I'm trying to automate csv insertion into a mysql database. I go through creating the database and tables in a Python script and then run it in a Jupyter Notebook. However, for some reason the final cursor.execute(SQL_STATEMENT)
seems to hang and I am not able to insert the csv values into the database. I get no logs suggesting why this is the case:
This is my csv_import_functions.py
:
import os
import numpy as np
import pandas as pd
import mysql.connector
def csv_files():
# get names of only csv files
csv_files = []
for file in os.listdir(os.getcwd()):
if file.endswith(".csv"):
csv_files.append(file)
return csv_files
def create_df(csv_files):
data_path = os.getcwd()+'/'
# loop through the files and create the dataframe
df = {}
for file in csv_files:
try:
df[file] = pd.read_csv(data_path+file)
except UnicodeDecodeError:
# if utf-8 encoding error
df[file] = pd.read_csv(data_path+file, encoding="ISO-8859-1")
print(file)
return df
def clean_tbl_name(filename):
# rename csv, force lower case, no spaces, no dashes
clean_tbl_name = filename.lower().replace(" ", "").replace(
"-", "_").replace(r"/", "_").replace("\\", "_").replace("$", "").replace("%", "")
tbl_name = '{0}'.format(clean_tbl_name.split('.')[0])
return tbl_name
def clean_colname(dataframe):
# force column names to be lower case, no spaces, no dashes
dataframe.columns = [x.lower().replace(" ", "_").replace("-", "_").replace(r"/", "_").replace(
"\\", "_").replace(".", "_").replace("$", "").replace("%", "") for x in dataframe.columns]
# processing data
replacements = {
'timedelta64[ns]': 'varchar(100)',
'object': 'varchar(100)',
'float64': 'float',
'int64': 'int',
'datetime64': 'timestamp'
}
col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(
dataframe.columns, dataframe.dtypes.replace(replacements)))
return col_str, dataframe.columns
def upload_to_db(host, database, user, password, tbl_name, col_str, file, dataframe, dataframe_columns):
conn_string = "host=%s, database=%s, user=%s, password=%s, port=%s" % (
host, database, user, password)
print("string is: " + conn_string)
conn = mysql.connector.connect(
host=host, database=database, user=user, password=password)
cursor = conn.cursor()
print('opened database successfully')
print("drop table if exists %s;" % (tbl_name))
print("create table %s (%s);" % (tbl_name, col_str))
# drop table with same name
cursor.execute("drop table if exists %s;" % (tbl_name))
# create table
cursor.execute("create table %s (%s);" % (tbl_name, col_str))
print('{0} was created successfully'.format(tbl_name))
# save df to csv
dataframe.to_csv(file, header=dataframe_columns,
index=False, encoding='utf-8')
col_names = col_str.replace(
' varchar(100)', '').replace(' int', '').replace(' float', '')
# upload to db
SQL_STATEMENT = """
LOAD DATA INFILE '%s' INTO TABLE %s
FIELDS TERMINATED BY ',' ENCLOSED BY '"'
LINES TERMINATED BY '\r\n'
IGNORE 1 LINES
(%s);
""" % (os.getcwd().replace(os.sep, '/') + '/' + file, tbl_name, col_names)
print(SQL_STATEMENT)
cursor.execute(SQL_STATEMENT)
print('file copied to db')
cursor.execute("grant select on table %s to public" % tbl_name)
conn.commit()
cursor.close()
print('table {0} imported to db completed'.format(tbl_name))
return
And my Jupyter Notebook (main.ipynb
):
import os
import numpy as np
import pandas as pd
import mysql.connector
#main
from csv_import_functions import *
#settings
dataset_dir = 'datasets'
#db settings
host = 'localhost'
database = 'nba_data'
user = 'user'
password = 'password'
#configure environment and create main df
csv_files = csv_files()
df = create_df( csv_files)
for k in csv_files:
#call dataframe
dataframe = df[k]
#clean table name
tbl_name = clean_tbl_name(k)
#clean column names
col_str, dataframe.columns = clean_colname(dataframe)
#upload data to db
upload_to_db(host,
database,
user,
password,
tbl_name,
col_str,
file=k,
dataframe=dataframe,
dataframe_columns=dataframe.columns)
Finally, here is the output before it hangs:
nba-playbyplay.csv
string is: host=localhost, database=nba_data, user=user, password=password
opened database successfully
drop table if exists nba_playbyplay;
create table nba_playbyplay (url varchar(100), gametype varchar(100), location varchar(100), date varchar(100), time varchar(100), winningteam varchar(100), quarter int, secleft int, awayteam varchar(100), awayplay varchar(100), awayscore int, hometeam varchar(100), homeplay float, homescore int, shooter float, shottype float, shotoutcome float, shotdist float, assister float, blocker float, foultype float, fouler float, fouled float, rebounder float, reboundtype float, violationplayer float, violationtype float, timeoutteam float, freethrowshooter float, freethrowoutcome float, freethrownum float, entergame float, leavegame float, turnoverplayer float, turnovertype float, turnovercause float, turnovercauser float, jumpballawayplayer varchar(100), jumpballhomeplayer varchar(100), jumpballposs varchar(100));
Copyright License:
Author:「clattenburg cake」,Reproduced under the CC 4.0 BY-SA copyright license with link to original source & disclaimer.
Link to:https://stackoverflow.com/questions/71682385/automating-csv-to-mysql-insertion-in-python-cursor-execute-hangs