/general/mysql_bulk_insert.py
Python | 193 lines | 192 code | 0 blank | 1 comment | 0 complexity | 3882fb727024a60be05aeb7ac312d54a MD5 | raw file
Possible License(s): BSD-3-Clause
- # https://stackoverflow.com/questions/20868845/python-loading-a-set-of-csv-files-into-mysql
- config = {
- 'user': 'root',
- 'password': '',
- 'host': '127.0.0.1',
- 'port': '3306',
- 'database': '',
- 'raise_on_warnings': False,
- 'charset': 'utf8',
- }
- template = {
- 'dropDb': 'DROP DATABASE IF EXISTS {};',
- 'dropTable': 'DROP TABLE IF EXISTS {}.{};',
- 'batchLoad':"LOAD DATA LOCAL INFILE '{}' " \
- "INTO TABLE {}.{} " \
- "FIELDS " \
- "TERMINATED BY ',' " \
- "OPTIONALLY ENCLOSED BY '\"' " \
- "LINES TERMINATED BY '\\n' " \
- "IGNORE 1 LINES; ",
- 'createDb': 'CREATE DATABASE `{}` /*!40100 DEFAULT CHARACTER SET utf8 */;',
- 'checkTb': "SELECT COUNT(*) "\
- "FROM information_schema.tables "\
- "WHERE TABLE_SCHEMA = '{}' "\
- "AND TABLE_NAME = '{}';"\
- }
- def getConnection():
- try:
- cnx = mysql.connector.connect(**config)
- print('Connection susscessful')
- except mysql.connector.Error as err:
- if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
- print("Something is wrong with your user name or password")
- elif err.errno == errorcode.ER_BAD_DB_ERROR:
- print("Database does not exist")
- else:
- print(err)
- return cnx;
- def dropDbSQL(dbname):
- return template['dropDb'].format(dbname)
- def createDbSQL(dbname):
- return template['createDb'].format(dbname)
- def checkTableSQL(dbname, tbname):
- return template['checkTb'].format(dbname, tbname)
- def dropTableSQL(dbname, tbname):
- return template['dropTable'].format(dbname, tbname)
- def createTableSQL(dbname, tbname, header, addIdx = False):
- tableTemplate = \
- "CREATE TABLE {}.{}(" \
- "{}" \
- ") ENGINE = INNODB;"
- columnIdTemplate = "`{0}` BIGINT NULL, INDEX `idx.{0}` using hash (`{0}`)," if addIdx else "`{0}` BIGINT NULL,";
- columnDateTimeTemplate = "`{0}` DATETIME NULL,"; # Need to validate for different time formats
- columnTemplate = "`{0}` text NULL," # or change this to text or varchar(400)
- cols = ''
- idIdx = [i for i, x in enumerate(header) if x.lower().endswith("id")]
- dateTimeIdx = [i for i, x in enumerate(header) if x.lower().endswith("dttm")]
- for idx in xrange(0, len(header)):
- if idx in idIdx:
- cols += columnIdTemplate.format(header[idx])
- elif idx in dateTimeIdx:
- cols += columnDateTimeTemplate.format(header[idx])
- else:
- cols += columnTemplate.format(header[idx])
- cols = cols[:-1]
- return tableTemplate.format(dbname, tbname, cols)
- def insertTableSQL(dbname, tbname, filePath):
- return template['batchLoad'].format(filePath, dbname, tbname)
- def checkForDuplciates(header):
- # https://stackoverflow.com/questions/30650474/python-rename-duplicates-in-list-with-progressive-numbers-without-sorting-list
- from collections import Counter
- counts = Counter(header)
- for s, num in counts.items():
- if num > 1:
- for suffix in range(1, num + 1):
- header[header.index(s)] = s + str(suffix)
- return header
- def executeBatch(args):
- dirPath = args.f
- config['database'] = args.db
- config['password'] = args.p
- config['user'] = args.u if args.u else 'root'
- dbName = config['database'].strip()
- dropdb = args.newdb if args.newdb else False
- droptb = args.newtb if args.newtb else False
- connection = getConnection()
- if connection is not None:
- connection.database = dbName
- cursor = connection.cursor()
- csvFiles = glob.glob(os.path.join(dirPath, '*.csv'))
- try:
- if (dropdb):
- cursor.execute(dropDbSQL(dbName))
- cursor.execute(createDbSQL(dbName))
- for file in csvFiles:
- tbName = os.path.splitext(os.path.basename(file))[0].strip()
- # https://stackoverflow.com/questions/17262256/how-to-read-one-single-line-of-csv-data-in-python
- cursor.execute(checkTableSQL(dbName, tbName))
- result = cursor.fetchone()
- if droptb or result[0] == 0:
- with codecs.open(file, encoding="utf-8-sig") as f:
- reader = csv.DictReader(f, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL)
- header = checkForDuplciates(reader.fieldnames)
- print("Processing file {} ...".format(tbName))
- # use header to create table columns
- loadScript = insertTableSQL(dbName, tbName, f.name)
- cursor.execute(dropTableSQL(dbName, tbName))
- cursor.execute(createTableSQL(dbName, tbName, header, True))
- cursor.execute(loadScript)
- except mysql.connector.Error as err:
- if err.errno == errorcode.ER_CANT_CREATE_TABLE:
- print("Can't create table for file {} ...".format(tbName))
- if err.errno == errorcode.ER_TOO_BIG_ROWSIZE:
- print("Maximum row size for import reached for file {} ...".format(tbName))
- else:
- print("Processing failed for file {} ...".format(tbName))
- print(err.msg)
- print('Import completed ...')
- cursor.close()
- connection.commit()
- connection.close()
- print('Starting data import to austin database ...')
- #https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
- def str2bool(v):
- if v.lower() in ('yes', 'true', 't', 'y', '1'):
- return True
- elif v.lower() in ('no', 'false', 'f', 'n', '0'):
- return False
- else:
- raise argparse.ArgumentTypeError('Boolean value expected.')
- if __name__ == "__main__":
- import argparse, sys
- import csv
- import glob
- import os
- import codecs
- import mysql.connector
- from mysql.connector import errorcode
- parser = argparse.ArgumentParser()
- parser.add_argument('-db', help='database name')
- parser.add_argument('-f', help='input csv file path')
- parser.add_argument('-u', help='username')
- parser.add_argument('-p', help='password')
- parser.add_argument('-newdb', help='drop existing database', nargs='?',
- type = str2bool, const = True, default= True)
- parser.add_argument('-newtb', help='drop existing table', nargs='?',
- type=str2bool, const=True, default=True)
- args = parser.parse_args()
- print(args)
- # mysql_bulk_insert.py -f path -d dbname -p password -u username
- executeBatch(args)