PageRenderTime 55ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/general/mysql_bulk_insert.py

https://bitbucket.org/nandinisa/projects
Python | 193 lines | 192 code | 0 blank | 1 comment | 0 complexity | 3882fb727024a60be05aeb7ac312d54a MD5 | raw file
Possible License(s): BSD-3-Clause
  1. # https://stackoverflow.com/questions/20868845/python-loading-a-set-of-csv-files-into-mysql
  2. config = {
  3. 'user': 'root',
  4. 'password': '',
  5. 'host': '127.0.0.1',
  6. 'port': '3306',
  7. 'database': '',
  8. 'raise_on_warnings': False,
  9. 'charset': 'utf8',
  10. }
  11. template = {
  12. 'dropDb': 'DROP DATABASE IF EXISTS {};',
  13. 'dropTable': 'DROP TABLE IF EXISTS {}.{};',
  14. 'batchLoad':"LOAD DATA LOCAL INFILE '{}' " \
  15. "INTO TABLE {}.{} " \
  16. "FIELDS " \
  17. "TERMINATED BY ',' " \
  18. "OPTIONALLY ENCLOSED BY '\"' " \
  19. "LINES TERMINATED BY '\\n' " \
  20. "IGNORE 1 LINES; ",
  21. 'createDb': 'CREATE DATABASE `{}` /*!40100 DEFAULT CHARACTER SET utf8 */;',
  22. 'checkTb': "SELECT COUNT(*) "\
  23. "FROM information_schema.tables "\
  24. "WHERE TABLE_SCHEMA = '{}' "\
  25. "AND TABLE_NAME = '{}';"\
  26. }
  27. def getConnection():
  28. try:
  29. cnx = mysql.connector.connect(**config)
  30. print('Connection susscessful')
  31. except mysql.connector.Error as err:
  32. if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
  33. print("Something is wrong with your user name or password")
  34. elif err.errno == errorcode.ER_BAD_DB_ERROR:
  35. print("Database does not exist")
  36. else:
  37. print(err)
  38. return cnx;
  39. def dropDbSQL(dbname):
  40. return template['dropDb'].format(dbname)
  41. def createDbSQL(dbname):
  42. return template['createDb'].format(dbname)
  43. def checkTableSQL(dbname, tbname):
  44. return template['checkTb'].format(dbname, tbname)
  45. def dropTableSQL(dbname, tbname):
  46. return template['dropTable'].format(dbname, tbname)
  47. def createTableSQL(dbname, tbname, header, addIdx = False):
  48. tableTemplate = \
  49. "CREATE TABLE {}.{}(" \
  50. "{}" \
  51. ") ENGINE = INNODB;"
  52. columnIdTemplate = "`{0}` BIGINT NULL, INDEX `idx.{0}` using hash (`{0}`)," if addIdx else "`{0}` BIGINT NULL,";
  53. columnDateTimeTemplate = "`{0}` DATETIME NULL,"; # Need to validate for different time formats
  54. columnTemplate = "`{0}` text NULL," # or change this to text or varchar(400)
  55. cols = ''
  56. idIdx = [i for i, x in enumerate(header) if x.lower().endswith("id")]
  57. dateTimeIdx = [i for i, x in enumerate(header) if x.lower().endswith("dttm")]
  58. for idx in xrange(0, len(header)):
  59. if idx in idIdx:
  60. cols += columnIdTemplate.format(header[idx])
  61. elif idx in dateTimeIdx:
  62. cols += columnDateTimeTemplate.format(header[idx])
  63. else:
  64. cols += columnTemplate.format(header[idx])
  65. cols = cols[:-1]
  66. return tableTemplate.format(dbname, tbname, cols)
  67. def insertTableSQL(dbname, tbname, filePath):
  68. return template['batchLoad'].format(filePath, dbname, tbname)
  69. def checkForDuplciates(header):
  70. # https://stackoverflow.com/questions/30650474/python-rename-duplicates-in-list-with-progressive-numbers-without-sorting-list
  71. from collections import Counter
  72. counts = Counter(header)
  73. for s, num in counts.items():
  74. if num > 1:
  75. for suffix in range(1, num + 1):
  76. header[header.index(s)] = s + str(suffix)
  77. return header
  78. def executeBatch(args):
  79. dirPath = args.f
  80. config['database'] = args.db
  81. config['password'] = args.p
  82. config['user'] = args.u if args.u else 'root'
  83. dbName = config['database'].strip()
  84. dropdb = args.newdb if args.newdb else False
  85. droptb = args.newtb if args.newtb else False
  86. connection = getConnection()
  87. if connection is not None:
  88. connection.database = dbName
  89. cursor = connection.cursor()
  90. csvFiles = glob.glob(os.path.join(dirPath, '*.csv'))
  91. try:
  92. if (dropdb):
  93. cursor.execute(dropDbSQL(dbName))
  94. cursor.execute(createDbSQL(dbName))
  95. for file in csvFiles:
  96. tbName = os.path.splitext(os.path.basename(file))[0].strip()
  97. # https://stackoverflow.com/questions/17262256/how-to-read-one-single-line-of-csv-data-in-python
  98. cursor.execute(checkTableSQL(dbName, tbName))
  99. result = cursor.fetchone()
  100. if droptb or result[0] == 0:
  101. with codecs.open(file, encoding="utf-8-sig") as f:
  102. reader = csv.DictReader(f, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL)
  103. header = checkForDuplciates(reader.fieldnames)
  104. print("Processing file {} ...".format(tbName))
  105. # use header to create table columns
  106. loadScript = insertTableSQL(dbName, tbName, f.name)
  107. cursor.execute(dropTableSQL(dbName, tbName))
  108. cursor.execute(createTableSQL(dbName, tbName, header, True))
  109. cursor.execute(loadScript)
  110. except mysql.connector.Error as err:
  111. if err.errno == errorcode.ER_CANT_CREATE_TABLE:
  112. print("Can't create table for file {} ...".format(tbName))
  113. if err.errno == errorcode.ER_TOO_BIG_ROWSIZE:
  114. print("Maximum row size for import reached for file {} ...".format(tbName))
  115. else:
  116. print("Processing failed for file {} ...".format(tbName))
  117. print(err.msg)
  118. print('Import completed ...')
  119. cursor.close()
  120. connection.commit()
  121. connection.close()
  122. print('Starting data import to austin database ...')
  123. #https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
  124. def str2bool(v):
  125. if v.lower() in ('yes', 'true', 't', 'y', '1'):
  126. return True
  127. elif v.lower() in ('no', 'false', 'f', 'n', '0'):
  128. return False
  129. else:
  130. raise argparse.ArgumentTypeError('Boolean value expected.')
  131. if __name__ == "__main__":
  132. import argparse, sys
  133. import csv
  134. import glob
  135. import os
  136. import codecs
  137. import mysql.connector
  138. from mysql.connector import errorcode
  139. parser = argparse.ArgumentParser()
  140. parser.add_argument('-db', help='database name')
  141. parser.add_argument('-f', help='input csv file path')
  142. parser.add_argument('-u', help='username')
  143. parser.add_argument('-p', help='password')
  144. parser.add_argument('-newdb', help='drop existing database', nargs='?',
  145. type = str2bool, const = True, default= True)
  146. parser.add_argument('-newtb', help='drop existing table', nargs='?',
  147. type=str2bool, const=True, default=True)
  148. args = parser.parse_args()
  149. print(args)
  150. # mysql_bulk_insert.py -f path -d dbname -p password -u username
  151. executeBatch(args)