PageRenderTime 147ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/contactsjson/convert.py

https://gitlab.com/jeffglover/contactsjsonmod
Python | 111 lines | 53 code | 22 blank | 36 comment | 10 complexity | fbb8ad888c1c956e76515cc1c7bb383f MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import pandas
  3. import re
  4. import logging
  5. class RowSanitizer(object):
  6. '''
  7. Generic helper for sanitizing rows of a DataFrame
  8. '''
  9. def __init__(self, column_name, validator_func):
  10. self.column_name = column_name
  11. self.validator_func = validator_func
  12. def __call__(self, row):
  13. '''
  14. Santizes a row
  15. Returns True if valid and False if invalid
  16. '''
  17. return bool(self.validator_func(row[self.column_name]))
  18. class CSVtoJSON(object):
  19. '''
  20. converts CSV files to a single JSON file
  21. supports validating rows, currently when enabled expects an email column
  22. '''
  23. valid_email = re.compile(
  24. r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$")
  25. def __init__(self, input_files, output_file, index_column, santize_rows=True, preserve_index_column=True):
  26. self.input_files = input_files
  27. self.output_file = output_file
  28. self.index_column = index_column
  29. self.santize_rows = santize_rows
  30. self.preserve_index_column = preserve_index_column
  31. if self.santize_rows:
  32. # initilize the RowSantizer to run a regex match on the 'email'
  33. # column
  34. self.email_column = "email"
  35. self.email_sanitizer = RowSanitizer(
  36. self.email_column, self.valid_email.match)
  37. self.log = logging.getLogger(self.__class__.__name__)
  38. def convert(self):
  39. '''
  40. Entry point
  41. converts all input_files to a JSON file named output_file
  42. '''
  43. # GO!
  44. full_df = pandas.concat(self.read_csvs())
  45. # drop duplicates and warn the user if this happens
  46. prepared_df = full_df.drop_duplicates()
  47. droped_rows = len(full_df) - len(prepared_df)
  48. if droped_rows != 0:
  49. self.log.warn("droped %d duplicate rows", droped_rows)
  50. # reseting the index will move the index as a part of the DataFrame columns
  51. # which will preserve them when writing out to JSON
  52. if self.preserve_index_column:
  53. prepared_df.reset_index(inplace=True)
  54. try:
  55. prepared_df.to_json(self.output_file, orient='records')
  56. self.log.info("wrote JSON output -> %s", self.output_file)
  57. except Exception as e:
  58. # cath the Exception, log it then reraise it
  59. self.log.critical("unable to write file '%s'. %s", self.output_file, e)
  60. raise
  61. def read_csvs(self):
  62. '''
  63. Reads all the CSVs and returns as a list of DataFrames
  64. '''
  65. return [self.read_csv(filepath) for filepath in self.input_files]
  66. def read_csv(self, filepath):
  67. '''
  68. Converts a single CSV file to a DataFrame
  69. Returns a DataFrame
  70. '''
  71. self.log.debug("reading <- '%s'", filepath)
  72. df = None
  73. # read csv
  74. # expect column index_column to exist and it will be the index
  75. try:
  76. df = pandas.read_csv(filepath, index_col=self.index_column)
  77. if self.santize_rows:
  78. # santize the rows
  79. df['valid'] = df.apply(self.email_sanitizer, axis=1)
  80. # log invalid rows
  81. invalid_df = df[df['valid'] == False]
  82. for row in invalid_df.itertuples():
  83. self.log.warn("%s: invalid row: %s", filepath, row)
  84. # return all valid rows, drop the valid column that we created
  85. df = df[df['valid'] == True].drop('valid', axis=1)
  86. except TypeError as e:
  87. self.log.error("unable to process file '%s': %s", filepath, e)
  88. return df