/contactsjson/convert.py
Python | 111 lines | 53 code | 22 blank | 36 comment | 10 complexity | fbb8ad888c1c956e76515cc1c7bb383f MD5 | raw file
- # -*- coding: utf-8 -*-
- import pandas
- import re
- import logging
- class RowSanitizer(object):
- '''
- Generic helper for sanitizing rows of a DataFrame
- '''
- def __init__(self, column_name, validator_func):
- self.column_name = column_name
- self.validator_func = validator_func
- def __call__(self, row):
- '''
- Santizes a row
- Returns True if valid and False if invalid
- '''
- return bool(self.validator_func(row[self.column_name]))
- class CSVtoJSON(object):
- '''
- converts CSV files to a single JSON file
- supports validating rows, currently when enabled expects an email column
- '''
- valid_email = re.compile(
- r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$")
- def __init__(self, input_files, output_file, index_column, santize_rows=True, preserve_index_column=True):
- self.input_files = input_files
- self.output_file = output_file
- self.index_column = index_column
- self.santize_rows = santize_rows
- self.preserve_index_column = preserve_index_column
- if self.santize_rows:
- # initilize the RowSantizer to run a regex match on the 'email'
- # column
- self.email_column = "email"
- self.email_sanitizer = RowSanitizer(
- self.email_column, self.valid_email.match)
- self.log = logging.getLogger(self.__class__.__name__)
- def convert(self):
- '''
- Entry point
- converts all input_files to a JSON file named output_file
- '''
- # GO!
- full_df = pandas.concat(self.read_csvs())
- # drop duplicates and warn the user if this happens
- prepared_df = full_df.drop_duplicates()
- droped_rows = len(full_df) - len(prepared_df)
- if droped_rows != 0:
- self.log.warn("droped %d duplicate rows", droped_rows)
- # reseting the index will move the index as a part of the DataFrame columns
- # which will preserve them when writing out to JSON
- if self.preserve_index_column:
- prepared_df.reset_index(inplace=True)
- try:
- prepared_df.to_json(self.output_file, orient='records')
- self.log.info("wrote JSON output -> %s", self.output_file)
- except Exception as e:
- # cath the Exception, log it then reraise it
- self.log.critical("unable to write file '%s'. %s", self.output_file, e)
- raise
- def read_csvs(self):
- '''
- Reads all the CSVs and returns as a list of DataFrames
- '''
- return [self.read_csv(filepath) for filepath in self.input_files]
- def read_csv(self, filepath):
- '''
- Converts a single CSV file to a DataFrame
- Returns a DataFrame
- '''
- self.log.debug("reading <- '%s'", filepath)
- df = None
- # read csv
- # expect column index_column to exist and it will be the index
- try:
- df = pandas.read_csv(filepath, index_col=self.index_column)
- if self.santize_rows:
- # santize the rows
- df['valid'] = df.apply(self.email_sanitizer, axis=1)
- # log invalid rows
- invalid_df = df[df['valid'] == False]
- for row in invalid_df.itertuples():
- self.log.warn("%s: invalid row: %s", filepath, row)
- # return all valid rows, drop the valid column that we created
- df = df[df['valid'] == True].drop('valid', axis=1)
- except TypeError as e:
- self.log.error("unable to process file '%s': %s", filepath, e)
- return df