convert.py | searchcode

/contactsjson/convert.py

https://gitlab.com/jeffglover/contactsjsonmod · Python · 111 lines · 53 code · 22 blank · 36 comment · 10 complexity · fbb8ad888c1c956e76515cc1c7bb383f MD5 · raw file

# -*- coding: utf-8 -*-

import pandas
import re
import logging


class RowSanitizer(object):
    '''
    Generic helper for sanitizing rows of a DataFrame
    '''

    def __init__(self, column_name, validator_func):
        self.column_name = column_name
        self.validator_func = validator_func

    def __call__(self, row):
        '''
        Santizes a row
        Returns True if valid and False if invalid
        '''
        return bool(self.validator_func(row[self.column_name]))


class CSVtoJSON(object):
    '''
    converts CSV files to a single JSON file

    supports validating rows, currently when enabled expects an email column
    '''

    valid_email = re.compile(
        r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$")

    def __init__(self, input_files, output_file, index_column, santize_rows=True, preserve_index_column=True):
        self.input_files = input_files
        self.output_file = output_file
        self.index_column = index_column
        self.santize_rows = santize_rows
        self.preserve_index_column = preserve_index_column

        if self.santize_rows:
            # initilize the RowSantizer to run a regex match on the 'email'
            # column
            self.email_column = "email"
            self.email_sanitizer = RowSanitizer(
                self.email_column, self.valid_email.match)

        self.log = logging.getLogger(self.__class__.__name__)

    def convert(self):
        '''
        Entry point
        converts all input_files to a JSON file named output_file
        '''
        # GO!
        full_df = pandas.concat(self.read_csvs())

        # drop duplicates and warn the user if this happens
        prepared_df = full_df.drop_duplicates()
        droped_rows = len(full_df) - len(prepared_df)
        if droped_rows != 0:
            self.log.warn("droped %d duplicate rows", droped_rows)

        # reseting the index will move the index as a part of the DataFrame columns
        # which will preserve them when writing out to JSON
        if self.preserve_index_column:
            prepared_df.reset_index(inplace=True)

        try:
          prepared_df.to_json(self.output_file, orient='records')
          self.log.info("wrote JSON output -> %s", self.output_file)
        except Exception as e:
          # cath the Exception, log it then reraise it
          self.log.critical("unable to write file '%s'. %s", self.output_file, e)
          raise

    def read_csvs(self):
        '''
        Reads all the CSVs and returns as a list of DataFrames
        '''
        return [self.read_csv(filepath) for filepath in self.input_files]

    def read_csv(self, filepath):
        '''
        Converts a single CSV file to a DataFrame
        Returns a DataFrame
        '''
        self.log.debug("reading <- '%s'", filepath)
        df = None

        # read csv
        # expect column index_column to exist and it will be the index
        try:
            df = pandas.read_csv(filepath, index_col=self.index_column)

            if self.santize_rows:
                # santize the rows
                df['valid'] = df.apply(self.email_sanitizer, axis=1)

                # log invalid rows
                invalid_df = df[df['valid'] == False]
                for row in invalid_df.itertuples():
                    self.log.warn("%s: invalid row: %s", filepath, row)

                # return all valid rows, drop the valid column that we created
                df = df[df['valid'] == True].drop('valid', axis=1)
        except TypeError as e:
            self.log.error("unable to process file '%s': %s", filepath, e)

        return df