dedup_columns.py - Drop duplicated columns from test and tr…

/src/dedup_columns.py

https://gitlab.com/mbay/bosch · Python · 97 lines · 67 code · 24 blank · 6 comment · 9 complexity · c11aabaabfa96cad32a5187689d75421 MD5 · raw file


from subprocess import Popen, PIPE
import pandas as pd
import numpy as np
import argparse
import time
from scipy.sparse import csr_matrix
from sklearn import preprocessing


def get_hashes(f):

    header = pd.read_csv(f, nrows=0, index_col='Id').columns
    hashmap = pd.DataFrame(index=header, columns=['no', 'hash'])
    hashmap.index.name = 'col'
    for idx, col in enumerate(header):
        col_no = idx + 2
        command = 'cut -d, -f{} {}| tail -n +2 | md5sum'.format(col_no, f)
        proc = Popen(command, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = proc.communicate()
        hashmap.ix[col, :] = [col_no, stdout]

    hashmap.to_pickle('{}_colhashes.pik'.format(f[:-4]))


def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )


def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])


def label_encode(df):
    df_enc = pd.DataFrame(columns=df.columns, index=df.index)
    for col in df.columns:
        if col == 'Id':
            df_enc[col] = df[col].values
        else:
            enc = preprocessing.LabelEncoder()
            df_enc[col] = enc.fit_transform(df[col].values)
    return df_enc


def dedup(filetype):
    # Drop duplicated columns from test and train file based on the train file.
    # Filetypes:  ['categorical', 'date', 'numeric']

    t = filetype

    df_trn_hash = pd.read_pickle('data/train_{}_colhashes.pik'.format(t))
    duplicated = df_trn_hash.duplicated(subset=['hash'], keep='first')
    print('{} / {} columns duplicated for train_{}. Droppping from both train and test sets.'.format(np.sum(duplicated), len(duplicated), t))
    # Process train file
    df_trn = pd.read_csv('data/train_{}.csv'.format(t))
    df_trn_dedup = df_trn.ix[:, ['Id'] +  list(duplicated.index[~duplicated])]

    if t == 'categorical':
        df_trn_dedup = label_encode(df_trn_dedup.fillna(''))

    df_trn_dedup_sparse = csr_matrix(df_trn_dedup.values)
    print 'train size {}'.format(df_trn_dedup_sparse.shape)
    save_sparse_csr('data/dedup/train_{}_dedup_sparse'.format(t), df_trn_dedup_sparse)
    np.savetxt('data/dedup/{}_dedup_colnames.txt'.format(t), df_trn_dedup.columns, fmt='%s')
    print 'train {} converted to sparse: {}'.format(t, df_trn_dedup_sparse.__repr__)
    del df_trn, df_trn_dedup

    # Process test file
    df_tst = pd.read_csv('data/test_{}.csv'.format(t))
    if t == 'numeric':
        df_tst_dedup = df_tst[['Id'] + list(duplicated.index[~duplicated][:-1])]
    else:
        df_tst_dedup = df_tst[ ['Id'] + list(duplicated.index[~duplicated])]

    if t == 'categorical':
        df_tst_dedup = label_encode(df_tst_dedup.fillna(''))

    df_tst_dedup_sparse = csr_matrix(df_tst_dedup.values)
    print 'test size {}'.format(df_tst_dedup_sparse.shape)
    save_sparse_csr('data/dedup/test_{}_dedup_sparse'.format(t), df_tst_dedup_sparse)
    print 'test {} converted to sparse: {}'.format(t, df_tst_dedup_sparse.__repr__)

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    #parser.add_argument('--filename', required=True, dest='filename')
    parser.add_argument('--filetype', required=True, dest='filetype')

    args = parser.parse_args()
    start = time.time()
    #get_hashes(args.filename)
    dedup(args.filetype)


    print('finished ({:.2f} sec elasped)'.format(time.time() - start))

Tech Fingerprint

Alerts (12)

'def' Ensure functions have docstrings for documentation
10 25 30 36 47
'open(' Use 'with open()' to ensure Files are properly closed
18
'print(' Use logging module for better control and configurability
55 97
'list(' Avoid unnecessary list conversions; use generators where possible
58 73 75
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
68