/src/dedup_columns.py
Python | 97 lines | 79 code | 15 blank | 3 comment | 5 complexity | c11aabaabfa96cad32a5187689d75421 MD5 | raw file
- from subprocess import Popen, PIPE
- import pandas as pd
- import numpy as np
- import argparse
- import time
- from scipy.sparse import csr_matrix
- from sklearn import preprocessing
- def get_hashes(f):
- header = pd.read_csv(f, nrows=0, index_col='Id').columns
- hashmap = pd.DataFrame(index=header, columns=['no', 'hash'])
- hashmap.index.name = 'col'
- for idx, col in enumerate(header):
- col_no = idx + 2
- command = 'cut -d, -f{} {}| tail -n +2 | md5sum'.format(col_no, f)
- proc = Popen(command, stdout=PIPE, stderr=PIPE, shell=True)
- stdout, stderr = proc.communicate()
- hashmap.ix[col, :] = [col_no, stdout]
- hashmap.to_pickle('{}_colhashes.pik'.format(f[:-4]))
- def save_sparse_csr(filename,array):
- np.savez(filename,data = array.data ,indices=array.indices,
- indptr =array.indptr, shape=array.shape )
- def load_sparse_csr(filename):
- loader = np.load(filename)
- return csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
- shape = loader['shape'])
- def label_encode(df):
- df_enc = pd.DataFrame(columns=df.columns, index=df.index)
- for col in df.columns:
- if col == 'Id':
- df_enc[col] = df[col].values
- else:
- enc = preprocessing.LabelEncoder()
- df_enc[col] = enc.fit_transform(df[col].values)
- return df_enc
- def dedup(filetype):
- # Drop duplicated columns from test and train file based on the train file.
- # Filetypes: ['categorical', 'date', 'numeric']
- t = filetype
- df_trn_hash = pd.read_pickle('data/train_{}_colhashes.pik'.format(t))
- duplicated = df_trn_hash.duplicated(subset=['hash'], keep='first')
- print('{} / {} columns duplicated for train_{}. Droppping from both train and test sets.'.format(np.sum(duplicated), len(duplicated), t))
- # Process train file
- df_trn = pd.read_csv('data/train_{}.csv'.format(t))
- df_trn_dedup = df_trn.ix[:, ['Id'] + list(duplicated.index[~duplicated])]
- if t == 'categorical':
- df_trn_dedup = label_encode(df_trn_dedup.fillna(''))
- df_trn_dedup_sparse = csr_matrix(df_trn_dedup.values)
- print 'train size {}'.format(df_trn_dedup_sparse.shape)
- save_sparse_csr('data/dedup/train_{}_dedup_sparse'.format(t), df_trn_dedup_sparse)
- np.savetxt('data/dedup/{}_dedup_colnames.txt'.format(t), df_trn_dedup.columns, fmt='%s')
- print 'train {} converted to sparse: {}'.format(t, df_trn_dedup_sparse.__repr__)
- del df_trn, df_trn_dedup
- # Process test file
- df_tst = pd.read_csv('data/test_{}.csv'.format(t))
- if t == 'numeric':
- df_tst_dedup = df_tst[['Id'] + list(duplicated.index[~duplicated][:-1])]
- else:
- df_tst_dedup = df_tst[ ['Id'] + list(duplicated.index[~duplicated])]
- if t == 'categorical':
- df_tst_dedup = label_encode(df_tst_dedup.fillna(''))
- df_tst_dedup_sparse = csr_matrix(df_tst_dedup.values)
- print 'test size {}'.format(df_tst_dedup_sparse.shape)
- save_sparse_csr('data/dedup/test_{}_dedup_sparse'.format(t), df_tst_dedup_sparse)
- print 'test {} converted to sparse: {}'.format(t, df_tst_dedup_sparse.__repr__)
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- #parser.add_argument('--filename', required=True, dest='filename')
- parser.add_argument('--filetype', required=True, dest='filetype')
- args = parser.parse_args()
- start = time.time()
- #get_hashes(args.filename)
- dedup(args.filetype)
- print('finished ({:.2f} sec elasped)'.format(time.time() - start))