PageRenderTime 231ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/src/dedup_columns.py

https://gitlab.com/mbay/bosch
Python | 97 lines | 79 code | 15 blank | 3 comment | 5 complexity | c11aabaabfa96cad32a5187689d75421 MD5 | raw file
  1. from subprocess import Popen, PIPE
  2. import pandas as pd
  3. import numpy as np
  4. import argparse
  5. import time
  6. from scipy.sparse import csr_matrix
  7. from sklearn import preprocessing
  8. def get_hashes(f):
  9. header = pd.read_csv(f, nrows=0, index_col='Id').columns
  10. hashmap = pd.DataFrame(index=header, columns=['no', 'hash'])
  11. hashmap.index.name = 'col'
  12. for idx, col in enumerate(header):
  13. col_no = idx + 2
  14. command = 'cut -d, -f{} {}| tail -n +2 | md5sum'.format(col_no, f)
  15. proc = Popen(command, stdout=PIPE, stderr=PIPE, shell=True)
  16. stdout, stderr = proc.communicate()
  17. hashmap.ix[col, :] = [col_no, stdout]
  18. hashmap.to_pickle('{}_colhashes.pik'.format(f[:-4]))
  19. def save_sparse_csr(filename,array):
  20. np.savez(filename,data = array.data ,indices=array.indices,
  21. indptr =array.indptr, shape=array.shape )
  22. def load_sparse_csr(filename):
  23. loader = np.load(filename)
  24. return csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
  25. shape = loader['shape'])
  26. def label_encode(df):
  27. df_enc = pd.DataFrame(columns=df.columns, index=df.index)
  28. for col in df.columns:
  29. if col == 'Id':
  30. df_enc[col] = df[col].values
  31. else:
  32. enc = preprocessing.LabelEncoder()
  33. df_enc[col] = enc.fit_transform(df[col].values)
  34. return df_enc
  35. def dedup(filetype):
  36. # Drop duplicated columns from test and train file based on the train file.
  37. # Filetypes: ['categorical', 'date', 'numeric']
  38. t = filetype
  39. df_trn_hash = pd.read_pickle('data/train_{}_colhashes.pik'.format(t))
  40. duplicated = df_trn_hash.duplicated(subset=['hash'], keep='first')
  41. print('{} / {} columns duplicated for train_{}. Droppping from both train and test sets.'.format(np.sum(duplicated), len(duplicated), t))
  42. # Process train file
  43. df_trn = pd.read_csv('data/train_{}.csv'.format(t))
  44. df_trn_dedup = df_trn.ix[:, ['Id'] + list(duplicated.index[~duplicated])]
  45. if t == 'categorical':
  46. df_trn_dedup = label_encode(df_trn_dedup.fillna(''))
  47. df_trn_dedup_sparse = csr_matrix(df_trn_dedup.values)
  48. print 'train size {}'.format(df_trn_dedup_sparse.shape)
  49. save_sparse_csr('data/dedup/train_{}_dedup_sparse'.format(t), df_trn_dedup_sparse)
  50. np.savetxt('data/dedup/{}_dedup_colnames.txt'.format(t), df_trn_dedup.columns, fmt='%s')
  51. print 'train {} converted to sparse: {}'.format(t, df_trn_dedup_sparse.__repr__)
  52. del df_trn, df_trn_dedup
  53. # Process test file
  54. df_tst = pd.read_csv('data/test_{}.csv'.format(t))
  55. if t == 'numeric':
  56. df_tst_dedup = df_tst[['Id'] + list(duplicated.index[~duplicated][:-1])]
  57. else:
  58. df_tst_dedup = df_tst[ ['Id'] + list(duplicated.index[~duplicated])]
  59. if t == 'categorical':
  60. df_tst_dedup = label_encode(df_tst_dedup.fillna(''))
  61. df_tst_dedup_sparse = csr_matrix(df_tst_dedup.values)
  62. print 'test size {}'.format(df_tst_dedup_sparse.shape)
  63. save_sparse_csr('data/dedup/test_{}_dedup_sparse'.format(t), df_tst_dedup_sparse)
  64. print 'test {} converted to sparse: {}'.format(t, df_tst_dedup_sparse.__repr__)
  65. if __name__ == '__main__':
  66. parser = argparse.ArgumentParser()
  67. #parser.add_argument('--filename', required=True, dest='filename')
  68. parser.add_argument('--filetype', required=True, dest='filetype')
  69. args = parser.parse_args()
  70. start = time.time()
  71. #get_hashes(args.filename)
  72. dedup(args.filetype)
  73. print('finished ({:.2f} sec elasped)'.format(time.time() - start))