PageRenderTime 42ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/src/generate_e4.py

https://gitlab.com/tianzhou2011/talkingdata
Python | 213 lines | 210 code | 2 blank | 1 comment | 0 complexity | 3514441bc817a8ccea7060cdb35188a1 MD5 | raw file
  1. #!/usr/bin/env python
  2. from __future__ import division
  3. from scipy import sparse
  4. from sklearn.datasets import dump_svmlight_file
  5. from sklearn.preprocessing import LabelEncoder, OneHotEncoder
  6. import argparse
  7. import logging
  8. import numpy as np
  9. import os
  10. import pandas as pd
  11. import time
  12. def generate_feature(train_file, test_file, app_event_file, app_label_file, event_file, label_file,
  13. phone_file, train_feature_file, test_feature_file):
  14. """Generate features based on Dune Dweller's script."""
  15. logging.info('loading raw data files')
  16. trn = pd.read_csv(train_file, index_col='device_id', usecols=['device_id', 'group'])
  17. tst = pd.read_csv(test_file, index_col='device_id')
  18. app_event = pd.read_csv(app_event_file, usecols=['event_id', 'app_id', 'is_active'],
  19. dtype={'is_active': bool})
  20. app_label = pd.read_csv(app_label_file)
  21. event = pd.read_csv(event_file, parse_dates=['timestamp'], index_col='event_id')
  22. label = pd.read_csv(label_file, index_col='label_id')
  23. phone = pd.read_csv(phone_file)
  24. logging.info('removeing gender and age from training, set groups to 0 for test')
  25. tst['group'] = 0
  26. logging.info('label-encoding group in training data')
  27. trn.group = LabelEncoder().fit_transform(trn.group)
  28. logging.info('adding row ids to training and test data')
  29. trn['row_id_trn'] = np.arange(trn.shape[0])
  30. tst['row_id_tst'] = np.arange(tst.shape[0])
  31. logging.info('training data frame:\n{}'.format(trn.head()))
  32. logging.info('test data frame:\n{}'.format(trn.head()))
  33. logging.info('combining training and test data')
  34. df = pd.concat([trn, tst], axis=0)
  35. logging.info('removing duplicates from phone data')
  36. phone = phone.drop_duplicates('device_id', keep='first').set_index('device_id')
  37. logging.info('label-encodeing phone brand and device model')
  38. phone.ix[:, 'phone_brand'] = LabelEncoder().fit_transform(phone.phone_brand)
  39. logging.debug('# phone_brand: {}'.format(phone.phone_brand.nunique()))
  40. phone.ix[:, 'device_model'] = LabelEncoder().fit_transform(phone.device_model)
  41. logging.debug('# device_model: {}'.format(phone.device_model.nunique()))
  42. logging.info('phone data frame:\n{}'.format(phone.head()))
  43. logging.info('joining with phone data')
  44. trn = pd.merge(trn, phone, left_index=True, right_index=True, how='left')
  45. tst = pd.merge(tst, phone, left_index=True, right_index=True, how='left')
  46. X_brand_trn = sparse.csr_matrix((np.ones(trn.shape[0]), (trn.row_id_trn, trn.phone_brand)))
  47. X_brand_tst = sparse.csr_matrix((np.ones(tst.shape[0]), (tst.row_id_tst, tst.phone_brand)))
  48. logging.debug('phone brand data: train: {}, test: {}'.format(X_brand_trn.shape, X_brand_tst.shape))
  49. X_model_trn = sparse.csr_matrix((np.ones(trn.shape[0]), (trn.row_id_trn, trn.device_model)))
  50. X_model_tst = sparse.csr_matrix((np.ones(tst.shape[0]), (tst.row_id_tst, tst.device_model)))
  51. logging.debug('device model data: train: {}, test: {}'.format(X_model_trn.shape, X_model_tst.shape))
  52. logging.info('removing app labels not associated with app ids in app_event')
  53. app_label = app_label.loc[app_label.app_id.isin(app_event.app_id.unique())]
  54. logging.info('label-encoding app_id in app_event')
  55. lbe_app_id = LabelEncoder()
  56. app_event.ix[:, 'app_id'] = lbe_app_id.fit_transform(app_event.app_id)
  57. logging.debug('# app_id: {}'.format(app_event.app_id.nunique()))
  58. logging.info('joining app event data with event data to get device ids')
  59. app_event = pd.merge(app_event, event[['device_id']], left_on='event_id', right_index=True, how='left')
  60. logging.info('joining app event data with training and test row ids')
  61. device_app_event = (app_event.groupby(['device_id', 'app_id'])['app_id']
  62. .agg(['size'])
  63. .merge(trn[['row_id_trn']], how='left', left_index=True, right_index=True)
  64. .merge(tst[['row_id_tst']], how='left', left_index=True, right_index=True)
  65. .reset_index())
  66. logging.debug('device_app_event:\n{}'.format(device_app_event.head()))
  67. device_app_event.columns = ['device_id', 'app_id', 'n_app_event', 'row_id_trn', 'row_id_tst']
  68. device_app_event_trn = device_app_event.dropna(subset=['row_id_trn'])
  69. device_app_event_tst = device_app_event.dropna(subset=['row_id_tst'])
  70. n_app = len(lbe_app_id.classes_)
  71. X_app_event_trn = sparse.csr_matrix((np.log2(1 + device_app_event_trn.n_app_event), (device_app_event_trn.row_id_trn,
  72. device_app_event_trn.app_id)), shape=(trn.shape[0], n_app))
  73. X_app_event_tst = sparse.csr_matrix((np.log2(1 + device_app_event_tst.n_app_event), (device_app_event_tst.row_id_tst,
  74. device_app_event_tst.app_id)), shape=(tst.shape[0], n_app))
  75. logging.debug('app event data: train: {}, test: {}'.format(X_app_event_trn.shape, X_app_event_tst.shape))
  76. logging.info('label-encoding app_id and label_id in app_label')
  77. app_label.ix[:, 'app_id'] = lbe_app_id.transform(app_label.app_id)
  78. lbe_label = LabelEncoder()
  79. app_label.ix[:, 'label_id'] = lbe_label.fit_transform(app_label.label_id)
  80. logging.info('joining app_label with app event data above to get device ids')
  81. device_app_label = pd.merge(device_app_event[['device_id', 'app_id']], app_label[['app_id', 'label_id']])
  82. logging.debug('device_app_label:\n{}'.format(device_app_label.head()))
  83. device_label = (device_app_label.groupby(['device_id', 'label_id'])['label_id'].agg(['size'])
  84. .merge(trn[['row_id_trn']], how='left', left_index=True, right_index=True)
  85. .merge(tst[['row_id_tst']], how='left', left_index=True, right_index=True)
  86. .reset_index())
  87. logging.debug('device_label:\n{}'.format(device_label.head()))
  88. device_label.columns = ['device_id', 'label_id', 'n_app_label', 'row_id_trn', 'row_id_tst']
  89. device_label_trn = device_label.dropna(subset=['row_id_trn'])
  90. device_label_tst = device_label.dropna(subset=['row_id_tst'])
  91. n_label = len(lbe_label.classes_)
  92. X_app_label_trn = sparse.csr_matrix((np.log2(1 + device_label_trn.n_app_label), (device_label_trn.row_id_trn,
  93. device_label_trn.label_id)), shape=(trn.shape[0], n_label))
  94. X_app_label_tst = sparse.csr_matrix((np.log2(1 + device_label_tst.n_app_label), (device_label_tst.row_id_tst,
  95. device_label_tst.label_id)), shape=(tst.shape[0], n_label))
  96. logging.debug('app label data: train: {}, test: {}'.format(X_app_label_trn.shape, X_app_label_tst.shape))
  97. # EA 1.0: stats on events pehour-of-the-week and week-of-the-year
  98. aux = app_event.merge(event[['timestamp']], how='left',left_on='event_id',right_index=True)
  99. aux['hour'] = aux['timestamp'].dt.hour
  100. # EA 1.2: stats of log1p(number of events in each week of the year)
  101. aux3 = aux.groupby(['device_id','app_id','hour'])['event_id'].count().reset_index().rename(columns={'event_id':'num_events'})
  102. # start with min num_events/hour
  103. stats_df = (aux3.groupby(['device_id','hour'])['num_events'].min().reset_index().rename(columns={'num_events':'nevents_min'}).fillna(0))
  104. # add max num_events/hour
  105. stats_df = (stats_df.merge(aux3.groupby(['device_id','hour'])['num_events'].max().reset_index().rename(columns={'num_events':'nevents_max'}),
  106. on=['device_id', 'hour'], how='outer', left_index=True, right_index=True).fillna(0))
  107. # add mean num_events/hour
  108. stats_df = (stats_df.merge(aux3.groupby(['device_id','hour'])['num_events'].mean().reset_index().rename(columns={'num_events':'nevents_mean'}),
  109. on=['device_id', 'hour'], how='outer', left_index=True, right_index=True).fillna(0))
  110. # add sum num_events/hour
  111. stats_df = (stats_df.merge(aux3.groupby(['device_id','hour'])['num_events'].sum().reset_index().rename(columns={'num_events':'nevents_sum'}),
  112. on=['device_id', 'hour'], how='outer', left_index=True, right_index=True).fillna(0))
  113. # add std num_events/hour
  114. stats_df = (stats_df.merge(aux3.groupby(['device_id','hour'])['num_events'].std().reset_index().rename(columns={'num_events':'nevents_std'}),
  115. on=['device_id', 'hour'], how='outer', left_index=True, right_index=True).fillna(0))
  116. stats_df.index = stats_df['device_id']
  117. stats_df.drop('device_id', axis=1, inplace=True)
  118. stats_df_hour = (stats_df.merge(trn[['row_id_trn']], how='left', left_index=True, right_index=True)
  119. .merge(tst[['row_id_tst']], how='left', left_index=True, right_index=True).reset_index())
  120. logging.debug('stats on num of events/hour for each device:\n{}'.format(stats_df_hour.head()))
  121. logging.debug('columns of stats_df_hour: {}'.format(stats_df_hour.columns))
  122. stats_df_hour_trn = stats_df_hour.dropna(subset=['row_id_trn'])
  123. stats_df_hour_tst = stats_df_hour.dropna(subset=['row_id_tst'])
  124. X_hour_min_nevents_trn = sparse.csr_matrix((np.log2(1 + stats_df_hour_trn.nevents_min), (stats_df_hour_trn.row_id_trn, stats_df_hour_trn.hour)), shape=(trn.shape[0], 24))
  125. X_hour_max_nevents_trn = sparse.csr_matrix((np.log2(1 + stats_df_hour_trn.nevents_max), (stats_df_hour_trn.row_id_trn, stats_df_hour_trn.hour)), shape=(trn.shape[0], 24))
  126. X_hour_mean_nevents_trn = sparse.csr_matrix((np.log2(1 + stats_df_hour_trn.nevents_mean), (stats_df_hour_trn.row_id_trn, stats_df_hour_trn.hour)), shape=(trn.shape[0], 24))
  127. X_hour_sum_nevents_trn = sparse.csr_matrix((np.log2(1 + stats_df_hour_trn.nevents_sum), (stats_df_hour_trn.row_id_trn, stats_df_hour_trn.hour)), shape=(trn.shape[0], 24))
  128. X_hour_std_nevents_trn = sparse.csr_matrix((np.log2(1 + stats_df_hour_trn.nevents_std), (stats_df_hour_trn.row_id_trn, stats_df_hour_trn.hour)), shape=(trn.shape[0], 24))
  129. X_hour_min_nevents_tst = sparse.csr_matrix((np.log2(1 + stats_df_hour_tst.nevents_min), (stats_df_hour_tst.row_id_tst, stats_df_hour_tst.hour)), shape=(tst.shape[0], 24))
  130. X_hour_max_nevents_tst = sparse.csr_matrix((np.log2(1 + stats_df_hour_tst.nevents_max), (stats_df_hour_tst.row_id_tst, stats_df_hour_tst.hour)), shape=(tst.shape[0], 24))
  131. X_hour_mean_nevents_tst = sparse.csr_matrix((np.log2(1 + stats_df_hour_tst.nevents_mean), (stats_df_hour_tst.row_id_tst, stats_df_hour_tst.hour)), shape=(tst.shape[0], 24))
  132. X_hour_sum_nevents_tst = sparse.csr_matrix((np.log2(1 + stats_df_hour_tst.nevents_sum), (stats_df_hour_tst.row_id_tst, stats_df_hour_tst.hour)), shape=(tst.shape[0], 24))
  133. X_hour_std_nevents_tst = sparse.csr_matrix((np.log2(1 + stats_df_hour_tst.nevents_std), (stats_df_hour_tst.row_id_tst, stats_df_hour_tst.hour)), shape=(tst.shape[0], 24))
  134. X_nevents_stats_hour_trn = sparse.hstack((X_hour_min_nevents_trn, X_hour_max_nevents_trn, X_hour_mean_nevents_trn, X_hour_sum_nevents_trn, X_hour_std_nevents_trn), format='csr')
  135. X_nevents_stats_hour_tst = sparse.hstack((X_hour_min_nevents_tst, X_hour_max_nevents_tst, X_hour_mean_nevents_tst, X_hour_sum_nevents_tst, X_hour_std_nevents_tst), format='csr')
  136. logging.debug('hourly stats on events data: train: {}, test: {}'.format(X_nevents_stats_hour_trn.shape, X_nevents_stats_hour_tst.shape))
  137. logging.info('combining all features - phone brand, device model, app_event, app_label')
  138. X_trn = sparse.hstack((X_brand_trn, X_model_trn, X_app_event_trn, X_app_label_trn, X_nevents_stats_hour_trn), format='csr')
  139. X_tst = sparse.hstack((X_brand_tst, X_model_tst, X_app_event_tst, X_app_label_tst, X_nevents_stats_hour_tst), format='csr')
  140. logging.debug('all features: train: {}, test: {}'.format(X_trn.shape, X_tst.shape))
  141. logging.info('saving as libSVM format')
  142. dump_svmlight_file(X_trn, trn.group, train_feature_file, zero_based=False)
  143. dump_svmlight_file(X_tst, tst.group, test_feature_file, zero_based=False)
  144. if __name__ == '__main__':
  145. logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
  146. level=logging.DEBUG)
  147. parser = argparse.ArgumentParser()
  148. parser.add_argument('--train-file', required=True, dest='train_file')
  149. parser.add_argument('--test-file', required=True, dest='test_file')
  150. parser.add_argument('--app-event-file', required=True, dest='app_event_file')
  151. parser.add_argument('--app-label-file', required=True, dest='app_label_file')
  152. parser.add_argument('--event-file', required=True, dest='event_file')
  153. parser.add_argument('--label-file', required=True, dest='label_file')
  154. parser.add_argument('--phone-file', required=True, dest='phone_file')
  155. parser.add_argument('--train-feature-file', required=True, dest='train_feature_file')
  156. parser.add_argument('--test-feature-file', required=True, dest='test_feature_file')
  157. args = parser.parse_args()
  158. start = time.time()
  159. generate_feature(args.train_file,
  160. args.test_file,
  161. args.app_event_file,
  162. args.app_label_file,
  163. args.event_file,
  164. args.label_file,
  165. args.phone_file,
  166. args.train_feature_file,
  167. args.test_feature_file)
  168. logging.info('finished ({:.2f} sec elasped)'.format(time.time() - start))