PageRenderTime 800ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/src/generate_baseline03.py

https://gitlab.com/tianzhou2011/talkingdata
Python | 397 lines | 249 code | 85 blank | 63 comment | 24 complexity | 5a63f7256ebbb4c305191b053fe8921d MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Sun Aug 28 15:26:57 2016
  4. @author: Luca
  5. """
  6. import numpy as np
  7. import keras
  8. import pandas as pd
  9. import os
  10. import sys
  11. import gc
  12. from random import shuffle
  13. from scipy import sparse
  14. from sklearn.preprocessing import LabelEncoder
  15. from sklearn.cross_validation import train_test_split
  16. from sklearn.metrics import log_loss
  17. from keras.layers.advanced_activations import PReLU
  18. from keras.models import Sequential
  19. from keras.layers import Dense, Dropout, Activation
  20. from keras.wrappers.scikit_learn import KerasClassifier
  21. from keras.utils import np_utils
  22. from keras.optimizers import SGD
  23. from keras.callbacks import EarlyStopping
  24. #------------------------------------------------- Parameters ----------------------------------------
  25. resamples = 30
  26. MIN_DF = 2
  27. MAX_DF = 0.25
  28. submission = 'baseline03'
  29. #------------------------------------------------- Write functions ----------------------------------------
  30. def rstr(df): return df.dtypes, df.head(3) ,df.apply(lambda x: [x.unique()]), df.apply(lambda x: [len(x.unique())]),df.shape
  31. def batch_generator(X, y, batch_size, shuffle):
  32. #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
  33. number_of_batches = np.ceil(X.shape[0]/batch_size)
  34. counter = 0
  35. sample_index = np.arange(X.shape[0])
  36. if shuffle:
  37. np.random.shuffle(sample_index)
  38. while True:
  39. batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
  40. X_batch = X[batch_index,:].toarray()
  41. y_batch = y[batch_index]
  42. counter += 1
  43. yield X_batch, y_batch
  44. if (counter == number_of_batches):
  45. if shuffle:
  46. np.random.shuffle(sample_index)
  47. counter = 0
  48. def batch_generatorp(X, batch_size, shuffle):
  49. number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
  50. counter = 0
  51. sample_index = np.arange(X.shape[0])
  52. while True:
  53. batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
  54. X_batch = X[batch_index, :].toarray()
  55. counter += 1
  56. yield X_batch
  57. if (counter == number_of_batches):
  58. counter = 0
  59. def baseline_model():
  60. # create model
  61. model = Sequential()
  62. model.add(Dense(150, input_dim=X_train.shape[1], init='normal'))
  63. model.add(PReLU())
  64. model.add(Dropout(0.4))
  65. model.add(Dense(50, input_dim=X_train.shape[1], init='normal'))
  66. model.add(PReLU())
  67. model.add(Dropout(0.2))
  68. model.add(Dense(12, init='normal', activation='softmax'))
  69. # Compile model
  70. model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss
  71. return model
  72. #------------------------------------------------ Read data from source files ------------------------------------
  73. if __name__ == '__main__':
  74. seed = 101
  75. np.random.seed(seed)
  76. datadir = 'input'
  77. print("### ----- PART 1 ----- ###")
  78. print("# Read app events")
  79. # By event_id it is possible to figure out the installed applications (and if they are active)
  80. types = {'event_id':np.uint32, 'app_id':np.str, 'is_installed':np.uint8,
  81. 'is_active':np.uint8}
  82. app_events = pd.read_csv(os.path.join(datadir,'app_events.csv'), dtype=types)
  83. app_events.head(5)
  84. app_events.info()
  85. # remove duplicates(app_id)
  86. app_events= app_events.groupby("event_id")["app_id"].apply(
  87. lambda x: " ".join(set("app_id:" + str(s) for s in x)))
  88. app_events.head(5)
  89. print("# Read Events")
  90. # By event_id we can associate a device, a time of the day, latitude and longitude
  91. events = pd.read_csv(os.path.join(datadir,'events.csv'), dtype={'device_id': np.str})
  92. events.head(5)
  93. events["app_id"] = events["event_id"].map(app_events)
  94. events = events.dropna()
  95. del app_events
  96. gc.collect()
  97. events = events[["device_id", "app_id"]]
  98. events.info()
  99. # remove duplicates(app_id) all events related to a device are grouped together
  100. events.loc[:,"device_id"].value_counts(ascending=True)
  101. events = events.groupby("device_id")["app_id"].apply(
  102. lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
  103. events = events.reset_index(name="app_id")
  104. from sklearn.feature_extraction.text import TfidfVectorizer
  105. tf_idf = TfidfVectorizer(tokenizer= lambda x: x.split(' '), max_df=MAX_DF, min_df=MIN_DF)
  106. tf_idf.fit(events.app_id)
  107. refuse_list = {element:True for element in tf_idf.stop_words_}
  108. # Remove too frequent and too little frequent apps
  109. events['app_id'] = events.app_id.apply(lambda x: ' '.join([j for j in x.split(' ') if j not in refuse_list]))
  110. # expand to multiple rows
  111. events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
  112. for _, row in events.iterrows()]).reset_index()
  113. events.columns = ['app_id', 'device_id']
  114. events.head(5)
  115. f3 = events[["device_id", "app_id"]] # app_id
  116. print("#Part1 formed")
  117. ##################
  118. # App labels
  119. ##################
  120. print("### ----- PART 2 ----- ###")
  121. print("# Read App labels")
  122. app_labels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
  123. label_cat = pd.read_csv(os.path.join(datadir,'label_categories.csv'))
  124. app_labels.info()
  125. label_cat.info()
  126. label_cat=label_cat[['label_id','category']]
  127. app_labels=app_labels.merge(label_cat,on='label_id',how='left')
  128. app_labels.head(3)
  129. events.head(3)
  130. #app_labels = app_labels.loc[app_labels.smaller_cat != "unknown_unknown"]
  131. #app_labels = app_labels.groupby("app_id")["category"].apply(
  132. # lambda x: ";".join(set("app_cat:" + str(s) for s in x)))
  133. app_labels = app_labels.groupby(["app_id","category"]).agg('size').reset_index()
  134. app_labels = app_labels[['app_id','category']]
  135. print("# App labels done")
  136. # Remove "app_id:" from column
  137. print("## Handling events data for merging with app lables")
  138. events['app_id'] = events['app_id'].map(lambda x : x.lstrip('app_id:'))
  139. events['app_id'] = events['app_id'].astype(str)
  140. app_labels['app_id'] = app_labels['app_id'].astype(str)
  141. app_labels.info()
  142. print("## Merge")
  143. events= pd.merge(events, app_labels, on = 'app_id',how='left').astype(str)
  144. #events['smaller_cat'].unique()
  145. # expand to multiple rows
  146. print("#Expand to multiple rows")
  147. #events= pd.concat([pd.Series(row['device_id'], row['category'].split(';'))
  148. # for _, row in events.iterrows()]).reset_index()
  149. #events.columns = ['app_cat', 'device_id']
  150. #events.head(5)
  151. #print(events.info())
  152. events= events.groupby(["device_id","category"]).agg('size').reset_index()
  153. events= events[['device_id','category']]
  154. events.head(10)
  155. print("# App labels done")
  156. f5 = events[["device_id", "category"]] # app_id
  157. # Can % total share be included as well?
  158. print("# App category part formed")
  159. ##################
  160. # Phone Brand
  161. ##################
  162. print("### ----- PART 3 ----- ###")
  163. print("# Read Phone Brand")
  164. pbd = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'),
  165. dtype={'device_id': np.str})
  166. pbd.drop_duplicates('device_id', keep='first', inplace=True)
  167. ##################
  168. # Train and Test
  169. ##################
  170. print("# Generate Train and Test")
  171. train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
  172. dtype={'device_id': np.str})
  173. train.drop(["age", "gender"], axis=1, inplace=True)
  174. test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
  175. dtype={'device_id': np.str})
  176. test["group"] = np.nan
  177. train['leak'] = train.index / float(len(train))
  178. test['leak'] = test.index / float(len(test))
  179. split_len = len(train)
  180. # Group Labels
  181. Y = train["group"]
  182. lable_group = LabelEncoder()
  183. Y = lable_group.fit_transform(Y)
  184. device_id = test["device_id"]
  185. # Exploiting leak
  186. # Concat
  187. Df = pd.concat((train, test), axis=0, ignore_index=True)
  188. print("### ----- PART 4 ----- ###")
  189. Df = pd.merge(Df, pbd, how="left", on="device_id")
  190. Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
  191. Df["device_model"] = Df["device_model"].apply(
  192. lambda x: "device_model:" + str(x))
  193. ###################
  194. # Concat Feature
  195. ###################
  196. print("# Concat all features")
  197. f1 = Df[["device_id", "phone_brand"]] # phone_brand
  198. f2 = Df[["device_id", "device_model"]] # device_model
  199. events = None
  200. Df = None
  201. f1.columns.values[1] = "feature"
  202. f2.columns.values[1] = "feature"
  203. f5.columns.values[1] = "feature"
  204. f3.columns.values[1] = "feature"
  205. FLS = pd.concat((f1, f2, f3, f5), axis=0, ignore_index=True)
  206. FLS.info()
  207. ###################
  208. # User-Item Feature
  209. ###################
  210. print("# User-Item-Feature")
  211. device_ids = FLS["device_id"].unique()
  212. feature_cs = FLS["feature"].unique()
  213. data = np.ones(len(FLS))
  214. len(data)
  215. dec = LabelEncoder().fit(FLS["device_id"])
  216. row = dec.transform(FLS["device_id"])
  217. col = LabelEncoder().fit_transform(FLS["feature"])
  218. sparse_matrix = sparse.csr_matrix(
  219. (data, (row, col)), shape=(len(device_ids), len(feature_cs)))
  220. sparse_matrix.shape
  221. sys.getsizeof(sparse_matrix)
  222. sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0] # Remove the zero columns
  223. print("# Sparse matrix done")
  224. del FLS
  225. del data
  226. f1 = [1]
  227. f5 = [1]
  228. f2 = [1]
  229. f3 = [1]
  230. events = [1]
  231. ##################
  232. # Data
  233. ##################
  234. print("# Split data")
  235. train_row = dec.transform(train["device_id"])
  236. train_sp = sparse_matrix[train_row, :]
  237. test_row = dec.transform(test["device_id"])
  238. test_sp = sparse_matrix[test_row, :]
  239. with open('cv_id.txt','rb') as R:
  240. cv_prg = map(lambda x: int(x.strip()), R.readlines())
  241. cv_seq = [list() for i in range(5)]
  242. for n,v in enumerate(cv_prg):
  243. cv_seq[v-1].append(n)
  244. validation_scores = list()
  245. cv_predictions = np.zeros((train_sp.shape[0],12))
  246. rounds = list()
  247. for iterations in range(resamples):
  248. for cv in range(5):
  249. print 'CV fold %i' % (cv+1)
  250. insample = [item for k,sublist in enumerate(cv_seq) if k != cv for item in sublist]
  251. subsample = int(len(insample) * 0.80)
  252. print 'subsampling %i examples' % subsample
  253. outsample = cv_seq[cv]
  254. shuffle(insample)
  255. X_train, X_val, y_train, y_val = train_sp[insample[:subsample],:], train_sp[outsample,:], Y[insample[:subsample]], Y[outsample]
  256. ##################
  257. # Build Model
  258. ##################
  259. print("# Num of Features: ", X_train.shape[1])
  260. model=baseline_model()
  261. early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=2)
  262. fit= model.fit_generator(generator=batch_generator(X_train, y_train, 800, True),
  263. nb_epoch=50,
  264. samples_per_epoch=X_train.shape[0],
  265. validation_data=(X_val.todense(), y_val), verbose=2,
  266. callbacks=[early_stopping]
  267. )
  268. rounds.append(max(fit.epoch))
  269. # evaluate the model
  270. cv_predictions[outsample] = cv_predictions[outsample]+model.predict_generator(generator=batch_generatorp(X_val, 400, False), val_samples=X_val.shape[0])
  271. ll_score = log_loss(y_val, cv_predictions[outsample])
  272. validation_scores.append(ll_score)
  273. print('logloss val {}'.format(ll_score))
  274. print('Expected logloss val {}'.format(np.mean(validation_scores)))
  275. print("# Averaged prediction")
  276. insample = range(train_sp.shape[0])
  277. subsample = int(len(insample) * 0.80)
  278. shuffle(insample)
  279. early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=2)
  280. fit= model.fit_generator(generator=batch_generator(train_sp[insample[:subsample],:], Y[insample[:subsample]], 800, True),
  281. nb_epoch=int(np.mean(rounds)),
  282. samples_per_epoch=X_train.shape[0],
  283. validation_data=(train_sp[insample[subsample:],:].todense(), Y[insample[subsample:]]),
  284. verbose=2,
  285. callbacks=[early_stopping]
  286. )
  287. if iterations==0:
  288. scores = model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
  289. else:
  290. scores += model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
  291. # Averaging of all results
  292. cv_predictions = cv_predictions / float(resamples)
  293. result = pd.DataFrame(scores / (float(resamples) * (float(cv)+1.0)) , columns=lable_group.classes_)
  294. result["device_id"] = device_id
  295. print(result.head(1))
  296. result = result.set_index("device_id")
  297. ##################
  298. # Saving Results
  299. ##################
  300. from sklearn.datasets import dump_svmlight_file
  301. # Metric
  302. with open('metric/'+submission+'.val','wb') as W:
  303. W.write(submission+'\t'+str(np.mean(validation_scores))+'\n')
  304. # Test predictions
  305. result.to_csv('tst/'+submission+'.csv', index=True, index_label='device_id')
  306. # Validation predictions
  307. np.savetxt('val/'+submission+'.val.yht', cv_predictions, delimiter=',')
  308. # Datasets
  309. dump_svmlight_file(X=train_sp, y=Y, f='feature/'+submission+'.trn.sps')
  310. dump_svmlight_file(X=test_sp, y=np.zeros(test_sp.shape[0]), f='feature/'+submission+'.tst.sps')
  311. print("Done")