PageRenderTime 57ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/src/CalmeToi.py

https://gitlab.com/tianzhou2011/talkingdata
Python | 361 lines | 212 code | 76 blank | 73 comment | 9 complexity | 926179f92324a15db5eb25da43e29d83 MD5 | raw file
  1. # Bag of apps categories
  2. # Bag of labels categories
  3. # Include phone brand and model device
  4. print("Initialize libraries")
  5. import pandas as pd
  6. import sys
  7. import numpy as np
  8. import scipy as sp
  9. import matplotlib.pyplot as plt
  10. from sklearn.cross_validation import StratifiedKFold, KFold
  11. from sklearn.metrics import log_loss
  12. from sklearn.cluster import DBSCAN
  13. from sklearn import metrics as skmetrics
  14. from sklearn.preprocessing import StandardScaler
  15. import matplotlib.pyplot as plt
  16. from collections import Counter
  17. from keras.layers.advanced_activations import PReLU
  18. from sklearn.preprocessing import LabelEncoder
  19. import xgboost as xgb
  20. from xgboost.sklearn import XGBClassifier
  21. from sklearn import ensemble
  22. from sklearn.decomposition import PCA
  23. import os
  24. import gc
  25. from scipy import sparse
  26. from sklearn.cross_validation import train_test_split, cross_val_score
  27. from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
  28. from sklearn import ensemble
  29. from sklearn.neighbors import KNeighborsClassifier
  30. from keras.models import Sequential
  31. from keras.layers import Dense, Dropout, Activation
  32. from keras.wrappers.scikit_learn import KerasClassifier
  33. from keras.utils import np_utils
  34. from keras.optimizers import SGD
  35. from sklearn.cross_validation import cross_val_score
  36. from sklearn.cross_validation import KFold
  37. from sklearn.preprocessing import LabelEncoder
  38. from sklearn.pipeline import Pipeline
  39. from sklearn.cross_validation import train_test_split
  40. from sklearn.metrics import log_loss
  41. #https://www.kaggle.com/poiss0nriot/talkingdata-mobile-user-demographics/bag-of-apps-keras-11-08-16-no-val
  42. #------------------------------------------------- Write functions ----------------------------------------
  43. def rstr(df): return df.dtypes, df.head(3) ,df.apply(lambda x: [x.unique()]), df.apply(lambda x: [len(x.unique())]),df.shape
  44. def batch_generator(X, y, batch_size, shuffle):
  45. #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
  46. number_of_batches = np.ceil(X.shape[0]/batch_size)
  47. counter = 0
  48. sample_index = np.arange(X.shape[0])
  49. if shuffle:
  50. np.random.shuffle(sample_index)
  51. while True:
  52. batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
  53. X_batch = X[batch_index,:].toarray()
  54. y_batch = y[batch_index]
  55. counter += 1
  56. yield X_batch, y_batch
  57. if (counter == number_of_batches):
  58. if shuffle:
  59. np.random.shuffle(sample_index)
  60. counter = 0
  61. def batch_generatorp(X, batch_size, shuffle):
  62. number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
  63. counter = 0
  64. sample_index = np.arange(X.shape[0])
  65. while True:
  66. batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
  67. X_batch = X[batch_index, :].toarray()
  68. counter += 1
  69. yield X_batch
  70. if (counter == number_of_batches):
  71. counter = 0
  72. #------------------------------------------------ Read data from source files ------------------------------------
  73. seed = 7
  74. np.random.seed(seed)
  75. datadir = '../input'
  76. print("### ----- PART 1 ----- ###")
  77. # Data - Events data
  78. # Bag of apps
  79. print("# Read app events")
  80. app_events = pd.read_csv(os.path.join(datadir,'app_events.csv'), dtype={'device_id' : np.str})
  81. app_events.head(5)
  82. app_events.info()
  83. #print(rstr(app_events))
  84. # remove duplicates(app_id)
  85. app_events= app_events.groupby("event_id")["app_id"].apply(
  86. lambda x: " ".join(set("app_id:" + str(s) for s in x)))
  87. app_events.head(5)
  88. print("# Read Events")
  89. events = pd.read_csv(os.path.join(datadir,'events.csv'), dtype={'device_id': np.str})
  90. events.head(5)
  91. events["app_id"] = events["event_id"].map(app_events)
  92. events = events.dropna()
  93. del app_events
  94. events = events[["device_id", "app_id"]]
  95. events.info()
  96. # 1Gb reduced to 34 Mb
  97. # remove duplicates(app_id)
  98. events.loc[:,"device_id"].value_counts(ascending=True)
  99. events = events.groupby("device_id")["app_id"].apply(
  100. lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
  101. events = events.reset_index(name="app_id")
  102. # expand to multiple rows
  103. events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
  104. for _, row in events.iterrows()]).reset_index()
  105. events.columns = ['app_id', 'device_id']
  106. events.head(5)
  107. f3 = events[["device_id", "app_id"]] # app_id
  108. print("#Part1 formed")
  109. ##################
  110. # App labels
  111. ##################
  112. print("### ----- PART 2 ----- ###")
  113. print("# Read App labels")
  114. app_labels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
  115. label_cat = pd.read_csv(os.path.join(datadir,'label_categories.csv'))
  116. app_labels.info()
  117. label_cat.info()
  118. label_cat=label_cat[['label_id','category']]
  119. app_labels=app_labels.merge(label_cat,on='label_id',how='left')
  120. app_labels.head(3)
  121. events.head(3)
  122. #app_labels = app_labels.loc[app_labels.smaller_cat != "unknown_unknown"]
  123. #app_labels = app_labels.groupby("app_id")["category"].apply(
  124. # lambda x: ";".join(set("app_cat:" + str(s) for s in x)))
  125. app_labels = app_labels.groupby(["app_id","category"]).agg('size').reset_index()
  126. app_labels = app_labels[['app_id','category']]
  127. print("# App labels done")
  128. # Remove "app_id:" from column
  129. print("## Handling events data for merging with app lables")
  130. events['app_id'] = events['app_id'].map(lambda x : x.lstrip('app_id:'))
  131. events['app_id'] = events['app_id'].astype(str)
  132. app_labels['app_id'] = app_labels['app_id'].astype(str)
  133. app_labels.info()
  134. print("## Merge")
  135. events= pd.merge(events, app_labels, on = 'app_id',how='left').astype(str)
  136. #events['smaller_cat'].unique()
  137. # expand to multiple rows
  138. print("#Expand to multiple rows")
  139. #events= pd.concat([pd.Series(row['device_id'], row['category'].split(';'))
  140. # for _, row in events.iterrows()]).reset_index()
  141. #events.columns = ['app_cat', 'device_id']
  142. #events.head(5)
  143. #print(events.info())
  144. events= events.groupby(["device_id","category"]).agg('size').reset_index()
  145. events= events[['device_id','category']]
  146. events.head(10)
  147. print("# App labels done")
  148. f5 = events[["device_id", "category"]] # app_id
  149. # Can % total share be included as well?
  150. print("# App category part formed")
  151. ##################
  152. # Phone Brand
  153. ##################
  154. print("### ----- PART 3 ----- ###")
  155. print("# Read Phone Brand")
  156. pbd = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'),
  157. dtype={'device_id': np.str})
  158. pbd.drop_duplicates('device_id', keep='first', inplace=True)
  159. ##################
  160. # Train and Test
  161. ##################
  162. print("# Generate Train and Test")
  163. train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
  164. dtype={'device_id': np.str})
  165. train.drop(["age", "gender"], axis=1, inplace=True)
  166. test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
  167. dtype={'device_id': np.str})
  168. test["group"] = np.nan
  169. split_len = len(train)
  170. # Group Labels
  171. Y = train["group"]
  172. lable_group = LabelEncoder()
  173. Y = lable_group.fit_transform(Y)
  174. device_id = test["device_id"]
  175. # Concat
  176. Df = pd.concat((train, test), axis=0, ignore_index=True)
  177. print("### ----- PART 4 ----- ###")
  178. Df = pd.merge(Df, pbd, how="left", on="device_id")
  179. Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
  180. Df["device_model"] = Df["device_model"].apply(
  181. lambda x: "device_model:" + str(x))
  182. ###################
  183. # Concat Feature
  184. ###################
  185. print("# Concat all features")
  186. f1 = Df[["device_id", "phone_brand"]] # phone_brand
  187. f2 = Df[["device_id", "device_model"]] # device_model
  188. events = None
  189. Df = None
  190. f1.columns.values[1] = "feature"
  191. f2.columns.values[1] = "feature"
  192. f5.columns.values[1] = "feature"
  193. f3.columns.values[1] = "feature"
  194. FLS = pd.concat((f1, f2, f3, f5), axis=0, ignore_index=True)
  195. FLS.info()
  196. ###################
  197. # User-Item Feature
  198. ###################
  199. print("# User-Item-Feature")
  200. device_ids = FLS["device_id"].unique()
  201. feature_cs = FLS["feature"].unique()
  202. data = np.ones(len(FLS))
  203. len(data)
  204. dec = LabelEncoder().fit(FLS["device_id"])
  205. row = dec.transform(FLS["device_id"])
  206. col = LabelEncoder().fit_transform(FLS["feature"])
  207. sparse_matrix = sparse.csr_matrix(
  208. (data, (row, col)), shape=(len(device_ids), len(feature_cs)))
  209. sparse_matrix.shape
  210. sys.getsizeof(sparse_matrix)
  211. sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]
  212. print("# Sparse matrix done")
  213. del FLS
  214. del data
  215. f1 = [1]
  216. f5 = [1]
  217. f2 = [1]
  218. f3 = [1]
  219. events = [1]
  220. ##################
  221. # Data
  222. ##################
  223. print("# Split data")
  224. train_row = dec.transform(train["device_id"])
  225. train_sp = sparse_matrix[train_row, :]
  226. test_row = dec.transform(test["device_id"])
  227. test_sp = sparse_matrix[test_row, :]
  228. X_train, X_val, y_train, y_val = train_test_split(
  229. train_sp, Y, train_size=0.999, random_state=10)
  230. ##################
  231. # Feature Sel
  232. ##################
  233. print("# Feature Selection")
  234. #selector = SelectPercentile(f_classif, percentile=53)
  235. #selector.fit(X_train, y_train)
  236. #X_train.shape
  237. #X_train = selector.transform(X_train)
  238. #X_train.shape
  239. #X_val = selector.transform(X_val)
  240. #X_val.shape
  241. # Selection using chi-square
  242. # selector = SelectKBest(chi2, k=11155).fit(X_train, y_train)
  243. # X_train.shape
  244. # X_train = selector.transform(X_train)
  245. # X_train.shape
  246. # X_val = selector.transform(X_val)
  247. # X_val.shape
  248. print("# Num of Features: ", X_train.shape[1])
  249. ##################
  250. # Build Model
  251. ##################
  252. #act = keras.layers.advanced_activations.PReLU(init='zero', weights=None)
  253. def baseline_model():
  254. # create model
  255. model = Sequential()
  256. model.add(Dense(150, input_dim=X_train.shape[1], init='normal'))
  257. model.add(PReLU())
  258. model.add(Dropout(0.4))
  259. model.add(Dense(50, input_dim=X_train.shape[1], init='normal'))
  260. model.add(PReLU())
  261. model.add(Dropout(0.2))
  262. model.add(Dense(12, init='normal', activation='softmax'))
  263. # Compile model
  264. model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss
  265. return model
  266. model=baseline_model()
  267. fit= model.fit_generator(generator=batch_generator(X_train, y_train, 500, True),
  268. nb_epoch=16,
  269. samples_per_epoch=69984,
  270. validation_data=(X_val.todense(), y_val), verbose=2
  271. )
  272. # evaluate the model
  273. scores_val = model.predict_generator(generator=batch_generatorp(X_val, 500, False), val_samples=X_val.shape[0])
  274. print('logloss val {}'.format(log_loss(y_val, scores_val)))
  275. print("# Final prediction")
  276. scores = model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
  277. result = pd.DataFrame(scores , columns=lable_group.classes_)
  278. result["device_id"] = device_id
  279. print(result.head(1))
  280. result = result.set_index("device_id")
  281. #result.to_csv('./sub_bagofapps7_keras_10_50_pt2_10epoch.csv', index=True, index_label='device_id')
  282. #Drop out 0.2
  283. #Validation 2.3017
  284. result.to_csv('sub_bagofapps7_keras_150_pt4_50_pt2_15epoch_prelu_softmax.csv', index=True, index_label='device_id')
  285. print("Done")