/src/generate_baseline03.py
Python | 397 lines | 249 code | 85 blank | 63 comment | 24 complexity | 5a63f7256ebbb4c305191b053fe8921d MD5 | raw file
- # -*- coding: utf-8 -*-
- """
- Created on Sun Aug 28 15:26:57 2016
- @author: Luca
- """
- import numpy as np
- import keras
- import pandas as pd
- import os
- import sys
- import gc
- from random import shuffle
- from scipy import sparse
- from sklearn.preprocessing import LabelEncoder
- from sklearn.cross_validation import train_test_split
- from sklearn.metrics import log_loss
- from keras.layers.advanced_activations import PReLU
- from keras.models import Sequential
- from keras.layers import Dense, Dropout, Activation
- from keras.wrappers.scikit_learn import KerasClassifier
- from keras.utils import np_utils
- from keras.optimizers import SGD
- from keras.callbacks import EarlyStopping
- #------------------------------------------------- Parameters ----------------------------------------
- resamples = 30
- MIN_DF = 2
- MAX_DF = 0.25
- submission = 'baseline03'
- #------------------------------------------------- Write functions ----------------------------------------
- def rstr(df): return df.dtypes, df.head(3) ,df.apply(lambda x: [x.unique()]), df.apply(lambda x: [len(x.unique())]),df.shape
- def batch_generator(X, y, batch_size, shuffle):
- #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
- number_of_batches = np.ceil(X.shape[0]/batch_size)
- counter = 0
- sample_index = np.arange(X.shape[0])
- if shuffle:
- np.random.shuffle(sample_index)
- while True:
- batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
- X_batch = X[batch_index,:].toarray()
- y_batch = y[batch_index]
- counter += 1
- yield X_batch, y_batch
- if (counter == number_of_batches):
- if shuffle:
- np.random.shuffle(sample_index)
- counter = 0
- def batch_generatorp(X, batch_size, shuffle):
- number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
- counter = 0
- sample_index = np.arange(X.shape[0])
- while True:
- batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
- X_batch = X[batch_index, :].toarray()
- counter += 1
- yield X_batch
- if (counter == number_of_batches):
- counter = 0
- def baseline_model():
- # create model
- model = Sequential()
- model.add(Dense(150, input_dim=X_train.shape[1], init='normal'))
- model.add(PReLU())
- model.add(Dropout(0.4))
- model.add(Dense(50, input_dim=X_train.shape[1], init='normal'))
- model.add(PReLU())
- model.add(Dropout(0.2))
- model.add(Dense(12, init='normal', activation='softmax'))
- # Compile model
- model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss
- return model
- #------------------------------------------------ Read data from source files ------------------------------------
- if __name__ == '__main__':
- seed = 101
- np.random.seed(seed)
- datadir = 'input'
-
- print("### ----- PART 1 ----- ###")
-
- print("# Read app events")
- # By event_id it is possible to figure out the installed applications (and if they are active)
- types = {'event_id':np.uint32, 'app_id':np.str, 'is_installed':np.uint8,
- 'is_active':np.uint8}
- app_events = pd.read_csv(os.path.join(datadir,'app_events.csv'), dtype=types)
- app_events.head(5)
- app_events.info()
-
- # remove duplicates(app_id)
- app_events= app_events.groupby("event_id")["app_id"].apply(
- lambda x: " ".join(set("app_id:" + str(s) for s in x)))
- app_events.head(5)
-
- print("# Read Events")
- # By event_id we can associate a device, a time of the day, latitude and longitude
- events = pd.read_csv(os.path.join(datadir,'events.csv'), dtype={'device_id': np.str})
- events.head(5)
- events["app_id"] = events["event_id"].map(app_events)
- events = events.dropna()
- del app_events
- gc.collect()
-
- events = events[["device_id", "app_id"]]
- events.info()
-
- # remove duplicates(app_id) all events related to a device are grouped together
- events.loc[:,"device_id"].value_counts(ascending=True)
-
- events = events.groupby("device_id")["app_id"].apply(
- lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
- events = events.reset_index(name="app_id")
-
- from sklearn.feature_extraction.text import TfidfVectorizer
- tf_idf = TfidfVectorizer(tokenizer= lambda x: x.split(' '), max_df=MAX_DF, min_df=MIN_DF)
- tf_idf.fit(events.app_id)
- refuse_list = {element:True for element in tf_idf.stop_words_}
- # Remove too frequent and too little frequent apps
-
- events['app_id'] = events.app_id.apply(lambda x: ' '.join([j for j in x.split(' ') if j not in refuse_list]))
- # expand to multiple rows
- events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
- for _, row in events.iterrows()]).reset_index()
- events.columns = ['app_id', 'device_id']
- events.head(5)
- f3 = events[["device_id", "app_id"]] # app_id
-
- print("#Part1 formed")
-
- ##################
- # App labels
- ##################
-
- print("### ----- PART 2 ----- ###")
-
- print("# Read App labels")
- app_labels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
- label_cat = pd.read_csv(os.path.join(datadir,'label_categories.csv'))
- app_labels.info()
- label_cat.info()
- label_cat=label_cat[['label_id','category']]
-
- app_labels=app_labels.merge(label_cat,on='label_id',how='left')
- app_labels.head(3)
- events.head(3)
- #app_labels = app_labels.loc[app_labels.smaller_cat != "unknown_unknown"]
-
- #app_labels = app_labels.groupby("app_id")["category"].apply(
- # lambda x: ";".join(set("app_cat:" + str(s) for s in x)))
- app_labels = app_labels.groupby(["app_id","category"]).agg('size').reset_index()
- app_labels = app_labels[['app_id','category']]
- print("# App labels done")
-
- # Remove "app_id:" from column
- print("## Handling events data for merging with app lables")
- events['app_id'] = events['app_id'].map(lambda x : x.lstrip('app_id:'))
- events['app_id'] = events['app_id'].astype(str)
- app_labels['app_id'] = app_labels['app_id'].astype(str)
- app_labels.info()
-
- print("## Merge")
-
- events= pd.merge(events, app_labels, on = 'app_id',how='left').astype(str)
-
- #events['smaller_cat'].unique()
-
- # expand to multiple rows
- print("#Expand to multiple rows")
- #events= pd.concat([pd.Series(row['device_id'], row['category'].split(';'))
- # for _, row in events.iterrows()]).reset_index()
- #events.columns = ['app_cat', 'device_id']
- #events.head(5)
- #print(events.info())
-
- events= events.groupby(["device_id","category"]).agg('size').reset_index()
- events= events[['device_id','category']]
- events.head(10)
- print("# App labels done")
-
- f5 = events[["device_id", "category"]] # app_id
- # Can % total share be included as well?
- print("# App category part formed")
-
- ##################
- # Phone Brand
- ##################
- print("### ----- PART 3 ----- ###")
-
- print("# Read Phone Brand")
- pbd = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'),
- dtype={'device_id': np.str})
- pbd.drop_duplicates('device_id', keep='first', inplace=True)
-
- ##################
- # Train and Test
- ##################
- print("# Generate Train and Test")
-
- train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
- dtype={'device_id': np.str})
- train.drop(["age", "gender"], axis=1, inplace=True)
-
- test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
- dtype={'device_id': np.str})
- test["group"] = np.nan
- train['leak'] = train.index / float(len(train))
- test['leak'] = test.index / float(len(test))
-
- split_len = len(train)
-
- # Group Labels
- Y = train["group"]
- lable_group = LabelEncoder()
- Y = lable_group.fit_transform(Y)
- device_id = test["device_id"]
-
- # Exploiting leak
-
-
- # Concat
- Df = pd.concat((train, test), axis=0, ignore_index=True)
-
- print("### ----- PART 4 ----- ###")
-
- Df = pd.merge(Df, pbd, how="left", on="device_id")
- Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
- Df["device_model"] = Df["device_model"].apply(
- lambda x: "device_model:" + str(x))
-
-
- ###################
- # Concat Feature
- ###################
-
- print("# Concat all features")
-
- f1 = Df[["device_id", "phone_brand"]] # phone_brand
- f2 = Df[["device_id", "device_model"]] # device_model
-
- events = None
- Df = None
-
- f1.columns.values[1] = "feature"
- f2.columns.values[1] = "feature"
- f5.columns.values[1] = "feature"
- f3.columns.values[1] = "feature"
-
- FLS = pd.concat((f1, f2, f3, f5), axis=0, ignore_index=True)
-
- FLS.info()
-
- ###################
- # User-Item Feature
- ###################
- print("# User-Item-Feature")
-
- device_ids = FLS["device_id"].unique()
- feature_cs = FLS["feature"].unique()
-
- data = np.ones(len(FLS))
- len(data)
-
- dec = LabelEncoder().fit(FLS["device_id"])
- row = dec.transform(FLS["device_id"])
- col = LabelEncoder().fit_transform(FLS["feature"])
-
- sparse_matrix = sparse.csr_matrix(
- (data, (row, col)), shape=(len(device_ids), len(feature_cs)))
- sparse_matrix.shape
- sys.getsizeof(sparse_matrix)
-
- sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0] # Remove the zero columns
- print("# Sparse matrix done")
-
- del FLS
- del data
- f1 = [1]
- f5 = [1]
- f2 = [1]
- f3 = [1]
-
- events = [1]
-
- ##################
- # Data
- ##################
-
- print("# Split data")
- train_row = dec.transform(train["device_id"])
- train_sp = sparse_matrix[train_row, :]
-
- test_row = dec.transform(test["device_id"])
- test_sp = sparse_matrix[test_row, :]
-
- with open('cv_id.txt','rb') as R:
- cv_prg = map(lambda x: int(x.strip()), R.readlines())
-
- cv_seq = [list() for i in range(5)]
- for n,v in enumerate(cv_prg):
- cv_seq[v-1].append(n)
-
- validation_scores = list()
- cv_predictions = np.zeros((train_sp.shape[0],12))
-
- rounds = list()
-
- for iterations in range(resamples):
-
- for cv in range(5):
- print 'CV fold %i' % (cv+1)
- insample = [item for k,sublist in enumerate(cv_seq) if k != cv for item in sublist]
- subsample = int(len(insample) * 0.80)
- print 'subsampling %i examples' % subsample
- outsample = cv_seq[cv]
- shuffle(insample)
- X_train, X_val, y_train, y_val = train_sp[insample[:subsample],:], train_sp[outsample,:], Y[insample[:subsample]], Y[outsample]
-
- ##################
- # Build Model
- ##################
-
- print("# Num of Features: ", X_train.shape[1])
-
- model=baseline_model()
- early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=2)
- fit= model.fit_generator(generator=batch_generator(X_train, y_train, 800, True),
- nb_epoch=50,
- samples_per_epoch=X_train.shape[0],
- validation_data=(X_val.todense(), y_val), verbose=2,
- callbacks=[early_stopping]
- )
- rounds.append(max(fit.epoch))
- # evaluate the model
- cv_predictions[outsample] = cv_predictions[outsample]+model.predict_generator(generator=batch_generatorp(X_val, 400, False), val_samples=X_val.shape[0])
- ll_score = log_loss(y_val, cv_predictions[outsample])
- validation_scores.append(ll_score)
- print('logloss val {}'.format(ll_score))
-
- print('Expected logloss val {}'.format(np.mean(validation_scores)))
-
- print("# Averaged prediction")
- insample = range(train_sp.shape[0])
- subsample = int(len(insample) * 0.80)
- shuffle(insample)
- early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=2)
- fit= model.fit_generator(generator=batch_generator(train_sp[insample[:subsample],:], Y[insample[:subsample]], 800, True),
- nb_epoch=int(np.mean(rounds)),
- samples_per_epoch=X_train.shape[0],
- validation_data=(train_sp[insample[subsample:],:].todense(), Y[insample[subsample:]]),
- verbose=2,
- callbacks=[early_stopping]
- )
- if iterations==0:
- scores = model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
- else:
- scores += model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
-
- # Averaging of all results
- cv_predictions = cv_predictions / float(resamples)
- result = pd.DataFrame(scores / (float(resamples) * (float(cv)+1.0)) , columns=lable_group.classes_)
-
- result["device_id"] = device_id
- print(result.head(1))
- result = result.set_index("device_id")
-
- ##################
- # Saving Results
- ##################
- from sklearn.datasets import dump_svmlight_file
- # Metric
- with open('metric/'+submission+'.val','wb') as W:
- W.write(submission+'\t'+str(np.mean(validation_scores))+'\n')
- # Test predictions
- result.to_csv('tst/'+submission+'.csv', index=True, index_label='device_id')
- # Validation predictions
- np.savetxt('val/'+submission+'.val.yht', cv_predictions, delimiter=',')
- # Datasets
- dump_svmlight_file(X=train_sp, y=Y, f='feature/'+submission+'.trn.sps')
- dump_svmlight_file(X=test_sp, y=np.zeros(test_sp.shape[0]), f='feature/'+submission+'.tst.sps')
-
- print("Done")