generate_baseline03.py

/src/generate_baseline03.py

https://gitlab.com/tianzhou2011/talkingdata
Python | 397 lines | 249 code | 85 blank | 63 comment | 24 complexity | 5a63f7256ebbb4c305191b053fe8921d MD5 | raw file

# -*- coding: utf-8 -*-
"""
Created on Sun Aug 28 15:26:57 2016

@author: Luca
"""
import numpy as np
import keras
import pandas as pd
import os
import sys
import gc
from random import shuffle
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

from keras.layers.advanced_activations import PReLU
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping

#------------------------------------------------- Parameters ----------------------------------------

resamples  = 30
MIN_DF = 2
MAX_DF = 0.25
submission = 'baseline03' 

#------------------------------------------------- Write functions ----------------------------------------

def rstr(df): return df.dtypes, df.head(3) ,df.apply(lambda x: [x.unique()]), df.apply(lambda x: [len(x.unique())]),df.shape

def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(150, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.4))
    model.add(Dense(50, input_dim=X_train.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model



#------------------------------------------------ Read data from source files ------------------------------------

if __name__ == '__main__':
    seed = 101
    np.random.seed(seed)
    datadir = 'input'
    
    print("### ----- PART 1 ----- ###")
    
    print("# Read app events")
    # By event_id it is possible to figure out the installed applications (and if they are active)
    types = {'event_id':np.uint32, 'app_id':np.str, 'is_installed':np.uint8, 
             'is_active':np.uint8}
    app_events = pd.read_csv(os.path.join(datadir,'app_events.csv'), dtype=types)
    app_events.head(5)
    app_events.info()
    
    # remove duplicates(app_id)
    app_events= app_events.groupby("event_id")["app_id"].apply(
        lambda x: " ".join(set("app_id:" + str(s) for s in x)))
    app_events.head(5)
    
    print("# Read Events")
    # By event_id we can associate a device, a time of the day, latitude and longitude
    events = pd.read_csv(os.path.join(datadir,'events.csv'), dtype={'device_id': np.str})
    events.head(5)
    events["app_id"] = events["event_id"].map(app_events)
    events = events.dropna()
    del app_events
    gc.collect()
    
    events = events[["device_id", "app_id"]]
    events.info()
    
    # remove duplicates(app_id) all events related to a device are grouped together
    events.loc[:,"device_id"].value_counts(ascending=True)
    
    events = events.groupby("device_id")["app_id"].apply(
        lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
    events = events.reset_index(name="app_id")
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    tf_idf = TfidfVectorizer(tokenizer= lambda x: x.split(' '), max_df=MAX_DF, min_df=MIN_DF)
    tf_idf.fit(events.app_id)
    refuse_list = {element:True for element in tf_idf.stop_words_}
    # Remove too frequent and too little frequent apps
    
    events['app_id'] = events.app_id.apply(lambda x: ' '.join([j for j in x.split(' ') if j not in refuse_list]))

    # expand to multiple rows
    events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                        for _, row in events.iterrows()]).reset_index()
    events.columns = ['app_id', 'device_id']
    events.head(5)
    f3 = events[["device_id", "app_id"]]    # app_id
    
    print("#Part1 formed")
    
    ##################
    #   App labels
    ##################
    
    print("### ----- PART 2 ----- ###")
    
    print("# Read App labels")
    app_labels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
    label_cat = pd.read_csv(os.path.join(datadir,'label_categories.csv'))
    app_labels.info()
    label_cat.info()
    label_cat=label_cat[['label_id','category']]
    
    app_labels=app_labels.merge(label_cat,on='label_id',how='left')
    app_labels.head(3)
    events.head(3)
    #app_labels = app_labels.loc[app_labels.smaller_cat != "unknown_unknown"]
    
    #app_labels = app_labels.groupby("app_id")["category"].apply(
    #    lambda x: ";".join(set("app_cat:" + str(s) for s in x)))
    app_labels = app_labels.groupby(["app_id","category"]).agg('size').reset_index()
    app_labels = app_labels[['app_id','category']]
    print("# App labels done")
    
    # Remove "app_id:" from column
    print("## Handling events data for merging with app lables")
    events['app_id'] = events['app_id'].map(lambda x : x.lstrip('app_id:'))
    events['app_id'] = events['app_id'].astype(str)
    app_labels['app_id'] = app_labels['app_id'].astype(str)
    app_labels.info()
    
    print("## Merge")
    
    events= pd.merge(events, app_labels, on = 'app_id',how='left').astype(str)
    
    #events['smaller_cat'].unique()
    
    # expand to multiple rows
    print("#Expand to multiple rows")
    #events= pd.concat([pd.Series(row['device_id'], row['category'].split(';'))
    #                    for _, row in events.iterrows()]).reset_index()
    #events.columns = ['app_cat', 'device_id']
    #events.head(5)
    #print(events.info())
    
    events= events.groupby(["device_id","category"]).agg('size').reset_index()
    events= events[['device_id','category']]
    events.head(10)
    print("# App labels done")
    
    f5 = events[["device_id", "category"]]    # app_id
    # Can % total share be included as well?
    print("# App category part formed")
    
    ##################
    #   Phone Brand
    ##################
    print("### ----- PART 3 ----- ###")
    
    print("# Read Phone Brand")
    pbd = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'),
                      dtype={'device_id': np.str})
    pbd.drop_duplicates('device_id', keep='first', inplace=True)
    
    ##################
    #  Train and Test
    ##################
    print("# Generate Train and Test")
    
    train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                        dtype={'device_id': np.str})
    train.drop(["age", "gender"], axis=1, inplace=True)
    
    test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                       dtype={'device_id': np.str})
    test["group"] = np.nan
    train['leak'] = train.index / float(len(train))
    test['leak'] = test.index / float(len(test))
        
    split_len = len(train)
    
    # Group Labels
    Y = train["group"]
    lable_group = LabelEncoder()
    Y = lable_group.fit_transform(Y)
    device_id = test["device_id"]
    
    # Exploiting leak
    
    
    # Concat
    Df = pd.concat((train, test), axis=0, ignore_index=True)
    
    print("### ----- PART 4 ----- ###")
    
    Df = pd.merge(Df, pbd, how="left", on="device_id")
    Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
    Df["device_model"] = Df["device_model"].apply(
        lambda x: "device_model:" + str(x))
    
    
    ###################
    #  Concat Feature
    ###################
    
    print("# Concat all features")
    
    f1 = Df[["device_id", "phone_brand"]]   # phone_brand
    f2 = Df[["device_id", "device_model"]]  # device_model
    
    events = None
    Df = None
    
    f1.columns.values[1] = "feature"
    f2.columns.values[1] = "feature"
    f5.columns.values[1] = "feature"
    f3.columns.values[1] = "feature"
    
    FLS = pd.concat((f1, f2, f3, f5), axis=0, ignore_index=True)
    
    FLS.info()
    
    ###################
    # User-Item Feature
    ###################
    print("# User-Item-Feature")
    
    device_ids = FLS["device_id"].unique()
    feature_cs = FLS["feature"].unique()
    
    data = np.ones(len(FLS))
    len(data)
    
    dec = LabelEncoder().fit(FLS["device_id"])
    row = dec.transform(FLS["device_id"])
    col = LabelEncoder().fit_transform(FLS["feature"])
    
    sparse_matrix = sparse.csr_matrix(
        (data, (row, col)), shape=(len(device_ids), len(feature_cs)))
    sparse_matrix.shape
    sys.getsizeof(sparse_matrix)
    
    sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0] # Remove the zero columns
    print("# Sparse matrix done")
    
    del FLS
    del data
    f1 = [1]
    f5 = [1]
    f2 = [1]
    f3 = [1]
    
    events = [1]
    
    ##################
    #      Data
    ##################
    
    print("# Split data")
    train_row = dec.transform(train["device_id"])
    train_sp = sparse_matrix[train_row, :]
        
    test_row = dec.transform(test["device_id"])
    test_sp = sparse_matrix[test_row, :]
    
    with open('cv_id.txt','rb') as R:
        cv_prg = map(lambda x: int(x.strip()), R.readlines())
    
    cv_seq = [list() for i in range(5)]
    for n,v in enumerate(cv_prg):
        cv_seq[v-1].append(n)
    
    validation_scores = list()
    cv_predictions = np.zeros((train_sp.shape[0],12))
    
    rounds = list()
    
    for iterations in range(resamples):
        
        for cv in range(5):
            print 'CV fold %i' % (cv+1)
            insample  = [item for k,sublist in enumerate(cv_seq) if k != cv for item in sublist]
            subsample = int(len(insample) * 0.80)
            print 'subsampling %i examples' % subsample
            outsample = cv_seq[cv]
            shuffle(insample)
            X_train, X_val, y_train, y_val = train_sp[insample[:subsample],:], train_sp[outsample,:], Y[insample[:subsample]], Y[outsample]
            
            ##################
            #  Build Model
            ##################
            
            print("# Num of Features: ", X_train.shape[1])
            
            model=baseline_model()
            early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=2)
            fit= model.fit_generator(generator=batch_generator(X_train, y_train, 800, True),
                                     nb_epoch=50,
                                     samples_per_epoch=X_train.shape[0],
                                     validation_data=(X_val.todense(), y_val), verbose=2,
                                     callbacks=[early_stopping]
                                     )
            rounds.append(max(fit.epoch))
            # evaluate the model
            cv_predictions[outsample] = cv_predictions[outsample]+model.predict_generator(generator=batch_generatorp(X_val, 400, False), val_samples=X_val.shape[0])
            ll_score = log_loss(y_val, cv_predictions[outsample])
            validation_scores.append(ll_score)
            print('logloss val {}'.format(ll_score))
        
        print('Expected logloss val {}'.format(np.mean(validation_scores)))
        
        print("# Averaged prediction")
        insample = range(train_sp.shape[0])
        subsample = int(len(insample) * 0.80)
        shuffle(insample)
        early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=2)
        fit= model.fit_generator(generator=batch_generator(train_sp[insample[:subsample],:], Y[insample[:subsample]], 800, True),
                                         nb_epoch=int(np.mean(rounds)),
                                         samples_per_epoch=X_train.shape[0],
                                         validation_data=(train_sp[insample[subsample:],:].todense(), Y[insample[subsample:]]), 
                                         verbose=2, 
                                         callbacks=[early_stopping]
                                         )
        if iterations==0:
            scores = model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
        else:
            scores += model.predict_generator(generator=batch_generatorp(test_sp, 800, False), val_samples=test_sp.shape[0])
    
    # Averaging of all results
    cv_predictions = cv_predictions / float(resamples)
    result = pd.DataFrame(scores / (float(resamples) * (float(cv)+1.0)) , columns=lable_group.classes_)
    
    result["device_id"] = device_id
    print(result.head(1))
    result = result.set_index("device_id")
    
    ##################
    #  Saving Results
    ##################
    from sklearn.datasets import dump_svmlight_file
    # Metric
    with open('metric/'+submission+'.val','wb') as W:
        W.write(submission+'\t'+str(np.mean(validation_scores))+'\n')
    # Test predictions
    result.to_csv('tst/'+submission+'.csv', index=True, index_label='device_id')
    # Validation predictions
    np.savetxt('val/'+submission+'.val.yht', cv_predictions, delimiter=',')
    # Datasets
    dump_svmlight_file(X=train_sp, y=Y, f='feature/'+submission+'.trn.sps')
    dump_svmlight_file(X=test_sp, y=np.zeros(test_sp.shape[0]), f='feature/'+submission+'.tst.sps')
    
    print("Done")