/src/train_predict_keras3.py
Python | 157 lines | 125 code | 28 blank | 4 comment | 8 complexity | 7104ff39fc293e95068f9b5cad1bb906 MD5 | raw file
- #!/usr/bin/env python
- from __future__ import absolute_import, division, print_function
- from keras.callbacks import EarlyStopping
- from keras.models import Sequential
- from keras.layers.core import Dense, Dropout, Activation
- from keras.layers.normalization import BatchNormalization
- from keras.layers.advanced_activations import PReLU
- from keras.utils import np_utils
- from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
- from sklearn.cross_validation import StratifiedKFold
- from sklearn.metrics import log_loss
- import argparse
- import logging
- import numpy as np
- import os
- import pandas as pd
- import time
- from kaggler.data_io import load_data
- from const import N_CLASS, SEED
- np.random.seed(SEED)
- def batch_generator(X, y, batch_size, shuffle):
- #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
- number_of_batches = np.ceil(X.shape[0]/batch_size)
- counter = 0
- sample_index = np.arange(X.shape[0])
- if shuffle:
- np.random.shuffle(sample_index)
- while True:
- batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
- X_batch = X[batch_index,:].toarray()
- y_batch = y[batch_index]
- counter += 1
- yield X_batch, y_batch
- if (counter == number_of_batches):
- if shuffle:
- np.random.shuffle(sample_index)
- counter = 0
- def batch_generatorp(X, batch_size, shuffle):
- number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
- counter = 0
- sample_index = np.arange(X.shape[0])
- while True:
- batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
- X_batch = X[batch_index, :].toarray()
- counter += 1
- yield X_batch
- if (counter == number_of_batches):
- counter = 0
- def baseline_model(dims):
- # create model
- model = Sequential()
- model.add(Dense(150, input_dim=dims, init='normal'))
- model.add(PReLU())
- model.add(Dropout(0.4))
- model.add(Dense(50, input_dim=dims, init='normal'))
- model.add(PReLU())
- model.add(Dropout(0.2))
- model.add(Dense(12, init='normal', activation='softmax'))
- # Compile model
- model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) #logloss
- return model
- def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
- cv_id_file, n_est=100, neurons=512, dropout=0.5, batch=16,
- n_fold=5):
- feature_name = os.path.basename(train_file)[:-8]
- model_name = 'keras3_{}_{}_{}_{}_{}'.format(
- n_est, neurons, dropout, batch, feature_name
- )
- logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
- level=logging.DEBUG,
- filename='{}.log'.format(model_name))
- logging.info('Loading training and test data...')
- X, y = load_data(train_file)
- Y = np_utils.to_categorical(y)
- X_tst, _ = load_data(test_file)
- nb_classes = Y.shape[1]
- dims = X.shape[1]
- logging.info('{} classes, {} dims'.format(nb_classes, dims))
- logging.info('Loading CV Ids')
- cv_id = np.loadtxt(cv_id_file)
- P_val = np.zeros_like(Y)
- P_tst = np.zeros((X_tst.shape[0], nb_classes))
- for i in range(1, n_fold + 1):
- i_trn = np.where(cv_id != i)[0]
- i_val = np.where(cv_id == i)[0]
- logging.info('Training model #{}'.format(i))
- clf = baseline_model(dims)
- clf.fit_generator(generator=batch_generator(X[i_trn], Y[i_trn], 500, True),
- nb_epoch=16,
- samples_per_epoch=69984,
- validation_data=(X[i_val].todense(), Y[i_val]),
- verbose=2)
- P_val[i_val] = clf.predict_generator(generator=batch_generatorp(X[i_val], 500, False),
- val_samples=X[i_val].shape[0])
- logging.info('CV #{} Log Loss: {:.6f}'.format(i, log_loss(Y[i_val], P_val[i_val])))
- P_tst += clf.predict_generator(generator=batch_generatorp(X_tst, 800, False),
- val_samples=X_tst.shape[0]) / n_fold
- logging.info('Saving normalized validation predictions...')
- logging.info('CV Log Loss: {:.6f}'.format(log_loss(Y, P_val)))
- np.savetxt(predict_valid_file, P_val, fmt='%.6f', delimiter=',')
- logging.info('Saving normalized test predictions...')
- np.savetxt(predict_test_file, P_tst, fmt='%.6f', delimiter=',')
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('--train-file', required=True, dest='train_file')
- parser.add_argument('--test-file', required=True, dest='test_file')
- parser.add_argument('--predict-valid-file', required=True,
- dest='predict_valid_file')
- parser.add_argument('--predict-test-file', required=True,
- dest='predict_test_file')
- parser.add_argument('--n-est', default=10, type=int, dest='n_est')
- parser.add_argument('--batch-size', default=64, type=int,
- dest='batch_size')
- parser.add_argument('--neurons', default=512, type=int)
- parser.add_argument('--dropout', default=0.5, type=float)
- parser.add_argument('--cv-id', required=True, dest='cv_id_file')
- args = parser.parse_args()
- start = time.time()
- train_predict(train_file=args.train_file,
- test_file=args.test_file,
- predict_valid_file=args.predict_valid_file,
- predict_test_file=args.predict_test_file,
- cv_id_file=args.cv_id_file,
- n_est=args.n_est,
- neurons=args.neurons,
- dropout=args.dropout,
- batch=args.batch_size)
- logging.info('finished ({:.2f} min elasped)'.format((time.time() - start) /
- 60))