/src/ensemble2.ipynb
Jupyter | 256 lines | 256 code | 0 blank | 0 comment | 0 complexity | e1475208fe675bd059d75a52f94a13d0 MD5 | raw file
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 74,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.\n",
- " warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')\n"
- ]
- }
- ],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "from __future__ import division\n",
- "from scipy.optimize import minimize \n",
- "from sklearn.metrics import mean_absolute_error as MAE\n",
- "from sklearn import base \n",
- "from sklearn.utils import check_random_state \n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "\n",
- "from kaggler.data_io import load_data\n",
- "from const import SEED"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "X, y = load_data('../build/feature/esb.esb3.trn.csv')\n",
- "X_tst, _ = load_data('../build/feature/esb.esb3.tst.csv')\n",
- "sub = pd.read_csv('../data/sample_submission.csv', index_col=0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "class Minimizer(base.BaseEstimator): \n",
- " \n",
- " def __init__(self, algo='Nelder-Mead', tol=1e-6, random_state=None):\n",
- " self.algo = algo \n",
- " self.tol = tol \n",
- " self.random_state = check_random_state(random_state) \n",
- " \n",
- " def fit(self, X, y): \n",
- " X = np.asarray(X) \n",
- " res = minimize(lambda x: MAE(np.exp(y), np.exp(X.dot(x))), \n",
- " x0=self.random_state.rand(X.shape[1]), \n",
- " method=self.algo, \n",
- " tol=self.tol) \n",
- " self.coef_ = res.x \n",
- " \n",
- " return self \n",
- " \n",
- " def predict(self, X): \n",
- " X = np.array(X) \n",
- " return X.dot(self.coef_) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 77,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "offset = 200\n",
- "ylog = np.log(y + offset)\n",
- "Xlog = np.log(X + offset)\n",
- "Xlog_tst = np.log(X_tst + offset)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 85,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "MAE = 1118.2360\n",
- "SEED = 64\n",
- "coefficients: [ 2.18523446 -1.81484195 0.5794238 0.22792973 -0.14610499 -0.02820413]\n",
- "Best seed = 64\n"
- ]
- }
- ],
- "source": [
- "best_seed = 64\n",
- "best_loss = np.inf\n",
- "for seed in [best_seed + x for x in range(100)]:\n",
- " m = Minimizer(random_state=seed)\n",
- " m.fit(Xlog, ylog)\n",
- " plog = m.predict(Xlog)\n",
- " p = np.exp(plog) - offset\n",
- " \n",
- " loss = MAE(y, p)\n",
- " if loss < best_loss:\n",
- " best_loss = loss\n",
- " best_seed = seed\n",
- " \n",
- " print('MAE = {:.4f}\\nSEED = {}'.format(MAE(y, p), seed))\n",
- " print('coefficients: {}'.format(m.coef_))\n",
- " \n",
- "print('Best seed = {}'.format(best_seed))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 86,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "MAE = 1118.2360\n",
- "SEED = 64\n",
- "coefficients: [ 2.18523446 -1.81484195 0.5794238 0.22792973 -0.14610499 -0.02820413]\n"
- ]
- }
- ],
- "source": [
- "seed = best_seed\n",
- "m = Minimizer(random_state=seed)\n",
- "m.fit(Xlog, ylog)\n",
- "plog = m.predict(Xlog)\n",
- "p = np.exp(plog) - offset\n",
- "print('MAE = {:.4f}\\nSEED = {}'.format(MAE(y, p), seed))\n",
- "print('coefficients: {}'.format(m.coef_))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 87,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(2740.6679577421937, 2750.0757202846044)"
- ]
- },
- "execution_count": 87,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "plog_tst = m.predict(Xlog_tst)\n",
- "p_tst = np.exp(plog_tst) - offset\n",
- "p_tst.mean(), p.mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 88,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "sub.loss = p_tst\n",
- "sub.to_csv('../build/sub/minl_{}_esb.esb3.sub.csv'.format(seed))\n",
- "np.savetxt('../build/val/minl_{}_esb.esb3.val.yht'.format(seed), p, fmt='%.6f')\n",
- "np.savetxt('../build/tst/minl_{}_esb.esb3.tst.yht'.format(seed), p_tst, fmt='%.6f')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 89,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "64"
- ]
- },
- "execution_count": 89,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "seed"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }