PageRenderTime 25ms CodeModel.GetById 6ms RepoModel.GetById 0ms app.codeStats 0ms

/src/ensemble2.ipynb

https://gitlab.com/jeongyoonlee/allstate-claims-severity
Jupyter | 256 lines | 256 code | 0 blank | 0 comment | 0 complexity | e1475208fe675bd059d75a52f94a13d0 MD5 | raw file
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 74,
  6. "metadata": {
  7. "collapsed": false
  8. },
  9. "outputs": [
  10. {
  11. "name": "stderr",
  12. "output_type": "stream",
  13. "text": [
  14. "/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.\n",
  15. " warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')\n"
  16. ]
  17. }
  18. ],
  19. "source": [
  20. "%matplotlib inline"
  21. ]
  22. },
  23. {
  24. "cell_type": "code",
  25. "execution_count": 4,
  26. "metadata": {
  27. "collapsed": false
  28. },
  29. "outputs": [],
  30. "source": [
  31. "from __future__ import division\n",
  32. "from scipy.optimize import minimize \n",
  33. "from sklearn.metrics import mean_absolute_error as MAE\n",
  34. "from sklearn import base \n",
  35. "from sklearn.utils import check_random_state \n",
  36. "import pandas as pd\n",
  37. "import numpy as np\n",
  38. "\n",
  39. "from kaggler.data_io import load_data\n",
  40. "from const import SEED"
  41. ]
  42. },
  43. {
  44. "cell_type": "code",
  45. "execution_count": 75,
  46. "metadata": {
  47. "collapsed": true
  48. },
  49. "outputs": [],
  50. "source": [
  51. "X, y = load_data('../build/feature/esb.esb3.trn.csv')\n",
  52. "X_tst, _ = load_data('../build/feature/esb.esb3.tst.csv')\n",
  53. "sub = pd.read_csv('../data/sample_submission.csv', index_col=0)"
  54. ]
  55. },
  56. {
  57. "cell_type": "code",
  58. "execution_count": 76,
  59. "metadata": {
  60. "collapsed": false
  61. },
  62. "outputs": [],
  63. "source": [
  64. "class Minimizer(base.BaseEstimator): \n",
  65. " \n",
  66. " def __init__(self, algo='Nelder-Mead', tol=1e-6, random_state=None):\n",
  67. " self.algo = algo \n",
  68. " self.tol = tol \n",
  69. " self.random_state = check_random_state(random_state) \n",
  70. " \n",
  71. " def fit(self, X, y): \n",
  72. " X = np.asarray(X) \n",
  73. " res = minimize(lambda x: MAE(np.exp(y), np.exp(X.dot(x))), \n",
  74. " x0=self.random_state.rand(X.shape[1]), \n",
  75. " method=self.algo, \n",
  76. " tol=self.tol) \n",
  77. " self.coef_ = res.x \n",
  78. " \n",
  79. " return self \n",
  80. " \n",
  81. " def predict(self, X): \n",
  82. " X = np.array(X) \n",
  83. " return X.dot(self.coef_) "
  84. ]
  85. },
  86. {
  87. "cell_type": "code",
  88. "execution_count": 77,
  89. "metadata": {
  90. "collapsed": true
  91. },
  92. "outputs": [],
  93. "source": [
  94. "offset = 200\n",
  95. "ylog = np.log(y + offset)\n",
  96. "Xlog = np.log(X + offset)\n",
  97. "Xlog_tst = np.log(X_tst + offset)"
  98. ]
  99. },
  100. {
  101. "cell_type": "code",
  102. "execution_count": 85,
  103. "metadata": {
  104. "collapsed": false
  105. },
  106. "outputs": [
  107. {
  108. "name": "stdout",
  109. "output_type": "stream",
  110. "text": [
  111. "MAE = 1118.2360\n",
  112. "SEED = 64\n",
  113. "coefficients: [ 2.18523446 -1.81484195 0.5794238 0.22792973 -0.14610499 -0.02820413]\n",
  114. "Best seed = 64\n"
  115. ]
  116. }
  117. ],
  118. "source": [
  119. "best_seed = 64\n",
  120. "best_loss = np.inf\n",
  121. "for seed in [best_seed + x for x in range(100)]:\n",
  122. " m = Minimizer(random_state=seed)\n",
  123. " m.fit(Xlog, ylog)\n",
  124. " plog = m.predict(Xlog)\n",
  125. " p = np.exp(plog) - offset\n",
  126. " \n",
  127. " loss = MAE(y, p)\n",
  128. " if loss < best_loss:\n",
  129. " best_loss = loss\n",
  130. " best_seed = seed\n",
  131. " \n",
  132. " print('MAE = {:.4f}\\nSEED = {}'.format(MAE(y, p), seed))\n",
  133. " print('coefficients: {}'.format(m.coef_))\n",
  134. " \n",
  135. "print('Best seed = {}'.format(best_seed))"
  136. ]
  137. },
  138. {
  139. "cell_type": "code",
  140. "execution_count": 86,
  141. "metadata": {
  142. "collapsed": false
  143. },
  144. "outputs": [
  145. {
  146. "name": "stdout",
  147. "output_type": "stream",
  148. "text": [
  149. "MAE = 1118.2360\n",
  150. "SEED = 64\n",
  151. "coefficients: [ 2.18523446 -1.81484195 0.5794238 0.22792973 -0.14610499 -0.02820413]\n"
  152. ]
  153. }
  154. ],
  155. "source": [
  156. "seed = best_seed\n",
  157. "m = Minimizer(random_state=seed)\n",
  158. "m.fit(Xlog, ylog)\n",
  159. "plog = m.predict(Xlog)\n",
  160. "p = np.exp(plog) - offset\n",
  161. "print('MAE = {:.4f}\\nSEED = {}'.format(MAE(y, p), seed))\n",
  162. "print('coefficients: {}'.format(m.coef_))"
  163. ]
  164. },
  165. {
  166. "cell_type": "code",
  167. "execution_count": 87,
  168. "metadata": {
  169. "collapsed": false
  170. },
  171. "outputs": [
  172. {
  173. "data": {
  174. "text/plain": [
  175. "(2740.6679577421937, 2750.0757202846044)"
  176. ]
  177. },
  178. "execution_count": 87,
  179. "metadata": {},
  180. "output_type": "execute_result"
  181. }
  182. ],
  183. "source": [
  184. "plog_tst = m.predict(Xlog_tst)\n",
  185. "p_tst = np.exp(plog_tst) - offset\n",
  186. "p_tst.mean(), p.mean()"
  187. ]
  188. },
  189. {
  190. "cell_type": "code",
  191. "execution_count": 88,
  192. "metadata": {
  193. "collapsed": true
  194. },
  195. "outputs": [],
  196. "source": [
  197. "sub.loss = p_tst\n",
  198. "sub.to_csv('../build/sub/minl_{}_esb.esb3.sub.csv'.format(seed))\n",
  199. "np.savetxt('../build/val/minl_{}_esb.esb3.val.yht'.format(seed), p, fmt='%.6f')\n",
  200. "np.savetxt('../build/tst/minl_{}_esb.esb3.tst.yht'.format(seed), p_tst, fmt='%.6f')"
  201. ]
  202. },
  203. {
  204. "cell_type": "code",
  205. "execution_count": 89,
  206. "metadata": {
  207. "collapsed": false
  208. },
  209. "outputs": [
  210. {
  211. "data": {
  212. "text/plain": [
  213. "64"
  214. ]
  215. },
  216. "execution_count": 89,
  217. "metadata": {},
  218. "output_type": "execute_result"
  219. }
  220. ],
  221. "source": [
  222. "seed"
  223. ]
  224. },
  225. {
  226. "cell_type": "code",
  227. "execution_count": null,
  228. "metadata": {
  229. "collapsed": true
  230. },
  231. "outputs": [],
  232. "source": []
  233. }
  234. ],
  235. "metadata": {
  236. "kernelspec": {
  237. "display_name": "Python 2",
  238. "language": "python",
  239. "name": "python2"
  240. },
  241. "language_info": {
  242. "codemirror_mode": {
  243. "name": "ipython",
  244. "version": 2
  245. },
  246. "file_extension": ".py",
  247. "mimetype": "text/x-python",
  248. "name": "python",
  249. "nbconvert_exporter": "python",
  250. "pygments_lexer": "ipython2",
  251. "version": "2.7.12"
  252. }
  253. },
  254. "nbformat": 4,
  255. "nbformat_minor": 0
  256. }