PageRenderTime 38ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/code/preprocessing.ipynb

https://gitlab.com/aakansh9/SPHERE-Challenge
Jupyter | 188 lines | 188 code | 0 blank | 0 comment | 0 complexity | a0a8f9df18794247acdcb7fe8ee76246 MD5 | raw file
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 5,
  6. "metadata": {
  7. "collapsed": true
  8. },
  9. "outputs": [],
  10. "source": [
  11. "import pandas as pd\n",
  12. "import numpy as np"
  13. ]
  14. },
  15. {
  16. "cell_type": "code",
  17. "execution_count": 6,
  18. "metadata": {
  19. "collapsed": true
  20. },
  21. "outputs": [],
  22. "source": [
  23. "accel00001 = pd.read_csv('/SPHERE-Challenge/data/raw_data/public_data/train/00001/acceleration.csv')\n",
  24. "target00001 = pd.read_csv('/SPHERE-Challenge/data/raw_data/public_data/train/00001/targets.csv')"
  25. ]
  26. },
  27. {
  28. "cell_type": "code",
  29. "execution_count": 7,
  30. "metadata": {
  31. "collapsed": false,
  32. "scrolled": true
  33. },
  34. "outputs": [],
  35. "source": [
  36. "def window_feat(t_start, t_end, df, columns):\n",
  37. " \n",
  38. " ''' Extract statistical features for a given window and sensor data frame df '''\n",
  39. " \n",
  40. " window_size = t_end - t_start\n",
  41. " window_data = df[(df['t'] >= t_start) & (df['t'] < t_end)]\n",
  42. " window_data = window_data[columns]\n",
  43. " \n",
  44. " feat = [] # empty features list\n",
  45. " fnames = [] # empty feature names list\n",
  46. "\n",
  47. " # quantiles\n",
  48. " feat = feat + window_data.quantile(q = [0, 0.25, 0.50, 0.75, 1]).values.flatten().tolist()\n",
  49. " fnames = ( fnames + \n",
  50. " [ (column + '_W' + str(window_size) + '_0Q') for column in window_data.columns] + \n",
  51. " [ (column + '_W' + str(window_size) + '_25Q') for column in window_data.columns] +\n",
  52. " [ (column + '_W' + str(window_size) + '_50Q') for column in window_data.columns] +\n",
  53. " [ (column + '_W' + str(window_size) + '_75Q') for column in window_data.columns] +\n",
  54. " [ (column + '_W' + str(window_size) + '_100Q') for column in window_data.columns] )\n",
  55. " \n",
  56. " # mean\n",
  57. " feat = feat + window_data.mean().tolist()\n",
  58. " fnames = fnames + [ (column + '_W' + str(window_size) + '_mean') for column in window_data.columns]\n",
  59. " \n",
  60. " # std\n",
  61. " feat = feat + window_data.std().tolist()\n",
  62. " fnames = fnames + [ (column + '_W' + str(window_size) + '_std') for column in window_data.columns]\n",
  63. " \n",
  64. " return feat, fnames"
  65. ]
  66. },
  67. {
  68. "cell_type": "code",
  69. "execution_count": 8,
  70. "metadata": {
  71. "collapsed": true
  72. },
  73. "outputs": [],
  74. "source": [
  75. "feats = []\n",
  76. "\n",
  77. "for idx in target00001.index: # [0,1823]\n",
  78. " \n",
  79. " s = target00001.loc[idx,'start']\n",
  80. " e = target00001.loc[idx,'end']\n",
  81. " \n",
  82. " # extract features\n",
  83. " feat_W1, fnames_W1 = window_feat(t_start = e - 1, t_end = e, df = accel00001, columns = ['x', 'y', 'z'])\n",
  84. " feat_W2, fnames_W2 = window_feat(t_start = e - 2, t_end = e, df = accel00001, columns = ['x', 'y', 'z'])\n",
  85. " feat_W3, fnames_W3 = window_feat(t_start = e - 3, t_end = e, df = accel00001, columns = ['x', 'y', 'z'])\n",
  86. " feat_W4, fnames_W4 = window_feat(t_start = e - 4, t_end = e, df = accel00001, columns = ['x', 'y', 'z'])\n",
  87. " \n",
  88. " # consolidate features\n",
  89. " feat = feat_W1 + feat_W2 + feat_W3 + feat_W4\n",
  90. " fnames = fnames_W1 + fnames_W2 + fnames_W3 + fnames_W4\n",
  91. " \n",
  92. " feats.append(feat)\n",
  93. " \n",
  94. "feats = pd.DataFrame(feats, columns = fnames)"
  95. ]
  96. },
  97. {
  98. "cell_type": "code",
  99. "execution_count": 33,
  100. "metadata": {
  101. "collapsed": false
  102. },
  103. "outputs": [],
  104. "source": [
  105. "# create features for all train data\n",
  106. "targets = target00001.iloc[:,2:22]"
  107. ]
  108. },
  109. {
  110. "cell_type": "code",
  111. "execution_count": 48,
  112. "metadata": {
  113. "collapsed": false
  114. },
  115. "outputs": [],
  116. "source": [
  117. "feats = feats[np.isfinite(targets.sum(axis=1, skipna=False))]\n",
  118. "targets = targets[np.isfinite(targets.sum(axis=1, skipna=False))]"
  119. ]
  120. },
  121. {
  122. "cell_type": "code",
  123. "execution_count": 52,
  124. "metadata": {
  125. "collapsed": false
  126. },
  127. "outputs": [],
  128. "source": [
  129. "import xgboost"
  130. ]
  131. },
  132. {
  133. "cell_type": "code",
  134. "execution_count": 54,
  135. "metadata": {
  136. "collapsed": false
  137. },
  138. "outputs": [
  139. {
  140. "ename": "ValueError",
  141. "evalue": "DataFrame for label cannot have multiple columns",
  142. "output_type": "error",
  143. "traceback": [
  144. "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
  145. "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
  146. "\u001b[1;32m<ipython-input-54-4c763af0299c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mxgboost\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDMatrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfeats\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtargets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
  147. "\u001b[1;32m/opt/conda/lib/python3.5/site-packages/xgboost-0.4-py3.5.egg/xgboost/core.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data, label, missing, weight, silent, feature_names, feature_types)\u001b[0m\n\u001b[0;32m 220\u001b[0m \u001b[0mfeature_names\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m feature_types)\n\u001b[1;32m--> 222\u001b[1;33m \u001b[0mlabel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_maybe_pandas_label\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 223\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSTRING_TYPES\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  148. "\u001b[1;32m/opt/conda/lib/python3.5/site-packages/xgboost-0.4-py3.5.egg/xgboost/core.py\u001b[0m in \u001b[0;36m_maybe_pandas_label\u001b[1;34m(label)\u001b[0m\n\u001b[0;32m 164\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 165\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 166\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'DataFrame for label cannot have multiple columns'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 167\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 168\u001b[0m \u001b[0mlabel_dtypes\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlabel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtypes\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  149. "\u001b[1;31mValueError\u001b[0m: DataFrame for label cannot have multiple columns"
  150. ]
  151. }
  152. ],
  153. "source": [
  154. "xgboost.DMatrix(data = feats, label = targets)"
  155. ]
  156. },
  157. {
  158. "cell_type": "code",
  159. "execution_count": null,
  160. "metadata": {
  161. "collapsed": true
  162. },
  163. "outputs": [],
  164. "source": []
  165. }
  166. ],
  167. "metadata": {
  168. "kernelspec": {
  169. "display_name": "Python [Root]",
  170. "language": "python",
  171. "name": "Python [Root]"
  172. },
  173. "language_info": {
  174. "codemirror_mode": {
  175. "name": "ipython",
  176. "version": 3
  177. },
  178. "file_extension": ".py",
  179. "mimetype": "text/x-python",
  180. "name": "python",
  181. "nbconvert_exporter": "python",
  182. "pygments_lexer": "ipython3",
  183. "version": "3.5.2"
  184. }
  185. },
  186. "nbformat": 4,
  187. "nbformat_minor": 0
  188. }