PageRenderTime 152ms CodeModel.GetById 35ms RepoModel.GetById 0ms app.codeStats 0ms

/notebooks/get-data-2016.ipynb

https://bitbucket.org/aabtzu/mlkaggle
Jupyter | 410 lines | 410 code | 0 blank | 0 comment | 0 complexity | 581b303f3d20c6ebd4f9b143aa5ea7f9 MD5 | raw file
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 109,
  6. "metadata": {
  7. "collapsed": true
  8. },
  9. "outputs": [],
  10. "source": [
  11. "import pandas\n",
  12. "import requests\n",
  13. "import bs4\n",
  14. "import os"
  15. ]
  16. },
  17. {
  18. "cell_type": "code",
  19. "execution_count": 127,
  20. "metadata": {
  21. "collapsed": false
  22. },
  23. "outputs": [],
  24. "source": [
  25. "url = \"http://www.fftoday.com/nfl/schedule.php\"\n",
  26. "soup = bs4.BeautifulSoup(requests.get(url).text)\n",
  27. "df = pandas.read_html(str(soup.find_all('table')[8]))[0]\n",
  28. "df.columns = ['date', 'time', 'Visitor', 'Home Team']"
  29. ]
  30. },
  31. {
  32. "cell_type": "code",
  33. "execution_count": 128,
  34. "metadata": {
  35. "collapsed": false
  36. },
  37. "outputs": [],
  38. "source": [
  39. "# get these of the pesky characters\n",
  40. "bad_chars = [u'\\xc2' , u'\\xa0', u' \\xb9']\n",
  41. "\n",
  42. "for cc in df.columns:\n",
  43. " for c in bad_chars:\n",
  44. " df[cc] = df[cc].str.replace(c,'')"
  45. ]
  46. },
  47. {
  48. "cell_type": "code",
  49. "execution_count": 129,
  50. "metadata": {
  51. "collapsed": false
  52. },
  53. "outputs": [],
  54. "source": [
  55. "# fill in dates\n",
  56. "for ii,row in df.iterrows():\n",
  57. " if ii > 0:\n",
  58. " if row['date'] == '':\n",
  59. " row['date'] = prev['date']\n",
  60. " prev = row \n",
  61. " \n",
  62. "# fix the rows \n",
  63. "df = df.drop_duplicates().dropna()\n",
  64. "df = df[1:]"
  65. ]
  66. },
  67. {
  68. "cell_type": "code",
  69. "execution_count": 130,
  70. "metadata": {
  71. "collapsed": false
  72. },
  73. "outputs": [],
  74. "source": [
  75. "# add year to dates\n",
  76. "df.date = df.date + ' 2016'\n",
  77. "\n",
  78. "# fix the ones in Jan\n",
  79. "date_filter = df.date.str.contains('Jan')\n",
  80. "df.loc[date_filter, 'date'] = df[date_filter].date.str.replace('2016', '2017')\n",
  81. "\n",
  82. "# convert to datetime\n",
  83. "df.date = pandas.to_datetime(df.date, format='%a %b %d %Y')"
  84. ]
  85. },
  86. {
  87. "cell_type": "code",
  88. "execution_count": 131,
  89. "metadata": {
  90. "collapsed": false
  91. },
  92. "outputs": [],
  93. "source": [
  94. "# define the root directory for the nfl code in $MLNLF_ROOT\n",
  95. "\n",
  96. "rootDir = '/Users/amit/repos/mlnfl/nfl'\n",
  97. "codeDir = \"\".join([rootDir, os.path.sep])\n",
  98. "dataRoot = \"\".join([codeDir, \"data\", os.path.sep])\n",
  99. "\n",
  100. "df.to_csv(dataRoot + '/nfl_schedule_2016.csv', index=False)"
  101. ]
  102. },
  103. {
  104. "cell_type": "code",
  105. "execution_count": 138,
  106. "metadata": {
  107. "collapsed": false,
  108. "scrolled": false
  109. },
  110. "outputs": [
  111. {
  112. "name": "stdout",
  113. "output_type": "stream",
  114. "text": [
  115. ">>Denver Broncos<<\n",
  116. ">>Baltimore Ravens<<\n",
  117. ">>Houston Texans<<\n",
  118. ">>New York Jets<<\n",
  119. ">>Philadelphia Eagles<<\n",
  120. ">>Jacksonville Jaguars<<\n",
  121. ">>Tennessee Titans<<\n",
  122. ">>New Orleans Saints<<\n",
  123. ">>Kansas City Chiefs<<\n",
  124. ">>Atlanta Falcons<<\n",
  125. ">>Seattle Seahawks<<\n",
  126. ">>Indianapolis Colts<<\n",
  127. ">>Dallas Cowboys<<\n",
  128. ">>Arizona Cardinals<<\n",
  129. ">>Washington Redskins<<\n",
  130. ">>San Francisco 49ers<<\n",
  131. ">>Buffalo Bills<<\n",
  132. ">>Cleveland Browns<<\n",
  133. ">>Pittsburgh Steelers<<\n",
  134. ">>Washington Redskins<<\n",
  135. ">>Houston Texans<<\n",
  136. ">>New England Patriots<<\n",
  137. ">>New York Giants<<\n",
  138. ">>Carolina Panthers<<\n",
  139. ">>Detroit Lions<<\n",
  140. ">>Los Angeles Rams<<\n",
  141. ">>Arizona Cardinals<<\n",
  142. ">>Oakland Raiders<<\n",
  143. ">>Denver Broncos<<\n",
  144. ">>San Diego Chargers<<\n",
  145. ">>Minnesota Vikings<<\n",
  146. ">>Chicago Bears<<\n",
  147. ">>New England Patriots<<\n",
  148. ">>Buffalo Bills<<\n",
  149. ">>Jacksonville Jaguars<<\n",
  150. ">>Miami Dolphins<<\n",
  151. ">>Cincinnati Bengals<<\n",
  152. ">>Green Bay Packers<<\n",
  153. ">>Carolina Panthers<<\n",
  154. ">>Tennessee Titans<<\n",
  155. ">>New York Giants<<\n",
  156. ">>Tampa Bay Buccaneers<<\n",
  157. ">>Seattle Seahawks<<\n",
  158. ">>Kansas City Chiefs<<\n",
  159. ">>Philadelphia Eagles<<\n",
  160. ">>Indianapolis Colts<<\n",
  161. ">>Dallas Cowboys<<\n",
  162. ">>New Orleans Saints<<\n",
  163. ">>Cincinnati Bengals<<\n",
  164. ">>Jacksonville Jaguars <<\n",
  165. ">>New England Patriots<<\n",
  166. ">>Atlanta Falcons<<\n",
  167. ">>Washington Redskins<<\n",
  168. ">>Chicago Bears<<\n",
  169. ">>Baltimore Ravens<<\n",
  170. ">>New York Jets<<\n",
  171. ">>Houston Texans<<\n",
  172. ">>Tampa Bay Buccaneers<<\n",
  173. ">>San Francisco 49ers<<\n",
  174. ">>Arizona Cardinals<<\n",
  175. ">>San Diego Chargers<<\n",
  176. ">>Pittsburgh Steelers<<\n",
  177. ">>Minnesota Vikings<<\n",
  178. ">>San Francisco 49ers<<\n",
  179. ">>Indianapolis Colts<<\n",
  180. ">>Minnesota Vikings<<\n",
  181. ">>Cleveland Browns<<\n",
  182. ">>Pittsburgh Steelers<<\n",
  183. ">>Detroit Lions<<\n",
  184. ">>Miami Dolphins<<\n",
  185. ">>Baltimore Ravens<<\n",
  186. ">>Denver Broncos<<\n",
  187. ">>Los Angeles Rams<<\n",
  188. ">>Dallas Cowboys<<\n",
  189. ">>Oakland Raiders<<\n",
  190. ">>Green Bay Packers<<\n",
  191. ">>Carolina Panthers<<\n",
  192. ">>San Diego Chargers<<\n",
  193. ">>New York Giants<<\n",
  194. ">>New Orleans Saints<<\n",
  195. ">>New England Patriots<<\n",
  196. ">>Tennessee Titans<<\n",
  197. ">>Chicago Bears<<\n",
  198. ">>Detroit Lions<<\n",
  199. ">>Washington Redskins<<\n",
  200. ">>Miami Dolphins<<\n",
  201. ">>Buffalo Bills<<\n",
  202. ">>Oakland Raiders<<\n",
  203. ">>Seattle Seahawks<<\n",
  204. ">>Green Bay Packers<<\n",
  205. ">>Houston Texans<<\n",
  206. ">>Arizona Cardinals<<\n",
  207. ">>Green Bay Packers<<\n",
  208. ">>Los Angeles Rams <<\n",
  209. ">>New York Jets<<\n",
  210. ">>Miami Dolphins<<\n",
  211. ">>Cincinnati Bengals<<\n",
  212. ">>Tennessee Titans<<\n",
  213. ">>Philadelphia Eagles<<\n",
  214. ">>Kansas City Chiefs<<\n",
  215. ">>Jacksonville Jaguars<<\n",
  216. ">>Detroit Lions<<\n",
  217. ">>Atlanta Falcons<<\n",
  218. ">>San Francisco 49ers<<\n",
  219. ">>Pittsburgh Steelers<<\n",
  220. ">>Arizona Cardinals<<\n",
  221. ">>Denver Broncos<<\n",
  222. ">>Tennessee Titans<<\n",
  223. ">>Cincinnati Bengals <<\n",
  224. ">>Houston Texans<<\n",
  225. ">>Atlanta Falcons<<\n",
  226. ">>Indianapolis Colts<<\n",
  227. ">>Buffalo Bills<<\n",
  228. ">>Cleveland Browns<<\n",
  229. ">>Tampa Bay Buccaneers<<\n",
  230. ">>New Orleans Saints<<\n",
  231. ">>Denver Broncos<<\n",
  232. ">>Carolina Panthers<<\n",
  233. ">>Dallas Cowboys<<\n",
  234. ">>Chicago Bears<<\n",
  235. ">>Tampa Bay Buccaneers<<\n",
  236. ">>Cleveland Browns<<\n",
  237. ">>Minnesota Vikings<<\n",
  238. ">>Kansas City Chiefs<<\n",
  239. ">>Miami Dolphins<<\n",
  240. ">>New York Giants<<\n",
  241. ">>Baltimore Ravens<<\n",
  242. ">>Los Angeles Rams<<\n",
  243. ">>San Francisco 49ers<<\n",
  244. ">>Green Bay Packers<<\n",
  245. ">>San Diego Chargers<<\n",
  246. ">>Oakland Raiders<<\n",
  247. ">>Seattle Seahawks<<\n",
  248. ">>Baltimore Ravens<<\n",
  249. ">>Philadelphia Eagles<<\n",
  250. ">>Tampa Bay Buccaneers<<\n",
  251. ">>New Orleans Saints<<\n",
  252. ">>Tennessee Titans<<\n",
  253. ">>Jacksonville Jaguars<<\n",
  254. ">>Carolina Panthers<<\n",
  255. ">>New York Jets<<\n",
  256. ">>Washington Redskins<<\n",
  257. ">>San Diego Chargers<<\n",
  258. ">>Pittsburgh Steelers<<\n",
  259. ">>Arizona Cardinals<<\n",
  260. ">>New England Patriots<<\n",
  261. ">>New York Giants<<\n",
  262. ">>Carolina Panthers<<\n",
  263. ">>Minnesota Vikings<<\n",
  264. ">>Dallas Cowboys<<\n",
  265. ">>Cincinnati Bengals<<\n",
  266. ">>New York Giants<<\n",
  267. ">>Detroit Lions<<\n",
  268. ">>Cleveland Browns<<\n",
  269. ">>Kansas City Chiefs<<\n",
  270. ">>Indianapolis Colts<<\n",
  271. ">>Los Angeles Rams<<\n",
  272. ">>San Francisco 49ers<<\n",
  273. ">>Seattle Seahawks<<\n",
  274. ">>Washington Redskins<<\n",
  275. ">>Oakland Raiders<<\n",
  276. ">>Detroit Lions<<\n",
  277. ">>Dallas Cowboys<<\n",
  278. ">>Indianapolis Colts<<\n",
  279. ">>Atlanta Falcons<<\n",
  280. ">>Baltimore Ravens<<\n",
  281. ">>Buffalo Bills<<\n",
  282. ">>New Orleans Saints<<\n",
  283. ">>Cleveland Browns<<\n",
  284. ">>Houston Texans<<\n",
  285. ">>Miami Dolphins<<\n",
  286. ">>Chicago Bears<<\n",
  287. ">>Tampa Bay Buccaneers<<\n",
  288. ">>Oakland Raiders<<\n",
  289. ">>Denver Broncos<<\n",
  290. ">>New York Jets<<\n",
  291. ">>Philadelphia Eagles<<\n",
  292. ">>Minnesota Vikings<<\n",
  293. ">>Jacksonville Jaguars<<\n",
  294. ">>New Orleans Saints<<\n",
  295. ">>Green Bay Packers<<\n",
  296. ">>Atlanta Falcons<<\n",
  297. ">>New England Patriots<<\n",
  298. ">>Baltimore Ravens<<\n",
  299. ">>Cincinnati Bengals<<\n",
  300. ">>Chicago Bears<<\n",
  301. ">>Oakland Raiders<<\n",
  302. ">>Pittsburgh Steelers<<\n",
  303. ">>San Diego Chargers<<\n",
  304. ">>Arizona Cardinals<<\n",
  305. ">>Seattle Seahawks<<\n",
  306. ">>New York Jets<<\n",
  307. ">>Kansas City Chiefs<<\n",
  308. ">>Miami Dolphins<<\n",
  309. ">>Detroit Lions<<\n",
  310. ">>Cleveland Browns<<\n",
  311. ">>Tennessee Titans<<\n",
  312. ">>Indianapolis Colts<<\n",
  313. ">>Jacksonville Jaguars<<\n",
  314. ">>Tampa Bay Buccaneers<<\n",
  315. ">>Buffalo Bills<<\n",
  316. ">>Carolina Panthers<<\n",
  317. ">>Philadelphia Eagles<<\n",
  318. ">>San Francisco 49ers<<\n",
  319. ">>Los Angeles Rams<<\n",
  320. ">>Green Bay Packers<<\n",
  321. ">>New York Giants<<\n",
  322. ">>New England Patriots<<\n",
  323. ">>Seattle Seahawks<<\n",
  324. ">>New York Jets<<\n",
  325. ">>Buffalo Bills<<\n",
  326. ">>New York Giants<<\n",
  327. ">>Chicago Bears<<\n",
  328. ">>Minnesota Vikings<<\n",
  329. ">>Houston Texans<<\n",
  330. ">>Baltimore Ravens<<\n",
  331. ">>Dallas Cowboys<<\n",
  332. ">>Kansas City Chiefs<<\n",
  333. ">>Arizona Cardinals<<\n",
  334. ">>Atlanta Falcons<<\n",
  335. ">>Denver Broncos<<\n",
  336. ">>San Diego Chargers<<\n",
  337. ">>Cincinnati Bengals<<\n",
  338. ">>Washington Redskins<<\n",
  339. ">>Philadelphia Eagles<<\n",
  340. ">>Carolina Panthers<<\n",
  341. ">>Buffalo Bills<<\n",
  342. ">>Green Bay Packers<<\n",
  343. ">>New England Patriots<<\n",
  344. ">>Cleveland Browns<<\n",
  345. ">>New Orleans Saints<<\n",
  346. ">>Jacksonville Jaguars<<\n",
  347. ">>Chicago Bears<<\n",
  348. ">>Oakland Raiders<<\n",
  349. ">>Seattle Seahawks<<\n",
  350. ">>Los Angeles Rams<<\n",
  351. ">>Houston Texans<<\n",
  352. ">>Pittsburgh Steelers<<\n",
  353. ">>Kansas City Chiefs<<\n",
  354. ">>Dallas Cowboys<<\n",
  355. ">>Cincinnati Bengals<<\n",
  356. ">>New York Jets<<\n",
  357. ">>Tampa Bay Buccaneers<<\n",
  358. ">>Minnesota Vikings<<\n",
  359. ">>Pittsburgh Steelers<<\n",
  360. ">>Philadelphia Eagles<<\n",
  361. ">>Detroit Lions<<\n",
  362. ">>Tennessee Titans<<\n",
  363. ">>Indianapolis Colts<<\n",
  364. ">>Miami Dolphins<<\n",
  365. ">>Atlanta Falcons<<\n",
  366. ">>Washington Redskins<<\n",
  367. ">>Los Angeles Rams<<\n",
  368. ">>San Diego Chargers<<\n",
  369. ">>Denver Broncos<<\n",
  370. ">>San Francisco 49ers<<\n"
  371. ]
  372. }
  373. ],
  374. "source": [
  375. "for ii, rr in df.iterrows():\n",
  376. " print '>>'+rr['Home Team']+'<<'"
  377. ]
  378. },
  379. {
  380. "cell_type": "code",
  381. "execution_count": null,
  382. "metadata": {
  383. "collapsed": true
  384. },
  385. "outputs": [],
  386. "source": []
  387. }
  388. ],
  389. "metadata": {
  390. "kernelspec": {
  391. "display_name": "Python 2",
  392. "language": "python",
  393. "name": "python2"
  394. },
  395. "language_info": {
  396. "codemirror_mode": {
  397. "name": "ipython",
  398. "version": 2
  399. },
  400. "file_extension": ".py",
  401. "mimetype": "text/x-python",
  402. "name": "python",
  403. "nbconvert_exporter": "python",
  404. "pygments_lexer": "ipython2",
  405. "version": "2.7.10"
  406. }
  407. },
  408. "nbformat": 4,
  409. "nbformat_minor": 0
  410. }