PageRenderTime 88ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/lec_04_scraping.ipynb

https://gitlab.com/xbsd/content
Jupyter | 379 lines | 379 code | 0 blank | 0 comment | 0 complexity | 147f2b8aec23bbaeb891cb5e2e3fb8bb MD5 | raw file
  1. {
  2. "metadata": {
  3. "name": ""
  4. },
  5. "nbformat": 3,
  6. "nbformat_minor": 0,
  7. "worksheets": [
  8. {
  9. "cells": [
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. "# Scraping Demo\n",
  15. "Companion to Lecture 4 of Harvard [CS109: Data Science](http://cs109.org)\n"
  16. ]
  17. },
  18. {
  19. "cell_type": "code",
  20. "collapsed": false,
  21. "input": [
  22. "import requests\n",
  23. "from pattern import web\n",
  24. "from BeautifulSoup import BeautifulSoup"
  25. ],
  26. "language": "python",
  27. "metadata": {},
  28. "outputs": [],
  29. "prompt_number": 1
  30. },
  31. {
  32. "cell_type": "markdown",
  33. "metadata": {},
  34. "source": [
  35. "# Task\n",
  36. "\n",
  37. "Find and print the movie title, list of genres, runtime, and score of all movies on [this page](http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=1950,2012)"
  38. ]
  39. },
  40. {
  41. "cell_type": "markdown",
  42. "metadata": {},
  43. "source": [
  44. "### Two ways of making get requests"
  45. ]
  46. },
  47. {
  48. "cell_type": "markdown",
  49. "metadata": {},
  50. "source": [
  51. "#### 1. Explicit URL"
  52. ]
  53. },
  54. {
  55. "cell_type": "code",
  56. "collapsed": false,
  57. "input": [
  58. "url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'\n",
  59. "r = requests.get(url)\n",
  60. "print r.url"
  61. ],
  62. "language": "python",
  63. "metadata": {},
  64. "outputs": [
  65. {
  66. "output_type": "stream",
  67. "stream": "stdout",
  68. "text": [
  69. "http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012\n"
  70. ]
  71. }
  72. ],
  73. "prompt_number": 7
  74. },
  75. {
  76. "cell_type": "markdown",
  77. "metadata": {},
  78. "source": [
  79. "#### 2. Base URL with GET dictionary"
  80. ]
  81. },
  82. {
  83. "cell_type": "code",
  84. "collapsed": false,
  85. "input": [
  86. "url = 'http://www.imdb.com/search/title'\n",
  87. "params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1950,2012')\n",
  88. "r = requests.get(url, params=params)\n",
  89. "print r.url # notice it constructs the full url for you"
  90. ],
  91. "language": "python",
  92. "metadata": {},
  93. "outputs": [
  94. {
  95. "output_type": "stream",
  96. "stream": "stdout",
  97. "text": [
  98. "http://www.imdb.com/search/title?sort=num_votes%2Cdesc&start=1&title_type=feature&year=1950%2C2012\n"
  99. ]
  100. }
  101. ],
  102. "prompt_number": 8
  103. },
  104. {
  105. "cell_type": "markdown",
  106. "metadata": {},
  107. "source": [
  108. "# Using Pattern"
  109. ]
  110. },
  111. {
  112. "cell_type": "code",
  113. "collapsed": false,
  114. "input": [
  115. "#selection in pattern follows the rules of CSS\n",
  116. "\n",
  117. "dom = web.Element(r.text)\n",
  118. "for movie in dom.by_tag('td.title'): \n",
  119. " title = movie.by_tag('a')[0].content\n",
  120. " genres = movie.by_tag('span.genre')[0].by_tag('a')\n",
  121. " genres = [g.content for g in genres]\n",
  122. " runtime = movie.by_tag('span.runtime')[0].content\n",
  123. " rating = movie.by_tag('span.value')[0].content\n",
  124. " print title, genres, runtime, rating"
  125. ],
  126. "language": "python",
  127. "metadata": {},
  128. "outputs": [
  129. {
  130. "output_type": "stream",
  131. "stream": "stdout",
  132. "text": [
  133. "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n",
  134. "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n",
  135. "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n",
  136. "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n",
  137. "Fight Club [u'Drama'] 139 mins. 8.9\n",
  138. "The Lord of the Rings: The Fellowship of the Ring"
  139. ]
  140. },
  141. {
  142. "output_type": "stream",
  143. "stream": "stdout",
  144. "text": [
  145. " [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n",
  146. "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n",
  147. "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n",
  148. "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n",
  149. "Forrest Gump [u'Drama', u'Romance'] 142 mins. 8.7\n",
  150. "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n",
  151. "The Lord of the Rings: The Two Towers"
  152. ]
  153. },
  154. {
  155. "output_type": "stream",
  156. "stream": "stdout",
  157. "text": [
  158. " [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n",
  159. "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n",
  160. "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n",
  161. "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n",
  162. "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n",
  163. "Star Wars"
  164. ]
  165. },
  166. {
  167. "output_type": "stream",
  168. "stream": "stdout",
  169. "text": [
  170. " [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n",
  171. "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n",
  172. "Memento [u'Mystery', u'Thriller'] 113 mins. 8.6\n",
  173. "American Beauty [u'Drama'] 122 mins. 8.5\n",
  174. "Schindler's List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n",
  175. "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n",
  176. "The Departed"
  177. ]
  178. },
  179. {
  180. "output_type": "stream",
  181. "stream": "stdout",
  182. "text": [
  183. " [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n",
  184. "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n",
  185. "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n",
  186. "Star Wars: Episode V - The Empire Strikes Back"
  187. ]
  188. },
  189. {
  190. "output_type": "stream",
  191. "stream": "stdout",
  192. "text": [
  193. " [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n",
  194. "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n",
  195. "V for Vendetta [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n",
  196. "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n",
  197. "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n",
  198. "American History X"
  199. ]
  200. },
  201. {
  202. "output_type": "stream",
  203. "stream": "stdout",
  204. "text": [
  205. " [u'Crime', u'Drama'] 119 mins. 8.6\n",
  206. "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n",
  207. "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n",
  208. "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n",
  209. "Terminator 2: Judgment Day"
  210. ]
  211. },
  212. {
  213. "output_type": "stream",
  214. "stream": "stdout",
  215. "text": [
  216. " [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n",
  217. "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n",
  218. "Kill Bill: Vol. 1 [u'Action', u'Crime'] 111 mins. 8.2\n",
  219. "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n",
  220. "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n",
  221. "Léon: The Professional"
  222. ]
  223. },
  224. {
  225. "output_type": "stream",
  226. "stream": "stdout",
  227. "text": [
  228. " [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n",
  229. "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n",
  230. "One Flew Over the Cuckoo's Nest [u'Drama'] 133 mins. 8.8\n",
  231. "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n",
  232. "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n",
  233. "Eternal Sunshine of the Spotless Mind"
  234. ]
  235. },
  236. {
  237. "output_type": "stream",
  238. "stream": "stdout",
  239. "text": [
  240. " [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n",
  241. "Shutter Island [u'Drama', u'Thriller'] 138 mins. 8.0\n",
  242. "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n",
  243. "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n",
  244. "WALL·E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n",
  245. "300"
  246. ]
  247. },
  248. {
  249. "output_type": "stream",
  250. "stream": "stdout",
  251. "text": [
  252. " [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n"
  253. ]
  254. }
  255. ],
  256. "prompt_number": 9
  257. },
  258. {
  259. "cell_type": "markdown",
  260. "metadata": {},
  261. "source": [
  262. "# Using BeautifulSoup"
  263. ]
  264. },
  265. {
  266. "cell_type": "code",
  267. "collapsed": false,
  268. "input": [
  269. "bs = BeautifulSoup(r.text)\n",
  270. "for movie in bs.findAll('td', 'title'):\n",
  271. " title = movie.find('a').contents[0]\n",
  272. " genres = movie.find('span', 'genre').findAll('a')\n",
  273. " genres = [g.contents[0] for g in genres]\n",
  274. " runtime = movie.find('span', 'runtime').contents[0]\n",
  275. " rating = movie.find('span', 'value').contents[0]\n",
  276. " print title, genres, runtime, rating\n"
  277. ],
  278. "language": "python",
  279. "metadata": {},
  280. "outputs": [
  281. {
  282. "output_type": "stream",
  283. "stream": "stdout",
  284. "text": [
  285. "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n",
  286. "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n",
  287. "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n",
  288. "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n",
  289. "Fight Club [u'Drama'] 139 mins. 8.9\n",
  290. "The Lord of the Rings: The Fellowship of the Ring [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n",
  291. "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n",
  292. "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n",
  293. "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n",
  294. "Forrest Gump"
  295. ]
  296. },
  297. {
  298. "output_type": "stream",
  299. "stream": "stdout",
  300. "text": [
  301. " [u'Drama', u'Romance'] 142 mins. 8.7\n",
  302. "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n",
  303. "The Lord of the Rings: The Two Towers [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n",
  304. "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n",
  305. "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n",
  306. "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n",
  307. "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n",
  308. "Star Wars [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n",
  309. "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n",
  310. "Memento"
  311. ]
  312. },
  313. {
  314. "output_type": "stream",
  315. "stream": "stdout",
  316. "text": [
  317. " [u'Mystery', u'Thriller'] 113 mins. 8.6\n",
  318. "American Beauty [u'Drama'] 122 mins. 8.5\n",
  319. "Schindler's List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n",
  320. "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n",
  321. "The Departed [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n",
  322. "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n",
  323. "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n",
  324. "Star Wars: Episode V - The Empire Strikes Back [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n",
  325. "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n",
  326. "V for Vendetta"
  327. ]
  328. },
  329. {
  330. "output_type": "stream",
  331. "stream": "stdout",
  332. "text": [
  333. " [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n",
  334. "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n",
  335. "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n",
  336. "American History X [u'Crime', u'Drama'] 119 mins. 8.6\n",
  337. "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n",
  338. "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n",
  339. "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n",
  340. "Terminator 2: Judgment Day [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n",
  341. "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n",
  342. "Kill Bill: Vol. 1"
  343. ]
  344. },
  345. {
  346. "output_type": "stream",
  347. "stream": "stdout",
  348. "text": [
  349. " [u'Action', u'Crime'] 111 mins. 8.2\n",
  350. "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n",
  351. "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n",
  352. "Léon: The Professional [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n",
  353. "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n",
  354. "One Flew Over the Cuckoo's Nest [u'Drama'] 133 mins. 8.8\n",
  355. "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n",
  356. "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n",
  357. "Eternal Sunshine of the Spotless Mind [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n",
  358. "Shutter Island"
  359. ]
  360. },
  361. {
  362. "output_type": "stream",
  363. "stream": "stdout",
  364. "text": [
  365. " [u'Drama', u'Thriller'] 138 mins. 8.0\n",
  366. "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n",
  367. "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n",
  368. "WALL·E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n",
  369. "300 [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n"
  370. ]
  371. }
  372. ],
  373. "prompt_number": 5
  374. }
  375. ],
  376. "metadata": {}
  377. }
  378. ]
  379. }