PageRenderTime 96ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Web_Scraping.ipynb

https://bitbucket.org/bdenton/python-seminar
Jupyter | 129 lines | 129 code | 0 blank | 0 comment | 0 complexity | e00d5ca8ebe40648a730c56647fcb5ca MD5 | raw file
  1. {
  2. "metadata": {
  3. "name": "Web_Scraping"
  4. },
  5. "nbformat": 3,
  6. "nbformat_minor": 0,
  7. "worksheets": [
  8. {
  9. "cells": [
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. "Grab data from a webpage and extract desired information\n",
  15. "-----"
  16. ]
  17. },
  18. {
  19. "cell_type": "code",
  20. "collapsed": false,
  21. "input": [
  22. "from urllib import urlopen\n",
  23. "from BeautifulSoup import BeautifulStoneSoup"
  24. ],
  25. "language": "python",
  26. "metadata": {},
  27. "outputs": []
  28. },
  29. {
  30. "cell_type": "code",
  31. "collapsed": false,
  32. "input": [
  33. "# Copy all content from the provided web page\n",
  34. "LLY_RSS = urlopen( \"http://apps.shareholder.com/rss/rss.aspx?channels=2886&companyid=LLY\" ).read()"
  35. ],
  36. "language": "python",
  37. "metadata": {},
  38. "outputs": []
  39. },
  40. {
  41. "cell_type": "code",
  42. "collapsed": false,
  43. "input": [
  44. "# Use BeautifulStoneSoup to parse webpage elements using XML tags\n",
  45. "soup = BeautifulStoneSoup(LLY_RSS)"
  46. ],
  47. "language": "python",
  48. "metadata": {},
  49. "outputs": []
  50. },
  51. {
  52. "cell_type": "code",
  53. "collapsed": false,
  54. "input": [
  55. "# Read the contents of each of the XML tags into a Python list\n",
  56. "title = soup.findAll('title')\n",
  57. "link = soup.findAll('link')\n",
  58. "pubDate = soup.findAll('pubDate')\n",
  59. "description = soup.findAll('description')"
  60. ],
  61. "language": "python",
  62. "metadata": {},
  63. "outputs": []
  64. },
  65. {
  66. "cell_type": "code",
  67. "collapsed": false,
  68. "input": [
  69. "# Print title and link for each news story on Lilly RSS feed\n",
  70. "for i in range(10):\n",
  71. " print title[i]\n",
  72. " print link[i]\n",
  73. " print \"\\n\"\n"
  74. ],
  75. "language": "python",
  76. "metadata": {},
  77. "outputs": []
  78. },
  79. {
  80. "cell_type": "code",
  81. "collapsed": false,
  82. "input": [
  83. "# Remove XML tags for prettier printing\n",
  84. "for i in range(10):\n",
  85. " print \"Title:\", str(title[i]).strip('<title>').strip('</title>')\n",
  86. " print \"Link: \" + str(link[i]).strip('<link>').strip('</link>')\n",
  87. " print \"\\n\""
  88. ],
  89. "language": "python",
  90. "metadata": {},
  91. "outputs": []
  92. },
  93. {
  94. "cell_type": "code",
  95. "collapsed": false,
  96. "input": [],
  97. "language": "python",
  98. "metadata": {},
  99. "outputs": []
  100. },
  101. {
  102. "cell_type": "code",
  103. "collapsed": false,
  104. "input": [],
  105. "language": "python",
  106. "metadata": {},
  107. "outputs": []
  108. },
  109. {
  110. "cell_type": "code",
  111. "collapsed": false,
  112. "input": [],
  113. "language": "python",
  114. "metadata": {},
  115. "outputs": []
  116. },
  117. {
  118. "cell_type": "code",
  119. "collapsed": false,
  120. "input": [],
  121. "language": "python",
  122. "metadata": {},
  123. "outputs": []
  124. }
  125. ],
  126. "metadata": {}
  127. }
  128. ]
  129. }