/src/Web_Scraping.ipynb
Jupyter | 129 lines | 129 code | 0 blank | 0 comment | 0 complexity | e00d5ca8ebe40648a730c56647fcb5ca MD5 | raw file
- {
- "metadata": {
- "name": "Web_Scraping"
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Grab data from a webpage and extract desired information\n",
- "-----"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "from urllib import urlopen\n",
- "from BeautifulSoup import BeautifulStoneSoup"
- ],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Copy all content from the provided web page\n",
- "LLY_RSS = urlopen( \"http://apps.shareholder.com/rss/rss.aspx?channels=2886&companyid=LLY\" ).read()"
- ],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Use BeautifulStoneSoup to parse webpage elements using XML tags\n",
- "soup = BeautifulStoneSoup(LLY_RSS)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Read the contents of each of the XML tags into a Python list\n",
- "title = soup.findAll('title')\n",
- "link = soup.findAll('link')\n",
- "pubDate = soup.findAll('pubDate')\n",
- "description = soup.findAll('description')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Print title and link for each news story on Lilly RSS feed\n",
- "for i in range(10):\n",
- " print title[i]\n",
- " print link[i]\n",
- " print \"\\n\"\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Remove XML tags for prettier printing\n",
- "for i in range(10):\n",
- " print \"Title:\", str(title[i]).strip('<title>').strip('</title>')\n",
- " print \"Link: \" + str(link[i]).strip('<link>').strip('</link>')\n",
- " print \"\\n\""
- ],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": []
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": []
- }
- ],
- "metadata": {}
- }
- ]
- }