/src/Scrape_XML.py
Python | 108 lines | 83 code | 10 blank | 15 comment | 8 complexity | c970abc6e0e10963039c94b679c69a7d MD5 | raw file
- #!/usr/bin/python
- #################################################################################
- # FILENAME : Scrape_XML.py #
- # AUTHOR : Brian Denton <brian.denton@gmail.com> #
- # DATE : 11/20/2012 #
- # DESCRIPTION: Scrape data from an XML table on a webpage and pass the data to #
- # R to demonstrate RPy2. #
- #################################################################################
- from urllib import urlopen
- from BeautifulSoup import BeautifulStoneSoup
- import re
- import rpy2.robjects as robjects
- from pandas import Series, DataFrame
- import pandas.rpy.common as com # pandas wrapper around RPy2 functions
- # Copy all content from the provided web page
- BLS = urlopen( "http://www.bls.gov/web/laus/laumstrk.htm" ).read()
- # Use BeautifulSoup to parse webpage elements using HTML tags
- soup = BeautifulStoneSoup( BLS )
- # Get all tables in the webpage
- Tables = soup.findAll('table')
- # Scan tables and find the index of the table with the data we want
- TABLE_INDEX = [i for i in range(len(Tables)) if "INDIANA" in str(Tables[i]).upper()][0]
- DataTable = Tables[TABLE_INDEX]
- # Make a Python list out of the table's rows (tr is the XML tag for table row)
- DataTable = DataTable.findAll('tr')
- # Select the rows containing the desired data
- DataTable = DataTable[1:]
- # Parse each row and populate lists for State and UnemploymentRate
- State = []
- UnemploymentRate = []
- MapColor = []
- for row in DataTable:
- row_list = re.split( '<|>', str(row) )
- state = row_list[8]
- ue_rate = float( row_list[12] )
- State.append( state )
- UnemploymentRate.append( ue_rate )
- if( ue_rate <= 6 ):
- MapColor.append( "pink" )
- elif( ue_rate > 6 and ue_rate <= 9 ):
- MapColor.append( "red" )
- elif( ue_rate > 9 ):
- MapColor.append( "dark red" )
- else:
- MapColor.append( "" )
- # Say you are working with pandas data structures. For the purpose of this
- # illustration I will explicilty convert Python lists to pandas.Series to show
- # how pandas data objects can be converted to R objects. In a real application
- # it would not make sense to do this since it is easy to convert from Python
- # lists and other fundamental data structures to R objects without going through
- # pandas. My intent here is simply to demonstrate if you were using pandas to do
- # your analysis you can easily convert pandas data objects to R data objects. But
- # these translation functions are merely wrappers for RPy2 functions. So you
- # could also use the RPy2 functions directly.
- # Create example pandas data objects
- State = DataFrame( {'State': Series( State )} )
- MapColor = DataFrame( {'MapColor' : Series( MapColor )} )
- ## Write a Python wrapper function that converts Python data types to R data
- ## types using RPy2 and calls R function
- def map_plot_py( regions, color, title,
- legend_placement, legend_text, legend_color, outfile ):
- # Convert Python lists to R string vectors
- regions_R = com.convert_to_r_matrix( regions )
- color_R = com.convert_to_r_matrix( color )
- legend_text_R = robjects.StrVector( legend_text )
- legend_color_R = robjects.StrVector( legend_color )
- # Load R source and bind the map_plot function to the name map_plot_R in the
- # Python global environment.
- source_R = robjects.r( 'source("../lib/map_plot.R")' )
- map_plot_R = robjects.globalenv['map_plot']
- # Pass R data vectors to R function
- map_plot_R( regions = regions_R, color = color_R,
- title = title,
- legend_placement = legend_placement,
- legend_text = legend_text_R,
- legend_color = legend_color_R,
- outfile = outfile )
- return None
- map_plot_py( regions = State, color = MapColor,
- title = "Unemployment Rate -- September 2012",
- legend_placement = "bottomleft",
- legend_text = ["<6%","6%-9%",">9%"],
- legend_color = ["pink","red","dark red"],
- outfile = "../plots/Unemployment_September_2012.pdf" )
- ## END OF FILE