Scrape_XML.py - FILENAME : Scrape_XML.py # AUTHOR : Brian D…

/src/Scrape_XML.py

https://bitbucket.org/bdenton/python-seminar · Python · 108 lines · 53 code · 23 blank · 32 comment · 8 complexity · c970abc6e0e10963039c94b679c69a7d MD5 · raw file

#!/usr/bin/python
#################################################################################
# FILENAME   : Scrape_XML.py                                                    #
# AUTHOR     : Brian Denton <brian.denton@gmail.com>                            #
# DATE       : 11/20/2012                                                       #
# DESCRIPTION: Scrape data from an XML table on a webpage and pass the data to  #
#              R to demonstrate RPy2.                                           #
#################################################################################

from urllib import urlopen
from BeautifulSoup import BeautifulStoneSoup
import re
import rpy2.robjects as robjects
from pandas import Series, DataFrame
import pandas.rpy.common as com # pandas wrapper around RPy2 functions

# Copy all content from the provided web page
BLS = urlopen( "http://www.bls.gov/web/laus/laumstrk.htm" ).read()

# Use BeautifulSoup to parse webpage elements using HTML tags
soup = BeautifulStoneSoup( BLS )

# Get all tables in the webpage
Tables = soup.findAll('table')

# Scan tables and find the index of the table with the data we want 
TABLE_INDEX = [i for i in range(len(Tables)) if "INDIANA" in str(Tables[i]).upper()][0]

DataTable = Tables[TABLE_INDEX]

# Make a Python list out of the table's rows (tr is the XML tag for table row)
DataTable = DataTable.findAll('tr')

# Select the rows containing the desired data
DataTable = DataTable[1:]

# Parse each row and populate lists for State and UnemploymentRate
State = []
UnemploymentRate = []
MapColor = []

for row in DataTable:
    row_list = re.split( '<|>', str(row) )
    state = row_list[8]
    ue_rate = float( row_list[12] )
    State.append( state )
    UnemploymentRate.append( ue_rate )
    if( ue_rate <= 6 ):
        MapColor.append( "pink" )
    elif( ue_rate > 6 and ue_rate <= 9 ):
        MapColor.append( "red" )
    elif( ue_rate > 9 ):
        MapColor.append( "dark red" )
    else:
        MapColor.append( "" )


# Say you are working with pandas data structures. For the purpose of this 
# illustration I will explicilty convert Python lists to pandas.Series to show
# how pandas data objects can be converted to R objects. In a real application
# it would not make sense to do this since it is easy to convert from Python 
# lists and other fundamental data structures to R objects without going through
# pandas. My intent here is simply to demonstrate if you were using pandas to do
# your analysis you can easily convert pandas data objects to R data objects. But 
# these translation functions are merely wrappers for RPy2 functions. So you
# could also use the RPy2 functions directly.

# Create example pandas data objects
State = DataFrame( {'State': Series( State )} )
MapColor = DataFrame( {'MapColor' : Series( MapColor )} )


## Write a Python wrapper function that converts Python data types to R data
## types using RPy2 and calls R function

def map_plot_py( regions, color, title,
                 legend_placement, legend_text, legend_color, outfile ):

    # Convert Python lists to R string vectors
    regions_R = com.convert_to_r_matrix( regions )
    color_R = com.convert_to_r_matrix( color )
    legend_text_R = robjects.StrVector( legend_text )
    legend_color_R = robjects.StrVector( legend_color )

    # Load R source and bind the map_plot function to the name map_plot_R in the
    # Python global environment.
    source_R = robjects.r( 'source("../lib/map_plot.R")' )
    map_plot_R = robjects.globalenv['map_plot']

    # Pass R data vectors to R function
    map_plot_R( regions = regions_R, color = color_R,
          title = title,
          legend_placement = legend_placement,
          legend_text = legend_text_R,
          legend_color = legend_color_R,
          outfile = outfile )

    return None


map_plot_py( regions = State, color = MapColor,
             title = "Unemployment Rate -- September 2012",
             legend_placement = "bottomleft",
             legend_text = ["<6%","6%-9%",">9%"],
             legend_color = ["pink","red","dark red"],
             outfile = "../plots/Unemployment_September_2012.pdf" )

## END OF FILE
Tech Fingerprint

Alerts (1)

'open(' Use 'with open()' to ensure Files are properly closed
18