PageRenderTime 41ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Scrape_XML.py

https://bitbucket.org/bdenton/python-seminar
Python | 108 lines | 83 code | 10 blank | 15 comment | 8 complexity | c970abc6e0e10963039c94b679c69a7d MD5 | raw file
  1. #!/usr/bin/python
  2. #################################################################################
  3. # FILENAME : Scrape_XML.py #
  4. # AUTHOR : Brian Denton <brian.denton@gmail.com> #
  5. # DATE : 11/20/2012 #
  6. # DESCRIPTION: Scrape data from an XML table on a webpage and pass the data to #
  7. # R to demonstrate RPy2. #
  8. #################################################################################
  9. from urllib import urlopen
  10. from BeautifulSoup import BeautifulStoneSoup
  11. import re
  12. import rpy2.robjects as robjects
  13. from pandas import Series, DataFrame
  14. import pandas.rpy.common as com # pandas wrapper around RPy2 functions
  15. # Copy all content from the provided web page
  16. BLS = urlopen( "http://www.bls.gov/web/laus/laumstrk.htm" ).read()
  17. # Use BeautifulSoup to parse webpage elements using HTML tags
  18. soup = BeautifulStoneSoup( BLS )
  19. # Get all tables in the webpage
  20. Tables = soup.findAll('table')
  21. # Scan tables and find the index of the table with the data we want
  22. TABLE_INDEX = [i for i in range(len(Tables)) if "INDIANA" in str(Tables[i]).upper()][0]
  23. DataTable = Tables[TABLE_INDEX]
  24. # Make a Python list out of the table's rows (tr is the XML tag for table row)
  25. DataTable = DataTable.findAll('tr')
  26. # Select the rows containing the desired data
  27. DataTable = DataTable[1:]
  28. # Parse each row and populate lists for State and UnemploymentRate
  29. State = []
  30. UnemploymentRate = []
  31. MapColor = []
  32. for row in DataTable:
  33. row_list = re.split( '<|>', str(row) )
  34. state = row_list[8]
  35. ue_rate = float( row_list[12] )
  36. State.append( state )
  37. UnemploymentRate.append( ue_rate )
  38. if( ue_rate <= 6 ):
  39. MapColor.append( "pink" )
  40. elif( ue_rate > 6 and ue_rate <= 9 ):
  41. MapColor.append( "red" )
  42. elif( ue_rate > 9 ):
  43. MapColor.append( "dark red" )
  44. else:
  45. MapColor.append( "" )
  46. # Say you are working with pandas data structures. For the purpose of this
  47. # illustration I will explicilty convert Python lists to pandas.Series to show
  48. # how pandas data objects can be converted to R objects. In a real application
  49. # it would not make sense to do this since it is easy to convert from Python
  50. # lists and other fundamental data structures to R objects without going through
  51. # pandas. My intent here is simply to demonstrate if you were using pandas to do
  52. # your analysis you can easily convert pandas data objects to R data objects. But
  53. # these translation functions are merely wrappers for RPy2 functions. So you
  54. # could also use the RPy2 functions directly.
  55. # Create example pandas data objects
  56. State = DataFrame( {'State': Series( State )} )
  57. MapColor = DataFrame( {'MapColor' : Series( MapColor )} )
  58. ## Write a Python wrapper function that converts Python data types to R data
  59. ## types using RPy2 and calls R function
  60. def map_plot_py( regions, color, title,
  61. legend_placement, legend_text, legend_color, outfile ):
  62. # Convert Python lists to R string vectors
  63. regions_R = com.convert_to_r_matrix( regions )
  64. color_R = com.convert_to_r_matrix( color )
  65. legend_text_R = robjects.StrVector( legend_text )
  66. legend_color_R = robjects.StrVector( legend_color )
  67. # Load R source and bind the map_plot function to the name map_plot_R in the
  68. # Python global environment.
  69. source_R = robjects.r( 'source("../lib/map_plot.R")' )
  70. map_plot_R = robjects.globalenv['map_plot']
  71. # Pass R data vectors to R function
  72. map_plot_R( regions = regions_R, color = color_R,
  73. title = title,
  74. legend_placement = legend_placement,
  75. legend_text = legend_text_R,
  76. legend_color = legend_color_R,
  77. outfile = outfile )
  78. return None
  79. map_plot_py( regions = State, color = MapColor,
  80. title = "Unemployment Rate -- September 2012",
  81. legend_placement = "bottomleft",
  82. legend_text = ["<6%","6%-9%",">9%"],
  83. legend_color = ["pink","red","dark red"],
  84. outfile = "../plots/Unemployment_September_2012.pdf" )
  85. ## END OF FILE