PageRenderTime 48ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/fl/get_legislation.py

https://github.com/rshapiro/fiftystates
Python | 149 lines | 123 code | 15 blank | 11 comment | 10 complexity | aa9bab1a2a4c9e390cacb6d7a101ef5f MD5 | raw file
  1. #!/usr/bin/env python
  2. import urllib2
  3. import re
  4. import datetime as dt
  5. from BeautifulSoup import BeautifulSoup
  6. # ugly hack
  7. import sys
  8. sys.path.append('./scripts')
  9. from pyutils.legislation import *
  10. class FLLegislationScraper(LegislationScraper):
  11. state = 'fl'
  12. metadata = {
  13. 'state_name': 'Florida',
  14. 'legislature_name': 'Florida Legislature',
  15. 'upper_chamber_name': 'Senate',
  16. 'lower_chamber_name': 'House of Representatives',
  17. 'upper_title': 'Senator',
  18. 'lower_title': 'Representative',
  19. 'upper_term': 4,
  20. 'lower_term': 2,
  21. 'sessions': ['1998', '1999', '2000', '2001', '2002', '2003', '2004',
  22. '2005', '2006', '2007', '2008', '2009'],
  23. 'session_details': {
  24. '1998': {'years': [1998], 'sub_sessions': []},
  25. '1999': {'years': [1999], 'sub_sessions': []},
  26. '2000': {'years': [2000], 'sub_sessions': ['2000 A', '2000 O']},
  27. '2001': {'years': [2001],
  28. 'sub_sessions': ['2001 A', '2001 B', '2001 C']},
  29. '2002': {'years': [2002],
  30. 'sub_sessions': ['2002 D', '2002 E', '2002 O']},
  31. '2003': {'years': [2003], 'sub_sessions': ['2003 A', '2003 B',
  32. '2003 C', '2003 D',
  33. '2003 E']},
  34. '2004': {'years': [2004], 'sub_sessions': ['2004 A', '2004 O']},
  35. '2005': {'years': [2005], 'sub_sessions': ['2005 B']},
  36. '2006': {'years': [2006], 'sub_sessions': ['2006 O']},
  37. '2007': {'years': [2007],
  38. 'sub_sessions': ['2007 A', '2007 B', '2007 C', '2007 D']},
  39. '2008': {'years': [2008], 'sub_sessions': ['2008 O']},
  40. '2009': {'years': [2009], 'sub_sessions': ['2009 A']},
  41. }
  42. }
  43. def scrape_legislators(self, chamber, year):
  44. if year not in self.metadata['session_details']:
  45. raise NoDataForYear(year)
  46. def scrape_session(self, chamber, session):
  47. if chamber == 'upper':
  48. chamber_name = 'Senate'
  49. bill_abbr = 'S'
  50. elif chamber == 'lower':
  51. chamber_name = 'House'
  52. bill_abbr = 'H'
  53. # Base url for bills sorted by first letter of title
  54. base_url = 'http://www.flsenate.gov/Session/index.cfm?Mode=Bills&BI_Mode=ViewBySubject&Letter=%s&Year=%s&Chamber=%s'
  55. # Bill ID format
  56. bill_re = re.compile("%s (\d{4}[ABCDEO]?)" % bill_abbr)
  57. # Go through all sorted bill list pages
  58. for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
  59. bill_list_url = base_url % (letter, session.replace(' ', ''),
  60. chamber_name)
  61. self.log("Getting bill list for %s %s (%s)" % (chamber, session,
  62. letter))
  63. bill_list = BeautifulSoup(self.urlopen(bill_list_url))
  64. # Bill ID's are bold
  65. for b in bill_list.findAll('b'):
  66. if not b.string:
  67. continue
  68. match = bill_re.search(b.string)
  69. if match:
  70. # Bill ID and number
  71. bill_id = match.group(0)
  72. bill_number = match.group(1)
  73. # Get bill name and info url
  74. bill_link = b.parent.findNext('td').a
  75. bill_name = bill_link.string.strip()
  76. info_url = "http://www.flsenate.gov/Session/%s&Year=%s" % (bill_link['href'], session)
  77. # Add bill
  78. bill = Bill(session, chamber, bill_id, bill_name)
  79. # Get bill info page
  80. info_page = BeautifulSoup(self.urlopen(info_url))
  81. # Get all bill versions
  82. bill_table = info_page.find('a', attrs={'name':'BillText'}).parent.parent.findNext('tr').td.table
  83. if bill_table:
  84. for tr in bill_table.findAll('tr')[1:]:
  85. version_name = tr.td.string
  86. version_url = "http://www.flsenate.gov%s" % tr.a['href']
  87. bill.add_version(version_name, version_url)
  88. # Get actions
  89. hist_table = info_page.find('pre', "billhistory")
  90. hist = ""
  91. for line in hist_table.findAll(text=True):
  92. hist += line + "\n"
  93. hist = hist.replace(' ', ' ')
  94. act_re = re.compile('^ (\d\d/\d\d/\d\d) (SENATE|HOUSE) (.*\n(\s{16,16}.*\n){0,})', re.MULTILINE)
  95. for act_match in act_re.finditer(hist):
  96. action = act_match.group(3).replace('\n', ' ')
  97. action = re.sub('\s+', ' ', action).strip()
  98. if act_match.group(2) == 'SENATE':
  99. act_chamber = 'upper'
  100. else:
  101. act_chamber = 'lower'
  102. bill.add_action(act_chamber, action, act_match.group(1))
  103. # Get primary sponsor
  104. # Right now we just list the committee as the primary sponsor
  105. # for committee substituts. In the future, consider listing
  106. # committee separately and listing the original human
  107. # sponsors as primary
  108. spon_re = re.compile('by ([^;(\n]+;?|\w+)')
  109. sponsor = spon_re.search(hist).group(1).strip('; ')
  110. bill.add_sponsor('primary', sponsor)
  111. # Get co-sponsors
  112. cospon_re = re.compile('\((CO-SPONSORS|CO-AUTHORS)\) ([\w .]+(;[\w .\n]+){0,})', re.MULTILINE)
  113. cospon_match = cospon_re.search(hist)
  114. if cospon_match:
  115. for cosponsor in cospon_match.group(2).split(';'):
  116. cosponsor = cosponsor.replace('\n', '').strip()
  117. bill.add_sponsor('cosponsor', cosponsor)
  118. self.add_bill(bill)
  119. def scrape_bills(self, chamber, year):
  120. if year not in self.metadata['session_details']:
  121. raise NoDataForYear(year)
  122. self.scrape_session(chamber, year)
  123. for session in self.metadata['session_details'][year]['sub_sessions']:
  124. self.scrape_session(chamber, session)
  125. if __name__ == '__main__':
  126. FLLegislationScraper().run()