PageRenderTime 97ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/ca/get_legislation.py

https://github.com/jdunck/fiftystates
Python | 112 lines | 106 code | 4 blank | 2 comment | 1 complexity | 4dee946dd506e7f4a2c61be185bc7e32 MD5 | raw file
  1. #!/usr/bin/env python
  2. import urllib2
  3. import re
  4. import datetime as dt
  5. from BeautifulSoup import BeautifulSoup
  6. # ugly hack
  7. import sys
  8. sys.path.append('./scripts')
  9. from pyutils.legislation import LegislationScraper, NoDataForYear
  10. class CALegislationScraper(LegislationScraper):
  11. state = 'ca'
  12. def get_bill_info(self, chamber, session, bill_id):
  13. detail_url = 'http://www.leginfo.ca.gov/cgi-bin/postquery?bill_number=%s_%s&sess=%s' % (bill_id[:2].lower(), bill_id[2:], session.replace('-', ''))
  14. # Get the details page and parse it with BeautifulSoup. These
  15. # pages contain a malformed 'p' tag that (certain versions of)
  16. # BS choke on, so we replace it with a regex before parsing.
  17. details_raw = urllib2.urlopen(detail_url).read()
  18. details_raw = details_raw.replace('<P ALIGN=CENTER">', '')
  19. details = BeautifulSoup(details_raw)
  20. # Get the history page (following a link from the details page).
  21. # Once again, we remove tags that BeautifulSoup chokes on
  22. # (including all meta tags, because bills with quotation marks
  23. # in the title come to us w/ malformed meta tags)
  24. hist_link = details.find(href=re.compile("_history.html"))
  25. hist_url = 'http://www.leginfo.ca.gov%s' % hist_link['href']
  26. history_raw = urllib2.urlopen(hist_url).read()
  27. history_raw = history_raw.replace('<! ****** document data starts here ******>', '')
  28. rem_meta = re.compile('</title>.*</head>', re.MULTILINE | re.DOTALL)
  29. history_raw = rem_meta.sub('</title></head>', history_raw)
  30. history = BeautifulSoup(history_raw)
  31. # Find title and add bill
  32. title_match = re.search('TOPIC\t:\s(\w.+\n(\t\w.*\n){0,})', history_raw, re.MULTILINE)
  33. bill_title = title_match.group(1).replace('\n', '').replace('\t', ' ')
  34. self.add_bill(chamber, session, bill_id, bill_title)
  35. # Find author (primary sponsor)
  36. sponsor_match = re.search('^AUTHOR\t:\s(.*)$', history_raw, re.MULTILINE)
  37. bill_sponsor = sponsor_match.group(1)
  38. self.add_sponsorship(chamber, session, bill_id, 'primary', bill_sponsor)
  39. # Get all versions of the bill
  40. text_re = '%s_%s_bill\w*\.html' % (bill_id[:2].lower(), bill_id[2:])
  41. links = details.find(text='Bill Text').parent.findAllNext(href=re.compile(text_re))
  42. for link in links:
  43. version_url = "http://www.leginfo.ca.gov%s" % link['href']
  44. # This name is not necessarily unique (for example, there may
  45. # be many versions called simply "Amended"). Perhaps we should
  46. # add a date or something to make it unique?
  47. version_name = link.parent.previousSibling.previousSibling.b.font.string
  48. self.add_bill_version(chamber, session, bill_id,
  49. version_name, version_url)
  50. # Get bill actions
  51. action_re = re.compile('^(\d{4})|^([\w.]{4,6}\s+\d{1,2})\s+(.*(\n\s+.*){0,})', re.MULTILINE)
  52. act_year = None
  53. for act_match in action_re.finditer(history.find('pre').contents[0]):
  54. # If we didn't match group 2 then this must be a year change
  55. if act_match.group(2) == None:
  56. act_year = act_match.group(1)
  57. continue
  58. # If not year change, must be an action
  59. act_date = act_match.group(2)
  60. action = act_match.group(3).replace('\n', '').replace(' ', ' ').replace('\t', ' ')
  61. self.add_action(chamber, session, bill_id, chamber,
  62. action, "%s, %s" % (act_date, act_year))
  63. def scrape_session(self, chamber, session):
  64. if chamber == 'upper':
  65. chamber_name = 'senate'
  66. bill_abbr = 'SB'
  67. elif chamber == 'lower':
  68. chamber_name = 'assembly'
  69. bill_abbr = 'AB'
  70. # Get the list of all chamber bills for the given session
  71. # (text format, sorted by author)
  72. url = "http://www.leginfo.ca.gov/pub/%s/bill/index_%s_author_bill_topic" % (session, chamber_name)
  73. self.be_verbose("Getting bill list for %s %s" % (chamber, session))
  74. bill_list = urllib2.urlopen(url).read()
  75. bill_re = re.compile('\s+(%s\s+\d+)(.*(\n\s{31}.*){0,})' % bill_abbr,
  76. re.MULTILINE)
  77. for bill_match in bill_re.finditer(bill_list):
  78. bill_id = bill_match.group(1).replace(' ', '')
  79. self.get_bill_info(chamber, session, bill_id)
  80. def scrape_bills(self, chamber, year):
  81. # CA makes data available from 1993 on
  82. if int(year) < 1993 or int(year) > dt.date.today().year:
  83. raise NoDataForYear(year)
  84. # We expect the first year of a session (odd)
  85. if int(year) % 2 != 1:
  86. raise NoDataForYear(year)
  87. year1 = year[2:]
  88. year2 = str((int(year) + 1))[2:]
  89. session = "%s-%s" % (year1, year2)
  90. self.scrape_session(chamber, session)
  91. if __name__ == '__main__':
  92. CALegislationScraper().run()