PageRenderTime 49ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/nc/get_legislation.py

https://github.com/gosuri/fiftystates
Python | 117 lines | 93 code | 18 blank | 6 comment | 17 complexity | b5b6f03d98accc6fec153c6ffbf8b32a MD5 | raw file
  1. import html5lib
  2. # ugly hack
  3. import sys
  4. sys.path.append('./scripts')
  5. from pyutils.legislation import LegislationScraper, NoDataForYear
  6. def clean_legislators(s):
  7. s = s.replace(' ', ' ').strip()
  8. return [l.strip() for l in s.split(';') if l]
  9. class NCLegislationScraper(LegislationScraper):
  10. state = 'nc'
  11. soup_parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder('beautifulsoup')).parse
  12. def get_bill_info(self, session, bill_id):
  13. bill_detail_url = 'http://www.ncga.state.nc.us/gascripts/BillLookUp/BillLookUp.pl?Session=%s&BillID=%s' % (session, bill_id)
  14. # parse the bill data page, finding the latest html text
  15. if bill_id[0] == 'H':
  16. chamber = 'lower'
  17. else:
  18. chamber = 'upper'
  19. bill_data = self.urlopen(bill_detail_url)
  20. bill_soup = self.soup_parser(bill_data)
  21. bill_title = bill_soup.findAll('div', style="text-align: center; font: bold 20px Arial; margin-top: 15px; margin-bottom: 8px;")[0].contents[0]
  22. self.add_bill(chamber, session, bill_id, bill_title)
  23. # get all versions
  24. links = bill_soup.findAll('a')
  25. for link in links:
  26. if link.has_key('href') and link['href'].startswith('/Sessions') and link['href'].endswith('.html'):
  27. version_name = link.parent.previousSibling.previousSibling.contents[0].replace(' ', ' ')
  28. version_url = 'http://www.ncga.state.nc.us' + link['href']
  29. self.add_bill_version(chamber, session, bill_id, version_name, version_url)
  30. # grab primary and cosponsors from table[6]
  31. tables = bill_soup.findAll('table')
  32. sponsor_rows = tables[6].findAll('tr')
  33. sponsors = clean_legislators(sponsor_rows[1].td.contents[0])
  34. for leg in sponsors:
  35. self.add_sponsorship(chamber, session, bill_id, 'primary', leg)
  36. cosponsors = clean_legislators(sponsor_rows[2].td.contents[0])
  37. for leg in cosponsors:
  38. self.add_sponsorship(chamber, session, bill_id, 'cosponsor', leg)
  39. # easier to read actions from the rss.. but perhaps favor less HTTP requests?
  40. rss_url = 'http://www.ncga.state.nc.us/gascripts/BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&view=history_rss' % (session, bill_id)
  41. rss_data = self.urlopen(rss_url)
  42. rss_soup = self.soup_parser(rss_data)
  43. # title looks like 'House Chamber: action'
  44. for item in rss_soup.findAll('item'):
  45. action = item.title.contents[0]
  46. pieces = item.title.contents[0].split(' Chamber: ')
  47. if len(pieces) == 2:
  48. action_chamber = pieces[0]
  49. action = pieces[1]
  50. else:
  51. action_chamber = None
  52. action = pieces[0]
  53. date = item.pubdate.contents[0]
  54. self.add_action(chamber, session, bill_id, action_chamber, action, date)
  55. def scrape_session(self, chamber, session):
  56. url = 'http://www.ncga.state.nc.us/gascripts/SimpleBillInquiry/displaybills.pl?Session=%s&tab=Chamber&Chamber=%s' % (session, chamber)
  57. self.be_verbose("Downloading %s" % url)
  58. data = self.urlopen(url)
  59. soup = self.soup_parser(data)
  60. rows = soup.findAll('table')[5].findAll('tr')[1:]
  61. for row in rows:
  62. td = row.find('td')
  63. bill_id = td.a.contents[0]
  64. self.get_bill_info(session, bill_id)
  65. def scrape_bills(self, chamber, year):
  66. year_mapping = {
  67. '1985': ('1985',),
  68. '1986': ('1985E1',),
  69. '1987': ('1987',),
  70. '1988': (),
  71. '1989': ('1989', '1989E1'),
  72. '1990': ('1989E2',),
  73. '1991': ('1991E1', '1991'),
  74. '1992': (),
  75. '1993': ('1993',),
  76. '1994': ('1993E1',),
  77. '1995': ('1995',),
  78. '1996': ('1995E1', '1995E2'),
  79. '1997': ('1997',),
  80. '1998': ('1997E1',),
  81. '1999': ('1999E1', '1999'),
  82. '2000': ('1999E2',),
  83. '2001': ('2001',),
  84. '2002': ('2001E1',),
  85. '2003': ('2003', '2002E1', '2003E2'),
  86. '2004': ('2003E3',),
  87. '2005': ('2005',),
  88. '2006': (),
  89. '2007': ('2007E1', '2007'),
  90. '2008': ('2007E2',),
  91. '2009': ('2009',),
  92. }
  93. chamber = {'lower':'House', 'upper':'Senate'}[chamber]
  94. if year not in year_mapping:
  95. raise NoDataForYear(year)
  96. for session in year_mapping[year]:
  97. self.scrape_session(chamber, session)
  98. if __name__ == '__main__':
  99. NCLegislationScraper().run()