PageRenderTime 186ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/ut/get_legislation.py

https://github.com/BRIMIL01/fiftystates
Python | 221 lines | 177 code | 35 blank | 9 comment | 26 complexity | 8030f949d235617f3334e0e9d921a752 MD5 | raw file
  1. #!/usr/bin/env python
  2. import urllib2
  3. import re
  4. import datetime as dt
  5. import html5lib
  6. # ugly hack
  7. import sys
  8. sys.path.append('./scripts')
  9. from pyutils.legislation import *
  10. class UTLegislationScraper(LegislationScraper):
  11. state = 'ut'
  12. soup_parser = html5lib.HTMLParser(
  13. tree=html5lib.treebuilders.getTreeBuilder('beautifulsoup')).parse
  14. # TODO: Grab sessions/sub_sessions programmatically from the site
  15. metadata = {'state_name': 'Utah',
  16. 'legislature_name': 'Utah State Legislature',
  17. 'lower_chamber_name': 'House of Representatives',
  18. 'upper_chamber_name': 'Senate',
  19. 'lower_title': 'Representative',
  20. 'upper_title': 'Senator',
  21. 'lower_term': 2,
  22. 'upper_term': 4,
  23. 'sessions': ['1997', '1998', '1999', '2000', '2001', '2002',
  24. '2003', '2004', '2005', '2006', '2007', '2008',
  25. '2009'],
  26. 'session_details':
  27. {'1997': {'years': [1997],
  28. 'sub_sessions': ['1997 S1', '1997 S2']},
  29. '1998': {'years': [1998], 'sub_sessions': []},
  30. '1999': {'years': [1999], 'sub_sessions': []},
  31. '2000': {'years': [2000], 'sub_sessions': []},
  32. '2001': {'years': [2001],
  33. 'sub_sessions': ['2001 S1', '2001 S2']},
  34. '2002': {'years': [2002],
  35. 'sub_sessions': ['2002 S2', '2002 S3', '2002 S4',
  36. '2002 S5', '2002 S6']},
  37. '2003': {'years': [2003],
  38. 'sub_sessions': ['2003 S1', '2003 S2']},
  39. '2004': {'years': [2004],
  40. 'sub_sessions': ['2003 S3', '2003 S4']},
  41. '2005': {'years': [2005],
  42. 'sub_sessions': ['2004 S1', '2004 S2']},
  43. '2006': {'years': [2006],
  44. 'sub_sessions': ['2006 S3', '2006 S4', '2006 S5']},
  45. '2007': {'years': [2007],'sub_sessions': ['2007 S1']},
  46. '2008': {'years': [2008], 'sub_sessions': ['2008 S2']},
  47. '2009': {'years': [2009], 'sub_sessions': ['2009 S1']},
  48. }
  49. }
  50. def scrape_legislators(self, chamber, year):
  51. if year not in self.metadata['session_details']:
  52. raise NoDataForYear(year)
  53. if chamber == 'lower':
  54. title = 'Representative'
  55. else:
  56. title = 'Senator'
  57. url = 'http://www.le.state.ut.us/asp/roster/roster.asp?year=%s' % year
  58. leg_list = self.soup_parser(self.urlopen(url))
  59. for row in leg_list.findAll('table')[1].findAll('tr')[1:]:
  60. tds = row.findAll('td')
  61. leg_title = tds[1].find(text=True)
  62. if leg_title == title:
  63. fullname = tds[0].find(text=True)
  64. last_name = fullname.split(',')[0]
  65. first_name = fullname.split(' ')[1]
  66. if len(fullname.split(' ')) > 2:
  67. middle_name = fullname.split(' ')[2]
  68. leg = Legislator(year, chamber, tds[3].find(text=True),
  69. fullname, first_name, last_name,
  70. middle_name, tds[2].find(text=True))
  71. self.add_legislator(leg)
  72. def parse_status(self, bill, url):
  73. chamber = bill['chamber']
  74. session = bill['session']
  75. bill_id = bill['bill_id']
  76. status = self.soup_parser(self.urlopen(url))
  77. act_table = status.table
  78. # Get actions
  79. for row in act_table.findAll('tr')[1:]:
  80. act_date = row.td.find(text=True)
  81. act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")
  82. action = row.findAll('td')[1].find(text=True)
  83. # If not specified, assume action occurred
  84. # in originating house
  85. actor = chamber
  86. split_action = action.split('/')
  87. if len(split_action) > 1:
  88. actor = split_action[0]
  89. if actor == 'House':
  90. actor = 'lower'
  91. elif actor == 'Senate':
  92. actor = 'upper'
  93. action = '/'.join(split_action[1:]).strip()
  94. bill.add_action(actor, action, act_date)
  95. # Check if this action is a vote
  96. links = row.findAll('a')
  97. if len(links) > 1:
  98. vote_url = links[-1]['href']
  99. # Committee votes are of a different format that
  100. # we don't handle yet
  101. if not vote_url.endswith('txt'):
  102. continue
  103. vote_url = '/'.join(url.split('/')[:-1]) + '/' + vote_url
  104. vote_page = self.urlopen(vote_url)
  105. vote_re = re.compile('YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)'
  106. '(.*)ABSENT( OR NOT VOTING)? -?\s?'
  107. '(\d+)(.*)',
  108. re.MULTILINE | re.DOTALL)
  109. match = vote_re.search(vote_page)
  110. yes_count = match.group(1)
  111. no_count = match.group(3)
  112. other_count = match.group(6)
  113. if int(yes_count) > int(no_count):
  114. passed = True
  115. else:
  116. passed = False
  117. if actor == 'upper' or actor == 'lower':
  118. vote_chamber = actor
  119. vote_location = ''
  120. else:
  121. vote_chamber = ''
  122. vote_location = actor
  123. vote = Vote(vote_chamber, act_date,
  124. action, passed, yes_count, no_count,
  125. other_count,
  126. location=vote_location)
  127. yes_votes = re.split('\s{2,}', match.group(2).strip())
  128. no_votes = re.split('\s{2,}', match.group(4).strip())
  129. other_votes = re.split('\s{2,}', match.group(7).strip())
  130. map(vote.yes, yes_votes)
  131. map(vote.no, no_votes)
  132. map(vote.other, other_votes)
  133. bill.add_vote(vote)
  134. def scrape_session(self, chamber, session):
  135. if chamber == "lower":
  136. bill_abbr = "HB"
  137. else:
  138. bill_abbr = "SB"
  139. bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % (
  140. session.replace(' ', ''))
  141. self.log("Getting bill list for %s, %s" % (session, chamber))
  142. try:
  143. base_bill_list = self.soup_parser(self.urlopen(bill_list_url))
  144. except:
  145. # this session doesn't exist for this year
  146. return
  147. bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr)
  148. for link in base_bill_list.findAll('a', href=bill_list_link_re):
  149. bill_list = self.soup_parser(self.urlopen(link['href']))
  150. bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr)
  151. for bill_link in bill_list.findAll('a', href=bill_link_re):
  152. bill_id = bill_link.find(text=True).strip()
  153. bill_info = self.soup_parser(self.urlopen(
  154. bill_link['href']))
  155. (bill_title, primary_sponsor) = bill_info.h3.contents[2].replace(
  156. ' ', ' ').strip().split(' -- ')
  157. bill = Bill(session, chamber, bill_id, bill_title)
  158. bill.add_sponsor('primary', primary_sponsor)
  159. status_re = re.compile('.*billsta/%s.*.htm' % bill_abbr.lower())
  160. status_link = bill_info.find('a', href=status_re)
  161. if status_link:
  162. self.parse_status(bill, status_link['href'])
  163. text_find = bill_info.find(text="Bill Text (If you are having trouble viewing PDF files, ")
  164. if text_find:
  165. text_link_re = re.compile('.*\.htm')
  166. for text_link in text_find.parent.parent.findAll(
  167. 'a', href=text_link_re)[1:]:
  168. version_name = text_link.previous.strip()
  169. bill.add_version(version_name, text_link['href'])
  170. self.add_bill(bill)
  171. def scrape_bills(self, chamber, year):
  172. if year not in self.metadata['session_details']:
  173. raise NoDataForYear(year)
  174. session = self.metadata['session_details'][year]
  175. self.scrape_session(chamber, year)
  176. for sub_session in session['sub_sessions']:
  177. self.scrape_session(chamber, sub_session)
  178. if __name__ == '__main__':
  179. UTLegislationScraper().run()