PageRenderTime 607ms CodeModel.GetById 32ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/pa/get_legislation.py

https://github.com/rcadby/fiftystates
Python | 329 lines | 290 code | 26 blank | 13 comment | 9 complexity | c0cd053f69ab4c27f4abec7cb213eb4e MD5 | raw file
  1. #!/usr/bin/env python
  2. from __future__ import with_statement
  3. import re
  4. import datetime as dt
  5. import calendar
  6. import sys
  7. import os
  8. from utils import (bill_abbr, start_year, parse_action_date,
  9. bill_list_url, history_url, info_url, vote_url,
  10. legislators_url)
  11. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  12. from pyutils.legislation import (LegislationScraper, Bill, Vote, Legislator,
  13. NoDataForYear)
  14. class PALegislationScraper(LegislationScraper):
  15. state = 'pa'
  16. metadata = {
  17. 'state_name': 'Pennsylvania',
  18. 'legislature_name': 'Pennsylvania General Assembly',
  19. 'upper_chamber_name': 'Senate',
  20. 'lower_chamber_name': 'House of Representatives',
  21. 'upper_title': 'Senator',
  22. 'lower_title': 'Representative',
  23. 'upper_term': 4,
  24. 'lower_term': 2,
  25. 'sessions': [],
  26. 'session_details': {},
  27. }
  28. def scrape_metadata(self):
  29. metadata_url = "http://www.legis.state.pa.us/cfdocs/"\
  30. "legis/home/session.cfm"
  31. with self.soup_context(metadata_url) as session_page:
  32. for option in session_page.find(id="BTI_sess").findAll('option'):
  33. if option['value'].endswith('_0'):
  34. year1 = int(option['value'][1:5])
  35. year2 = year1 + 1
  36. session = "%d-%d" % (year1, year2)
  37. self.metadata['sessions'].append(session)
  38. self.metadata['session_details'][session] = {
  39. 'years': [year1, year2],
  40. 'sub_sessions': [],
  41. }
  42. else:
  43. session = option.string[0:9]
  44. self.metadata['session_details'][session][
  45. 'sub_sessions'].append(option.string)
  46. # sessions were in reverse-chronological order
  47. self.metadata['sessions'].reverse()
  48. return self.metadata
  49. def scrape_session(self, chamber, session, special=0):
  50. session_url = bill_list_url(chamber, session, special)
  51. with self.soup_context(session_url) as bill_list_page:
  52. bill_link_re = "body=%s&type=(B|R)&bn=\d+" % bill_abbr(chamber)
  53. for link in bill_list_page.findAll(href=re.compile(bill_link_re)):
  54. self.parse_bill(chamber, session, special, link)
  55. def parse_bill(self, chamber, session, special, link):
  56. bill_number = link.contents[0]
  57. type = re.search('type=(B|R|)', link['href']).group(1)
  58. bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number)
  59. bill_info_url = info_url(chamber, session, special, type, bill_number)
  60. with self.soup_context(bill_info_url) as info_page:
  61. title_label = info_page.find(text='Short Title:')
  62. title = title_label.findNext().contents[0]
  63. bill = Bill(session, chamber, bill_id, title)
  64. bill.add_source(bill_info_url)
  65. self.parse_bill_versions(bill, info_page)
  66. self.parse_history(bill, history_url(chamber, session, special,
  67. type, bill_number))
  68. self.parse_votes(bill, vote_url(chamber, session, special,
  69. type, bill_number))
  70. self.save_bill(bill)
  71. def parse_bill_versions(self, bill, info_page):
  72. """
  73. Grab links to all versions of a bill from its info page.
  74. """
  75. pn_table = info_page.find('div', {"class": 'pn_table'})
  76. text_rows = pn_table.findAll('tr')[1:]
  77. for row in text_rows:
  78. text_link = row.td.a
  79. text_url = 'http://www.legis.state.pa.us%s' % text_link['href']
  80. text_name = text_link.contents[0].strip()
  81. bill.add_version(text_name, text_url)
  82. def parse_history(self, bill, url):
  83. """
  84. Grab all history data (actions and votes) for a given bill provided
  85. the url to its history page.
  86. """
  87. bill.add_source(url)
  88. with self.soup_context(url) as history_page:
  89. self.parse_sponsors(bill, history_page)
  90. self.parse_actions(bill, history_page)
  91. def parse_sponsors(self, bill, history_page):
  92. """
  93. Grab all of a bill's sponsors from its history page.
  94. """
  95. # Sponsor format changed in 2009
  96. if int(start_year(bill['session'])) < 2009:
  97. sponsors = history_page.find(
  98. text='Sponsors:').parent.findNext('td').find(
  99. 'td').string.strip().replace(' and', ',').split(', ')
  100. bill.add_sponsor('primary', sponsors[0])
  101. for cosponsor in sponsors[1:]:
  102. bill.add_sponsor('cosponsor', cosponsor)
  103. else:
  104. sponsors = history_page.find(
  105. text='Sponsors:').parent.findNext().findAll('a')
  106. bill.add_sponsor('primary', sponsors[0].contents[0])
  107. for cosponsor in sponsors[1:]:
  108. bill.add_sponsor('cosponsor', cosponsor.contents[0])
  109. def parse_actions(self, bill, history_page):
  110. """
  111. Grab all of a bill's actions from its history page.
  112. """
  113. act_table = history_page.find(text="Actions:").parent.findNextSibling()
  114. act_chamber = bill['chamber']
  115. for row in act_table.findAll('tr'):
  116. act_raw = ""
  117. for node in row.td.div:
  118. if hasattr(node, 'contents'):
  119. if len(node.contents) > 0:
  120. act_raw += node.contents[0]
  121. else:
  122. act_raw += node
  123. act_raw = act_raw.replace('&#160;', ' ')
  124. act_match = re.match('(.*),\s+((\w+\.?) (\d+), (\d{4}))', act_raw)
  125. if act_match:
  126. date = parse_action_date(act_match.group(2).strip())
  127. bill.add_action(act_chamber, act_match.group(1),
  128. date)
  129. else:
  130. # Handle actions from the other chamber
  131. # ("In the (House|Senate)" row followed by actions that
  132. # took place in that chamber)
  133. cham_match = re.match('In the (House|Senate)', act_raw)
  134. if not cham_match:
  135. # Ignore?
  136. continue
  137. if cham_match.group(1) == 'House':
  138. act_chamber = 'lower'
  139. else:
  140. act_chamber = 'upper'
  141. def parse_votes(self, bill, url):
  142. """
  143. Grab all of the votes for a bill given the url of its primary
  144. votes page.
  145. """
  146. bill.add_source(url)
  147. with self.soup_context(url) as votes_page:
  148. for td in votes_page.findAll('td', {'class': 'vote'}):
  149. prev = td.findPrevious().contents[0].strip()
  150. if prev == 'Senate':
  151. chamber = 'upper'
  152. location = ''
  153. elif prev == 'House':
  154. chamber = 'lower'
  155. location = ''
  156. else:
  157. # Committee votes come in a number of different formats
  158. # that we don't handle yet
  159. continue
  160. chamber_votes_url = td.a['href']
  161. self.parse_chamber_votes(chamber, bill, chamber_votes_url)
  162. def parse_chamber_votes(self, chamber, bill, url):
  163. """
  164. Grab all votes for a bill that occurred in a given chamber.
  165. """
  166. bill.add_source(url)
  167. with self.soup_context(url) as chamber_votes_page:
  168. for link in chamber_votes_page.findAll(
  169. 'a', href=re.compile('rc_view')):
  170. vote_details_url = "http://www.legis.state.pa.us/CFDOCS/"\
  171. "Legis/RC/Public/%s" % link['href']
  172. vote = self.parse_vote_details(vote_details_url)
  173. bill.add_vote(vote)
  174. def parse_vote_details(self, url):
  175. """
  176. Grab the details of a specific vote, such as how each legislator
  177. voted.
  178. """
  179. def find_vote(letter):
  180. return vote_page.findAll('span', {'class': 'font8text'},
  181. text=letter)
  182. with self.soup_context(url) as vote_page:
  183. header = vote_page.find('div', {'class': 'subHdrGraphic'})
  184. if 'Senate' in header.string:
  185. chamber = 'upper'
  186. else:
  187. chamber = 'lower'
  188. # we'll use the link back to the bill as a base to
  189. # get the motion/date
  190. linkback = vote_page.find(
  191. 'a', href=re.compile('billinfo')).parent.parent
  192. date = linkback.find('div').string
  193. date = dt.datetime.strptime(date, "%A, %B %d, %Y")
  194. motion = linkback.findNextSibling('div')
  195. if motion.a:
  196. motion = "%s %s" % (motion.a.string,
  197. motion.contents[-1].string.strip())
  198. elif motion.span:
  199. motion = "%s %s" % (motion.span.string.strip(),
  200. motion.contents[-1].string.strip())
  201. else:
  202. motion = motion.string.strip().replace('&nbsp;', '')
  203. yes_count = int(vote_page.find('div', text='YEAS').next.string)
  204. no_count = int(vote_page.find('div', text='NAYS').next.string)
  205. lve_count = int(vote_page.find('div', text='LVE').next.string)
  206. nv_count = int(vote_page.find('div', text='N/V').next.string)
  207. other_count = lve_count + nv_count
  208. passed = yes_count > no_count
  209. vote = Vote(chamber, date, motion, passed, yes_count, no_count,
  210. other_count)
  211. vote.add_source(url)
  212. # find the votes by the inner text. because background colors lie.
  213. yes_votes = [vote.yes, find_vote('Y')]
  214. no_votes = [vote.no, find_vote('N')]
  215. nv_votes = [vote.other, find_vote('E') + find_vote('X')]
  216. for (action, votes) in (yes_votes, no_votes, nv_votes):
  217. for a_vote in votes:
  218. action(a_vote.parent.findNextSibling('span').string)
  219. if len(vote['yes_votes']) != yes_count:
  220. raise ScrapeError('wrong yes count %d/%d' %
  221. (len(vote['yes_votes']), yes_count))
  222. if len(vote['no_votes']) != no_count:
  223. raise ScrapeError('wrong no count %d/%d' %
  224. (len(vote['no_votes']), no_count))
  225. if len(vote['other_votes']) != other_count:
  226. raise ScrapeError('wrong other count %d/%d' %
  227. (len(vote['other_votes']), other_count))
  228. return vote
  229. def scrape_bills(self, chamber, year):
  230. session = "%s-%d" % (year, int(year) + 1)
  231. if not session in self.metadata['session_details']:
  232. raise NoDataForYear(year)
  233. self.scrape_session(chamber, session)
  234. specials = self.metadata['session_details'][session]['sub_sessions']
  235. for special in specials:
  236. session_num = re.search('#(\d+)', special).group(1)
  237. self.scrape_session(chamber, session, int(session_num))
  238. def scrape_legislators(self, chamber, year):
  239. # Pennsylvania doesn't make member lists easily available
  240. # for previous sessions, unfortunately
  241. if int(year) < 2009:
  242. #raise NoDataForYear(year)
  243. return
  244. session = "%s-%d" % (year, int(year) + 1)
  245. leg_list_url = legislators_url(chamber)
  246. with self.soup_context(leg_list_url) as member_list_page:
  247. for link in member_list_page.findAll(
  248. 'a', href=re.compile('_bio\.cfm\?id=')):
  249. full_name = link.contents[0][0:-4]
  250. last_name = full_name.split(',')[0]
  251. first_name = full_name.split(' ')[1]
  252. if len(full_name.split(' ')) > 2:
  253. middle_name = full_name.split(' ')[2].strip(',')
  254. else:
  255. middle_name = ''
  256. party = link.contents[0][-2]
  257. if party == 'R':
  258. party = "Republican"
  259. elif party == 'D':
  260. party = "Democrat"
  261. district = re.search(
  262. "District (\d+)", link.parent.contents[1]).group(1)
  263. legislator = Legislator(session, chamber, district,
  264. full_name, first_name, last_name,
  265. middle_name, party)
  266. legislator.add_source(leg_list_url)
  267. self.save_legislator(legislator)
  268. if __name__ == '__main__':
  269. PALegislationScraper.run()