PageRenderTime 42ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/vt/get_legislation.py

https://github.com/rshapiro/fiftystates
Python | 184 lines | 171 code | 11 blank | 2 comment | 8 complexity | c0265e19a582805e39cfe163664ea46d MD5 | raw file
  1. #!/usr/bin/env python
  2. import urllib2, urllib
  3. import re
  4. from BeautifulSoup import BeautifulSoup
  5. import datetime as dt
  6. import time
  7. # ugly hack
  8. import sys
  9. sys.path.append('./scripts')
  10. from pyutils.legislation import *
  11. class VTLegislationScraper(LegislationScraper):
  12. state = 'vt'
  13. metadata = {
  14. 'state_name': 'Vermont',
  15. 'legislature_name': 'Vermont General Assembly',
  16. 'upper_chamber_name': 'Senate',
  17. 'lower_chamber_name': 'House of Representatives',
  18. 'upper_term': 2,
  19. 'lower_term': 2,
  20. 'sessions': ['1987-1988', '1989-1990', '1991-1992', '1993-1994',
  21. '1995-1996', '1997-1998', '1999-2000', '2001-2002',
  22. '2003-2004', '2005-2006', '2007-2008', '2009-2010'],
  23. 'session_details': {
  24. '1987-1988': {'years': [1987, 1988], 'sub_sessions': []},
  25. '1989-1990': {'years': [1989, 1990], 'sub_sessions': []},
  26. '1991-1992': {'years': [1991, 1992], 'sub_sessions': []},
  27. '1993-1994': {'years': [1993, 1994], 'sub_sessions': []},
  28. '1995-1996': {'years': [1995, 1996], 'sub_sessions': []},
  29. '1997-1998': {'years': [1997, 1998], 'sub_sessions': []},
  30. '1999-2000': {'years': [1999, 2000], 'sub_sessions': []},
  31. '2001-2002': {'years': [2001, 2003], 'sub_sessions': []},
  32. '2003-2004': {'years': [2003, 2004], 'sub_sessions': []},
  33. '2005-2006': {'years': [2005, 2006], 'sub_sessions': []},
  34. '2007-2008': {'years': [2007, 2008], 'sub_sessions': []},
  35. '2009-2010': {'years': [2009, 2010], 'sub_sessions': []},
  36. }
  37. }
  38. def scrape_session_new(self, chamber, session):
  39. if chamber == "lower":
  40. bill_abbr = "H."
  41. else:
  42. bill_abbr = "S."
  43. bill_list_url = "http://www.leg.state.vt.us/docs/bills.cfm?Session=%s&Body=%s" % (session.split('-')[1], bill_abbr[0])
  44. self.log("Getting bill list for %s %s" % (chamber, session))
  45. bill_list = BeautifulSoup(self.urlopen(bill_list_url))
  46. bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0])
  47. for bill_link in bill_list.findAll('a', href=bill_link_re):
  48. bill_id = bill_link.string
  49. bill_title = bill_link.parent.findNext('b').string
  50. bill = Bill(session, chamber, bill_id, bill_title)
  51. bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']
  52. info_page = BeautifulSoup(self.urlopen(bill_info_url))
  53. text_links = info_page.findAll('blockquote')[1].findAll('a')
  54. for text_link in text_links:
  55. bill.add_version(text_link.string,
  56. "http://www.leg.state.vt.us" +
  57. text_link['href'])
  58. act_table = info_page.findAll('blockquote')[2].table
  59. for row in act_table.findAll('tr')[1:]:
  60. if row['bgcolor'] == 'Salmon':
  61. act_chamber = 'lower'
  62. else:
  63. act_chamber = 'upper'
  64. action = ""
  65. for s in row.findAll('td')[1].findAll(text=True):
  66. action += s + " "
  67. action = action.strip()
  68. if row.td.a:
  69. act_date = row.td.a.string.split(' ')[0]
  70. else:
  71. act_date = row.td.string.split(' ')[0]
  72. bill.add_action(act_chamber, action, act_date)
  73. sponsors = info_page.find(
  74. text='Sponsor(s):').parent.parent.findAll('b')
  75. bill.add_sponsor('primary', sponsors[0].string)
  76. for sponsor in sponsors[1:]:
  77. bill.add_sponsor('cosponsor', sponsor.string)
  78. self.add_bill(bill)
  79. def scrape_session_old(self, chamber, session):
  80. if chamber == "lower":
  81. bill_abbr = "H."
  82. chamber_name = "House"
  83. other_chamber = "Senate"
  84. else:
  85. bill_abbr = "S."
  86. chamber_name = "Senate"
  87. other_chamber = "House"
  88. start_date = '1/1/%s' % session.split('-')[0]
  89. data = urllib.urlencode({'Date': start_date,
  90. 'Body': bill_abbr[0],
  91. 'Session': session.split('-')[1]})
  92. bill_list_url = "http://www.leg.state.vt.us/database/rintro/results.cfm"
  93. self.log("Getting bill list for %s %s" % (chamber, session))
  94. bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data))
  95. bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0])
  96. for bill_link in bill_list.findAll('a', href=bill_link_re):
  97. bill_id = bill_link.string
  98. bill_title = bill_link.parent.parent.findAll('td')[1].string
  99. bill = Bill(session, chamber, bill_id, bill_title)
  100. info_page = BeautifulSoup(self.urlopen(
  101. "http://www.leg.state.vt.us" + bill_link['href']))
  102. text_links = info_page.findAll('blockquote')[-1].findAll('a')
  103. for text_link in text_links:
  104. bill.add_version(text_link.string,
  105. "http://www.leg.state.vt.us" +
  106. text_link['href'])
  107. sponsors = info_page.find(
  108. text='Sponsor(s):').parent.findNext('td').findAll('b')
  109. bill.add_sponsor('primary', sponsors[0].string)
  110. for sponsor in sponsors[1:]:
  111. bill.add_sponsor('cosponsor', sponsor.string)
  112. # Grab actions from the originating chamber
  113. act_table = info_page.find(
  114. text='%s Status:' % chamber_name).findNext('table')
  115. for row in act_table.findAll('tr')[3:]:
  116. action = row.td.string.replace(' ', '').strip(':')
  117. act_date = row.findAll('td')[1].b.string.replace(' ', '')
  118. if act_date != "":
  119. detail = row.findAll('td')[2].b
  120. if detail and detail.string != "":
  121. action += ": %s" % detail.string.replace(' ', '')
  122. bill.add_action(chamber, action, act_date)
  123. # Grab actions from the other chamber
  124. act_table = info_page.find(
  125. text='%s Status:' % other_chamber).findNext('table')
  126. if act_table:
  127. if chamber == 'upper':
  128. act_chamber = 'lower'
  129. else:
  130. act_chamber = 'upper'
  131. for row in act_table.findAll('tr')[3:]:
  132. action = row.td.string.replace(' ', '').strip(':')
  133. act_date = row.findAll('td')[1].b.string.replace(
  134. ' ', '')
  135. if act_date != "":
  136. detail = row.findAll('td')[2].b
  137. if detail and detail.string != "":
  138. action += ": %s" % detail.string.replace(
  139. ' ', '')
  140. bill.add_action(act_chamber, action, act_date)
  141. self.add_bill(bill)
  142. def scrape_bills(self, chamber, year):
  143. session = "%s-%d" % (year, int(year) + 1)
  144. if session not in self.metadata['session_details']:
  145. raise NoDataForYear(year)
  146. if int(year) >= 2009:
  147. self.scrape_session_new(chamber, session)
  148. else:
  149. self.scrape_session_old(chamber, session)
  150. def scrape_legislators(self, chamber, year):
  151. session = "%s-%d" % (year, int(year) + 1)
  152. if session not in self.metadata['session_details']:
  153. raise NoDataForYear(year)
  154. if __name__ == '__main__':
  155. VTLegislationScraper().run()