PageRenderTime 31ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/fiftystates/scrape/wv/bills.py

https://github.com/runderwood/openstates
Python | 138 lines | 133 code | 3 blank | 2 comment | 1 complexity | ed2ba0145eb9409f27de10d6432bd815 MD5 | raw file
  1. #!/usr/bin/env python
  2. import urllib
  3. import re
  4. import datetime as dt
  5. import urllib2
  6. from BeautifulSoup import BeautifulSoup
  7. from fiftystates.scrape.bills import BillScraper, Bill
  8. def cleansource(data):
  9. '''Remove some irregularities from WV's HTML.
  10. It includes a spurious </HEAD> before the useful data begins and lines like '<option value="Bill"selected="selected">Bill</option>', in which the lack of a space between the attributes confuses BeautifulSoup.
  11. '''
  12. data = data.replace('</HEAD>', '')
  13. return re.sub('(="[^"]+")([a-zA-Z])', r'\1 \2', data)
  14. def cleansponsor(sponsor):
  15. if sponsor.endswith('President)'):
  16. # in the senate:
  17. # Soandso (Salutation President)
  18. return sponsor.split(' ')[0]
  19. if ' Speaker' in sponsor: # leading space in case there is a Rep. Speaker
  20. # in the house:
  21. # Salutation Speaker (Salutation Soandso)
  22. return sponsor.split(' ')[-1][:-1]
  23. return sponsor
  24. def issponsorlink(a):
  25. if 'title' in a:
  26. return (a['title'].startswith('View bills Delegate') or
  27. a['title'].startswith('View bills Senator'))
  28. return False
  29. def sessionexisted(data):
  30. return not re.search('Please choose another session', data)
  31. urlbase = 'http://www.legis.state.wv.us/Bill_Status/%s'
  32. class WVBillScraper(BillScraper):
  33. state = 'wv'
  34. session_abbrevs = 'RS 1X 2X 3X 4X 5X 6X 7X'.split()
  35. def scrape(self, chamber, year):
  36. if int(year) < 1993:
  37. raise NoDataForPeriod
  38. for session in self.session_abbrevs:
  39. if not self.scrape_session(chamber, session, year):
  40. return
  41. def scrape_session(self, chamber, session, year):
  42. if chamber == 'upper':
  43. c = 's'
  44. else:
  45. c = 'h'
  46. q = 'Bills_all_bills.cfm?year=%s&sessiontype=%s&btype=bill&orig=%s' % (
  47. year, session, c)
  48. try:
  49. with self.urlopen(urlbase % q) as data:
  50. if not sessionexisted(data):
  51. return False
  52. soup = BeautifulSoup(cleansource(data))
  53. rows = soup.findAll('table')[1].findAll('tr')[1:]
  54. for row in rows:
  55. histlink = urlbase % row.td.a['href']
  56. billid = row.td.a.contents[0].contents[0]
  57. self.scrape_bill(chamber, session, billid, histlink, year)
  58. return True
  59. except urllib2.HTTPError as e:
  60. if e.code == 500:
  61. # Nonexistent session
  62. return False
  63. else:
  64. raise e
  65. def scrape_bill(self, chamber, session, billid, histurl, year):
  66. if year[0] != 'R':
  67. session = year
  68. else:
  69. session = self.metadata['session_details'][year][
  70. 'sub_sessions'][int(year[0]) - 1]
  71. with self.urlopen(histurl) as data:
  72. soup = BeautifulSoup(cleansource(data))
  73. basicinfo = soup.findAll('div', id='bhistleft')[0]
  74. hist = basicinfo.table
  75. sponsor = None
  76. title = None
  77. for b in basicinfo.findAll('b'):
  78. if b.next.startswith('SUMMARY'):
  79. title = b.findNextSiblings(text=True)[0].strip()
  80. elif b.next.startswith('SPONSOR'):
  81. for a in b.findNextSiblings('a'):
  82. if not issponsorlink(a):
  83. break
  84. sponsor = cleansponsor(a.contents[0])
  85. bill = Bill(session, chamber, billid, title)
  86. if sponsor:
  87. bill.add_sponsor('primary', sponsor)
  88. for row in hist.findAll('tr'):
  89. link = row.td.a
  90. vlink = urlbase % link['href']
  91. vname = link.contents[0].strip()
  92. bill.add_version(vname, vlink)
  93. history = soup.findAll('div', id='bhisttab')[0].table
  94. rows = history.findAll('tr')[1:]
  95. for row in rows:
  96. tds = row.findAll('td')
  97. if len(tds) < 2:
  98. # This is not actually an action
  99. continue
  100. date, action = row.findAll('td')[:2]
  101. date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
  102. action = action.contents[0].strip()
  103. if 'House' in action:
  104. actor = 'lower'
  105. elif 'Senate' in action:
  106. actor = 'upper'
  107. else: # for lack of a better
  108. actor = chamber
  109. bill.add_action(actor, action, date)
  110. self.save_bill(bill)