PageRenderTime 45ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/fiftystates/scrape/nh/bills.py

https://github.com/runderwood/openstates
Python | 114 lines | 104 code | 8 blank | 2 comment | 4 complexity | d35e689642a90710e189b87bb1e8505f MD5 | raw file
  1. #!/usr/bin/env python
  2. import urllib
  3. import re
  4. from BeautifulSoup import BeautifulSoup
  5. from fiftystates.scrape import NoDataForPeriod
  6. from fiftystates.scrape.bills import Bill, BillScraper
  7. class NHBillScraper(BillScraper):
  8. state = 'nh'
  9. def get_bill_text(self, url):
  10. regexp = re.compile("href=\"(\S*)\"")
  11. bill_url = regexp.search(str(url))
  12. return bill_url.group(1)
  13. def add_bill_sponsors(self, url):
  14. regexp = re.compile("href=\"(\S*)\"")
  15. sponsor_url = regexp.search(str(url))
  16. sponsor_url = sponsor_url.group(1)
  17. def scrape(self, chamber, year):
  18. if year == '2009':
  19. self.scrape_year(chamber, '2009', '2009-2010')
  20. self.scrape_year(chamber, '2010', '2009-2010')
  21. else:
  22. raise NoDataForPeriod(year)
  23. def scrape_year(self, chamber, year, session):
  24. if chamber == 'upper':
  25. chamber_abbr = 'H'
  26. elif chamber == 'lower':
  27. chamber_abbr = 'S'
  28. #set up POST data
  29. values = [('txtsessionyear', year),
  30. ('txttitle', ''),
  31. ('txtlsrnumber', ''),
  32. ('Submit1', 'Submit')]
  33. params = urllib.urlencode(values)
  34. search_url = 'http://www.gencourt.state.nh.us/bill_status/Results.aspx'
  35. #request page with list of all bills in year
  36. with self.urlopen(search_url + '?' + params) as doc:
  37. soup = BeautifulSoup(doc)
  38. #parse results
  39. bills = soup.find("table", {"class": "ptable"})
  40. trs = soup.findAll("tr")
  41. #go through all of the table rows with relevant data
  42. tr_start = 8
  43. tr_hop = 11
  44. i = 0
  45. while (tr_start + (tr_hop * i)) < len(trs):
  46. tr = trs[tr_start + (tr_hop * i)]
  47. i = i + 1
  48. # strip off extra white space from name
  49. id = tr.find("big").string.strip()
  50. bill_id = tr.find("big").string.strip()
  51. exp = re.compile("^(\w*)")
  52. bill_id = exp.search(id).group(1)
  53. # check to see if its in the proper chamber
  54. exp = re.compile("^" + chamber_abbr)
  55. if exp.search(bill_id) == None:
  56. continue # in wrong house
  57. # check to see it is a bill and not a resolution
  58. exp = re.compile("B")
  59. if exp.search(bill_id) == None:
  60. continue # not a bill
  61. # get bill_id suffix if exists
  62. exp = re.compile("(-\w*)$")
  63. res = exp.search(id)
  64. if res != None:
  65. bill_id = bill_id + res.group(1)
  66. # get bill title
  67. title = tr.findAll("b")[0]
  68. bill_title = title.nextSibling.string
  69. bill_title = bill_title.strip()
  70. bill_title = bill_title.encode('ascii', 'xmlcharrefreplace')
  71. # grab url of bill text
  72. urls = tr.findAll("a")
  73. textexp = re.compile("Bill Text")
  74. textdoc = re.compile("Bill Docket")
  75. textstat = re.compile("Bill Status")
  76. textcall = re.compile("Roll Calls")
  77. textaudio = re.compile("Audio Files")
  78. for url in urls:
  79. if textexp.search(str(url.string)) != None:
  80. bill_url = self.get_bill_text(url)
  81. if textdoc.search(str(url.string)) != None:
  82. pass
  83. if textstat.search(str(url.string)) != None:
  84. add_bill_sponsors()
  85. if textcall.search(str(url.string)) != None:
  86. pass
  87. if textaudio.search(str(url.string)) != None:
  88. pass
  89. bill = Bill(session, chamber, bill_id, bill_title)
  90. bill.add_version("Bill text", bill_url)
  91. bill.add_source(search_url)
  92. self.save_bill(bill)
  93. #grabs sponsorship
  94. #todo: add sponsorship, audio, actions