PageRenderTime 48ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/nh/get_legislation.py

https://github.com/BRIMIL01/fiftystates
Python | 118 lines | 106 code | 9 blank | 3 comment | 2 complexity | 9b1c17f1735fa476e73e36063afe8b0f MD5 | raw file
  1. #!/usr/bin/python
  2. import urllib, urllib2
  3. import unicodedata;
  4. import re
  5. from BeautifulSoup import BeautifulSoup
  6. # ugly hack
  7. import sys
  8. sys.path.append('./scripts')
  9. from pyutils.legislation import LegislationScraper, NoDataForYear
  10. class NHLegislationScraper(LegislationScraper):
  11. state='nh'
  12. def get_bill_text(self, url):
  13. regexp = re.compile("href=\"(\S*)\"")
  14. bill_url = regexp.search(str(url))
  15. return bill_url.group(1)
  16. def add_bill_sponsors(self, url):
  17. regexp = re.compile("href=\"(\S*)\"")
  18. sponsor_url = regexp.search(str(url))
  19. sponsor_url= sponsor_url.group(1)
  20. def scrape_bills(self, chamber, year):
  21. if chamber == 'upper':
  22. chamber_abbr = 'H'
  23. elif chamber == 'lower':
  24. chamber_abbr = 'S'
  25. #set up POST data
  26. values = [('txtsessionyear',year),
  27. ('txttitle',''),
  28. ('txtlsrnumber',''),
  29. ('Submit1','Submit')]
  30. params = urllib.urlencode(values)
  31. search_url='http://www.gencourt.state.nh.us/bill_status/Results.aspx'
  32. #request page with list of all bills in year
  33. req = urllib2.Request(search_url, params)
  34. response = urllib2.urlopen(req)
  35. doc = response.read()
  36. soup = BeautifulSoup(doc)
  37. #parse results
  38. bills = soup.find("table",{"class":"ptable"})
  39. trs = soup.findAll("tr")
  40. #go through all of the table rows with relevant data
  41. tr_start = 8
  42. tr_hop = 11
  43. i = 0
  44. while (tr_start+(tr_hop*i)) < len(trs):
  45. tr = trs[tr_start+(tr_hop*i)]
  46. i = i + 1
  47. #strip off extra white space from name
  48. id = tr.find("big").string.strip()
  49. bill_id = tr.find("big").string.strip()
  50. exp = re.compile("^(\w*)")
  51. bill_id = exp.search(id).group(1)
  52. #check to see if its in the proper chamber
  53. exp = re.compile("^"+chamber_abbr)
  54. if exp.search(bill_id) == None:
  55. continue #in wrong house
  56. #check to see it is a bill and not a resolution
  57. exp = re.compile("B")
  58. if exp.search(bill_id) == None:
  59. continue #not a bill
  60. #get bill_id suffix if exists
  61. exp = re.compile("(-\w*)$")
  62. res = exp.search(id)
  63. if res != None:
  64. bill_id = bill_id + res.group(1)
  65. #get bill title
  66. title = tr.findAll("b")[0]
  67. #print title
  68. bill_title = title.nextSibling.string;
  69. bill_title = bill_title.strip()
  70. bill_title = bill_title.encode('ascii','xmlcharrefreplace')
  71. #grab url of bill text
  72. urls = tr.findAll("a")
  73. textexp = re.compile("Bill Text")
  74. textdoc = re.compile("Bill Docket")
  75. textstat = re.compile("Bill Status")
  76. textcall = re.compile("Roll Calls")
  77. textaudio = re.compile("Audio Files")
  78. for url in urls:
  79. if textexp.search(str(url.string)) != None:
  80. bill_url = self.get_bill_text(url)
  81. if textdoc.search(str(url.string)) != None:
  82. pass
  83. if textstat.search(str(url.string)) != None:
  84. add_bill_sponsors()
  85. if textcall.search(str(url.string)) != None:
  86. pass
  87. if textaudio.search(str(url.string)) != None:
  88. pass
  89. self.add_bill(chamber,year,bill_id,bill_title)
  90. self.add_bill_version(chamber,year,bill_id,"bill text",bill_url)
  91. #grabs sponsorship
  92. #todo: add sponsorship, audio, actions
  93. if __name__ == '__main__':
  94. NHLegislationScraper().run()