PageRenderTime 52ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/fiftystates/scrape/ct/bills.py

https://github.com/runderwood/openstates
Python | 293 lines | 269 code | 17 blank | 7 comment | 15 complexity | 076aaef30c13d032f74796954ecd2b78 MD5 | raw file
  1. import re
  2. import string
  3. from BeautifulSoup import BeautifulSoup
  4. from fiftystates.scrape.bills import Bill, BillScraper
  5. from fiftystates.scrape.votes import Vote
  6. class CTBillScraper(BillScraper):
  7. #some constants
  8. state = 'ct'
  9. min_year = 1991
  10. upper_bill_no_min = 5001
  11. upper_bill_no_max = 9999
  12. lower_bill_no_min = 1
  13. lower_bill_no_max = 5000
  14. def scrape(self,chamber,year):
  15. if year < self.min_year:
  16. raise NoDataForPeriod(year)
  17. if chamber == 'upper':
  18. min = self.upper_bill_no_min
  19. max = self.upper_bill_no_max
  20. elif chamber == 'lower':
  21. min = self.lower_bill_no_min
  22. max = self.lower_bill_no_max
  23. for i in range(min,max+1):
  24. #obtain html
  25. index_file ='http://cga.ct.gov/asp/cgabillstatus/cgabillstatus.asp?selBillType=Bill&bill_num=%d&which_year=%s'\
  26. %(i,year)
  27. with self.urlopen(index_file) as doc:
  28. soup = BeautifulSoup(cleanup_html(doc))
  29. #check to see legislation exists
  30. if soup.find("div",{"class":"CGASubHeader"}) == None:
  31. continue #bill does not exist
  32. else:
  33. #find bill title
  34. tables = soup.find("table",{"class":"CGABlackOnWhite"})
  35. bill_title = tables.findAll("td")[1].contents[2]
  36. bill_summary = tables.findAll("td")[1].contents[5]
  37. bill = Bill(year,chamber,i, bill_title, summary=bill_summary)
  38. #find urls of bill versions
  39. self.add_bill_versions(bill,soup)
  40. #add sponsors
  41. self.add_bill_sponsors(bill,soup)
  42. self.add_bill_actions(bill,soup)
  43. self.add_bill_votes(bill,chamber,soup)
  44. self.save_bill(bill)
  45. def add_bill_sponsors(self,bill,soup):
  46. #add primary sponsors
  47. sponsors = soup.find("table",{"class":"CGABlackOnWhite"}).findAll("td")[2]
  48. num_sponsors = len(sponsors)/2
  49. for i in range(num_sponsors):
  50. name = sponsors.contents[(i+1)*2]
  51. if i == 0:
  52. name = name.replace("Introduced by:","")
  53. name = name.strip()
  54. if (len(name) > 0):
  55. bill.add_sponsor("primary",sponsors.contents[(i+1)*2])
  56. #find cosponsors, if exists
  57. for td in soup.findAll('td'): #ug, this is slow, can I be more efficient here?
  58. if (td.string != None) and (td.string.find("Co-sponsors of") > -1):
  59. contents = td.findAllNext('tr')[0].find('td').contents
  60. for item in contents:
  61. item = item.string
  62. if item != None:
  63. if item.find("<br") < 0:
  64. item = item.strip()
  65. if len(item) > 0:
  66. bill.add_sponsor("cosponsor",item)
  67. def add_bill_versions(self,bill,soup):
  68. ahrefs = soup.find("table",{"id":"CGABillText"}).findAll("a")
  69. for href in ahrefs:
  70. if(href.string != "[pdf]"):
  71. regexp = re.compile("href=\"(\S*)\"")
  72. bill_url = regexp.search(str(href))
  73. bill_url = bill_url.group(1)
  74. bill_url = "http://cga.ct.gov"+bill_url
  75. bill.add_version(href.string,bill_url)
  76. def add_bill_actions(self,bill,soup):
  77. for td in soup.findAll('td'): #ug, this is slow, can I be more efficient here?
  78. if (td.string != None) and (td.string.find("Bill History") > -1):
  79. action_table = td.findAllNext('table')[0].findAll('table')[1]
  80. for action in action_table.findAll("tr"):
  81. date = action.contents[2].string
  82. action = action.contents[len(action.contents)-1].string
  83. bill.add_action('',action,date)
  84. def add_bill_votes(self,bill,chamber,soup):
  85. #not all bills have votes
  86. vote_table = soup.find("table",{"id":"CGAVotes"})
  87. if vote_table != None:
  88. for line in vote_table:
  89. line = str(line)
  90. line = line.strip()
  91. if (len(line) > 0) and (line.find("CGAWhiteOnBlue") < 0):
  92. regexp = re.compile("href=\"(\S*)\"")
  93. vote_url = regexp.search(line)
  94. vote_url = vote_url.group(1)
  95. vote_url = "http://cga.ct.gov"+vote_url
  96. vote = self.scrape_votes(vote_url,chamber)
  97. print 'getting a vote', vote
  98. bill.add_vote(vote)
  99. #url is the url where the vote info page is. Returns Vote object
  100. def scrape_votes(self,url,chamb):
  101. with self.urlopen(url) as doc:
  102. soup = BeautifulSoup(doc)
  103. date=None
  104. motion=None
  105. yeas=None
  106. neas=None
  107. others=None
  108. passed=None
  109. chamber=chamb
  110. necessary=None
  111. vote=None
  112. fonts = soup.findAll('font')
  113. span = soup.findAll('span')
  114. if (len(fonts) + (len(span))) > 4: #data is vaguely structured
  115. if (len(fonts) < 4):
  116. fonts = span
  117. for line in fonts:
  118. #this could be sped up.
  119. line = str(line.contents[0])
  120. line = line.strip()
  121. if line.find("Taken on") > -1:
  122. #then the text is in the form of: "Take on <date> <reason>"
  123. split = line.split(None,3)
  124. date = split[2]
  125. if (len(split) > 3):
  126. motion=split[3]
  127. elif line.find("Those voting Yea") > -1:
  128. yeas = self.get_num_from_line(line)
  129. elif line.find("Those voting Nay") > -1:
  130. neas = self.get_num_from_line(line)
  131. elif line.find("Those absent and not voting") > -1:
  132. others = self.get_num_from_line(line)
  133. elif (line.find("Necessary for Adoption") > -1) or (line.find("Necessary for Passage") > -1):
  134. necessary = self.get_num_from_line(line)
  135. if yeas >= necessary:
  136. passed = True
  137. else:
  138. passed = False
  139. vote = Vote(chamber,date,motion,passed,yeas,neas,others)
  140. #figure out who voted for what
  141. table = soup.findAll('table')
  142. tds = table[len(table)-1].findAll('td')#get the last table
  143. vote_value = None
  144. digits = re.compile('^[\d ]+$')
  145. for cell in tds:
  146. string = cell.find('font')
  147. if (string == None):
  148. string = cell.find('span') #either we are looking at fonts or spans
  149. if (string != None):
  150. string = string.contents[0]
  151. string = string.strip()
  152. else:
  153. string = ''
  154. if (len(string) > 0) and (digits.search(string) == None):
  155. if vote_value == None:
  156. if (string == 'Y') or (string == 'N'):
  157. vote_value = string
  158. elif (string == 'X') or (string == 'A'):
  159. vote_value = 'X'
  160. else:
  161. if vote_value == 'Y':
  162. vote.yes(string)
  163. elif vote_value == 'N':
  164. vote.no(string)
  165. else:
  166. vote.other(string)
  167. vote_value = None
  168. else:
  169. #data is mostly unstructured. Have to sift through a string
  170. data = soup.find('pre')
  171. lines = data.contents[len(data.contents)-1]
  172. lines = lines.strip()
  173. exp = re.compile(r'\n+|\r+|\f+')
  174. lines = exp.split(lines)
  175. names = []
  176. for i in range(len(lines)):
  177. line = lines[i].strip()
  178. if line.find("Taken on") > -1:
  179. #then the text is in the form of: "Take on <date> <reason>"
  180. split = line.split(None,3)
  181. date = split[2]
  182. if (len(split) > 3):
  183. motion=split[3]
  184. elif line.find("Those voting Yea") > -1:
  185. yeas = self.get_num_from_line(line)
  186. elif line.find("Those voting Nay") > -1:
  187. neas = self.get_num_from_line(line)
  188. elif line.find("Those absent and not voting") > -1:
  189. others = self.get_num_from_line(line)
  190. elif (line.find("Necessary for Adoption") > -1) or (line.find("Necessary for Passage") > -1):
  191. if (line.find("Adoption") > -1):
  192. motion="Adoption"
  193. else:
  194. motion="Passage"
  195. necessary = self.get_num_from_line(line)
  196. elif (line.find("The following is the roll call vote:") > -1):
  197. break #the next lines contain actual votes
  198. #process the vote values
  199. if yeas >= necessary:
  200. passed = True
  201. else:
  202. passed = False
  203. vote = Vote(chamber,date,motion,passed,yeas,neas,others)
  204. lines = lines[i+1:]
  205. lines = string.join(lines,' ')
  206. lines = lines.split(' ')
  207. absent_vote_value = re.compile('^(X|A)$')
  208. yea_vote_value = re.compile('^Y$')
  209. nea_vote_value = re.compile('^N$')
  210. #there aren't two spaces between vote and name so it doesn't get parsed
  211. annoying_vote = re.compile('^(Y|X|A|N) ([\S ]+)$')
  212. digits = re.compile('^[\d ]+$')
  213. vote_value = None
  214. for word in lines:
  215. word = word.strip()
  216. if (len(word) > 0) and (digits.search(word) == None):
  217. word = strip_digits(word)
  218. if vote_value != None:
  219. if vote_value == 'Y':
  220. vote.yes(word)
  221. elif vote_value == 'N':
  222. vote.no(word)
  223. else:
  224. vote.other(word)
  225. vote_value = None
  226. elif absent_vote_value.match(word) != None:
  227. vote_value = 'X'
  228. elif yea_vote_value.match(word) != None:
  229. vote_value = 'Y'
  230. elif nea_vote_value.match(word) != None:
  231. vote_value = 'N'
  232. elif annoying_vote.match(word) != None:
  233. split = annoying_vote.match(word)
  234. vote_value = split.group(2)
  235. name = split.group(1)
  236. if vote_value == 'Y':
  237. vote.yes(name)
  238. elif vote_value == 'N':
  239. vote.no(name)
  240. else:
  241. vote.other(name)
  242. vote_value = None
  243. return vote
  244. def get_num_from_line(self,line):
  245. num = re.match('[\D]*(\d+)\w*$',line)
  246. num = num.group(1)
  247. return int(num)
  248. #all CT bill pages have a bug in them (three quotation marks in a row
  249. #in one of the href tags) that crashes beautiful soup. This funciton
  250. #returns a version of the html without this bug
  251. def cleanup_html(html):
  252. return html.replace('"""','"')
  253. #stripts digits at beginning of string
  254. def strip_digits(string):
  255. old = string
  256. exp = re.compile("^\d* *([\w.,'\s]+)$")
  257. string = exp.search(string)
  258. if string == None:
  259. return old
  260. return string.group(1).strip()