PageRenderTime 57ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/openstates/ut/bills.py

https://github.com/JoeGermuska/openstates
Python | 391 lines | 374 code | 13 blank | 4 comment | 5 complexity | b8d5262c05a74d7b50098d92b028c3ef MD5 | raw file
  1. import re
  2. import datetime
  3. from billy.scrape.bills import BillScraper, Bill
  4. from billy.scrape.votes import Vote
  5. import lxml.html
  6. import scrapelib
  7. import logging
  8. logger = logging.getLogger('openstates')
  9. SUB_BLACKLIST = [
  10. "Second Substitute",
  11. "Third Substitute",
  12. "Fourth Substitute",
  13. "Fifth Substitute",
  14. "Sixth Substitute",
  15. "Seventh Substitute",
  16. "Eighth Substitute",
  17. "Ninth Substitute",
  18. "Substitute",
  19. ] # Pages are the same, we'll strip this from bills we catch.
  20. class UTBillScraper(BillScraper):
  21. jurisdiction = 'ut'
  22. def accept_response(self, response):
  23. # check for rate limit pages
  24. normal = super(UTBillScraper, self).accept_response(response)
  25. return (normal and
  26. 'com.microsoft.jdbc.base.BaseSQLException' not in
  27. response.text and
  28. 'java.sql.SQLException' not in
  29. response.text)
  30. # The UT site has been throwing a lot of transiant DB errors, these
  31. # will backoff and retry if their site has an issue. Seems to happen
  32. # often enough.
  33. def scrape(self, chamber, session):
  34. self.validate_session(session)
  35. if chamber == 'lower':
  36. bill_abbrs = r'HB|HCR|HJ|HR'
  37. else:
  38. bill_abbrs = r'SB|SCR|SJR|SR'
  39. bill_list_re = r'(%s).*ht\.htm' % bill_abbrs
  40. bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % (
  41. session.replace(' ', ''))
  42. page = self.urlopen(bill_list_url)
  43. page = lxml.html.fromstring(page)
  44. page.make_links_absolute(bill_list_url)
  45. for link in page.xpath('//a'):
  46. if "href" not in link.attrib:
  47. continue # XXX: There are some funky <a> tags here.
  48. if re.search(bill_list_re, link.attrib['href']):
  49. self.scrape_bill_list(chamber, session,
  50. link.attrib['href'])
  51. def scrape_bill_list(self, chamber, session, url):
  52. page = self.urlopen(url)
  53. page = lxml.html.fromstring(page)
  54. page.make_links_absolute(url)
  55. for link in page.xpath('//a[contains(@href, "billhtm")]'):
  56. bill_id = link.xpath('string()').strip()
  57. self.scrape_bill(chamber, session, bill_id,
  58. link.attrib['href'])
  59. def scrape_bill(self, chamber, session, bill_id, url):
  60. try:
  61. page = self.urlopen(url)
  62. except scrapelib.HTTPError:
  63. self.warning("couldn't open %s, skipping bill" % url)
  64. return
  65. page = lxml.html.fromstring(page)
  66. page.make_links_absolute(url)
  67. header = page.xpath('//h3/br')[0].tail.replace('&nbsp;', ' ')
  68. title, primary_sponsor = header.split(' -- ')
  69. if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'):
  70. bill_type = ['bill']
  71. elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'):
  72. bill_type = ['resolution']
  73. elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'):
  74. bill_type = ['concurrent resolution']
  75. elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'):
  76. bill_type = ['joint resolution']
  77. for flag in SUB_BLACKLIST:
  78. if flag in bill_id:
  79. bill_id = bill_id.replace(flag, " ")
  80. bill_id = re.sub("\s+", " ", bill_id).strip()
  81. bill = Bill(session, chamber, bill_id, title, type=bill_type)
  82. bill.add_sponsor('primary', primary_sponsor)
  83. bill.add_source(url)
  84. for link in page.xpath(
  85. '//a[contains(@href, "bills/") and text() = "HTML"]'):
  86. name = link.getprevious().tail.strip()
  87. bill.add_version(name, link.attrib['href'], mimetype="text/html")
  88. next = link.getnext()
  89. if next.text == "PDF":
  90. bill.add_version(name, next.attrib['href'],
  91. mimetype="application/pdf")
  92. for link in page.xpath(
  93. "//a[contains(@href, 'fnotes') and text() = 'HTML']"):
  94. bill.add_document("Fiscal Note", link.attrib['href'])
  95. subjects = []
  96. for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
  97. subjects.append(link.text.strip())
  98. bill['subjects'] = subjects
  99. status_link = page.xpath('//a[contains(@href, "billsta")]')[0]
  100. self.parse_status(bill, status_link.attrib['href'])
  101. self.save_bill(bill)
  102. def parse_status(self, bill, url):
  103. page = self.urlopen(url)
  104. bill.add_source(url)
  105. page = lxml.html.fromstring(page)
  106. page.make_links_absolute(url)
  107. uniqid = 0
  108. for row in page.xpath('//table/tr')[1:]:
  109. uniqid += 1
  110. date = row.xpath('string(td[1])')
  111. date = datetime.datetime.strptime(date, "%m/%d/%Y").date()
  112. action = row.xpath('string(td[2])')
  113. actor = bill['chamber']
  114. if '/' in action:
  115. actor = action.split('/')[0].strip()
  116. if actor == 'House':
  117. actor = 'lower'
  118. elif actor == 'Senate':
  119. actor = 'upper'
  120. elif actor == 'LFA':
  121. actor = 'Office of the Legislative Fiscal Analyst'
  122. action = '/'.join(action.split('/')[1:]).strip()
  123. if action == 'Governor Signed':
  124. actor = 'executive'
  125. type = 'governor:signed'
  126. elif action == 'Governor Vetoed':
  127. actor = 'executive'
  128. type = 'governor:vetoed'
  129. elif action.startswith('1st reading'):
  130. type = ['bill:introduced', 'bill:reading:1']
  131. elif action == 'to Governor':
  132. type = 'governor:received'
  133. elif action == 'passed 3rd reading':
  134. type = 'bill:passed'
  135. elif action.startswith('passed 2nd & 3rd readings'):
  136. type = 'bill:passed'
  137. elif action == 'to standing committee':
  138. comm_link = row.xpath("td[3]/font/font/a")[0]
  139. comm = re.match(
  140. r"writetxt\('(.*)'\)",
  141. comm_link.attrib['onmouseover']).group(1)
  142. action = "to " + comm
  143. type = 'committee:referred'
  144. elif action.startswith('2nd reading'):
  145. type = 'bill:reading:2'
  146. elif action.startswith('3rd reading'):
  147. type = 'bill:reading:3'
  148. elif action == 'failed':
  149. type = 'bill:failed'
  150. elif action.startswith('2nd & 3rd readings'):
  151. type = ['bill:reading:2', 'bill:reading:3']
  152. elif action == 'passed 2nd reading':
  153. type = 'bill:reading:2'
  154. elif 'Comm - Favorable Recommendation' in action:
  155. type = 'committee:passed:favorable'
  156. elif action == 'committee report favorable':
  157. type = 'committee:passed:favorable'
  158. else:
  159. type = 'other'
  160. bill.add_action(actor, action, date, type=type,
  161. _vote_id=uniqid)
  162. # Check if this action is a vote
  163. vote_links = row.xpath('./td[4]//a')
  164. for vote_link in vote_links:
  165. vote_url = vote_link.attrib['href']
  166. # Committee votes are of a different format that
  167. # we don't handle yet
  168. if not vote_url.endswith('txt'):
  169. self.parse_html_vote(bill, actor, date, action,
  170. vote_url, uniqid)
  171. else:
  172. self.parse_vote(bill, actor, date, action,
  173. vote_url, uniqid)
  174. def scrape_committee_vote(self, bill, actor, date, motion, url, uniqid):
  175. page = self.urlopen(url)
  176. page = lxml.html.fromstring(page)
  177. page.make_links_absolute(url)
  178. committee = page.xpath("//b")[0].text_content()
  179. votes = page.xpath("//table")[0]
  180. rows = votes.xpath(".//tr")[0]
  181. yno = rows.xpath(".//td")
  182. if len(yno) < 3:
  183. yes = yno[0]
  184. no, other = None, None
  185. else:
  186. yes, no, other = rows.xpath(".//td")
  187. def proc_block(obj):
  188. if obj is None:
  189. return {
  190. "type": None,
  191. "count": None,
  192. "votes": []
  193. }
  194. typ = obj.xpath("./b")[0].text_content()
  195. count = obj.xpath(".//b")[0].tail.replace("-", "").strip()
  196. count = int(count)
  197. votes = []
  198. for vote in obj.xpath(".//br"):
  199. vote = vote.tail
  200. if vote:
  201. vote = vote.strip()
  202. votes.append(vote)
  203. return {
  204. "type": typ,
  205. "count": count,
  206. "votes": votes
  207. }
  208. vote_dict = {
  209. "yes": proc_block(yes),
  210. "no": proc_block(no),
  211. "other": proc_block(other),
  212. }
  213. yes_count = vote_dict['yes']['count']
  214. no_count = vote_dict['no']['count'] or 0
  215. other_count = vote_dict['other']['count'] or 0
  216. vote = Vote(
  217. actor,
  218. date,
  219. motion,
  220. (yes_count > no_count),
  221. yes_count,
  222. no_count,
  223. other_count,
  224. _vote_id=uniqid)
  225. vote.add_source(url)
  226. for key in vote_dict:
  227. for voter in vote_dict[key]['votes']:
  228. getattr(vote, key)(voter)
  229. bill.add_vote(vote)
  230. def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
  231. page = self.urlopen(url)
  232. page = lxml.html.fromstring(page)
  233. page.make_links_absolute(url)
  234. descr = page.xpath("//b")[0].text_content()
  235. if "on voice vote" in descr:
  236. return
  237. if "committee" in descr.lower():
  238. return self.scrape_committee_vote(
  239. bill, actor, date, motion, url, uniqid
  240. )
  241. passed = None
  242. if "Passed" in descr:
  243. passed = True
  244. elif "Failed" in descr:
  245. passed = False
  246. elif "UTAH STATE LEGISLATURE" in descr:
  247. return
  248. else:
  249. logger.warning(descr)
  250. raise NotImplemented("Can't see if we passed or failed")
  251. headings = page.xpath("//b")[1:]
  252. votes = page.xpath("//table")
  253. sets = zip(headings, votes)
  254. vdict = {}
  255. for (typ, votes) in sets:
  256. txt = typ.text_content()
  257. arr = [x.strip() for x in txt.split("-", 1)]
  258. if len(arr) != 2:
  259. continue
  260. v_txt, count = arr
  261. v_txt = v_txt.strip()
  262. count = int(count)
  263. people = [x.text_content().strip() for x in
  264. votes.xpath(".//font[@face='Arial']")]
  265. vdict[v_txt] = {
  266. "count": count,
  267. "people": people
  268. }
  269. vote = Vote(actor, date,
  270. motion,
  271. passed,
  272. vdict['Yeas']['count'],
  273. vdict['Nays']['count'],
  274. vdict['Absent or not voting']['count'],
  275. _vote_id=uniqid)
  276. vote.add_source(url)
  277. for person in vdict['Yeas']['people']:
  278. vote.yes(person)
  279. for person in vdict['Nays']['people']:
  280. vote.no(person)
  281. for person in vdict['Absent or not voting']['people']:
  282. vote.other(person)
  283. logger.info(vote)
  284. bill.add_vote(vote)
  285. def parse_vote(self, bill, actor, date, motion, url, uniqid):
  286. page = self.urlopen(url)
  287. bill.add_source(url)
  288. vote_re = re.compile('YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)'
  289. '(.*)ABSENT( OR NOT VOTING)? -?\s?'
  290. '(\d+)(.*)',
  291. re.MULTILINE | re.DOTALL)
  292. match = vote_re.search(page)
  293. yes_count = int(match.group(1))
  294. no_count = int(match.group(3))
  295. other_count = int(match.group(6))
  296. if yes_count > no_count:
  297. passed = True
  298. else:
  299. passed = False
  300. if actor == 'upper' or actor == 'lower':
  301. vote_chamber = actor
  302. vote_location = ''
  303. else:
  304. vote_chamber = ''
  305. vote_location = actor
  306. vote = Vote(vote_chamber, date,
  307. motion, passed, yes_count, no_count,
  308. other_count,
  309. location=vote_location,
  310. _vote_id=uniqid)
  311. vote.add_source(url)
  312. yes_votes = re.split('\s{2,}', match.group(2).strip())
  313. no_votes = re.split('\s{2,}', match.group(4).strip())
  314. other_votes = re.split('\s{2,}', match.group(7).strip())
  315. for yes in yes_votes:
  316. if yes:
  317. vote.yes(yes)
  318. for no in no_votes:
  319. if no:
  320. vote.no(no)
  321. for other in other_votes:
  322. if other:
  323. vote.other(other)
  324. bill.add_vote(vote)