PageRenderTime 28ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/scrapers/ne/bills.py

https://github.com/mattgrayson/openstates
Python | 278 lines | 255 code | 15 blank | 8 comment | 6 complexity | 0582a3a4af1046782cb130020ec6d287 MD5 | raw file
  1. import pytz
  2. import urllib
  3. from datetime import datetime
  4. from openstates.scrape import Scraper, Bill, VoteEvent
  5. from utils import LXMLMixin
  6. TIMEZONE = pytz.timezone("US/Central")
  7. VOTE_TYPE_MAP = {"yes": "yes", "no": "no"}
  8. class NEBillScraper(Scraper, LXMLMixin):
  9. def scrape(self, session=None):
  10. if session is None:
  11. session = self.jurisdiction.legislative_sessions[-1]
  12. self.info("no session specified, using %s", session["identifier"])
  13. else:
  14. session = next(
  15. (
  16. item
  17. for item in self.jurisdiction.legislative_sessions
  18. if item["identifier"] == session
  19. ),
  20. None,
  21. )
  22. start_year = datetime.strptime(session["start_date"], "%Y-%m-%d").year
  23. end_year = datetime.strptime(session["end_date"], "%Y-%m-%d").year
  24. yield from self.scrape_year(session["identifier"], start_year)
  25. if start_year != end_year:
  26. yield from self.scrape_year(session["identifier"], end_year)
  27. def scrape_year(self, session, year):
  28. main_url = (
  29. "https://nebraskalegislature.gov/bills/search_by_date.php?"
  30. "SessionDay={}".format(year)
  31. )
  32. page = self.lxmlize(main_url)
  33. document_links = self.get_nodes(
  34. page,
  35. '//div[@class="main-content"]//div[@class="table-responsive"]//'
  36. 'table[@class="table"]/tbody/tr/td[1]/a',
  37. )
  38. for document_link in document_links:
  39. # bill_number = document_link.text
  40. bill_link = document_link.attrib["href"]
  41. # POST request for search form
  42. # post_dict = {'DocumentNumber': bill_number, 'Legislature': session}
  43. # headers = urllib.urlencode(post_dict)
  44. # bill_resp = self.post('http://nebraskalegislature.gov/bills/'
  45. # 'search_by_number.php', data=post_dict)
  46. # bill_link = bill_resp.url
  47. # bill_page = bill_resp.text
  48. yield from self.bill_info(bill_link, session, main_url)
  49. def bill_info(self, bill_link, session, main_url):
  50. bill_page = self.lxmlize(bill_link)
  51. long_title = self.get_node(
  52. bill_page, '//div[@class="main-content"]//h2'
  53. ).text.split()
  54. bill_number = long_title[0]
  55. title = ""
  56. for x in range(2, len(long_title)):
  57. title += long_title[x] + " "
  58. title = title[0:-1]
  59. if not title:
  60. self.error("no title, skipping %s", bill_number)
  61. return
  62. bill_type = "resolution" if "LR" in bill_number else "bill"
  63. bill = Bill(bill_number, session, title, classification=bill_type)
  64. bill.add_source(main_url)
  65. bill.add_source(bill_link)
  66. introduced_by = self.get_node(
  67. bill_page,
  68. "//body/div[3]/div[2]/div[2]/div/div[3]/div[1]/ul/li[1]/a[1]/text()",
  69. )
  70. if not introduced_by:
  71. introduced_by = self.get_node(
  72. bill_page,
  73. "//body/div[3]/div[2]/div[2]/div/div[2]/div[1]/ul/li[1]/text()",
  74. )
  75. introduced_by = introduced_by.split("Introduced By:")[1].strip()
  76. introduced_by = introduced_by.strip()
  77. bill.add_sponsorship(
  78. name=introduced_by,
  79. entity_type="person",
  80. primary=True,
  81. classification="primary",
  82. )
  83. action_nodes = self.get_nodes(
  84. bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr'
  85. )
  86. for action_node in action_nodes:
  87. date = self.get_node(action_node, "./td[1]").text
  88. date = datetime.strptime(date, "%b %d, %Y")
  89. # The action node may have an anchor element within it, so
  90. # we grab all the text within.
  91. action = self.get_node(action_node, "./td[2]").text_content()
  92. if "Governor" in action:
  93. actor = "executive"
  94. elif "Speaker" in action:
  95. actor = "legislature"
  96. else:
  97. actor = "legislature"
  98. action_type = self.action_types(action)
  99. bill.add_action(
  100. action,
  101. date.strftime("%Y-%m-%d"),
  102. chamber=actor,
  103. classification=action_type,
  104. )
  105. # Grabs bill version documents.
  106. version_links = self.get_nodes(
  107. bill_page, "/html/body/div[3]/div[2]/div[2]/div/" "div[3]/div[2]/ul/li/a"
  108. )
  109. for version_link in version_links:
  110. version_name = version_link.text
  111. version_url = version_link.attrib["href"]
  112. # replace Current w/ session number
  113. version_url = version_url.replace("Current", session)
  114. bill.add_version_link(
  115. version_name, version_url, media_type="application/pdf"
  116. )
  117. soi = self.get_nodes(bill_page, ".//a[contains(text(), 'Statement of Intent')]")
  118. if soi:
  119. bill.add_document_link(
  120. "Statement of Intent", soi[0].get("href"), media_type="application/pdf"
  121. )
  122. comstmt = self.get_nodes(
  123. bill_page, ".//a[contains(text(), 'Committee Statement')]"
  124. )
  125. if comstmt:
  126. bill.add_document_link(
  127. "Committee Statement",
  128. comstmt[0].get("href"),
  129. media_type="application/pdf",
  130. )
  131. fn = self.get_nodes(bill_page, ".//a[contains(text(), 'Fiscal Note')]")
  132. if fn:
  133. bill.add_document_link(
  134. "Fiscal Note", fn[0].get("href"), media_type="application/pdf"
  135. )
  136. # Adds any documents related to amendments.
  137. amendment_links = self.get_nodes(
  138. bill_page, ".//div[contains(@class, 'amend-link')]/a"
  139. )
  140. for amendment_link in amendment_links:
  141. amendment_name = amendment_link.text
  142. amendment_url = amendment_link.attrib["href"]
  143. # skip over transcripts
  144. if "/AM/" not in amendment_url:
  145. continue
  146. bill.add_document_link(
  147. amendment_name, amendment_url, media_type="application/pdf"
  148. )
  149. yield bill
  150. yield from self.scrape_votes(bill, bill_page, actor)
  151. def scrape_amendments(self, bill, bill_page):
  152. amd_xpath = '//div[contains(@class,"amends") and not(contains(@class,"mb-3"))]'
  153. for row in bill_page.xpath(amd_xpath):
  154. status = row.xpath("string(./div[2])").strip()
  155. if "adopted" in status.lower():
  156. version_url = row.xpath("./div[1]/a/@href")[0]
  157. version_name = row.xpath("./div[1]/a/text()")[0]
  158. bill.add_version_link(
  159. version_name,
  160. version_url,
  161. media_type="application/pdf",
  162. on_duplicate="ignore",
  163. )
  164. def scrape_votes(self, bill, bill_page, chamber):
  165. vote_links = bill_page.xpath(
  166. '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]'
  167. )
  168. for vote_link in vote_links:
  169. vote_url = vote_link.attrib["href"]
  170. date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td")
  171. date = datetime.strptime(date_td.text, "%b %d, %Y")
  172. motion_text = motion_td.text_content()
  173. vote_page = self.lxmlize(vote_url)
  174. passed = "Passed" in motion_text or "Advanced" in motion_text
  175. cells = vote_page.xpath(
  176. '//div[contains(@class,"table-responsive")]/table//td'
  177. )
  178. vote = VoteEvent(
  179. bill=bill,
  180. chamber=chamber,
  181. start_date=TIMEZONE.localize(date),
  182. motion_text=motion_text,
  183. classification="passage",
  184. result="pass" if passed else "fail",
  185. )
  186. yes_count = self.process_count(vote_page, "Yes:")
  187. no_count = self.process_count(vote_page, "No:")
  188. exc_count = self.process_count(vote_page, "Excused - Not Voting:")
  189. absent_count = self.process_count(vote_page, "Absent - Not Voting:")
  190. present_count = self.process_count(vote_page, "Present - Not Voting:")
  191. vote.set_count("yes", yes_count)
  192. vote.set_count("no", no_count)
  193. vote.set_count("excused", exc_count)
  194. vote.set_count("absent", absent_count)
  195. vote.set_count("abstain", present_count)
  196. query_params = urllib.parse.parse_qs(urllib.parse.urlparse(vote_url).query)
  197. vote.pupa_id = query_params["KeyID"][0]
  198. vote.add_source(vote_url)
  199. for chunk in range(0, len(cells), 2):
  200. name = cells[chunk].text
  201. vote_type = cells[chunk + 1].text
  202. if name and vote_type:
  203. vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"), name)
  204. yield vote
  205. # Find the vote count row containing row_string, and return the integer count
  206. def process_count(self, page, row_string):
  207. count_xpath = (
  208. 'string(//ul[contains(@class,"list-unstyled")]/li[contains(text(),"{}")])'
  209. )
  210. count_text = page.xpath(count_xpath.format(row_string))
  211. return int("".join(x for x in count_text if x.isdigit()))
  212. def action_types(self, action):
  213. if "Date of introduction" in action:
  214. action_type = "introduction"
  215. elif "Referred to" in action:
  216. action_type = "referral-committee"
  217. elif "Indefinitely postponed" in action:
  218. action_type = "committee-failure"
  219. elif ("File" in action) or ("filed" in action):
  220. action_type = "filing"
  221. elif "Placed on Final Reading" in action:
  222. action_type = "reading-3"
  223. elif "Passed" in action or "President/Speaker signed" in action:
  224. action_type = "passage"
  225. elif "Presented to Governor" in action:
  226. action_type = "executive-receipt"
  227. elif "Approved by Governor" in action:
  228. action_type = "executive-signature"
  229. elif "Failed to pass notwithstanding the objections of the Governor" in action:
  230. action_type = "executive-veto"
  231. elif "Failed" in action:
  232. action_type = "failure"
  233. elif "Bill withdrawn" in action:
  234. action_type = "withdrawal"
  235. else:
  236. action_type = None
  237. return action_type