PageRenderTime 52ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/scrapers/tx/bills.py

https://github.com/sunlightlabs/openstates
Python | 302 lines | 285 code | 10 blank | 7 comment | 9 complexity | c0b3f379808ac1d9bf94ec51a7044b6b MD5 | raw file
Possible License(s): GPL-3.0
  1. import datetime
  2. import ftplib
  3. import re
  4. import time
  5. from urllib import parse as urlparse
  6. import xml.etree.cElementTree as etree
  7. from openstates.scrape import Scraper, Bill
  8. from openstates.scrape.base import ScrapeError
  9. from utils import LXMLMixin
  10. _action_re = (
  11. ("^Amended$", "amendment-passage"),
  12. (r"^Amendment\(s\) offered$", "amendment-introduction"),
  13. ("^Amendment amended$", "amendment-amendment"),
  14. ("^Amendment withdrawn$", "amendment-withdrawal"),
  15. ("^Passed$", "passage"),
  16. ("^Adopted$", "passage"),
  17. ("^Received (by|from) the.*Secretary of the Senate", "filing"),
  18. ("^Received (by|from) the", "introduction"),
  19. ("^Sent to the Governor", "executive-receipt"),
  20. ("^Signed by the Governor", "executive-signature"),
  21. ("^Effective on", "became-law"),
  22. ("^Vetoed by the Governor$", "executive-veto"),
  23. ("^Read first time$", ["introduction", "reading-1"]),
  24. ("^Read & adopted$", ["passage", "introduction"]),
  25. ("^Passed as amended$", "passage"),
  26. ("^Referred to", "referral-committee"),
  27. ("^Recommended to be sent to", "referral-committee"),
  28. (r"^Reported favorably w/o amendment\(s\)$", "committee-passage"),
  29. ("^Filed$", "filing"),
  30. ("^Read 3rd time$", "reading-3"),
  31. ("^Read 2nd time$", "reading-2"),
  32. ("^Reported favorably", "committee-passage-favorable"),
  33. ("^Effective immediately$", "became-law"),
  34. ("^Filed without the Governor's signature$", "became-law"),
  35. )
  36. def _categorize_action(action):
  37. for pattern, types in _action_re:
  38. if re.findall(pattern, action):
  39. return types
  40. return None
  41. class TXBillScraper(Scraper, LXMLMixin):
  42. _FTP_ROOT = "ftp.legis.state.tx.us"
  43. CHAMBERS = {"H": "lower", "S": "upper"}
  44. NAME_SLUGS = {
  45. "I": "Introduced",
  46. "E": "Engrossed",
  47. "S": "Senate Committee Report",
  48. "H": "House Committee Report",
  49. "F": "Enrolled",
  50. }
  51. companion_url = (
  52. "https://capitol.texas.gov/BillLookup/Companions.aspx" "?LegSess={}&Bill={}"
  53. )
  54. def _get_ftp_files(self, dir_):
  55. """Recursively traverse an FTP directory, returning all files"""
  56. for i in range(3):
  57. try:
  58. ftp = ftplib.FTP(self._FTP_ROOT)
  59. break
  60. except (EOFError, ftplib.error_temp):
  61. time.sleep(2**i)
  62. else:
  63. raise Exception
  64. ftp.login()
  65. ftp.cwd("/" + dir_)
  66. self.info("Searching an FTP folder for files ({})".format(dir_))
  67. lines = []
  68. ftp.retrlines("LIST", lines.append)
  69. for line in lines:
  70. (_date, _time, is_dir, _file_size, name) = re.search(
  71. r"""(?x)
  72. ^(\d{2}-\d{2}-\d{2})\s+ # Date in mm-dd-yy
  73. (\d{2}:\d{2}[AP]M)\s+ # Time in hh:mmAM/PM
  74. (<DIR>)?\s+ # Directories will have an indicating flag
  75. (\d+)?\s+ # Files will have their size in bytes
  76. (.+?)\s*$ # Directory or file name is the remaining text
  77. """,
  78. line,
  79. ).groups()
  80. if is_dir:
  81. for item in self._get_ftp_files("/".join([dir_, name])):
  82. yield item
  83. else:
  84. yield "/".join(["ftp://" + self._FTP_ROOT, dir_, name])
  85. @staticmethod
  86. def _get_bill_id_from_file_path(file_path):
  87. bill_id = file_path.split("/")[-1].split(".")[0]
  88. identifier, number = re.search(r"([A-Z]{2}R?)0+(\d+)", bill_id).groups()
  89. # House and Senate Concurrent and Joint Resolutions files do not contain
  90. # the 'R' for resolution in file names. This is required to match
  91. # bill ID's later on.
  92. if re.match("[HS][CJ]", identifier):
  93. identifier += "R"
  94. return " ".join([identifier, number])
  95. def scrape(self, session=None, chamber=None):
  96. chambers = [chamber] if chamber else ["upper", "lower"]
  97. session_code = self._format_session(session)
  98. self.witnesses = []
  99. witness_files = self._get_ftp_files(
  100. "bills/{}/witlistbill/html".format(session_code)
  101. )
  102. for item in witness_files:
  103. bill_id = self._get_bill_id_from_file_path(item)
  104. self.witnesses.append((bill_id, item))
  105. history_files = self._get_ftp_files("bills/{}/billhistory".format(session_code))
  106. for bill_url in history_files:
  107. if "house" in bill_url:
  108. if "lower" in chambers:
  109. yield from self.scrape_bill(session, bill_url)
  110. elif "senate" in bill_url:
  111. if "upper" in chambers:
  112. yield from self.scrape_bill(session, bill_url)
  113. def scrape_bill(self, session, history_url):
  114. history_xml = self.get(history_url).text
  115. root = etree.fromstring(history_xml)
  116. bill_title = root.findtext("caption")
  117. if bill_title is None or "Bill does not exist" in history_xml:
  118. self.warning("Bill does not appear to exist")
  119. return
  120. bill_id = " ".join(root.attrib["bill"].split(" ")[1:])
  121. chamber = self.CHAMBERS[bill_id[0]]
  122. if bill_id[1] == "B":
  123. bill_type = ["bill"]
  124. elif bill_id[1] == "R":
  125. bill_type = ["resolution"]
  126. elif bill_id[1:3] == "CR":
  127. bill_type = ["concurrent resolution"]
  128. elif bill_id[1:3] == "JR":
  129. bill_type = ["joint resolution"]
  130. else:
  131. raise ScrapeError("Invalid bill_id: %s" % bill_id)
  132. bill = Bill(
  133. bill_id,
  134. legislative_session=session,
  135. chamber=chamber,
  136. title=bill_title,
  137. classification=bill_type,
  138. )
  139. bill.add_source(history_url)
  140. bill_id_for_url = bill_id.replace(" ", "")
  141. bill.add_source(
  142. f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
  143. )
  144. for subject in root.iterfind("subjects/subject"):
  145. bill.add_subject(subject.text.strip())
  146. for version in root.iterfind("billtext/docTypes/bill/versions/version"):
  147. if not version:
  148. continue
  149. note = version.find("versionDescription").text
  150. html_url = version.find("WebHTMLURL").text
  151. bill.add_version_link(note=note, url=html_url, media_type="text/html")
  152. pdf_url = version.find("WebPDFURL").text
  153. bill.add_version_link(note=note, url=pdf_url, media_type="application/pdf")
  154. for analysis in root.iterfind("billtext/docTypes/analysis/versions/version"):
  155. if not analysis:
  156. continue
  157. description = analysis.find("versionDescription").text
  158. html_url = analysis.find("WebHTMLURL").text
  159. bill.add_document_link(
  160. note="Analysis ({})".format(description),
  161. url=html_url,
  162. media_type="text/html",
  163. )
  164. for fiscal_note in root.iterfind(
  165. "billtext/docTypes/fiscalNote/versions/version"
  166. ):
  167. if not fiscal_note:
  168. continue
  169. description = fiscal_note.find("versionDescription").text
  170. html_url = fiscal_note.find("WebHTMLURL").text
  171. bill.add_document_link(
  172. note="Fiscal Note ({})".format(description),
  173. url=html_url,
  174. media_type="text/html",
  175. )
  176. witnesses = [x for x in self.witnesses if x[0] == bill_id]
  177. for witness in witnesses:
  178. bill.add_document_link(
  179. note="Witness List ({})".format(self.NAME_SLUGS[witness[1][-5]]),
  180. url=witness[1],
  181. media_type="text/html",
  182. )
  183. for action in root.findall("actions/action"):
  184. act_date = datetime.datetime.strptime(
  185. action.findtext("date"), "%m/%d/%Y"
  186. ).date()
  187. action_number = action.find("actionNumber").text
  188. actor = {"H": "lower", "S": "upper", "E": "executive"}[action_number[0]]
  189. desc = action.findtext("description").strip()
  190. if desc == "Scheduled for public hearing on . . .":
  191. self.warning("Skipping public hearing action with no date")
  192. continue
  193. atype = _categorize_action(desc)
  194. act = bill.add_action(
  195. action.findtext("description"),
  196. act_date,
  197. chamber=actor,
  198. classification=atype,
  199. )
  200. if atype and "referral-committee" in atype:
  201. repls = ["Referred to", "Recommended to be sent to "]
  202. ctty = desc
  203. for r in repls:
  204. ctty = ctty.replace(r, "").strip()
  205. act.add_related_entity(name=ctty, entity_type="organization")
  206. for author in root.findtext("authors").split(" | "):
  207. if re.search(r"\S+", author.strip()) is not None:
  208. bill.add_sponsorship(
  209. author, classification="primary", entity_type="person", primary=True
  210. )
  211. for coauthor in root.findtext("coauthors").split(" | "):
  212. if re.search(r"\S+", coauthor.strip()) is not None:
  213. bill.add_sponsorship(
  214. coauthor,
  215. classification="cosponsor",
  216. entity_type="person",
  217. primary=False,
  218. )
  219. for sponsor in root.findtext("sponsors").split(" | "):
  220. if re.search(r"\S+", sponsor.strip()) is not None:
  221. bill.add_sponsorship(
  222. sponsor,
  223. classification="primary",
  224. entity_type="person",
  225. primary=True,
  226. )
  227. for cosponsor in root.findtext("cosponsors").split(" | "):
  228. if re.search(r"\S+", cosponsor.strip()) is not None:
  229. bill.add_sponsorship(
  230. cosponsor,
  231. classification="cosponsor",
  232. entity_type="person",
  233. primary=False,
  234. )
  235. if root.findtext("companions"):
  236. self._get_companion(bill)
  237. yield bill
  238. def _get_companion(self, bill):
  239. url = self.companion_url.format(
  240. self._format_session(bill.legislative_session),
  241. self._format_bill_id(bill.identifier),
  242. )
  243. page = self.lxmlize(url)
  244. links = page.xpath('//table[@id="Table6"]//a')
  245. for link in links:
  246. parsed = urlparse.urlparse(link.attrib["href"])
  247. query = urlparse.parse_qs(parsed.query)
  248. bill.add_related_bill(
  249. identifier=query["Bill"][0],
  250. legislative_session=query["LegSess"][0].replace("R", ""),
  251. relation_type="companion",
  252. )
  253. def _format_session(self, session):
  254. if len(session) == 2:
  255. session = session + "R"
  256. assert len(session) == 3, "Unable to handle the session name"
  257. return session
  258. def _format_bill_id(self, bill_id):
  259. return bill_id.replace(" ", "")