/scrapers/tx/bills.py
Python | 302 lines | 285 code | 10 blank | 7 comment | 9 complexity | c0b3f379808ac1d9bf94ec51a7044b6b MD5 | raw file
Possible License(s): GPL-3.0
- import datetime
- import ftplib
- import re
- import time
- from urllib import parse as urlparse
- import xml.etree.cElementTree as etree
- from openstates.scrape import Scraper, Bill
- from openstates.scrape.base import ScrapeError
- from utils import LXMLMixin
- _action_re = (
- ("^Amended$", "amendment-passage"),
- (r"^Amendment\(s\) offered$", "amendment-introduction"),
- ("^Amendment amended$", "amendment-amendment"),
- ("^Amendment withdrawn$", "amendment-withdrawal"),
- ("^Passed$", "passage"),
- ("^Adopted$", "passage"),
- ("^Received (by|from) the.*Secretary of the Senate", "filing"),
- ("^Received (by|from) the", "introduction"),
- ("^Sent to the Governor", "executive-receipt"),
- ("^Signed by the Governor", "executive-signature"),
- ("^Effective on", "became-law"),
- ("^Vetoed by the Governor$", "executive-veto"),
- ("^Read first time$", ["introduction", "reading-1"]),
- ("^Read & adopted$", ["passage", "introduction"]),
- ("^Passed as amended$", "passage"),
- ("^Referred to", "referral-committee"),
- ("^Recommended to be sent to", "referral-committee"),
- (r"^Reported favorably w/o amendment\(s\)$", "committee-passage"),
- ("^Filed$", "filing"),
- ("^Read 3rd time$", "reading-3"),
- ("^Read 2nd time$", "reading-2"),
- ("^Reported favorably", "committee-passage-favorable"),
- ("^Effective immediately$", "became-law"),
- ("^Filed without the Governor's signature$", "became-law"),
- )
- def _categorize_action(action):
- for pattern, types in _action_re:
- if re.findall(pattern, action):
- return types
- return None
- class TXBillScraper(Scraper, LXMLMixin):
- _FTP_ROOT = "ftp.legis.state.tx.us"
- CHAMBERS = {"H": "lower", "S": "upper"}
- NAME_SLUGS = {
- "I": "Introduced",
- "E": "Engrossed",
- "S": "Senate Committee Report",
- "H": "House Committee Report",
- "F": "Enrolled",
- }
- companion_url = (
- "https://capitol.texas.gov/BillLookup/Companions.aspx" "?LegSess={}&Bill={}"
- )
- def _get_ftp_files(self, dir_):
- """Recursively traverse an FTP directory, returning all files"""
- for i in range(3):
- try:
- ftp = ftplib.FTP(self._FTP_ROOT)
- break
- except (EOFError, ftplib.error_temp):
- time.sleep(2**i)
- else:
- raise Exception
- ftp.login()
- ftp.cwd("/" + dir_)
- self.info("Searching an FTP folder for files ({})".format(dir_))
- lines = []
- ftp.retrlines("LIST", lines.append)
- for line in lines:
- (_date, _time, is_dir, _file_size, name) = re.search(
- r"""(?x)
- ^(\d{2}-\d{2}-\d{2})\s+ # Date in mm-dd-yy
- (\d{2}:\d{2}[AP]M)\s+ # Time in hh:mmAM/PM
- (<DIR>)?\s+ # Directories will have an indicating flag
- (\d+)?\s+ # Files will have their size in bytes
- (.+?)\s*$ # Directory or file name is the remaining text
- """,
- line,
- ).groups()
- if is_dir:
- for item in self._get_ftp_files("/".join([dir_, name])):
- yield item
- else:
- yield "/".join(["ftp://" + self._FTP_ROOT, dir_, name])
- @staticmethod
- def _get_bill_id_from_file_path(file_path):
- bill_id = file_path.split("/")[-1].split(".")[0]
- identifier, number = re.search(r"([A-Z]{2}R?)0+(\d+)", bill_id).groups()
- # House and Senate Concurrent and Joint Resolutions files do not contain
- # the 'R' for resolution in file names. This is required to match
- # bill ID's later on.
- if re.match("[HS][CJ]", identifier):
- identifier += "R"
- return " ".join([identifier, number])
- def scrape(self, session=None, chamber=None):
- chambers = [chamber] if chamber else ["upper", "lower"]
- session_code = self._format_session(session)
- self.witnesses = []
- witness_files = self._get_ftp_files(
- "bills/{}/witlistbill/html".format(session_code)
- )
- for item in witness_files:
- bill_id = self._get_bill_id_from_file_path(item)
- self.witnesses.append((bill_id, item))
- history_files = self._get_ftp_files("bills/{}/billhistory".format(session_code))
- for bill_url in history_files:
- if "house" in bill_url:
- if "lower" in chambers:
- yield from self.scrape_bill(session, bill_url)
- elif "senate" in bill_url:
- if "upper" in chambers:
- yield from self.scrape_bill(session, bill_url)
- def scrape_bill(self, session, history_url):
- history_xml = self.get(history_url).text
- root = etree.fromstring(history_xml)
- bill_title = root.findtext("caption")
- if bill_title is None or "Bill does not exist" in history_xml:
- self.warning("Bill does not appear to exist")
- return
- bill_id = " ".join(root.attrib["bill"].split(" ")[1:])
- chamber = self.CHAMBERS[bill_id[0]]
- if bill_id[1] == "B":
- bill_type = ["bill"]
- elif bill_id[1] == "R":
- bill_type = ["resolution"]
- elif bill_id[1:3] == "CR":
- bill_type = ["concurrent resolution"]
- elif bill_id[1:3] == "JR":
- bill_type = ["joint resolution"]
- else:
- raise ScrapeError("Invalid bill_id: %s" % bill_id)
- bill = Bill(
- bill_id,
- legislative_session=session,
- chamber=chamber,
- title=bill_title,
- classification=bill_type,
- )
- bill.add_source(history_url)
- bill_id_for_url = bill_id.replace(" ", "")
- bill.add_source(
- f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
- )
- for subject in root.iterfind("subjects/subject"):
- bill.add_subject(subject.text.strip())
- for version in root.iterfind("billtext/docTypes/bill/versions/version"):
- if not version:
- continue
- note = version.find("versionDescription").text
- html_url = version.find("WebHTMLURL").text
- bill.add_version_link(note=note, url=html_url, media_type="text/html")
- pdf_url = version.find("WebPDFURL").text
- bill.add_version_link(note=note, url=pdf_url, media_type="application/pdf")
- for analysis in root.iterfind("billtext/docTypes/analysis/versions/version"):
- if not analysis:
- continue
- description = analysis.find("versionDescription").text
- html_url = analysis.find("WebHTMLURL").text
- bill.add_document_link(
- note="Analysis ({})".format(description),
- url=html_url,
- media_type="text/html",
- )
- for fiscal_note in root.iterfind(
- "billtext/docTypes/fiscalNote/versions/version"
- ):
- if not fiscal_note:
- continue
- description = fiscal_note.find("versionDescription").text
- html_url = fiscal_note.find("WebHTMLURL").text
- bill.add_document_link(
- note="Fiscal Note ({})".format(description),
- url=html_url,
- media_type="text/html",
- )
- witnesses = [x for x in self.witnesses if x[0] == bill_id]
- for witness in witnesses:
- bill.add_document_link(
- note="Witness List ({})".format(self.NAME_SLUGS[witness[1][-5]]),
- url=witness[1],
- media_type="text/html",
- )
- for action in root.findall("actions/action"):
- act_date = datetime.datetime.strptime(
- action.findtext("date"), "%m/%d/%Y"
- ).date()
- action_number = action.find("actionNumber").text
- actor = {"H": "lower", "S": "upper", "E": "executive"}[action_number[0]]
- desc = action.findtext("description").strip()
- if desc == "Scheduled for public hearing on . . .":
- self.warning("Skipping public hearing action with no date")
- continue
- atype = _categorize_action(desc)
- act = bill.add_action(
- action.findtext("description"),
- act_date,
- chamber=actor,
- classification=atype,
- )
- if atype and "referral-committee" in atype:
- repls = ["Referred to", "Recommended to be sent to "]
- ctty = desc
- for r in repls:
- ctty = ctty.replace(r, "").strip()
- act.add_related_entity(name=ctty, entity_type="organization")
- for author in root.findtext("authors").split(" | "):
- if re.search(r"\S+", author.strip()) is not None:
- bill.add_sponsorship(
- author, classification="primary", entity_type="person", primary=True
- )
- for coauthor in root.findtext("coauthors").split(" | "):
- if re.search(r"\S+", coauthor.strip()) is not None:
- bill.add_sponsorship(
- coauthor,
- classification="cosponsor",
- entity_type="person",
- primary=False,
- )
- for sponsor in root.findtext("sponsors").split(" | "):
- if re.search(r"\S+", sponsor.strip()) is not None:
- bill.add_sponsorship(
- sponsor,
- classification="primary",
- entity_type="person",
- primary=True,
- )
- for cosponsor in root.findtext("cosponsors").split(" | "):
- if re.search(r"\S+", cosponsor.strip()) is not None:
- bill.add_sponsorship(
- cosponsor,
- classification="cosponsor",
- entity_type="person",
- primary=False,
- )
- if root.findtext("companions"):
- self._get_companion(bill)
- yield bill
- def _get_companion(self, bill):
- url = self.companion_url.format(
- self._format_session(bill.legislative_session),
- self._format_bill_id(bill.identifier),
- )
- page = self.lxmlize(url)
- links = page.xpath('//table[@id="Table6"]//a')
- for link in links:
- parsed = urlparse.urlparse(link.attrib["href"])
- query = urlparse.parse_qs(parsed.query)
- bill.add_related_bill(
- identifier=query["Bill"][0],
- legislative_session=query["LegSess"][0].replace("R", ""),
- relation_type="companion",
- )
- def _format_session(self, session):
- if len(session) == 2:
- session = session + "R"
- assert len(session) == 3, "Unable to handle the session name"
- return session
- def _format_bill_id(self, bill_id):
- return bill_id.replace(" ", "")