PageRenderTime 24ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/scrapers/nm/committees.py

https://github.com/mattgrayson/openstates
Python | 147 lines | 143 code | 4 blank | 0 comment | 0 complexity | eff936e50b58941a1a9d19aee28e0c8a MD5 | raw file
  1. import collections
  2. from openstates.scrape import Scraper, Organization
  3. from utils import LXMLMixin
  4. base_url = "http://www.nmlegis.gov/Committee/"
  5. Member = collections.namedtuple("Member", "name role chamber")
  6. def clean_committee_name(name_to_clean):
  7. head, _sep, tail = (
  8. name_to_clean.replace("House ", "")
  9. .replace("Senate ", "")
  10. .replace("Subcommittee", "Committee")
  11. .rpartition(" Committee")
  12. )
  13. return head + tail
  14. class NMCommitteeScraper(Scraper, LXMLMixin):
  15. jurisdiction = "nm"
  16. def scrape(self, chamber=None):
  17. if chamber:
  18. chambers = [chamber]
  19. else:
  20. chambers = ["upper", "lower", "legislature"]
  21. # Xpath query string format for legislative chamber committee urls
  22. base_xpath = (
  23. '//table[@id="MainContent_gridView{0}Committees"]//a'
  24. '[contains(@id, "MainContent_gridView{1}Committees_link'
  25. '{2}Committee")]/@href'
  26. )
  27. chamber_paths = {
  28. "upper": {
  29. "url": "{}Senate_Standing".format(base_url),
  30. "chamber_xpath": base_xpath.format("Senate", "Senate", "Senate"),
  31. },
  32. "lower": {
  33. "url": "{}House_Standing".format(base_url),
  34. "chamber_xpath": base_xpath.format("House", "House", "House"),
  35. },
  36. "legislature": {
  37. "url": "{}Interim".format(base_url),
  38. "chamber_xpath": base_xpath.format("", "", ""),
  39. },
  40. }
  41. for chamber in chambers:
  42. page = self.lxmlize(chamber_paths[chamber]["url"])
  43. committee_urls = self.get_nodes(
  44. page, chamber_paths[chamber]["chamber_xpath"]
  45. )
  46. for committee_url in committee_urls:
  47. committee_page = self.lxmlize(committee_url)
  48. c_name = (
  49. committee_page.xpath(
  50. '//li/a[contains(@id, "siteMapBreadcrumbs_lnkPage_")]'
  51. )[-1]
  52. .text_content()
  53. .strip()
  54. )
  55. if c_name:
  56. members_xpath = (
  57. '//table[@id="MainContent_formView'
  58. "CommitteeInformation_grid"
  59. 'ViewCommitteeMembers"]/tbody/tr'
  60. )
  61. member_nodes = self.get_nodes(committee_page, members_xpath)
  62. tds = {"title": 0, "name": 1, "role": 3}
  63. members = []
  64. for member_node in member_nodes:
  65. m_title = member_node[tds["title"]].text_content()
  66. m_name = self.get_node(
  67. member_node[tds["name"]],
  68. ".//a[contains(@href, " '"/Members/Legislator?SponCode=")]',
  69. ).text_content()
  70. role = member_node[tds["role"]].text_content()
  71. if m_title == "Senator":
  72. m_chamber = "upper"
  73. elif m_title == "Representative":
  74. m_chamber = "lower"
  75. else:
  76. m_chamber = None
  77. if role in (
  78. "Chair",
  79. "Co-Chair",
  80. "Vice Chair",
  81. "Member",
  82. "Advisory",
  83. "Ranking Member",
  84. ):
  85. if chamber == "legislature":
  86. m_role = "interim {}".format(role.lower())
  87. else:
  88. m_role = role.lower()
  89. else:
  90. m_role = None
  91. if m_role:
  92. members.append(
  93. Member(name=m_name, role=m_role, chamber=m_chamber)
  94. )
  95. # Interim committees are collected during the scraping
  96. # for joint committees, and most interim committees
  97. # have members from both chambers. However, a small
  98. # number of interim committees (right now, just 1) have
  99. # only members from one chamber, so the chamber is set
  100. # to their chamber instead of 'legislature' for those
  101. # committees.
  102. if chamber == "legislature":
  103. m_chambers = set([mem.chamber for mem in members])
  104. if len(m_chambers) == 1:
  105. chamber = m_chambers.pop()
  106. committee = Organization(
  107. name=clean_committee_name(c_name),
  108. chamber=chamber,
  109. classification="committee",
  110. )
  111. for member in members:
  112. committee.add_member(member.name, member.role)
  113. committee.add_source(committee_url)
  114. if not committee._related:
  115. self.warning(
  116. "skipping blank committee {0} "
  117. "at {1}".format(c_name, committee_url)
  118. )
  119. else:
  120. yield committee
  121. else:
  122. self.warning(
  123. "No legislative committee found at " "{}".format(committee_url)
  124. )