PageRenderTime 73ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/billy/bin/update.py

https://github.com/sunlightlabs/billy
Python | 403 lines | 349 code | 38 blank | 16 comment | 37 complexity | ab315d4aa47720ee6580ba711eaadeff MD5 | raw file
  1. #!/usr/bin/env python
  2. from __future__ import print_function
  3. import os
  4. import sys
  5. import pdb
  6. import json
  7. import glob
  8. import logging
  9. import inspect
  10. import argparse
  11. import traceback
  12. import importlib
  13. import six
  14. import datetime as dt
  15. from billy.core import db
  16. from billy.core import settings, base_arg_parser
  17. from billy.scrape import ScrapeError, get_scraper, check_sessions
  18. from billy.utils import term_for_session
  19. from billy.scrape.validator import DatetimeValidator
  20. def _clear_scraped_data(output_dir, scraper_type=''):
  21. # make or clear directory for this type
  22. path = os.path.join(output_dir, scraper_type)
  23. try:
  24. os.makedirs(path)
  25. except OSError as e:
  26. if e.errno != 17:
  27. raise e
  28. else:
  29. for f in glob.glob(path + '/*.json'):
  30. os.remove(f)
  31. def _get_configured_scraper(scraper_type, options, metadata):
  32. try:
  33. ScraperClass = get_scraper(options.module, scraper_type)
  34. except ScrapeError as e:
  35. # silence error only when alldata is present
  36. if 'alldata' in options.types and ('no %s scraper found in' % scraper_type) in str(e):
  37. return None
  38. else:
  39. raise e
  40. return ScraperClass(metadata,
  41. output_dir=options.output_dir,
  42. strict_validation=options.strict,
  43. fastmode=options.fastmode)
  44. def _is_old_scrape(f):
  45. argspec = inspect.getargspec(f)
  46. return 'chamber' in argspec.args
  47. def _run_scraper(scraper_type, options, metadata):
  48. """
  49. scraper_type: bills, legislators, committees, votes
  50. """
  51. _clear_scraped_data(options.output_dir, scraper_type)
  52. scraper = _get_configured_scraper(scraper_type, options, metadata)
  53. ua_email = os.environ.get('BILLY_UA_EMAIL')
  54. if ua_email and scraper:
  55. scraper.user_agent += ' ({})'.format(ua_email)
  56. if not scraper:
  57. return [{
  58. "type": scraper_type,
  59. "start_time": dt.datetime.utcnow(),
  60. "noscraper": True,
  61. "end_time": dt.datetime.utcnow()
  62. }]
  63. runs = []
  64. # Removed from the inner loop due to non-bicameral scrapers
  65. scrape = {
  66. "type": scraper_type
  67. }
  68. scrape['start_time'] = dt.datetime.utcnow()
  69. if scraper_type in ('bills', 'votes', 'events'):
  70. times = options.sessions
  71. for time in times:
  72. scraper.validate_session(time, scraper.latest_only)
  73. elif scraper_type in ('committees', 'legislators'):
  74. times = options.terms
  75. for time in times:
  76. scraper.validate_term(time, scraper.latest_only)
  77. # run scraper against year/session/term
  78. for time in times:
  79. # old style
  80. chambers = options.chambers
  81. if scraper_type == 'events' and len(options.chambers) == 2:
  82. chambers.append('other')
  83. if _is_old_scrape(scraper.scrape):
  84. for chamber in chambers:
  85. scraper.scrape(chamber, time)
  86. else:
  87. scraper.scrape(time, chambers=chambers)
  88. # error out if events or votes don't scrape anything
  89. if not scraper.object_count and scraper_type not in ('events',
  90. 'votes'):
  91. raise ScrapeError("%s scraper didn't save any objects" %
  92. scraper_type)
  93. scrape['end_time'] = dt.datetime.utcnow()
  94. runs.append(scrape)
  95. return runs
  96. def _scrape_solo_bills(options, metadata):
  97. _clear_scraped_data(options.output_dir, 'bills')
  98. scraper = _get_configured_scraper('bills', options, metadata)
  99. if len(options.chambers) == 1:
  100. chamber = options.chambers[0]
  101. else:
  102. raise ScrapeError('must specify --chamber when providing a --bill')
  103. if len(options.sessions):
  104. session = list(options.sessions)[0]
  105. else:
  106. raise ScrapeError('must specify --session when providing a --bill')
  107. for bill_id in options.solo_bills:
  108. scraper.scrape_bill(chamber, session, bill_id)
  109. def _do_imports(abbrev, args):
  110. # do imports here so that scrape doesn't depend on mongo
  111. from billy.importers.metadata import import_metadata
  112. from billy.importers.bills import import_bills
  113. from billy.importers.legislators import import_legislators
  114. from billy.importers.committees import import_committees
  115. from billy.importers.events import import_events
  116. # always import metadata and districts
  117. import_metadata(abbrev)
  118. report = {}
  119. if 'legislators' in args.types:
  120. report['legislators'] = \
  121. import_legislators(abbrev, settings.BILLY_DATA_DIR)
  122. if 'bills' in args.types:
  123. report['bills'] = import_bills(abbrev, settings.BILLY_DATA_DIR)
  124. if 'committees' in args.types:
  125. report['committees'] = \
  126. import_committees(abbrev, settings.BILLY_DATA_DIR)
  127. return report
  128. def main():
  129. try:
  130. parser = argparse.ArgumentParser(
  131. description='update billy data',
  132. parents=[base_arg_parser],
  133. )
  134. what = parser.add_argument_group(
  135. 'what to scrape', 'flags that help select what data to scrape')
  136. scrape = parser.add_argument_group('scraper config',
  137. 'settings for the scraper')
  138. parser.add_argument('module', type=str, help='scraper module (eg. nc)')
  139. parser.add_argument('--pdb', action='store_true', default=False,
  140. help='invoke PDB when exception is raised')
  141. parser.add_argument('--ipdb', action='store_true', default=False,
  142. help='invoke PDB when exception is raised')
  143. parser.add_argument('--pudb', action='store_true', default=False,
  144. help='invoke PUDB when exception is raised')
  145. what.add_argument('-s', '--session', action='append',
  146. dest='sessions', default=[],
  147. help='session(s) to scrape')
  148. what.add_argument('-t', '--term', action='append', dest='terms',
  149. help='term(s) to scrape', default=[])
  150. for arg in ('upper', 'lower'):
  151. what.add_argument('--' + arg, action='append_const',
  152. dest='chambers', const=arg)
  153. for arg in ('bills', 'legislators', 'committees',
  154. 'votes', 'events',):
  155. what.add_argument('--' + arg, action='append_const', dest='types',
  156. const=arg)
  157. for arg in ('scrape', 'import', 'report', 'session-list'):
  158. parser.add_argument('--' + arg, dest='actions',
  159. action="append_const", const=arg,
  160. help='only run %s step' % arg)
  161. # special modes for debugging
  162. scrape.add_argument('--nonstrict', action='store_false', dest='strict',
  163. default=True, help="don't fail immediately when"
  164. " encountering validation warning")
  165. scrape.add_argument('--fastmode', help="scrape in fast mode",
  166. action="store_true", default=False)
  167. # scrapelib overrides
  168. scrape.add_argument('-r', '--rpm', action='store', type=int,
  169. dest='SCRAPELIB_RPM')
  170. scrape.add_argument('--timeout', action='store', type=int,
  171. dest='SCRAPELIB_TIMEOUT')
  172. scrape.add_argument('--retries', type=int,
  173. dest='SCRAPELIB_RETRY_ATTEMPTS')
  174. scrape.add_argument('--retry_wait', type=int,
  175. dest='SCRAPELIB_RETRY_WAIT_SECONDS')
  176. args = parser.parse_args()
  177. if args.pdb or args.pudb or args.ipdb:
  178. _debugger = pdb
  179. if args.pudb:
  180. try:
  181. import pudb
  182. _debugger = pudb
  183. except ImportError:
  184. pass
  185. if args.ipdb:
  186. try:
  187. import ipdb
  188. _debugger = ipdb
  189. except ImportError:
  190. pass
  191. # turn on PDB-on-error mode
  192. # stolen from http://stackoverflow.com/questions/1237379/
  193. # if this causes problems in interactive mode check that page
  194. def _tb_info(type, value, tb):
  195. traceback.print_exception(type, value, tb)
  196. _debugger.pm()
  197. sys.excepthook = _tb_info
  198. # inject scraper paths so scraper module can be found
  199. for newpath in settings.SCRAPER_PATHS:
  200. sys.path.insert(0, newpath)
  201. # get metadata
  202. module = importlib.import_module(args.module)
  203. metadata = module.metadata
  204. module_settings = getattr(module, 'settings', {})
  205. abbrev = metadata['abbreviation']
  206. # load module settings, then command line settings
  207. settings.update(module_settings)
  208. settings.update(args)
  209. # make output dir
  210. args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)
  211. # if terms aren't set, use latest
  212. if not args.terms:
  213. if args.sessions:
  214. for session in args.sessions:
  215. args.terms.append(
  216. term_for_session(metadata['abbreviation'], session,
  217. metadata))
  218. args.terms = list(set(args.terms or []))
  219. else:
  220. latest_term = metadata['terms'][-1]['name']
  221. args.terms = [latest_term]
  222. # only set sessions from terms if sessions weren't set
  223. elif not args.sessions:
  224. for term in metadata['terms']:
  225. if term['name'] in args.terms:
  226. args.sessions.extend(term['sessions'])
  227. # dedup sessions
  228. args.sessions = list(set(args.sessions or []))
  229. if not args.sessions:
  230. args.sessions = [metadata['terms'][-1]['sessions'][-1]]
  231. # determine chambers
  232. if not args.chambers:
  233. args.chambers = ['upper', 'lower']
  234. if not args.actions:
  235. args.actions = ['scrape', 'import', 'report']
  236. if not args.types:
  237. args.types = ['bills', 'legislators', 'votes', 'committees',
  238. 'alldata']
  239. plan = """billy-update abbr=%s
  240. actions=%s
  241. types=%s
  242. sessions=%s
  243. terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
  244. ','.join(args.sessions), ','.join(args.terms))
  245. logging.getLogger('billy').info(plan)
  246. scrape_data = {}
  247. if 'scrape' in args.actions:
  248. _clear_scraped_data(args.output_dir)
  249. # validate then write metadata
  250. if hasattr(module, 'session_list'):
  251. session_list = module.session_list()
  252. else:
  253. session_list = []
  254. check_sessions(metadata, session_list)
  255. try:
  256. schema_path = os.path.join(os.path.split(__file__)[0],
  257. '../schemas/metadata.json')
  258. schema = json.load(open(schema_path))
  259. validator = DatetimeValidator()
  260. validator.validate(metadata, schema)
  261. except ValueError as e:
  262. logging.getLogger('billy').warning(
  263. 'metadata validation error: ' + str(e))
  264. run_record = []
  265. exec_record = {
  266. "run_record": run_record,
  267. "args": sys.argv,
  268. }
  269. lex = None
  270. exc_traceback = None
  271. # start to run scrapers
  272. exec_start = dt.datetime.utcnow()
  273. # scraper order matters
  274. order = ('legislators', 'committees', 'votes', 'bills', 'events')
  275. _traceback = None
  276. try:
  277. for stype in order:
  278. if stype in args.types:
  279. run_record += _run_scraper(stype, args, metadata)
  280. except Exception as e:
  281. _traceback = _, _, exc_traceback = sys.exc_info()
  282. run_record += [{"exception": e, "type": stype}]
  283. lex = e
  284. exec_end = dt.datetime.utcnow()
  285. exec_record['started'] = exec_start
  286. exec_record['ended'] = exec_end
  287. scrape_data['scraped'] = exec_record
  288. scrape_data['abbr'] = abbrev
  289. for record in run_record:
  290. if "exception" in record:
  291. ex = record['exception']
  292. fb = traceback.format_exception(*_traceback)
  293. trace = ""
  294. for t in fb:
  295. trace += t
  296. record['exception'] = {
  297. "type": ex.__class__.__name__,
  298. "message": ex.message,
  299. 'traceback': trace
  300. }
  301. scrape_data['failure'] = True
  302. if lex:
  303. if 'import' in args.actions:
  304. try:
  305. db.billy_runs.save(scrape_data, safe=True)
  306. except Exception as e:
  307. print('mongo error:', e)
  308. six.reraise(lex, None, exc_traceback)
  309. # XXX: This should *NEVER* happen, but it has
  310. # in the past, so we're going to catch any errors
  311. # writing # to pymongo, and raise the original
  312. # exception rather then let it look like Mongo's fault.
  313. # Thanks for catching this, Thom.
  314. #
  315. # We lose the stack trace, but the Exception is the
  316. # same in every other way.
  317. # -- paultag
  318. raise
  319. # imports
  320. if 'import' in args.actions:
  321. import_report = _do_imports(abbrev, args)
  322. scrape_data['imported'] = import_report
  323. # We're tying the run-logging into the import stage - since import
  324. # already writes to the DB, we might as well throw this in too.
  325. db.billy_runs.save(scrape_data, safe=True)
  326. if 'session-list' in args.actions:
  327. if hasattr(module, 'session_list'):
  328. print("\n".join(module.session_list()))
  329. else:
  330. raise ScrapeError('session_list() is not defined')
  331. except ScrapeError as e:
  332. logging.getLogger('billy').critical('Error: %s', e)
  333. sys.exit(1)
  334. if __name__ == '__main__':
  335. main()