PageRenderTime 129ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/everyblock/everyblock/cities/nyc/graffiti/retrieval.py

https://github.com/UXE/everyblock
Python | 134 lines | 110 code | 16 blank | 8 comment | 8 complexity | 0311956f158e9556211c868bd88ac36e MD5 | raw file
  1. """
  2. Screen scraper for NYC graffiti location data
  3. https://a002-oom01.nyc.gov/graffiti/
  4. More information is here:
  5. http://www.nyc.gov/html/cau/html/anti_graffiti/main.shtml
  6. """
  7. from ebdata.retrieval.scrapers.base import ScraperBroken
  8. from ebdata.retrieval.scrapers.list_detail import SkipRecord
  9. from ebdata.retrieval.scrapers.newsitem_list_detail import NewsItemListDetailScraper
  10. from ebpub.db.models import NewsItem
  11. from ebpub.utils.dates import parse_date
  12. import re
  13. class GraffitiScraperBase(NewsItemListDetailScraper):
  14. has_detail = False
  15. def list_pages(self):
  16. html = self.get_html(self.source_url)
  17. m = re.search(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="([^"]*)"', html)
  18. if not m:
  19. raise ScraperBroken('VIEWSTATE not found on %s' % self.source_url)
  20. viewstate = m.group(1)
  21. m = re.search(r'<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="([^"]*)"', html)
  22. if not m:
  23. raise ScraperBroken('EVENTVALIDATION not found on %s' % self.source_url)
  24. eventvalidation = m.group(1)
  25. yield self.get_html(self.source_url, {'__VIEWSTATE': viewstate, '__EVENTVALIDATION': eventvalidation, 'cmdFind': 'Find'})
  26. def parse_list(self, page):
  27. page = page.replace('&nbsp;', ' ')
  28. for record in self.parse_list_re.finditer(page):
  29. yield record.groupdict()
  30. def clean_list_record(self, record):
  31. record['waiver_date'] = parse_date(record['waiver_date'], '%m/%d/%y')
  32. record['address'] = ('%s %s %s' % (record.pop('street_number', '').strip(), record.pop('street_name', '').strip(), record.pop('street_suffix', '').strip())).strip()
  33. try:
  34. record['borough'] = {
  35. 'BK': 'Brooklyn',
  36. 'BX': 'The Bronx',
  37. 'MN': 'Manhattan',
  38. 'QS': 'Queens',
  39. 'SI': 'Staten Island',
  40. }[record['borough']]
  41. except KeyError:
  42. raise SkipRecord('Invalid borough')
  43. return record
  44. class PendingGraffitiScraper(GraffitiScraperBase):
  45. schema_slugs = ('graffiti-pending-cleanup',)
  46. parse_list_re = re.compile(r'(?si)<tr[^>]*>\s*<td[^>]*>(?P<street_number>[^<]*)</td><td[^>]*>(?P<street_name>[^<]*)</td><td[^>]*>(?P<street_suffix>[^<]*)</td><td[^>]*>(?P<borough>[^<]*)</td><td[^>]*>(?P<zipcode>[^<]*)</td><td[^>]*>[^<]*</td><td[^>]*>[^<]*</td><td[^>]*>[^<]*</td><td[^>]*>(?P<waiver_date>[^<]*)</td><td[^>]*>Waiver Received</td>\s*</tr>')
  47. source_url = 'https://a002-oom03.nyc.gov/graffiti/Pending.aspx'
  48. def existing_record(self, record):
  49. try:
  50. qs = NewsItem.objects.filter(schema__id=self.schema.id, item_date=record['waiver_date'])
  51. qs = qs.by_attribute(self.schema_fields['address'], record['address'])
  52. qs = qs.by_attribute(self.schema_fields['borough'], record['borough'])
  53. return qs[0]
  54. except IndexError:
  55. return None
  56. def save(self, old_record, list_record, detail_record):
  57. if old_record is not None:
  58. # Graffiti data never changes, so we don't have to
  59. # worry about changing data that already exists.
  60. self.logger.debug('Data already exists')
  61. return
  62. attributes = {
  63. 'address': list_record['address'],
  64. 'borough': list_record['borough'],
  65. }
  66. self.create_newsitem(
  67. attributes,
  68. title='Graffiti reported at %s, %s' % (list_record['address'], list_record['borough']),
  69. url=self.source_url,
  70. item_date=list_record['waiver_date'],
  71. location_name='%s, %s' % (list_record['address'], list_record['borough']),
  72. )
  73. class CompletedGraffitiScraper(GraffitiScraperBase):
  74. schema_slugs = ('graffiti-cleaned',)
  75. parse_list_re = re.compile(r'(?si)<tr[^>]*>\s*<td[^>]*>(?P<street_number>[^<]*)</td><td[^>]*>(?P<street_name>[^<]*)</td><td[^>]*>(?P<street_suffix>[^<]*)</td><td[^>]*>(?P<borough>[^<]*)</td><td[^>]*>(?P<zipcode>[^<]*)</td><td[^>]*>[^<]*</td><td[^>]*>[^<]*</td><td[^>]*>[^<]*</td><td[^>]*>(?P<waiver_date>\d\d/\d\d/\d\d)</td><td[^>]*>(?P<completed_on>\d\d/\d\d/\d\d)</td><td[^>]*>(?P<status>[^<]*)</td>\s*</tr>')
  76. source_url = 'https://a002-oom03.nyc.gov/graffiti/Completed.aspx'
  77. def clean_list_record(self, record):
  78. record = GraffitiScraperBase.clean_list_record(self, record)
  79. record['completed_on'] = parse_date(record['completed_on'], '%m/%d/%y')
  80. return record
  81. def existing_record(self, record):
  82. try:
  83. qs = NewsItem.objects.filter(schema__id=self.schema.id, item_date=record['completed_on'])
  84. qs = qs.by_attribute(self.schema_fields['address'], record['address'])
  85. qs = qs.by_attribute(self.schema_fields['borough'], record['borough'])
  86. qs = qs.by_attribute(self.schema_fields['waiver_date'], record['waiver_date'])
  87. return qs[0]
  88. except IndexError:
  89. return None
  90. def save(self, old_record, list_record, detail_record):
  91. status = self.get_or_create_lookup('status', list_record['status'], list_record['status'], make_text_slug=False)
  92. attributes = {
  93. 'address': list_record['address'],
  94. 'borough': list_record['borough'],
  95. 'waiver_date': list_record['waiver_date'],
  96. 'status': status.id,
  97. }
  98. values = {
  99. 'title': 'Graffiti cleaned up at %s, %s' % (list_record['address'], list_record['borough']),
  100. 'url': self.source_url,
  101. 'item_date': list_record['completed_on'],
  102. 'location_name': '%s, %s' % (list_record['address'], list_record['borough']),
  103. }
  104. if old_record is None:
  105. self.create_newsitem(attributes, **values)
  106. else:
  107. self.update_existing(old_record, values, attributes)
  108. def update_newest():
  109. s = PendingGraffitiScraper()
  110. s.update()
  111. s = CompletedGraffitiScraper()
  112. s.update()
  113. if __name__ == "__main__":
  114. update_newest()