PageRenderTime 54ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/everyblock/everyblock/states/california/liquor/retrieval.py

https://github.com/frankk00/openblock
Python | 277 lines | 200 code | 27 blank | 50 comment | 24 complexity | 20a31952ebacd6a825491622789d9b1d MD5 | raw file
  1. # Copyright 2007,2008,2009 Everyblock LLC
  2. #
  3. # This file is part of everyblock
  4. #
  5. # everyblock is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # everyblock is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with everyblock. If not, see <http://www.gnu.org/licenses/>.
  17. #
  18. """
  19. Screen scraper for California liquor-license data.
  20. http://www.abc.ca.gov/datport/SubscrMenu.asp
  21. """
  22. from ebdata.retrieval.scrapers.base import ScraperBroken
  23. from ebdata.retrieval.scrapers.list_detail import SkipRecord
  24. from ebdata.retrieval.scrapers.newsitem_list_detail import NewsItemListDetailScraper
  25. from ebpub.db.models import NewsItem
  26. from ebpub.utils.dates import parse_date
  27. import re
  28. GEOCODES = {
  29. 'la': 1993,
  30. 'sf': 3800,
  31. 'sanjose': 4313,
  32. }
  33. license_types_re = re.compile(r'(?si)<b>\d\) License Type:</b> (?P<license_type>.*?)</td></tr><tr><td><b>\s*License Type Status:</b> (?P<license_type_status>.*?)</td></tr><tr><td><b>\s*Status Date: </b> (?P<status_date>.*?) <b>\s*Term: (?P<term>.*?)</td></tr><tr><td><b>\s*Original Issue Date: </b> (?P<original_issue_date>.*?) <b>\s*Expiration Date: </b> (?P<expiration_date>.*?) </tr><tr><td><b>\s*Master: </b> (?P<master>.*?)\s*<b>\s*Duplicate: </b> (?P<duplicate>.*?) <b>\s*Fee Code: </b> (?P<fee_code>.*?)</td></tr>')
  34. detail_url = lambda page_id: 'http://www.abc.ca.gov/datport/LQSdata.asp?ID=%s' % page_id
  35. class LiquorLicenseScraper(NewsItemListDetailScraper):
  36. """
  37. A base class that encapsulates how to scrape California liquor licenses.
  38. Do not instantiate this class directly; use one of the subclasses.
  39. """
  40. schema_slugs = ('liquor-licenses',)
  41. has_detail = True
  42. parse_detail_re = re.compile(r'(?si)License Number: </b>.*? <b>\s*Status: </b>(?P<status>.*?)</td></tr><tr><td><b>Primary Owner: </b>(?P<primary_owner>.*?)</td></tr><td><b>ABC Office of Application: </b>(?P<office_of_application>.*?)</td></tr></tr><tr><td bgcolor=#260066 class=header><font color=white> <b>Business Name </font></b></td></tr>(?P<business_name>.*?)<tr><td bgcolor=#260066 class=header><font color=white> <b>Business Address </font></b></td><td bgcolor=#260066 class=header></td></tr><tr><td><b>Address: </b>(?P<address>.*?)<b>Census Tract: </b>(?P<census_tract>.*?)</td></tr><tr><td><b>City: </b>(?P<city>.*?) <b>County: </b>(?P<county>.*?)</td></tr><tr><td><b>State: </b>(?P<state>.*?) <b>Zip Code: </b>(?P<zipcode>.*?)</td></tr><tr><td bgcolor=#260066 class=header><font color=white> <b>Licensee Information </font></b></td></tr><tr><td><b>Licensee: </b>.*?</td></tr><tr><td bgcolor=#260066 class=header><font color=white> <B>License Types </font></b></td></tr><tr><td>(?P<license_types>.*?)<tr><td bgcolor=#260066 class=header><font color=white> <b>Current Disciplinary Action </font>')
  43. def parse_list(self, page):
  44. page = page.replace('&nbsp;', ' ')
  45. # First, get the report date by looking for "Report as of XXXX".
  46. m = re.search(r'(?i)report as of (\w+ \d\d?, \d\d\d\d)</U>', page)
  47. if not m:
  48. raise ScraperBroken('Could not find "Report as of" in page')
  49. report_date = parse_date(m.group(1), '%B %d, %Y')
  50. # Determine the headers by looking at the <th> tags, and clean them up
  51. # to match our style for keys in the list_record dictionary (lower
  52. # case, underscores instead of spaces).
  53. headers = [h.lower() for h in re.findall('(?i)<th[^>]*>(?:<a[^>]+>)?\s*(.*?)\s*(?:</a>)?</th>', page)]
  54. headers = [h.replace('<br>', ' ') for h in headers]
  55. headers = [re.sub(r'[^a-z]', ' ', h) for h in headers]
  56. headers = [re.sub(r'\s+', '_', h.strip()) for h in headers]
  57. # Dynamically construct a regex based on the number of headers.
  58. # Note that this assumes that at most *one* of the headers has an
  59. # empty name; if more than one header has an empty name, this regex
  60. # will have multiple named groups with the same name, which will cause
  61. # an error.
  62. pattern = '(?si)<tr valign=top class=report_column>%s</tr>'% '\s*'.join(['\s*<td[^>]*>\s*(?:<center>)?\s*(?P<%s>.*?)\s*(?:</center>)?\s*</td[^>]*>\s*' % (h or 'number') for h in headers])
  63. for record in re.finditer(pattern, page):
  64. yield dict(record.groupdict(), report_date=report_date)
  65. def clean_list_record(self, record):
  66. try:
  67. license_number = record.pop('license_num')
  68. except KeyError:
  69. license_number = record.pop('license_number')
  70. m = re.search(r'(?i)<a href=.*?LQSdata\.asp\?ID=(\d+)>\s*(\d+)\s*</a>', license_number)
  71. if not m:
  72. raise ScraperBroken('License number link not found in %r' % license_number)
  73. record['place_id'], record['license_number'] = m.groups()
  74. return record
  75. def get_detail(self, record):
  76. url = detail_url(record['place_id'])
  77. return self.get_html(url)
  78. def parse_detail(self, page, list_record):
  79. # They use a ton of &nbsp;s for some reason, so convert them to spaces
  80. # to make the parse_detail_re regex more readable.
  81. page = page.replace('&nbsp;', ' ')
  82. return NewsItemListDetailScraper.parse_detail(self, page, list_record)
  83. def clean_detail_record(self, record):
  84. if 'No Active DBA found' in record['business_name']:
  85. record['business_name'] = ''
  86. else:
  87. m = re.search(r'(?si)<tr><td><b>Doing Business As: </b>(.*?)</td></tr>', record['business_name'])
  88. if not m:
  89. raise ScraperBroken('Got unknown business_name value %r' % record['business_name'])
  90. record['business_name'] = m.group(1)
  91. record['address'] = record['address'].strip()
  92. # There can be multiple license types, so this requires further parsing
  93. # to create a list.
  94. license_types = []
  95. for m in license_types_re.finditer(record['license_types']):
  96. d = m.groupdict()
  97. d['status_date'] = parse_date(d['status_date'], '%d-%b-%Y')
  98. if not d['status_date']:
  99. # Skip license types that don't have a status date, because
  100. # a NewsItem is required to have an item_date, and we don't
  101. # care about licenses that don't have a change date.
  102. continue
  103. d['original_issue_date'] = parse_date(d['original_issue_date'], '%d-%b-%Y')
  104. d['expiration_date'] = parse_date(d['expiration_date'], '%d-%b-%Y')
  105. d['term'] = d['term'].replace('</B>', '').strip()
  106. license_types.append(d)
  107. record['license_types'] = license_types
  108. return record
  109. def existing_record(self, record):
  110. # We don't have enough information from the list_record to determine
  111. # whether this record exists.
  112. return None
  113. def detail_required(self, list_record, old_record):
  114. # Always download the detail page.
  115. return True
  116. def save(self, old_record, list_record, detail_record):
  117. # Each status change only applies to a single license type (e.g.
  118. # "Winegrower"). The list page says which license type we're interested
  119. # in, but only the detail page has the description, so we have to use
  120. # one to look up the other.
  121. try:
  122. license = [t for t in detail_record['license_types'] if t['license_type'][:2] == list_record['type']][0]
  123. except IndexError:
  124. raise ScraperBroken('License type %r not found on detail page' % list_record['type'])
  125. license_type = self.get_or_create_lookup('type', license['license_type'][5:], list_record['type'])
  126. status = self.get_or_create_lookup('status', license['license_type_status'], license['license_type_status'])
  127. if not list_record.has_key('action'):
  128. list_record['action'] = '' # Status changes do not have actions
  129. action = self.get_or_create_lookup('action', list_record['action'], list_record['action'])
  130. if self.record_type.code == 'STATUS_CHANGE':
  131. old_status = self.get_or_create_lookup('old_status', list_record['status_from'], list_record['status_from'])
  132. new_status = self.get_or_create_lookup('new_status', list_record['status_to'], list_record['status_to'])
  133. else:
  134. # New licesnses and new application have no old status.
  135. old_status = self.get_or_create_lookup('old_status', 'None', 'NONE')
  136. new_status = self.get_or_create_lookup('new_status', list_record['status'], list_record['status'])
  137. try:
  138. qs = NewsItem.objects.filter(schema__id=self.schema.id, item_date=list_record['report_date'])
  139. qs = qs.by_attribute(self.schema_fields['page_id'], list_record['place_id'])
  140. qs = qs.by_attribute(self.schema_fields['type'], license_type.id)
  141. if self.record_type.code == 'STATUS_CHANGE':
  142. qs = qs.by_attribute(self.schema_fields['old_status'], old_status.id)
  143. qs = qs.by_attribute(self.schema_fields['new_status'], new_status.id)
  144. else:
  145. qs = qs.by_attribute(self.schema_fields['action'], action.id)
  146. old_record = qs[0]
  147. except IndexError:
  148. pass
  149. else:
  150. return # No need to save again, if this record already exists.
  151. title = '%s for %s' % (self.record_type.name, detail_record['business_name'] or detail_record['primary_owner'])
  152. attributes = {
  153. 'page_id': list_record['place_id'],
  154. 'address': detail_record['address'],
  155. 'business_name': detail_record['business_name'],
  156. 'original_issue_date': license['original_issue_date'],
  157. 'expiration_date': license['expiration_date'],
  158. 'type': license_type.id,
  159. 'status': status.id,
  160. 'license_number': list_record['license_number'],
  161. 'primary_owner': detail_record['primary_owner'],
  162. 'action': action.id,
  163. 'record_type': self.record_type.id,
  164. 'old_status': old_status.id,
  165. 'new_status': new_status.id,
  166. }
  167. self.create_newsitem(
  168. attributes,
  169. title=title,
  170. url=detail_url(list_record['place_id']),
  171. item_date=license['status_date'],
  172. location_name=detail_record['address'],
  173. )
  174. class NewIssuedLicenseScraper(LiquorLicenseScraper):
  175. def __init__(self, geo_code):
  176. # geo_code is a numeric code describing the part of California we're
  177. # interested in. For all the choices, see page three of this PDF:
  178. # http://www.abc.ca.gov/datport/ABC_Data_Layout.PDF
  179. LiquorLicenseScraper.__init__(self)
  180. self.geo_code = str(geo_code)
  181. self.record_type = self.get_or_create_lookup('record_type', 'New license issued', 'ISSUED')
  182. def list_pages(self):
  183. yield self.get_html('http://www.abc.ca.gov/datport/SubscrOption.asp', {'SUBCRIT': 'p_DlyIssApp'})
  184. def clean_list_record(self, record):
  185. if record['geo_code'] != self.geo_code:
  186. raise SkipRecord
  187. record = LiquorLicenseScraper.clean_list_record(self, record)
  188. record['type'], record['dup'] = record.pop('type_dup').split('/')
  189. record['expir_date'] = parse_date(record['expir_date'], '%m/%d/%Y')
  190. return record
  191. class NewApplicationScraper(LiquorLicenseScraper):
  192. def __init__(self, geo_code):
  193. LiquorLicenseScraper.__init__(self)
  194. self.geo_code = str(geo_code)
  195. self.record_type = self.get_or_create_lookup('record_type', 'New application', 'APPLICATION')
  196. def list_pages(self):
  197. yield self.get_html('http://www.abc.ca.gov/datport/SubscrOption.asp', {'SUBCRIT': 'p_DlyNuApp'})
  198. def clean_list_record(self, record):
  199. if record['geo_code'] != self.geo_code:
  200. raise SkipRecord
  201. record = LiquorLicenseScraper.clean_list_record(self, record)
  202. record['type'], record['dup'] = record.pop('type_dup').split('/')
  203. return record
  204. class StatusChangeScraper(LiquorLicenseScraper):
  205. def __init__(self, geo_code):
  206. LiquorLicenseScraper.__init__(self)
  207. self.geo_code = str(geo_code)
  208. self.record_type = self.get_or_create_lookup('record_type', 'Status change', 'STATUS_CHANGE')
  209. def list_pages(self):
  210. yield self.get_html('http://www.abc.ca.gov/datport/SubscrOption.asp', {'SUBCRIT': 'p_DlyStat'})
  211. def clean_list_record(self, record):
  212. if record['geo_code'] != self.geo_code:
  213. raise SkipRecord
  214. record = LiquorLicenseScraper.clean_list_record(self, record)
  215. record['type'], record['dup'] = record.pop('type_dup').split('/')
  216. record['status_from'], record['status_to'] = record.pop('status_changed_from_to').split(' / ')
  217. record['transfer_info_from'], record['transfer_info_to'] = record.pop('transfer_info_from_to').split('/')
  218. record['orig_iss_date'] = parse_date(record['orig_iss_date'], '%m/%d/%Y')
  219. record['expir_date'] = parse_date(record['expir_date'], '%m/%d/%Y')
  220. return record
  221. def update_newest(geo_code):
  222. # San Francisco is geo_code 3800. San Jose is 4313. LA is 1933.
  223. # You can get the geo_code for a city by looking here:
  224. # http://www.abc.ca.gov/datport/SubDlyNuRep.asp
  225. s = NewIssuedLicenseScraper(geo_code)
  226. s.update()
  227. s = NewApplicationScraper(geo_code)
  228. s.update()
  229. s = StatusChangeScraper(geo_code)
  230. s.update()
  231. if __name__ == '__main__':
  232. import sys
  233. from ebdata.retrieval import log_debug
  234. try:
  235. geocode = GEOCODES[sys.argv[1]]
  236. except (KeyError, IndexError):
  237. print "Usage: retrieval.py %s" % '|'.join(GEOCODES.keys())
  238. sys.exit(0)
  239. update_newest(int(geocode))