PageRenderTime 30ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/scrape_cc/util.py

https://github.com/sunlightlabs/muni_words
Python | 198 lines | 195 code | 2 blank | 1 comment | 0 complexity | 6d88ab0934f96635dee5d15a3845cf8f MD5 | raw file
  1. import httplib2
  2. import urlparse
  3. import json
  4. import re
  5. from models import *
  6. import datetime
  7. import htmlentitydefs
  8. from django.contrib.gis.geos import Point
  9. from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
  10. from excludes import EXCLUDED
  11. h = httplib2.Http()
  12. today = datetime.datetime.today()
  13. six_months_ago = (today - datetime.timedelta(weeks=24)).strftime('%Y-%m-%dT%H:%M:%S')
  14. def granicus_json_scrape(domain, clip_id, raw = False):
  15. """
  16. Accepts any Granicus url with a 'clip_id' parameter and returns
  17. a Python object containing all the JSON found.
  18. """
  19. g_url = 'http://%s/JSON.php?clip_id=%s' % ( domain, clip_id)
  20. response, j = h.request(g_url)
  21. if response.get('status') == '200':
  22. try:
  23. j = json.loads(j, strict=False)
  24. except ValueError:
  25. ts = re.sub('([{,]\s+)([a-z]+)(: ")', lambda s: '%s"%s"%s' % (s.groups()[0], s.groups()[1], s.groups()[2]), j).replace("\\", "")
  26. try:
  27. j = json.loads(ts, strict=False)
  28. except UnicodeDecodeError:
  29. ts = unicode(ts, errors='ignore')
  30. j = json.loads(ts, strict=False)
  31. except:
  32. j = False
  33. else:
  34. j = False
  35. return j
  36. def strip_html(string):
  37. return ''.join([e for e in BeautifulSoup(string).recursiveChildGenerator() if isinstance(e, unicode)]).replace(' ', ' ')
  38. def build_db(domain, clip_id, muni, datestring):
  39. j = granicus_json_scrape(domain, clip_id)
  40. if j and len(j) > 0 and len(j[0]) > 0:
  41. text = " ".join([x["text"] if x['type'] != "meta" else "" for x in j[0] ])
  42. titles = " ".join([x["title"] if x['type'] == "meta" else "" for x in j[0] ])
  43. try:
  44. final_text = strip_html(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES).contents[0])
  45. except:
  46. final_text = strip_html(text)
  47. try:
  48. final_titles = strip_html(BeautifulStoneSoup(titles, convertEntities=BeautifulStoneSoup.ALL_ENTITIES).contents[0])
  49. except:
  50. final_titles = strip_html(titles)
  51. if final_text == " ":
  52. cc = True
  53. else:
  54. cc = False
  55. try:
  56. a_date = datetime.datetime.strptime(datestring[:-6], '%Y-%m-%dT%H:%M:%S')
  57. t = Transcript.objects.get_or_create(clip_id=clip_id, muni=muni)[0]
  58. t.text = final_text
  59. t.titles = final_titles
  60. t.cc = cc
  61. t.date = a_date
  62. t.save()
  63. except Exception as e:
  64. print "Couldn't save transcript object - %s" % e
  65. def get_clips():
  66. munis = Muni.objects.all()
  67. count = 0
  68. total_munis = munis.count()
  69. for m in munis:
  70. print "Processing %s of %s total munis - %s" % (count, total_munis, m.name)
  71. es_filter_data = """{ "size": 1,
  72. "query": {
  73. "term": {
  74. "agency_id": "%s"
  75. }
  76. },
  77. "filter": { "range": {
  78. "datetime": {
  79. "from": "%s",
  80. "include_lower": true
  81. }
  82. }
  83. }
  84. }""" % (m.granicus_id, six_months_ago)
  85. resp, content = h.request("http://govflix.com/api", "POST", body=es_filter_data)
  86. if resp.get('status') == '200':
  87. total_json = json.loads(content, strict=False)
  88. total = total_json['hits']['total']
  89. es_filter_data = '{ "size": %d, "query": {"term": {"agency_id": "%s" }}, "filter": { "range": { "datetime": { "from": "%s", "include_lower": true}}}}' % ( total, m.granicus_id, six_months_ago)
  90. resp, content = h.request("http://govflix.com/api", "POST", body=es_filter_data)
  91. try:
  92. m_json = json.loads(content, strict=False)
  93. videos = m_json['hits']['hits']
  94. except:
  95. return
  96. for vid in videos:
  97. if vid['_type'] != 'video':
  98. continue
  99. else:
  100. try:
  101. build_db(m.host_url, vid['_source']['id'], m, vid['_source']['datetime'])
  102. except Exception as e:
  103. print "error in build_db for %s - %s - %s" % ( m.host_url, vid['_source']['id'], e)
  104. print "Processed %s videos" % len(videos)
  105. else:
  106. print "Response not OK"
  107. count += 1
  108. def check_exclude(muni):
  109. for e in EXCLUDED:
  110. if muni.granicus_id == e[1]:
  111. return True
  112. return False
  113. def remove_excluded():
  114. for ex in EXCLUDED:
  115. try:
  116. m = Muni.objects.get(granicus_id=ex[1])
  117. t = Transcript.objects.filter(muni=m).delete()
  118. m.delete()
  119. except:
  120. pass
  121. def import_muni():
  122. api_url = "http://govflix.com/api?type=agency&size=1"
  123. response, text = h.request(api_url)
  124. if response.get('status') == '200':
  125. agencies = json.loads(text, strict=False)
  126. total_agencies = agencies['hits']['total']
  127. response, text = h.request("http://govflix.com/api?type=agency&size=%s" % total_agencies)
  128. agencies = json.loads(text, strict=False)
  129. for agency in agencies['hits']['hits']:
  130. new_agency = Muni.objects.get_or_create(granicus_id=agency['_id'])[0]
  131. if not check_exclude(new_agency):
  132. new_agency.name = agency['_source']['name']
  133. new_agency.state = agency['_source']['state']
  134. new_agency.host_url = agency['_source']['host']
  135. try:
  136. pt = Point(agency['_source']['location'][1], agency['_source']['location'][0]) #this is long lat now, instead of lat long
  137. new_agency.lat_long = pt
  138. except:
  139. pass
  140. new_agency.save()
  141. def unescape(text):
  142. def fixup(m):
  143. text = m.group(0)
  144. if text[:2] == "&#":
  145. #character reference
  146. try:
  147. if text[:3] == "&#x":
  148. return unichr(int(text[3:-1], 15))
  149. else:
  150. return unichr(int(text[2:-1]))
  151. except ValueError:
  152. pass
  153. else:
  154. #named entity
  155. try:
  156. text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
  157. except KeyError:
  158. pass
  159. return text # leave as is
  160. return re.sub("&#?\w+;", fixup, text)