util.py | searchcode

/scrape_cc/util.py

https://github.com/sunlightlabs/muni_words · Python · 198 lines · 157 code · 38 blank · 3 comment · 45 complexity · 6d88ab0934f96635dee5d15a3845cf8f MD5 · raw file

import httplib2
import urlparse
import json
import re
from models import *
import datetime
import htmlentitydefs
from django.contrib.gis.geos import Point
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
from excludes import EXCLUDED

h = httplib2.Http()
today = datetime.datetime.today()
six_months_ago = (today - datetime.timedelta(weeks=24)).strftime('%Y-%m-%dT%H:%M:%S') 

def granicus_json_scrape(domain, clip_id, raw = False):
    """
    Accepts any Granicus url with a 'clip_id' parameter and returns 
    a Python object containing all the JSON found.
    """
    g_url = 'http://%s/JSON.php?clip_id=%s' % ( domain, clip_id)
    response, j = h.request(g_url)
    if response.get('status') == '200':
        try:
            j = json.loads(j, strict=False)
        except ValueError:
            ts = re.sub('([{,]\s+)([a-z]+)(: ")', lambda s: '%s"%s"%s' % (s.groups()[0], s.groups()[1], s.groups()[2]), j).replace("\\", "")
            try:
                j = json.loads(ts, strict=False)
            except UnicodeDecodeError:
                ts = unicode(ts, errors='ignore')
                j = json.loads(ts, strict=False)
        except:
            j = False
    else:
        j = False
    return j


def strip_html(string):
    return ''.join([e for e in BeautifulSoup(string).recursiveChildGenerator() if isinstance(e, unicode)]).replace('&nbsp;', ' ')

def build_db(domain, clip_id, muni, datestring):
    
    j = granicus_json_scrape(domain, clip_id)
    if j and len(j) > 0 and len(j[0]) > 0:
        
        text = " ".join([x["text"] if x['type'] != "meta" else "" for x in j[0] ])
        titles = " ".join([x["title"] if x['type'] == "meta" else "" for x in j[0] ])

        try:
            final_text = strip_html(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES).contents[0])
        except:
            final_text = strip_html(text)
        try:
            final_titles = strip_html(BeautifulStoneSoup(titles, convertEntities=BeautifulStoneSoup.ALL_ENTITIES).contents[0])
        except:
            final_titles = strip_html(titles)

        if final_text == " ":
            cc = True
        else:
            cc = False
    
            
        try:
            a_date =  datetime.datetime.strptime(datestring[:-6], '%Y-%m-%dT%H:%M:%S') 
            t = Transcript.objects.get_or_create(clip_id=clip_id, muni=muni)[0]
            t.text = final_text
            t.titles = final_titles
            t.cc = cc
            t.date =  a_date
            t.save()

        except Exception as e:
            print "Couldn't save transcript object - %s" % e

def get_clips():

    munis = Muni.objects.all()
    count = 0
    total_munis = munis.count()
    
    for m in munis:
        print "Processing %s of %s total munis - %s" % (count, total_munis, m.name)

        es_filter_data = """{ "size": 1, 
                              "query": {
                                        "term": {
                                                "agency_id": "%s" 
                                                }
                                     }, 
                              "filter": { "range": { 
                                                "datetime": { 
                                                            "from": "%s", 
                                                            "include_lower": true
                                                            }
                                                 }
                                      }
                            }""" % (m.granicus_id, six_months_ago)

        resp, content = h.request("http://govflix.com/api", "POST", body=es_filter_data)

        if resp.get('status') == '200':
            
            total_json = json.loads(content, strict=False)
            total = total_json['hits']['total']
            es_filter_data = '{ "size": %d, "query": {"term": {"agency_id": "%s" }},  "filter": { "range": { "datetime": { "from": "%s", "include_lower": true}}}}' % ( total, m.granicus_id, six_months_ago)

            resp, content = h.request("http://govflix.com/api", "POST", body=es_filter_data)
            try:
                m_json = json.loads(content, strict=False)
                videos = m_json['hits']['hits']
            except:
                return

            for vid in videos:
                
                if vid['_type'] != 'video':
                    continue
                else:
                    try:
                        build_db(m.host_url, vid['_source']['id'], m, vid['_source']['datetime'])
                    except Exception as e:
                        print "error in build_db for %s - %s - %s" % ( m.host_url, vid['_source']['id'], e)

            print "Processed %s videos" % len(videos)
        else:       
            print "Response not OK"

        count += 1

def check_exclude(muni):
    for e in EXCLUDED:
        if muni.granicus_id == e[1]:
            return True
    return False

def remove_excluded():
    for ex in EXCLUDED:
        try:
            m = Muni.objects.get(granicus_id=ex[1])
            t = Transcript.objects.filter(muni=m).delete()
            m.delete()
        except:
            pass

def import_muni():
    
    api_url = "http://govflix.com/api?type=agency&size=1"
    response, text  = h.request(api_url)

    if response.get('status') == '200':
        agencies = json.loads(text, strict=False)
        total_agencies = agencies['hits']['total']
        
        response, text = h.request("http://govflix.com/api?type=agency&size=%s" % total_agencies)
        agencies = json.loads(text, strict=False)
    
        for agency in agencies['hits']['hits']:
            
            new_agency = Muni.objects.get_or_create(granicus_id=agency['_id'])[0]
            if not check_exclude(new_agency):
                new_agency.name = agency['_source']['name']
                new_agency.state = agency['_source']['state']
                new_agency.host_url = agency['_source']['host']
                try:
                    pt = Point(agency['_source']['location'][1],  agency['_source']['location'][0])  #this is long lat now, instead of lat long
                    new_agency.lat_long = pt
                except:
                    pass

                new_agency.save()


def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            #character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 15))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            #named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)
Tech Fingerprint

Alerts (3)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
5
Complexity hotspot; lines 48 to 49 (total complexity: 6)
48 49