/lib/geoalchemy.py
Python | 447 lines | 433 code | 0 blank | 14 comment | 0 complexity | f779a7bbee029f113e4722c6f8487153 MD5 | raw file
Possible License(s): BSD-3-Clause
- """
- Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 1. Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- """
- """
- Performs geolocation disambiguation
- """
- import sqlalchemy
- import sqlalchemy.orm as orm
- import sqlalchemy.ext.declarative as declarative
- import sqlalchemy.sql.expression as expression
- from sqlalchemy.sql.expression import bindparam
- import geoalchemy_util
- import itertools
- import os
- import datetime
- import re
- import sys
- from collections import defaultdict, Counter
- import pandas as pd
- import alchemy
- from alchemy.match import commit_inserts, commit_updates
- from tasks import bulk_commit_inserts, bulk_commit_updates
- global doctype
- doctype = ''
- #The config file alchemy uses to store information
- alchemy_config = alchemy.get_config()
- #The path to the database which holds geolocation data
- geo_data_dbpath = os.path.join(
- alchemy_config.get("location").get('path'),
- alchemy_config.get("location").get('database'))
- geo_data_engine = sqlalchemy.create_engine('sqlite:///%s' % geo_data_dbpath)
- geo_data_session_class = orm.sessionmaker(bind=geo_data_engine)
- #Used to query the database that holds the data from google
- #As well as a MaxMinds database containing every city in the world
- geo_data_session = geo_data_session_class()
- base = declarative.declarative_base()
- commit_freq = alchemy_config.get("location").get("commit_frequency")
- #Stores an address disambiguated by the Google API
- class RawGoogle(base):
- __tablename__ = 'raw_google'
- id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True)
- input_address = sqlalchemy.Column(sqlalchemy.String)
- city = sqlalchemy.Column(sqlalchemy.String)
- region = sqlalchemy.Column(sqlalchemy.String)
- country = sqlalchemy.Column(sqlalchemy.String)
- latitude = sqlalchemy.Column(sqlalchemy.REAL)
- longitude = sqlalchemy.Column(sqlalchemy.REAL)
- confidence = sqlalchemy.Column(sqlalchemy.REAL)
- def __init__(self, input_address, city, region, country, latitude, longitude, confidence):
- self.input_address = input_address
- self.city = city
- self.region = region
- self.country = country
- self.latitude = latitude
- self.longitude = longitude
- self.confidence = confidence
- def __repr__(self):
- return "<RawGoogle('%s','%s','%s','%s','%s','%s','%s')>" % (self.input_address, self.city, self.region, self.country, self.latitude, self.longitude, self.confidence)
- #One of the cities in the world as stored in the MaxMinds database
- class AllCities(base):
- __tablename__ = 'all_cities'
- id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True)
- city = sqlalchemy.Column(sqlalchemy.String)
- region = sqlalchemy.Column(sqlalchemy.String)
- country = sqlalchemy.Column(sqlalchemy.String)
- latitude = sqlalchemy.Column(sqlalchemy.REAL)
- longitude = sqlalchemy.Column(sqlalchemy.REAL)
- def __init__(self, city, region, country, latitude, longitude):
- self.city = city
- self.region = region
- self.country = country
- self.latitude = latitude
- self.longitude = longitude
- #Perform geocoding on the data stored in alchemy.db provided by parsing XML
- def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'):
- alchemy_session = alchemy.fetch_session(dbtype=doctype)
- t = datetime.datetime.now()
- print "geocoding started", t
- #Construct a list of all addresses which Google was capable of identifying
- #Making this now allows it to be referenced quickly later
- valid_input_addresses = construct_valid_input_addresses()
- #Get all of the raw locations in alchemy.db that were parsed from XML
- if doctype == 'grant':
- raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset)
- elif doctype == 'application':
- raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset)
- #If there are no locations, there is no point in continuing
- if raw_parsed_locations.count() == 0:
- return False
- print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(), 'items'
- """
- grouped_loations will contain a list of dicts. Each dict will contain three values:
- raw_location = Location object containing the original location found in the XML
- matching_location = RawGoogle object containing the disambiguated location
- grouping_id = ID constructed from the city, region, and country of the matching_location
- """
- identified_grouped_locations = []
- unidentified_grouped_locations = []
- for instance in raw_parsed_locations:
- #Convert the location into a string that matches the Google format
- parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country)
- cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location)
- #If the cleaned location has a match in the raw_google database,
- #we use that to classify it
- if input_address_exists(valid_input_addresses, cleaned_location):
- matching_location = geo_data_session.query(RawGoogle).filter(
- RawGoogle.input_address==cleaned_location).first()
- grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
- identified_grouped_locations.append({"raw_location": instance,
- "matching_location": matching_location,
- "grouping_id": grouping_id})
- else:
- """
- If there is no match in the raw_google database, we leave the location alone
- TODO: analyze the location's edit distance to make minor adjustments to it
- such that it can be matched. Particularly good if we can combine the
- all_cities database with the list of valid input_address values in the
- raw_google database.
- """
- #Sort the locations by their country
- country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
- unidentified_grouped_locations.append({"raw_location": instance,
- "cleaned_location": cleaned_location,
- "country": country})
- print "locations grouped", datetime.datetime.now() - t
- print 'count of identified locations:', len(identified_grouped_locations)
- t = datetime.datetime.now()
- #We now have two lists of locations. First, consider the unmatched locations.
- keyfunc = lambda x:x["country"]
- #Sort the list by the country
- unidentified_grouped_locations.sort(key=keyfunc)
- #Create an iterator that will access everything in the list with the same
- #country
- unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc))
- #Identify the correct location for each entry by comparing to all_cities
- identify_missing_locations(unidentified_grouped_locations_enum,
- identified_grouped_locations,
- minimum_match_value, t)
- print 'new count of identified locations:', len(identified_grouped_locations)
- #We now have a list of all locations in the file, along with their
- #matching locations and the id used to group them
- #Perform a quickfix to correct state names
- geoalchemy_util.fix_state_abbreviations(identified_grouped_locations);
- #Sort the list by the grouping_id
- keyfunc = lambda x: x['grouping_id']
- identified_grouped_locations.sort(key=keyfunc)
- #Create an iterator that will access everything in the list with the same
- #grouping_id
- identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc))
- print "identified_grouped_locations sorted", datetime.datetime.now() - t
- t = datetime.datetime.now()
- #Match the locations
- match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session)
- alchemy_session.commit()
- print "Matches made!", datetime.datetime.now() - t
- if doctype == 'grant':
- unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all()
- elif doctype == 'application':
- unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all()
- print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())
- #Identify locations that the Google disambiguation couldn't resolve
- def identify_missing_locations(unidentified_grouped_locations_enum,
- identified_grouped_locations,
- minimum_match_value, t):
- #For each group of locations with the same country
- for i, item in unidentified_grouped_locations_enum:
- country, grouped_locations_list = item
- #Get a list of all cities that exist anywhere in that country
- all_cities_in_country = geo_data_session.query(AllCities.city, AllCities.region).filter_by(country=country)
- #Construct a name for each location that matches the normal cleaned location format
- all_cities_in_country = [geoalchemy_util.concatenate_location(x.city,
- x.region if geoalchemy_util.region_is_a_state(x.region) else '',
- country) for x in all_cities_in_country]
- #For each location found in this country, find its closest match
- #among the list of all cities from that country
- for grouped_location in grouped_locations_list:
- cleaned_location = grouped_location["cleaned_location"]
- closest_match = geoalchemy_util.get_closest_match_leven(cleaned_location, all_cities_in_country, minimum_match_value)
- #If no match was found or only the trivial match
- if closest_match=='' or closest_match==country:
- continue
- #If we have a good match, add it to the list of matched locations
- closest_match_split = re.split(",",closest_match)
- city = closest_match_split[0].strip()
- if len(closest_match_split)==3:
- region = closest_match_split[1].strip()
- country = closest_match_split[2].strip()
- matching_location = geo_data_session.query(AllCities).filter_by(city=city, region=region, country=country).first()
- else:
- country = closest_match_split[1].strip()
- matching_location = geo_data_session.query(AllCities).filter_by(city=city, country=country).first()
- if not matching_location:
- print 'Warning: all_cities match attempt failed for', cleaned_location.encode('utf8'), 'location not found'
- grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
- raw_location = grouped_location["raw_location"]
- identified_grouped_locations.append({"raw_location": raw_location,
- "matching_location": matching_location,
- "grouping_id": grouping_id})
- print 'all_cities found additional location for', raw_location
- """
- looks like there are not enough records being put into "update_statements". When I look at the rawlocation
- table and look for how many records in there have null location_ids, there are *far* too many.
- I'm looking at totalmatchgroups, which is the number of records that get update_statements generated,
- externalmatchgroups, which is the number of records that go into the process. I should also look
- at identified_grouped_locations_enum, which is a collection of all the grouped locations.
- """
- def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session):
- if alchemy.is_mysql():
- alchemy_session.execute("set foreign_key_checks = 0; truncate location;")
- alchemy_session.commit()
- for i, item in identified_grouped_locations_enum:
- #grouped_locations_list = a list of every grouped location with the same grouping_id
- # Note that a grouped_location is a dict, as described above
- #grouping_id = the grouping_id of all items in the list
- grouping_id, grouped_locations_list = item
- #We need to get only the RawLocation objects back from the grouped_location dict
- #match_group is the list of RawLocation objects which we call match on
- match_group = []
- first_pass=True
- for grouped_location in grouped_locations_list:
- match_group.append(grouped_location["raw_location"])
- if(first_pass):
- first_matching_location = grouped_location["matching_location"]
- """
- default is a dict containing the default values of the parameters
- (id, city, region, country, latitude, longtidue)
- for all locations that are part of the same group.
- Here we set the defaults to be the values for the first entry in the grouped_locations_list
- In theory, all entries in the grouped_locations_list should have the same lat/long.
- """
- default = {"id": grouping_id, "city":first_matching_location.city,
- "state":first_matching_location.region,
- "country":first_matching_location.country,
- "latitude":first_matching_location.latitude,
- "longitude":first_matching_location.longitude}
- #No need to run match() if no matching location was found.
- if(grouping_id!="nolocationfound"):
- run_geo_match(grouping_id, default, match_group, i, t, alchemy_session)
- if alchemy.is_mysql():
- alchemy_session.execute('truncate location; truncate assignee_location; truncate inventor_location;')
- else:
- alchemy_session.execute('delete from location;')
- alchemy_session.commit()
- alchemy_session.execute('delete from location_assignee;')
- alchemy_session.commit()
- alchemy_session.execute('delete from location_inventor;')
- alchemy_session.commit()
- if doctype == 'grant':
- bulk_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq, 'grant')
- bulk_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'grant')
- elif doctype == 'application':
- bulk_commit_inserts(location_insert_statements, alchemy.schema.App_Location.__table__, alchemy.is_mysql(), commit_freq, 'application')
- bulk_commit_updates('location_id', update_statements, alchemy.schema.App_RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'application')
- alchemy_session.commit()
- session_generator = alchemy.session_generator(dbtype=doctype)
- session = session_generator()
- session.commit()
- def run_geo_match(key, default, match_group, counter, runtime, alchemy_session):
- most_freq = 0
- #If there is more than one key, we need to figure out what attributes
- #(city, region, country, latitude, longitude) to assign the group
- """
- if len(match_group) > 1:
- # if key exists, look at the frequency
- # to determine the default summarization
- clean = alchemy_session.query(alchemy.schema.Location).filter(alchemy.schema.Location.id == key).first()
- if clean:
- param = clean.summarize
- param.pop("id")
- param.pop("latitude")
- param.pop("longitude")
- loc = alchemy_session.query(alchemy.schema.RawLocation)\
- .filter(alchemy.schema.RawLocation.city == param["city"])\
- .filter(alchemy.schema.RawLocation.state == param["state"])\
- .filter(alchemy.schema.RawLocation.country == param["country"])\
- .first()
- if loc:
- most_freq = len(loc.rawassignees) + len(loc.rawinventors)
- default.update(param)
- # took a look at the frequency of the items in the match_group
- for loc in match_group:
- freq = len(loc.rawassignees) + len(loc.rawinventors)
- if freq > most_freq:
- default.update(loc.summarize)
- most_freq = freq"""
- geo_match(match_group, alchemy_session, default)
- #if (counter + 1) % alchemy_config.get("location").get("commit_frequency") == 0:
- # print " *", (counter + 1), datetime.datetime.now() - runtime
- # alchemy_session.commit()
- location_insert_statements = []
- update_statements = []
- def geo_match(objects, session, default):
- freq = defaultdict(Counter)
- param = {}
- raw_objects = []
- clean_objects = []
- clean_cnt = 0
- clean_main = None
- class_type = None
- class_type = None
- for obj in objects:
- if not obj: continue
- class_type = obj.__related__
- raw_objects.append(obj)
- break
- for obj in raw_objects:
- for k, v in obj.summarize.iteritems():
- freq[k][v] += 1
- if "id" not in param:
- param["id"] = obj.uuid
- param["id"] = min(param["id"], obj.uuid)
- # create parameters based on most frequent
- for k in freq:
- if None in freq[k]:
- freq[k].pop(None)
- if "" in freq[k]:
- freq[k].pop("")
- if freq[k]:
- param[k] = freq[k].most_common(1)[0][0]
- param.update(default)
- if '?' in param['city']:
- print param['city']
- #TODO: Fix param city ?????
- location_insert_statements.append(param)
- tmpids = map(lambda x: x.id, objects)
- update_statements.extend([{'pk':x,'update':param['id']} for x in tmpids])
- def clean_raw_locations_from_file(inputfilename, outputfilename):
- inputfile = open(inputfilename, 'r')
- outputfile = open(outputfilename, 'w+')
- for line in inputfile:
- line = line.decode('utf8')
- line = geoalchemy_util.clean_raw_location(line)
- line = line.encode('utf8')
- outputfile.write(line)
- def analyze_input_addresses(inputfilename):
- valid_input_addresses = construct_valid_input_addresses()
- print datetime.datetime.now()
- inputfile = open(inputfilename, 'r')
- line_count=0
- good_count=0
- exists_in_all_cities_count=0
- #not_found_file = open('not_found.txt', 'w+')
- for line in inputfile:
- line = line.decode('utf8')
- input_address = geoalchemy_util.clean_raw_location(line)
- if input_address_exists(valid_input_addresses, input_address):
- good_count+=1
- #else:
- #not_found_file.write('{0}\n'.format(input_address.encode('utf8')))
- line_count+=1
- print 'All lines compared!'
- print '% good:', good_count*1.0/line_count
- print '% in all_cities:', exists_in_all_cities_count*1.0/line_count
- print datetime.datetime.now()
- def construct_valid_input_addresses():
- valid_input_addresses = set()
- temp = geo_data_session.query(RawGoogle.input_address).filter(RawGoogle.confidence>0)\
- .filter((RawGoogle.city!='') | (RawGoogle.region!=''))
- for row in temp:
- input_address = row.input_address
- valid_input_addresses.add(input_address)
- print 'List of all valid Google input_address values constructed with', len(valid_input_addresses), 'items'
- return valid_input_addresses
- def input_address_exists(valid_input_addresses, input_address):
- if valid_input_addresses:
- return input_address in valid_input_addresses
- else:
- print 'Error: list of valid input addresses not constructed'
- return False
- def find_difficult_locations_from_file(inputfilename, outputfilename):
- inputfile = open(inputfilename, 'r')
- outputfile = open(outputfilename, 'w+')
- t = datetime.datetime.now()
- all_japan_cities_query = geo_data_session.query(AllCities.city).filter(AllCities.country=='JP').group_by(AllCities.city).all()
- all_japan_cities = []
- for row in all_japan_cities_query:
- all_japan_cities.append(row.city)
- print 'list of all_japan_cities created', datetime.datetime.now()-t
- for line in inputfile:
- line = line.decode('utf8')
- line = geoalchemy_util.remove_eol_pattern.sub('', line)
- if line.endswith(', JP') or line.endswith(', JA'):
- city = line.split(',')[0].strip()
- most_similar_city = geoalchemy_util.get_closest_match_leven(city, all_japan_cities, 0.8)
- if most_similar_city!='':
- outputfile.write('{0}|{1}\n'.format(city.encode('utf8'), most_similar_city.encode('utf8')))
- print datetime.datetime.now()-t
- if __name__=='__main__':
- global doctype
- doctype='grant'
- main(doctype='grant')
- doctype = 'application'
- main(doctype='application')