PageRenderTime 50ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/geoalchemy.py

https://github.com/ryanwhalen/patentprocessor
Python | 447 lines | 433 code | 0 blank | 14 comment | 0 complexity | f779a7bbee029f113e4722c6f8487153 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. """
  2. Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are met:
  6. 1. Redistributions of source code must retain the above copyright notice, this
  7. list of conditions and the following disclaimer.
  8. 2. Redistributions in binary form must reproduce the above copyright notice,
  9. this list of conditions and the following disclaimer in the documentation
  10. and/or other materials provided with the distribution.
  11. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  12. ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  13. WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  14. DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  15. FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  16. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  17. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  18. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  19. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  20. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. """
  22. """
  23. Performs geolocation disambiguation
  24. """
  25. import sqlalchemy
  26. import sqlalchemy.orm as orm
  27. import sqlalchemy.ext.declarative as declarative
  28. import sqlalchemy.sql.expression as expression
  29. from sqlalchemy.sql.expression import bindparam
  30. import geoalchemy_util
  31. import itertools
  32. import os
  33. import datetime
  34. import re
  35. import sys
  36. from collections import defaultdict, Counter
  37. import pandas as pd
  38. import alchemy
  39. from alchemy.match import commit_inserts, commit_updates
  40. from tasks import bulk_commit_inserts, bulk_commit_updates
  41. global doctype
  42. doctype = ''
  43. #The config file alchemy uses to store information
  44. alchemy_config = alchemy.get_config()
  45. #The path to the database which holds geolocation data
  46. geo_data_dbpath = os.path.join(
  47. alchemy_config.get("location").get('path'),
  48. alchemy_config.get("location").get('database'))
  49. geo_data_engine = sqlalchemy.create_engine('sqlite:///%s' % geo_data_dbpath)
  50. geo_data_session_class = orm.sessionmaker(bind=geo_data_engine)
  51. #Used to query the database that holds the data from google
  52. #As well as a MaxMinds database containing every city in the world
  53. geo_data_session = geo_data_session_class()
  54. base = declarative.declarative_base()
  55. commit_freq = alchemy_config.get("location").get("commit_frequency")
  56. #Stores an address disambiguated by the Google API
  57. class RawGoogle(base):
  58. __tablename__ = 'raw_google'
  59. id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True)
  60. input_address = sqlalchemy.Column(sqlalchemy.String)
  61. city = sqlalchemy.Column(sqlalchemy.String)
  62. region = sqlalchemy.Column(sqlalchemy.String)
  63. country = sqlalchemy.Column(sqlalchemy.String)
  64. latitude = sqlalchemy.Column(sqlalchemy.REAL)
  65. longitude = sqlalchemy.Column(sqlalchemy.REAL)
  66. confidence = sqlalchemy.Column(sqlalchemy.REAL)
  67. def __init__(self, input_address, city, region, country, latitude, longitude, confidence):
  68. self.input_address = input_address
  69. self.city = city
  70. self.region = region
  71. self.country = country
  72. self.latitude = latitude
  73. self.longitude = longitude
  74. self.confidence = confidence
  75. def __repr__(self):
  76. return "<RawGoogle('%s','%s','%s','%s','%s','%s','%s')>" % (self.input_address, self.city, self.region, self.country, self.latitude, self.longitude, self.confidence)
  77. #One of the cities in the world as stored in the MaxMinds database
  78. class AllCities(base):
  79. __tablename__ = 'all_cities'
  80. id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True)
  81. city = sqlalchemy.Column(sqlalchemy.String)
  82. region = sqlalchemy.Column(sqlalchemy.String)
  83. country = sqlalchemy.Column(sqlalchemy.String)
  84. latitude = sqlalchemy.Column(sqlalchemy.REAL)
  85. longitude = sqlalchemy.Column(sqlalchemy.REAL)
  86. def __init__(self, city, region, country, latitude, longitude):
  87. self.city = city
  88. self.region = region
  89. self.country = country
  90. self.latitude = latitude
  91. self.longitude = longitude
  92. #Perform geocoding on the data stored in alchemy.db provided by parsing XML
  93. def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'):
  94. alchemy_session = alchemy.fetch_session(dbtype=doctype)
  95. t = datetime.datetime.now()
  96. print "geocoding started", t
  97. #Construct a list of all addresses which Google was capable of identifying
  98. #Making this now allows it to be referenced quickly later
  99. valid_input_addresses = construct_valid_input_addresses()
  100. #Get all of the raw locations in alchemy.db that were parsed from XML
  101. if doctype == 'grant':
  102. raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset)
  103. elif doctype == 'application':
  104. raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset)
  105. #If there are no locations, there is no point in continuing
  106. if raw_parsed_locations.count() == 0:
  107. return False
  108. print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(), 'items'
  109. """
  110. grouped_loations will contain a list of dicts. Each dict will contain three values:
  111. raw_location = Location object containing the original location found in the XML
  112. matching_location = RawGoogle object containing the disambiguated location
  113. grouping_id = ID constructed from the city, region, and country of the matching_location
  114. """
  115. identified_grouped_locations = []
  116. unidentified_grouped_locations = []
  117. for instance in raw_parsed_locations:
  118. #Convert the location into a string that matches the Google format
  119. parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country)
  120. cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location)
  121. #If the cleaned location has a match in the raw_google database,
  122. #we use that to classify it
  123. if input_address_exists(valid_input_addresses, cleaned_location):
  124. matching_location = geo_data_session.query(RawGoogle).filter(
  125. RawGoogle.input_address==cleaned_location).first()
  126. grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
  127. identified_grouped_locations.append({"raw_location": instance,
  128. "matching_location": matching_location,
  129. "grouping_id": grouping_id})
  130. else:
  131. """
  132. If there is no match in the raw_google database, we leave the location alone
  133. TODO: analyze the location's edit distance to make minor adjustments to it
  134. such that it can be matched. Particularly good if we can combine the
  135. all_cities database with the list of valid input_address values in the
  136. raw_google database.
  137. """
  138. #Sort the locations by their country
  139. country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
  140. unidentified_grouped_locations.append({"raw_location": instance,
  141. "cleaned_location": cleaned_location,
  142. "country": country})
  143. print "locations grouped", datetime.datetime.now() - t
  144. print 'count of identified locations:', len(identified_grouped_locations)
  145. t = datetime.datetime.now()
  146. #We now have two lists of locations. First, consider the unmatched locations.
  147. keyfunc = lambda x:x["country"]
  148. #Sort the list by the country
  149. unidentified_grouped_locations.sort(key=keyfunc)
  150. #Create an iterator that will access everything in the list with the same
  151. #country
  152. unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc))
  153. #Identify the correct location for each entry by comparing to all_cities
  154. identify_missing_locations(unidentified_grouped_locations_enum,
  155. identified_grouped_locations,
  156. minimum_match_value, t)
  157. print 'new count of identified locations:', len(identified_grouped_locations)
  158. #We now have a list of all locations in the file, along with their
  159. #matching locations and the id used to group them
  160. #Perform a quickfix to correct state names
  161. geoalchemy_util.fix_state_abbreviations(identified_grouped_locations);
  162. #Sort the list by the grouping_id
  163. keyfunc = lambda x: x['grouping_id']
  164. identified_grouped_locations.sort(key=keyfunc)
  165. #Create an iterator that will access everything in the list with the same
  166. #grouping_id
  167. identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc))
  168. print "identified_grouped_locations sorted", datetime.datetime.now() - t
  169. t = datetime.datetime.now()
  170. #Match the locations
  171. match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session)
  172. alchemy_session.commit()
  173. print "Matches made!", datetime.datetime.now() - t
  174. if doctype == 'grant':
  175. unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all()
  176. elif doctype == 'application':
  177. unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all()
  178. print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())
  179. #Identify locations that the Google disambiguation couldn't resolve
  180. def identify_missing_locations(unidentified_grouped_locations_enum,
  181. identified_grouped_locations,
  182. minimum_match_value, t):
  183. #For each group of locations with the same country
  184. for i, item in unidentified_grouped_locations_enum:
  185. country, grouped_locations_list = item
  186. #Get a list of all cities that exist anywhere in that country
  187. all_cities_in_country = geo_data_session.query(AllCities.city, AllCities.region).filter_by(country=country)
  188. #Construct a name for each location that matches the normal cleaned location format
  189. all_cities_in_country = [geoalchemy_util.concatenate_location(x.city,
  190. x.region if geoalchemy_util.region_is_a_state(x.region) else '',
  191. country) for x in all_cities_in_country]
  192. #For each location found in this country, find its closest match
  193. #among the list of all cities from that country
  194. for grouped_location in grouped_locations_list:
  195. cleaned_location = grouped_location["cleaned_location"]
  196. closest_match = geoalchemy_util.get_closest_match_leven(cleaned_location, all_cities_in_country, minimum_match_value)
  197. #If no match was found or only the trivial match
  198. if closest_match=='' or closest_match==country:
  199. continue
  200. #If we have a good match, add it to the list of matched locations
  201. closest_match_split = re.split(",",closest_match)
  202. city = closest_match_split[0].strip()
  203. if len(closest_match_split)==3:
  204. region = closest_match_split[1].strip()
  205. country = closest_match_split[2].strip()
  206. matching_location = geo_data_session.query(AllCities).filter_by(city=city, region=region, country=country).first()
  207. else:
  208. country = closest_match_split[1].strip()
  209. matching_location = geo_data_session.query(AllCities).filter_by(city=city, country=country).first()
  210. if not matching_location:
  211. print 'Warning: all_cities match attempt failed for', cleaned_location.encode('utf8'), 'location not found'
  212. grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
  213. raw_location = grouped_location["raw_location"]
  214. identified_grouped_locations.append({"raw_location": raw_location,
  215. "matching_location": matching_location,
  216. "grouping_id": grouping_id})
  217. print 'all_cities found additional location for', raw_location
  218. """
  219. looks like there are not enough records being put into "update_statements". When I look at the rawlocation
  220. table and look for how many records in there have null location_ids, there are *far* too many.
  221. I'm looking at totalmatchgroups, which is the number of records that get update_statements generated,
  222. externalmatchgroups, which is the number of records that go into the process. I should also look
  223. at identified_grouped_locations_enum, which is a collection of all the grouped locations.
  224. """
  225. def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session):
  226. if alchemy.is_mysql():
  227. alchemy_session.execute("set foreign_key_checks = 0; truncate location;")
  228. alchemy_session.commit()
  229. for i, item in identified_grouped_locations_enum:
  230. #grouped_locations_list = a list of every grouped location with the same grouping_id
  231. # Note that a grouped_location is a dict, as described above
  232. #grouping_id = the grouping_id of all items in the list
  233. grouping_id, grouped_locations_list = item
  234. #We need to get only the RawLocation objects back from the grouped_location dict
  235. #match_group is the list of RawLocation objects which we call match on
  236. match_group = []
  237. first_pass=True
  238. for grouped_location in grouped_locations_list:
  239. match_group.append(grouped_location["raw_location"])
  240. if(first_pass):
  241. first_matching_location = grouped_location["matching_location"]
  242. """
  243. default is a dict containing the default values of the parameters
  244. (id, city, region, country, latitude, longtidue)
  245. for all locations that are part of the same group.
  246. Here we set the defaults to be the values for the first entry in the grouped_locations_list
  247. In theory, all entries in the grouped_locations_list should have the same lat/long.
  248. """
  249. default = {"id": grouping_id, "city":first_matching_location.city,
  250. "state":first_matching_location.region,
  251. "country":first_matching_location.country,
  252. "latitude":first_matching_location.latitude,
  253. "longitude":first_matching_location.longitude}
  254. #No need to run match() if no matching location was found.
  255. if(grouping_id!="nolocationfound"):
  256. run_geo_match(grouping_id, default, match_group, i, t, alchemy_session)
  257. if alchemy.is_mysql():
  258. alchemy_session.execute('truncate location; truncate assignee_location; truncate inventor_location;')
  259. else:
  260. alchemy_session.execute('delete from location;')
  261. alchemy_session.commit()
  262. alchemy_session.execute('delete from location_assignee;')
  263. alchemy_session.commit()
  264. alchemy_session.execute('delete from location_inventor;')
  265. alchemy_session.commit()
  266. if doctype == 'grant':
  267. bulk_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq, 'grant')
  268. bulk_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'grant')
  269. elif doctype == 'application':
  270. bulk_commit_inserts(location_insert_statements, alchemy.schema.App_Location.__table__, alchemy.is_mysql(), commit_freq, 'application')
  271. bulk_commit_updates('location_id', update_statements, alchemy.schema.App_RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'application')
  272. alchemy_session.commit()
  273. session_generator = alchemy.session_generator(dbtype=doctype)
  274. session = session_generator()
  275. session.commit()
  276. def run_geo_match(key, default, match_group, counter, runtime, alchemy_session):
  277. most_freq = 0
  278. #If there is more than one key, we need to figure out what attributes
  279. #(city, region, country, latitude, longitude) to assign the group
  280. """
  281. if len(match_group) > 1:
  282. # if key exists, look at the frequency
  283. # to determine the default summarization
  284. clean = alchemy_session.query(alchemy.schema.Location).filter(alchemy.schema.Location.id == key).first()
  285. if clean:
  286. param = clean.summarize
  287. param.pop("id")
  288. param.pop("latitude")
  289. param.pop("longitude")
  290. loc = alchemy_session.query(alchemy.schema.RawLocation)\
  291. .filter(alchemy.schema.RawLocation.city == param["city"])\
  292. .filter(alchemy.schema.RawLocation.state == param["state"])\
  293. .filter(alchemy.schema.RawLocation.country == param["country"])\
  294. .first()
  295. if loc:
  296. most_freq = len(loc.rawassignees) + len(loc.rawinventors)
  297. default.update(param)
  298. # took a look at the frequency of the items in the match_group
  299. for loc in match_group:
  300. freq = len(loc.rawassignees) + len(loc.rawinventors)
  301. if freq > most_freq:
  302. default.update(loc.summarize)
  303. most_freq = freq"""
  304. geo_match(match_group, alchemy_session, default)
  305. #if (counter + 1) % alchemy_config.get("location").get("commit_frequency") == 0:
  306. # print " *", (counter + 1), datetime.datetime.now() - runtime
  307. # alchemy_session.commit()
  308. location_insert_statements = []
  309. update_statements = []
  310. def geo_match(objects, session, default):
  311. freq = defaultdict(Counter)
  312. param = {}
  313. raw_objects = []
  314. clean_objects = []
  315. clean_cnt = 0
  316. clean_main = None
  317. class_type = None
  318. class_type = None
  319. for obj in objects:
  320. if not obj: continue
  321. class_type = obj.__related__
  322. raw_objects.append(obj)
  323. break
  324. for obj in raw_objects:
  325. for k, v in obj.summarize.iteritems():
  326. freq[k][v] += 1
  327. if "id" not in param:
  328. param["id"] = obj.uuid
  329. param["id"] = min(param["id"], obj.uuid)
  330. # create parameters based on most frequent
  331. for k in freq:
  332. if None in freq[k]:
  333. freq[k].pop(None)
  334. if "" in freq[k]:
  335. freq[k].pop("")
  336. if freq[k]:
  337. param[k] = freq[k].most_common(1)[0][0]
  338. param.update(default)
  339. if '?' in param['city']:
  340. print param['city']
  341. #TODO: Fix param city ?????
  342. location_insert_statements.append(param)
  343. tmpids = map(lambda x: x.id, objects)
  344. update_statements.extend([{'pk':x,'update':param['id']} for x in tmpids])
  345. def clean_raw_locations_from_file(inputfilename, outputfilename):
  346. inputfile = open(inputfilename, 'r')
  347. outputfile = open(outputfilename, 'w+')
  348. for line in inputfile:
  349. line = line.decode('utf8')
  350. line = geoalchemy_util.clean_raw_location(line)
  351. line = line.encode('utf8')
  352. outputfile.write(line)
  353. def analyze_input_addresses(inputfilename):
  354. valid_input_addresses = construct_valid_input_addresses()
  355. print datetime.datetime.now()
  356. inputfile = open(inputfilename, 'r')
  357. line_count=0
  358. good_count=0
  359. exists_in_all_cities_count=0
  360. #not_found_file = open('not_found.txt', 'w+')
  361. for line in inputfile:
  362. line = line.decode('utf8')
  363. input_address = geoalchemy_util.clean_raw_location(line)
  364. if input_address_exists(valid_input_addresses, input_address):
  365. good_count+=1
  366. #else:
  367. #not_found_file.write('{0}\n'.format(input_address.encode('utf8')))
  368. line_count+=1
  369. print 'All lines compared!'
  370. print '% good:', good_count*1.0/line_count
  371. print '% in all_cities:', exists_in_all_cities_count*1.0/line_count
  372. print datetime.datetime.now()
  373. def construct_valid_input_addresses():
  374. valid_input_addresses = set()
  375. temp = geo_data_session.query(RawGoogle.input_address).filter(RawGoogle.confidence>0)\
  376. .filter((RawGoogle.city!='') | (RawGoogle.region!=''))
  377. for row in temp:
  378. input_address = row.input_address
  379. valid_input_addresses.add(input_address)
  380. print 'List of all valid Google input_address values constructed with', len(valid_input_addresses), 'items'
  381. return valid_input_addresses
  382. def input_address_exists(valid_input_addresses, input_address):
  383. if valid_input_addresses:
  384. return input_address in valid_input_addresses
  385. else:
  386. print 'Error: list of valid input addresses not constructed'
  387. return False
  388. def find_difficult_locations_from_file(inputfilename, outputfilename):
  389. inputfile = open(inputfilename, 'r')
  390. outputfile = open(outputfilename, 'w+')
  391. t = datetime.datetime.now()
  392. all_japan_cities_query = geo_data_session.query(AllCities.city).filter(AllCities.country=='JP').group_by(AllCities.city).all()
  393. all_japan_cities = []
  394. for row in all_japan_cities_query:
  395. all_japan_cities.append(row.city)
  396. print 'list of all_japan_cities created', datetime.datetime.now()-t
  397. for line in inputfile:
  398. line = line.decode('utf8')
  399. line = geoalchemy_util.remove_eol_pattern.sub('', line)
  400. if line.endswith(', JP') or line.endswith(', JA'):
  401. city = line.split(',')[0].strip()
  402. most_similar_city = geoalchemy_util.get_closest_match_leven(city, all_japan_cities, 0.8)
  403. if most_similar_city!='':
  404. outputfile.write('{0}|{1}\n'.format(city.encode('utf8'), most_similar_city.encode('utf8')))
  405. print datetime.datetime.now()-t
  406. if __name__=='__main__':
  407. global doctype
  408. doctype='grant'
  409. main(doctype='grant')
  410. doctype = 'application'
  411. main(doctype='application')