PageRenderTime 65ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/enrichment/legacy/alchemyapi/AlchemyEntityGeoCleanser.java

https://github.com/IKANOW/Infinit.e
Java | 629 lines | 398 code | 113 blank | 118 comment | 160 complexity | 2104d074580c4334434c3d2003ddd1f9 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi;
  17. import java.lang.reflect.Type;
  18. import java.net.UnknownHostException;
  19. import java.util.ArrayList;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.LinkedList;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Set;
  26. import java.util.regex.Matcher;
  27. import java.util.regex.Pattern;
  28. import org.apache.commons.lang.WordUtils;
  29. import com.mongodb.BasicDBObject;
  30. import com.mongodb.DBCollection;
  31. import com.mongodb.DBCursor;
  32. import com.mongodb.DBObject;
  33. import com.mongodb.MongoException;
  34. import com.google.gson.Gson;
  35. import com.google.gson.reflect.TypeToken;
  36. import com.ikanow.infinit.e.data_model.store.MongoDbManager;
  37. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  38. import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
  39. import com.ikanow.infinit.e.data_model.store.feature.geo.GeoFeaturePojo;
  40. //______________________________________________________________________________________
  41. public class AlchemyEntityGeoCleanser {
  42. // Stats:
  43. private int _nDocusModified = 0;
  44. private int _nDocusProcessed = 0;
  45. private int _nStayedWithOriginal = 0;
  46. private int _nMovedToRegion = 0;
  47. private int _nMovedToLargeCity = 0;
  48. private int _nMovedToForeignCity = 0;
  49. public int getDocusModified() { return _nDocusModified; }
  50. public int getDocusProcessed() { return _nDocusProcessed; }
  51. public int getStayedWithOriginal() { return _nStayedWithOriginal; }
  52. public int getMovedToRegion() { return _nMovedToRegion; }
  53. public int getMovedToLargeCity() { return _nMovedToLargeCity; }
  54. public int getMovedToForeignCity() { return _nMovedToForeignCity; }
  55. // Debug:
  56. private int _nDebugLevel = 0;
  57. public void setDebugLevel(int nDebugLevel) { //1==replacements, 2=feeds/candidate entities, 3=entities, 4=decomposition
  58. _nDebugLevel = nDebugLevel;
  59. }
  60. //______________________________________________________________________________________
  61. // Processing code
  62. //______________________________________________________________________________________
  63. // Top level logic
  64. // For running remotely
  65. // For cleaning local feeds, just call cleansePeopleInDocu(feed)
  66. // Host/Port - obvious
  67. // HexSlice - sub-samples somewhat efficiently, on last specified digits of _id
  68. // userQuery - lets the calling function decide what data to run on (probably for debugging)
  69. // nLimit - the max number of entries returned (for debugging)
  70. // bAlterDB - writes the results back to the DB (else it's just for debugging)
  71. public void doProcessing(int nSkip, BasicDBObject userQuery, int nLimit, boolean bAlterDB)
  72. throws NumberFormatException, UnknownHostException, MongoException
  73. {
  74. // Initialization (regexes and stuff)
  75. this.initialize();
  76. // Launch MongoDB query
  77. BasicDBObject query = userQuery;
  78. if (null == query) {
  79. new BasicDBObject();
  80. }
  81. // Just get the entity list out to save a few CPU cycles
  82. BasicDBObject outFields = new BasicDBObject();
  83. outFields.append(DocumentPojo.entities_, 1);
  84. outFields.append(DocumentPojo.url_, 1); // (help with debugging)
  85. outFields.append(DocumentPojo.title_, 1); // (help with debugging)
  86. DBCursor dbc = null;
  87. if (nLimit > 0) {
  88. dbc = _docsDB.find(query, outFields).limit(nLimit).skip(nSkip);
  89. }
  90. else { // Everything!
  91. dbc = _docsDB.find(query, outFields).skip(nSkip);
  92. }
  93. // Create POJO array of documents (definitely not the most efficient, but
  94. // will make integration with the harvester easier)
  95. List<DocumentPojo> docus = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());
  96. // Loop over array and invoke the cleansing function for each one
  97. for (DocumentPojo docu: docus) {
  98. if (this.cleanseGeoInDocu(docu)) {
  99. this._nDocusModified++;
  100. if (bAlterDB) {
  101. BasicDBObject inner0 = new BasicDBObject(DocumentPojo.entities_,
  102. (DBObject)com.mongodb.util.JSON.parse(new Gson().toJson(docu.getEntities())));
  103. BasicDBObject inner1 = new BasicDBObject(MongoDbManager.set_, inner0);
  104. // Overwrite the existing entities list with the new one
  105. _docsDB.update(new BasicDBObject(DocumentPojo._id_, docu.getId()), inner1, false, true);
  106. // (need the multi-update in case _id isn't the shard key - documentation claims this is not necessary but 2.4.6/shell still enforces it)
  107. }//TESTED
  108. }
  109. this._nDocusProcessed++;
  110. }
  111. }
  112. //________________________________________________
  113. // Initialization variables
  114. private DBCollection _docsDB = null;
  115. private DBCollection _georefDB = null;
  116. private static final String _stateList =
  117. "Alabama|Alaska|American Samoa|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|D\\.C\\.|District of Columbia|Florida|Georgia|Guam|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Northern Marianas Islands|Ohio|Oklahoma|Oregon|Pennsylvania|Puerto Rico|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Virgin Islands|Washington|West Virginia|Wisconsin|Wyoming";
  118. private Pattern _statesRegex = null;
  119. private static final String _abbrStateList = "(?:m\\.d|n\\.j|n.m|conn|mich|al\\.|d\\.c|vt|calif|wash\\.|ore\\.|ind\\.)\\.?";
  120. private Pattern _abbrStateRegex = null;
  121. //________________________________________________
  122. // Initialization code
  123. // Call with null/null to act on local objects vs fetching them from the DB
  124. public void initialize() throws NumberFormatException, UnknownHostException, MongoException {
  125. // MongoDB
  126. _docsDB = MongoDbManager.getDocument().getMetadata();
  127. _georefDB = MongoDbManager.getFeature().getGeo();
  128. // Regex of US states
  129. _statesRegex = Pattern.compile(_stateList);
  130. _abbrStateRegex = Pattern.compile(_abbrStateList);
  131. }
  132. //________________________________________________
  133. // Inner loop processing logic
  134. public static class Candidate {
  135. EntityPojo entity;
  136. LinkedList<GeoFeaturePojo> candidates;
  137. String state;
  138. Candidate(EntityPojo ent, LinkedList<GeoFeaturePojo> cands, String st)
  139. { entity = ent; candidates = cands; state = st; }
  140. }
  141. public boolean cleanseGeoInDocu(DocumentPojo doc) {
  142. boolean bChangedAnything = false;
  143. Map<String, Candidate> dubiousLocations = new HashMap<String, Candidate>();
  144. Set<String> otherRegions = new HashSet<String>();
  145. Set<String> otherCountries = new HashSet<String>();
  146. Set<String> otherCountriesOrRegionsReferenced = new HashSet<String>();
  147. //Debug
  148. if (_nDebugLevel >= 2) {
  149. System.out.println("+++++++ Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getEntities().size());
  150. }
  151. // 1] First off, let's find anything location-based and also determine if it's bad or not
  152. if (null != doc.getEntities()) for (EntityPojo ent: doc.getEntities()) {
  153. boolean bStrongCandidate = false;
  154. // People: decompose names
  155. if (EntityPojo.Dimension.Where == ent.getDimension()) {
  156. // So locations get disambiguated to one of:
  157. // "<city-etc>, <region-or-country>", or "<region-or-country>"
  158. // though can also just be left as they are.
  159. String sActualName = ent.getActual_name().toLowerCase();
  160. if (!ent.getDisambiguatedName().toLowerCase().equals(sActualName)) {
  161. // It's been disambiguated
  162. //Debug
  163. if (_nDebugLevel >= 3) {
  164. System.out.println("disambiguous candidate: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name()
  165. + " (" + ((null!=ent.getSemanticLinks())?ent.getSemanticLinks().size():0) + ")"
  166. );
  167. }
  168. // OK next step, is it a disambiguation to a US town?
  169. String splitMe[] = ent.getDisambiguatedName().split(", ");
  170. if (2 == splitMe.length) {
  171. String stateOrCountry = splitMe[1];
  172. Matcher m = _statesRegex.matcher(stateOrCountry);
  173. if (m.find()) { // This is a US disambiguation - high risk case
  174. // Short cut if state is already directly mentioned?
  175. stateOrCountry = stateOrCountry.toLowerCase();
  176. if (!otherRegions.contains(stateOrCountry)) { // See list below - no need to go any further
  177. // OK next step - is it a possible ambiguity:
  178. ArrayList<BasicDBObject> x = new ArrayList<BasicDBObject>();
  179. BasicDBObject inner0_0 = new BasicDBObject(MongoDbManager.not_, Pattern.compile("US"));
  180. BasicDBObject inner1_0 = new BasicDBObject("country_code", inner0_0);
  181. x.add(inner1_0);
  182. BasicDBObject inner0_1 = new BasicDBObject(MongoDbManager.gte_, 400000);
  183. BasicDBObject inner1_1 = new BasicDBObject("population", inner0_1);
  184. x.add(inner1_1);
  185. BasicDBObject dbo = new BasicDBObject();
  186. dbo.append("search_field", sActualName);
  187. dbo.append(MongoDbManager.or_, x);
  188. DBCursor dbc = _georefDB.find(dbo);
  189. if (dbc.size() >= 1) { // Problems!
  190. //Create list of candidates
  191. Type listType = new TypeToken<LinkedList<GeoFeaturePojo>>() {}.getType();
  192. LinkedList<GeoFeaturePojo> grpl = new Gson().fromJson(dbc.toArray().toString(), listType);
  193. //Debug
  194. if (_nDebugLevel >= 2) {
  195. System.out.println("\tERROR CANDIDATE: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name()
  196. + " (" + dbc.count() + ")");
  197. if (_nDebugLevel >= 3) {
  198. for (GeoFeaturePojo grp: grpl) {
  199. System.out.println("\t\tCandidate:" + grp.getCity() + " / " + grp.getRegion() + " / " + grp.getCountry());
  200. }
  201. }
  202. }
  203. Candidate candidate = new Candidate(ent, grpl, stateOrCountry);
  204. dubiousLocations.put(ent.getIndex(), candidate);
  205. bStrongCandidate = true;
  206. } // if strong candidate
  207. }//TESTED ("reston, virginia" after "virginia/stateorcounty" mention)
  208. // (end if can't shortcut past all this)
  209. } // end if a US town
  210. } // end if in the format "A, B"
  211. } // if weak candidate
  212. //TESTED
  213. if (!bStrongCandidate) { // Obv can't count on a disambiguous candidate:
  214. String type = ent.getType().toLowerCase();
  215. if (type.equals("stateorcounty")) {
  216. String disName = ent.getDisambiguatedName().toLowerCase();
  217. if (_abbrStateRegex.matcher(disName).matches()) {
  218. otherRegions.add(getStateFromAbbr(disName));
  219. }
  220. else {
  221. otherRegions.add(ent.getDisambiguatedName().toLowerCase());
  222. }
  223. otherCountriesOrRegionsReferenced.add("united states");
  224. }//TESTED: "mich./stateorcounty"
  225. else if (type.equals("country")) {
  226. String disName = ent.getDisambiguatedName().toLowerCase();
  227. // Translation of known badly transcribed countries:
  228. // (England->UK)
  229. if (disName.equals("england")) {
  230. otherCountries.add("united kingdom");
  231. }//TESTED
  232. else {
  233. otherCountries.add(ent.getDisambiguatedName().toLowerCase());
  234. }
  235. }
  236. else if (type.equals("region")) {
  237. otherRegions.add(ent.getDisambiguatedName().toLowerCase());
  238. }
  239. else if (type.equals("city")) {
  240. String splitMe[] = ent.getDisambiguatedName().split(",\\s*");
  241. if (2 == splitMe.length) {
  242. otherCountriesOrRegionsReferenced.add(splitMe[1].toLowerCase());
  243. if (this._statesRegex.matcher(splitMe[1]).find()) {
  244. otherCountriesOrRegionsReferenced.add("united states");
  245. }//TESTED: "lexingon, kentucky/city"
  246. }
  247. }
  248. }//TESTED: just above clauses
  249. } // if location
  250. } // (end loop over entities)
  251. // Debug:
  252. if ((_nDebugLevel >= 3) && (!dubiousLocations.isEmpty())) {
  253. for (String s: otherRegions) {
  254. System.out.println("Strong region: " + s);
  255. }
  256. for (String s: otherCountries) {
  257. System.out.println("Strong countries: " + s);
  258. }
  259. for (String s: otherCountriesOrRegionsReferenced) {
  260. System.out.println("Weak regionscountries: " + s);
  261. }
  262. }
  263. // 2] The requirements and algorithm are discussed in
  264. // http://ikanow.jira.com/wiki/display/INF/Beta...+improving+AlchemyAPI+extraction+%28geo%29
  265. // Canonical cases:
  266. // Darfur -> Darfur, MN even though Sudan and sometimes Darfur, Sudan are present
  267. // Shanghai -> Shanghai, WV even though China is mentioned (and not WV)
  268. // Manchester -> Manchester village, NY (not Manchester, UK)
  269. // Philadelphia -> Philadelphia (village), NY (though NY is mentioned and not PA)
  270. // We're generating the following order
  271. // 10] Sitting tenant with strong direct
  272. // 15] Large city with strong direct
  273. // 20] Region with direct
  274. // 30] Large city with strong indirect
  275. // 40] Sitting tenant with strong indirect
  276. // 50] Region with indirect
  277. // 60] Another foreign possibility with strong direct
  278. // 70] Large city with weak direct
  279. // 72] Large city with weak indirect
  280. // 75] Large city with no reference
  281. // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant)
  282. // 80] Sitting tenant with any weak (US) direct or indirect
  283. // 90] Another foreign possibility with strong indirect
  284. // 100] Another foreign possibility with weak direct
  285. // 110] Another foreign possibility with weak indirect
  286. // 120] Region with no reference, if there is only 1
  287. // 130] Sitting tenant with none of the above (ie default)
  288. // 140] Anything else!
  289. for (Map.Entry<String, Candidate> pair: dubiousLocations.entrySet()) {
  290. EntityPojo ent = pair.getValue().entity;
  291. Candidate candidate = pair.getValue();
  292. // 2.1] Let's analyse the "sitting tenant"
  293. int nPrio = 130;
  294. GeoFeaturePojo currLeader = null;
  295. int nCase = 0; // (just for debugging, 0=st, 1=large city, 2=region, 3=other)
  296. if (otherRegions.contains(candidate.state)) { // Strong direct ref, winner!
  297. nPrio = 10; // winner!
  298. }//TESTED: "san antonio, texas/city" vs "texas"
  299. else if (otherCountriesOrRegionsReferenced.contains(candidate.state)) {
  300. // Indirect ref
  301. nPrio = 40; // good, but beatable...
  302. }//TESTED: "philadelphia (village), new york/city"
  303. else if (otherCountries.contains("united states")) { // Weak direct ref
  304. nPrio = 80; // better than nothing...
  305. }//TESTED: "apache, oklahoma/city"
  306. else if (otherCountriesOrRegionsReferenced.contains("united states")) { // Weak indirect ref
  307. nPrio = 80; // better than nothing...
  308. }//TESTED: "washington, d.c." have DC as stateorcounty, but US in countries list
  309. // Special case: we don't like "village":
  310. if ((80 != nPrio) && ent.getDisambiguatedName().contains("village") && !ent.getActual_name().contains("village"))
  311. {
  312. nPrio = 80;
  313. }//TESTED: "Downvoted: Philadelphia (village), New York from Philadelphia"
  314. // Debug
  315. if (_nDebugLevel >= 2) {
  316. System.out.println(pair.getKey() + " SittingTenantScore=" + nPrio);
  317. }
  318. // Alternatives
  319. if (nPrio > 10) {
  320. LinkedList<GeoFeaturePojo> geos = pair.getValue().candidates;
  321. for (GeoFeaturePojo geo: geos) {
  322. int nAltPrio = 140;
  323. int nAltCase = -1;
  324. String city = (null != geo.getCity()) ? geo.getCity().toLowerCase() : null;
  325. String region = (null != geo.getRegion()) ? geo.getRegion().toLowerCase() : null;
  326. String country = (null != geo.getCountry()) ? geo.getCountry().toLowerCase() : null;
  327. // 2.2] CASE 1: I'm a city with pop > 1M (best score 15)
  328. // 15] Large city with strong direct
  329. // 30] Large city with strong indirect
  330. // 70] Large city with weak direct
  331. // 72] Large city with weak indirect
  332. // 75] Large city with no reference
  333. if ((null != city) && (geo.getPopulation() >= 400000) && (nPrio > 15)) {
  334. nAltCase = 1;
  335. if ((null != region) && (otherRegions.contains(region))) {
  336. nAltPrio = 15; // strong direct
  337. }//TESTED: "dallas / Texas / United States = 15"
  338. else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
  339. nAltPrio = 30; // strong indirect
  340. }//TESTED: "sacramento / California / United State"
  341. else if ((null != country) && (otherCountries.contains(country))) {
  342. nAltPrio = 70; // weak direct
  343. }//TESTED: "berlin, germany", with "germany" directly mentioned
  344. else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
  345. nAltPrio = 72; // weak indirect
  346. }//TESTED: "los angeles / California / United States = 72"
  347. else {
  348. nAltPrio = 75; // just for being big!
  349. }//TESTED: "barcelona, spain"
  350. }
  351. // 2.3] CASE 2: I'm a region (best score=20, can beat current score)
  352. // 20] Region with direct
  353. // 50] Region with indirect
  354. // 120] Region with no reference, if there is only 1
  355. else if ((null == city) && (nPrio > 20)) {
  356. nAltCase = 2;
  357. if ((null != country) && (otherCountries.contains(country))) {
  358. nAltPrio = 20; // strong direct
  359. }//TESTED: (region) "Berlin, Germany" with "Germany" mentioned
  360. else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
  361. nAltPrio = 50; // strong indirect
  362. }//(haven't seen, but we'll live)
  363. else {
  364. nAltPrio = 120; // (just for being there)
  365. }//TESTED: "null / Portland / Jamaica = 120", also "Shanghai / China"
  366. }
  367. // 2.4] CASE 3: I'm any foreign possibility (best score=60)
  368. // 60] Another foreign possibility with strong direct
  369. // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant)
  370. // 90] Another foreign possibility with strong indirect
  371. // 100] Another foreign possibility with weak direct
  372. // 110] Another foreign possibility with weak indirect
  373. else if (nPrio > 60) {
  374. nAltCase = 3;
  375. if ((null != region) && (otherRegions.contains(region))) {
  376. nAltPrio = 60; // strong direct
  377. // Double check we're not falling into the trap below:
  378. if (!geo.getCountry_code().equals("US")) {
  379. Matcher m = this._statesRegex.matcher(geo.getRegion());
  380. if (m.matches()) { // non US state matching against (probably) US state, disregard)
  381. nAltPrio = 140;
  382. }
  383. }//TESTED (same clause as below)
  384. }//TESTED: lol "philadelphia / Maryland / Liberia = 60" (before above extra clause)
  385. if (nAltPrio > 60) { // (may need to re-run test)
  386. if ((null != country) && (otherCountries.contains(country))) {
  387. if (geo.getPopulation() < 100000) {
  388. nAltPrio = 90; // strong indirect
  389. } //TESTED: "washington / Villa Clara / Cuba"
  390. else {
  391. nAltPrio = 78; // strong indirect, with boost!
  392. } //TESTED: "geneva, Geneve, Switzerland", pop 180K
  393. }
  394. else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
  395. nAltPrio = 100; // weak direct
  396. }//TESTED: "lincoln / Lincolnshire / United Kingdom = 100"
  397. else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
  398. nAltPrio = 110; // weak indirect
  399. }//(haven't seen, but we'll live)
  400. }
  401. }
  402. // Debug:
  403. if ((_nDebugLevel >= 2) && (nAltPrio < 140)) {
  404. System.out.println("----Alternative: " + geo.getCity() + " / " + geo.getRegion() + " / " + geo.getCountry() + " score=" + nAltPrio);
  405. }
  406. // Outcome of results:
  407. if (nAltPrio < nPrio) {
  408. currLeader = geo;
  409. nPrio = nAltPrio;
  410. nCase = nAltCase;
  411. }
  412. } // end loop over alternativse
  413. if (null != currLeader) { // Need to change
  414. if (1 == nCase) {
  415. this._nMovedToLargeCity++;
  416. //(Cities are lower case in georef DB for some reason)
  417. String city = WordUtils.capitalize(currLeader.getCity());
  418. if (currLeader.getCountry_code().equals("US")) { // Special case: is this just the original?
  419. String region = currLeader.getRegion();
  420. if (region.equals("District of Columbia")) { // Special special case
  421. region = "D.C.";
  422. }
  423. String sCandidate = city + ", " + region;
  424. if (!sCandidate.equals(ent.getDisambiguatedName())) {
  425. ent.setDisambiguatedName(sCandidate);
  426. ent.setIndex(ent.getDisambiguatedName() + "/city");
  427. ent.setSemanticLinks(null);
  428. bChangedAnything = true;
  429. }//TESTED (lots, eg "Philadelphia (village), New York" -> "Philadelphia, PA"; Wash, Ill. -> Wash DC)
  430. else {
  431. this._nMovedToLargeCity--;
  432. _nStayedWithOriginal++;
  433. }//TESTED ("Washington DC", "San Juan, Puerto Rico")
  434. }//TESTED (see above)
  435. else {
  436. ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
  437. ent.setIndex(ent.getDisambiguatedName() + "/city");
  438. ent.setSemanticLinks(null);
  439. bChangedAnything = true;
  440. }//TESTED: "london, california/city to London, United Kingdom"
  441. }
  442. else if (2 == nCase) {
  443. this._nMovedToRegion++;
  444. ent.setDisambiguatedName(currLeader.getRegion() + ", " + currLeader.getCountry());
  445. ent.setIndex(ent.getDisambiguatedName() + "/region");
  446. ent.setSemanticLinks(null);
  447. bChangedAnything = true;
  448. }//TESTED: "Moved madrid, new york/city to Madrid, Spain" (treats Madrid as region, like Berlin see above)
  449. else {
  450. //(Cities are lower case in georef DB for some reason)
  451. String city = WordUtils.capitalize(currLeader.getCity());
  452. this._nMovedToForeignCity++;
  453. ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
  454. ent.setIndex(ent.getDisambiguatedName() + "/city");
  455. ent.setSemanticLinks(null);
  456. bChangedAnything = true;
  457. }//TESTED: "Moved geneva, new york/city to Geneva, Switzerland"
  458. if ((_nDebugLevel >= 1) && (null == ent.getSemanticLinks())) {
  459. System.out.println("++++ Moved " + pair.getKey() + " to " + ent.getDisambiguatedName());
  460. }
  461. }
  462. else {
  463. _nStayedWithOriginal++;
  464. }
  465. } // (if sitting tenant not holder)
  466. } // (end loop over candidates)
  467. if ((_nDebugLevel >= 1) && bChangedAnything) {
  468. System.out.println("\t(((Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getUrl() + ")))");
  469. }
  470. return bChangedAnything;
  471. }
  472. /////////////////////////////////////////////////////////////////////////////////////////////
  473. // Utility: state abbrievations:
  474. private static String getStateFromAbbr(String s) {
  475. if (s.endsWith(".")) {
  476. s = s.substring(0, s.length() - 1);
  477. }
  478. if (s.equals("m.d")) {
  479. s = "maryland";
  480. }
  481. else if (s.equals("n.m")) {
  482. s = "new mexico";
  483. }
  484. else if (s.equals("conn")) {
  485. s = "connecticut";
  486. }
  487. else if (s.equals("mich")) {
  488. s = "michigan";
  489. }
  490. else if (s.equals("n.j")) {
  491. s = "new jersey";
  492. }
  493. else if (s.equals("al")) {
  494. s = "alabama";
  495. }
  496. else if (s.equals("d.c")) {
  497. s = "district of columbia";
  498. }
  499. else if (s.equals("vt")) {
  500. s = "vermont";
  501. }
  502. else if (s.equals("calif")) {
  503. s = "california";
  504. }
  505. else if (s.equals("wash")) {
  506. s = "washington";
  507. }
  508. else if (s.equals("ore")) {
  509. s = "oregon";
  510. }
  511. else if (s.equals("ind")) {
  512. s = "indiana";
  513. }
  514. return s;
  515. }
  516. }