PageRenderTime 7092ms CodeModel.GetById 31ms RepoModel.GetById 2ms app.codeStats 0ms

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/enrichment/legacy/alchemyapi/AlchemyEntityPersonCleanser.java

https://github.com/IKANOW/Infinit.e
Java | 877 lines | 563 code | 150 blank | 164 comment | 191 complexity | 183cbdca75a5f90805d9fc174ff468c5 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi;
  17. import java.io.PrintStream;
  18. import java.net.UnknownHostException;
  19. import java.util.HashMap;
  20. import java.util.HashSet;
  21. import java.util.LinkedList;
  22. import java.util.List;
  23. import java.util.Map;
  24. import java.util.Set;
  25. import java.util.regex.Matcher;
  26. import java.util.regex.Pattern;
  27. import com.mongodb.BasicDBObject;
  28. import com.mongodb.DBCollection;
  29. import com.mongodb.DBCursor;
  30. import com.mongodb.DBObject;
  31. import com.mongodb.MongoException;
  32. import com.google.gson.Gson;
  33. import com.ikanow.infinit.e.data_model.store.MongoDbManager;
  34. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  35. import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
  36. //______________________________________________________________________________________
  37. public class AlchemyEntityPersonCleanser {
  38. // Stats:
  39. private int _nDeduplications = 0;
  40. public int getDeduplications() { return _nDeduplications; }
  41. private int _nOneWordAssignments = 0;
  42. public int getOneWordAssignments() { return _nOneWordAssignments; }
  43. private int _nOneWordConversions = 0; // (a subset of the above)
  44. public int getOneWordConversions() { return _nOneWordConversions; }
  45. private int _nOneWordDeletions = 0;
  46. public int getOneWordDeletions() { return _nOneWordDeletions; }
  47. private int _nDocusModified = 0;
  48. public int getDocusModified() { return _nDocusModified; }
  49. private int _nDocusProcessed = 0;
  50. public int getDocusProcessed() { return _nDocusProcessed; }
  51. // Debug:
  52. private int _nDebugLevel = 0;
  53. public void setDebugLevel(int nDebugLevel) { //1==replacements, 2=feeds/candidate entities, 3=entities, 4=decomposition
  54. _nDebugLevel = nDebugLevel;
  55. }
  56. //______________________________________________________________________________________
  57. // Processing code
  58. //______________________________________________________________________________________
  59. // Top level logic
  60. // For running remotely
  61. // For cleaning local feeds, just call cleansePeopleInDocu(feed)
  62. // Host/Port - obvious
  63. // HexSlice - sub-samples somewhat efficiently, on last specified digits of _id
  64. // userQuery - lets the calling function decide what data to run on (probably for debugging)
  65. // nLimit - the max number of entries returned (for debugging)
  66. // bAlterDB - writes the results back to the DB (else it's just for debugging)
  67. public void doProcessing(int nSkip, BasicDBObject userQuery, int nLimit, boolean bAlterDB)
  68. throws NumberFormatException, UnknownHostException, MongoException
  69. {
  70. // Initialization (regexes and stuff)
  71. this.initialize();
  72. // Launch MongoDB query
  73. BasicDBObject query = userQuery;
  74. if (null == query) {
  75. new BasicDBObject();
  76. }
  77. // Just get the entity list out to save a few CPU cycles
  78. BasicDBObject outFields = new BasicDBObject();
  79. outFields.append(DocumentPojo.entities_, 1);
  80. outFields.append(DocumentPojo.url_, 1); // (help with debugging)
  81. outFields.append(DocumentPojo.title_, 1); // (help with debugging)
  82. DBCursor dbc = null;
  83. if (nLimit > 0) {
  84. dbc = docsDB.find(query, outFields).limit(nLimit).skip(nSkip);
  85. }
  86. else { // Everything!
  87. dbc = docsDB.find(query, outFields).skip(nSkip);
  88. }
  89. // Create POJO array of documents (definitely not the most efficient, but
  90. // will make integration with the harvester easier)
  91. List<DocumentPojo> docus = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());
  92. // Loop over array and invoke the cleansing function for each one
  93. for (DocumentPojo docu: docus) {
  94. if (this.cleansePeopleInDocu(docu)) {
  95. this._nDocusModified++;
  96. if (bAlterDB) {
  97. BasicDBObject inner0 = new BasicDBObject(DocumentPojo.entities_,
  98. (DBObject)com.mongodb.util.JSON.parse(new Gson().toJson(docu.getEntities())));
  99. BasicDBObject inner1 = new BasicDBObject(MongoDbManager.set_, inner0);
  100. // Overwrite the existing entities list with the new one
  101. docsDB.update(new BasicDBObject(DocumentPojo._id_, docu.getId()), inner1, false, true);
  102. // (need the multi-update in case _id isn't the shard key - documentation claims this is not necessary but 2.4.6/shell still enforces it)
  103. }//TESTED: checked on "Feed: Japan's Three Elections / 4c92863751cc2e59d612000b / 30"
  104. }
  105. this._nDocusProcessed++;
  106. }
  107. }
  108. //________________________________________________
  109. // Initialization variables
  110. static final private String _namePrefixes =
  111. "(?:(?:ms|miss|mrs|mr|master|rev|reverand|fr|father|dr|doctor|atty|prof|professor|hon|" +
  112. "pres|president|gov|governor|coach|ofc|supt|rep|representative|sen|senator|amb|ambassador|" +
  113. "pm|p\\.m|prime minister|judge|chief judge|" +
  114. "pvt|private|cpl|corporal|sgt|sargent|seargant|maj|major|cpt|captain|cmdr|command|lt|lieutenant|" +
  115. "lt col|lieutenant colonel|gen|general)\\.?\\s+)?";
  116. static final private String _weakPrefixes = "^\\s*(?:ms|miss|mrs|mr)\\.?\\s+";
  117. private Pattern _weakPrefixPattern = null;
  118. private Pattern _namePattern = null;
  119. private Pattern _nickNamePattern = null;
  120. //private Map<String, Set<String> > _nicknameListHash = new HashMap<String, Set<String> >();
  121. private DBCollection docsDB = null;
  122. //________________________________________________
  123. // Initialization code
  124. // Call with null/null to act on local objects vs fetching them from the DB
  125. public void initialize() throws NumberFormatException, UnknownHostException, MongoException {
  126. // MongoDB
  127. docsDB = MongoDbManager.getDocument().getMetadata();
  128. // Regex
  129. // (prefix) (first-name) [various names or nicknames] (last-name) \(disambiguation\)
  130. _namePattern = Pattern.compile(_namePrefixes +
  131. "(?:([^\\s()]+)\\s+)?" + // first-name, ws (capture)
  132. "((?:(?:(?:\\\"(?:[^\"]+)\\\")|(?:(?:[^\\s()]+)))\\s+)*)?" + // nick-names OR middle-names (capture the whole thing)
  133. "(?:([^\\s()]+[^\\s(),.;:])[,.;:]?\\s*)" + // last-name, ws (capture) - also remove closing punct
  134. "(?:\\((?:[^)]+)\\)?)?\\s*" // disambiguation
  135. );
  136. //TESTED: seen all of these clauses work
  137. // (note this "fails" with jr|jr.|iii etc - needs to be sorted out in the decomp fn below)
  138. _nickNamePattern = Pattern.compile("(?:(?:(?:\\\"([^\"]+)\\\")|(?:([^\\s()]+)))\\s+)");
  139. // (individual components within nick name)
  140. _weakPrefixPattern = Pattern.compile(_weakPrefixes);
  141. }
  142. //________________________________________________
  143. // Utility class used in processing function below
  144. static private class EntityInfo {
  145. EntityPojo entity = null;
  146. String firstName = "";
  147. String lastName = "";
  148. Set<String> middleOrNickNames = new HashSet<String>();
  149. //________________________________________________
  150. // Constructor - where most of the processing logic occurs, using the regexes defined above
  151. // One word case:
  152. EntityInfo(EntityPojo e) {
  153. this.entity = e;
  154. this.firstName = null; // (slightly hacky way of differentiating between one word and weak prefix cases...)
  155. this.lastName = e.getActual_name().toLowerCase();
  156. }
  157. // Complex case:
  158. EntityInfo(EntityPojo e, Matcher m, Pattern p) {
  159. this.entity = e;
  160. int nCount = m.groupCount();
  161. if (nCount > 0) {
  162. this.firstName = m.group(1);
  163. if (null == this.firstName) {
  164. this.firstName = "";
  165. }
  166. if (nCount > 1) {
  167. boolean bNeedNewLastName = false;
  168. this.lastName = m.group(nCount);
  169. if (null == this.lastName) {
  170. this.lastName = "";
  171. }
  172. if (this.lastName.matches("jr.?|[ivx]+.?")) {
  173. bNeedNewLastName = true;
  174. this.lastName = null;
  175. }
  176. LinkedList<String> lNicks = null;
  177. for (int i = 2; i < nCount; ++i) { // Should only be 1: the set of middle names
  178. String middleOrNick = m.group(i);
  179. if (null != middleOrNick) { // Have to decompose further, sigh
  180. lNicks = new LinkedList<String>();
  181. Matcher mN = p.matcher(middleOrNick);
  182. while (mN.find()) {
  183. String sNick = mN.group(1);
  184. if (null != sNick) {
  185. lNicks.add(sNick);
  186. }//TESTED: see below
  187. sNick = mN.group(2);
  188. if (null != sNick) {
  189. lNicks.add(sNick);
  190. }//TESTED: see below
  191. if (bNeedNewLastName) {
  192. this.lastName = sNick;
  193. }//TESTED: "julian clifton lewis jr."
  194. }
  195. }//TESTED: got "g. j. siegle, g. d. stetten", "drs. ghassan k. abou-alfa"
  196. // Also: teodore "ted" kaczynski, rep. anh "joseph" cao/person, spc. jason dean "j.d." hunt/person
  197. if (bNeedNewLastName && (null != this.lastName)) {
  198. lNicks.removeLast();
  199. } ///TESTED: "julian clifton lewis jr.", "charles allen, iii/person"
  200. } // (end loop over (1) un-decomposed set of middle names)
  201. for (String middleOrNick: lNicks) { // Loop over decomposed nicknames
  202. if (middleOrNick.endsWith(".")) { // Remove abbreviations from nickname
  203. middleOrNick = middleOrNick.substring(0, middleOrNick.length() - 1);
  204. if (0 == middleOrNick.length()) {
  205. middleOrNick = null;
  206. }
  207. }//TESTED: with "George W. Bush" etc
  208. if (null != middleOrNick) {
  209. this.middleOrNickNames.add(middleOrNick);
  210. }
  211. }// (end loop over decomposed middle names)
  212. if (null == this.lastName) {
  213. if (!this.firstName.isEmpty()) {
  214. this.lastName = this.firstName;
  215. this.firstName = "";
  216. }
  217. else {
  218. this.lastName = "Junior"; // (not expecting to see this ever)
  219. }
  220. }//TESTED: john chakwin jr./person
  221. if (bNeedNewLastName) {
  222. this.lastName = this.lastName.replaceFirst("[.,;:]+$", "");
  223. if (0 == this.lastName.length()) {
  224. this.lastName = "Junior"; // (not expecting to see this ever)
  225. }
  226. }//TESTED " martin luther king, jr./person", "charles allen, iii/person"
  227. } //(end if several names)
  228. } //(end if any names)
  229. }//TESTED: seen all these clauses work
  230. //________________________________________________
  231. // More utility
  232. private static List<EntityInfo> loadStringInfoMap(String s, Map<String, List<EntityInfo>> m) {
  233. List<EntityInfo> l = m.get(s);
  234. if (null == l) {
  235. l = new LinkedList<EntityInfo>();
  236. }
  237. m.put(s, l);
  238. return l;
  239. }
  240. public void loadEntityInfoIntoMap(Map<String, List<EntityInfo>> m) {
  241. if (!this.firstName.isEmpty()) {
  242. loadStringInfoMap(this.firstName, m).add(this);
  243. }
  244. if (!this.lastName.isEmpty()) {
  245. loadStringInfoMap(this.lastName, m).add(this);
  246. }
  247. for (String s: this.middleOrNickNames) {
  248. loadStringInfoMap(s, m).add(this);
  249. }
  250. }
  251. // Yet more utility:
  252. private boolean contains(String sName) {
  253. // We'll say true if info not interesting:
  254. if (this.firstName.isEmpty() && this.middleOrNickNames.isEmpty()) {
  255. return true;
  256. }
  257. return this.firstName.equals(sName) || this.lastName.equals(sName) ||
  258. this.middleOrNickNames.contains(sName);
  259. }//TESTED
  260. // More utility:
  261. public static void assimilate(EntityPojo changingToEnt, EntityPojo toChangeEnt) {
  262. changingToEnt.setFrequency(changingToEnt.getFrequency() + toChangeEnt.getFrequency());
  263. double dRelToDel = toChangeEnt.getRelevance();
  264. double dRelToInc = changingToEnt.getRelevance();
  265. dRelToInc = dRelToInc + 0.5*(1.0 - dRelToInc)*dRelToDel;
  266. // 0.5* just to dampen the effect
  267. changingToEnt.setRelevance(dRelToInc);
  268. }//TESTED: manually
  269. // Debug/Utility
  270. void print(PrintStream out) {
  271. out.print("Decomposition: " + this.firstName);
  272. for (String sName: this.middleOrNickNames) {
  273. out.print(" / ");
  274. out.print(sName);
  275. }
  276. out.print(" : " + this.middleOrNickNames.size());
  277. out.print(" | ");
  278. out.print(this.lastName);
  279. out.println();
  280. }
  281. };
  282. //________________________________________________
  283. // Inner loop processing logic
  284. // This gets quite involved ... get your code reading boots on...
  285. public boolean cleansePeopleInDocu(DocumentPojo doc) {
  286. boolean bChangedAnything = false;
  287. //Debug
  288. if (_nDebugLevel >= 2) {
  289. System.out.println("+++++++ Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getEntities().size());
  290. }
  291. List<EntityInfo> oneWordEntities = new LinkedList<EntityInfo>();
  292. List<EntityInfo> decentQualityEntities = new LinkedList<EntityInfo>();
  293. Map<String, List<EntityInfo>> possibleMatches = new HashMap<String, List<EntityInfo>>();
  294. Map<String, List<EntityInfo>> qualityPossibleMatches = new HashMap<String, List<EntityInfo>>();
  295. Map<String, List<EntityInfo>> weakPrefixMatches = new HashMap<String, List<EntityInfo>>();
  296. Map<String, Set<EntityPojo>> possibleWhoMatches = new HashMap<String, Set<EntityPojo>>();
  297. // 1] First time through the array, extract the various components of the disambiguous and actual names
  298. if (null != doc.getEntities()) for (EntityPojo ent: doc.getEntities()) {
  299. // People: decompose names
  300. if (ent.getType().toLowerCase().equals("person")) {
  301. //Debug
  302. if (_nDebugLevel >= 3) {
  303. System.out.println("Entity1: " + ent.getIndex() + " - "+ ent.getActual_name() + " / " + ent.getDisambiguatedName());
  304. }
  305. if (ent.getActual_name().contains(" ")) { // else is definitely a "bad" entity
  306. boolean bNastyWeakPrefixCase = false;
  307. // Look for "Mr Whatever" mapped to something other than "Mr Whatever" (and other weak prefixes)
  308. if (!ent.getActual_name().equals(ent.getDisambiguatedName())) {
  309. String sActName = ent.getActual_name().toLowerCase();
  310. Matcher weakPrefixNameMatcher = _weakPrefixPattern.matcher(sActName);
  311. if (weakPrefixNameMatcher.find()) { // Starts with a weak prefix - problem candidate
  312. // Decompose the rest of the name to clarify
  313. Matcher actualNameMatcher = _namePattern.matcher(ent.getActual_name().toLowerCase());
  314. EntityInfo actualNameMatches = null;
  315. if (actualNameMatcher.matches()) {
  316. actualNameMatches = new EntityInfo(ent, actualNameMatcher, this._nickNamePattern);
  317. }
  318. if (null != actualNameMatches) {
  319. if (actualNameMatches.firstName.isEmpty() && actualNameMatches.middleOrNickNames.isEmpty()) {
  320. bNastyWeakPrefixCase = true;
  321. // Treat like a single word:
  322. oneWordEntities.add(actualNameMatches);
  323. //Debug
  324. if (_nDebugLevel >= 2) {
  325. System.out.println("Entity1.1: " + ent.getIndex() + " - "+ ent.getActual_name() + " / " + ent.getDisambiguatedName());
  326. if (null != actualNameMatches) actualNameMatches.print(System.out);
  327. }
  328. }
  329. }
  330. }//TESTED: "Mr. Gates / Bill Gates", "MR. GIBBS: / MR. GIBBS: " (etc), "Mr. James Snyder / James Snyder, Jr."
  331. //(also "Mr. B / Brandon Miller (lacrosse)", fails to match)
  332. }
  333. // Decompose dis name
  334. Matcher disNameMatcher = _namePattern.matcher(ent.getDisambiguatedName().toLowerCase());
  335. EntityInfo disNameMatches = null;
  336. if (disNameMatcher.matches()) {
  337. disNameMatches = new EntityInfo(ent, disNameMatcher, this._nickNamePattern);
  338. if (!bNastyWeakPrefixCase) {
  339. // Only save dis name for later matching - it's the most reliable
  340. disNameMatches.loadEntityInfoIntoMap(possibleMatches);
  341. // If the entity has a first and last name, then it's a candidate
  342. // for overwriting other entries
  343. if (!disNameMatches.firstName.isEmpty() && !disNameMatches.lastName.isEmpty()) {
  344. disNameMatches.loadEntityInfoIntoMap(qualityPossibleMatches);
  345. }
  346. decentQualityEntities.add(disNameMatches);
  347. } // end if not nasty weak prefix case
  348. else { // Save in a map to fix annoying 1-word case
  349. disNameMatches.loadEntityInfoIntoMap(weakPrefixMatches);
  350. }
  351. }
  352. // Some debug code:
  353. if (_nDebugLevel >= 4) {
  354. if (null != disNameMatches) disNameMatches.print(System.out);
  355. }
  356. }
  357. else {
  358. // Put this somewhere to be analyzed further
  359. oneWordEntities.add(new EntityInfo(ent));
  360. }
  361. }//TESTED: all these clauses
  362. else if (EntityPojo.Dimension.Who == ent.getDimension()) { // Others, see below
  363. // People can get confused with companies, so we'll allow some simple matching to occur
  364. String sWhoName = ent.getDisambiguatedName().toLowerCase();
  365. String sDecomposedWho[] = sWhoName.split("\\s+");
  366. for (String sWho: sDecomposedWho) {
  367. sWho = sWho.replaceFirst("[.;:,]+$", "");
  368. if (sWho.length() >= 3) { // Min allowed length, I think
  369. Set<EntityPojo> le = possibleWhoMatches.get(sWho);
  370. if (null == le) {
  371. le = new HashSet<EntityPojo>();
  372. possibleWhoMatches.put(sWho, le);
  373. }
  374. le.add(ent);
  375. }
  376. }//TESTED: by eye, pretty simple code
  377. //DEBUG
  378. if (_nDebugLevel >= 3) {
  379. System.out.println("Entity2: " + ent.getIndex() + " - "+ ent.getActual_name() + " / " + ent.getDisambiguatedName() + ": " + sDecomposedWho.length);
  380. }
  381. }//TESTED: see above
  382. } // (end first loop over entities)
  383. // 2.1] Loop over all the decent entries - are these possible duplicates?
  384. Map<EntityInfo, EntityInfo> changeDuplicateFromToMap = new HashMap<EntityInfo, EntityInfo>();
  385. for (EntityInfo info: decentQualityEntities) {
  386. if ((null != info.entity.getSemanticLinks()) && !info.entity.getSemanticLinks().isEmpty()) {
  387. continue; // (see below, "this" will always win out)
  388. } // TESTED (naoto kan, PM of Japan)
  389. List<EntityInfo> l1stName = qualityPossibleMatches.get(info.firstName);
  390. List<EntityInfo> lSurName = qualityPossibleMatches.get(info.lastName);
  391. Set<EntityInfo> candidateSet = new HashSet<EntityInfo>();
  392. Set<String> candidateFirstNames = new HashSet<String>();
  393. Set<String> candidateLastNames = new HashSet<String>();
  394. // Add the current entity being investigated:
  395. candidateSet.add(info);
  396. if (!info.firstName.isEmpty()) {
  397. candidateFirstNames.add(info.firstName);
  398. }
  399. candidateLastNames.add(info.lastName);
  400. if (null != l1stName) {
  401. for (EntityInfo possDup: l1stName) {
  402. if (possDup == info) continue;
  403. if (info.lastName.equals(possDup.lastName)
  404. ||
  405. (info.firstName.equals(possDup.firstName)
  406. && (info.contains(possDup.lastName)
  407. || possDup.contains(info.lastName)))
  408. )
  409. {
  410. // A] First name is somewhere in the possDup, and last names match - pretty good...
  411. // B] First names match, they both contain each others last names
  412. candidateSet.add(possDup);
  413. candidateLastNames.add(possDup.lastName);
  414. if (!possDup.firstName.isEmpty() && info.middleOrNickNames.isEmpty()) {
  415. candidateFirstNames.add(possDup.firstName);
  416. }
  417. } //TESTED: [A] "kim cocklin" vs "kim r. cocklin" - no failures seen
  418. // [B] "julian lewis" vs "julan clifton lewis BLAH", lots of others
  419. }
  420. } // (end matching 1st names)
  421. if (null != lSurName) {
  422. for (EntityInfo possDup: lSurName) {
  423. if (possDup == info) continue;
  424. if ((info.firstName.equals(possDup.firstName))
  425. ||
  426. (info.firstName.isEmpty() && info.lastName.equals(possDup.lastName)))
  427. {
  428. // A] Last name is somewhere in the possDup, and first names match - pretty good...
  429. // B] we have no first name and our last names match
  430. candidateSet.add(possDup);
  431. candidateLastNames.add(possDup.lastName);
  432. if (!possDup.firstName.isEmpty() && info.middleOrNickNames.isEmpty()) {
  433. candidateFirstNames.add(possDup.firstName);
  434. }
  435. } //TESTED: seen [A] ("frances k oldham" vs "frances oldham kelsey"), [B] (Dr Lang vs Daniel Lang) - also no false positives
  436. }
  437. } // (end matching last names)
  438. // 2.2] Now check out candidates...
  439. // Rule will be: if there's more than one "first name" or "last name" available
  440. // (Taking into account that a firstname can be last (no other names)
  441. // and a firstname can appear as a middle name (eg unknown prefix)), then do nothing (too much risk of getting things wrong)
  442. // Otherwise pick an entry with linkdata
  443. // Otherwise pick the highest relevance that has both first+last name
  444. if (candidateSet.size() > 1) { // (ie more than just "info")
  445. // Debug info
  446. if (_nDebugLevel >= 2) {
  447. {System.out.println("*** Candidates for " + info.entity.getIndex() + ": " + candidateFirstNames.size() + ", " + candidateLastNames.size());
  448. for (EntityInfo candidate: candidateSet) {
  449. System.out.println("...... Candidate: " + candidate.entity.getIndex());
  450. }}
  451. }
  452. boolean bTooConfusedToContinue = false;
  453. // In both these cases, allow multiple first/last names but only if
  454. // the 1st/last name is a nickname in every non-trivial case *bar one*
  455. // (ie we allow one non-matching name)
  456. if (candidateLastNames.size() > 1) {
  457. int nNonMatchingNames = 0;
  458. for (String sLastName: candidateLastNames) {
  459. for (EntityInfo possDup: candidateSet) {
  460. if (!possDup.contains(sLastName)) {
  461. nNonMatchingNames++;
  462. if (nNonMatchingNames > 1) {
  463. bTooConfusedToContinue = true;
  464. break;
  465. }
  466. }
  467. }
  468. if (bTooConfusedToContinue) {
  469. break;
  470. }
  471. } // (end loop over candidate first names)
  472. }//TESTED: "frances oldham kelsey" vs "frances kathleen oldham", works because
  473. // of only 1 non-matching name; for failure cases see identical code below
  474. //FAILURE: julian lewis/person VS julian clifton lewis jr.
  475. if (candidateFirstNames.size() > 1) {
  476. int nNonMatchingNames = 0;
  477. for (String s1stName: candidateFirstNames) {
  478. for (EntityInfo possDup: candidateSet) {
  479. if (!possDup.contains(s1stName)) {
  480. nNonMatchingNames++;
  481. if (nNonMatchingNames > 1) {
  482. bTooConfusedToContinue = true;
  483. break;
  484. }
  485. }
  486. }
  487. if (bTooConfusedToContinue) {
  488. break;
  489. }
  490. } // (end loop over candidate first names)
  491. } //TESTED: "dr lang" vs "daniel lang" and "david lang"
  492. if (!bTooConfusedToContinue) {
  493. EntityInfo chosenDup = null;
  494. double highestRel = 0.0;
  495. boolean bLinkedEntityFound = false;
  496. for (EntityInfo possDup: candidateSet) {
  497. if (possDup == info) continue; // Don't consider myself until later
  498. if ((null != possDup.entity.getSemanticLinks()) && !possDup.entity.getSemanticLinks().isEmpty()) {
  499. if (!bLinkedEntityFound) highestRel = 0.0;
  500. bLinkedEntityFound = true;
  501. }
  502. else if (bLinkedEntityFound) {
  503. continue; // (not allowed to compare unlinked vs linked)
  504. }
  505. double rel = possDup.entity.getRelevance();
  506. if (rel > highestRel) {
  507. highestRel = rel;
  508. chosenDup = possDup;
  509. }
  510. }//TESTED: seen highest rel in linked and unlinked cases
  511. if (!bLinkedEntityFound && !info.firstName.isEmpty()) {
  512. double rel = info.entity.getRelevance();
  513. if (rel > highestRel) {
  514. // Compare myself vs best unlinked
  515. if (_nDebugLevel >= 2) {
  516. System.out.println("KEEP " + info.entity.getIndex() + " OVER " + chosenDup.entity.getIndex());
  517. }
  518. chosenDup = null;
  519. }
  520. }//TESTED: eg "REPLACE dr. gish/person WITH dr. robert g. gish/person: 0.322918, false"/KEEP dr. robert g. gish/person OVER robert g. gish/person
  521. if (null != chosenDup) { // Need to change the entity...
  522. //(make it recursive to handle a->b, b->c, ie a,b->c (only hop once: so if a->b,b->c,c->a, just do a,b->c)
  523. if (_nDebugLevel >= 1) {
  524. System.out.println("REPLACE " + info.entity.getIndex() + " WITH " + chosenDup.entity.getIndex() + ": " + chosenDup.entity.getRelevance() + ", " + bLinkedEntityFound);
  525. }
  526. changeDuplicateFromToMap.put(info, chosenDup);
  527. }
  528. }//TESTED: (see non-trivial clauses above)
  529. //else System.out.println("Too confused......");
  530. } // end there are duplication candidates
  531. } // end loop over entities I'm checking for duplication
  532. // 2.3] Finally, actually make the duplication changes:
  533. for (Map.Entry<EntityInfo, EntityInfo> changePair: changeDuplicateFromToMap.entrySet()) {
  534. EntityInfo toChange = changePair.getKey();
  535. EntityInfo changingTo = changePair.getValue();
  536. // Handle "1-hop" recursion as discussed above
  537. if (null == (changingTo = changeDuplicateFromToMap.get(changingTo))) {
  538. changingTo = changePair.getValue(); // (change back again)
  539. }
  540. // Make the change:
  541. EntityPojo toChangeEnt = toChange.entity;
  542. EntityPojo changingToEnt = changingTo.entity;
  543. // Preferred option, improve stats of "change to" and then delete "to change"
  544. EntityInfo.assimilate(changingToEnt, toChangeEnt);
  545. toChange.entity = changingTo.entity; // (need to support 1-word replacement below)
  546. doc.getEntities().remove(toChangeEnt);
  547. //TESTED: 1-hop and dedup-and-1-word-replace
  548. // Other option: swap the important fields over - the problem with
  549. // this is that you get multiple entities with the same name
  550. // so we'll not go with that
  551. // toChangeEnt.getGazateer_index() = changingToEnt.getGazateer_index();
  552. // toChangeEnt.getDisambiguous_name() = changingToEnt.getDisambiguous_name();
  553. // toChangeEnt.linkdata = changingToEnt.linkdata;
  554. // Leave the stats alone ... it's all a little bit confusing
  555. this._nDeduplications++;
  556. bChangedAnything = true;
  557. }//TESTED: "HOPPING: miss oldham/person TO frances oldham kelsey/person TO frances oldham kelsey/person"
  558. // 3.1] The easiest case is one-word person entities, whether they've been
  559. // mapped to an actual dis-name or not....
  560. // If the one-word actual name maps to the first name, surname, nickname of a
  561. // "well qualified" entity (ie reasonable quality actual name)
  562. // (and there's only option, ie ignore "chao" vs "albert chao" and "anne chao")
  563. for (EntityInfo entInfo: oneWordEntities) {
  564. EntityPojo ent = entInfo.entity;
  565. List<EntityInfo> l = possibleMatches.get(entInfo.lastName);
  566. if (null != l) {
  567. // Debug:
  568. if (_nDebugLevel >= 2) {
  569. System.out.println("Candidate matches for " + ent.getActual_name() + " / " + ent.getDisambiguatedName() + ": ");
  570. }
  571. String disName = null;
  572. EntityInfo changeTo = null;
  573. boolean bMultipleDisNames = false;
  574. // (If there are multiple dis names, but one of them is mine, then I'm good to go
  575. // else I'm going to delete the one-word entity...)
  576. for (EntityInfo info: l) {
  577. // Debug:
  578. if (_nDebugLevel >= 2) {
  579. System.out.println("\tEntity3: " + info.entity.getIndex() + " - "+ info.entity.getActual_name() + " / " + info.entity.getDisambiguatedName()
  580. + ": " + info.entity.getRelevance() + " / " + info.entity.getFrequency() + " / " + info.entity.getTotalfrequency());
  581. }
  582. if (ent.getDisambiguatedName().equals(info.entity.getDisambiguatedName())) {
  583. // Found my man - dis name match between Alchemy and a candidate
  584. changeTo = info;
  585. bMultipleDisNames = true; // (Make debug printing easier)
  586. break;
  587. } //TESTED: (Couldn't actually find an example of this, but it's simple enough!)
  588. if (null == disName) {
  589. disName = info.entity.getDisambiguatedName();
  590. changeTo = info;
  591. }
  592. else if (!bMultipleDisNames) {
  593. if (!disName.equals(info.entity.getDisambiguatedName())) {
  594. bMultipleDisNames = true;
  595. changeTo = null; // Not going to be able to assign a better version
  596. }
  597. }
  598. } // end loop over 1-word candidates
  599. if (null != changeTo) {
  600. //Debug code
  601. if (_nDebugLevel >= 1) {
  602. System.out.println("1REPLACE/" + bMultipleDisNames + ": "+ ent.getActual_name() + " WITH " + changeTo.entity.getIndex() + " - "+ changeTo.entity.getActual_name() + " / " + changeTo.entity.getDisambiguatedName()
  603. + ": " + changeTo.entity.getRelevance() + " / " + changeTo.entity.getFrequency() + " / " + changeTo.entity.getTotalfrequency());
  604. }
  605. // Preferred option, improve stats of "change to" and then delete "to change"
  606. EntityInfo.assimilate(changeTo.entity, ent);
  607. doc.getEntities().remove(ent);
  608. //TESTED
  609. // Other option: swap the important fields over - the problem with
  610. // this is that you get multiple entities with the same name
  611. // so we'll not go with that
  612. // ent.getGazateer_index() = changeTo.entity.getGazateer_index();
  613. // ent.getDisambiguous_name() = changeTo.entity.getDisambiguous_name();
  614. // ent.linkdata = changeTo.entity.linkdata;
  615. // Leave the stats alone ... it's all a little bit confusing
  616. this._nOneWordAssignments++;
  617. bChangedAnything = true;
  618. }
  619. else if (null == changeTo) { // Expensive, but hopefully don't need to do that often
  620. //Debug
  621. if (_nDebugLevel >= 1) {
  622. System.out.println("DELETE " + ent.getActual_name() + " " + ent.getDisambiguatedName());
  623. }
  624. this._nOneWordDeletions++;
  625. doc.getEntities().remove(ent);
  626. bChangedAnything = true;
  627. }
  628. }//TESTED: various cases, changed and unchanged
  629. else { // No candidate matches, compare against other "who"s
  630. //Debug:
  631. if (_nDebugLevel >= 2) {
  632. System.out.println("No person candidate matches for " + ent.getActual_name());
  633. }
  634. // 3.2] also need to compare against companies ie other "who's" (if no people matches)
  635. String gazIndex = null;
  636. EntityPojo changeTo = null;
  637. Set<EntityPojo> possWhoSet = null;
  638. if (null == entInfo.firstName) { // Won't try to replace weak prefix entities with company names, obviously
  639. String stripActualName = ent.getActual_name().replaceAll("[;;.,]+$", "");
  640. possWhoSet = possibleWhoMatches.get(stripActualName.toLowerCase());
  641. if (null != possWhoSet) {
  642. // Check there's only 1 candidate, else nothing to be done
  643. for (EntityPojo possWho: possWhoSet) {
  644. //Debug
  645. if (_nDebugLevel >= 2) {
  646. System.out.println("Candidate company match: " + possWho.getIndex());
  647. }
  648. if (null == gazIndex) {
  649. gazIndex = possWho.getIndex();
  650. changeTo = possWho;
  651. }
  652. else if (!gazIndex.equals(possWho.getIndex())) {
  653. gazIndex = null;
  654. break;
  655. }
  656. }
  657. }//TESTED: "gala/person" vs "buckyball discovery gala/organization"
  658. //TESTED: multiple companies case: "Sinclair" vs "Sinclair Technologies Inc" and "Sinclair Holdings Inc"
  659. } //(end if a one-word vs weak prefix)
  660. if (null != gazIndex) { // Convert
  661. //Debug code
  662. if (_nDebugLevel >= 1) {
  663. System.out.println("COREPLACE " + ent.getActual_name() + " WITH "+ changeTo.getActual_name() + " / " + changeTo.getDisambiguatedName()
  664. + ": " + changeTo.getRelevance() + " / " + changeTo.getFrequency() + " / " + changeTo.getTotalfrequency());
  665. }
  666. // Preferred option, improve stats of "change to" and then delete "to change"
  667. EntityInfo.assimilate(changeTo, ent);
  668. doc.getEntities().remove(ent);
  669. //TESTED
  670. // Other option: swap the important fields over - the problem with
  671. // this is that you get multiple entities with the same name
  672. // so we'll not go with that
  673. // ent.getGazateer_index() = changeTo.getGazateer_index();
  674. // ent.getDisambiguous_name() = changeTo.getDisambiguous_name();
  675. // ent.type = changeTo.type;
  676. // ent.linkdata = changeTo.linkdata;
  677. // Leave the stats alone ... it's all a little bit confusing
  678. this._nOneWordAssignments++;
  679. this._nOneWordConversions++;
  680. bChangedAnything = true;
  681. }//TESTED: cut and paste of code from above, with type added
  682. else { // No candidate names or companies... (or too many companies)
  683. // Either stick with Alchemy's suggestion, if there is one
  684. // Or delete (always do this if many candidate companies)
  685. boolean bDelete = true;
  686. if ((null == possWhoSet) && !ent.getDisambiguatedName().matches("[^ -]*")) {
  687. bDelete = false;
  688. // Extra bit of logic needed:
  689. List<EntityInfo> lTmp = weakPrefixMatches.get(entInfo.lastName);
  690. // (Do I match a name from the weak prefix set, if so then delete me after all)
  691. if (null != lTmp) {
  692. if (lTmp.size() > 1) {
  693. bDelete = true;
  694. }
  695. else {
  696. EntityInfo tmpEntInfo = lTmp.get(0);
  697. if (tmpEntInfo.entity != entInfo.entity) {
  698. bDelete = true;
  699. }
  700. }
  701. }//TESTED: "Mrs. Obama / Barack Obama" (obv wrong but the logic is right!)
  702. //TESTED Ghodrat-ol-Ein
  703. }
  704. if (bDelete) {
  705. // Multiple possible companies OR no Alchemy suggestion
  706. // Expensive, but hopefully don't need to do that often
  707. this._nOneWordDeletions++;
  708. doc.getEntities().remove(ent);
  709. bChangedAnything = true;
  710. //Debug
  711. if (_nDebugLevel >= 1) {
  712. System.out.println("DELETE " + ent.getActual_name() + " " + ent.getDisambiguatedName());
  713. }
  714. } //TESTED: " Mrs. Clinton / Hillary Rodham Clinton" and "Hillary Edmund Hillary"
  715. else { // Leave Alchemy suggestion, nothing to do
  716. //Debug
  717. if (_nDebugLevel >= 2) {
  718. System.out.println("DON'T DELETE " + ent.getActual_name() + " " + ent.getDisambiguatedName());
  719. }
  720. }
  721. }//TESTED: by eye (eg deleted "Sinclair" in above "multiple companies case", "John" no mappingl didn't delete "Hitler")
  722. }// end if people candidates or other who candidates
  723. //TESTED: see above clauses
  724. } // end loop over 1 word entries
  725. return bChangedAnything;
  726. }
  727. } // end class AlchemyEntityPersonCleaner