PageRenderTime 74ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/enrichment/custom/StructuredAnalysisHarvester.java

https://github.com/IKANOW/Infinit.e
Java | 3617 lines | 2674 code | 380 blank | 563 comment | 935 complexity | bd317aa493c2dd1ee19c53fd373274ea MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.harvest.enrichment.custom;
  17. import java.util.ArrayList;
  18. import java.util.Arrays;
  19. import java.util.Date;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.Hashtable;
  23. import java.util.Iterator;
  24. import java.util.List;
  25. import java.util.Map;
  26. import java.util.Map.Entry;
  27. import java.util.Set;
  28. import java.util.regex.Matcher;
  29. import java.util.regex.Pattern;
  30. import javax.script.*;
  31. import org.apache.log4j.Logger;
  32. import org.bson.types.ObjectId;
  33. import org.json.JSONArray;
  34. import org.json.JSONException;
  35. import org.json.JSONObject;
  36. import com.google.gson.Gson;
  37. import com.google.gson.GsonBuilder;
  38. import com.ikanow.infinit.e.data_model.store.DbManager;
  39. import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
  40. import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.DocumentSpecPojo;
  41. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  42. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo;
  43. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo.GeoSpecPojo;
  44. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo.EntitySpecPojo;
  45. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo.AssociationSpecPojo;
  46. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  47. import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
  48. import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
  49. import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
  50. import com.ikanow.infinit.e.data_model.store.feature.geo.GeoFeaturePojo;
  51. import com.ikanow.infinit.e.data_model.utils.GeoOntologyMapping;
  52. import com.ikanow.infinit.e.harvest.HarvestContext;
  53. import com.ikanow.infinit.e.harvest.HarvestController;
  54. import com.ikanow.infinit.e.harvest.utils.DateUtility;
  55. import com.ikanow.infinit.e.harvest.utils.AssociationUtils;
  56. import com.ikanow.infinit.e.data_model.utils.DimensionUtility;
  57. import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
  58. import com.mongodb.BasicDBList;
  59. import com.mongodb.BasicDBObject;
  60. /**
  61. * StructuredAnalysisHarvester
  62. * @author cvitter
  63. */
  64. public class StructuredAnalysisHarvester
  65. {
  66. ///////////////////////////////////////////////////////////////////////////////////////////
  67. // NEW PROCESSING PIPELINE INTERFACE
  68. public void setContext(HarvestContext context) {
  69. _context = context;
  70. // Setup some globals if necessary
  71. if (null == _gson) {
  72. GsonBuilder gb = new GsonBuilder();
  73. _gson = gb.create();
  74. }
  75. }
  76. public void resetForNewDoc() {
  77. resetEntityCache();
  78. resetDocumentCache();
  79. }
  80. public void resetEntityCache() {
  81. // Clear geoMap before we start extracting entities and associations for each feed
  82. if (null != _entityMap) {
  83. if (!_geoMap.isEmpty()) _geoMap.clear();
  84. if (!_entityMap.isEmpty()) _entityMap.clear();
  85. // Fill in geoMap and entityMap with any existing docs/entities
  86. _entityMap = null;
  87. _geoMap = null;
  88. }
  89. }//TESTED (entity_cache_reset_test)
  90. public void resetDocumentCache() {
  91. this._document = null;
  92. this._docPojo = null;
  93. }
  94. // Load global functions
  95. // (scriptLang currently ignored)
  96. public void loadGlobalFunctions(List<String> imports, List<String> scripts, String scriptLang)
  97. {
  98. intializeScriptEngine();
  99. // Pass scripts into the engine
  100. try {
  101. // Retrieve and eval script files in s.scriptFiles
  102. if (imports != null) {
  103. for (String file : imports) {
  104. if (null != file) {
  105. _securityManager.eval(_scriptEngine, JavaScriptUtils.getJavaScriptFile(file));
  106. }
  107. }
  108. }//(end load imports)
  109. // Eval script passed in s.script
  110. if (null != scripts) {
  111. for (String script: scripts) {
  112. if (null != script) {
  113. _securityManager.eval(_scriptEngine, script);
  114. }
  115. }
  116. }//(end load scripts)
  117. }
  118. catch (ScriptException e) {
  119. this._context.getHarvestStatus().logMessage("ScriptException: " + e.getMessage(), true);
  120. logger.error("ScriptException: " + e.getMessage(), e);
  121. }
  122. }//TESTED (uah:import_and_lookup_test_uahSah.json)
  123. // Set the document level fields
  124. public void setDocumentMetadata(DocumentPojo doc, DocumentSpecPojo docMetadataConfig) throws JSONException, ScriptException {
  125. Gson g = _gson;
  126. intializeDocIfNeeded(doc, g);
  127. //TODO (INF-1938): allow setting of tags (here and in legacy code)
  128. // We'll just basically duplicate the code from executeHarvest() since it's pretty simple
  129. // and it isn't very easy to pull out the logic in there (which is unnecessarily complicated for
  130. // the pipeline version since you don't need to work out whether to generate the fields before or
  131. // after the other stages, you get to explicity specify)
  132. // Extract Title if applicable
  133. try {
  134. if (docMetadataConfig.title != null) {
  135. if (JavaScriptUtils.containsScript(docMetadataConfig.title)) {
  136. doc.setTitle((String)getValueFromScript(docMetadataConfig.title, null, null));
  137. }
  138. else {
  139. doc.setTitle(getFormattedTextFromField(docMetadataConfig.title, null));
  140. }
  141. }
  142. }
  143. catch (Exception e) {
  144. this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
  145. //DEBUG (don't output log messages per doc)
  146. //logger.error("title: " + e.getMessage(), e);
  147. }
  148. //TESTED (fulltext_docMetaTest)
  149. // Extract display URL if applicable
  150. try {
  151. if (docMetadataConfig.displayUrl != null) {
  152. if (JavaScriptUtils.containsScript(docMetadataConfig.displayUrl)) {
  153. doc.setDisplayUrl((String)getValueFromScript(docMetadataConfig.displayUrl, null, null));
  154. }
  155. else {
  156. doc.setDisplayUrl(getFormattedTextFromField(docMetadataConfig.displayUrl, null));
  157. }
  158. }
  159. }
  160. catch (Exception e) {
  161. this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
  162. //DEBUG (don't output log messages per doc)
  163. //logger.error("displayUrl: " + e.getMessage(), e);
  164. }
  165. //TESTED (fulltext_docMetaTest)
  166. // Extract Description if applicable
  167. try {
  168. if (docMetadataConfig.description != null) {
  169. if (JavaScriptUtils.containsScript(docMetadataConfig.description)) {
  170. doc.setDescription((String)getValueFromScript(docMetadataConfig.description, null, null));
  171. }
  172. else {
  173. doc.setDescription(getFormattedTextFromField(docMetadataConfig.description, null));
  174. }
  175. }
  176. }
  177. catch (Exception e) {
  178. this._context.getHarvestStatus().logMessage("description: " + e.getMessage(), true);
  179. //DEBUG (don't output log messages per doc)
  180. //logger.error("description: " + e.getMessage(), e);
  181. }
  182. //TESTED (fulltext_docMetaTest)
  183. // Extract fullText if applicable
  184. try {
  185. if (docMetadataConfig.fullText != null) {
  186. if (JavaScriptUtils.containsScript(docMetadataConfig.fullText)) {
  187. doc.setFullText((String)getValueFromScript(docMetadataConfig.fullText, null, null));
  188. }
  189. else {
  190. doc.setFullText(getFormattedTextFromField(docMetadataConfig.fullText, null));
  191. }
  192. }
  193. }
  194. catch (Exception e) {
  195. this._context.getHarvestStatus().logMessage("fullText: " + e.getMessage(), true);
  196. //DEBUG (don't output log messages per doc)
  197. //logger.error("fullText: " + e.getMessage(), e);
  198. }
  199. //TESTED (fulltext_docMetaTest)
  200. // Extract Published Date if applicable
  201. try {
  202. if (docMetadataConfig.publishedDate != null) {
  203. if (JavaScriptUtils.containsScript(docMetadataConfig.publishedDate)) {
  204. doc.setPublishedDate(new Date(
  205. DateUtility.parseDate((String)getValueFromScript(docMetadataConfig.publishedDate, null, null))));
  206. }
  207. else {
  208. doc.setPublishedDate(new Date(
  209. DateUtility.parseDate((String)getFormattedTextFromField(docMetadataConfig.publishedDate, null))));
  210. }
  211. }
  212. }
  213. catch (Exception e) {
  214. this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
  215. //DEBUG (don't output log messages per doc)
  216. //logger.error("publishedDate: " + e.getMessage(), e);
  217. }
  218. //TESTED (fulltext_docMetaTest)
  219. // Extract Document GEO if applicable
  220. try {
  221. if (docMetadataConfig.geotag != null) {
  222. doc.setDocGeo(getDocGeo(docMetadataConfig.geotag));
  223. }
  224. }
  225. catch (Exception e) {
  226. this._context.getHarvestStatus().logMessage("docGeo: " + e.getMessage(), true);
  227. //DEBUG (don't output log messages per doc)
  228. //logger.error("docGeo: " + e.getMessage(), e);
  229. }
  230. //TESTED (fulltext_docMetaTest)
  231. }
  232. //TESTED (fulltext_docMetaTest)
  233. // Set the entities
  234. StructuredAnalysisConfigPojo _pipelineTmpConfig = null;
  235. public void setEntities(DocumentPojo doc, List<EntitySpecPojo> entSpecs) throws JSONException, ScriptException {
  236. intializeDocIfNeeded(doc, _gson);
  237. if (null == _pipelineTmpConfig) {
  238. _pipelineTmpConfig = new StructuredAnalysisConfigPojo();
  239. }
  240. _pipelineTmpConfig.setEntities(entSpecs);
  241. expandIterationLoops(_pipelineTmpConfig);
  242. List<EntityPojo> ents = getEntities(_pipelineTmpConfig.getEntities(), doc);
  243. if (null == doc.getEntities()) { // (else has already been added by getEntities)
  244. doc.setEntities(ents);
  245. }
  246. }
  247. //TESTED (both first time through, and when adding to existing entities)
  248. // Set the associations
  249. public void setAssociations(DocumentPojo doc, List<AssociationSpecPojo> assocSpecs) throws JSONException, ScriptException {
  250. //TODO (INF-1922): Allow setting of directed sentiment (here and in legacy code)
  251. intializeDocIfNeeded(doc, _gson);
  252. if (null == _pipelineTmpConfig) {
  253. _pipelineTmpConfig = new StructuredAnalysisConfigPojo();
  254. }
  255. _pipelineTmpConfig.setAssociations(assocSpecs);
  256. expandIterationLoops(_pipelineTmpConfig);
  257. List<AssociationPojo> assocs = getAssociations(_pipelineTmpConfig.getAssociations(), doc);
  258. if (null == doc.getAssociations()) { // (else has already been added by getAssociations)
  259. doc.setAssociations(assocs);
  260. }
  261. }
  262. //TESTED (both first time through, and when adding to existing associations)
  263. ///////////////////////////////////////////////////////////////////////////////////////////
  264. // (Utility function for optimization)
  265. private void intializeDocIfNeeded(DocumentPojo f, Gson g) throws JSONException, ScriptException {
  266. if (null == _document) {
  267. // (don't need assocs or ents)
  268. List<EntityPojo> ents = f.getEntities();
  269. List<AssociationPojo> assocs = f.getAssociations();
  270. f.setEntities(null);
  271. f.setAssociations(null);
  272. try {
  273. // Convert the DocumentPojo Object to a JSON document using GsonBuilder
  274. String docStr = g.toJson(f);
  275. _document = new JSONObject(docStr);
  276. _docPojo = f;
  277. // Add the document (JSONObject) to the engine
  278. if (null != _scriptEngine) {
  279. _scriptEngine.put("document", docStr);
  280. _securityManager.eval(_scriptEngine, JavaScriptUtils.initScript);
  281. }
  282. }
  283. finally {
  284. f.setEntities(ents);
  285. f.setAssociations(assocs);
  286. }
  287. }
  288. }//TESTED
  289. ///////////////////////////////////////////////////////////////////////////////////////////
  290. // Loads the caches into script
  291. public void loadLookupCaches(Map<String, ObjectId> caches, Set<ObjectId> communityIds) {
  292. //grab any json cache and make it available to the engine
  293. try
  294. {
  295. if (null != caches) {
  296. CacheUtils.addJSONCachesToEngine(caches, _scriptEngine, _securityManager, communityIds, _context);
  297. }
  298. }
  299. catch (Exception ex)
  300. {
  301. _context.getHarvestStatus().logMessage("JSONcache: " + ex.getMessage(), true);
  302. //(no need to log this, appears in log under source -with URL- anyway):
  303. //logger.error("JSONcache: " + ex.getMessage(), ex);
  304. }
  305. }//TESTED (import_and_lookup_test_uahSah.json)
  306. ///////////////////////////////////////////////////////////////////////////////////////////
  307. // Tidy up metadadata after processing
  308. public void removeUnwantedMetadataFields(String metaFields, DocumentPojo f)
  309. {
  310. if (null != f.getMetadata()) {
  311. if (null != metaFields) {
  312. boolean bInclude = true;
  313. if (metaFields.startsWith("+")) {
  314. metaFields = metaFields.substring(1);
  315. }
  316. else if (metaFields.startsWith("-")) {
  317. metaFields = metaFields.substring(1);
  318. bInclude = false;
  319. }
  320. String[] metaFieldArray = metaFields.split("\\s*,\\s*");
  321. if (bInclude) {
  322. Set<String> metaFieldSet = new HashSet<String>();
  323. metaFieldSet.addAll(Arrays.asList(metaFieldArray));
  324. Iterator<Entry<String, Object[]>> metaField = f.getMetadata().entrySet().iterator();
  325. while (metaField.hasNext()) {
  326. Entry<String, Object[]> metaFieldIt = metaField.next();
  327. if (!metaFieldSet.contains(metaFieldIt.getKey())) {
  328. metaField.remove();
  329. }
  330. }
  331. }
  332. else { // exclude case, easier
  333. for (String metaField: metaFieldArray) {
  334. if (!metaField.contains(".")) {
  335. f.getMetadata().remove(metaField);
  336. }
  337. else { // more complex case, nested delete
  338. MongoDbUtil.recursiveNestedMapDelete(metaField.split("\\s*\\.\\s*"), 0, f.getMetadata());
  339. }
  340. }//(end loop over metaFields)
  341. }//(end if exclude case)
  342. //TESTED: include (default + explicit) and exclude cases
  343. }
  344. }//(if metadata exists)
  345. }//TESTED (legacy code)
  346. public boolean rejectDoc(String rejectDocCriteria, DocumentPojo f) throws JSONException, ScriptException
  347. {
  348. return rejectDoc(rejectDocCriteria, f, true);
  349. }
  350. public boolean rejectDoc(String rejectDocCriteria, DocumentPojo f, boolean logMessage) throws JSONException, ScriptException
  351. {
  352. if (null != rejectDocCriteria) {
  353. intializeDocIfNeeded(f, _gson);
  354. Object o = getValueFromScript(rejectDocCriteria, null, null, false);
  355. if (null != o) {
  356. if (o instanceof String) {
  357. String rejectDoc = (String)o;
  358. if (null != rejectDoc) {
  359. if (logMessage) {
  360. this._context.getHarvestStatus().logMessage("SAH_reject: " + rejectDoc, true);
  361. }
  362. return true;
  363. }
  364. }
  365. else if (o instanceof Boolean) {
  366. Boolean rejectDoc = (Boolean)o;
  367. if (rejectDoc) {
  368. if (logMessage) {
  369. this._context.getHarvestStatus().logMessage("SAH_reject: reason not specified", true);
  370. }
  371. return true;
  372. }
  373. }
  374. else {
  375. if (logMessage) {
  376. this._context.getHarvestStatus().logMessage("SAH_reject: reason not specified", true);
  377. }
  378. return true;
  379. }
  380. }
  381. }
  382. return false;
  383. }//TESTED (storageSettings_test + legacy code)
  384. public void handleDocumentUpdates(String onUpdateScript, DocumentPojo f) throws JSONException, ScriptException
  385. {
  386. // Compare the new and old docs in the case when this doc is an update
  387. if ((null != onUpdateScript) && (null != f.getUpdateId())) {
  388. // (note we must be in integrated mode - not called from source/test - if f.getId() != null)
  389. intializeDocIfNeeded(f, _gson);
  390. BasicDBObject query1 = new BasicDBObject(DocumentPojo._id_, f.getUpdateId());
  391. BasicDBObject query2 = new BasicDBObject(DocumentPojo.updateId_, f.getUpdateId());
  392. BasicDBObject query = new BasicDBObject(DbManager.or_, Arrays.asList(query1, query2));
  393. BasicDBObject docObj = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query);
  394. if (null != docObj) {
  395. if (null == PARSING_SCRIPT) { // First time through, initialize parsing script
  396. // (to convert native JS return vals into something we can write into our metadata)
  397. PARSING_SCRIPT = JavaScriptUtils.generateParsingScript();
  398. }
  399. if (!_isParsingScriptInitialized) {
  400. _securityManager.eval(_scriptEngine, PARSING_SCRIPT);
  401. _isParsingScriptInitialized = true;
  402. }
  403. DocumentPojo doc = DocumentPojo.fromDb(docObj, DocumentPojo.class);
  404. _scriptEngine.put("old_document", _gson.toJson(doc));
  405. try {
  406. _securityManager.eval(_scriptEngine,JavaScriptUtils.initOnUpdateScript);
  407. Object returnVal = _securityManager.eval(_scriptEngine, onUpdateScript);
  408. BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, _scriptEngine);
  409. f.addToMetadata("_PERSISTENT_", outList.toArray());
  410. }
  411. catch (Exception e) {
  412. // Extra step here...
  413. if (null != doc.getMetadata()) { // Copy persistent metadata across...
  414. Object[] persist = doc.getMetadata().get("_PERSISTENT_");
  415. if (null != persist) {
  416. f.addToMetadata("_PERSISTENT_", persist);
  417. }
  418. this._context.getHarvestStatus().logMessage("SAH::onUpdateScript: " + e.getMessage(), true);
  419. //DEBUG (don't output log messages per doc)
  420. //logger.error("SAH::onUpdateScript: " + e.getMessage(), e);
  421. }
  422. //(TESTED)
  423. }
  424. //TODO (INF-1507): need to write more efficient code to deserialize metadata?
  425. }
  426. _document = null;
  427. _docPojo = null;
  428. intializeDocIfNeeded(f, _gson);
  429. }//TESTED (end if callback-on-update)
  430. }//TESTED (legacy code)
  431. ///////////////////////////////////////////////////////////////////////////////////////////
  432. // PROCESSING PIPELINE - UTILITIES
  433. // Intialize script engine - currently only Java script is supported
  434. public void intializeScriptEngine()
  435. {
  436. if (null == _scriptEngine) {
  437. //set up the security manager
  438. _securityManager = new JavascriptSecurityManager();
  439. _scriptFactory = new ScriptEngineManager();
  440. _scriptEngine = _scriptFactory.getEngineByName("JavaScript");
  441. if (null != _unstructuredHandler) { // Also initialize the scripting engine for the UAH
  442. _unstructuredHandler.set_sahEngine(_scriptEngine);
  443. _unstructuredHandler.set_sahSecurity(_securityManager);
  444. }
  445. // Make the engine invocable so that we can call functions in the script
  446. // using the inv.invokeFunction(function) method
  447. _scriptInvoker = (Invocable) _scriptEngine;
  448. }//(once only)
  449. }//TESTED
  450. ///////////////////////////////////////////////////////////////////////////////////////////
  451. ///////////////////////////////////////////////////////////////////////////////////////////
  452. ///////////////////////////////////////////////////////////////////////////////////////////
  453. // LEGACY CODE - USE TO SUPPORT OLD CODE FOR NOW + AS UTILITY CODE FOR THE PIPELINE LOGIC
  454. // Private class variables
  455. private static Logger logger;
  456. private JSONObject _document = null; //TODO (INF-2488): change all the JSONObject logic to LinkedHashMap and (generic) Array so can just replace this with a string...
  457. private DocumentPojo _docPojo = null;
  458. private Gson _gson = null;
  459. private JSONObject _iterator = null;
  460. private String _iteratorIndex = null;
  461. private static Pattern SUBSTITUTION_PATTERN = Pattern.compile("\\$([a-zA-Z._0-9]+)|\\$\\{([^}]+)\\}");
  462. private HashMap<String, GeoPojo> _geoMap = null;
  463. private HashSet<String> _entityMap = null;
  464. private HarvestContext _context;
  465. /**
  466. * Default Constructor
  467. */
  468. public StructuredAnalysisHarvester()
  469. {
  470. logger = Logger.getLogger(StructuredAnalysisHarvester.class);
  471. }
  472. // Allows the unstructured handler to take advantage of text created by this
  473. public void addUnstructuredHandler(UnstructuredAnalysisHarvester uap) {
  474. _unstructuredHandler = uap;
  475. }
  476. private UnstructuredAnalysisHarvester _unstructuredHandler = null;
  477. //
  478. private ScriptEngineManager _scriptFactory = null;
  479. private ScriptEngine _scriptEngine = null;
  480. private JavascriptSecurityManager _securityManager = null;
  481. private Invocable _scriptInvoker = null;
  482. private static String PARSING_SCRIPT = null;
  483. private boolean _isParsingScriptInitialized = false; // (needs to be done once per source)
  484. /**
  485. * executeHarvest(SourcePojo source, List<DocumentPojo> feeds) extracts document GEO, Entities,
  486. * and Associations based on the DocGeoSpec, EntitySpec, and AssociationSpec information contained
  487. * within the source document's StructuredAnalysis sections
  488. * @param source
  489. * @param docs
  490. * @return List<DocumentPojo>
  491. * @throws ScriptException
  492. */
  493. public List<DocumentPojo> executeHarvest(HarvestController contextController, SourcePojo source, List<DocumentPojo> docs)
  494. {
  495. _context = contextController;
  496. if (null == _gson) {
  497. GsonBuilder gb = new GsonBuilder();
  498. _gson = gb.create();
  499. }
  500. Gson g = _gson;
  501. // Skip if the StructuredAnalysis object of the source is null
  502. if (source.getStructuredAnalysisConfig() != null)
  503. {
  504. StructuredAnalysisConfigPojo s = source.getStructuredAnalysisConfig();
  505. // (some pre-processing to expand the specs)
  506. expandIterationLoops(s);
  507. // Instantiate a new ScriptEngineManager and create an engine to execute
  508. // the type of script specified in StructuredAnalysisPojo.scriptEngine
  509. this.intializeScriptEngine();
  510. this.loadLookupCaches(s.getCaches(), source.getCommunityIds());
  511. // Iterate over each doc in docs, create entity and association pojo objects
  512. // to add to the feed using the source entity and association spec pojos
  513. Iterator<DocumentPojo> it = docs.iterator();
  514. int nDocs = 0;
  515. while (it.hasNext())
  516. {
  517. DocumentPojo f = it.next();
  518. nDocs++;
  519. try
  520. {
  521. resetEntityCache();
  522. _document = null;
  523. _docPojo = null;
  524. // (don't create this until needed, since it might need to be (re)serialized after a call
  525. // to the UAH which would obviously be undesirable)
  526. // If the script engine has been instantiated pass the feed document and any scripts
  527. if (_scriptEngine != null)
  528. {
  529. List<String> scriptList = null;
  530. List<String> scriptFileList = null;
  531. try {
  532. // Script code embedded in source
  533. scriptList = Arrays.asList(s.getScript());
  534. }
  535. catch (Exception e) {}
  536. try {
  537. // scriptFiles - can contain String[] of script files to import into the engine
  538. scriptFileList = Arrays.asList(s.getScriptFiles());
  539. }
  540. catch (Exception e) {}
  541. this.loadGlobalFunctions(scriptFileList, scriptList, s.getScriptEngine());
  542. }//TESTED
  543. // 1. Document level fields
  544. // Extract Title if applicable
  545. boolean bTryTitleLater = false;
  546. try {
  547. if (s.getTitle() != null)
  548. {
  549. intializeDocIfNeeded(f, g);
  550. if (JavaScriptUtils.containsScript(s.getTitle()))
  551. {
  552. f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
  553. }
  554. else
  555. {
  556. f.setTitle(getFormattedTextFromField(s.getTitle(), null));
  557. }
  558. if (null == f.getTitle()) {
  559. bTryTitleLater = true;
  560. }
  561. }
  562. }
  563. catch (Exception e)
  564. {
  565. this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
  566. //DEBUG (don't output log messages per doc)
  567. //logger.error("title: " + e.getMessage(), e);
  568. }
  569. // Extract Display URL if applicable
  570. boolean bTryDisplayUrlLater = false;
  571. try {
  572. if (s.getDisplayUrl() != null)
  573. {
  574. intializeDocIfNeeded(f, g);
  575. if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
  576. {
  577. f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
  578. }
  579. else
  580. {
  581. f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
  582. }
  583. if (null == f.getDisplayUrl()) {
  584. bTryDisplayUrlLater = true;
  585. }
  586. }
  587. }
  588. catch (Exception e)
  589. {
  590. this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
  591. //DEBUG (don't output log messages per doc)
  592. //logger.error("displayUrl: " + e.getMessage(), e);
  593. }
  594. //TOTEST
  595. // Extract Description if applicable
  596. boolean bTryDescriptionLater = false;
  597. try {
  598. if (s.getDescription() != null)
  599. {
  600. intializeDocIfNeeded(f, g);
  601. if (JavaScriptUtils.containsScript(s.getDescription()))
  602. {
  603. f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
  604. }
  605. else
  606. {
  607. f.setDescription(getFormattedTextFromField(s.getDescription(), null));
  608. }
  609. if (null == f.getDescription()) {
  610. bTryDescriptionLater = true;
  611. }
  612. }
  613. }
  614. catch (Exception e)
  615. {
  616. this._context.getHarvestStatus().logMessage("description: " + e.getMessage(), true);
  617. //DEBUG (don't output log messages per doc)
  618. //logger.error("description: " + e.getMessage(), e);
  619. }
  620. // Extract fullText if applicable
  621. boolean bTryFullTextLater = false;
  622. try {
  623. if (s.getFullText() != null)
  624. {
  625. intializeDocIfNeeded(f, g);
  626. if (JavaScriptUtils.containsScript(s.getFullText()))
  627. {
  628. f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
  629. }
  630. else
  631. {
  632. f.setFullText(getFormattedTextFromField(s.getFullText(), null));
  633. }
  634. if (null == f.getFullText()) {
  635. bTryFullTextLater = true;
  636. }
  637. }
  638. }
  639. catch (Exception e)
  640. {
  641. this._context.getHarvestStatus().logMessage("fullText: " + e.getMessage(), true);
  642. //DEBUG (don't output log messages per doc)
  643. //logger.error("fullText: " + e.getMessage(), e);
  644. }
  645. // Published date is done after the UAH
  646. // (since the UAH can't access it, and it might be populated via the UAH)
  647. // 2. UAH/extraction properties
  648. // Add fields to metadata that can be used to create entities and associations
  649. // (Either with the UAH, or with the entity extractor)
  650. try {
  651. boolean bMetadataChanged = false;
  652. if (null != this._unstructuredHandler)
  653. {
  654. try
  655. {
  656. this._unstructuredHandler.set_sahEngine(_scriptEngine);
  657. this._unstructuredHandler.set_sahSecurity(_securityManager);
  658. bMetadataChanged = this._unstructuredHandler.executeHarvest(_context, source, f, (1 == nDocs), it.hasNext());
  659. }
  660. catch (Exception e) {
  661. contextController.handleExtractError(e, source); //handle extractor error if need be
  662. it.remove(); // remove the document from the list...
  663. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  664. // (Note: this can't be source level error, so carry on harvesting - unlike below)
  665. continue;
  666. }
  667. }
  668. if (contextController.isEntityExtractionRequired(source))
  669. {
  670. bMetadataChanged = true;
  671. // Text/Entity Extraction
  672. List<DocumentPojo> toAdd = new ArrayList<DocumentPojo>(1);
  673. toAdd.add(f);
  674. try {
  675. contextController.extractTextAndEntities(toAdd, source, false, false);
  676. if (toAdd.isEmpty()) { // this failed...
  677. it.remove(); // remove the document from the list...
  678. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  679. continue;
  680. }//TESTED
  681. }
  682. catch (Exception e) {
  683. contextController.handleExtractError(e, source); //handle extractor error if need be
  684. it.remove(); // remove the document from the list...
  685. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  686. if (source.isHarvestBadSource())
  687. {
  688. // Source error, ignore all other documents
  689. while (it.hasNext()) {
  690. f = it.next();
  691. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  692. it.remove();
  693. }
  694. break;
  695. }
  696. else {
  697. continue;
  698. }
  699. //TESTED
  700. }
  701. }
  702. if (bMetadataChanged) {
  703. // Ugly, but need to re-create doc json because metadata has changed
  704. String sTmpFullText = f.getFullText();
  705. f.setFullText(null); // (no need to serialize this, can save some cycles)
  706. _document = null;
  707. _docPojo = null;
  708. intializeDocIfNeeded(f, g);
  709. f.setFullText(sTmpFullText); //(restore)
  710. }
  711. // Can copy metadata from old documents to new ones:
  712. handleDocumentUpdates(s.getOnUpdateScript(), f);
  713. // Check (based on the metadata and entities so far) whether to retain the doc
  714. if (rejectDoc(s.getRejectDocCriteria(), f)) {
  715. it.remove(); // remove the document from the list...
  716. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  717. continue;
  718. }
  719. }
  720. catch (Exception e) {
  721. this._context.getHarvestStatus().logMessage("SAH->UAH: " + e.getMessage(), true);
  722. //DEBUG (don't output log messages per doc)
  723. //logger.error("SAH->UAH: " + e.getMessage(), e);
  724. }
  725. // Now create document since there's no risk of having to re-serialize
  726. intializeDocIfNeeded(f, g);
  727. // 3. final doc-level metadata fields:
  728. // If description was null before might need to get it from a UAH field
  729. if (bTryTitleLater) {
  730. try {
  731. if (s.getTitle() != null)
  732. {
  733. intializeDocIfNeeded(f, g);
  734. if (JavaScriptUtils.containsScript(s.getTitle()))
  735. {
  736. f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
  737. }
  738. else
  739. {
  740. f.setTitle(getFormattedTextFromField(s.getTitle(), null));
  741. }
  742. }
  743. }
  744. catch (Exception e)
  745. {
  746. this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
  747. //DEBUG (don't output log messages per doc)
  748. //logger.error("title: " + e.getMessage(), e);
  749. }
  750. }
  751. // Extract Display URL if needed
  752. if (bTryDisplayUrlLater) {
  753. try {
  754. if (s.getDisplayUrl() != null)
  755. {
  756. intializeDocIfNeeded(f, g);
  757. if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
  758. {
  759. f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
  760. }
  761. else
  762. {
  763. f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
  764. }
  765. }
  766. }
  767. catch (Exception e)
  768. {
  769. this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
  770. //DEBUG (don't output log messages per doc)
  771. //logger.error("displayUrl: " + e.getMessage(), e);
  772. }
  773. }
  774. //TOTEST
  775. // If description was null before might need to get it from a UAH field
  776. if (bTryDescriptionLater) {
  777. try {
  778. if (s.getDescription() != null)
  779. {
  780. intializeDocIfNeeded(f, g);
  781. if (JavaScriptUtils.containsScript(s.getDescription()))
  782. {
  783. f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
  784. }
  785. else
  786. {
  787. f.setDescription(getFormattedTextFromField(s.getDescription(), null));
  788. }
  789. }
  790. }
  791. catch (Exception e)
  792. {
  793. this._context.getHarvestStatus().logMessage("description2: " + e.getMessage(), true);
  794. //DEBUG (don't output log messages per doc)
  795. //logger.error("description2: " + e.getMessage(), e);
  796. }
  797. }
  798. // If fullText was null before might need to get it from a UAH field
  799. if (bTryFullTextLater) {
  800. try {
  801. if (s.getFullText() != null)
  802. {
  803. intializeDocIfNeeded(f, g);
  804. if (JavaScriptUtils.containsScript(s.getFullText()))
  805. {
  806. f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
  807. }
  808. else
  809. {
  810. f.setFullText(getFormattedTextFromField(s.getFullText(), null));
  811. }
  812. }
  813. }
  814. catch (Exception e)
  815. {
  816. this._context.getHarvestStatus().logMessage("fullText2: " + e.getMessage(), true);
  817. //DEBUG (don't output log messages per doc)
  818. //logger.error("fullText2: " + e.getMessage(), e);
  819. }
  820. }
  821. // Extract Published Date if applicable
  822. if (s.getPublishedDate() != null)
  823. {
  824. if (JavaScriptUtils.containsScript(s.getPublishedDate()))
  825. {
  826. try
  827. {
  828. f.setPublishedDate(new Date(
  829. DateUtility.parseDate((String)getValueFromScript(s.getPublishedDate(), null, null))));
  830. }
  831. catch (Exception e)
  832. {
  833. this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
  834. }
  835. }
  836. else
  837. {
  838. try
  839. {
  840. f.setPublishedDate(new Date(
  841. DateUtility.parseDate((String)getFormattedTextFromField(s.getPublishedDate(), null))));
  842. }
  843. catch (Exception e)
  844. {
  845. this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
  846. }
  847. }
  848. }
  849. // 4. Entity level fields
  850. // Extract Document GEO if applicable
  851. if (s.getDocumentGeo() != null)
  852. {
  853. try
  854. {
  855. f.setDocGeo(getDocGeo(s.getDocumentGeo()));
  856. }
  857. catch (Exception e)
  858. {
  859. this._context.getHarvestStatus().logMessage("docGeo: " + e.getMessage(), true);
  860. }
  861. }
  862. // Extract Entities
  863. if (s.getEntities() != null)
  864. {
  865. f.setEntities(getEntities(s.getEntities(), f));
  866. }
  867. // Extract Associations
  868. if (s.getAssociations() != null)
  869. {
  870. f.setAssociations(getAssociations(s.getAssociations(), f));
  871. }
  872. // 5. Remove unwanted metadata fields
  873. removeUnwantedMetadataFields(s.getMetadataFields(), f);
  874. }
  875. catch (Exception e)
  876. {
  877. this._context.getHarvestStatus().logMessage("Unknown: " + e.getMessage(), true);
  878. //DEBUG (don't output log messages per doc)
  879. //logger.error("Unknown: " + e.getMessage(), e);
  880. }
  881. finally
  882. {
  883. _document = null;
  884. _docPojo = null;
  885. }
  886. } // (end loop over documents)
  887. } // (end if SAH specified)
  888. return docs;
  889. }
  890. /**
  891. * getEntities(EntitySpecPojo e, DocumentPojo f)
  892. *
  893. * @param e
  894. * @param f
  895. * @return List<EntityPojo>
  896. * @throws JSONException
  897. */
  898. private List<EntityPojo> getEntities(List<EntitySpecPojo> esps, DocumentPojo f) throws JSONException
  899. {
  900. //TODO (INF-1922): should I always create in a new list and then add on? because of the entity map below...
  901. // If the feed already has entities we want to add the new entities to the list of existing entities
  902. List<EntityPojo> entities = null;
  903. if (f.getEntities() != null)
  904. {
  905. entities = f.getEntities();
  906. }
  907. // Otherwise we create a new arraylist to hold the new entities we are adding
  908. else
  909. {
  910. entities = new ArrayList<EntityPojo>();
  911. }
  912. repopulateEntityCacheIfNeeded(f);
  913. // Iterate over each EntitySpecPojo and try to create an entity, or entities, from the data
  914. JSONObject metadata = null;
  915. if (_document.has("metadata")) {
  916. metadata = _document.getJSONObject("metadata");
  917. }
  918. for (EntitySpecPojo esp : esps)
  919. {
  920. try {
  921. List<EntityPojo> tempEntities = getEntities(esp, f, metadata);
  922. for (EntityPojo e : tempEntities)
  923. {
  924. entities.add(e);
  925. }
  926. }
  927. catch (Exception e) {} // (carry on, prob just a missing field in this doc)
  928. }
  929. return entities;
  930. }
  931. /**
  932. * getEntities
  933. * @param esp
  934. * @param f
  935. * @return
  936. */
  937. private List<EntityPojo> getEntities(EntitySpecPojo esp, DocumentPojo f, JSONObject currObj)
  938. {
  939. List<EntityPojo> entities = new ArrayList<EntityPojo>();
  940. // Does the entity contain a list of entities to iterate over -
  941. if (esp.getIterateOver() != null)
  942. {
  943. try
  944. {
  945. String iterateOver = esp.getIterateOver();
  946. // Check to see if the arrayRoot specified exists in the current doc before proceeding
  947. Object itEl = null;
  948. try {
  949. itEl = currObj.get(iterateOver);
  950. }
  951. catch (JSONException e) {} // carry on, trapped below...
  952. if (null == itEl) {
  953. return entities;
  954. }
  955. JSONArray entityRecords = null;
  956. try {
  957. entityRecords = currObj.getJSONArray(iterateOver);
  958. }
  959. catch (JSONException e) {} // carry on, trapped below...
  960. if (null == entityRecords) {
  961. entityRecords = new JSONArray();
  962. entityRecords.put(itEl);
  963. }
  964. //TESTED
  965. // Get the type of object contained in EntityRecords[0]
  966. String objType = entityRecords.get(0).getClass().toString();
  967. /*
  968. * EntityRecords is a simple String[] array of entities
  969. */
  970. if (objType.equalsIgnoreCase("class java.lang.String"))
  971. {
  972. // Iterate over array elements and extract entities
  973. for (int i = 0; i < entityRecords.length(); ++i)
  974. {
  975. String field = entityRecords.getString(i);
  976. long nIndex = Long.valueOf(i);
  977. if (null != esp.getType()) { // (else cannot be a valid entity, must just be a list)
  978. EntityPojo entity = getEntity(esp, field, String.valueOf(i), f);
  979. if (entity != null) entities.add(entity);
  980. }
  981. // Does the association break out into multiple associations?
  982. if (esp.getEntities() != null)
  983. {
  984. // Iterate over the associations and call getAssociations recursively
  985. for (EntitySpecPojo subEsp : esp.getEntities())
  986. {
  987. if (null != subEsp.getIterateOver()) {
  988. if (null == subEsp.getCreationCriteriaScript()) {
  989. _context.getHarvestStatus().logMessage(new StringBuffer("In iterator ").
  990. append(esp.getIterateOver()).append(", trying to loop over field '").
  991. append(subEsp.getIterateOver()).append("' in array of primitives.").toString(), true);
  992. }
  993. else {
  994. this.executeEntityAssociationValidation(subEsp.getCreationCriteriaScript(), field, Long.toString(nIndex));
  995. }
  996. // (any creation criteria script indicates user accepts it can be either)
  997. }
  998. if (null != subEsp.getDisambiguated_name()) {
  999. EntityPojo entity = getEntity(subEsp, field, String.valueOf(i), f);
  1000. if (entity != null) entities.add(entity);
  1001. }
  1002. }
  1003. }//TESTED (error case, mixed object)
  1004. }
  1005. }
  1006. /*
  1007. * EntityRecords is a JSONArray
  1008. */
  1009. else if (objType.equalsIgnoreCase("class org.json.JSONObject"))
  1010. {
  1011. // Iterate over array elements and extract entities
  1012. for (int i = 0; i < entityRecords.length(); ++i)
  1013. {
  1014. // Get JSONObject containing entity fields and pass entityElement
  1015. // into the script engine so scripts can access it
  1016. JSONObject savedIterator = null;
  1017. if (_scriptEngine != null)
  1018. {
  1019. _iterator = savedIterator = entityRecords.getJSONObject(i);
  1020. }
  1021. if (null != esp.getType()) { // (else cannot be a valid entity, must just be a list)
  1022. EntityPojo entity = getEntity(esp, null, String.valueOf(i), f);
  1023. if (entity != null) entities.add(entity);
  1024. }
  1025. // Does the entity break out into multiple entities?
  1026. if (esp.getEntities() != null)
  1027. {
  1028. // Iterate over the entities and call getEntities recursively
  1029. for (EntitySpecPojo subEsp : esp.getEntities())
  1030. {
  1031. _iterator = savedIterator; // (reset this)
  1032. List<EntityPojo> subEntities = getEntities(subEsp, f, _iterator);
  1033. for (EntityPojo e : subEntities)
  1034. {
  1035. entities.add(e);
  1036. }
  1037. }
  1038. }
  1039. }
  1040. }
  1041. if (_iterator != currObj) { // (ie at the top level)
  1042. _iterator = null;
  1043. }
  1044. }
  1045. catch (Exception e)
  1046. {
  1047. //e.printStackTrace();
  1048. //System.out.println(e.getMessage());
  1049. //logger.error("Exception: " + e.getMessage());
  1050. }
  1051. }
  1052. // Single entity
  1053. else
  1054. {
  1055. // Does the entity break out into multiple entities?
  1056. if (esp.getEntities() != null)
  1057. {
  1058. // Iterate over the entities and call getEntities recursively
  1059. for (EntitySpecPojo subEsp : esp.getEntities())
  1060. {
  1061. List<EntityPojo> subEntities = getEntities(subEsp, f, currObj);
  1062. for (EntityPojo e : subEntities)
  1063. {
  1064. entities.add(e);
  1065. }
  1066. }
  1067. }
  1068. else
  1069. {
  1070. EntityPojo entity = getEntity(esp, null, null, f);
  1071. if (entity != null) entities.add(entity);
  1072. }
  1073. }
  1074. return entities;
  1075. }
  1076. /**
  1077. * getEntity
  1078. * @param esp
  1079. * @param field
  1080. * @param index
  1081. * @param f
  1082. * @return
  1083. */
  1084. private EntityPojo getEntity(EntitySpecPojo esp, String field, String index, DocumentPojo f)
  1085. {
  1086. // If the EntitySpecPojo or DocumentPojo is null return null
  1087. if ((esp == null) || (f == null)) return null;
  1088. try
  1089. {
  1090. EntityPojo e = new EntityPojo();
  1091. // Parse creation criteria script to determine if the entity should be added
  1092. if (esp.getCreationCriteriaScript() != null && JavaScriptUtils.containsScript(esp.getCreationCriteriaScript()))
  1093. {
  1094. boolean addEntity = executeEntityAssociationValidation(esp.getCreationCriteriaScript(), field, index);
  1095. if (!addEntity) {
  1096. return null;
  1097. }
  1098. }
  1099. // Entity.disambiguous_name
  1100. String disambiguatedName = null;
  1101. if (JavaScriptUtils.containsScript(esp.getDisambiguated_name()))
  1102. {
  1103. disambiguatedName = (String)getValueFromScript(esp.getDisambiguated_name(), field, index);
  1104. }
  1105. else
  1106. {
  1107. if ((_iterator != null) && (esp.getDisambiguated_name().startsWith("$metadata.") || esp.getDisambiguated_name().startsWith("${metadata."))) {
  1108. if (_context.isStandalone()) { // (minor message, while debugging only)
  1109. _context.getHarvestStatus().logMessage("Warning: in disambiguated_name, using global $metadata when iterating", true);
  1110. }
  1111. }
  1112. // Field - passed in via simple string array from getEntities
  1113. if (field != null)
  1114. {
  1115. disambiguatedName = getFormattedTextFromField(esp.getDisambiguated_name(), field);
  1116. }
  1117. else
  1118. {
  1119. disambiguatedName = getFormattedTextFromField(esp.getDisambiguated_name(), field);
  1120. }
  1121. }
  1122. // Only proceed if disambiguousName contains a meaningful value
  1123. if (disambiguatedName != null && disambiguatedName.length() > 0)
  1124. {
  1125. e.setDisambiguatedName(disambiguatedName);
  1126. }
  1127. else // Always log failure to get a dname - to remove this, specify a creationCriteriaScript
  1128. {
  1129. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required disambiguated_name from: ").append(esp.getDisambiguated_name()).toString(), true);
  1130. return null;
  1131. }
  1132. // Entity.frequency (count)
  1133. String freq = "1";
  1134. if (esp.getFrequency() != null)
  1135. {
  1136. if (JavaScriptUtils.containsScript(esp.getFrequency()))
  1137. {
  1138. freq = getValueFromScript(esp.getFrequency(), field, index).toString();
  1139. }
  1140. else
  1141. {
  1142. freq = getFormattedTextFromField(esp.getFrequency(), field);
  1143. }
  1144. // Since we've specified freq, we're going to enforce it
  1145. if (null == freq) { // failed to get it
  1146. if (null == esp.getCreationCriteriaScript()) {
  1147. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required frequency from: ").append(esp.getFrequency()).toString(), true);
  1148. return null;
  1149. }
  1150. }
  1151. }
  1152. // Try converting the freq string value to its numeric (double) representation
  1153. Double frequency = (double) 0;
  1154. try
  1155. {
  1156. frequency = Double.parseDouble(freq);
  1157. }
  1158. catch (Exception e1)
  1159. {
  1160. this._context.getHarvestStatus().logMessage(e1.getMessage(), true);
  1161. return null;
  1162. }
  1163. // Only proceed if frequency > 0
  1164. if (frequency > 0)
  1165. {
  1166. e.setFrequency(frequency.longValue()); // Cast to long from double
  1167. }
  1168. else
  1169. {
  1170. return null;
  1171. }
  1172. // Entity.actual_name
  1173. String actualName = null;
  1174. if (esp.getActual_name() != null)
  1175. {
  1176. if (JavaScriptUtils.containsScript(esp.getActual_name()))
  1177. {
  1178. actualName = (String)getValueFromScript(esp.getActual_name(), field, index);
  1179. }
  1180. else
  1181. {
  1182. if ((_iterator != null) && (esp.getActual_name().startsWith("$metadata.") || esp.getActual_name().startsWith("${metadata."))) {
  1183. if (_context.isStandalone()) { // (minor message, while debugging only)
  1184. _context.getHarvestStatus().logMessage("Warning: in actual_name, using global $metadata when iterating", true);
  1185. }
  1186. }
  1187. actualName = getFormattedTextFromField(esp.getActual_name(), field);
  1188. }
  1189. // Since we've specified actual name, we're going to enforce it (unless otherwise specified)
  1190. if (null == actualName) { // failed to get it
  1191. if (null == esp.getCreationCriteriaScript()) {
  1192. if (_context.isStandalone()) { // (minor message, while debugging only)
  1193. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required actual_name from: ").append(esp.getActual_name()).toString(), true);
  1194. }
  1195. return null;
  1196. }
  1197. }
  1198. }
  1199. // If actualName == null set it equal to disambiguousName
  1200. if (actualName == null) actualName = disambiguatedName;
  1201. e.setActual_name(actualName);
  1202. // Entity.type
  1203. String type = null;
  1204. if (esp.getType() != null)
  1205. {
  1206. if (JavaScriptUtils.containsScript(esp.getType()))
  1207. {
  1208. type = (String)getValueFromScript(esp.getType(), field, index);
  1209. }
  1210. else
  1211. {
  1212. type = getFormattedTextFromField(esp.getType(), field);
  1213. }
  1214. // Since we've specified type, we're going to enforce it (unless otherwise specified)
  1215. if (null == type) { // failed to get it
  1216. if (null == esp.getCreationCriteriaScript()) {
  1217. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required type from: ").append(esp.getType()).toString(), true);
  1218. return null;
  1219. }
  1220. }
  1221. }
  1222. else
  1223. {
  1224. type = "Keyword";
  1225. }
  1226. e.setType(type);
  1227. // Entity.index
  1228. String entityIndex = disambiguatedName + "/" + type;
  1229. e.setIndex(entityIndex.toLowerCase());
  1230. // Now check if we already exist, discard if so:
  1231. if (_entityMap.contains(e.getIndex())) {
  1232. return null;
  1233. }
  1234. // Entity.dimension
  1235. String dimension = null;
  1236. if (esp.getDimension() != null)
  1237. {
  1238. if (JavaScriptUtils.containsScript(esp.getDimension()))
  1239. {
  1240. dimension = (String)getValueFromScript(esp.getDimension(), field, index);
  1241. }
  1242. else
  1243. {
  1244. dimension = getFormattedTextFromField(esp.getDimension(), field);
  1245. }
  1246. // Since we've specified dimension, we're going to enforce it (unless otherwise specified)
  1247. if (null == dimension) { // failed to get it
  1248. if (null == esp.getCreationCriteriaScript()) {
  1249. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required dimension from: ").append(esp.getDimension()).toString(), true);
  1250. return null;
  1251. }
  1252. }
  1253. }
  1254. if (null == dimension) {
  1255. try {
  1256. e.setDimension(DimensionUtility.getDimensionByType(type));
  1257. }
  1258. catch (java.lang.IllegalArgumentException ex) {
  1259. e.setDimension(EntityPojo.Dimension.What);
  1260. }
  1261. }
  1262. else {
  1263. try {
  1264. EntityPojo.Dimension enumDimension = EntityPojo.Dimension.valueOf(dimension);
  1265. if (null == enumDimension) {
  1266. _context.getHarvestStatus().logMessage(new StringBuffer("Invalid dimension: ").append(dimension).toString(), true);
  1267. return null; // (invalid dimension)
  1268. }
  1269. else {
  1270. e.setDimension(enumDimension);
  1271. }
  1272. }
  1273. catch (Exception e2) {
  1274. _context.getHarvestStatus().logMessage(new StringBuffer("Invalid dimension: ").append(dimension).toString(), true);
  1275. return null; // (invalid dimension)
  1276. }
  1277. }
  1278. // Entity.relevance
  1279. String relevance = "0";
  1280. if (esp.getRelevance() != null)
  1281. {
  1282. if (JavaScriptUtils.containsScript(esp.getRelevance()))
  1283. {
  1284. relevance = (String)getValueFromScript(esp.getRelevance(), field, index);
  1285. }
  1286. else
  1287. {
  1288. relevance = getFormattedTextFromField(esp.getRelevance(), field);
  1289. }
  1290. // Since we've specified relevance, we're going to enforce it (unless otherwise specified)
  1291. if (null == relevance) { // failed to get it
  1292. if (null == esp.getCreationCriteriaScript()) {
  1293. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required relevance from: ").append(esp.getRelevance()).toString(), true);
  1294. return null;
  1295. }
  1296. }
  1297. }
  1298. try {
  1299. e.setRelevance(Double.parseDouble(relevance));
  1300. }
  1301. catch (Exception e1) {
  1302. this._context.getHarvestStatus().logMessage(e1.getMessage(), true);
  1303. return null;
  1304. }
  1305. // Entity.sentiment (optional field)
  1306. if (esp.getSentiment() != null)
  1307. {
  1308. String sentiment;
  1309. if (JavaScriptUtils.containsScript(esp.getSentiment()))
  1310. {
  1311. sentiment = (String)getValueFromScript(esp.getSentiment(), field, index);
  1312. }
  1313. else
  1314. {
  1315. sentiment = getFormattedTextFromField(esp.getSentiment(), field);
  1316. }
  1317. // (sentiment is optional, even if specified)
  1318. if (null != sentiment) {
  1319. try {
  1320. double d = Double.parseDouble(sentiment);
  1321. e.setSentiment(d);
  1322. if (null == e.getSentiment()) {
  1323. if (_context.isStandalone()) { // (minor message, while debugging only)
  1324. _context.getHarvestStatus().logMessage(new StringBuffer("Invalid sentiment: ").append(sentiment).toString(), true);
  1325. }
  1326. }
  1327. }
  1328. catch (Exception e1) {
  1329. this._context.getHarvestStatus().logMessage(e1.getMessage(), true);
  1330. return null;
  1331. }
  1332. }
  1333. }
  1334. // Entity Link data:
  1335. if (esp.getLinkdata() != null)
  1336. {
  1337. String linkdata = null;
  1338. if (JavaScriptUtils.containsScript(esp.getLinkdata()))
  1339. {
  1340. linkdata = (String)getValueFromScript(esp.getLinkdata(), field, index);
  1341. }
  1342. else
  1343. {
  1344. linkdata = getFormattedTextFromField(esp.getLinkdata(), field);
  1345. }
  1346. // linkdata is optional, even if specified
  1347. if (null != linkdata) {
  1348. String[] links = linkdata.split("\\s+");
  1349. e.setSemanticLinks(Arrays.asList(links));
  1350. }
  1351. }
  1352. // Extract Entity GEO or set Entity Geo equal to DocGeo if specified via useDocGeo
  1353. if (esp.getGeotag() != null)
  1354. {
  1355. GeoPojo geo = getEntityGeo(esp.getGeotag(), null, field);
  1356. if (null != geo) {
  1357. e.setGeotag(geo);
  1358. }
  1359. // (Allow this field to be intrinsically optional)
  1360. // If no ontology type is specified, derive it from getEntityGeo:
  1361. if (null == esp.getOntology_type()) {
  1362. esp.setOntology_type(esp.getGeotag().getOntology_type());
  1363. }
  1364. }
  1365. else if (esp.getUseDocGeo() == true)
  1366. {
  1367. GeoPojo geo = getEntityGeo(null, f, field);
  1368. if (null != geo) {
  1369. e.setGeotag(geo);
  1370. }
  1371. // (Allow this field to be intrinsically optional)
  1372. }
  1373. // Entity.ontological_type (
  1374. String ontology_type = null;
  1375. if (esp.getOntology_type() != null)
  1376. {
  1377. if (JavaScriptUtils.containsScript(esp.getOntology_type()))
  1378. {
  1379. ontology_type = (String)getValueFromScript(esp.getOntology_type(), field, index);
  1380. }
  1381. else
  1382. {
  1383. ontology_type = getFormattedTextFromField(esp.getOntology_type(), field);
  1384. }
  1385. // Allow this field to be intrinsically optional
  1386. }
  1387. // If ontological_type == null, go fetch it from the internal lookup
  1388. if (ontology_type == null) {
  1389. e.setOntology_type(GeoOntologyMapping.mapEntityToOntology(type));
  1390. }
  1391. else if ('p' == GeoOntologyMapping.encodeOntologyCode(ontology_type) && !ontology_type.equals("point")) {
  1392. // In this case we don't recognize the ontology type so we'll overwrite it
  1393. e.setOntology_type(GeoOntologyMapping.mapEntityToOntology(type));
  1394. }
  1395. e.setOntology_type(ontology_type);
  1396. // Add the index and geotag to geomap to get used by associations with matching indexes
  1397. if (e.getGeotag() != null)
  1398. {
  1399. _geoMap.put(e.getIndex(), e.getGeotag());
  1400. }
  1401. _entityMap.add(e.getIndex());
  1402. return e;
  1403. }
  1404. catch (Exception ex)
  1405. {
  1406. return null;
  1407. }
  1408. }
  1409. /**
  1410. * getAssociations
  1411. * @param esps
  1412. * @param f
  1413. * @return
  1414. * @throws JSONException
  1415. */
  1416. private List<AssociationPojo> getAssociations(List<AssociationSpecPojo> esps, DocumentPojo f) throws JSONException
  1417. {
  1418. // If the feed already has associations we want to add the new associations to the list of existing associations
  1419. List<AssociationPojo> associations = null;
  1420. if (f.getAssociations() != null)
  1421. {
  1422. associations = f.getAssociations();
  1423. }
  1424. // Otherwise we create a new arraylist to hold the new associations we are adding
  1425. else
  1426. {
  1427. associations = new ArrayList<AssociationPojo>();
  1428. }
  1429. repopulateEntityCacheIfNeeded(f);
  1430. // Iterate over each AssociationSpecPojo and try to create an entity, or entities, from the data
  1431. JSONObject metadata = null;
  1432. if (_document.has("metadata")) {
  1433. metadata = _document.getJSONObject("metadata");
  1434. }
  1435. for (AssociationSpecPojo esp : esps)
  1436. {
  1437. try {
  1438. List<AssociationPojo> tempAssocs = getAssociations(esp, f, metadata);
  1439. if (null != tempAssocs) {
  1440. for (AssociationPojo e : tempAssocs)
  1441. {
  1442. associations.add(e);
  1443. }
  1444. }
  1445. }
  1446. catch (Exception e) {} // (prob just a missing field)
  1447. }
  1448. return associations;
  1449. }
  1450. /**
  1451. * getAssociations(List<AssociationSpecPojo> esps, DocumentPojo f)
  1452. * @param esps
  1453. * @param f
  1454. * @return List<AssociationPojo>
  1455. */
  1456. private List<AssociationPojo> getAssociations(AssociationSpecPojo esp, DocumentPojo f, JSONObject currObj)
  1457. {
  1458. List<AssociationPojo> associations = new ArrayList<AssociationPojo>();
  1459. try
  1460. {
  1461. //
  1462. if (esp.getIterateOver() != null)
  1463. {
  1464. String iterateOver = esp.getIterateOver();
  1465. String slashSplit[] = iterateOver.split("/");
  1466. String commaSplit[] = iterateOver.split(",");
  1467. // START - Multiplicative/Additive Association Creation
  1468. // entity1/entity2/geo_index/time_start/time_end or entity1,entity2,geo_index,time_start,time_end
  1469. if (slashSplit.length > 1 || commaSplit.length > 1)
  1470. {
  1471. ArrayList<String[]> assocsToCreate = new ArrayList<String[]> ();
  1472. // Multiplicative - entity1/entity2/geo_index/time_start/time_end
  1473. if (slashSplit.length > 1)
  1474. {
  1475. assocsToCreate = getMultiplicativeAssociations(esp, iterateOver, f);
  1476. }
  1477. // WARNING: This code has not been tested! It should work but...
  1478. // Additive - entity1,entity2,geo_index,time_start,time_end
  1479. else if (commaSplit.length > 1)
  1480. {
  1481. assocsToCreate = getAdditiveAssociations(esp, iterateOver, f);
  1482. }
  1483. // Iterate over each association String[] returned and (try to) create a new AssociationSpecPojo
  1484. if (assocsToCreate != null)
  1485. {
  1486. for (String[] assocToCreate : assocsToCreate)
  1487. {
  1488. JSONObject currIt = new JSONObject();
  1489. AssociationSpecPojo newAssoc = new AssociationSpecPojo();
  1490. // Entity1
  1491. if (assocToCreate[0] !=null) {
  1492. newAssoc.setEntity1_index(assocToCreate[0].replace("$", "${$}"));
  1493. currIt.put("entity1_index", assocToCreate[0]);
  1494. }
  1495. else {
  1496. newAssoc.setEntity1(esp.getEntity1());
  1497. newAssoc.setEntity1_index(esp.getEntity1_index());
  1498. }
  1499. // Entity2
  1500. if (assocToCreate[1] !=null) {
  1501. newAssoc.setEntity2_index(assocToCreate[1].replace("$", "${$}"));
  1502. currIt.put("entity2_index", assocToCreate[1]);
  1503. }
  1504. else {
  1505. newAssoc.setEntity2(esp.getEntity2());
  1506. newAssoc.setEntity2_index(esp.getEntity2_index());
  1507. }
  1508. // Geo_index
  1509. if (assocToCreate[2] !=null) {
  1510. newAssoc.setGeo_index(assocToCreate[2].replace("$", "${$}"));
  1511. currIt.put("geo_index", assocToCreate[2]);
  1512. }
  1513. else { newAssoc.setGeo_index(esp.getGeo_index()); }
  1514. // Time_start
  1515. if (assocToCreate[3] !=null) {
  1516. newAssoc.setTime_start(assocToCreate[3].replace("$", "${$}"));
  1517. currIt.put("time_start", assocToCreate[3]);
  1518. }
  1519. else { newAssoc.setTime_start(esp.getTime_start()); }
  1520. // Time_end
  1521. if (assocToCreate[4] !=null) {
  1522. newAssoc.setTime_end(assocToCreate[4].replace("$", "${$}"));
  1523. currIt.put("time_end", assocToCreate[4]);
  1524. }
  1525. else { newAssoc.setTime_end(esp.getTime_end()); }
  1526. // Misc. Fields to copy from the original pojo
  1527. newAssoc.setCreationCriteriaScript(esp.getCreationCriteriaScript());
  1528. newAssoc.setVerb(esp.getVerb());
  1529. newAssoc.setVerb_category(esp.getVerb_category());
  1530. newAssoc.setAssoc_type(esp.getAssoc_type());
  1531. newAssoc.setGeotag(esp.getGeotag());
  1532. // Create an association from the AssociationSpecPojo and document
  1533. JSONObject savedIterator = _iterator; // (just in case this needs to be retained - i don't think it does)
  1534. if (null != _scriptEngine) { // (in case no script engine specified)
  1535. _iterator = currIt;
  1536. }
  1537. AssociationPojo association = getAssociation(newAssoc, null, null, f);
  1538. if (association != null) associations.add(association);
  1539. _iterator = savedIterator;
  1540. }
  1541. //TESTED (including the ${$} escaping)
  1542. }
  1543. }
  1544. // END - Multiplicative/Additive Association Creation
  1545. //
  1546. else if (null != currObj) // Single field iterateOver
  1547. {
  1548. try
  1549. {
  1550. // Check to see if the arrayRoot specified exists in the current doc before proceeding
  1551. // Get array of association records from the specified root element
  1552. Object itEl = null;
  1553. try {
  1554. itEl = currObj.get(iterateOver);
  1555. }
  1556. catch (JSONException e) {} // carry on, trapped below...
  1557. if (null == itEl) {
  1558. return associations;
  1559. }
  1560. JSONArray assocRecords = null;
  1561. try {
  1562. assocRecords = currObj.getJSONArray(iterateOver);
  1563. }
  1564. catch (JSONException e) {} // carry on, trapped below...
  1565. if (null == assocRecords) {
  1566. assocRecords = new JSONArray();
  1567. assocRecords.put(itEl);
  1568. }
  1569. //TESTED
  1570. // Get the type of object contained in assocRecords[0]
  1571. if (assocRecords.length() > 0) {
  1572. String objType = assocRecords.get(0).getClass().toString();
  1573. // EntityRecords is a simple String[] array of associations
  1574. if (objType.equalsIgnoreCase("class java.lang.String"))
  1575. {
  1576. // Iterate over array elements and extract associations
  1577. for (int i = 0; i < assocRecords.length(); ++i)
  1578. {
  1579. String field = assocRecords.getString(i);
  1580. long nIndex = Long.valueOf(i);
  1581. if (null != esp.getVerb_category()) { // (ie a mandatory field is present)
  1582. AssociationPojo association = getAssociation(esp, field, nIndex, f);
  1583. if (association != null) associations.add(association);
  1584. }//TESTED
  1585. // Does the association break out into multiple associations?
  1586. if (esp.getAssociations() != null)
  1587. {
  1588. // Iterate over the associations and call getAssociations recursively
  1589. for (AssociationSpecPojo subEsp : esp.getAssociations())
  1590. {
  1591. if (null != subEsp.getIterateOver()) {
  1592. if (null == subEsp.getCreationCriteriaScript()) {
  1593. _context.getHarvestStatus().logMessage(new StringBuffer("In iterator ").
  1594. append(esp.getIterateOver()).append(", trying to loop over field '").
  1595. append(subEsp.getIterateOver()).append("' in array of primitives.").toString(), true);
  1596. }
  1597. else {
  1598. this.executeEntityAssociationValidation(subEsp.getCreationCriteriaScript(), field, Long.toString(nIndex));
  1599. }
  1600. // (any creation criteria script indicates user accepts it can be either)
  1601. }
  1602. if (null != subEsp.getVerb_category()) { // (ie a mandatory field is present)
  1603. AssociationPojo association = getAssociation(subEsp, field, nIndex, f);
  1604. if (association != null) associations.add(association);
  1605. }
  1606. }
  1607. }//TESTED (error case)
  1608. }
  1609. }
  1610. // EntityRecords is a JSONArray
  1611. else if (objType.equalsIgnoreCase("class org.json.JSONObject"))
  1612. {
  1613. // Iterate over array elements and extract associations
  1614. for (int i = 0; i < assocRecords.length(); ++i)
  1615. {
  1616. // Get JSONObject containing association fields and pass assocElement
  1617. // into the script engine so scripts can access it
  1618. JSONObject savedIterator = null;
  1619. if (_scriptEngine != null)
  1620. {
  1621. _iterator = savedIterator = assocRecords.getJSONObject(i);
  1622. }
  1623. if (null != esp.getVerb_category()) { // (ie a mandatory field is present)
  1624. AssociationPojo association = getAssociation(esp, null, Long.valueOf(i), f);
  1625. if (association != null) associations.add(association);
  1626. }//TESTED
  1627. // Does the association break out into multiple associations?
  1628. if (esp.getAssociations() != null)
  1629. {
  1630. // Iterate over the associations and call getAssociations recursively
  1631. for (AssociationSpecPojo subEsp : esp.getAssociations())
  1632. {
  1633. _iterator = savedIterator; // (reset this)
  1634. List<AssociationPojo> subAssocs = getAssociations(subEsp, f, _iterator);
  1635. for (AssociationPojo e : subAssocs)
  1636. {
  1637. associations.add(e);
  1638. }
  1639. }
  1640. }
  1641. }//(else if is json object)
  1642. }//(end if >0 array elements)
  1643. if (_iterator != currObj) { // top level
  1644. _iterator = null;
  1645. }
  1646. }
  1647. }
  1648. catch (Exception e)
  1649. {
  1650. //System.out.println(e.getMessage());
  1651. //DEBUG (don't output log messages per doc)
  1652. //logger.error("Exception: " + e.getMessage(), e);
  1653. }
  1654. }
  1655. }
  1656. //
  1657. else // No iterate over at all
  1658. {
  1659. AssociationPojo association = getAssociation(esp, null, null, f);
  1660. if (association != null) associations.add(association);
  1661. }
  1662. return associations;
  1663. }
  1664. catch (Exception e)
  1665. {
  1666. //DEBUG (don't output log messages per doc)
  1667. //logger.error("Exception: " + e.getMessage());
  1668. return null;
  1669. }
  1670. }
  1671. /**
  1672. * getMultiplicativeAssociations
  1673. * @param iterateOver
  1674. * @param f
  1675. * @return
  1676. */
  1677. private ArrayList<String[]> getMultiplicativeAssociations(AssociationSpecPojo esp, String iterateOver, DocumentPojo f)
  1678. {
  1679. // Split iterateOver into a list of fields
  1680. String[] entityFields = iterateOver.split("/");
  1681. // ArrayList to store association entities in and extract the entities (disambiguous names) from feed.entities
  1682. HashMap<String, ArrayList<String>> entityLists = extractEntityLists(esp, entityFields, f);
  1683. // Calculate the total number of associations to create from the EntitySpecPojo
  1684. Hashtable<String, Integer> assocCounts = getTotalNumberOfAssociations(entityLists, entityFields);
  1685. int totalNumberOfAssociations = (Integer) assocCounts.get("totalNumberOfAssociations");
  1686. if (totalNumberOfAssociations > 0)
  1687. {
  1688. ArrayList<String[]> assocsToCreate = new ArrayList<String[]> ();
  1689. int entity1Number = 1;
  1690. int entity2Number = 1;
  1691. int geoIndexNumber = 1;
  1692. int timeStartNumber = 1;
  1693. int timeEndNumber = 1;
  1694. for (int i = 0; i < totalNumberOfAssociations; i++)
  1695. {
  1696. try
  1697. {
  1698. String[] assocToCreate = new String[5];
  1699. // Entity1
  1700. if (entityLists.get("entity1") != null && entityLists.get("entity1").get(entity1Number - 1) != null)
  1701. {
  1702. assocToCreate[0] = entityLists.get("entity1").get(entity1Number - 1);
  1703. if (((Integer) assocCounts.get("entity1Count") > 1) && (i % (Integer) assocCounts.get("entity1Repeat") == 0)) entity1Number++;
  1704. if (entity1Number > entityLists.get("entity1").size()) entity1Number = 1;
  1705. }
  1706. // Entity2
  1707. if (entityLists.get("entity2") != null && entityLists.get("entity2").get(entity2Number - 1) != null)
  1708. {
  1709. assocToCreate[1] = entityLists.get("entity2").get(entity2Number - 1);
  1710. if (((Integer) assocCounts.get("entity2Count") > 1) && (i % (Integer) assocCounts.get("entity2Repeat") == 0)) entity2Number++;
  1711. if (entity2Number > entityLists.get("entity2").size()) entity2Number = 1;
  1712. }
  1713. // Geo_Index
  1714. if (entityLists.get("geo_index") != null && entityLists.get("geo_index").get(geoIndexNumber - 1) != null)
  1715. {
  1716. assocToCreate[2] = entityLists.get("geo_index").get(geoIndexNumber - 1);
  1717. if (((Integer) assocCounts.get("geoIndexCount") > 1) && (i % (Integer) assocCounts.get("geoIndexCount") == 0)) geoIndexNumber++;
  1718. if (geoIndexNumber > entityLists.get("geoIndexCount").size()) geoIndexNumber = 1;
  1719. }
  1720. // Time_Start
  1721. if (entityLists.get("time_start") != null && entityLists.get("time_start").get(timeStartNumber - 1) != null)
  1722. {
  1723. assocToCreate[3] = entityLists.get("time_start").get(timeStartNumber - 1);
  1724. if (((Integer) assocCounts.get("timeStartCount") > 1) && (i % (Integer) assocCounts.get("timeStartCount") == 0)) timeStartNumber++;
  1725. if (geoIndexNumber > entityLists.get("timeStartCount").size()) geoIndexNumber = 1;
  1726. }
  1727. // Time_End
  1728. if (entityLists.get("time_end") != null && entityLists.get("time_end").get(timeEndNumber - 1) != null)
  1729. {
  1730. assocToCreate[4] = entityLists.get("time_end").get(timeEndNumber - 1);
  1731. if (((Integer) assocCounts.get("timeEndCount") > 1) && (i % (Integer) assocCounts.get("timeEndCount") == 0)) timeEndNumber++;
  1732. }
  1733. assocsToCreate.add(assocToCreate);
  1734. }
  1735. catch (Exception e)
  1736. {
  1737. //e.printStackTrace();
  1738. //System.out.println(e.getMessage());
  1739. //logger.error("Exception: " + e.getMessage());
  1740. }
  1741. }
  1742. return assocsToCreate;
  1743. }
  1744. else
  1745. {
  1746. return null;
  1747. }
  1748. }
  1749. /**
  1750. * extractEntityLists
  1751. * @param esp
  1752. * @param entityFields
  1753. * @param f
  1754. * @return
  1755. */
  1756. private HashMap<String, ArrayList<String>> extractEntityLists(AssociationSpecPojo esp, String[] entityFields, DocumentPojo f)
  1757. {
  1758. // ArrayList to store association entities in
  1759. HashMap<String, ArrayList<String>> entityLists = new HashMap<String, ArrayList<String>>();
  1760. // Get the list of entities from the feed
  1761. List<EntityPojo> entities = f.getEntities();
  1762. // These are the fields over which we are iterating
  1763. for (String field : entityFields)
  1764. {
  1765. // Get the specified type for this field
  1766. String typeValue = getFieldValueFromAssociationSpecPojo(esp, field);
  1767. // Get the index for any entity that matches the type field
  1768. ArrayList<String> indexes = new ArrayList<String>();
  1769. if (typeValue != null)
  1770. {
  1771. for (EntityPojo e : entities)
  1772. {
  1773. if (e.getType().equalsIgnoreCase(typeValue))
  1774. {
  1775. if (null != e.getIndex()) {
  1776. indexes.add(e.getIndex()); // (I think the code will always take this branch)
  1777. }
  1778. else { // (this is just a harmless safety net I think)
  1779. indexes.add(new StringBuffer(e.getDisambiguatedName().toLowerCase()).append(typeValue.toLowerCase()).toString());
  1780. }
  1781. }
  1782. }
  1783. if (indexes.size() > 0) entityLists.put(field, indexes);
  1784. }
  1785. }
  1786. //TESTED (see INF1360_test_source.json:test1 for entities, :test5 for geo_index)
  1787. return entityLists;
  1788. }
  1789. /**
  1790. * getFieldValueFromAssociationSpecPojo
  1791. * @param esp
  1792. * @param field
  1793. * @return
  1794. */
  1795. private String getFieldValueFromAssociationSpecPojo(AssociationSpecPojo esp, String field)
  1796. {
  1797. if (field.equalsIgnoreCase("entity1"))
  1798. {
  1799. return esp.getEntity1();
  1800. }
  1801. else if (field.equalsIgnoreCase("entity2"))
  1802. {
  1803. return esp.getEntity2();
  1804. }
  1805. else if (field.equalsIgnoreCase("geo_index"))
  1806. {
  1807. return esp.getGeo_index();
  1808. }
  1809. else if (field.equalsIgnoreCase("time_start"))
  1810. {
  1811. return esp.getTime_start();
  1812. }
  1813. else if (field.equalsIgnoreCase("time_end"))
  1814. {
  1815. return esp.getTime_end();
  1816. }
  1817. else
  1818. {
  1819. return null;
  1820. }
  1821. }
  1822. /**
  1823. * getTotalNumberOfAssociations
  1824. * @param entityLists
  1825. * @return
  1826. */
  1827. private Hashtable<String, Integer> getTotalNumberOfAssociations(HashMap<String, ArrayList<String>> entityLists, String[] entityFields)
  1828. {
  1829. // Create Hashtable to hold count values referenced by name: i.e. totalNumberOfAssociations
  1830. Hashtable<String, Integer> retVal = new Hashtable<String, Integer>();
  1831. //
  1832. int entity1_count = 0;
  1833. int entity2_count = 0;
  1834. int geo_index_count = 0;
  1835. int time_start_count = 0;
  1836. int time_end_count = 0;
  1837. // Count up the total number of associations that need to be created
  1838. // Total Number of Associations = entity1 * entity2 * geo_index * time_start * time_end
  1839. // Note: Only calculates based on the fields passed in the entityFields String[] and
  1840. // the number of matching values in entityLists. If one of those values is 0 then the
  1841. // total number of associations = 0
  1842. int totalAssocs = 1;
  1843. for (String field : entityFields)
  1844. {
  1845. if (field.equalsIgnoreCase("entity1"))
  1846. {
  1847. entity1_count = (entityLists.get("entity1") != null) ? entityLists.get("entity1").size() : 0;
  1848. totalAssocs = totalAssocs * entity1_count;
  1849. }
  1850. if (field.equalsIgnoreCase("entity2"))
  1851. {
  1852. entity2_count = (entityLists.get("entity2") != null) ? entityLists.get("entity2").size() : 0;
  1853. totalAssocs = totalAssocs * entity2_count;
  1854. }
  1855. if (field.equalsIgnoreCase("geo_index"))
  1856. {
  1857. geo_index_count = (entityLists.get("geo_index") != null) ? entityLists.get("geo_index").size() : 0;
  1858. totalAssocs = totalAssocs * geo_index_count;
  1859. }
  1860. if (field.equalsIgnoreCase("time_start"))
  1861. {
  1862. time_start_count = (entityLists.get("time_start") != null) ? entityLists.get("time_start").size() : 0;
  1863. totalAssocs = totalAssocs * time_start_count;
  1864. }
  1865. if (field.equalsIgnoreCase("time_end"))
  1866. {
  1867. time_end_count = (entityLists.get("time_end") != null) ? entityLists.get("time_end").size() : 0;
  1868. totalAssocs = totalAssocs * time_end_count;
  1869. }
  1870. }
  1871. // Add total number of associations to the HashTable and return if the val == 0
  1872. retVal.put("totalNumberOfAssociations", totalAssocs);
  1873. if (totalAssocs == 0) return retVal;
  1874. if (entity1_count == 0) entity1_count = 1;
  1875. if (entity2_count == 0) entity2_count = 1;
  1876. if (geo_index_count == 0) geo_index_count = 1;
  1877. if (time_start_count == 0) time_start_count = 1;
  1878. if (time_end_count == 0) time_end_count = 1;
  1879. // Entity1
  1880. Double repeat = (double) (totalAssocs / entity1_count);
  1881. retVal.put("entity1Repeat", repeat.intValue());
  1882. retVal.put("entity1Count", entity1_count);
  1883. // Entity2
  1884. repeat = (double) (totalAssocs / entity1_count / entity2_count);
  1885. retVal.put("entity2Repeat", repeat.intValue());
  1886. retVal.put("entity2Count", entity2_count);
  1887. // Geo_Index
  1888. repeat = (double) (totalAssocs / entity1_count / entity2_count / geo_index_count);
  1889. retVal.put("geoIndexRepeat", repeat.intValue());
  1890. retVal.put("geoIndexCount", geo_index_count);
  1891. // Time_Start
  1892. repeat = (double) (totalAssocs / entity1_count / entity2_count / geo_index_count / time_start_count);
  1893. retVal.put("timeStartRepeat", repeat.intValue());
  1894. retVal.put("timeStartCount", time_start_count);
  1895. // Time_End
  1896. repeat = (double) (totalAssocs / entity1_count / entity2_count / geo_index_count / time_start_count / time_end_count);
  1897. retVal.put("timeEndRepeat", repeat.intValue());
  1898. retVal.put("timeEndCount", time_end_count);
  1899. return retVal;
  1900. }
  1901. /**
  1902. * getAdditiveAssociations
  1903. * @param iterateOver
  1904. * @param f
  1905. * @return
  1906. */
  1907. private ArrayList<String[]> getAdditiveAssociations(AssociationSpecPojo esp, String iterateOver, DocumentPojo f)
  1908. {
  1909. // Split iterateOver into a list of entities on ','
  1910. String[] entityFields = iterateOver.split(",");
  1911. // ArrayList to store association entities in and extract the entities (disambiguous names) from doc.entities
  1912. HashMap<String, ArrayList<String>> entityLists = extractEntityLists(esp, entityFields, f);
  1913. int itemCount = 0;
  1914. if (entityLists.size() > 0)
  1915. {
  1916. itemCount = (entityLists.get(entityFields[0]) != null) ? entityLists.get(entityFields[0]).size() : 0;
  1917. // Get an ArrayList<String> from entity1, entity2, geo_index, time_start and time_end fields as appropriate
  1918. ArrayList<String> entity1 = (entityLists.get("entity1") != null) ? entityLists.get("entity1") : null;
  1919. ArrayList<String> entity2 = (entityLists.get("entity2") != null) ? entityLists.get("entity2") : null;
  1920. ArrayList<String> geo_index = (entityLists.get("geo_index") != null) ? entityLists.get("geo_index") : null;
  1921. ArrayList<String> time_start = (entityLists.get("time_start") != null) ? entityLists.get("time_start") : null;
  1922. ArrayList<String> time_end = (entityLists.get("time_end") != null) ? entityLists.get("time_end") : null;
  1923. ArrayList<String[]> assocsToCreate = new ArrayList<String[]>();
  1924. for (int i = 0; i < itemCount; i++)
  1925. {
  1926. String[] assocToCreate = new String[5];
  1927. if (entity1 != null && entity1.get(i) != null) assocToCreate[0] = entity1.get(i);
  1928. if (entity2 != null && entity2.get(i) != null) assocToCreate[1] = entity2.get(i);
  1929. if (geo_index != null && geo_index.get(i) != null) assocToCreate[2] = geo_index.get(i);
  1930. if (time_start != null && time_start.get(i) != null) assocToCreate[3] = time_start.get(i);
  1931. if (time_end != null && time_end.get(i) != null) assocToCreate[4] = time_end.get(i);
  1932. // Only add assocToCreate to associationsToCreate if each field passed via entityFields has a value
  1933. boolean addAssocToCreate = true;
  1934. for (String s : entityFields)
  1935. {
  1936. if (s.equalsIgnoreCase("entity1") && assocToCreate[0] == null) { addAssocToCreate = false; break; }
  1937. if (s.equalsIgnoreCase("entity2") && assocToCreate[1] == null) { addAssocToCreate = false; break; }
  1938. if (s.equalsIgnoreCase("geo_index") && assocToCreate[2] == null) { addAssocToCreate = false; break; }
  1939. if (s.equalsIgnoreCase("time_start") && assocToCreate[3] == null) { addAssocToCreate = false; break; }
  1940. if (s.equalsIgnoreCase("time_end") && assocToCreate[4] == null) { addAssocToCreate = false; break; }
  1941. }
  1942. if (addAssocToCreate) assocsToCreate.add(assocToCreate);
  1943. }
  1944. return assocsToCreate;
  1945. }
  1946. else
  1947. {
  1948. return null;
  1949. }
  1950. }
  1951. /**
  1952. * getAssociation
  1953. * @param esp
  1954. * @param field
  1955. * @param count
  1956. * @param f
  1957. * @return
  1958. */
  1959. private AssociationPojo getAssociation(AssociationSpecPojo esp, String field, Long count, DocumentPojo f)
  1960. {
  1961. String index = (count != null) ? count.toString() : null;
  1962. try
  1963. {
  1964. AssociationPojo e = new AssociationPojo();
  1965. // If the AssociationSpecPojo has a creation criteria script check the association for validity
  1966. if (esp.getCreationCriteriaScript() != null && JavaScriptUtils.containsScript(esp.getCreationCriteriaScript()))
  1967. {
  1968. boolean addAssoc = executeEntityAssociationValidation(esp.getCreationCriteriaScript(), field, index);
  1969. if (!addAssoc) {
  1970. return null;
  1971. }
  1972. }
  1973. boolean bDontResolveToIndices = false; // (can always override to summary)
  1974. if (null != esp.getAssoc_type() && (esp.getAssoc_type().equalsIgnoreCase("summary"))) {
  1975. bDontResolveToIndices = true;
  1976. }
  1977. // Assoc.entity1
  1978. if ((esp.getEntity1() != null) || (esp.getEntity1_index() != null))
  1979. {
  1980. // Association.entity1_index
  1981. if (esp.getEntity1_index() != null)
  1982. {
  1983. if (JavaScriptUtils.containsScript(esp.getEntity1_index()))
  1984. {
  1985. String s = (String)getValueFromScript(esp.getEntity1_index(), field, index);
  1986. if (null != s) e.setEntity1_index(s.toLowerCase());
  1987. }
  1988. else
  1989. {
  1990. if ((_iterator != null) && (esp.getEntity1_index().startsWith("$metadata.") || esp.getEntity1_index().startsWith("${metadata."))) {
  1991. if (_context.isStandalone()) { // (minor message, while debugging only)
  1992. _context.getHarvestStatus().logMessage("Warning: in entity1_index, using global $metadata when iterating", true);
  1993. }
  1994. }
  1995. String s = getFormattedTextFromField(esp.getEntity1_index(), field);
  1996. if (null != s) e.setEntity1_index(s.toLowerCase());
  1997. }
  1998. if (null != e.getEntity1_index()) { // Convert to entity1
  1999. int nTypeIndex = e.getEntity1_index().lastIndexOf('/');
  2000. if (nTypeIndex > 0) {
  2001. e.setEntity1(e.getEntity1_index().substring(0, nTypeIndex));
  2002. if (!_entityMap.contains(e.getEntity1_index())) { // Needs to correlate with an entity
  2003. StringBuffer error = new StringBuffer("Failed to correlate entity1_index with: ").append(esp.getEntity1_index());
  2004. if (_context.isStandalone()) {
  2005. error.append(" using ").append(e.getEntity1_index());
  2006. }
  2007. _context.getHarvestStatus().logMessage(error.toString(), true);
  2008. e.setEntity1_index(null);
  2009. }//TESTED (INF1360_test_source.json:test8)
  2010. }
  2011. else { // index must be malformed
  2012. StringBuffer error = new StringBuffer("Malformed entity1_index with: ").append(esp.getEntity1_index());
  2013. if (_context.isStandalone()) {
  2014. error.append(" using ").append(e.getEntity1_index());
  2015. }
  2016. _context.getHarvestStatus().logMessage(error.toString(), true);
  2017. e.setEntity1_index(null);
  2018. }
  2019. }
  2020. }//TESTED (see INF1360_test_source.json:test2)
  2021. // entity1
  2022. if (null != esp.getEntity1()) {
  2023. if (JavaScriptUtils.containsScript(esp.getEntity1()))
  2024. {
  2025. e.setEntity1((String)getValueFromScript(esp.getEntity1(), field, index));
  2026. }
  2027. else
  2028. {
  2029. if ((_iterator != null) && (esp.getEntity1().startsWith("$metadata.") || esp.getEntity1().startsWith("${metadata."))) {
  2030. if (_context.isStandalone()) { // (minor message, while debugging only)
  2031. _context.getHarvestStatus().logMessage("Warning: in entity1, using global $metadata when iterating", true);
  2032. }
  2033. }
  2034. e.setEntity1(getFormattedTextFromField(esp.getEntity1(), field));
  2035. }
  2036. if (!bDontResolveToIndices && (null == e.getEntity1_index()))
  2037. {
  2038. // Try using the entity.disambiguated name, this isn't perfect because 2 entities with different
  2039. // types can have different dnames, but we'll try and then abandon if we get multiple hits
  2040. int nHits = 0;
  2041. String matchingIndex = null;
  2042. for (EntityPojo entity : f.getEntities())
  2043. {
  2044. if (entity.getDisambiguatedName().equalsIgnoreCase(e.getEntity1()))
  2045. {
  2046. nHits++;
  2047. if (1 == nHits) {
  2048. matchingIndex = entity.getIndex();
  2049. e.setEntity1_index(entity.getIndex());
  2050. }
  2051. else if (!matchingIndex.equals(entity.getIndex())) { // Ambiguous reference so bail out
  2052. StringBuffer error = new StringBuffer("Failed entity1_index disambiguation with: ").append(esp.getEntity1());
  2053. if (_context.isStandalone()) {
  2054. error.append(" using ").append(e.getEntity1());
  2055. }
  2056. _context.getHarvestStatus().logMessage(error.toString(), true);
  2057. e.setEntity1_index(null);
  2058. break;
  2059. }
  2060. }
  2061. } // (end loop across all indices)
  2062. }//TESTED (success and fail cases, see INF1360_test_source.json:test3)
  2063. } // (end no entity1_index extracted, entity1 specified)
  2064. // Quality checks:
  2065. if ((esp.getEntity1() != null) && (null == e.getEntity1()) && (null == esp.getCreationCriteriaScript())) {
  2066. // Specified this (entity1), so going to insist on it
  2067. if (_context.isStandalone()) { // (minor message, while debugging only)
  2068. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required entity1 from: ").append(esp.getEntity1()).toString(), true);
  2069. }
  2070. return null;
  2071. }
  2072. if ((esp.getEntity1_index() != null) && (null == e.getEntity1_index()) && (null == esp.getCreationCriteriaScript())) {
  2073. // Specified this (entity1_index), so going to insist on it
  2074. if (_context.isStandalone()) { // (minor message, while debugging only)
  2075. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required entity1_index from: ").append(esp.getEntity1_index()).toString(), true);
  2076. }
  2077. return null;
  2078. }
  2079. //TESTED INF1360_test_source:test7 (no criteria), test8 (criteria)
  2080. } // (end entity1)
  2081. // Assoc.entity2
  2082. if ((esp.getEntity2() != null) || (esp.getEntity2_index() != null))
  2083. {
  2084. // Association.entity2_index
  2085. if (esp.getEntity2_index() != null)
  2086. {
  2087. if (JavaScriptUtils.containsScript(esp.getEntity2_index()))
  2088. {
  2089. String s = (String)getValueFromScript(esp.getEntity2_index(), field, index);
  2090. if (null != s) e.setEntity2_index(s.toLowerCase());
  2091. }
  2092. else
  2093. {
  2094. if ((_iterator != null) && (esp.getEntity2_index().startsWith("$metadata.") || esp.getEntity2_index().startsWith("${metadata."))) {
  2095. if (_context.isStandalone()) { // (minor message, while debugging only)
  2096. _context.getHarvestStatus().logMessage("Warning: in entity2_index, using global $metadata when iterating", true);
  2097. }
  2098. }
  2099. String s = getFormattedTextFromField(esp.getEntity2_index(), field);
  2100. if (null != s) e.setEntity2_index(s.toLowerCase());
  2101. }
  2102. if (null != e.getEntity2_index()) { // Convert to entity2
  2103. int nTypeIndex = e.getEntity2_index().lastIndexOf('/');
  2104. if (nTypeIndex > 0) {
  2105. e.setEntity2(e.getEntity2_index().substring(0, nTypeIndex));
  2106. if (!_entityMap.contains(e.getEntity2_index())) { // Needs to correlate with an entity
  2107. StringBuffer error = new StringBuffer("Failed to correlate entity2_index with: ").append(esp.getEntity2_index());
  2108. if (_context.isStandalone()) {
  2109. error.append(" using ").append(e.getEntity2_index());
  2110. }
  2111. _context.getHarvestStatus().logMessage(error.toString(), true);
  2112. e.setEntity2_index(null);
  2113. }//TESTED (INF1360_test_source.json:test8)
  2114. }
  2115. else { // index must be malformed
  2116. StringBuffer error = new StringBuffer("Malformed entity2_index with: ").append(esp.getEntity2_index());
  2117. if (_context.isStandalone()) {
  2118. error.append(" using ").append(e.getEntity2_index());
  2119. }
  2120. _context.getHarvestStatus().logMessage(error.toString(), true);
  2121. e.setEntity2_index(null);
  2122. }
  2123. }
  2124. }//TESTED (see INF1360_test_source.json:test2)
  2125. // entity2
  2126. if (null != esp.getEntity2()) {
  2127. if (JavaScriptUtils.containsScript(esp.getEntity2()))
  2128. {
  2129. e.setEntity2((String)getValueFromScript(esp.getEntity2(), field, index));
  2130. }
  2131. else
  2132. {
  2133. if ((_iterator != null) && (esp.getEntity2().startsWith("$metadata.") || esp.getEntity2().startsWith("${metadata."))) {
  2134. if (_context.isStandalone()) { // (minor message, while debugging only)
  2135. _context.getHarvestStatus().logMessage("Warning: in entity2, using global $metadata when iterating", true);
  2136. }
  2137. }
  2138. e.setEntity2(getFormattedTextFromField(esp.getEntity2(), field));
  2139. }
  2140. if (!bDontResolveToIndices && (null == e.getEntity2_index()))
  2141. {
  2142. // Try using the entity.disambiguated name, this isn't perfect because 2 entities with different
  2143. // types can have different dnames, but we'll try and then abandon if we get multiple hits
  2144. int nHits = 0;
  2145. String matchingIndex = null;
  2146. for (EntityPojo entity : f.getEntities())
  2147. {
  2148. if (entity.getDisambiguatedName().equalsIgnoreCase(e.getEntity2()))
  2149. {
  2150. nHits++;
  2151. if (1 == nHits) {
  2152. matchingIndex = entity.getIndex();
  2153. e.setEntity2_index(entity.getIndex());
  2154. }
  2155. else if (!matchingIndex.equals(entity.getIndex())) { // Ambiguous reference so bail out
  2156. StringBuffer error = new StringBuffer("Failed entity2_index disambiguation with: ").append(esp.getEntity2());
  2157. if (_context.isStandalone()) {
  2158. error.append(" using ").append(e.getEntity2());
  2159. }
  2160. _context.getHarvestStatus().logMessage(error.toString(), true);
  2161. e.setEntity2_index(null);
  2162. break;
  2163. }
  2164. }
  2165. } // (end loop across all indices)
  2166. }//TESTED (success and fail cases, see INF1360_test_source.json:test3)
  2167. } // (end no entity2_index extracted, entity2 specified)
  2168. // Quality checks:
  2169. if ((esp.getEntity2() != null) && (null == e.getEntity2()) && (null == esp.getCreationCriteriaScript())) {
  2170. // Specified this (entity2), so going to insist on it
  2171. if (_context.isStandalone()) { // (minor message, while debugging only)
  2172. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required entity2 from: ").append(esp.getEntity2()).toString(), true);
  2173. }
  2174. return null;
  2175. }
  2176. if ((esp.getEntity2_index() != null) && (null == e.getEntity2_index()) && (null == esp.getCreationCriteriaScript())) {
  2177. // Specified this (entity2_index), so going to insist on it
  2178. if (_context.isStandalone()) { // (minor message, while debugging only)
  2179. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required entity2_index from: ").append(esp.getEntity2_index()).toString(), true);
  2180. }
  2181. return null;
  2182. }
  2183. //TESTED INF1360_test_source:test7 (no criteria), test8 (criteria)
  2184. } // (end entity2)
  2185. // Association.verb
  2186. if (esp.getVerb() != null)
  2187. {
  2188. if (JavaScriptUtils.containsScript(esp.getVerb()))
  2189. {
  2190. e.setVerb((String)getValueFromScript(esp.getVerb(), field, index));
  2191. }
  2192. else
  2193. {
  2194. e.setVerb(getFormattedTextFromField(esp.getVerb(), field));
  2195. }
  2196. if ((null == e.getVerb()) && (null == esp.getCreationCriteriaScript())) {
  2197. // Specified this, so going to insist on it
  2198. if (_context.isStandalone()) { // (minor message, while debugging only)
  2199. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required verb from: ").append(esp.getVerb()).toString(), true);
  2200. }
  2201. return null;
  2202. }
  2203. }
  2204. // Association.verb_category
  2205. if (esp.getVerb_category() != null)
  2206. {
  2207. if (JavaScriptUtils.containsScript(esp.getVerb_category()))
  2208. {
  2209. String s = (String)getValueFromScript(esp.getVerb_category(), field, index);
  2210. if (null != s) e.setVerb_category(s.toLowerCase());
  2211. }
  2212. else
  2213. {
  2214. String s = getFormattedTextFromField(esp.getVerb_category(), field);
  2215. if (null != s) e.setVerb_category(s.toLowerCase());
  2216. }
  2217. }
  2218. if (null == e.getVerb_category()) { // Needed: verb category (get from verb if not specified)
  2219. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required verb_category from: ").append(esp.getVerb_category()).toString(), true);
  2220. return null;
  2221. }
  2222. if (null == e.getVerb()) { // set from verb cat
  2223. e.setVerb(e.getVerb_category());
  2224. }
  2225. // Entity.start_time
  2226. if (esp.getTime_start() != null)
  2227. {
  2228. String startTimeString = null;
  2229. if (JavaScriptUtils.containsScript(esp.getTime_start()))
  2230. {
  2231. startTimeString = (String)getValueFromScript(esp.getTime_start(), field, index);
  2232. }
  2233. else
  2234. {
  2235. startTimeString = getFormattedTextFromField(esp.getTime_start(), field);
  2236. }
  2237. if (null != startTimeString) {
  2238. e.setTime_start(DateUtility.getIsoDateString(startTimeString));
  2239. }
  2240. // Allow this to be intrinsically optional
  2241. }
  2242. // Entity.end_time
  2243. if (esp.getTime_end() != null)
  2244. {
  2245. String endTimeString = null;
  2246. if (JavaScriptUtils.containsScript(esp.getTime_end()))
  2247. {
  2248. endTimeString = (String)getValueFromScript(esp.getTime_end(), field, index);
  2249. }
  2250. else
  2251. {
  2252. endTimeString = getFormattedTextFromField(esp.getTime_end(), field);
  2253. }
  2254. if (null != endTimeString) {
  2255. e.setTime_end(DateUtility.getIsoDateString(endTimeString));
  2256. }
  2257. // Allow this to be intrinsically optional
  2258. }
  2259. // Entity.geo_index
  2260. if (esp.getGeo_index() != null)
  2261. {
  2262. String geo_entity = null;
  2263. if (JavaScriptUtils.containsScript(esp.getGeo_index()))
  2264. {
  2265. geo_entity = (String)getValueFromScript(esp.getGeo_index(), field, index);
  2266. }
  2267. else
  2268. {
  2269. if ((_iterator != null) && (esp.getGeo_index().startsWith("$metadata.") || esp.getGeo_index().startsWith("${metadata."))) {
  2270. if (_context.isStandalone()) { // (minor message, while debugging only)
  2271. _context.getHarvestStatus().logMessage("Warning: in geo_index, using global $metadata when iterating", true);
  2272. }
  2273. }
  2274. geo_entity = getFormattedTextFromField(esp.getGeo_index(), field);
  2275. }
  2276. if (null != geo_entity) {
  2277. geo_entity = geo_entity.toLowerCase();
  2278. if (geo_entity.lastIndexOf('/') < 0) {
  2279. StringBuffer error = new StringBuffer("Malformed entity2_index with: ").append(esp.getGeo_index());
  2280. if (_context.isStandalone()) {
  2281. error.append(" using ").append(geo_entity);
  2282. }
  2283. _context.getHarvestStatus().logMessage(error.toString(), true);
  2284. geo_entity = null;
  2285. }
  2286. if (!_entityMap.contains(geo_entity)) {
  2287. StringBuffer error = new StringBuffer("Failed to disambiguate geo_index with: ").append(esp.getGeo_index());
  2288. if (_context.isStandalone()) {
  2289. error.append(" using ").append(geo_entity);
  2290. }
  2291. _context.getHarvestStatus().logMessage(error.toString(), true);
  2292. geo_entity = null;
  2293. }
  2294. //TESTED (INF1360_test_source:test4b)
  2295. }
  2296. //TESTED (INF1360_test_source:test4, test5, test6)
  2297. if (null != geo_entity) e.setGeo_index(geo_entity);
  2298. GeoPojo s1 = _geoMap.get(geo_entity);
  2299. e.setGeotag(s1);
  2300. //TESTED (INF1360_test_source:test4)
  2301. // Allow this to be intrinsically optional
  2302. }
  2303. // Get geo information based on geo tag
  2304. if (e.getGeotag() == null)
  2305. {
  2306. // Extract association geoTag if it exists in the association
  2307. if (esp.getGeotag() != null)
  2308. {
  2309. e.setGeotag(getEntityGeo(esp.getGeotag(), null, field));
  2310. }
  2311. // Otherwise search geoMap on index (entity1_index, entity2_index) for a geoTag
  2312. else
  2313. {
  2314. if (e.getEntity1_index() != null || e.getEntity2_index() != null)
  2315. {
  2316. GeoPojo s1 = _geoMap.get(e.getEntity1_index());
  2317. if (s1 != null)
  2318. {
  2319. e.setGeotag(s1);
  2320. e.setGeo_index(e.getEntity1_index());
  2321. }
  2322. else {
  2323. GeoPojo s2 = _geoMap.get(e.getEntity2_index());
  2324. if (s2 != null)
  2325. {
  2326. e.setGeotag(s2);
  2327. e.setGeo_index(e.getEntity2_index());
  2328. }
  2329. }
  2330. }
  2331. }
  2332. // Allow this to be intrinsically optional
  2333. }
  2334. // If all the indexes are null don't add the association
  2335. if (e.getEntity1_index() == null && e.getEntity2_index() == null && e.getGeo_index() == null) {
  2336. if (bDontResolveToIndices && _context.isStandalone()) { // (minor message, while debugging only)
  2337. _context.getHarvestStatus().logMessage("Warning: for summaries, at least one entity must be manually specified as an index", true);
  2338. }
  2339. return null;
  2340. }
  2341. // Calculate association type
  2342. if (bDontResolveToIndices) {
  2343. e.setAssociation_type("Summary");
  2344. }
  2345. else {
  2346. e.setAssociation_type(AssociationUtils.getAssocType(e));
  2347. if (null != esp.getAssoc_type()) {
  2348. if (!e.getAssociation_type().equals("Summary")) {
  2349. // Allowed to switch event<->fact
  2350. if (esp.getAssoc_type().equalsIgnoreCase("fact")) {
  2351. e.setAssociation_type("Fact");
  2352. }
  2353. else if (esp.getAssoc_type().equalsIgnoreCase("event")) {
  2354. e.setAssociation_type("Event");
  2355. }
  2356. }
  2357. }
  2358. }
  2359. return e;
  2360. }
  2361. catch (Exception e)
  2362. {
  2363. // This can happen as part of normal logic flow
  2364. //logger.error("Exception: " + e.getMessage());
  2365. return null;
  2366. }
  2367. }
  2368. /**
  2369. * getValueFromScript
  2370. * @param script
  2371. * @param value
  2372. * @param index
  2373. * @return
  2374. */
  2375. private Object getValueFromScript(String script, String value, String index)
  2376. {
  2377. return getValueFromScript(script, value, index, true);
  2378. }
  2379. private Object getValueFromScript(String script, String value, String index, boolean errorOnNull)
  2380. {
  2381. Object retVal = null;
  2382. try
  2383. {
  2384. // Create script object from entity or association JSON
  2385. if (_iterator != null)
  2386. {
  2387. if (null == _scriptEngine) {
  2388. throw new RuntimeException("Using script without specifying 'scriptEngine' field in 'structuredAnalysis'");
  2389. }
  2390. _scriptEngine.put("_iterator", _iterator);
  2391. _securityManager.eval(_scriptEngine, JavaScriptUtils.iteratorDocScript);
  2392. }
  2393. else {
  2394. _scriptEngine.put("_iterator", null);
  2395. }
  2396. // Pass value into script as _value so it is accessible
  2397. if (value != null) {
  2398. if (null == _scriptEngine) {
  2399. throw new RuntimeException("Using script without specifying 'scriptEngine' field in 'structuredAnalysis'");
  2400. }
  2401. _scriptEngine.put("_value", value);
  2402. }
  2403. else {
  2404. _scriptEngine.put("_value", null);
  2405. }
  2406. //
  2407. if (index != null) {
  2408. if (null == _scriptEngine) {
  2409. throw new RuntimeException("Using script without specifying 'scriptEngine' field in 'structuredAnalysis'");
  2410. }
  2411. _scriptEngine.put("_index", index);
  2412. }
  2413. else if (_iteratorIndex != null) {
  2414. if (null == _scriptEngine) {
  2415. throw new RuntimeException("Using script without specifying 'scriptEngine' field in 'structuredAnalysis'");
  2416. }
  2417. _scriptEngine.put("_index", _iteratorIndex);
  2418. }
  2419. else {
  2420. _scriptEngine.put("_index", null);
  2421. }
  2422. // $SCRIPT - string contains javacript to pass into the engine
  2423. // via .eval and then invoke to get a return value of type Object
  2424. if (script.toLowerCase().startsWith("$script"))
  2425. {
  2426. if (null == _scriptEngine) {
  2427. throw new RuntimeException("Using script without specifying 'scriptEngine' field in 'structuredAnalysis'");
  2428. }
  2429. _securityManager.eval(_scriptEngine, JavaScriptUtils.getScript(script));
  2430. //must turn the security back on when invoking calls
  2431. _securityManager.setJavascriptFlag(true);
  2432. try {
  2433. retVal = _scriptInvoker.invokeFunction(JavaScriptUtils.genericFunctionCall);
  2434. }
  2435. finally {
  2436. _securityManager.setJavascriptFlag(false);
  2437. }
  2438. }
  2439. // $FUNC - string contains the name of a function to call (i.e. getSometing(); )
  2440. else if (script.toLowerCase().startsWith("$func"))
  2441. {
  2442. if (null == _scriptEngine) {
  2443. throw new RuntimeException("Using script without specifying 'scriptEngine' field in 'structuredAnalysis'");
  2444. }
  2445. retVal = _securityManager.eval(_scriptEngine, JavaScriptUtils.getScript(script));
  2446. }
  2447. if (errorOnNull && (null == retVal) && _context.isStandalone()) { // Display warning:
  2448. StringBuffer error = new StringBuffer("Failed to get value from: ");
  2449. error.append("script=").append(script).append("; iterator=").append(null==_iterator?"null":_iterator.toString()).
  2450. append("; value=").append(null==value?"null":value).
  2451. append("; index=").append(index == null?_iteratorIndex:index);
  2452. _context.getHarvestStatus().logMessage(error.toString(), true);
  2453. }
  2454. }
  2455. catch (Exception e)
  2456. {
  2457. //e.printStackTrace();
  2458. StringBuffer error = HarvestExceptionUtils.createExceptionMessage(e);
  2459. error.append(": script=").append(script);
  2460. if (_context.isStandalone()) { // Standalone mode, provide more details
  2461. error.append("; iterator=").append(null==_iterator?"null":_iterator.toString()).
  2462. append("; value=").append(null==value?"null":value).
  2463. append("; index=").append(index == null?_iteratorIndex:index);
  2464. }
  2465. _context.getHarvestStatus().logMessage(error.toString(), true);
  2466. }
  2467. return retVal;
  2468. }
  2469. /**
  2470. * getDocGeo(DocGeoSpecPojo d)
  2471. * Convert the contents of a DocGeoSpecPojo to a GeoJSONPojo, i.e. return
  2472. * latitude and longitude for a feed
  2473. * @param d DocGeoSpecPojo
  2474. * @return GeoJSONPojo
  2475. */
  2476. private GeoPojo getDocGeo(GeoSpecPojo d)
  2477. {
  2478. GeoPojo docGeo = new GeoPojo();
  2479. String latValue = null;
  2480. String lonValue = null;
  2481. try
  2482. {
  2483. // The DocSpecGeoPojo already has lat and lon so we just need to retrieve the values
  2484. if ((d.getLat() != null) && (d.getLon() != null))
  2485. {
  2486. if (JavaScriptUtils.containsScript(d.getLat()))
  2487. {
  2488. latValue = (String)getValueFromScript(d.getLat(), null, null);
  2489. }
  2490. else
  2491. {
  2492. latValue = getStringFromJsonField(d.getLat(), null);
  2493. }
  2494. if (JavaScriptUtils.containsScript(d.getLat()))
  2495. {
  2496. lonValue = (String)getValueFromScript(d.getLon(), null, null);
  2497. }
  2498. else
  2499. {
  2500. lonValue = getStringFromJsonField(d.getLon(), null);
  2501. }
  2502. }
  2503. // Try and retrieve lat and lon using city, state, country values
  2504. else
  2505. {
  2506. String city, region, country, countryCode = null;
  2507. // Create a GeoReferencePojo from the DocSpecGeo object
  2508. GeoFeaturePojo g = new GeoFeaturePojo();
  2509. if (d.getCity() != null)
  2510. {
  2511. if (JavaScriptUtils.containsScript(d.getCity()))
  2512. {
  2513. city = (String)getValueFromScript(d.getCity(), null, null);
  2514. }
  2515. else
  2516. {
  2517. city = getFormattedTextFromField(d.getCity(), null);
  2518. }
  2519. g.setCity(city);
  2520. g.setSearch_field(city);
  2521. }
  2522. if (d.getStateProvince() != null)
  2523. {
  2524. if (JavaScriptUtils.containsScript(d.getStateProvince()))
  2525. {
  2526. region = (String)getValueFromScript(d.getStateProvince(), null, null);
  2527. }
  2528. else
  2529. {
  2530. region = getFormattedTextFromField(d.getStateProvince(), null);
  2531. }
  2532. g.setRegion(region);
  2533. if (g.getSearch_field() == null) g.setSearch_field(region);
  2534. }
  2535. if (d.getCountry() != null)
  2536. {
  2537. if (JavaScriptUtils.containsScript(d.getCountry()))
  2538. {
  2539. country = (String)getValueFromScript(d.getCountry(), null, null);
  2540. }
  2541. else
  2542. {
  2543. country = getFormattedTextFromField(d.getCountry(), null);
  2544. }
  2545. g.setCountry(country);
  2546. if (g.getSearch_field() == null) g.setSearch_field(country);
  2547. }
  2548. if (d.getCountryCode() != null)
  2549. {
  2550. if (JavaScriptUtils.containsScript(d.getCountryCode()))
  2551. {
  2552. countryCode = (String)getValueFromScript(d.getCountryCode(), null, null);
  2553. }
  2554. else
  2555. {
  2556. countryCode = getFormattedTextFromField(d.getCountryCode(), null);
  2557. }
  2558. g.setCountry_code(countryCode);
  2559. if (g.getSearch_field() == null) g.setSearch_field(countryCode);
  2560. }
  2561. // Send the GeoReferencePojo to enrichGeoInfo to attempt to get lat and lon values
  2562. boolean bStrictMatch = (null == d.getStrictMatch()) || d.getStrictMatch();
  2563. List<GeoFeaturePojo> gList = GeoReference.enrichGeoInfo(g, bStrictMatch, true, 1);
  2564. latValue = gList.get(0).getGeoindex().lat.toString();
  2565. lonValue = gList.get(0).getGeoindex().lon.toString();
  2566. }
  2567. // Set lat and long in DocGeo if possible
  2568. docGeo.lat = Double.parseDouble(latValue);
  2569. docGeo.lon = Double.parseDouble(lonValue);
  2570. if (docGeo.lat == 0 && docGeo.lon == 0) docGeo = null; // Don't save 0,0 vals
  2571. }
  2572. catch (Exception e)
  2573. {
  2574. if (null != d.getAlternatives()) {
  2575. for (GeoSpecPojo altIn: d.getAlternatives()) {
  2576. GeoPojo altOut = getDocGeo(altIn);
  2577. if (null != altOut) {
  2578. return altOut;
  2579. }
  2580. }
  2581. }
  2582. docGeo = null;
  2583. }
  2584. return docGeo;
  2585. }
  2586. /**
  2587. * getEntityGeo
  2588. * Get GeoPojo object for entities and associations
  2589. * @param gsp
  2590. * @return
  2591. */
  2592. private GeoPojo getEntityGeo(GeoSpecPojo gsp, DocumentPojo f, String field)
  2593. {
  2594. try
  2595. {
  2596. GeoPojo g = null;
  2597. Double dLat = (double) 0;
  2598. Double dLon = (double) 0;
  2599. if (gsp != null)
  2600. {
  2601. String latValue = null;
  2602. String lonValue = null;
  2603. // The GeoSpecPojo already has lat and lon so we just need to retrieve the values
  2604. if ((gsp.getLat() != null) && (gsp.getLon() != null)) {
  2605. if (JavaScriptUtils.containsScript(gsp.getLat()))
  2606. {
  2607. latValue = (String)getValueFromScript(gsp.getLat(), null, null);
  2608. }
  2609. else
  2610. {
  2611. latValue = getFormattedTextFromField(gsp.getLat(), field);
  2612. }
  2613. if (JavaScriptUtils.containsScript(gsp.getLon()))
  2614. {
  2615. lonValue = (String)getValueFromScript(gsp.getLon(), null, null);
  2616. }
  2617. else
  2618. {
  2619. lonValue = getFormattedTextFromField(gsp.getLon(), field);
  2620. }
  2621. if (latValue != null && lonValue != null)
  2622. {
  2623. dLat = Double.parseDouble(latValue);
  2624. dLon = Double.parseDouble(lonValue);
  2625. }
  2626. }
  2627. else
  2628. {
  2629. String city, region, country, countryCode = null;
  2630. // Create a GeoReferencePojo from the GeoSpec object
  2631. GeoFeaturePojo gfp = new GeoFeaturePojo();
  2632. if (gsp.getCity() != null)
  2633. {
  2634. if (JavaScriptUtils.containsScript(gsp.getCity()))
  2635. {
  2636. city = (String)getValueFromScript(gsp.getCity(), null, null);
  2637. }
  2638. else
  2639. {
  2640. city = getFormattedTextFromField(gsp.getCity(), null);
  2641. }
  2642. gfp.setCity(city);
  2643. gfp.setSearch_field(city);
  2644. }
  2645. if (gsp.getStateProvince() != null)
  2646. {
  2647. if (JavaScriptUtils.containsScript(gsp.getStateProvince()))
  2648. {
  2649. region = (String)getValueFromScript(gsp.getStateProvince(), null, null);
  2650. }
  2651. else
  2652. {
  2653. region = getFormattedTextFromField(gsp.getStateProvince(), null);
  2654. }
  2655. gfp.setRegion(region);
  2656. if (gfp.getSearch_field() == null) gfp.setSearch_field(region);
  2657. }
  2658. if (gsp.getCountry() != null)
  2659. {
  2660. if (JavaScriptUtils.containsScript(gsp.getCountry()))
  2661. {
  2662. country = (String)getValueFromScript(gsp.getCountry(), null, null);
  2663. }
  2664. else
  2665. {
  2666. country = getFormattedTextFromField(gsp.getCountry(), null);
  2667. }
  2668. gfp.setCountry(country);
  2669. if (gfp.getSearch_field() == null) gfp.setSearch_field(country);
  2670. }
  2671. if (gsp.getCountryCode() != null)
  2672. {
  2673. if (JavaScriptUtils.containsScript(gsp.getCountryCode()))
  2674. {
  2675. countryCode = (String)getValueFromScript(gsp.getCountryCode(), null, null);
  2676. }
  2677. else
  2678. {
  2679. countryCode = getFormattedTextFromField(gsp.getCountryCode(), null);
  2680. }
  2681. gfp.setCountry_code(countryCode);
  2682. // (Don't set to search field for country code - it will be equal to country...)
  2683. }
  2684. // Send the GeoReferencePojo to enrichGeoInfo to attempt to get lat and lon values
  2685. boolean bStrictMatch = (null == gsp.getStrictMatch()) || gsp.getStrictMatch();
  2686. List<GeoFeaturePojo> gList = GeoReference.enrichGeoInfo(gfp, bStrictMatch, true, 1);
  2687. GeoFeaturePojo firstGeo = gList.get(0);
  2688. latValue = firstGeo.getGeoindex().lat.toString();
  2689. lonValue = firstGeo.getGeoindex().lon.toString();
  2690. gsp.setOntology_type(firstGeo.getOntology_type());
  2691. // Set lat and long in DocGeo if possible
  2692. dLat = Double.parseDouble(latValue);
  2693. dLon = Double.parseDouble(lonValue);
  2694. }
  2695. }
  2696. if (dLat != 0 && dLon !=0)
  2697. {
  2698. g = new GeoPojo();
  2699. g.lat = dLat;
  2700. g.lon = dLon;
  2701. }
  2702. return g;
  2703. }
  2704. catch (Exception e) // If alternatives are specified we can try them instead
  2705. {
  2706. if (null != gsp.getAlternatives()) {
  2707. for (GeoSpecPojo altIn: gsp.getAlternatives()) {
  2708. GeoPojo altOut = getEntityGeo(altIn, f, field);
  2709. if (null != altOut) {
  2710. gsp.setOntology_type(altIn.getOntology_type());
  2711. return altOut;
  2712. }
  2713. }
  2714. }
  2715. return null;
  2716. }
  2717. }
  2718. /**
  2719. * executeEntityAssociationValidation
  2720. * @param s
  2721. * @param j
  2722. * @return
  2723. */
  2724. private Boolean executeEntityAssociationValidation(String s, String value, String index)
  2725. {
  2726. Boolean retVal = null;
  2727. try
  2728. {
  2729. // Run our script that checks whether or not the entity/association should be added
  2730. Object retValObj = getValueFromScript(s, value, index);
  2731. try {
  2732. retVal = (Boolean) retValObj;
  2733. }
  2734. catch (Exception e) {} // case exception handled below
  2735. if (null == retVal) { // If it's any string, then creation criteria == false and log message
  2736. _context.getHarvestStatus().logMessage((String)retValObj, true);
  2737. retVal = false;
  2738. }//TESTED
  2739. }
  2740. catch (Exception e)
  2741. {
  2742. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  2743. retVal = false;
  2744. }
  2745. return retVal;
  2746. }
  2747. /**
  2748. * getFormattedTextFromField
  2749. * Accepts a string value that can contain a combination of literal text and
  2750. * names of fields in the JSON document that need to be retrieved into the
  2751. * literal text, i.e.:
  2752. * 'On $metadata.reportdatetime MPD reported that a $metadata.offense occurred.'
  2753. * @param v - origString
  2754. * @return String
  2755. */
  2756. private String getFormattedTextFromField(String origString, String value)
  2757. {
  2758. // Don't bother running the rest of the code if there are no replacements to make (i.e. does not have $)
  2759. if (!origString.contains("$")) return origString;
  2760. StringBuffer sb = new StringBuffer();
  2761. Matcher m = SUBSTITUTION_PATTERN.matcher(origString);
  2762. int ncurrpos = 0;
  2763. // Iterate over each match found within the string and concatenate values together:
  2764. // string literal value + JSON field (matched pattern) retrieved
  2765. while (m.find())
  2766. {
  2767. int nnewpos = m.start();
  2768. sb.append(origString.substring(ncurrpos, nnewpos));
  2769. ncurrpos = m.end();
  2770. // Retrieve the field information matched with the RegEx
  2771. String match = (m.group(1) != null) ? m.group(1): m.group(2);
  2772. String sreplace;
  2773. if ((null != match) && match.equals("$")) { // $ escaping via ${$}
  2774. sreplace = "$";
  2775. }//TESTED
  2776. else {
  2777. // Retrieve the data from the JSON field and append
  2778. sreplace = getStringFromJsonField(match, value);
  2779. }
  2780. if (null == sreplace) {
  2781. return null;
  2782. }
  2783. sb.append( sreplace );
  2784. }
  2785. sb.append(origString.substring(ncurrpos));
  2786. return sb.toString();
  2787. }
  2788. /**
  2789. * getStringFromJsonField
  2790. * Takes string in the form of: node1.nodeN.fieldName and returns
  2791. * the value contained in the JSON for that field as an String
  2792. * Note: supports leading $s in the field name, $s get stripped
  2793. * out in getValueFromJsonField
  2794. * @param fieldLocation
  2795. * @return Object
  2796. */
  2797. private String getStringFromJsonField(String fieldLocation, String value)
  2798. {
  2799. try
  2800. {
  2801. if ((null != value) && fieldLocation.equalsIgnoreCase("value")) // ($value when iterating)
  2802. {
  2803. return value;
  2804. }//TESTED
  2805. if ((null == _iterator) && (null != _docPojo) && fieldLocation.equalsIgnoreCase("fullText")) { // another special case
  2806. return _docPojo.getFullText();
  2807. }//TESTED
  2808. return (String)getValueFromJsonField(fieldLocation);
  2809. }
  2810. catch (Exception e)
  2811. {
  2812. return null;
  2813. }
  2814. }
  2815. /**
  2816. * getValueFromJsonField
  2817. * Takes string in the form of: node1.node2.fieldName and returns
  2818. * the value contained in the JSON for that field as an Object
  2819. * Note: supports leading $s in the field name
  2820. * @param fieldLocation
  2821. * @return
  2822. */
  2823. private Object getValueFromJsonField(String fieldLocation)
  2824. {
  2825. try
  2826. {
  2827. // Strip out $ chars if present and then split on '.'
  2828. // to get the JSON node hierarchy and field name
  2829. String[] field = fieldLocation.replace("$", "").split("\\.");
  2830. StringBuffer node = new StringBuffer();
  2831. // JSON node = all strings in field[] (except for the last string in the array)
  2832. // concatenated together with the '.' char
  2833. if (field.length > 1)
  2834. {
  2835. for ( int i = 0; i < field.length - 1; i++ )
  2836. {
  2837. if (node.length() > 0) node.append(".");
  2838. node.append(field[i]);
  2839. }
  2840. }
  2841. // The field name is the final value in the array
  2842. String fieldName = field[field.length - 1];
  2843. return getValueFromJson(node.toString(), fieldName);
  2844. }
  2845. catch (Exception e)
  2846. {
  2847. // This can happen as part of normal logic flow
  2848. //logger.error("getValueFromJsonField Exception: " + e.getMessage());
  2849. return null;
  2850. }
  2851. }
  2852. /**
  2853. * getValueFromJson(String node, String field)
  2854. * Attempts to retrieve a value from the node/field and return
  2855. * and object containing the value to be converted by calling method
  2856. * @param node
  2857. * @param field
  2858. * @return Object o
  2859. */
  2860. private Object getValueFromJson(String node, String field)
  2861. {
  2862. JSONObject json = (_iterator != null) ? _iterator : _document;
  2863. Object o = null;
  2864. try
  2865. {
  2866. if (node.length() > 1)
  2867. {
  2868. // (removed the [] case, you'll need to do that with scripts unless you want [0] for every field)
  2869. // Mostly standard case $metadata(.object).+field
  2870. if (node.indexOf('.') > -1) {
  2871. String node_fields[] = node.split("\\.");
  2872. JSONObject jo = json;
  2873. for (String f: node_fields) {
  2874. Object testJo = jo.get(f);
  2875. if (testJo instanceof JSONArray) {
  2876. jo = ((JSONArray)testJo).getJSONObject(0);
  2877. }
  2878. else {
  2879. jo = (JSONObject)testJo;
  2880. }
  2881. }
  2882. Object testJo = jo.get(field);
  2883. if (testJo instanceof JSONArray) {
  2884. o = ((JSONArray)testJo).getString(0);
  2885. }
  2886. else {
  2887. o = testJo;
  2888. }
  2889. }
  2890. // Standard case - $metadata.field
  2891. else
  2892. {
  2893. JSONObject jo = json.getJSONObject(node);
  2894. Object testJo = jo.get(field);
  2895. if (testJo instanceof JSONArray)
  2896. {
  2897. o = ((JSONArray)testJo).getString(0);
  2898. }
  2899. else
  2900. {
  2901. o = testJo;
  2902. }
  2903. }
  2904. }
  2905. else
  2906. {
  2907. Object testJo = json.get(field);
  2908. if (testJo instanceof JSONArray)
  2909. {
  2910. o = ((JSONArray)testJo).getString(0);
  2911. }
  2912. else
  2913. {
  2914. o = testJo;
  2915. }
  2916. }
  2917. }
  2918. catch (Exception e)
  2919. {
  2920. // This can happen as part of normal logic flow
  2921. //logger.error("getValueFromJson Exception: " + e.getMessage());
  2922. return null;
  2923. }
  2924. return o;
  2925. }
  2926. /////////////////////////////////////////////////////
  2927. // Utility function to expand all "iterateOver"s of the format a.b.c
  2928. private static void expandIterationLoops(StructuredAnalysisConfigPojo s) {
  2929. // Entities first:
  2930. HashMap<String, EntitySpecPojo> nestedEntityMap = null;
  2931. ArrayList<EntitySpecPojo> newEntityEntries = null;
  2932. if (null != s.getEntities()) {
  2933. Iterator<EntitySpecPojo> entSpecIt = s.getEntities().iterator();
  2934. while (entSpecIt.hasNext()) {
  2935. EntitySpecPojo entS = entSpecIt.next();
  2936. if ((null != entS.getIterateOver()) && (entS.getIterateOver().contains(".")))
  2937. {
  2938. // For associations only: included here so it doesn't get forgotten in cut-and-pastes...
  2939. //if (assocS.getIterateOver().contains(",") || assocS.getIterateOver().contains("/")) {
  2940. // continue;
  2941. //}
  2942. if (null == nestedEntityMap) { // (do need this map)
  2943. nestedEntityMap = new HashMap<String, EntitySpecPojo>();
  2944. }
  2945. if (null == newEntityEntries) {
  2946. newEntityEntries = new ArrayList<EntitySpecPojo>(10);
  2947. }
  2948. EntitySpecPojo prevLevelSpec = null;
  2949. String iterateOver = entS.getIterateOver() + "."; // (end with "." to make life easier)
  2950. entS.setIterateOver(null); // (this is now the end of the chain)
  2951. entSpecIt.remove(); // (so remove from the list)
  2952. boolean bChainBroken = false;
  2953. for (int nCurrDot = iterateOver.indexOf('.'), nLastDot = -1;
  2954. nCurrDot >= 0;
  2955. nLastDot = nCurrDot, nCurrDot = iterateOver.indexOf('.', nCurrDot + 1))
  2956. {
  2957. String currLevel = iterateOver.substring(0, nCurrDot); // (eg a, a.b, a.b.c)
  2958. String lastComp_currLevel = iterateOver.substring(nLastDot + 1, nCurrDot); // (eg a, b, c)
  2959. EntitySpecPojo currLevelSpec = null;
  2960. if (!bChainBroken) {
  2961. currLevelSpec = nestedEntityMap.get(currLevel);
  2962. }
  2963. if (null == currLevelSpec) {
  2964. bChainBroken = true; // (no point in doing any more lookups)
  2965. currLevelSpec = new EntitySpecPojo();
  2966. nestedEntityMap.put(currLevel, currLevelSpec);
  2967. currLevelSpec.setIterateOver(lastComp_currLevel);
  2968. if (null != prevLevelSpec) { // add myself to the next level
  2969. if (null == prevLevelSpec.getEntities()) {
  2970. prevLevelSpec.setEntities(new ArrayList<EntitySpecPojo>(5));
  2971. }
  2972. prevLevelSpec.getEntities().add(currLevelSpec);
  2973. }
  2974. else { // I am the first level, add myself to entity list
  2975. newEntityEntries.add(currLevelSpec); // (this is now the head of the chain)
  2976. }
  2977. prevLevelSpec = currLevelSpec;
  2978. }//TESTED
  2979. else { // We're already have this level, so carry on:
  2980. prevLevelSpec = currLevelSpec; //(in case this was the last level...)
  2981. continue;
  2982. }//TESTED
  2983. } //(end loop over expansion levels)
  2984. // Add entS (ie the spec with the content) to the end of the chain
  2985. if (null != prevLevelSpec) { // (probably an internal logic error if not)
  2986. if (null == prevLevelSpec.getEntities()) {
  2987. prevLevelSpec.setEntities(new ArrayList<EntitySpecPojo>(5));
  2988. }
  2989. prevLevelSpec.getEntities().add(entS);
  2990. }//TESTED
  2991. }//(end found entity with expandable iterateOver)
  2992. else if (null != entS.getIterateOver()) { // Non-nested case, simpler
  2993. // For associations only: included here so it doesn't get forgotten in cut-and-pastes...
  2994. //if (assocS.getIterateOver().contains(",") || assocS.getIterateOver().contains("/")) {
  2995. // continue;
  2996. //}
  2997. if (null == nestedEntityMap) { // (do need this map)
  2998. nestedEntityMap = new HashMap<String, EntitySpecPojo>();
  2999. }
  3000. //(and logic is different enough that it makes most sense to do separately rather than grovel to save a few lines)
  3001. EntitySpecPojo currSpec = nestedEntityMap.get(entS.getIterateOver());
  3002. if (null != currSpec) {
  3003. entSpecIt.remove();
  3004. if (null == currSpec.getEntities()) {
  3005. currSpec.setEntities(new ArrayList<EntitySpecPojo>(5));
  3006. }
  3007. entS.setIterateOver(null);
  3008. currSpec.getEntities().add(entS);
  3009. }
  3010. else {
  3011. nestedEntityMap.put(entS.getIterateOver(), entS);
  3012. }
  3013. }//TESTED
  3014. }// (end loop over entities)
  3015. if (null != newEntityEntries) {
  3016. s.getEntities().addAll(newEntityEntries);
  3017. }
  3018. }//(end if entities)
  3019. // Identical code for associations:
  3020. // Just going to cut and replace and rename a few variables
  3021. //HashMap<String, AssociationSpecPojo> nestedAssociationMap = null;
  3022. HashMap<String, AssociationSpecPojo> nestedAssocMap = null;
  3023. ArrayList<AssociationSpecPojo> newAssocEntries = null;
  3024. if (null != s.getAssociations()) {
  3025. Iterator<AssociationSpecPojo> assocSpecIt = s.getAssociations().iterator();
  3026. while (assocSpecIt.hasNext()) {
  3027. AssociationSpecPojo assocS = assocSpecIt.next();
  3028. if ((null != assocS.getIterateOver()) && (assocS.getIterateOver().contains(".")))
  3029. {
  3030. // For associations only: included here so it doesn't get forgotten in cut-and-pastes...
  3031. if (assocS.getIterateOver().contains(",") || assocS.getIterateOver().contains("/")) {
  3032. continue;
  3033. }//TESTED
  3034. if (null == nestedAssocMap) { // (do need this map)
  3035. nestedAssocMap = new HashMap<String, AssociationSpecPojo>();
  3036. }
  3037. if (null == newAssocEntries) {
  3038. newAssocEntries = new ArrayList<AssociationSpecPojo>(10);
  3039. }
  3040. AssociationSpecPojo prevLevelSpec = null;
  3041. String iterateOver = assocS.getIterateOver() + "."; // (end with "." to make life easier)
  3042. assocS.setIterateOver(null); // (this is now the end of the chain)
  3043. assocSpecIt.remove(); // (so remove from the list)
  3044. boolean bChainBroken = false;
  3045. for (int nCurrDot = iterateOver.indexOf('.'), nLastDot = -1;
  3046. nCurrDot >= 0;
  3047. nLastDot = nCurrDot, nCurrDot = iterateOver.indexOf('.', nCurrDot + 1))
  3048. {
  3049. String currLevel = iterateOver.substring(0, nCurrDot); // (eg a, a.b, a.b.c)
  3050. String lastComp_currLevel = iterateOver.substring(nLastDot + 1, nCurrDot); // (eg a, b, c)
  3051. AssociationSpecPojo currLevelSpec = null;
  3052. if (!bChainBroken) {
  3053. currLevelSpec = nestedAssocMap.get(currLevel);
  3054. }
  3055. if (null == currLevelSpec) {
  3056. bChainBroken = true; // (no point in doing any more lookups)
  3057. currLevelSpec = new AssociationSpecPojo();
  3058. nestedAssocMap.put(currLevel, currLevelSpec);
  3059. currLevelSpec.setIterateOver(lastComp_currLevel);
  3060. if (null != prevLevelSpec) { // add myself to the next level
  3061. if (null == prevLevelSpec.getAssociations()) {
  3062. prevLevelSpec.setAssociations(new ArrayList<AssociationSpecPojo>(5));
  3063. }
  3064. prevLevelSpec.getAssociations().add(currLevelSpec);
  3065. }
  3066. else { // I am the first level, add myself to entity list
  3067. newAssocEntries.add(currLevelSpec); // (this is now the head of the chain)
  3068. }
  3069. prevLevelSpec = currLevelSpec;
  3070. }//TESTED
  3071. else { // We're already have this level, so carry on:
  3072. prevLevelSpec = currLevelSpec; //(in case this was the last level...)
  3073. continue;
  3074. }//TESTED
  3075. } //(end loop over expansion levels)
  3076. // Add entS (ie the spec with the content) to the end of the chain
  3077. if (null != prevLevelSpec) { // (probably an internal logic error if not)
  3078. if (null == prevLevelSpec.getAssociations()) {
  3079. prevLevelSpec.setAssociations(new ArrayList<AssociationSpecPojo>(5));
  3080. }
  3081. prevLevelSpec.getAssociations().add(assocS);
  3082. }//TESTED
  3083. }//(end found entity with expandable iterateOver)
  3084. else if (null != assocS.getIterateOver()) { // Non-nested case, simpler
  3085. // For associations only: included here so it doesn't get forgotten in cut-and-pastes...
  3086. if (assocS.getIterateOver().contains(",") || assocS.getIterateOver().contains("/")) {
  3087. continue;
  3088. }//TESTED
  3089. if (null == nestedAssocMap) { // (do need this map)
  3090. nestedAssocMap = new HashMap<String, AssociationSpecPojo>();
  3091. }
  3092. //(and logic is different enough that it makes most sense to do separately rather than grovel to save a few lines)
  3093. AssociationSpecPojo currSpec = nestedAssocMap.get(assocS.getIterateOver());
  3094. if (null != currSpec) {
  3095. assocSpecIt.remove();
  3096. if (null == currSpec.getAssociations()) {
  3097. currSpec.setAssociations(new ArrayList<AssociationSpecPojo>(5));
  3098. }
  3099. assocS.setIterateOver(null);
  3100. currSpec.getAssociations().add(assocS);
  3101. }
  3102. else {
  3103. nestedAssocMap.put(assocS.getIterateOver(), assocS);
  3104. }
  3105. }//TESTED
  3106. }// (end loop over entities)
  3107. if (null != newAssocEntries) {
  3108. s.getAssociations().addAll(newAssocEntries);
  3109. }
  3110. }//(end if entities)
  3111. }
  3112. /////////////////////////////////////////////////////////////////////////////
  3113. // Share utility to repopulate the entity cache before ent/assoc processing
  3114. private void repopulateEntityCacheIfNeeded(DocumentPojo f)
  3115. {
  3116. if (null == _entityMap) {
  3117. _entityMap = new HashSet<String>();
  3118. _geoMap = new HashMap<String, GeoPojo>();
  3119. if (f.getEntities() != null)
  3120. {
  3121. for (EntityPojo ent: f.getEntities()) {
  3122. if (null != ent.getIndex()) {
  3123. _entityMap.add(ent.getIndex());
  3124. if (null != ent.getGeotag()) {
  3125. _geoMap.put(ent.getIndex(), ent.getGeotag());
  3126. }
  3127. }
  3128. }
  3129. }//TESTED (in INF_1360_test_source.json:test8, hand created f.entities containing "entity2/type2")
  3130. }
  3131. }
  3132. /////////////////////////////////////////////////////////////////////////////
  3133. //TEST CODE:
  3134. public static void main(String[] argv) {
  3135. // Test entity expansion:
  3136. StructuredAnalysisConfigPojo s = new StructuredAnalysisConfigPojo();
  3137. s.setEntities(new ArrayList<EntitySpecPojo>(20));
  3138. EntitySpecPojo e = null;
  3139. e = new EntitySpecPojo();
  3140. //a1
  3141. e.setIterateOver("a");
  3142. e.setDisambiguated_name("a.test1");
  3143. s.getEntities().add(e);
  3144. //a2
  3145. e = new EntitySpecPojo();
  3146. e.setIterateOver("a");
  3147. e.setDisambiguated_name("a.test2");
  3148. s.getEntities().add(e);
  3149. //x1
  3150. e = new EntitySpecPojo();
  3151. e.setIterateOver("x");
  3152. e.setDisambiguated_name("x.test1");
  3153. s.getEntities().add(e);
  3154. //a.b1
  3155. e = new EntitySpecPojo();
  3156. e.setIterateOver("a.b");
  3157. e.setDisambiguated_name("a.b.test1");
  3158. s.getEntities().add(e);
  3159. //a.b.c.d1
  3160. e = new EntitySpecPojo();
  3161. e.setIterateOver("a.b.c.d");
  3162. e.setDisambiguated_name("a.b.c.d.test1");
  3163. s.getEntities().add(e);
  3164. //a.b2
  3165. e = new EntitySpecPojo();
  3166. e.setIterateOver("a.b");
  3167. e.setDisambiguated_name("a.b.test2");
  3168. s.getEntities().add(e);
  3169. //p.q1
  3170. e = new EntitySpecPojo();
  3171. e.setIterateOver("p.q");
  3172. e.setDisambiguated_name("p.q.test1");
  3173. s.getEntities().add(e);
  3174. // null case
  3175. e = new EntitySpecPojo();
  3176. e.setDisambiguated_name("(null iterator)");
  3177. s.getEntities().add(e);
  3178. expandIterationLoops(s);
  3179. System.out.println("TEST1: ENTITY ITERATION EXPANSION: ");
  3180. System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(s));
  3181. s.setAssociations(new ArrayList<AssociationSpecPojo>(20));
  3182. AssociationSpecPojo assoc = null;
  3183. assoc = new AssociationSpecPojo();
  3184. //a1
  3185. assoc.setIterateOver("a");
  3186. assoc.setEntity1("a.test1");
  3187. s.getAssociations().add(assoc);
  3188. //a2
  3189. assoc = new AssociationSpecPojo();
  3190. assoc.setIterateOver("a");
  3191. assoc.setEntity1("a.test2");
  3192. s.getAssociations().add(assoc);
  3193. //x1
  3194. assoc = new AssociationSpecPojo();
  3195. assoc.setIterateOver("x");
  3196. assoc.setEntity1("x.test1");
  3197. s.getAssociations().add(assoc);
  3198. //a.b1
  3199. assoc = new AssociationSpecPojo();
  3200. assoc.setIterateOver("a.b");
  3201. assoc.setEntity1("a.b.test1");
  3202. s.getAssociations().add(assoc);
  3203. //a.b.c.d1
  3204. assoc =new AssociationSpecPojo();
  3205. assoc.setIterateOver("a.b.c.d");
  3206. assoc.setEntity1("a.b.c.d.test1");
  3207. s.getAssociations().add(assoc);
  3208. //a.b2
  3209. assoc =new AssociationSpecPojo();
  3210. assoc.setIterateOver("a.b");
  3211. assoc.setEntity1("a.b.test2");
  3212. s.getAssociations().add(assoc);
  3213. //p.q1
  3214. assoc =new AssociationSpecPojo();
  3215. assoc.setIterateOver("p.q");
  3216. assoc.setEntity1("p.q.test1");
  3217. s.getAssociations().add(assoc);
  3218. //"," case
  3219. assoc =new AssociationSpecPojo();
  3220. assoc.setIterateOver("p.q,RR");
  3221. assoc.setEntity1("ITERATE OVER p.q,RR");
  3222. s.getAssociations().add(assoc);
  3223. //"/" case
  3224. assoc =new AssociationSpecPojo();
  3225. assoc.setIterateOver("p.q/SS");
  3226. assoc.setEntity1("ITERATE OVER p.q/SS");
  3227. s.getAssociations().add(assoc);
  3228. // null case
  3229. assoc =new AssociationSpecPojo();
  3230. assoc.setEntity1("(null iterator)");
  3231. s.getAssociations().add(assoc);
  3232. //SHOULD HAVE TEST FOR ITERATE OVER p,q (now hand tested anyway)
  3233. expandIterationLoops(s);
  3234. System.out.println("TEST2: ASSOCIATION ITERATION EXPANSION: ");
  3235. System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(s));
  3236. }
  3237. }