PageRenderTime 58ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/enrichment/custom/StructuredAnalysisHarvester.java

https://github.com/IKANOW/Infinit.e
Java | 3617 lines | 2674 code | 380 blank | 563 comment | 935 complexity | bd317aa493c2dd1ee19c53fd373274ea MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.harvest.enrichment.custom;
  17. import java.util.ArrayList;
  18. import java.util.Arrays;
  19. import java.util.Date;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.Hashtable;
  23. import java.util.Iterator;
  24. import java.util.List;
  25. import java.util.Map;
  26. import java.util.Map.Entry;
  27. import java.util.Set;
  28. import java.util.regex.Matcher;
  29. import java.util.regex.Pattern;
  30. import javax.script.*;
  31. import org.apache.log4j.Logger;
  32. import org.bson.types.ObjectId;
  33. import org.json.JSONArray;
  34. import org.json.JSONException;
  35. import org.json.JSONObject;
  36. import com.google.gson.Gson;
  37. import com.google.gson.GsonBuilder;
  38. import com.ikanow.infinit.e.data_model.store.DbManager;
  39. import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
  40. import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.DocumentSpecPojo;
  41. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  42. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo;
  43. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo.GeoSpecPojo;
  44. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo.EntitySpecPojo;
  45. import com.ikanow.infinit.e.data_model.store.config.source.StructuredAnalysisConfigPojo.AssociationSpecPojo;
  46. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  47. import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
  48. import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
  49. import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
  50. import com.ikanow.infinit.e.data_model.store.feature.geo.GeoFeaturePojo;
  51. import com.ikanow.infinit.e.data_model.utils.GeoOntologyMapping;
  52. import com.ikanow.infinit.e.harvest.HarvestContext;
  53. import com.ikanow.infinit.e.harvest.HarvestController;
  54. import com.ikanow.infinit.e.harvest.utils.DateUtility;
  55. import com.ikanow.infinit.e.harvest.utils.AssociationUtils;
  56. import com.ikanow.infinit.e.data_model.utils.DimensionUtility;
  57. import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
  58. import com.mongodb.BasicDBList;
  59. import com.mongodb.BasicDBObject;
  60. /**
  61. * StructuredAnalysisHarvester
  62. * @author cvitter
  63. */
  64. public class StructuredAnalysisHarvester
  65. {
  66. ///////////////////////////////////////////////////////////////////////////////////////////
  67. // NEW PROCESSING PIPELINE INTERFACE
  68. public void setContext(HarvestContext context) {
  69. _context = context;
  70. // Setup some globals if necessary
  71. if (null == _gson) {
  72. GsonBuilder gb = new GsonBuilder();
  73. _gson = gb.create();
  74. }
  75. }
  76. public void resetForNewDoc() {
  77. resetEntityCache();
  78. resetDocumentCache();
  79. }
  80. public void resetEntityCache() {
  81. // Clear geoMap before we start extracting entities and associations for each feed
  82. if (null != _entityMap) {
  83. if (!_geoMap.isEmpty()) _geoMap.clear();
  84. if (!_entityMap.isEmpty()) _entityMap.clear();
  85. // Fill in geoMap and entityMap with any existing docs/entities
  86. _entityMap = null;
  87. _geoMap = null;
  88. }
  89. }//TESTED (entity_cache_reset_test)
  90. public void resetDocumentCache() {
  91. this._document = null;
  92. this._docPojo = null;
  93. }
  94. // Load global functions
  95. // (scriptLang currently ignored)
  96. public void loadGlobalFunctions(List<String> imports, List<String> scripts, String scriptLang)
  97. {
  98. intializeScriptEngine();
  99. // Pass scripts into the engine
  100. try {
  101. // Retrieve and eval script files in s.scriptFiles
  102. if (imports != null) {
  103. for (String file : imports) {
  104. if (null != file) {
  105. _securityManager.eval(_scriptEngine, JavaScriptUtils.getJavaScriptFile(file));
  106. }
  107. }
  108. }//(end load imports)
  109. // Eval script passed in s.script
  110. if (null != scripts) {
  111. for (String script: scripts) {
  112. if (null != script) {
  113. _securityManager.eval(_scriptEngine, script);
  114. }
  115. }
  116. }//(end load scripts)
  117. }
  118. catch (ScriptException e) {
  119. this._context.getHarvestStatus().logMessage("ScriptException: " + e.getMessage(), true);
  120. logger.error("ScriptException: " + e.getMessage(), e);
  121. }
  122. }//TESTED (uah:import_and_lookup_test_uahSah.json)
  123. // Set the document level fields
  124. public void setDocumentMetadata(DocumentPojo doc, DocumentSpecPojo docMetadataConfig) throws JSONException, ScriptException {
  125. Gson g = _gson;
  126. intializeDocIfNeeded(doc, g);
  127. //TODO (INF-1938): allow setting of tags (here and in legacy code)
  128. // We'll just basically duplicate the code from executeHarvest() since it's pretty simple
  129. // and it isn't very easy to pull out the logic in there (which is unnecessarily complicated for
  130. // the pipeline version since you don't need to work out whether to generate the fields before or
  131. // after the other stages, you get to explicity specify)
  132. // Extract Title if applicable
  133. try {
  134. if (docMetadataConfig.title != null) {
  135. if (JavaScriptUtils.containsScript(docMetadataConfig.title)) {
  136. doc.setTitle((String)getValueFromScript(docMetadataConfig.title, null, null));
  137. }
  138. else {
  139. doc.setTitle(getFormattedTextFromField(docMetadataConfig.title, null));
  140. }
  141. }
  142. }
  143. catch (Exception e) {
  144. this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
  145. //DEBUG (don't output log messages per doc)
  146. //logger.error("title: " + e.getMessage(), e);
  147. }
  148. //TESTED (fulltext_docMetaTest)
  149. // Extract display URL if applicable
  150. try {
  151. if (docMetadataConfig.displayUrl != null) {
  152. if (JavaScriptUtils.containsScript(docMetadataConfig.displayUrl)) {
  153. doc.setDisplayUrl((String)getValueFromScript(docMetadataConfig.displayUrl, null, null));
  154. }
  155. else {
  156. doc.setDisplayUrl(getFormattedTextFromField(docMetadataConfig.displayUrl, null));
  157. }
  158. }
  159. }
  160. catch (Exception e) {
  161. this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
  162. //DEBUG (don't output log messages per doc)
  163. //logger.error("displayUrl: " + e.getMessage(), e);
  164. }
  165. //TESTED (fulltext_docMetaTest)
  166. // Extract Description if applicable
  167. try {
  168. if (docMetadataConfig.description != null) {
  169. if (JavaScriptUtils.containsScript(docMetadataConfig.description)) {
  170. doc.setDescription((String)getValueFromScript(docMetadataConfig.description, null, null));
  171. }
  172. else {
  173. doc.setDescription(getFormattedTextFromField(docMetadataConfig.description, null));
  174. }
  175. }
  176. }
  177. catch (Exception e) {
  178. this._context.getHarvestStatus().logMessage("description: " + e.getMessage(), true);
  179. //DEBUG (don't output log messages per doc)
  180. //logger.error("description: " + e.getMessage(), e);
  181. }
  182. //TESTED (fulltext_docMetaTest)
  183. // Extract fullText if applicable
  184. try {
  185. if (docMetadataConfig.fullText != null) {
  186. if (JavaScriptUtils.containsScript(docMetadataConfig.fullText)) {
  187. doc.setFullText((String)getValueFromScript(docMetadataConfig.fullText, null, null));
  188. }
  189. else {
  190. doc.setFullText(getFormattedTextFromField(docMetadataConfig.fullText, null));
  191. }
  192. }
  193. }
  194. catch (Exception e) {
  195. this._context.getHarvestStatus().logMessage("fullText: " + e.getMessage(), true);
  196. //DEBUG (don't output log messages per doc)
  197. //logger.error("fullText: " + e.getMessage(), e);
  198. }
  199. //TESTED (fulltext_docMetaTest)
  200. // Extract Published Date if applicable
  201. try {
  202. if (docMetadataConfig.publishedDate != null) {
  203. if (JavaScriptUtils.containsScript(docMetadataConfig.publishedDate)) {
  204. doc.setPublishedDate(new Date(
  205. DateUtility.parseDate((String)getValueFromScript(docMetadataConfig.publishedDate, null, null))));
  206. }
  207. else {
  208. doc.setPublishedDate(new Date(
  209. DateUtility.parseDate((String)getFormattedTextFromField(docMetadataConfig.publishedDate, null))));
  210. }
  211. }
  212. }
  213. catch (Exception e) {
  214. this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
  215. //DEBUG (don't output log messages per doc)
  216. //logger.error("publishedDate: " + e.getMessage(), e);
  217. }
  218. //TESTED (fulltext_docMetaTest)
  219. // Extract Document GEO if applicable
  220. try {
  221. if (docMetadataConfig.geotag != null) {
  222. doc.setDocGeo(getDocGeo(docMetadataConfig.geotag));
  223. }
  224. }
  225. catch (Exception e) {
  226. this._context.getHarvestStatus().logMessage("docGeo: " + e.getMessage(), true);
  227. //DEBUG (don't output log messages per doc)
  228. //logger.error("docGeo: " + e.getMessage(), e);
  229. }
  230. //TESTED (fulltext_docMetaTest)
  231. }
  232. //TESTED (fulltext_docMetaTest)
  233. // Set the entities
  234. StructuredAnalysisConfigPojo _pipelineTmpConfig = null;
  235. public void setEntities(DocumentPojo doc, List<EntitySpecPojo> entSpecs) throws JSONException, ScriptException {
  236. intializeDocIfNeeded(doc, _gson);
  237. if (null == _pipelineTmpConfig) {
  238. _pipelineTmpConfig = new StructuredAnalysisConfigPojo();
  239. }
  240. _pipelineTmpConfig.setEntities(entSpecs);
  241. expandIterationLoops(_pipelineTmpConfig);
  242. List<EntityPojo> ents = getEntities(_pipelineTmpConfig.getEntities(), doc);
  243. if (null == doc.getEntities()) { // (else has already been added by getEntities)
  244. doc.setEntities(ents);
  245. }
  246. }
  247. //TESTED (both first time through, and when adding to existing entities)
  248. // Set the associations
  249. public void setAssociations(DocumentPojo doc, List<AssociationSpecPojo> assocSpecs) throws JSONException, ScriptException {
  250. //TODO (INF-1922): Allow setting of directed sentiment (here and in legacy code)
  251. intializeDocIfNeeded(doc, _gson);
  252. if (null == _pipelineTmpConfig) {
  253. _pipelineTmpConfig = new StructuredAnalysisConfigPojo();
  254. }
  255. _pipelineTmpConfig.setAssociations(assocSpecs);
  256. expandIterationLoops(_pipelineTmpConfig);
  257. List<AssociationPojo> assocs = getAssociations(_pipelineTmpConfig.getAssociations(), doc);
  258. if (null == doc.getAssociations()) { // (else has already been added by getAssociations)
  259. doc.setAssociations(assocs);
  260. }
  261. }
  262. //TESTED (both first time through, and when adding to existing associations)
  263. ///////////////////////////////////////////////////////////////////////////////////////////
  264. // (Utility function for optimization)
  265. private void intializeDocIfNeeded(DocumentPojo f, Gson g) throws JSONException, ScriptException {
  266. if (null == _document) {
  267. // (don't need assocs or ents)
  268. List<EntityPojo> ents = f.getEntities();
  269. List<AssociationPojo> assocs = f.getAssociations();
  270. f.setEntities(null);
  271. f.setAssociations(null);
  272. try {
  273. // Convert the DocumentPojo Object to a JSON document using GsonBuilder
  274. String docStr = g.toJson(f);
  275. _document = new JSONObject(docStr);
  276. _docPojo = f;
  277. // Add the document (JSONObject) to the engine
  278. if (null != _scriptEngine) {
  279. _scriptEngine.put("document", docStr);
  280. _securityManager.eval(_scriptEngine, JavaScriptUtils.initScript);
  281. }
  282. }
  283. finally {
  284. f.setEntities(ents);
  285. f.setAssociations(assocs);
  286. }
  287. }
  288. }//TESTED
  289. ///////////////////////////////////////////////////////////////////////////////////////////
  290. // Loads the caches into script
  291. public void loadLookupCaches(Map<String, ObjectId> caches, Set<ObjectId> communityIds) {
  292. //grab any json cache and make it available to the engine
  293. try
  294. {
  295. if (null != caches) {
  296. CacheUtils.addJSONCachesToEngine(caches, _scriptEngine, _securityManager, communityIds, _context);
  297. }
  298. }
  299. catch (Exception ex)
  300. {
  301. _context.getHarvestStatus().logMessage("JSONcache: " + ex.getMessage(), true);
  302. //(no need to log this, appears in log under source -with URL- anyway):
  303. //logger.error("JSONcache: " + ex.getMessage(), ex);
  304. }
  305. }//TESTED (import_and_lookup_test_uahSah.json)
  306. ///////////////////////////////////////////////////////////////////////////////////////////
  307. // Tidy up metadadata after processing
  308. public void removeUnwantedMetadataFields(String metaFields, DocumentPojo f)
  309. {
  310. if (null != f.getMetadata()) {
  311. if (null != metaFields) {
  312. boolean bInclude = true;
  313. if (metaFields.startsWith("+")) {
  314. metaFields = metaFields.substring(1);
  315. }
  316. else if (metaFields.startsWith("-")) {
  317. metaFields = metaFields.substring(1);
  318. bInclude = false;
  319. }
  320. String[] metaFieldArray = metaFields.split("\\s*,\\s*");
  321. if (bInclude) {
  322. Set<String> metaFieldSet = new HashSet<String>();
  323. metaFieldSet.addAll(Arrays.asList(metaFieldArray));
  324. Iterator<Entry<String, Object[]>> metaField = f.getMetadata().entrySet().iterator();
  325. while (metaField.hasNext()) {
  326. Entry<String, Object[]> metaFieldIt = metaField.next();
  327. if (!metaFieldSet.contains(metaFieldIt.getKey())) {
  328. metaField.remove();
  329. }
  330. }
  331. }
  332. else { // exclude case, easier
  333. for (String metaField: metaFieldArray) {
  334. if (!metaField.contains(".")) {
  335. f.getMetadata().remove(metaField);
  336. }
  337. else { // more complex case, nested delete
  338. MongoDbUtil.recursiveNestedMapDelete(metaField.split("\\s*\\.\\s*"), 0, f.getMetadata());
  339. }
  340. }//(end loop over metaFields)
  341. }//(end if exclude case)
  342. //TESTED: include (default + explicit) and exclude cases
  343. }
  344. }//(if metadata exists)
  345. }//TESTED (legacy code)
  346. public boolean rejectDoc(String rejectDocCriteria, DocumentPojo f) throws JSONException, ScriptException
  347. {
  348. return rejectDoc(rejectDocCriteria, f, true);
  349. }
  350. public boolean rejectDoc(String rejectDocCriteria, DocumentPojo f, boolean logMessage) throws JSONException, ScriptException
  351. {
  352. if (null != rejectDocCriteria) {
  353. intializeDocIfNeeded(f, _gson);
  354. Object o = getValueFromScript(rejectDocCriteria, null, null, false);
  355. if (null != o) {
  356. if (o instanceof String) {
  357. String rejectDoc = (String)o;
  358. if (null != rejectDoc) {
  359. if (logMessage) {
  360. this._context.getHarvestStatus().logMessage("SAH_reject: " + rejectDoc, true);
  361. }
  362. return true;
  363. }
  364. }
  365. else if (o instanceof Boolean) {
  366. Boolean rejectDoc = (Boolean)o;
  367. if (rejectDoc) {
  368. if (logMessage) {
  369. this._context.getHarvestStatus().logMessage("SAH_reject: reason not specified", true);
  370. }
  371. return true;
  372. }
  373. }
  374. else {
  375. if (logMessage) {
  376. this._context.getHarvestStatus().logMessage("SAH_reject: reason not specified", true);
  377. }
  378. return true;
  379. }
  380. }
  381. }
  382. return false;
  383. }//TESTED (storageSettings_test + legacy code)
  384. public void handleDocumentUpdates(String onUpdateScript, DocumentPojo f) throws JSONException, ScriptException
  385. {
  386. // Compare the new and old docs in the case when this doc is an update
  387. if ((null != onUpdateScript) && (null != f.getUpdateId())) {
  388. // (note we must be in integrated mode - not called from source/test - if f.getId() != null)
  389. intializeDocIfNeeded(f, _gson);
  390. BasicDBObject query1 = new BasicDBObject(DocumentPojo._id_, f.getUpdateId());
  391. BasicDBObject query2 = new BasicDBObject(DocumentPojo.updateId_, f.getUpdateId());
  392. BasicDBObject query = new BasicDBObject(DbManager.or_, Arrays.asList(query1, query2));
  393. BasicDBObject docObj = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query);
  394. if (null != docObj) {
  395. if (null == PARSING_SCRIPT) { // First time through, initialize parsing script
  396. // (to convert native JS return vals into something we can write into our metadata)
  397. PARSING_SCRIPT = JavaScriptUtils.generateParsingScript();
  398. }
  399. if (!_isParsingScriptInitialized) {
  400. _securityManager.eval(_scriptEngine, PARSING_SCRIPT);
  401. _isParsingScriptInitialized = true;
  402. }
  403. DocumentPojo doc = DocumentPojo.fromDb(docObj, DocumentPojo.class);
  404. _scriptEngine.put("old_document", _gson.toJson(doc));
  405. try {
  406. _securityManager.eval(_scriptEngine,JavaScriptUtils.initOnUpdateScript);
  407. Object returnVal = _securityManager.eval(_scriptEngine, onUpdateScript);
  408. BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, _scriptEngine);
  409. f.addToMetadata("_PERSISTENT_", outList.toArray());
  410. }
  411. catch (Exception e) {
  412. // Extra step here...
  413. if (null != doc.getMetadata()) { // Copy persistent metadata across...
  414. Object[] persist = doc.getMetadata().get("_PERSISTENT_");
  415. if (null != persist) {
  416. f.addToMetadata("_PERSISTENT_", persist);
  417. }
  418. this._context.getHarvestStatus().logMessage("SAH::onUpdateScript: " + e.getMessage(), true);
  419. //DEBUG (don't output log messages per doc)
  420. //logger.error("SAH::onUpdateScript: " + e.getMessage(), e);
  421. }
  422. //(TESTED)
  423. }
  424. //TODO (INF-1507): need to write more efficient code to deserialize metadata?
  425. }
  426. _document = null;
  427. _docPojo = null;
  428. intializeDocIfNeeded(f, _gson);
  429. }//TESTED (end if callback-on-update)
  430. }//TESTED (legacy code)
  431. ///////////////////////////////////////////////////////////////////////////////////////////
  432. // PROCESSING PIPELINE - UTILITIES
  433. // Intialize script engine - currently only Java script is supported
  434. public void intializeScriptEngine()
  435. {
  436. if (null == _scriptEngine) {
  437. //set up the security manager
  438. _securityManager = new JavascriptSecurityManager();
  439. _scriptFactory = new ScriptEngineManager();
  440. _scriptEngine = _scriptFactory.getEngineByName("JavaScript");
  441. if (null != _unstructuredHandler) { // Also initialize the scripting engine for the UAH
  442. _unstructuredHandler.set_sahEngine(_scriptEngine);
  443. _unstructuredHandler.set_sahSecurity(_securityManager);
  444. }
  445. // Make the engine invocable so that we can call functions in the script
  446. // using the inv.invokeFunction(function) method
  447. _scriptInvoker = (Invocable) _scriptEngine;
  448. }//(once only)
  449. }//TESTED
  450. ///////////////////////////////////////////////////////////////////////////////////////////
  451. ///////////////////////////////////////////////////////////////////////////////////////////
  452. ///////////////////////////////////////////////////////////////////////////////////////////
  453. // LEGACY CODE - USE TO SUPPORT OLD CODE FOR NOW + AS UTILITY CODE FOR THE PIPELINE LOGIC
  454. // Private class variables
  455. private static Logger logger;
  456. private JSONObject _document = null; //TODO (INF-2488): change all the JSONObject logic to LinkedHashMap and (generic) Array so can just replace this with a string...
  457. private DocumentPojo _docPojo = null;
  458. private Gson _gson = null;
  459. private JSONObject _iterator = null;
  460. private String _iteratorIndex = null;
  461. private static Pattern SUBSTITUTION_PATTERN = Pattern.compile("\\$([a-zA-Z._0-9]+)|\\$\\{([^}]+)\\}");
  462. private HashMap<String, GeoPojo> _geoMap = null;
  463. private HashSet<String> _entityMap = null;
  464. private HarvestContext _context;
  465. /**
  466. * Default Constructor
  467. */
  468. public StructuredAnalysisHarvester()
  469. {
  470. logger = Logger.getLogger(StructuredAnalysisHarvester.class);
  471. }
  472. // Allows the unstructured handler to take advantage of text created by this
  473. public void addUnstructuredHandler(UnstructuredAnalysisHarvester uap) {
  474. _unstructuredHandler = uap;
  475. }
  476. private UnstructuredAnalysisHarvester _unstructuredHandler = null;
  477. //
  478. private ScriptEngineManager _scriptFactory = null;
  479. private ScriptEngine _scriptEngine = null;
  480. private JavascriptSecurityManager _securityManager = null;
  481. private Invocable _scriptInvoker = null;
  482. private static String PARSING_SCRIPT = null;
  483. private boolean _isParsingScriptInitialized = false; // (needs to be done once per source)
  484. /**
  485. * executeHarvest(SourcePojo source, List<DocumentPojo> feeds) extracts document GEO, Entities,
  486. * and Associations based on the DocGeoSpec, EntitySpec, and AssociationSpec information contained
  487. * within the source document's StructuredAnalysis sections
  488. * @param source
  489. * @param docs
  490. * @return List<DocumentPojo>
  491. * @throws ScriptException
  492. */
  493. public List<DocumentPojo> executeHarvest(HarvestController contextController, SourcePojo source, List<DocumentPojo> docs)
  494. {
  495. _context = contextController;
  496. if (null == _gson) {
  497. GsonBuilder gb = new GsonBuilder();
  498. _gson = gb.create();
  499. }
  500. Gson g = _gson;
  501. // Skip if the StructuredAnalysis object of the source is null
  502. if (source.getStructuredAnalysisConfig() != null)
  503. {
  504. StructuredAnalysisConfigPojo s = source.getStructuredAnalysisConfig();
  505. // (some pre-processing to expand the specs)
  506. expandIterationLoops(s);
  507. // Instantiate a new ScriptEngineManager and create an engine to execute
  508. // the type of script specified in StructuredAnalysisPojo.scriptEngine
  509. this.intializeScriptEngine();
  510. this.loadLookupCaches(s.getCaches(), source.getCommunityIds());
  511. // Iterate over each doc in docs, create entity and association pojo objects
  512. // to add to the feed using the source entity and association spec pojos
  513. Iterator<DocumentPojo> it = docs.iterator();
  514. int nDocs = 0;
  515. while (it.hasNext())
  516. {
  517. DocumentPojo f = it.next();
  518. nDocs++;
  519. try
  520. {
  521. resetEntityCache();
  522. _document = null;
  523. _docPojo = null;
  524. // (don't create this until needed, since it might need to be (re)serialized after a call
  525. // to the UAH which would obviously be undesirable)
  526. // If the script engine has been instantiated pass the feed document and any scripts
  527. if (_scriptEngine != null)
  528. {
  529. List<String> scriptList = null;
  530. List<String> scriptFileList = null;
  531. try {
  532. // Script code embedded in source
  533. scriptList = Arrays.asList(s.getScript());
  534. }
  535. catch (Exception e) {}
  536. try {
  537. // scriptFiles - can contain String[] of script files to import into the engine
  538. scriptFileList = Arrays.asList(s.getScriptFiles());
  539. }
  540. catch (Exception e) {}
  541. this.loadGlobalFunctions(scriptFileList, scriptList, s.getScriptEngine());
  542. }//TESTED
  543. // 1. Document level fields
  544. // Extract Title if applicable
  545. boolean bTryTitleLater = false;
  546. try {
  547. if (s.getTitle() != null)
  548. {
  549. intializeDocIfNeeded(f, g);
  550. if (JavaScriptUtils.containsScript(s.getTitle()))
  551. {
  552. f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
  553. }
  554. else
  555. {
  556. f.setTitle(getFormattedTextFromField(s.getTitle(), null));
  557. }
  558. if (null == f.getTitle()) {
  559. bTryTitleLater = true;
  560. }
  561. }
  562. }
  563. catch (Exception e)
  564. {
  565. this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
  566. //DEBUG (don't output log messages per doc)
  567. //logger.error("title: " + e.getMessage(), e);
  568. }
  569. // Extract Display URL if applicable
  570. boolean bTryDisplayUrlLater = false;
  571. try {
  572. if (s.getDisplayUrl() != null)
  573. {
  574. intializeDocIfNeeded(f, g);
  575. if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
  576. {
  577. f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
  578. }
  579. else
  580. {
  581. f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
  582. }
  583. if (null == f.getDisplayUrl()) {
  584. bTryDisplayUrlLater = true;
  585. }
  586. }
  587. }
  588. catch (Exception e)
  589. {
  590. this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
  591. //DEBUG (don't output log messages per doc)
  592. //logger.error("displayUrl: " + e.getMessage(), e);
  593. }
  594. //TOTEST
  595. // Extract Description if applicable
  596. boolean bTryDescriptionLater = false;
  597. try {
  598. if (s.getDescription() != null)
  599. {
  600. intializeDocIfNeeded(f, g);
  601. if (JavaScriptUtils.containsScript(s.getDescription()))
  602. {
  603. f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
  604. }
  605. else
  606. {
  607. f.setDescription(getFormattedTextFromField(s.getDescription(), null));
  608. }
  609. if (null == f.getDescription()) {
  610. bTryDescriptionLater = true;
  611. }
  612. }
  613. }
  614. catch (Exception e)
  615. {
  616. this._context.getHarvestStatus().logMessage("description: " + e.getMessage(), true);
  617. //DEBUG (don't output log messages per doc)
  618. //logger.error("description: " + e.getMessage(), e);
  619. }
  620. // Extract fullText if applicable
  621. boolean bTryFullTextLater = false;
  622. try {
  623. if (s.getFullText() != null)
  624. {
  625. intializeDocIfNeeded(f, g);
  626. if (JavaScriptUtils.containsScript(s.getFullText()))
  627. {
  628. f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
  629. }
  630. else
  631. {
  632. f.setFullText(getFormattedTextFromField(s.getFullText(), null));
  633. }
  634. if (null == f.getFullText()) {
  635. bTryFullTextLater = true;
  636. }
  637. }
  638. }
  639. catch (Exception e)
  640. {
  641. this._context.getHarvestStatus().logMessage("fullText: " + e.getMessage(), true);
  642. //DEBUG (don't output log messages per doc)
  643. //logger.error("fullText: " + e.getMessage(), e);
  644. }
  645. // Published date is done after the UAH
  646. // (since the UAH can't access it, and it might be populated via the UAH)
  647. // 2. UAH/extraction properties
  648. // Add fields to metadata that can be used to create entities and associations
  649. // (Either with the UAH, or with the entity extractor)
  650. try {
  651. boolean bMetadataChanged = false;
  652. if (null != this._unstructuredHandler)
  653. {
  654. try
  655. {
  656. this._unstructuredHandler.set_sahEngine(_scriptEngine);
  657. this._unstructuredHandler.set_sahSecurity(_securityManager);
  658. bMetadataChanged = this._unstructuredHandler.executeHarvest(_context, source, f, (1 == nDocs), it.hasNext());
  659. }
  660. catch (Exception e) {
  661. contextController.handleExtractError(e, source); //handle extractor error if need be
  662. it.remove(); // remove the document from the list...
  663. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  664. // (Note: this can't be source level error, so carry on harvesting - unlike below)
  665. continue;
  666. }
  667. }
  668. if (contextController.isEntityExtractionRequired(source))
  669. {
  670. bMetadataChanged = true;
  671. // Text/Entity Extraction
  672. List<DocumentPojo> toAdd = new ArrayList<DocumentPojo>(1);
  673. toAdd.add(f);
  674. try {
  675. contextController.extractTextAndEntities(toAdd, source, false, false);
  676. if (toAdd.isEmpty()) { // this failed...
  677. it.remove(); // remove the document from the list...
  678. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  679. continue;
  680. }//TESTED
  681. }
  682. catch (Exception e) {
  683. contextController.handleExtractError(e, source); //handle extractor error if need be
  684. it.remove(); // remove the document from the list...
  685. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  686. if (source.isHarvestBadSource())
  687. {
  688. // Source error, ignore all other documents
  689. while (it.hasNext()) {
  690. f = it.next();
  691. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  692. it.remove();
  693. }
  694. break;
  695. }
  696. else {
  697. continue;
  698. }
  699. //TESTED
  700. }
  701. }
  702. if (bMetadataChanged) {
  703. // Ugly, but need to re-create doc json because metadata has changed
  704. String sTmpFullText = f.getFullText();
  705. f.setFullText(null); // (no need to serialize this, can save some cycles)
  706. _document = null;
  707. _docPojo = null;
  708. intializeDocIfNeeded(f, g);
  709. f.setFullText(sTmpFullText); //(restore)
  710. }
  711. // Can copy metadata from old documents to new ones:
  712. handleDocumentUpdates(s.getOnUpdateScript(), f);
  713. // Check (based on the metadata and entities so far) whether to retain the doc
  714. if (rejectDoc(s.getRejectDocCriteria(), f)) {
  715. it.remove(); // remove the document from the list...
  716. f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  717. continue;
  718. }
  719. }
  720. catch (Exception e) {
  721. this._context.getHarvestStatus().logMessage("SAH->UAH: " + e.getMessage(), true);
  722. //DEBUG (don't output log messages per doc)
  723. //logger.error("SAH->UAH: " + e.getMessage(), e);
  724. }
  725. // Now create document since there's no risk of having to re-serialize
  726. intializeDocIfNeeded(f, g);
  727. // 3. final doc-level metadata fields:
  728. // If description was null before might need to get it from a UAH field
  729. if (bTryTitleLater) {
  730. try {
  731. if (s.getTitle() != null)
  732. {
  733. intializeDocIfNeeded(f, g);
  734. if (JavaScriptUtils.containsScript(s.getTitle()))
  735. {
  736. f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
  737. }
  738. else
  739. {
  740. f.setTitle(getFormattedTextFromField(s.getTitle(), null));
  741. }
  742. }
  743. }
  744. catch (Exception e)
  745. {
  746. this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
  747. //DEBUG (don't output log messages per doc)
  748. //logger.error("title: " + e.getMessage(), e);
  749. }
  750. }
  751. // Extract Display URL if needed
  752. if (bTryDisplayUrlLater) {
  753. try {
  754. if (s.getDisplayUrl() != null)
  755. {
  756. intializeDocIfNeeded(f, g);
  757. if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
  758. {
  759. f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
  760. }
  761. else
  762. {
  763. f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
  764. }
  765. }
  766. }
  767. catch (Exception e)
  768. {
  769. this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
  770. //DEBUG (don't output log messages per doc)
  771. //logger.error("displayUrl: " + e.getMessage(), e);
  772. }
  773. }
  774. //TOTEST
  775. // If description was null before might need to get it from a UAH field
  776. if (bTryDescriptionLater) {
  777. try {
  778. if (s.getDescription() != null)
  779. {
  780. intializeDocIfNeeded(f, g);
  781. if (JavaScriptUtils.containsScript(s.getDescription()))
  782. {
  783. f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
  784. }
  785. else
  786. {
  787. f.setDescription(getFormattedTextFromField(s.getDescription(), null));
  788. }
  789. }
  790. }
  791. catch (Exception e)
  792. {
  793. this._context.getHarvestStatus().logMessage("description2: " + e.getMessage(), true);
  794. //DEBUG (don't output log messages per doc)
  795. //logger.error("description2: " + e.getMessage(), e);
  796. }
  797. }
  798. // If fullText was null before might need to get it from a UAH field
  799. if (bTryFullTextLater) {
  800. try {
  801. if (s.getFullText() != null)
  802. {
  803. intializeDocIfNeeded(f, g);
  804. if (JavaScriptUtils.containsScript(s.getFullText()))
  805. {
  806. f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
  807. }
  808. else
  809. {
  810. f.setFullText(getFormattedTextFromField(s.getFullText(), null));
  811. }
  812. }
  813. }
  814. catch (Exception e)
  815. {
  816. this._context.getHarvestStatus().logMessage("fullText2: " + e.getMessage(), true);
  817. //DEBUG (don't output log messages per doc)
  818. //logger.error("fullText2: " + e.getMessage(), e);
  819. }
  820. }
  821. // Extract Published Date if applicable
  822. if (s.getPublishedDate() != null)
  823. {
  824. if (JavaScriptUtils.containsScript(s.getPublishedDate()))
  825. {
  826. try
  827. {
  828. f.setPublishedDate(new Date(
  829. DateUtility.parseDate((String)getValueFromScript(s.getPublishedDate(), null, null))));
  830. }
  831. catch (Exception e)
  832. {
  833. this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
  834. }
  835. }
  836. else
  837. {
  838. try
  839. {
  840. f.setPublishedDate(new Date(
  841. DateUtility.parseDate((String)getFormattedTextFromField(s.getPublishedDate(), null))));
  842. }
  843. catch (Exception e)
  844. {
  845. this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
  846. }
  847. }
  848. }
  849. // 4. Entity level fields
  850. // Extract Document GEO if applicable
  851. if (s.getDocumentGeo() != null)
  852. {
  853. try
  854. {
  855. f.setDocGeo(getDocGeo(s.getDocumentGeo()));
  856. }
  857. catch (Exception e)
  858. {
  859. this._context.getHarvestStatus().logMessage("docGeo: " + e.getMessage(), true);
  860. }
  861. }
  862. // Extract Entities
  863. if (s.getEntities() != null)
  864. {
  865. f.setEntities(getEntities(s.getEntities(), f));
  866. }
  867. // Extract Associations
  868. if (s.getAssociations() != null)
  869. {
  870. f.setAssociations(getAssociations(s.getAssociations(), f));
  871. }
  872. // 5. Remove unwanted metadata fields
  873. removeUnwantedMetadataFields(s.getMetadataFields(), f);
  874. }
  875. catch (Exception e)
  876. {
  877. this._context.getHarvestStatus().logMessage("Unknown: " + e.getMessage(), true);
  878. //DEBUG (don't output log messages per doc)
  879. //logger.error("Unknown: " + e.getMessage(), e);
  880. }
  881. finally
  882. {
  883. _document = null;
  884. _docPojo = null;
  885. }
  886. } // (end loop over documents)
  887. } // (end if SAH specified)
  888. return docs;
  889. }
  890. /**
  891. * getEntities(EntitySpecPojo e, DocumentPojo f)
  892. *
  893. * @param e
  894. * @param f
  895. * @return List<EntityPojo>
  896. * @throws JSONException
  897. */
  898. private List<EntityPojo> getEntities(List<EntitySpecPojo> esps, DocumentPojo f) throws JSONException
  899. {
  900. //TODO (INF-1922): should I always create in a new list and then add on? because of the entity map below...
  901. // If the feed already has entities we want to add the new entities to the list of existing entities
  902. List<EntityPojo> entities = null;
  903. if (f.getEntities() != null)
  904. {
  905. entities = f.getEntities();
  906. }
  907. // Otherwise we create a new arraylist to hold the new entities we are adding
  908. else
  909. {
  910. entities = new ArrayList<EntityPojo>();
  911. }
  912. repopulateEntityCacheIfNeeded(f);
  913. // Iterate over each EntitySpecPojo and try to create an entity, or entities, from the data
  914. JSONObject metadata = null;
  915. if (_document.has("metadata")) {
  916. metadata = _document.getJSONObject("metadata");
  917. }
  918. for (EntitySpecPojo esp : esps)
  919. {
  920. try {
  921. List<EntityPojo> tempEntities = getEntities(esp, f, metadata);
  922. for (EntityPojo e : tempEntities)
  923. {
  924. entities.add(e);
  925. }
  926. }
  927. catch (Exception e) {} // (carry on, prob just a missing field in this doc)
  928. }
  929. return entities;
  930. }
  931. /**
  932. * getEntities
  933. * @param esp
  934. * @param f
  935. * @return
  936. */
  937. private List<EntityPojo> getEntities(EntitySpecPojo esp, DocumentPojo f, JSONObject currObj)
  938. {
  939. List<EntityPojo> entities = new ArrayList<EntityPojo>();
  940. // Does the entity contain a list of entities to iterate over -
  941. if (esp.getIterateOver() != null)
  942. {
  943. try
  944. {
  945. String iterateOver = esp.getIterateOver();
  946. // Check to see if the arrayRoot specified exists in the current doc before proceeding
  947. Object itEl = null;
  948. try {
  949. itEl = currObj.get(iterateOver);
  950. }
  951. catch (JSONException e) {} // carry on, trapped below...
  952. if (null == itEl) {
  953. return entities;
  954. }
  955. JSONArray entityRecords = null;
  956. try {
  957. entityRecords = currObj.getJSONArray(iterateOver);
  958. }
  959. catch (JSONException e) {} // carry on, trapped below...
  960. if (null == entityRecords) {
  961. entityRecords = new JSONArray();
  962. entityRecords.put(itEl);
  963. }
  964. //TESTED
  965. // Get the type of object contained in EntityRecords[0]
  966. String objType = entityRecords.get(0).getClass().toString();
  967. /*
  968. * EntityRecords is a simple String[] array of entities
  969. */
  970. if (objType.equalsIgnoreCase("class java.lang.String"))
  971. {
  972. // Iterate over array elements and extract entities
  973. for (int i = 0; i < entityRecords.length(); ++i)
  974. {
  975. String field = entityRecords.getString(i);
  976. long nIndex = Long.valueOf(i);
  977. if (null != esp.getType()) { // (else cannot be a valid entity, must just be a list)
  978. EntityPojo entity = getEntity(esp, field, String.valueOf(i), f);
  979. if (entity != null) entities.add(entity);
  980. }
  981. // Does the association break out into multiple associations?
  982. if (esp.getEntities() != null)
  983. {
  984. // Iterate over the associations and call getAssociations recursively
  985. for (EntitySpecPojo subEsp : esp.getEntities())
  986. {
  987. if (null != subEsp.getIterateOver()) {
  988. if (null == subEsp.getCreationCriteriaScript()) {
  989. _context.getHarvestStatus().logMessage(new StringBuffer("In iterator ").
  990. append(esp.getIterateOver()).append(", trying to loop over field '").
  991. append(subEsp.getIterateOver()).append("' in array of primitives.").toString(), true);
  992. }
  993. else {
  994. this.executeEntityAssociationValidation(subEsp.getCreationCriteriaScript(), field, Long.toString(nIndex));
  995. }
  996. // (any creation criteria script indicates user accepts it can be either)
  997. }
  998. if (null != subEsp.getDisambiguated_name()) {
  999. EntityPojo entity = getEntity(subEsp, field, String.valueOf(i), f);
  1000. if (entity != null) entities.add(entity);
  1001. }
  1002. }
  1003. }//TESTED (error case, mixed object)
  1004. }
  1005. }
  1006. /*
  1007. * EntityRecords is a JSONArray
  1008. */
  1009. else if (objType.equalsIgnoreCase("class org.json.JSONObject"))
  1010. {
  1011. // Iterate over array elements and extract entities
  1012. for (int i = 0; i < entityRecords.length(); ++i)
  1013. {
  1014. // Get JSONObject containing entity fields and pass entityElement
  1015. // into the script engine so scripts can access it
  1016. JSONObject savedIterator = null;
  1017. if (_scriptEngine != null)
  1018. {
  1019. _iterator = savedIterator = entityRecords.getJSONObject(i);
  1020. }
  1021. if (null != esp.getType()) { // (else cannot be a valid entity, must just be a list)
  1022. EntityPojo entity = getEntity(esp, null, String.valueOf(i), f);
  1023. if (entity != null) entities.add(entity);
  1024. }
  1025. // Does the entity break out into multiple entities?
  1026. if (esp.getEntities() != null)
  1027. {
  1028. // Iterate over the entities and call getEntities recursively
  1029. for (EntitySpecPojo subEsp : esp.getEntities())
  1030. {
  1031. _iterator = savedIterator; // (reset this)
  1032. List<EntityPojo> subEntities = getEntities(subEsp, f, _iterator);
  1033. for (EntityPojo e : subEntities)
  1034. {
  1035. entities.add(e);
  1036. }
  1037. }
  1038. }
  1039. }
  1040. }
  1041. if (_iterator != currObj) { // (ie at the top level)
  1042. _iterator = null;
  1043. }
  1044. }
  1045. catch (Exception e)
  1046. {
  1047. //e.printStackTrace();
  1048. //System.out.println(e.getMessage());
  1049. //logger.error("Exception: " + e.getMessage());
  1050. }
  1051. }
  1052. // Single entity
  1053. else
  1054. {
  1055. // Does the entity break out into multiple entities?
  1056. if (esp.getEntities() != null)
  1057. {
  1058. // Iterate over the entities and call getEntities recursively
  1059. for (EntitySpecPojo subEsp : esp.getEntities())
  1060. {
  1061. List<EntityPojo> subEntities = getEntities(subEsp, f, currObj);
  1062. for (EntityPojo e : subEntities)
  1063. {
  1064. entities.add(e);
  1065. }
  1066. }
  1067. }
  1068. else
  1069. {
  1070. EntityPojo entity = getEntity(esp, null, null, f);
  1071. if (entity != null) entities.add(entity);
  1072. }
  1073. }
  1074. return entities;
  1075. }
  1076. /**
  1077. * getEntity
  1078. * @param esp
  1079. * @param field
  1080. * @param index
  1081. * @param f
  1082. * @return
  1083. */
  1084. private EntityPojo getEntity(EntitySpecPojo esp, String field, String index, DocumentPojo f)
  1085. {
  1086. // If the EntitySpecPojo or DocumentPojo is null return null
  1087. if ((esp == null) || (f == null)) return null;
  1088. try
  1089. {
  1090. EntityPojo e = new EntityPojo();
  1091. // Parse creation criteria script to determine if the entity should be added
  1092. if (esp.getCreationCriteriaScript() != null && JavaScriptUtils.containsScript(esp.getCreationCriteriaScript()))
  1093. {
  1094. boolean addEntity = executeEntityAssociationValidation(esp.getCreationCriteriaScript(), field, index);
  1095. if (!addEntity) {
  1096. return null;
  1097. }
  1098. }
  1099. // Entity.disambiguous_name
  1100. String disambiguatedName = null;
  1101. if (JavaScriptUtils.containsScript(esp.getDisambiguated_name()))
  1102. {
  1103. disambiguatedName = (String)getValueFromScript(esp.getDisambiguated_name(), field, index);
  1104. }
  1105. else
  1106. {
  1107. if ((_iterator != null) && (esp.getDisambiguated_name().startsWith("$metadata.") || esp.getDisambiguated_name().startsWith("${metadata."))) {
  1108. if (_context.isStandalone()) { // (minor message, while debugging only)
  1109. _context.getHarvestStatus().logMessage("Warning: in disambiguated_name, using global $metadata when iterating", true);
  1110. }
  1111. }
  1112. // Field - passed in via simple string array from getEntities
  1113. if (field != null)
  1114. {
  1115. disambiguatedName = getFormattedTextFromField(esp.getDisambiguated_name(), field);
  1116. }
  1117. else
  1118. {
  1119. disambiguatedName = getFormattedTextFromField(esp.getDisambiguated_name(), field);
  1120. }
  1121. }
  1122. // Only proceed if disambiguousName contains a meaningful value
  1123. if (disambiguatedName != null && disambiguatedName.length() > 0)
  1124. {
  1125. e.setDisambiguatedName(disambiguatedName);
  1126. }
  1127. else // Always log failure to get a dname - to remove this, specify a creationCriteriaScript
  1128. {
  1129. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required disambiguated_name from: ").append(esp.getDisambiguated_name()).toString(), true);
  1130. return null;
  1131. }
  1132. // Entity.frequency (count)
  1133. String freq = "1";
  1134. if (esp.getFrequency() != null)
  1135. {
  1136. if (JavaScriptUtils.containsScript(esp.getFrequency()))
  1137. {
  1138. freq = getValueFromScript(esp.getFrequency(), field, index).toString();
  1139. }
  1140. else
  1141. {
  1142. freq = getFormattedTextFromField(esp.getFrequency(), field);
  1143. }
  1144. // Since we've specified freq, we're going to enforce it
  1145. if (null == freq) { // failed to get it
  1146. if (null == esp.getCreationCriteriaScript()) {
  1147. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required frequency from: ").append(esp.getFrequency()).toString(), true);
  1148. return null;
  1149. }
  1150. }
  1151. }
  1152. // Try converting the freq string value to its numeric (double) representation
  1153. Double frequency = (double) 0;
  1154. try
  1155. {
  1156. frequency = Double.parseDouble(freq);
  1157. }
  1158. catch (Exception e1)
  1159. {
  1160. this._context.getHarvestStatus().logMessage(e1.getMessage(), true);
  1161. return null;
  1162. }
  1163. // Only proceed if frequency > 0
  1164. if (frequency > 0)
  1165. {
  1166. e.setFrequency(frequency.longValue()); // Cast to long from double
  1167. }
  1168. else
  1169. {
  1170. return null;
  1171. }
  1172. // Entity.actual_name
  1173. String actualName = null;
  1174. if (esp.getActual_name() != null)
  1175. {
  1176. if (JavaScriptUtils.containsScript(esp.getActual_name()))
  1177. {
  1178. actualName = (String)getValueFromScript(esp.getActual_name(), field, index);
  1179. }
  1180. else
  1181. {
  1182. if ((_iterator != null) && (esp.getActual_name().startsWith("$metadata.") || esp.getActual_name().startsWith("${metadata."))) {
  1183. if (_context.isStandalone()) { // (minor message, while debugging only)
  1184. _context.getHarvestStatus().logMessage("Warning: in actual_name, using global $metadata when iterating", true);
  1185. }
  1186. }
  1187. actualName = getFormattedTextFromField(esp.getActual_name(), field);
  1188. }
  1189. // Since we've specified actual name, we're going to enforce it (unless otherwise specified)
  1190. if (null == actualName) { // failed to get it
  1191. if (null == esp.getCreationCriteriaScript()) {
  1192. if (_context.isStandalone()) { // (minor message, while debugging only)
  1193. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required actual_name from: ").append(esp.getActual_name()).toString(), true);
  1194. }
  1195. return null;
  1196. }
  1197. }
  1198. }
  1199. // If actualName == null set it equal to disambiguousName
  1200. if (actualName == null) actualName = disambiguatedName;
  1201. e.setActual_name(actualName);
  1202. // Entity.type
  1203. String type = null;
  1204. if (esp.getType() != null)
  1205. {
  1206. if (JavaScriptUtils.containsScript(esp.getType()))
  1207. {
  1208. type = (String)getValueFromScript(esp.getType(), field, index);
  1209. }
  1210. else
  1211. {
  1212. type = getFormattedTextFromField(esp.getType(), field);
  1213. }
  1214. // Since we've specified type, we're going to enforce it (unless otherwise specified)
  1215. if (null == type) { // failed to get it
  1216. if (null == esp.getCreationCriteriaScript()) {
  1217. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required type from: ").append(esp.getType()).toString(), true);
  1218. return null;
  1219. }
  1220. }
  1221. }
  1222. else
  1223. {
  1224. type = "Keyword";
  1225. }
  1226. e.setType(type);
  1227. // Entity.index
  1228. String entityIndex = disambiguatedName + "/" + type;
  1229. e.setIndex(entityIndex.toLowerCase());
  1230. // Now check if we already exist, discard if so:
  1231. if (_entityMap.contains(e.getIndex())) {
  1232. return null;
  1233. }
  1234. // Entity.dimension
  1235. String dimension = null;
  1236. if (esp.getDimension() != null)
  1237. {
  1238. if (JavaScriptUtils.containsScript(esp.getDimension()))
  1239. {
  1240. dimension = (String)getValueFromScript(esp.getDimension(), field, index);
  1241. }
  1242. else
  1243. {
  1244. dimension = getFormattedTextFromField(esp.getDimension(), field);
  1245. }
  1246. // Since we've specified dimension, we're going to enforce it (unless otherwise specified)
  1247. if (null == dimension) { // failed to get it
  1248. if (null == esp.getCreationCriteriaScript()) {
  1249. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required dimension from: ").append(esp.getDimension()).toString(), true);
  1250. return null;
  1251. }
  1252. }
  1253. }
  1254. if (null == dimension) {
  1255. try {
  1256. e.setDimension(DimensionUtility.getDimensionByType(type));
  1257. }
  1258. catch (java.lang.IllegalArgumentException ex) {
  1259. e.setDimension(EntityPojo.Dimension.What);
  1260. }
  1261. }
  1262. else {
  1263. try {
  1264. EntityPojo.Dimension enumDimension = EntityPojo.Dimension.valueOf(dimension);
  1265. if (null == enumDimension) {
  1266. _context.getHarvestStatus().logMessage(new StringBuffer("Invalid dimension: ").append(dimension).toString(), true);
  1267. return null; // (invalid dimension)
  1268. }
  1269. else {
  1270. e.setDimension(enumDimension);
  1271. }
  1272. }
  1273. catch (Exception e2) {
  1274. _context.getHarvestStatus().logMessage(new StringBuffer("Invalid dimension: ").append(dimension).toString(), true);
  1275. return null; // (invalid dimension)
  1276. }
  1277. }
  1278. // Entity.relevance
  1279. String relevance = "0";
  1280. if (esp.getRelevance() != null)
  1281. {
  1282. if (JavaScriptUtils.containsScript(esp.getRelevance()))
  1283. {
  1284. relevance = (String)getValueFromScript(esp.getRelevance(), field, index);
  1285. }
  1286. else
  1287. {
  1288. relevance = getFormattedTextFromField(esp.getRelevance(), field);
  1289. }
  1290. // Since we've specified relevance, we're going to enforce it (unless otherwise specified)
  1291. if (null == relevance) { // failed to get it
  1292. if (null == esp.getCreationCriteriaScript()) {
  1293. _context.getHarvestStatus().logMessage(new StringBuffer("Failed to get required relevance from: ").append(esp.getRelevance()).toString(), true);
  1294. return null;
  1295. }
  1296. }
  1297. }
  1298. try {
  1299. e.setRelevance(Double.parseDouble(relevance));
  1300. }
  1301. catch (Exception e1) {
  1302. this._context.getHarvestStatus().logMessage(e1.getMessage(), true);
  1303. return null;
  1304. }
  1305. // Entity.sentiment (optional field)
  1306. if (esp.getSentiment() != null)
  1307. {
  1308. String sentiment;
  1309. if (JavaScriptUtils.containsScript(esp.getSentiment()))
  1310. {
  1311. sentiment = (String)getValueFromScript(esp.getSentiment(), field, index);
  1312. }
  1313. else
  1314. {
  1315. sentiment = getFormattedTextFromField(esp.getSentiment(), field);
  1316. }
  1317. // (sentiment is optional, even if specified)
  1318. if (null != sentiment) {
  1319. try {
  1320. double d = Double.parseDouble(sentiment);
  1321. e.setSentiment(d);
  1322. if (null == e.getSentiment()) {
  1323. if (_context.isStandalone()) { // (minor message, while debugging only)
  1324. _context.getHarvestStatus().logMessage(new StringBuffer("Invalid sentiment: ").append(sentiment).toString(), true);
  1325. }
  1326. }
  1327. }
  1328. catch (Exception e1) {
  1329. this._context.getHarvestStatus().logMessage(e1.getMessage(), true);
  1330. return null;
  1331. }
  1332. }
  1333. }
  1334. // Entity Link data:
  1335. if (esp.getLinkdata() != null)
  1336. {
  1337. String linkdata = null;
  1338. if (JavaScriptUtils.containsScript(esp.getLinkdata()))
  1339. {
  1340. linkdata = (String)getValueFromScript(esp.getLinkdata(), field, index);
  1341. }
  1342. else
  1343. {
  1344. linkdata = getFormattedTextFromField(esp.getLinkdata(), field);
  1345. }
  1346. // linkdata i…

Large files files are truncated, but you can click here to view the full file