PageRenderTime 84ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 1ms

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/enrichment/custom/UnstructuredAnalysisHarvester.java

https://github.com/IKANOW/Infinit.e
Java | 1601 lines | 1224 code | 164 blank | 213 comment | 439 complexity | 671c69d5a300cc43ea46d863a4bdb470 MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.harvest.enrichment.custom;
  17. import java.io.ByteArrayInputStream;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.io.InputStreamReader;
  21. import java.io.StringWriter;
  22. import java.net.URL;
  23. import java.net.URLConnection;
  24. import java.util.ArrayList;
  25. import java.util.Arrays;
  26. import java.util.HashMap;
  27. import java.util.HashSet;
  28. import java.util.Iterator;
  29. import java.util.LinkedHashMap;
  30. import java.util.LinkedList;
  31. import java.util.List;
  32. import java.util.Map;
  33. import java.util.Scanner;
  34. import java.util.Set;
  35. import java.util.regex.Matcher;
  36. import java.util.regex.Pattern;
  37. import javax.script.ScriptEngine;
  38. import javax.script.ScriptEngineManager;
  39. import javax.script.ScriptException;
  40. import javax.xml.parsers.ParserConfigurationException;
  41. import javax.xml.stream.XMLInputFactory;
  42. import javax.xml.stream.XMLStreamReader;
  43. import javax.xml.transform.Transformer;
  44. import javax.xml.transform.TransformerException;
  45. import javax.xml.transform.TransformerFactory;
  46. import javax.xml.transform.dom.DOMSource;
  47. import javax.xml.transform.stream.StreamResult;
  48. import javax.xml.xpath.XPath;
  49. import javax.xml.xpath.XPathConstants;
  50. import javax.xml.xpath.XPathExpressionException;
  51. import javax.xml.xpath.XPathFactory;
  52. import org.apache.commons.lang.StringEscapeUtils;
  53. import org.apache.log4j.Logger;
  54. import org.bson.types.ObjectId;
  55. import org.htmlcleaner.CleanerProperties;
  56. import org.htmlcleaner.DomSerializer;
  57. import org.htmlcleaner.HtmlCleaner;
  58. import org.htmlcleaner.TagNode;
  59. import org.json.JSONException;
  60. import org.json.JSONObject;
  61. import org.json.XML;
  62. import org.w3c.dom.Document;
  63. import org.w3c.dom.NamedNodeMap;
  64. import org.w3c.dom.Node;
  65. import org.w3c.dom.NodeList;
  66. import com.google.gson.Gson;
  67. import com.google.gson.GsonBuilder;
  68. import com.google.gson.stream.JsonReader;
  69. import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
  70. import com.ikanow.infinit.e.data_model.store.config.source.SimpleTextCleanserPojo;
  71. import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.ManualTextExtractionSpecPojo;
  72. import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.MetadataSpecPojo;
  73. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  74. import com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo;
  75. import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo;
  76. import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.Context;
  77. import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.metaField;
  78. import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
  79. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  80. import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
  81. import com.ikanow.infinit.e.harvest.HarvestContext;
  82. import com.ikanow.infinit.e.harvest.HarvestController;
  83. import com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser;
  84. import com.ikanow.infinit.e.harvest.extraction.document.file.XmlToMetadataParser;
  85. import com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika;
  86. import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
  87. import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
  88. import com.ikanow.infinit.e.harvest.utils.ProxyManager;
  89. import com.mongodb.BasicDBList;
  90. /**
  91. * UnstructuredAnalysisHarvester
  92. */
  93. public class UnstructuredAnalysisHarvester {
  94. ///////////////////////////////////////////////////////////////////////////////////////////
  95. // NEW PROCESSING PIPELINE INTERFACE
  96. //TODO (INF-1922): Handle headers and footers
  97. public void setContext(HarvestContext context) {
  98. _context = context;
  99. //TODO: need to set up the javascript engine just once - can't do it here though
  100. // because this might be called before the SAH is setup...
  101. }
  102. // Transform the doc's text (go get it if necessary)
  103. public String doManualTextEnrichment(DocumentPojo doc, List<ManualTextExtractionSpecPojo> textExtractors, SourceRssConfigPojo feedConfig) throws IOException {
  104. String cachedFullText = null;
  105. // Map to the legacy format and then call the legacy code
  106. ArrayList<SimpleTextCleanserPojo> mappedTextExtractors = new ArrayList<SimpleTextCleanserPojo>(textExtractors.size());
  107. for (ManualTextExtractionSpecPojo textExtractor: textExtractors) {
  108. if (DocumentPojo.fullText_.equalsIgnoreCase(textExtractor.fieldName)) {
  109. boolean fullTextNeeded = (null == doc.getFullText()); // (check here so we can cache it)
  110. if (fullTextNeeded) {
  111. getRawTextFromUrlIfNeeded(doc, feedConfig);
  112. // (if transforming full text then grab the raw body from the URL if necessary)
  113. cachedFullText = doc.getFullText();
  114. }//TOTEST
  115. }
  116. SimpleTextCleanserPojo mappedTextExtractor = new SimpleTextCleanserPojo();
  117. mappedTextExtractor.setField(textExtractor.fieldName);
  118. mappedTextExtractor.setFlags(textExtractor.flags);
  119. mappedTextExtractor.setScript(textExtractor.script);
  120. mappedTextExtractor.setScriptlang(textExtractor.scriptlang);
  121. mappedTextExtractor.setReplacement(textExtractor.replacement);
  122. mappedTextExtractors.add(mappedTextExtractor);
  123. }
  124. this.cleanseText(mappedTextExtractors, doc);
  125. return cachedFullText;
  126. }
  127. //TESTED (fulltext_regexTests.json)
  128. public void processMetadataChain(DocumentPojo doc, List<MetadataSpecPojo> metadataFields, SourceRssConfigPojo feedConfig) throws IOException
  129. {
  130. // Map metadata list to a legacy meta format (they're really similar...)
  131. UnstructuredAnalysisConfigPojo.metaField mappedEl = new UnstructuredAnalysisConfigPojo.metaField();
  132. boolean textSet = false;
  133. for (MetadataSpecPojo meta: metadataFields) {
  134. mappedEl.fieldName = meta.fieldName;
  135. mappedEl.context = Context.All;
  136. mappedEl.flags = meta.flags;
  137. if (null == mappedEl.flags) {
  138. mappedEl.flags = "";
  139. }
  140. if (mappedEl.flags.isEmpty() || mappedEl.flags.contains("t")) {
  141. if (!textSet) {
  142. getRawTextFromUrlIfNeeded(doc, feedConfig);
  143. textSet = true;
  144. }
  145. }//TESTED (content_needed_test)
  146. mappedEl.scriptlang = meta.scriptlang;
  147. mappedEl.script = meta.script;
  148. mappedEl.replace = meta.replace;
  149. mappedEl.groupNum = null;
  150. //(no group num - just use replace, and flags "o" for xpath/gN:-1)
  151. this.processMeta(doc, mappedEl, doc.getFullText(), null, null);
  152. }
  153. //TODO (INF-1922) (store/index)
  154. }
  155. //TESTED (fulltext_regexTests.json)
  156. ///////////////////////////////////////////////////////////////////////////////////////////
  157. // PROCESSING PIPELINE - UTILITIES
  158. public void getRawTextFromUrlIfNeeded(DocumentPojo doc, SourceRssConfigPojo feedConfig) throws IOException {
  159. if (null != doc.getFullText()) { // Nothing to do
  160. return;
  161. }
  162. Scanner s = null;
  163. try {
  164. URL url = new URL(doc.getUrl());
  165. URLConnection urlConnect = null;
  166. if (null != feedConfig) {
  167. urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
  168. if (null != feedConfig.getUserAgent()) {
  169. urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
  170. }// TESTED
  171. if (null != feedConfig.getHttpFields()) {
  172. for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
  173. urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());
  174. }
  175. }//TOTEST
  176. }
  177. else {
  178. urlConnect = url.openConnection();
  179. }
  180. InputStream urlStream = null;
  181. try {
  182. urlStream = urlConnect.getInputStream();
  183. }
  184. catch (Exception e) { // Try one more time, this time exception out all the way
  185. if (null != feedConfig) {
  186. urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
  187. if (null != feedConfig.getUserAgent()) {
  188. urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
  189. }// TESTED
  190. if (null != feedConfig.getHttpFields()) {
  191. for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
  192. urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());
  193. }
  194. }//TESTED
  195. }
  196. else {
  197. urlConnect = url.openConnection();
  198. }
  199. urlStream = urlConnect.getInputStream();
  200. }
  201. s = new Scanner(urlStream, "UTF-8");
  202. doc.setFullText(s.useDelimiter("\\A").next());
  203. }
  204. finally { //(release resources)
  205. if (null != s) {
  206. s.close();
  207. }
  208. }
  209. }//TESTED (cut-and-paste from existing code, so new testing very cursory)
  210. ///////////////////////////////////////////////////////////////////////////////////////////
  211. ///////////////////////////////////////////////////////////////////////////////////////////
  212. ///////////////////////////////////////////////////////////////////////////////////////////
  213. // LEGACY CODE - USE TO SUPPORT OLD CODE FOR NOW + AS UTILITY CODE FOR THE PIPELINE LOGIC
  214. // Per-source state
  215. private Pattern headerPattern = null;
  216. private Pattern footerPattern = null;
  217. private UnstructuredAnalysisConfigPojo savedUap = null;
  218. // Javascript handling, if needed
  219. private ScriptEngineManager factory = null;
  220. private ScriptEngine engine = null;
  221. private static String parsingScript = null;
  222. // Using Tika to process documents:
  223. TextExtractorTika tikaExtractor = null;
  224. private HarvestContext _context = null;
  225. private Logger logger = Logger
  226. .getLogger(UnstructuredAnalysisHarvester.class);
  227. // (some web scraping may be needed)
  228. private long nBetweenDocs_ms = -1;
  229. // (set this in execute harvest - makes it easy to only set once in the per doc version called in bulk from the SAH)
  230. // Ensure we don't get long list of duplicates for commonly occurring words
  231. private HashSet<String> regexDuplicates = null;
  232. private HtmlCleaner cleaner = null;
  233. //if the sah already init'd an engine we'll just use it
  234. private ScriptEngine _sahEngine = null;
  235. private JavascriptSecurityManager securityManager = null;
  236. /**
  237. * Default Constructor
  238. */
  239. public UnstructuredAnalysisHarvester() {
  240. }
  241. // For harvest pipeline, just ensures duplicate map exists and is empty for each doc
  242. public void resetForNewDoc() {
  243. if ((null == regexDuplicates) || (!regexDuplicates.isEmpty())) {
  244. regexDuplicates = new HashSet<String>();
  245. }
  246. }
  247. /**
  248. * executeHarvest(SourcePojo source, List<DocumentPojo> feeds)
  249. *
  250. * @param source
  251. * @param feeds
  252. * @return List<DocumentPojo>
  253. */
  254. public List<DocumentPojo> executeHarvest(HarvestController contextController, SourcePojo source, List<DocumentPojo> documents)
  255. {
  256. nBetweenDocs_ms = -1;
  257. // Can override the default (feed) wait time from within the source (eg
  258. // for sites that we know don't get upset about getting hammered)
  259. if (null != source.getRssConfig()) {
  260. if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
  261. nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
  262. }
  263. }
  264. if (-1 == nBetweenDocs_ms) {
  265. PropertiesManager props = new PropertiesManager();
  266. nBetweenDocs_ms = props.getWebCrawlWaitTime();
  267. }
  268. // TESTED: default and overridden values
  269. _context = contextController;
  270. UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();
  271. if (uap != null) {
  272. boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed");
  273. String headerRegEx = uap.getHeaderRegEx();
  274. String footerRegEx = uap.getFooterRegEx();
  275. List<metaField> meta = uap.getMeta();
  276. if (headerRegEx != null)
  277. headerPattern = createRegex(headerRegEx, uap.getHeaderRegExFlags());
  278. if (footerRegEx != null)
  279. footerPattern = createRegex(footerRegEx, uap.getFooterRegExFlags());
  280. Iterator<DocumentPojo> it = documents.iterator();
  281. int nDocs = 0;
  282. while (it.hasNext()) {
  283. nDocs++;
  284. DocumentPojo d = it.next();
  285. regexDuplicates = new HashSet<String>();
  286. cleaner = null;
  287. // For feeds, may need to go get the document text manually,
  288. // it's a bit horrible since
  289. // obviously may then go get the data again for full text
  290. // extraction
  291. boolean bFetchedUrl = false;
  292. if (bGetRawDoc && (null == d.getFullText())) {
  293. if (null == source.getRssConfig()) {
  294. source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
  295. }
  296. // (first time through, sleep following a URL/RSS access)
  297. if ((1 == nDocs) && (null != source.getUrl())) { // (have already made a call to RSS (or "searchConfig" URL)
  298. try {
  299. Thread.sleep(nBetweenDocs_ms);
  300. } catch (InterruptedException e) {
  301. }
  302. }
  303. // TESTED (first time only, correct value after searchConfig override)
  304. try {
  305. if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
  306. // Special case: if tika enabled then do that first
  307. if (null == tikaExtractor) {
  308. tikaExtractor = new TextExtractorTika();
  309. tikaExtractor.extractText(d);
  310. }
  311. }
  312. else {
  313. this.getRawTextFromUrlIfNeeded(d, source.getRssConfig());
  314. }
  315. bFetchedUrl = true;
  316. } catch (Exception e) { // Failed to get full text twice, remove doc
  317. contextController.handleExtractError(e, source); //handle extractor error if need be
  318. it.remove();
  319. d.setTempSource(null); // (can safely corrupt this doc since it's been removed)
  320. continue;
  321. }
  322. }
  323. long nTime_ms = System.currentTimeMillis();
  324. // ^^^ (end slight hack to get raw text to the UAH for RSS feeds)
  325. try {
  326. processBody(d, meta, true, source, uap);
  327. } catch (Exception e) {
  328. this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
  329. //DEBUG (don't output log messages per doc)
  330. //logger.error("processBody1: " + e.getMessage(), e);
  331. }
  332. try {
  333. if (uap.getSimpleTextCleanser() != null) {
  334. cleanseText(uap.getSimpleTextCleanser(), d);
  335. }
  336. } catch (Exception e) {
  337. this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
  338. //DEBUG (don't output log messages per doc)
  339. //logger.error("cleanseText: " + e.getMessage(), e);
  340. }
  341. try {
  342. processHeader(headerPattern, d, meta, source, uap);
  343. processFooter(footerPattern, d, meta, source, uap);
  344. } catch (Exception e) {
  345. this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
  346. //DEBUG (don't output log messages per doc)
  347. //logger.error("header/footerPattern: " + e.getMessage(), e);
  348. }
  349. try {
  350. processBody(d, meta, false, source, uap);
  351. } catch (Exception e) {
  352. this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
  353. //DEBUG (don't output log messages per doc)
  354. //logger.error("processBody2: " + e.getMessage(), e);
  355. }
  356. if (it.hasNext() && bFetchedUrl) {
  357. nTime_ms = nBetweenDocs_ms
  358. - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
  359. if (nTime_ms > 0) {
  360. try {
  361. Thread.sleep(nTime_ms);
  362. } catch (InterruptedException e) {
  363. }
  364. }
  365. } // (end politeness delay for URL getting from a single source (likely site)
  366. }
  367. return documents;
  368. }
  369. return new ArrayList<DocumentPojo>();
  370. }
  371. /**
  372. * executeHarvest For single-feed calls (note exception handling happens in
  373. * SAH)
  374. *
  375. * @param source
  376. * @param doc
  377. * @return
  378. * @throws ExtractorDocumentLevelException
  379. */
  380. public boolean executeHarvest(HarvestContext context, SourcePojo source, DocumentPojo doc, boolean bFirstTime, boolean bMoreDocs) throws ExtractorDocumentLevelException
  381. {
  382. regexDuplicates = new HashSet<String>();
  383. cleaner = null;
  384. boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed")
  385. && (null == doc.getFullText());
  386. // (ie don't have full text and will need to go fetch it from network)
  387. if (bFirstTime) {
  388. nBetweenDocs_ms = -1; // (reset eg bewteen searchConfig and SAH)
  389. }
  390. if ((-1 == nBetweenDocs_ms) && bGetRawDoc && (bMoreDocs || bFirstTime)) { // (don't bother if not using it...)
  391. // Can override the default (feed) wait time from within the source
  392. // (eg for sites that we know
  393. // don't get upset about getting hammered)
  394. if (null != source.getRssConfig()) {
  395. if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
  396. nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
  397. }
  398. }
  399. if (-1 == nBetweenDocs_ms) { // (ie not overridden so use default)
  400. PropertiesManager props = new PropertiesManager();
  401. nBetweenDocs_ms = props.getWebCrawlWaitTime();
  402. }
  403. } // TESTED (overridden and using system default)
  404. _context = context;
  405. UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();
  406. int nChanges = 0;
  407. if (null != doc.getMetaData()) {
  408. nChanges = doc.getMetaData().size();
  409. }
  410. boolean bFetchedUrl = false;
  411. if (bGetRawDoc) {
  412. if (null == source.getRssConfig()) {
  413. source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
  414. }
  415. try {
  416. // Workaround for observed twitter bug (first access after the
  417. // RSS was gzipped)
  418. if (bFirstTime) {
  419. // (first time through, sleep following a URL/RSS access)
  420. if (null != source.getUrl()) { // (have already made a call to RSS (or "searchConfig" URL)
  421. try {
  422. Thread.sleep(nBetweenDocs_ms);
  423. } catch (InterruptedException e) {
  424. }
  425. }
  426. // TESTED
  427. }
  428. if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
  429. // Special case: if tika enabled then do that first
  430. if (null == tikaExtractor) {
  431. tikaExtractor = new TextExtractorTika();
  432. tikaExtractor.extractText(doc);
  433. }
  434. }
  435. else {
  436. getRawTextFromUrlIfNeeded(doc, source.getRssConfig());
  437. }
  438. bFetchedUrl = true;
  439. } catch (Exception e) { // Failed to get full text twice... remove doc and carry on
  440. throw new ExtractorDocumentLevelException(e.getMessage());
  441. }
  442. }
  443. long nTime_ms = System.currentTimeMillis();
  444. // ^^^ (end slight hack to get raw text to the UAH for RSS feeds)
  445. if (uap != null) {
  446. List<metaField> meta = uap.getMeta();
  447. if (savedUap != uap) {
  448. String headerRegEx = uap.getHeaderRegEx();
  449. String footerRegEx = uap.getFooterRegEx();
  450. if (headerRegEx != null)
  451. headerPattern = Pattern.compile(headerRegEx, Pattern.DOTALL);
  452. if (footerRegEx != null)
  453. footerPattern = Pattern.compile(footerRegEx, Pattern.DOTALL);
  454. savedUap = uap;
  455. }
  456. try {
  457. processBody(doc, meta, true, source, uap);
  458. } catch (Exception e) {
  459. this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
  460. //DEBUG (don't output log messages per doc)
  461. //logger.error("processBody1: " + e.getMessage(), e);
  462. }
  463. try {
  464. if (uap.getSimpleTextCleanser() != null) {
  465. cleanseText(uap.getSimpleTextCleanser(), doc);
  466. }
  467. } catch (Exception e) {
  468. this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
  469. //DEBUG (don't output log messages per doc)
  470. //logger.error("cleanseText: " + e.getMessage(), e);
  471. }
  472. try {
  473. processHeader(headerPattern, doc, meta, source, uap);
  474. processFooter(footerPattern, doc, meta, source, uap);
  475. } catch (Exception e) {
  476. this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
  477. //DEBUG (don't output log messages per doc)
  478. //logger.error("header/footerPattern: " + e.getMessage(), e);
  479. }
  480. try {
  481. processBody(doc, meta, false, source, uap);
  482. } catch (Exception e) {
  483. this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
  484. //DEBUG (don't output log messages per doc)
  485. //logger.error("processBody2: " + e.getMessage(), e);
  486. }
  487. }
  488. if (bMoreDocs && bFetchedUrl) {
  489. nTime_ms = nBetweenDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
  490. if (nTime_ms > 0) {
  491. try {
  492. Thread.sleep(nTime_ms);
  493. } catch (InterruptedException e) {
  494. }
  495. }
  496. } // (end politeness delay for URL getting from a single source (likely site)
  497. if (null != doc.getMetaData()) {
  498. if (nChanges != doc.getMetaData().size()) {
  499. return true;
  500. }
  501. }
  502. return false;
  503. }
  504. /**
  505. * processHeader
  506. *
  507. * @param headerPattern
  508. * @param f
  509. * @param meta
  510. */
  511. private void processHeader(Pattern headerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
  512. {
  513. if (headerPattern != null) {
  514. Matcher headerMatcher = headerPattern.matcher(f.getFullText());
  515. String headerText = null;
  516. while (headerMatcher.find()) {
  517. if (headerMatcher.start() == 0) {
  518. headerText = headerMatcher.group(0);
  519. f.setHeaderEndIndex(headerText.length());
  520. for (int i = 1; i < headerMatcher.groupCount() + 1; i++) {
  521. f.addToHeader(headerMatcher.group(i).trim());
  522. }
  523. break;
  524. }
  525. }
  526. if (null != headerText && null != meta) {
  527. for (metaField m : meta) {
  528. if (m.context == Context.Header || m.context == Context.All) {
  529. this.processMeta(f, m, headerText, source, uap);
  530. }
  531. }
  532. }
  533. }
  534. }
  535. /**
  536. * processFooter
  537. *
  538. * @param footerPattern
  539. * @param f
  540. * @param meta
  541. */
  542. private void processFooter(Pattern footerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
  543. {
  544. if (footerPattern != null) {
  545. Matcher footerMatcher = footerPattern.matcher(f.getFullText());
  546. String footerText = null;
  547. while (footerMatcher.find()) {
  548. footerText = footerMatcher.group(0);
  549. int docLength = f.getFullText().length();
  550. f.setFooterStartIndex(docLength - footerMatcher.group(0).length());
  551. for (int i = 1; i < footerMatcher.groupCount() + 1; i++) {
  552. f.addToHeader(footerMatcher.group(i).trim());
  553. }
  554. break;
  555. }
  556. if (null != footerText && null != meta) {
  557. for (metaField m : meta) {
  558. if (m.context == Context.Footer || m.context == Context.All) {
  559. this.processMeta(f, m, footerText, source, uap);
  560. }
  561. }
  562. }
  563. }
  564. }
  565. /**
  566. * processBody
  567. *
  568. * @param f
  569. * @param meta
  570. */
  571. private void processBody(DocumentPojo f, List<metaField> meta, boolean bPreCleansing, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
  572. {
  573. if (null != meta) {
  574. for (metaField m : meta) {
  575. if ((bPreCleansing && (m.context == Context.First))
  576. || (!bPreCleansing && (m.context == Context.Body || m.context == Context.All))) {
  577. String toProcess = f.getBody();
  578. if (toProcess == null)
  579. toProcess = f.getDescription();
  580. if (null != toProcess) {
  581. this.processMeta(f, m, toProcess, source, uap);
  582. }
  583. }
  584. }
  585. }
  586. }
  587. /**
  588. * processMeta - handle an individual field
  589. */
  590. private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source, UnstructuredAnalysisConfigPojo uap) {
  591. boolean bAllowDuplicates = false;
  592. if ((null != m.flags) && m.flags.contains("U")) {
  593. bAllowDuplicates = true;
  594. }
  595. if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {
  596. Pattern metaPattern = createRegex(m.script, m.flags);
  597. int timesToRun = 1;
  598. Object[] currField = null;
  599. if ((null != m.flags) && m.flags.contains("c")) {
  600. currField = f.getMetadata().get(m.fieldName);
  601. }
  602. if (null != currField) { // chained metadata
  603. timesToRun = currField.length;
  604. text = (String)currField[0];
  605. }//TESTED
  606. Matcher matcher = metaPattern.matcher(text);
  607. LinkedList<String> Llist = null;
  608. for (int ii = 0; ii < timesToRun; ++ii) {
  609. if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
  610. text = (String)currField[ii];
  611. matcher = metaPattern.matcher(text);
  612. }//TESTED
  613. StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
  614. int nFieldNameLen = m.fieldName.length() + 1;
  615. try {
  616. while (matcher.find()) {
  617. if (null == Llist) {
  618. Llist = new LinkedList<String>();
  619. }
  620. if (null == m.groupNum) {
  621. m.groupNum = 0;
  622. }
  623. String toAdd = matcher.group(m.groupNum);
  624. if (null != m.replace) {
  625. toAdd = metaPattern.matcher(toAdd).replaceFirst(
  626. m.replace);
  627. }
  628. if ((null != m.flags) && m.flags.contains("H")) {
  629. toAdd = StringEscapeUtils.unescapeHtml(toAdd);
  630. }
  631. prefix.setLength(nFieldNameLen);
  632. prefix.append(toAdd);
  633. String dupCheck = prefix.toString();
  634. if (!regexDuplicates.contains(dupCheck)) {
  635. Llist.add(toAdd);
  636. if (!bAllowDuplicates) {
  637. regexDuplicates.add(dupCheck);
  638. }
  639. }
  640. }
  641. } catch (Exception e) {
  642. this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
  643. }
  644. }//(end metadata chaining handling)
  645. if (null != Llist) {
  646. if (null != currField) { // (overwrite)
  647. f.getMetadata().put(m.fieldName, Llist.toArray());
  648. }
  649. else {
  650. f.addToMetadata(m.fieldName, Llist.toArray());
  651. }
  652. }//TESTED
  653. }
  654. else if (m.scriptlang.equalsIgnoreCase("javascript"))
  655. {
  656. if (null == f.getMetadata()) {
  657. f.setMetadata(new LinkedHashMap<String, Object[]>());
  658. }
  659. //set the script engine up if necessary
  660. if ((null != source) && (null != uap)) {
  661. //(these are null if called from new processing pipeline vs legacy code)
  662. intializeScriptEngine(source, uap);
  663. }
  664. try
  665. {
  666. //TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
  667. // (also should be able to use SAH _document object I think?)
  668. // Javascript: the user passes in
  669. Object[] currField = f.getMetadata().get(m.fieldName);
  670. if ((null == m.flags) || m.flags.isEmpty()) {
  671. if (null == currField) {
  672. engine.put("text", text);
  673. engine.put("_iterator", null);
  674. }
  675. //(otherwise will just pass the current fields in there)
  676. }
  677. else { // flags specified
  678. if (m.flags.contains("t")) { // text
  679. engine.put("text", text);
  680. }
  681. if (m.flags.contains("d")) { // entire document (minus ents and assocs)
  682. GsonBuilder gb = new GsonBuilder();
  683. Gson g = gb.create();
  684. List<EntityPojo> ents = f.getEntities();
  685. List<AssociationPojo> assocs = f.getAssociations();
  686. try {
  687. f.setEntities(null);
  688. f.setAssociations(null);
  689. engine.put("document", g.toJson(f));
  690. securityManager.eval(engine, JavaScriptUtils.initScript);
  691. }
  692. finally {
  693. f.setEntities(ents);
  694. f.setAssociations(assocs);
  695. }
  696. }
  697. if (m.flags.contains("m")) { // metadata
  698. GsonBuilder gb = new GsonBuilder();
  699. Gson g = gb.create();
  700. engine.put("_metadata", g.toJson(f.getMetadata()));
  701. securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
  702. }
  703. }//(end flags processing)
  704. if (null != currField) {
  705. f.getMetadata().remove(m.fieldName);
  706. GsonBuilder gb = new GsonBuilder();
  707. Gson g = gb.create();
  708. engine.put("_iterator", g.toJson(currField));
  709. securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);
  710. }
  711. //TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)
  712. Object returnVal = securityManager.eval(engine, m.script);
  713. if (null != returnVal) {
  714. if (returnVal instanceof String) { // The only easy case
  715. Object[] array = new Object[1];
  716. if ((null != m.flags) && m.flags.contains("H")) {
  717. returnVal = StringEscapeUtils.unescapeHtml((String)returnVal);
  718. }
  719. array[0] = returnVal;
  720. f.addToMetadata(m.fieldName, array);
  721. } else { // complex object or array - in either case the engine turns these into
  722. // internal.NativeArray or internal.NativeObject
  723. BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);
  724. f.addToMetadata(m.fieldName, outList.toArray());
  725. }
  726. }
  727. } catch (ScriptException e) {
  728. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  729. // Just do nothing and log
  730. // e.printStackTrace();
  731. //DEBUG (don't output log messages per doc)
  732. //logger.error(e.getMessage());
  733. } catch (Exception e) {
  734. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  735. // Just do nothing and log
  736. // e.printStackTrace();
  737. //DEBUG (don't output log messages per doc)
  738. //logger.error(e.getMessage());
  739. }
  740. } else if (m.scriptlang.equalsIgnoreCase("xpath")) {
  741. String xpath = m.script;
  742. try {
  743. createHtmlCleanerIfNeeded();
  744. int timesToRun = 1;
  745. Object[] currField = null;
  746. if ((null != m.flags) && m.flags.contains("c")) {
  747. currField = f.getMetadata().get(m.fieldName);
  748. }
  749. if (null != currField) { // chained metadata
  750. f.getMetadata().remove(m.fieldName); // (so will add to the end)
  751. timesToRun = currField.length;
  752. text = (String)currField[0];
  753. }//TESTED
  754. for (int ii = 0; ii < timesToRun; ++ii) {
  755. if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
  756. text = (String)currField[ii];
  757. }//TESTED
  758. TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));
  759. //NewCode : Only use html cleaner for cleansing
  760. //use JAXP for full Xpath lib
  761. Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
  762. String extraRegex = extractRegexFromXpath(xpath);
  763. if (extraRegex != null)
  764. xpath = xpath.replace(extraRegex, "");
  765. XPath xpa = XPathFactory.newInstance().newXPath();
  766. NodeList res = (NodeList)xpa.evaluate(xpath, doc, XPathConstants.NODESET);
  767. if (res.getLength() > 0)
  768. {
  769. if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
  770. m.groupNum = -1; // (see bConvertToObject below)
  771. }
  772. StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
  773. int nFieldNameLen = m.fieldName.length() + 1;
  774. ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
  775. boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
  776. boolean convertToXml = ((null != m.flags) && (m.flags.contains("x")));
  777. for (int i= 0; i< res.getLength(); i++)
  778. {
  779. Node info_node = res.item(i);
  780. if ((null != m.flags) && (m.flags.contains("g"))) {
  781. Llist.add(parseHtmlTable(info_node, m.replace));
  782. }
  783. else if (bConvertToObject || convertToXml) {
  784. // Try to create a JSON object out of this
  785. StringWriter writer = new StringWriter();
  786. try {
  787. Transformer transformer = TransformerFactory.newInstance().newTransformer();
  788. transformer.transform(new DOMSource(info_node), new StreamResult(writer));
  789. } catch (TransformerException e1) {
  790. continue;
  791. }
  792. if (bConvertToObject) {
  793. try {
  794. JSONObject subObj = XML.toJSONObject(writer.toString());
  795. if (xpath.endsWith("*")) { // (can have any number of different names here)
  796. Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
  797. }//TESTED
  798. else {
  799. String[] rootNames = JSONObject.getNames(subObj);
  800. if (1 == rootNames.length) {
  801. // (don't think it can't be any other number in fact)
  802. subObj = subObj.getJSONObject(rootNames[0]);
  803. }
  804. boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
  805. Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj, bUnescapeHtml));
  806. }//TESTED
  807. }
  808. catch (JSONException e) { // Just carry on
  809. continue;
  810. }
  811. //TESTED
  812. }
  813. else { // leave in XML form
  814. Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
  815. }//TESTED (xpath_test.json)
  816. }
  817. else { // Treat this as string, either directly or via regex
  818. String info = info_node.getTextContent().trim();
  819. if (extraRegex == null || extraRegex.isEmpty()) {
  820. prefix.setLength(nFieldNameLen);
  821. prefix.append(info);
  822. String dupCheck = prefix.toString();
  823. if (!regexDuplicates.contains(dupCheck)) {
  824. if ((null != m.flags) && m.flags.contains("H")) {
  825. info = StringEscapeUtils.unescapeHtml(info);
  826. }
  827. Llist.add(info);
  828. if (!bAllowDuplicates) {
  829. regexDuplicates.add(dupCheck);
  830. }
  831. }
  832. }
  833. else { // Apply regex to the string
  834. Pattern dataRegex = createRegex(extraRegex, m.flags);
  835. Matcher dataMatcher = dataRegex.matcher(info);
  836. boolean result = dataMatcher.find();
  837. while (result) {
  838. String toAdd;
  839. if (m.groupNum != null)
  840. toAdd = dataMatcher.group(m.groupNum);
  841. else
  842. toAdd = dataMatcher.group();
  843. prefix.setLength(nFieldNameLen);
  844. prefix.append(toAdd);
  845. String dupCheck = prefix.toString();
  846. if (!regexDuplicates.contains(dupCheck)) {
  847. if ((null != m.flags) && m.flags.contains("H")) {
  848. toAdd = StringEscapeUtils.unescapeHtml(toAdd);
  849. }
  850. Llist.add(toAdd);
  851. if (!bAllowDuplicates) {
  852. regexDuplicates.add(dupCheck);
  853. }
  854. }
  855. result = dataMatcher.find();
  856. }
  857. }//(regex vs no regex)
  858. }//(end string vs object)
  859. }
  860. if (Llist.size() > 0) {
  861. f.addToMetadata(m.fieldName, Llist.toArray());
  862. }
  863. }
  864. }//(end loop over metadata objects if applicable)
  865. } catch (IOException ioe) {
  866. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(), true);
  867. // Just do nothing and log
  868. //DEBUG (don't output log messages per doc)
  869. //logger.error(ioe.getMessage());
  870. } catch (ParserConfigurationException e1) {
  871. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
  872. // Just do nothing and log
  873. //DEBUG (don't output log messages per doc)
  874. //logger.error(e1.getMessage());
  875. } catch (XPathExpressionException e1) {
  876. _context.getHarvestStatus().logMessage("Error evaluating xpath expression: " + xpath, true);
  877. }
  878. }
  879. else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
  880. // which one?
  881. try {
  882. boolean json = false;
  883. boolean xml = false;
  884. for (int i = 0; i < 128; ++i) {
  885. if ('<' == text.charAt(i)) {
  886. xml = true;
  887. break;
  888. }
  889. if ('{' == text.charAt(i)) {
  890. json = true;
  891. break;
  892. }
  893. if (!Character.isSpaceChar(text.charAt(i))) {
  894. break;
  895. }
  896. }//TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)
  897. List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
  898. if (xml) {
  899. XmlToMetadataParser parser = new XmlToMetadataParser(Arrays.asList(m.script.split("\\s*,\\s*")), null, null, null, null, null, Integer.MAX_VALUE);
  900. XMLInputFactory factory = XMLInputFactory.newInstance();
  901. factory.setProperty(XMLInputFactory.IS_COALESCING, true);
  902. factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
  903. XMLStreamReader reader = null;
  904. try {
  905. reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
  906. docs = parser.parseDocument(reader, true);
  907. }
  908. finally {
  909. if (null != reader) reader.close();
  910. }
  911. }//TESTED (meta_stream_test, test1)
  912. if (json) {
  913. JsonReader jsonReader = null;
  914. try {
  915. JsonToMetadataParser parser = new JsonToMetadataParser(null, Arrays.asList(m.script.split("\\s*,\\s*")), null, null, Integer.MAX_VALUE);
  916. jsonReader = new JsonReader(new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
  917. jsonReader.setLenient(true);
  918. docs = parser.parseDocument(jsonReader, true);
  919. }
  920. finally {
  921. if (null != jsonReader) jsonReader.close();
  922. }
  923. }//TESTED (meta_stream_test test2)
  924. if (!docs.isEmpty()) {
  925. ArrayList<String> Llist = new ArrayList<String>(docs.size());
  926. for (DocumentPojo doc: docs) {
  927. if (null != doc.getFullText()) {
  928. Llist.add(doc.getFullText());
  929. }
  930. }
  931. if (Llist.size() > 0) {
  932. f.addToMetadata(m.fieldName, Llist.toArray());
  933. }
  934. }//TESTED (meta_stream_test test1,test2)
  935. }//(end try)
  936. catch (Exception e) { // various parsing errors
  937. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  938. }
  939. }//TESTED (meta_stream_test)
  940. // (don't currently support other script types)
  941. }
  942. private static String extractRegexFromXpath(String original_xpath) {
  943. Pattern addedRegex = Pattern.compile("regex\\(.*\\)\\s*$", Pattern.MULTILINE | Pattern.DOTALL);
  944. Matcher matcher = addedRegex.matcher(original_xpath);
  945. boolean matchFound = matcher.find();
  946. if (matchFound) {
  947. try {
  948. return matcher.group();
  949. } catch (Exception e) {
  950. return null;
  951. }
  952. }
  953. return null;
  954. }
  955. /**
  956. * cleanseText
  957. *
  958. * @param source
  959. * @param documents
  960. * @return
  961. */
  962. private void cleanseText(List<SimpleTextCleanserPojo> simpleTextCleanser, DocumentPojo document)
  963. {
  964. // Store these since can re-generate them by concatenation
  965. StringBuffer fullTextBuilder = null;
  966. StringBuffer descriptionBuilder = null;
  967. StringBuffer titleBuilder = null;
  968. // (note no support for metadata concatenation, replace only)
  969. // Iterate over the cleanser functions that need to run on each feed
  970. for (SimpleTextCleanserPojo s : simpleTextCleanser) {
  971. boolean bConcat = (null != s.getFlags()) && s.getFlags().contains("+");
  972. boolean bUsingJavascript = ((null != s.getScriptlang()) && s.getScriptlang().equalsIgnoreCase("javascript"));
  973. if (s.getField().equalsIgnoreCase("fulltext")) {
  974. if ((null != document.getFullText()) || bUsingJavascript) {
  975. StringBuffer myBuilder = fullTextBuilder;
  976. if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
  977. document.setFullText(myBuilder.toString());
  978. myBuilder.setLength(0);
  979. } //TESTED
  980. String res = cleanseField(document.getFullText(),
  981. s.getScriptlang(), s.getScript(), s.getFlags(),
  982. s.getReplacement(), document);
  983. if (bConcat) {
  984. if (null == myBuilder) {
  985. fullTextBuilder = myBuilder = new StringBuffer();
  986. }
  987. myBuilder.append(res).append('\n');
  988. }
  989. else {
  990. document.setFullText(res);
  991. }
  992. }
  993. } //TESTED
  994. else if (s.getField().equalsIgnoreCase("description")) {
  995. if ((null != document.getDescription()) || bUsingJavascript) {
  996. StringBuffer myBuilder = descriptionBuilder;
  997. if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
  998. document.setDescription(myBuilder.toString());
  999. myBuilder.setLength(0);
  1000. } //TESTED
  1001. String res = cleanseField(document.getDescription(),
  1002. s.getScriptlang(), s.getScript(), s.getFlags(),
  1003. s.getReplacement(), document);
  1004. if (bConcat) {
  1005. if (null == myBuilder) {
  1006. descriptionBuilder = myBuilder = new StringBuffer();
  1007. }
  1008. myBuilder.append(res).append('\n');
  1009. }
  1010. else {
  1011. document.setDescription(res);
  1012. }
  1013. }
  1014. } //TESTED
  1015. else if (s.getField().equalsIgnoreCase("title")) {
  1016. if ((null != document.getTitle()) || bUsingJavascript) {
  1017. StringBuffer myBuilder = titleBuilder;
  1018. if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
  1019. document.setTitle(myBuilder.toString());
  1020. myBuilder.setLength(0);
  1021. } //TESTED
  1022. String res = cleanseField(document.getTitle(),
  1023. s.getScriptlang(), s.getScript(), s.getFlags(),
  1024. s.getReplacement(), document);
  1025. if (bConcat) {
  1026. if (null == myBuilder) {
  1027. titleBuilder = myBuilder = new StringBuffer();
  1028. }
  1029. myBuilder.append(res).append('\n');
  1030. }
  1031. else {
  1032. document.setTitle(res);
  1033. }
  1034. }
  1035. } //TESTED
  1036. else if (s.getField().startsWith("metadata.")) {
  1037. // (note no support for metadata concatenation, replace only)
  1038. String metaField = s.getField().substring(9); // (9 for"metadata.")
  1039. Object[] meta = document.getMetadata().get(metaField);
  1040. if ((null != meta) && (meta.length > 0)) {
  1041. Object[] newMeta = new Object[meta.length];
  1042. for (int i = 0; i < meta.length; ++i) {
  1043. Object metaValue = meta[i];
  1044. if (metaValue instanceof String) {
  1045. newMeta[i] = (Object) cleanseField(
  1046. (String) metaValue, s.getScriptlang(),
  1047. s.getScript(), s.getFlags(),
  1048. s.getReplacement(), document);
  1049. } else {
  1050. newMeta[i] = metaValue;
  1051. }
  1052. }
  1053. // Overwrite the old fields
  1054. document.addToMetadata(metaField, newMeta);
  1055. }
  1056. }
  1057. // This is sufficient fields for the moment
  1058. } // (end loop over fields)
  1059. // Handle any left over cases:
  1060. if ((null != fullTextBuilder) && (fullTextBuilder.length() > 0)) {
  1061. document.setFullText(fullTextBuilder.toString());
  1062. } //TESTED
  1063. if ((null != descriptionBuilder) && (descriptionBuilder.length() > 0)) {
  1064. document.setDescription(descriptionBuilder.toString());
  1065. } //TESTED
  1066. if ((null != titleBuilder) && (titleBuilder.length() > 0)) {
  1067. document.setTitle(titleBuilder.toString());
  1068. } //TESTED
  1069. }// TESTED
  1070. /**
  1071. * cleanseField
  1072. *
  1073. * @param field
  1074. * @param script
  1075. * @param replaceWith
  1076. */
  1077. private String cleanseField(String field, String scriptLang, String script,
  1078. String flags, String replaceWith, DocumentPojo f)
  1079. {
  1080. if ((null == scriptLang) || scriptLang.equalsIgnoreCase("regex")) {
  1081. if (null == flags) {
  1082. return field.replaceAll(script, replaceWith);
  1083. }
  1084. else {
  1085. if (flags.contains("H")) { // HTML decode
  1086. return StringEscapeUtils.unescapeHtml(createRegex(script,flags).matcher(field).replaceAll(replaceWith));
  1087. } else {
  1088. return createRegex(script, flags).matcher(field).replaceAll(replaceWith);
  1089. }
  1090. }
  1091. }
  1092. else if (scriptLang.equalsIgnoreCase("xpath")) {
  1093. try {
  1094. createHtmlCleanerIfNeeded();
  1095. TagNode node = cleaner.clean(new ByteArrayInputStream(field.getBytes()));
  1096. Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
  1097. XPath xpa = XPathFactory.newInstance().newXPath();
  1098. NodeList res = (NodeList)xpa.evaluate(script, doc, XPathConstants.NODESET);
  1099. if (0 == res.getLength()) { // No match, just return "", unlike regex we don't want anything if we don't match...
  1100. return "";
  1101. }
  1102. else {
  1103. StringBuffer sb = new StringBuffer();
  1104. for (int i= 0; i< res.getLength(); i++) {
  1105. if (0 != i) {
  1106. sb.append('\n');
  1107. }
  1108. Node info_node = res.item(i);
  1109. if ((null != flags) && flags.contains("H")) { // HTML decode
  1110. sb.append(StringEscapeUtils.unescapeHtml(info_node.getTextContent().trim()));
  1111. }
  1112. else if ((null != flags) && flags.contains("x")) { // Leave as XML string
  1113. StringWriter writer = new StringWriter();
  1114. try {
  1115. Transformer transformer = TransformerFactory.newInstance().newTransformer();
  1116. transformer.transform(new DOMSource(info_node), new StreamResult(writer));
  1117. sb.append(writer.toString().substring(38)); // (step over <?xml etc?> see under metadata field extraction
  1118. }
  1119. catch (TransformerException e1) { // (do nothing just skip)
  1120. }
  1121. }
  1122. else {
  1123. sb.append(info_node.getTextContent().trim());
  1124. }
  1125. }
  1126. return sb.toString();
  1127. }//TESTED (xpath_test: object - multiple and single, text)
  1128. } catch (IOException e) {
  1129. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  1130. }
  1131. catch (XPathExpressionException e) {
  1132. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  1133. }
  1134. catch (ParserConfigurationException e) {
  1135. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  1136. }
  1137. }
  1138. else if (scriptLang.equalsIgnoreCase("javascript")) {
  1139. try {
  1140. SourcePojo src = f.getTempSource();
  1141. intializeScriptEngine(src, src.getUnstructuredAnalysisConfig());
  1142. // Setup input:
  1143. if (null == flags) {
  1144. flags = "t";
  1145. }
  1146. if (flags.contains("t")) { // text
  1147. engine.put("text", field);
  1148. }
  1149. if (flags.contains("d")) { // entire document
  1150. GsonBuilder gb = new GsonBuilder();
  1151. Gson g = gb.create();
  1152. List<EntityPojo> ents = f.getEntities();
  1153. List<AssociationPojo> assocs = f.getAssociations();
  1154. try {
  1155. f.setEntities(null);
  1156. f.setAssociations(null);
  1157. engine.put("document", g.toJson(f));
  1158. securityManager.eval(engine, JavaScriptUtils.initScript);
  1159. }
  1160. finally {
  1161. f.setEntities(ents);
  1162. f.setAssociations(assocs);
  1163. }
  1164. }
  1165. if (flags.contains("m")) { // metadata
  1166. GsonBuilder gb = new GsonBuilder();
  1167. Gson g = gb.create();
  1168. engine.put("_metadata", g.toJson(f.getMetadata()));
  1169. securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
  1170. }
  1171. Object returnVal = securityManager.eval(engine, script);
  1172. field = (String) returnVal; // (If not a string or is null then will exception out)
  1173. if ((null != flags) && flags.contains("H") && (null != field)) { // HTML decode
  1174. field = StringEscapeUtils.unescapeHtml(field);
  1175. }
  1176. }
  1177. catch (Exception e) {
  1178. _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
  1179. // Just do nothing and log
  1180. // e.printStackTrace();
  1181. //DEBUG (don't output log messages per doc)
  1182. //logger.error(e.getMessage());
  1183. }
  1184. }
  1185. return field;
  1186. }
  1187. // Handles parsing of HTML tables to Objects that can be easily printed as JSON. (flag = g)
  1188. // 1] No Replace Value - The first row of the table will be set as the headers
  1189. // 2] Replace Value = "[]" - Headers will be set to the column count number (beginning with 0) eg "0","1"
  1190. // 3a] Replace Value = "[one,two,three]" - The provided headers will be set as the headers
  1191. // 3b] Replace Values set, but more data columns than values provided - Additional columns that were not
  1192. // specified will be assigned it's column count number. eg "specified","1","2"
  1193. // 4] Replace Value = "[one,null,three]" - Columns specified as null in the provided header will be skipped.
  1194. // eg "one","three"
  1195. private static HashMap<String, Object> parseHtmlTable(Node table_node, String replaceWith)
  1196. {
  1197. if (table_node.getNodeName().equalsIgnoreCase("table") && table_node.hasChildNodes())
  1198. {
  1199. Node topNode = table_node;
  1200. boolean tbody = table_node.getFirstChild().getNodeName().equalsIgnoreCase("tbody");
  1201. if (tbody)
  1202. topNode = table_node.getFirstChild();
  1203. if (topNode.hasChildNodes())
  1204. {
  1205. NodeList rows = topNode.getChildNodes();
  1206. List<String> headers = null;
  1207. ArrayList<HashMap<String, String>> data = null;
  1208. int headerLength = 0;
  1209. boolean[] skip = null;
  1210. if (null != replaceWith)
  1211. {
  1212. if (replaceWith.equals("[]")){
  1213. headers = new ArrayList<String>();
  1214. headerLength = 0;
  1215. } // TESTED (by eye - 2)
  1216. else
  1217. {
  1218. //Remove square brackets
  1219. if(replaceWith.startsWith("[") && replaceWith.endsWith("]"))
  1220. replaceWith = replaceWith.substring(1, replaceWith.length()-1);
  1221. //Turn the provided list of headers into a list object
  1222. headers = Arrays.asList(replaceWith.split("\\s*,\\s*"));
  1223. headerLength = headers.size();
  1224. skip = new boolean[headerLength];
  1225. for(int h = 0; h < headerLength; h++)
  1226. {
  1227. String val = headers.get(h);
  1228. if (val.length() == 0 || val.equalsIgnoreCase("null"))
  1229. skip[h] = true;
  1230. else
  1231. skip[h] = false;
  1232. }
  1233. }// TESTED (by eye - 3a)
  1234. }
  1235. //traverse rows
  1236. for(int i = 0; i < rows.getLength(); i++)
  1237. {
  1238. Node row = rows.item(i);
  1239. if (row.getNodeName().equalsIgnoreCase("tr") || row.getNodeName().equalsIgnoreCase("th"))
  1240. {
  1241. //If the header value has not been set, the first row will be set as the headers
  1242. if (null == headers)
  1243. {
  1244. //Traverse through cells
  1245. headers = new ArrayList<String>();
  1246. if (row.hasChildNodes())
  1247. {
  1248. NodeList cells = row.getChildNodes();
  1249. headerLength = cells.getLength();
  1250. skip = new boolean[headerLength];
  1251. for (int j = 0; j < headerLength; j++)
  1252. {
  1253. headers.add(cells.item(j).getTextContent());
  1254. skip[j] = false;
  1255. }
  1256. } // TESTED (by eye - 1)
  1257. }
  1258. else
  1259. {
  1260. if (null == data)
  1261. {
  1262. data = new ArrayList<HashMap<String,String>>();
  1263. }
  1264. if (row.hasChildNodes())
  1265. {
  1266. HashMap<String,String> cellList = new HashMap<String,String>();
  1267. NodeList cells = row.getChildNodes();
  1268. for (int j = 0; j < cells.getLength();

Large files files are truncated, but you can click here to view the full file