UnstructuredAnalysisHarvester.java

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/enrichment/custom/UnstructuredAnalysisHarvester.java

https://github.com/IKANOW/Infinit.e · Java · 1601 lines · 1224 code · 164 blank · 213 comment · 439 complexity · 671c69d5a300cc43ea46d863a4bdb470 MD5 · raw file

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.harvest.enrichment.custom;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.XML;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.stream.JsonReader;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.store.config.source.SimpleTextCleanserPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.ManualTextExtractionSpecPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.MetadataSpecPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.Context;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.metaField;
import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.harvest.HarvestContext;
import com.ikanow.infinit.e.harvest.HarvestController;
import com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser;
import com.ikanow.infinit.e.harvest.extraction.document.file.XmlToMetadataParser;
import com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika;
import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.ikanow.infinit.e.harvest.utils.ProxyManager;
import com.mongodb.BasicDBList;

/**
 * UnstructuredAnalysisHarvester
 */
public class UnstructuredAnalysisHarvester {
	
	///////////////////////////////////////////////////////////////////////////////////////////
	
	// NEW PROCESSING PIPELINE INTERFACE

	//TODO (INF-1922): Handle headers and footers
	
	public void setContext(HarvestContext context) {
		_context = context;
		
		//TODO: need to set up the javascript engine just once - can't do it here though
		// because this might be called before the SAH is setup...		
	} 
	
	// Transform the doc's text (go get it if necessary)
	
	public String doManualTextEnrichment(DocumentPojo doc, List<ManualTextExtractionSpecPojo> textExtractors, SourceRssConfigPojo feedConfig) throws IOException {
		String cachedFullText = null;
		// Map to the legacy format and then call the legacy code 
		ArrayList<SimpleTextCleanserPojo> mappedTextExtractors = new ArrayList<SimpleTextCleanserPojo>(textExtractors.size());
		for (ManualTextExtractionSpecPojo textExtractor: textExtractors) {
			if (DocumentPojo.fullText_.equalsIgnoreCase(textExtractor.fieldName)) {
				boolean fullTextNeeded = (null == doc.getFullText()); // (check here so we can cache it)
				if (fullTextNeeded) {
					getRawTextFromUrlIfNeeded(doc, feedConfig);				
						// (if transforming full text then grab the raw body from the URL if necessary)
					cachedFullText = doc.getFullText();
				}//TOTEST
			}			
			SimpleTextCleanserPojo mappedTextExtractor = new SimpleTextCleanserPojo();
			mappedTextExtractor.setField(textExtractor.fieldName);
			mappedTextExtractor.setFlags(textExtractor.flags);
			mappedTextExtractor.setScript(textExtractor.script);
			mappedTextExtractor.setScriptlang(textExtractor.scriptlang);
			mappedTextExtractor.setReplacement(textExtractor.replacement);
			mappedTextExtractors.add(mappedTextExtractor);
		}
		this.cleanseText(mappedTextExtractors, doc);
		
		return cachedFullText;		
	}
	//TESTED (fulltext_regexTests.json)
	
	public void processMetadataChain(DocumentPojo doc, List<MetadataSpecPojo> metadataFields, SourceRssConfigPojo feedConfig) throws IOException
	{		
		// Map metadata list to a legacy meta format (they're really similar...)
		UnstructuredAnalysisConfigPojo.metaField mappedEl = new UnstructuredAnalysisConfigPojo.metaField();
		boolean textSet = false;
		for (MetadataSpecPojo meta: metadataFields) {
			mappedEl.fieldName = meta.fieldName;
			mappedEl.context = Context.All;
			mappedEl.flags = meta.flags;
			if (null == mappedEl.flags) {
				mappedEl.flags = "";
			}
			if (mappedEl.flags.isEmpty() || mappedEl.flags.contains("t")) {
				if (!textSet) {
					getRawTextFromUrlIfNeeded(doc, feedConfig);				
					textSet = true;
				}
			}//TESTED (content_needed_test)
			
			mappedEl.scriptlang = meta.scriptlang;
			mappedEl.script = meta.script;
			mappedEl.replace = meta.replace;
			mappedEl.groupNum = null;
			//(no group num - just use replace, and flags "o" for xpath/gN:-1)
			
			this.processMeta(doc, mappedEl, doc.getFullText(), null, null);						
		}
		//TODO (INF-1922) (store/index)
	}
	//TESTED (fulltext_regexTests.json)
	
	///////////////////////////////////////////////////////////////////////////////////////////
	
	// PROCESSING PIPELINE - UTILITIES
	
	public void getRawTextFromUrlIfNeeded(DocumentPojo doc, SourceRssConfigPojo feedConfig) throws IOException {
		if (null != doc.getFullText()) { // Nothing to do
			return;
		}
		Scanner s = null;
		try {
			URL url = new URL(doc.getUrl());
			URLConnection urlConnect = null;
			if (null != feedConfig) {
				urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
				if (null != feedConfig.getUserAgent()) {
					urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
				}// TESTED
				if (null != feedConfig.getHttpFields()) {
					for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
						urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());														
					}
				}//TOTEST
			}
			else {
				urlConnect = url.openConnection();				
			}
			InputStream urlStream = null;
			try {
				urlStream = urlConnect.getInputStream();
			}
			catch (Exception e) { // Try one more time, this time exception out all the way
				if (null != feedConfig) {
					urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
					if (null != feedConfig.getUserAgent()) {
						urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
					}// TESTED
					if (null != feedConfig.getHttpFields()) {
						for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
							urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());														
						}
					}//TESTED
				}
				else {
					urlConnect = url.openConnection();				
				}
				urlStream = urlConnect.getInputStream();
			}
			s = new Scanner(urlStream, "UTF-8");
			doc.setFullText(s.useDelimiter("\\A").next());
		}
		finally { //(release resources)
			if (null != s) {
				s.close();
			}
		}		
		
	}//TESTED (cut-and-paste from existing code, so new testing very cursory) 
	
	///////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////
	
	// LEGACY CODE - USE TO SUPPORT OLD CODE FOR NOW + AS UTILITY CODE FOR THE PIPELINE LOGIC
	
	// Per-source state
	private Pattern headerPattern = null;
	private Pattern footerPattern = null;
	private UnstructuredAnalysisConfigPojo savedUap = null;

	// Javascript handling, if needed
	private ScriptEngineManager factory = null;
	private ScriptEngine engine = null;
	private static String parsingScript = null;

	// Using Tika to process documents:
	TextExtractorTika tikaExtractor = null;
	
	private HarvestContext _context = null;
	private Logger logger = Logger
			.getLogger(UnstructuredAnalysisHarvester.class);

	// (some web scraping may be needed)
	private long nBetweenDocs_ms = -1;
	// (set this in execute harvest - makes it easy to only set once in the per doc version called in bulk from the SAH)

	// Ensure we don't get long list of duplicates for commonly occurring words
	private HashSet<String> regexDuplicates = null;
	private HtmlCleaner cleaner = null;
	
	//if the sah already init'd an engine we'll just use it
	private ScriptEngine _sahEngine = null; 
	private JavascriptSecurityManager securityManager = null;
	
	/**
	 * Default Constructor
	 */
	public UnstructuredAnalysisHarvester() {
	}

	// For harvest pipeline, just ensures duplicate map exists and is empty for each doc
	public void resetForNewDoc() {
		if ((null == regexDuplicates) || (!regexDuplicates.isEmpty())) {
			regexDuplicates = new HashSet<String>();
		}
	}
	
	/**
	 * executeHarvest(SourcePojo source, List<DocumentPojo> feeds)
	 * 
	 * @param source
	 * @param feeds
	 * @return List<DocumentPojo>
	 */
	public List<DocumentPojo> executeHarvest(HarvestController contextController, SourcePojo source, List<DocumentPojo> documents)
	{
		nBetweenDocs_ms = -1;
		// Can override the default (feed) wait time from within the source (eg
		// for sites that we know don't get upset about getting hammered)
		if (null != source.getRssConfig()) {
			if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
				nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
			}
		}
		if (-1 == nBetweenDocs_ms) {
			PropertiesManager props = new PropertiesManager();
			nBetweenDocs_ms = props.getWebCrawlWaitTime();
		}
		// TESTED: default and overridden values

		_context = contextController;
		UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();

		if (uap != null) {
			boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed");

			String headerRegEx = uap.getHeaderRegEx();
			String footerRegEx = uap.getFooterRegEx();
			List<metaField> meta = uap.getMeta();

			if (headerRegEx != null)
				headerPattern = createRegex(headerRegEx, uap.getHeaderRegExFlags());
			if (footerRegEx != null)
				footerPattern = createRegex(footerRegEx, uap.getFooterRegExFlags());

			Iterator<DocumentPojo> it = documents.iterator();
			int nDocs = 0;
			while (it.hasNext()) {
				nDocs++;
				DocumentPojo d = it.next();
 				regexDuplicates = new HashSet<String>();
				cleaner = null;

				// For feeds, may need to go get the document text manually,
				// it's a bit horrible since
				// obviously may then go get the data again for full text
				// extraction
				boolean bFetchedUrl = false;
				if (bGetRawDoc && (null == d.getFullText())) {
					if (null == source.getRssConfig()) {
						source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
					}
					// (first time through, sleep following a URL/RSS access)
					if ((1 == nDocs) && (null != source.getUrl())) { // (have already made a call to RSS (or "searchConfig" URL)
						try {
							Thread.sleep(nBetweenDocs_ms);
						} catch (InterruptedException e) {
						}
					}
					// TESTED (first time only, correct value after searchConfig override)

					try {
						if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
							// Special case: if tika enabled then do that first
							if (null == tikaExtractor) {
								tikaExtractor = new TextExtractorTika();
								tikaExtractor.extractText(d);
							}
						}
						else {
							this.getRawTextFromUrlIfNeeded(d, source.getRssConfig());
						}
						bFetchedUrl = true;
						
					} catch (Exception e) { // Failed to get full text twice, remove doc
						contextController.handleExtractError(e, source); //handle extractor error if need be				
						it.remove();
						d.setTempSource(null); // (can safely corrupt this doc since it's been removed)						
						continue;
					}
				}
				long nTime_ms = System.currentTimeMillis();
				// ^^^ (end slight hack to get raw text to the UAH for RSS feeds)

				try {
					processBody(d, meta, true, source, uap);
				} catch (Exception e) {
					this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
					//DEBUG (don't output log messages per doc)
					//logger.error("processBody1: " + e.getMessage(), e);
				}

				try {
					if (uap.getSimpleTextCleanser() != null) {
						cleanseText(uap.getSimpleTextCleanser(), d);
					}
				} catch (Exception e) {
					this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
					//DEBUG (don't output log messages per doc)
					//logger.error("cleanseText: " + e.getMessage(), e);
				}

				try {
					processHeader(headerPattern, d, meta, source, uap);
					processFooter(footerPattern, d, meta, source, uap);
					
				} catch (Exception e) {
					this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
					//DEBUG (don't output log messages per doc)
					//logger.error("header/footerPattern: " + e.getMessage(), e);
				}
				try {
					processBody(d, meta, false, source, uap);
					
				} catch (Exception e) {
					this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
					//DEBUG (don't output log messages per doc)
					//logger.error("processBody2: " + e.getMessage(), e);
				}

				if (it.hasNext() && bFetchedUrl) {
					nTime_ms = nBetweenDocs_ms
							- (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
					if (nTime_ms > 0) {
						try {
							Thread.sleep(nTime_ms);
						} catch (InterruptedException e) {
						}
					}
				} // (end politeness delay for URL getting from a single source (likely site)
			}
			return documents;
		}
		return new ArrayList<DocumentPojo>();
	}

	/**
	 * executeHarvest For single-feed calls (note exception handling happens in
	 * SAH)
	 * 
	 * @param source
	 * @param doc
	 * @return
	 * @throws ExtractorDocumentLevelException 
	 */
	public boolean executeHarvest(HarvestContext context, SourcePojo source, DocumentPojo doc, boolean bFirstTime, boolean bMoreDocs) throws ExtractorDocumentLevelException
	{		
		regexDuplicates = new HashSet<String>();
		cleaner = null;
		boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed")
								&& (null == doc.getFullText());
		// (ie don't have full text and will need to go fetch it from network)

		if (bFirstTime) {
			nBetweenDocs_ms = -1; // (reset eg bewteen searchConfig and SAH)
		}
		if ((-1 == nBetweenDocs_ms) && bGetRawDoc && (bMoreDocs || bFirstTime)) { // (don't bother if not using it...)
			// Can override the default (feed) wait time from within the source
			// (eg for sites that we know
			// don't get upset about getting hammered)
			if (null != source.getRssConfig()) {
				if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
					nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
				}
			}
			if (-1 == nBetweenDocs_ms) { // (ie not overridden so use default)
				PropertiesManager props = new PropertiesManager();
				nBetweenDocs_ms = props.getWebCrawlWaitTime();
			}
		} // TESTED (overridden and using system default)

		_context = context;
		UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();

		int nChanges = 0;
		if (null != doc.getMetaData()) {
			nChanges = doc.getMetaData().size();
		}
		boolean bFetchedUrl = false;
		if (bGetRawDoc) {
			if (null == source.getRssConfig()) {
				source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
			}
			try {
				// Workaround for observed twitter bug (first access after the
				// RSS was gzipped)
				if (bFirstTime) {
					// (first time through, sleep following a URL/RSS access)
					if (null != source.getUrl()) { // (have already made a call to RSS (or "searchConfig" URL)
						try {
							Thread.sleep(nBetweenDocs_ms);
						} catch (InterruptedException e) {
						}
					}
					// TESTED
				}
				
				if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
					// Special case: if tika enabled then do that first
					if (null == tikaExtractor) {
						tikaExtractor = new TextExtractorTika();
						tikaExtractor.extractText(doc);
					}
				}
				else {
					getRawTextFromUrlIfNeeded(doc, source.getRssConfig());
				}
				bFetchedUrl = true;
				
			} catch (Exception e) { // Failed to get full text twice... remove doc and carry on
				throw new ExtractorDocumentLevelException(e.getMessage());
			}
		}
		long nTime_ms = System.currentTimeMillis();
		// ^^^ (end slight hack to get raw text to the UAH for RSS feeds)

		if (uap != null) {
			List<metaField> meta = uap.getMeta();
			if (savedUap != uap) {
				String headerRegEx = uap.getHeaderRegEx();
				String footerRegEx = uap.getFooterRegEx();

				if (headerRegEx != null)
					headerPattern = Pattern.compile(headerRegEx, Pattern.DOTALL);
				if (footerRegEx != null)
					footerPattern = Pattern.compile(footerRegEx, Pattern.DOTALL);

				savedUap = uap;
			}
			try {
				processBody(doc, meta, true, source, uap);
				
			} catch (Exception e) {
				this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
				//DEBUG (don't output log messages per doc)
				//logger.error("processBody1: " + e.getMessage(), e);
			}
			try {
				if (uap.getSimpleTextCleanser() != null) {
					cleanseText(uap.getSimpleTextCleanser(), doc);
				}
			} catch (Exception e) {
				this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
				//DEBUG (don't output log messages per doc)
				//logger.error("cleanseText: " + e.getMessage(), e);
			}
			try {
				processHeader(headerPattern, doc, meta, source, uap);
				processFooter(footerPattern, doc, meta, source, uap);
				
			} catch (Exception e) {
				this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
				//DEBUG (don't output log messages per doc)
				//logger.error("header/footerPattern: " + e.getMessage(), e);
			}
			try {
				processBody(doc, meta, false, source, uap);
				
			} catch (Exception e) {
				this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
				//DEBUG (don't output log messages per doc)
				//logger.error("processBody2: " + e.getMessage(), e);
			}
		}
		if (bMoreDocs && bFetchedUrl) {
			nTime_ms = nBetweenDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
			if (nTime_ms > 0) {
				try {
					Thread.sleep(nTime_ms);
				} catch (InterruptedException e) {
				}
			}
		} // (end politeness delay for URL getting from a single source (likely site)

		if (null != doc.getMetaData()) {
			if (nChanges != doc.getMetaData().size()) {
				return true;
			}
		}
		return false;
	}

	/**
	 * processHeader
	 * 
	 * @param headerPattern
	 * @param f
	 * @param meta
	 */
	private void processHeader(Pattern headerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
	{
		if (headerPattern != null) {
			Matcher headerMatcher = headerPattern.matcher(f.getFullText());
			String headerText = null;
			while (headerMatcher.find()) {
				if (headerMatcher.start() == 0) {
					headerText = headerMatcher.group(0);
					f.setHeaderEndIndex(headerText.length());
					for (int i = 1; i < headerMatcher.groupCount() + 1; i++) {
						f.addToHeader(headerMatcher.group(i).trim());
					}
					break;
				}
			}
			if (null != headerText && null != meta) {
				for (metaField m : meta) {
					if (m.context == Context.Header || m.context == Context.All) {
						this.processMeta(f, m, headerText, source, uap);
					}
				}
			}
		}
	}

	/**
	 * processFooter
	 * 
	 * @param footerPattern
	 * @param f
	 * @param meta
	 */
	private void processFooter(Pattern footerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
	{

		if (footerPattern != null) {
			Matcher footerMatcher = footerPattern.matcher(f.getFullText());
			String footerText = null;
			while (footerMatcher.find()) {
				footerText = footerMatcher.group(0);
				int docLength = f.getFullText().length();
				f.setFooterStartIndex(docLength - footerMatcher.group(0).length());
				for (int i = 1; i < footerMatcher.groupCount() + 1; i++) {
					f.addToHeader(footerMatcher.group(i).trim());
				}
				break;
			}

			if (null != footerText && null != meta) {
				for (metaField m : meta) {
					if (m.context == Context.Footer || m.context == Context.All) {
						this.processMeta(f, m, footerText, source, uap);
					}
				}
			}
		}
	}

	/**
	 * processBody
	 * 
	 * @param f
	 * @param meta
	 */
	private void processBody(DocumentPojo f, List<metaField> meta, boolean bPreCleansing, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
	{
		if (null != meta) {
			for (metaField m : meta) {
				if ((bPreCleansing && (m.context == Context.First))
						|| (!bPreCleansing && (m.context == Context.Body || m.context == Context.All))) {
					String toProcess = f.getBody();
					if (toProcess == null)
						toProcess = f.getDescription();

					if (null != toProcess) {
						this.processMeta(f, m, toProcess, source, uap);
					}
				}
			}
		}
	}

	/**
	 * processMeta - handle an individual field
	 */
	private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source, UnstructuredAnalysisConfigPojo uap) {

		boolean bAllowDuplicates = false;
		if ((null != m.flags) && m.flags.contains("U")) {
			bAllowDuplicates = true;
		}		
		if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {

			Pattern metaPattern = createRegex(m.script, m.flags);

			int timesToRun = 1;
			Object[] currField = null;
			if ((null != m.flags) && m.flags.contains("c")) {
				currField = f.getMetadata().get(m.fieldName);
			}
			if (null != currField) { // chained metadata
				timesToRun = currField.length;
				text = (String)currField[0];
			}//TESTED

			Matcher matcher = metaPattern.matcher(text);
			LinkedList<String> Llist = null;
			
			for (int ii = 0; ii < timesToRun; ++ii) {
				if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
					text = (String)currField[ii];		
					matcher = metaPattern.matcher(text);
				}//TESTED
			
				StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
				int nFieldNameLen = m.fieldName.length() + 1;
	
				try {
					while (matcher.find()) {
						if (null == Llist) {
							Llist = new LinkedList<String>();
						}
						if (null == m.groupNum) {
							m.groupNum = 0;
						}
						String toAdd = matcher.group(m.groupNum);
						if (null != m.replace) {
							toAdd = metaPattern.matcher(toAdd).replaceFirst(
									m.replace);
						}
						if ((null != m.flags) && m.flags.contains("H"))	{
							toAdd = StringEscapeUtils.unescapeHtml(toAdd);
						}
						prefix.setLength(nFieldNameLen);
						prefix.append(toAdd);
						String dupCheck = prefix.toString();
	
						if (!regexDuplicates.contains(dupCheck)) {
							Llist.add(toAdd);
							if (!bAllowDuplicates) {
								regexDuplicates.add(dupCheck);
							}
						}
					}
				} catch (Exception e) {
					this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
				}
			}//(end metadata chaining handling)
			if (null != Llist) {
				if (null != currField) { // (overwrite)
					f.getMetadata().put(m.fieldName, Llist.toArray());
				}
				else {
					f.addToMetadata(m.fieldName, Llist.toArray());
				}
			}//TESTED
		} 
		else if (m.scriptlang.equalsIgnoreCase("javascript")) 
		{
			if (null == f.getMetadata()) {
				f.setMetadata(new LinkedHashMap<String, Object[]>());
			}
			//set the script engine up if necessary
			if ((null != source) && (null != uap)) {
				//(these are null if called from new processing pipeline vs legacy code)
				intializeScriptEngine(source, uap);
			}
			
			try 
			{
				//TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
				// (also should be able to use SAH _document object I think?)
				
				// Javascript: the user passes in 
				Object[] currField = f.getMetadata().get(m.fieldName);
				if ((null == m.flags) || m.flags.isEmpty()) {
					if (null == currField) {
						engine.put("text", text);
						engine.put("_iterator", null);
					}
					//(otherwise will just pass the current fields in there)
				}
				else { // flags specified
					if (m.flags.contains("t")) { // text
						engine.put("text", text);							
					}
					if (m.flags.contains("d")) { // entire document (minus ents and assocs)
						GsonBuilder gb = new GsonBuilder();
						Gson g = gb.create();
						List<EntityPojo> ents = f.getEntities();
						List<AssociationPojo> assocs = f.getAssociations();
						try {
							f.setEntities(null);
							f.setAssociations(null);
					        engine.put("document", g.toJson(f));
					        securityManager.eval(engine, JavaScriptUtils.initScript);
						}
						finally {
							f.setEntities(ents);
							f.setAssociations(assocs);
						}
					}
					if (m.flags.contains("m")) { // metadata
						GsonBuilder gb = new GsonBuilder();
						Gson g = gb.create();	
						engine.put("_metadata", g.toJson(f.getMetadata()));
						securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
					}
				}//(end flags processing)
				
				if (null != currField) {
					f.getMetadata().remove(m.fieldName);
					
					GsonBuilder gb = new GsonBuilder();
					Gson g = gb.create();	
					engine.put("_iterator", g.toJson(currField));
					securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);		        	
				}
				//TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)

				Object returnVal = securityManager.eval(engine, m.script);

				if (null != returnVal) {
					if (returnVal instanceof String) { // The only easy case
						Object[] array = new Object[1];
						if ((null != m.flags) && m.flags.contains("H"))	{
							returnVal = StringEscapeUtils.unescapeHtml((String)returnVal);
						}
						array[0] = returnVal;
						f.addToMetadata(m.fieldName, array);
					} else { // complex object or array - in either case the engine turns these into
								// internal.NativeArray or internal.NativeObject
						
						BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);												
						f.addToMetadata(m.fieldName, outList.toArray());
					}
				}
			} catch (ScriptException e) {

				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);

				// Just do nothing and log
				// e.printStackTrace();
				//DEBUG (don't output log messages per doc)
				//logger.error(e.getMessage());
			} catch (Exception e) {
				
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);

				// Just do nothing and log
				// e.printStackTrace();
				//DEBUG (don't output log messages per doc)
				//logger.error(e.getMessage());
			}
		} else if (m.scriptlang.equalsIgnoreCase("xpath")) {

			String xpath = m.script;
			
			try {
				createHtmlCleanerIfNeeded();

				int timesToRun = 1;
				Object[] currField = null;
				if ((null != m.flags) && m.flags.contains("c")) {
					currField = f.getMetadata().get(m.fieldName);
				}
				if (null != currField) { // chained metadata
					f.getMetadata().remove(m.fieldName); // (so will add to the end)
					timesToRun = currField.length;
					text = (String)currField[0];
				}//TESTED

				for (int ii = 0; ii < timesToRun; ++ii) {
					if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
						text = (String)currField[ii];						
					}//TESTED
					
					TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));
					
					//NewCode : Only use html cleaner for cleansing
					//use JAXP for full Xpath lib
					Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
					
	
					String extraRegex = extractRegexFromXpath(xpath);
	
					if (extraRegex != null)
						xpath = xpath.replace(extraRegex, "");
					
					XPath xpa = XPathFactory.newInstance().newXPath();
					NodeList res = (NodeList)xpa.evaluate(xpath, doc, XPathConstants.NODESET);
					
					if (res.getLength() > 0)
					{
						if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
							m.groupNum = -1; // (see bConvertToObject below)
						}
						StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
						int nFieldNameLen = m.fieldName.length() + 1;
						ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
						boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
						boolean convertToXml =  ((null != m.flags) && (m.flags.contains("x")));
						for (int i= 0; i< res.getLength(); i++)
						{
							Node info_node = res.item(i);
							if ((null != m.flags) && (m.flags.contains("g"))) {
								Llist.add(parseHtmlTable(info_node, m.replace));
							}
							else if (bConvertToObject || convertToXml) {
								// Try to create a JSON object out of this
								StringWriter writer = new StringWriter();
								try {
									Transformer transformer = TransformerFactory.newInstance().newTransformer();
									transformer.transform(new DOMSource(info_node), new StreamResult(writer));
								} catch (TransformerException e1) {
									continue;
								}
	
								if (bConvertToObject) {
									try {
										JSONObject subObj = XML.toJSONObject(writer.toString());
										if (xpath.endsWith("*"))  { // (can have any number of different names here)
											Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
										}//TESTED
										else {
											String[] rootNames = JSONObject.getNames(subObj);
											if (1 == rootNames.length) {
												// (don't think it can't be any other number in fact)
												subObj = subObj.getJSONObject(rootNames[0]);
											}
											boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
											Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj, bUnescapeHtml));										
										}//TESTED
									}
									catch (JSONException e) { // Just carry on
										continue;
									}
									//TESTED
								}
								else { // leave in XML form
									Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
								}//TESTED (xpath_test.json)
							}
							else { // Treat this as string, either directly or via regex
								String info = info_node.getTextContent().trim();
								if (extraRegex == null || extraRegex.isEmpty()) {
									prefix.setLength(nFieldNameLen);
									prefix.append(info);
									String dupCheck = prefix.toString();
		
									if (!regexDuplicates.contains(dupCheck)) {
										if ((null != m.flags) && m.flags.contains("H")) {
											info = StringEscapeUtils.unescapeHtml(info);
										}
										Llist.add(info);
										if (!bAllowDuplicates) {
											regexDuplicates.add(dupCheck);
										}
									}
								} 
								else { // Apply regex to the string
									Pattern dataRegex = createRegex(extraRegex, m.flags);
									Matcher dataMatcher = dataRegex.matcher(info);
									boolean result = dataMatcher.find();
									while (result) {
										String toAdd;
										if (m.groupNum != null)
											toAdd = dataMatcher.group(m.groupNum);
										else
											toAdd = dataMatcher.group();
										prefix.setLength(nFieldNameLen);
										prefix.append(toAdd);
										String dupCheck = prefix.toString();
		
										if (!regexDuplicates.contains(dupCheck)) {
											if ((null != m.flags) && m.flags.contains("H")) {
												toAdd = StringEscapeUtils.unescapeHtml(toAdd);
											}
											Llist.add(toAdd);
											if (!bAllowDuplicates) {
												regexDuplicates.add(dupCheck);
											}
										}
		
										result = dataMatcher.find();
									}
								}//(regex vs no regex)
							}//(end string vs object)
						}
						if (Llist.size() > 0) {
							f.addToMetadata(m.fieldName, Llist.toArray());
						}
					}
				}//(end loop over metadata objects if applicable)

			} catch (IOException ioe) {
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(), true);

				// Just do nothing and log
				//DEBUG (don't output log messages per doc)
				//logger.error(ioe.getMessage());
			} catch (ParserConfigurationException e1) {
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
				// Just do nothing and log
				//DEBUG (don't output log messages per doc)
				//logger.error(e1.getMessage());
			} catch (XPathExpressionException e1) {
				_context.getHarvestStatus().logMessage("Error evaluating xpath expression: " +  xpath, true);
			}
		}
		else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
			// which one?
			try {
				boolean json = false;
				boolean xml = false;
				for (int i = 0; i < 128; ++i) {
					if ('<' == text.charAt(i)) {
						xml = true;
						break;
					}
					if ('{' == text.charAt(i)) {
						json = true;
						break;
					}
					if (!Character.isSpaceChar(text.charAt(i))) {
						break;
					}
				}//TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)
				
				List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
				if (xml) {
					XmlToMetadataParser parser = new XmlToMetadataParser(Arrays.asList(m.script.split("\\s*,\\s*")), null, null, null, null, null, Integer.MAX_VALUE);
					XMLInputFactory factory = XMLInputFactory.newInstance();
					factory.setProperty(XMLInputFactory.IS_COALESCING, true);
					factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
					XMLStreamReader reader = null;
					try {							
						reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
						docs = parser.parseDocument(reader, true);
					}
					finally {
						if (null != reader) reader.close();
					}
				}//TESTED (meta_stream_test, test1)
				if (json) {
					JsonReader jsonReader = null;
					try {					
						JsonToMetadataParser parser = new JsonToMetadataParser(null, Arrays.asList(m.script.split("\\s*,\\s*")), null, null, Integer.MAX_VALUE);
						jsonReader = new JsonReader(new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
						jsonReader.setLenient(true);
						docs = parser.parseDocument(jsonReader, true);
					}
					finally {
						if (null != jsonReader) jsonReader.close();
					}
				}//TESTED (meta_stream_test test2)
				
				if (!docs.isEmpty()) {
					ArrayList<String> Llist = new ArrayList<String>(docs.size());					
					for (DocumentPojo doc: docs) {
						if (null != doc.getFullText()) {
							Llist.add(doc.getFullText());
						}
					}
					if (Llist.size() > 0) {
						f.addToMetadata(m.fieldName, Llist.toArray());
					}
				}//TESTED (meta_stream_test test1,test2)
			}//(end try)
			catch (Exception e) { // various parsing errors
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);				
			}
		}//TESTED (meta_stream_test)
		
		// (don't currently support other script types)
	}

	private static String extractRegexFromXpath(String original_xpath) {
		Pattern addedRegex = Pattern.compile("regex\\(.*\\)\\s*$", Pattern.MULTILINE | Pattern.DOTALL); 
		Matcher matcher = addedRegex.matcher(original_xpath);
		boolean matchFound = matcher.find();

		if (matchFound) {
			try {
				return matcher.group();
			} catch (Exception e) {
				return null;
			}
		}
		return null;

	}

	/**
	 * cleanseText
	 * 
	 * @param source
	 * @param documents
	 * @return
	 */
	private void cleanseText(List<SimpleTextCleanserPojo> simpleTextCleanser, DocumentPojo document)
	{
		// Store these since can re-generate them by concatenation
		StringBuffer fullTextBuilder = null;
		StringBuffer descriptionBuilder = null;
		StringBuffer titleBuilder = null;
		// (note no support for metadata concatenation, replace only)
		
		// Iterate over the cleanser functions that need to run on each feed
		for (SimpleTextCleanserPojo s : simpleTextCleanser) {
			boolean bConcat = (null != s.getFlags()) && s.getFlags().contains("+");
			
			boolean bUsingJavascript = ((null != s.getScriptlang()) && s.getScriptlang().equalsIgnoreCase("javascript"));
			if (s.getField().equalsIgnoreCase("fulltext")) {
				if ((null != document.getFullText()) || bUsingJavascript) {
					StringBuffer myBuilder = fullTextBuilder;
					
					if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
						document.setFullText(myBuilder.toString());
						myBuilder.setLength(0);	
					} //TESTED
					
					String res = cleanseField(document.getFullText(),
												s.getScriptlang(), s.getScript(), s.getFlags(),
												s.getReplacement(), document);					
					if (bConcat) {
						if (null == myBuilder) {
							fullTextBuilder = myBuilder = new StringBuffer();
						}
						myBuilder.append(res).append('\n');
					}
					else {
						document.setFullText(res);
					}					
				}
			} //TESTED
			else if (s.getField().equalsIgnoreCase("description")) {
				if ((null != document.getDescription()) || bUsingJavascript) {
					StringBuffer myBuilder = descriptionBuilder;
					
					if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
						document.setDescription(myBuilder.toString());
						myBuilder.setLength(0);	
					} //TESTED
					
					String res = cleanseField(document.getDescription(),
												s.getScriptlang(), s.getScript(), s.getFlags(),
												s.getReplacement(), document);
					
					if (bConcat) {
						if (null == myBuilder) {
							descriptionBuilder = myBuilder = new StringBuffer();
						}
						myBuilder.append(res).append('\n');
					}
					else {
						document.setDescription(res);
					}					
				}
			} //TESTED
			else if (s.getField().equalsIgnoreCase("title")) {
				if ((null != document.getTitle()) || bUsingJavascript) {
					StringBuffer myBuilder = titleBuilder;
					
					if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
						document.setTitle(myBuilder.toString());
						myBuilder.setLength(0);	
					} //TESTED
					
					String res = cleanseField(document.getTitle(),
												s.getScriptlang(), s.getScript(), s.getFlags(),
												s.getReplacement(), document);
					if (bConcat) {
						if (null == myBuilder) {
							titleBuilder = myBuilder = new StringBuffer();
						}
						myBuilder.append(res).append('\n');
					}
					else {
						document.setTitle(res);
					}					
				}
			} //TESTED
			else if (s.getField().startsWith("metadata.")) {
				// (note no support for metadata concatenation, replace only)
				String metaField = s.getField().substring(9); // (9 for"metadata.")
				Object[] meta = document.getMetadata().get(metaField);
				if ((null != meta) && (meta.length > 0)) {
					Object[] newMeta = new Object[meta.length];
					for (int i = 0; i < meta.length; ++i) {
						Object metaValue = meta[i];
						if (metaValue instanceof String) {
							newMeta[i] = (Object) cleanseField(
									(String) metaValue, s.getScriptlang(),
									s.getScript(), s.getFlags(),
									s.getReplacement(), document);
						} else {
							newMeta[i] = metaValue;
						}
					}
					// Overwrite the old fields
					document.addToMetadata(metaField, newMeta);
				}
			}			
			// This is sufficient fields for the moment
			
		} // (end loop over fields)
		
		// Handle any left over cases:
		if ((null != fullTextBuilder) && (fullTextBuilder.length() > 0)) {
			document.setFullText(fullTextBuilder.toString());
		} //TESTED
		if ((null != descriptionBuilder) && (descriptionBuilder.length() > 0)) {
			document.setDescription(descriptionBuilder.toString());
		} //TESTED
		if ((null != titleBuilder) && (titleBuilder.length() > 0)) {
			document.setTitle(titleBuilder.toString());
		} //TESTED
		
	}// TESTED

	/**
	 * cleanseField
	 * 
	 * @param field
	 * @param script
	 * @param replaceWith
	 */
	private String cleanseField(String field, String scriptLang, String script,
									String flags, String replaceWith, DocumentPojo f) 
	{
		if ((null == scriptLang) || scriptLang.equalsIgnoreCase("regex")) {
			if (null == flags) {
				return field.replaceAll(script, replaceWith);
			} 
			else {
				if (flags.contains("H")) { // HTML decode
					return StringEscapeUtils.unescapeHtml(createRegex(script,flags).matcher(field).replaceAll(replaceWith));
				} else {
					return createRegex(script, flags).matcher(field).replaceAll(replaceWith);
				}
			}
		} 
		else if (scriptLang.equalsIgnoreCase("xpath")) {
			
			try {
				createHtmlCleanerIfNeeded();

				TagNode node = cleaner.clean(new ByteArrayInputStream(field.getBytes()));

				Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
				XPath xpa = XPathFactory.newInstance().newXPath();				
				
				NodeList res = (NodeList)xpa.evaluate(script, doc, XPathConstants.NODESET);

				if (0 == res.getLength()) { // No match, just return "", unlike regex we don't want anything if we don't match...
					return "";					
				}
				else {
					StringBuffer sb = new StringBuffer();
					for (int i= 0; i< res.getLength(); i++) {
						if (0 != i) {
							sb.append('\n');
						}
						Node info_node = res.item(i);

						if ((null != flags) && flags.contains("H")) { // HTML decode
							sb.append(StringEscapeUtils.unescapeHtml(info_node.getTextContent().trim()));
						}
						else if ((null != flags) && flags.contains("x")) { // Leave as XML string 
							StringWriter writer = new StringWriter();
							try {
								Transformer transformer = TransformerFactory.newInstance().newTransformer();
								transformer.transform(new DOMSource(info_node), new StreamResult(writer));
								sb.append(writer.toString().substring(38)); // (step over <?xml etc?> see under metadata field extraction
							} 
							catch (TransformerException e1) { // (do nothing just skip)
							}
						}
						else {
							sb.append(info_node.getTextContent().trim());						
						}										
					}										
					return sb.toString();
				}//TESTED (xpath_test: object - multiple and single, text)
				
			} catch (IOException e) {
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
			} 
			catch (XPathExpressionException e) {
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
			} 
			catch (ParserConfigurationException e) {
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
			}
		}
		else if (scriptLang.equalsIgnoreCase("javascript")) {
			try {
				SourcePojo src = f.getTempSource();
				intializeScriptEngine(src, src.getUnstructuredAnalysisConfig());

				// Setup input:
				if (null == flags) {
					flags = "t";
				}
				if (flags.contains("t")) { // text
					engine.put("text", field);							
				}
				if (flags.contains("d")) { // entire document
					GsonBuilder gb = new GsonBuilder();
					Gson g = gb.create();	
					List<EntityPojo> ents = f.getEntities();
					List<AssociationPojo> assocs = f.getAssociations();
					try {
						f.setEntities(null);
						f.setAssociations(null);
				        engine.put("document", g.toJson(f));
				        securityManager.eval(engine, JavaScriptUtils.initScript);
					}
					finally {
						f.setEntities(ents);
						f.setAssociations(assocs);
					}
				}
				if (flags.contains("m")) { // metadata
					GsonBuilder gb = new GsonBuilder();
					Gson g = gb.create();	
					engine.put("_metadata", g.toJson(f.getMetadata()));
					securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
				}
				Object returnVal = securityManager.eval(engine, script);
				field = (String) returnVal; // (If not a string or is null then will exception out)
				if ((null != flags) && flags.contains("H") && (null != field)) { // HTML decode
					field = StringEscapeUtils.unescapeHtml(field);
				}
			}
			catch (Exception e) {
				_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);

				// Just do nothing and log
				// e.printStackTrace();
				//DEBUG (don't output log messages per doc)
				//logger.error(e.getMessage());
			}
		}
		return field;
	}
	
	// Handles parsing of HTML tables to Objects that can be easily printed as JSON. (flag = g)
	// 1] No Replace Value - The first row of the table will be set as the headers
	// 2] Replace Value = "[]" - Headers will be set to the column count number (beginning with 0) eg "0","1"
	// 3a] Replace Value = "[one,two,three]" - The provided headers will be set as the headers
	// 3b] Replace Values set, but more data columns than values provided - Additional columns that were not
	//		specified will be assigned it's column count number. eg "specified","1","2"
	// 4] Replace Value = "[one,null,three]" - Columns specified as null in the provided header will be skipped.
	//		eg "one","three"
	
	private static HashMap<String, Object> parseHtmlTable(Node table_node, String replaceWith)
	{
		if (table_node.getNodeName().equalsIgnoreCase("table") && table_node.hasChildNodes())
		{
			Node topNode = table_node;
			
			boolean tbody = table_node.getFirstChild().getNodeName().equalsIgnoreCase("tbody");
			
			if (tbody)
				topNode = table_node.getFirstChild();
			
			if (topNode.hasChildNodes())
			{
				NodeList rows = topNode.getChildNodes();
				
				List<String> headers = null;
				ArrayList<HashMap<String, String>> data = null;
				int headerLength = 0;
				boolean[] skip = null;
				
				if (null != replaceWith)
				{
					if (replaceWith.equals("[]")){
						headers = new ArrayList<String>();
						headerLength = 0;
					} // TESTED (by eye - 2)
					else
					{
						//Remove square brackets
						if(replaceWith.startsWith("[") && replaceWith.endsWith("]"))
							replaceWith = replaceWith.substring(1, replaceWith.length()-1);
						//Turn the provided list of headers into a list object
						headers = Arrays.asList(replaceWith.split("\\s*,\\s*"));
						headerLength = headers.size();
						skip = new boolean[headerLength];
						for(int h = 0; h < headerLength; h++)
						{
							String val = headers.get(h);
							if (val.length() == 0 || val.equalsIgnoreCase("null"))
								skip[h] = true;
							else
								skip[h] = false;
						}
						
					}// TESTED (by eye - 3a)
				}
				
				//traverse rows
				for(int i = 0; i < rows.getLength(); i++)
				{
					Node row = rows.item(i);
					if (row.getNodeName().equalsIgnoreCase("tr") || row.getNodeName().equalsIgnoreCase("th"))
					{
						//If the header value has not been set, the first row will be set as the headers
						if (null == headers)
						{
							//Traverse through cells
							headers = new ArrayList<String>();
							if (row.hasChildNodes())
							{
								NodeList cells = row.getChildNodes();
								headerLength = cells.getLength();
								skip = new boolean[headerLength];
								for (int j = 0; j < headerLength; j++)
								{
									headers.add(cells.item(j).getTextContent());
									skip[j] = false;
								}
							} // TESTED (by eye - 1)
						} 
						else
						{
							if (null == data)
							{
								data = new ArrayList<HashMap<String,String>>();
							}
							if (row.hasChildNodes())
							{
								HashMap<String,String> cellList = new HashMap<String,String>();
								NodeList cells = row.getChildNodes();
								for (int j = 0; j < cells.getLength(); j++)
								{
									// Skip Code (TESTED by eye - 4)
									if (headerLength == 0 || (j < headerLength && skip[j] == false))
									{
										String key = Integer.toString(j); // TESTED (by eye - 3b)
										if (j < headerLength)
											key = headers.get(j);
	
										cellList.put(key, cells.item(j).getTextContent());
									}
								}
								data.add(cellList);
							}
							
						}
					}
				}
				//Create final hashmap containing attributes
				HashMap<String,Object> table_attrib = new HashMap<String, Object>();
				
				NamedNodeMap nnm = table_node.getAttributes();
				for (int i = 0; i < nnm.getLength(); i++)
				{
					Node att = nnm.item(i);
					table_attrib.put(att.getNodeName(), att.getNodeValue());
				}
				table_attrib.put("table", data);
				
				//TESTED (by eye) attributes added to table value
				// eg: {"id":"search","cellpadding":"1","table":[{"Status":"B","two":"ONE6313" ......
				
				return table_attrib;
			}			
		}
		return null;
	}

	private static Pattern createRegex(String regEx, String flags) {
		int nflags = 0; 

		if (null != flags) {
			for (int i = 0; i < flags.length(); ++i) {
				char c = flags.charAt(i);
				switch (c) { 
				case 'm':
					nflags |= Pattern.MULTILINE;
					break;
				case 'i':
					nflags |= Pattern.CASE_INSENSITIVE;
					break;
				case 'd':
					nflags |= Pattern.DOTALL;
					break; 
				case 'u':
					nflags |= Pattern.UNICODE_CASE;
					break;
				case 'n':
					nflags |= Pattern.UNIX_LINES;
					break;
				}
			}
		}
		return Pattern.compile(regEx, nflags);
	}

	// Utility to minimise number of times the cleaner is created
	
	private void createHtmlCleanerIfNeeded()
	{
		if (null == cleaner) {
			cleaner = new HtmlCleaner();
			CleanerProperties props = cleaner.getProperties();
			props.setAllowHtmlInsideAttributes(true);
			props.setAllowMultiWordAttributes(true);
			props.setRecognizeUnicodeChars(true);
			props.setOmitComments(true);
			props.setTreatUnknownTagsAsContent(false);
			props.setTranslateSpecialEntities(true);
			props.setTransResCharsToNCR(true);
			props.setNamespacesAware(false);
		}		
	}

	public void set_sahEngine(ScriptEngine _sahEngine) {
		this._sahEngine = _sahEngine;
	}

	public ScriptEngine get_sahEngine() {
		return _sahEngine;
	}	
	public void set_sahSecurity(JavascriptSecurityManager _securityManager) {
		this.securityManager = _securityManager;
	}

	public JavascriptSecurityManager get_sahSecurity() {
		return securityManager;
	}	

	///////////////////////////////////////////////////
	
	// Javascript scripting utilities:
	
	public void intializeScriptEngine(SourcePojo source, UnstructuredAnalysisConfigPojo uap) {
		if ( null == engine )
		{
			//use the passed in sah one if possible
			if ( null != this.get_sahEngine())
			{
				engine = this.get_sahEngine();
			}
			else if (null == factory)  //otherwise create our own
			{
				//set up the security manager
				securityManager = new JavascriptSecurityManager();	
				
				factory = new ScriptEngineManager();
				engine = factory.getEngineByName("JavaScript");		
				//grab any json cache and make it available to the engine
			}
			//once engine is created, do some initialization
			if ( null != engine )
			{
				if (null != source) {
					loadLookupCaches(uap.getCaches(), source.getCommunityIds());
					List<String> scriptFiles = null;
					if (null != uap.getScriptFiles()) {
						scriptFiles = Arrays.asList(uap.getScriptFiles());
					}
					loadGlobalFunctions(scriptFiles, uap.getScript());
				}
				if (null == parsingScript)  {
					parsingScript = JavaScriptUtils.generateParsingScript();
				}
				try  {
					securityManager.eval(engine, parsingScript);						
				} 
				catch (ScriptException e) { // Just do nothing and log
					e.printStackTrace();
					logger.error("intializeScriptEngine: " + e.getMessage());
				}
				
			}
		}//end start engine up		
		
	}//TESTED (legacy + imports_and_lookup_test.json + imports_and_lookup_test_uahSah.json)
	
	//////////////////////////////////////////////////////
	
	// Utilities that in legacy mode are called from the initializeScriptEngine, but can be called
	// standalone in the pipelined mode:
	
	public void loadLookupCaches(Map<String, ObjectId> caches, Set<ObjectId> communityIds)
	{
		try
		{
			if (null != caches) {
				CacheUtils.addJSONCachesToEngine(caches, engine, securityManager, communityIds, _context);
			}
		}
		catch (Exception ex)
		{
			_context.getHarvestStatus().logMessage("JSONcache: " + ex.getMessage(), true);						
			//(no need to log this, appears in log under source -with URL- anyway):
			//logger.error("JSONcache: " + ex.getMessage(), ex);
		}
	}//TESTED (legacy + imports_and_lookup_test.json)
	
	public void loadGlobalFunctions(List<String> imports, String script) 
	{
        // Pass scripts into the engine
        try 
        {
        	// Eval script passed in s.script
        	if (script != null) securityManager.eval(engine, script);
        	
        	// Retrieve and eval script files in s.scriptFiles
        	if (imports != null)
        	{
        		for (String file : imports)
        		{
        			securityManager.eval(engine, JavaScriptUtils.getJavaScriptFile(file));
        		}
        	}
		} 
        catch (ScriptException e) 
		{
			this._context.getHarvestStatus().logMessage("ScriptException: " + e.getMessage(), true);						
			//DEBUG (only once per message, but should be spotted at the debug stage anyway)
			//logger.error("ScriptException: " + e.getMessage(), e);
		}
        
	}//TESTED (legacy + imports_and_lookup_test.json)
}
Tech Fingerprint

Alerts (59)

'Set' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
146 157
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
203 356 368 378 388 396 491 514 523 532 540 715 816 1040 1056 1307 1568
'new Scanner(' Resource creation detected. Ensure resources (streams, connections, etc.) are properly closed using try-with-resources (Java 7+) to prevent leaks.
220
'Thread.sleep(' Avoid for arbitrary synchronization; use wait/notify, locks, or higher-level concurrency utilities (java.util.concurrent). May be acceptable for simple polling delays with justification.
337 407 472 550
'catch' Correctness Info: Empty catch block detected. Swallowing exceptions without logging or handling can hide errors and make debugging difficult.
338 473 1247 1257 1260
Complexity hotspot; lines 637 to 640 (total complexity: 11)
637 638 639 640
'Object[]' Avoid using raw 'Object[]'. Prefer typed arrays (e.g., String[]) or generic Collections (e.g., List<Object> or List<SpecificType>). If converting a collection, use `toArray(new Type[0])` or `toArray(Type[]::new)`.
667 731 745 795 833 1157 1159
'instanceof' Frequent 'instanceof' checks can indicate a need for better polymorphism (using overridden methods in subclasses) or visitor pattern. Consider if the design can be improved.
794 1162
'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
804 861 1224 1341 1388 1407
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
978 1595
'XMLInputFactory.newInstance()' Security Warning: Default XML parser configurations may be vulnerable to XML External Entity (XXE) attacks. Explicitly disable external entity processing using features like FEATURE_SECURE_PROCESSING, setExpandEntityReferences(false), setSupportDTD(false), etc., unless external entities are explicitly required and validated.
1003
'return null;' Returning null forces callers to perform null checks, risking NullPointerException. Consider using Optional<T> (Java 8+), throwing an exception, or returning a Null Object/empty collection instead.
1057 1060 1443
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
1156
'HashMap<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
1328
'Map' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
1429
'switch (' Ensure switch statements on enums or non-trivial types cover all cases or include a 'default:' label to handle unexpected values.
1452
'.printStackTrace()' Avoid printing stack traces directly to std err/out. Use a proper logging framework to handle exceptions consistently and direct output appropriately.
1546