SourcePojo.java | searchcode

/core/infinit.e.data_model/src/com/ikanow/infinit/e/data_model/store/config/source/SourcePojo.java

https://github.com/IKANOW/Infinit.e
Java | 840 lines | 644 code | 79 blank | 117 comment | 163 complexity | e12d5f0172eb8bcd32459aec83cc4f03 MD5 | raw file
Possible License(s): BSD-3-Clause

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.ikanow.infinit.e.data_model.store.config.source;

import java.io.UnsupportedEncodingException;
import java.lang.reflect.Type;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.binary.Base64;
import org.bson.types.ObjectId;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonParseException;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
import com.ikanow.infinit.e.data_model.store.social.authentication.AuthenticationPojo;
import com.mongodb.BasicDBObject;

/**
 * Class used to establish the source information for a feed
 * this defines the data necessary to create a feed in the system
 * 
 * @author cmorgan
 *
 */

public class SourcePojo extends BaseDbPojo {
	// Standard static function for readability
	@SuppressWarnings("unchecked")
	static public TypeToken<List<SourcePojo>> listType() { return new TypeToken<List<SourcePojo>>(){}; }

	/** 
	  * Private Class Variables
	  */
	
	// Metadata fields
	
	private ObjectId _id = null;
	final public static String _id_ = "_id";
	private Date created = null;
	final public static String created_ = "created";
	private Date modified = null;
	final public static String modified_ = "modified";
	private String url = null;
	final public static String url_ = "url";
	private String title = null;
	final public static String title_ = "title";
	private Boolean isPublic = null; // if false then many fields are removed when viewed by non-owners/moderators/admins 
	final public static String isPublic_ = "isPublic";
	private Boolean partiallyPublished = null; // if fields are removed based on isPublic then this is set to true
	final public static String partiallyPublished_ = "partiallyPublished";
	private ObjectId ownerId = null;
	final public static String ownerId_ = "ownerId";
	private String author = null;
	final public static String author_ = "author";
	
	private String mediaType = null;
	final public static String mediaType_ = "mediaType";
	private String key = null;
	final public static String key_ = "key";
	private String description = null;
	final public static String description_ = "description";
	private Set<String> tags = null;
	final public static String tags_ = "tags";
	
	private Set<ObjectId> communityIds = null;
	final public static String communityIds_ = "communityIds";
	
	private boolean isApproved = false;
	final public static String isApproved_ = "isApproved";
	private boolean harvestBadSource = false;
	final public static String harvestBadSource_ = "harvestBadSource";
	
	private String extractType = null; // (in pipeline mode, copied across from pipeline)
	final public static String extractType_ = "extractType";
	
	private String shah256Hash = null;	
	final public static String shah256Hash_ = "shah256Hash";

	// Control fields used everywhere
	
	private Integer searchCycle_secs = null; // Determines the time between searches, defaults as quickly as the harvest can cycle
												// (in pipeline mode, copied across from pipeline)
	final public static String searchCycle_secs_ = "searchCycle_secs";

	private Integer distributionFactor;
	final public static String distributionFactor_ = "distributionFactor";
	
	public static class SourceSearchIndexFilter {
		public Boolean indexOnIngest = null; // (if specified and false, default:true, then don't index the docs at all)
		public String entityFilter = null; // (regex applied to entity indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String assocFilter = null; // (regex applied to new-line separated association indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String entityGeoFilter = null; // (regex applied to entity indexes if the entity has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String assocGeoFilter = null; // (regex applied to new-line separated association indexes if the association has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String fieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String metadataFieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		
		// temp:
		public transient Pattern entityFilterRegex;
		public transient Pattern assocFilterRegex;
		public transient Pattern entityGeoFilterRegex;
		public transient Pattern assocGeoFilterRegex;
	}
	
	// PROCESSING PIPELINE
	
	private List<SourcePipelinePojo> processingPipeline;
	final public static String processingPipeline_ = "processingPipeline";
	
	// LEGACY CODE, IGNORED IN PROCESSING-PIPELINE MODE

	private SourceHarvestStatusPojo harvest = null;
	final public static String harvest_ = "harvest";
	private SourceDatabaseConfigPojo database = null;
	final public static String database_ = "database";
	private SourceNoSqlConfigPojo nosql = null; 
	final public static String nosql_ = "nosql";
	
	private SourceFileConfigPojo file = null;
	final public static String file_ = "file";
	private SourceRssConfigPojo rss = null;
	final public static String rss_ = "rss";
		
	private AuthenticationPojo authentication = null;
	final public static String authentication_ = "authentication";
	
	private String useExtractor = null;
	final public static String useExtractor_ = "useExtractor";
	private String useTextExtractor = null;
	final public static String useTextExtractor_ = "useTextExtractor";
	
	private StructuredAnalysisConfigPojo structuredAnalysis = null;
	final public static String structuredAnalysis_ = "structuredAnalysis";
	private UnstructuredAnalysisConfigPojo unstructuredAnalysis = null;
	final public static String unstructuredAnalysis_ = "unstructuredAnalysis";	
	
	private Integer maxDocs = null; // Limits the number of docs that can be stored for this source at any one time
	final public static String maxDocs_ = "maxDocs";
	private Integer throttleDocs = null; // Limits the number of docs that can be harvested in one cycle (cannot be higher than system setting in harvest.maxdocs_persource)
	final public static String throttleDocs_ = "throttleDocs";
	private Boolean duplicateExistingUrls; // If false (defaults: true) will ignore docs harvested by other sources in the community
	final public static String duplicateExistingUrls_ = "duplicateExistingUrls";
	private Boolean appendTagsToDocs = null; // if true (default) source tags are appended to the document
	
	final public static String appendTagsToDocs_ = "appendTagsToDocs";
	
	private SourceSearchIndexFilter searchIndexFilter = null; // Optional, allows the source builder to configure which fields are searchable
	final public static String searchIndexFilter_ = "searchIndexFilter";
	
	private LinkedHashMap<String, String> extractorOptions = null; // Optional, overrides the per-extractor configuration options, where permissible
	final public static String extractorOptions_ = "extractorOptions";
	
	//////////////////////////////////////
	
	// Gets and sets
	
	public AuthenticationPojo getAuthentication() {
		return authentication;
	}
	public void setAuthentication(AuthenticationPojo authentication) {
		this.authentication = authentication;
	}
	public SourceFileConfigPojo getFileConfig() {
		return file;
	}
	public void setFileConfig(SourceFileConfigPojo file) {
		this.file = file;
	}
	public SourceRssConfigPojo getRssConfig() {
		return rss;
	}
	public void setRssConfig(SourceRssConfigPojo rss) {
		this.rss = rss;
	}
	public SourceDatabaseConfigPojo getDatabaseConfig() {
		return database;
	}
	public void setDatabaseConfig(SourceDatabaseConfigPojo database) {
		this.database = database;
	}
	public ObjectId getId() {
		return _id;
	}
	public void setId(ObjectId id) {
		this._id = id;
	}
	public String getKey() {
		return key;
	}
	public void setKey(String key) {
		this.key = key;
	}
	public Date getCreated() {
		return created;
	}
	public void setCreated(Date created) {
		this.created = created;
	}
	public Date getModified() {
		return modified;
	}
	public void setModified(Date modified) {
		this.modified = modified;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getDescription() {
		return description;
	}
	public void setDescription(String description) {
		this.description = description;
	}
	public String getMediaType() {
		return mediaType;
	}
	public void setMediaType(String mediaType) {
		this.mediaType = mediaType;
	}
	public String getExtractType() {
		return extractType;
	}
	public void setExtractType(String extractType) {
		this.extractType = extractType;
	}
	public Boolean getIsPublic() {
		return isPublic;
	}
	public boolean isPublic() {
		return (isPublic == null)?false:isPublic; // (ie defaults to false)
	}
	public void setIsPublic(Boolean isPublic) {
		this.isPublic = isPublic;		
	}
	public void setPublic(boolean isPublic) {
		this.isPublic = isPublic;
	}
	public String getAuthor() {
		return author;
	}
	public void setAuthor(String author) {
		this.author = author;
	}
	/** 
	  * Get the tags
	  */
	public Set<String> getTags() {
		return tags;
	}
	/** 
	  * Set the tags
	  */
	public void setTags(Set<String> tags) {
		this.tags = tags;
	}
	
	/**
	 * @param ownerID the ownerID to set
	 */
	public void setOwnerId(ObjectId ownerID) {
		this.ownerId = ownerID;
	}
	/**
	 * @return the ownerID
	 */
	public ObjectId getOwnerId() {
		return ownerId;
	}
	public SourcePojo() {
		
	}
	public void setHarvestStatus(SourceHarvestStatusPojo harvest) {
		this.harvest = harvest;
	}
	public SourceHarvestStatusPojo getHarvestStatus() {
		return harvest;
	}
	public void setApproved(boolean isApproved) {
		this.isApproved = isApproved;
	}
	public boolean isApproved() {
		return isApproved;
	}
	public void addToCommunityIds(ObjectId communityID) {
		if (null == this.communityIds) {
			this.communityIds = new HashSet<ObjectId>();
		}
		this.communityIds.add(communityID);
	}
	public void removeFromCommunityIds(ObjectId communityID) {
		if (null != this.communityIds) {
			this.communityIds.remove(communityID);
		}
	}
	public Set<ObjectId> getCommunityIds() {
		return communityIds;
	}
	public void setCommunityIds(Set<ObjectId> ids) {
		communityIds = ids;
	}
	public void setHarvestBadSource(boolean harvestBadSource) {
		this.harvestBadSource = harvestBadSource;
	}
	public boolean isHarvestBadSource() {
		return harvestBadSource;
	}

	/**
	 * @param useExtractor the useExtractor to set
	 */
	public void setUseExtractor(String useExtractor) {
		this.useExtractor = useExtractor;
	}

	/**
	 * @return the useExtractor
	 */
	public String useExtractor() {
		return useExtractor;
	}

	/**
	 * @param useTextExtractor the useTextExtractor to set
	 */
	public void setUseTextExtractor(String useTextExtractor) {
		this.useTextExtractor = useTextExtractor;
	}

	/**
	 * @return the useTextExtractor
	 */
	public String useTextExtractor() {
		return useTextExtractor;
	}

	/**
	 * @param structedAnalysis the structedAnalysis to set
	 */
	public void setStructuredAnalysisConfig(StructuredAnalysisConfigPojo structuredAnalysis) {
		this.structuredAnalysis = structuredAnalysis;
	}

	/**
	 * @return the structedAnalysis
	 */
	public StructuredAnalysisConfigPojo getStructuredAnalysisConfig() {
		return structuredAnalysis;
	}
	
	/**
	 * @param structuredAnalysis the structuredAnalysis to set
	 */
	public void setUnstructuredAnalysisConfig(UnstructuredAnalysisConfigPojo unstructuredAnalysis) {
		this.unstructuredAnalysis = unstructuredAnalysis;
	}

	/**
	 * @return the unstructuredAnalysis
	 */
	public UnstructuredAnalysisConfigPojo getUnstructuredAnalysisConfig() {
		return unstructuredAnalysis;
	}
	/**
	 * setShah256Hash - calls generateShah256Hash
	 */
	public void generateShah256Hash()
	{
		try 
		{
			generateShah256Hash_internal();
		} 
		catch (Exception e) 
		{
			
		}
	}

	/**
	 * getShah256Hash - calls generateShah256Hash if shah256Hash is null
	 * @return
	 */
	public String getShah256Hash() 
	{
		if (null != shah256Hash )
		{
			return shah256Hash;
		}
		else
		{
			try 
			{
				generateShah256Hash_internal();
				return shah256Hash;
			} 
			catch (Exception e) 
			{
				return null;
			}
		}
	}
	// Utility:
	
	/**
	 * generateSourceKey
	 * Strips out http://, smb:// /, :, etc. from the URL field to generate
	 * Example: http://www.ikanow.com/rss -> www.ikanow.com.rss
	 */
	public String generateSourceKey()
	{
		String s = getRepresentativeUrl(); // (supports all cases - note we are guaranteed to have a URL by this point)
		if (null == s) {
			return null;
		}
		
		int nIndex = s.indexOf('?');
		final int nMaxLen = 64; // (+24 for the object id, + random other stuff, keeps it in the <100 range)
		if (nIndex >= 0) {
			if (nIndex > nMaxLen) {
				nIndex = nMaxLen; // (ie max length)
			}
			StringBuffer sb = new StringBuffer(s.substring(0, nIndex));
			sb.append(".").append(s.length() - nIndex).append('.').append(Math.abs(s.hashCode()) % 100);
			s = sb.toString();
		}
		else if (s.length() > nMaxLen) {
			s = s.substring(0, nMaxLen);
		}
		//TESTED (urls with and without ?)
		
		s = s.replaceAll("http://|https://|smb://|ftp://|ftps://|file://|[/:+?&(),#]", ".");
		if (s.startsWith(".")) s = s.substring(1);
		return s;
	}
	/**
	 * generateShah256Hash
	 * Combines the required and optional fields of a SourcePojo into a string that is
	 * then hashed using SHAH-256 and saved to the SourePojo.shah256Hash field;
	 * this value is used to determine source uniqueness
	 * @throws NoSuchAlgorithmException
	 * @throws UnsupportedEncodingException
	 */
	private void generateShah256Hash_internal() throws NoSuchAlgorithmException, UnsupportedEncodingException 
	{	
		// Create StringBuffer with fields to use to establish source *processing* uniqueness
		StringBuffer sb = new StringBuffer();

		// (Note what I mean by "source processing uniqueness" is that, *for a specific doc URL* 2 sources would process it identically)	
		// So fields like key,URL,media type,tags,etc aren't included in the hash
		
		if (null != processingPipeline) { // new processing pipeline contains all the logic that determines a source's processing
			for (SourcePipelinePojo pxPipe: processingPipeline) {
				if ((null == pxPipe.feed) && (null == pxPipe.web)) { // (these are too difficult to pull the URL out of)
					String fileUrl = null;
					if (null != pxPipe.file) {
						fileUrl = pxPipe.file.getUrl();
						pxPipe.file.setUrl(null);
					}
					// (don't both with DB because its URL is so intertwined with its processing)
					sb.append(new Gson().toJson(pxPipe));
					if (null != fileUrl) {
						pxPipe.file.setUrl(fileUrl);
					} // (stay idempotent)
				}
			}
		}//TESTED
		else { //legacy case
		
			// Required Fields
			sb.append(this.extractType);
					
			// Optional fields
			if (this.extractType != null) sb.append(this.extractType);
			if (this.useExtractor != null) sb.append(this.useExtractor);
			if (this.useTextExtractor != null) sb.append(this.useTextExtractor);
			
			// Generate a hash of all the objects using the ORM layer
			SourcePojo newSrc = new SourcePojo();
			newSrc.setId(null); // (in case this is auto set by the c'tor)
			newSrc.setAuthentication(this.authentication);
			newSrc.setDatabaseConfig(this.database);
			newSrc.setFileConfig(this.file);
			// Don't include RSS config since it can contain URLs
			newSrc.setStructuredAnalysisConfig(this.structuredAnalysis);
			newSrc.setUnstructuredAnalysisConfig(this.unstructuredAnalysis);
			sb.append(((BasicDBObject)newSrc.toDb()).toString());
			
		}//TESTED (legacy)
		
		// Create MessageDigest and set shah256Hash value
		MessageDigest md = MessageDigest.getInstance("SHA-256");
		md.update(sb.toString().getBytes("UTF-8"));		
		shah256Hash = Base64.encodeBase64String(md.digest());
	}
	public Integer getSearchCycle_secs() {
		return searchCycle_secs;
	}
	public void setSearchCycle_secs(Integer searchCycle_secs) {
		this.searchCycle_secs = searchCycle_secs;
	}
	public void setMaxDocs(Integer maxDocs) {
		this.maxDocs = maxDocs;
	}
	public Integer getMaxDocs() {
		return maxDocs;
	}
	public void setReachedMaxDocs() {
		this.reachedMaxDocs = true;
	}
	public boolean reachedMaxDocs() {
		return reachedMaxDocs;
	}
	public void setDuplicateExistingUrls(Boolean duplicateExistingUrls) {
		this.duplicateExistingUrls = duplicateExistingUrls;
	}
	public boolean getDuplicateExistingUrls() { // (defaults to true)
		return duplicateExistingUrls == null ? true : duplicateExistingUrls;
	}
	public SourceSearchIndexFilter getSearchIndexFilter() {
		initSearchIndexFilter(searchIndexFilter);
		return searchIndexFilter;
	}
	public void setSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
		this.searchIndexFilter = searchIndexFilter;
	}
	///////////////////////////////////////////////////////////////////////////////////
	
	// Transient state (implementation details)
	
	transient private boolean reachedMaxDocs = false;
	// (if set to true, means that the next search cycle won't be applied - otherwise if you only search once per day
	//  and only process 5K docs/search, it can take a while to build up large repositories)
	
	private transient Set<Integer> distributionTokens; // (temporary internal state for managing intra-source distribution)
	
	// Build some regexes:
	
	public static void initSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
		if (null != searchIndexFilter) { // Initialize regex
			if ((null != searchIndexFilter.assocFilter) && (null == searchIndexFilter.assocFilterRegex)) {
				if (searchIndexFilter.assocFilter.startsWith("+") || searchIndexFilter.assocFilter.startsWith("-")) {
					searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
				}
				else {
					searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);					
				}
			}
			if ((null != searchIndexFilter.assocGeoFilter) && (null == searchIndexFilter.assocGeoFilterRegex)) {
				if (searchIndexFilter.assocGeoFilter.startsWith("+") || searchIndexFilter.assocGeoFilter.startsWith("-")) {
					searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
				}
				else {
					searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);					
				}
			}
			if ((null != searchIndexFilter.entityFilter) && (null == searchIndexFilter.entityFilterRegex)) {
				if (searchIndexFilter.entityFilter.startsWith("+") || searchIndexFilter.entityFilter.startsWith("-")) {
					searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter.substring(1), Pattern.CASE_INSENSITIVE);
				}
				else {
					searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter, Pattern.CASE_INSENSITIVE);					
				}
			}
			if ((null != searchIndexFilter.entityGeoFilter) && (null == searchIndexFilter.entityGeoFilterRegex)) {
				if (searchIndexFilter.entityGeoFilter.startsWith("+") || searchIndexFilter.entityGeoFilter.startsWith("-")) {
					searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter.substring(1), Pattern.CASE_INSENSITIVE);
				}
				else {
					searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter, Pattern.CASE_INSENSITIVE);					
				}
			}
		} // (end if search filter specified)
	}//(end initialize search filter)
	public void setExtractorOptions(LinkedHashMap<String, String> extractorOptions) {
		this.extractorOptions = extractorOptions;
	}
	public LinkedHashMap<String, String> getExtractorOptions() {
		return extractorOptions;
	}
	//TESTED

	public void setProcessingPipeline(List<SourcePipelinePojo> processingPipeline) {
		this.processingPipeline = processingPipeline;
	}
	public List<SourcePipelinePojo> getProcessingPipeline() {
		return processingPipeline;
	}

	public void setAppendTagsToDocs(Boolean appendTagsToDocs) {
		this.appendTagsToDocs = appendTagsToDocs;
	}
	public Boolean getAppendTagsToDocs() {
		return appendTagsToDocs;
	}

	public void setNoSql(SourceNoSqlConfigPojo noSql) {
		this.nosql = noSql;
	}
	public SourceNoSqlConfigPojo getNoSql() {
		return nosql;
	}

	public void setDistributionFactor(Integer distributionFactor) {
		this.distributionFactor = distributionFactor;
	}
	public Integer getDistributionFactor() {
		return distributionFactor;
	}

	public void setDistributionTokens(Set<Integer> distributionTokens) {
		this.distributionTokens = distributionTokens;
	}
	public Set<Integer> getDistributionTokens() {
		return distributionTokens;
	}

	public void setThrottleDocs(Integer throttleDocs) {
		this.throttleDocs = throttleDocs;
	}
	public Integer getThrottleDocs() {
		return throttleDocs;
	}

	///////////////////////////////////////////////////////////////////
	
	// Serialization/deserialization utils:
	// (Ugh needed because extractorOptions keys can contain "."s)
	
	public GsonBuilder extendBuilder(GsonBuilder gp) {
		return gp.registerTypeAdapter(SourcePojo.class, new SourcePojoDeserializer()).
				registerTypeAdapter(SourcePojo.class, new SourcePojoSerializer());
	}
	
	protected static class SourcePojoDeserializer implements JsonDeserializer<SourcePojo> 
	{
		@Override
		public SourcePojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException
		{
			SourcePojo src = BaseDbPojo.getDefaultBuilder().create().fromJson(json, SourcePojo.class);  
			if (null != src.extractorOptions) {
				src.extractorOptions = decodeKeysForDatabaseStorage(src.extractorOptions);
			}
			if (null != src.processingPipeline) {
				for (SourcePipelinePojo pxPipe: src.processingPipeline) {
					if ((null != pxPipe.web) || (null != pxPipe.feed)) {
						SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
						if (null != webOrFeed.getHttpFields()) {
							webOrFeed.setHttpFields(decodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
						}
					}//TESTED (added httpFields by hand)
					// (don't do lookup tables, "."s aren't allowed in their keys)
					if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
						pxPipe.featureEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);						
					}//TESTED (basic_web_test_ocOptions)
					if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
						pxPipe.textEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);						
					}//TESTED (c/p basic_web_test_ocOptions)
				}
			}
			return src;
		}//TESTED (with and without extractor options)
	}
	
	protected static class SourcePojoSerializer implements JsonSerializer<SourcePojo> 
	{
		@Override
		public JsonElement serialize(SourcePojo src, Type typeOfT, JsonSerializationContext context) throws JsonParseException
		{
			if (null != src.extractorOptions) {
				src.extractorOptions = encodeKeysForDatabaseStorage(src.extractorOptions);
			}
			if (null != src.processingPipeline) {
				for (SourcePipelinePojo pxPipe: src.processingPipeline) {
					if ((null != pxPipe.web) || (null != pxPipe.feed)) {
						SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
						if (null != webOrFeed.getHttpFields()) {
							webOrFeed.setHttpFields(encodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
						}
					}//TESTED (added httpFields by hand)
					// (don't do lookup tables, "."s aren't allowed in their keys)
					if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
						pxPipe.featureEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);						
					}//TESTED (basic_web_test_ocOptions)
					if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
						pxPipe.textEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);						
					}//TESTED (c/p basic_web_test_ocOptions)
				}
			}
			// GSON transformation:
			JsonElement je = SourcePojo.getDefaultBuilder().create().toJsonTree(src, typeOfT);
			
			return je;
		}//TESTED (with and without extractor options)
	}	
	// Utilities for handling processing pipeline
	
	// Decode/Encode utilities
	
	private static LinkedHashMap<String, String> decodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
		LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
		for (Map.Entry<String, String> entry: in.entrySet()) {
			transformed.put(entry.getKey().replace("%2e", "."), entry.getValue());
		}		
		return transformed;
	}//TESTED (legacy)

	private static LinkedHashMap<String, String> encodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
		LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
		for (Map.Entry<String, String> entry: in.entrySet()) {
			transformed.put(entry.getKey().replace(".", "%2e"), entry.getValue());
		}		
		return transformed;
	}//TESTED (legacy)
	
	//(ugh need to store this logstash-domain-specific information here, might need to update it from time to time buy should remain reasonably simple)
	private static Pattern _getLogstashUrlRegex = Pattern.compile("(?:bucket|host|url|uri|path)[\\s\\n\\r]*=>[\\s\\n\\r]*['\"]([^'\"]+)", Pattern.CASE_INSENSITIVE);
	
	public String getRepresentativeUrl() {
		if (null == this.getProcessingPipeline()) {
			if (null != this.getUrl()) {
				return this.getUrl();
			}
			else if ((null != this.getRssConfig()) && (null != this.getRssConfig().getExtraUrls()) && !this.getRssConfig().getExtraUrls().isEmpty()) {
				return this.getRssConfig().getExtraUrls().get(0).url;
			}
		}
		else if (!this.getProcessingPipeline().isEmpty()) {
			SourcePipelinePojo px = this.getProcessingPipeline().get(0);
			if (null != px.file) {
				return px.file.getUrl();
			}
			else if (null != px.database) {
				return px.database.getUrl();
			}
			else if (null != px.logstash) {
				String url = null;
				try {
					Matcher m1 =  _getLogstashUrlRegex.matcher(px.logstash.config);
					if (m1.find()) { // (get the first)
						url = m1.group(1);
					}
				}
				catch (Exception e) {} // return null will error out
				return url;
			}
			else {
				SourceRssConfigPojo webOrFeed = px.feed;
				if (null == webOrFeed) {
					webOrFeed = px.web;
				}
				if ((null != webOrFeed) && (null != webOrFeed.getExtraUrls()) && !webOrFeed.getExtraUrls().isEmpty()) {
					return webOrFeed.getExtraUrls().get(0).url;					
				}
			}
		}
		return null;
	}//TESTED (legacy+basic_web_test_ocOptions)
	
	public void fillInSourcePipelineFields() {
		if (null != this.getProcessingPipeline()) {
			this.extractType = null; // always derive from the px pipeline, ignore user input
			
			for (SourcePipelinePojo px: this.getProcessingPipeline()) {
				if (null != px.file) {
					this.extractType = "File";
				}
				else if (null != px.database) {
					this.extractType = "Database";					
				}
				else if (null != px.logstash) {
					this.extractType = "Logstash";					
				}
				else if ((null != px.web) || (null != px.feed)) {
					this.extractType = "Feed";					
				}
				if (null != px.harvest) {
					if (null != px.harvest.searchCycle_secs) {
						if ((null == searchCycle_secs) || (searchCycle_secs > 0)) {
							searchCycle_secs = Math.abs(px.harvest.searchCycle_secs);
						}
						else { // (searchCycle_secs < 0 ie want to suspend source)
							searchCycle_secs = -Math.abs(px.harvest.searchCycle_secs);
						}
						distributionFactor = px.harvest.distributionFactor;
					}//TESTED
					else if ((null != searchCycle_secs) && (searchCycle_secs < 0)) {
						// No search cycle specfiied, source suspended
						searchCycle_secs = -1;
					}//TESTED
					else { // No search cycle specified and source not suspended
						searchCycle_secs = null;
					}//TESTED
					break;
				}
			}
		}//TESTED		
	}
	public Boolean getPartiallyPublished() {
		return partiallyPublished;
	}
	public void setPartiallyPublished(Boolean partiallyPublished) {
		this.partiallyPublished = partiallyPublished;
	}
}