SourcePojo.java | searchcode

/core/infinit.e.data_model/src/com/ikanow/infinit/e/data_model/store/config/source/SourcePojo.java

https://github.com/IKANOW/Infinit.e · Java · 840 lines · 644 code · 79 blank · 117 comment · 163 complexity · e12d5f0172eb8bcd32459aec83cc4f03 MD5 · raw file

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.ikanow.infinit.e.data_model.store.config.source;

import java.io.UnsupportedEncodingException;
import java.lang.reflect.Type;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.binary.Base64;
import org.bson.types.ObjectId;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonParseException;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
import com.ikanow.infinit.e.data_model.store.social.authentication.AuthenticationPojo;
import com.mongodb.BasicDBObject;

/**
 * Class used to establish the source information for a feed
 * this defines the data necessary to create a feed in the system
 * 
 * @author cmorgan
 *
 */

public class SourcePojo extends BaseDbPojo {
	// Standard static function for readability
	@SuppressWarnings("unchecked")
	static public TypeToken<List<SourcePojo>> listType() { return new TypeToken<List<SourcePojo>>(){}; }

	/** 
	  * Private Class Variables
	  */
	
	// Metadata fields
	
	private ObjectId _id = null;
	final public static String _id_ = "_id";
	private Date created = null;
	final public static String created_ = "created";
	private Date modified = null;
	final public static String modified_ = "modified";
	private String url = null;
	final public static String url_ = "url";
	private String title = null;
	final public static String title_ = "title";
	private Boolean isPublic = null; // if false then many fields are removed when viewed by non-owners/moderators/admins 
	final public static String isPublic_ = "isPublic";
	private Boolean partiallyPublished = null; // if fields are removed based on isPublic then this is set to true
	final public static String partiallyPublished_ = "partiallyPublished";
	private ObjectId ownerId = null;
	final public static String ownerId_ = "ownerId";
	private String author = null;
	final public static String author_ = "author";
	
	private String mediaType = null;
	final public static String mediaType_ = "mediaType";
	private String key = null;
	final public static String key_ = "key";
	private String description = null;
	final public static String description_ = "description";
	private Set<String> tags = null;
	final public static String tags_ = "tags";
	
	private Set<ObjectId> communityIds = null;
	final public static String communityIds_ = "communityIds";
	
	private boolean isApproved = false;
	final public static String isApproved_ = "isApproved";
	private boolean harvestBadSource = false;
	final public static String harvestBadSource_ = "harvestBadSource";
	
	private String extractType = null; // (in pipeline mode, copied across from pipeline)
	final public static String extractType_ = "extractType";
	
	private String shah256Hash = null;	
	final public static String shah256Hash_ = "shah256Hash";

	// Control fields used everywhere
	
	private Integer searchCycle_secs = null; // Determines the time between searches, defaults as quickly as the harvest can cycle
												// (in pipeline mode, copied across from pipeline)
	final public static String searchCycle_secs_ = "searchCycle_secs";

	private Integer distributionFactor;
	final public static String distributionFactor_ = "distributionFactor";
	
	public static class SourceSearchIndexFilter {
		public Boolean indexOnIngest = null; // (if specified and false, default:true, then don't index the docs at all)
		public String entityFilter = null; // (regex applied to entity indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String assocFilter = null; // (regex applied to new-line separated association indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String entityGeoFilter = null; // (regex applied to entity indexes if the entity has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String assocGeoFilter = null; // (regex applied to new-line separated association indexes if the association has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String fieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		public String metadataFieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
		
		// temp:
		public transient Pattern entityFilterRegex;
		public transient Pattern assocFilterRegex;
		public transient Pattern entityGeoFilterRegex;
		public transient Pattern assocGeoFilterRegex;
	}
	
	// PROCESSING PIPELINE
	
	private List<SourcePipelinePojo> processingPipeline;
	final public static String processingPipeline_ = "processingPipeline";
	
	// LEGACY CODE, IGNORED IN PROCESSING-PIPELINE MODE

	private SourceHarvestStatusPojo harvest = null;
	final public static String harvest_ = "harvest";
	private SourceDatabaseConfigPojo database = null;
	final public static String database_ = "database";
	private SourceNoSqlConfigPojo nosql = null; 
	final public static String nosql_ = "nosql";
	
	private SourceFileConfigPojo file = null;
	final public static String file_ = "file";
	private SourceRssConfigPojo rss = null;
	final public static String rss_ = "rss";
		
	private AuthenticationPojo authentication = null;
	final public static String authentication_ = "authentication";
	
	private String useExtractor = null;
	final public static String useExtractor_ = "useExtractor";
	private String useTextExtractor = null;
	final public static String useTextExtractor_ = "useTextExtractor";
	
	private StructuredAnalysisConfigPojo structuredAnalysis = null;
	final public static String structuredAnalysis_ = "structuredAnalysis";
	private UnstructuredAnalysisConfigPojo unstructuredAnalysis = null;
	final public static String unstructuredAnalysis_ = "unstructuredAnalysis";	
	
	private Integer maxDocs = null; // Limits the number of docs that can be stored for this source at any one time
	final public static String maxDocs_ = "maxDocs";
	private Integer throttleDocs = null; // Limits the number of docs that can be harvested in one cycle (cannot be higher than system setting in harvest.maxdocs_persource)
	final public static String throttleDocs_ = "throttleDocs";
	private Boolean duplicateExistingUrls; // If false (defaults: true) will ignore docs harvested by other sources in the community
	final public static String duplicateExistingUrls_ = "duplicateExistingUrls";
	private Boolean appendTagsToDocs = null; // if true (default) source tags are appended to the document
	
	final public static String appendTagsToDocs_ = "appendTagsToDocs";
	
	private SourceSearchIndexFilter searchIndexFilter = null; // Optional, allows the source builder to configure which fields are searchable
	final public static String searchIndexFilter_ = "searchIndexFilter";
	
	private LinkedHashMap<String, String> extractorOptions = null; // Optional, overrides the per-extractor configuration options, where permissible
	final public static String extractorOptions_ = "extractorOptions";
	
	//////////////////////////////////////
	
	// Gets and sets
	
	public AuthenticationPojo getAuthentication() {
		return authentication;
	}
	public void setAuthentication(AuthenticationPojo authentication) {
		this.authentication = authentication;
	}
	public SourceFileConfigPojo getFileConfig() {
		return file;
	}
	public void setFileConfig(SourceFileConfigPojo file) {
		this.file = file;
	}
	public SourceRssConfigPojo getRssConfig() {
		return rss;
	}
	public void setRssConfig(SourceRssConfigPojo rss) {
		this.rss = rss;
	}
	public SourceDatabaseConfigPojo getDatabaseConfig() {
		return database;
	}
	public void setDatabaseConfig(SourceDatabaseConfigPojo database) {
		this.database = database;
	}
	public ObjectId getId() {
		return _id;
	}
	public void setId(ObjectId id) {
		this._id = id;
	}
	public String getKey() {
		return key;
	}
	public void setKey(String key) {
		this.key = key;
	}
	public Date getCreated() {
		return created;
	}
	public void setCreated(Date created) {
		this.created = created;
	}
	public Date getModified() {
		return modified;
	}
	public void setModified(Date modified) {
		this.modified = modified;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getDescription() {
		return description;
	}
	public void setDescription(String description) {
		this.description = description;
	}
	public String getMediaType() {
		return mediaType;
	}
	public void setMediaType(String mediaType) {
		this.mediaType = mediaType;
	}
	public String getExtractType() {
		return extractType;
	}
	public void setExtractType(String extractType) {
		this.extractType = extractType;
	}
	public Boolean getIsPublic() {
		return isPublic;
	}
	public boolean isPublic() {
		return (isPublic == null)?false:isPublic; // (ie defaults to false)
	}
	public void setIsPublic(Boolean isPublic) {
		this.isPublic = isPublic;		
	}
	public void setPublic(boolean isPublic) {
		this.isPublic = isPublic;
	}
	public String getAuthor() {
		return author;
	}
	public void setAuthor(String author) {
		this.author = author;
	}
	/** 
	  * Get the tags
	  */
	public Set<String> getTags() {
		return tags;
	}
	/** 
	  * Set the tags
	  */
	public void setTags(Set<String> tags) {
		this.tags = tags;
	}
	
	/**
	 * @param ownerID the ownerID to set
	 */
	public void setOwnerId(ObjectId ownerID) {
		this.ownerId = ownerID;
	}
	/**
	 * @return the ownerID
	 */
	public ObjectId getOwnerId() {
		return ownerId;
	}
	public SourcePojo() {
		
	}
	public void setHarvestStatus(SourceHarvestStatusPojo harvest) {
		this.harvest = harvest;
	}
	public SourceHarvestStatusPojo getHarvestStatus() {
		return harvest;
	}
	public void setApproved(boolean isApproved) {
		this.isApproved = isApproved;
	}
	public boolean isApproved() {
		return isApproved;
	}
	public void addToCommunityIds(ObjectId communityID) {
		if (null == this.communityIds) {
			this.communityIds = new HashSet<ObjectId>();
		}
		this.communityIds.add(communityID);
	}
	public void removeFromCommunityIds(ObjectId communityID) {
		if (null != this.communityIds) {
			this.communityIds.remove(communityID);
		}
	}
	public Set<ObjectId> getCommunityIds() {
		return communityIds;
	}
	public void setCommunityIds(Set<ObjectId> ids) {
		communityIds = ids;
	}
	public void setHarvestBadSource(boolean harvestBadSource) {
		this.harvestBadSource = harvestBadSource;
	}
	public boolean isHarvestBadSource() {
		return harvestBadSource;
	}

	/**
	 * @param useExtractor the useExtractor to set
	 */
	public void setUseExtractor(String useExtractor) {
		this.useExtractor = useExtractor;
	}

	/**
	 * @return the useExtractor
	 */
	public String useExtractor() {
		return useExtractor;
	}

	/**
	 * @param useTextExtractor the useTextExtractor to set
	 */
	public void setUseTextExtractor(String useTextExtractor) {
		this.useTextExtractor = useTextExtractor;
	}

	/**
	 * @return the useTextExtractor
	 */
	public String useTextExtractor() {
		return useTextExtractor;
	}

	/**
	 * @param structedAnalysis the structedAnalysis to set
	 */
	public void setStructuredAnalysisConfig(StructuredAnalysisConfigPojo structuredAnalysis) {
		this.structuredAnalysis = structuredAnalysis;
	}

	/**
	 * @return the structedAnalysis
	 */
	public StructuredAnalysisConfigPojo getStructuredAnalysisConfig() {
		return structuredAnalysis;
	}
	
	/**
	 * @param structuredAnalysis the structuredAnalysis to set
	 */
	public void setUnstructuredAnalysisConfig(UnstructuredAnalysisConfigPojo unstructuredAnalysis) {
		this.unstructuredAnalysis = unstructuredAnalysis;
	}

	/**
	 * @return the unstructuredAnalysis
	 */
	public UnstructuredAnalysisConfigPojo getUnstructuredAnalysisConfig() {
		return unstructuredAnalysis;
	}
	/**
	 * setShah256Hash - calls generateShah256Hash
	 */
	public void generateShah256Hash()
	{
		try 
		{
			generateShah256Hash_internal();
		} 
		catch (Exception e) 
		{
			
		}
	}

	/**
	 * getShah256Hash - calls generateShah256Hash if shah256Hash is null
	 * @return
	 */
	public String getShah256Hash() 
	{
		if (null != shah256Hash )
		{
			return shah256Hash;
		}
		else
		{
			try 
			{
				generateShah256Hash_internal();
				return shah256Hash;
			} 
			catch (Exception e) 
			{
				return null;
			}
		}
	}
	// Utility:
	
	/**
	 * generateSourceKey
	 * Strips out http://, smb:// /, :, etc. from the URL field to generate
	 * Example: http://www.ikanow.com/rss -> www.ikanow.com.rss
	 */
	public String generateSourceKey()
	{
		String s = getRepresentativeUrl(); // (supports all cases - note we are guaranteed to have a URL by this point)
		if (null == s) {
			return null;
		}
		
		int nIndex = s.indexOf('?');
		final int nMaxLen = 64; // (+24 for the object id, + random other stuff, keeps it in the <100 range)
		if (nIndex >= 0) {
			if (nIndex > nMaxLen) {
				nIndex = nMaxLen; // (ie max length)
			}
			StringBuffer sb = new StringBuffer(s.substring(0, nIndex));
			sb.append(".").append(s.length() - nIndex).append('.').append(Math.abs(s.hashCode()) % 100);
			s = sb.toString();
		}
		else if (s.length() > nMaxLen) {
			s = s.substring(0, nMaxLen);
		}
		//TESTED (urls with and without ?)
		
		s = s.replaceAll("http://|https://|smb://|ftp://|ftps://|file://|[/:+?&(),#]", ".");
		if (s.startsWith(".")) s = s.substring(1);
		return s;
	}
	/**
	 * generateShah256Hash
	 * Combines the required and optional fields of a SourcePojo into a string that is
	 * then hashed using SHAH-256 and saved to the SourePojo.shah256Hash field;
	 * this value is used to determine source uniqueness
	 * @throws NoSuchAlgorithmException
	 * @throws UnsupportedEncodingException
	 */
	private void generateShah256Hash_internal() throws NoSuchAlgorithmException, UnsupportedEncodingException 
	{	
		// Create StringBuffer with fields to use to establish source *processing* uniqueness
		StringBuffer sb = new StringBuffer();

		// (Note what I mean by "source processing uniqueness" is that, *for a specific doc URL* 2 sources would process it identically)	
		// So fields like key,URL,media type,tags,etc aren't included in the hash
		
		if (null != processingPipeline) { // new processing pipeline contains all the logic that determines a source's processing
			for (SourcePipelinePojo pxPipe: processingPipeline) {
				if ((null == pxPipe.feed) && (null == pxPipe.web)) { // (these are too difficult to pull the URL out of)
					String fileUrl = null;
					if (null != pxPipe.file) {
						fileUrl = pxPipe.file.getUrl();
						pxPipe.file.setUrl(null);
					}
					// (don't both with DB because its URL is so intertwined with its processing)
					sb.append(new Gson().toJson(pxPipe));
					if (null != fileUrl) {
						pxPipe.file.setUrl(fileUrl);
					} // (stay idempotent)
				}
			}
		}//TESTED
		else { //legacy case
		
			// Required Fields
			sb.append(this.extractType);
					
			// Optional fields
			if (this.extractType != null) sb.append(this.extractType);
			if (this.useExtractor != null) sb.append(this.useExtractor);
			if (this.useTextExtractor != null) sb.append(this.useTextExtractor);
			
			// Generate a hash of all the objects using the ORM layer
			SourcePojo newSrc = new SourcePojo();
			newSrc.setId(null); // (in case this is auto set by the c'tor)
			newSrc.setAuthentication(this.authentication);
			newSrc.setDatabaseConfig(this.database);
			newSrc.setFileConfig(this.file);
			// Don't include RSS config since it can contain URLs
			newSrc.setStructuredAnalysisConfig(this.structuredAnalysis);
			newSrc.setUnstructuredAnalysisConfig(this.unstructuredAnalysis);
			sb.append(((BasicDBObject)newSrc.toDb()).toString());
			
		}//TESTED (legacy)
		
		// Create MessageDigest and set shah256Hash value
		MessageDigest md = MessageDigest.getInstance("SHA-256");
		md.update(sb.toString().getBytes("UTF-8"));		
		shah256Hash = Base64.encodeBase64String(md.digest());
	}
	public Integer getSearchCycle_secs() {
		return searchCycle_secs;
	}
	public void setSearchCycle_secs(Integer searchCycle_secs) {
		this.searchCycle_secs = searchCycle_secs;
	}
	public void setMaxDocs(Integer maxDocs) {
		this.maxDocs = maxDocs;
	}
	public Integer getMaxDocs() {
		return maxDocs;
	}
	public void setReachedMaxDocs() {
		this.reachedMaxDocs = true;
	}
	public boolean reachedMaxDocs() {
		return reachedMaxDocs;
	}
	public void setDuplicateExistingUrls(Boolean duplicateExistingUrls) {
		this.duplicateExistingUrls = duplicateExistingUrls;
	}
	public boolean getDuplicateExistingUrls() { // (defaults to true)
		return duplicateExistingUrls == null ? true : duplicateExistingUrls;
	}
	public SourceSearchIndexFilter getSearchIndexFilter() {
		initSearchIndexFilter(searchIndexFilter);
		return searchIndexFilter;
	}
	public void setSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
		this.searchIndexFilter = searchIndexFilter;
	}
	///////////////////////////////////////////////////////////////////////////////////
	
	// Transient state (implementation details)
	
	transient private boolean reachedMaxDocs = false;
	// (if set to true, means that the next search cycle won't be applied - otherwise if you only search once per day
	//  and only process 5K docs/search, it can take a while to build up large repositories)
	
	private transient Set<Integer> distributionTokens; // (temporary internal state for managing intra-source distribution)
	
	// Build some regexes:
	
	public static void initSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
		if (null != searchIndexFilter) { // Initialize regex
			if ((null != searchIndexFilter.assocFilter) && (null == searchIndexFilter.assocFilterRegex)) {
				if (searchIndexFilter.assocFilter.startsWith("+") || searchIndexFilter.assocFilter.startsWith("-")) {
					searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
				}
				else {
					searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);					
				}
			}
			if ((null != searchIndexFilter.assocGeoFilter) && (null == searchIndexFilter.assocGeoFilterRegex)) {
				if (searchIndexFilter.assocGeoFilter.startsWith("+") || searchIndexFilter.assocGeoFilter.startsWith("-")) {
					searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
				}
				else {
					searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);					
				}
			}
			if ((null != searchIndexFilter.entityFilter) && (null == searchIndexFilter.entityFilterRegex)) {
				if (searchIndexFilter.entityFilter.startsWith("+") || searchIndexFilter.entityFilter.startsWith("-")) {
					searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter.substring(1), Pattern.CASE_INSENSITIVE);
				}
				else {
					searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter, Pattern.CASE_INSENSITIVE);					
				}
			}
			if ((null != searchIndexFilter.entityGeoFilter) && (null == searchIndexFilter.entityGeoFilterRegex)) {
				if (searchIndexFilter.entityGeoFilter.startsWith("+") || searchIndexFilter.entityGeoFilter.startsWith("-")) {
					searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter.substring(1), Pattern.CASE_INSENSITIVE);
				}
				else {
					searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter, Pattern.CASE_INSENSITIVE);					
				}
			}
		} // (end if search filter specified)
	}//(end initialize search filter)
	public void setExtractorOptions(LinkedHashMap<String, String> extractorOptions) {
		this.extractorOptions = extractorOptions;
	}
	public LinkedHashMap<String, String> getExtractorOptions() {
		return extractorOptions;
	}
	//TESTED

	public void setProcessingPipeline(List<SourcePipelinePojo> processingPipeline) {
		this.processingPipeline = processingPipeline;
	}
	public List<SourcePipelinePojo> getProcessingPipeline() {
		return processingPipeline;
	}

	public void setAppendTagsToDocs(Boolean appendTagsToDocs) {
		this.appendTagsToDocs = appendTagsToDocs;
	}
	public Boolean getAppendTagsToDocs() {
		return appendTagsToDocs;
	}

	public void setNoSql(SourceNoSqlConfigPojo noSql) {
		this.nosql = noSql;
	}
	public SourceNoSqlConfigPojo getNoSql() {
		return nosql;
	}

	public void setDistributionFactor(Integer distributionFactor) {
		this.distributionFactor = distributionFactor;
	}
	public Integer getDistributionFactor() {
		return distributionFactor;
	}

	public void setDistributionTokens(Set<Integer> distributionTokens) {
		this.distributionTokens = distributionTokens;
	}
	public Set<Integer> getDistributionTokens() {
		return distributionTokens;
	}

	public void setThrottleDocs(Integer throttleDocs) {
		this.throttleDocs = throttleDocs;
	}
	public Integer getThrottleDocs() {
		return throttleDocs;
	}

	///////////////////////////////////////////////////////////////////
	
	// Serialization/deserialization utils:
	// (Ugh needed because extractorOptions keys can contain "."s)
	
	public GsonBuilder extendBuilder(GsonBuilder gp) {
		return gp.registerTypeAdapter(SourcePojo.class, new SourcePojoDeserializer()).
				registerTypeAdapter(SourcePojo.class, new SourcePojoSerializer());
	}
	
	protected static class SourcePojoDeserializer implements JsonDeserializer<SourcePojo> 
	{
		@Override
		public SourcePojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException
		{
			SourcePojo src = BaseDbPojo.getDefaultBuilder().create().fromJson(json, SourcePojo.class);  
			if (null != src.extractorOptions) {
				src.extractorOptions = decodeKeysForDatabaseStorage(src.extractorOptions);
			}
			if (null != src.processingPipeline) {
				for (SourcePipelinePojo pxPipe: src.processingPipeline) {
					if ((null != pxPipe.web) || (null != pxPipe.feed)) {
						SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
						if (null != webOrFeed.getHttpFields()) {
							webOrFeed.setHttpFields(decodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
						}
					}//TESTED (added httpFields by hand)
					// (don't do lookup tables, "."s aren't allowed in their keys)
					if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
						pxPipe.featureEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);						
					}//TESTED (basic_web_test_ocOptions)
					if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
						pxPipe.textEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);						
					}//TESTED (c/p basic_web_test_ocOptions)
				}
			}
			return src;
		}//TESTED (with and without extractor options)
	}
	
	protected static class SourcePojoSerializer implements JsonSerializer<SourcePojo> 
	{
		@Override
		public JsonElement serialize(SourcePojo src, Type typeOfT, JsonSerializationContext context) throws JsonParseException
		{
			if (null != src.extractorOptions) {
				src.extractorOptions = encodeKeysForDatabaseStorage(src.extractorOptions);
			}
			if (null != src.processingPipeline) {
				for (SourcePipelinePojo pxPipe: src.processingPipeline) {
					if ((null != pxPipe.web) || (null != pxPipe.feed)) {
						SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
						if (null != webOrFeed.getHttpFields()) {
							webOrFeed.setHttpFields(encodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
						}
					}//TESTED (added httpFields by hand)
					// (don't do lookup tables, "."s aren't allowed in their keys)
					if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
						pxPipe.featureEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);						
					}//TESTED (basic_web_test_ocOptions)
					if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
						pxPipe.textEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);						
					}//TESTED (c/p basic_web_test_ocOptions)
				}
			}
			// GSON transformation:
			JsonElement je = SourcePojo.getDefaultBuilder().create().toJsonTree(src, typeOfT);
			
			return je;
		}//TESTED (with and without extractor options)
	}	
	// Utilities for handling processing pipeline
	
	// Decode/Encode utilities
	
	private static LinkedHashMap<String, String> decodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
		LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
		for (Map.Entry<String, String> entry: in.entrySet()) {
			transformed.put(entry.getKey().replace("%2e", "."), entry.getValue());
		}		
		return transformed;
	}//TESTED (legacy)

	private static LinkedHashMap<String, String> encodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
		LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
		for (Map.Entry<String, String> entry: in.entrySet()) {
			transformed.put(entry.getKey().replace(".", "%2e"), entry.getValue());
		}		
		return transformed;
	}//TESTED (legacy)
	
	//(ugh need to store this logstash-domain-specific information here, might need to update it from time to time buy should remain reasonably simple)
	private static Pattern _getLogstashUrlRegex = Pattern.compile("(?:bucket|host|url|uri|path)[\\s\\n\\r]*=>[\\s\\n\\r]*['\"]([^'\"]+)", Pattern.CASE_INSENSITIVE);
	
	public String getRepresentativeUrl() {
		if (null == this.getProcessingPipeline()) {
			if (null != this.getUrl()) {
				return this.getUrl();
			}
			else if ((null != this.getRssConfig()) && (null != this.getRssConfig().getExtraUrls()) && !this.getRssConfig().getExtraUrls().isEmpty()) {
				return this.getRssConfig().getExtraUrls().get(0).url;
			}
		}
		else if (!this.getProcessingPipeline().isEmpty()) {
			SourcePipelinePojo px = this.getProcessingPipeline().get(0);
			if (null != px.file) {
				return px.file.getUrl();
			}
			else if (null != px.database) {
				return px.database.getUrl();
			}
			else if (null != px.logstash) {
				String url = null;
				try {
					Matcher m1 =  _getLogstashUrlRegex.matcher(px.logstash.config);
					if (m1.find()) { // (get the first)
						url = m1.group(1);
					}
				}
				catch (Exception e) {} // return null will error out
				return url;
			}
			else {
				SourceRssConfigPojo webOrFeed = px.feed;
				if (null == webOrFeed) {
					webOrFeed = px.web;
				}
				if ((null != webOrFeed) && (null != webOrFeed.getExtraUrls()) && !webOrFeed.getExtraUrls().isEmpty()) {
					return webOrFeed.getExtraUrls().get(0).url;					
				}
			}
		}
		return null;
	}//TESTED (legacy+basic_web_test_ocOptions)
	
	public void fillInSourcePipelineFields() {
		if (null != this.getProcessingPipeline()) {
			this.extractType = null; // always derive from the px pipeline, ignore user input
			
			for (SourcePipelinePojo px: this.getProcessingPipeline()) {
				if (null != px.file) {
					this.extractType = "File";
				}
				else if (null != px.database) {
					this.extractType = "Database";					
				}
				else if (null != px.logstash) {
					this.extractType = "Logstash";					
				}
				else if ((null != px.web) || (null != px.feed)) {
					this.extractType = "Feed";					
				}
				if (null != px.harvest) {
					if (null != px.harvest.searchCycle_secs) {
						if ((null == searchCycle_secs) || (searchCycle_secs > 0)) {
							searchCycle_secs = Math.abs(px.harvest.searchCycle_secs);
						}
						else { // (searchCycle_secs < 0 ie want to suspend source)
							searchCycle_secs = -Math.abs(px.harvest.searchCycle_secs);
						}
						distributionFactor = px.harvest.distributionFactor;
					}//TESTED
					else if ((null != searchCycle_secs) && (searchCycle_secs < 0)) {
						// No search cycle specfiied, source suspended
						searchCycle_secs = -1;
					}//TESTED
					else { // No search cycle specified and source not suspended
						searchCycle_secs = null;
					}//TESTED
					break;
				}
			}
		}//TESTED		
	}
	public Boolean getPartiallyPublished() {
		return partiallyPublished;
	}
	public void setPartiallyPublished(Boolean partiallyPublished) {
		this.partiallyPublished = partiallyPublished;
	}
}
Tech Fingerprint

Alerts (31)

'java.util.Date' Maintainability Info: Prefer using the modern Java Time API (java.time.* classes like LocalDate, ZonedDateTime, Instant) introduced in Java 8 over the legacy java.util.Date and Calendar classes for better API design and thread safety.
22
'public' Maintainability Info: Public non-final fields violate encapsulation. Prefer making fields private and providing public getter/setter methods if access is needed.
118 119 120 121 122 123 124 127 128 129 130
'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
123 124
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
409 432 779
'return null;' Returning null forces callers to perform null checks, risking NullPointerException. Consider using Optional<T> (Java 8+), throwing an exception, or returning a Null Object/empty collection instead.
434 449 792
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
453
Complexity hotspot; lines 679 to 683 (total complexity: 10)
679 680 681 682 683
Complexity hotspot; lines 708 to 712 (total complexity: 10)
708 709 710 711 712