/core/infinit.e.data_model/src/com/ikanow/infinit/e/data_model/store/config/source/SourcePojo.java
Java | 840 lines | 644 code | 79 blank | 117 comment | 163 complexity | e12d5f0172eb8bcd32459aec83cc4f03 MD5 | raw file
Possible License(s): BSD-3-Clause
- /*******************************************************************************
- * Copyright 2012 The Infinit.e Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
- package com.ikanow.infinit.e.data_model.store.config.source;
- import java.io.UnsupportedEncodingException;
- import java.lang.reflect.Type;
- import java.security.MessageDigest;
- import java.security.NoSuchAlgorithmException;
- import java.util.Date;
- import java.util.HashSet;
- import java.util.LinkedHashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.Set;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.commons.codec.binary.Base64;
- import org.bson.types.ObjectId;
- import com.google.gson.Gson;
- import com.google.gson.GsonBuilder;
- import com.google.gson.JsonDeserializationContext;
- import com.google.gson.JsonDeserializer;
- import com.google.gson.JsonElement;
- import com.google.gson.JsonParseException;
- import com.google.gson.JsonSerializationContext;
- import com.google.gson.JsonSerializer;
- import com.google.gson.reflect.TypeToken;
- import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
- import com.ikanow.infinit.e.data_model.store.social.authentication.AuthenticationPojo;
- import com.mongodb.BasicDBObject;
- /**
- * Class used to establish the source information for a feed
- * this defines the data necessary to create a feed in the system
- *
- * @author cmorgan
- *
- */
- public class SourcePojo extends BaseDbPojo {
- // Standard static function for readability
- @SuppressWarnings("unchecked")
- static public TypeToken<List<SourcePojo>> listType() { return new TypeToken<List<SourcePojo>>(){}; }
- /**
- * Private Class Variables
- */
-
- // Metadata fields
-
- private ObjectId _id = null;
- final public static String _id_ = "_id";
- private Date created = null;
- final public static String created_ = "created";
- private Date modified = null;
- final public static String modified_ = "modified";
- private String url = null;
- final public static String url_ = "url";
- private String title = null;
- final public static String title_ = "title";
- private Boolean isPublic = null; // if false then many fields are removed when viewed by non-owners/moderators/admins
- final public static String isPublic_ = "isPublic";
- private Boolean partiallyPublished = null; // if fields are removed based on isPublic then this is set to true
- final public static String partiallyPublished_ = "partiallyPublished";
- private ObjectId ownerId = null;
- final public static String ownerId_ = "ownerId";
- private String author = null;
- final public static String author_ = "author";
-
- private String mediaType = null;
- final public static String mediaType_ = "mediaType";
- private String key = null;
- final public static String key_ = "key";
- private String description = null;
- final public static String description_ = "description";
- private Set<String> tags = null;
- final public static String tags_ = "tags";
-
- private Set<ObjectId> communityIds = null;
- final public static String communityIds_ = "communityIds";
-
- private boolean isApproved = false;
- final public static String isApproved_ = "isApproved";
- private boolean harvestBadSource = false;
- final public static String harvestBadSource_ = "harvestBadSource";
-
- private String extractType = null; // (in pipeline mode, copied across from pipeline)
- final public static String extractType_ = "extractType";
-
- private String shah256Hash = null;
- final public static String shah256Hash_ = "shah256Hash";
- // Control fields used everywhere
-
- private Integer searchCycle_secs = null; // Determines the time between searches, defaults as quickly as the harvest can cycle
- // (in pipeline mode, copied across from pipeline)
- final public static String searchCycle_secs_ = "searchCycle_secs";
- private Integer distributionFactor;
- final public static String distributionFactor_ = "distributionFactor";
-
- public static class SourceSearchIndexFilter {
- public Boolean indexOnIngest = null; // (if specified and false, default:true, then don't index the docs at all)
- public String entityFilter = null; // (regex applied to entity indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
- public String assocFilter = null; // (regex applied to new-line separated association indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
- public String entityGeoFilter = null; // (regex applied to entity indexes if the entity has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
- public String assocGeoFilter = null; // (regex applied to new-line separated association indexes if the association has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
- public String fieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
- public String metadataFieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
-
- // temp:
- public transient Pattern entityFilterRegex;
- public transient Pattern assocFilterRegex;
- public transient Pattern entityGeoFilterRegex;
- public transient Pattern assocGeoFilterRegex;
- }
-
- // PROCESSING PIPELINE
-
- private List<SourcePipelinePojo> processingPipeline;
- final public static String processingPipeline_ = "processingPipeline";
-
- // LEGACY CODE, IGNORED IN PROCESSING-PIPELINE MODE
- private SourceHarvestStatusPojo harvest = null;
- final public static String harvest_ = "harvest";
- private SourceDatabaseConfigPojo database = null;
- final public static String database_ = "database";
- private SourceNoSqlConfigPojo nosql = null;
- final public static String nosql_ = "nosql";
-
- private SourceFileConfigPojo file = null;
- final public static String file_ = "file";
- private SourceRssConfigPojo rss = null;
- final public static String rss_ = "rss";
-
- private AuthenticationPojo authentication = null;
- final public static String authentication_ = "authentication";
-
- private String useExtractor = null;
- final public static String useExtractor_ = "useExtractor";
- private String useTextExtractor = null;
- final public static String useTextExtractor_ = "useTextExtractor";
-
- private StructuredAnalysisConfigPojo structuredAnalysis = null;
- final public static String structuredAnalysis_ = "structuredAnalysis";
- private UnstructuredAnalysisConfigPojo unstructuredAnalysis = null;
- final public static String unstructuredAnalysis_ = "unstructuredAnalysis";
-
- private Integer maxDocs = null; // Limits the number of docs that can be stored for this source at any one time
- final public static String maxDocs_ = "maxDocs";
- private Integer throttleDocs = null; // Limits the number of docs that can be harvested in one cycle (cannot be higher than system setting in harvest.maxdocs_persource)
- final public static String throttleDocs_ = "throttleDocs";
- private Boolean duplicateExistingUrls; // If false (defaults: true) will ignore docs harvested by other sources in the community
- final public static String duplicateExistingUrls_ = "duplicateExistingUrls";
- private Boolean appendTagsToDocs = null; // if true (default) source tags are appended to the document
-
- final public static String appendTagsToDocs_ = "appendTagsToDocs";
-
- private SourceSearchIndexFilter searchIndexFilter = null; // Optional, allows the source builder to configure which fields are searchable
- final public static String searchIndexFilter_ = "searchIndexFilter";
-
- private LinkedHashMap<String, String> extractorOptions = null; // Optional, overrides the per-extractor configuration options, where permissible
- final public static String extractorOptions_ = "extractorOptions";
-
- //////////////////////////////////////
-
- // Gets and sets
-
- public AuthenticationPojo getAuthentication() {
- return authentication;
- }
- public void setAuthentication(AuthenticationPojo authentication) {
- this.authentication = authentication;
- }
- public SourceFileConfigPojo getFileConfig() {
- return file;
- }
- public void setFileConfig(SourceFileConfigPojo file) {
- this.file = file;
- }
- public SourceRssConfigPojo getRssConfig() {
- return rss;
- }
- public void setRssConfig(SourceRssConfigPojo rss) {
- this.rss = rss;
- }
- public SourceDatabaseConfigPojo getDatabaseConfig() {
- return database;
- }
- public void setDatabaseConfig(SourceDatabaseConfigPojo database) {
- this.database = database;
- }
- public ObjectId getId() {
- return _id;
- }
- public void setId(ObjectId id) {
- this._id = id;
- }
- public String getKey() {
- return key;
- }
- public void setKey(String key) {
- this.key = key;
- }
- public Date getCreated() {
- return created;
- }
- public void setCreated(Date created) {
- this.created = created;
- }
- public Date getModified() {
- return modified;
- }
- public void setModified(Date modified) {
- this.modified = modified;
- }
- public String getUrl() {
- return url;
- }
- public void setUrl(String url) {
- this.url = url;
- }
- public String getTitle() {
- return title;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public String getDescription() {
- return description;
- }
- public void setDescription(String description) {
- this.description = description;
- }
- public String getMediaType() {
- return mediaType;
- }
- public void setMediaType(String mediaType) {
- this.mediaType = mediaType;
- }
- public String getExtractType() {
- return extractType;
- }
- public void setExtractType(String extractType) {
- this.extractType = extractType;
- }
- public Boolean getIsPublic() {
- return isPublic;
- }
- public boolean isPublic() {
- return (isPublic == null)?false:isPublic; // (ie defaults to false)
- }
- public void setIsPublic(Boolean isPublic) {
- this.isPublic = isPublic;
- }
- public void setPublic(boolean isPublic) {
- this.isPublic = isPublic;
- }
- public String getAuthor() {
- return author;
- }
- public void setAuthor(String author) {
- this.author = author;
- }
- /**
- * Get the tags
- */
- public Set<String> getTags() {
- return tags;
- }
- /**
- * Set the tags
- */
- public void setTags(Set<String> tags) {
- this.tags = tags;
- }
-
- /**
- * @param ownerID the ownerID to set
- */
- public void setOwnerId(ObjectId ownerID) {
- this.ownerId = ownerID;
- }
- /**
- * @return the ownerID
- */
- public ObjectId getOwnerId() {
- return ownerId;
- }
- public SourcePojo() {
-
- }
- public void setHarvestStatus(SourceHarvestStatusPojo harvest) {
- this.harvest = harvest;
- }
- public SourceHarvestStatusPojo getHarvestStatus() {
- return harvest;
- }
- public void setApproved(boolean isApproved) {
- this.isApproved = isApproved;
- }
- public boolean isApproved() {
- return isApproved;
- }
- public void addToCommunityIds(ObjectId communityID) {
- if (null == this.communityIds) {
- this.communityIds = new HashSet<ObjectId>();
- }
- this.communityIds.add(communityID);
- }
- public void removeFromCommunityIds(ObjectId communityID) {
- if (null != this.communityIds) {
- this.communityIds.remove(communityID);
- }
- }
- public Set<ObjectId> getCommunityIds() {
- return communityIds;
- }
- public void setCommunityIds(Set<ObjectId> ids) {
- communityIds = ids;
- }
- public void setHarvestBadSource(boolean harvestBadSource) {
- this.harvestBadSource = harvestBadSource;
- }
- public boolean isHarvestBadSource() {
- return harvestBadSource;
- }
- /**
- * @param useExtractor the useExtractor to set
- */
- public void setUseExtractor(String useExtractor) {
- this.useExtractor = useExtractor;
- }
- /**
- * @return the useExtractor
- */
- public String useExtractor() {
- return useExtractor;
- }
- /**
- * @param useTextExtractor the useTextExtractor to set
- */
- public void setUseTextExtractor(String useTextExtractor) {
- this.useTextExtractor = useTextExtractor;
- }
- /**
- * @return the useTextExtractor
- */
- public String useTextExtractor() {
- return useTextExtractor;
- }
- /**
- * @param structedAnalysis the structedAnalysis to set
- */
- public void setStructuredAnalysisConfig(StructuredAnalysisConfigPojo structuredAnalysis) {
- this.structuredAnalysis = structuredAnalysis;
- }
- /**
- * @return the structedAnalysis
- */
- public StructuredAnalysisConfigPojo getStructuredAnalysisConfig() {
- return structuredAnalysis;
- }
-
- /**
- * @param structuredAnalysis the structuredAnalysis to set
- */
- public void setUnstructuredAnalysisConfig(UnstructuredAnalysisConfigPojo unstructuredAnalysis) {
- this.unstructuredAnalysis = unstructuredAnalysis;
- }
- /**
- * @return the unstructuredAnalysis
- */
- public UnstructuredAnalysisConfigPojo getUnstructuredAnalysisConfig() {
- return unstructuredAnalysis;
- }
- /**
- * setShah256Hash - calls generateShah256Hash
- */
- public void generateShah256Hash()
- {
- try
- {
- generateShah256Hash_internal();
- }
- catch (Exception e)
- {
-
- }
- }
- /**
- * getShah256Hash - calls generateShah256Hash if shah256Hash is null
- * @return
- */
- public String getShah256Hash()
- {
- if (null != shah256Hash )
- {
- return shah256Hash;
- }
- else
- {
- try
- {
- generateShah256Hash_internal();
- return shah256Hash;
- }
- catch (Exception e)
- {
- return null;
- }
- }
- }
- // Utility:
-
- /**
- * generateSourceKey
- * Strips out http://, smb:// /, :, etc. from the URL field to generate
- * Example: http://www.ikanow.com/rss -> www.ikanow.com.rss
- */
- public String generateSourceKey()
- {
- String s = getRepresentativeUrl(); // (supports all cases - note we are guaranteed to have a URL by this point)
- if (null == s) {
- return null;
- }
-
- int nIndex = s.indexOf('?');
- final int nMaxLen = 64; // (+24 for the object id, + random other stuff, keeps it in the <100 range)
- if (nIndex >= 0) {
- if (nIndex > nMaxLen) {
- nIndex = nMaxLen; // (ie max length)
- }
- StringBuffer sb = new StringBuffer(s.substring(0, nIndex));
- sb.append(".").append(s.length() - nIndex).append('.').append(Math.abs(s.hashCode()) % 100);
- s = sb.toString();
- }
- else if (s.length() > nMaxLen) {
- s = s.substring(0, nMaxLen);
- }
- //TESTED (urls with and without ?)
-
- s = s.replaceAll("http://|https://|smb://|ftp://|ftps://|file://|[/:+?&(),#]", ".");
- if (s.startsWith(".")) s = s.substring(1);
- return s;
- }
- /**
- * generateShah256Hash
- * Combines the required and optional fields of a SourcePojo into a string that is
- * then hashed using SHAH-256 and saved to the SourePojo.shah256Hash field;
- * this value is used to determine source uniqueness
- * @throws NoSuchAlgorithmException
- * @throws UnsupportedEncodingException
- */
- private void generateShah256Hash_internal() throws NoSuchAlgorithmException, UnsupportedEncodingException
- {
- // Create StringBuffer with fields to use to establish source *processing* uniqueness
- StringBuffer sb = new StringBuffer();
- // (Note what I mean by "source processing uniqueness" is that, *for a specific doc URL* 2 sources would process it identically)
- // So fields like key,URL,media type,tags,etc aren't included in the hash
-
- if (null != processingPipeline) { // new processing pipeline contains all the logic that determines a source's processing
- for (SourcePipelinePojo pxPipe: processingPipeline) {
- if ((null == pxPipe.feed) && (null == pxPipe.web)) { // (these are too difficult to pull the URL out of)
- String fileUrl = null;
- if (null != pxPipe.file) {
- fileUrl = pxPipe.file.getUrl();
- pxPipe.file.setUrl(null);
- }
- // (don't both with DB because its URL is so intertwined with its processing)
- sb.append(new Gson().toJson(pxPipe));
- if (null != fileUrl) {
- pxPipe.file.setUrl(fileUrl);
- } // (stay idempotent)
- }
- }
- }//TESTED
- else { //legacy case
-
- // Required Fields
- sb.append(this.extractType);
-
- // Optional fields
- if (this.extractType != null) sb.append(this.extractType);
- if (this.useExtractor != null) sb.append(this.useExtractor);
- if (this.useTextExtractor != null) sb.append(this.useTextExtractor);
-
- // Generate a hash of all the objects using the ORM layer
- SourcePojo newSrc = new SourcePojo();
- newSrc.setId(null); // (in case this is auto set by the c'tor)
- newSrc.setAuthentication(this.authentication);
- newSrc.setDatabaseConfig(this.database);
- newSrc.setFileConfig(this.file);
- // Don't include RSS config since it can contain URLs
- newSrc.setStructuredAnalysisConfig(this.structuredAnalysis);
- newSrc.setUnstructuredAnalysisConfig(this.unstructuredAnalysis);
- sb.append(((BasicDBObject)newSrc.toDb()).toString());
-
- }//TESTED (legacy)
-
- // Create MessageDigest and set shah256Hash value
- MessageDigest md = MessageDigest.getInstance("SHA-256");
- md.update(sb.toString().getBytes("UTF-8"));
- shah256Hash = Base64.encodeBase64String(md.digest());
- }
- public Integer getSearchCycle_secs() {
- return searchCycle_secs;
- }
- public void setSearchCycle_secs(Integer searchCycle_secs) {
- this.searchCycle_secs = searchCycle_secs;
- }
- public void setMaxDocs(Integer maxDocs) {
- this.maxDocs = maxDocs;
- }
- public Integer getMaxDocs() {
- return maxDocs;
- }
- public void setReachedMaxDocs() {
- this.reachedMaxDocs = true;
- }
- public boolean reachedMaxDocs() {
- return reachedMaxDocs;
- }
- public void setDuplicateExistingUrls(Boolean duplicateExistingUrls) {
- this.duplicateExistingUrls = duplicateExistingUrls;
- }
- public boolean getDuplicateExistingUrls() { // (defaults to true)
- return duplicateExistingUrls == null ? true : duplicateExistingUrls;
- }
- public SourceSearchIndexFilter getSearchIndexFilter() {
- initSearchIndexFilter(searchIndexFilter);
- return searchIndexFilter;
- }
- public void setSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
- this.searchIndexFilter = searchIndexFilter;
- }
- ///////////////////////////////////////////////////////////////////////////////////
-
- // Transient state (implementation details)
-
- transient private boolean reachedMaxDocs = false;
- // (if set to true, means that the next search cycle won't be applied - otherwise if you only search once per day
- // and only process 5K docs/search, it can take a while to build up large repositories)
-
- private transient Set<Integer> distributionTokens; // (temporary internal state for managing intra-source distribution)
-
- // Build some regexes:
-
- public static void initSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
- if (null != searchIndexFilter) { // Initialize regex
- if ((null != searchIndexFilter.assocFilter) && (null == searchIndexFilter.assocFilterRegex)) {
- if (searchIndexFilter.assocFilter.startsWith("+") || searchIndexFilter.assocFilter.startsWith("-")) {
- searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
- }
- else {
- searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
- }
- }
- if ((null != searchIndexFilter.assocGeoFilter) && (null == searchIndexFilter.assocGeoFilterRegex)) {
- if (searchIndexFilter.assocGeoFilter.startsWith("+") || searchIndexFilter.assocGeoFilter.startsWith("-")) {
- searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
- }
- else {
- searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
- }
- }
- if ((null != searchIndexFilter.entityFilter) && (null == searchIndexFilter.entityFilterRegex)) {
- if (searchIndexFilter.entityFilter.startsWith("+") || searchIndexFilter.entityFilter.startsWith("-")) {
- searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter.substring(1), Pattern.CASE_INSENSITIVE);
- }
- else {
- searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter, Pattern.CASE_INSENSITIVE);
- }
- }
- if ((null != searchIndexFilter.entityGeoFilter) && (null == searchIndexFilter.entityGeoFilterRegex)) {
- if (searchIndexFilter.entityGeoFilter.startsWith("+") || searchIndexFilter.entityGeoFilter.startsWith("-")) {
- searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter.substring(1), Pattern.CASE_INSENSITIVE);
- }
- else {
- searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter, Pattern.CASE_INSENSITIVE);
- }
- }
- } // (end if search filter specified)
- }//(end initialize search filter)
- public void setExtractorOptions(LinkedHashMap<String, String> extractorOptions) {
- this.extractorOptions = extractorOptions;
- }
- public LinkedHashMap<String, String> getExtractorOptions() {
- return extractorOptions;
- }
- //TESTED
- public void setProcessingPipeline(List<SourcePipelinePojo> processingPipeline) {
- this.processingPipeline = processingPipeline;
- }
- public List<SourcePipelinePojo> getProcessingPipeline() {
- return processingPipeline;
- }
- public void setAppendTagsToDocs(Boolean appendTagsToDocs) {
- this.appendTagsToDocs = appendTagsToDocs;
- }
- public Boolean getAppendTagsToDocs() {
- return appendTagsToDocs;
- }
- public void setNoSql(SourceNoSqlConfigPojo noSql) {
- this.nosql = noSql;
- }
- public SourceNoSqlConfigPojo getNoSql() {
- return nosql;
- }
- public void setDistributionFactor(Integer distributionFactor) {
- this.distributionFactor = distributionFactor;
- }
- public Integer getDistributionFactor() {
- return distributionFactor;
- }
- public void setDistributionTokens(Set<Integer> distributionTokens) {
- this.distributionTokens = distributionTokens;
- }
- public Set<Integer> getDistributionTokens() {
- return distributionTokens;
- }
- public void setThrottleDocs(Integer throttleDocs) {
- this.throttleDocs = throttleDocs;
- }
- public Integer getThrottleDocs() {
- return throttleDocs;
- }
- ///////////////////////////////////////////////////////////////////
-
- // Serialization/deserialization utils:
- // (Ugh needed because extractorOptions keys can contain "."s)
-
- public GsonBuilder extendBuilder(GsonBuilder gp) {
- return gp.registerTypeAdapter(SourcePojo.class, new SourcePojoDeserializer()).
- registerTypeAdapter(SourcePojo.class, new SourcePojoSerializer());
- }
-
- protected static class SourcePojoDeserializer implements JsonDeserializer<SourcePojo>
- {
- @Override
- public SourcePojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException
- {
- SourcePojo src = BaseDbPojo.getDefaultBuilder().create().fromJson(json, SourcePojo.class);
- if (null != src.extractorOptions) {
- src.extractorOptions = decodeKeysForDatabaseStorage(src.extractorOptions);
- }
- if (null != src.processingPipeline) {
- for (SourcePipelinePojo pxPipe: src.processingPipeline) {
- if ((null != pxPipe.web) || (null != pxPipe.feed)) {
- SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
- if (null != webOrFeed.getHttpFields()) {
- webOrFeed.setHttpFields(decodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
- }
- }//TESTED (added httpFields by hand)
- // (don't do lookup tables, "."s aren't allowed in their keys)
- if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
- pxPipe.featureEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);
- }//TESTED (basic_web_test_ocOptions)
- if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
- pxPipe.textEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);
- }//TESTED (c/p basic_web_test_ocOptions)
- }
- }
- return src;
- }//TESTED (with and without extractor options)
- }
-
- protected static class SourcePojoSerializer implements JsonSerializer<SourcePojo>
- {
- @Override
- public JsonElement serialize(SourcePojo src, Type typeOfT, JsonSerializationContext context) throws JsonParseException
- {
- if (null != src.extractorOptions) {
- src.extractorOptions = encodeKeysForDatabaseStorage(src.extractorOptions);
- }
- if (null != src.processingPipeline) {
- for (SourcePipelinePojo pxPipe: src.processingPipeline) {
- if ((null != pxPipe.web) || (null != pxPipe.feed)) {
- SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
- if (null != webOrFeed.getHttpFields()) {
- webOrFeed.setHttpFields(encodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
- }
- }//TESTED (added httpFields by hand)
- // (don't do lookup tables, "."s aren't allowed in their keys)
- if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
- pxPipe.featureEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);
- }//TESTED (basic_web_test_ocOptions)
- if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
- pxPipe.textEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);
- }//TESTED (c/p basic_web_test_ocOptions)
- }
- }
- // GSON transformation:
- JsonElement je = SourcePojo.getDefaultBuilder().create().toJsonTree(src, typeOfT);
-
- return je;
- }//TESTED (with and without extractor options)
- }
- // Utilities for handling processing pipeline
-
- // Decode/Encode utilities
-
- private static LinkedHashMap<String, String> decodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
- LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
- for (Map.Entry<String, String> entry: in.entrySet()) {
- transformed.put(entry.getKey().replace("%2e", "."), entry.getValue());
- }
- return transformed;
- }//TESTED (legacy)
- private static LinkedHashMap<String, String> encodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
- LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
- for (Map.Entry<String, String> entry: in.entrySet()) {
- transformed.put(entry.getKey().replace(".", "%2e"), entry.getValue());
- }
- return transformed;
- }//TESTED (legacy)
-
- //(ugh need to store this logstash-domain-specific information here, might need to update it from time to time buy should remain reasonably simple)
- private static Pattern _getLogstashUrlRegex = Pattern.compile("(?:bucket|host|url|uri|path)[\\s\\n\\r]*=>[\\s\\n\\r]*['\"]([^'\"]+)", Pattern.CASE_INSENSITIVE);
-
- public String getRepresentativeUrl() {
- if (null == this.getProcessingPipeline()) {
- if (null != this.getUrl()) {
- return this.getUrl();
- }
- else if ((null != this.getRssConfig()) && (null != this.getRssConfig().getExtraUrls()) && !this.getRssConfig().getExtraUrls().isEmpty()) {
- return this.getRssConfig().getExtraUrls().get(0).url;
- }
- }
- else if (!this.getProcessingPipeline().isEmpty()) {
- SourcePipelinePojo px = this.getProcessingPipeline().get(0);
- if (null != px.file) {
- return px.file.getUrl();
- }
- else if (null != px.database) {
- return px.database.getUrl();
- }
- else if (null != px.logstash) {
- String url = null;
- try {
- Matcher m1 = _getLogstashUrlRegex.matcher(px.logstash.config);
- if (m1.find()) { // (get the first)
- url = m1.group(1);
- }
- }
- catch (Exception e) {} // return null will error out
- return url;
- }
- else {
- SourceRssConfigPojo webOrFeed = px.feed;
- if (null == webOrFeed) {
- webOrFeed = px.web;
- }
- if ((null != webOrFeed) && (null != webOrFeed.getExtraUrls()) && !webOrFeed.getExtraUrls().isEmpty()) {
- return webOrFeed.getExtraUrls().get(0).url;
- }
- }
- }
- return null;
- }//TESTED (legacy+basic_web_test_ocOptions)
-
- public void fillInSourcePipelineFields() {
- if (null != this.getProcessingPipeline()) {
- this.extractType = null; // always derive from the px pipeline, ignore user input
-
- for (SourcePipelinePojo px: this.getProcessingPipeline()) {
- if (null != px.file) {
- this.extractType = "File";
- }
- else if (null != px.database) {
- this.extractType = "Database";
- }
- else if (null != px.logstash) {
- this.extractType = "Logstash";
- }
- else if ((null != px.web) || (null != px.feed)) {
- this.extractType = "Feed";
- }
- if (null != px.harvest) {
- if (null != px.harvest.searchCycle_secs) {
- if ((null == searchCycle_secs) || (searchCycle_secs > 0)) {
- searchCycle_secs = Math.abs(px.harvest.searchCycle_secs);
- }
- else { // (searchCycle_secs < 0 ie want to suspend source)
- searchCycle_secs = -Math.abs(px.harvest.searchCycle_secs);
- }
- distributionFactor = px.harvest.distributionFactor;
- }//TESTED
- else if ((null != searchCycle_secs) && (searchCycle_secs < 0)) {
- // No search cycle specfiied, source suspended
- searchCycle_secs = -1;
- }//TESTED
- else { // No search cycle specified and source not suspended
- searchCycle_secs = null;
- }//TESTED
- break;
- }
- }
- }//TESTED
- }
- public Boolean getPartiallyPublished() {
- return partiallyPublished;
- }
- public void setPartiallyPublished(Boolean partiallyPublished) {
- this.partiallyPublished = partiallyPublished;
- }
- }