PageRenderTime 69ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/core/infinit.e.data_model/src/com/ikanow/infinit/e/data_model/store/config/source/SourcePojo.java

https://github.com/IKANOW/Infinit.e
Java | 840 lines | 644 code | 79 blank | 117 comment | 163 complexity | e12d5f0172eb8bcd32459aec83cc4f03 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012 The Infinit.e Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.data_model.store.config.source;
  17. import java.io.UnsupportedEncodingException;
  18. import java.lang.reflect.Type;
  19. import java.security.MessageDigest;
  20. import java.security.NoSuchAlgorithmException;
  21. import java.util.Date;
  22. import java.util.HashSet;
  23. import java.util.LinkedHashMap;
  24. import java.util.List;
  25. import java.util.Map;
  26. import java.util.Set;
  27. import java.util.regex.Matcher;
  28. import java.util.regex.Pattern;
  29. import org.apache.commons.codec.binary.Base64;
  30. import org.bson.types.ObjectId;
  31. import com.google.gson.Gson;
  32. import com.google.gson.GsonBuilder;
  33. import com.google.gson.JsonDeserializationContext;
  34. import com.google.gson.JsonDeserializer;
  35. import com.google.gson.JsonElement;
  36. import com.google.gson.JsonParseException;
  37. import com.google.gson.JsonSerializationContext;
  38. import com.google.gson.JsonSerializer;
  39. import com.google.gson.reflect.TypeToken;
  40. import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
  41. import com.ikanow.infinit.e.data_model.store.social.authentication.AuthenticationPojo;
  42. import com.mongodb.BasicDBObject;
  43. /**
  44. * Class used to establish the source information for a feed
  45. * this defines the data necessary to create a feed in the system
  46. *
  47. * @author cmorgan
  48. *
  49. */
  50. public class SourcePojo extends BaseDbPojo {
  51. // Standard static function for readability
  52. @SuppressWarnings("unchecked")
  53. static public TypeToken<List<SourcePojo>> listType() { return new TypeToken<List<SourcePojo>>(){}; }
  54. /**
  55. * Private Class Variables
  56. */
  57. // Metadata fields
  58. private ObjectId _id = null;
  59. final public static String _id_ = "_id";
  60. private Date created = null;
  61. final public static String created_ = "created";
  62. private Date modified = null;
  63. final public static String modified_ = "modified";
  64. private String url = null;
  65. final public static String url_ = "url";
  66. private String title = null;
  67. final public static String title_ = "title";
  68. private Boolean isPublic = null; // if false then many fields are removed when viewed by non-owners/moderators/admins
  69. final public static String isPublic_ = "isPublic";
  70. private Boolean partiallyPublished = null; // if fields are removed based on isPublic then this is set to true
  71. final public static String partiallyPublished_ = "partiallyPublished";
  72. private ObjectId ownerId = null;
  73. final public static String ownerId_ = "ownerId";
  74. private String author = null;
  75. final public static String author_ = "author";
  76. private String mediaType = null;
  77. final public static String mediaType_ = "mediaType";
  78. private String key = null;
  79. final public static String key_ = "key";
  80. private String description = null;
  81. final public static String description_ = "description";
  82. private Set<String> tags = null;
  83. final public static String tags_ = "tags";
  84. private Set<ObjectId> communityIds = null;
  85. final public static String communityIds_ = "communityIds";
  86. private boolean isApproved = false;
  87. final public static String isApproved_ = "isApproved";
  88. private boolean harvestBadSource = false;
  89. final public static String harvestBadSource_ = "harvestBadSource";
  90. private String extractType = null; // (in pipeline mode, copied across from pipeline)
  91. final public static String extractType_ = "extractType";
  92. private String shah256Hash = null;
  93. final public static String shah256Hash_ = "shah256Hash";
  94. // Control fields used everywhere
  95. private Integer searchCycle_secs = null; // Determines the time between searches, defaults as quickly as the harvest can cycle
  96. // (in pipeline mode, copied across from pipeline)
  97. final public static String searchCycle_secs_ = "searchCycle_secs";
  98. private Integer distributionFactor;
  99. final public static String distributionFactor_ = "distributionFactor";
  100. public static class SourceSearchIndexFilter {
  101. public Boolean indexOnIngest = null; // (if specified and false, default:true, then don't index the docs at all)
  102. public String entityFilter = null; // (regex applied to entity indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
  103. public String assocFilter = null; // (regex applied to new-line separated association indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
  104. public String entityGeoFilter = null; // (regex applied to entity indexes if the entity has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
  105. public String assocGeoFilter = null; // (regex applied to new-line separated association indexes if the association has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
  106. public String fieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
  107. public String metadataFieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
  108. // temp:
  109. public transient Pattern entityFilterRegex;
  110. public transient Pattern assocFilterRegex;
  111. public transient Pattern entityGeoFilterRegex;
  112. public transient Pattern assocGeoFilterRegex;
  113. }
  114. // PROCESSING PIPELINE
  115. private List<SourcePipelinePojo> processingPipeline;
  116. final public static String processingPipeline_ = "processingPipeline";
  117. // LEGACY CODE, IGNORED IN PROCESSING-PIPELINE MODE
  118. private SourceHarvestStatusPojo harvest = null;
  119. final public static String harvest_ = "harvest";
  120. private SourceDatabaseConfigPojo database = null;
  121. final public static String database_ = "database";
  122. private SourceNoSqlConfigPojo nosql = null;
  123. final public static String nosql_ = "nosql";
  124. private SourceFileConfigPojo file = null;
  125. final public static String file_ = "file";
  126. private SourceRssConfigPojo rss = null;
  127. final public static String rss_ = "rss";
  128. private AuthenticationPojo authentication = null;
  129. final public static String authentication_ = "authentication";
  130. private String useExtractor = null;
  131. final public static String useExtractor_ = "useExtractor";
  132. private String useTextExtractor = null;
  133. final public static String useTextExtractor_ = "useTextExtractor";
  134. private StructuredAnalysisConfigPojo structuredAnalysis = null;
  135. final public static String structuredAnalysis_ = "structuredAnalysis";
  136. private UnstructuredAnalysisConfigPojo unstructuredAnalysis = null;
  137. final public static String unstructuredAnalysis_ = "unstructuredAnalysis";
  138. private Integer maxDocs = null; // Limits the number of docs that can be stored for this source at any one time
  139. final public static String maxDocs_ = "maxDocs";
  140. private Integer throttleDocs = null; // Limits the number of docs that can be harvested in one cycle (cannot be higher than system setting in harvest.maxdocs_persource)
  141. final public static String throttleDocs_ = "throttleDocs";
  142. private Boolean duplicateExistingUrls; // If false (defaults: true) will ignore docs harvested by other sources in the community
  143. final public static String duplicateExistingUrls_ = "duplicateExistingUrls";
  144. private Boolean appendTagsToDocs = null; // if true (default) source tags are appended to the document
  145. final public static String appendTagsToDocs_ = "appendTagsToDocs";
  146. private SourceSearchIndexFilter searchIndexFilter = null; // Optional, allows the source builder to configure which fields are searchable
  147. final public static String searchIndexFilter_ = "searchIndexFilter";
  148. private LinkedHashMap<String, String> extractorOptions = null; // Optional, overrides the per-extractor configuration options, where permissible
  149. final public static String extractorOptions_ = "extractorOptions";
  150. //////////////////////////////////////
  151. // Gets and sets
  152. public AuthenticationPojo getAuthentication() {
  153. return authentication;
  154. }
  155. public void setAuthentication(AuthenticationPojo authentication) {
  156. this.authentication = authentication;
  157. }
  158. public SourceFileConfigPojo getFileConfig() {
  159. return file;
  160. }
  161. public void setFileConfig(SourceFileConfigPojo file) {
  162. this.file = file;
  163. }
  164. public SourceRssConfigPojo getRssConfig() {
  165. return rss;
  166. }
  167. public void setRssConfig(SourceRssConfigPojo rss) {
  168. this.rss = rss;
  169. }
  170. public SourceDatabaseConfigPojo getDatabaseConfig() {
  171. return database;
  172. }
  173. public void setDatabaseConfig(SourceDatabaseConfigPojo database) {
  174. this.database = database;
  175. }
  176. public ObjectId getId() {
  177. return _id;
  178. }
  179. public void setId(ObjectId id) {
  180. this._id = id;
  181. }
  182. public String getKey() {
  183. return key;
  184. }
  185. public void setKey(String key) {
  186. this.key = key;
  187. }
  188. public Date getCreated() {
  189. return created;
  190. }
  191. public void setCreated(Date created) {
  192. this.created = created;
  193. }
  194. public Date getModified() {
  195. return modified;
  196. }
  197. public void setModified(Date modified) {
  198. this.modified = modified;
  199. }
  200. public String getUrl() {
  201. return url;
  202. }
  203. public void setUrl(String url) {
  204. this.url = url;
  205. }
  206. public String getTitle() {
  207. return title;
  208. }
  209. public void setTitle(String title) {
  210. this.title = title;
  211. }
  212. public String getDescription() {
  213. return description;
  214. }
  215. public void setDescription(String description) {
  216. this.description = description;
  217. }
  218. public String getMediaType() {
  219. return mediaType;
  220. }
  221. public void setMediaType(String mediaType) {
  222. this.mediaType = mediaType;
  223. }
  224. public String getExtractType() {
  225. return extractType;
  226. }
  227. public void setExtractType(String extractType) {
  228. this.extractType = extractType;
  229. }
  230. public Boolean getIsPublic() {
  231. return isPublic;
  232. }
  233. public boolean isPublic() {
  234. return (isPublic == null)?false:isPublic; // (ie defaults to false)
  235. }
  236. public void setIsPublic(Boolean isPublic) {
  237. this.isPublic = isPublic;
  238. }
  239. public void setPublic(boolean isPublic) {
  240. this.isPublic = isPublic;
  241. }
  242. public String getAuthor() {
  243. return author;
  244. }
  245. public void setAuthor(String author) {
  246. this.author = author;
  247. }
  248. /**
  249. * Get the tags
  250. */
  251. public Set<String> getTags() {
  252. return tags;
  253. }
  254. /**
  255. * Set the tags
  256. */
  257. public void setTags(Set<String> tags) {
  258. this.tags = tags;
  259. }
  260. /**
  261. * @param ownerID the ownerID to set
  262. */
  263. public void setOwnerId(ObjectId ownerID) {
  264. this.ownerId = ownerID;
  265. }
  266. /**
  267. * @return the ownerID
  268. */
  269. public ObjectId getOwnerId() {
  270. return ownerId;
  271. }
  272. public SourcePojo() {
  273. }
  274. public void setHarvestStatus(SourceHarvestStatusPojo harvest) {
  275. this.harvest = harvest;
  276. }
  277. public SourceHarvestStatusPojo getHarvestStatus() {
  278. return harvest;
  279. }
  280. public void setApproved(boolean isApproved) {
  281. this.isApproved = isApproved;
  282. }
  283. public boolean isApproved() {
  284. return isApproved;
  285. }
  286. public void addToCommunityIds(ObjectId communityID) {
  287. if (null == this.communityIds) {
  288. this.communityIds = new HashSet<ObjectId>();
  289. }
  290. this.communityIds.add(communityID);
  291. }
  292. public void removeFromCommunityIds(ObjectId communityID) {
  293. if (null != this.communityIds) {
  294. this.communityIds.remove(communityID);
  295. }
  296. }
  297. public Set<ObjectId> getCommunityIds() {
  298. return communityIds;
  299. }
  300. public void setCommunityIds(Set<ObjectId> ids) {
  301. communityIds = ids;
  302. }
  303. public void setHarvestBadSource(boolean harvestBadSource) {
  304. this.harvestBadSource = harvestBadSource;
  305. }
  306. public boolean isHarvestBadSource() {
  307. return harvestBadSource;
  308. }
  309. /**
  310. * @param useExtractor the useExtractor to set
  311. */
  312. public void setUseExtractor(String useExtractor) {
  313. this.useExtractor = useExtractor;
  314. }
  315. /**
  316. * @return the useExtractor
  317. */
  318. public String useExtractor() {
  319. return useExtractor;
  320. }
  321. /**
  322. * @param useTextExtractor the useTextExtractor to set
  323. */
  324. public void setUseTextExtractor(String useTextExtractor) {
  325. this.useTextExtractor = useTextExtractor;
  326. }
  327. /**
  328. * @return the useTextExtractor
  329. */
  330. public String useTextExtractor() {
  331. return useTextExtractor;
  332. }
  333. /**
  334. * @param structedAnalysis the structedAnalysis to set
  335. */
  336. public void setStructuredAnalysisConfig(StructuredAnalysisConfigPojo structuredAnalysis) {
  337. this.structuredAnalysis = structuredAnalysis;
  338. }
  339. /**
  340. * @return the structedAnalysis
  341. */
  342. public StructuredAnalysisConfigPojo getStructuredAnalysisConfig() {
  343. return structuredAnalysis;
  344. }
  345. /**
  346. * @param structuredAnalysis the structuredAnalysis to set
  347. */
  348. public void setUnstructuredAnalysisConfig(UnstructuredAnalysisConfigPojo unstructuredAnalysis) {
  349. this.unstructuredAnalysis = unstructuredAnalysis;
  350. }
  351. /**
  352. * @return the unstructuredAnalysis
  353. */
  354. public UnstructuredAnalysisConfigPojo getUnstructuredAnalysisConfig() {
  355. return unstructuredAnalysis;
  356. }
  357. /**
  358. * setShah256Hash - calls generateShah256Hash
  359. */
  360. public void generateShah256Hash()
  361. {
  362. try
  363. {
  364. generateShah256Hash_internal();
  365. }
  366. catch (Exception e)
  367. {
  368. }
  369. }
  370. /**
  371. * getShah256Hash - calls generateShah256Hash if shah256Hash is null
  372. * @return
  373. */
  374. public String getShah256Hash()
  375. {
  376. if (null != shah256Hash )
  377. {
  378. return shah256Hash;
  379. }
  380. else
  381. {
  382. try
  383. {
  384. generateShah256Hash_internal();
  385. return shah256Hash;
  386. }
  387. catch (Exception e)
  388. {
  389. return null;
  390. }
  391. }
  392. }
  393. // Utility:
  394. /**
  395. * generateSourceKey
  396. * Strips out http://, smb:// /, :, etc. from the URL field to generate
  397. * Example: http://www.ikanow.com/rss -> www.ikanow.com.rss
  398. */
  399. public String generateSourceKey()
  400. {
  401. String s = getRepresentativeUrl(); // (supports all cases - note we are guaranteed to have a URL by this point)
  402. if (null == s) {
  403. return null;
  404. }
  405. int nIndex = s.indexOf('?');
  406. final int nMaxLen = 64; // (+24 for the object id, + random other stuff, keeps it in the <100 range)
  407. if (nIndex >= 0) {
  408. if (nIndex > nMaxLen) {
  409. nIndex = nMaxLen; // (ie max length)
  410. }
  411. StringBuffer sb = new StringBuffer(s.substring(0, nIndex));
  412. sb.append(".").append(s.length() - nIndex).append('.').append(Math.abs(s.hashCode()) % 100);
  413. s = sb.toString();
  414. }
  415. else if (s.length() > nMaxLen) {
  416. s = s.substring(0, nMaxLen);
  417. }
  418. //TESTED (urls with and without ?)
  419. s = s.replaceAll("http://|https://|smb://|ftp://|ftps://|file://|[/:+?&(),#]", ".");
  420. if (s.startsWith(".")) s = s.substring(1);
  421. return s;
  422. }
  423. /**
  424. * generateShah256Hash
  425. * Combines the required and optional fields of a SourcePojo into a string that is
  426. * then hashed using SHAH-256 and saved to the SourePojo.shah256Hash field;
  427. * this value is used to determine source uniqueness
  428. * @throws NoSuchAlgorithmException
  429. * @throws UnsupportedEncodingException
  430. */
  431. private void generateShah256Hash_internal() throws NoSuchAlgorithmException, UnsupportedEncodingException
  432. {
  433. // Create StringBuffer with fields to use to establish source *processing* uniqueness
  434. StringBuffer sb = new StringBuffer();
  435. // (Note what I mean by "source processing uniqueness" is that, *for a specific doc URL* 2 sources would process it identically)
  436. // So fields like key,URL,media type,tags,etc aren't included in the hash
  437. if (null != processingPipeline) { // new processing pipeline contains all the logic that determines a source's processing
  438. for (SourcePipelinePojo pxPipe: processingPipeline) {
  439. if ((null == pxPipe.feed) && (null == pxPipe.web)) { // (these are too difficult to pull the URL out of)
  440. String fileUrl = null;
  441. if (null != pxPipe.file) {
  442. fileUrl = pxPipe.file.getUrl();
  443. pxPipe.file.setUrl(null);
  444. }
  445. // (don't both with DB because its URL is so intertwined with its processing)
  446. sb.append(new Gson().toJson(pxPipe));
  447. if (null != fileUrl) {
  448. pxPipe.file.setUrl(fileUrl);
  449. } // (stay idempotent)
  450. }
  451. }
  452. }//TESTED
  453. else { //legacy case
  454. // Required Fields
  455. sb.append(this.extractType);
  456. // Optional fields
  457. if (this.extractType != null) sb.append(this.extractType);
  458. if (this.useExtractor != null) sb.append(this.useExtractor);
  459. if (this.useTextExtractor != null) sb.append(this.useTextExtractor);
  460. // Generate a hash of all the objects using the ORM layer
  461. SourcePojo newSrc = new SourcePojo();
  462. newSrc.setId(null); // (in case this is auto set by the c'tor)
  463. newSrc.setAuthentication(this.authentication);
  464. newSrc.setDatabaseConfig(this.database);
  465. newSrc.setFileConfig(this.file);
  466. // Don't include RSS config since it can contain URLs
  467. newSrc.setStructuredAnalysisConfig(this.structuredAnalysis);
  468. newSrc.setUnstructuredAnalysisConfig(this.unstructuredAnalysis);
  469. sb.append(((BasicDBObject)newSrc.toDb()).toString());
  470. }//TESTED (legacy)
  471. // Create MessageDigest and set shah256Hash value
  472. MessageDigest md = MessageDigest.getInstance("SHA-256");
  473. md.update(sb.toString().getBytes("UTF-8"));
  474. shah256Hash = Base64.encodeBase64String(md.digest());
  475. }
  476. public Integer getSearchCycle_secs() {
  477. return searchCycle_secs;
  478. }
  479. public void setSearchCycle_secs(Integer searchCycle_secs) {
  480. this.searchCycle_secs = searchCycle_secs;
  481. }
  482. public void setMaxDocs(Integer maxDocs) {
  483. this.maxDocs = maxDocs;
  484. }
  485. public Integer getMaxDocs() {
  486. return maxDocs;
  487. }
  488. public void setReachedMaxDocs() {
  489. this.reachedMaxDocs = true;
  490. }
  491. public boolean reachedMaxDocs() {
  492. return reachedMaxDocs;
  493. }
  494. public void setDuplicateExistingUrls(Boolean duplicateExistingUrls) {
  495. this.duplicateExistingUrls = duplicateExistingUrls;
  496. }
  497. public boolean getDuplicateExistingUrls() { // (defaults to true)
  498. return duplicateExistingUrls == null ? true : duplicateExistingUrls;
  499. }
  500. public SourceSearchIndexFilter getSearchIndexFilter() {
  501. initSearchIndexFilter(searchIndexFilter);
  502. return searchIndexFilter;
  503. }
  504. public void setSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
  505. this.searchIndexFilter = searchIndexFilter;
  506. }
  507. ///////////////////////////////////////////////////////////////////////////////////
  508. // Transient state (implementation details)
  509. transient private boolean reachedMaxDocs = false;
  510. // (if set to true, means that the next search cycle won't be applied - otherwise if you only search once per day
  511. // and only process 5K docs/search, it can take a while to build up large repositories)
  512. private transient Set<Integer> distributionTokens; // (temporary internal state for managing intra-source distribution)
  513. // Build some regexes:
  514. public static void initSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
  515. if (null != searchIndexFilter) { // Initialize regex
  516. if ((null != searchIndexFilter.assocFilter) && (null == searchIndexFilter.assocFilterRegex)) {
  517. if (searchIndexFilter.assocFilter.startsWith("+") || searchIndexFilter.assocFilter.startsWith("-")) {
  518. searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
  519. }
  520. else {
  521. searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
  522. }
  523. }
  524. if ((null != searchIndexFilter.assocGeoFilter) && (null == searchIndexFilter.assocGeoFilterRegex)) {
  525. if (searchIndexFilter.assocGeoFilter.startsWith("+") || searchIndexFilter.assocGeoFilter.startsWith("-")) {
  526. searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter.substring(1), Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
  527. }
  528. else {
  529. searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter, Pattern.CASE_INSENSITIVE|Pattern.DOTALL|Pattern.MULTILINE);
  530. }
  531. }
  532. if ((null != searchIndexFilter.entityFilter) && (null == searchIndexFilter.entityFilterRegex)) {
  533. if (searchIndexFilter.entityFilter.startsWith("+") || searchIndexFilter.entityFilter.startsWith("-")) {
  534. searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter.substring(1), Pattern.CASE_INSENSITIVE);
  535. }
  536. else {
  537. searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter, Pattern.CASE_INSENSITIVE);
  538. }
  539. }
  540. if ((null != searchIndexFilter.entityGeoFilter) && (null == searchIndexFilter.entityGeoFilterRegex)) {
  541. if (searchIndexFilter.entityGeoFilter.startsWith("+") || searchIndexFilter.entityGeoFilter.startsWith("-")) {
  542. searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter.substring(1), Pattern.CASE_INSENSITIVE);
  543. }
  544. else {
  545. searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter, Pattern.CASE_INSENSITIVE);
  546. }
  547. }
  548. } // (end if search filter specified)
  549. }//(end initialize search filter)
  550. public void setExtractorOptions(LinkedHashMap<String, String> extractorOptions) {
  551. this.extractorOptions = extractorOptions;
  552. }
  553. public LinkedHashMap<String, String> getExtractorOptions() {
  554. return extractorOptions;
  555. }
  556. //TESTED
  557. public void setProcessingPipeline(List<SourcePipelinePojo> processingPipeline) {
  558. this.processingPipeline = processingPipeline;
  559. }
  560. public List<SourcePipelinePojo> getProcessingPipeline() {
  561. return processingPipeline;
  562. }
  563. public void setAppendTagsToDocs(Boolean appendTagsToDocs) {
  564. this.appendTagsToDocs = appendTagsToDocs;
  565. }
  566. public Boolean getAppendTagsToDocs() {
  567. return appendTagsToDocs;
  568. }
  569. public void setNoSql(SourceNoSqlConfigPojo noSql) {
  570. this.nosql = noSql;
  571. }
  572. public SourceNoSqlConfigPojo getNoSql() {
  573. return nosql;
  574. }
  575. public void setDistributionFactor(Integer distributionFactor) {
  576. this.distributionFactor = distributionFactor;
  577. }
  578. public Integer getDistributionFactor() {
  579. return distributionFactor;
  580. }
  581. public void setDistributionTokens(Set<Integer> distributionTokens) {
  582. this.distributionTokens = distributionTokens;
  583. }
  584. public Set<Integer> getDistributionTokens() {
  585. return distributionTokens;
  586. }
  587. public void setThrottleDocs(Integer throttleDocs) {
  588. this.throttleDocs = throttleDocs;
  589. }
  590. public Integer getThrottleDocs() {
  591. return throttleDocs;
  592. }
  593. ///////////////////////////////////////////////////////////////////
  594. // Serialization/deserialization utils:
  595. // (Ugh needed because extractorOptions keys can contain "."s)
  596. public GsonBuilder extendBuilder(GsonBuilder gp) {
  597. return gp.registerTypeAdapter(SourcePojo.class, new SourcePojoDeserializer()).
  598. registerTypeAdapter(SourcePojo.class, new SourcePojoSerializer());
  599. }
  600. protected static class SourcePojoDeserializer implements JsonDeserializer<SourcePojo>
  601. {
  602. @Override
  603. public SourcePojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException
  604. {
  605. SourcePojo src = BaseDbPojo.getDefaultBuilder().create().fromJson(json, SourcePojo.class);
  606. if (null != src.extractorOptions) {
  607. src.extractorOptions = decodeKeysForDatabaseStorage(src.extractorOptions);
  608. }
  609. if (null != src.processingPipeline) {
  610. for (SourcePipelinePojo pxPipe: src.processingPipeline) {
  611. if ((null != pxPipe.web) || (null != pxPipe.feed)) {
  612. SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
  613. if (null != webOrFeed.getHttpFields()) {
  614. webOrFeed.setHttpFields(decodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
  615. }
  616. }//TESTED (added httpFields by hand)
  617. // (don't do lookup tables, "."s aren't allowed in their keys)
  618. if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
  619. pxPipe.featureEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);
  620. }//TESTED (basic_web_test_ocOptions)
  621. if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
  622. pxPipe.textEngine.engineConfig = decodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);
  623. }//TESTED (c/p basic_web_test_ocOptions)
  624. }
  625. }
  626. return src;
  627. }//TESTED (with and without extractor options)
  628. }
  629. protected static class SourcePojoSerializer implements JsonSerializer<SourcePojo>
  630. {
  631. @Override
  632. public JsonElement serialize(SourcePojo src, Type typeOfT, JsonSerializationContext context) throws JsonParseException
  633. {
  634. if (null != src.extractorOptions) {
  635. src.extractorOptions = encodeKeysForDatabaseStorage(src.extractorOptions);
  636. }
  637. if (null != src.processingPipeline) {
  638. for (SourcePipelinePojo pxPipe: src.processingPipeline) {
  639. if ((null != pxPipe.web) || (null != pxPipe.feed)) {
  640. SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
  641. if (null != webOrFeed.getHttpFields()) {
  642. webOrFeed.setHttpFields(encodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
  643. }
  644. }//TESTED (added httpFields by hand)
  645. // (don't do lookup tables, "."s aren't allowed in their keys)
  646. if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
  647. pxPipe.featureEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.featureEngine.engineConfig);
  648. }//TESTED (basic_web_test_ocOptions)
  649. if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
  650. pxPipe.textEngine.engineConfig = encodeKeysForDatabaseStorage(pxPipe.textEngine.engineConfig);
  651. }//TESTED (c/p basic_web_test_ocOptions)
  652. }
  653. }
  654. // GSON transformation:
  655. JsonElement je = SourcePojo.getDefaultBuilder().create().toJsonTree(src, typeOfT);
  656. return je;
  657. }//TESTED (with and without extractor options)
  658. }
  659. // Utilities for handling processing pipeline
  660. // Decode/Encode utilities
  661. private static LinkedHashMap<String, String> decodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
  662. LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
  663. for (Map.Entry<String, String> entry: in.entrySet()) {
  664. transformed.put(entry.getKey().replace("%2e", "."), entry.getValue());
  665. }
  666. return transformed;
  667. }//TESTED (legacy)
  668. private static LinkedHashMap<String, String> encodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
  669. LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
  670. for (Map.Entry<String, String> entry: in.entrySet()) {
  671. transformed.put(entry.getKey().replace(".", "%2e"), entry.getValue());
  672. }
  673. return transformed;
  674. }//TESTED (legacy)
  675. //(ugh need to store this logstash-domain-specific information here, might need to update it from time to time buy should remain reasonably simple)
  676. private static Pattern _getLogstashUrlRegex = Pattern.compile("(?:bucket|host|url|uri|path)[\\s\\n\\r]*=>[\\s\\n\\r]*['\"]([^'\"]+)", Pattern.CASE_INSENSITIVE);
  677. public String getRepresentativeUrl() {
  678. if (null == this.getProcessingPipeline()) {
  679. if (null != this.getUrl()) {
  680. return this.getUrl();
  681. }
  682. else if ((null != this.getRssConfig()) && (null != this.getRssConfig().getExtraUrls()) && !this.getRssConfig().getExtraUrls().isEmpty()) {
  683. return this.getRssConfig().getExtraUrls().get(0).url;
  684. }
  685. }
  686. else if (!this.getProcessingPipeline().isEmpty()) {
  687. SourcePipelinePojo px = this.getProcessingPipeline().get(0);
  688. if (null != px.file) {
  689. return px.file.getUrl();
  690. }
  691. else if (null != px.database) {
  692. return px.database.getUrl();
  693. }
  694. else if (null != px.logstash) {
  695. String url = null;
  696. try {
  697. Matcher m1 = _getLogstashUrlRegex.matcher(px.logstash.config);
  698. if (m1.find()) { // (get the first)
  699. url = m1.group(1);
  700. }
  701. }
  702. catch (Exception e) {} // return null will error out
  703. return url;
  704. }
  705. else {
  706. SourceRssConfigPojo webOrFeed = px.feed;
  707. if (null == webOrFeed) {
  708. webOrFeed = px.web;
  709. }
  710. if ((null != webOrFeed) && (null != webOrFeed.getExtraUrls()) && !webOrFeed.getExtraUrls().isEmpty()) {
  711. return webOrFeed.getExtraUrls().get(0).url;
  712. }
  713. }
  714. }
  715. return null;
  716. }//TESTED (legacy+basic_web_test_ocOptions)
  717. public void fillInSourcePipelineFields() {
  718. if (null != this.getProcessingPipeline()) {
  719. this.extractType = null; // always derive from the px pipeline, ignore user input
  720. for (SourcePipelinePojo px: this.getProcessingPipeline()) {
  721. if (null != px.file) {
  722. this.extractType = "File";
  723. }
  724. else if (null != px.database) {
  725. this.extractType = "Database";
  726. }
  727. else if (null != px.logstash) {
  728. this.extractType = "Logstash";
  729. }
  730. else if ((null != px.web) || (null != px.feed)) {
  731. this.extractType = "Feed";
  732. }
  733. if (null != px.harvest) {
  734. if (null != px.harvest.searchCycle_secs) {
  735. if ((null == searchCycle_secs) || (searchCycle_secs > 0)) {
  736. searchCycle_secs = Math.abs(px.harvest.searchCycle_secs);
  737. }
  738. else { // (searchCycle_secs < 0 ie want to suspend source)
  739. searchCycle_secs = -Math.abs(px.harvest.searchCycle_secs);
  740. }
  741. distributionFactor = px.harvest.distributionFactor;
  742. }//TESTED
  743. else if ((null != searchCycle_secs) && (searchCycle_secs < 0)) {
  744. // No search cycle specfiied, source suspended
  745. searchCycle_secs = -1;
  746. }//TESTED
  747. else { // No search cycle specified and source not suspended
  748. searchCycle_secs = null;
  749. }//TESTED
  750. break;
  751. }
  752. }
  753. }//TESTED
  754. }
  755. public Boolean getPartiallyPublished() {
  756. return partiallyPublished;
  757. }
  758. public void setPartiallyPublished(Boolean partiallyPublished) {
  759. this.partiallyPublished = partiallyPublished;
  760. }
  761. }