PageRenderTime 34ms CodeModel.GetById 8ms RepoModel.GetById 0ms app.codeStats 0ms

/core/infinit.e.processing.generic.library/src/com/ikanow/infinit/e/processing/generic/store_and_index/StoreAndIndexManager.java

https://github.com/IKANOW/Infinit.e
Java | 1009 lines | 603 code | 140 blank | 266 comment | 167 complexity | 62586a4c4c797a04244a503a51f21aa4 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.processing.generic.store_and_index;
  17. import java.net.UnknownHostException;
  18. import java.util.Date;
  19. import java.util.HashSet;
  20. import java.util.Iterator;
  21. import java.util.LinkedList;
  22. import java.util.List;
  23. import java.util.Map;
  24. import java.util.Set;
  25. import java.util.TreeMap;
  26. import org.apache.log4j.Logger;
  27. import org.bson.types.ObjectId;
  28. import org.elasticsearch.index.query.BaseQueryBuilder;
  29. import org.elasticsearch.index.query.QueryBuilders;
  30. import com.google.gson.reflect.TypeToken;
  31. import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
  32. import com.ikanow.infinit.e.data_model.index.IndexManager;
  33. import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
  34. import com.ikanow.infinit.e.data_model.store.DbManager;
  35. import com.ikanow.infinit.e.data_model.store.MongoDbManager;
  36. import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
  37. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  38. import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
  39. import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
  40. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  41. import com.ikanow.infinit.e.data_model.utils.PropertiesManager;
  42. import com.mongodb.BasicDBObject;
  43. import com.mongodb.CommandResult;
  44. import com.mongodb.DBCollection;
  45. import com.mongodb.DBCursor;
  46. /**
  47. * Class used to commit records to backend storage during harvest process
  48. * @author cmorgan
  49. *
  50. */
  51. public class StoreAndIndexManager {
  52. // Initialize the Logger
  53. private static final Logger logger = Logger.getLogger(StoreAndIndexManager.class);
  54. private static boolean _diagnosticMode = false;
  55. public static void setDiagnosticMode(boolean bMode) { _diagnosticMode = bMode; }
  56. private int nMaxContentLen_bytes = 100000; // (100KB default max)
  57. private boolean bStoreRawContent = false; // (store the raw as well as the processed data)
  58. private boolean bStoreMetadataAsContent = false; // (store the metadata in the content block)
  59. public final static String DELETION_INDICATOR = "?DEL?";
  60. private String harvesterUUID = null;
  61. public String getUUID() { return harvesterUUID; }
  62. public StoreAndIndexManager() {
  63. com.ikanow.infinit.e.processing.generic.utils.PropertiesManager pm =
  64. new com.ikanow.infinit.e.processing.generic.utils.PropertiesManager();
  65. int nMaxContent = pm.getMaxContentSize();
  66. if (nMaxContent > -1) {
  67. nMaxContentLen_bytes = nMaxContent;
  68. }
  69. bStoreRawContent = pm.storeRawContent();
  70. bStoreMetadataAsContent = pm.storeMetadataAsContent();
  71. try {
  72. StringBuffer sb = new StringBuffer(DELETION_INDICATOR).append(java.net.InetAddress.getLocalHost().getHostName());
  73. harvesterUUID = sb.toString();
  74. } catch (UnknownHostException e) {
  75. harvesterUUID = DELETION_INDICATOR + "UNKNOWN";
  76. }
  77. }
  78. /////////////////////////////////////////////////////////////////////////////////////////////////
  79. /////////////////////////////////////////////////////////////////////////////////////////////////
  80. // Datastore addition
  81. /**
  82. * Add a list of doc documents to the data store
  83. * @param feeds
  84. */
  85. public void addToDatastore(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source) {
  86. try {
  87. // Create collection manager
  88. // Add to data store
  89. addToDatastore(DbManager.getDocument().getMetadata(), docs);
  90. } catch (Exception e) {
  91. // If an exception occurs log the error
  92. logger.error("Exception Message: " + e.getMessage(), e);
  93. }
  94. // (note: currently modifies docs, see DocumentIndexPojoMap, so beware if using after this point)
  95. if (bSaveContent) {
  96. saveContent(docs);
  97. }
  98. boolean index = true;
  99. if ((null != source) && (null != source.getSearchIndexFilter())) {
  100. if (null != source.getSearchIndexFilter().indexOnIngest) {
  101. index = source.getSearchIndexFilter().indexOnIngest;
  102. }
  103. }
  104. if (index) {
  105. this.addToSearch(docs);
  106. }
  107. }//TESTED
  108. /////////////////////////////////////////////////////////////////////////////////////////////////
  109. // Utilities
  110. /**
  111. * Add a single doc document to the datastore
  112. * @param col
  113. * @param doc
  114. */
  115. private void addToDatastore(DBCollection col, DocumentPojo doc) {
  116. if (!_diagnosticMode) {
  117. if (!docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
  118. doc.makeFullTextNonTransient(); // (ie store full text in this case)
  119. }
  120. col.save(doc.toDb());
  121. }
  122. else {
  123. System.out.println("StoreAndIndexManager.addToDatastore: " + ((BasicDBObject)doc.toDb()).toString());
  124. }
  125. }//TESTED
  126. /**
  127. * Add a list of doc documents to the data store
  128. * @param feeds
  129. */
  130. private void addToDatastore(DBCollection col, List<DocumentPojo> docs) {
  131. // Store the knowledge in the feeds collection in the harvester db
  132. for ( DocumentPojo f : docs) {
  133. // Set an _id before writing it to the datastore,
  134. // so the same _id gets written to the index
  135. // NOTE WE OVERWRITE ANY TRANSIENT IDS THAT MIGHT HAVE BEEN SET eg BY REMOVE CODE
  136. f.setId(new ObjectId());
  137. // Check geo-size: need to add to a different index if so, for memory usage reasons
  138. if (null == f.getLocs()) { // (can be set by update/deletion code also)
  139. if (DocumentPojoIndexMap.hasManyGeos(f)) {
  140. f.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
  141. // (note this check isn't stateless, it actually populates "locs" at the same time)
  142. // therefore...
  143. }
  144. }
  145. Set<String> locs = f.getLocs();
  146. f.setLocs(null);
  147. addToDatastore(col, f);
  148. f.setLocs(locs);
  149. }
  150. }//TESTED
  151. //////////////////////////////////////////////////////////////////////////////////////
  152. /**
  153. * Save the fulltext of a pojo to mongo for using later
  154. *
  155. * @param docs
  156. */
  157. private void saveContent(List<DocumentPojo> docs)
  158. {
  159. try
  160. {
  161. DBCollection contentDb = DbManager.getDocument().getContent();
  162. for ( DocumentPojo doc : docs )
  163. {
  164. boolean bStoreContent = true;
  165. bStoreContent &= (0 != nMaxContentLen_bytes); // (otherwise it's turned off)
  166. bStoreContent &= this.bStoreMetadataAsContent || ((null != doc.getFullText()) && !doc.getFullText().isEmpty());
  167. boolean bDocHasExternalContent = docHasExternalContent(doc.getUrl(), doc.getSourceUrl());
  168. if (bStoreContent && bDocHasExternalContent) {
  169. try
  170. {
  171. String rawText = this.bStoreRawContent ? doc.getRawFullText() : null;
  172. DocumentPojo meta = bStoreMetadataAsContent ? doc : null;
  173. CompressedFullTextPojo gzippedContent = new CompressedFullTextPojo(doc.getUrl(), doc.getSourceKey(), doc.getCommunityId(),
  174. doc.getFullText(), rawText, meta, nMaxContentLen_bytes);
  175. if (null != gzippedContent.getUrl()) {
  176. // Be efficient and write field-by-field vs using JSON conversion
  177. BasicDBObject query = new BasicDBObject(CompressedFullTextPojo.url_, gzippedContent.getUrl());
  178. query.put(CompressedFullTextPojo.sourceKey_, gzippedContent.getSourceKey());
  179. BasicDBObject update = gzippedContent.getUpdate();
  180. if (!_diagnosticMode) {
  181. contentDb.update(query, update, true, false); // (ie upsert, supported because query includes shard key==url)
  182. }
  183. else {
  184. System.out.println("StoreAndIndexManager.savedContent, save content: " + gzippedContent.getUrl());
  185. }
  186. }
  187. }
  188. catch (Exception ex)
  189. {
  190. // Do nothing, just carry on
  191. ex.printStackTrace();
  192. }
  193. }//TESTED
  194. }
  195. }
  196. catch (Exception ex)
  197. {
  198. // This is a more serious error
  199. logger.error(ex.getMessage());
  200. }
  201. }//TESTED (not changed since by-eye testing in Beta)
  202. /////////////////////////////////////////////////////////////////////////////////////////////////
  203. /////////////////////////////////////////////////////////////////////////////////////////////////
  204. // Datastore removal
  205. /**
  206. * This function removes documents "soft deleted" by this harvester
  207. */
  208. public void removeSoftDeletedDocuments()
  209. {
  210. BasicDBObject query = new BasicDBObject(DocumentPojo.url_, harvesterUUID);
  211. if (_diagnosticMode) {
  212. System.out.println("Soft delete: " + DbManager.getDocument().getMetadata().count(query));
  213. }
  214. else {
  215. DbManager.getDocument().getMetadata().remove(query);
  216. }
  217. }//TESTED
  218. /**
  219. * Low level utility to abstract soft deletion
  220. * We're using URL because 1) we cant' use a shard key
  221. * 2) it needs to be an indexed field
  222. * 3) ideally one that is likely to be cached in memory
  223. * 4) one that minimizes the chance of having to move the document when modifying the field
  224. * (I also considered sourceUrl or an all new field, they _might_ be better because smaller, but conversely
  225. * would be less likely to be cached and most importantly there's the risk of 4)
  226. */
  227. private BasicDBObject _softDeleter = null;
  228. private BasicDBObject getSoftDeleteUpdate()
  229. {
  230. if (null == _softDeleter) {
  231. BasicDBObject softDeleter = new BasicDBObject(DocumentPojo.url_, harvesterUUID);
  232. softDeleter.put(DocumentPojo.index_, DELETION_INDICATOR);
  233. // (used in CustomHadoopTaskLauncher.createConfigXML)
  234. _softDeleter = new BasicDBObject(DbManager.set_, softDeleter);
  235. }
  236. return _softDeleter;
  237. }//TESTED
  238. /**
  239. * Remove a list of doc documents from the data store (you have their _id and sourceKey)
  240. *
  241. * CALLED FROM: resizeDB() <- FILLS IN _ID, SOURCEKEY, INDEX, URL, SOURCEURL
  242. */
  243. public void removeFromDatastore_byId(List<DocumentPojo> docs) {
  244. try {
  245. // Remove from data store
  246. removeFromDatastore_byId(DbManager.getDocument().getMetadata(), docs);
  247. this.removeFromSearch(docs);
  248. } catch (Exception e) {
  249. // If an exception occurs log the error
  250. logger.error("Exception Message: " + e.getMessage(), e);
  251. }
  252. }//TESTED
  253. /**
  254. * Remove a list of doc documents from the data store (you have their url) AND ALSO the search index
  255. *
  256. * @param docs - child function needs url (optionally sourceUrl) set - child function requires sourceKey
  257. * this function needs id and index both of which are set by the child stack
  258. *
  259. * CALLED FROM: MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
  260. * processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey,
  261. * DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
  262. * pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX, _ID
  263. * updateHarvestStatus(...)
  264. */
  265. public ObjectId removeFromDatastore_byURL(List<DocumentPojo> docs) {
  266. // Remove from data store:
  267. ObjectId nextId = null;
  268. try {
  269. nextId = removeFromDatastore_byURL(DbManager.getDocument().getMetadata(), docs);
  270. // ^^^ adds "created" (if updateId set), "_id" and "index" to the doc and expands "sourceUrl" docs (adding "_id" and "index")
  271. } catch (Exception e) {
  272. // If an exception occurs log the error
  273. logger.error("Exception Message: " + e.getMessage(), e);
  274. }
  275. // Remove from index:
  276. try {
  277. this.removeFromSearch(docs);
  278. } catch (Exception e) {
  279. // If an exception occurs log the error
  280. logger.error("Exception Message: " + e.getMessage(), e);
  281. }
  282. return nextId;
  283. }//TESTED
  284. /**
  285. * Remove a list of doc documents from the data store (you have a source key, so you can go much quicker)
  286. * CALLED FROM: deleteSource(...)
  287. * @returns the number of docs deleted
  288. */
  289. public long removeFromDatastoreAndIndex_bySourceKey(String sourceKey, ObjectId lessThanId, boolean definitelyNoContent, String communityId) {
  290. try {
  291. if (!definitelyNoContent) {
  292. DbManager.getDocument().getContent().remove(new BasicDBObject(CompressedFullTextPojo.sourceKey_, sourceKey));
  293. // (will just check index and pull out if the doc has no external content)
  294. }
  295. BasicDBObject query = new BasicDBObject(DocumentPojo.sourceKey_, sourceKey);
  296. if (null != lessThanId) { // Multiple threads running for this source
  297. // First check whether one of the other threads has already deleted the source:
  298. BasicDBObject oneFinalCheckQuery = new BasicDBObject(DocumentPojo.sourceKey_, sourceKey);
  299. BasicDBObject oneFinalCheckFields = new BasicDBObject(DocumentPojo.index_, 1);
  300. BasicDBObject firstDocToBeUpdated = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(oneFinalCheckQuery, oneFinalCheckFields);
  301. if ((null == firstDocToBeUpdated) || firstDocToBeUpdated.getString(DocumentPojo.index_, "").equals(DELETION_INDICATOR))
  302. {
  303. //(ie grab the first doc in natural order and tell me if it's been soft-deleted yet, if so do nothing)
  304. return 0;
  305. }//TESTED
  306. // That check isn't perfect because of race conditions, so we'll still add the !="?DEL?" check to the
  307. // update as well:
  308. query.put(DocumentPojo._id_, new BasicDBObject(DbManager.lte_, lessThanId));
  309. query.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, DELETION_INDICATOR));
  310. }//TESTED
  311. BasicDBObject softDeleter = getSoftDeleteUpdate();
  312. DbManager.getDocument().getMetadata().update(query, softDeleter, false, true);
  313. // (don't do getLastError just yet since it can block waiting for completion)
  314. // Quick delete for index though:
  315. StringBuffer sb = new StringBuffer(DocumentPojoIndexMap.manyGeoDocumentIndexCollection_).append(",docs_").append(communityId).append('/').append(DocumentPojoIndexMap.documentType_);
  316. ElasticSearchManager indexManager = IndexManager.getIndex(sb.toString());
  317. BaseQueryBuilder soloOrCombinedQuery = QueryBuilders.termQuery(DocumentPojo.sourceKey_, sourceKey);
  318. if (null != lessThanId) {
  319. //(_id isn't indexed - _uid is and == _type + "#" + _id)
  320. soloOrCombinedQuery = QueryBuilders.boolQuery().must(soloOrCombinedQuery).
  321. must(QueryBuilders.rangeQuery("_uid").lte("document_index#" + lessThanId.toString()));
  322. }//TESTED
  323. indexManager.doDeleteByQuery(soloOrCombinedQuery);
  324. CommandResult result = DbManager.getDocument().getLastError("metadata");
  325. return result.getLong("n", 0);
  326. } catch (Exception e) {
  327. // If an exception occurs log the error
  328. logger.error("Exception Message: " + e.getMessage(), e);
  329. }
  330. return 0;
  331. }//TESTED
  332. /**
  333. * Remove a list of doc documents from the data store and index (you have a source URL, so you can go much quicker)
  334. *
  335. * CALLED FROM: removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
  336. * MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
  337. * processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey,
  338. * DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
  339. * @returns the number of docs deleted
  340. */
  341. private ElasticSearchManager _cachedIndexManagerForSourceXxxDeletion = null;
  342. private ObjectId _cachedCommunityIdForSourceXxxDeletion = null;
  343. public long removeFromDatastoreAndIndex_bySourceUrl(String sourceUrl, String sourceKey, ObjectId communityId) {
  344. try {
  345. // (never any content)
  346. BasicDBObject query = new BasicDBObject(DocumentPojo.sourceUrl_, sourceUrl);
  347. query.put(DocumentPojo.sourceKey_, sourceKey);
  348. BasicDBObject softDeleter = getSoftDeleteUpdate();
  349. DbManager.getDocument().getMetadata().update(query, softDeleter, false, true);
  350. CommandResult result = DbManager.getDocument().getLastError("metadata");
  351. // Quick delete for index though:
  352. if (!communityId.equals(_cachedCommunityIdForSourceXxxDeletion)) {
  353. StringBuffer sb = new StringBuffer(DocumentPojoIndexMap.manyGeoDocumentIndexCollection_).append(",docs_").append(communityId).append('/').append(DocumentPojoIndexMap.documentType_);
  354. _cachedIndexManagerForSourceXxxDeletion = IndexManager.getIndex(sb.toString());
  355. _cachedCommunityIdForSourceXxxDeletion = communityId;
  356. }//TESTED
  357. _cachedIndexManagerForSourceXxxDeletion.doDeleteByQuery(
  358. QueryBuilders.boolQuery()
  359. .must(QueryBuilders.termQuery(DocumentPojo.sourceUrl_, sourceUrl))
  360. .must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sourceKey))
  361. );
  362. return result.getLong("n", 0);
  363. } catch (Exception e) {
  364. // If an exception occurs log the error
  365. logger.error("Exception Message: " + e.getMessage(), e);
  366. }
  367. return 0;
  368. }//TESTED
  369. /////////////////////////////////////////////////////////////////////////////////////////////////
  370. // Utility
  371. /**
  372. * Remove a list of doc documents from the data store + adds _id and index doc fields to retrieve to support de-index
  373. * (also adds created to docs with an updateId so the created remains ~the same)
  374. * (Will in theory support arbitrary sourceUrl/sourceKey operators but in practice these will always be from a single source)
  375. * @param docs - needs url (optionally sourceUrl) set - child function requires sourceKey
  376. * @param col
  377. * @param feeds
  378. *
  379. * CALLED FROM: removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
  380. * MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
  381. * processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey,
  382. * DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
  383. * pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX, _ID
  384. * updateHarvestStatus(...)
  385. */
  386. private ObjectId removeFromDatastore_byURL(DBCollection col, List<DocumentPojo> docs) {
  387. ObjectId nextId = null;
  388. BasicDBObject fields = new BasicDBObject();
  389. fields.put(DocumentPojo.created_, 1); // (If we're getting the deleted doc fields, get this and have exact created time)
  390. fields.put(DocumentPojo.index_, 1); // This is needed for the removeFromSearch() called from parent removeFromDatastore_URL
  391. TreeMap<String,DocumentPojo> sourceUrlToKeyMap = null;
  392. HashSet<String> deletedSources = null;
  393. // Store the knowledge in the feeds collection in the harvester db
  394. Iterator<DocumentPojo> docIt = docs.iterator();
  395. while (docIt.hasNext()) {
  396. DocumentPojo f = docIt.next();
  397. nextId = f.getId(); // (only interested in the pruneSource case, in which case _id is set on input)
  398. if ((null != f.getSourceUrl()) && (null == f.getUrl())) { // special case ... delete all these documents...
  399. if ((null == deletedSources) || !deletedSources.contains(f.getSourceKey())) { // (don't bother deleting sourceURL if deleting source)
  400. if (null == sourceUrlToKeyMap) {
  401. sourceUrlToKeyMap = new TreeMap<String,DocumentPojo>();
  402. }
  403. sourceUrlToKeyMap.put(f.getSourceUrl(), f);
  404. }//TESTED
  405. docIt.remove(); // (so don't miscount number of docs; processed below)
  406. }
  407. else if (null != f.getSourceKey() && (null == f.getSourceUrl()) && (null == f.getUrl())) {
  408. // Even more special case: delete entire sourceKey
  409. if (null == deletedSources) {
  410. deletedSources = new HashSet<String>();
  411. }
  412. if (!deletedSources.contains(f.getSourceKey())) {
  413. deletedSources.add(f.getSourceKey());
  414. long srcRemoved = removeFromDatastoreAndIndex_bySourceKey(f.getSourceKey(), f.getId(), true, f.getCommunityId().toString());
  415. if (srcRemoved > 0) {
  416. updateDocCountsOnTheFly(-srcRemoved, f.getSourceKey(), f.getCommunityId());
  417. }
  418. }
  419. docIt.remove(); // (so don't miscount number of docs)
  420. }//TESTED
  421. else {
  422. removeFromDatastore_byURL(col, f, fields,
  423. StoreAndIndexManager.docHasExternalContent(f.getUrl(), f.getSourceUrl()));
  424. // (adds "_id", "index")
  425. }
  426. }//TESTED
  427. // Now tidy up sourceUrls, do some caching across sourceKey/community for performance
  428. String sourceKey = null; // (if deleting sourceKey don't bother deleting any sourceUrls)
  429. long removed = 0; // (from special operations)
  430. String cachedSourceKey = null; // (will handle multiple source keys, although that can't currently happen in practice)
  431. ObjectId communityId = null;
  432. if (null != sourceUrlToKeyMap) for (Map.Entry<String, DocumentPojo> entry: sourceUrlToKeyMap.entrySet()) {
  433. String srcUrl = entry.getKey();
  434. DocumentPojo doc = entry.getValue();
  435. sourceKey = doc.getSourceKey();
  436. communityId = doc.getCommunityId();
  437. if (sourceKey != cachedSourceKey) { // ptr comparison by design
  438. if (removed > 0) {
  439. updateDocCountsOnTheFly(-removed, sourceKey, communityId);
  440. removed = 0;
  441. }//TESTED
  442. cachedSourceKey = sourceKey;
  443. }
  444. removed += removeFromDatastoreAndIndex_bySourceUrl(srcUrl, sourceKey, communityId);
  445. }//TESTED
  446. if ((removed > 0) && (null != sourceKey)) {
  447. updateDocCountsOnTheFly(-removed, sourceKey, communityId);
  448. }//TESTED
  449. return nextId;
  450. }//TESTED
  451. public void updateDocCountsOnTheFly(long docIncrement, String sourceKey, ObjectId communityId)
  452. {
  453. DbManager.getDocument().getCounts().update(new BasicDBObject(DocCountPojo._id_, communityId),
  454. new BasicDBObject(DbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, docIncrement)));
  455. DbManager.getIngest().getSource().update(new BasicDBObject(SourcePojo.key_, sourceKey),
  456. new BasicDBObject(MongoDbManager.inc_,
  457. new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, docIncrement))
  458. );
  459. }//TESTED
  460. /**
  461. * Remove a doc from the data store
  462. * @param col
  463. * @param doc
  464. * @param fields - fields to retrieve
  465. *
  466. * CALLED FROM: removeFromDataStore_byId(List<doc>, bDeleteContent)
  467. * resizeDB() <- FILLS IN _ID, SOURCEKEY, INDEX, SOURCEURL
  468. */
  469. private void removeFromDatastore_byId(DBCollection col, List<DocumentPojo> docs) {
  470. // Store the knowledge in the feeds collection in the harvester db
  471. for ( DocumentPojo f : docs) {
  472. removeFromDatastore_byId(col, f);
  473. }
  474. }//TESTED
  475. /**
  476. * Remove a doc from the data store
  477. * @param col
  478. * @param doc - assumes _id set
  479. * @param fields - fields to retrieve (set in outside the doc loop for performance, url, index, sourceKey)
  480. *
  481. * CALLED FROM: removeFromDataStore_byId(col, List<doc>, bDeleteContent)
  482. * removeFromDataStore_byId(List<doc>, bDeleteContent)
  483. * resizeDB() <- _ID, SOURCEKEY, INDEX, SOURCEURL
  484. */
  485. private void removeFromDatastore_byId(DBCollection col, DocumentPojo doc) {
  486. boolean bDeleteContent = docHasExternalContent(doc.getUrl(), doc.getSourceUrl());
  487. if (bDeleteContent) {
  488. // Remove its content also:
  489. if (!_diagnosticMode) {
  490. BasicDBObject contentQuery = new BasicDBObject(DocumentPojo.url_, doc.getUrl());
  491. contentQuery.put(DocumentPojo.sourceKey_, doc.getSourceKey());
  492. DbManager.getDocument().getContent().remove(contentQuery);
  493. }
  494. else {
  495. System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete content: " + doc.getSourceKey() + "/" + doc.getUrl());
  496. }
  497. }
  498. // Update Mongodb with the data
  499. BasicDBObject query = new BasicDBObject();
  500. query.put(DocumentPojo.sourceKey_, doc.getSourceKey());
  501. query.put(DocumentPojo._id_, doc.getId());
  502. query.put(DocumentPojo.sourceKey_, doc.getSourceKey()); // (needed because on newer machines this is the shard key)
  503. if (!_diagnosticMode) {
  504. BasicDBObject softDelete = getSoftDeleteUpdate();
  505. col.update(query, softDelete);
  506. // (can do this on sharded collections because it uses sourceKey+_id, the shard key)
  507. }
  508. else { // (diagnostic mode)
  509. if (null != col.findOne(query)) {
  510. System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete: " + doc.toDb().toString());
  511. }
  512. else {
  513. System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete: DOC NOT FOUND");
  514. }
  515. }
  516. }//TESTED (1.1)
  517. /**
  518. * Remove a doc from the data store, ensures all the fields specified in "fields" are populated (ready for index deletion)
  519. * @param col
  520. * @param doc - needs url, sourceKey set
  521. * @param fields - fields to retrieve (index, created), set in calling function outside of loop for performance
  522. *
  523. * CALLED FROM: removeFromDatastore_byURL(col, List<doc>, bDeleteContent) <- ADDS INDEX, CREATED TO FIELDS
  524. * removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
  525. * MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
  526. * processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey,
  527. * DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
  528. * pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX
  529. * updateHarvestStatus(...)
  530. */
  531. private void removeFromDatastore_byURL(DBCollection col, DocumentPojo doc, BasicDBObject fields, boolean bDeleteContent) {
  532. // 1] Create the query to soft delete the document
  533. BasicDBObject query = new BasicDBObject();
  534. query.put(DocumentPojo.url_, doc.getUrl());
  535. query.put(DocumentPojo.sourceKey_, doc.getSourceKey());
  536. // 2] Delete the content if needed
  537. if (bDeleteContent) {
  538. if (docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
  539. if (!_diagnosticMode) {
  540. DbManager.getDocument().getContent().remove(query);
  541. }
  542. else {
  543. System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2), delete content: " + doc.getSourceKey() + "/" + doc.getUrl());
  544. }
  545. }
  546. }
  547. //TESTED
  548. // 3] Work out which fields we have and which (if any we need to go and fetch):
  549. boolean needToFindAndModify = false;
  550. if (null == doc.getId()) { // This is called from processDocuments
  551. if (null != doc.getUpdateId()) { // update case...
  552. doc.setId(doc.getUpdateId()); // (note this is overwritten by addToDatastore later, in update case, so we're good)
  553. // (doc.index is populated but may not be correct because of the "many geos" workaround):
  554. if (DocumentPojoIndexMap.hasManyGeos(doc)) {
  555. doc.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
  556. // (note this check isn't stateless, it actually populates "locs" at the same time
  557. // this is handled in addToDatastore (update case), temp removed when adding to DB
  558. }//TESTED (2.1.2, diagnostic mode, doc2)
  559. }
  560. else { // Not an update case, we're going to have to grab the document after all, which is a bit slower
  561. needToFindAndModify = true;
  562. }
  563. }//TESTED (2.1.2, diagnostic mode, doc2)
  564. if (!needToFindAndModify) { // set created if we need to, since we're not grabbing it from the datastore
  565. if (null != doc.getUpdateId()) { // (this means we have an approx created if we don't need to go fetch the deleted doc)
  566. doc.setCreated(new Date(doc.getUpdateId().getTime()));
  567. }//TESTED (2.1.2, diagnostic mode, doc2)
  568. }
  569. // (if we're here and index is not set, then it is intended to be null)
  570. // 4] Update the doc_metadata collection
  571. BasicDBObject softDelete = getSoftDeleteUpdate();
  572. BasicDBObject deadDoc = null; // (not normally needed)
  573. if (needToFindAndModify) { // less pleasant, need to go grab the doc
  574. deadDoc = (BasicDBObject) col.findOne(query, fields);
  575. }//TESTED (2.1.2)
  576. if (!_diagnosticMode) {
  577. col.update(query, softDelete, false, true); // (needs to be multi- even though there's a single element for sharding reasons)
  578. }//TESTED (2.1.2)
  579. // 5] Add fields if necessary
  580. if (null != deadDoc) {
  581. doc.setCreated((Date) deadDoc.get(DocumentPojo.created_));
  582. // (if getting this doc anyway then might as well get the created)
  583. doc.setId((ObjectId) deadDoc.get(DocumentPojo._id_));
  584. doc.setIndex((String) deadDoc.get(DocumentPojo.index_));
  585. if (_diagnosticMode) {
  586. System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): found " + deadDoc.toString());
  587. }
  588. }//TESTED (2.1.2)
  589. else if (_diagnosticMode) {
  590. if (!needToFindAndModify) {
  591. System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): straight deleted " + doc.toDb().toString());
  592. }
  593. else {
  594. System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): didn't find " + query.toString());
  595. }
  596. }//TESTED (2.1.2)
  597. }//TESSTED (2.1.2)
  598. /////////////////////////////////////////////////////////////////////////////////////////////////
  599. /////////////////////////////////////////////////////////////////////////////////////////////////
  600. // Synchronize database with index
  601. /**
  602. * Add a list of feeds to the full text index
  603. * @param docs
  604. */
  605. public void addToSearch(List<DocumentPojo> docs)
  606. {
  607. String sSavedIndex = null;
  608. ElasticSearchManager indexManager = null;
  609. LinkedList<DocumentPojo> tmpDocs = new LinkedList<DocumentPojo>();
  610. int nTmpDocs = 0;
  611. for ( DocumentPojo doc : docs )
  612. {
  613. String sThisDocIndex = doc.getIndex();
  614. if ((null == sSavedIndex) || (null == sThisDocIndex) || !sSavedIndex.equals(sThisDocIndex)) { // Change index
  615. if (null != indexManager) { // ie not first time through, bulk add what docs we have
  616. sendToIndex(indexManager, tmpDocs);
  617. // (ie with the *old* index manager)
  618. nTmpDocs = 0;
  619. }
  620. sSavedIndex = sThisDocIndex;
  621. if ((null == sSavedIndex) || (sSavedIndex.equals(DocumentPojoIndexMap.globalDocumentIndex_))) {
  622. indexManager = IndexManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
  623. }
  624. else {
  625. indexManager = IndexManager.getIndex(new StringBuffer(sSavedIndex).append('/').
  626. append(DocumentPojoIndexMap.documentType_).toString());
  627. }
  628. }//TESTED
  629. tmpDocs.add(doc);
  630. nTmpDocs++;
  631. if (nTmpDocs > 5000) { // some sensible upper limit
  632. sendToIndex(indexManager, tmpDocs);
  633. nTmpDocs = 0;
  634. }
  635. if (_diagnosticMode) {
  636. System.out.println("StoreAndIndexManager.addToSearch, add: " + doc.getId() + " + " +
  637. ((null != doc.getEntities())?("" + doc.getEntities().size()):"0") + " entities, " +
  638. ((null != doc.getAssociations())?("" + doc.getAssociations().size()):"0") + " assocs, " +
  639. ((null != doc.getLocs())?("" + doc.getLocs().size()):"0") + " locs"
  640. );
  641. }
  642. }// (end loop over docs)
  643. // Bulk add remaining docs
  644. sendToIndex(indexManager, tmpDocs);
  645. }//TESTED (not change since by-eye testing in Beta)
  646. // Utility required by the above function
  647. private void sendToIndex(ElasticSearchManager indexManager, LinkedList<DocumentPojo> docsToAdd) {
  648. try {
  649. if (!docsToAdd.isEmpty()) {
  650. if (!_diagnosticMode) {
  651. indexManager.bulkAddDocuments(IndexManager.mapListToIndex(docsToAdd, new TypeToken<LinkedList<DocumentPojo>>(){},
  652. new DocumentPojoIndexMap()), DocumentPojo._id_, null, true);
  653. }
  654. else {
  655. System.out.println("StoreAndIndexManager.addToSearch: index " + docsToAdd.size() + " documents to " + indexManager.getIndexName());
  656. }
  657. docsToAdd.clear();
  658. }
  659. }
  660. catch (Exception ex)
  661. {
  662. ex.printStackTrace();
  663. logger.error("Exception Message saving document to ES: " + ex.getMessage(), ex);
  664. }
  665. }//TESTED
  666. /**
  667. *
  668. * @param docs (just need the id and the index and any events)
  669. */
  670. public void removeFromSearch(List<DocumentPojo> docs)
  671. {
  672. String sIndex = null;
  673. ElasticSearchManager indexManager = null;
  674. LinkedList<String> tmpDocs = new LinkedList<String>();
  675. int nTmpDocs = 0;
  676. for ( DocumentPojo doc : docs )
  677. {
  678. if (null == doc.getId()) { // Normally this will be sourceUrls, eg files pointing to many docs
  679. continue; // (ie can just ignore)
  680. }
  681. if ((null != doc.getIndex()) && doc.getIndex().equals("?DEL?")) {
  682. continue; //(must have already been deleted, so can ignore)
  683. }
  684. if ((null == sIndex) || (null == doc.getIndex()) || !sIndex.equals(doc.getIndex())) { // Change index
  685. if (null != indexManager) { // ie not first time through, bulk delete what docs we have
  686. deleteFromIndex(indexManager, tmpDocs); // (clears tmpDocs)
  687. // (ie with the *old* index manager)
  688. nTmpDocs = 0;
  689. }
  690. sIndex = doc.getIndex();
  691. if ((null == sIndex) || (sIndex.equals(DocumentPojoIndexMap.globalDocumentIndex_))) {
  692. indexManager = IndexManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
  693. }
  694. else {
  695. indexManager = IndexManager.getIndex(new StringBuffer(sIndex).append('/').append(DocumentPojoIndexMap.documentType_).toString());
  696. }
  697. }//TESTED
  698. tmpDocs.add(doc.getId().toString());
  699. nTmpDocs++;
  700. if (nTmpDocs > 5000) { // some sensible upper limit
  701. deleteFromIndex(indexManager, tmpDocs); // (clears tmpDocs)
  702. nTmpDocs = 0;
  703. }
  704. //delete from event search
  705. if (_diagnosticMode) {
  706. System.out.println("StoreAndIndexManager.removeFromSearch, remove: " + doc.getId() + " + " +
  707. ((null != doc.getEntities())?("" + doc.getEntities().size()):"0") + " entities, " +
  708. ((null != doc.getAssociations())?("" + doc.getAssociations().size()):"0") + " assocs, " +
  709. ((null != doc.getLocs())?("" + doc.getLocs().size()):"0") + " locs"
  710. );
  711. }
  712. } // (end loop over docs)
  713. // Bulk remove remaining docs
  714. deleteFromIndex(indexManager, tmpDocs);
  715. }//TESTED (not change since by-eye testing in Beta)
  716. /////////////////////////////////////////////////////////////////////////////////////////////////
  717. // Utility required by the above function
  718. private void deleteFromIndex(ElasticSearchManager indexManager, LinkedList<String> docsToDelete) {
  719. try {
  720. if (!docsToDelete.isEmpty()) {
  721. if (!_diagnosticMode) {
  722. indexManager.bulkDeleteDocuments(docsToDelete);
  723. }
  724. else {
  725. System.out.println("StoreAndIndexManager.removeFromSearch: index " + docsToDelete.size() + " documents from " + indexManager.getIndexName());
  726. }
  727. docsToDelete.clear();
  728. }
  729. }
  730. catch (Exception ex)
  731. {
  732. ex.printStackTrace();
  733. logger.error("Exception Message deleting document from ES: " + ex.getMessage(), ex);
  734. }
  735. }//TESTED
  736. /////////////////////////////////////////////////////////////////////////////////////////////////
  737. /////////////////////////////////////////////////////////////////////////////////////////////////
  738. // Handle resizing the DB if it gets too large
  739. // Utility function for diagnostic prints etc
  740. public long getDatabaseSize() {
  741. return DbManager.getDocument().getMetadata().count();
  742. }
  743. /**
  744. * This function checks if DB storage requirements are met,
  745. * if not it will start removing docs based on least used/oldest
  746. *
  747. * @return true once DB is within bounds, false if an error occurs
  748. */
  749. public boolean resizeDB()
  750. {
  751. return resizeDB(-1);
  752. }
  753. public boolean resizeDB(long capacityOverride)
  754. {
  755. //Do quick check to check if we are already under storage requirements
  756. if ( checkStorageCapacity(capacityOverride) ) {
  757. return false;
  758. }
  759. else
  760. {
  761. //if quick check fails, start removing docs to get under requirement
  762. try
  763. {
  764. long currDocsInDB = DbManager.getDocument().getMetadata().count();
  765. long storageCap = (capacityOverride == -1L) ? new PropertiesManager().getStorageCapacity() : capacityOverride;
  766. List<DocumentPojo> docsToRemove = getLeastActiveDocs((int) (currDocsInDB-storageCap));
  767. // (populates docsToRemove with _id and sourceKey - needed to support doc_metadata sharding)
  768. removeFromDatastore_byId(docsToRemove); // (remove content since don't know if it exists)
  769. //(^ this also removes from index)
  770. return true;
  771. }
  772. catch (Exception e)
  773. {
  774. // If an exception occurs log the error
  775. logger.error("Exception Message: " + e.getMessage(), e);
  776. return true;
  777. }
  778. }
  779. }//TESTED
  780. /////////////////////////////////////////////////////////////////////////////////////////////////
  781. // Utility
  782. /**
  783. * This method checks if doc count is
  784. * below threshhold set in properties
  785. * @return true is below threshhold, false if not
  786. */
  787. private boolean checkStorageCapacity(long capacityOverride)
  788. {
  789. long currDocsInDB = 0;
  790. try {
  791. currDocsInDB = DbManager.getDocument().getMetadata().count();
  792. } catch (Exception e ) {
  793. // If an exception occurs log the error
  794. logger.error("Exception Message: " + e.getMessage(), e);
  795. }
  796. long storageCapacity = (-1L == capacityOverride) ? new PropertiesManager().getStorageCapacity() : capacityOverride;
  797. return (currDocsInDB <= storageCapacity);
  798. }
  799. /**
  800. * Returns a list of the least active documents
  801. * List is of length numDocs
  802. *
  803. * @param numDocs Number of documents to return that are least active
  804. * @return a list of documents that are least active in DB (populates docsToRemove with _id and sourceKey - needed to support doc_metadata sharding)
  805. )
  806. */
  807. private List<DocumentPojo> getLeastActiveDocs(int numDocs)
  808. {
  809. List<DocumentPojo> olddocs = null;
  810. //TODO (INF-1301): WRITE AN ALGORITHM TO CALCULATE THIS BASED ON USAGE, just using time last accessed currently
  811. //give a weight to documents age and documents activity to calculate
  812. //least active (current incarnation doesn't work)
  813. try
  814. {
  815. BasicDBObject fields = new BasicDBObject(DocumentPojo._id_, 1);
  816. fields.put(DocumentPojo.sourceKey_, 1);
  817. fields.put(DocumentPojo.index_, 1);
  818. fields.put(DocumentPojo.sourceUrl_, 1);
  819. fields.put(DocumentPojo.url_, 1);
  820. DBCursor dbc = DbManager.getDocument().getMetadata().find(new BasicDBObject(), fields).
  821. sort(new BasicDBObject(DocumentPojo._id_,1)).limit(numDocs);
  822. // (note, just retrieve _id and sourceKey fields: _id starts with timestamp so these are approximately oldest created)
  823. olddocs = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());
  824. }
  825. catch (Exception e )
  826. {
  827. // If an exception occurs log the error
  828. logger.error("Exception Message: " + e.getMessage(), e);
  829. }
  830. return olddocs;
  831. }//TESTED (1.1)
  832. ////////////////////////////////////////////////////////////////////////////////
  833. ////////////////////////////////////////////////////////////////////////////////
  834. // Utility
  835. // Utility function to decide if we need to add/remove content via the external table
  836. // (ie JDBC and XML have their content as part of their metadata, eg fields
  837. // others like HTTP and Files can have large amounts of content that we don't want to store in the DB object)
  838. // Called from: (noted here because this needs to be tidied up at some point)
  839. // StoreAndIndexManager.addToDatastore
  840. // MongoDocumentTxfer.doTransfer
  841. // SourceUtils.pruneSource
  842. // StoreAndIndexManager.removeFromDataStore_by(Id|SourceKey|Url)
  843. // StoreAndIndexManager.saveContent
  844. static public boolean docHasExternalContent(String url, String srcUrl) {
  845. //TODO: INF-1367: there's an issue with this .. suppose it's some enormous JSON file
  846. // and we excise a bunch of JSON files from the metadata (after using them for processing)
  847. // seems like we should have an optional keepExternalContent that defaults to the return value
  848. // of this function, but you can override from the SAH or whatever
  849. if (null != srcUrl) { // must be either JSON or XML or *sv
  850. return false;
  851. }
  852. else if (null == url) { // no idea, pathological case?!
  853. return true;
  854. }
  855. else if (url.startsWith("jdbc:")) { // DB entry
  856. return false;
  857. }
  858. else if (url.startsWith("inf://custom/")) { // custom entry
  859. return false;
  860. }
  861. else if ((url.startsWith("smb://") || url.startsWith("file:") || url.startsWith("s3://") || url.startsWith("inf://")) &&
  862. (url.endsWith(".xml") || url.endsWith(".json") || url.endsWith("sv")))
  863. // JSON/XML/*sv but 1 doc/file
  864. {
  865. return false;
  866. }
  867. return true;
  868. }
  869. }