StoreAndIndexManager.java

/core/infinit.e.processing.generic.library/src/com/ikanow/infinit/e/processing/generic/store_and_index/StoreAndIndexManager.java

https://github.com/IKANOW/Infinit.e · Java · 1009 lines · 603 code · 140 blank · 266 comment · 167 complexity · 62586a4c4c797a04244a503a51f21aa4 MD5 · raw file

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.generic.store_and_index;


import java.net.UnknownHostException;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.elasticsearch.index.query.BaseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;

import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.IndexManager;
import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.utils.PropertiesManager;
import com.mongodb.BasicDBObject;
import com.mongodb.CommandResult;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;

/**
 * Class used to commit records to backend storage during harvest process
 * @author cmorgan
 *
 */
public class StoreAndIndexManager {

	// Initialize the Logger
	private static final Logger logger = Logger.getLogger(StoreAndIndexManager.class);	
	
	private static boolean _diagnosticMode = false;
	public static void setDiagnosticMode(boolean bMode) { _diagnosticMode = bMode; }
	
	private int nMaxContentLen_bytes = 100000; // (100KB default max)
	private boolean bStoreRawContent = false; // (store the raw as well as the processed data)
	private boolean bStoreMetadataAsContent = false; // (store the metadata in the content block)
	
	public final static String DELETION_INDICATOR = "?DEL?";
	private String harvesterUUID = null;
	public String getUUID() { return harvesterUUID; }
	
	public StoreAndIndexManager() {
		com.ikanow.infinit.e.processing.generic.utils.PropertiesManager pm = 
			new com.ikanow.infinit.e.processing.generic.utils.PropertiesManager();
		
		int nMaxContent = pm.getMaxContentSize();
		if (nMaxContent > -1) {
			nMaxContentLen_bytes = nMaxContent;
		}		
		bStoreRawContent = pm.storeRawContent();
		bStoreMetadataAsContent = pm.storeMetadataAsContent();
		
		try {
			StringBuffer sb = new StringBuffer(DELETION_INDICATOR).append(java.net.InetAddress.getLocalHost().getHostName());
			harvesterUUID = sb.toString();
		} catch (UnknownHostException e) {
			harvesterUUID = DELETION_INDICATOR + "UNKNOWN";
		}
	}
	
/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	
	
// Datastore addition		
	
	/**
	 * Add a list of doc documents to the data store
	 * @param feeds
	 */
	public void addToDatastore(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source) {
		try {
			// Create collection manager
			// Add to data store
			addToDatastore(DbManager.getDocument().getMetadata(), docs);
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		// (note: currently modifies docs, see DocumentIndexPojoMap, so beware if using after this point)
		if (bSaveContent) {
			saveContent(docs);
		}
		boolean index = true;
		if ((null != source) && (null != source.getSearchIndexFilter())) {
			if (null != source.getSearchIndexFilter().indexOnIngest) {
				index = source.getSearchIndexFilter().indexOnIngest;
			}
		}
		if (index) {
			this.addToSearch(docs);
		}
		
	}//TESTED
	
	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utilities
	
	/**
	 * Add a single doc document to the datastore
	 * @param col
	 * @param doc
	 */
	private void addToDatastore(DBCollection col, DocumentPojo doc) {
		if (!_diagnosticMode) {
			if (!docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
				doc.makeFullTextNonTransient(); // (ie store full text in this case)
			}
			col.save(doc.toDb());
		}
		else {
			System.out.println("StoreAndIndexManager.addToDatastore: " + ((BasicDBObject)doc.toDb()).toString());
		}
	}//TESTED
	
	/**
	 * Add a list of doc documents to the data store
	 * @param feeds
	 */
	private void addToDatastore(DBCollection col, List<DocumentPojo> docs) {
		// Store the knowledge in the feeds collection in the harvester db			
		for ( DocumentPojo f : docs) {
			
			// Set an _id before writing it to the datastore,
			// so the same _id gets written to the index
			// NOTE WE OVERWRITE ANY TRANSIENT IDS THAT MIGHT HAVE BEEN SET eg BY REMOVE CODE
			f.setId(new ObjectId());
			
			// Check geo-size: need to add to a different index if so, for memory usage reasons
			if (null == f.getLocs()) { // (can be set by update/deletion code also)
				if (DocumentPojoIndexMap.hasManyGeos(f)) {
					f.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
					// (note this check isn't stateless, it actually populates "locs" at the same time)
					// therefore...
				}
			}
			Set<String> locs = f.getLocs();
			f.setLocs(null);
			
			addToDatastore(col, f);
			
			f.setLocs(locs);
		}
	}//TESTED

	//////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Save the fulltext of a pojo to mongo for using later
	 * 
	 * @param docs
	 */
	private void saveContent(List<DocumentPojo> docs)
	{
		try
		{
			DBCollection contentDb = DbManager.getDocument().getContent();
			
			for ( DocumentPojo doc : docs )
			{
				boolean bStoreContent = true;
				bStoreContent &= (0 != nMaxContentLen_bytes); // (otherwise it's turned off)
				bStoreContent &= this.bStoreMetadataAsContent || ((null != doc.getFullText()) && !doc.getFullText().isEmpty());				
				boolean bDocHasExternalContent = docHasExternalContent(doc.getUrl(), doc.getSourceUrl());
				
				if (bStoreContent && bDocHasExternalContent) {
					try
					{
						String rawText = this.bStoreRawContent ? doc.getRawFullText() : null; 
						DocumentPojo meta = bStoreMetadataAsContent ? doc : null; 
						CompressedFullTextPojo gzippedContent = new CompressedFullTextPojo(doc.getUrl(), doc.getSourceKey(), doc.getCommunityId(),
																							doc.getFullText(), rawText, meta, nMaxContentLen_bytes);
						
						if (null != gzippedContent.getUrl())  {
							// Be efficient and write field-by-field vs using JSON conversion
							BasicDBObject query = new BasicDBObject(CompressedFullTextPojo.url_, gzippedContent.getUrl());
							query.put(CompressedFullTextPojo.sourceKey_, gzippedContent.getSourceKey());
							BasicDBObject update = gzippedContent.getUpdate();
							if (!_diagnosticMode) {
								contentDb.update(query, update, true, false); // (ie upsert, supported because query includes shard key==url)
							}
							else {
								System.out.println("StoreAndIndexManager.savedContent, save content: " + gzippedContent.getUrl());
							}
						}
					}
					catch (Exception ex)
					{
						// Do nothing, just carry on
						ex.printStackTrace();
					}
				}//TESTED
			}
		}
		catch (Exception ex)
		{
			// This is a more serious error
			logger.error(ex.getMessage());
		}
	}//TESTED (not changed since by-eye testing in Beta)
	
	
/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	
	
// Datastore removal		

	/**
	 * This function removes documents "soft deleted" by this harvester
	 */
	
	public void removeSoftDeletedDocuments()
	{
		BasicDBObject query = new BasicDBObject(DocumentPojo.url_, harvesterUUID);
		
		if (_diagnosticMode) {
			System.out.println("Soft delete: " + DbManager.getDocument().getMetadata().count(query));			
		}
		else {
			DbManager.getDocument().getMetadata().remove(query);			
		}
	}//TESTED
	
	/**
	 * Low level utility to abstract soft deletion
	 * We're using URL because 1) we cant' use a shard key
	 * 2) it needs to be an indexed field
	 * 3) ideally one that is likely to be cached in memory
	 * 4) one that minimizes the chance of having to move the document when modifying the field
	 * (I also considered sourceUrl or an all new field, they _might_ be better because smaller, but conversely
	 *  would be less likely to be cached and most importantly there's the risk of 4)
	 */
	
	private BasicDBObject _softDeleter = null;
	
	private BasicDBObject getSoftDeleteUpdate()
	{
		if (null == _softDeleter) {
			BasicDBObject softDeleter = new BasicDBObject(DocumentPojo.url_, harvesterUUID);
			softDeleter.put(DocumentPojo.index_, DELETION_INDICATOR);
				// (used in CustomHadoopTaskLauncher.createConfigXML)
			_softDeleter = new BasicDBObject(DbManager.set_, softDeleter);
		}
		return _softDeleter;
	}//TESTED

	/**
	 * Remove a list of doc documents from the data store (you have their _id and sourceKey)
	 * 
	 * CALLED FROM:	resizeDB() <- FILLS IN _ID, SOURCEKEY, INDEX, URL, SOURCEURL
	 */
	public void removeFromDatastore_byId(List<DocumentPojo> docs) {
		try {
			// Remove from data store
			removeFromDatastore_byId(DbManager.getDocument().getMetadata(), docs);			
			this.removeFromSearch(docs);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
	}//TESTED
	
	/**
	 * Remove a list of doc documents from the data store (you have their url) AND ALSO the search index
	 * 
	 * @param docs - child function needs url (optionally sourceUrl) set - child function requires sourceKey
	 * 					this function needs id and index both of which are set by the child stack
	 * 
	 * CALLED FROM: MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 				processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 										DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * 				pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX, _ID
	 * 					updateHarvestStatus(...)
	 */
	public ObjectId removeFromDatastore_byURL(List<DocumentPojo> docs) {
		
		// Remove from data store:
		ObjectId nextId = null;
		try {
			nextId = removeFromDatastore_byURL(DbManager.getDocument().getMetadata(), docs);			
				// ^^^ adds "created" (if updateId set), "_id" and "index" to the doc and expands "sourceUrl" docs (adding "_id" and "index")
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		
		// Remove from index:
		
		try {
			this.removeFromSearch(docs);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		
		return nextId;
	}//TESTED
	
	/**
	 * Remove a list of doc documents from the data store (you have a source key, so you can go much quicker)
	 * CALLED FROM:	deleteSource(...) 
	 * @returns the number of docs deleted
	 */
	public long removeFromDatastoreAndIndex_bySourceKey(String sourceKey, ObjectId lessThanId, boolean definitelyNoContent, String communityId) {
				
		try {			
			if (!definitelyNoContent) {
				DbManager.getDocument().getContent().remove(new BasicDBObject(CompressedFullTextPojo.sourceKey_, sourceKey));
					// (will just check index and pull out if the doc has no external content)
			}
			BasicDBObject query = new BasicDBObject(DocumentPojo.sourceKey_, sourceKey);
			if (null != lessThanId) { // Multiple threads running for this source
				// First check whether one of the other threads has already deleted the source:
				BasicDBObject oneFinalCheckQuery = new BasicDBObject(DocumentPojo.sourceKey_, sourceKey);
				BasicDBObject oneFinalCheckFields = new BasicDBObject(DocumentPojo.index_, 1);
				BasicDBObject firstDocToBeUpdated = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(oneFinalCheckQuery, oneFinalCheckFields);
				if ((null == firstDocToBeUpdated) || firstDocToBeUpdated.getString(DocumentPojo.index_, "").equals(DELETION_INDICATOR))
				{
					//(ie grab the first doc in natural order and tell me if it's been soft-deleted yet, if so do nothing)
					return 0;
				}//TESTED
				
				// That check isn't perfect because of race conditions, so we'll still add the !="?DEL?" check to the 
				// update as well:				
				query.put(DocumentPojo._id_, new BasicDBObject(DbManager.lte_, lessThanId));
				query.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, DELETION_INDICATOR));
			}//TESTED
			
			BasicDBObject softDeleter = getSoftDeleteUpdate();
			DbManager.getDocument().getMetadata().update(query, softDeleter, false, true);
			// (don't do getLastError just yet since it can block waiting for completion)
			
			// Quick delete for index though:
			StringBuffer sb = new StringBuffer(DocumentPojoIndexMap.manyGeoDocumentIndexCollection_).append(",docs_").append(communityId).append('/').append(DocumentPojoIndexMap.documentType_);
			ElasticSearchManager indexManager = IndexManager.getIndex(sb.toString());
			BaseQueryBuilder soloOrCombinedQuery = QueryBuilders.termQuery(DocumentPojo.sourceKey_, sourceKey);
			if (null != lessThanId) {
				//(_id isn't indexed - _uid is and == _type + "#" + _id)
				soloOrCombinedQuery = QueryBuilders.boolQuery().must(soloOrCombinedQuery).
										must(QueryBuilders.rangeQuery("_uid").lte("document_index#" + lessThanId.toString()));
				
			}//TESTED
			indexManager.doDeleteByQuery(soloOrCombinedQuery);						
			
			CommandResult result = DbManager.getDocument().getLastError("metadata");
			return result.getLong("n", 0);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);			
		}
		return 0;
	}//TESTED
	
	
	/**
	 * Remove a list of doc documents from the data store and index (you have a source URL, so you can go much quicker)
	 * 
	 * CALLED FROM: removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
	 * 					MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 					processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 											DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * @returns the number of docs deleted
	 */
	
	private ElasticSearchManager _cachedIndexManagerForSourceXxxDeletion = null;
	private ObjectId _cachedCommunityIdForSourceXxxDeletion = null;
	public long removeFromDatastoreAndIndex_bySourceUrl(String sourceUrl, String sourceKey, ObjectId communityId) {
				
		try {			
			// (never any content)
			BasicDBObject query = new BasicDBObject(DocumentPojo.sourceUrl_, sourceUrl);
			query.put(DocumentPojo.sourceKey_, sourceKey);
			BasicDBObject softDeleter = getSoftDeleteUpdate();
			DbManager.getDocument().getMetadata().update(query, softDeleter, false, true);
			CommandResult result = DbManager.getDocument().getLastError("metadata");
			
			// Quick delete for index though:
			if (!communityId.equals(_cachedCommunityIdForSourceXxxDeletion)) {
				StringBuffer sb = new StringBuffer(DocumentPojoIndexMap.manyGeoDocumentIndexCollection_).append(",docs_").append(communityId).append('/').append(DocumentPojoIndexMap.documentType_);
				_cachedIndexManagerForSourceXxxDeletion = IndexManager.getIndex(sb.toString());
				_cachedCommunityIdForSourceXxxDeletion = communityId;
			}//TESTED
			_cachedIndexManagerForSourceXxxDeletion.doDeleteByQuery(
					QueryBuilders.boolQuery()
						.must(QueryBuilders.termQuery(DocumentPojo.sourceUrl_, sourceUrl))
						.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sourceKey))
					);
			
			return result.getLong("n", 0);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);			
		}
		return 0;
	}//TESTED
	
	
	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utility
	
	/**
	 * Remove a list of doc documents from the data store + adds _id and index doc fields to retrieve to support de-index
	 * (also adds created to docs with an updateId so the created remains ~the same)
	 * (Will in theory support arbitrary sourceUrl/sourceKey operators but in practice these will always be from a single source)
	 * @param docs - needs url (optionally sourceUrl) set - child function requires sourceKey
	 * @param col
	 * @param feeds
	 * 
	 * CALLED FROM: removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
	 * 					MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 					processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 											DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * 					pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX, _ID
	 * 						updateHarvestStatus(...)
	 */
	private ObjectId removeFromDatastore_byURL(DBCollection col, List<DocumentPojo> docs) {
		ObjectId nextId = null;
		BasicDBObject fields = new BasicDBObject();
		fields.put(DocumentPojo.created_, 1); // (If we're getting the deleted doc fields, get this and have exact created time)
		fields.put(DocumentPojo.index_, 1); // This is needed for the removeFromSearch() called from parent removeFromDatastore_URL
				
		TreeMap<String,DocumentPojo> sourceUrlToKeyMap = null;
		HashSet<String> deletedSources = null;
		// Store the knowledge in the feeds collection in the harvester db
		Iterator<DocumentPojo> docIt = docs.iterator();
		while (docIt.hasNext()) {
			DocumentPojo f = docIt.next();
			nextId = f.getId(); // (only interested in the pruneSource case, in which case _id is set on input)
			
			if ((null != f.getSourceUrl()) && (null == f.getUrl())) { // special case ... delete all these documents...
				if ((null == deletedSources) || !deletedSources.contains(f.getSourceKey())) { // (don't bother deleting sourceURL if deleting source)
					if (null == sourceUrlToKeyMap) {
						sourceUrlToKeyMap = new TreeMap<String,DocumentPojo>();
					}
					sourceUrlToKeyMap.put(f.getSourceUrl(), f);				
				}//TESTED

				docIt.remove(); // (so don't miscount number of docs; processed below)
			}
			else if (null != f.getSourceKey() && (null == f.getSourceUrl()) && (null == f.getUrl())) {
				// Even more special case: delete entire sourceKey
				if (null == deletedSources) {
					deletedSources = new HashSet<String>();
				}
				if (!deletedSources.contains(f.getSourceKey())) {
					deletedSources.add(f.getSourceKey());
					long srcRemoved = removeFromDatastoreAndIndex_bySourceKey(f.getSourceKey(), f.getId(), true, f.getCommunityId().toString());
					if (srcRemoved > 0) {
						updateDocCountsOnTheFly(-srcRemoved, f.getSourceKey(), f.getCommunityId());						
					}
				}
				docIt.remove(); // (so don't miscount number of docs)
			}//TESTED
			else {
				removeFromDatastore_byURL(col, f, fields, 
						StoreAndIndexManager.docHasExternalContent(f.getUrl(), f.getSourceUrl()));
					// (adds "_id", "index")
			}
		}//TESTED

		// Now tidy up sourceUrls, do some caching across sourceKey/community for performance
		String sourceKey = null; // (if deleting sourceKey don't bother deleting any sourceUrls)
		long removed = 0; // (from special operations)
		String cachedSourceKey = null; // (will handle multiple source keys, although that can't currently happen in practice)
		ObjectId communityId = null;
		if (null != sourceUrlToKeyMap) for (Map.Entry<String, DocumentPojo> entry: sourceUrlToKeyMap.entrySet()) {
			String srcUrl = entry.getKey();
			DocumentPojo doc = entry.getValue();
			sourceKey = doc.getSourceKey();
			communityId = doc.getCommunityId();
			if (sourceKey != cachedSourceKey) { // ptr comparison by design
				if (removed > 0) {
					updateDocCountsOnTheFly(-removed, sourceKey, communityId);
					removed = 0;
				}//TESTED
				cachedSourceKey = sourceKey;
			}
			removed += removeFromDatastoreAndIndex_bySourceUrl(srcUrl, sourceKey, communityId);
		}//TESTED
		if ((removed > 0) && (null != sourceKey)) {
			updateDocCountsOnTheFly(-removed, sourceKey, communityId);
		}//TESTED
		return nextId;
	}//TESTED
	
	public void updateDocCountsOnTheFly(long docIncrement, String sourceKey, ObjectId communityId)
	{
		DbManager.getDocument().getCounts().update(new BasicDBObject(DocCountPojo._id_, communityId), 
				new BasicDBObject(DbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, docIncrement)));
		DbManager.getIngest().getSource().update(new BasicDBObject(SourcePojo.key_, sourceKey),
				new BasicDBObject(MongoDbManager.inc_, 
						new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, docIncrement))
				);		
	}//TESTED
	
	/**
	 * Remove a doc from the data store
	 * @param col
	 * @param doc
	 * @param fields - fields to retrieve
	 * 
	 * CALLED FROM:	removeFromDataStore_byId(List<doc>, bDeleteContent)
	 * 					resizeDB() <- FILLS IN _ID, SOURCEKEY, INDEX, SOURCEURL
	 */
	private void removeFromDatastore_byId(DBCollection col, List<DocumentPojo> docs) {
		// Store the knowledge in the feeds collection in the harvester db			
		for ( DocumentPojo f : docs) {
			removeFromDatastore_byId(col, f);
		}
	}//TESTED
	
	/**
	 * Remove a doc from the data store
	 * @param col
	 * @param doc - assumes _id set
	 * @param fields - fields to retrieve (set in outside the doc loop for performance, url, index, sourceKey)
	 * 
	 * CALLED FROM:	removeFromDataStore_byId(col, List<doc>, bDeleteContent) 
	 * 					removeFromDataStore_byId(List<doc>, bDeleteContent) 
	 * 						resizeDB() <- _ID, SOURCEKEY, INDEX, SOURCEURL
	 */
	private void removeFromDatastore_byId(DBCollection col, DocumentPojo doc) {
		
		boolean bDeleteContent =  docHasExternalContent(doc.getUrl(), doc.getSourceUrl());
		
		if (bDeleteContent) {
			// Remove its content also:
			if (!_diagnosticMode) {
				BasicDBObject contentQuery = new BasicDBObject(DocumentPojo.url_, doc.getUrl());
				contentQuery.put(DocumentPojo.sourceKey_, doc.getSourceKey()); 
				DbManager.getDocument().getContent().remove(contentQuery);
			}
			else {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete content: " + doc.getSourceKey() + "/" + doc.getUrl());
			}
		}
		
		// Update Mongodb with the data
		BasicDBObject query = new BasicDBObject();
		query.put(DocumentPojo.sourceKey_, doc.getSourceKey());
		query.put(DocumentPojo._id_, doc.getId());
		query.put(DocumentPojo.sourceKey_, doc.getSourceKey()); // (needed because on newer machines this is the shard key)
		
		if (!_diagnosticMode) {
			BasicDBObject softDelete = getSoftDeleteUpdate();
			col.update(query, softDelete);
				// (can do this on sharded collections because it uses sourceKey+_id, the shard key)
		}
		else { // (diagnostic mode)
			if (null != col.findOne(query)) {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete: " + doc.toDb().toString());
			}
			else {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete: DOC NOT FOUND");				
			}
		}
	}//TESTED (1.1)
	
	/**
	 * Remove a doc from the data store, ensures all the fields specified in "fields" are populated (ready for index deletion)
	 * @param col
	 * @param doc - needs  url, sourceKey set
	 * @param fields - fields to retrieve (index, created), set in calling function outside of loop for performance
	 * 
	 * CALLED FROM: removeFromDatastore_byURL(col, List<doc>, bDeleteContent) <- ADDS INDEX, CREATED TO FIELDS 
	 * 					removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
	 * 						MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 						processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 												DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * 						pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX
	 * 							updateHarvestStatus(...)
	 */
	private void removeFromDatastore_byURL(DBCollection col, DocumentPojo doc, BasicDBObject fields, boolean bDeleteContent) {
		
		// 1] Create the query to soft delete the document
		
		BasicDBObject query = new BasicDBObject();
		query.put(DocumentPojo.url_, doc.getUrl());
		query.put(DocumentPojo.sourceKey_, doc.getSourceKey());

		// 2] Delete the content if needed
		
		if (bDeleteContent) {
			if (docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
				if (!_diagnosticMode) {
					DbManager.getDocument().getContent().remove(query);
				}
				else {
					System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2), delete content: " + doc.getSourceKey() + "/" + doc.getUrl());
				}
			}
		}
		//TESTED
		
		// 3] Work out which fields we have and which (if any we need to go and fetch):
		
		boolean needToFindAndModify = false;
		
		if (null == doc.getId()) { // This is called from processDocuments
			
			if (null != doc.getUpdateId()) { // update case...
				doc.setId(doc.getUpdateId()); // (note this is overwritten by addToDatastore later, in update case, so we're good)

				// (doc.index is populated but may not be correct because of the "many geos" workaround):
				if (DocumentPojoIndexMap.hasManyGeos(doc)) {
					doc.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
					// (note this check isn't stateless, it actually populates "locs" at the same time
					//  this is handled in addToDatastore (update case), temp removed when adding to DB
				}//TESTED (2.1.2, diagnostic mode, doc2)
			}
			else { // Not an update case, we're going to have to grab the document after all, which is a bit slower
				needToFindAndModify = true;
			}
		}//TESTED (2.1.2, diagnostic mode, doc2)
		if (!needToFindAndModify) { // set created if we need to, since we're not grabbing it from the datastore
			if (null != doc.getUpdateId()) { // (this means we have an approx created if we don't need to go fetch the deleted doc)
				doc.setCreated(new Date(doc.getUpdateId().getTime()));
			}//TESTED (2.1.2, diagnostic mode, doc2)					
		}
		// (if we're here and index is not set, then it is intended to be null)
		
		// 4] Update the doc_metadata collection
		
		BasicDBObject softDelete = getSoftDeleteUpdate();
		BasicDBObject deadDoc = null; // (not normally needed)
		
		if (needToFindAndModify) { // less pleasant, need to go grab the doc
			deadDoc = (BasicDBObject) col.findOne(query, fields);				
		}//TESTED (2.1.2)
		
		if (!_diagnosticMode) {
			col.update(query, softDelete, false, true); // (needs to be multi- even though there's a single element for sharding reasons)			
		}//TESTED (2.1.2)
		
		// 5] Add fields if necessary
		
		if (null != deadDoc) {
			doc.setCreated((Date) deadDoc.get(DocumentPojo.created_));
				// (if getting this doc anyway then might as well get the created)
			doc.setId((ObjectId) deadDoc.get(DocumentPojo._id_));
			doc.setIndex((String) deadDoc.get(DocumentPojo.index_));
			
			if (_diagnosticMode) {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): found " + deadDoc.toString());
			}
		}//TESTED (2.1.2)
		else if (_diagnosticMode) {
			if (!needToFindAndModify) {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): straight deleted " + doc.toDb().toString());
			}
			else {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): didn't find " + query.toString());
			}
		}//TESTED (2.1.2)
	}//TESSTED (2.1.2)

/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	

// Synchronize database with index	
	
	/**
	 * Add a list of feeds to the full text index
	 * @param docs
	 */
	public void addToSearch(List<DocumentPojo> docs) 
	{		
		String sSavedIndex = null;
		ElasticSearchManager indexManager = null;
		LinkedList<DocumentPojo> tmpDocs = new LinkedList<DocumentPojo>();
		int nTmpDocs = 0;
		for ( DocumentPojo doc : docs )
		{			
			String sThisDocIndex = doc.getIndex();
			
			if ((null == sSavedIndex) || (null == sThisDocIndex) || !sSavedIndex.equals(sThisDocIndex)) { // Change index
				
				if (null != indexManager) { // ie not first time through, bulk add what docs we have
					sendToIndex(indexManager, tmpDocs);
						// (ie with the *old* index manager)
					nTmpDocs = 0;
				}
				sSavedIndex = sThisDocIndex;
				if ((null == sSavedIndex) || (sSavedIndex.equals(DocumentPojoIndexMap.globalDocumentIndex_))) {
					indexManager = IndexManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
				}
				else {
					indexManager = IndexManager.getIndex(new StringBuffer(sSavedIndex).append('/').
																append(DocumentPojoIndexMap.documentType_).toString());					
				}
			}//TESTED		
			
			tmpDocs.add(doc);
			nTmpDocs++;
			
			if (nTmpDocs > 5000) { // some sensible upper limit
				sendToIndex(indexManager, tmpDocs);
				nTmpDocs = 0;
			}
			
			if (_diagnosticMode) {
				System.out.println("StoreAndIndexManager.addToSearch, add: " + doc.getId() + " + " +
						((null != doc.getEntities())?("" + doc.getEntities().size()):"0") + " entities, " +
						((null != doc.getAssociations())?("" + doc.getAssociations().size()):"0") + " assocs, " +
						((null != doc.getLocs())?("" + doc.getLocs().size()):"0") + " locs"
								);
			}
		}// (end loop over docs)
		
		// Bulk add remaining docs		
		sendToIndex(indexManager, tmpDocs);
			
	}//TESTED (not change since by-eye testing in Beta)
	
	// Utility required by the above function
	
	private void sendToIndex(ElasticSearchManager indexManager, LinkedList<DocumentPojo> docsToAdd) {
		try {
			if (!docsToAdd.isEmpty()) {
				if (!_diagnosticMode) {
					indexManager.bulkAddDocuments(IndexManager.mapListToIndex(docsToAdd, new TypeToken<LinkedList<DocumentPojo>>(){}, 
							new DocumentPojoIndexMap()), DocumentPojo._id_, null, true);
				}
				else {
					System.out.println("StoreAndIndexManager.addToSearch: index " + docsToAdd.size() + " documents to " + indexManager.getIndexName());
				}							
				docsToAdd.clear();				
			}
		}
		catch (Exception ex)
		{
			ex.printStackTrace();
			logger.error("Exception Message saving document to ES: " + ex.getMessage(), ex);
		}
	}//TESTED
	
	/**
	 * 
	 * @param docs (just need the id and the index and any events)
	 */

	public void removeFromSearch(List<DocumentPojo> docs) 
	{
		String sIndex = null;
		ElasticSearchManager indexManager = null;
		LinkedList<String> tmpDocs = new LinkedList<String>();
		int nTmpDocs = 0;
		for ( DocumentPojo doc : docs )
		{	
			if (null == doc.getId()) { // Normally this will be sourceUrls, eg files pointing to many docs 
				continue; // (ie can just ignore)
			}
			if ((null != doc.getIndex()) && doc.getIndex().equals("?DEL?"))  {
				continue; //(must have already been deleted, so can ignore)
			}
			if ((null == sIndex) || (null == doc.getIndex()) || !sIndex.equals(doc.getIndex())) { // Change index
				
				if (null != indexManager) { // ie not first time through, bulk delete what docs we have
					deleteFromIndex(indexManager, tmpDocs); // (clears tmpDocs)
						// (ie with the *old* index manager)
					nTmpDocs = 0;
				}
				sIndex = doc.getIndex();
				if ((null == sIndex) || (sIndex.equals(DocumentPojoIndexMap.globalDocumentIndex_))) {
					indexManager = IndexManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
				}
				else {
					indexManager = IndexManager.getIndex(new StringBuffer(sIndex).append('/').append(DocumentPojoIndexMap.documentType_).toString());					
				}
			}//TESTED		
			
			tmpDocs.add(doc.getId().toString());
			nTmpDocs++;
			
			if (nTmpDocs > 5000) { // some sensible upper limit
				deleteFromIndex(indexManager, tmpDocs); // (clears tmpDocs)
				nTmpDocs = 0;
			}
			
			//delete from event search
			if (_diagnosticMode) {
				System.out.println("StoreAndIndexManager.removeFromSearch, remove: " + doc.getId() + " + " +
						((null != doc.getEntities())?("" + doc.getEntities().size()):"0") + " entities, " +
						((null != doc.getAssociations())?("" + doc.getAssociations().size()):"0") + " assocs, " +
						((null != doc.getLocs())?("" + doc.getLocs().size()):"0") + " locs"
						);
			}			
		} // (end loop over docs)
		
		// Bulk remove remaining docs		
		deleteFromIndex(indexManager, tmpDocs);
		
	}//TESTED (not change since by-eye testing in Beta)
	
	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utility required by the above function
	
	private void deleteFromIndex(ElasticSearchManager indexManager, LinkedList<String> docsToDelete) {
		try {
			if (!docsToDelete.isEmpty()) {
				if (!_diagnosticMode) {
					indexManager.bulkDeleteDocuments(docsToDelete);
				}
				else {
					System.out.println("StoreAndIndexManager.removeFromSearch: index " + docsToDelete.size() + " documents from " + indexManager.getIndexName());
				}							
				docsToDelete.clear();				
			}
		}
		catch (Exception ex)
		{
			ex.printStackTrace();
			logger.error("Exception Message deleting document from ES: " + ex.getMessage(), ex);
		}
	}//TESTED
	
/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	
	
// Handle resizing the DB if it gets too large	

	// Utility function for diagnostic prints etc

	public long getDatabaseSize() {
		return DbManager.getDocument().getMetadata().count();
	}

	/**
	 * This function checks if DB storage requirements are met,
	 * if not it will start removing docs based on least used/oldest
	 * 
	 * @return true once DB is within bounds, false if an error occurs
	 */
	public boolean resizeDB()
	{
		return resizeDB(-1);
	}

	public boolean resizeDB(long capacityOverride)
	{
		//Do quick check to check if we are already under storage requirements
		if ( checkStorageCapacity(capacityOverride) ) {
			return false;
		}
		else
		{
			//if quick check fails, start removing docs to get under requirement
			try
			{
				long currDocsInDB = DbManager.getDocument().getMetadata().count();
				long storageCap = (capacityOverride == -1L) ? new PropertiesManager().getStorageCapacity() : capacityOverride;

				List<DocumentPojo> docsToRemove = getLeastActiveDocs((int) (currDocsInDB-storageCap));
					// (populates docsToRemove with _id and sourceKey - needed to support doc_metadata sharding)
				
				removeFromDatastore_byId(docsToRemove); // (remove content since don't know if it exists)
				//(^ this also removes from index)

				return true;
			}
			catch (Exception e)
			{
				// If an exception occurs log the error
				logger.error("Exception Message: " + e.getMessage(), e);
				return true;
			}
		}
	}//TESTED 

	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utility
	
	/**
	 * This method checks if doc count is
	 * below threshhold set in properties
	 * @return true is below threshhold, false if not
	 */
	private boolean checkStorageCapacity(long capacityOverride)
	{
		long currDocsInDB = 0;
		try {
			currDocsInDB = DbManager.getDocument().getMetadata().count();
		} catch (Exception e ) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		long storageCapacity = (-1L == capacityOverride) ? new PropertiesManager().getStorageCapacity() : capacityOverride;
		return (currDocsInDB <= storageCapacity); 
	}

	/**
	 * Returns a list of the least active documents
	 * List is of length numDocs
	 * 
	 * @param numDocs Number of documents to return that are least active
	 * @return a list of documents that are least active in DB (populates docsToRemove with _id and sourceKey - needed to support doc_metadata sharding)
)
	 */
	private List<DocumentPojo> getLeastActiveDocs(int numDocs)
	{
		List<DocumentPojo> olddocs = null;

		//TODO (INF-1301): WRITE AN ALGORITHM TO CALCULATE THIS BASED ON USAGE, just using time last accessed currently
		//give a weight to documents age and documents activity to calculate
		//least active (current incarnation doesn't work)
		try
		{
			BasicDBObject fields = new BasicDBObject(DocumentPojo._id_, 1);
			fields.put(DocumentPojo.sourceKey_, 1);
			fields.put(DocumentPojo.index_, 1);
			fields.put(DocumentPojo.sourceUrl_, 1);
			fields.put(DocumentPojo.url_, 1);
			DBCursor dbc = DbManager.getDocument().getMetadata().find(new BasicDBObject(), fields).
																	sort(new BasicDBObject(DocumentPojo._id_,1)).limit(numDocs);
			// (note, just retrieve _id and sourceKey fields: _id starts with timestamp so these are approximately oldest created)

			olddocs = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());

		}
		catch (Exception e )
		{
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		return olddocs;
	}//TESTED (1.1)			

////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////	

// Utility

		// Utility function to decide if we need to add/remove content via the external table 
		// (ie JDBC and XML have their content as part of their metadata, eg fields
		//  others like HTTP and Files can have large amounts of content that we don't want to store in the DB object)

		// Called from: (noted here because this needs to be tidied up at some point)
		// StoreAndIndexManager.addToDatastore
		// MongoDocumentTxfer.doTransfer
		// SourceUtils.pruneSource
		// StoreAndIndexManager.removeFromDataStore_by(Id|SourceKey|Url)
		// StoreAndIndexManager.saveContent
	
		static public boolean docHasExternalContent(String url, String srcUrl) {
			//TODO: INF-1367: there's an issue with this .. suppose it's some enormous JSON file
			// and we excise a bunch of JSON files from the metadata (after using them for processing)
			// seems like we should have an optional keepExternalContent that defaults to the return value 
			// of this function, but you can override from the SAH or whatever
			
			if (null != srcUrl) { // must be either JSON or XML or *sv
				return false;
			}
			else if (null == url) { // no idea, pathological case?!
				return true;
			}
			else if (url.startsWith("jdbc:")) { // DB entry
				return false;
			}
			else if (url.startsWith("inf://custom/")) { // custom entry
				return false;
			}
			else if ((url.startsWith("smb://") || url.startsWith("file:") || url.startsWith("s3://") || url.startsWith("inf://")) && 
									(url.endsWith(".xml") || url.endsWith(".json") || url.endsWith("sv")))
				// JSON/XML/*sv but 1 doc/file 				
			{
				return false;
			}
			return true;
		}	

}
Tech Fingerprint

Alerts (39)

'java.util.Date' Maintainability Info: Prefer using the modern Java Time API (java.time.* classes like LocalDate, ZonedDateTime, Instant) introduced in Java 8 over the legacy java.util.Date and Calendar classes for better API design and thread safety.
20
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
64
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
105 288 314 324 381 425 921
'Collection' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
134 187 558 609
'System.out.println(' Use a logging framework (e.g., SLF4J, Log4j) for better control and configurability
142 213 247 570 587 590 625 680 685 688 738 761 818 842
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
142 769
'.printStackTrace()' Avoid printing stack traces directly to std err/out. Use a proper logging framework to handle exceptions consistently and direct output appropriately.
220 768 849
Complexity hotspot; lines 466 to 468 (total complexity: 9)
466 467 468
'>' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
732 811
'LinkedList<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
753 835