StoreAndIndexManager.java

/core/infinit.e.processing.generic.library/src/com/ikanow/infinit/e/processing/generic/store_and_index/StoreAndIndexManager.java

https://github.com/IKANOW/Infinit.e
Java | 1009 lines | 603 code | 140 blank | 266 comment | 167 complexity | 62586a4c4c797a04244a503a51f21aa4 MD5 | raw file
Possible License(s): BSD-3-Clause

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.generic.store_and_index;


import java.net.UnknownHostException;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.elasticsearch.index.query.BaseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;

import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.IndexManager;
import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.utils.PropertiesManager;
import com.mongodb.BasicDBObject;
import com.mongodb.CommandResult;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;

/**
 * Class used to commit records to backend storage during harvest process
 * @author cmorgan
 *
 */
public class StoreAndIndexManager {

	// Initialize the Logger
	private static final Logger logger = Logger.getLogger(StoreAndIndexManager.class);	
	
	private static boolean _diagnosticMode = false;
	public static void setDiagnosticMode(boolean bMode) { _diagnosticMode = bMode; }
	
	private int nMaxContentLen_bytes = 100000; // (100KB default max)
	private boolean bStoreRawContent = false; // (store the raw as well as the processed data)
	private boolean bStoreMetadataAsContent = false; // (store the metadata in the content block)
	
	public final static String DELETION_INDICATOR = "?DEL?";
	private String harvesterUUID = null;
	public String getUUID() { return harvesterUUID; }
	
	public StoreAndIndexManager() {
		com.ikanow.infinit.e.processing.generic.utils.PropertiesManager pm = 
			new com.ikanow.infinit.e.processing.generic.utils.PropertiesManager();
		
		int nMaxContent = pm.getMaxContentSize();
		if (nMaxContent > -1) {
			nMaxContentLen_bytes = nMaxContent;
		}		
		bStoreRawContent = pm.storeRawContent();
		bStoreMetadataAsContent = pm.storeMetadataAsContent();
		
		try {
			StringBuffer sb = new StringBuffer(DELETION_INDICATOR).append(java.net.InetAddress.getLocalHost().getHostName());
			harvesterUUID = sb.toString();
		} catch (UnknownHostException e) {
			harvesterUUID = DELETION_INDICATOR + "UNKNOWN";
		}
	}
	
/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	
	
// Datastore addition		
	
	/**
	 * Add a list of doc documents to the data store
	 * @param feeds
	 */
	public void addToDatastore(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source) {
		try {
			// Create collection manager
			// Add to data store
			addToDatastore(DbManager.getDocument().getMetadata(), docs);
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		// (note: currently modifies docs, see DocumentIndexPojoMap, so beware if using after this point)
		if (bSaveContent) {
			saveContent(docs);
		}
		boolean index = true;
		if ((null != source) && (null != source.getSearchIndexFilter())) {
			if (null != source.getSearchIndexFilter().indexOnIngest) {
				index = source.getSearchIndexFilter().indexOnIngest;
			}
		}
		if (index) {
			this.addToSearch(docs);
		}
		
	}//TESTED
	
	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utilities
	
	/**
	 * Add a single doc document to the datastore
	 * @param col
	 * @param doc
	 */
	private void addToDatastore(DBCollection col, DocumentPojo doc) {
		if (!_diagnosticMode) {
			if (!docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
				doc.makeFullTextNonTransient(); // (ie store full text in this case)
			}
			col.save(doc.toDb());
		}
		else {
			System.out.println("StoreAndIndexManager.addToDatastore: " + ((BasicDBObject)doc.toDb()).toString());
		}
	}//TESTED
	
	/**
	 * Add a list of doc documents to the data store
	 * @param feeds
	 */
	private void addToDatastore(DBCollection col, List<DocumentPojo> docs) {
		// Store the knowledge in the feeds collection in the harvester db			
		for ( DocumentPojo f : docs) {
			
			// Set an _id before writing it to the datastore,
			// so the same _id gets written to the index
			// NOTE WE OVERWRITE ANY TRANSIENT IDS THAT MIGHT HAVE BEEN SET eg BY REMOVE CODE
			f.setId(new ObjectId());
			
			// Check geo-size: need to add to a different index if so, for memory usage reasons
			if (null == f.getLocs()) { // (can be set by update/deletion code also)
				if (DocumentPojoIndexMap.hasManyGeos(f)) {
					f.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
					// (note this check isn't stateless, it actually populates "locs" at the same time)
					// therefore...
				}
			}
			Set<String> locs = f.getLocs();
			f.setLocs(null);
			
			addToDatastore(col, f);
			
			f.setLocs(locs);
		}
	}//TESTED

	//////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Save the fulltext of a pojo to mongo for using later
	 * 
	 * @param docs
	 */
	private void saveContent(List<DocumentPojo> docs)
	{
		try
		{
			DBCollection contentDb = DbManager.getDocument().getContent();
			
			for ( DocumentPojo doc : docs )
			{
				boolean bStoreContent = true;
				bStoreContent &= (0 != nMaxContentLen_bytes); // (otherwise it's turned off)
				bStoreContent &= this.bStoreMetadataAsContent || ((null != doc.getFullText()) && !doc.getFullText().isEmpty());				
				boolean bDocHasExternalContent = docHasExternalContent(doc.getUrl(), doc.getSourceUrl());
				
				if (bStoreContent && bDocHasExternalContent) {
					try
					{
						String rawText = this.bStoreRawContent ? doc.getRawFullText() : null; 
						DocumentPojo meta = bStoreMetadataAsContent ? doc : null; 
						CompressedFullTextPojo gzippedContent = new CompressedFullTextPojo(doc.getUrl(), doc.getSourceKey(), doc.getCommunityId(),
																							doc.getFullText(), rawText, meta, nMaxContentLen_bytes);
						
						if (null != gzippedContent.getUrl())  {
							// Be efficient and write field-by-field vs using JSON conversion
							BasicDBObject query = new BasicDBObject(CompressedFullTextPojo.url_, gzippedContent.getUrl());
							query.put(CompressedFullTextPojo.sourceKey_, gzippedContent.getSourceKey());
							BasicDBObject update = gzippedContent.getUpdate();
							if (!_diagnosticMode) {
								contentDb.update(query, update, true, false); // (ie upsert, supported because query includes shard key==url)
							}
							else {
								System.out.println("StoreAndIndexManager.savedContent, save content: " + gzippedContent.getUrl());
							}
						}
					}
					catch (Exception ex)
					{
						// Do nothing, just carry on
						ex.printStackTrace();
					}
				}//TESTED
			}
		}
		catch (Exception ex)
		{
			// This is a more serious error
			logger.error(ex.getMessage());
		}
	}//TESTED (not changed since by-eye testing in Beta)
	
	
/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	
	
// Datastore removal		

	/**
	 * This function removes documents "soft deleted" by this harvester
	 */
	
	public void removeSoftDeletedDocuments()
	{
		BasicDBObject query = new BasicDBObject(DocumentPojo.url_, harvesterUUID);
		
		if (_diagnosticMode) {
			System.out.println("Soft delete: " + DbManager.getDocument().getMetadata().count(query));			
		}
		else {
			DbManager.getDocument().getMetadata().remove(query);			
		}
	}//TESTED
	
	/**
	 * Low level utility to abstract soft deletion
	 * We're using URL because 1) we cant' use a shard key
	 * 2) it needs to be an indexed field
	 * 3) ideally one that is likely to be cached in memory
	 * 4) one that minimizes the chance of having to move the document when modifying the field
	 * (I also considered sourceUrl or an all new field, they _might_ be better because smaller, but conversely
	 *  would be less likely to be cached and most importantly there's the risk of 4)
	 */
	
	private BasicDBObject _softDeleter = null;
	
	private BasicDBObject getSoftDeleteUpdate()
	{
		if (null == _softDeleter) {
			BasicDBObject softDeleter = new BasicDBObject(DocumentPojo.url_, harvesterUUID);
			softDeleter.put(DocumentPojo.index_, DELETION_INDICATOR);
				// (used in CustomHadoopTaskLauncher.createConfigXML)
			_softDeleter = new BasicDBObject(DbManager.set_, softDeleter);
		}
		return _softDeleter;
	}//TESTED

	/**
	 * Remove a list of doc documents from the data store (you have their _id and sourceKey)
	 * 
	 * CALLED FROM:	resizeDB() <- FILLS IN _ID, SOURCEKEY, INDEX, URL, SOURCEURL
	 */
	public void removeFromDatastore_byId(List<DocumentPojo> docs) {
		try {
			// Remove from data store
			removeFromDatastore_byId(DbManager.getDocument().getMetadata(), docs);			
			this.removeFromSearch(docs);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
	}//TESTED
	
	/**
	 * Remove a list of doc documents from the data store (you have their url) AND ALSO the search index
	 * 
	 * @param docs - child function needs url (optionally sourceUrl) set - child function requires sourceKey
	 * 					this function needs id and index both of which are set by the child stack
	 * 
	 * CALLED FROM: MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 				processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 										DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * 				pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX, _ID
	 * 					updateHarvestStatus(...)
	 */
	public ObjectId removeFromDatastore_byURL(List<DocumentPojo> docs) {
		
		// Remove from data store:
		ObjectId nextId = null;
		try {
			nextId = removeFromDatastore_byURL(DbManager.getDocument().getMetadata(), docs);			
				// ^^^ adds "created" (if updateId set), "_id" and "index" to the doc and expands "sourceUrl" docs (adding "_id" and "index")
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		
		// Remove from index:
		
		try {
			this.removeFromSearch(docs);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		
		return nextId;
	}//TESTED
	
	/**
	 * Remove a list of doc documents from the data store (you have a source key, so you can go much quicker)
	 * CALLED FROM:	deleteSource(...) 
	 * @returns the number of docs deleted
	 */
	public long removeFromDatastoreAndIndex_bySourceKey(String sourceKey, ObjectId lessThanId, boolean definitelyNoContent, String communityId) {
				
		try {			
			if (!definitelyNoContent) {
				DbManager.getDocument().getContent().remove(new BasicDBObject(CompressedFullTextPojo.sourceKey_, sourceKey));
					// (will just check index and pull out if the doc has no external content)
			}
			BasicDBObject query = new BasicDBObject(DocumentPojo.sourceKey_, sourceKey);
			if (null != lessThanId) { // Multiple threads running for this source
				// First check whether one of the other threads has already deleted the source:
				BasicDBObject oneFinalCheckQuery = new BasicDBObject(DocumentPojo.sourceKey_, sourceKey);
				BasicDBObject oneFinalCheckFields = new BasicDBObject(DocumentPojo.index_, 1);
				BasicDBObject firstDocToBeUpdated = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(oneFinalCheckQuery, oneFinalCheckFields);
				if ((null == firstDocToBeUpdated) || firstDocToBeUpdated.getString(DocumentPojo.index_, "").equals(DELETION_INDICATOR))
				{
					//(ie grab the first doc in natural order and tell me if it's been soft-deleted yet, if so do nothing)
					return 0;
				}//TESTED
				
				// That check isn't perfect because of race conditions, so we'll still add the !="?DEL?" check to the 
				// update as well:				
				query.put(DocumentPojo._id_, new BasicDBObject(DbManager.lte_, lessThanId));
				query.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, DELETION_INDICATOR));
			}//TESTED
			
			BasicDBObject softDeleter = getSoftDeleteUpdate();
			DbManager.getDocument().getMetadata().update(query, softDeleter, false, true);
			// (don't do getLastError just yet since it can block waiting for completion)
			
			// Quick delete for index though:
			StringBuffer sb = new StringBuffer(DocumentPojoIndexMap.manyGeoDocumentIndexCollection_).append(",docs_").append(communityId).append('/').append(DocumentPojoIndexMap.documentType_);
			ElasticSearchManager indexManager = IndexManager.getIndex(sb.toString());
			BaseQueryBuilder soloOrCombinedQuery = QueryBuilders.termQuery(DocumentPojo.sourceKey_, sourceKey);
			if (null != lessThanId) {
				//(_id isn't indexed - _uid is and == _type + "#" + _id)
				soloOrCombinedQuery = QueryBuilders.boolQuery().must(soloOrCombinedQuery).
										must(QueryBuilders.rangeQuery("_uid").lte("document_index#" + lessThanId.toString()));
				
			}//TESTED
			indexManager.doDeleteByQuery(soloOrCombinedQuery);						
			
			CommandResult result = DbManager.getDocument().getLastError("metadata");
			return result.getLong("n", 0);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);			
		}
		return 0;
	}//TESTED
	
	
	/**
	 * Remove a list of doc documents from the data store and index (you have a source URL, so you can go much quicker)
	 * 
	 * CALLED FROM: removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
	 * 					MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 					processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 											DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * @returns the number of docs deleted
	 */
	
	private ElasticSearchManager _cachedIndexManagerForSourceXxxDeletion = null;
	private ObjectId _cachedCommunityIdForSourceXxxDeletion = null;
	public long removeFromDatastoreAndIndex_bySourceUrl(String sourceUrl, String sourceKey, ObjectId communityId) {
				
		try {			
			// (never any content)
			BasicDBObject query = new BasicDBObject(DocumentPojo.sourceUrl_, sourceUrl);
			query.put(DocumentPojo.sourceKey_, sourceKey);
			BasicDBObject softDeleter = getSoftDeleteUpdate();
			DbManager.getDocument().getMetadata().update(query, softDeleter, false, true);
			CommandResult result = DbManager.getDocument().getLastError("metadata");
			
			// Quick delete for index though:
			if (!communityId.equals(_cachedCommunityIdForSourceXxxDeletion)) {
				StringBuffer sb = new StringBuffer(DocumentPojoIndexMap.manyGeoDocumentIndexCollection_).append(",docs_").append(communityId).append('/').append(DocumentPojoIndexMap.documentType_);
				_cachedIndexManagerForSourceXxxDeletion = IndexManager.getIndex(sb.toString());
				_cachedCommunityIdForSourceXxxDeletion = communityId;
			}//TESTED
			_cachedIndexManagerForSourceXxxDeletion.doDeleteByQuery(
					QueryBuilders.boolQuery()
						.must(QueryBuilders.termQuery(DocumentPojo.sourceUrl_, sourceUrl))
						.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sourceKey))
					);
			
			return result.getLong("n", 0);
			
		} catch (Exception e) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);			
		}
		return 0;
	}//TESTED
	
	
	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utility
	
	/**
	 * Remove a list of doc documents from the data store + adds _id and index doc fields to retrieve to support de-index
	 * (also adds created to docs with an updateId so the created remains ~the same)
	 * (Will in theory support arbitrary sourceUrl/sourceKey operators but in practice these will always be from a single source)
	 * @param docs - needs url (optionally sourceUrl) set - child function requires sourceKey
	 * @param col
	 * @param feeds
	 * 
	 * CALLED FROM: removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
	 * 					MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 					processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 											DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * 					pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX, _ID
	 * 						updateHarvestStatus(...)
	 */
	private ObjectId removeFromDatastore_byURL(DBCollection col, List<DocumentPojo> docs) {
		ObjectId nextId = null;
		BasicDBObject fields = new BasicDBObject();
		fields.put(DocumentPojo.created_, 1); // (If we're getting the deleted doc fields, get this and have exact created time)
		fields.put(DocumentPojo.index_, 1); // This is needed for the removeFromSearch() called from parent removeFromDatastore_URL
				
		TreeMap<String,DocumentPojo> sourceUrlToKeyMap = null;
		HashSet<String> deletedSources = null;
		// Store the knowledge in the feeds collection in the harvester db
		Iterator<DocumentPojo> docIt = docs.iterator();
		while (docIt.hasNext()) {
			DocumentPojo f = docIt.next();
			nextId = f.getId(); // (only interested in the pruneSource case, in which case _id is set on input)
			
			if ((null != f.getSourceUrl()) && (null == f.getUrl())) { // special case ... delete all these documents...
				if ((null == deletedSources) || !deletedSources.contains(f.getSourceKey())) { // (don't bother deleting sourceURL if deleting source)
					if (null == sourceUrlToKeyMap) {
						sourceUrlToKeyMap = new TreeMap<String,DocumentPojo>();
					}
					sourceUrlToKeyMap.put(f.getSourceUrl(), f);				
				}//TESTED

				docIt.remove(); // (so don't miscount number of docs; processed below)
			}
			else if (null != f.getSourceKey() && (null == f.getSourceUrl()) && (null == f.getUrl())) {
				// Even more special case: delete entire sourceKey
				if (null == deletedSources) {
					deletedSources = new HashSet<String>();
				}
				if (!deletedSources.contains(f.getSourceKey())) {
					deletedSources.add(f.getSourceKey());
					long srcRemoved = removeFromDatastoreAndIndex_bySourceKey(f.getSourceKey(), f.getId(), true, f.getCommunityId().toString());
					if (srcRemoved > 0) {
						updateDocCountsOnTheFly(-srcRemoved, f.getSourceKey(), f.getCommunityId());						
					}
				}
				docIt.remove(); // (so don't miscount number of docs)
			}//TESTED
			else {
				removeFromDatastore_byURL(col, f, fields, 
						StoreAndIndexManager.docHasExternalContent(f.getUrl(), f.getSourceUrl()));
					// (adds "_id", "index")
			}
		}//TESTED

		// Now tidy up sourceUrls, do some caching across sourceKey/community for performance
		String sourceKey = null; // (if deleting sourceKey don't bother deleting any sourceUrls)
		long removed = 0; // (from special operations)
		String cachedSourceKey = null; // (will handle multiple source keys, although that can't currently happen in practice)
		ObjectId communityId = null;
		if (null != sourceUrlToKeyMap) for (Map.Entry<String, DocumentPojo> entry: sourceUrlToKeyMap.entrySet()) {
			String srcUrl = entry.getKey();
			DocumentPojo doc = entry.getValue();
			sourceKey = doc.getSourceKey();
			communityId = doc.getCommunityId();
			if (sourceKey != cachedSourceKey) { // ptr comparison by design
				if (removed > 0) {
					updateDocCountsOnTheFly(-removed, sourceKey, communityId);
					removed = 0;
				}//TESTED
				cachedSourceKey = sourceKey;
			}
			removed += removeFromDatastoreAndIndex_bySourceUrl(srcUrl, sourceKey, communityId);
		}//TESTED
		if ((removed > 0) && (null != sourceKey)) {
			updateDocCountsOnTheFly(-removed, sourceKey, communityId);
		}//TESTED
		return nextId;
	}//TESTED
	
	public void updateDocCountsOnTheFly(long docIncrement, String sourceKey, ObjectId communityId)
	{
		DbManager.getDocument().getCounts().update(new BasicDBObject(DocCountPojo._id_, communityId), 
				new BasicDBObject(DbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, docIncrement)));
		DbManager.getIngest().getSource().update(new BasicDBObject(SourcePojo.key_, sourceKey),
				new BasicDBObject(MongoDbManager.inc_, 
						new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, docIncrement))
				);		
	}//TESTED
	
	/**
	 * Remove a doc from the data store
	 * @param col
	 * @param doc
	 * @param fields - fields to retrieve
	 * 
	 * CALLED FROM:	removeFromDataStore_byId(List<doc>, bDeleteContent)
	 * 					resizeDB() <- FILLS IN _ID, SOURCEKEY, INDEX, SOURCEURL
	 */
	private void removeFromDatastore_byId(DBCollection col, List<DocumentPojo> docs) {
		// Store the knowledge in the feeds collection in the harvester db			
		for ( DocumentPojo f : docs) {
			removeFromDatastore_byId(col, f);
		}
	}//TESTED
	
	/**
	 * Remove a doc from the data store
	 * @param col
	 * @param doc - assumes _id set
	 * @param fields - fields to retrieve (set in outside the doc loop for performance, url, index, sourceKey)
	 * 
	 * CALLED FROM:	removeFromDataStore_byId(col, List<doc>, bDeleteContent) 
	 * 					removeFromDataStore_byId(List<doc>, bDeleteContent) 
	 * 						resizeDB() <- _ID, SOURCEKEY, INDEX, SOURCEURL
	 */
	private void removeFromDatastore_byId(DBCollection col, DocumentPojo doc) {
		
		boolean bDeleteContent =  docHasExternalContent(doc.getUrl(), doc.getSourceUrl());
		
		if (bDeleteContent) {
			// Remove its content also:
			if (!_diagnosticMode) {
				BasicDBObject contentQuery = new BasicDBObject(DocumentPojo.url_, doc.getUrl());
				contentQuery.put(DocumentPojo.sourceKey_, doc.getSourceKey()); 
				DbManager.getDocument().getContent().remove(contentQuery);
			}
			else {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete content: " + doc.getSourceKey() + "/" + doc.getUrl());
			}
		}
		
		// Update Mongodb with the data
		BasicDBObject query = new BasicDBObject();
		query.put(DocumentPojo.sourceKey_, doc.getSourceKey());
		query.put(DocumentPojo._id_, doc.getId());
		query.put(DocumentPojo.sourceKey_, doc.getSourceKey()); // (needed because on newer machines this is the shard key)
		
		if (!_diagnosticMode) {
			BasicDBObject softDelete = getSoftDeleteUpdate();
			col.update(query, softDelete);
				// (can do this on sharded collections because it uses sourceKey+_id, the shard key)
		}
		else { // (diagnostic mode)
			if (null != col.findOne(query)) {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete: " + doc.toDb().toString());
			}
			else {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byId, delete: DOC NOT FOUND");				
			}
		}
	}//TESTED (1.1)
	
	/**
	 * Remove a doc from the data store, ensures all the fields specified in "fields" are populated (ready for index deletion)
	 * @param col
	 * @param doc - needs  url, sourceKey set
	 * @param fields - fields to retrieve (index, created), set in calling function outside of loop for performance
	 * 
	 * CALLED FROM: removeFromDatastore_byURL(col, List<doc>, bDeleteContent) <- ADDS INDEX, CREATED TO FIELDS 
	 * 					removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
	 * 						MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
	 * 						processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
	 * 												DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
	 * 						pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX
	 * 							updateHarvestStatus(...)
	 */
	private void removeFromDatastore_byURL(DBCollection col, DocumentPojo doc, BasicDBObject fields, boolean bDeleteContent) {
		
		// 1] Create the query to soft delete the document
		
		BasicDBObject query = new BasicDBObject();
		query.put(DocumentPojo.url_, doc.getUrl());
		query.put(DocumentPojo.sourceKey_, doc.getSourceKey());

		// 2] Delete the content if needed
		
		if (bDeleteContent) {
			if (docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
				if (!_diagnosticMode) {
					DbManager.getDocument().getContent().remove(query);
				}
				else {
					System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2), delete content: " + doc.getSourceKey() + "/" + doc.getUrl());
				}
			}
		}
		//TESTED
		
		// 3] Work out which fields we have and which (if any we need to go and fetch):
		
		boolean needToFindAndModify = false;
		
		if (null == doc.getId()) { // This is called from processDocuments
			
			if (null != doc.getUpdateId()) { // update case...
				doc.setId(doc.getUpdateId()); // (note this is overwritten by addToDatastore later, in update case, so we're good)

				// (doc.index is populated but may not be correct because of the "many geos" workaround):
				if (DocumentPojoIndexMap.hasManyGeos(doc)) {
					doc.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
					// (note this check isn't stateless, it actually populates "locs" at the same time
					//  this is handled in addToDatastore (update case), temp removed when adding to DB
				}//TESTED (2.1.2, diagnostic mode, doc2)
			}
			else { // Not an update case, we're going to have to grab the document after all, which is a bit slower
				needToFindAndModify = true;
			}
		}//TESTED (2.1.2, diagnostic mode, doc2)
		if (!needToFindAndModify) { // set created if we need to, since we're not grabbing it from the datastore
			if (null != doc.getUpdateId()) { // (this means we have an approx created if we don't need to go fetch the deleted doc)
				doc.setCreated(new Date(doc.getUpdateId().getTime()));
			}//TESTED (2.1.2, diagnostic mode, doc2)					
		}
		// (if we're here and index is not set, then it is intended to be null)
		
		// 4] Update the doc_metadata collection
		
		BasicDBObject softDelete = getSoftDeleteUpdate();
		BasicDBObject deadDoc = null; // (not normally needed)
		
		if (needToFindAndModify) { // less pleasant, need to go grab the doc
			deadDoc = (BasicDBObject) col.findOne(query, fields);				
		}//TESTED (2.1.2)
		
		if (!_diagnosticMode) {
			col.update(query, softDelete, false, true); // (needs to be multi- even though there's a single element for sharding reasons)			
		}//TESTED (2.1.2)
		
		// 5] Add fields if necessary
		
		if (null != deadDoc) {
			doc.setCreated((Date) deadDoc.get(DocumentPojo.created_));
				// (if getting this doc anyway then might as well get the created)
			doc.setId((ObjectId) deadDoc.get(DocumentPojo._id_));
			doc.setIndex((String) deadDoc.get(DocumentPojo.index_));
			
			if (_diagnosticMode) {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): found " + deadDoc.toString());
			}
		}//TESTED (2.1.2)
		else if (_diagnosticMode) {
			if (!needToFindAndModify) {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): straight deleted " + doc.toDb().toString());
			}
			else {
				System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): didn't find " + query.toString());
			}
		}//TESTED (2.1.2)
	}//TESSTED (2.1.2)

/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	

// Synchronize database with index	
	
	/**
	 * Add a list of feeds to the full text index
	 * @param docs
	 */
	public void addToSearch(List<DocumentPojo> docs) 
	{		
		String sSavedIndex = null;
		ElasticSearchManager indexManager = null;
		LinkedList<DocumentPojo> tmpDocs = new LinkedList<DocumentPojo>();
		int nTmpDocs = 0;
		for ( DocumentPojo doc : docs )
		{			
			String sThisDocIndex = doc.getIndex();
			
			if ((null == sSavedIndex) || (null == sThisDocIndex) || !sSavedIndex.equals(sThisDocIndex)) { // Change index
				
				if (null != indexManager) { // ie not first time through, bulk add what docs we have
					sendToIndex(indexManager, tmpDocs);
						// (ie with the *old* index manager)
					nTmpDocs = 0;
				}
				sSavedIndex = sThisDocIndex;
				if ((null == sSavedIndex) || (sSavedIndex.equals(DocumentPojoIndexMap.globalDocumentIndex_))) {
					indexManager = IndexManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
				}
				else {
					indexManager = IndexManager.getIndex(new StringBuffer(sSavedIndex).append('/').
																append(DocumentPojoIndexMap.documentType_).toString());					
				}
			}//TESTED		
			
			tmpDocs.add(doc);
			nTmpDocs++;
			
			if (nTmpDocs > 5000) { // some sensible upper limit
				sendToIndex(indexManager, tmpDocs);
				nTmpDocs = 0;
			}
			
			if (_diagnosticMode) {
				System.out.println("StoreAndIndexManager.addToSearch, add: " + doc.getId() + " + " +
						((null != doc.getEntities())?("" + doc.getEntities().size()):"0") + " entities, " +
						((null != doc.getAssociations())?("" + doc.getAssociations().size()):"0") + " assocs, " +
						((null != doc.getLocs())?("" + doc.getLocs().size()):"0") + " locs"
								);
			}
		}// (end loop over docs)
		
		// Bulk add remaining docs		
		sendToIndex(indexManager, tmpDocs);
			
	}//TESTED (not change since by-eye testing in Beta)
	
	// Utility required by the above function
	
	private void sendToIndex(ElasticSearchManager indexManager, LinkedList<DocumentPojo> docsToAdd) {
		try {
			if (!docsToAdd.isEmpty()) {
				if (!_diagnosticMode) {
					indexManager.bulkAddDocuments(IndexManager.mapListToIndex(docsToAdd, new TypeToken<LinkedList<DocumentPojo>>(){}, 
							new DocumentPojoIndexMap()), DocumentPojo._id_, null, true);
				}
				else {
					System.out.println("StoreAndIndexManager.addToSearch: index " + docsToAdd.size() + " documents to " + indexManager.getIndexName());
				}							
				docsToAdd.clear();				
			}
		}
		catch (Exception ex)
		{
			ex.printStackTrace();
			logger.error("Exception Message saving document to ES: " + ex.getMessage(), ex);
		}
	}//TESTED
	
	/**
	 * 
	 * @param docs (just need the id and the index and any events)
	 */

	public void removeFromSearch(List<DocumentPojo> docs) 
	{
		String sIndex = null;
		ElasticSearchManager indexManager = null;
		LinkedList<String> tmpDocs = new LinkedList<String>();
		int nTmpDocs = 0;
		for ( DocumentPojo doc : docs )
		{	
			if (null == doc.getId()) { // Normally this will be sourceUrls, eg files pointing to many docs 
				continue; // (ie can just ignore)
			}
			if ((null != doc.getIndex()) && doc.getIndex().equals("?DEL?"))  {
				continue; //(must have already been deleted, so can ignore)
			}
			if ((null == sIndex) || (null == doc.getIndex()) || !sIndex.equals(doc.getIndex())) { // Change index
				
				if (null != indexManager) { // ie not first time through, bulk delete what docs we have
					deleteFromIndex(indexManager, tmpDocs); // (clears tmpDocs)
						// (ie with the *old* index manager)
					nTmpDocs = 0;
				}
				sIndex = doc.getIndex();
				if ((null == sIndex) || (sIndex.equals(DocumentPojoIndexMap.globalDocumentIndex_))) {
					indexManager = IndexManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
				}
				else {
					indexManager = IndexManager.getIndex(new StringBuffer(sIndex).append('/').append(DocumentPojoIndexMap.documentType_).toString());					
				}
			}//TESTED		
			
			tmpDocs.add(doc.getId().toString());
			nTmpDocs++;
			
			if (nTmpDocs > 5000) { // some sensible upper limit
				deleteFromIndex(indexManager, tmpDocs); // (clears tmpDocs)
				nTmpDocs = 0;
			}
			
			//delete from event search
			if (_diagnosticMode) {
				System.out.println("StoreAndIndexManager.removeFromSearch, remove: " + doc.getId() + " + " +
						((null != doc.getEntities())?("" + doc.getEntities().size()):"0") + " entities, " +
						((null != doc.getAssociations())?("" + doc.getAssociations().size()):"0") + " assocs, " +
						((null != doc.getLocs())?("" + doc.getLocs().size()):"0") + " locs"
						);
			}			
		} // (end loop over docs)
		
		// Bulk remove remaining docs		
		deleteFromIndex(indexManager, tmpDocs);
		
	}//TESTED (not change since by-eye testing in Beta)
	
	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utility required by the above function
	
	private void deleteFromIndex(ElasticSearchManager indexManager, LinkedList<String> docsToDelete) {
		try {
			if (!docsToDelete.isEmpty()) {
				if (!_diagnosticMode) {
					indexManager.bulkDeleteDocuments(docsToDelete);
				}
				else {
					System.out.println("StoreAndIndexManager.removeFromSearch: index " + docsToDelete.size() + " documents from " + indexManager.getIndexName());
				}							
				docsToDelete.clear();				
			}
		}
		catch (Exception ex)
		{
			ex.printStackTrace();
			logger.error("Exception Message deleting document from ES: " + ex.getMessage(), ex);
		}
	}//TESTED
	
/////////////////////////////////////////////////////////////////////////////////////////////////	
/////////////////////////////////////////////////////////////////////////////////////////////////	
	
// Handle resizing the DB if it gets too large	

	// Utility function for diagnostic prints etc

	public long getDatabaseSize() {
		return DbManager.getDocument().getMetadata().count();
	}

	/**
	 * This function checks if DB storage requirements are met,
	 * if not it will start removing docs based on least used/oldest
	 * 
	 * @return true once DB is within bounds, false if an error occurs
	 */
	public boolean resizeDB()
	{
		return resizeDB(-1);
	}

	public boolean resizeDB(long capacityOverride)
	{
		//Do quick check to check if we are already under storage requirements
		if ( checkStorageCapacity(capacityOverride) ) {
			return false;
		}
		else
		{
			//if quick check fails, start removing docs to get under requirement
			try
			{
				long currDocsInDB = DbManager.getDocument().getMetadata().count();
				long storageCap = (capacityOverride == -1L) ? new PropertiesManager().getStorageCapacity() : capacityOverride;

				List<DocumentPojo> docsToRemove = getLeastActiveDocs((int) (currDocsInDB-storageCap));
					// (populates docsToRemove with _id and sourceKey - needed to support doc_metadata sharding)
				
				removeFromDatastore_byId(docsToRemove); // (remove content since don't know if it exists)
				//(^ this also removes from index)

				return true;
			}
			catch (Exception e)
			{
				// If an exception occurs log the error
				logger.error("Exception Message: " + e.getMessage(), e);
				return true;
			}
		}
	}//TESTED 

	/////////////////////////////////////////////////////////////////////////////////////////////////
	
	// Utility
	
	/**
	 * This method checks if doc count is
	 * below threshhold set in properties
	 * @return true is below threshhold, false if not
	 */
	private boolean checkStorageCapacity(long capacityOverride)
	{
		long currDocsInDB = 0;
		try {
			currDocsInDB = DbManager.getDocument().getMetadata().count();
		} catch (Exception e ) {
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		long storageCapacity = (-1L == capacityOverride) ? new PropertiesManager().getStorageCapacity() : capacityOverride;
		return (currDocsInDB <= storageCapacity); 
	}

	/**
	 * Returns a list of the least active documents
	 * List is of length numDocs
	 * 
	 * @param numDocs Number of documents to return that are least active
	 * @return a list of documents that are least active in DB (populates docsToRemove with _id and sourceKey - needed to support doc_metadata sharding)
)
	 */
	private List<DocumentPojo> getLeastActiveDocs(int numDocs)
	{
		List<DocumentPojo> olddocs = null;

		//TODO (INF-1301): WRITE AN ALGORITHM TO CALCULATE THIS BASED ON USAGE, just using time last accessed currently
		//give a weight to documents age and documents activity to calculate
		//least active (current incarnation doesn't work)
		try
		{
			BasicDBObject fields = new BasicDBObject(DocumentPojo._id_, 1);
			fields.put(DocumentPojo.sourceKey_, 1);
			fields.put(DocumentPojo.index_, 1);
			fields.put(DocumentPojo.sourceUrl_, 1);
			fields.put(DocumentPojo.url_, 1);
			DBCursor dbc = DbManager.getDocument().getMetadata().find(new BasicDBObject(), fields).
																	sort(new BasicDBObject(DocumentPojo._id_,1)).limit(numDocs);
			// (note, just retrieve _id and sourceKey fields: _id starts with timestamp so these are approximately oldest created)

			olddocs = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());

		}
		catch (Exception e )
		{
			// If an exception occurs log the error
			logger.error("Exception Message: " + e.getMessage(), e);
		}
		return olddocs;
	}//TESTED (1.1)			

////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////	

// Utility

		// Utility function to decide if we need to add/remove content via the external table 
		// (ie JDBC and XML have their content as part of their metadata, eg fields
		//  others like HTTP and Files can have large amounts of content that we don't want to store in the DB object)

		// Called from: (noted here because this needs to be tidied up at some point)
		// StoreAndIndexManager.addToDatastore
		// MongoDocumentTxfer.doTransfer
		// SourceUtils.pruneSource
		// StoreAndIndexManager.removeFromDataStore_by(Id|SourceKey|Url)
		// StoreAndIndexManager.saveContent
	
		static public boolean docHasExternalContent(String url, String srcUrl) {
			//TODO: INF-1367: there's an issue with this .. suppose it's some enormous JSON file
			// and we excise a bunch of JSON files from the metadata (after using them for processing)
			// seems like we should have an optional keepExternalContent that defaults to the return value 
			// of this function, but you can override from the SAH or whatever
			
			if (null != srcUrl) { // must be either JSON or XML or *sv
				return false;
			}
			else if (null == url) { // no idea, pathological case?!
				return true;
			}
			else if (url.startsWith("jdbc:")) { // DB entry
				return false;
			}
			else if (url.startsWith("inf://custom/")) { // custom entry
				return false;
			}
			else if ((url.startsWith("smb://") || url.startsWith("file:") || url.startsWith("s3://") || url.startsWith("inf://")) && 
									(url.endsWith(".xml") || url.endsWith(".json") || url.endsWith("sv")))
				// JSON/XML/*sv but 1 doc/file 				
			{
				return false;
			}
			return true;
		}	

}