/core/infinit.e.processing.generic.library/src/com/ikanow/infinit/e/processing/generic/GenericProcessingController.java
Java | 813 lines | 510 code | 107 blank | 196 comment | 118 complexity | ee5054c21a6815af08f46f41752bc0e4 MD5 | raw file
Possible License(s): BSD-3-Clause
- /*******************************************************************************
- * Copyright 2012, The Infinit.e Open Source Project.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License, version 3,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- ******************************************************************************/
- package com.ikanow.infinit.e.processing.generic;
- import java.util.HashMap;
- import java.util.List;
- //import org.apache.log4j.Logger;
- import org.bson.types.ObjectId;
- import org.elasticsearch.common.settings.ImmutableSettings;
- import org.elasticsearch.common.settings.ImmutableSettings.Builder;
- import com.google.gson.Gson;
- import com.ikanow.infinit.e.data_model.InfiniteEnums;
- import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
- import com.ikanow.infinit.e.data_model.index.IndexManager;
- import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
- import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap;
- import com.ikanow.infinit.e.data_model.index.feature.event.AssociationFeaturePojoIndexMap;
- import com.ikanow.infinit.e.data_model.store.DbManager;
- import com.ikanow.infinit.e.data_model.store.MongoDbManager;
- import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
- import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
- import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
- import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
- import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
- import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
- import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
- import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
- import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
- import com.ikanow.infinit.e.data_model.store.social.community.CommunityPojo;
- import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager;
- import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
- import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager;
- import com.mongodb.BasicDBObject;
- import com.mongodb.DBCollection;
- import com.mongodb.DBCursor;
- import com.mongodb.DBObject;
- import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest;
- import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
- import org.elasticsearch.cluster.metadata.AliasMetaData;
- import org.elasticsearch.common.collect.CrossVersionImmutableMapOfImmutableMaps;
- //DEBUG (alias corruption)
- //import org.elasticsearch.action.admin.indices.status.IndexStatus;
- //import org.elasticsearch.action.admin.indices.status.IndicesStatusRequest;
- //import org.elasticsearch.action.admin.indices.status.IndicesStatusResponse;
- public class GenericProcessingController {
- //NOTE THIS FUNCTION SHOULD CONTAIN NO STATE SINCE IT CAN BE RUN ACROSS MULTIPLE THREADS
-
- //(Nothing currently to log)
- //private static final Logger logger = Logger.getLogger(GenericProcessingController.class);
- ///////////////////////////////////////////////////////////////////////////////////////
- //
- // Set up the databases and indexes
-
- public void Initialize() {
- InitializeDatabase();
- InitializeIndex(false, false, false);
- // (Don't delete anything, obviously)
- }
-
- public void InitializeDatabase() {
- // Add indices:
- try
- {
- PropertiesManager pm = new PropertiesManager();
-
- ////////////////////////
- //
- // Remove old indexes, mostly just old code that is no longer needed
- //
- dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.url_, 1);
- dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.sourceKey_, 2);
- dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceUrl_, 1);
- dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceKey_, 1);
- dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.title_, 1);
- // (Title simply not needed, that was a mistake from an early iteration)
- dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.updateId_, 1);
- dropIndexIfItExists(DbManager.getSocial().getShare(), "type", 1);
- dropIndexIfItExists(DbManager.getSocial().getCookies(), "apiKey", 1);
- dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 2);
- dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 2);
- // (see shard keys below, these legacy ones can appear if the DB is restored from a different machine's backup)
- dropIndexIfNotNeeded(DbManager.getDocument().getContent(), "sourceKey_1_url_1", 0, "sourceKey_2_url_2", 0);
- dropIndexIfNotNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_1", 0, "sourceKey_1__id_-1", 0);
-
- ////////////////////////
- //
- // Indexes needed for sharding:
- //
- // ** Content (has changed a bit)
- BasicDBObject compIndex = new BasicDBObject(CompressedFullTextPojo.sourceKey_, 1);
- compIndex.put(CompressedFullTextPojo.url_, 1);
- addIndexIfNeeded(DbManager.getDocument().getContent(), "sourceKey_2_url_2", 0, compIndex); // (remove legacy 2_2 and replace with 1_1, which supports shards)
- // ** Metadata
- // Add {_id:1} to "standalone" sourceKey, sort docs matching source key by "time" (sort of!)
- compIndex = new BasicDBObject(DocumentPojo.sourceKey_, 1);
- compIndex.put(DocumentPojo._id_, 1);
- addIndexIfNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_-1", 0, compIndex); // (remove legacy 1_-1 and replace with 1_1, which supports shards)
- // ** Entities and associations
- DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.index_, 1));
- DbManager.getFeature().getAssociation().ensureIndex(new BasicDBObject(AssociationFeaturePojo.index_, 1));
-
- ////////////////////////
- //
- // Other indexes
- //
- // Needed to handle updates of large files containing many URLs:
- DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.sourceUrl_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
- // Needed for duplicate checking
- // (Compound index lets me access {url, sourceKey}, {url} efficiently ... but need sourceKey separately to do {sourceKey})
- compIndex = new BasicDBObject(DocumentPojo.url_, 1);
- compIndex.put(DocumentPojo.sourceKey_, 1);
- DbManager.getDocument().getMetadata().ensureIndex(compIndex);
- // Needed to handle document updates
- DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.updateId_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
- // Needed to update documents' entities' doc counts
- if (!pm.getAggregationDisabled()) {
- compIndex = new BasicDBObject(EntityPojo.docQuery_index_, 1);
- compIndex.put(DocumentPojo.communityId_, 1);
- DbManager.getDocument().getMetadata().ensureIndex(compIndex);
- }
- // Needed for keeping source/community doc counts
- compIndex = new BasicDBObject(DocCountPojo._id_, 1);
- compIndex.put(DocCountPojo.doccount_, 1);
- DbManager.getDocument().getCounts().ensureIndex(compIndex);
- // Needed for keep tracking of entities
- DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.disambiguated_name_, 1));
- DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.alias_, 1));
- // Needed for background re-calculation
- DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
- DbManager.getFeature().getAssociation().ensureIndex(new BasicDBObject(AssociationFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
- // Needed for geo-location in the entity pipeline
- DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("country", 1));
- DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("search_field", 1));
- DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("geoindex", "2d"));
- // Needed for source management
- DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.key_, 1));
- DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.communityIds_, 1));
- DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, 1));
- DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, 1));
- // Searching shares
- // Compound index lets me access {type, communities._id}, {type} efficiently
- compIndex = new BasicDBObject("type", 1);
- compIndex.put("communities._id", 1);
- DbManager.getSocial().getShare().ensureIndex(compIndex);
- // User logins
- DbManager.getSocial().getCookies().ensureIndex(new BasicDBObject("apiKey", 2), new BasicDBObject(MongoDbManager.sparse_, true));
- // Custom job scheduling
- DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, 1));
- //TODO (): MOVE THESE TO SPARSE INDEXES AFTER YOU'VE UPDATED THE LOGIC (SWAP THE 1 AND 2)
- DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1), new BasicDBObject(MongoDbManager.sparse_, false));
- // DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
- // dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 1);
- DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1), new BasicDBObject(MongoDbManager.sparse_, false));
- // DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
- // dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 1);
- }
- catch (Exception e) {
- e.printStackTrace();
- throw new RuntimeException(e.getMessage());
- }
- }//TESTED (not changed since by-eye test in Beta)
-
- // Some *DB* index utilities (note note Lucene index)
-
- private static void addIndexIfNeeded(DBCollection coll, String indexToCheck, int nIndexIndex, BasicDBObject newIndex)
- {
- StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
- if (0 != nIndexIndex) {
- indexNameStrBuff.append("_").append(nIndexIndex);
- }
- String indexName2 = indexNameStrBuff.toString();
- List<DBObject> list = coll.getIndexInfo();
- for (DBObject dbo: list) {
- String name = (String) dbo.get("name");
- if (indexName2.equalsIgnoreCase(name)) {
- return; // no need to create a new index
- }
- }
- // If we're here then we didn't find the index so create a new index
- try {
- coll.ensureIndex(newIndex);
- }
- catch (Exception e) {}
- }//TESTED
-
- private static void dropIndexIfNotNeeded(DBCollection coll, String indexToCheck, int nIndexToCheckIndex, String indexToDelete, int nIndexToDeleteIndex)
- {
- StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
- if (0 != nIndexToCheckIndex) {
- indexNameStrBuff.append("_").append(nIndexToCheckIndex);
- }
- String indexToCheck2 = indexNameStrBuff.toString();
- indexNameStrBuff.setLength(0);
- indexNameStrBuff.append(indexToDelete);
- if (0 != nIndexToDeleteIndex) {
- indexNameStrBuff.append("_").append(nIndexToDeleteIndex);
- }
-
- boolean foundIndexToDelete = false;
- boolean foundIndexToCheck = false;
- String indexToDelete2 = indexNameStrBuff.toString();
- List<DBObject> list = coll.getIndexInfo();
- for (DBObject dbo: list) {
- String name = (String) dbo.get("name");
- if (indexToCheck2.equalsIgnoreCase(name)) {
- foundIndexToCheck = true;
- }
- else if (indexToDelete2.equalsIgnoreCase(name)) {
- foundIndexToDelete = true;
- }
- }
- if (foundIndexToCheck && foundIndexToDelete) {
- try {
- coll.dropIndex(indexToDelete2);
- }
- catch (Exception e) {}
- }
- }//TESTED
- private void dropIndexIfItExists(DBCollection coll, String indexName, int nIndexIndex)
- {
- StringBuffer indexNameStrBuff = new StringBuffer(indexName);
- if (0 != nIndexIndex) {
- indexNameStrBuff.append("_").append(nIndexIndex);
- }
- String indexName2 = indexNameStrBuff.toString();
- List<DBObject> list = coll.getIndexInfo();
- for (DBObject dbo: list) {
- String name = (String) dbo.get("name");
- if (indexName2.equalsIgnoreCase(name)) {
- try {
- coll.dropIndex(name);
- }
- catch (Exception e) {}
- }
- }
- }//TESTED
-
- /////////////////////////////////////////////////////////
- // Lucene index initialization
-
- // (Note some of the code below is duplicated in MongoDocumentTxfer, so make sure you sync changes)
- public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature) {
- InitializeIndex(bDeleteDocs, bDeleteEntityFeature, bDeleteEventFeature, false);
- }
- public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature, boolean bRebuildDocsIndex) {
-
- try { //create elasticsearch indexes
-
- PropertiesManager pm = new PropertiesManager();
-
- if (!pm.getAggregationDisabled()) {
-
- boolean languageNormalization = pm.getNormalizeEncoding();
-
- Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
- localSettingsEvent.put("number_of_shards", 1).put("number_of_replicas", 0);
- localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
- if (languageNormalization) {
- localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer","icu_folding","standard","lowercase");
- }
- else {
- localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard","lowercase");
- }
-
- Builder localSettingsGaz = ImmutableSettings.settingsBuilder();
- localSettingsGaz.put("number_of_shards", 1).put("number_of_replicas", 0);
- localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
- if (languageNormalization) {
- localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer","icu_folding","standard","lowercase");
- }
- else {
- localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard","lowercase");
- }
-
- //event feature
- String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class);
- ElasticSearchManager eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent);
- if (null == eventIndex) { // (if has been previously referenced in this process space)
- eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_);
- }
- eventIndex.createAlias(AssociationFeaturePojoIndexMap.indexCollectionName_);
- if (bDeleteEventFeature) {
- eventIndex.deleteMe();
- eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent);
- }
- //entity feature
- String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class);
- ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz);
- if (null == entityIndex) { // (if has been previously referenced in this process space)
- entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
- }
- entityIndex.createAlias(EntityFeaturePojoIndexMap.indexCollectionName_);
- if (bDeleteEntityFeature) {
- entityIndex.deleteMe();
- entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz);
- }
- }
-
- //DOCS - much more complicated than anything else
- boolean bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
- // (ie if main doc index doesn't exist then always rebuild all indexes)
-
- if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing...
- try { Thread.sleep(60000); } catch (Exception e) {}
- bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
- }
- bRebuildDocsIndex |= bPingMainIndexFailed;
-
- // check the main index has the "collection" alias - if not then rebuild everything
- if (!bPingMainIndexFailed && (null == _aliasInfo)) {
- ElasticSearchManager docIndex = ElasticSearchManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
- ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
- _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData());
- if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) {
- bRebuildDocsIndex = true;
- }
- } //TESTED
-
- createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs);
- createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs);
-
- // Some hardwired dummy communities
- createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin)
- createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user)
- // (create dummy index used to keep personal group aliases)
-
- if (bRebuildDocsIndex || bDeleteDocs) {
- // OK, going to have different shards for different communities:
- // Get a list of all the communities:
-
- BasicDBObject query = new BasicDBObject();
- BasicDBObject fieldsToDrop = new BasicDBObject("members", 0);
- fieldsToDrop.put("communityAttributes", 0);
- fieldsToDrop.put("userAttributes", 0);
- DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop);
-
- List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts)
- int i = 0;
- System.out.println("Initializing " + dbc.size() + " indexes:");
- for (int j = 0; j < 2; ++j) {
- for (DBObject dbotmp: tmparray) {
- if ((++i % 100) == 0) {
- System.out.println("Initialized " + i + " indexes.");
- }
- BasicDBObject dbo = (BasicDBObject) dbotmp;
-
- // OK, going to see if there are any sources with this group id, create a new index if so:
- // (Don't use CommunityPojo data model here for performance reasons....
- // (Also, haven't gotten round to porting CommunityPojo field access to using static fields))
- ObjectId communityId = (ObjectId) dbo.get("_id");
- boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false);
- boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false);
- ObjectId parentCommunityId = (ObjectId) dbo.get("parentId");
-
- createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup, bSystemGroup, bDeleteDocs, j==0);
-
- }//end loop over communities
- }// end loop over communities - first time parents only
- } // (end if need to do big loop over all sources)
- }
- catch (Exception e)
- {
- e.printStackTrace();
- throw new RuntimeException(e.getMessage());
- }
- }//TESTED (not changed since by-eye test in Beta - retested after moving code into createCommunityDocIndex below)
-
- ///////////////////////////////////////////////////////////////////////////////////////
-
- // Utility code for creating community indexes
-
- public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex)
- {
- createCommunityDocIndex(nameOrCommunityIdStr, parentCommunityId, bPersonalGroup, bSystemGroup, bClearIndex, false);
- }
-
- public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId,
- boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex, boolean bParentsOnly)
- {
- //create elasticsearch indexes
- PropertiesManager pm = new PropertiesManager();
- boolean languageNormalization = pm.getNormalizeEncoding();
- int nPreferredReplicas = pm.getMaxIndexReplicas();
-
- String docMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class);
-
- String sGroupIndex = null; // for indexing, ie always a single index
- String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
- try {
- sGroupIndex = new StringBuffer("doc_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString();
- sAliasIndex = new StringBuffer("docs_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString();
- }
- catch (Exception e) {
- sGroupIndex = nameOrCommunityIdStr;
- if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) {
- sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
- }
- else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
- sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
- }
- else { // fallback
- sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
- }
- //TESTED
- }
- if (!bPersonalGroup) {
-
- if (null == parentCommunityId) {
-
- int nShards = bSystemGroup? 10 : 5 ; // (system group is largest)
-
- // Remove the alias, in case it exists:
- // Then create an index with this name:
- Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
- localSettingsGroupIndex.put("number_of_shards", nShards).put("number_of_replicas", nPreferredReplicas);
- if (languageNormalization) {
- localSettingsGroupIndex.put("index.analysis.analyzer.default.tokenizer","standard");
- localSettingsGroupIndex.putArray("index.analysis.analyzer.default.filter", "icu_normalizer","icu_folding","standard","lowercase","stop");
- }//TESTED
- ElasticSearchManager docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
- if (null == docIndex) { // index has already been referenced, hence createIndex returns null
- docIndex = IndexManager.getIndex(sGroupIndex);
- }
- if (bClearIndex) {
- docIndex.deleteMe();
- docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
- }
- if (null != docIndex) {
- try {
- docIndex.pingIndex(); // (wait until it's created itself)
- }
- catch (Exception e) {} // (just make sure this doesn't die horribly)
- }
- else {
- docIndex = IndexManager.getIndex(sGroupIndex);
- }
- if (null != docIndex) { // should always be true
- docIndex.createAlias(sAliasIndex);
- docIndex.closeIndex();
- }
- }
- else if (!bParentsOnly) { // A sub-index of a parent
-
- parentCommunityId = getRootCommunity(parentCommunityId);
-
- if (null != parentCommunityId) {
- String parentCommunityIdStr = parentCommunityId.toString();
-
- String sParentGroupIndex = new StringBuffer("doc_").append(new ObjectId(parentCommunityIdStr).toString()).toString();
- ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);
-
- //DEBUG (alias corruption)
- // if (null == _aliasInfo) {
- // ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
- // _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData());
- // }
- // else {
- // if (_aliasInfo.containsKey(sGroupIndex)) { // has no aliases, we're not good
- // return;
- // }
- // else {
- // //DEBUG
- // System.out.println("Alias " + sGroupIndex + " has no aliases (but should)");
- // ElasticSearchManager docIndex2 = IndexManager.getIndex(sGroupIndex);
- // docIndex2.deleteMe();
- // }
- // }
-
- docIndex.createAlias(sGroupIndex); // for indexing
- // (this is going to be tricky when the functionality is fully implemented
- // because it will need to handle the parent index splitting)
- docIndex.createAlias(sAliasIndex); // for queries
- docIndex.closeIndex();
- // (do nothing on delete - that will be handled at the parent index level)
- }
- }
- //TESTED (parents, children, and personal + docs_ aliases)
- }
- else { // (Personal group)
- // Just create the dummy index, no different to getting it in practice
- Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
- localSettingsGroupIndex.put("number_of_shards", 1).put("number_of_replicas", 0); // (ie guaranteed to be local to each ES node)
- ElasticSearchManager dummyGroupIndex = IndexManager.createIndex(DocumentPojoIndexMap.dummyDocumentIndex_, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
- if (null == dummyGroupIndex) {
- dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
- }
-
- // Just create an alias, so that queries work arbitrarily:
- dummyGroupIndex.createAlias(sGroupIndex); // (at some point we should delete the sGroupIndex alias, but leave it in for bw compatibility for now)
- dummyGroupIndex.createAlias(sAliasIndex); // (never index dummy indices so only need query index)
- // (do nothing on delete since don't have any docs in here anyway)
- }
- }
- //TESTED (including new docs_ alias)
- ///////////////////////////
-
- // (this utility function is needed for the legacy case where empty communities were
- // treated as aliases of the dummy community ... first time I encounter a community, I need
- // to recreate it...)
-
- public static void recreateCommunityDocIndex_unknownFields(ObjectId communityId, boolean bDeleteFirst) {
- CommunityPojo cp = CommunityPojo.fromDb(MongoDbManager.getSocial().getCommunity().findOne(new BasicDBObject("_id", communityId)), CommunityPojo.class);
- if (null != cp) {
- deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), true);
- // (in the legacy world this would have been treated as a "personal" ie equivalently to a dummy community ...
- // this does nothing if it's already a real community)
-
- if (bDeleteFirst) {
- deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity());
- }
- createCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity(), cp.getIsSystemCommunity(), false);
- }
- }
- //TESTED
-
- ///////////////////////////
-
- public static void deleteCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup) {
-
- String sGroupIndex = null; // for indexing, ie always a single index
- String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
- ObjectId communityId = null;
- try {
- communityId = new ObjectId(nameOrCommunityIdStr);
- sGroupIndex = new StringBuffer("doc_").append(communityId.toString()).toString();
- sAliasIndex = new StringBuffer("docs_").append(communityId.toString()).toString();
- }
- catch (Exception e) {
- sGroupIndex = nameOrCommunityIdStr;
- if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) {
- sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
- }
- else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
- sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
- }
- else { // fallback
- sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
- }
- //TESTED
- }
- if (bPersonalGroup) {
- ElasticSearchManager dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
- dummyGroupIndex.removeAlias(sAliasIndex);
- dummyGroupIndex.removeAlias(sGroupIndex);
- }
- else if (null != parentCommunityId) {
-
- parentCommunityId = getRootCommunity(parentCommunityId);
- if (null != parentCommunityId) {
- String sParentGroupIndex = new StringBuffer("doc_").append(parentCommunityId.toString()).toString();
- ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);
- docIndex.removeAlias(sGroupIndex);
- docIndex.removeAlias(sAliasIndex);
- docIndex.closeIndex();
- }
- }
- else {
- ElasticSearchManager docIndex = IndexManager.getIndex(sGroupIndex);
- docIndex.deleteMe();
- }
- //TESTED (parent, children, and personal)
-
- // Also need to delete any records indexes:
- // It's a bit more complex because we're not exactly sure which indexes exist:
-
- if (null != communityId) {
- ElasticSearchManager indexMgr = ElasticSearchManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
- // (just something that's guaranteed to exist)
- String stashedIndex = "recs_" + communityId.toString();
- String liveIndicesPrefix = "recs_t_" + communityId.toString();
-
- ClusterStateResponse clusterState = indexMgr.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
- String indices[] = clusterState.getState().getMetaData().getConcreteAllOpenIndices();
- for (String index: indices) {
- if (index.startsWith(stashedIndex) || index.startsWith(liveIndicesPrefix)) {
- ElasticSearchManager.getIndex(index).deleteMe();
- }
- }//TESTED
-
- // THIS CODE ONLY WORKS ON ES-1.0+ ... so have replaced with the less efficient code above
-
- // First off: stashed interface:
-
- // String stashedIndex = "recs_" + communityId.toString();
- // ClusterStateResponse retVal = indexMgr.getRawClient().admin().cluster().prepareState()
- // .setIndices(stashedIndex)
- // .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
- //
- // if (!retVal.getState().getMetaData().getIndices().isEmpty()) {
- // ElasticSearchManager.getIndex(stashedIndex).deleteMe();
- // }//TESTED
- // // (else doesn't exist...)
- //
- // // Second: all the time-indexed versions
- //
- // String indexPattern = new StringBuffer("recs_t_").append(communityId.toString()).append("*").toString();
- // retVal = indexMgr.getRawClient().admin().cluster().prepareState()
- // .setIndices(indexPattern)
- // .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
- //
- // for (IndexMetaData indexMetadata: retVal.getState().getMetaData()) {
- // ElasticSearchManager.getIndex(indexMetadata.index()).deleteMe();
- // }//TESTED
- }//TESTED
- }
- //TESTED (personal and system)
-
- ///////////////////////////
-
- // Utility function to get the root community of a community hierarchy, since you can't add aliases to aliases
-
- static ObjectId getRootCommunity(ObjectId parentCommunityId) {
-
- for (;;) {
- BasicDBObject query = new BasicDBObject("_id", parentCommunityId);
- BasicDBObject field = new BasicDBObject("parentId", 1);
- BasicDBObject retVal = (BasicDBObject) MongoDbManager.getSocial().getCommunity().findOne(query, field);
- if (null == retVal) { // (shouldn't ever happen)
- return parentCommunityId;
- }
- ObjectId tmp = retVal.getObjectId("parentId", null);
- if (null == tmp) { // (no more parents)
- return parentCommunityId;
- }
- if (tmp.equals(parentCommunityId)) { // (shouldn't ever happen but will prevent infinite loop)
- return parentCommunityId;
- }
- parentCommunityId = tmp;
- }
- }//TESTED (cases where have and don't have parent id)
-
- ///////////////////////////////////////////////////////////////////////////////////////
- //
- // Interface to handle scaleable indexes
- // Currently this is a dummy interface, but it will make it easy to split the indexes in the future
- private static HashMap<String, String> _docIndexMap = null;
- private static String _assocIndex = null;
- private static String _entityIndex = null;
- private static CrossVersionImmutableMapOfImmutableMaps<AliasMetaData> _aliasInfo = null;
-
- //TODO (INF-1136): Test and integrate this (phase 1), then implement the index splitting code (phase 2)
-
- public static synchronized String getIndex(String communityIdOrIndexStr) {
- if (communityIdOrIndexStr == EntityFeaturePojoIndexMap.indexName_) { // pointer == intended
- if (null == _entityIndex) {
- _entityIndex = EntityFeaturePojoIndexMap.indexName_;
- }
- return _entityIndex;
- }
- else if (communityIdOrIndexStr == AssociationFeaturePojoIndexMap.indexName_) { // pointer == intended
- if (null == _assocIndex) {
- _assocIndex = AssociationFeaturePojoIndexMap.indexName_;
- }
- return _assocIndex;
- }
- else { // Documents
-
- if (null == _docIndexMap) {
- _docIndexMap = new HashMap<String, String>();
- }
- String sAliasIndex;
- try {
- sAliasIndex = new StringBuffer("doc_").append(new ObjectId(communityIdOrIndexStr).toString()).toString();
- }
- catch (Exception e) {
- if (DocumentPojoIndexMap.globalDocumentIndex_.equals(communityIdOrIndexStr)) {
- communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
- }
- else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(communityIdOrIndexStr)) {
- communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
- }
- else { // fallback
- communityIdOrIndexStr = sAliasIndex = communityIdOrIndexStr.replaceAll("doc(?:ument)?_", "");
- }
- }
- String sDocIndex = _docIndexMap.get(communityIdOrIndexStr);
- if (null == sDocIndex) {
- sDocIndex = sAliasIndex;
- _docIndexMap.put(communityIdOrIndexStr, sAliasIndex);
- }
- return sDocIndex;
- }
- }
- //TOTEST (lots of cases)
-
- ///////////////////////////////////////////////////////////////////////////////////////
- //
- // Enrich and store documents (source is optional - can choose not to index if set)
- // (and remove any documents)
-
- public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete)
- {
- processDocuments(harvestType, toAdd, toUpdate_subsetOfAdd, toDelete, null);
- }
- public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete, SourcePojo source)
- {
- PropertiesManager props = new PropertiesManager();
-
- // Note: toAdd = toAdd(old) + toUpdate
- // Need to treat updates as follows:
- // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the future comments etc)
- // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...)
- toDelete.addAll(toUpdate_subsetOfAdd);
- StoreAndIndexManager storageManager = new StoreAndIndexManager();
- storageManager.removeFromDatastore_byURL(toDelete);
- // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester)
- // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...)
-
- // Aggregation:
- // 1+2. Create aggregate entities/events ("features") and write them to the DB
- // (then can store feeds - doesn't matter that the event/entities have been modified by the aggregation)
- // 3. (Scheduled for efficiency) Update all documents' frequencies based on new entities and events
- // 4. (Scheduled for efficiency) Synchronize with index [after this, queries can find them - so (2) must have happened]
- // (Syncronization currently "corrupts" the entities so needs to be run last)
- AggregationManager perSourceAggregation = null;
-
- if (!props.getAggregationDisabled()) {
- perSourceAggregation = new AggregationManager();
- }
-
- // 1+2]
- if (null != perSourceAggregation) {
- perSourceAggregation.doAggregation(toAdd, toDelete);
- perSourceAggregation.createOrUpdateFeatureEntries();
- }
-
- // Save feeds to feeds collection in MongoDb
- // (second field determines if content gets saved)
- if (null != perSourceAggregation) {
- perSourceAggregation.applyAggregationToDocs(toAdd);
- // (First save aggregated statistics back to the docs' entity/event instances)
- }
- storeFeeds(toAdd, (harvestType != InfiniteEnums.DATABASE), source);
- // Then finish aggregation:
-
- if (null != perSourceAggregation) {
- // 3]
- perSourceAggregation.runScheduledDocumentUpdates();
-
- // 4] This needs to happen last because it "corrupts" the entities and events
- perSourceAggregation.runScheduledSynchronization();
- }
-
- }//TESTED (by eye - logic is v simple)
-
- ///////////////////////////////////////////////////////////////////////////////////////
- //
- // STORAGE AND INDEXING
- //
- //////////////////////////////////////////////////////////////////////////////////////
- /**
- * Writes the feeds to the DB and index
- *
- * @param feeds list of feeds to be added to db
- */
- private void storeFeeds(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source)
- {
- if ( null != docs && docs.size() > 0 )
- {
- StoreAndIndexManager store = new StoreAndIndexManager();
- store.addToDatastore(docs, bSaveContent, source);
- }
- }//TESTED (by eye)
-
- // See StoreAndIndexManager
-
- ///////////////////////////////////////////////////////////////////////////////////////
- //
- // AGGREGATION
- //
- //////////////////////////////////////////////////////////////////////////////////////
- // See AggregationManager
-
- }