PageRenderTime 100ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/core/infinit.e.processing.generic.library/src/com/ikanow/infinit/e/processing/generic/GenericProcessingController.java

https://github.com/IKANOW/Infinit.e
Java | 813 lines | 510 code | 107 blank | 196 comment | 118 complexity | ee5054c21a6815af08f46f41752bc0e4 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.processing.generic;
  17. import java.util.HashMap;
  18. import java.util.List;
  19. //import org.apache.log4j.Logger;
  20. import org.bson.types.ObjectId;
  21. import org.elasticsearch.common.settings.ImmutableSettings;
  22. import org.elasticsearch.common.settings.ImmutableSettings.Builder;
  23. import com.google.gson.Gson;
  24. import com.ikanow.infinit.e.data_model.InfiniteEnums;
  25. import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
  26. import com.ikanow.infinit.e.data_model.index.IndexManager;
  27. import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
  28. import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap;
  29. import com.ikanow.infinit.e.data_model.index.feature.event.AssociationFeaturePojoIndexMap;
  30. import com.ikanow.infinit.e.data_model.store.DbManager;
  31. import com.ikanow.infinit.e.data_model.store.MongoDbManager;
  32. import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
  33. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  34. import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
  35. import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
  36. import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
  37. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  38. import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
  39. import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
  40. import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
  41. import com.ikanow.infinit.e.data_model.store.social.community.CommunityPojo;
  42. import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager;
  43. import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
  44. import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager;
  45. import com.mongodb.BasicDBObject;
  46. import com.mongodb.DBCollection;
  47. import com.mongodb.DBCursor;
  48. import com.mongodb.DBObject;
  49. import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest;
  50. import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
  51. import org.elasticsearch.cluster.metadata.AliasMetaData;
  52. import org.elasticsearch.common.collect.CrossVersionImmutableMapOfImmutableMaps;
  53. //DEBUG (alias corruption)
  54. //import org.elasticsearch.action.admin.indices.status.IndexStatus;
  55. //import org.elasticsearch.action.admin.indices.status.IndicesStatusRequest;
  56. //import org.elasticsearch.action.admin.indices.status.IndicesStatusResponse;
  57. public class GenericProcessingController {
  58. //NOTE THIS FUNCTION SHOULD CONTAIN NO STATE SINCE IT CAN BE RUN ACROSS MULTIPLE THREADS
  59. //(Nothing currently to log)
  60. //private static final Logger logger = Logger.getLogger(GenericProcessingController.class);
  61. ///////////////////////////////////////////////////////////////////////////////////////
  62. //
  63. // Set up the databases and indexes
  64. public void Initialize() {
  65. InitializeDatabase();
  66. InitializeIndex(false, false, false);
  67. // (Don't delete anything, obviously)
  68. }
  69. public void InitializeDatabase() {
  70. // Add indices:
  71. try
  72. {
  73. PropertiesManager pm = new PropertiesManager();
  74. ////////////////////////
  75. //
  76. // Remove old indexes, mostly just old code that is no longer needed
  77. //
  78. dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.url_, 1);
  79. dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.sourceKey_, 2);
  80. dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceUrl_, 1);
  81. dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceKey_, 1);
  82. dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.title_, 1);
  83. // (Title simply not needed, that was a mistake from an early iteration)
  84. dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.updateId_, 1);
  85. dropIndexIfItExists(DbManager.getSocial().getShare(), "type", 1);
  86. dropIndexIfItExists(DbManager.getSocial().getCookies(), "apiKey", 1);
  87. dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 2);
  88. dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 2);
  89. // (see shard keys below, these legacy ones can appear if the DB is restored from a different machine's backup)
  90. dropIndexIfNotNeeded(DbManager.getDocument().getContent(), "sourceKey_1_url_1", 0, "sourceKey_2_url_2", 0);
  91. dropIndexIfNotNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_1", 0, "sourceKey_1__id_-1", 0);
  92. ////////////////////////
  93. //
  94. // Indexes needed for sharding:
  95. //
  96. // ** Content (has changed a bit)
  97. BasicDBObject compIndex = new BasicDBObject(CompressedFullTextPojo.sourceKey_, 1);
  98. compIndex.put(CompressedFullTextPojo.url_, 1);
  99. addIndexIfNeeded(DbManager.getDocument().getContent(), "sourceKey_2_url_2", 0, compIndex); // (remove legacy 2_2 and replace with 1_1, which supports shards)
  100. // ** Metadata
  101. // Add {_id:1} to "standalone" sourceKey, sort docs matching source key by "time" (sort of!)
  102. compIndex = new BasicDBObject(DocumentPojo.sourceKey_, 1);
  103. compIndex.put(DocumentPojo._id_, 1);
  104. addIndexIfNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_-1", 0, compIndex); // (remove legacy 1_-1 and replace with 1_1, which supports shards)
  105. // ** Entities and associations
  106. DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.index_, 1));
  107. DbManager.getFeature().getAssociation().ensureIndex(new BasicDBObject(AssociationFeaturePojo.index_, 1));
  108. ////////////////////////
  109. //
  110. // Other indexes
  111. //
  112. // Needed to handle updates of large files containing many URLs:
  113. DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.sourceUrl_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
  114. // Needed for duplicate checking
  115. // (Compound index lets me access {url, sourceKey}, {url} efficiently ... but need sourceKey separately to do {sourceKey})
  116. compIndex = new BasicDBObject(DocumentPojo.url_, 1);
  117. compIndex.put(DocumentPojo.sourceKey_, 1);
  118. DbManager.getDocument().getMetadata().ensureIndex(compIndex);
  119. // Needed to handle document updates
  120. DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.updateId_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
  121. // Needed to update documents' entities' doc counts
  122. if (!pm.getAggregationDisabled()) {
  123. compIndex = new BasicDBObject(EntityPojo.docQuery_index_, 1);
  124. compIndex.put(DocumentPojo.communityId_, 1);
  125. DbManager.getDocument().getMetadata().ensureIndex(compIndex);
  126. }
  127. // Needed for keeping source/community doc counts
  128. compIndex = new BasicDBObject(DocCountPojo._id_, 1);
  129. compIndex.put(DocCountPojo.doccount_, 1);
  130. DbManager.getDocument().getCounts().ensureIndex(compIndex);
  131. // Needed for keep tracking of entities
  132. DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.disambiguated_name_, 1));
  133. DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.alias_, 1));
  134. // Needed for background re-calculation
  135. DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
  136. DbManager.getFeature().getAssociation().ensureIndex(new BasicDBObject(AssociationFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
  137. // Needed for geo-location in the entity pipeline
  138. DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("country", 1));
  139. DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("search_field", 1));
  140. DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("geoindex", "2d"));
  141. // Needed for source management
  142. DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.key_, 1));
  143. DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.communityIds_, 1));
  144. DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, 1));
  145. DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, 1));
  146. // Searching shares
  147. // Compound index lets me access {type, communities._id}, {type} efficiently
  148. compIndex = new BasicDBObject("type", 1);
  149. compIndex.put("communities._id", 1);
  150. DbManager.getSocial().getShare().ensureIndex(compIndex);
  151. // User logins
  152. DbManager.getSocial().getCookies().ensureIndex(new BasicDBObject("apiKey", 2), new BasicDBObject(MongoDbManager.sparse_, true));
  153. // Custom job scheduling
  154. DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, 1));
  155. //TODO (): MOVE THESE TO SPARSE INDEXES AFTER YOU'VE UPDATED THE LOGIC (SWAP THE 1 AND 2)
  156. DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1), new BasicDBObject(MongoDbManager.sparse_, false));
  157. // DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
  158. // dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 1);
  159. DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1), new BasicDBObject(MongoDbManager.sparse_, false));
  160. // DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
  161. // dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 1);
  162. }
  163. catch (Exception e) {
  164. e.printStackTrace();
  165. throw new RuntimeException(e.getMessage());
  166. }
  167. }//TESTED (not changed since by-eye test in Beta)
  168. // Some *DB* index utilities (note note Lucene index)
  169. private static void addIndexIfNeeded(DBCollection coll, String indexToCheck, int nIndexIndex, BasicDBObject newIndex)
  170. {
  171. StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
  172. if (0 != nIndexIndex) {
  173. indexNameStrBuff.append("_").append(nIndexIndex);
  174. }
  175. String indexName2 = indexNameStrBuff.toString();
  176. List<DBObject> list = coll.getIndexInfo();
  177. for (DBObject dbo: list) {
  178. String name = (String) dbo.get("name");
  179. if (indexName2.equalsIgnoreCase(name)) {
  180. return; // no need to create a new index
  181. }
  182. }
  183. // If we're here then we didn't find the index so create a new index
  184. try {
  185. coll.ensureIndex(newIndex);
  186. }
  187. catch (Exception e) {}
  188. }//TESTED
  189. private static void dropIndexIfNotNeeded(DBCollection coll, String indexToCheck, int nIndexToCheckIndex, String indexToDelete, int nIndexToDeleteIndex)
  190. {
  191. StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
  192. if (0 != nIndexToCheckIndex) {
  193. indexNameStrBuff.append("_").append(nIndexToCheckIndex);
  194. }
  195. String indexToCheck2 = indexNameStrBuff.toString();
  196. indexNameStrBuff.setLength(0);
  197. indexNameStrBuff.append(indexToDelete);
  198. if (0 != nIndexToDeleteIndex) {
  199. indexNameStrBuff.append("_").append(nIndexToDeleteIndex);
  200. }
  201. boolean foundIndexToDelete = false;
  202. boolean foundIndexToCheck = false;
  203. String indexToDelete2 = indexNameStrBuff.toString();
  204. List<DBObject> list = coll.getIndexInfo();
  205. for (DBObject dbo: list) {
  206. String name = (String) dbo.get("name");
  207. if (indexToCheck2.equalsIgnoreCase(name)) {
  208. foundIndexToCheck = true;
  209. }
  210. else if (indexToDelete2.equalsIgnoreCase(name)) {
  211. foundIndexToDelete = true;
  212. }
  213. }
  214. if (foundIndexToCheck && foundIndexToDelete) {
  215. try {
  216. coll.dropIndex(indexToDelete2);
  217. }
  218. catch (Exception e) {}
  219. }
  220. }//TESTED
  221. private void dropIndexIfItExists(DBCollection coll, String indexName, int nIndexIndex)
  222. {
  223. StringBuffer indexNameStrBuff = new StringBuffer(indexName);
  224. if (0 != nIndexIndex) {
  225. indexNameStrBuff.append("_").append(nIndexIndex);
  226. }
  227. String indexName2 = indexNameStrBuff.toString();
  228. List<DBObject> list = coll.getIndexInfo();
  229. for (DBObject dbo: list) {
  230. String name = (String) dbo.get("name");
  231. if (indexName2.equalsIgnoreCase(name)) {
  232. try {
  233. coll.dropIndex(name);
  234. }
  235. catch (Exception e) {}
  236. }
  237. }
  238. }//TESTED
  239. /////////////////////////////////////////////////////////
  240. // Lucene index initialization
  241. // (Note some of the code below is duplicated in MongoDocumentTxfer, so make sure you sync changes)
  242. public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature) {
  243. InitializeIndex(bDeleteDocs, bDeleteEntityFeature, bDeleteEventFeature, false);
  244. }
  245. public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature, boolean bRebuildDocsIndex) {
  246. try { //create elasticsearch indexes
  247. PropertiesManager pm = new PropertiesManager();
  248. if (!pm.getAggregationDisabled()) {
  249. boolean languageNormalization = pm.getNormalizeEncoding();
  250. Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
  251. localSettingsEvent.put("number_of_shards", 1).put("number_of_replicas", 0);
  252. localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
  253. if (languageNormalization) {
  254. localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer","icu_folding","standard","lowercase");
  255. }
  256. else {
  257. localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard","lowercase");
  258. }
  259. Builder localSettingsGaz = ImmutableSettings.settingsBuilder();
  260. localSettingsGaz.put("number_of_shards", 1).put("number_of_replicas", 0);
  261. localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
  262. if (languageNormalization) {
  263. localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer","icu_folding","standard","lowercase");
  264. }
  265. else {
  266. localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard","lowercase");
  267. }
  268. //event feature
  269. String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class);
  270. ElasticSearchManager eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent);
  271. if (null == eventIndex) { // (if has been previously referenced in this process space)
  272. eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_);
  273. }
  274. eventIndex.createAlias(AssociationFeaturePojoIndexMap.indexCollectionName_);
  275. if (bDeleteEventFeature) {
  276. eventIndex.deleteMe();
  277. eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent);
  278. }
  279. //entity feature
  280. String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class);
  281. ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz);
  282. if (null == entityIndex) { // (if has been previously referenced in this process space)
  283. entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
  284. }
  285. entityIndex.createAlias(EntityFeaturePojoIndexMap.indexCollectionName_);
  286. if (bDeleteEntityFeature) {
  287. entityIndex.deleteMe();
  288. entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz);
  289. }
  290. }
  291. //DOCS - much more complicated than anything else
  292. boolean bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
  293. // (ie if main doc index doesn't exist then always rebuild all indexes)
  294. if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing...
  295. try { Thread.sleep(60000); } catch (Exception e) {}
  296. bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
  297. }
  298. bRebuildDocsIndex |= bPingMainIndexFailed;
  299. // check the main index has the "collection" alias - if not then rebuild everything
  300. if (!bPingMainIndexFailed && (null == _aliasInfo)) {
  301. ElasticSearchManager docIndex = ElasticSearchManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
  302. ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
  303. _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData());
  304. if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) {
  305. bRebuildDocsIndex = true;
  306. }
  307. } //TESTED
  308. createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs);
  309. createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs);
  310. // Some hardwired dummy communities
  311. createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin)
  312. createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user)
  313. // (create dummy index used to keep personal group aliases)
  314. if (bRebuildDocsIndex || bDeleteDocs) {
  315. // OK, going to have different shards for different communities:
  316. // Get a list of all the communities:
  317. BasicDBObject query = new BasicDBObject();
  318. BasicDBObject fieldsToDrop = new BasicDBObject("members", 0);
  319. fieldsToDrop.put("communityAttributes", 0);
  320. fieldsToDrop.put("userAttributes", 0);
  321. DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop);
  322. List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts)
  323. int i = 0;
  324. System.out.println("Initializing " + dbc.size() + " indexes:");
  325. for (int j = 0; j < 2; ++j) {
  326. for (DBObject dbotmp: tmparray) {
  327. if ((++i % 100) == 0) {
  328. System.out.println("Initialized " + i + " indexes.");
  329. }
  330. BasicDBObject dbo = (BasicDBObject) dbotmp;
  331. // OK, going to see if there are any sources with this group id, create a new index if so:
  332. // (Don't use CommunityPojo data model here for performance reasons....
  333. // (Also, haven't gotten round to porting CommunityPojo field access to using static fields))
  334. ObjectId communityId = (ObjectId) dbo.get("_id");
  335. boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false);
  336. boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false);
  337. ObjectId parentCommunityId = (ObjectId) dbo.get("parentId");
  338. createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup, bSystemGroup, bDeleteDocs, j==0);
  339. }//end loop over communities
  340. }// end loop over communities - first time parents only
  341. } // (end if need to do big loop over all sources)
  342. }
  343. catch (Exception e)
  344. {
  345. e.printStackTrace();
  346. throw new RuntimeException(e.getMessage());
  347. }
  348. }//TESTED (not changed since by-eye test in Beta - retested after moving code into createCommunityDocIndex below)
  349. ///////////////////////////////////////////////////////////////////////////////////////
  350. // Utility code for creating community indexes
  351. public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex)
  352. {
  353. createCommunityDocIndex(nameOrCommunityIdStr, parentCommunityId, bPersonalGroup, bSystemGroup, bClearIndex, false);
  354. }
  355. public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId,
  356. boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex, boolean bParentsOnly)
  357. {
  358. //create elasticsearch indexes
  359. PropertiesManager pm = new PropertiesManager();
  360. boolean languageNormalization = pm.getNormalizeEncoding();
  361. int nPreferredReplicas = pm.getMaxIndexReplicas();
  362. String docMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class);
  363. String sGroupIndex = null; // for indexing, ie always a single index
  364. String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
  365. try {
  366. sGroupIndex = new StringBuffer("doc_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString();
  367. sAliasIndex = new StringBuffer("docs_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString();
  368. }
  369. catch (Exception e) {
  370. sGroupIndex = nameOrCommunityIdStr;
  371. if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) {
  372. sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
  373. }
  374. else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
  375. sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
  376. }
  377. else { // fallback
  378. sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
  379. }
  380. //TESTED
  381. }
  382. if (!bPersonalGroup) {
  383. if (null == parentCommunityId) {
  384. int nShards = bSystemGroup? 10 : 5 ; // (system group is largest)
  385. // Remove the alias, in case it exists:
  386. // Then create an index with this name:
  387. Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
  388. localSettingsGroupIndex.put("number_of_shards", nShards).put("number_of_replicas", nPreferredReplicas);
  389. if (languageNormalization) {
  390. localSettingsGroupIndex.put("index.analysis.analyzer.default.tokenizer","standard");
  391. localSettingsGroupIndex.putArray("index.analysis.analyzer.default.filter", "icu_normalizer","icu_folding","standard","lowercase","stop");
  392. }//TESTED
  393. ElasticSearchManager docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
  394. if (null == docIndex) { // index has already been referenced, hence createIndex returns null
  395. docIndex = IndexManager.getIndex(sGroupIndex);
  396. }
  397. if (bClearIndex) {
  398. docIndex.deleteMe();
  399. docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
  400. }
  401. if (null != docIndex) {
  402. try {
  403. docIndex.pingIndex(); // (wait until it's created itself)
  404. }
  405. catch (Exception e) {} // (just make sure this doesn't die horribly)
  406. }
  407. else {
  408. docIndex = IndexManager.getIndex(sGroupIndex);
  409. }
  410. if (null != docIndex) { // should always be true
  411. docIndex.createAlias(sAliasIndex);
  412. docIndex.closeIndex();
  413. }
  414. }
  415. else if (!bParentsOnly) { // A sub-index of a parent
  416. parentCommunityId = getRootCommunity(parentCommunityId);
  417. if (null != parentCommunityId) {
  418. String parentCommunityIdStr = parentCommunityId.toString();
  419. String sParentGroupIndex = new StringBuffer("doc_").append(new ObjectId(parentCommunityIdStr).toString()).toString();
  420. ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);
  421. //DEBUG (alias corruption)
  422. // if (null == _aliasInfo) {
  423. // ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
  424. // _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData());
  425. // }
  426. // else {
  427. // if (_aliasInfo.containsKey(sGroupIndex)) { // has no aliases, we're not good
  428. // return;
  429. // }
  430. // else {
  431. // //DEBUG
  432. // System.out.println("Alias " + sGroupIndex + " has no aliases (but should)");
  433. // ElasticSearchManager docIndex2 = IndexManager.getIndex(sGroupIndex);
  434. // docIndex2.deleteMe();
  435. // }
  436. // }
  437. docIndex.createAlias(sGroupIndex); // for indexing
  438. // (this is going to be tricky when the functionality is fully implemented
  439. // because it will need to handle the parent index splitting)
  440. docIndex.createAlias(sAliasIndex); // for queries
  441. docIndex.closeIndex();
  442. // (do nothing on delete - that will be handled at the parent index level)
  443. }
  444. }
  445. //TESTED (parents, children, and personal + docs_ aliases)
  446. }
  447. else { // (Personal group)
  448. // Just create the dummy index, no different to getting it in practice
  449. Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
  450. localSettingsGroupIndex.put("number_of_shards", 1).put("number_of_replicas", 0); // (ie guaranteed to be local to each ES node)
  451. ElasticSearchManager dummyGroupIndex = IndexManager.createIndex(DocumentPojoIndexMap.dummyDocumentIndex_, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
  452. if (null == dummyGroupIndex) {
  453. dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
  454. }
  455. // Just create an alias, so that queries work arbitrarily:
  456. dummyGroupIndex.createAlias(sGroupIndex); // (at some point we should delete the sGroupIndex alias, but leave it in for bw compatibility for now)
  457. dummyGroupIndex.createAlias(sAliasIndex); // (never index dummy indices so only need query index)
  458. // (do nothing on delete since don't have any docs in here anyway)
  459. }
  460. }
  461. //TESTED (including new docs_ alias)
  462. ///////////////////////////
  463. // (this utility function is needed for the legacy case where empty communities were
  464. // treated as aliases of the dummy community ... first time I encounter a community, I need
  465. // to recreate it...)
  466. public static void recreateCommunityDocIndex_unknownFields(ObjectId communityId, boolean bDeleteFirst) {
  467. CommunityPojo cp = CommunityPojo.fromDb(MongoDbManager.getSocial().getCommunity().findOne(new BasicDBObject("_id", communityId)), CommunityPojo.class);
  468. if (null != cp) {
  469. deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), true);
  470. // (in the legacy world this would have been treated as a "personal" ie equivalently to a dummy community ...
  471. // this does nothing if it's already a real community)
  472. if (bDeleteFirst) {
  473. deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity());
  474. }
  475. createCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity(), cp.getIsSystemCommunity(), false);
  476. }
  477. }
  478. //TESTED
  479. ///////////////////////////
  480. public static void deleteCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup) {
  481. String sGroupIndex = null; // for indexing, ie always a single index
  482. String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
  483. ObjectId communityId = null;
  484. try {
  485. communityId = new ObjectId(nameOrCommunityIdStr);
  486. sGroupIndex = new StringBuffer("doc_").append(communityId.toString()).toString();
  487. sAliasIndex = new StringBuffer("docs_").append(communityId.toString()).toString();
  488. }
  489. catch (Exception e) {
  490. sGroupIndex = nameOrCommunityIdStr;
  491. if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) {
  492. sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
  493. }
  494. else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
  495. sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
  496. }
  497. else { // fallback
  498. sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
  499. }
  500. //TESTED
  501. }
  502. if (bPersonalGroup) {
  503. ElasticSearchManager dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
  504. dummyGroupIndex.removeAlias(sAliasIndex);
  505. dummyGroupIndex.removeAlias(sGroupIndex);
  506. }
  507. else if (null != parentCommunityId) {
  508. parentCommunityId = getRootCommunity(parentCommunityId);
  509. if (null != parentCommunityId) {
  510. String sParentGroupIndex = new StringBuffer("doc_").append(parentCommunityId.toString()).toString();
  511. ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);
  512. docIndex.removeAlias(sGroupIndex);
  513. docIndex.removeAlias(sAliasIndex);
  514. docIndex.closeIndex();
  515. }
  516. }
  517. else {
  518. ElasticSearchManager docIndex = IndexManager.getIndex(sGroupIndex);
  519. docIndex.deleteMe();
  520. }
  521. //TESTED (parent, children, and personal)
  522. // Also need to delete any records indexes:
  523. // It's a bit more complex because we're not exactly sure which indexes exist:
  524. if (null != communityId) {
  525. ElasticSearchManager indexMgr = ElasticSearchManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
  526. // (just something that's guaranteed to exist)
  527. String stashedIndex = "recs_" + communityId.toString();
  528. String liveIndicesPrefix = "recs_t_" + communityId.toString();
  529. ClusterStateResponse clusterState = indexMgr.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
  530. String indices[] = clusterState.getState().getMetaData().getConcreteAllOpenIndices();
  531. for (String index: indices) {
  532. if (index.startsWith(stashedIndex) || index.startsWith(liveIndicesPrefix)) {
  533. ElasticSearchManager.getIndex(index).deleteMe();
  534. }
  535. }//TESTED
  536. // THIS CODE ONLY WORKS ON ES-1.0+ ... so have replaced with the less efficient code above
  537. // First off: stashed interface:
  538. // String stashedIndex = "recs_" + communityId.toString();
  539. // ClusterStateResponse retVal = indexMgr.getRawClient().admin().cluster().prepareState()
  540. // .setIndices(stashedIndex)
  541. // .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
  542. //
  543. // if (!retVal.getState().getMetaData().getIndices().isEmpty()) {
  544. // ElasticSearchManager.getIndex(stashedIndex).deleteMe();
  545. // }//TESTED
  546. // // (else doesn't exist...)
  547. //
  548. // // Second: all the time-indexed versions
  549. //
  550. // String indexPattern = new StringBuffer("recs_t_").append(communityId.toString()).append("*").toString();
  551. // retVal = indexMgr.getRawClient().admin().cluster().prepareState()
  552. // .setIndices(indexPattern)
  553. // .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
  554. //
  555. // for (IndexMetaData indexMetadata: retVal.getState().getMetaData()) {
  556. // ElasticSearchManager.getIndex(indexMetadata.index()).deleteMe();
  557. // }//TESTED
  558. }//TESTED
  559. }
  560. //TESTED (personal and system)
  561. ///////////////////////////
  562. // Utility function to get the root community of a community hierarchy, since you can't add aliases to aliases
  563. static ObjectId getRootCommunity(ObjectId parentCommunityId) {
  564. for (;;) {
  565. BasicDBObject query = new BasicDBObject("_id", parentCommunityId);
  566. BasicDBObject field = new BasicDBObject("parentId", 1);
  567. BasicDBObject retVal = (BasicDBObject) MongoDbManager.getSocial().getCommunity().findOne(query, field);
  568. if (null == retVal) { // (shouldn't ever happen)
  569. return parentCommunityId;
  570. }
  571. ObjectId tmp = retVal.getObjectId("parentId", null);
  572. if (null == tmp) { // (no more parents)
  573. return parentCommunityId;
  574. }
  575. if (tmp.equals(parentCommunityId)) { // (shouldn't ever happen but will prevent infinite loop)
  576. return parentCommunityId;
  577. }
  578. parentCommunityId = tmp;
  579. }
  580. }//TESTED (cases where have and don't have parent id)
  581. ///////////////////////////////////////////////////////////////////////////////////////
  582. //
  583. // Interface to handle scaleable indexes
  584. // Currently this is a dummy interface, but it will make it easy to split the indexes in the future
  585. private static HashMap<String, String> _docIndexMap = null;
  586. private static String _assocIndex = null;
  587. private static String _entityIndex = null;
  588. private static CrossVersionImmutableMapOfImmutableMaps<AliasMetaData> _aliasInfo = null;
  589. //TODO (INF-1136): Test and integrate this (phase 1), then implement the index splitting code (phase 2)
  590. public static synchronized String getIndex(String communityIdOrIndexStr) {
  591. if (communityIdOrIndexStr == EntityFeaturePojoIndexMap.indexName_) { // pointer == intended
  592. if (null == _entityIndex) {
  593. _entityIndex = EntityFeaturePojoIndexMap.indexName_;
  594. }
  595. return _entityIndex;
  596. }
  597. else if (communityIdOrIndexStr == AssociationFeaturePojoIndexMap.indexName_) { // pointer == intended
  598. if (null == _assocIndex) {
  599. _assocIndex = AssociationFeaturePojoIndexMap.indexName_;
  600. }
  601. return _assocIndex;
  602. }
  603. else { // Documents
  604. if (null == _docIndexMap) {
  605. _docIndexMap = new HashMap<String, String>();
  606. }
  607. String sAliasIndex;
  608. try {
  609. sAliasIndex = new StringBuffer("doc_").append(new ObjectId(communityIdOrIndexStr).toString()).toString();
  610. }
  611. catch (Exception e) {
  612. if (DocumentPojoIndexMap.globalDocumentIndex_.equals(communityIdOrIndexStr)) {
  613. communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
  614. }
  615. else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(communityIdOrIndexStr)) {
  616. communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
  617. }
  618. else { // fallback
  619. communityIdOrIndexStr = sAliasIndex = communityIdOrIndexStr.replaceAll("doc(?:ument)?_", "");
  620. }
  621. }
  622. String sDocIndex = _docIndexMap.get(communityIdOrIndexStr);
  623. if (null == sDocIndex) {
  624. sDocIndex = sAliasIndex;
  625. _docIndexMap.put(communityIdOrIndexStr, sAliasIndex);
  626. }
  627. return sDocIndex;
  628. }
  629. }
  630. //TOTEST (lots of cases)
  631. ///////////////////////////////////////////////////////////////////////////////////////
  632. //
  633. // Enrich and store documents (source is optional - can choose not to index if set)
  634. // (and remove any documents)
  635. public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete)
  636. {
  637. processDocuments(harvestType, toAdd, toUpdate_subsetOfAdd, toDelete, null);
  638. }
  639. public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete, SourcePojo source)
  640. {
  641. PropertiesManager props = new PropertiesManager();
  642. // Note: toAdd = toAdd(old) + toUpdate
  643. // Need to treat updates as follows:
  644. // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the future comments etc)
  645. // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...)
  646. toDelete.addAll(toUpdate_subsetOfAdd);
  647. StoreAndIndexManager storageManager = new StoreAndIndexManager();
  648. storageManager.removeFromDatastore_byURL(toDelete);
  649. // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester)
  650. // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...)
  651. // Aggregation:
  652. // 1+2. Create aggregate entities/events ("features") and write them to the DB
  653. // (then can store feeds - doesn't matter that the event/entities have been modified by the aggregation)
  654. // 3. (Scheduled for efficiency) Update all documents' frequencies based on new entities and events
  655. // 4. (Scheduled for efficiency) Synchronize with index [after this, queries can find them - so (2) must have happened]
  656. // (Syncronization currently "corrupts" the entities so needs to be run last)
  657. AggregationManager perSourceAggregation = null;
  658. if (!props.getAggregationDisabled()) {
  659. perSourceAggregation = new AggregationManager();
  660. }
  661. // 1+2]
  662. if (null != perSourceAggregation) {
  663. perSourceAggregation.doAggregation(toAdd, toDelete);
  664. perSourceAggregation.createOrUpdateFeatureEntries();
  665. }
  666. // Save feeds to feeds collection in MongoDb
  667. // (second field determines if content gets saved)
  668. if (null != perSourceAggregation) {
  669. perSourceAggregation.applyAggregationToDocs(toAdd);
  670. // (First save aggregated statistics back to the docs' entity/event instances)
  671. }
  672. storeFeeds(toAdd, (harvestType != InfiniteEnums.DATABASE), source);
  673. // Then finish aggregation:
  674. if (null != perSourceAggregation) {
  675. // 3]
  676. perSourceAggregation.runScheduledDocumentUpdates();
  677. // 4] This needs to happen last because it "corrupts" the entities and events
  678. perSourceAggregation.runScheduledSynchronization();
  679. }
  680. }//TESTED (by eye - logic is v simple)
  681. ///////////////////////////////////////////////////////////////////////////////////////
  682. //
  683. // STORAGE AND INDEXING
  684. //
  685. //////////////////////////////////////////////////////////////////////////////////////
  686. /**
  687. * Writes the feeds to the DB and index
  688. *
  689. * @param feeds list of feeds to be added to db
  690. */
  691. private void storeFeeds(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source)
  692. {
  693. if ( null != docs && docs.size() > 0 )
  694. {
  695. StoreAndIndexManager store = new StoreAndIndexManager();
  696. store.addToDatastore(docs, bSaveContent, source);
  697. }
  698. }//TESTED (by eye)
  699. // See StoreAndIndexManager
  700. ///////////////////////////////////////////////////////////////////////////////////////
  701. //
  702. // AGGREGATION
  703. //
  704. //////////////////////////////////////////////////////////////////////////////////////
  705. // See AggregationManager
  706. }