PageRenderTime 94ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/utility/infinit.e.mongo-indexer/src/com/ikanow/infinit/e/utility/MongoEntityFeatureTxfer.java

https://github.com/IKANOW/Infinit.e
Java | 434 lines | 233 code | 81 blank | 120 comment | 49 complexity | b23ac34c8fa7f8e0c75ac98b1ba028d0 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.utility;
  17. import java.io.IOException;
  18. import java.lang.reflect.Type;
  19. import java.net.UnknownHostException;
  20. import java.util.ArrayList;
  21. import java.util.LinkedList;
  22. import java.util.List;
  23. import org.bson.BSONObject;
  24. import org.bson.types.ObjectId;
  25. import org.elasticsearch.common.settings.ImmutableSettings;
  26. import org.elasticsearch.common.settings.ImmutableSettings.Builder;
  27. import com.google.gson.Gson;
  28. import com.google.gson.reflect.TypeToken;
  29. import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
  30. import com.ikanow.infinit.e.data_model.index.IndexManager;
  31. import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap;
  32. import com.ikanow.infinit.e.data_model.store.DbManager;
  33. import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
  34. import com.ikanow.infinit.e.processing.generic.GenericProcessingController;
  35. import com.mongodb.BasicDBObject;
  36. import com.mongodb.DBCollection;
  37. import com.mongodb.DBCursor;
  38. import com.mongodb.Mongo;
  39. import com.mongodb.MongoException;
  40. public class MongoEntityFeatureTxfer
  41. {
  42. //___________________________________________________________________________________________________
  43. // MAIN
  44. /**
  45. * @param args: 0 is config location, 1 is query, 2 is delete/split (optional)
  46. * to run 3 options:
  47. * Transfer: config query(opt)
  48. * Delete: config query delete
  49. * Split: config query split
  50. *
  51. * @throws MongoException
  52. * @throws NumberFormatException
  53. * @throws IOException
  54. */
  55. public static void main(String sConfigPath, String sQuery, boolean bDelete, boolean bRebuildIndex, int nSkip, int nLimit, String chunksDescription) throws NumberFormatException, MongoException, IOException {
  56. MongoEntityFeatureTxfer txferManager = new MongoEntityFeatureTxfer();
  57. // Command line processing
  58. com.ikanow.infinit.e.data_model.Globals.setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE);
  59. if (null != sConfigPath) {
  60. com.ikanow.infinit.e.data_model.Globals.overrideConfigLocation(sConfigPath);
  61. }
  62. if (bRebuildIndex) {
  63. new GenericProcessingController().InitializeIndex(false, true, false);
  64. }
  65. BasicDBObject query = null;
  66. if (null == sQuery) {
  67. query = new BasicDBObject();
  68. }
  69. else {
  70. query = (BasicDBObject) com.mongodb.util.JSON.parse(sQuery);
  71. }
  72. if (bDelete) {
  73. MongoEntityFeatureTxfer.doDelete(query, nLimit);
  74. }
  75. else {
  76. if (null == chunksDescription) {
  77. txferManager.doTransfer(query, nSkip, nLimit, null);
  78. }
  79. else {
  80. txferManager.doChunkedTransfer(query, nSkip, nLimit, chunksDescription);
  81. }
  82. }
  83. }
  84. //___________________________________________________________________________________________________
  85. // Wrapper for doing transfer in chunks:
  86. private void doChunkedTransfer(BasicDBObject query, int nSkip, int nLimit, String chunksDescription) throws IOException
  87. {
  88. List<BasicDBObject> chunkList = MongoIndexerUtils.getChunks("feature.entity", chunksDescription);
  89. System.out.println("CHUNKS: Found " + chunkList.size() + " chunks");
  90. //DEBUG
  91. //System.out.println("Chunklist= " + chunkList);
  92. for (BasicDBObject chunk: chunkList) {
  93. BasicDBObject cleanQuery = new BasicDBObject();
  94. cleanQuery.putAll((BSONObject)query);
  95. String id = null;
  96. try {
  97. id = (String) chunk.remove("$id");
  98. System.out.println("CHUNK: " + id);
  99. doTransfer(cleanQuery, 0, 0, chunk);
  100. }
  101. catch (Exception e) {
  102. System.out.println("FAILED CHUNK: " + id + " ... " + e.getMessage());
  103. }
  104. }
  105. }//TESTED
  106. //___________________________________________________________________________________________________
  107. // PROCESSING LOOP (new interface)
  108. private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk)
  109. {
  110. ElasticSearchManager elasticManager = null;
  111. // Initialize the DB:
  112. DBCollection entityFeatureDB = DbManager.getFeature().getEntity();
  113. // Initialize the ES (create the index if it doesn't already):
  114. // 1. Set-up the entity feature index
  115. String indexName = "entity_index";
  116. ElasticSearchManager.setDefaultClusterName("infinite-aws");
  117. // (delete the index)
  118. //elasticManager = ElasticSearchManager.getIndex(indexName);
  119. //elasticManager.deleteMe();
  120. // Create the index if necessary
  121. String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class);
  122. Builder localSettings = ImmutableSettings.settingsBuilder();
  123. localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
  124. localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
  125. localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");
  126. elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings);
  127. // Get the index (necessary if already created)
  128. if (null == elasticManager)
  129. {
  130. elasticManager = ElasticSearchManager.getIndex(indexName);
  131. }
  132. // Now query the DB:
  133. DBCursor dbc = null;
  134. dbc = entityFeatureDB.find(query);
  135. if (null != chunk) {
  136. if (chunk.containsField(DbManager.min_)) {
  137. dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
  138. }
  139. if (chunk.containsField(DbManager.max_)) {
  140. dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
  141. }
  142. }
  143. dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
  144. if (null == chunk) {
  145. int nCount = dbc.count() - nSkip;
  146. if (nCount < 0) nCount = 0;
  147. System.out.println("Found " + nCount + " records to sync, process first " + (0==nLimit?nCount:nLimit));
  148. if (0 == nCount) { // Nothing to do...
  149. return;
  150. }
  151. }
  152. int nSynced = 0;
  153. List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>();
  154. while ( dbc.hasNext() )
  155. {
  156. EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(),EntityFeaturePojo.class);
  157. if (null != feature.getAlias()) { // (some corrupt gazateer entry)
  158. // Handle groups (system group is: "4c927585d591d31d7b37097a")
  159. // if there is no community id, add system group (something is wrong if this happens?)
  160. if (null == feature.getCommunityId())
  161. {
  162. feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
  163. }
  164. }
  165. entities.add(feature);
  166. nSynced++;
  167. // Add the entities
  168. if ( entities.size() > 1000 )
  169. {
  170. elasticManager.bulkAddDocuments(
  171. IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()),
  172. "_id", null, true);
  173. // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)
  174. entities = new ArrayList<EntityFeaturePojo>();
  175. }
  176. }
  177. //write whatevers left
  178. elasticManager.bulkAddDocuments(
  179. IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()),
  180. "_id", null, true);
  181. // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)
  182. if (null != chunk) {
  183. System.out.println("Found " + nSynced + " records to sync in chunk");
  184. }
  185. }
  186. //___________________________________________________________________________________________________
  187. // DELETE DOCUMENTS FROM A QUERY
  188. static void doDelete(BasicDBObject query, int nLimit)
  189. {
  190. doDelete(query, nLimit, false);
  191. }
  192. static void doDelete(BasicDBObject query, int nLimit, boolean automatedRequest)
  193. {
  194. try
  195. {
  196. // Initialize the DB:
  197. DBCollection entityFeatureDB = DbManager.getFeature().getEntity();
  198. ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("entity_index");
  199. BasicDBObject fields = new BasicDBObject();
  200. fields.put(EntityFeaturePojo.index_, 1);
  201. fields.put(EntityFeaturePojo.communityId_, 1);
  202. DBCursor cur = entityFeatureDB.find(query, fields).limit(nLimit);
  203. // (this internally works in batches of 1000)
  204. if (automatedRequest) {
  205. System.out.println("Found " + cur.count() + " records to delete from _id list");
  206. }
  207. else {
  208. System.out.println("Found " + cur.count() + " records to delete from " + query.toString());
  209. }
  210. if (nLimit > 0) {
  211. System.out.println("(limited to " + nLimit + " records)");
  212. }
  213. int nArraySize = (cur.count() > 1000) ? 1000 : cur.count();
  214. ArrayList<EntityFeaturePojo> batchList = new ArrayList<EntityFeaturePojo>(nArraySize);
  215. while (cur.hasNext())
  216. {
  217. EntityFeaturePojo gp = EntityFeaturePojo.fromDb(cur.next(),EntityFeaturePojo.class);
  218. batchList.add(gp);
  219. if (batchList.size() >= nArraySize) {
  220. internalDelete(batchList, elasticManager);
  221. batchList.clear();
  222. }
  223. }
  224. if (!batchList.isEmpty()) {
  225. internalDelete(batchList, elasticManager);
  226. }
  227. entityFeatureDB.remove(query);
  228. } catch (NumberFormatException e) {
  229. e.printStackTrace();
  230. } catch (MongoException e) {
  231. e.printStackTrace();
  232. }
  233. finally
  234. {
  235. }
  236. }//TESTED
  237. // Batch delete
  238. private static void internalDelete(List<EntityFeaturePojo> entitiesToDelete, ElasticSearchManager esMgr) {
  239. List<String> esids = new ArrayList<String>(entitiesToDelete.size());
  240. for (EntityFeaturePojo gp: entitiesToDelete) {
  241. esids.add(new StringBuffer(gp.getIndex()).append(':').append(gp.getCommunityId().toString()).toString());
  242. }
  243. esMgr.bulkDeleteDocuments(esids);
  244. }//TESTED
  245. //___________________________________________________________________________________________________
  246. // TEST CODE
  247. @SuppressWarnings("unused")
  248. private void doUnitTestCode(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort,
  249. BasicDBObject query, int nLimit)
  250. {
  251. Mongo mongoDB = null;
  252. ElasticSearchManager elasticManager = null;
  253. try {
  254. // Initialize the DB:
  255. mongoDB = new Mongo(sMongoDbHost, Integer.parseInt(sMongoDbPort));
  256. DBCollection gazDB = mongoDB.getDB("feature").getCollection("entity");
  257. // Initialize the ES (create the index if it doesn't already):
  258. // 1. Set-up the entity feature index
  259. String indexName = "entity_index";
  260. //TEST: delete the index:
  261. // elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
  262. // elasticManager.deleteMe();
  263. //TEST: create the index
  264. // String sMapping = new Gson().toJson(new GazateerPojo.Mapping(), GazateerPojo.Mapping.class);
  265. // Builder localSettings = ImmutableSettings.settingsBuilder();
  266. // localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); q
  267. // elasticManager = ElasticSearchManager.createIndex
  268. // (indexName, false,
  269. // sElasticHost + ":" + sElasticPort,
  270. // sMapping, localSettings);
  271. //TEST: delete the index:
  272. // elasticManager.deleteMe();
  273. //TEST: get the index:
  274. // elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
  275. // Now query the DB:
  276. DBCursor dbc = null;
  277. if (nLimit > 0) {
  278. dbc = gazDB.find(query).limit(nLimit);
  279. }
  280. else { // Everything!
  281. dbc = gazDB.find(query);
  282. }
  283. Type listType = new TypeToken<ArrayList<EntityFeaturePojo>>() {}.getType();
  284. List<EntityFeaturePojo> entities = new Gson().fromJson(dbc.toArray().toString(), listType);
  285. //Debug:
  286. List<String> entIds = new LinkedList<String>();
  287. // Loop over array and invoke the cleansing function for each one
  288. for (EntityFeaturePojo ent: entities) {
  289. if (null != ent.getAlias()) { // (some corrupt gazateer entry)
  290. //Debug:
  291. //System.out.println("entity=" + ent.getGazateerIndex());
  292. //System.out.println("aliases=" + Arrays.toString(ent.getAlias().toArray()));
  293. // Insert into the elasticsearch index
  294. //Debug:
  295. //System.out.println(new Gson().toJson(ent, GazateerPojo.class));
  296. // Handle groups (system group is: "4c927585d591d31d7b37097a")
  297. if (null == ent.getCommunityId()) {
  298. ent.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
  299. }
  300. //TEST: index documemt
  301. // ent.synchronizeWithIndex();
  302. // boolean b = elasticManager.addDocument(ent, ent.getGazateerIndex(), true);
  303. //TEST: remove document
  304. //b = elasticManager.removeDocument(ent.getGazateerIndex());
  305. //TEST: (part of get, bulk add/delete)
  306. entIds.add(ent.getIndex());
  307. // Debug:
  308. // if (!b) {
  309. // System.out.println("Didn't add " + ent.getGazateerIndex());
  310. // }
  311. }
  312. } // End loop over entities
  313. //TEST: bulk delete
  314. //elasticManager.bulkAddDocuments(entities, "index", null);
  315. //elasticManager.bulkDeleteDocuments(entIds);
  316. //TEST: get document
  317. // elasticManager.getRawClient().admin().indices().refresh(Requests.refreshRequest(indexName)).actionGet();
  318. // for (String id: entIds) {
  319. // Map<String, GetField> results = elasticManager.getDocument(id,"doccount", "disambiguated_name");
  320. // System.out.println(id + ": " + results.get("doccount").values().get(0) + " , " + results.get("disambiguated_name").values().get(0));
  321. // }
  322. //TEST: search
  323. // elasticManager.getRawClient().admin().indices().refresh(Requests.refreshRequest(indexName)).actionGet();
  324. // SearchRequestBuilder searchOptions = elasticManager.getSearchOptions();
  325. // XContentQueryBuilder queryObj = QueryBuilders.matchAllQuery();
  326. // searchOptions.addSort("doccount", SortOrder.DESC);
  327. // searchOptions.addFields("doccount", "type");
  328. // SearchResponse rsp = elasticManager.doQuery(queryObj, searchOptions);
  329. // SearchHit[] docs = rsp.getHits().getHits();
  330. // for (SearchHit hit: docs) {
  331. // String id = hit.getId();
  332. // Long doccount = (Long) hit.field("doccount").value();
  333. // String type = (String) hit.field("type").value();
  334. // System.out.println(id + ": " + doccount + ", " + type);
  335. // }
  336. } catch (NumberFormatException e) {
  337. e.printStackTrace();
  338. } catch (UnknownHostException e) {
  339. e.printStackTrace();
  340. } catch (MongoException e) {
  341. e.printStackTrace();
  342. }
  343. finally {
  344. if (null != mongoDB) {
  345. mongoDB.close();
  346. }
  347. if (null != elasticManager) {
  348. //NB not sure when exactly to call this - probably can just not bother?
  349. //elasticManager.getRawClient().close();
  350. }
  351. }
  352. }
  353. }