PageRenderTime 26ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/utility/infinit.e.mongo-indexer/src/com/ikanow/infinit/e/utility/MongoDocumentTxfer.java

https://github.com/IKANOW/Infinit.e
Java | 662 lines | 442 code | 87 blank | 133 comment | 123 complexity | 1faf73e4d402e3507652bfac2c343978 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.utility;
  17. import java.io.ByteArrayInputStream;
  18. import java.io.IOException;
  19. import java.util.Arrays;
  20. import java.util.HashMap;
  21. import java.util.LinkedList;
  22. import java.util.List;
  23. import java.util.Map;
  24. import java.util.Set;
  25. import java.util.TreeMap;
  26. import java.util.TreeSet;
  27. import java.util.regex.Pattern;
  28. import java.util.zip.GZIPInputStream;
  29. import org.bson.BSONObject;
  30. import org.bson.types.ObjectId;
  31. import org.elasticsearch.common.settings.ImmutableSettings;
  32. import org.elasticsearch.common.settings.ImmutableSettings.Builder;
  33. import com.google.gson.Gson;
  34. import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
  35. import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
  36. import com.ikanow.infinit.e.data_model.store.DbManager;
  37. import com.ikanow.infinit.e.data_model.store.MongoDbManager;
  38. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  39. import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
  40. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  41. import com.ikanow.infinit.e.processing.generic.GenericProcessingController;
  42. import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager;
  43. import com.ikanow.infinit.e.processing.generic.aggregation.AssociationBackgroundAggregationManager;
  44. import com.ikanow.infinit.e.processing.generic.aggregation.EntityBackgroundAggregationManager;
  45. import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
  46. import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager;
  47. import com.mongodb.BasicDBObject;
  48. import com.mongodb.DBCollection;
  49. import com.mongodb.DBCursor;
  50. import com.mongodb.DBObject;
  51. import com.mongodb.MongoException;
  52. public class MongoDocumentTxfer {
  53. //___________________________________________________________________________________________________
  54. // MAIN
  55. /**
  56. * @param args: 0,1 is the location of the MongoDB host/port, 2/3 is the location of the ES index host/port
  57. * @throws MongoException
  58. * @throws NumberFormatException
  59. * @throws IOException
  60. */
  61. public static void main(String sConfigPath, String sQuery, boolean bDelete, boolean bRebuildIndex, boolean bVerifyIndex, boolean bUpdateFeatures, int nSkip, int nLimit, String chunksDescription) throws NumberFormatException, MongoException, IOException {
  62. // Command line processing
  63. com.ikanow.infinit.e.data_model.Globals.setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE);
  64. if (null != sConfigPath) {
  65. com.ikanow.infinit.e.data_model.Globals.overrideConfigLocation(sConfigPath);
  66. }
  67. boolean bRebuildIndexOnFly = false;
  68. if (bRebuildIndex && ((null == sQuery) || sQuery.equals("{}"))) { // (else will do them 1-by-1)
  69. new GenericProcessingController().InitializeIndex(true, false, false);
  70. }
  71. else {
  72. // Have seen odd transport timeouts on occasion: this should ensure they never happen
  73. new GenericProcessingController().InitializeIndex(false, false, false, bVerifyIndex);
  74. // (don't delete anything, but do recalc)
  75. if (bRebuildIndex) {
  76. bRebuildIndexOnFly = true;
  77. }
  78. }
  79. if (bVerifyIndex && (0 == nLimit) && (null == sQuery)) {
  80. // Index verifcation with nothing else to do
  81. return;
  82. }
  83. MongoDocumentTxfer txferManager = new MongoDocumentTxfer(bRebuildIndexOnFly);
  84. BasicDBObject query = null;
  85. if (null == sQuery) {
  86. query = new BasicDBObject();
  87. }
  88. else {
  89. query = (BasicDBObject) com.mongodb.util.JSON.parse(sQuery);
  90. }
  91. if (!bDelete) {
  92. if (null != chunksDescription) {
  93. txferManager.doChunkedTransfer(query, nSkip, nLimit, bUpdateFeatures, chunksDescription);
  94. }
  95. else {
  96. txferManager.doTransfer(query, nSkip, nLimit, bUpdateFeatures, null);
  97. }
  98. }
  99. else {
  100. txferManager.doDelete(query, nLimit);
  101. }
  102. }
  103. public MongoDocumentTxfer(boolean bRebuildIndexOnFly) {
  104. if (bRebuildIndexOnFly) {
  105. _deletedIndex = new TreeSet<String>();
  106. _deletedIndex.add(DocumentPojoIndexMap.manyGeoDocumentIndex_); // (don't ever delete this on the fly, it contains docs matching other queries)
  107. }
  108. }
  109. //___________________________________________________________________________________________________
  110. // Wrapper for doing transfer in chunks:
  111. private void doChunkedTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, String chunksDescription) throws IOException
  112. {
  113. List<BasicDBObject> chunkList = MongoIndexerUtils.getChunks("doc_metadata.metadata", chunksDescription);
  114. System.out.println("CHUNKS: Found " + chunkList.size() + " chunks");
  115. //DEBUG
  116. //System.out.println("Chunklist= " + chunkList);
  117. for (BasicDBObject chunk: chunkList) {
  118. BasicDBObject cleanQuery = new BasicDBObject();
  119. cleanQuery.putAll((BSONObject)query);
  120. String id = null;
  121. try {
  122. id = (String) chunk.remove("$id");
  123. System.out.println("CHUNK: " + id);
  124. doTransfer(cleanQuery, 0, 0, bAggregate, chunk);
  125. }
  126. catch (Exception e) {
  127. System.out.println("FAILED CHUNK: " + id + " ... " + e.getMessage());
  128. }
  129. }
  130. }//TESTED
  131. //___________________________________________________________________________________________________
  132. // PROCESSING LOOP (new interface)
  133. private Map<String, SourcePojo> _sourceCache = new HashMap<String, SourcePojo>();
  134. private TreeSet<String> _deletedIndex = null;
  135. private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk) throws IOException
  136. {
  137. PropertiesManager pm = new PropertiesManager();
  138. int nMaxContentSize_bytes = pm.getMaxContentSize();
  139. // Initialize the DB:
  140. DBCollection docsDB = DbManager.getDocument().getMetadata();
  141. DBCollection contentDB = DbManager.getDocument().getContent();
  142. DBCollection sourcesDB = DbManager.getIngest().getSource();
  143. ElasticSearchManager.setDefaultClusterName("infinite-aws");
  144. // 1. Get the documents from the DB (combining data + metadata and refreshing source meta)
  145. // (Ignore soft-deleted records:)
  146. if (null == query) {
  147. query = new BasicDBObject();
  148. }
  149. // Optimize communityId into sourceKeys...
  150. if ((null != query.get(DocumentPojo.communityId_)) && (null == query.get(DocumentPojo.sourceKey_)))
  151. {
  152. try {
  153. ObjectId commId = query.getObjectId(DocumentPojo.communityId_);
  154. DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId));
  155. String[] sourceKeys = new String[dbc.count()];
  156. int added = 0;
  157. for (DBObject dbo: dbc) {
  158. sourceKeys[added++] = (String) dbo.get(SourcePojo.key_);
  159. }
  160. query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));
  161. System.out.println("(Optimized simple community query to " + sourceKeys.length + " source key(s))");
  162. }
  163. catch (Exception e) {
  164. System.out.println("(Can't optimize complex community query)");
  165. }
  166. }
  167. // Ignored delete objects
  168. Object sourceKeyQuery = query.get(DocumentPojo.sourceKey_);
  169. if (null == sourceKeyQuery) {
  170. query.put(DocumentPojo.sourceKey_, Pattern.compile("^[^?]")); // (ie nothing starting with ?)
  171. }//TESTED
  172. else if (sourceKeyQuery instanceof BasicDBObject) {
  173. ((BasicDBObject) sourceKeyQuery).append("$regex", "^[^?]");
  174. }//TESTED
  175. //DEBUG
  176. //System.out.println("COMBINED QUERY= " + query.toString());
  177. // If aggregating, kick off the background aggregation thread
  178. if (bAggregate) {
  179. EntityBackgroundAggregationManager.startThread();
  180. AssociationBackgroundAggregationManager.startThread();
  181. }
  182. //Debug:
  183. DBCursor dbc = null;
  184. dbc = docsDB.find(query);
  185. if (null != chunk) {
  186. if (chunk.containsField(DbManager.min_)) {
  187. dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
  188. }
  189. if (chunk.containsField(DbManager.max_)) {
  190. dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
  191. }
  192. }
  193. dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
  194. if (null == chunk) {
  195. int nCount = dbc.count() - nSkip;
  196. if (nCount < 0) nCount = 0;
  197. System.out.println("Found " + nCount + " records to sync, process first " + (0==nLimit?nCount:nLimit));
  198. if (0 == nCount) { // Nothing to do...
  199. return;
  200. }
  201. }
  202. byte[] storageArray = new byte[200000];
  203. int nSynced = 0;
  204. LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
  205. Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
  206. ObjectId currCommunityId = null;
  207. while (dbc.hasNext()) {
  208. BasicDBObject dbo = (BasicDBObject)dbc.next();
  209. DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
  210. String sDocIndex = doc.getIndex();
  211. if (null == sDocIndex) {
  212. sDocIndex = "document_index";
  213. }
  214. if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
  215. _deletedIndex.add(sDocIndex);
  216. rebuildIndex(sDocIndex);
  217. try { // (Just in case the index requires some time to sort itself out)
  218. Thread.sleep(1000);
  219. } catch (InterruptedException e) {}
  220. }
  221. //Debug:
  222. //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());
  223. // Get the content:
  224. if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl()))
  225. {
  226. BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
  227. contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
  228. BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
  229. BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields);
  230. if (null != dboContent) {
  231. byte[] compressedData = ((byte[])dboContent.get(CompressedFullTextPojo.gzip_content_));
  232. ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
  233. GZIPInputStream gzip = new GZIPInputStream(in);
  234. int nRead = 0;
  235. StringBuffer output = new StringBuffer();
  236. while (nRead >= 0) {
  237. nRead = gzip.read(storageArray, 0, 200000);
  238. if (nRead > 0) {
  239. String s = new String(storageArray, 0, nRead, "UTF-8");
  240. output.append(s);
  241. }
  242. }
  243. doc.setFullText(output.toString());
  244. }
  245. }
  246. // (else document has full text already)
  247. // Get tags, if necessary:
  248. // Always overwrite tags - one of the reasons we might choose to migrate
  249. // Also may need source in order to support source index filtering
  250. SourcePojo src = _sourceCache.get(doc.getSourceKey());
  251. if (null == src) {
  252. //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
  253. BasicDBObject srcDbo = (BasicDBObject) sourcesDB.findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
  254. if (null != srcDbo) {
  255. src = SourcePojo.fromDb(srcDbo, SourcePojo.class);
  256. _sourceCache.put(doc.getSourceKey(), src);
  257. doc.setTempSource(src); // (needed for source index filtering)
  258. }
  259. }
  260. if (null != src) {
  261. if (null != src.getTags()) {
  262. Set<String> tagsTidied = new TreeSet<String>();
  263. for (String s: src.getTags()) {
  264. String ss = s.trim().toLowerCase();
  265. tagsTidied.add(ss);
  266. }
  267. // May also want to write this back to the DB:
  268. //TODO (INF-2223): Handle append tags or not in the pipeline...
  269. if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
  270. if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
  271. BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getSourceKey());
  272. updateQuery.put(DocumentPojo._id_, doc.getId());
  273. docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(
  274. DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied))));
  275. }
  276. doc.setTags(tagsTidied); // (just copy ptr across)
  277. }
  278. }
  279. }
  280. // 2. Update the index with the new document
  281. // (Optionally also update entity and assoc features)
  282. if (bAggregate) {
  283. if (null == currCommunityId) {
  284. currCommunityId = doc.getCommunityId();
  285. }
  286. else if (!currCommunityId.equals(doc.getCommunityId())) {
  287. LinkedList<DocumentPojo> perCommunityDocList = null;
  288. if (null == communityList) { // (very first time we see > 1 community)
  289. communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
  290. perCommunityDocList = new LinkedList<DocumentPojo>();
  291. perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
  292. communityList.put(currCommunityId, perCommunityDocList);
  293. }
  294. currCommunityId = doc.getCommunityId();
  295. perCommunityDocList = communityList.get(currCommunityId);
  296. if (null == perCommunityDocList) {
  297. perCommunityDocList = new LinkedList<DocumentPojo>();
  298. communityList.put(currCommunityId, perCommunityDocList);
  299. }
  300. perCommunityDocList.add(doc);
  301. }
  302. }//TESTED
  303. nSynced++;
  304. docsToTransfer.add(doc);
  305. if (0 == (nSynced % 10000)) {
  306. StoreAndIndexManager manager = new StoreAndIndexManager();
  307. if (bAggregate) {
  308. // Loop over communities and aggregate each one then store the modified entities/assocs
  309. doAggregation(communityList, docsToTransfer);
  310. communityList = null; // (in case the next 10,000 docs are all in the same community!)
  311. currCommunityId = null;
  312. }//TOTEST
  313. manager.addToSearch(docsToTransfer);
  314. docsToTransfer.clear();
  315. System.out.println("(Synced " + nSynced + " records)");
  316. }
  317. } // (End loop over docs)
  318. // Sync remaining docs
  319. if (!docsToTransfer.isEmpty()) {
  320. if (bAggregate) {
  321. // Loop over communities and aggregate each one then store the modified entities/assocs
  322. doAggregation(communityList, docsToTransfer);
  323. }
  324. StoreAndIndexManager manager = new StoreAndIndexManager();
  325. manager.addToSearch(docsToTransfer);
  326. }
  327. if (null != chunk) {
  328. System.out.println("Found " + nSynced + " records to sync in chunk");
  329. }
  330. if (bAggregate) {
  331. System.out.println("Completed. You can hit CTRL+C at any time.");
  332. System.out.println("By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities.");
  333. try {
  334. Thread.sleep(300000);
  335. } catch (InterruptedException e) {}
  336. // Turn off so we can exit
  337. EntityBackgroundAggregationManager.stopThreadAndWait();
  338. AssociationBackgroundAggregationManager.stopThreadAndWait();
  339. }
  340. }
  341. //___________________________________________________________________________________________________
  342. private void doAggregation(Map<ObjectId, LinkedList<DocumentPojo>> communityList, LinkedList<DocumentPojo> singleList) {
  343. if (null == communityList) { // just one community this one is easy
  344. AggregationManager aggManager = new AggregationManager();
  345. aggManager.doAggregation(singleList, new LinkedList<DocumentPojo>());
  346. aggManager.createOrUpdateFeatureEntries();
  347. aggManager.applyAggregationToDocs(singleList);
  348. aggManager.runScheduledDocumentUpdates();
  349. aggManager.runScheduledSynchronization();
  350. }
  351. else {
  352. for (Map.Entry<ObjectId, LinkedList<DocumentPojo>> entry: communityList.entrySet()) {
  353. AggregationManager aggManager = new AggregationManager();
  354. aggManager.doAggregation(entry.getValue(), new LinkedList<DocumentPojo>());
  355. aggManager.createOrUpdateFeatureEntries();
  356. aggManager.applyAggregationToDocs(entry.getValue());
  357. aggManager.runScheduledDocumentUpdates();
  358. aggManager.runScheduledSynchronization();
  359. }
  360. }//TESTED
  361. // Finally, need to update all the docs (ick)
  362. DocumentPojo dummy = new DocumentPojo();
  363. for (DocumentPojo doc: singleList) {
  364. boolean bEnts = (null != doc.getEntities()) && !doc.getEntities().isEmpty();
  365. boolean bAssocs = (null != doc.getAssociations()) && !doc.getAssociations().isEmpty();
  366. if (bEnts || bAssocs) {
  367. dummy.setEntities(doc.getEntities());
  368. dummy.setAssociations(doc.getAssociations());
  369. DBObject toWrite = dummy.toDb();
  370. BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getSourceKey());
  371. updateQuery.put(DocumentPojo._id_, doc.getId());
  372. MongoDbManager.getDocument().getMetadata().update(updateQuery, new BasicDBObject(MongoDbManager.set_, toWrite));
  373. }//TESTED
  374. }// (end loop over docs)
  375. }//TESTED
  376. //___________________________________________________________________________________________________
  377. // Utility function for the above, rebuilds an index
  378. private void rebuildIndex(String indexName) {
  379. if (indexName.startsWith("doc_")) { // Else not eligible...
  380. try {
  381. ObjectId communityId = new ObjectId(indexName.substring(4));
  382. GenericProcessingController.recreateCommunityDocIndex_unknownFields(communityId, true);
  383. }
  384. catch (Exception e) { // I guess this wasn't a valid community?!
  385. e.printStackTrace();
  386. }
  387. }
  388. }
  389. //TESTED (by hand, it's a straight call of tested GPC code anyway)
  390. //___________________________________________________________________________________________________
  391. // DELETE DOCUMENTS FROM A QUERY
  392. private void doDelete(BasicDBObject query, int nLimit)
  393. {
  394. try {
  395. // Get the documents to delete
  396. BasicDBObject queryFields = new BasicDBObject(DocumentPojo.sourceKey_, 1);
  397. queryFields.put(DocumentPojo.sourceUrl_, 1);
  398. queryFields.put(DocumentPojo.url_, 1);
  399. queryFields.put(DocumentPojo.communityId_, 1);
  400. queryFields.put(DocumentPojo.index_, 1);
  401. DBCursor cur = DbManager.getDocument().getMetadata().find(query, queryFields).limit(nLimit);
  402. // (this internally works in batches of 1000)
  403. System.out.println("Found " + cur.count() + " records to delete");
  404. if (nLimit > 0) {
  405. System.out.println("(limited to " + nLimit + " records)");
  406. }
  407. List<DocumentPojo> docs = DocumentPojo.listFromDb(cur, DocumentPojo.listType());
  408. // Keep track of number of docs per community getting deleted
  409. Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>();
  410. Map<String, Integer> sourceKeyMap = new HashMap<String, Integer>();
  411. for (DocumentPojo doc: docs) {
  412. if (null != doc.getSourceKey()) { // (can only happen by error, still)
  413. ObjectId community = doc.getCommunityId();
  414. Integer count = communityMap.get(community);
  415. communityMap.put(community, (count == null ? 1 : count + 1));
  416. int nSpecialFormat = doc.getSourceKey().indexOf('#');
  417. String sourceKey = doc.getSourceKey();
  418. if (nSpecialFormat > 0) {
  419. sourceKey = sourceKey.substring(0, nSpecialFormat);
  420. }
  421. Integer count2 = sourceKeyMap.get(sourceKey);
  422. sourceKeyMap.put(sourceKey, (count2 == null ? 1 : count2 + 1));
  423. }
  424. }
  425. StoreAndIndexManager dataStore = new StoreAndIndexManager();
  426. dataStore.removeFromDatastore_byURL(docs);
  427. AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID());
  428. dataStore.removeSoftDeletedDocuments();
  429. AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID());
  430. // Actually update the DB counts:
  431. for (Map.Entry<ObjectId, Integer> communityInfo: communityMap.entrySet()) {
  432. System.out.println("Removed " + communityInfo.getValue() + " records from community " + communityInfo.getKey());
  433. DbManager.getDocument().getCounts().update(new BasicDBObject("_id", communityInfo.getKey()),
  434. new BasicDBObject("$inc", new BasicDBObject("doccount", -communityInfo.getValue())));
  435. }
  436. for (Map.Entry<String, Integer> sourceInfo: sourceKeyMap.entrySet()) {
  437. System.out.println("Removed " + sourceInfo.getValue() + " records from source " + sourceInfo.getKey());
  438. DbManager.getIngest().getSource().update(new BasicDBObject("key", sourceInfo.getKey()),
  439. new BasicDBObject("$inc", new BasicDBObject("harvest.doccount", -sourceInfo.getValue())));
  440. }
  441. } catch (Exception e) {
  442. e.printStackTrace();
  443. }
  444. }
  445. //___________________________________________________________________________________________________
  446. //___________________________________________________________________________________________________
  447. //___________________________________________________________________________________________________
  448. //___________________________________________________________________________________________________
  449. // UNIT/FUNCTIONAL/COVERAGE TEST CODE
  450. @SuppressWarnings("unused")
  451. private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort,
  452. BasicDBObject query, int nLimit)
  453. {
  454. ElasticSearchManager elasticManager = null;
  455. try {
  456. // Initialize the DB:
  457. DBCollection feedsDB = DbManager.getDocument().getMetadata();
  458. DBCollection contentDB = DbManager.getDocument().getContent();
  459. DBCollection sourcesDB = DbManager.getIngest().getSource();
  460. String indexName = "document_index";
  461. // Test/debug recreate the index
  462. if (true) {
  463. // (delete the index)
  464. System.out.println("Deleting index...");
  465. elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
  466. elasticManager.deleteMe();
  467. //(also deletes the child index - same index, different type)
  468. // Create the index if necessary
  469. String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class);
  470. Builder localSettings = ImmutableSettings.settingsBuilder();
  471. localSettings.put("number_of_shards", 10).put("number_of_replicas", 2);
  472. System.out.println("Creating index..." + sMapping);
  473. elasticManager = ElasticSearchManager.createIndex
  474. (indexName, null, false,
  475. sElasticHost + ":" + sElasticPort,
  476. sMapping, localSettings);
  477. }
  478. // Get the index (necessary if already created)
  479. if (null == elasticManager) {
  480. elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
  481. }
  482. // Get the feeds from the DB:
  483. //Debug:
  484. // System.out.println("Querying DB...");
  485. DBCursor dbc = feedsDB.find(query).limit(nLimit);
  486. byte[] storageArray = new byte[200000];
  487. while (dbc.hasNext()) {
  488. BasicDBObject dbo = (BasicDBObject)dbc.next();
  489. DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
  490. //Debug:
  491. System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl());
  492. // Get the content:
  493. BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
  494. contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
  495. BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ);
  496. if (null != dboContent) {
  497. byte[] compressedData = ((byte[])dboContent.get("gzip_content"));
  498. ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
  499. GZIPInputStream gzip = new GZIPInputStream(in);
  500. int nRead = gzip.read(storageArray, 0, 200000);
  501. String s = new String(storageArray, 0, nRead, "UTF-8");
  502. doc.setFullText(s);
  503. }
  504. // Get tag:
  505. SourcePojo src = _sourceCache.get(doc.getSourceKey());
  506. if (null == src) {
  507. BasicDBObject srcDbo = (BasicDBObject) sourcesDB.findOne(new BasicDBObject("key", doc.getSourceKey()));
  508. if (null != srcDbo) {
  509. src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class);
  510. _sourceCache.put(doc.getSourceKey(), src);
  511. }
  512. }
  513. if (null != src) {
  514. Set<String> tagsTidied = new TreeSet<String>();
  515. for (String s: src.getTags()) {
  516. String ss = s.trim().toLowerCase();
  517. tagsTidied.add(ss);
  518. }
  519. doc.setTags(tagsTidied);
  520. }
  521. //TEST: set dynamic field
  522. // Lots of testing of dynamic dates:
  523. // feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString()));
  524. // String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated());
  525. // feed.addToMetadata("another_dateISO", s1);
  526. // String s1_5 = new SimpleDateFormat().format(feed.getCreated());
  527. // feed.addToMetadata("another_dateTimeJava", s1_5);
  528. // String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated());
  529. // feed.addToMetadata("another_dateYYYYMMDD", s2);
  530. // String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated());
  531. // feed.addToMetadata("another_dateRFC822", s3);
  532. // feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString());
  533. // // Testing of the string field types
  534. // feed.addToMetadata("my_comment", "Testing this ABCDEFG");
  535. // feed.addToMetadata("my_term", "Testing this UVWXYZ");
  536. // feed.addToMetadata("my_text", "Testing this 123456");
  537. // // Test an array of longs:
  538. // Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L;
  539. // feed.addToMetadata("md_long", tl);
  540. //TEST: some dummy event timestamp adding code (not seeing much/any in the data)
  541. // if (null != feed.getEvents()) {
  542. // int i = 0;
  543. // for (EventPojo evt: feed.getEvents()) {
  544. // //1: Add single date
  545. // if (0 == i) {
  546. // evt.time_start = "2011-01-01";
  547. // }
  548. // //2: Add short span
  549. // if (1 == i) {
  550. // evt.time_start = "2010-04-06";
  551. // evt.time_end = "2010-08-09";
  552. // }
  553. // //3: Add cross-yr span
  554. // if (2 == i) {
  555. // evt.time_start = "2012-06-05";
  556. // evt.time_end = "2013-09-05";
  557. // }
  558. // //4: Add too long span
  559. // if (3 == i) {
  560. // evt.time_start = "2012-04-06";
  561. // evt.time_end = "2014-04-09";
  562. // }
  563. // i++;
  564. // }
  565. // }
  566. // For event adding, see data_model.test.TestCode
  567. }
  568. } catch (IOException e) {
  569. e.printStackTrace();
  570. }
  571. finally {
  572. //nothing to do
  573. }
  574. }
  575. }