PageRenderTime 1037ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/core/infinit.e.core.server/src/com/ikanow/infinit/e/core/utils/SourceUtils.java

https://github.com/IKANOW/Infinit.e
Java | 863 lines | 573 code | 131 blank | 159 comment | 210 complexity | 18e5a2ec05c596c45d71e0a51d0954cc MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.core.utils;
  17. import java.net.InetAddress;
  18. import java.util.Arrays;
  19. import java.util.Date;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.LinkedList;
  23. import java.util.List;
  24. import java.util.Map;
  25. import org.apache.log4j.Logger;
  26. import org.bson.types.ObjectId;
  27. import com.google.gson.reflect.TypeToken;
  28. import com.ikanow.infinit.e.data_model.InfiniteEnums;
  29. import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum;
  30. import com.ikanow.infinit.e.data_model.store.DbManager;
  31. import com.ikanow.infinit.e.data_model.store.MongoDbManager;
  32. import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
  33. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  34. import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
  35. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  36. import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
  37. import com.mongodb.BasicDBObject;
  38. import com.mongodb.DBCursor;
  39. public class SourceUtils {
  40. private static Logger logger = Logger.getLogger(SourceUtils.class);
  41. private static final long _ONEDAY = 24L*3600L*1000L;
  42. /////////////////////////////////////////////////////////////////////////////////////
  43. // Utilities common to both harvester and synchronization
  44. /////////////////////////////////////////////////////////////////////////////////////
  45. public static boolean checkDbSyncLock() {
  46. DBCursor dbc = DbManager.getFeature().getSyncLock().find();
  47. if (0 == dbc.count()) {
  48. return false; // working fine
  49. }
  50. Date now = new Date();
  51. while (dbc.hasNext()) {
  52. BasicDBObject sync_lock = (BasicDBObject) dbc.next();
  53. Object lastSyncObj = sync_lock.get("last_sync");
  54. if (null != lastSyncObj) {
  55. try {
  56. Date last_sync = (Date) lastSyncObj;
  57. if (last_sync.getTime() + _ONEDAY > now.getTime()) {
  58. return true; // (ie sync object exists and is < 1 day old)
  59. }
  60. }
  61. catch (Exception e) {
  62. // class cast, do nothing
  63. }
  64. }
  65. } // (end "loop over" 1 object in sync_lock DB)
  66. return false;
  67. }
  68. //TESTED (active lock, no lock, old lock)
  69. /////////////////////////////////////////////////////////////////////////////////////
  70. // Get all sources to be harvested (max 500 per cycle, in order of harvesting so nothing should get lost)
  71. public static LinkedList<SourcePojo> getSourcesToWorkOn(String sCommunityOverride, String sSourceId, boolean bSync, boolean bDistributed) {
  72. // Add harvest types to the DB
  73. com.ikanow.infinit.e.harvest.utils.PropertiesManager props = new com.ikanow.infinit.e.harvest.utils.PropertiesManager();
  74. int nMaxSources = 1000;
  75. if (!bSync) {
  76. nMaxSources = props.getMaxSourcesPerHarvest(); // (normally 0 == no limit)
  77. }
  78. String sTypes = props.getHarvesterTypes();
  79. String sType[] = sTypes.split("\\s*,\\s*");
  80. String sTypeCase[] = new String[sType.length*2];
  81. for (int i = 0; i < sType.length; i++) {
  82. String s = sType[i];
  83. sTypeCase[2*i] = s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();
  84. sTypeCase[2*i + 1] = s.toLowerCase();
  85. }
  86. BasicDBObject harvestTypes = new BasicDBObject(MongoDbManager.in_, sTypeCase);
  87. LinkedList<SourcePojo> sources = null;
  88. try
  89. {
  90. BasicDBObject query = null;
  91. BasicDBObject adminUpdateQuery = new BasicDBObject();
  92. if (bDistributed) {
  93. Date now = new Date();
  94. query = generateNotInProgressClause(now);
  95. // (just don't waste time on things currently being harvested)
  96. // Also need to ignore any sources that have just been synced by a different node...
  97. if (bSync) {
  98. Date recentlySynced = new Date(now.getTime() - 1800*1000); //(ie not synced within 1/2 hour)
  99. query.put(SourceHarvestStatusPojo.sourceQuery_synced_, new BasicDBObject(MongoDbManager.lt_, recentlySynced));
  100. // (will know synced exists because we set it below - the sort doesn't work without its being set for all records)
  101. }
  102. else if (null == sSourceId) { // for harvest, try to take into account the effect of search cycles
  103. // (if manually setting the source then ignore this obviously...)
  104. addSearchCycleClause(query, now);
  105. }
  106. }
  107. else {
  108. query = new BasicDBObject();
  109. }
  110. if (null == sSourceId) {
  111. query.put(SourcePojo.isApproved_, true);
  112. }
  113. if (!bSync && (null == sSourceId)) {
  114. query.put(SourcePojo.harvestBadSource_, new BasicDBObject(MongoDbManager.ne_, true)); // (ie true or doesn't exist)
  115. // (still sync bad sources)
  116. }
  117. query.put(SourcePojo.extractType_, harvestTypes);
  118. if (null != sCommunityOverride) {
  119. query.put(SourcePojo.communityIds_, new ObjectId(sCommunityOverride));
  120. adminUpdateQuery.put(SourcePojo.communityIds_, new ObjectId(sCommunityOverride));
  121. }
  122. else if (null != sSourceId) {
  123. try {
  124. query.put(SourcePojo._id_, new ObjectId(sSourceId));
  125. adminUpdateQuery.put(SourcePojo._id_, new ObjectId(sSourceId));
  126. }
  127. catch (Exception e) { // Allow either _id or key to be used as the id...
  128. query.put(SourcePojo.key_, sSourceId);
  129. adminUpdateQuery.put(SourcePojo.key_, sSourceId);
  130. }
  131. }
  132. BasicDBObject orderBy = new BasicDBObject();
  133. if (bSync) {
  134. orderBy.put(SourceHarvestStatusPojo.sourceQuery_synced_, 1);
  135. }
  136. else {
  137. orderBy.put(SourceHarvestStatusPojo.sourceQuery_harvested_, 1);
  138. }
  139. //(note although there's a complex query preceding this, it should be using the above index
  140. // anyway so there should be some benefit to this)
  141. BasicDBObject fields = new BasicDBObject();
  142. if (bDistributed) {
  143. // Mainly just _id and extractType, we'll get these for debugging
  144. fields.put(SourcePojo._id_, 1);
  145. fields.put(SourcePojo.extractType_, 1);
  146. fields.put(SourcePojo.key_, 1);
  147. fields.put(SourceHarvestStatusPojo.sourceQuery_harvested_, 1);
  148. fields.put(SourceHarvestStatusPojo.sourceQuery_synced_, 1);
  149. fields.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1);
  150. if (null != sSourceId) {
  151. //put a random field in just so we know it's a source override:
  152. fields.put(SourcePojo.ownerId_, 1);
  153. //(plus don't add searchCycle, we're just going to ignore it anyway)
  154. }//TESTED
  155. else {
  156. fields.put(SourcePojo.searchCycle_secs_, 1);
  157. }//TESTED
  158. // (need these for distributed logic)
  159. fields.put(SourcePojo.distributionFactor_, 1);
  160. fields.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_, 1);
  161. }
  162. // (first off, set the harvest/sync date for any sources that don't have it set,
  163. // needed because sort doesn't return records without the sorting field)
  164. Date yesteryear = new Date(new Date().getTime() - 365L*_ONEDAY);
  165. // (NOTE this time being >=1 yr is depended upon by applications, so you don't get to change it. Ever)
  166. if (bSync) {
  167. adminUpdateQuery.put(SourceHarvestStatusPojo.sourceQuery_synced_, new BasicDBObject(MongoDbManager.exists_, false));
  168. DbManager.getIngest().getSource().update(adminUpdateQuery,
  169. new BasicDBObject(MongoDbManager.set_, new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, yesteryear)), false, true);
  170. }
  171. else {
  172. adminUpdateQuery.put(SourceHarvestStatusPojo.sourceQuery_harvested_, new BasicDBObject(MongoDbManager.exists_, false));
  173. DbManager.getIngest().getSource().update(adminUpdateQuery,
  174. new BasicDBObject(MongoDbManager.set_, new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, yesteryear)), false, true);
  175. }
  176. // (then perform query)
  177. DBCursor cur = DbManager.getIngest().getSource().find(query, fields).sort(orderBy).limit(nMaxSources);
  178. sources = SourcePojo.listFromDb(cur, new TypeToken<LinkedList<SourcePojo>>(){});
  179. }
  180. catch (Exception e)
  181. {
  182. logger.error("Exception Message getting sources for sync: " + e.getMessage(), e);
  183. }
  184. return sources;
  185. }//TESTED (mostly unchanged from "tested" Beta version - few changes for distribution tested by eye)
  186. /////////////////////////////////////////////////////////////////////////////////////
  187. // Share sources to be harvested across all running harvesters
  188. public static LinkedList<SourcePojo> getDistributedSourceList(LinkedList<SourcePojo> uncheckedSources, String sSourceType, boolean bSync)
  189. {
  190. Date now = new Date();
  191. LinkedList<SourcePojo> nextSetToProcess = new LinkedList<SourcePojo>();
  192. // Some additional distributed logic
  193. LinkedList<SourcePojo> putMeBackAtTheStart_distributed = null;
  194. PropertiesManager pm = new PropertiesManager();
  195. int nBatchSize = pm.getDistributionBatchSize(bSync);
  196. Long defaultSearchCycle_ms = pm.getMinimumHarvestTimePerSourceMs();
  197. // The logic for getting the next set of sources is:
  198. // 2] Get the oldest 20 sources that are:
  199. // 2.1] In progress and >a day old (assume the harvester/sync running them crashed)
  200. // 2.2] Not in progress and have either never been harvested or synced, or in age of how long ago
  201. for (int nNumSourcesGot = 0; (nNumSourcesGot < nBatchSize) && (!uncheckedSources.isEmpty()); ) {
  202. BasicDBObject query = generateNotInProgressClause(now);
  203. SourcePojo candidate = null;
  204. synchronized (SourceUtils.class) { // (can be called across multiple threads)
  205. candidate = uncheckedSources.pop();
  206. }
  207. //DEBUG
  208. //System.out.println(" CANDIDATE=" + candidate.getKey() + " ..." + candidate.getId());
  209. if ((null != sSourceType) && !candidate.getExtractType().equalsIgnoreCase(sSourceType)) {
  210. continue;
  211. }
  212. HarvestEnum candidateStatus = null;
  213. if (null != candidate.getHarvestStatus()) {
  214. candidateStatus = candidate.getHarvestStatus().getHarvest_status();
  215. }
  216. if (bSync && (null == candidateStatus)) { // Don't sync unharvested sources, obviously!
  217. continue;
  218. }
  219. //(DISTRIBUTON LOGIC)
  220. // Checking whether to respect the searchCycle_secs for distributed sources is a bit more complex
  221. boolean isDistributed = (null != candidate.getDistributionFactor());
  222. boolean distributedInProcess = isDistributed &&
  223. candidate.reachedMaxDocs() || // (<- only set inside a process)
  224. ((null != candidate.getHarvestStatus()) && // (robustness)
  225. (null != candidate.getHarvestStatus().getDistributionTokensFree()) && // (else starting out)
  226. (candidate.getDistributionFactor() != candidate.getHarvestStatus().getDistributionTokensFree()));
  227. // (else this is the start)
  228. //(TESTED - local and distributed)
  229. //(END DISTRIBUTON LOGIC)
  230. if (((HarvestEnum.success_iteration != candidateStatus) && !distributedInProcess)
  231. ||
  232. ((null != candidate.getSearchCycle_secs()) && (candidate.getSearchCycle_secs() < 0)))
  233. {
  234. // (ie EITHER we're not iteration OR we're disabled)
  235. //(^^^ don't respect iteration status if source manually disabled)
  236. if ((null != candidate.getSearchCycle_secs()) || (null != defaultSearchCycle_ms)) {
  237. if (null == candidate.getSearchCycle_secs()) {
  238. candidate.setSearchCycle_secs((int)(defaultSearchCycle_ms/1000));
  239. }
  240. if (candidate.getSearchCycle_secs() < 0) {
  241. continue; // negative search cycle => disabled
  242. }
  243. if ((null != candidate.getHarvestStatus()) && (null != candidate.getHarvestStatus().getHarvested())) {
  244. //(ie the source has been harvested, and there is a non-default search cycle setting)
  245. if ((candidate.getHarvestStatus().getHarvested().getTime() + 1000L*candidate.getSearchCycle_secs())
  246. > now.getTime())
  247. {
  248. if ((HarvestEnum.in_progress != candidateStatus) && (null != candidateStatus) && (null == candidate.getOwnerId()))
  249. {
  250. //(^^ last test, if it's in_progress then it died recently (or hasn't started) so go ahead and harvest anyway)
  251. // (also hacky use of getOwnerId just to see if this is a source override source or not)
  252. continue; // (too soon since the last harvest...)
  253. }//TESTED (including hacky use of ownerId)
  254. }
  255. }
  256. }//TESTED
  257. }
  258. //TESTED: manually disabled (ignore), not success_iteration (ignore if outside cycle), success_iteration (always process)
  259. query.put(SourcePojo._id_, candidate.getId());
  260. BasicDBObject modifyClause = new BasicDBObject();
  261. modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.in_progress.toString());
  262. if (bSync) {
  263. modifyClause.put(SourceHarvestStatusPojo.sourceQuery_synced_, now);
  264. }
  265. else {
  266. modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvested_, now);
  267. }
  268. modifyClause.put(SourceHarvestStatusPojo.sourceQuery_lastHarvestedBy_, getHostname());
  269. BasicDBObject modify = new BasicDBObject(MongoDbManager.set_, modifyClause);
  270. try {
  271. BasicDBObject dbo = (BasicDBObject) DbManager.getIngest().getSource().findAndModify(query, modify);
  272. if (null != dbo) {
  273. SourcePojo fullSource = SourcePojo.fromDb(dbo, SourcePojo.class);
  274. nextSetToProcess.add(fullSource);
  275. nNumSourcesGot++;
  276. ////////////////////////////////////////////////////////////////////////
  277. //
  278. // DISTRIBUTION LOGIC:
  279. // If distributionFactor set then grab one token and set state back to
  280. // success_iteration, to allow other threads/processes to grab me
  281. if ((null != fullSource.getDistributionFactor()) && !bSync)
  282. {
  283. // Get the current distribution token
  284. int distributionToken = 0;
  285. boolean bReset = false;
  286. if ((null == fullSource.getHarvestStatus()) || (null == fullSource.getHarvestStatus().getDistributionTokensFree())) {
  287. distributionToken = fullSource.getDistributionFactor();
  288. // (also set up some parameters so don't need to worry about null checks later)
  289. if (null == fullSource.getHarvestStatus()) {
  290. fullSource.setHarvestStatus(new SourceHarvestStatusPojo());
  291. }
  292. fullSource.getHarvestStatus().setDistributionTokensFree(distributionToken);
  293. fullSource.getHarvestStatus().setDistributionTokensComplete(0);
  294. }
  295. else {
  296. distributionToken = fullSource.getHarvestStatus().getDistributionTokensFree();
  297. //Check last harvested time to ensure this isn't an old state (reset if so)
  298. if ((distributionToken != fullSource.getDistributionFactor()) ||
  299. (0 != fullSource.getHarvestStatus().getDistributionTokensComplete()))
  300. {
  301. if (null != fullSource.getHarvestStatus().getRealHarvested()) { // harvested is useless here because it's already been updated
  302. if ((new Date().getTime() - fullSource.getHarvestStatus().getRealHarvested().getTime()) >
  303. _ONEDAY) // (ie older than a day)
  304. {
  305. distributionToken = fullSource.getDistributionFactor(); // ie start again
  306. }
  307. }
  308. }//TESTED
  309. }//(end check for any existing state)
  310. if (distributionToken == fullSource.getDistributionFactor()) {
  311. bReset = true; // (first time through, might as well go ahead and reset to ensure all the vars are present)
  312. }
  313. // If in error then just want to grab all remaining tokens and reset the status
  314. if (HarvestEnum.error == fullSource.getHarvestStatus().getHarvest_status()) { // currently an error
  315. if (distributionToken != fullSource.getDistributionFactor()) { // In the middle, ie just errored
  316. fullSource.setDistributionTokens(new HashSet<Integer>());
  317. while (distributionToken > 0) {
  318. distributionToken--;
  319. fullSource.getDistributionTokens().add(distributionToken);
  320. }
  321. BasicDBObject dummy = new BasicDBObject();
  322. bReset = updateHarvestDistributionState_tokenComplete(fullSource, HarvestEnum.error, dummy, dummy);
  323. // (then finish off completion down below)
  324. }
  325. }//TESTED (error mode, 2 cases: complete and incomplete)
  326. //DEBUG
  327. //System.out.println(" DIST_SOURCE=" + fullSource.getKey() + "/" + fullSource.getDistributionFactor() + ": " + distributionToken + ", " + bReset);
  328. //(note we'll see this even if searchCycle is set because the "source" var (which still has the old
  329. // state) is stuck back at the start of uncheckedList, so each harvester will see the source >1 time)
  330. if (0 != distributionToken) { // (else no available tokens for this cycle)
  331. distributionToken--;
  332. fullSource.setDistributionTokens(new HashSet<Integer>());
  333. fullSource.getDistributionTokens().add(distributionToken);
  334. // Remove one of the available tokens (they don't get reset until the source is complete)
  335. updateHarvestDistributionState_newToken(fullSource.getId(), distributionToken, HarvestEnum.success_iteration, bReset);
  336. // After this loop is complete, put back at the start of the unchecked list
  337. // so another thread can pick up more tokens:
  338. if (null == putMeBackAtTheStart_distributed) {
  339. putMeBackAtTheStart_distributed = new LinkedList<SourcePojo>();
  340. }
  341. putMeBackAtTheStart_distributed.add(candidate);
  342. // Before adding back to list, set a transient field to ensure it bypasses any search cycle checks
  343. // (for in process logic where we won't see the update status from the DB)
  344. candidate.setReachedMaxDocs();
  345. // Reset full source's status so we know if we started in success/error/success_iteration
  346. if (null == candidateStatus) {
  347. candidateStatus = HarvestEnum.success;
  348. }
  349. fullSource.getHarvestStatus().setHarvest_status(candidateStatus);
  350. } // (end if available tokens)
  351. else { // (don't process, just set back to original status)
  352. HarvestEnum harvestStatus = HarvestEnum.success;
  353. if (null != fullSource.getHarvestStatus()) {
  354. if (null != fullSource.getHarvestStatus().getHarvest_status()) {
  355. harvestStatus = fullSource.getHarvestStatus().getHarvest_status();
  356. }
  357. }
  358. if (bReset) { // resetting back to 10
  359. distributionToken = fullSource.getDistributionFactor();
  360. }
  361. updateHarvestDistributionState_newToken(fullSource.getId(), distributionToken, harvestStatus, bReset);
  362. // (bReset can be true in the error case handled above)
  363. nextSetToProcess.removeLast();
  364. nNumSourcesGot--;
  365. }//TESTED
  366. }//TESTED
  367. else if (bSync) {
  368. // Not allowed to sync "distributed in progress"
  369. if ((null != fullSource.getHarvestStatus()) || (null != fullSource.getHarvestStatus().getDistributionTokensFree())) {
  370. if (null == fullSource.getHarvestStatus().getHarvest_status()) { // (shouldn't ever happen)
  371. fullSource.getHarvestStatus().setHarvest_status(HarvestEnum.success_iteration);
  372. }
  373. if (fullSource.getHarvestStatus().getDistributionTokensFree() != fullSource.getDistributionFactor()) {
  374. updateHarvestDistributionState_newToken(fullSource.getId(), fullSource.getHarvestStatus().getDistributionTokensFree(), fullSource.getHarvestStatus().getHarvest_status(), false);
  375. nextSetToProcess.removeLast();
  376. nNumSourcesGot--;
  377. }
  378. }
  379. }//TESTED
  380. //
  381. //(end DISTRIBUTION LOGIC)
  382. ////////////////////////////////////////////////////////////////////////
  383. }//(end found source - note could have been gazumped by a different thread in the meantime, and that's fine)
  384. }
  385. catch (Exception e) {
  386. // Unset the in-progress clause for robustness
  387. modifyClause = new BasicDBObject();
  388. modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.error.toString());
  389. modify = new BasicDBObject(MongoDbManager.set_, modifyClause);
  390. DbManager.getIngest().getSource().update(query, modify);
  391. // This source has failed somehow, just carry on
  392. logger.error("Source " + candidate.getKey() + " has errored during distribution " + e.getMessage());
  393. e.printStackTrace();
  394. }
  395. } // (end loop over unchecked sources until we have >20)
  396. // Little bit more distribution logic:
  397. if (null != putMeBackAtTheStart_distributed) {
  398. synchronized (SourceUtils.class) { // (can be called across multiple threads)
  399. for (SourcePojo distSource: putMeBackAtTheStart_distributed) {
  400. uncheckedSources.addFirst(distSource);
  401. }
  402. }
  403. }//TESTED
  404. return nextSetToProcess;
  405. } //TESTED
  406. /////////////////////////////////////////////////////////////////////////////////////
  407. // Sub-utility function used by both the above functions
  408. private static BasicDBObject generateNotInProgressClause(Date date) {
  409. //24hrs ago
  410. Date oldDate = new Date(date.getTime() - _ONEDAY);
  411. // This query says: if the query isn't in progress [1] (or the harvest object doesn't exist [3,4]) ... or if it is but nothing's happened in 24 hours [2]
  412. BasicDBObject subclause1 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
  413. new BasicDBObject(MongoDbManager.ne_, HarvestEnum.in_progress.toString()));
  414. BasicDBObject subclause2 = new BasicDBObject();
  415. subclause2.put(SourceHarvestStatusPojo.sourceQuery_harvested_, new BasicDBObject(MongoDbManager.lt_, oldDate));
  416. // (always check for harvested, don't care if synced isn't happening regularly)
  417. BasicDBObject subclause3 = new BasicDBObject(SourcePojo.harvest_, new BasicDBObject(MongoDbManager.exists_, false));
  418. BasicDBObject subclause4 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
  419. new BasicDBObject(MongoDbManager.exists_, false));
  420. BasicDBObject clause = new BasicDBObject(MongoDbManager.or_, Arrays.asList(subclause1, subclause2, subclause3, subclause4));
  421. return clause;
  422. }//TESTED
  423. //(NOTE: IF RUN IN CONJUNCTION WITH "ABUSIVE" MAP/REDUCE WILL CAUSE DB HANG)
  424. private static void addSearchCycleClause(BasicDBObject currQuery, Date now) {
  425. BasicDBObject subclause1 = new BasicDBObject(SourcePojo.searchCycle_secs_, new BasicDBObject(MongoDbManager.exists_, false));
  426. StringBuffer js = new StringBuffer();
  427. js.append("(null == this.harvest) || ('success_iteration'== this.harvest.harvest_status) || (null == this.harvest.harvested) || (null == this.searchCycle_secs) || ((this.searchCycle_secs >= 0) && ((this.harvest.harvested.getTime() + 1000*this.searchCycle_secs) <= ");
  428. js.append(now.getTime());
  429. js.append("))");
  430. BasicDBObject subclause2 = new BasicDBObject(MongoDbManager.where_, js.toString());
  431. currQuery.append(MongoDbManager.or_, Arrays.asList(subclause1, subclause2));
  432. }//TESTED (by hand/eye)
  433. public static void checkSourcesHaveHashes(String sCommunityOverride, String sSourceDebug) {
  434. BasicDBObject query = new BasicDBObject(SourcePojo.shah256Hash_, new BasicDBObject(MongoDbManager.exists_, false));
  435. if (null != sCommunityOverride) {
  436. query.put(SourcePojo.communityIds_, new ObjectId(sCommunityOverride));
  437. }
  438. if (null != sSourceDebug) {
  439. try {
  440. query.put(SourcePojo._id_, new ObjectId(sSourceDebug));
  441. }
  442. catch (Exception e) { // Allow key also
  443. query.put(SourcePojo.key_, sSourceDebug);
  444. }
  445. }
  446. DBCursor dbc = DbManager.getIngest().getSource().find(query);
  447. int nSrcFixCount = 0;
  448. while (dbc.hasNext()) {
  449. SourcePojo src = SourcePojo.fromDb(dbc.next(), SourcePojo.class);
  450. nSrcFixCount++;
  451. src.generateShah256Hash();
  452. DbManager.getIngest().getSource().update(new BasicDBObject(SourcePojo._id_, src.getId()),
  453. new BasicDBObject(MongoDbManager.set_, new BasicDBObject(SourcePojo.shah256Hash_, src.getShah256Hash())));
  454. }
  455. if (nSrcFixCount > 0) {
  456. logger.info("Core.Server: Fixed " + nSrcFixCount + " missing source hash(es)");
  457. }
  458. }//TESTED (by hand/eye)
  459. ////////////////////////////////////////////////////////////////////////////////////////////
  460. // Synchronization specific utilities
  461. // Updates "in_progress" to either "success" or "error"
  462. public static void updateSyncStatus(SourcePojo source, HarvestEnum harvestStatus) {
  463. BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
  464. BasicDBObject update = new BasicDBObject(MongoDbManager.set_, new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, harvestStatus.toString()));
  465. DbManager.getIngest().getSource().update(query, update);
  466. }
  467. ////////////////////////////////////////////////////////////////////////////////////////////
  468. // Harvest specific source utilities
  469. // Updates "in_progress" to either "success" or "error", increments the doccount (per source and per community)
  470. public static void updateHarvestStatus(SourcePojo source, HarvestEnum harvestStatus, List<DocumentPojo> added, long nDocsDeleted, String extraMessage) {
  471. // Handle successful harvests where the max docs were reached, so don't want to respect the searchCycle
  472. if ((harvestStatus == HarvestEnum.success) && (source.reachedMaxDocs())) {
  473. harvestStatus = HarvestEnum.success_iteration;
  474. }
  475. // Always update status object in order to release the "in_progress" lock
  476. // (make really really sure we don't exception out before doing this!)
  477. BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
  478. BasicDBObject setClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, harvestStatus.toString());
  479. if ((null != added) && !added.isEmpty()) {
  480. setClause.put(SourceHarvestStatusPojo.sourceQuery_extracted_, new Date());
  481. }
  482. if (null != extraMessage) {
  483. if ((null == source.getHarvestStatus()) || (null == source.getHarvestStatus().getHarvest_message())) {
  484. setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, extraMessage);
  485. }
  486. else {
  487. source.getHarvestStatus().setHarvest_message(source.getHarvestStatus().getHarvest_message() + "\n" + extraMessage);
  488. setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, source.getHarvestStatus().getHarvest_message());
  489. }
  490. }
  491. BasicDBObject update = new BasicDBObject(MongoDbManager.set_, setClause);
  492. int docsAdded = 0;
  493. if (null != added) {
  494. docsAdded = added.size();
  495. }
  496. BasicDBObject incClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, docsAdded - nDocsDeleted);
  497. update.put(MongoDbManager.inc_, incClause);
  498. if (null != source.getDistributionTokens()) { // Distribution logic (specified and also enabled - eg ignore Feed/DB)
  499. updateHarvestDistributionState_tokenComplete(source, harvestStatus, incClause, setClause);
  500. }
  501. if (setClause.isEmpty()) { // (ie got removed by the distribution logic above)
  502. update.remove(MongoDbManager.set_);
  503. }//TESTED
  504. long nTotalDocsAfterInsert = 0;
  505. BasicDBObject fieldsToReturn = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
  506. BasicDBObject updatedSource =
  507. (BasicDBObject) DbManager.getIngest().getSource().findAndModify(query, fieldsToReturn, null, false, update, true, false);
  508. BasicDBObject harvestStatusObj = (BasicDBObject) updatedSource.get(SourcePojo.harvest_);
  509. if (null != harvestStatusObj) {
  510. Long docCount = harvestStatusObj.getLong(SourceHarvestStatusPojo.doccount_);
  511. if (null != docCount) {
  512. nTotalDocsAfterInsert = docCount;
  513. }
  514. }
  515. //TESTED
  516. // Prune documents if necessary
  517. if ((null != source.getMaxDocs()) && (nTotalDocsAfterInsert > source.getMaxDocs())) {
  518. long nToPrune = (nTotalDocsAfterInsert - source.getMaxDocs());
  519. SourceUtils.pruneSource(source, (int) nToPrune);
  520. nDocsDeleted += nToPrune;
  521. // And update to reflect that it now has max docs...
  522. BasicDBObject update2_1 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, source.getMaxDocs());
  523. BasicDBObject update2 = new BasicDBObject(DbManager.set_, update2_1);
  524. DbManager.getIngest().getSource().update(query, update2);
  525. }
  526. //TESTED
  527. // (OK now the only thing we really had to do is complete, add some handy metadata)
  528. // Also update the document count table in doc_metadata:
  529. if (docsAdded > 0) {
  530. if (1 == source.getCommunityIds().size()) { // (simple/usual case, just 1 community)
  531. query = new BasicDBObject(DocCountPojo._id_, source.getCommunityIds().iterator().next());
  532. update = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, docsAdded - nDocsDeleted));
  533. if ((docsAdded != 0) || (nDocsDeleted != 0)) {
  534. update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date()));
  535. }
  536. DbManager.getDocument().getCounts().update(query, update, true, false);
  537. }
  538. else if (!source.getCommunityIds().isEmpty()) { // Complex case since docs can belong to diff communities (but they're usually somewhat grouped)
  539. Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>();
  540. for (DocumentPojo doc: added) {
  541. ObjectId communityId = doc.getCommunityId();
  542. Integer count = communityMap.get(communityId);
  543. communityMap.put(communityId, (count == null ? 1 : count + 1));
  544. }//end loop over added documents (updating the separate community counts)
  545. long nDocsDeleted_byCommunity = nDocsDeleted/source.getCommunityIds().size();
  546. // (can't do better than assume a uniform distribution - the whole thing gets recalculated weekly anyway...)
  547. for (Map.Entry<ObjectId, Integer> communityInfo: communityMap.entrySet()) {
  548. query = new BasicDBObject(DocCountPojo._id_, communityInfo.getKey());
  549. update = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, communityInfo.getValue() - nDocsDeleted_byCommunity));
  550. if ((communityInfo.getValue() != 0) || (nDocsDeleted_byCommunity != 0)) {
  551. update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date()));
  552. }
  553. DbManager.getDocument().getCounts().update(query, update, true, false);
  554. // (true for upsert, false for multi add)
  555. }
  556. }//(never called in practice - tested up until 5/2/2014)
  557. }
  558. }//TESTED (actually, except for multi community sources, which can't happen at the moment anyway)
  559. ////////////////////////////////////////////////////////////////////////////////////////////
  560. // Maps string type in source pojo to enum
  561. public static int getHarvestType(SourcePojo source) {
  562. if (source.getExtractType().equalsIgnoreCase("database")) {
  563. return InfiniteEnums.DATABASE;
  564. }
  565. else if (source.getExtractType().equalsIgnoreCase("file")) {
  566. return InfiniteEnums.FILES;
  567. }
  568. else {
  569. return InfiniteEnums.FEEDS;
  570. }
  571. }//TESTED
  572. ////////////////////////////////////////////////////////////////////////////////////////////
  573. /**
  574. * Changes all sources badSource flag to false so it will be attempted again on
  575. * the next harvest cycle.
  576. *
  577. * NOTE: If mutliple harvesters are called with reset flag they will all
  578. * set the bad source flag back to true for all sources
  579. *
  580. */
  581. public static void resetBadSources()
  582. {
  583. try
  584. {
  585. BasicDBObject query = new BasicDBObject();
  586. query.put(MongoDbManager.set_,new BasicDBObject(SourcePojo.harvestBadSource_, false));
  587. DbManager.getIngest().getSource().update(new BasicDBObject(), query, false, true);
  588. }
  589. catch (Exception e)
  590. {
  591. logger.error("Exception Message reseting feeds badsource flag: " + e.getMessage(), e);
  592. }
  593. }//TESTED (unchanged from working Beta version)
  594. /////////////////////////////////////////////////////////////////////////////////////
  595. // Prune sources with max doc settings
  596. private static void pruneSource(SourcePojo source, int nToPrune)
  597. {
  598. int nDocsDeleted = 0;
  599. // (code taken mostly from SourceHandler.deleteSource)
  600. if (null != source.getKey()) { // or may delete everything!
  601. BasicDBObject docQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getKey());
  602. docQuery.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, "?DEL?")); // (robustness)
  603. BasicDBObject sortField = new BasicDBObject(DocumentPojo._id_, 1);
  604. BasicDBObject docFields = new BasicDBObject();
  605. docFields.append(DocumentPojo.url_, 1);
  606. docFields.append(DocumentPojo.sourceUrl_, 1);
  607. docFields.append(DocumentPojo.index_, 1);
  608. docFields.append(DocumentPojo.sourceKey_, 1);
  609. StoreAndIndexManager dataStore = new StoreAndIndexManager();
  610. ObjectId nextId = null;
  611. while (nToPrune > 0) {
  612. int nToDelete = nToPrune;
  613. if (nToDelete > 10000) {
  614. nToDelete = 10000;
  615. }
  616. if (null != nextId) {
  617. docQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.gt_, nextId));
  618. }//TESTED (by hand)
  619. DBCursor dbc = DbManager.getDocument().getMetadata().find(docQuery, docFields).sort(sortField).limit(nToDelete);
  620. // (ie batches of 10K, ascending ordered by _id)
  621. nToPrune -= nToDelete;
  622. if (0 == nDocsDeleted) {
  623. nDocsDeleted = dbc.count();
  624. }
  625. if (0 == dbc.size()) {
  626. break;
  627. }
  628. List<DocumentPojo> docs = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());
  629. nextId = dataStore.removeFromDatastore_byURL(docs);
  630. }
  631. }
  632. // No need to do anything related to soft deletion, this is all handled when the harvest ends
  633. }//TESTED
  634. //////////////////////////////////////////////////////
  635. // Utility to get harvest name for display purposes
  636. private static String _harvestHostname = null;
  637. private static String getHostname() {
  638. // (just get the hostname once)
  639. if (null == _harvestHostname) {
  640. try {
  641. _harvestHostname = InetAddress.getLocalHost().getHostName();
  642. } catch (Exception e) {
  643. _harvestHostname = "UNKNOWN";
  644. }
  645. }
  646. return _harvestHostname;
  647. }//TESTED
  648. ////////////////////////////////////////////////////////////////////////////////////////////
  649. ////////////////////////////////////////////////////////////////////////////////////////////
  650. // DISTRIBUTION UTILITIES
  651. //
  652. // Update the distibution state BEFORE the source is processed
  653. // (note can set in here because currently the status is in_process so no other threads can touch it)
  654. //
  655. private static void updateHarvestDistributionState_newToken(ObjectId sourceId, int distributionTokensFree, HarvestEnum harvestStatus, boolean bResetOldState) {
  656. BasicDBObject query = new BasicDBObject(SourcePojo._id_, sourceId);
  657. BasicDBObject setClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_, distributionTokensFree);
  658. if (bResetOldState) {
  659. setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 0);
  660. setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, false);
  661. }//TESTED
  662. setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, harvestStatus.toString());
  663. BasicDBObject update = new BasicDBObject(MongoDbManager.set_, setClause);
  664. MongoDbManager.getIngest().getSource().update(query, update, false, false);
  665. //DEBUG
  666. //System.out.println(" NEW_TOKEN=" + query.toString() + " / " + update.toString());
  667. }//TESTED
  668. //
  669. // Update the distibution state AFTER the source is processed
  670. // (note can set here if source is complete because that means no other thread can have control)
  671. // returns true if harvest is complete
  672. //
  673. // NOTE this isn't called if an error occurs during the ingest cycle (which is where almost all the errors are called)
  674. // as a result, the source will linger with incomplete/unavailable tokens until it's seen by the getDistributedSourceList
  675. // again - normally this will be quick because the sources keep getting put back on the uncheckedList
  676. //
  677. private static boolean updateHarvestDistributionState_tokenComplete(SourcePojo source, HarvestEnum harvestStatus, BasicDBObject incClause, BasicDBObject setClause) {
  678. // Update tokens complete, and retrieve modified version
  679. int nTokensToBeCleared = source.getDistributionTokens().size();
  680. BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
  681. BasicDBObject modify = new BasicDBObject(MongoDbManager.inc_,
  682. new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, nTokensToBeCleared));
  683. BasicDBObject fields = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 1);
  684. fields.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1);
  685. fields.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, 1);
  686. BasicDBObject partial = (BasicDBObject) MongoDbManager.getIngest().getSource().findAndModify(query, fields, null, false, modify, true, false);
  687. //(return new version - ensures previous increments have been taken into account)
  688. // Two cases: source complete (all tokens obtained), source incomplete:
  689. if (null != partial) { // (else yikes!)
  690. BasicDBObject partialStatus = (BasicDBObject) partial.get(SourcePojo.harvest_);
  691. if (null != partialStatus) { // (else yikes!)
  692. int nTokensComplete = partialStatus.getInt(SourceHarvestStatusPojo.distributionTokensComplete_, 0);
  693. // (note after increment)
  694. // COMPLETE: reset parameters, status -> error (if anything has errored), success (all done), success_iteration (more to do)
  695. if (nTokensComplete == source.getDistributionFactor()) {
  696. setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 0);
  697. setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_, source.getDistributionFactor());
  698. setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, false); // (resetting this)
  699. // This source is now complete
  700. String status = partialStatus.getString(SourceHarvestStatusPojo.harvest_status_, null);
  701. Boolean reachedLimit = partialStatus.getBoolean(SourceHarvestStatusPojo.distributionReachedLimit_, false) || source.reachedMaxDocs();
  702. if ((null != status)
  703. && ((status.equalsIgnoreCase(HarvestEnum.error.toString()) || (HarvestEnum.error == harvestStatus))))
  704. {
  705. setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.error.toString());
  706. }//TESTED (current and previous state == error)
  707. else if (reachedLimit || (HarvestEnum.success_iteration == harvestStatus)) {
  708. setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.success_iteration.toString());
  709. }//TESTED (from previous or current state)
  710. // (else leave with default of success)
  711. //DEBUG
  712. //System.out.println(Thread.currentThread().getName() + " COMPLETE_SRC COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete);
  713. return true;
  714. }//TESTED
  715. else { // Not complete
  716. // If we're here then we're only allowed to update the status to error
  717. if (HarvestEnum.error != harvestStatus) {
  718. setClause.remove(SourceHarvestStatusPojo.sourceQuery_harvest_status_);
  719. }//TESTED
  720. if (source.reachedMaxDocs()) {
  721. setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, true);
  722. }//TESTED
  723. //DEBUG
  724. //System.out.println(Thread.currentThread().getName() + " COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete);
  725. return false;
  726. }//(end is complete or not)
  727. //TESTED (reached max limit)
  728. }//(end found partial source status, else catastrophic failure)
  729. }//(end found partial source, else catastrophic failure)
  730. return false;
  731. }//TESTED
  732. }