PageRenderTime 64ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/Collectiondb.cpp

https://github.com/gigablast/open-source-search-engine
C++ | 4250 lines | 2151 code | 607 blank | 1492 comment | 397 complexity | 6c162a71eb7c344fb15b6f36dc9b5b10 MD5 | raw file
Possible License(s): Apache-2.0
  1. #include "gb-include.h"
  2. #include "Collectiondb.h"
  3. //#include "CollectionRec.h"
  4. #include "Xml.h"
  5. #include "Url.h"
  6. #include "Loop.h"
  7. #include "Spider.h" // for calling SpiderLoop::collectionsUpdated()
  8. #include "Posdb.h"
  9. //#include "Indexdb.h"
  10. #include "Datedb.h"
  11. #include "Titledb.h"
  12. //#include "Revdb.h"
  13. //#include "Sections.h"
  14. #include "Placedb.h"
  15. #include "Tagdb.h"
  16. #include "Catdb.h"
  17. #include "Tfndb.h"
  18. #include "Spider.h"
  19. //#include "Checksumdb.h"
  20. #include "Clusterdb.h"
  21. #include "Spider.h"
  22. #include "Repair.h"
  23. #include "Users.h"
  24. #include "Parms.h"
  25. void testRegex ( ) ;
  26. HashTableX g_collTable;
  27. // a global class extern'd in .h file
  28. Collectiondb g_collectiondb;
  29. Collectiondb::Collectiondb ( ) {
  30. m_wrapped = 0;
  31. m_numRecs = 0;
  32. m_numRecsUsed = 0;
  33. m_numCollsSwappedOut = 0;
  34. m_initializing = false;
  35. //m_lastUpdateTime = 0LL;
  36. m_needsSave = false;
  37. // sanity
  38. if ( RDB_END2 >= RDB_END ) return;
  39. log("db: increase RDB_END2 to at least %"INT32" in "
  40. "Collectiondb.h",(int32_t)RDB_END);
  41. char *xx=NULL;*xx=0;
  42. }
  43. // reset rdb
  44. void Collectiondb::reset() {
  45. log(LOG_INFO,"db: resetting collectiondb.");
  46. for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
  47. if ( ! m_recs[i] ) continue;
  48. mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
  49. delete ( m_recs[i] );
  50. m_recs[i] = NULL;
  51. }
  52. m_numRecs = 0;
  53. m_numRecsUsed = 0;
  54. g_collTable.reset();
  55. }
  56. /*
  57. bool Collectiondb::init ( bool isDump ) {
  58. reset();
  59. if ( g_isYippy ) return true;
  60. // reset # of recs
  61. //m_numRecs = 0;
  62. //m_numRecsUsed = 0;
  63. // . now load ALL recs
  64. // . returns false and sets g_errno on error
  65. if ( ! load ( isDump ) ) return false;
  66. // update time
  67. updateTime();
  68. // so we don't save again
  69. m_needsSave = false;
  70. // sanity
  71. if ( RDB_END2 < RDB_END ) {
  72. log("db: increase RDB_END2 to at least %"INT32" in "
  73. "Collectiondb.h",(int32_t)RDB_END);
  74. char *xx=NULL;*xx=0;
  75. }
  76. // if it set g_errno, return false
  77. //if ( g_errno ) return log("admin: Had init error: %s.",
  78. // mstrerror(g_errno));
  79. g_errno = 0;
  80. // otherwise, true, even if reloadList() blocked
  81. return true;
  82. }
  83. */
  84. extern bool g_inAutoSave;
  85. // . save to disk
  86. // . returns false if blocked, true otherwise
  87. bool Collectiondb::save ( ) {
  88. if ( g_conf.m_readOnlyMode ) return true;
  89. if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_hostId != 0 )
  90. return true;
  91. // which collection rec needs a save
  92. for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
  93. if ( ! m_recs[i] ) continue;
  94. // temp debug message
  95. //logf(LOG_DEBUG,"admin: SAVING collection #%"INT32" ANYWAY",i);
  96. if ( ! m_recs[i]->m_needsSave ) continue;
  97. // if we core in malloc we won't be able to save the
  98. // coll.conf files
  99. if ( m_recs[i]->m_isCustomCrawl &&
  100. g_inMemFunction &&
  101. g_hostdb.m_hostId != 0 )
  102. continue;
  103. //log(LOG_INFO,"admin: Saving collection #%"INT32".",i);
  104. m_recs[i]->save ( );
  105. }
  106. // oh well
  107. return true;
  108. }
  109. ///////////
  110. //
  111. // fill up our m_recs[] array based on the coll.*.*/coll.conf files
  112. //
  113. ///////////
  114. bool Collectiondb::loadAllCollRecs ( ) {
  115. m_initializing = true;
  116. char dname[1024];
  117. // MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
  118. sprintf ( dname , "%s" , g_hostdb.m_dir );
  119. Dir d;
  120. d.set ( dname );
  121. if ( ! d.open ()) return log("admin: Could not load collection config "
  122. "files.");
  123. int32_t count = 0;
  124. char *f;
  125. while ( ( f = d.getNextFilename ( "*" ) ) ) {
  126. // skip if first char not "coll."
  127. if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
  128. // must end on a digit (i.e. coll.main.0)
  129. if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
  130. // count them
  131. count++;
  132. }
  133. // reset directory for another scan
  134. d.set ( dname );
  135. if ( ! d.open ()) return log("admin: Could not load collection config "
  136. "files.");
  137. // note it
  138. //log(LOG_INFO,"db: loading collection config files.");
  139. // . scan through all subdirs in the collections dir
  140. // . they should be like, "coll.main/" and "coll.mycollection/"
  141. while ( ( f = d.getNextFilename ( "*" ) ) ) {
  142. // skip if first char not "coll."
  143. if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
  144. // must end on a digit (i.e. coll.main.0)
  145. if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
  146. // point to collection
  147. char *coll = f + 5;
  148. // NULL terminate at .
  149. char *pp = strchr ( coll , '.' );
  150. if ( ! pp ) continue;
  151. *pp = '\0';
  152. // get collnum
  153. collnum_t collnum = atol ( pp + 1 );
  154. // add it
  155. if ( ! addExistingColl ( coll , collnum ) )
  156. return false;
  157. // swap it out if we got 100+ collections
  158. // if ( count < 100 ) continue;
  159. // CollectionRec *cr = getRec ( collnum );
  160. // if ( cr ) cr->swapOut();
  161. }
  162. // if no existing recs added... add coll.main.0 always at startup
  163. if ( m_numRecs == 0 ) {
  164. log("admin: adding main collection.");
  165. addNewColl ( "main",
  166. 0 , // customCrawl ,
  167. NULL,
  168. 0 ,
  169. true , // bool saveIt ,
  170. // Parms.cpp reserves this so it can be sure
  171. // to add the same collnum to every shard
  172. 0 );
  173. }
  174. m_initializing = false;
  175. // note it
  176. //log(LOG_INFO,"db: Loaded data for %"INT32" collections. Ranging from "
  177. // "collection #0 to #%"INT32".",m_numRecsUsed,m_numRecs-1);
  178. // update the time
  179. //updateTime();
  180. // don't clean the tree if just dumpin
  181. //if ( isDump ) return true;
  182. return true;
  183. }
  184. // after we've initialized all rdbs in main.cpp call this to clean out
  185. // our rdb trees
  186. bool Collectiondb::cleanTrees ( ) {
  187. // remove any nodes with illegal collnums
  188. Rdb *r;
  189. //r = g_indexdb.getRdb();
  190. //r->m_tree.cleanTree ((char **)r->m_bases);
  191. r = g_posdb.getRdb();
  192. //r->m_tree.cleanTree ();//(char **)r->m_bases);
  193. r->m_buckets.cleanBuckets();
  194. //r = g_datedb.getRdb();
  195. //r->m_tree.cleanTree ((char **)r->m_bases);
  196. r = g_titledb.getRdb();
  197. r->m_tree.cleanTree ();//(char **)r->m_bases);
  198. //r = g_revdb.getRdb();
  199. //r->m_tree.cleanTree ((char **)r->m_bases);
  200. //r = g_sectiondb.getRdb();
  201. //r->m_tree.cleanTree ((char **)r->m_bases);
  202. //r = g_checksumdb.getRdb();
  203. //r->m_tree.cleanTree ((char **)r->m_bases);
  204. //r = g_tfndb.getRdb();
  205. //r->m_tree.cleanTree ((char **)r->m_bases);
  206. r = g_spiderdb.getRdb();
  207. r->m_tree.cleanTree ();//(char **)r->m_bases);
  208. r = g_doledb.getRdb();
  209. r->m_tree.cleanTree ();//(char **)r->m_bases);
  210. // success
  211. return true;
  212. }
  213. /*
  214. void Collectiondb::updateTime() {
  215. // get time now in milliseconds
  216. int64_t newTime = gettimeofdayInMilliseconds();
  217. // change it
  218. if ( m_lastUpdateTime == newTime ) newTime++;
  219. // update it
  220. m_lastUpdateTime = newTime;
  221. // we need a save
  222. m_needsSave = true;
  223. }
  224. */
  225. #include "Statsdb.h"
  226. #include "Cachedb.h"
  227. #include "Syncdb.h"
  228. // same as addOldColl()
  229. bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
  230. int32_t i = collnum;
  231. // ensure does not already exist in memory
  232. collnum_t oldCollnum = getCollnum(coll);
  233. if ( oldCollnum >= 0 ) {
  234. g_errno = EEXIST;
  235. log("admin: Trying to create collection \"%s\" but "
  236. "already exists in memory. Do an ls on "
  237. "the working dir to see if there are two "
  238. "collection dirs with the same coll name",coll);
  239. char *xx=NULL;*xx=0;
  240. }
  241. // also try by #, i've seen this happen too
  242. CollectionRec *ocr = getRec ( i );
  243. if ( ocr ) {
  244. g_errno = EEXIST;
  245. log("admin: Collection id %i is in use already by "
  246. "%s, so we can not add %s. moving %s to trash."
  247. ,(int)i,ocr->m_coll,coll,coll);
  248. SafeBuf cmd;
  249. int64_t now = gettimeofdayInMilliseconds();
  250. cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%"UINT64
  251. , coll
  252. ,(int)i
  253. , coll
  254. ,(int)i
  255. , now );
  256. //log("admin: %s",cmd.getBufStart());
  257. gbsystem ( cmd.getBufStart() );
  258. return true;
  259. }
  260. // create the record in memory
  261. CollectionRec *cr = new (CollectionRec);
  262. if ( ! cr )
  263. return log("admin: Failed to allocated %"INT32" bytes for new "
  264. "collection record for \"%s\".",
  265. (int32_t)sizeof(CollectionRec),coll);
  266. mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
  267. // set collnum right for g_parms.setToDefault() call just in case
  268. // because before it was calling CollectionRec::reset() which
  269. // was resetting the RdbBases for the m_collnum which was garbage
  270. // and ended up resetting random collections' rdb. but now
  271. // CollectionRec::CollectionRec() sets m_collnum to -1 so we should
  272. // not need this!
  273. //cr->m_collnum = oldCollnum;
  274. // get the default.conf from working dir if there
  275. g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
  276. strcpy ( cr->m_coll , coll );
  277. cr->m_collLen = gbstrlen ( coll );
  278. cr->m_collnum = i;
  279. // point to this, so Rdb and RdbBase can reference it
  280. coll = cr->m_coll;
  281. //log("admin: loaded old coll \"%s\"",coll);
  282. // load coll.conf file
  283. if ( ! cr->load ( coll , i ) ) {
  284. mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
  285. log("admin: Failed to load coll.%s.%"INT32"/coll.conf",coll,i);
  286. delete ( cr );
  287. if ( m_recs ) m_recs[i] = NULL;
  288. return false;
  289. }
  290. if ( ! registerCollRec ( cr , false ) ) return false;
  291. // always index spider status docs now for custom crawls
  292. if ( cr->m_isCustomCrawl )
  293. cr->m_indexSpiderReplies = true;
  294. // and don't do link voting, will help speed up
  295. if ( cr->m_isCustomCrawl ) {
  296. cr->m_getLinkInfo = false;
  297. cr->m_computeSiteNumInlinks = false;
  298. // limit each shard to 5 spiders per collection to prevent
  299. // ppl from spidering the web and hogging up resources
  300. cr->m_maxNumSpiders = 5;
  301. // diffbot download docs up to 50MB so we don't truncate
  302. // things like sitemap.xml. but keep regular html pages
  303. // 1MB
  304. cr->m_maxTextDocLen = 1024*1024;
  305. // xml, pdf, etc can be this. 50MB
  306. cr->m_maxOtherDocLen = 50000000;
  307. }
  308. // we need to compile the regular expressions or update the url
  309. // filters with new logic that maps crawlbot parms to url filters
  310. return cr->rebuildUrlFilters ( );
  311. }
  312. // . add a new rec
  313. // . returns false and sets g_errno on error
  314. // . was addRec()
  315. // . "isDump" is true if we don't need to initialize all the rdbs etc
  316. // because we are doing a './gb dump ...' cmd to dump out data from
  317. // one Rdb which we will custom initialize in main.cpp where the dump
  318. // code is. like for instance, posdb.
  319. // . "customCrawl" is 0 for a regular collection, 1 for a simple crawl
  320. // 2 for a bulk job. diffbot terminology.
  321. bool Collectiondb::addNewColl ( char *coll ,
  322. char customCrawl ,
  323. char *cpc ,
  324. int32_t cpclen ,
  325. bool saveIt ,
  326. // Parms.cpp reserves this so it can be sure
  327. // to add the same collnum to every shard
  328. collnum_t newCollnum ) {
  329. //do not send add/del coll request until we are in sync with shard!!
  330. // just return ETRYAGAIN for the parmlist...
  331. // ensure coll name is legit
  332. char *p = coll;
  333. for ( ; *p ; p++ ) {
  334. if ( is_alnum_a(*p) ) continue;
  335. if ( *p == '-' ) continue;
  336. if ( *p == '_' ) continue; // underscore now allowed
  337. break;
  338. }
  339. if ( *p ) {
  340. g_errno = EBADENGINEER;
  341. log("admin: \"%s\" is a malformed collection name because it "
  342. "contains the '%c' character.",coll,*p);
  343. return false;
  344. }
  345. // . scan for holes
  346. // . i is also known as the collection id
  347. //int32_t i = (int32_t)newCollnum;
  348. // no longer fill empty slots because if they do a reset then
  349. // a new rec right away it will be filled with msg4 recs not
  350. // destined for it. Later we will have to recycle some how!!
  351. //else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
  352. // right now we #define collnum_t int16_t. so do not breach that!
  353. //if ( m_numRecs < 0x7fff ) {
  354. // // set it
  355. // i = m_numRecs;
  356. // // claim it
  357. // // we don't do it here, because we check i below and
  358. // // increment m_numRecs below.
  359. // //m_numRecs++;
  360. //}
  361. // TODO: scan for holes here...
  362. //else {
  363. if ( newCollnum < 0 ) { char *xx=NULL;*xx=0; }
  364. // ceiling?
  365. //int64_t maxColls = 1LL<<(sizeof(collnum_t)*8);
  366. //if ( i >= maxColls ) {
  367. // g_errno = ENOBUFS;
  368. // return log("admin: Limit of %"INT64" collection reached. "
  369. // "Collection not created.",maxColls);
  370. //}
  371. // if empty... bail, no longer accepted, use "main"
  372. if ( ! coll || !coll[0] ) {
  373. g_errno = EBADENGINEER;
  374. return log("admin: Trying to create a new collection "
  375. "but no collection name provided. Use the \"c\" "
  376. "cgi parameter to specify it.");
  377. }
  378. // or if too big
  379. if ( gbstrlen(coll) > MAX_COLL_LEN ) {
  380. g_errno = ENOBUFS;
  381. return log("admin: Trying to create a new collection "
  382. "whose name \"%s\" of %i chars is longer than the "
  383. "max of %"INT32" chars.",coll,gbstrlen(coll),
  384. (int32_t)MAX_COLL_LEN);
  385. }
  386. // ensure does not already exist in memory
  387. if ( getCollnum ( coll ) >= 0 ) {
  388. g_errno = EEXIST;
  389. log("admin: Trying to create collection \"%s\" but "
  390. "already exists in memory.",coll);
  391. // just let it pass...
  392. g_errno = 0 ;
  393. return true;
  394. }
  395. // MDW: ensure not created on disk since time of last load
  396. char dname[512];
  397. sprintf(dname, "%scoll.%s.%"INT32"/",g_hostdb.m_dir,coll,(int32_t)newCollnum);
  398. DIR *dir = opendir ( dname );
  399. if ( dir ) closedir ( dir );
  400. if ( dir ) {
  401. g_errno = EEXIST;
  402. return log("admin: Trying to create collection %s but "
  403. "directory %s already exists on disk.",coll,dname);
  404. }
  405. // create the record in memory
  406. CollectionRec *cr = new (CollectionRec);
  407. if ( ! cr )
  408. return log("admin: Failed to allocated %"INT32" bytes for new "
  409. "collection record for \"%s\".",
  410. (int32_t)sizeof(CollectionRec),coll);
  411. // register the mem
  412. mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
  413. // get copy collection
  414. //CollectionRec *cpcrec = NULL;
  415. //if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
  416. //if ( cpc && cpc[0] && ! cpcrec )
  417. // log("admin: Collection \"%s\" to copy config from does not "
  418. // "exist.",cpc);
  419. // set collnum right for g_parms.setToDefault() call
  420. //cr->m_collnum = newCollnum;
  421. // . get the default.conf from working dir if there
  422. // . i think this calls CollectionRec::reset() which resets all of its
  423. // rdbbase classes for its collnum so m_collnum needs to be right
  424. //g_parms.setToDefault( (char *)cr );
  425. // get the default.conf from working dir if there
  426. //g_parms.setToDefault( (char *)cr , OBJ_COLL );
  427. g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
  428. // put search results back so it doesn't mess up results in qatest123
  429. if ( strcmp(coll,"qatest123") == 0 )
  430. cr->m_sameLangWeight = 20.0;
  431. /*
  432. // the default conf file
  433. char tmp1[1024];
  434. sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
  435. // . set our parms from the file.
  436. // . accepts OBJ_COLLECTIONREC or OBJ_CONF
  437. g_parms.setFromFile ( cr , NULL , tmp1 );
  438. */
  439. // this will override all
  440. // if ( cpcrec ) {
  441. // // copy it, but not the timedb hashtable, etc.
  442. // int32_t size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
  443. // // JAB: bad gbmemcpy - no donut!
  444. // // this is not how objects are supposed to be copied!!!
  445. // gbmemcpy ( cr , cpcrec , size);
  446. // }
  447. // set coll id and coll name for coll id #i
  448. strcpy ( cr->m_coll , coll );
  449. cr->m_collLen = gbstrlen ( coll );
  450. cr->m_collnum = newCollnum;
  451. // point to this, so Rdb and RdbBase can reference it
  452. coll = cr->m_coll;
  453. //
  454. // BEGIN NEW CODE
  455. //
  456. //
  457. // get token and crawlname if customCrawl is 1 or 2
  458. //
  459. char *token = NULL;
  460. char *crawl = NULL;
  461. SafeBuf tmp;
  462. // . return true with g_errno set on error
  463. // . if we fail to set a parm right we should force ourselves
  464. // out sync
  465. if ( customCrawl ) {
  466. if ( ! tmp.safeStrcpy ( coll ) ) return true;
  467. token = tmp.getBufStart();
  468. // diffbot coll name format is <token>-<crawlname>
  469. char *h = strchr ( tmp.getBufStart() , '-' );
  470. if ( ! h ) {
  471. log("crawlbot: bad custom collname");
  472. g_errno = EBADENGINEER;
  473. mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
  474. delete ( cr );
  475. return true;
  476. }
  477. *h = '\0';
  478. crawl = h + 1;
  479. if ( ! crawl[0] ) {
  480. log("crawlbot: bad custom crawl name");
  481. mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
  482. delete ( cr );
  483. g_errno = EBADENGINEER;
  484. return true;
  485. }
  486. // or if too big!
  487. if ( gbstrlen(crawl) > 30 ) {
  488. log("crawlbot: crawlbot crawl NAME is over 30 chars");
  489. mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
  490. delete ( cr );
  491. g_errno = EBADENGINEER;
  492. return true;
  493. }
  494. }
  495. //log("parms: added new collection \"%s\"", collName );
  496. cr->m_maxToCrawl = -1;
  497. cr->m_maxToProcess = -1;
  498. if ( customCrawl ) {
  499. // always index spider status docs now
  500. cr->m_indexSpiderReplies = true;
  501. // remember the token
  502. cr->m_diffbotToken.set ( token );
  503. cr->m_diffbotCrawlName.set ( crawl );
  504. // bring this back
  505. cr->m_diffbotApiUrl.set ( "" );
  506. cr->m_diffbotUrlCrawlPattern.set ( "" );
  507. cr->m_diffbotUrlProcessPattern.set ( "" );
  508. cr->m_diffbotPageProcessPattern.set ( "" );
  509. cr->m_diffbotUrlCrawlRegEx.set ( "" );
  510. cr->m_diffbotUrlProcessRegEx.set ( "" );
  511. cr->m_diffbotMaxHops = -1;
  512. cr->m_spiderStatus = SP_INITIALIZING;
  513. // do not spider more than this many urls total.
  514. // -1 means no max.
  515. cr->m_maxToCrawl = 100000;
  516. // do not process more than this. -1 means no max.
  517. cr->m_maxToProcess = 100000;
  518. // -1 means no max
  519. cr->m_maxCrawlRounds = -1;
  520. // diffbot download docs up to 10MB so we don't truncate
  521. // things like sitemap.xml
  522. cr->m_maxTextDocLen = 10000000;
  523. cr->m_maxOtherDocLen = 10000000;
  524. // john wants deduping on by default to avoid
  525. // processing similar pgs
  526. cr->m_dedupingEnabled = true;
  527. // show the ban links in the search results. the
  528. // collection name is cryptographic enough to show that
  529. cr->m_isCustomCrawl = customCrawl;
  530. cr->m_diffbotOnlyProcessIfNewUrl = true;
  531. // default respider to off
  532. cr->m_collectiveRespiderFrequency = 0.0;
  533. //cr->m_restrictDomain = true;
  534. // reset the crawl stats
  535. // always turn off gigabits so &s=1000 can do summary skipping
  536. cr->m_docsToScanForTopics = 0;
  537. // turn off link voting, etc. to speed up
  538. cr->m_getLinkInfo = false;
  539. cr->m_computeSiteNumInlinks = false;
  540. }
  541. // . this will core if a host was dead and then when it came
  542. // back up host #0's parms.cpp told it to add a new coll
  543. cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
  544. cr->m_diffbotCrawlEndTime = 0;
  545. // . just the basics on these for now
  546. // . if certain parms are changed then the url filters
  547. // must be rebuilt, as well as possibly the waiting tree!!!
  548. // . need to set m_urlFiltersHavePageCounts etc.
  549. cr->rebuildUrlFilters ( );
  550. cr->m_useRobotsTxt = true;
  551. // reset crawler stats.they should be loaded from crawlinfo.txt
  552. memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
  553. memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );
  554. // note that
  555. log("colldb: initial revival for %s",cr->m_coll);
  556. // . assume we got some urls ready to spider
  557. // . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
  558. // urls it spidered in that time these will get set to 0 and it
  559. // will send out an email alert if m_sentCrawlDoneAlert is not true.
  560. cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
  561. cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;
  562. // set some defaults. max spiders for all priorities in this
  563. // collection. NO, default is in Parms.cpp.
  564. //cr->m_maxNumSpiders = 10;
  565. //cr->m_needsSave = 1;
  566. // start the spiders!
  567. cr->m_spideringEnabled = true;
  568. // override this?
  569. saveIt = true;
  570. //
  571. // END NEW CODE
  572. //
  573. //log("admin: adding coll \"%s\" (new=%"INT32")",coll,(int32_t)isNew);
  574. // MDW: create the new directory
  575. retry22:
  576. if ( ::mkdir ( dname ,
  577. getDirCreationFlags() ) ) {
  578. // S_IRUSR | S_IWUSR | S_IXUSR |
  579. // S_IRGRP | S_IWGRP | S_IXGRP |
  580. // S_IROTH | S_IXOTH ) ) {
  581. // valgrind?
  582. if ( errno == EINTR ) goto retry22;
  583. g_errno = errno;
  584. mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
  585. delete ( cr );
  586. return log("admin: Creating directory %s had error: "
  587. "%s.", dname,mstrerror(g_errno));
  588. }
  589. // save it into this dir... might fail!
  590. if ( saveIt && ! cr->save() ) {
  591. mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
  592. delete ( cr );
  593. return log("admin: Failed to save file %s: %s",
  594. dname,mstrerror(g_errno));
  595. }
  596. if ( ! registerCollRec ( cr , true ) )
  597. return false;
  598. // add the rdbbases for this coll, CollectionRec::m_bases[]
  599. if ( ! addRdbBasesForCollRec ( cr ) )
  600. return false;
  601. return true;
  602. }
  603. void CollectionRec::setBasePtr ( char rdbId , class RdbBase *base ) {
  604. // if in the process of swapping in, this will be false...
  605. //if ( m_swappedOut ) { char *xx=NULL;*xx=0; }
  606. if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
  607. // Rdb::deleteColl() will call this even though we are swapped in
  608. // but it calls it with "base" set to NULL after it nukes the RdbBase
  609. // so check if base is null here.
  610. if ( base && m_bases[ (unsigned char)rdbId ]){ char *xx=NULL;*xx=0; }
  611. m_bases [ (unsigned char)rdbId ] = base;
  612. }
  613. RdbBase *CollectionRec::getBasePtr ( char rdbId ) {
  614. if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
  615. return m_bases [ (unsigned char)rdbId ];
  616. }
  617. static bool s_inside = false;
  618. // . returns NULL w/ g_errno set on error.
  619. // . TODO: ensure not called from in thread, not thread safe
  620. RdbBase *CollectionRec::getBase ( char rdbId ) {
  621. if ( s_inside ) { char *xx=NULL;*xx=0; }
  622. if ( ! m_swappedOut ) return m_bases[(unsigned char)rdbId];
  623. log("cdb: swapin collnum=%"INT32"",(int32_t)m_collnum);
  624. // sanity!
  625. if ( g_threads.amThread() ) { char *xx=NULL;*xx=0; }
  626. s_inside = true;
  627. // turn off quickpoll to avoid getbase() being re-called and
  628. // coring from s_inside being true
  629. int32_t saved = g_conf.m_useQuickpoll;
  630. g_conf.m_useQuickpoll = false;
  631. // load them back in. return NULL w/ g_errno set on error.
  632. if ( ! g_collectiondb.addRdbBasesForCollRec ( this ) ) {
  633. log("coll: error swapin: %s",mstrerror(g_errno));
  634. g_conf.m_useQuickpoll = saved;
  635. s_inside = false;
  636. return NULL;
  637. }
  638. g_conf.m_useQuickpoll = saved;
  639. s_inside = false;
  640. g_collectiondb.m_numCollsSwappedOut--;
  641. m_swappedOut = false;
  642. log("coll: swapin was successful for collnum=%"INT32"",(int32_t)m_collnum);
  643. return m_bases[(unsigned char)rdbId];
  644. }
  645. bool CollectionRec::swapOut ( ) {
  646. if ( m_swappedOut ) return true;
  647. log("cdb: swapout collnum=%"INT32"",(int32_t)m_collnum);
  648. // free all RdbBases in each rdb
  649. for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
  650. Rdb *rdb = g_process.m_rdbs[i];
  651. // this frees all the RdbBase::m_files and m_maps for the base
  652. rdb->resetBase ( m_collnum );
  653. }
  654. // now free each base itself
  655. for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
  656. RdbBase *base = m_bases[i];
  657. if ( ! base ) continue;
  658. mdelete (base, sizeof(RdbBase), "Rdb Coll");
  659. delete (base);
  660. m_bases[i] = NULL;
  661. }
  662. m_swappedOut = true;
  663. g_collectiondb.m_numCollsSwappedOut++;
  664. return true;
  665. }
  666. // . called only by addNewColl() and by addExistingColl()
  667. bool Collectiondb::registerCollRec ( CollectionRec *cr , bool isNew ) {
  668. // add m_recs[] and to hashtable
  669. if ( ! setRecPtr ( cr->m_collnum , cr ) )
  670. return false;
  671. return true;
  672. }
  673. // swap it in
  674. bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
  675. for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
  676. CollectionRec *cr = m_recs[i];
  677. if ( ! cr ) continue;
  678. // skip if swapped out
  679. if ( cr->m_swappedOut ) continue;
  680. // add rdb base files etc. for it
  681. addRdbBasesForCollRec ( cr );
  682. }
  683. // now clean the trees. moved this into here from
  684. // addRdbBasesForCollRec() since we call addRdbBasesForCollRec()
  685. // now from getBase() to load on-demand for saving memory
  686. cleanTrees();
  687. return true;
  688. }
  689. bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
  690. char *coll = cr->m_coll;
  691. //////
  692. //
  693. // if we are doing a dump from the command line, skip this stuff
  694. //
  695. //////
  696. if ( g_dumpMode ) return true;
  697. // tell rdbs to add one, too
  698. //if ( ! g_indexdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  699. if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  700. //if ( ! g_datedb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  701. if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  702. //if ( ! g_revdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  703. //if ( ! g_sectiondb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  704. if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  705. //if ( ! g_catdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  706. //if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  707. //if ( ! g_tfndb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  708. if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  709. if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  710. if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  711. if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
  712. // now clean the trees
  713. //cleanTrees();
  714. // debug message
  715. //log ( LOG_INFO, "db: verified collection \"%s\" (%"INT32").",
  716. // coll,(int32_t)cr->m_collnum);
  717. // tell SpiderCache about this collection, it will create a
  718. // SpiderCollection class for it.
  719. //g_spiderCache.reset1();
  720. // success
  721. return true;
  722. hadError:
  723. log("db: error registering coll: %s",mstrerror(g_errno));
  724. return false;
  725. }
  726. /*
  727. bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
  728. if ( r->getLong("admin",1) == 0 ) return false;
  729. if ( g_conf.isMasterAdmin ( s , r ) ) return true;
  730. char *c = r->getString ( "c" );
  731. CollectionRec *cr = getRec ( c );
  732. if ( ! cr ) return false;
  733. return g_users.hasPermission ( r , PAGE_SEARCH );
  734. //return cr->hasPermission ( r , s );
  735. }
  736. void savingCheckWrapper1 ( int fd , void *state ) {
  737. WaitEntry *we = (WaitEntry *)state;
  738. // no state?
  739. if ( ! we ) { log("colldb: we1 is null"); return; }
  740. // unregister too
  741. g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
  742. // if it blocked again i guess tree is still saving
  743. if ( ! g_collectiondb.resetColl ( we->m_coll ,
  744. we ,
  745. we->m_purgeSeeds))
  746. return;
  747. // all done
  748. we->m_callback ( we->m_state );
  749. }
  750. void savingCheckWrapper2 ( int fd , void *state ) {
  751. WaitEntry *we = (WaitEntry *)state;
  752. // no state?
  753. if ( ! we ) { log("colldb: we2 is null"); return; }
  754. // unregister too
  755. g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
  756. // if it blocked again i guess tree is still saving
  757. if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
  758. // all done
  759. we->m_callback ( we->m_state );
  760. }
  761. */
  762. /*
  763. // delete all records checked in the list
  764. bool Collectiondb::deleteRecs ( HttpRequest *r ) {
  765. for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
  766. char *f = r->getField ( i );
  767. if ( strncmp ( f , "del" , 3 ) != 0 ) continue;
  768. char *coll = f + 3;
  769. //if ( ! is_digit ( f[3] ) ) continue;
  770. //int32_t h = atol ( f + 3 );
  771. deleteRec ( coll , NULL );
  772. }
  773. return true;
  774. }
  775. */
  776. /*
  777. // . delete a collection
  778. // . this uses blocking unlinks, may make non-blocking later
  779. // . returns false if blocked, true otherwise
  780. bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
  781. // force on for now
  782. //deleteTurkdb = true;
  783. // no spiders can be out. they may be referencing the CollectionRec
  784. // in XmlDoc.cpp... quite likely.
  785. //if ( g_conf.m_spideringEnabled ||
  786. // g_spiderLoop.m_numSpidersOut > 0 ) {
  787. // log("admin: Can not delete collection while "
  788. // "spiders are enabled or active.");
  789. // return false;
  790. //}
  791. // ensure it's not NULL
  792. if ( ! coll ) {
  793. log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
  794. g_errno = ENOTFOUND;
  795. return true;
  796. }
  797. // find the rec for this collection
  798. collnum_t collnum = getCollnum ( coll );
  799. return deleteRec2 ( collnum , we );
  800. }
  801. */
  802. // if there is an outstanding disk read thread or merge thread then
  803. // Spider.cpp will handle the delete in the callback.
  804. // this is now tryToDeleteSpiderColl in Spider.cpp
  805. /*
  806. void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
  807. sc->m_deleteMyself = true;
  808. // if not currently being accessed nuke it now
  809. if ( ! sc->m_msg5.m_waitingForList &&
  810. ! sc->m_msg5b.m_waitingForList &&
  811. ! sc->m_msg1.m_mcast.m_inUse ) {
  812. mdelete ( sc, sizeof(SpiderColl),"nukecr2");
  813. delete ( sc );
  814. return;
  815. }
  816. }
  817. */
  818. /// this deletes the collection, not just part of a reset.
  819. bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
  820. // do not allow this if in repair mode
  821. if ( g_repair.isRepairActive() && g_repair.m_collnum == collnum ) {
  822. log("admin: Can not delete collection while in repair mode.");
  823. g_errno = EBADENGINEER;
  824. return true;
  825. }
  826. // bitch if not found
  827. if ( collnum < 0 ) {
  828. g_errno = ENOTFOUND;
  829. log(LOG_LOGIC,"admin: Collection #%"INT32" is bad, "
  830. "delete failed.",(int32_t)collnum);
  831. return true;
  832. }
  833. CollectionRec *cr = m_recs [ collnum ];
  834. if ( ! cr ) {
  835. log("admin: Collection id problem. Delete failed.");
  836. g_errno = ENOTFOUND;
  837. return true;
  838. }
  839. if ( g_process.isAnyTreeSaving() ) {
  840. // note it
  841. log("admin: tree is saving. waiting2.");
  842. // all done
  843. return false;
  844. }
  845. // spiders off
  846. //if ( cr->m_spiderColl &&
  847. // cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
  848. // log("admin: Can not delete collection while "
  849. // "spiders are outstanding for collection. Turn off "
  850. // "spiders and wait for them to exit.");
  851. // return false;
  852. //}
  853. char *coll = cr->m_coll;
  854. // note it
  855. log(LOG_INFO,"db: deleting coll \"%s\" (%"INT32")",coll,
  856. (int32_t)cr->m_collnum);
  857. // we need a save
  858. m_needsSave = true;
  859. // nuke doleiptable and waintree and waitingtable
  860. /*
  861. SpiderColl *sc = g_spiderCache.getSpiderColl ( collnum );
  862. sc->m_waitingTree.clear();
  863. sc->m_waitingTable.clear();
  864. sc->m_doleIpTable.clear();
  865. g_spiderLoop.m_lockTable.clear();
  866. g_spiderLoop.m_lockCache.clear(0);
  867. sc->m_lastDownloadCache.clear(collnum);
  868. */
  869. // CAUTION: tree might be in the middle of saving
  870. // we deal with this in Process.cpp now
  871. // remove from spider cache, tell it to sync up with collectiondb
  872. //g_spiderCache.reset1();
  873. // . TODO: remove from g_sync
  874. // . remove from all rdbs
  875. //g_indexdb.getRdb()->delColl ( coll );
  876. g_posdb.getRdb()->delColl ( coll );
  877. //g_datedb.getRdb()->delColl ( coll );
  878. g_titledb.getRdb()->delColl ( coll );
  879. //g_revdb.getRdb()->delColl ( coll );
  880. //g_sectiondb.getRdb()->delColl ( coll );
  881. g_tagdb.getRdb()->delColl ( coll );
  882. // let's preserve the tags... they have all the turk votes in them
  883. //if ( deleteTurkdb ) {
  884. //}
  885. //g_catdb.getRdb()->delColl ( coll );
  886. //g_checksumdb.getRdb()->delColl ( coll );
  887. g_spiderdb.getRdb()->delColl ( coll );
  888. g_doledb.getRdb()->delColl ( coll );
  889. //g_tfndb.getRdb()->delColl ( coll );
  890. g_clusterdb.getRdb()->delColl ( coll );
  891. g_linkdb.getRdb()->delColl ( coll );
  892. // reset spider info
  893. SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
  894. if ( sc ) {
  895. // remove locks from lock table:
  896. sc->clearLocks();
  897. //sc->m_collnum = newCollnum;
  898. //sc->reset();
  899. // you have to set this for tryToDeleteSpiderColl to
  900. // actually have a shot at deleting it
  901. sc->m_deleteMyself = true;
  902. // cr will be invalid int16_tly after this
  903. // MDW: this is causing the core...
  904. // use fake ptrs for easier debugging
  905. //sc->m_cr = (CollectionRec *)0x99999;//NULL;
  906. //sc->m_cr = NULL;
  907. sc->setCollectionRec ( NULL );
  908. // this will put it on "death row" so it will be deleted
  909. // once Msg5::m_waitingForList/Merge is NULL
  910. tryToDeleteSpiderColl ( sc ,"10");
  911. //mdelete ( sc, sizeof(SpiderColl),"nukecr2");
  912. //delete ( sc );
  913. // don't let cr reference us anymore, sc is on deathrow
  914. // and "cr" is delete below!
  915. //cr->m_spiderColl = (SpiderColl *)0x8888;//NULL;
  916. cr->m_spiderColl = NULL;
  917. }
  918. // the bulk urls file too i guess
  919. if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
  920. SafeBuf bu;
  921. bu.safePrintf("%sbulkurls-%s.txt",
  922. g_hostdb.m_dir , cr->m_coll );
  923. File bf;
  924. bf.set ( bu.getBufStart() );
  925. if ( bf.doesExist() ) bf.unlink();
  926. }
  927. // now remove from list of collections that might need a disk merge
  928. removeFromMergeLinkedList ( cr );
  929. //////
  930. //
  931. // remove from m_recs[]
  932. //
  933. //////
  934. setRecPtr ( cr->m_collnum , NULL );
  935. // free it
  936. mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
  937. delete ( cr );
  938. // do not do this here in case spiders were outstanding
  939. // and they added a new coll right away and it ended up getting
  940. // recs from the deleted coll!!
  941. //while ( ! m_recs[m_numRecs-1] ) m_numRecs--;
  942. // update the time
  943. //updateTime();
  944. // done
  945. return true;
  946. }
  947. //#include "PageTurk.h"
  948. /*
  949. // . reset a collection
  950. // . returns false if blocked and will call callback
  951. bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
  952. // ensure it's not NULL
  953. if ( ! coll ) {
  954. log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
  955. g_errno = ENOCOLLREC;
  956. return true;
  957. }
  958. // get the CollectionRec for "qatest123"
  959. CollectionRec *cr = getRec ( coll ); // "qatest123" );
  960. // must be there. if not, we create test i guess
  961. if ( ! cr ) {
  962. log("db: could not get coll rec \"%s\" to reset", coll);
  963. char *xx=NULL;*xx=0;
  964. }
  965. return resetColl2 ( cr->m_collnum, purgeSeeds);
  966. }
  967. */
  968. // ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
  969. bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
  970. // an add, make sure big enough
  971. int32_t need = ((int32_t)collnum+1)*sizeof(CollectionRec *);
  972. int32_t have = m_recPtrBuf.getLength();
  973. int32_t need2 = need - have;
  974. // if already big enough
  975. if ( need2 <= 0 ) {
  976. m_recs [ collnum ] = NULL;
  977. return true;
  978. }
  979. m_recPtrBuf.setLabel ("crecptrb");
  980. // . true here means to clear the new space to zeroes
  981. // . this shit works based on m_length not m_capacity
  982. if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
  983. log("admin: error growing rec ptr buf2.");
  984. return false;
  985. }
  986. // sanity
  987. if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }
  988. // set it
  989. m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
  990. // update length of used bytes in case we re-alloc
  991. m_recPtrBuf.setLength ( need );
  992. // re-max
  993. int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
  994. // sanity
  995. if ( collnum >= max ) { char *xx=NULL;*xx=0; }
  996. // initialize slot
  997. m_recs [ collnum ] = NULL;
  998. return true;
  999. }
  1000. bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
  1001. // first time init hashtable that maps coll to collnum
  1002. if ( g_collTable.m_numSlots == 0 &&
  1003. ! g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
  1004. false,0,"nhshtbl"))
  1005. return false;
  1006. // sanity
  1007. if ( collnum < 0 ) { char *xx=NULL;*xx=0; }
  1008. // sanity
  1009. int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
  1010. // set it
  1011. m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
  1012. // tell spiders to re-upadted the active list
  1013. g_spiderLoop.m_activeListValid = false;
  1014. g_spiderLoop.m_activeListModified = true;
  1015. // a delete?
  1016. if ( ! cr ) {
  1017. // sanity
  1018. if ( collnum >= max ) { char *xx=NULL;*xx=0; }
  1019. // get what's there
  1020. CollectionRec *oc = m_recs[collnum];
  1021. // let it go
  1022. m_recs[collnum] = NULL;
  1023. // if nothing already, done
  1024. if ( ! oc ) return true;
  1025. // tally it up
  1026. m_numRecsUsed--;
  1027. // delete key
  1028. int64_t h64 = hash64n(oc->m_coll);
  1029. // if in the hashtable UNDER OUR COLLNUM then nuke it
  1030. // otherwise, we might be called from resetColl2()
  1031. void *vp = g_collTable.getValue ( &h64 );
  1032. if ( ! vp ) return true;
  1033. collnum_t ct = *(collnum_t *)vp;
  1034. if ( ct != collnum ) return true;
  1035. g_collTable.removeKey ( &h64 );
  1036. return true;
  1037. }
  1038. // ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
  1039. if ( ! growRecPtrBuf ( collnum ) )
  1040. return false;
  1041. // sanity
  1042. if ( cr->m_collnum != collnum ) { char *xx=NULL;*xx=0; }
  1043. // add to hash table to map name to collnum_t
  1044. int64_t h64 = hash64n(cr->m_coll);
  1045. // debug
  1046. //log("coll: adding key %"INT64" for %s",h64,cr->m_coll);
  1047. if ( ! g_collTable.addKey ( &h64 , &collnum ) )
  1048. return false;
  1049. // ensure last is NULL
  1050. m_recs[collnum] = cr;
  1051. // count it
  1052. m_numRecsUsed++;
  1053. //log("coll: adding key4 %"UINT64" for coll \"%s\" (%"INT32")",h64,cr->m_coll,
  1054. // (int32_t)i);
  1055. // reserve it
  1056. if ( collnum >= m_numRecs ) m_numRecs = collnum + 1;
  1057. // sanity to make sure collectionrec ptrs are legit
  1058. for ( int32_t j = 0 ; j < m_numRecs ; j++ ) {
  1059. if ( ! m_recs[j] ) continue;
  1060. if ( m_recs[j]->m_collnum == 1 ) continue;
  1061. }
  1062. // update the time
  1063. //updateTime();
  1064. return true;
  1065. }
  1066. // moves a file by first trying rename, then copying since cross device renaming doesn't work
  1067. // returns 0 on success
  1068. int mv(char* src, char* dest) {
  1069. int status = rename( src , dest );
  1070. if (status == 0)
  1071. return 0;
  1072. FILE *fsrc, *fdest;
  1073. fsrc = fopen(src, "r");
  1074. if (fsrc == NULL)
  1075. return -1;
  1076. fdest = fopen(dest, "w");
  1077. if (fdest == NULL) {
  1078. fclose(fsrc);
  1079. return -1;
  1080. }
  1081. const int BUF_SIZE = 1024;
  1082. char buf[BUF_SIZE];
  1083. while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
  1084. int read = fread(buf, 1, BUF_SIZE, fsrc);
  1085. fwrite(buf, 1, read, fdest);
  1086. }
  1087. fclose(fsrc);
  1088. fclose(fdest);
  1089. if (ferror(fdest) || ferror(fsrc))
  1090. return -1;
  1091. remove(src);
  1092. return 0;
  1093. }
  1094. // . returns false if we need a re-call, true if we completed
  1095. // . returns true with g_errno set on error
  1096. bool Collectiondb::resetColl2( collnum_t oldCollnum,
  1097. collnum_t newCollnum,
  1098. //WaitEntry *we,
  1099. bool purgeSeeds){
  1100. // save parms in case we block
  1101. //we->m_purgeSeeds = purgeSeeds;
  1102. // now must be "qatest123" only for now
  1103. //if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
  1104. // no spiders can be out. they may be referencing the CollectionRec
  1105. // in XmlDoc.cpp... quite likely.
  1106. //if ( g_conf.m_spideringEnabled ||
  1107. // g_spiderLoop.m_numSpidersOut > 0 ) {
  1108. // log("admin: Can not delete collection while "
  1109. // "spiders are enabled or active.");
  1110. // return false;
  1111. //}
  1112. // do not allow this if in repair mode
  1113. if ( g_repair.isRepairActive() && g_repair.m_collnum == oldCollnum ) {
  1114. log("admin: Can not delete collection while in repair mode.");
  1115. g_errno = EBADENGINEER;
  1116. return true;
  1117. }
  1118. //log("admin: resetting collnum %"INT32"",(int32_t)oldCollnum);
  1119. // CAUTION: tree might be in the middle of saving
  1120. // we deal with this in Process.cpp now
  1121. if ( g_process.isAnyTreeSaving() ) {
  1122. // we could not complete...
  1123. return false;
  1124. }
  1125. CollectionRec *cr = m_recs [ oldCollnum ];
  1126. // let's reset crawlinfo crap
  1127. cr->m_globalCrawlInfo.reset();
  1128. cr->m_localCrawlInfo.reset();
  1129. //collnum_t oldCollnum = cr->m_collnum;
  1130. //collnum_t newCollnum = m_numRecs;
  1131. // in case of bulk job, be sure to save list of spots
  1132. // copy existing list to a /tmp, where they will later be transferred back to the new folder
  1133. // now i just store in the root working dir... MDW
  1134. /*
  1135. char oldbulkurlsname[1036];
  1136. snprintf(oldbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)oldCollnum);
  1137. char newbulkurlsname[1036];
  1138. snprintf(newbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)newCollnum);
  1139. char tmpbulkurlsname[1036];
  1140. snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%"INT32".bulkurls.txt",cr->m_coll,(int32_t)oldCollnum);
  1141. if (cr->m_isCustomCrawl == 2)
  1142. mv( oldbulkurlsname , tmpbulkurlsname );
  1143. */
  1144. // reset spider info
  1145. SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
  1146. if ( sc ) {
  1147. // remove locks from lock table:
  1148. sc->clearLocks();
  1149. // don't do this anymore, just nuke it in case
  1150. // m_populatingDoledb was true etc. there are too many
  1151. // flags to worry about
  1152. //sc->m_collnum = newCollnum;
  1153. //sc->reset();
  1154. // this will put it on "death row" so it will be deleted
  1155. // once Msg5::m_waitingForList/Merge is NULL
  1156. tryToDeleteSpiderColl ( sc,"11" );
  1157. //mdelete ( sc, sizeof(SpiderColl),"nukecr2");
  1158. //delete ( sc );
  1159. cr->m_spiderColl = NULL;
  1160. }
  1161. // reset spider round
  1162. cr->m_spiderRoundNum = 0;
  1163. cr->m_spiderRoundStartTime = 0;
  1164. cr->m_spiderStatus = SP_INITIALIZING; // this is 0
  1165. //cr->m_spiderStatusMsg = NULL;
  1166. // reset seed buf
  1167. if ( purgeSeeds ) {
  1168. // free the buffer of seed urls
  1169. cr->m_diffbotSeeds.purge();
  1170. // reset seed dedup table
  1171. HashTableX *ht = &cr->m_seedHashTable;
  1172. ht->reset();
  1173. }
  1174. // so XmlDoc.cpp can detect if the collection was reset since it
  1175. // launched its spider:
  1176. cr->m_lastResetCount++;
  1177. if ( newCollnum >= m_numRecs ) m_numRecs = (int32_t)newCollnum + 1;
  1178. // advance sanity check. did we wrap around?
  1179. // right now we #define collnum_t int16_t
  1180. if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }
  1181. // make a new collnum so records in transit will not be added
  1182. // to any rdb...
  1183. cr->m_collnum = newCollnum;
  1184. // update the timestamps since we are restarting/resetting
  1185. cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
  1186. cr->m_diffbotCrawlEndTime = 0;
  1187. ////////
  1188. //
  1189. // ALTER m_recs[] array
  1190. //
  1191. ////////
  1192. // Rdb::resetColl() needs to know the new cr so it can move
  1193. // the RdbBase into cr->m_bases[rdbId] array. recycling.
  1194. setRecPtr ( newCollnum , cr );
  1195. // a new directory then since we changed the collnum
  1196. char dname[512];
  1197. sprintf(dname, "%scoll.%s.%"INT32"/",
  1198. g_hostdb.m_dir,
  1199. cr->m_coll,
  1200. (int32_t)newCollnum);
  1201. DIR *dir = opendir ( dname );
  1202. if ( dir )
  1203. closedir ( dir );
  1204. if ( dir ) {
  1205. //g_errno = EEXIST;
  1206. log("admin: Trying to create collection %s but "
  1207. "directory %s already exists on disk.",cr->m_coll,dname);
  1208. }
  1209. if ( ::mkdir ( dname ,
  1210. getDirCreationFlags() ) ) {
  1211. // S_IRUSR | S_IWUSR | S_IXUSR |
  1212. // S_IRGRP | S_IWGRP | S_IXGRP |
  1213. // S_IROTH | S_IXOTH ) ) {
  1214. // valgrind?
  1215. //if ( errno == EINTR ) goto retry22;
  1216. //g_errno = errno;
  1217. log("admin: Creating directory %s had error: "
  1218. "%s.", dname,mstrerror(g_errno));
  1219. }
  1220. // be sure to copy back the bulk urls for bulk jobs
  1221. // MDW: now i just store that file in the root working dir
  1222. //if (cr->m_isCustomCrawl == 2)
  1223. // mv( tmpbulkurlsname, newbulkurlsname );
  1224. // . unlink all the *.dat and *.map files for this coll in its subdir
  1225. // . remove all recs from this collnum from m_tree/m_buckets
  1226. // . updates RdbBase::m_collnum
  1227. // . so for the tree it just needs to mark the old collnum recs
  1228. // with a collnum -1 in case it is saving...
  1229. g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
  1230. g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
  1231. g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
  1232. g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
  1233. g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
  1234. g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
  1235. g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
  1236. // reset crawl status too!
  1237. cr->m_spiderStatus = SP_INITIALIZING;
  1238. // . set m_recs[oldCollnum] to NULL and remove from hash table
  1239. // . do after calls to deleteColl() above so it won't crash
  1240. setRecPtr ( oldCollnum , NULL );
  1241. // save coll.conf to new directory
  1242. cr->save();
  1243. // and clear the robots.txt cache in case we recently spidered a
  1244. // robots.txt, we don't want to use it, we want to use the one we
  1245. // have in the test-parser subdir so we are consistent
  1246. //RdbCache *robots = Msg13::getHttpCacheRobots();
  1247. //RdbCache *others = Msg13::getHttpCacheOthers();
  1248. // clear() was removed do to possible corruption
  1249. //robots->clear ( oldCollnum );
  1250. //others->clear ( oldCollnum );
  1251. //g_templateTable.reset();
  1252. //g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
  1253. // repopulate CollectionRec::m_sortByDateTable. should be empty
  1254. // since we are resetting here.
  1255. //initSortByDateTable ( coll );
  1256. // done
  1257. return true;
  1258. }
  1259. // a hack function
  1260. bool addCollToTable ( char *coll , collnum_t collnum ) {
  1261. // readd it to the hashtable that maps name to collnum too
  1262. int64_t h64 = hash64n(coll);
  1263. g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
  1264. false,0,"nhshtbl");
  1265. return g_collTable.addKey ( &h64 , &collnum );
  1266. }
  1267. // get coll rec specified in the HTTP request
  1268. CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
  1269. char *coll = r->getString ( "c" );
  1270. if ( coll && ! coll[0] ) coll = NULL;
  1271. // maybe it is crawlbot?
  1272. char *name = NULL;
  1273. char *token = NULL;
  1274. if ( ! coll ) {
  1275. name = r->getString("name");
  1276. token = r->getString("token");
  1277. }
  1278. char tmp[MAX_COLL_LEN+1];
  1279. if ( ! coll && token && name ) {
  1280. snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
  1281. coll = tmp;
  1282. }
  1283. // default to main first
  1284. if ( ! coll && useDefaultRec ) {
  1285. CollectionRec *cr = g_collectiondb.getRec("main");
  1286. if ( cr ) return cr;
  1287. }
  1288. // try next in line
  1289. if ( ! coll && useDefaultRec ) {
  1290. return getFirstRec ();
  1291. }
  1292. // give up?
  1293. if ( ! coll ) return NULL;
  1294. //if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
  1295. return g_collectiondb.getRec ( coll );
  1296. }
  1297. char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
  1298. char *coll = r->getString ( "c" );
  1299. if ( coll && ! coll[0] ) coll = NULL;
  1300. if ( coll ) return coll;
  1301. CollectionRec *cr = NULL;
  1302. // default to main first
  1303. if ( ! coll ) {
  1304. cr = g_collectiondb.getRec("main");
  1305. // CAUTION: cr could be deleted so don't trust this ptr
  1306. // if you give up control of the cpu
  1307. if ( cr ) return cr->m_coll;
  1308. }
  1309. // try next in line
  1310. if ( ! coll ) {
  1311. cr = getFirstRec ();
  1312. if ( cr ) return cr->m_coll;
  1313. }
  1314. // give up?
  1315. return NULL;
  1316. }
  1317. //CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
  1318. // char *coll = getDefaultColl();
  1319. // return g_collectiondb.getRec(coll);
  1320. //}
  1321. // . get collectionRec from name
  1322. // . returns NULL if not available
  1323. CollectionRec *Collectiondb::getRec ( char *coll ) {
  1324. if ( ! coll ) coll = "";
  1325. return getRec ( coll , gbstrlen(coll) );
  1326. }
  1327. CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
  1328. if ( ! coll ) coll = "";
  1329. collnum_t collnum = getCollnum ( coll , collLen );
  1330. if ( collnum < 0 ) return NULL;
  1331. return m_recs [ (int32_t)collnum ];
  1332. }
  1333. CollectionRec *Collectiondb::getRec ( collnum_t collnum) {
  1334. if ( collnum >= m_numRecs || collnum < 0 ) {
  1335. // Rdb::resetBase() gets here, so don't always log.
  1336. // it is called from CollectionRec::reset() which is called
  1337. // from the CollectionRec constructor and ::load() so
  1338. // it won't have anything in rdb at that time
  1339. //log("colldb: collnum %"INT32" > numrecs = %"INT32"",
  1340. // (int32_t)collnum,(int32_t)m_numRecs);
  1341. return NULL;
  1342. }
  1343. return m_recs[collnum];
  1344. }
  1345. //CollectionRec *Collectiondb::getDefaultRec ( ) {
  1346. // if ( ! g_conf.m_defaultColl[0] ) return NULL; // no default?
  1347. // collnum_t collnum = getCollnum ( g_conf.m_defaultColl );
  1348. // if ( collnum < (collnum_t)0 ) return NULL;
  1349. // return m_recs[(int32_t)collnum];
  1350. //}
  1351. CollectionRec *Collectiondb::getFirstRec ( ) {
  1352. for ( int32_t i = 0 ; i < m_numRecs ; i++ )
  1353. if ( m_recs[i] ) return m_recs[i];
  1354. return NULL;
  1355. }
  1356. collnum_t Collectiondb::getFirstCollnum ( ) {
  1357. for ( int32_t i = 0 ; i < m_numRecs ; i++ )
  1358. if ( m_recs[i] ) return i;
  1359. return (collnum_t)-1;
  1360. }
  1361. char *Collectiondb::getFirstCollName ( ) {
  1362. for ( int32_t i = 0 ; i < m_numRecs ; i++ )
  1363. if ( m_recs[i] ) return m_recs[i]->m_coll;
  1364. return NULL;
  1365. }
  1366. char *Collectiondb::getCollName ( collnum_t collnum ) {
  1367. if ( collnum < 0 || collnum > m_numRecs ) return NULL;
  1368. if ( ! m_recs[(int32_t)collnum] ) return NULL;
  1369. return m_recs[collnum]->m_coll;
  1370. }
  1371. collnum_t Collectiondb::getCollnum ( char *coll ) {
  1372. int32_t clen = 0;
  1373. if ( coll ) clen = gbstrlen(coll );
  1374. return getCollnum ( coll , clen );
  1375. /*
  1376. //if ( ! coll ) coll = "";
  1377. // default empty collection names
  1378. if ( coll && ! coll[0] ) coll = NULL;
  1379. if ( ! coll ) coll = g_conf.m_defaultColl;
  1380. if ( ! coll || ! coll[0] ) coll = "main";
  1381. // This is necessary for Statsdb to work, as it is
  1382. // not associated with any collection. Is this
  1383. // necessary for Catdb?
  1384. if ( coll[0]=='s' && coll[1] =='t' &&
  1385. strcmp ( "statsdb\0", coll ) == 0)
  1386. return 0;
  1387. if ( coll[0]=='f' && coll[1]=='a' &&
  1388. strcmp ( "facebookdb\0", coll ) == 0)
  1389. return 0;
  1390. if ( coll[0]=='a' && coll[1]=='c' &&
  1391. strcmp ( "accessdb\0", coll ) == 0)
  1392. return 0;
  1393. // because diffbot may have thousands of crawls/collections
  1394. // let's improve the speed here. try hashing it...
  1395. int64_t h64 = hash64n(coll);
  1396. void *vp = g_collTable.getValue ( &h64 );
  1397. if ( ! vp ) return -1; // not found
  1398. return *(collnum_t *)vp;
  1399. */
  1400. /*
  1401. for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
  1402. if ( ! m_recs[i] ) continue;
  1403. if ( m_recs[i]->m_coll[0] != coll[0] ) continue;
  1404. if ( strcmp ( m_recs[i]->m_coll , coll ) == 0 ) return i;
  1405. }
  1406. //if ( strcmp ( "catdb\0", coll ) == 0) return 0;
  1407. return (collnum_t)-1; // not found
  1408. */
  1409. }
  1410. collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {
  1411. // default empty collection names
  1412. if ( coll && ! coll[0] ) coll = NULL;
  1413. if ( ! coll ) {
  1414. coll = g_conf.m_defaultColl;
  1415. if ( coll ) clen = gbstrlen(coll);
  1416. else clen = 0;
  1417. }
  1418. if ( ! coll || ! coll[0] ) {
  1419. coll = "main";
  1420. clen = gbstrlen(coll);
  1421. }
  1422. // This is necessary for Statsdb to work, as it is
  1423. //if ( ! coll ) coll = "";
  1424. // not associated with any collection. Is this
  1425. // necessary for Catdb?
  1426. if ( coll[0]=='s' && coll[1] =='t' &&
  1427. strcmp ( "statsdb\0", coll ) == 0)
  1428. return 0;
  1429. if ( coll[0]=='f' && coll[1]=='f' &&
  1430. strcmp ( "facebookdb\0", coll ) == 0)
  1431. return 0;
  1432. if ( coll[0]=='a' && coll[1]=='c' &&
  1433. strcmp ( "accessdb\0", coll ) == 0)
  1434. return 0;
  1435. // because diffbot may have thousands of crawls/collections
  1436. // let's improve the speed here. try hashing it...
  1437. int64_t h64 = hash64(coll,clen);
  1438. void *vp = g_collTable.getValue ( &h64 );
  1439. if ( ! vp ) return -1; // not found
  1440. return *(collnum_t *)vp;
  1441. /*
  1442. for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
  1443. if ( ! m_recs[i] ) continue;
  1444. if ( m_recs[i]->m_collLen != clen ) continue;
  1445. if ( strncmp(m_recs[i]->m_coll,coll,clen) == 0 ) return i;
  1446. }
  1447. //if ( strncmp ( "catdb\0", coll, clen ) == 0) return 0;
  1448. return (collnum_t)-1; // not found
  1449. */
  1450. }
  1451. //collnum_t Collectiondb::getNextCollnum ( collnum_t collnum ) {
  1452. // for ( int32_t i = (int32_t)collnum + 1 ; i < m_numRecs ; i++ )
  1453. // if ( m_recs[i] ) return i;
  1454. // // no next one, use -1
  1455. // return (collnum_t) -1;
  1456. //}
  1457. // what collnum will be used the next time a coll is added?
  1458. collnum_t Collectiondb::reserveCollNum ( ) {
  1459. if ( m_numRecs < 0x7fff ) {
  1460. collnum_t next = m_numRecs;
  1461. // make the ptr NULL at least to accommodate the
  1462. // loop that scan up to m_numRecs lest we core
  1463. growRecPtrBuf ( next );
  1464. m_numRecs++;
  1465. return next;
  1466. }
  1467. // collnum_t is signed right now because we use -1 to indicate a
  1468. // bad collnum.
  1469. int32_t scanned = 0;
  1470. // search for an empty slot
  1471. for ( int32_t i = m_wrapped ; ; i++ ) {
  1472. // because collnum_t is 2 bytes, signed, limit this here
  1473. if ( i > 0x7fff ) i = 0;
  1474. // how can this happen?
  1475. if ( i < 0 ) i = 0;
  1476. // if we scanned the max # of recs we could have, we are done
  1477. if ( ++scanned >= m_numRecs ) break;
  1478. // skip if this is in use
  1479. if ( m_recs[i] ) continue;
  1480. // start after this one next time
  1481. m_wrapped = i+1;
  1482. // note it
  1483. log("colldb: returning wrapped collnum "
  1484. "of %"INT32"",(int32_t)i);
  1485. return (collnum_t)i;
  1486. }
  1487. log("colldb: no new collnum available. consider upping collnum_t");
  1488. // none available!!
  1489. return -1;
  1490. }
  1491. ///////////////
  1492. //
  1493. // COLLECTIONREC
  1494. //
  1495. ///////////////
  1496. #include "gb-include.h"
  1497. //#include "CollectionRec.h"
  1498. //#include "Collectiondb.h"
  1499. #include "HttpServer.h" // printColors2()
  1500. #include "Msg5.h"
  1501. #include "Threads.h"
  1502. #include "Datedb.h"
  1503. #include "Timedb.h"
  1504. #include "Spider.h"
  1505. #include "Process.h"
  1506. static CollectionRec g_default;
  1507. CollectionRec::CollectionRec() {
  1508. m_nextLink = NULL;
  1509. m_prevLink = NULL;
  1510. m_spiderCorruptCount = 0;
  1511. m_collnum = -1;
  1512. m_coll[0] = '\0';
  1513. m_updateRoundNum = 0;
  1514. m_swappedOut = false;
  1515. //m_numSearchPwds = 0;
  1516. //m_numBanIps = 0;
  1517. //m_numSearchIps = 0;
  1518. //m_numSpamIps = 0;
  1519. //m_numAdminPwds = 0;
  1520. //m_numAdminIps = 0;
  1521. memset ( m_bases , 0 , sizeof(RdbBase *)*RDB_END );
  1522. // how many keys in the tree of each rdb? we now store this stuff
  1523. // here and not in RdbTree.cpp because we no longer have a maximum
  1524. // # of collection recs... MAX_COLLS. each is a 32-bit "int32_t" so
  1525. // it is 4 * RDB_END...
  1526. memset ( m_numNegKeysInTree , 0 , 4*RDB_END );
  1527. memset ( m_numPosKeysInTree , 0 , 4*RDB_END );
  1528. m_spiderColl = NULL;
  1529. m_overflow = 0x12345678;
  1530. m_overflow2 = 0x12345678;
  1531. // the spiders are currently uninhibited i guess
  1532. m_spiderStatus = SP_INITIALIZING; // this is 0
  1533. //m_spiderStatusMsg = NULL;
  1534. // for Url::getSite()
  1535. m_updateSiteRulesTable = 1;
  1536. //m_lastUpdateTime = 0LL;
  1537. m_clickNScrollEnabled = false;
  1538. // inits for sortbydatetable
  1539. m_inProgress = false;
  1540. m_msg5 = NULL;
  1541. m_importState = NULL;
  1542. // JAB - track which regex parsers have been initialized
  1543. //log(LOG_DEBUG,"regex: %p initializing empty parsers", m_pRegExParser);
  1544. // clear these out so Parms::calcChecksum can work:
  1545. memset( m_spiderFreqs, 0, MAX_FILTERS*sizeof(*m_spiderFreqs) );
  1546. //for ( int i = 0; i < MAX_FILTERS ; i++ )
  1547. // m_spiderQuotas[i] = -1;
  1548. memset( m_spiderPriorities, 0,
  1549. MAX_FILTERS*sizeof(*m_spiderPriorities) );
  1550. memset ( m_harvestLinks,0,MAX_FILTERS);
  1551. memset ( m_forceDelete,0,MAX_FILTERS);
  1552. //memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
  1553. //for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
  1554. // *(m_searchPwds[i]) = '\0';
  1555. //}
  1556. //for ( int i = 0; i < MAX_ADMIN_PASSWORDS; i++ ) {
  1557. // *(m_adminPwds[i]) = '\0';
  1558. //}
  1559. //memset( m_banIps, 0, MAX_BANNED_IPS*sizeof(*m_banIps) );
  1560. //memset( m_searchIps, 0, MAX_SEARCH_IPS*sizeof(*m_searchIps) );
  1561. //memset( m_spamIps, 0, MAX_SPAM_IPS*sizeof(*m_spamIps) );
  1562. //memset( m_adminIps, 0, MAX_ADMIN_IPS*sizeof(*m_adminIps) );
  1563. //for ( int i = 0; i < MAX_FILTERS; i++ ) {
  1564. // //m_pRegExParser[i] = NULL;
  1565. // *(m_regExs[i]) = '\0';
  1566. //}
  1567. m_numRegExs = 0;
  1568. //m_requests = 0;
  1569. //m_replies = 0;
  1570. //m_doingCallbacks = false;
  1571. m_lastResetCount = 0;
  1572. // regex_t types
  1573. m_hasucr = false;
  1574. m_hasupr = false;
  1575. // for diffbot caching the global spider stats
  1576. reset();
  1577. // add default reg ex if we do not have one
  1578. //setUrlFiltersToDefaults();
  1579. //rebuildUrlFilters();
  1580. }
  1581. CollectionRec::~CollectionRec() {
  1582. //invalidateRegEx ();
  1583. reset();
  1584. }
  1585. // new collection recs get this called on them
  1586. void CollectionRec::setToDefaults ( ) {
  1587. g_parms.setFromFile ( this , NULL , NULL , OBJ_COLL );
  1588. // add default reg ex
  1589. //setUrlFiltersToDefaults();
  1590. rebuildUrlFilters();
  1591. }
  1592. void CollectionRec::reset() {
  1593. //log("coll: resetting collnum=%"INT32"",(int32_t)m_collnum);
  1594. // . grows dynamically
  1595. // . setting to 0 buckets should never have error
  1596. //m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
  1597. // regex_t types
  1598. if ( m_hasucr ) regfree ( &m_ucr );
  1599. if ( m_hasupr ) regfree ( &m_upr );
  1600. m_hasucr = false;
  1601. m_hasupr = false;
  1602. m_sendingAlertInProgress = false;
  1603. // make sure we do not leave spiders "hanging" waiting for their
  1604. // callback to be called... and it never gets called
  1605. //if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
  1606. //if ( m_doingCallbacks ) { char *xx=NULL;*xx=0; }
  1607. //if ( m_replies != m_requests ) { char *xx=NULL;*xx=0; }
  1608. m_localCrawlInfo.reset();
  1609. m_globalCrawlInfo.reset();
  1610. //m_requests = 0;
  1611. //m_replies = 0;
  1612. // free all RdbBases in each rdb
  1613. for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
  1614. Rdb *rdb = g_process.m_rdbs[i];
  1615. rdb->resetBase ( m_collnum );
  1616. }
  1617. for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
  1618. RdbBase *base = m_bases[i];
  1619. if ( ! base ) continue;
  1620. mdelete (base, sizeof(RdbBase), "Rdb Coll");
  1621. delete (base);
  1622. }
  1623. SpiderColl *sc = m_spiderColl;
  1624. // debug hack thing
  1625. //if ( sc == (SpiderColl *)0x8888 ) return;
  1626. // if never made one, we are done
  1627. if ( ! sc ) return;
  1628. // spider coll also!
  1629. sc->m_deleteMyself = true;
  1630. // if not currently being accessed nuke it now
  1631. tryToDeleteSpiderColl ( sc ,"12");
  1632. // if ( ! sc->m_msg5.m_waitingForList &&
  1633. // ! sc->m_msg5b.m_waitingForList &&
  1634. // ! sc->m_msg1.m_mcast.m_inUse ) {
  1635. // mdelete ( sc, sizeof(SpiderColl),"nukecr2");
  1636. // delete ( sc );
  1637. // }
  1638. }
  1639. CollectionRec *g_cr = NULL;
  1640. // . load this data from a conf file
  1641. // . values we do not explicitly have will be taken from "default",
  1642. // collection config file. if it does not have them then we use
  1643. // the value we received from call to setToDefaults()
  1644. // . returns false and sets g_errno on load error
  1645. bool CollectionRec::load ( char *coll , int32_t i ) {
  1646. // also reset some counts not included in parms list
  1647. reset();
  1648. // before we load, set to defaults in case some are not in xml file
  1649. g_parms.setToDefault ( (char *)this , OBJ_COLL , this );
  1650. // get the filename with that id
  1651. File f;
  1652. char tmp2[1024];
  1653. sprintf ( tmp2 , "%scoll.%s.%"INT32"/coll.conf", g_hostdb.m_dir , coll,i);
  1654. f.set ( tmp2 );
  1655. if ( ! f.doesExist () ) return log("admin: %s does not exist.",tmp2);
  1656. // set our collection number
  1657. m_collnum = i;
  1658. // set our collection name
  1659. m_collLen = gbstrlen ( coll );
  1660. strcpy ( m_coll , coll );
  1661. if ( ! g_conf.m_doingCommandLine )
  1662. log(LOG_INFO,"db: Loading conf for collection %s (%"INT32")",coll,
  1663. (int32_t)m_collnum);
  1664. // collection name HACK for backwards compatibility
  1665. //if ( strcmp ( coll , "main" ) == 0 ) {
  1666. // m_coll[0] = '\0';
  1667. // m_collLen = 0;
  1668. //}
  1669. // the default conf file
  1670. char tmp1[1024];
  1671. snprintf ( tmp1 , 1023, "%sdefault.conf" , g_hostdb.m_dir );
  1672. // . set our parms from the file.
  1673. // . accepts OBJ_COLLECTIONREC or OBJ_CONF
  1674. g_parms.setFromFile ( this , tmp2 , tmp1 , OBJ_COLL );
  1675. // add default reg ex IFF there are no url filters there now
  1676. //if(m_numRegExs == 0) rebuildUrlFilters();//setUrlFiltersToDefaults();
  1677. // this only rebuild them if necessary
  1678. rebuildUrlFilters();//setUrlFiltersToDefaults();
  1679. // temp check
  1680. //testRegex();
  1681. //
  1682. // LOAD the crawlinfo class in the collectionrec for diffbot
  1683. //
  1684. // LOAD LOCAL
  1685. snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
  1686. g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
  1687. log(LOG_DEBUG,"db: Loading %s",tmp1);
  1688. m_localCrawlInfo.reset();
  1689. SafeBuf sb;
  1690. // fillfromfile returns 0 if does not exist, -1 on read error
  1691. if ( sb.fillFromFile ( tmp1 ) > 0 )
  1692. //m_localCrawlInfo.setFromSafeBuf(&sb);
  1693. // it is binary now
  1694. gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
  1695. // if it had corrupted data from saving corrupted mem zero it out
  1696. CrawlInfo *stats = &m_localCrawlInfo;
  1697. // point to the stats for that host
  1698. int64_t *ss = (int64_t *)stats;
  1699. // are stats crazy?
  1700. bool crazy = false;
  1701. for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
  1702. // crazy stat?
  1703. if ( *ss > 1000000000LL ||
  1704. *ss < -1000000000LL ) {
  1705. crazy = true;
  1706. break;
  1707. }
  1708. ss++;
  1709. }
  1710. if ( m_localCrawlInfo.m_collnum != m_collnum )
  1711. crazy = true;
  1712. if ( crazy ) {
  1713. log("coll: had crazy spider stats for coll %s. zeroing out.",
  1714. m_coll);
  1715. m_localCrawlInfo.reset();
  1716. }
  1717. if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
  1718. log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
  1719. m_coll,
  1720. (int32_t)m_collnum,
  1721. (int32_t)m_localCrawlInfo.m_hasUrlsReadyToSpider);
  1722. // we introduced the this round counts, so don't start them at 0!!
  1723. if ( m_spiderRoundNum == 0 &&
  1724. m_localCrawlInfo.m_pageDownloadSuccessesThisRound <
  1725. m_localCrawlInfo.m_pageDownloadSuccesses ) {
  1726. log("coll: fixing process count this round for %s",m_coll);
  1727. m_localCrawlInfo.m_pageDownloadSuccessesThisRound =
  1728. m_localCrawlInfo.m_pageDownloadSuccesses;
  1729. }
  1730. // we introduced the this round counts, so don't start them at 0!!
  1731. if ( m_spiderRoundNum == 0 &&
  1732. m_localCrawlInfo.m_pageProcessSuccessesThisRound <
  1733. m_localCrawlInfo.m_pageProcessSuccesses ) {
  1734. log("coll: fixing process count this round for %s",m_coll);
  1735. m_localCrawlInfo.m_pageProcessSuccessesThisRound =
  1736. m_localCrawlInfo.m_pageProcessSuccesses;
  1737. }
  1738. // fix from old bug that was fixed
  1739. //if ( m_spiderRoundNum == 0 &&
  1740. // m_collectiveRespiderFrequency > 0.0 &&
  1741. // m_localCrawlInfo.m_sentCrawlDoneAlert ) {
  1742. // log("coll: bug fix: resending email alert for coll %s (%"INT32") "
  1743. // "of respider freq %f",m_coll,(int32_t)m_collnum,
  1744. // m_collectiveRespiderFrequency);
  1745. // m_localCrawlInfo.m_sentCrawlDoneAlert = false;
  1746. //}
  1747. // LOAD GLOBAL
  1748. snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
  1749. g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
  1750. log(LOG_DEBUG,"db: Loading %s",tmp1);
  1751. m_globalCrawlInfo.reset();
  1752. sb.reset();
  1753. if ( sb.fillFromFile ( tmp1 ) > 0 )
  1754. //m_globalCrawlInfo.setFromSafeBuf(&sb);
  1755. // it is binary now
  1756. gbmemcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
  1757. if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
  1758. log("coll: Loaded %s (%"INT32") global hasurlsready=%"INT32"",
  1759. m_coll,
  1760. (int32_t)m_collnum,
  1761. (int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
  1762. // the list of ip addresses that we have detected as being throttled
  1763. // and therefore backoff and use proxies for
  1764. if ( ! g_conf.m_doingCommandLine ) {
  1765. sb.reset();
  1766. sb.safePrintf("%scoll.%s.%"INT32"/",
  1767. g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
  1768. m_twitchyTable.m_allocName = "twittbl";
  1769. m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
  1770. }
  1771. ////////////
  1772. //
  1773. // PAGE COUNT TABLE for doing quotas in url filters
  1774. //
  1775. /////////////
  1776. // log it up if there on disk
  1777. //snprintf ( tmp1 , 1023, "/coll.%s.%"INT32"/pagecounts.dat",
  1778. // m_coll , (int32_t)m_collnum );
  1779. //if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
  1780. // log("db: failed to load page count table: %s",
  1781. // mstrerror(g_errno));
  1782. // ignore errors i guess
  1783. g_errno = 0;
  1784. // fix for diffbot, spider time deduping
  1785. if ( m_isCustomCrawl ) m_dedupingEnabled = true;
  1786. // always turn off gigabits so &s=1000 can do summary skipping
  1787. if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;
  1788. // make min to merge smaller than normal since most collections are
  1789. // small and we want to reduce the # of vfds (files) we have
  1790. if ( m_isCustomCrawl ) {
  1791. m_posdbMinFilesToMerge = 6;
  1792. m_titledbMinFilesToMerge = 4;
  1793. m_linkdbMinFilesToMerge = 3;
  1794. m_tagdbMinFilesToMerge = 2;
  1795. }
  1796. // always turn on distributed spider locking because otherwise
  1797. // we end up calling Msg50 which calls Msg25 for the same root url
  1798. // at the same time, thereby wasting massive resources. it is also
  1799. // dangerous to run without this because webmaster get pissed when
  1800. // we slam their servers.
  1801. // This is now deprecated...
  1802. //m_useSpiderLocks = false;
  1803. // and all pages downloaded from a particular ip should be done
  1804. // by the same host in our cluster to prevent webmaster rage
  1805. //m_distributeSpiderGet = true;
  1806. //initSortByDateTable(m_coll);
  1807. return true;
  1808. }
  1809. /*
  1810. bool CollectionRec::countEvents ( ) {
  1811. // set our m_numEventsOnHost value
  1812. log("coll: loading event count termlist gbeventcount");
  1813. // temporarily turn off threads
  1814. bool enabled = g_threads.areThreadsEnabled();
  1815. g_threads.disableThreads();
  1816. // count them
  1817. m_numEventsOnHost = 0;
  1818. // 1MB at a time
  1819. int32_t minRecSizes = 1000000;
  1820. // look up this termlist, gbeventcount which we index in XmlDoc.cpp
  1821. int64_t termId = hash64n("gbeventcount") & TERMID_MASK;
  1822. // make datedb key from it
  1823. key128_t startKey = g_datedb.makeStartKey ( termId , 0xffffffff );
  1824. key128_t endKey = g_datedb.makeEndKey ( termId , 0 );
  1825. Msg5 msg5;
  1826. RdbList list;
  1827. // . init m_numEventsOnHost by getting the exact length of that
  1828. // termlist on this host
  1829. // . send in the ping request packet so all hosts can total up
  1830. // . Rdb.cpp should be added to incrementally so we should have no
  1831. // double positives.
  1832. // . Rdb.cpp should inspect each datedb rec for this termid in
  1833. // a fast an efficient manner
  1834. loop:
  1835. // use msg5 to get the list, should ALWAYS block since no threads
  1836. if ( ! msg5.getList ( RDB_DATEDB ,
  1837. m_coll ,
  1838. &list ,
  1839. (char *)&startKey ,
  1840. (char *)&endKey ,
  1841. minRecSizes ,
  1842. true , // includeTree ,
  1843. false , // add to cache?
  1844. 0 , // max cache age
  1845. 0 , // startFileNum ,
  1846. -1 , // numFiles ,
  1847. NULL , // state
  1848. NULL , // callback
  1849. 0 , // niceness
  1850. false , // err correction?
  1851. NULL , // cache key ptr
  1852. 0 , // retry num
  1853. -1 , // maxRetries
  1854. true , // compensate for merge
  1855. -1LL , // sync point
  1856. NULL )){// msg5b
  1857. // not allowed to block!
  1858. char *xx=NULL;*xx=0; }
  1859. // scan the list, score is how many valid events from that docid
  1860. uint32_t total = 0;
  1861. for ( ; ! list.isExhausted() ; list.skipCurrentRec() ) {
  1862. unsigned char *rec = (unsigned char *)list.getCurrentRec();
  1863. // in datedb score is byte #5
  1864. total += (255-rec[5]);
  1865. }
  1866. // declare
  1867. char *lastKeyPtr;
  1868. key128_t newStartKey;
  1869. // add to count. datedb uses half keys so subtract 6 bytes
  1870. // since the termids will be the same...
  1871. //m_numEventsOnHost += list.getListSize() / (sizeof(key128_t)-6);
  1872. m_numEventsOnHost += total;
  1873. // bail if under limit
  1874. if ( list.getListSize() < minRecSizes ) goto done;
  1875. // update key
  1876. lastKeyPtr = list.m_listEnd - 10;
  1877. // we make a new start key
  1878. list.getKey ( lastKeyPtr , (char *)&newStartKey );
  1879. // maxxed out?
  1880. if ( newStartKey.n0==0xffffffffffffffffLL &&
  1881. newStartKey.n1==0xffffffffffffffffLL )
  1882. goto done;
  1883. // sanity check
  1884. if ( newStartKey < startKey ) { char *xx=NULL;*xx=0; }
  1885. if ( newStartKey > endKey ) { char *xx=NULL;*xx=0; }
  1886. // inc it
  1887. newStartKey.n0++;
  1888. // in the top if the bottom wrapped
  1889. if ( newStartKey.n0 == 0LL ) newStartKey.n1++;
  1890. // assign
  1891. startKey = newStartKey;
  1892. // and loop back up for more now
  1893. goto loop;
  1894. done:
  1895. // update all colls count
  1896. g_collectiondb.m_numEventsAllColls += m_numEventsOnHost;
  1897. if ( enabled ) g_threads.enableThreads();
  1898. log("coll: got %"INT32" local events in termlist",m_numEventsOnHost);
  1899. // set "m_hasDocQualityFiler"
  1900. //updateFilters();
  1901. return true;
  1902. }
  1903. */
  1904. bool CollectionRec::rebuildUrlFilters2 ( ) {
  1905. // tell spider loop to update active list
  1906. g_spiderLoop.m_activeListValid = false;
  1907. bool rebuild = true;
  1908. if ( m_numRegExs == 0 )
  1909. rebuild = true;
  1910. // don't touch it if not supposed to as int32_t as we have some already
  1911. //if ( m_urlFiltersProfile != UFP_NONE )
  1912. // rebuild = true;
  1913. // never for custom crawls however
  1914. if ( m_isCustomCrawl )
  1915. rebuild = false;
  1916. char *s = m_urlFiltersProfile.getBufStart();
  1917. // support the old UFP_CUSTOM, etc. numeric values
  1918. if ( !strcmp(s,"0" ) )
  1919. s = "custom";
  1920. // UFP_WEB SUPPORT
  1921. if ( !strcmp(s,"1" ) )
  1922. s = "web";
  1923. // UFP_NEWS
  1924. if ( !strcmp(s,"2" ) )
  1925. s = "shallow";
  1926. // leave custom profiles alone
  1927. if ( !strcmp(s,"custom" ) )
  1928. rebuild = false;
  1929. //if ( m_numRegExs > 0 && strcmp(m_regExs[m_numRegExs-1],"default") )
  1930. // addDefault = true;
  1931. if ( ! rebuild ) return true;
  1932. if ( !strcmp(s,"shallow" ) )
  1933. return rebuildShallowRules();
  1934. //if ( strcmp(s,"web") )
  1935. // just fall through for that
  1936. if ( !strcmp(s,"english") )
  1937. return rebuildLangRules( "en","com,us,gov");
  1938. if ( !strcmp(s,"german") )
  1939. return rebuildLangRules( "de","de");
  1940. if ( !strcmp(s,"french") )
  1941. return rebuildLangRules( "fr","fr");
  1942. if ( !strcmp(s,"norwegian") )
  1943. return rebuildLangRules( "nl","nl");
  1944. if ( !strcmp(s,"spanish") )
  1945. return rebuildLangRules( "es","es");
  1946. //if ( m_urlFiltersProfile == UFP_EURO )
  1947. // return rebuildLangRules( "de,fr,nl,es,sv,no,it",
  1948. // "com,gov,org,de,fr,nl,es,sv,no,it");
  1949. if ( !strcmp(s,"romantic") )
  1950. return rebuildLangRules("en,de,fr,nl,es,sv,no,it,fi,pt",
  1951. "de,fr,nl,es,sv,no,it,fi,pt,"
  1952. "com,gov,org"
  1953. );
  1954. if ( !strcmp(s,"chinese") )
  1955. return rebuildLangRules( "zh_cn,zh_tw","cn");
  1956. int32_t n = 0;
  1957. /*
  1958. m_regExs[n].set("default");
  1959. m_regExs[n].nullTerm();
  1960. m_spiderFreqs [n] = 30; // 30 days default
  1961. m_spiderPriorities[n] = 0;
  1962. m_maxSpidersPerRule[n] = 99;
  1963. m_spiderIpWaits[n] = 1000;
  1964. m_spiderIpMaxSpiders[n] = 7;
  1965. m_harvestLinks[n] = 1;
  1966. */
  1967. // max spiders per ip
  1968. int32_t ipms = 7;
  1969. m_regExs[n].set("isreindex");
  1970. m_harvestLinks [n] = 1;
  1971. m_spiderFreqs [n] = 0; // 30 days default
  1972. m_maxSpidersPerRule [n] = 99; // max spiders
  1973. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  1974. m_spiderIpWaits [n] = 1000; // same ip wait
  1975. m_spiderPriorities [n] = 80;
  1976. n++;
  1977. m_regExs[n].set("ismedia");
  1978. m_harvestLinks [n] = 1;
  1979. m_spiderFreqs [n] = 0; // 30 days default
  1980. m_maxSpidersPerRule [n] = 99; // max spiders
  1981. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  1982. m_spiderIpWaits [n] = 1000; // same ip wait
  1983. m_spiderPriorities [n] = 100; // delete!
  1984. m_forceDelete [n] = 1;
  1985. n++;
  1986. // if not in the site list then nuke it
  1987. m_regExs[n].set("!ismanualadd && !insitelist");
  1988. m_harvestLinks [n] = 1;
  1989. m_spiderFreqs [n] = 0; // 30 days default
  1990. m_maxSpidersPerRule [n] = 99; // max spiders
  1991. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  1992. m_spiderIpWaits [n] = 1000; // same ip wait
  1993. m_spiderPriorities [n] = 100;
  1994. m_forceDelete [n] = 1;
  1995. n++;
  1996. m_regExs[n].set("errorcount>=3 && hastmperror");
  1997. m_harvestLinks [n] = 1;
  1998. m_spiderFreqs [n] = 1; // 30 days default
  1999. m_maxSpidersPerRule [n] = 1; // max spiders
  2000. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2001. m_spiderIpWaits [n] = 1000; // same ip wait
  2002. m_spiderPriorities [n] = 100;
  2003. m_forceDelete [n] = 1;
  2004. n++;
  2005. m_regExs[n].set("errorcount>=1 && hastmperror");
  2006. m_harvestLinks [n] = 1;
  2007. m_spiderFreqs [n] = 1; // 30 days default
  2008. m_maxSpidersPerRule [n] = 1; // max spiders
  2009. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2010. m_spiderIpWaits [n] = 1000; // same ip wait
  2011. m_spiderPriorities [n] = 45;
  2012. if ( ! strcmp(s,"news") )
  2013. m_spiderFreqs [n] = .00347; // 5 mins
  2014. n++;
  2015. // a non temporary error, like a 404? retry once per 3 months i guess
  2016. m_regExs[n].set("errorcount>=1");
  2017. m_harvestLinks [n] = 1;
  2018. m_spiderFreqs [n] = 5; // 5 day retry
  2019. m_maxSpidersPerRule [n] = 1; // max spiders
  2020. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2021. m_spiderIpWaits [n] = 1000; // same ip wait
  2022. m_spiderPriorities [n] = 2;
  2023. m_forceDelete [n] = 1;
  2024. n++;
  2025. m_regExs[n].set("isaddurl");
  2026. m_harvestLinks [n] = 1;
  2027. m_spiderFreqs [n] = 7; // 30 days default
  2028. m_maxSpidersPerRule [n] = 99; // max spiders
  2029. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2030. m_spiderIpWaits [n] = 1000; // same ip wait
  2031. m_spiderPriorities [n] = 85;
  2032. if ( ! strcmp(s,"news") )
  2033. m_spiderFreqs [n] = .00347; // 5 mins
  2034. n++;
  2035. // 20+ unique c block parent request urls means it is important!
  2036. m_regExs[n].set("numinlinks>7 && isnew");
  2037. m_harvestLinks [n] = 1;
  2038. m_spiderFreqs [n] = 7; // 30 days default
  2039. m_maxSpidersPerRule [n] = 9; // max spiders
  2040. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2041. m_spiderIpWaits [n] = 1000; // same ip wait
  2042. m_spiderPriorities [n] = 52;
  2043. if ( ! strcmp(s,"news") )
  2044. m_spiderFreqs [n] = .00347; // 5 mins
  2045. n++;
  2046. // 20+ unique c block parent request urls means it is important!
  2047. m_regExs[n].set("numinlinks>7");
  2048. m_harvestLinks [n] = 1;
  2049. m_spiderFreqs [n] = 7; // 30 days default
  2050. m_maxSpidersPerRule [n] = 9; // max spiders
  2051. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2052. m_spiderIpWaits [n] = 1000; // same ip wait
  2053. m_spiderPriorities [n] = 51;
  2054. if ( ! strcmp(s,"news") )
  2055. m_spiderFreqs [n] = .00347; // 5 mins
  2056. n++;
  2057. m_regExs[n].set("hopcount==0 && iswww && isnew");
  2058. m_harvestLinks [n] = 1;
  2059. m_spiderFreqs [n] = 7; // 30 days default
  2060. m_maxSpidersPerRule [n] = 9; // max spiders
  2061. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2062. m_spiderIpWaits [n] = 1000; // same ip wait
  2063. m_spiderPriorities [n] = 50;
  2064. if ( ! strcmp(s,"news") )
  2065. m_spiderFreqs [n] = .00347; // 5 mins
  2066. n++;
  2067. m_regExs[n].set("hopcount==0 && iswww");
  2068. m_harvestLinks [n] = 1;
  2069. m_spiderFreqs [n] = 7.0; // days b4 respider
  2070. m_maxSpidersPerRule [n] = 9; // max spiders
  2071. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2072. m_spiderIpWaits [n] = 1000; // same ip wait
  2073. m_spiderPriorities [n] = 48;
  2074. if ( ! strcmp(s,"news") )
  2075. m_spiderFreqs [n] = .00347; // 5 mins
  2076. n++;
  2077. m_regExs[n].set("hopcount==0 && isnew");
  2078. m_harvestLinks [n] = 1;
  2079. m_spiderFreqs [n] = 7.0;
  2080. m_maxSpidersPerRule [n] = 9; // max spiders
  2081. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2082. m_spiderIpWaits [n] = 1000; // same ip wait
  2083. m_spiderPriorities [n] = 49;
  2084. if ( ! strcmp(s,"news") )
  2085. m_spiderFreqs [n] = .00347; // 5 mins
  2086. n++;
  2087. m_regExs[n].set("hopcount==0");
  2088. m_harvestLinks [n] = 1;
  2089. m_spiderFreqs [n] = 10.0;
  2090. m_maxSpidersPerRule [n] = 9; // max spiders
  2091. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2092. m_spiderIpWaits [n] = 1000; // same ip wait
  2093. m_spiderPriorities [n] = 47;
  2094. if ( ! strcmp(s,"news") )
  2095. m_spiderFreqs [n] = .00347; // 5 mins
  2096. n++;
  2097. m_regExs[n].set("isparentrss && isnew");
  2098. m_harvestLinks [n] = 1;
  2099. m_spiderFreqs [n] = 7; // 30 days default
  2100. m_maxSpidersPerRule [n] = 9; // max spiders
  2101. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2102. m_spiderIpWaits [n] = 1000; // same ip wait
  2103. m_spiderPriorities [n] = 45;
  2104. if ( ! strcmp(s,"news") )
  2105. m_spiderFreqs [n] = .00347; // 5 mins
  2106. n++;
  2107. m_regExs[n].set("isparentsitemap && isnew");
  2108. m_harvestLinks [n] = 1;
  2109. m_spiderFreqs [n] = 7; // 30 days default
  2110. m_maxSpidersPerRule [n] = 9; // max spiders
  2111. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2112. m_spiderIpWaits [n] = 1000; // same ip wait
  2113. m_spiderPriorities [n] = 44;
  2114. if ( ! strcmp(s,"news") )
  2115. m_spiderFreqs [n] = .00347; // 5 mins
  2116. n++;
  2117. m_regExs[n].set("isparentrss");
  2118. m_harvestLinks [n] = 1;
  2119. m_spiderFreqs [n] = 20.0; // 30 days default
  2120. m_maxSpidersPerRule [n] = 9; // max spiders
  2121. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2122. m_spiderIpWaits [n] = 1000; // same ip wait
  2123. m_spiderPriorities [n] = 43;
  2124. if ( ! strcmp(s,"news") )
  2125. m_spiderFreqs [n] = .00347; // 5 mins
  2126. n++;
  2127. m_regExs[n].set("isparentsitemap");
  2128. m_harvestLinks [n] = 1;
  2129. m_spiderFreqs [n] = 20.0; // 30 days default
  2130. m_maxSpidersPerRule [n] = 9; // max spiders
  2131. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2132. m_spiderIpWaits [n] = 1000; // same ip wait
  2133. m_spiderPriorities [n] = 42;
  2134. if ( ! strcmp(s,"news") )
  2135. m_spiderFreqs [n] = .00347; // 5 mins
  2136. n++;
  2137. m_regExs[n].set("hopcount==1 && isnew");
  2138. m_harvestLinks [n] = 1;
  2139. m_spiderFreqs [n] = 20.0;
  2140. m_maxSpidersPerRule [n] = 9; // max spiders
  2141. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2142. m_spiderIpWaits [n] = 1000; // same ip wait
  2143. m_spiderPriorities [n] = 40;
  2144. if ( ! strcmp(s,"news") )
  2145. m_spiderFreqs [n] = .04166; // 60 minutes
  2146. n++;
  2147. m_regExs[n].set("hopcount==1");
  2148. m_harvestLinks [n] = 1;
  2149. m_spiderFreqs [n] = 20.0;
  2150. m_maxSpidersPerRule [n] = 9; // max spiders
  2151. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2152. m_spiderIpWaits [n] = 1000; // same ip wait
  2153. m_spiderPriorities [n] = 39;
  2154. if ( ! strcmp(s,"news") )
  2155. m_spiderFreqs [n] = .04166; // 60 minutes
  2156. n++;
  2157. m_regExs[n].set("hopcount==2 && isnew");
  2158. m_harvestLinks [n] = 1;
  2159. m_spiderFreqs [n] = 40;
  2160. m_maxSpidersPerRule [n] = 9; // max spiders
  2161. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2162. m_spiderIpWaits [n] = 1000; // same ip wait
  2163. m_spiderPriorities [n] = 30;
  2164. // do not harvest links if we are spiderings NEWS
  2165. if ( ! strcmp(s,"news") ) {
  2166. m_spiderFreqs [n] = 5.0;
  2167. m_harvestLinks [n] = 0;
  2168. }
  2169. n++;
  2170. m_regExs[n].set("hopcount==2");
  2171. m_harvestLinks [n] = 1;
  2172. m_spiderFreqs [n] = 40;
  2173. m_maxSpidersPerRule [n] = 9; // max spiders
  2174. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2175. m_spiderIpWaits [n] = 1000; // same ip wait
  2176. m_spiderPriorities [n] = 29;
  2177. // do not harvest links if we are spiderings NEWS
  2178. if ( ! strcmp(s,"news") ) {
  2179. m_spiderFreqs [n] = 5.0;
  2180. m_harvestLinks [n] = 0;
  2181. }
  2182. n++;
  2183. m_regExs[n].set("hopcount>=3 && isnew");
  2184. m_harvestLinks [n] = 1;
  2185. m_spiderFreqs [n] = 60;
  2186. m_maxSpidersPerRule [n] = 9; // max spiders
  2187. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2188. m_spiderIpWaits [n] = 1000; // same ip wait
  2189. m_spiderPriorities [n] = 20;
  2190. // turn off spidering if hopcount is too big and we are spiderings NEWS
  2191. if ( ! strcmp(s,"news") ) {
  2192. m_maxSpidersPerRule [n] = 0;
  2193. m_harvestLinks [n] = 0;
  2194. }
  2195. else {
  2196. n++;
  2197. }
  2198. m_regExs[n].set("hopcount>=3");
  2199. m_harvestLinks [n] = 1;
  2200. m_spiderFreqs [n] = 60;
  2201. m_maxSpidersPerRule [n] = 9; // max spiders
  2202. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2203. m_spiderIpWaits [n] = 1000; // same ip wait
  2204. m_spiderPriorities [n] = 19;
  2205. // turn off spidering if hopcount is too big and we are spiderings NEWS
  2206. if ( ! strcmp(s,"news") ) {
  2207. m_maxSpidersPerRule [n] = 0;
  2208. m_harvestLinks [n] = 0;
  2209. }
  2210. else {
  2211. n++;
  2212. }
  2213. /*
  2214. m_regExs[n].set("isnew");
  2215. m_harvestLinks [n] = 1;
  2216. m_spiderFreqs [n] = resp4;
  2217. m_maxSpidersPerRule [n] = 9; // max spiders
  2218. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2219. m_spiderIpWaits [n] = 1000; // same ip wait
  2220. m_spiderPriorities [n] = 2;
  2221. n++;
  2222. */
  2223. m_regExs[n].set("default");
  2224. m_harvestLinks [n] = 1;
  2225. m_spiderFreqs [n] = 60;
  2226. m_maxSpidersPerRule [n] = 9; // max spiders
  2227. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2228. m_spiderIpWaits [n] = 1000; // same ip wait
  2229. m_spiderPriorities [n] = 1;
  2230. if ( ! strcmp(s,"news") ) {
  2231. m_maxSpidersPerRule [n] = 0;
  2232. m_harvestLinks [n] = 0;
  2233. }
  2234. n++;
  2235. m_numRegExs = n;
  2236. m_numRegExs2 = n;
  2237. m_numRegExs3 = n;
  2238. m_numRegExs10 = n;
  2239. m_numRegExs5 = n;
  2240. m_numRegExs6 = n;
  2241. m_numRegExs8 = n;
  2242. m_numRegExs7 = n;
  2243. // more rules
  2244. //m_spiderDiffbotApiNum[n] = 1;
  2245. //m_numRegExs11++;
  2246. //m_spiderDiffbotApiUrl[n].set("");
  2247. //m_spiderDiffbotApiUrl[n].nullTerm();
  2248. //m_numRegExs11++;
  2249. return true;
  2250. }
  2251. bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
  2252. // max spiders per ip
  2253. int32_t ipms = 7;
  2254. int32_t n = 0;
  2255. m_regExs[n].set("isreindex");
  2256. m_harvestLinks [n] = 1;
  2257. m_spiderFreqs [n] = 0; // 30 days default
  2258. m_maxSpidersPerRule [n] = 99; // max spiders
  2259. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2260. m_spiderIpWaits [n] = 1000; // same ip wait
  2261. m_spiderPriorities [n] = 80;
  2262. n++;
  2263. m_regExs[n].set("ismedia");
  2264. m_harvestLinks [n] = 1;
  2265. m_spiderFreqs [n] = 0; // 30 days default
  2266. m_maxSpidersPerRule [n] = 99; // max spiders
  2267. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2268. m_spiderIpWaits [n] = 1000; // same ip wait
  2269. m_spiderPriorities [n] = 100; // delete!
  2270. m_forceDelete [n] = 1;
  2271. n++;
  2272. // if not in the site list then nuke it
  2273. m_regExs[n].set("!ismanualadd && !insitelist");
  2274. m_harvestLinks [n] = 1;
  2275. m_spiderFreqs [n] = 0; // 30 days default
  2276. m_maxSpidersPerRule [n] = 99; // max spiders
  2277. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2278. m_spiderIpWaits [n] = 1000; // same ip wait
  2279. m_spiderPriorities [n] = 100; // delete!
  2280. m_forceDelete [n] = 1;
  2281. n++;
  2282. m_regExs[n].set("errorcount>=3 && hastmperror");
  2283. m_harvestLinks [n] = 1;
  2284. m_spiderFreqs [n] = 1; // 30 days default
  2285. m_maxSpidersPerRule [n] = 1; // max spiders
  2286. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2287. m_spiderIpWaits [n] = 1000; // same ip wait
  2288. m_spiderPriorities [n] = 100;
  2289. m_forceDelete [n] = 1;
  2290. n++;
  2291. m_regExs[n].set("errorcount>=1 && hastmperror");
  2292. m_harvestLinks [n] = 1;
  2293. m_spiderFreqs [n] = 1; // 30 days default
  2294. m_maxSpidersPerRule [n] = 1; // max spiders
  2295. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2296. m_spiderIpWaits [n] = 1000; // same ip wait
  2297. m_spiderPriorities [n] = 45;
  2298. n++;
  2299. m_regExs[n].set("isaddurl");
  2300. m_harvestLinks [n] = 1;
  2301. m_spiderFreqs [n] = 7; // 30 days default
  2302. m_maxSpidersPerRule [n] = 99; // max spiders
  2303. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2304. m_spiderIpWaits [n] = 1000; // same ip wait
  2305. m_spiderPriorities [n] = 85;
  2306. n++;
  2307. m_regExs[n].reset();
  2308. m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && tld==%s",
  2309. tldStr);
  2310. m_harvestLinks [n] = 1;
  2311. m_spiderFreqs [n] = 7; // 30 days default
  2312. m_maxSpidersPerRule [n] = 9; // max spiders
  2313. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2314. m_spiderIpWaits [n] = 1000; // same ip wait
  2315. m_spiderPriorities [n] = 50;
  2316. n++;
  2317. m_regExs[n].reset();
  2318. m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && "
  2319. "parentlang==%s,xx"
  2320. ,langStr);
  2321. m_harvestLinks [n] = 1;
  2322. m_spiderFreqs [n] = 7; // 30 days default
  2323. m_maxSpidersPerRule [n] = 9; // max spiders
  2324. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2325. m_spiderIpWaits [n] = 1000; // same ip wait
  2326. m_spiderPriorities [n] = 50;
  2327. n++;
  2328. // m_regExs[n].set("hopcount==0 && iswww && isnew");
  2329. // m_harvestLinks [n] = 1;
  2330. // m_spiderFreqs [n] = 7; // 30 days default
  2331. // m_maxSpidersPerRule [n] = 9; // max spiders
  2332. // m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2333. // m_spiderIpWaits [n] = 1000; // same ip wait
  2334. // m_spiderPriorities [n] = 20;
  2335. // n++;
  2336. m_regExs[n].reset();
  2337. m_regExs[n].safePrintf("hopcount==0 && iswww && tld==%s",tldStr);
  2338. m_harvestLinks [n] = 1;
  2339. m_spiderFreqs [n] = 7.0; // days b4 respider
  2340. m_maxSpidersPerRule [n] = 9; // max spiders
  2341. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2342. m_spiderIpWaits [n] = 1000; // same ip wait
  2343. m_spiderPriorities [n] = 48;
  2344. n++;
  2345. m_regExs[n].reset();
  2346. m_regExs[n].safePrintf("hopcount==0 && iswww && parentlang==%s,xx",
  2347. langStr);
  2348. m_harvestLinks [n] = 1;
  2349. m_spiderFreqs [n] = 7.0; // days b4 respider
  2350. m_maxSpidersPerRule [n] = 9; // max spiders
  2351. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2352. m_spiderIpWaits [n] = 1000; // same ip wait
  2353. m_spiderPriorities [n] = 48;
  2354. n++;
  2355. m_regExs[n].set("hopcount==0 && iswww");
  2356. m_harvestLinks [n] = 1;
  2357. m_spiderFreqs [n] = 7.0; // days b4 respider
  2358. m_maxSpidersPerRule [n] = 9; // max spiders
  2359. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2360. m_spiderIpWaits [n] = 1000; // same ip wait
  2361. m_spiderPriorities [n] = 19;
  2362. n++;
  2363. m_regExs[n].reset();
  2364. m_regExs[n].safePrintf("hopcount==0 && isnew && tld==%s",tldStr);
  2365. m_harvestLinks [n] = 1;
  2366. m_spiderFreqs [n] = 7.0;
  2367. m_maxSpidersPerRule [n] = 9; // max spiders
  2368. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2369. m_spiderIpWaits [n] = 1000; // same ip wait
  2370. m_spiderPriorities [n] = 49;
  2371. n++;
  2372. m_regExs[n].reset();
  2373. m_regExs[n].safePrintf("hopcount==0 && isnew && parentlang==%s,xx",
  2374. langStr);
  2375. m_harvestLinks [n] = 1;
  2376. m_spiderFreqs [n] = 7.0;
  2377. m_maxSpidersPerRule [n] = 9; // max spiders
  2378. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2379. m_spiderIpWaits [n] = 1000; // same ip wait
  2380. m_spiderPriorities [n] = 49;
  2381. n++;
  2382. m_regExs[n].set("hopcount==0 && isnew");
  2383. m_harvestLinks [n] = 1;
  2384. m_spiderFreqs [n] = 7.0;
  2385. m_maxSpidersPerRule [n] = 9; // max spiders
  2386. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2387. m_spiderIpWaits [n] = 1000; // same ip wait
  2388. m_spiderPriorities [n] = 18;
  2389. n++;
  2390. m_regExs[n].reset();
  2391. m_regExs[n].safePrintf("hopcount==0 && tld==%s",tldStr);
  2392. m_harvestLinks [n] = 1;
  2393. m_spiderFreqs [n] = 10.0;
  2394. m_maxSpidersPerRule [n] = 9; // max spiders
  2395. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2396. m_spiderIpWaits [n] = 1000; // same ip wait
  2397. m_spiderPriorities [n] = 47;
  2398. n++;
  2399. m_regExs[n].reset();
  2400. m_regExs[n].safePrintf("hopcount==0 && parentlang==%s,xx",langStr);
  2401. m_harvestLinks [n] = 1;
  2402. m_spiderFreqs [n] = 10.0;
  2403. m_maxSpidersPerRule [n] = 9; // max spiders
  2404. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2405. m_spiderIpWaits [n] = 1000; // same ip wait
  2406. m_spiderPriorities [n] = 47;
  2407. n++;
  2408. m_regExs[n].set("hopcount==0");
  2409. m_harvestLinks [n] = 1;
  2410. m_spiderFreqs [n] = 10.0;
  2411. m_maxSpidersPerRule [n] = 9; // max spiders
  2412. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2413. m_spiderIpWaits [n] = 1000; // same ip wait
  2414. m_spiderPriorities [n] = 17;
  2415. n++;
  2416. m_regExs[n].reset();
  2417. m_regExs[n].safePrintf("hopcount==1 && isnew && tld==%s",tldStr);
  2418. m_harvestLinks [n] = 1;
  2419. m_spiderFreqs [n] = 20.0;
  2420. m_maxSpidersPerRule [n] = 9; // max spiders
  2421. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2422. m_spiderIpWaits [n] = 1000; // same ip wait
  2423. m_spiderPriorities [n] = 40;
  2424. n++;
  2425. m_regExs[n].reset();
  2426. m_regExs[n].safePrintf("hopcount==1 && isnew && parentlang==%s,xx",
  2427. tldStr);
  2428. m_harvestLinks [n] = 1;
  2429. m_spiderFreqs [n] = 20.0;
  2430. m_maxSpidersPerRule [n] = 9; // max spiders
  2431. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2432. m_spiderIpWaits [n] = 1000; // same ip wait
  2433. m_spiderPriorities [n] = 40;
  2434. n++;
  2435. m_regExs[n].set("hopcount==1 && isnew");
  2436. m_harvestLinks [n] = 1;
  2437. m_spiderFreqs [n] = 20.0;
  2438. m_maxSpidersPerRule [n] = 9; // max spiders
  2439. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2440. m_spiderIpWaits [n] = 1000; // same ip wait
  2441. m_spiderPriorities [n] = 16;
  2442. n++;
  2443. m_regExs[n].reset();
  2444. m_regExs[n].safePrintf("hopcount==1 && tld==%s",tldStr);
  2445. m_harvestLinks [n] = 1;
  2446. m_spiderFreqs [n] = 20.0;
  2447. m_maxSpidersPerRule [n] = 9; // max spiders
  2448. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2449. m_spiderIpWaits [n] = 1000; // same ip wait
  2450. m_spiderPriorities [n] = 39;
  2451. n++;
  2452. m_regExs[n].reset();
  2453. m_regExs[n].safePrintf("hopcount==1 && parentlang==%s,xx",langStr);
  2454. m_harvestLinks [n] = 1;
  2455. m_spiderFreqs [n] = 20.0;
  2456. m_maxSpidersPerRule [n] = 9; // max spiders
  2457. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2458. m_spiderIpWaits [n] = 1000; // same ip wait
  2459. m_spiderPriorities [n] = 39;
  2460. n++;
  2461. m_regExs[n].set("hopcount==1");
  2462. m_harvestLinks [n] = 1;
  2463. m_spiderFreqs [n] = 20.0;
  2464. m_maxSpidersPerRule [n] = 9; // max spiders
  2465. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2466. m_spiderIpWaits [n] = 1000; // same ip wait
  2467. m_spiderPriorities [n] = 15;
  2468. n++;
  2469. m_regExs[n].reset();
  2470. m_regExs[n].safePrintf("hopcount==2 && isnew && tld==%s",tldStr);
  2471. m_harvestLinks [n] = 1;
  2472. m_spiderFreqs [n] = 40;
  2473. m_maxSpidersPerRule [n] = 9; // max spiders
  2474. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2475. m_spiderIpWaits [n] = 1000; // same ip wait
  2476. m_spiderPriorities [n] = 30;
  2477. n++;
  2478. m_regExs[n].reset();
  2479. m_regExs[n].safePrintf("hopcount==2 && isnew && parentlang==%s,xx",
  2480. langStr);
  2481. m_harvestLinks [n] = 1;
  2482. m_spiderFreqs [n] = 40;
  2483. m_maxSpidersPerRule [n] = 9; // max spiders
  2484. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2485. m_spiderIpWaits [n] = 1000; // same ip wait
  2486. m_spiderPriorities [n] = 30;
  2487. n++;
  2488. m_regExs[n].set("hopcount==2 && isnew");
  2489. m_harvestLinks [n] = 1;
  2490. m_spiderFreqs [n] = 40;
  2491. m_maxSpidersPerRule [n] = 9; // max spiders
  2492. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2493. m_spiderIpWaits [n] = 1000; // same ip wait
  2494. m_spiderPriorities [n] = 14;
  2495. n++;
  2496. m_regExs[n].reset();
  2497. m_regExs[n].safePrintf("hopcount==2 && tld==%s",tldStr);
  2498. m_harvestLinks [n] = 1;
  2499. m_spiderFreqs [n] = 40;
  2500. m_maxSpidersPerRule [n] = 9; // max spiders
  2501. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2502. m_spiderIpWaits [n] = 1000; // same ip wait
  2503. m_spiderPriorities [n] = 29;
  2504. n++;
  2505. m_regExs[n].reset();
  2506. m_regExs[n].safePrintf("hopcount==2 && parentlang==%s,xx",langStr);
  2507. m_harvestLinks [n] = 1;
  2508. m_spiderFreqs [n] = 40;
  2509. m_maxSpidersPerRule [n] = 9; // max spiders
  2510. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2511. m_spiderIpWaits [n] = 1000; // same ip wait
  2512. m_spiderPriorities [n] = 29;
  2513. n++;
  2514. m_regExs[n].set("hopcount==2");
  2515. m_harvestLinks [n] = 1;
  2516. m_spiderFreqs [n] = 40;
  2517. m_maxSpidersPerRule [n] = 9; // max spiders
  2518. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2519. m_spiderIpWaits [n] = 1000; // same ip wait
  2520. m_spiderPriorities [n] = 13;
  2521. n++;
  2522. m_regExs[n].reset();
  2523. m_regExs[n].safePrintf("hopcount>=3 && isnew && tld==%s",tldStr);
  2524. m_harvestLinks [n] = 1;
  2525. m_spiderFreqs [n] = 60;
  2526. m_maxSpidersPerRule [n] = 9; // max spiders
  2527. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2528. m_spiderIpWaits [n] = 1000; // same ip wait
  2529. m_spiderPriorities [n] = 22;
  2530. n++;
  2531. m_regExs[n].reset();
  2532. m_regExs[n].safePrintf("hopcount>=3 && isnew && parentlang==%s,xx",
  2533. langStr);
  2534. m_harvestLinks [n] = 1;
  2535. m_spiderFreqs [n] = 60;
  2536. m_maxSpidersPerRule [n] = 9; // max spiders
  2537. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2538. m_spiderIpWaits [n] = 1000; // same ip wait
  2539. m_spiderPriorities [n] = 22;
  2540. n++;
  2541. m_regExs[n].set("hopcount>=3 && isnew");
  2542. m_harvestLinks [n] = 1;
  2543. m_spiderFreqs [n] = 60;
  2544. m_maxSpidersPerRule [n] = 9; // max spiders
  2545. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2546. m_spiderIpWaits [n] = 1000; // same ip wait
  2547. m_spiderPriorities [n] = 12;
  2548. n++;
  2549. m_regExs[n].reset();
  2550. m_regExs[n].safePrintf("hopcount>=3 && tld==%s",tldStr);
  2551. m_harvestLinks [n] = 1;
  2552. m_spiderFreqs [n] = 60;
  2553. m_maxSpidersPerRule [n] = 9; // max spiders
  2554. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2555. m_spiderIpWaits [n] = 1000; // same ip wait
  2556. m_spiderPriorities [n] = 21;
  2557. n++;
  2558. m_regExs[n].reset();
  2559. m_regExs[n].safePrintf("hopcount>=3 && parentlang==%s,xx",langStr);
  2560. m_harvestLinks [n] = 1;
  2561. m_spiderFreqs [n] = 60;
  2562. m_maxSpidersPerRule [n] = 9; // max spiders
  2563. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2564. m_spiderIpWaits [n] = 1000; // same ip wait
  2565. m_spiderPriorities [n] = 21;
  2566. n++;
  2567. m_regExs[n].set("hopcount>=3");
  2568. m_harvestLinks [n] = 1;
  2569. m_spiderFreqs [n] = 60;
  2570. m_maxSpidersPerRule [n] = 9; // max spiders
  2571. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2572. m_spiderIpWaits [n] = 1000; // same ip wait
  2573. m_spiderPriorities [n] = 11;
  2574. n++;
  2575. m_regExs[n].set("default");
  2576. m_harvestLinks [n] = 1;
  2577. m_spiderFreqs [n] = 60;
  2578. m_maxSpidersPerRule [n] = 9; // max spiders
  2579. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2580. m_spiderIpWaits [n] = 1000; // same ip wait
  2581. m_spiderPriorities [n] = 1;
  2582. n++;
  2583. m_numRegExs = n;
  2584. m_numRegExs2 = n;
  2585. m_numRegExs3 = n;
  2586. m_numRegExs10 = n;
  2587. m_numRegExs5 = n;
  2588. m_numRegExs6 = n;
  2589. m_numRegExs8 = n;
  2590. m_numRegExs7 = n;
  2591. // done rebuilding CHINESE rules
  2592. return true;
  2593. }
  2594. bool CollectionRec::rebuildShallowRules ( ) {
  2595. // max spiders per ip
  2596. int32_t ipms = 7;
  2597. int32_t n = 0;
  2598. m_regExs[n].set("isreindex");
  2599. m_harvestLinks [n] = 1;
  2600. m_spiderFreqs [n] = 0; // 30 days default
  2601. m_maxSpidersPerRule [n] = 99; // max spiders
  2602. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2603. m_spiderIpWaits [n] = 1000; // same ip wait
  2604. m_spiderPriorities [n] = 80;
  2605. n++;
  2606. m_regExs[n].set("ismedia");
  2607. m_harvestLinks [n] = 1;
  2608. m_spiderFreqs [n] = 0; // 30 days default
  2609. m_maxSpidersPerRule [n] = 99; // max spiders
  2610. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2611. m_spiderIpWaits [n] = 1000; // same ip wait
  2612. m_spiderPriorities [n] = 100; // delete!
  2613. m_forceDelete [n] = 1;
  2614. n++;
  2615. // if not in the site list then nuke it
  2616. m_regExs[n].set("!ismanualadd && !insitelist");
  2617. m_harvestLinks [n] = 1;
  2618. m_spiderFreqs [n] = 0; // 30 days default
  2619. m_maxSpidersPerRule [n] = 99; // max spiders
  2620. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2621. m_spiderIpWaits [n] = 1000; // same ip wait
  2622. m_spiderPriorities [n] = 100; // delete!
  2623. m_forceDelete [n] = 1;
  2624. n++;
  2625. m_regExs[n].set("errorcount>=3 && hastmperror");
  2626. m_harvestLinks [n] = 1;
  2627. m_spiderFreqs [n] = 1; // 30 days default
  2628. m_maxSpidersPerRule [n] = 1; // max spiders
  2629. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2630. m_spiderIpWaits [n] = 1000; // same ip wait
  2631. m_spiderPriorities [n] = 100;
  2632. m_forceDelete [n] = 1;
  2633. n++;
  2634. m_regExs[n].set("errorcount>=1 && hastmperror");
  2635. m_harvestLinks [n] = 1;
  2636. m_spiderFreqs [n] = 1; // 30 days default
  2637. m_maxSpidersPerRule [n] = 1; // max spiders
  2638. m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
  2639. m_spiderIpWaits [n] = 1000; // same ip wait
  2640. m_spiderPriorities [n] = 45;
  2641. n++;
  2642. m_regExs[n].set("isaddurl");
  2643. m_harvestLinks [n] = 1;
  2644. m_spiderFreqs [n] = 7; // 30 days default
  2645. m_maxSpidersPerRule [n] = 99; // max spiders
  2646. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2647. m_spiderIpWaits [n] = 1000; // same ip wait
  2648. m_spiderPriorities [n] = 85;
  2649. n++;
  2650. //
  2651. // stop if hopcount>=2 for things tagged shallow in sitelist
  2652. //
  2653. m_regExs[n].set("tag:shallow && hopcount>=2");
  2654. m_harvestLinks [n] = 1;
  2655. m_spiderFreqs [n] = 40;
  2656. m_maxSpidersPerRule [n] = 0; // max spiders
  2657. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2658. m_spiderIpWaits [n] = 1000; // same ip wait
  2659. m_spiderPriorities [n] = 30;
  2660. n++;
  2661. // if # of pages in this site indexed is >= 10 then stop as well...
  2662. m_regExs[n].set("tag:shallow && sitepages>=10");
  2663. m_harvestLinks [n] = 1;
  2664. m_spiderFreqs [n] = 40;
  2665. m_maxSpidersPerRule [n] = 0; // max spiders
  2666. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2667. m_spiderIpWaits [n] = 1000; // same ip wait
  2668. m_spiderPriorities [n] = 30;
  2669. n++;
  2670. m_regExs[n].set("hopcount==0 && iswww && isnew");
  2671. m_harvestLinks [n] = 1;
  2672. m_spiderFreqs [n] = 7; // 30 days default
  2673. m_maxSpidersPerRule [n] = 9; // max spiders
  2674. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2675. m_spiderIpWaits [n] = 1000; // same ip wait
  2676. m_spiderPriorities [n] = 50;
  2677. n++;
  2678. m_regExs[n].set("hopcount==0 && iswww");
  2679. m_harvestLinks [n] = 1;
  2680. m_spiderFreqs [n] = 7.0; // days b4 respider
  2681. m_maxSpidersPerRule [n] = 9; // max spiders
  2682. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2683. m_spiderIpWaits [n] = 1000; // same ip wait
  2684. m_spiderPriorities [n] = 48;
  2685. n++;
  2686. m_regExs[n].set("hopcount==0 && isnew");
  2687. m_harvestLinks [n] = 1;
  2688. m_spiderFreqs [n] = 7.0;
  2689. m_maxSpidersPerRule [n] = 9; // max spiders
  2690. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2691. m_spiderIpWaits [n] = 1000; // same ip wait
  2692. m_spiderPriorities [n] = 49;
  2693. n++;
  2694. m_regExs[n].set("hopcount==0");
  2695. m_harvestLinks [n] = 1;
  2696. m_spiderFreqs [n] = 10.0;
  2697. m_maxSpidersPerRule [n] = 9; // max spiders
  2698. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2699. m_spiderIpWaits [n] = 1000; // same ip wait
  2700. m_spiderPriorities [n] = 47;
  2701. n++;
  2702. m_regExs[n].set("hopcount==1 && isnew");
  2703. m_harvestLinks [n] = 1;
  2704. m_spiderFreqs [n] = 20.0;
  2705. m_maxSpidersPerRule [n] = 9; // max spiders
  2706. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2707. m_spiderIpWaits [n] = 1000; // same ip wait
  2708. m_spiderPriorities [n] = 40;
  2709. n++;
  2710. m_regExs[n].set("hopcount==1");
  2711. m_harvestLinks [n] = 1;
  2712. m_spiderFreqs [n] = 20.0;
  2713. m_maxSpidersPerRule [n] = 9; // max spiders
  2714. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2715. m_spiderIpWaits [n] = 1000; // same ip wait
  2716. m_spiderPriorities [n] = 39;
  2717. n++;
  2718. m_regExs[n].set("hopcount==2 && isnew");
  2719. m_harvestLinks [n] = 1;
  2720. m_spiderFreqs [n] = 40;
  2721. m_maxSpidersPerRule [n] = 9; // max spiders
  2722. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2723. m_spiderIpWaits [n] = 1000; // same ip wait
  2724. m_spiderPriorities [n] = 30;
  2725. n++;
  2726. m_regExs[n].set("hopcount==2");
  2727. m_harvestLinks [n] = 1;
  2728. m_spiderFreqs [n] = 40;
  2729. m_maxSpidersPerRule [n] = 9; // max spiders
  2730. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2731. m_spiderIpWaits [n] = 1000; // same ip wait
  2732. m_spiderPriorities [n] = 29;
  2733. n++;
  2734. m_regExs[n].set("hopcount>=3 && isnew");
  2735. m_harvestLinks [n] = 1;
  2736. m_spiderFreqs [n] = 60;
  2737. m_maxSpidersPerRule [n] = 9; // max spiders
  2738. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2739. m_spiderIpWaits [n] = 1000; // same ip wait
  2740. m_spiderPriorities [n] = 22;
  2741. n++;
  2742. m_regExs[n].set("hopcount>=3");
  2743. m_harvestLinks [n] = 1;
  2744. m_spiderFreqs [n] = 60;
  2745. m_maxSpidersPerRule [n] = 9; // max spiders
  2746. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2747. m_spiderIpWaits [n] = 1000; // same ip wait
  2748. m_spiderPriorities [n] = 21;
  2749. n++;
  2750. m_regExs[n].set("default");
  2751. m_harvestLinks [n] = 1;
  2752. m_spiderFreqs [n] = 60;
  2753. m_maxSpidersPerRule [n] = 9; // max spiders
  2754. m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
  2755. m_spiderIpWaits [n] = 1000; // same ip wait
  2756. m_spiderPriorities [n] = 1;
  2757. n++;
  2758. m_numRegExs = n;
  2759. m_numRegExs2 = n;
  2760. m_numRegExs3 = n;
  2761. m_numRegExs10 = n;
  2762. m_numRegExs5 = n;
  2763. m_numRegExs6 = n;
  2764. m_numRegExs8 = n;
  2765. m_numRegExs7 = n;
  2766. // done rebuilding SHALLOW rules
  2767. return true;
  2768. }
  2769. /*
  2770. bool CrawlInfo::print (SafeBuf *sb ) {
  2771. return sb->safePrintf("objectsAdded:%"INT64"\n"
  2772. "objectsDeleted:%"INT64"\n"
  2773. "urlsConsidered:%"INT64"\n"
  2774. "downloadAttempts:%"INT64"\n"
  2775. "downloadSuccesses:%"INT64"\n"
  2776. "processAttempts:%"INT64"\n"
  2777. "processSuccesses:%"INT64"\n"
  2778. "lastupdate:%"UINT32"\n"
  2779. , m_objectsAdded
  2780. , m_objectsDeleted
  2781. , m_urlsConsidered
  2782. , m_pageDownloadAttempts
  2783. , m_pageDownloadSuccesses
  2784. , m_pageProcessAttempts
  2785. , m_pageProcessSuccesses
  2786. , m_lastUpdateTime
  2787. );
  2788. }
  2789. bool CrawlInfo::setFromSafeBuf (SafeBuf *sb ) {
  2790. return sscanf(sb->getBufStart(),
  2791. "objectsAdded:%"INT64"\n"
  2792. "objectsDeleted:%"INT64"\n"
  2793. "urlsConsidered:%"INT64"\n"
  2794. "downloadAttempts:%"INT64"\n"
  2795. "downloadSuccesses:%"INT64"\n"
  2796. "processAttempts:%"INT64"\n"
  2797. "processSuccesses:%"INT64"\n"
  2798. "lastupdate:%"UINT32"\n"
  2799. , &m_objectsAdded
  2800. , &m_objectsDeleted
  2801. , &m_urlsConsidered
  2802. , &m_pageDownloadAttempts
  2803. , &m_pageDownloadSuccesses
  2804. , &m_pageProcessAttempts
  2805. , &m_pageProcessSuccesses
  2806. , &m_lastUpdateTime
  2807. );
  2808. }
  2809. */
  2810. // returns false on failure and sets g_errno, true otherwise
  2811. bool CollectionRec::save ( ) {
  2812. if ( g_conf.m_readOnlyMode ) return true;
  2813. //File f;
  2814. char tmp[1024];
  2815. //sprintf ( tmp , "%scollections/%"INT32".%s/c.conf",
  2816. // g_hostdb.m_dir,m_id,m_coll);
  2817. // collection name HACK for backwards compatibility
  2818. //if ( m_collLen == 0 )
  2819. // sprintf ( tmp , "%scoll.main/coll.conf", g_hostdb.m_dir);
  2820. //else
  2821. snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/coll.conf",
  2822. g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
  2823. if ( ! g_parms.saveToXml ( (char *)this , tmp ,OBJ_COLL)) return false;
  2824. // log msg
  2825. //log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
  2826. //
  2827. // save the crawlinfo class in the collectionrec for diffbot
  2828. //
  2829. // SAVE LOCAL
  2830. snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
  2831. g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
  2832. //log("coll: saving %s",tmp);
  2833. // in case emergency save from malloc core, do not alloc
  2834. char stack[1024];
  2835. SafeBuf sb(stack,1024);
  2836. //m_localCrawlInfo.print ( &sb );
  2837. // binary now
  2838. sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
  2839. if ( sb.safeSave ( tmp ) == -1 ) {
  2840. log("db: failed to save file %s : %s",
  2841. tmp,mstrerror(g_errno));
  2842. g_errno = 0;
  2843. }
  2844. // SAVE GLOBAL
  2845. snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
  2846. g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
  2847. //log("coll: saving %s",tmp);
  2848. sb.reset();
  2849. //m_globalCrawlInfo.print ( &sb );
  2850. // binary now
  2851. sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
  2852. if ( sb.safeSave ( tmp ) == -1 ) {
  2853. log("db: failed to save file %s : %s",
  2854. tmp,mstrerror(g_errno));
  2855. g_errno = 0;
  2856. }
  2857. // the list of ip addresses that we have detected as being throttled
  2858. // and therefore backoff and use proxies for
  2859. sb.reset();
  2860. sb.safePrintf("%scoll.%s.%"INT32"/",
  2861. g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
  2862. m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
  2863. // do not need a save now
  2864. m_needsSave = false;
  2865. // waiting tree is saved in SpiderCache::save() called by Process.cpp
  2866. //SpiderColl *sc = m_spiderColl;
  2867. //if ( ! sc ) return true;
  2868. // save page count table which has # of pages indexed per
  2869. // subdomain/site and firstip for doing quotas in url filters table
  2870. //snprintf ( tmp , 1023, "coll.%s.%"INT32"/pagecounts.dat",
  2871. // m_coll , (int32_t)m_collnum );
  2872. //if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
  2873. // log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
  2874. // g_errno = 0;
  2875. //}
  2876. return true;
  2877. }
  2878. // calls hasPermissin() below
  2879. bool CollectionRec::hasPermission ( HttpRequest *r , TcpSocket *s ) {
  2880. int32_t plen;
  2881. char *p = r->getString ( "pwd" , &plen );
  2882. int32_t ip = s->m_ip;
  2883. return hasPermission ( p , plen , ip );
  2884. }
  2885. // . does this password work for this collection?
  2886. bool CollectionRec::isAssassin ( int32_t ip ) {
  2887. // ok, make sure they came from an acceptable IP
  2888. //for ( int32_t i = 0 ; i < m_numSpamIps ; i++ )
  2889. // // they also have a matching IP, so they now have permission
  2890. // if ( m_spamIps[i] == ip ) return true;
  2891. return false;
  2892. }
  2893. // . does this password work for this collection?
  2894. bool CollectionRec::hasPermission ( char *p, int32_t plen , int32_t ip ) {
  2895. // just return true
  2896. // collection permission is checked from Users::verifyColl
  2897. // in User::getUserType for every request
  2898. return true;
  2899. // scan the passwords
  2900. // MDW: no longer, this is too vulnerable!!!
  2901. /*
  2902. for ( int32_t i = 0 ; i < m_numAdminPwds ; i++ ) {
  2903. int32_t len = gbstrlen ( m_adminPwds[i] );
  2904. if ( len != plen ) continue;
  2905. if ( strncmp ( m_adminPwds[i] , p , plen ) != 0 ) continue;
  2906. // otherwise it's a match!
  2907. //goto checkIp;
  2908. // . matching one password is good enough now, default OR
  2909. // . because just matching an IP is good enough security,
  2910. // there is really no need for both IP AND passwd match
  2911. return true;
  2912. }
  2913. */
  2914. // . if had passwords but the provided one didn't match, return false
  2915. // . matching one password is good enough now, default OR
  2916. //if ( m_numPasswords > 0 ) return false;
  2917. // checkIp:
  2918. // ok, make sure they came from an acceptable IP
  2919. //for ( int32_t i = 0 ; i < m_numAdminIps ; i++ )
  2920. // // they also have a matching IP, so they now have permission
  2921. // if ( m_adminIps[i] == ip ) return true;
  2922. // if no security, allow all NONONONONONONONONO!!!!!!!!!!!!!!
  2923. //if ( m_numAdminPwds == 0 && m_numAdminIps == 0 ) return true;
  2924. // if they did not match an ip or password, even if both lists
  2925. // are empty, do not allow access... this prevents security breeches
  2926. // by accident
  2927. return false;
  2928. // if there were IPs then they failed to get in
  2929. //if ( m_numAdminIps > 0 ) return false;
  2930. // otherwise, they made it
  2931. //return true;
  2932. }
  2933. // can this ip perform a search or add url on this collection?
  2934. bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
  2935. // get the ip
  2936. int32_t ip = 0; if ( s ) ip = s->m_ip;
  2937. // and the ip domain
  2938. int32_t ipd = 0; if ( s ) ipd = ipdom ( s->m_ip );
  2939. // and top 2 bytes for the israel isp that has this huge block
  2940. int32_t ipt = 0; if ( s ) ipt = iptop ( s->m_ip );
  2941. // is it in the ban list?
  2942. /*
  2943. for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
  2944. if ( isIpTop ( m_banIps[i] ) ) {
  2945. if ( m_banIps[i] == ipt ) return false;
  2946. continue;
  2947. }
  2948. // check for ip domain match if this banned ip is an ip domain
  2949. if ( isIpDom ( m_banIps[i] ) ) {
  2950. if ( m_banIps[i] == ipd ) return false;
  2951. continue;
  2952. }
  2953. // otherwise it's just a single banned ip
  2954. if ( m_banIps[i] == ip ) return false;
  2955. }
  2956. */
  2957. // check the encapsulate ip if any
  2958. // 1091771468731 0 Aug 05 23:51:08 63.236.25.77 GET
  2959. // /search?code=mammaXbG&uip=65.87.190.39&n=15&raw=8&q=farm+insurance
  2960. // +nj+state HTTP/1.0
  2961. /*
  2962. if ( encapIp ) {
  2963. ipd = ipdom ( encapIp );
  2964. ip = encapIp;
  2965. for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
  2966. if ( isIpDom ( m_banIps[i] ) ) {
  2967. if ( m_banIps[i] == ipd ) return false;
  2968. continue;
  2969. }
  2970. if ( isIpTop ( m_banIps[i] ) ) {
  2971. if ( m_banIps[i] == ipt ) return false;
  2972. continue;
  2973. }
  2974. if ( m_banIps[i] == ip ) return false;
  2975. }
  2976. }
  2977. */
  2978. return true;
  2979. /*
  2980. // do we have an "only" list?
  2981. if ( m_numSearchIps == 0 ) return true;
  2982. // it must be in that list if we do
  2983. for ( int32_t i = 0 ; i < m_numSearchIps ; i++ ) {
  2984. // check for ip domain match if this banned ip is an ip domain
  2985. if ( isIpDom ( m_searchIps[i] ) ) {
  2986. if ( m_searchIps[i] == ipd ) return true;
  2987. continue;
  2988. }
  2989. // otherwise it's just a single ip
  2990. if ( m_searchIps[i] == ip ) return true;
  2991. }
  2992. */
  2993. // otherwise no permission
  2994. return false;
  2995. }
  2996. bool expandRegExShortcuts ( SafeBuf *sb ) ;
  2997. void nukeDoledb ( collnum_t collnum );
  2998. // rebuild the regexes related to diffbot, such as the one for the URL pattern
  2999. bool CollectionRec::rebuildDiffbotRegexes() {
  3000. //logf(LOG_DEBUG,"db: rebuilding url filters");
  3001. char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
  3002. if ( ucp && ! ucp[0] ) ucp = NULL;
  3003. // get the regexes
  3004. if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
  3005. if ( ucp && ! ucp[0] ) ucp = NULL;
  3006. char *upp = m_diffbotUrlProcessPattern.getBufStart();
  3007. if ( upp && ! upp[0] ) upp = NULL;
  3008. if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
  3009. if ( upp && ! upp[0] ) upp = NULL;
  3010. char *ppp = m_diffbotPageProcessPattern.getBufStart();
  3011. if ( ppp && ! ppp[0] ) ppp = NULL;
  3012. // recompiling regexes starts now
  3013. if ( m_hasucr ) {
  3014. regfree ( &m_ucr );
  3015. m_hasucr = false;
  3016. }
  3017. if ( m_hasupr ) {
  3018. regfree ( &m_upr );
  3019. m_hasupr = false;
  3020. }
  3021. // copy into tmpbuf
  3022. SafeBuf tmp;
  3023. char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
  3024. if ( rx && ! rx[0] ) rx = NULL;
  3025. if ( rx ) {
  3026. tmp.reset();
  3027. tmp.safeStrcpy ( rx );
  3028. expandRegExShortcuts ( &tmp );
  3029. m_hasucr = true;
  3030. }
  3031. if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
  3032. REG_EXTENDED| //REG_ICASE|
  3033. REG_NEWLINE ) ) { // |REG_NOSUB) ) {
  3034. // error!
  3035. log("coll: regcomp %s failed: %s. "
  3036. "Ignoring.",
  3037. rx,mstrerror(errno));
  3038. regfree ( &m_ucr );
  3039. m_hasucr = false;
  3040. }
  3041. rx = m_diffbotUrlProcessRegEx.getBufStart();
  3042. if ( rx && ! rx[0] ) rx = NULL;
  3043. if ( rx ) m_hasupr = true;
  3044. if ( rx ) {
  3045. tmp.reset();
  3046. tmp.safeStrcpy ( rx );
  3047. expandRegExShortcuts ( &tmp );
  3048. m_hasupr = true;
  3049. }
  3050. if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
  3051. REG_EXTENDED| // REG_ICASE|
  3052. REG_NEWLINE ) ) { // |REG_NOSUB) ) {
  3053. // error!
  3054. log("coll: regcomp %s failed: %s. "
  3055. "Ignoring.",
  3056. rx,mstrerror(errno));
  3057. regfree ( &m_upr );
  3058. m_hasupr = false;
  3059. }
  3060. return true;
  3061. }
  3062. bool CollectionRec::rebuildUrlFiltersDiffbot() {
  3063. //logf(LOG_DEBUG,"db: rebuilding url filters");
  3064. char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
  3065. if ( ucp && ! ucp[0] ) ucp = NULL;
  3066. // if we had a regex, that works for this purpose as well
  3067. if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
  3068. if ( ucp && ! ucp[0] ) ucp = NULL;
  3069. char *upp = m_diffbotUrlProcessPattern.getBufStart();
  3070. if ( upp && ! upp[0] ) upp = NULL;
  3071. // if we had a regex, that works for this purpose as well
  3072. if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
  3073. if ( upp && ! upp[0] ) upp = NULL;
  3074. char *ppp = m_diffbotPageProcessPattern.getBufStart();
  3075. if ( ppp && ! ppp[0] ) ppp = NULL;
  3076. ///////
  3077. //
  3078. // recompile regular expressions
  3079. //
  3080. ///////
  3081. if ( m_hasucr ) {
  3082. regfree ( &m_ucr );
  3083. m_hasucr = false;
  3084. }
  3085. if ( m_hasupr ) {
  3086. regfree ( &m_upr );
  3087. m_hasupr = false;
  3088. }
  3089. // copy into tmpbuf
  3090. SafeBuf tmp;
  3091. char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
  3092. if ( rx && ! rx[0] ) rx = NULL;
  3093. if ( rx ) {
  3094. tmp.reset();
  3095. tmp.safeStrcpy ( rx );
  3096. expandRegExShortcuts ( &tmp );
  3097. m_hasucr = true;
  3098. }
  3099. int32_t err;
  3100. if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
  3101. REG_EXTENDED| //REG_ICASE|
  3102. REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
  3103. // error!
  3104. char errbuf[1024];
  3105. regerror(err,&m_ucr,errbuf,1000);
  3106. log("coll: regcomp %s failed: %s. "
  3107. "Ignoring.",
  3108. rx,errbuf);
  3109. regfree ( &m_ucr );
  3110. m_hasucr = false;
  3111. }
  3112. rx = m_diffbotUrlProcessRegEx.getBufStart();
  3113. if ( rx && ! rx[0] ) rx = NULL;
  3114. if ( rx ) m_hasupr = true;
  3115. if ( rx ) {
  3116. tmp.reset();
  3117. tmp.safeStrcpy ( rx );
  3118. expandRegExShortcuts ( &tmp );
  3119. m_hasupr = true;
  3120. }
  3121. if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
  3122. REG_EXTENDED| // REG_ICASE|
  3123. REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
  3124. char errbuf[1024];
  3125. regerror(err,&m_upr,errbuf,1000);
  3126. // error!
  3127. log("coll: regcomp %s failed: %s. "
  3128. "Ignoring.",
  3129. rx,errbuf);
  3130. regfree ( &m_upr );
  3131. m_hasupr = false;
  3132. }
  3133. // what diffbot url to use for processing
  3134. char *api = m_diffbotApiUrl.getBufStart();
  3135. if ( api && ! api[0] ) api = NULL;
  3136. // convert from seconds to milliseconds. default is 250ms?
  3137. int32_t wait = (int32_t)(m_collectiveCrawlDelay * 1000.0);
  3138. // default to 250ms i guess. -1 means unset i think.
  3139. if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
  3140. bool isEthan = false;
  3141. if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
  3142. // it looks like we are assuming all crawls are repeating so that
  3143. // &rountStart=<currenttime> or &roundStart=0 which is the same
  3144. // thing, will trigger a re-crawl. so if collectiveRespiderFreq
  3145. // is 0 assume it is like 999999.0 days. so that stuff works.
  3146. // also i had to make the "default" rule below always have a respider
  3147. // freq of 0.0 so it will respider right away if we make it past the
  3148. // "lastspidertime>={roundstart}" rule which we will if they
  3149. // set the roundstart time to the current time using &roundstart=0
  3150. float respiderFreq = m_collectiveRespiderFrequency;
  3151. if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;
  3152. // lower from 7 to 1 since we have so many collections now
  3153. // ok, now we have much less colls so raise back to 7
  3154. int32_t diffbotipms = 7;//1; // 7
  3155. // make the gigablast regex table just "default" so it does not
  3156. // filtering, but accepts all urls. we will add code to pass the urls
  3157. // through m_diffbotUrlCrawlPattern alternatively. if that itself
  3158. // is empty, we will just restrict to the seed urls subdomain.
  3159. for ( int32_t i = 0 ; i < MAX_FILTERS ; i++ ) {
  3160. m_regExs[i].purge();
  3161. m_spiderPriorities[i] = 0;
  3162. m_maxSpidersPerRule [i] = 100;
  3163. // when someone has a bulk job of thousands of different
  3164. // domains it slows diffbot back-end down, so change this
  3165. // from 100 to 7 if doing a bulk job
  3166. if ( m_isCustomCrawl == 2 )
  3167. m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster
  3168. m_spiderIpWaits [i] = wait;
  3169. m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
  3170. // ethan wants some speed
  3171. // if ( isEthan )
  3172. // m_spiderIpMaxSpiders[i] = 30;
  3173. //m_spidersEnabled [i] = 1;
  3174. m_spiderFreqs [i] = respiderFreq;
  3175. //m_spiderDiffbotApiUrl[i].purge();
  3176. m_harvestLinks[i] = true;
  3177. m_forceDelete [i] = false;
  3178. }
  3179. int32_t i = 0;
  3180. // 1st one! for query reindex/ query delete
  3181. m_regExs[i].set("isreindex");
  3182. m_spiderIpMaxSpiders [i] = 10;
  3183. m_spiderPriorities [i] = 70;
  3184. i++;
  3185. // 2nd default url
  3186. m_regExs[i].set("ismedia && !ismanualadd");
  3187. m_maxSpidersPerRule [i] = 0;
  3188. m_spiderPriorities [i] = 100; // delete!
  3189. m_forceDelete [i] = 1;
  3190. i++;
  3191. // de-prioritize fakefirstip urls so we don't give the impression our
  3192. // spiders are slow. like if someone adds a bulk job with 100,000 urls
  3193. // then we sit there and process to lookup their ips and add a real
  3194. // spider request (if it falls onto the same shard) before we actually
  3195. // do any real spidering. so keep the priority here low.
  3196. m_regExs[i].set("isfakeip");
  3197. m_maxSpidersPerRule [i] = 7;
  3198. m_spiderIpMaxSpiders [i] = 7;
  3199. m_spiderPriorities [i] = 20;
  3200. m_spiderIpWaits [i] = 0;
  3201. i++;
  3202. // hopcount filter if asked for
  3203. if( m_diffbotMaxHops >= 0 ) {
  3204. // transform long to string
  3205. char numstr[21]; // enough to hold all numbers up to 64-bits
  3206. sprintf(numstr, "%"INT32"", (int32_t)m_diffbotMaxHops);
  3207. // form regEx like: hopcount>3
  3208. char hopcountStr[30];
  3209. strcpy(hopcountStr, "hopcount>");
  3210. strcat(hopcountStr, numstr);
  3211. m_regExs[i].set(hopcountStr);
  3212. // means DELETE :
  3213. m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
  3214. // just don't spider
  3215. m_maxSpidersPerRule[i] = 0;
  3216. // compatibility with m_spiderRoundStartTime:
  3217. m_spiderFreqs[i] = 0.0;
  3218. i++;
  3219. }
  3220. // 2nd default filter
  3221. // always turn this on for now. they need to add domains they want
  3222. // to crawl as seeds so they do not spider the web.
  3223. // no because FTB seeds with link pages that link to another
  3224. // domain. they just need to be sure to supply a crawl pattern
  3225. // to avoid spidering the whole web.
  3226. //
  3227. // if they did not EXPLICITLY provide a url crawl pattern or
  3228. // url crawl regex then restrict to seeds to prevent from spidering
  3229. // the entire internet.
  3230. //if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
  3231. // MDW: even if they supplied a crawl pattern let's restrict to seed
  3232. // domains 12/15/14
  3233. m_regExs[i].set("!isonsamedomain && !ismanualadd");
  3234. m_maxSpidersPerRule [i] = 0;
  3235. m_spiderPriorities [i] = 100; // delete!
  3236. m_forceDelete [i] = 1;
  3237. i++;
  3238. //}
  3239. bool ucpHasPositive = false;
  3240. // . scan them to see if all patterns start with '!' or not
  3241. // . if pattern starts with ! it is negative, otherwise positive
  3242. if ( ucp ) ucpHasPositive = hasPositivePattern ( ucp );
  3243. // if no crawl regex, and it has a crawl pattern consisting of
  3244. // only negative patterns then restrict to domains of seeds
  3245. if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
  3246. m_regExs[i].set("!isonsamedomain && !ismanualadd");
  3247. m_maxSpidersPerRule [i] = 0;
  3248. m_spiderPriorities [i] = 100; // delete!
  3249. m_forceDelete [i] = 1;
  3250. i++;
  3251. }
  3252. // don't bother re-spidering old pages if hopcount == maxhopcount
  3253. // and only process new urls is true. because we don't need to
  3254. // harvest outlinks from them.
  3255. if ( m_diffbotOnlyProcessIfNewUrl && m_diffbotMaxHops > 0 &&
  3256. // only crawls, not bulk jobs
  3257. m_isCustomCrawl == 1 ) {
  3258. m_regExs[i].purge();
  3259. m_regExs[i].safePrintf("isindexed && hopcount==%"INT32,
  3260. m_diffbotMaxHops );
  3261. m_spiderPriorities [i] = 14;
  3262. m_spiderFreqs [i] = 0.0;
  3263. m_maxSpidersPerRule [i] = 0; // turn off spiders
  3264. m_harvestLinks [i] = false;
  3265. i++;
  3266. }
  3267. // 3rd rule for respidering
  3268. // put this above the errocount>= rules below otherwise the crawl
  3269. // may never advance its round because it keeps retrying a ton of
  3270. // errored urls.
  3271. if ( respiderFreq > 0.0 ) {
  3272. m_regExs[i].set("lastspidertime>={roundstart}");
  3273. // do not "remove" from index
  3274. m_spiderPriorities [i] = 10;
  3275. // just turn off spidering. if we were to set priority to
  3276. // filtered it would be removed from index!
  3277. //m_spidersEnabled [i] = 0;
  3278. m_maxSpidersPerRule[i] = 0;
  3279. // temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
  3280. // which has been obsoleted, but we are running old code now!
  3281. //m_spiderDiffbotApiUrl[i].set ( api );
  3282. i++;
  3283. }
  3284. // if doing a one-shot crawl limit error retries to 3 times or
  3285. // if no urls currently available to spider, whichever comes first.
  3286. else {
  3287. m_regExs[i].set("errorcount>=3");
  3288. m_spiderPriorities [i] = 11;
  3289. m_spiderFreqs [i] = 0.0416;
  3290. m_maxSpidersPerRule [i] = 0; // turn off spiders
  3291. i++;
  3292. }
  3293. // diffbot needs to retry even on 500 or 404 errors since sometimes
  3294. // a seed url gets a 500 error mistakenly and it haults the crawl.
  3295. // so take out "!hastmperror".
  3296. m_regExs[i].set("errorcount>=1 && !hastmperror");
  3297. m_spiderPriorities [i] = 14;
  3298. m_spiderFreqs [i] = 0.0416; // every hour
  3299. //m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
  3300. i++;
  3301. // and for docs that have errors respider once every 5 hours
  3302. m_regExs[i].set("errorcount==1 && hastmperror");
  3303. m_spiderPriorities [i] = 40;
  3304. m_spiderFreqs [i] = 0.001; // 86 seconds
  3305. i++;
  3306. // and for docs that have errors respider once every 5 hours
  3307. m_regExs[i].set("errorcount==2 && hastmperror");
  3308. m_spiderPriorities [i] = 40;
  3309. m_spiderFreqs [i] = 0.003; // 3*86 seconds (was 24 hrs)
  3310. i++;
  3311. // excessive errors? (tcp/dns timed out, etc.) retry once per month?
  3312. m_regExs[i].set("errorcount>=3 && hastmperror");
  3313. m_spiderPriorities [i] = 39;
  3314. m_spiderFreqs [i] = .25; // 1/4 day
  3315. // if bulk job, do not download a url more than 3 times
  3316. if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
  3317. i++;
  3318. // if collectiverespiderfreq is 0 or less then do not RE-spider
  3319. // documents already indexed.
  3320. if ( respiderFreq <= 0.0 ) { // else {
  3321. // this does NOT work! error docs continuously respider
  3322. // because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
  3323. //m_regExs[i].set("isindexed");
  3324. m_regExs[i].set("hasreply");
  3325. m_spiderPriorities [i] = 10;
  3326. // just turn off spidering. if we were to set priority to
  3327. // filtered it would be removed from index!
  3328. //m_spidersEnabled [i] = 0;
  3329. m_maxSpidersPerRule[i] = 0;
  3330. // temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
  3331. // which has been obsoleted, but we are running old code now!
  3332. //m_spiderDiffbotApiUrl[i].set ( api );
  3333. i++;
  3334. }
  3335. // url crawl and PAGE process pattern
  3336. if ( ucp && ! upp && ppp ) {
  3337. // if just matches ucp, just crawl it, do not process
  3338. m_regExs[i].set("matchesucp");
  3339. m_spiderPriorities [i] = 53;
  3340. if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
  3341. // let's always make this without delay because if we
  3342. // restart the round we want these to process right away
  3343. if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
  3344. i++;
  3345. // crawl everything else, but don't harvest links,
  3346. // we have to see if the page content matches the "ppp"
  3347. // to determine whether the page should be processed or not.
  3348. m_regExs[i].set("default");
  3349. m_spiderPriorities [i] = 52;
  3350. if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
  3351. // let's always make this without delay because if we
  3352. // restart the round we want these to process right away
  3353. if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
  3354. m_harvestLinks [i] = false;
  3355. i++;
  3356. goto done;
  3357. }
  3358. // url crawl and process pattern
  3359. if ( ucp && upp ) {
  3360. m_regExs[i].set("matchesucp && matchesupp");
  3361. m_spiderPriorities [i] = 55;
  3362. if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
  3363. // let's always make this without delay because if we
  3364. // restart the round we want these to process right away
  3365. if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
  3366. //m_spiderDiffbotApiUrl[i].set ( api );
  3367. i++;
  3368. // if just matches ucp, just crawl it, do not process
  3369. m_regExs[i].set("matchesucp");
  3370. m_spiderPriorities [i] = 53;
  3371. if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
  3372. // let's always make this without delay because if we
  3373. // restart the round we want these to process right away
  3374. if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
  3375. i++;
  3376. // just process, do not spider links if does not match ucp
  3377. m_regExs[i].set("matchesupp");
  3378. m_spiderPriorities [i] = 54;
  3379. m_harvestLinks [i] = false;
  3380. if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
  3381. // let's always make this without delay because if we
  3382. // restart the round we want these to process right away
  3383. if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
  3384. //m_spiderDiffbotApiUrl[i].set ( api );
  3385. i++;
  3386. // do not crawl anything else
  3387. m_regExs[i].set("default");
  3388. m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
  3389. // don't spider
  3390. m_maxSpidersPerRule[i] = 0;
  3391. // this needs to be zero so &spiderRoundStart=0
  3392. // functionality which sets m_spiderRoundStartTime
  3393. // to the current time works
  3394. // otherwise Spider.cpp's getSpiderTimeMS() returns a time
  3395. // in the future and we can't force the round
  3396. m_spiderFreqs[i] = 0.0;
  3397. i++;
  3398. }
  3399. // harvest links if we should crawl it
  3400. if ( ucp && ! upp ) {
  3401. m_regExs[i].set("matchesucp");
  3402. m_spiderPriorities [i] = 53;
  3403. if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
  3404. // let's always make this without delay because if we
  3405. // restart the round we want these to process right away.
  3406. if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
  3407. // process everything since upp is empty
  3408. //m_spiderDiffbotApiUrl[i].set ( api );
  3409. i++;
  3410. // do not crawl anything else
  3411. m_regExs[i].set("default");
  3412. m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
  3413. // don't delete, just don't spider
  3414. m_maxSpidersPerRule[i] = 0;
  3415. // this needs to be zero so &spiderRoundStart=0
  3416. // functionality which sets m_spiderRoundStartTime
  3417. // to the current time works
  3418. // otherwise Spider.cpp's getSpiderTimeMS() returns a time
  3419. // in the future and we can't force the rounce
  3420. m_spiderFreqs[i] = 0.0;
  3421. i++;
  3422. }
  3423. // just process
  3424. if ( upp && ! ucp ) {
  3425. m_regExs[i].set("matchesupp");
  3426. m_spiderPriorities [i] = 54;
  3427. if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
  3428. // let's always make this without delay because if we
  3429. // restart the round we want these to process right away
  3430. if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
  3431. //m_harvestLinks [i] = false;
  3432. //m_spiderDiffbotApiUrl[i].set ( api );
  3433. i++;
  3434. // crawl everything by default, no processing
  3435. m_regExs[i].set("default");
  3436. m_spiderPriorities [i] = 50;
  3437. // this needs to be zero so &spiderRoundStart=0
  3438. // functionality which sets m_spiderRoundStartTime
  3439. // to the current time works
  3440. // otherwise Spider.cpp's getSpiderTimeMS() returns a time
  3441. // in the future and we can't force the rounce
  3442. m_spiderFreqs[i] = 0.0;
  3443. i++;
  3444. }
  3445. // no restraints
  3446. if ( ! upp && ! ucp ) {
  3447. // crawl everything by default, no processing
  3448. m_regExs[i].set("default");
  3449. m_spiderPriorities [i] = 50;
  3450. // this needs to be zero so &spiderRoundStart=0
  3451. // functionality which sets m_spiderRoundStartTime
  3452. // to the current time works
  3453. // otherwise Spider.cpp's getSpiderTimeMS() returns a time
  3454. // in the future and we can't force the rounce
  3455. m_spiderFreqs[i] = 0.0;
  3456. //m_spiderDiffbotApiUrl[i].set ( api );
  3457. i++;
  3458. }
  3459. done:
  3460. m_numRegExs = i;
  3461. m_numRegExs2 = i;
  3462. m_numRegExs3 = i;
  3463. m_numRegExs10 = i;
  3464. m_numRegExs5 = i;
  3465. m_numRegExs6 = i;
  3466. //m_numRegExs7 = i;
  3467. m_numRegExs8 = i;
  3468. m_numRegExs7 = i;
  3469. //m_numRegExs11 = i;
  3470. //char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
  3471. //if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
  3472. return true;
  3473. }
  3474. // . anytime the url filters are updated, this function is called
  3475. // . it is also called on load of the collection at startup
  3476. bool CollectionRec::rebuildUrlFilters ( ) {
  3477. if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
  3478. log("coll: Rebuilding url filters for %s ufp=%s",m_coll,
  3479. m_urlFiltersProfile.getBufStart());
  3480. // if not a custom crawl, and no expressions, add a default one
  3481. //if ( m_numRegExs == 0 && ! m_isCustomCrawl ) {
  3482. // setUrlFiltersToDefaults();
  3483. //}
  3484. // if not a custom crawl then set the url filters based on
  3485. // the url filter profile, if any
  3486. if ( ! m_isCustomCrawl )
  3487. rebuildUrlFilters2();
  3488. // set this so we know whether we have to keep track of page counts
  3489. // per subdomain/site and per domain. if the url filters have
  3490. // 'sitepages' 'domainpages' 'domainadds' or 'siteadds' we have to keep
  3491. // the count table SpiderColl::m_pageCountTable.
  3492. m_urlFiltersHavePageCounts = false;
  3493. for ( int32_t i = 0 ; i < m_numRegExs ; i++ ) {
  3494. // get the ith rule
  3495. SafeBuf *sb = &m_regExs[i];
  3496. char *p = sb->getBufStart();
  3497. if ( strstr(p,"sitepages") ||
  3498. strstr(p,"domainpages") ||
  3499. strstr(p,"siteadds") ||
  3500. strstr(p,"domainadds") ) {
  3501. m_urlFiltersHavePageCounts = true;
  3502. break;
  3503. }
  3504. }
  3505. // if collection is brand new being called from addNewColl()
  3506. // then sc will be NULL
  3507. SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(m_collnum);
  3508. // . do not do this at startup
  3509. // . this essentially resets doledb
  3510. if ( g_doledb.m_rdb.m_initialized &&
  3511. // somehow this is initialized before we set m_recs[m_collnum]
  3512. // so we gotta do the two checks below...
  3513. sc &&
  3514. // must be a valid coll
  3515. m_collnum < g_collectiondb.m_numRecs &&
  3516. g_collectiondb.m_recs[m_collnum] ) {
  3517. log("coll: resetting doledb for %s (%li)",m_coll,
  3518. (long)m_collnum);
  3519. // clear doledb recs from tree
  3520. //g_doledb.getRdb()->deleteAllRecs ( m_collnum );
  3521. nukeDoledb ( m_collnum );
  3522. // add it back
  3523. //if ( ! g_doledb.getRdb()->addRdbBase2 ( m_collnum ) )
  3524. // log("coll: error re-adding doledb for %s",m_coll);
  3525. // just start this over...
  3526. // . MDW left off here
  3527. //tryToDelete ( sc );
  3528. // maybe this is good enough
  3529. //if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
  3530. //CollectionRec *cr = sc->m_cr;
  3531. // . rebuild sitetable? in PageBasic.cpp.
  3532. // . re-adds seed spdierrequests using msg4
  3533. // . true = addSeeds
  3534. // . no, don't do this now because we call updateSiteList()
  3535. // when we have &sitelist=xxxx in the request which will
  3536. // handle updating those tables
  3537. //updateSiteListTables ( m_collnum ,
  3538. // true ,
  3539. // cr->m_siteListBuf.getBufStart() );
  3540. }
  3541. // If the crawl is not generated by crawlbot, then we will just update
  3542. // the regexes concerning the urls to process
  3543. rebuildDiffbotRegexes();
  3544. if ( ! m_isCustomCrawl ){
  3545. return true;
  3546. }
  3547. // on the other hand, if it is a crawlbot job, then by convention the url filters are all set
  3548. // to some default ones.
  3549. return rebuildUrlFiltersDiffbot();
  3550. }
  3551. // for some reason the libc we use doesn't support these int16_tcuts,
  3552. // so expand them to something it does support
  3553. bool expandRegExShortcuts ( SafeBuf *sb ) {
  3554. if ( ! sb->safeReplace3 ( "\\d" , "[0-9]" ) ) return false;
  3555. if ( ! sb->safeReplace3 ( "\\D" , "[^0-9]" ) ) return false;
  3556. if ( ! sb->safeReplace3 ( "\\l" , "[a-z]" ) ) return false;
  3557. if ( ! sb->safeReplace3 ( "\\a" , "[A-Za-z]" ) ) return false;
  3558. if ( ! sb->safeReplace3 ( "\\u" , "[A-Z]" ) ) return false;
  3559. if ( ! sb->safeReplace3 ( "\\w" , "[A-Za-z0-9_]" ) ) return false;
  3560. if ( ! sb->safeReplace3 ( "\\W" , "[^A-Za-z0-9_]" ) ) return false;
  3561. return true;
  3562. }
  3563. void testRegex ( ) {
  3564. //
  3565. // TEST
  3566. //
  3567. char *rx;
  3568. rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=\\d";
  3569. rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";
  3570. rx = ".*?article[0-9]*?.html";
  3571. regex_t ucr;
  3572. int32_t err;
  3573. if ( ( err = regcomp ( &ucr , rx ,
  3574. REG_ICASE
  3575. |REG_EXTENDED
  3576. //|REG_NEWLINE
  3577. //|REG_NOSUB
  3578. ) ) ) {
  3579. // error!
  3580. char errbuf[1024];
  3581. regerror(err,&ucr,errbuf,1000);
  3582. log("xmldoc: regcomp %s failed: %s. "
  3583. "Ignoring.",
  3584. rx,errbuf);
  3585. }
  3586. logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
  3587. //char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
  3588. char *url = "http://staticpages.diffbot.com/testCrawl/regex/article1.html";
  3589. if ( regexec(&ucr,url,0,NULL,0) )
  3590. logf(LOG_DEBUG,"db: failed to match %s on %s",
  3591. url,rx);
  3592. else
  3593. logf(LOG_DEBUG,"db: MATCHED %s on %s",
  3594. url,rx);
  3595. exit(0);
  3596. }
  3597. int64_t CollectionRec::getNumDocsIndexed() {
  3598. RdbBase *base = getBase(RDB_TITLEDB);//m_bases[RDB_TITLEDB];
  3599. if ( ! base ) return 0LL;
  3600. return base->getNumGlobalRecs();
  3601. }
  3602. // messes with m_spiderColl->m_sendLocalCrawlInfoToHost[MAX_HOSTS]
  3603. // so we do not have to keep sending this huge msg!
  3604. bool CollectionRec::shouldSendLocalCrawlInfoToHost ( int32_t hostId ) {
  3605. if ( ! m_spiderColl ) return false;
  3606. if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
  3607. if ( hostId >= g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
  3608. // sanity
  3609. return m_spiderColl->m_sendLocalCrawlInfoToHost[hostId];
  3610. }
  3611. void CollectionRec::localCrawlInfoUpdate() {
  3612. if ( ! m_spiderColl ) return;
  3613. // turn on all the flags
  3614. memset(m_spiderColl->m_sendLocalCrawlInfoToHost,1,g_hostdb.m_numHosts);
  3615. }
  3616. // right after we send copy it for sending we set this so we do not send
  3617. // again unless localCrawlInfoUpdate() is called
  3618. void CollectionRec::sentLocalCrawlInfoToHost ( int32_t hostId ) {
  3619. if ( ! m_spiderColl ) return;
  3620. m_spiderColl->m_sendLocalCrawlInfoToHost[hostId] = 0;
  3621. }