/Collectiondb.cpp
C++ | 4250 lines | 2151 code | 607 blank | 1492 comment | 397 complexity | 6c162a71eb7c344fb15b6f36dc9b5b10 MD5 | raw file
Possible License(s): Apache-2.0
- #include "gb-include.h"
- #include "Collectiondb.h"
- //#include "CollectionRec.h"
- #include "Xml.h"
- #include "Url.h"
- #include "Loop.h"
- #include "Spider.h" // for calling SpiderLoop::collectionsUpdated()
- #include "Posdb.h"
- //#include "Indexdb.h"
- #include "Datedb.h"
- #include "Titledb.h"
- //#include "Revdb.h"
- //#include "Sections.h"
- #include "Placedb.h"
- #include "Tagdb.h"
- #include "Catdb.h"
- #include "Tfndb.h"
- #include "Spider.h"
- //#include "Checksumdb.h"
- #include "Clusterdb.h"
- #include "Spider.h"
- #include "Repair.h"
- #include "Users.h"
- #include "Parms.h"
- void testRegex ( ) ;
- HashTableX g_collTable;
- // a global class extern'd in .h file
- Collectiondb g_collectiondb;
- Collectiondb::Collectiondb ( ) {
- m_wrapped = 0;
- m_numRecs = 0;
- m_numRecsUsed = 0;
- m_numCollsSwappedOut = 0;
- m_initializing = false;
- //m_lastUpdateTime = 0LL;
- m_needsSave = false;
- // sanity
- if ( RDB_END2 >= RDB_END ) return;
- log("db: increase RDB_END2 to at least %"INT32" in "
- "Collectiondb.h",(int32_t)RDB_END);
- char *xx=NULL;*xx=0;
- }
- // reset rdb
- void Collectiondb::reset() {
- log(LOG_INFO,"db: resetting collectiondb.");
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
- delete ( m_recs[i] );
- m_recs[i] = NULL;
- }
- m_numRecs = 0;
- m_numRecsUsed = 0;
- g_collTable.reset();
- }
- /*
- bool Collectiondb::init ( bool isDump ) {
- reset();
- if ( g_isYippy ) return true;
- // reset # of recs
- //m_numRecs = 0;
- //m_numRecsUsed = 0;
- // . now load ALL recs
- // . returns false and sets g_errno on error
- if ( ! load ( isDump ) ) return false;
- // update time
- updateTime();
- // so we don't save again
- m_needsSave = false;
- // sanity
- if ( RDB_END2 < RDB_END ) {
- log("db: increase RDB_END2 to at least %"INT32" in "
- "Collectiondb.h",(int32_t)RDB_END);
- char *xx=NULL;*xx=0;
- }
- // if it set g_errno, return false
- //if ( g_errno ) return log("admin: Had init error: %s.",
- // mstrerror(g_errno));
- g_errno = 0;
- // otherwise, true, even if reloadList() blocked
- return true;
- }
- */
- extern bool g_inAutoSave;
- // . save to disk
- // . returns false if blocked, true otherwise
- bool Collectiondb::save ( ) {
- if ( g_conf.m_readOnlyMode ) return true;
- if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_hostId != 0 )
- return true;
- // which collection rec needs a save
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- // temp debug message
- //logf(LOG_DEBUG,"admin: SAVING collection #%"INT32" ANYWAY",i);
- if ( ! m_recs[i]->m_needsSave ) continue;
- // if we core in malloc we won't be able to save the
- // coll.conf files
- if ( m_recs[i]->m_isCustomCrawl &&
- g_inMemFunction &&
- g_hostdb.m_hostId != 0 )
- continue;
- //log(LOG_INFO,"admin: Saving collection #%"INT32".",i);
- m_recs[i]->save ( );
- }
- // oh well
- return true;
- }
- ///////////
- //
- // fill up our m_recs[] array based on the coll.*.*/coll.conf files
- //
- ///////////
- bool Collectiondb::loadAllCollRecs ( ) {
- m_initializing = true;
- char dname[1024];
- // MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
- sprintf ( dname , "%s" , g_hostdb.m_dir );
- Dir d;
- d.set ( dname );
- if ( ! d.open ()) return log("admin: Could not load collection config "
- "files.");
- int32_t count = 0;
- char *f;
- while ( ( f = d.getNextFilename ( "*" ) ) ) {
- // skip if first char not "coll."
- if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
- // must end on a digit (i.e. coll.main.0)
- if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
- // count them
- count++;
- }
- // reset directory for another scan
- d.set ( dname );
- if ( ! d.open ()) return log("admin: Could not load collection config "
- "files.");
- // note it
- //log(LOG_INFO,"db: loading collection config files.");
- // . scan through all subdirs in the collections dir
- // . they should be like, "coll.main/" and "coll.mycollection/"
- while ( ( f = d.getNextFilename ( "*" ) ) ) {
- // skip if first char not "coll."
- if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
- // must end on a digit (i.e. coll.main.0)
- if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
- // point to collection
- char *coll = f + 5;
- // NULL terminate at .
- char *pp = strchr ( coll , '.' );
- if ( ! pp ) continue;
- *pp = '\0';
- // get collnum
- collnum_t collnum = atol ( pp + 1 );
- // add it
- if ( ! addExistingColl ( coll , collnum ) )
- return false;
- // swap it out if we got 100+ collections
- // if ( count < 100 ) continue;
- // CollectionRec *cr = getRec ( collnum );
- // if ( cr ) cr->swapOut();
- }
- // if no existing recs added... add coll.main.0 always at startup
- if ( m_numRecs == 0 ) {
- log("admin: adding main collection.");
- addNewColl ( "main",
- 0 , // customCrawl ,
- NULL,
- 0 ,
- true , // bool saveIt ,
- // Parms.cpp reserves this so it can be sure
- // to add the same collnum to every shard
- 0 );
- }
- m_initializing = false;
- // note it
- //log(LOG_INFO,"db: Loaded data for %"INT32" collections. Ranging from "
- // "collection #0 to #%"INT32".",m_numRecsUsed,m_numRecs-1);
- // update the time
- //updateTime();
- // don't clean the tree if just dumpin
- //if ( isDump ) return true;
- return true;
- }
- // after we've initialized all rdbs in main.cpp call this to clean out
- // our rdb trees
- bool Collectiondb::cleanTrees ( ) {
- // remove any nodes with illegal collnums
- Rdb *r;
- //r = g_indexdb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- r = g_posdb.getRdb();
- //r->m_tree.cleanTree ();//(char **)r->m_bases);
- r->m_buckets.cleanBuckets();
- //r = g_datedb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- r = g_titledb.getRdb();
- r->m_tree.cleanTree ();//(char **)r->m_bases);
- //r = g_revdb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- //r = g_sectiondb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- //r = g_checksumdb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- //r = g_tfndb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- r = g_spiderdb.getRdb();
- r->m_tree.cleanTree ();//(char **)r->m_bases);
- r = g_doledb.getRdb();
- r->m_tree.cleanTree ();//(char **)r->m_bases);
- // success
- return true;
- }
- /*
- void Collectiondb::updateTime() {
- // get time now in milliseconds
- int64_t newTime = gettimeofdayInMilliseconds();
- // change it
- if ( m_lastUpdateTime == newTime ) newTime++;
- // update it
- m_lastUpdateTime = newTime;
- // we need a save
- m_needsSave = true;
- }
- */
- #include "Statsdb.h"
- #include "Cachedb.h"
- #include "Syncdb.h"
- // same as addOldColl()
- bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
- int32_t i = collnum;
- // ensure does not already exist in memory
- collnum_t oldCollnum = getCollnum(coll);
- if ( oldCollnum >= 0 ) {
- g_errno = EEXIST;
- log("admin: Trying to create collection \"%s\" but "
- "already exists in memory. Do an ls on "
- "the working dir to see if there are two "
- "collection dirs with the same coll name",coll);
- char *xx=NULL;*xx=0;
- }
- // also try by #, i've seen this happen too
- CollectionRec *ocr = getRec ( i );
- if ( ocr ) {
- g_errno = EEXIST;
- log("admin: Collection id %i is in use already by "
- "%s, so we can not add %s. moving %s to trash."
- ,(int)i,ocr->m_coll,coll,coll);
- SafeBuf cmd;
- int64_t now = gettimeofdayInMilliseconds();
- cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%"UINT64
- , coll
- ,(int)i
- , coll
- ,(int)i
- , now );
- //log("admin: %s",cmd.getBufStart());
- gbsystem ( cmd.getBufStart() );
- return true;
- }
- // create the record in memory
- CollectionRec *cr = new (CollectionRec);
- if ( ! cr )
- return log("admin: Failed to allocated %"INT32" bytes for new "
- "collection record for \"%s\".",
- (int32_t)sizeof(CollectionRec),coll);
- mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
- // set collnum right for g_parms.setToDefault() call just in case
- // because before it was calling CollectionRec::reset() which
- // was resetting the RdbBases for the m_collnum which was garbage
- // and ended up resetting random collections' rdb. but now
- // CollectionRec::CollectionRec() sets m_collnum to -1 so we should
- // not need this!
- //cr->m_collnum = oldCollnum;
- // get the default.conf from working dir if there
- g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
- strcpy ( cr->m_coll , coll );
- cr->m_collLen = gbstrlen ( coll );
- cr->m_collnum = i;
- // point to this, so Rdb and RdbBase can reference it
- coll = cr->m_coll;
- //log("admin: loaded old coll \"%s\"",coll);
- // load coll.conf file
- if ( ! cr->load ( coll , i ) ) {
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- log("admin: Failed to load coll.%s.%"INT32"/coll.conf",coll,i);
- delete ( cr );
- if ( m_recs ) m_recs[i] = NULL;
- return false;
- }
- if ( ! registerCollRec ( cr , false ) ) return false;
- // always index spider status docs now for custom crawls
- if ( cr->m_isCustomCrawl )
- cr->m_indexSpiderReplies = true;
- // and don't do link voting, will help speed up
- if ( cr->m_isCustomCrawl ) {
- cr->m_getLinkInfo = false;
- cr->m_computeSiteNumInlinks = false;
- // limit each shard to 5 spiders per collection to prevent
- // ppl from spidering the web and hogging up resources
- cr->m_maxNumSpiders = 5;
- // diffbot download docs up to 50MB so we don't truncate
- // things like sitemap.xml. but keep regular html pages
- // 1MB
- cr->m_maxTextDocLen = 1024*1024;
- // xml, pdf, etc can be this. 50MB
- cr->m_maxOtherDocLen = 50000000;
- }
- // we need to compile the regular expressions or update the url
- // filters with new logic that maps crawlbot parms to url filters
- return cr->rebuildUrlFilters ( );
- }
- // . add a new rec
- // . returns false and sets g_errno on error
- // . was addRec()
- // . "isDump" is true if we don't need to initialize all the rdbs etc
- // because we are doing a './gb dump ...' cmd to dump out data from
- // one Rdb which we will custom initialize in main.cpp where the dump
- // code is. like for instance, posdb.
- // . "customCrawl" is 0 for a regular collection, 1 for a simple crawl
- // 2 for a bulk job. diffbot terminology.
- bool Collectiondb::addNewColl ( char *coll ,
- char customCrawl ,
- char *cpc ,
- int32_t cpclen ,
- bool saveIt ,
- // Parms.cpp reserves this so it can be sure
- // to add the same collnum to every shard
- collnum_t newCollnum ) {
- //do not send add/del coll request until we are in sync with shard!!
- // just return ETRYAGAIN for the parmlist...
- // ensure coll name is legit
- char *p = coll;
- for ( ; *p ; p++ ) {
- if ( is_alnum_a(*p) ) continue;
- if ( *p == '-' ) continue;
- if ( *p == '_' ) continue; // underscore now allowed
- break;
- }
- if ( *p ) {
- g_errno = EBADENGINEER;
- log("admin: \"%s\" is a malformed collection name because it "
- "contains the '%c' character.",coll,*p);
- return false;
- }
- // . scan for holes
- // . i is also known as the collection id
- //int32_t i = (int32_t)newCollnum;
- // no longer fill empty slots because if they do a reset then
- // a new rec right away it will be filled with msg4 recs not
- // destined for it. Later we will have to recycle some how!!
- //else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
- // right now we #define collnum_t int16_t. so do not breach that!
- //if ( m_numRecs < 0x7fff ) {
- // // set it
- // i = m_numRecs;
- // // claim it
- // // we don't do it here, because we check i below and
- // // increment m_numRecs below.
- // //m_numRecs++;
- //}
- // TODO: scan for holes here...
- //else {
- if ( newCollnum < 0 ) { char *xx=NULL;*xx=0; }
- // ceiling?
- //int64_t maxColls = 1LL<<(sizeof(collnum_t)*8);
- //if ( i >= maxColls ) {
- // g_errno = ENOBUFS;
- // return log("admin: Limit of %"INT64" collection reached. "
- // "Collection not created.",maxColls);
- //}
- // if empty... bail, no longer accepted, use "main"
- if ( ! coll || !coll[0] ) {
- g_errno = EBADENGINEER;
- return log("admin: Trying to create a new collection "
- "but no collection name provided. Use the \"c\" "
- "cgi parameter to specify it.");
- }
- // or if too big
- if ( gbstrlen(coll) > MAX_COLL_LEN ) {
- g_errno = ENOBUFS;
- return log("admin: Trying to create a new collection "
- "whose name \"%s\" of %i chars is longer than the "
- "max of %"INT32" chars.",coll,gbstrlen(coll),
- (int32_t)MAX_COLL_LEN);
- }
-
- // ensure does not already exist in memory
- if ( getCollnum ( coll ) >= 0 ) {
- g_errno = EEXIST;
- log("admin: Trying to create collection \"%s\" but "
- "already exists in memory.",coll);
- // just let it pass...
- g_errno = 0 ;
- return true;
- }
- // MDW: ensure not created on disk since time of last load
- char dname[512];
- sprintf(dname, "%scoll.%s.%"INT32"/",g_hostdb.m_dir,coll,(int32_t)newCollnum);
- DIR *dir = opendir ( dname );
- if ( dir ) closedir ( dir );
- if ( dir ) {
- g_errno = EEXIST;
- return log("admin: Trying to create collection %s but "
- "directory %s already exists on disk.",coll,dname);
- }
- // create the record in memory
- CollectionRec *cr = new (CollectionRec);
- if ( ! cr )
- return log("admin: Failed to allocated %"INT32" bytes for new "
- "collection record for \"%s\".",
- (int32_t)sizeof(CollectionRec),coll);
- // register the mem
- mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
- // get copy collection
- //CollectionRec *cpcrec = NULL;
- //if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
- //if ( cpc && cpc[0] && ! cpcrec )
- // log("admin: Collection \"%s\" to copy config from does not "
- // "exist.",cpc);
- // set collnum right for g_parms.setToDefault() call
- //cr->m_collnum = newCollnum;
- // . get the default.conf from working dir if there
- // . i think this calls CollectionRec::reset() which resets all of its
- // rdbbase classes for its collnum so m_collnum needs to be right
- //g_parms.setToDefault( (char *)cr );
- // get the default.conf from working dir if there
- //g_parms.setToDefault( (char *)cr , OBJ_COLL );
- g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
- // put search results back so it doesn't mess up results in qatest123
- if ( strcmp(coll,"qatest123") == 0 )
- cr->m_sameLangWeight = 20.0;
- /*
- // the default conf file
- char tmp1[1024];
- sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
- // . set our parms from the file.
- // . accepts OBJ_COLLECTIONREC or OBJ_CONF
- g_parms.setFromFile ( cr , NULL , tmp1 );
- */
- // this will override all
- // if ( cpcrec ) {
- // // copy it, but not the timedb hashtable, etc.
- // int32_t size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
- // // JAB: bad gbmemcpy - no donut!
- // // this is not how objects are supposed to be copied!!!
- // gbmemcpy ( cr , cpcrec , size);
- // }
- // set coll id and coll name for coll id #i
- strcpy ( cr->m_coll , coll );
- cr->m_collLen = gbstrlen ( coll );
- cr->m_collnum = newCollnum;
- // point to this, so Rdb and RdbBase can reference it
- coll = cr->m_coll;
- //
- // BEGIN NEW CODE
- //
- //
- // get token and crawlname if customCrawl is 1 or 2
- //
- char *token = NULL;
- char *crawl = NULL;
- SafeBuf tmp;
- // . return true with g_errno set on error
- // . if we fail to set a parm right we should force ourselves
- // out sync
- if ( customCrawl ) {
- if ( ! tmp.safeStrcpy ( coll ) ) return true;
- token = tmp.getBufStart();
- // diffbot coll name format is <token>-<crawlname>
- char *h = strchr ( tmp.getBufStart() , '-' );
- if ( ! h ) {
- log("crawlbot: bad custom collname");
- g_errno = EBADENGINEER;
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- return true;
- }
- *h = '\0';
- crawl = h + 1;
- if ( ! crawl[0] ) {
- log("crawlbot: bad custom crawl name");
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- g_errno = EBADENGINEER;
- return true;
- }
- // or if too big!
- if ( gbstrlen(crawl) > 30 ) {
- log("crawlbot: crawlbot crawl NAME is over 30 chars");
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- g_errno = EBADENGINEER;
- return true;
- }
- }
- //log("parms: added new collection \"%s\"", collName );
- cr->m_maxToCrawl = -1;
- cr->m_maxToProcess = -1;
- if ( customCrawl ) {
- // always index spider status docs now
- cr->m_indexSpiderReplies = true;
- // remember the token
- cr->m_diffbotToken.set ( token );
- cr->m_diffbotCrawlName.set ( crawl );
- // bring this back
- cr->m_diffbotApiUrl.set ( "" );
- cr->m_diffbotUrlCrawlPattern.set ( "" );
- cr->m_diffbotUrlProcessPattern.set ( "" );
- cr->m_diffbotPageProcessPattern.set ( "" );
- cr->m_diffbotUrlCrawlRegEx.set ( "" );
- cr->m_diffbotUrlProcessRegEx.set ( "" );
- cr->m_diffbotMaxHops = -1;
-
- cr->m_spiderStatus = SP_INITIALIZING;
- // do not spider more than this many urls total.
- // -1 means no max.
- cr->m_maxToCrawl = 100000;
- // do not process more than this. -1 means no max.
- cr->m_maxToProcess = 100000;
- // -1 means no max
- cr->m_maxCrawlRounds = -1;
- // diffbot download docs up to 10MB so we don't truncate
- // things like sitemap.xml
- cr->m_maxTextDocLen = 10000000;
- cr->m_maxOtherDocLen = 10000000;
- // john wants deduping on by default to avoid
- // processing similar pgs
- cr->m_dedupingEnabled = true;
- // show the ban links in the search results. the
- // collection name is cryptographic enough to show that
- cr->m_isCustomCrawl = customCrawl;
- cr->m_diffbotOnlyProcessIfNewUrl = true;
- // default respider to off
- cr->m_collectiveRespiderFrequency = 0.0;
- //cr->m_restrictDomain = true;
- // reset the crawl stats
- // always turn off gigabits so &s=1000 can do summary skipping
- cr->m_docsToScanForTopics = 0;
- // turn off link voting, etc. to speed up
- cr->m_getLinkInfo = false;
- cr->m_computeSiteNumInlinks = false;
- }
- // . this will core if a host was dead and then when it came
- // back up host #0's parms.cpp told it to add a new coll
- cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
- cr->m_diffbotCrawlEndTime = 0;
-
- // . just the basics on these for now
- // . if certain parms are changed then the url filters
- // must be rebuilt, as well as possibly the waiting tree!!!
- // . need to set m_urlFiltersHavePageCounts etc.
- cr->rebuildUrlFilters ( );
- cr->m_useRobotsTxt = true;
- // reset crawler stats.they should be loaded from crawlinfo.txt
- memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
- memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );
- // note that
- log("colldb: initial revival for %s",cr->m_coll);
- // . assume we got some urls ready to spider
- // . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
- // urls it spidered in that time these will get set to 0 and it
- // will send out an email alert if m_sentCrawlDoneAlert is not true.
- cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
- cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;
- // set some defaults. max spiders for all priorities in this
- // collection. NO, default is in Parms.cpp.
- //cr->m_maxNumSpiders = 10;
- //cr->m_needsSave = 1;
- // start the spiders!
- cr->m_spideringEnabled = true;
- // override this?
- saveIt = true;
- //
- // END NEW CODE
- //
- //log("admin: adding coll \"%s\" (new=%"INT32")",coll,(int32_t)isNew);
- // MDW: create the new directory
- retry22:
- if ( ::mkdir ( dname ,
- getDirCreationFlags() ) ) {
- // S_IRUSR | S_IWUSR | S_IXUSR |
- // S_IRGRP | S_IWGRP | S_IXGRP |
- // S_IROTH | S_IXOTH ) ) {
- // valgrind?
- if ( errno == EINTR ) goto retry22;
- g_errno = errno;
- mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
- delete ( cr );
- return log("admin: Creating directory %s had error: "
- "%s.", dname,mstrerror(g_errno));
- }
- // save it into this dir... might fail!
- if ( saveIt && ! cr->save() ) {
- mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
- delete ( cr );
- return log("admin: Failed to save file %s: %s",
- dname,mstrerror(g_errno));
- }
- if ( ! registerCollRec ( cr , true ) )
- return false;
- // add the rdbbases for this coll, CollectionRec::m_bases[]
- if ( ! addRdbBasesForCollRec ( cr ) )
- return false;
- return true;
- }
- void CollectionRec::setBasePtr ( char rdbId , class RdbBase *base ) {
- // if in the process of swapping in, this will be false...
- //if ( m_swappedOut ) { char *xx=NULL;*xx=0; }
- if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
- // Rdb::deleteColl() will call this even though we are swapped in
- // but it calls it with "base" set to NULL after it nukes the RdbBase
- // so check if base is null here.
- if ( base && m_bases[ (unsigned char)rdbId ]){ char *xx=NULL;*xx=0; }
- m_bases [ (unsigned char)rdbId ] = base;
- }
- RdbBase *CollectionRec::getBasePtr ( char rdbId ) {
- if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
- return m_bases [ (unsigned char)rdbId ];
- }
- static bool s_inside = false;
- // . returns NULL w/ g_errno set on error.
- // . TODO: ensure not called from in thread, not thread safe
- RdbBase *CollectionRec::getBase ( char rdbId ) {
- if ( s_inside ) { char *xx=NULL;*xx=0; }
- if ( ! m_swappedOut ) return m_bases[(unsigned char)rdbId];
- log("cdb: swapin collnum=%"INT32"",(int32_t)m_collnum);
- // sanity!
- if ( g_threads.amThread() ) { char *xx=NULL;*xx=0; }
- s_inside = true;
- // turn off quickpoll to avoid getbase() being re-called and
- // coring from s_inside being true
- int32_t saved = g_conf.m_useQuickpoll;
- g_conf.m_useQuickpoll = false;
- // load them back in. return NULL w/ g_errno set on error.
- if ( ! g_collectiondb.addRdbBasesForCollRec ( this ) ) {
- log("coll: error swapin: %s",mstrerror(g_errno));
- g_conf.m_useQuickpoll = saved;
- s_inside = false;
- return NULL;
- }
- g_conf.m_useQuickpoll = saved;
- s_inside = false;
- g_collectiondb.m_numCollsSwappedOut--;
- m_swappedOut = false;
- log("coll: swapin was successful for collnum=%"INT32"",(int32_t)m_collnum);
- return m_bases[(unsigned char)rdbId];
- }
- bool CollectionRec::swapOut ( ) {
- if ( m_swappedOut ) return true;
- log("cdb: swapout collnum=%"INT32"",(int32_t)m_collnum);
- // free all RdbBases in each rdb
- for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
- Rdb *rdb = g_process.m_rdbs[i];
- // this frees all the RdbBase::m_files and m_maps for the base
- rdb->resetBase ( m_collnum );
- }
- // now free each base itself
- for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
- RdbBase *base = m_bases[i];
- if ( ! base ) continue;
- mdelete (base, sizeof(RdbBase), "Rdb Coll");
- delete (base);
- m_bases[i] = NULL;
- }
- m_swappedOut = true;
- g_collectiondb.m_numCollsSwappedOut++;
- return true;
- }
- // . called only by addNewColl() and by addExistingColl()
- bool Collectiondb::registerCollRec ( CollectionRec *cr , bool isNew ) {
- // add m_recs[] and to hashtable
- if ( ! setRecPtr ( cr->m_collnum , cr ) )
- return false;
- return true;
- }
- // swap it in
- bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- CollectionRec *cr = m_recs[i];
- if ( ! cr ) continue;
- // skip if swapped out
- if ( cr->m_swappedOut ) continue;
- // add rdb base files etc. for it
- addRdbBasesForCollRec ( cr );
- }
- // now clean the trees. moved this into here from
- // addRdbBasesForCollRec() since we call addRdbBasesForCollRec()
- // now from getBase() to load on-demand for saving memory
- cleanTrees();
- return true;
- }
- bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
- char *coll = cr->m_coll;
- //////
- //
- // if we are doing a dump from the command line, skip this stuff
- //
- //////
- if ( g_dumpMode ) return true;
- // tell rdbs to add one, too
- //if ( ! g_indexdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_datedb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
-
- if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_revdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_sectiondb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_catdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_tfndb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- // now clean the trees
- //cleanTrees();
- // debug message
- //log ( LOG_INFO, "db: verified collection \"%s\" (%"INT32").",
- // coll,(int32_t)cr->m_collnum);
- // tell SpiderCache about this collection, it will create a
- // SpiderCollection class for it.
- //g_spiderCache.reset1();
- // success
- return true;
- hadError:
- log("db: error registering coll: %s",mstrerror(g_errno));
- return false;
- }
- /*
- bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
- if ( r->getLong("admin",1) == 0 ) return false;
- if ( g_conf.isMasterAdmin ( s , r ) ) return true;
- char *c = r->getString ( "c" );
- CollectionRec *cr = getRec ( c );
- if ( ! cr ) return false;
- return g_users.hasPermission ( r , PAGE_SEARCH );
- //return cr->hasPermission ( r , s );
- }
- void savingCheckWrapper1 ( int fd , void *state ) {
- WaitEntry *we = (WaitEntry *)state;
- // no state?
- if ( ! we ) { log("colldb: we1 is null"); return; }
- // unregister too
- g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
- // if it blocked again i guess tree is still saving
- if ( ! g_collectiondb.resetColl ( we->m_coll ,
- we ,
- we->m_purgeSeeds))
- return;
- // all done
- we->m_callback ( we->m_state );
- }
- void savingCheckWrapper2 ( int fd , void *state ) {
- WaitEntry *we = (WaitEntry *)state;
- // no state?
- if ( ! we ) { log("colldb: we2 is null"); return; }
- // unregister too
- g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
- // if it blocked again i guess tree is still saving
- if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
- // all done
- we->m_callback ( we->m_state );
- }
- */
- /*
- // delete all records checked in the list
- bool Collectiondb::deleteRecs ( HttpRequest *r ) {
- for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
- char *f = r->getField ( i );
- if ( strncmp ( f , "del" , 3 ) != 0 ) continue;
- char *coll = f + 3;
- //if ( ! is_digit ( f[3] ) ) continue;
- //int32_t h = atol ( f + 3 );
- deleteRec ( coll , NULL );
- }
- return true;
- }
- */
- /*
- // . delete a collection
- // . this uses blocking unlinks, may make non-blocking later
- // . returns false if blocked, true otherwise
- bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
- // force on for now
- //deleteTurkdb = true;
- // no spiders can be out. they may be referencing the CollectionRec
- // in XmlDoc.cpp... quite likely.
- //if ( g_conf.m_spideringEnabled ||
- // g_spiderLoop.m_numSpidersOut > 0 ) {
- // log("admin: Can not delete collection while "
- // "spiders are enabled or active.");
- // return false;
- //}
- // ensure it's not NULL
- if ( ! coll ) {
- log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
- g_errno = ENOTFOUND;
- return true;
- }
- // find the rec for this collection
- collnum_t collnum = getCollnum ( coll );
- return deleteRec2 ( collnum , we );
- }
- */
- // if there is an outstanding disk read thread or merge thread then
- // Spider.cpp will handle the delete in the callback.
- // this is now tryToDeleteSpiderColl in Spider.cpp
- /*
- void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
- sc->m_deleteMyself = true;
- // if not currently being accessed nuke it now
- if ( ! sc->m_msg5.m_waitingForList &&
- ! sc->m_msg5b.m_waitingForList &&
- ! sc->m_msg1.m_mcast.m_inUse ) {
- mdelete ( sc, sizeof(SpiderColl),"nukecr2");
- delete ( sc );
- return;
- }
- }
- */
- /// this deletes the collection, not just part of a reset.
- bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
- // do not allow this if in repair mode
- if ( g_repair.isRepairActive() && g_repair.m_collnum == collnum ) {
- log("admin: Can not delete collection while in repair mode.");
- g_errno = EBADENGINEER;
- return true;
- }
- // bitch if not found
- if ( collnum < 0 ) {
- g_errno = ENOTFOUND;
- log(LOG_LOGIC,"admin: Collection #%"INT32" is bad, "
- "delete failed.",(int32_t)collnum);
- return true;
- }
- CollectionRec *cr = m_recs [ collnum ];
- if ( ! cr ) {
- log("admin: Collection id problem. Delete failed.");
- g_errno = ENOTFOUND;
- return true;
- }
- if ( g_process.isAnyTreeSaving() ) {
- // note it
- log("admin: tree is saving. waiting2.");
- // all done
- return false;
- }
- // spiders off
- //if ( cr->m_spiderColl &&
- // cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
- // log("admin: Can not delete collection while "
- // "spiders are outstanding for collection. Turn off "
- // "spiders and wait for them to exit.");
- // return false;
- //}
- char *coll = cr->m_coll;
- // note it
- log(LOG_INFO,"db: deleting coll \"%s\" (%"INT32")",coll,
- (int32_t)cr->m_collnum);
- // we need a save
- m_needsSave = true;
- // nuke doleiptable and waintree and waitingtable
- /*
- SpiderColl *sc = g_spiderCache.getSpiderColl ( collnum );
- sc->m_waitingTree.clear();
- sc->m_waitingTable.clear();
- sc->m_doleIpTable.clear();
- g_spiderLoop.m_lockTable.clear();
- g_spiderLoop.m_lockCache.clear(0);
- sc->m_lastDownloadCache.clear(collnum);
- */
- // CAUTION: tree might be in the middle of saving
- // we deal with this in Process.cpp now
- // remove from spider cache, tell it to sync up with collectiondb
- //g_spiderCache.reset1();
- // . TODO: remove from g_sync
- // . remove from all rdbs
- //g_indexdb.getRdb()->delColl ( coll );
- g_posdb.getRdb()->delColl ( coll );
- //g_datedb.getRdb()->delColl ( coll );
- g_titledb.getRdb()->delColl ( coll );
- //g_revdb.getRdb()->delColl ( coll );
- //g_sectiondb.getRdb()->delColl ( coll );
- g_tagdb.getRdb()->delColl ( coll );
- // let's preserve the tags... they have all the turk votes in them
- //if ( deleteTurkdb ) {
- //}
- //g_catdb.getRdb()->delColl ( coll );
- //g_checksumdb.getRdb()->delColl ( coll );
- g_spiderdb.getRdb()->delColl ( coll );
- g_doledb.getRdb()->delColl ( coll );
- //g_tfndb.getRdb()->delColl ( coll );
- g_clusterdb.getRdb()->delColl ( coll );
- g_linkdb.getRdb()->delColl ( coll );
- // reset spider info
- SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
- if ( sc ) {
- // remove locks from lock table:
- sc->clearLocks();
- //sc->m_collnum = newCollnum;
- //sc->reset();
- // you have to set this for tryToDeleteSpiderColl to
- // actually have a shot at deleting it
- sc->m_deleteMyself = true;
- // cr will be invalid int16_tly after this
- // MDW: this is causing the core...
- // use fake ptrs for easier debugging
- //sc->m_cr = (CollectionRec *)0x99999;//NULL;
- //sc->m_cr = NULL;
- sc->setCollectionRec ( NULL );
- // this will put it on "death row" so it will be deleted
- // once Msg5::m_waitingForList/Merge is NULL
- tryToDeleteSpiderColl ( sc ,"10");
- //mdelete ( sc, sizeof(SpiderColl),"nukecr2");
- //delete ( sc );
- // don't let cr reference us anymore, sc is on deathrow
- // and "cr" is delete below!
- //cr->m_spiderColl = (SpiderColl *)0x8888;//NULL;
- cr->m_spiderColl = NULL;
- }
- // the bulk urls file too i guess
- if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
- SafeBuf bu;
- bu.safePrintf("%sbulkurls-%s.txt",
- g_hostdb.m_dir , cr->m_coll );
- File bf;
- bf.set ( bu.getBufStart() );
- if ( bf.doesExist() ) bf.unlink();
- }
- // now remove from list of collections that might need a disk merge
- removeFromMergeLinkedList ( cr );
- //////
- //
- // remove from m_recs[]
- //
- //////
- setRecPtr ( cr->m_collnum , NULL );
- // free it
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- // do not do this here in case spiders were outstanding
- // and they added a new coll right away and it ended up getting
- // recs from the deleted coll!!
- //while ( ! m_recs[m_numRecs-1] ) m_numRecs--;
- // update the time
- //updateTime();
- // done
- return true;
- }
- //#include "PageTurk.h"
- /*
- // . reset a collection
- // . returns false if blocked and will call callback
- bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
- // ensure it's not NULL
- if ( ! coll ) {
- log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
- g_errno = ENOCOLLREC;
- return true;
- }
- // get the CollectionRec for "qatest123"
- CollectionRec *cr = getRec ( coll ); // "qatest123" );
- // must be there. if not, we create test i guess
- if ( ! cr ) {
- log("db: could not get coll rec \"%s\" to reset", coll);
- char *xx=NULL;*xx=0;
- }
- return resetColl2 ( cr->m_collnum, purgeSeeds);
- }
- */
- // ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
- bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
- // an add, make sure big enough
- int32_t need = ((int32_t)collnum+1)*sizeof(CollectionRec *);
- int32_t have = m_recPtrBuf.getLength();
- int32_t need2 = need - have;
- // if already big enough
- if ( need2 <= 0 ) {
- m_recs [ collnum ] = NULL;
- return true;
- }
- m_recPtrBuf.setLabel ("crecptrb");
- // . true here means to clear the new space to zeroes
- // . this shit works based on m_length not m_capacity
- if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
- log("admin: error growing rec ptr buf2.");
- return false;
- }
- // sanity
- if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }
- // set it
- m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
- // update length of used bytes in case we re-alloc
- m_recPtrBuf.setLength ( need );
- // re-max
- int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
- // sanity
- if ( collnum >= max ) { char *xx=NULL;*xx=0; }
- // initialize slot
- m_recs [ collnum ] = NULL;
- return true;
- }
- bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
- // first time init hashtable that maps coll to collnum
- if ( g_collTable.m_numSlots == 0 &&
- ! g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
- false,0,"nhshtbl"))
- return false;
- // sanity
- if ( collnum < 0 ) { char *xx=NULL;*xx=0; }
- // sanity
- int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
- // set it
- m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
- // tell spiders to re-upadted the active list
- g_spiderLoop.m_activeListValid = false;
- g_spiderLoop.m_activeListModified = true;
- // a delete?
- if ( ! cr ) {
- // sanity
- if ( collnum >= max ) { char *xx=NULL;*xx=0; }
- // get what's there
- CollectionRec *oc = m_recs[collnum];
- // let it go
- m_recs[collnum] = NULL;
- // if nothing already, done
- if ( ! oc ) return true;
- // tally it up
- m_numRecsUsed--;
- // delete key
- int64_t h64 = hash64n(oc->m_coll);
- // if in the hashtable UNDER OUR COLLNUM then nuke it
- // otherwise, we might be called from resetColl2()
- void *vp = g_collTable.getValue ( &h64 );
- if ( ! vp ) return true;
- collnum_t ct = *(collnum_t *)vp;
- if ( ct != collnum ) return true;
- g_collTable.removeKey ( &h64 );
- return true;
- }
- // ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
- if ( ! growRecPtrBuf ( collnum ) )
- return false;
- // sanity
- if ( cr->m_collnum != collnum ) { char *xx=NULL;*xx=0; }
- // add to hash table to map name to collnum_t
- int64_t h64 = hash64n(cr->m_coll);
- // debug
- //log("coll: adding key %"INT64" for %s",h64,cr->m_coll);
- if ( ! g_collTable.addKey ( &h64 , &collnum ) )
- return false;
- // ensure last is NULL
- m_recs[collnum] = cr;
- // count it
- m_numRecsUsed++;
- //log("coll: adding key4 %"UINT64" for coll \"%s\" (%"INT32")",h64,cr->m_coll,
- // (int32_t)i);
- // reserve it
- if ( collnum >= m_numRecs ) m_numRecs = collnum + 1;
- // sanity to make sure collectionrec ptrs are legit
- for ( int32_t j = 0 ; j < m_numRecs ; j++ ) {
- if ( ! m_recs[j] ) continue;
- if ( m_recs[j]->m_collnum == 1 ) continue;
- }
- // update the time
- //updateTime();
- return true;
- }
- // moves a file by first trying rename, then copying since cross device renaming doesn't work
- // returns 0 on success
- int mv(char* src, char* dest) {
- int status = rename( src , dest );
- if (status == 0)
- return 0;
- FILE *fsrc, *fdest;
- fsrc = fopen(src, "r");
- if (fsrc == NULL)
- return -1;
- fdest = fopen(dest, "w");
- if (fdest == NULL) {
- fclose(fsrc);
- return -1;
- }
- const int BUF_SIZE = 1024;
- char buf[BUF_SIZE];
- while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
- int read = fread(buf, 1, BUF_SIZE, fsrc);
- fwrite(buf, 1, read, fdest);
- }
- fclose(fsrc);
- fclose(fdest);
- if (ferror(fdest) || ferror(fsrc))
- return -1;
- remove(src);
- return 0;
- }
- // . returns false if we need a re-call, true if we completed
- // . returns true with g_errno set on error
- bool Collectiondb::resetColl2( collnum_t oldCollnum,
- collnum_t newCollnum,
- //WaitEntry *we,
- bool purgeSeeds){
- // save parms in case we block
- //we->m_purgeSeeds = purgeSeeds;
- // now must be "qatest123" only for now
- //if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
- // no spiders can be out. they may be referencing the CollectionRec
- // in XmlDoc.cpp... quite likely.
- //if ( g_conf.m_spideringEnabled ||
- // g_spiderLoop.m_numSpidersOut > 0 ) {
- // log("admin: Can not delete collection while "
- // "spiders are enabled or active.");
- // return false;
- //}
- // do not allow this if in repair mode
- if ( g_repair.isRepairActive() && g_repair.m_collnum == oldCollnum ) {
- log("admin: Can not delete collection while in repair mode.");
- g_errno = EBADENGINEER;
- return true;
- }
- //log("admin: resetting collnum %"INT32"",(int32_t)oldCollnum);
- // CAUTION: tree might be in the middle of saving
- // we deal with this in Process.cpp now
- if ( g_process.isAnyTreeSaving() ) {
- // we could not complete...
- return false;
- }
- CollectionRec *cr = m_recs [ oldCollnum ];
- // let's reset crawlinfo crap
- cr->m_globalCrawlInfo.reset();
- cr->m_localCrawlInfo.reset();
- //collnum_t oldCollnum = cr->m_collnum;
- //collnum_t newCollnum = m_numRecs;
- // in case of bulk job, be sure to save list of spots
- // copy existing list to a /tmp, where they will later be transferred back to the new folder
- // now i just store in the root working dir... MDW
- /*
- char oldbulkurlsname[1036];
- snprintf(oldbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)oldCollnum);
- char newbulkurlsname[1036];
- snprintf(newbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)newCollnum);
- char tmpbulkurlsname[1036];
- snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%"INT32".bulkurls.txt",cr->m_coll,(int32_t)oldCollnum);
- if (cr->m_isCustomCrawl == 2)
- mv( oldbulkurlsname , tmpbulkurlsname );
- */
- // reset spider info
- SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
- if ( sc ) {
- // remove locks from lock table:
- sc->clearLocks();
- // don't do this anymore, just nuke it in case
- // m_populatingDoledb was true etc. there are too many
- // flags to worry about
- //sc->m_collnum = newCollnum;
- //sc->reset();
- // this will put it on "death row" so it will be deleted
- // once Msg5::m_waitingForList/Merge is NULL
- tryToDeleteSpiderColl ( sc,"11" );
- //mdelete ( sc, sizeof(SpiderColl),"nukecr2");
- //delete ( sc );
- cr->m_spiderColl = NULL;
- }
- // reset spider round
- cr->m_spiderRoundNum = 0;
- cr->m_spiderRoundStartTime = 0;
- cr->m_spiderStatus = SP_INITIALIZING; // this is 0
- //cr->m_spiderStatusMsg = NULL;
- // reset seed buf
- if ( purgeSeeds ) {
- // free the buffer of seed urls
- cr->m_diffbotSeeds.purge();
- // reset seed dedup table
- HashTableX *ht = &cr->m_seedHashTable;
- ht->reset();
- }
- // so XmlDoc.cpp can detect if the collection was reset since it
- // launched its spider:
- cr->m_lastResetCount++;
- if ( newCollnum >= m_numRecs ) m_numRecs = (int32_t)newCollnum + 1;
- // advance sanity check. did we wrap around?
- // right now we #define collnum_t int16_t
- if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }
- // make a new collnum so records in transit will not be added
- // to any rdb...
- cr->m_collnum = newCollnum;
- // update the timestamps since we are restarting/resetting
- cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
- cr->m_diffbotCrawlEndTime = 0;
- ////////
- //
- // ALTER m_recs[] array
- //
- ////////
- // Rdb::resetColl() needs to know the new cr so it can move
- // the RdbBase into cr->m_bases[rdbId] array. recycling.
- setRecPtr ( newCollnum , cr );
- // a new directory then since we changed the collnum
- char dname[512];
- sprintf(dname, "%scoll.%s.%"INT32"/",
- g_hostdb.m_dir,
- cr->m_coll,
- (int32_t)newCollnum);
- DIR *dir = opendir ( dname );
- if ( dir )
- closedir ( dir );
- if ( dir ) {
- //g_errno = EEXIST;
- log("admin: Trying to create collection %s but "
- "directory %s already exists on disk.",cr->m_coll,dname);
- }
- if ( ::mkdir ( dname ,
- getDirCreationFlags() ) ) {
- // S_IRUSR | S_IWUSR | S_IXUSR |
- // S_IRGRP | S_IWGRP | S_IXGRP |
- // S_IROTH | S_IXOTH ) ) {
- // valgrind?
- //if ( errno == EINTR ) goto retry22;
- //g_errno = errno;
- log("admin: Creating directory %s had error: "
- "%s.", dname,mstrerror(g_errno));
- }
- // be sure to copy back the bulk urls for bulk jobs
- // MDW: now i just store that file in the root working dir
- //if (cr->m_isCustomCrawl == 2)
- // mv( tmpbulkurlsname, newbulkurlsname );
- // . unlink all the *.dat and *.map files for this coll in its subdir
- // . remove all recs from this collnum from m_tree/m_buckets
- // . updates RdbBase::m_collnum
- // . so for the tree it just needs to mark the old collnum recs
- // with a collnum -1 in case it is saving...
- g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- // reset crawl status too!
- cr->m_spiderStatus = SP_INITIALIZING;
- // . set m_recs[oldCollnum] to NULL and remove from hash table
- // . do after calls to deleteColl() above so it won't crash
- setRecPtr ( oldCollnum , NULL );
- // save coll.conf to new directory
- cr->save();
- // and clear the robots.txt cache in case we recently spidered a
- // robots.txt, we don't want to use it, we want to use the one we
- // have in the test-parser subdir so we are consistent
- //RdbCache *robots = Msg13::getHttpCacheRobots();
- //RdbCache *others = Msg13::getHttpCacheOthers();
- // clear() was removed do to possible corruption
- //robots->clear ( oldCollnum );
- //others->clear ( oldCollnum );
- //g_templateTable.reset();
- //g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
- // repopulate CollectionRec::m_sortByDateTable. should be empty
- // since we are resetting here.
- //initSortByDateTable ( coll );
- // done
- return true;
- }
- // a hack function
- bool addCollToTable ( char *coll , collnum_t collnum ) {
- // readd it to the hashtable that maps name to collnum too
- int64_t h64 = hash64n(coll);
- g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
- false,0,"nhshtbl");
- return g_collTable.addKey ( &h64 , &collnum );
- }
- // get coll rec specified in the HTTP request
- CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
- char *coll = r->getString ( "c" );
- if ( coll && ! coll[0] ) coll = NULL;
- // maybe it is crawlbot?
- char *name = NULL;
- char *token = NULL;
- if ( ! coll ) {
- name = r->getString("name");
- token = r->getString("token");
- }
- char tmp[MAX_COLL_LEN+1];
- if ( ! coll && token && name ) {
- snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
- coll = tmp;
- }
- // default to main first
- if ( ! coll && useDefaultRec ) {
- CollectionRec *cr = g_collectiondb.getRec("main");
- if ( cr ) return cr;
- }
- // try next in line
- if ( ! coll && useDefaultRec ) {
- return getFirstRec ();
- }
- // give up?
- if ( ! coll ) return NULL;
- //if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
- return g_collectiondb.getRec ( coll );
- }
- char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
- char *coll = r->getString ( "c" );
- if ( coll && ! coll[0] ) coll = NULL;
- if ( coll ) return coll;
- CollectionRec *cr = NULL;
- // default to main first
- if ( ! coll ) {
- cr = g_collectiondb.getRec("main");
- // CAUTION: cr could be deleted so don't trust this ptr
- // if you give up control of the cpu
- if ( cr ) return cr->m_coll;
- }
- // try next in line
- if ( ! coll ) {
- cr = getFirstRec ();
- if ( cr ) return cr->m_coll;
- }
- // give up?
- return NULL;
- }
- //CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
- // char *coll = getDefaultColl();
- // return g_collectiondb.getRec(coll);
- //}
- // . get collectionRec from name
- // . returns NULL if not available
- CollectionRec *Collectiondb::getRec ( char *coll ) {
- if ( ! coll ) coll = "";
- return getRec ( coll , gbstrlen(coll) );
- }
- CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
- if ( ! coll ) coll = "";
- collnum_t collnum = getCollnum ( coll , collLen );
- if ( collnum < 0 ) return NULL;
- return m_recs [ (int32_t)collnum ];
- }
- CollectionRec *Collectiondb::getRec ( collnum_t collnum) {
- if ( collnum >= m_numRecs || collnum < 0 ) {
- // Rdb::resetBase() gets here, so don't always log.
- // it is called from CollectionRec::reset() which is called
- // from the CollectionRec constructor and ::load() so
- // it won't have anything in rdb at that time
- //log("colldb: collnum %"INT32" > numrecs = %"INT32"",
- // (int32_t)collnum,(int32_t)m_numRecs);
- return NULL;
- }
- return m_recs[collnum];
- }
- //CollectionRec *Collectiondb::getDefaultRec ( ) {
- // if ( ! g_conf.m_defaultColl[0] ) return NULL; // no default?
- // collnum_t collnum = getCollnum ( g_conf.m_defaultColl );
- // if ( collnum < (collnum_t)0 ) return NULL;
- // return m_recs[(int32_t)collnum];
- //}
- CollectionRec *Collectiondb::getFirstRec ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ )
- if ( m_recs[i] ) return m_recs[i];
- return NULL;
- }
- collnum_t Collectiondb::getFirstCollnum ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ )
- if ( m_recs[i] ) return i;
- return (collnum_t)-1;
- }
- char *Collectiondb::getFirstCollName ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ )
- if ( m_recs[i] ) return m_recs[i]->m_coll;
- return NULL;
- }
- char *Collectiondb::getCollName ( collnum_t collnum ) {
- if ( collnum < 0 || collnum > m_numRecs ) return NULL;
- if ( ! m_recs[(int32_t)collnum] ) return NULL;
- return m_recs[collnum]->m_coll;
- }
- collnum_t Collectiondb::getCollnum ( char *coll ) {
- int32_t clen = 0;
- if ( coll ) clen = gbstrlen(coll );
- return getCollnum ( coll , clen );
- /*
- //if ( ! coll ) coll = "";
- // default empty collection names
- if ( coll && ! coll[0] ) coll = NULL;
- if ( ! coll ) coll = g_conf.m_defaultColl;
- if ( ! coll || ! coll[0] ) coll = "main";
- // This is necessary for Statsdb to work, as it is
- // not associated with any collection. Is this
- // necessary for Catdb?
- if ( coll[0]=='s' && coll[1] =='t' &&
- strcmp ( "statsdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='f' && coll[1]=='a' &&
- strcmp ( "facebookdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='a' && coll[1]=='c' &&
- strcmp ( "accessdb\0", coll ) == 0)
- return 0;
- // because diffbot may have thousands of crawls/collections
- // let's improve the speed here. try hashing it...
- int64_t h64 = hash64n(coll);
- void *vp = g_collTable.getValue ( &h64 );
- if ( ! vp ) return -1; // not found
- return *(collnum_t *)vp;
- */
- /*
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- if ( m_recs[i]->m_coll[0] != coll[0] ) continue;
- if ( strcmp ( m_recs[i]->m_coll , coll ) == 0 ) return i;
- }
- //if ( strcmp ( "catdb\0", coll ) == 0) return 0;
- return (collnum_t)-1; // not found
- */
- }
- collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {
- // default empty collection names
- if ( coll && ! coll[0] ) coll = NULL;
- if ( ! coll ) {
- coll = g_conf.m_defaultColl;
- if ( coll ) clen = gbstrlen(coll);
- else clen = 0;
- }
- if ( ! coll || ! coll[0] ) {
- coll = "main";
- clen = gbstrlen(coll);
- }
- // This is necessary for Statsdb to work, as it is
- //if ( ! coll ) coll = "";
- // not associated with any collection. Is this
- // necessary for Catdb?
- if ( coll[0]=='s' && coll[1] =='t' &&
- strcmp ( "statsdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='f' && coll[1]=='f' &&
- strcmp ( "facebookdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='a' && coll[1]=='c' &&
- strcmp ( "accessdb\0", coll ) == 0)
- return 0;
- // because diffbot may have thousands of crawls/collections
- // let's improve the speed here. try hashing it...
- int64_t h64 = hash64(coll,clen);
- void *vp = g_collTable.getValue ( &h64 );
- if ( ! vp ) return -1; // not found
- return *(collnum_t *)vp;
- /*
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- if ( m_recs[i]->m_collLen != clen ) continue;
- if ( strncmp(m_recs[i]->m_coll,coll,clen) == 0 ) return i;
- }
- //if ( strncmp ( "catdb\0", coll, clen ) == 0) return 0;
- return (collnum_t)-1; // not found
- */
- }
- //collnum_t Collectiondb::getNextCollnum ( collnum_t collnum ) {
- // for ( int32_t i = (int32_t)collnum + 1 ; i < m_numRecs ; i++ )
- // if ( m_recs[i] ) return i;
- // // no next one, use -1
- // return (collnum_t) -1;
- //}
- // what collnum will be used the next time a coll is added?
- collnum_t Collectiondb::reserveCollNum ( ) {
- if ( m_numRecs < 0x7fff ) {
- collnum_t next = m_numRecs;
- // make the ptr NULL at least to accommodate the
- // loop that scan up to m_numRecs lest we core
- growRecPtrBuf ( next );
- m_numRecs++;
- return next;
- }
- // collnum_t is signed right now because we use -1 to indicate a
- // bad collnum.
- int32_t scanned = 0;
- // search for an empty slot
- for ( int32_t i = m_wrapped ; ; i++ ) {
- // because collnum_t is 2 bytes, signed, limit this here
- if ( i > 0x7fff ) i = 0;
- // how can this happen?
- if ( i < 0 ) i = 0;
- // if we scanned the max # of recs we could have, we are done
- if ( ++scanned >= m_numRecs ) break;
- // skip if this is in use
- if ( m_recs[i] ) continue;
- // start after this one next time
- m_wrapped = i+1;
- // note it
- log("colldb: returning wrapped collnum "
- "of %"INT32"",(int32_t)i);
- return (collnum_t)i;
- }
- log("colldb: no new collnum available. consider upping collnum_t");
- // none available!!
- return -1;
- }
- ///////////////
- //
- // COLLECTIONREC
- //
- ///////////////
- #include "gb-include.h"
- //#include "CollectionRec.h"
- //#include "Collectiondb.h"
- #include "HttpServer.h" // printColors2()
- #include "Msg5.h"
- #include "Threads.h"
- #include "Datedb.h"
- #include "Timedb.h"
- #include "Spider.h"
- #include "Process.h"
- static CollectionRec g_default;
- CollectionRec::CollectionRec() {
- m_nextLink = NULL;
- m_prevLink = NULL;
- m_spiderCorruptCount = 0;
- m_collnum = -1;
- m_coll[0] = '\0';
- m_updateRoundNum = 0;
- m_swappedOut = false;
- //m_numSearchPwds = 0;
- //m_numBanIps = 0;
- //m_numSearchIps = 0;
- //m_numSpamIps = 0;
- //m_numAdminPwds = 0;
- //m_numAdminIps = 0;
- memset ( m_bases , 0 , sizeof(RdbBase *)*RDB_END );
- // how many keys in the tree of each rdb? we now store this stuff
- // here and not in RdbTree.cpp because we no longer have a maximum
- // # of collection recs... MAX_COLLS. each is a 32-bit "int32_t" so
- // it is 4 * RDB_END...
- memset ( m_numNegKeysInTree , 0 , 4*RDB_END );
- memset ( m_numPosKeysInTree , 0 , 4*RDB_END );
- m_spiderColl = NULL;
- m_overflow = 0x12345678;
- m_overflow2 = 0x12345678;
- // the spiders are currently uninhibited i guess
- m_spiderStatus = SP_INITIALIZING; // this is 0
- //m_spiderStatusMsg = NULL;
- // for Url::getSite()
- m_updateSiteRulesTable = 1;
- //m_lastUpdateTime = 0LL;
- m_clickNScrollEnabled = false;
- // inits for sortbydatetable
- m_inProgress = false;
- m_msg5 = NULL;
- m_importState = NULL;
- // JAB - track which regex parsers have been initialized
- //log(LOG_DEBUG,"regex: %p initializing empty parsers", m_pRegExParser);
- // clear these out so Parms::calcChecksum can work:
- memset( m_spiderFreqs, 0, MAX_FILTERS*sizeof(*m_spiderFreqs) );
- //for ( int i = 0; i < MAX_FILTERS ; i++ )
- // m_spiderQuotas[i] = -1;
- memset( m_spiderPriorities, 0,
- MAX_FILTERS*sizeof(*m_spiderPriorities) );
- memset ( m_harvestLinks,0,MAX_FILTERS);
- memset ( m_forceDelete,0,MAX_FILTERS);
- //memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
- //for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
- // *(m_searchPwds[i]) = '\0';
- //}
- //for ( int i = 0; i < MAX_ADMIN_PASSWORDS; i++ ) {
- // *(m_adminPwds[i]) = '\0';
- //}
- //memset( m_banIps, 0, MAX_BANNED_IPS*sizeof(*m_banIps) );
- //memset( m_searchIps, 0, MAX_SEARCH_IPS*sizeof(*m_searchIps) );
- //memset( m_spamIps, 0, MAX_SPAM_IPS*sizeof(*m_spamIps) );
- //memset( m_adminIps, 0, MAX_ADMIN_IPS*sizeof(*m_adminIps) );
- //for ( int i = 0; i < MAX_FILTERS; i++ ) {
- // //m_pRegExParser[i] = NULL;
- // *(m_regExs[i]) = '\0';
- //}
- m_numRegExs = 0;
- //m_requests = 0;
- //m_replies = 0;
- //m_doingCallbacks = false;
- m_lastResetCount = 0;
- // regex_t types
- m_hasucr = false;
- m_hasupr = false;
- // for diffbot caching the global spider stats
- reset();
- // add default reg ex if we do not have one
- //setUrlFiltersToDefaults();
- //rebuildUrlFilters();
- }
- CollectionRec::~CollectionRec() {
- //invalidateRegEx ();
- reset();
- }
- // new collection recs get this called on them
- void CollectionRec::setToDefaults ( ) {
- g_parms.setFromFile ( this , NULL , NULL , OBJ_COLL );
- // add default reg ex
- //setUrlFiltersToDefaults();
- rebuildUrlFilters();
- }
- void CollectionRec::reset() {
- //log("coll: resetting collnum=%"INT32"",(int32_t)m_collnum);
- // . grows dynamically
- // . setting to 0 buckets should never have error
- //m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
- // regex_t types
- if ( m_hasucr ) regfree ( &m_ucr );
- if ( m_hasupr ) regfree ( &m_upr );
- m_hasucr = false;
- m_hasupr = false;
- m_sendingAlertInProgress = false;
- // make sure we do not leave spiders "hanging" waiting for their
- // callback to be called... and it never gets called
- //if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
- //if ( m_doingCallbacks ) { char *xx=NULL;*xx=0; }
- //if ( m_replies != m_requests ) { char *xx=NULL;*xx=0; }
- m_localCrawlInfo.reset();
- m_globalCrawlInfo.reset();
- //m_requests = 0;
- //m_replies = 0;
- // free all RdbBases in each rdb
- for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
- Rdb *rdb = g_process.m_rdbs[i];
- rdb->resetBase ( m_collnum );
- }
- for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
- RdbBase *base = m_bases[i];
- if ( ! base ) continue;
- mdelete (base, sizeof(RdbBase), "Rdb Coll");
- delete (base);
- }
- SpiderColl *sc = m_spiderColl;
- // debug hack thing
- //if ( sc == (SpiderColl *)0x8888 ) return;
- // if never made one, we are done
- if ( ! sc ) return;
- // spider coll also!
- sc->m_deleteMyself = true;
- // if not currently being accessed nuke it now
- tryToDeleteSpiderColl ( sc ,"12");
- // if ( ! sc->m_msg5.m_waitingForList &&
- // ! sc->m_msg5b.m_waitingForList &&
- // ! sc->m_msg1.m_mcast.m_inUse ) {
- // mdelete ( sc, sizeof(SpiderColl),"nukecr2");
- // delete ( sc );
- // }
- }
- CollectionRec *g_cr = NULL;
- // . load this data from a conf file
- // . values we do not explicitly have will be taken from "default",
- // collection config file. if it does not have them then we use
- // the value we received from call to setToDefaults()
- // . returns false and sets g_errno on load error
- bool CollectionRec::load ( char *coll , int32_t i ) {
- // also reset some counts not included in parms list
- reset();
- // before we load, set to defaults in case some are not in xml file
- g_parms.setToDefault ( (char *)this , OBJ_COLL , this );
- // get the filename with that id
- File f;
- char tmp2[1024];
- sprintf ( tmp2 , "%scoll.%s.%"INT32"/coll.conf", g_hostdb.m_dir , coll,i);
- f.set ( tmp2 );
- if ( ! f.doesExist () ) return log("admin: %s does not exist.",tmp2);
- // set our collection number
- m_collnum = i;
- // set our collection name
- m_collLen = gbstrlen ( coll );
- strcpy ( m_coll , coll );
- if ( ! g_conf.m_doingCommandLine )
- log(LOG_INFO,"db: Loading conf for collection %s (%"INT32")",coll,
- (int32_t)m_collnum);
- // collection name HACK for backwards compatibility
- //if ( strcmp ( coll , "main" ) == 0 ) {
- // m_coll[0] = '\0';
- // m_collLen = 0;
- //}
- // the default conf file
- char tmp1[1024];
- snprintf ( tmp1 , 1023, "%sdefault.conf" , g_hostdb.m_dir );
- // . set our parms from the file.
- // . accepts OBJ_COLLECTIONREC or OBJ_CONF
- g_parms.setFromFile ( this , tmp2 , tmp1 , OBJ_COLL );
- // add default reg ex IFF there are no url filters there now
- //if(m_numRegExs == 0) rebuildUrlFilters();//setUrlFiltersToDefaults();
- // this only rebuild them if necessary
- rebuildUrlFilters();//setUrlFiltersToDefaults();
- // temp check
- //testRegex();
- //
- // LOAD the crawlinfo class in the collectionrec for diffbot
- //
- // LOAD LOCAL
- snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
- g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
- log(LOG_DEBUG,"db: Loading %s",tmp1);
- m_localCrawlInfo.reset();
- SafeBuf sb;
- // fillfromfile returns 0 if does not exist, -1 on read error
- if ( sb.fillFromFile ( tmp1 ) > 0 )
- //m_localCrawlInfo.setFromSafeBuf(&sb);
- // it is binary now
- gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
- // if it had corrupted data from saving corrupted mem zero it out
- CrawlInfo *stats = &m_localCrawlInfo;
- // point to the stats for that host
- int64_t *ss = (int64_t *)stats;
- // are stats crazy?
- bool crazy = false;
- for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
- // crazy stat?
- if ( *ss > 1000000000LL ||
- *ss < -1000000000LL ) {
- crazy = true;
- break;
- }
- ss++;
- }
- if ( m_localCrawlInfo.m_collnum != m_collnum )
- crazy = true;
- if ( crazy ) {
- log("coll: had crazy spider stats for coll %s. zeroing out.",
- m_coll);
- m_localCrawlInfo.reset();
- }
- if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
- log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
- m_coll,
- (int32_t)m_collnum,
- (int32_t)m_localCrawlInfo.m_hasUrlsReadyToSpider);
- // we introduced the this round counts, so don't start them at 0!!
- if ( m_spiderRoundNum == 0 &&
- m_localCrawlInfo.m_pageDownloadSuccessesThisRound <
- m_localCrawlInfo.m_pageDownloadSuccesses ) {
- log("coll: fixing process count this round for %s",m_coll);
- m_localCrawlInfo.m_pageDownloadSuccessesThisRound =
- m_localCrawlInfo.m_pageDownloadSuccesses;
- }
- // we introduced the this round counts, so don't start them at 0!!
- if ( m_spiderRoundNum == 0 &&
- m_localCrawlInfo.m_pageProcessSuccessesThisRound <
- m_localCrawlInfo.m_pageProcessSuccesses ) {
- log("coll: fixing process count this round for %s",m_coll);
- m_localCrawlInfo.m_pageProcessSuccessesThisRound =
- m_localCrawlInfo.m_pageProcessSuccesses;
- }
- // fix from old bug that was fixed
- //if ( m_spiderRoundNum == 0 &&
- // m_collectiveRespiderFrequency > 0.0 &&
- // m_localCrawlInfo.m_sentCrawlDoneAlert ) {
- // log("coll: bug fix: resending email alert for coll %s (%"INT32") "
- // "of respider freq %f",m_coll,(int32_t)m_collnum,
- // m_collectiveRespiderFrequency);
- // m_localCrawlInfo.m_sentCrawlDoneAlert = false;
- //}
- // LOAD GLOBAL
- snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
- g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
- log(LOG_DEBUG,"db: Loading %s",tmp1);
- m_globalCrawlInfo.reset();
- sb.reset();
- if ( sb.fillFromFile ( tmp1 ) > 0 )
- //m_globalCrawlInfo.setFromSafeBuf(&sb);
- // it is binary now
- gbmemcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
- if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
- log("coll: Loaded %s (%"INT32") global hasurlsready=%"INT32"",
- m_coll,
- (int32_t)m_collnum,
- (int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
- // the list of ip addresses that we have detected as being throttled
- // and therefore backoff and use proxies for
- if ( ! g_conf.m_doingCommandLine ) {
- sb.reset();
- sb.safePrintf("%scoll.%s.%"INT32"/",
- g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
- m_twitchyTable.m_allocName = "twittbl";
- m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
- }
-
- ////////////
- //
- // PAGE COUNT TABLE for doing quotas in url filters
- //
- /////////////
- // log it up if there on disk
- //snprintf ( tmp1 , 1023, "/coll.%s.%"INT32"/pagecounts.dat",
- // m_coll , (int32_t)m_collnum );
- //if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
- // log("db: failed to load page count table: %s",
- // mstrerror(g_errno));
- // ignore errors i guess
- g_errno = 0;
- // fix for diffbot, spider time deduping
- if ( m_isCustomCrawl ) m_dedupingEnabled = true;
- // always turn off gigabits so &s=1000 can do summary skipping
- if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;
- // make min to merge smaller than normal since most collections are
- // small and we want to reduce the # of vfds (files) we have
- if ( m_isCustomCrawl ) {
- m_posdbMinFilesToMerge = 6;
- m_titledbMinFilesToMerge = 4;
- m_linkdbMinFilesToMerge = 3;
- m_tagdbMinFilesToMerge = 2;
- }
- // always turn on distributed spider locking because otherwise
- // we end up calling Msg50 which calls Msg25 for the same root url
- // at the same time, thereby wasting massive resources. it is also
- // dangerous to run without this because webmaster get pissed when
- // we slam their servers.
- // This is now deprecated...
- //m_useSpiderLocks = false;
- // and all pages downloaded from a particular ip should be done
- // by the same host in our cluster to prevent webmaster rage
- //m_distributeSpiderGet = true;
- //initSortByDateTable(m_coll);
- return true;
- }
- /*
- bool CollectionRec::countEvents ( ) {
- // set our m_numEventsOnHost value
- log("coll: loading event count termlist gbeventcount");
- // temporarily turn off threads
- bool enabled = g_threads.areThreadsEnabled();
- g_threads.disableThreads();
- // count them
- m_numEventsOnHost = 0;
- // 1MB at a time
- int32_t minRecSizes = 1000000;
- // look up this termlist, gbeventcount which we index in XmlDoc.cpp
- int64_t termId = hash64n("gbeventcount") & TERMID_MASK;
- // make datedb key from it
- key128_t startKey = g_datedb.makeStartKey ( termId , 0xffffffff );
- key128_t endKey = g_datedb.makeEndKey ( termId , 0 );
-
- Msg5 msg5;
- RdbList list;
- // . init m_numEventsOnHost by getting the exact length of that
- // termlist on this host
- // . send in the ping request packet so all hosts can total up
- // . Rdb.cpp should be added to incrementally so we should have no
- // double positives.
- // . Rdb.cpp should inspect each datedb rec for this termid in
- // a fast an efficient manner
- loop:
- // use msg5 to get the list, should ALWAYS block since no threads
- if ( ! msg5.getList ( RDB_DATEDB ,
- m_coll ,
- &list ,
- (char *)&startKey ,
- (char *)&endKey ,
- minRecSizes ,
- true , // includeTree ,
- false , // add to cache?
- 0 , // max cache age
- 0 , // startFileNum ,
- -1 , // numFiles ,
- NULL , // state
- NULL , // callback
- 0 , // niceness
- false , // err correction?
- NULL , // cache key ptr
- 0 , // retry num
- -1 , // maxRetries
- true , // compensate for merge
- -1LL , // sync point
- NULL )){// msg5b
- // not allowed to block!
- char *xx=NULL;*xx=0; }
- // scan the list, score is how many valid events from that docid
- uint32_t total = 0;
- for ( ; ! list.isExhausted() ; list.skipCurrentRec() ) {
- unsigned char *rec = (unsigned char *)list.getCurrentRec();
- // in datedb score is byte #5
- total += (255-rec[5]);
- }
- // declare
- char *lastKeyPtr;
- key128_t newStartKey;
- // add to count. datedb uses half keys so subtract 6 bytes
- // since the termids will be the same...
- //m_numEventsOnHost += list.getListSize() / (sizeof(key128_t)-6);
- m_numEventsOnHost += total;
- // bail if under limit
- if ( list.getListSize() < minRecSizes ) goto done;
- // update key
- lastKeyPtr = list.m_listEnd - 10;
- // we make a new start key
- list.getKey ( lastKeyPtr , (char *)&newStartKey );
- // maxxed out?
- if ( newStartKey.n0==0xffffffffffffffffLL &&
- newStartKey.n1==0xffffffffffffffffLL )
- goto done;
- // sanity check
- if ( newStartKey < startKey ) { char *xx=NULL;*xx=0; }
- if ( newStartKey > endKey ) { char *xx=NULL;*xx=0; }
- // inc it
- newStartKey.n0++;
- // in the top if the bottom wrapped
- if ( newStartKey.n0 == 0LL ) newStartKey.n1++;
- // assign
- startKey = newStartKey;
- // and loop back up for more now
- goto loop;
- done:
- // update all colls count
- g_collectiondb.m_numEventsAllColls += m_numEventsOnHost;
- if ( enabled ) g_threads.enableThreads();
- log("coll: got %"INT32" local events in termlist",m_numEventsOnHost);
- // set "m_hasDocQualityFiler"
- //updateFilters();
- return true;
- }
- */
- bool CollectionRec::rebuildUrlFilters2 ( ) {
- // tell spider loop to update active list
- g_spiderLoop.m_activeListValid = false;
- bool rebuild = true;
- if ( m_numRegExs == 0 )
- rebuild = true;
- // don't touch it if not supposed to as int32_t as we have some already
- //if ( m_urlFiltersProfile != UFP_NONE )
- // rebuild = true;
- // never for custom crawls however
- if ( m_isCustomCrawl )
- rebuild = false;
- char *s = m_urlFiltersProfile.getBufStart();
- // support the old UFP_CUSTOM, etc. numeric values
- if ( !strcmp(s,"0" ) )
- s = "custom";
- // UFP_WEB SUPPORT
- if ( !strcmp(s,"1" ) )
- s = "web";
- // UFP_NEWS
- if ( !strcmp(s,"2" ) )
- s = "shallow";
- // leave custom profiles alone
- if ( !strcmp(s,"custom" ) )
- rebuild = false;
-
- //if ( m_numRegExs > 0 && strcmp(m_regExs[m_numRegExs-1],"default") )
- // addDefault = true;
- if ( ! rebuild ) return true;
- if ( !strcmp(s,"shallow" ) )
- return rebuildShallowRules();
- //if ( strcmp(s,"web") )
- // just fall through for that
- if ( !strcmp(s,"english") )
- return rebuildLangRules( "en","com,us,gov");
- if ( !strcmp(s,"german") )
- return rebuildLangRules( "de","de");
- if ( !strcmp(s,"french") )
- return rebuildLangRules( "fr","fr");
- if ( !strcmp(s,"norwegian") )
- return rebuildLangRules( "nl","nl");
- if ( !strcmp(s,"spanish") )
- return rebuildLangRules( "es","es");
- //if ( m_urlFiltersProfile == UFP_EURO )
- // return rebuildLangRules( "de,fr,nl,es,sv,no,it",
- // "com,gov,org,de,fr,nl,es,sv,no,it");
- if ( !strcmp(s,"romantic") )
- return rebuildLangRules("en,de,fr,nl,es,sv,no,it,fi,pt",
- "de,fr,nl,es,sv,no,it,fi,pt,"
- "com,gov,org"
- );
- if ( !strcmp(s,"chinese") )
- return rebuildLangRules( "zh_cn,zh_tw","cn");
- int32_t n = 0;
- /*
- m_regExs[n].set("default");
- m_regExs[n].nullTerm();
- m_spiderFreqs [n] = 30; // 30 days default
- m_spiderPriorities[n] = 0;
- m_maxSpidersPerRule[n] = 99;
- m_spiderIpWaits[n] = 1000;
- m_spiderIpMaxSpiders[n] = 7;
- m_harvestLinks[n] = 1;
- */
- // max spiders per ip
- int32_t ipms = 7;
- m_regExs[n].set("isreindex");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 80;
- n++;
- m_regExs[n].set("ismedia");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100; // delete!
- m_forceDelete [n] = 1;
- n++;
- // if not in the site list then nuke it
- m_regExs[n].set("!ismanualadd && !insitelist");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100;
- m_forceDelete [n] = 1;
- n++;
- m_regExs[n].set("errorcount>=3 && hastmperror");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 1; // 30 days default
- m_maxSpidersPerRule [n] = 1; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100;
- m_forceDelete [n] = 1;
- n++;
- m_regExs[n].set("errorcount>=1 && hastmperror");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 1; // 30 days default
- m_maxSpidersPerRule [n] = 1; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 45;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- // a non temporary error, like a 404? retry once per 3 months i guess
- m_regExs[n].set("errorcount>=1");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 5; // 5 day retry
- m_maxSpidersPerRule [n] = 1; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 2;
- m_forceDelete [n] = 1;
- n++;
- m_regExs[n].set("isaddurl");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 85;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- // 20+ unique c block parent request urls means it is important!
- m_regExs[n].set("numinlinks>7 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 52;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- // 20+ unique c block parent request urls means it is important!
- m_regExs[n].set("numinlinks>7");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 51;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("hopcount==0 && iswww && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 50;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("hopcount==0 && iswww");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0; // days b4 respider
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 48;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("hopcount==0 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 49;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("hopcount==0");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 10.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 47;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("isparentrss && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 45;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("isparentsitemap && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 44;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("isparentrss");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 43;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("isparentsitemap");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 42;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .00347; // 5 mins
- n++;
- m_regExs[n].set("hopcount==1 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 40;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .04166; // 60 minutes
- n++;
- m_regExs[n].set("hopcount==1");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 39;
- if ( ! strcmp(s,"news") )
- m_spiderFreqs [n] = .04166; // 60 minutes
- n++;
- m_regExs[n].set("hopcount==2 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 30;
- // do not harvest links if we are spiderings NEWS
- if ( ! strcmp(s,"news") ) {
- m_spiderFreqs [n] = 5.0;
- m_harvestLinks [n] = 0;
- }
- n++;
- m_regExs[n].set("hopcount==2");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 29;
- // do not harvest links if we are spiderings NEWS
- if ( ! strcmp(s,"news") ) {
- m_spiderFreqs [n] = 5.0;
- m_harvestLinks [n] = 0;
- }
- n++;
- m_regExs[n].set("hopcount>=3 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 20;
- // turn off spidering if hopcount is too big and we are spiderings NEWS
- if ( ! strcmp(s,"news") ) {
- m_maxSpidersPerRule [n] = 0;
- m_harvestLinks [n] = 0;
- }
- else {
- n++;
- }
- m_regExs[n].set("hopcount>=3");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 19;
- // turn off spidering if hopcount is too big and we are spiderings NEWS
- if ( ! strcmp(s,"news") ) {
- m_maxSpidersPerRule [n] = 0;
- m_harvestLinks [n] = 0;
- }
- else {
- n++;
- }
- /*
- m_regExs[n].set("isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = resp4;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 2;
- n++;
- */
- m_regExs[n].set("default");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 1;
- if ( ! strcmp(s,"news") ) {
- m_maxSpidersPerRule [n] = 0;
- m_harvestLinks [n] = 0;
- }
- n++;
- m_numRegExs = n;
- m_numRegExs2 = n;
- m_numRegExs3 = n;
- m_numRegExs10 = n;
- m_numRegExs5 = n;
- m_numRegExs6 = n;
- m_numRegExs8 = n;
- m_numRegExs7 = n;
- // more rules
- //m_spiderDiffbotApiNum[n] = 1;
- //m_numRegExs11++;
- //m_spiderDiffbotApiUrl[n].set("");
- //m_spiderDiffbotApiUrl[n].nullTerm();
- //m_numRegExs11++;
- return true;
- }
- bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
- // max spiders per ip
- int32_t ipms = 7;
- int32_t n = 0;
- m_regExs[n].set("isreindex");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 80;
- n++;
- m_regExs[n].set("ismedia");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100; // delete!
- m_forceDelete [n] = 1;
- n++;
- // if not in the site list then nuke it
- m_regExs[n].set("!ismanualadd && !insitelist");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100; // delete!
- m_forceDelete [n] = 1;
- n++;
- m_regExs[n].set("errorcount>=3 && hastmperror");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 1; // 30 days default
- m_maxSpidersPerRule [n] = 1; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100;
- m_forceDelete [n] = 1;
- n++;
- m_regExs[n].set("errorcount>=1 && hastmperror");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 1; // 30 days default
- m_maxSpidersPerRule [n] = 1; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 45;
- n++;
- m_regExs[n].set("isaddurl");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 85;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && tld==%s",
- tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 50;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && "
- "parentlang==%s,xx"
- ,langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 50;
- n++;
- // m_regExs[n].set("hopcount==0 && iswww && isnew");
- // m_harvestLinks [n] = 1;
- // m_spiderFreqs [n] = 7; // 30 days default
- // m_maxSpidersPerRule [n] = 9; // max spiders
- // m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- // m_spiderIpWaits [n] = 1000; // same ip wait
- // m_spiderPriorities [n] = 20;
- // n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && iswww && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0; // days b4 respider
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 48;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && iswww && parentlang==%s,xx",
- langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0; // days b4 respider
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 48;
- n++;
- m_regExs[n].set("hopcount==0 && iswww");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0; // days b4 respider
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 19;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && isnew && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 49;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && isnew && parentlang==%s,xx",
- langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 49;
- n++;
- m_regExs[n].set("hopcount==0 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 18;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 10.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 47;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==0 && parentlang==%s,xx",langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 10.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 47;
- n++;
- m_regExs[n].set("hopcount==0");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 10.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 17;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==1 && isnew && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 40;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==1 && isnew && parentlang==%s,xx",
- tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 40;
- n++;
- m_regExs[n].set("hopcount==1 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 16;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==1 && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 39;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==1 && parentlang==%s,xx",langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 39;
- n++;
- m_regExs[n].set("hopcount==1");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 15;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==2 && isnew && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 30;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==2 && isnew && parentlang==%s,xx",
- langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 30;
- n++;
- m_regExs[n].set("hopcount==2 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 14;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==2 && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 29;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount==2 && parentlang==%s,xx",langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 29;
- n++;
- m_regExs[n].set("hopcount==2");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 13;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount>=3 && isnew && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 22;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount>=3 && isnew && parentlang==%s,xx",
- langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 22;
- n++;
- m_regExs[n].set("hopcount>=3 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 12;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount>=3 && tld==%s",tldStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 21;
- n++;
- m_regExs[n].reset();
- m_regExs[n].safePrintf("hopcount>=3 && parentlang==%s,xx",langStr);
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 21;
- n++;
- m_regExs[n].set("hopcount>=3");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 11;
- n++;
- m_regExs[n].set("default");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 1;
- n++;
- m_numRegExs = n;
- m_numRegExs2 = n;
- m_numRegExs3 = n;
- m_numRegExs10 = n;
- m_numRegExs5 = n;
- m_numRegExs6 = n;
- m_numRegExs8 = n;
- m_numRegExs7 = n;
- // done rebuilding CHINESE rules
- return true;
- }
- bool CollectionRec::rebuildShallowRules ( ) {
- // max spiders per ip
- int32_t ipms = 7;
- int32_t n = 0;
- m_regExs[n].set("isreindex");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 80;
- n++;
- m_regExs[n].set("ismedia");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100; // delete!
- m_forceDelete [n] = 1;
- n++;
- // if not in the site list then nuke it
- m_regExs[n].set("!ismanualadd && !insitelist");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 0; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100; // delete!
- m_forceDelete [n] = 1;
- n++;
- m_regExs[n].set("errorcount>=3 && hastmperror");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 1; // 30 days default
- m_maxSpidersPerRule [n] = 1; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 100;
- m_forceDelete [n] = 1;
- n++;
- m_regExs[n].set("errorcount>=1 && hastmperror");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 1; // 30 days default
- m_maxSpidersPerRule [n] = 1; // max spiders
- m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 45;
- n++;
- m_regExs[n].set("isaddurl");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 99; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 85;
- n++;
- //
- // stop if hopcount>=2 for things tagged shallow in sitelist
- //
- m_regExs[n].set("tag:shallow && hopcount>=2");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 0; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 30;
- n++;
- // if # of pages in this site indexed is >= 10 then stop as well...
- m_regExs[n].set("tag:shallow && sitepages>=10");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 0; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 30;
- n++;
- m_regExs[n].set("hopcount==0 && iswww && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7; // 30 days default
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 50;
- n++;
- m_regExs[n].set("hopcount==0 && iswww");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0; // days b4 respider
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 48;
- n++;
- m_regExs[n].set("hopcount==0 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 7.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 49;
- n++;
- m_regExs[n].set("hopcount==0");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 10.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 47;
- n++;
- m_regExs[n].set("hopcount==1 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 40;
- n++;
- m_regExs[n].set("hopcount==1");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 20.0;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 39;
- n++;
- m_regExs[n].set("hopcount==2 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 30;
- n++;
- m_regExs[n].set("hopcount==2");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 40;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 29;
- n++;
- m_regExs[n].set("hopcount>=3 && isnew");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 22;
- n++;
- m_regExs[n].set("hopcount>=3");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 21;
- n++;
- m_regExs[n].set("default");
- m_harvestLinks [n] = 1;
- m_spiderFreqs [n] = 60;
- m_maxSpidersPerRule [n] = 9; // max spiders
- m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
- m_spiderIpWaits [n] = 1000; // same ip wait
- m_spiderPriorities [n] = 1;
- n++;
- m_numRegExs = n;
- m_numRegExs2 = n;
- m_numRegExs3 = n;
- m_numRegExs10 = n;
- m_numRegExs5 = n;
- m_numRegExs6 = n;
- m_numRegExs8 = n;
- m_numRegExs7 = n;
- // done rebuilding SHALLOW rules
- return true;
- }
- /*
- bool CrawlInfo::print (SafeBuf *sb ) {
- return sb->safePrintf("objectsAdded:%"INT64"\n"
- "objectsDeleted:%"INT64"\n"
- "urlsConsidered:%"INT64"\n"
- "downloadAttempts:%"INT64"\n"
- "downloadSuccesses:%"INT64"\n"
- "processAttempts:%"INT64"\n"
- "processSuccesses:%"INT64"\n"
- "lastupdate:%"UINT32"\n"
- , m_objectsAdded
- , m_objectsDeleted
- , m_urlsConsidered
- , m_pageDownloadAttempts
- , m_pageDownloadSuccesses
- , m_pageProcessAttempts
- , m_pageProcessSuccesses
- , m_lastUpdateTime
- );
- }
- bool CrawlInfo::setFromSafeBuf (SafeBuf *sb ) {
- return sscanf(sb->getBufStart(),
- "objectsAdded:%"INT64"\n"
- "objectsDeleted:%"INT64"\n"
- "urlsConsidered:%"INT64"\n"
- "downloadAttempts:%"INT64"\n"
- "downloadSuccesses:%"INT64"\n"
- "processAttempts:%"INT64"\n"
- "processSuccesses:%"INT64"\n"
- "lastupdate:%"UINT32"\n"
- , &m_objectsAdded
- , &m_objectsDeleted
- , &m_urlsConsidered
- , &m_pageDownloadAttempts
- , &m_pageDownloadSuccesses
- , &m_pageProcessAttempts
- , &m_pageProcessSuccesses
- , &m_lastUpdateTime
- );
- }
- */
-
- // returns false on failure and sets g_errno, true otherwise
- bool CollectionRec::save ( ) {
- if ( g_conf.m_readOnlyMode ) return true;
- //File f;
- char tmp[1024];
- //sprintf ( tmp , "%scollections/%"INT32".%s/c.conf",
- // g_hostdb.m_dir,m_id,m_coll);
- // collection name HACK for backwards compatibility
- //if ( m_collLen == 0 )
- // sprintf ( tmp , "%scoll.main/coll.conf", g_hostdb.m_dir);
- //else
- snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/coll.conf",
- g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
- if ( ! g_parms.saveToXml ( (char *)this , tmp ,OBJ_COLL)) return false;
- // log msg
- //log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
- //
- // save the crawlinfo class in the collectionrec for diffbot
- //
- // SAVE LOCAL
- snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
- g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
- //log("coll: saving %s",tmp);
- // in case emergency save from malloc core, do not alloc
- char stack[1024];
- SafeBuf sb(stack,1024);
- //m_localCrawlInfo.print ( &sb );
- // binary now
- sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
- if ( sb.safeSave ( tmp ) == -1 ) {
- log("db: failed to save file %s : %s",
- tmp,mstrerror(g_errno));
- g_errno = 0;
- }
- // SAVE GLOBAL
- snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
- g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
- //log("coll: saving %s",tmp);
- sb.reset();
- //m_globalCrawlInfo.print ( &sb );
- // binary now
- sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
- if ( sb.safeSave ( tmp ) == -1 ) {
- log("db: failed to save file %s : %s",
- tmp,mstrerror(g_errno));
- g_errno = 0;
- }
- // the list of ip addresses that we have detected as being throttled
- // and therefore backoff and use proxies for
- sb.reset();
- sb.safePrintf("%scoll.%s.%"INT32"/",
- g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
- m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
- // do not need a save now
- m_needsSave = false;
- // waiting tree is saved in SpiderCache::save() called by Process.cpp
- //SpiderColl *sc = m_spiderColl;
- //if ( ! sc ) return true;
- // save page count table which has # of pages indexed per
- // subdomain/site and firstip for doing quotas in url filters table
- //snprintf ( tmp , 1023, "coll.%s.%"INT32"/pagecounts.dat",
- // m_coll , (int32_t)m_collnum );
- //if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
- // log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
- // g_errno = 0;
- //}
- return true;
- }
- // calls hasPermissin() below
- bool CollectionRec::hasPermission ( HttpRequest *r , TcpSocket *s ) {
- int32_t plen;
- char *p = r->getString ( "pwd" , &plen );
- int32_t ip = s->m_ip;
- return hasPermission ( p , plen , ip );
- }
- // . does this password work for this collection?
- bool CollectionRec::isAssassin ( int32_t ip ) {
- // ok, make sure they came from an acceptable IP
- //for ( int32_t i = 0 ; i < m_numSpamIps ; i++ )
- // // they also have a matching IP, so they now have permission
- // if ( m_spamIps[i] == ip ) return true;
- return false;
- }
- // . does this password work for this collection?
- bool CollectionRec::hasPermission ( char *p, int32_t plen , int32_t ip ) {
- // just return true
- // collection permission is checked from Users::verifyColl
- // in User::getUserType for every request
- return true;
- // scan the passwords
- // MDW: no longer, this is too vulnerable!!!
- /*
- for ( int32_t i = 0 ; i < m_numAdminPwds ; i++ ) {
- int32_t len = gbstrlen ( m_adminPwds[i] );
- if ( len != plen ) continue;
- if ( strncmp ( m_adminPwds[i] , p , plen ) != 0 ) continue;
- // otherwise it's a match!
- //goto checkIp;
- // . matching one password is good enough now, default OR
- // . because just matching an IP is good enough security,
- // there is really no need for both IP AND passwd match
- return true;
- }
- */
- // . if had passwords but the provided one didn't match, return false
- // . matching one password is good enough now, default OR
- //if ( m_numPasswords > 0 ) return false;
- // checkIp:
- // ok, make sure they came from an acceptable IP
- //for ( int32_t i = 0 ; i < m_numAdminIps ; i++ )
- // // they also have a matching IP, so they now have permission
- // if ( m_adminIps[i] == ip ) return true;
- // if no security, allow all NONONONONONONONONO!!!!!!!!!!!!!!
- //if ( m_numAdminPwds == 0 && m_numAdminIps == 0 ) return true;
- // if they did not match an ip or password, even if both lists
- // are empty, do not allow access... this prevents security breeches
- // by accident
- return false;
- // if there were IPs then they failed to get in
- //if ( m_numAdminIps > 0 ) return false;
- // otherwise, they made it
- //return true;
- }
- // can this ip perform a search or add url on this collection?
- bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
- // get the ip
- int32_t ip = 0; if ( s ) ip = s->m_ip;
- // and the ip domain
- int32_t ipd = 0; if ( s ) ipd = ipdom ( s->m_ip );
- // and top 2 bytes for the israel isp that has this huge block
- int32_t ipt = 0; if ( s ) ipt = iptop ( s->m_ip );
- // is it in the ban list?
- /*
- for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
- if ( isIpTop ( m_banIps[i] ) ) {
- if ( m_banIps[i] == ipt ) return false;
- continue;
- }
- // check for ip domain match if this banned ip is an ip domain
- if ( isIpDom ( m_banIps[i] ) ) {
- if ( m_banIps[i] == ipd ) return false;
- continue;
- }
- // otherwise it's just a single banned ip
- if ( m_banIps[i] == ip ) return false;
- }
- */
- // check the encapsulate ip if any
- // 1091771468731 0 Aug 05 23:51:08 63.236.25.77 GET
- // /search?code=mammaXbG&uip=65.87.190.39&n=15&raw=8&q=farm+insurance
- // +nj+state HTTP/1.0
- /*
- if ( encapIp ) {
- ipd = ipdom ( encapIp );
- ip = encapIp;
- for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
- if ( isIpDom ( m_banIps[i] ) ) {
- if ( m_banIps[i] == ipd ) return false;
- continue;
- }
- if ( isIpTop ( m_banIps[i] ) ) {
- if ( m_banIps[i] == ipt ) return false;
- continue;
- }
- if ( m_banIps[i] == ip ) return false;
- }
- }
- */
- return true;
- /*
- // do we have an "only" list?
- if ( m_numSearchIps == 0 ) return true;
- // it must be in that list if we do
- for ( int32_t i = 0 ; i < m_numSearchIps ; i++ ) {
- // check for ip domain match if this banned ip is an ip domain
- if ( isIpDom ( m_searchIps[i] ) ) {
- if ( m_searchIps[i] == ipd ) return true;
- continue;
- }
- // otherwise it's just a single ip
- if ( m_searchIps[i] == ip ) return true;
- }
- */
- // otherwise no permission
- return false;
- }
- bool expandRegExShortcuts ( SafeBuf *sb ) ;
- void nukeDoledb ( collnum_t collnum );
- // rebuild the regexes related to diffbot, such as the one for the URL pattern
- bool CollectionRec::rebuildDiffbotRegexes() {
- //logf(LOG_DEBUG,"db: rebuilding url filters");
- char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
- if ( ucp && ! ucp[0] ) ucp = NULL;
- // get the regexes
- if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
- if ( ucp && ! ucp[0] ) ucp = NULL;
- char *upp = m_diffbotUrlProcessPattern.getBufStart();
- if ( upp && ! upp[0] ) upp = NULL;
- if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
- if ( upp && ! upp[0] ) upp = NULL;
- char *ppp = m_diffbotPageProcessPattern.getBufStart();
- if ( ppp && ! ppp[0] ) ppp = NULL;
- // recompiling regexes starts now
- if ( m_hasucr ) {
- regfree ( &m_ucr );
- m_hasucr = false;
- }
- if ( m_hasupr ) {
- regfree ( &m_upr );
- m_hasupr = false;
- }
- // copy into tmpbuf
- SafeBuf tmp;
- char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
- if ( rx && ! rx[0] ) rx = NULL;
- if ( rx ) {
- tmp.reset();
- tmp.safeStrcpy ( rx );
- expandRegExShortcuts ( &tmp );
- m_hasucr = true;
- }
- if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
- REG_EXTENDED| //REG_ICASE|
- REG_NEWLINE ) ) { // |REG_NOSUB) ) {
- // error!
- log("coll: regcomp %s failed: %s. "
- "Ignoring.",
- rx,mstrerror(errno));
- regfree ( &m_ucr );
- m_hasucr = false;
- }
- rx = m_diffbotUrlProcessRegEx.getBufStart();
- if ( rx && ! rx[0] ) rx = NULL;
- if ( rx ) m_hasupr = true;
- if ( rx ) {
- tmp.reset();
- tmp.safeStrcpy ( rx );
- expandRegExShortcuts ( &tmp );
- m_hasupr = true;
- }
- if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
- REG_EXTENDED| // REG_ICASE|
- REG_NEWLINE ) ) { // |REG_NOSUB) ) {
- // error!
- log("coll: regcomp %s failed: %s. "
- "Ignoring.",
- rx,mstrerror(errno));
- regfree ( &m_upr );
- m_hasupr = false;
- }
- return true;
- }
- bool CollectionRec::rebuildUrlFiltersDiffbot() {
- //logf(LOG_DEBUG,"db: rebuilding url filters");
- char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
- if ( ucp && ! ucp[0] ) ucp = NULL;
- // if we had a regex, that works for this purpose as well
- if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
- if ( ucp && ! ucp[0] ) ucp = NULL;
- char *upp = m_diffbotUrlProcessPattern.getBufStart();
- if ( upp && ! upp[0] ) upp = NULL;
- // if we had a regex, that works for this purpose as well
- if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
- if ( upp && ! upp[0] ) upp = NULL;
- char *ppp = m_diffbotPageProcessPattern.getBufStart();
- if ( ppp && ! ppp[0] ) ppp = NULL;
- ///////
- //
- // recompile regular expressions
- //
- ///////
- if ( m_hasucr ) {
- regfree ( &m_ucr );
- m_hasucr = false;
- }
- if ( m_hasupr ) {
- regfree ( &m_upr );
- m_hasupr = false;
- }
- // copy into tmpbuf
- SafeBuf tmp;
- char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
- if ( rx && ! rx[0] ) rx = NULL;
- if ( rx ) {
- tmp.reset();
- tmp.safeStrcpy ( rx );
- expandRegExShortcuts ( &tmp );
- m_hasucr = true;
- }
- int32_t err;
- if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
- REG_EXTENDED| //REG_ICASE|
- REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
- // error!
- char errbuf[1024];
- regerror(err,&m_ucr,errbuf,1000);
- log("coll: regcomp %s failed: %s. "
- "Ignoring.",
- rx,errbuf);
- regfree ( &m_ucr );
- m_hasucr = false;
- }
- rx = m_diffbotUrlProcessRegEx.getBufStart();
- if ( rx && ! rx[0] ) rx = NULL;
- if ( rx ) m_hasupr = true;
- if ( rx ) {
- tmp.reset();
- tmp.safeStrcpy ( rx );
- expandRegExShortcuts ( &tmp );
- m_hasupr = true;
- }
- if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
- REG_EXTENDED| // REG_ICASE|
- REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
- char errbuf[1024];
- regerror(err,&m_upr,errbuf,1000);
- // error!
- log("coll: regcomp %s failed: %s. "
- "Ignoring.",
- rx,errbuf);
- regfree ( &m_upr );
- m_hasupr = false;
- }
- // what diffbot url to use for processing
- char *api = m_diffbotApiUrl.getBufStart();
- if ( api && ! api[0] ) api = NULL;
- // convert from seconds to milliseconds. default is 250ms?
- int32_t wait = (int32_t)(m_collectiveCrawlDelay * 1000.0);
- // default to 250ms i guess. -1 means unset i think.
- if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
- bool isEthan = false;
- if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
- // it looks like we are assuming all crawls are repeating so that
- // &rountStart=<currenttime> or &roundStart=0 which is the same
- // thing, will trigger a re-crawl. so if collectiveRespiderFreq
- // is 0 assume it is like 999999.0 days. so that stuff works.
- // also i had to make the "default" rule below always have a respider
- // freq of 0.0 so it will respider right away if we make it past the
- // "lastspidertime>={roundstart}" rule which we will if they
- // set the roundstart time to the current time using &roundstart=0
- float respiderFreq = m_collectiveRespiderFrequency;
- if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;
- // lower from 7 to 1 since we have so many collections now
- // ok, now we have much less colls so raise back to 7
- int32_t diffbotipms = 7;//1; // 7
- // make the gigablast regex table just "default" so it does not
- // filtering, but accepts all urls. we will add code to pass the urls
- // through m_diffbotUrlCrawlPattern alternatively. if that itself
- // is empty, we will just restrict to the seed urls subdomain.
- for ( int32_t i = 0 ; i < MAX_FILTERS ; i++ ) {
- m_regExs[i].purge();
- m_spiderPriorities[i] = 0;
- m_maxSpidersPerRule [i] = 100;
- // when someone has a bulk job of thousands of different
- // domains it slows diffbot back-end down, so change this
- // from 100 to 7 if doing a bulk job
- if ( m_isCustomCrawl == 2 )
- m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster
- m_spiderIpWaits [i] = wait;
- m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
- // ethan wants some speed
- // if ( isEthan )
- // m_spiderIpMaxSpiders[i] = 30;
- //m_spidersEnabled [i] = 1;
- m_spiderFreqs [i] = respiderFreq;
- //m_spiderDiffbotApiUrl[i].purge();
- m_harvestLinks[i] = true;
- m_forceDelete [i] = false;
- }
- int32_t i = 0;
- // 1st one! for query reindex/ query delete
- m_regExs[i].set("isreindex");
- m_spiderIpMaxSpiders [i] = 10;
- m_spiderPriorities [i] = 70;
- i++;
- // 2nd default url
- m_regExs[i].set("ismedia && !ismanualadd");
- m_maxSpidersPerRule [i] = 0;
- m_spiderPriorities [i] = 100; // delete!
- m_forceDelete [i] = 1;
- i++;
- // de-prioritize fakefirstip urls so we don't give the impression our
- // spiders are slow. like if someone adds a bulk job with 100,000 urls
- // then we sit there and process to lookup their ips and add a real
- // spider request (if it falls onto the same shard) before we actually
- // do any real spidering. so keep the priority here low.
- m_regExs[i].set("isfakeip");
- m_maxSpidersPerRule [i] = 7;
- m_spiderIpMaxSpiders [i] = 7;
- m_spiderPriorities [i] = 20;
- m_spiderIpWaits [i] = 0;
- i++;
- // hopcount filter if asked for
- if( m_diffbotMaxHops >= 0 ) {
- // transform long to string
- char numstr[21]; // enough to hold all numbers up to 64-bits
- sprintf(numstr, "%"INT32"", (int32_t)m_diffbotMaxHops);
-
- // form regEx like: hopcount>3
- char hopcountStr[30];
- strcpy(hopcountStr, "hopcount>");
- strcat(hopcountStr, numstr);
- m_regExs[i].set(hopcountStr);
- // means DELETE :
- m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
- // just don't spider
- m_maxSpidersPerRule[i] = 0;
- // compatibility with m_spiderRoundStartTime:
- m_spiderFreqs[i] = 0.0;
- i++;
- }
- // 2nd default filter
- // always turn this on for now. they need to add domains they want
- // to crawl as seeds so they do not spider the web.
- // no because FTB seeds with link pages that link to another
- // domain. they just need to be sure to supply a crawl pattern
- // to avoid spidering the whole web.
- //
- // if they did not EXPLICITLY provide a url crawl pattern or
- // url crawl regex then restrict to seeds to prevent from spidering
- // the entire internet.
- //if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
- // MDW: even if they supplied a crawl pattern let's restrict to seed
- // domains 12/15/14
- m_regExs[i].set("!isonsamedomain && !ismanualadd");
- m_maxSpidersPerRule [i] = 0;
- m_spiderPriorities [i] = 100; // delete!
- m_forceDelete [i] = 1;
- i++;
- //}
- bool ucpHasPositive = false;
- // . scan them to see if all patterns start with '!' or not
- // . if pattern starts with ! it is negative, otherwise positive
- if ( ucp ) ucpHasPositive = hasPositivePattern ( ucp );
- // if no crawl regex, and it has a crawl pattern consisting of
- // only negative patterns then restrict to domains of seeds
- if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
- m_regExs[i].set("!isonsamedomain && !ismanualadd");
- m_maxSpidersPerRule [i] = 0;
- m_spiderPriorities [i] = 100; // delete!
- m_forceDelete [i] = 1;
- i++;
- }
- // don't bother re-spidering old pages if hopcount == maxhopcount
- // and only process new urls is true. because we don't need to
- // harvest outlinks from them.
- if ( m_diffbotOnlyProcessIfNewUrl && m_diffbotMaxHops > 0 &&
- // only crawls, not bulk jobs
- m_isCustomCrawl == 1 ) {
- m_regExs[i].purge();
- m_regExs[i].safePrintf("isindexed && hopcount==%"INT32,
- m_diffbotMaxHops );
- m_spiderPriorities [i] = 14;
- m_spiderFreqs [i] = 0.0;
- m_maxSpidersPerRule [i] = 0; // turn off spiders
- m_harvestLinks [i] = false;
- i++;
- }
- // 3rd rule for respidering
- // put this above the errocount>= rules below otherwise the crawl
- // may never advance its round because it keeps retrying a ton of
- // errored urls.
- if ( respiderFreq > 0.0 ) {
- m_regExs[i].set("lastspidertime>={roundstart}");
- // do not "remove" from index
- m_spiderPriorities [i] = 10;
- // just turn off spidering. if we were to set priority to
- // filtered it would be removed from index!
- //m_spidersEnabled [i] = 0;
- m_maxSpidersPerRule[i] = 0;
- // temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
- // which has been obsoleted, but we are running old code now!
- //m_spiderDiffbotApiUrl[i].set ( api );
- i++;
- }
- // if doing a one-shot crawl limit error retries to 3 times or
- // if no urls currently available to spider, whichever comes first.
- else {
- m_regExs[i].set("errorcount>=3");
- m_spiderPriorities [i] = 11;
- m_spiderFreqs [i] = 0.0416;
- m_maxSpidersPerRule [i] = 0; // turn off spiders
- i++;
- }
- // diffbot needs to retry even on 500 or 404 errors since sometimes
- // a seed url gets a 500 error mistakenly and it haults the crawl.
- // so take out "!hastmperror".
- m_regExs[i].set("errorcount>=1 && !hastmperror");
- m_spiderPriorities [i] = 14;
- m_spiderFreqs [i] = 0.0416; // every hour
- //m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
- i++;
- // and for docs that have errors respider once every 5 hours
- m_regExs[i].set("errorcount==1 && hastmperror");
- m_spiderPriorities [i] = 40;
- m_spiderFreqs [i] = 0.001; // 86 seconds
- i++;
- // and for docs that have errors respider once every 5 hours
- m_regExs[i].set("errorcount==2 && hastmperror");
- m_spiderPriorities [i] = 40;
- m_spiderFreqs [i] = 0.003; // 3*86 seconds (was 24 hrs)
- i++;
- // excessive errors? (tcp/dns timed out, etc.) retry once per month?
- m_regExs[i].set("errorcount>=3 && hastmperror");
- m_spiderPriorities [i] = 39;
- m_spiderFreqs [i] = .25; // 1/4 day
- // if bulk job, do not download a url more than 3 times
- if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
- i++;
- // if collectiverespiderfreq is 0 or less then do not RE-spider
- // documents already indexed.
- if ( respiderFreq <= 0.0 ) { // else {
- // this does NOT work! error docs continuously respider
- // because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
- //m_regExs[i].set("isindexed");
- m_regExs[i].set("hasreply");
- m_spiderPriorities [i] = 10;
- // just turn off spidering. if we were to set priority to
- // filtered it would be removed from index!
- //m_spidersEnabled [i] = 0;
- m_maxSpidersPerRule[i] = 0;
- // temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
- // which has been obsoleted, but we are running old code now!
- //m_spiderDiffbotApiUrl[i].set ( api );
- i++;
- }
- // url crawl and PAGE process pattern
- if ( ucp && ! upp && ppp ) {
- // if just matches ucp, just crawl it, do not process
- m_regExs[i].set("matchesucp");
- m_spiderPriorities [i] = 53;
- if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
- // let's always make this without delay because if we
- // restart the round we want these to process right away
- if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
- i++;
- // crawl everything else, but don't harvest links,
- // we have to see if the page content matches the "ppp"
- // to determine whether the page should be processed or not.
- m_regExs[i].set("default");
- m_spiderPriorities [i] = 52;
- if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
- // let's always make this without delay because if we
- // restart the round we want these to process right away
- if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
- m_harvestLinks [i] = false;
- i++;
- goto done;
- }
- // url crawl and process pattern
- if ( ucp && upp ) {
- m_regExs[i].set("matchesucp && matchesupp");
- m_spiderPriorities [i] = 55;
- if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
- // let's always make this without delay because if we
- // restart the round we want these to process right away
- if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
- //m_spiderDiffbotApiUrl[i].set ( api );
- i++;
- // if just matches ucp, just crawl it, do not process
- m_regExs[i].set("matchesucp");
- m_spiderPriorities [i] = 53;
- if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
- // let's always make this without delay because if we
- // restart the round we want these to process right away
- if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
- i++;
- // just process, do not spider links if does not match ucp
- m_regExs[i].set("matchesupp");
- m_spiderPriorities [i] = 54;
- m_harvestLinks [i] = false;
- if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
- // let's always make this without delay because if we
- // restart the round we want these to process right away
- if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
- //m_spiderDiffbotApiUrl[i].set ( api );
- i++;
- // do not crawl anything else
- m_regExs[i].set("default");
- m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
- // don't spider
- m_maxSpidersPerRule[i] = 0;
- // this needs to be zero so &spiderRoundStart=0
- // functionality which sets m_spiderRoundStartTime
- // to the current time works
- // otherwise Spider.cpp's getSpiderTimeMS() returns a time
- // in the future and we can't force the round
- m_spiderFreqs[i] = 0.0;
- i++;
- }
- // harvest links if we should crawl it
- if ( ucp && ! upp ) {
- m_regExs[i].set("matchesucp");
- m_spiderPriorities [i] = 53;
- if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
- // let's always make this without delay because if we
- // restart the round we want these to process right away.
- if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
- // process everything since upp is empty
- //m_spiderDiffbotApiUrl[i].set ( api );
- i++;
- // do not crawl anything else
- m_regExs[i].set("default");
- m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
- // don't delete, just don't spider
- m_maxSpidersPerRule[i] = 0;
- // this needs to be zero so &spiderRoundStart=0
- // functionality which sets m_spiderRoundStartTime
- // to the current time works
- // otherwise Spider.cpp's getSpiderTimeMS() returns a time
- // in the future and we can't force the rounce
- m_spiderFreqs[i] = 0.0;
- i++;
- }
- // just process
- if ( upp && ! ucp ) {
- m_regExs[i].set("matchesupp");
- m_spiderPriorities [i] = 54;
- if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
- // let's always make this without delay because if we
- // restart the round we want these to process right away
- if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
- //m_harvestLinks [i] = false;
- //m_spiderDiffbotApiUrl[i].set ( api );
- i++;
- // crawl everything by default, no processing
- m_regExs[i].set("default");
- m_spiderPriorities [i] = 50;
- // this needs to be zero so &spiderRoundStart=0
- // functionality which sets m_spiderRoundStartTime
- // to the current time works
- // otherwise Spider.cpp's getSpiderTimeMS() returns a time
- // in the future and we can't force the rounce
- m_spiderFreqs[i] = 0.0;
- i++;
- }
- // no restraints
- if ( ! upp && ! ucp ) {
- // crawl everything by default, no processing
- m_regExs[i].set("default");
- m_spiderPriorities [i] = 50;
- // this needs to be zero so &spiderRoundStart=0
- // functionality which sets m_spiderRoundStartTime
- // to the current time works
- // otherwise Spider.cpp's getSpiderTimeMS() returns a time
- // in the future and we can't force the rounce
- m_spiderFreqs[i] = 0.0;
- //m_spiderDiffbotApiUrl[i].set ( api );
- i++;
- }
- done:
- m_numRegExs = i;
- m_numRegExs2 = i;
- m_numRegExs3 = i;
- m_numRegExs10 = i;
- m_numRegExs5 = i;
- m_numRegExs6 = i;
- //m_numRegExs7 = i;
- m_numRegExs8 = i;
- m_numRegExs7 = i;
- //m_numRegExs11 = i;
- //char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
- //if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
- return true;
- }
- // . anytime the url filters are updated, this function is called
- // . it is also called on load of the collection at startup
- bool CollectionRec::rebuildUrlFilters ( ) {
- if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
- log("coll: Rebuilding url filters for %s ufp=%s",m_coll,
- m_urlFiltersProfile.getBufStart());
- // if not a custom crawl, and no expressions, add a default one
- //if ( m_numRegExs == 0 && ! m_isCustomCrawl ) {
- // setUrlFiltersToDefaults();
- //}
- // if not a custom crawl then set the url filters based on
- // the url filter profile, if any
- if ( ! m_isCustomCrawl )
- rebuildUrlFilters2();
- // set this so we know whether we have to keep track of page counts
- // per subdomain/site and per domain. if the url filters have
- // 'sitepages' 'domainpages' 'domainadds' or 'siteadds' we have to keep
- // the count table SpiderColl::m_pageCountTable.
- m_urlFiltersHavePageCounts = false;
- for ( int32_t i = 0 ; i < m_numRegExs ; i++ ) {
- // get the ith rule
- SafeBuf *sb = &m_regExs[i];
- char *p = sb->getBufStart();
- if ( strstr(p,"sitepages") ||
- strstr(p,"domainpages") ||
- strstr(p,"siteadds") ||
- strstr(p,"domainadds") ) {
- m_urlFiltersHavePageCounts = true;
- break;
- }
- }
- // if collection is brand new being called from addNewColl()
- // then sc will be NULL
- SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(m_collnum);
- // . do not do this at startup
- // . this essentially resets doledb
- if ( g_doledb.m_rdb.m_initialized &&
- // somehow this is initialized before we set m_recs[m_collnum]
- // so we gotta do the two checks below...
- sc &&
- // must be a valid coll
- m_collnum < g_collectiondb.m_numRecs &&
- g_collectiondb.m_recs[m_collnum] ) {
- log("coll: resetting doledb for %s (%li)",m_coll,
- (long)m_collnum);
-
- // clear doledb recs from tree
- //g_doledb.getRdb()->deleteAllRecs ( m_collnum );
- nukeDoledb ( m_collnum );
-
- // add it back
- //if ( ! g_doledb.getRdb()->addRdbBase2 ( m_collnum ) )
- // log("coll: error re-adding doledb for %s",m_coll);
-
- // just start this over...
- // . MDW left off here
- //tryToDelete ( sc );
- // maybe this is good enough
- //if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
-
- //CollectionRec *cr = sc->m_cr;
- // . rebuild sitetable? in PageBasic.cpp.
- // . re-adds seed spdierrequests using msg4
- // . true = addSeeds
- // . no, don't do this now because we call updateSiteList()
- // when we have &sitelist=xxxx in the request which will
- // handle updating those tables
- //updateSiteListTables ( m_collnum ,
- // true ,
- // cr->m_siteListBuf.getBufStart() );
- }
- // If the crawl is not generated by crawlbot, then we will just update
- // the regexes concerning the urls to process
- rebuildDiffbotRegexes();
- if ( ! m_isCustomCrawl ){
- return true;
- }
- // on the other hand, if it is a crawlbot job, then by convention the url filters are all set
- // to some default ones.
- return rebuildUrlFiltersDiffbot();
- }
- // for some reason the libc we use doesn't support these int16_tcuts,
- // so expand them to something it does support
- bool expandRegExShortcuts ( SafeBuf *sb ) {
- if ( ! sb->safeReplace3 ( "\\d" , "[0-9]" ) ) return false;
- if ( ! sb->safeReplace3 ( "\\D" , "[^0-9]" ) ) return false;
- if ( ! sb->safeReplace3 ( "\\l" , "[a-z]" ) ) return false;
- if ( ! sb->safeReplace3 ( "\\a" , "[A-Za-z]" ) ) return false;
- if ( ! sb->safeReplace3 ( "\\u" , "[A-Z]" ) ) return false;
- if ( ! sb->safeReplace3 ( "\\w" , "[A-Za-z0-9_]" ) ) return false;
- if ( ! sb->safeReplace3 ( "\\W" , "[^A-Za-z0-9_]" ) ) return false;
- return true;
- }
- void testRegex ( ) {
- //
- // TEST
- //
- char *rx;
- rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=\\d";
- rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";
- rx = ".*?article[0-9]*?.html";
- regex_t ucr;
- int32_t err;
- if ( ( err = regcomp ( &ucr , rx ,
- REG_ICASE
- |REG_EXTENDED
- //|REG_NEWLINE
- //|REG_NOSUB
- ) ) ) {
- // error!
- char errbuf[1024];
- regerror(err,&ucr,errbuf,1000);
- log("xmldoc: regcomp %s failed: %s. "
- "Ignoring.",
- rx,errbuf);
- }
- logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
- //char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
- char *url = "http://staticpages.diffbot.com/testCrawl/regex/article1.html";
- if ( regexec(&ucr,url,0,NULL,0) )
- logf(LOG_DEBUG,"db: failed to match %s on %s",
- url,rx);
- else
- logf(LOG_DEBUG,"db: MATCHED %s on %s",
- url,rx);
- exit(0);
- }
- int64_t CollectionRec::getNumDocsIndexed() {
- RdbBase *base = getBase(RDB_TITLEDB);//m_bases[RDB_TITLEDB];
- if ( ! base ) return 0LL;
- return base->getNumGlobalRecs();
- }
- // messes with m_spiderColl->m_sendLocalCrawlInfoToHost[MAX_HOSTS]
- // so we do not have to keep sending this huge msg!
- bool CollectionRec::shouldSendLocalCrawlInfoToHost ( int32_t hostId ) {
- if ( ! m_spiderColl ) return false;
- if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
- if ( hostId >= g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
- // sanity
- return m_spiderColl->m_sendLocalCrawlInfoToHost[hostId];
- }
- void CollectionRec::localCrawlInfoUpdate() {
- if ( ! m_spiderColl ) return;
- // turn on all the flags
- memset(m_spiderColl->m_sendLocalCrawlInfoToHost,1,g_hostdb.m_numHosts);
- }
- // right after we send copy it for sending we set this so we do not send
- // again unless localCrawlInfoUpdate() is called
- void CollectionRec::sentLocalCrawlInfoToHost ( int32_t hostId ) {
- if ( ! m_spiderColl ) return;
- m_spiderColl->m_sendLocalCrawlInfoToHost[hostId] = 0;
- }