/Collectiondb.cpp
C++ | 4250 lines | 2151 code | 607 blank | 1492 comment | 397 complexity | 6c162a71eb7c344fb15b6f36dc9b5b10 MD5 | raw file
Possible License(s): Apache-2.0
Large files files are truncated, but you can click here to view the full file
- #include "gb-include.h"
- #include "Collectiondb.h"
- //#include "CollectionRec.h"
- #include "Xml.h"
- #include "Url.h"
- #include "Loop.h"
- #include "Spider.h" // for calling SpiderLoop::collectionsUpdated()
- #include "Posdb.h"
- //#include "Indexdb.h"
- #include "Datedb.h"
- #include "Titledb.h"
- //#include "Revdb.h"
- //#include "Sections.h"
- #include "Placedb.h"
- #include "Tagdb.h"
- #include "Catdb.h"
- #include "Tfndb.h"
- #include "Spider.h"
- //#include "Checksumdb.h"
- #include "Clusterdb.h"
- #include "Spider.h"
- #include "Repair.h"
- #include "Users.h"
- #include "Parms.h"
- void testRegex ( ) ;
- HashTableX g_collTable;
- // a global class extern'd in .h file
- Collectiondb g_collectiondb;
- Collectiondb::Collectiondb ( ) {
- m_wrapped = 0;
- m_numRecs = 0;
- m_numRecsUsed = 0;
- m_numCollsSwappedOut = 0;
- m_initializing = false;
- //m_lastUpdateTime = 0LL;
- m_needsSave = false;
- // sanity
- if ( RDB_END2 >= RDB_END ) return;
- log("db: increase RDB_END2 to at least %"INT32" in "
- "Collectiondb.h",(int32_t)RDB_END);
- char *xx=NULL;*xx=0;
- }
- // reset rdb
- void Collectiondb::reset() {
- log(LOG_INFO,"db: resetting collectiondb.");
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
- delete ( m_recs[i] );
- m_recs[i] = NULL;
- }
- m_numRecs = 0;
- m_numRecsUsed = 0;
- g_collTable.reset();
- }
- /*
- bool Collectiondb::init ( bool isDump ) {
- reset();
- if ( g_isYippy ) return true;
- // reset # of recs
- //m_numRecs = 0;
- //m_numRecsUsed = 0;
- // . now load ALL recs
- // . returns false and sets g_errno on error
- if ( ! load ( isDump ) ) return false;
- // update time
- updateTime();
- // so we don't save again
- m_needsSave = false;
- // sanity
- if ( RDB_END2 < RDB_END ) {
- log("db: increase RDB_END2 to at least %"INT32" in "
- "Collectiondb.h",(int32_t)RDB_END);
- char *xx=NULL;*xx=0;
- }
- // if it set g_errno, return false
- //if ( g_errno ) return log("admin: Had init error: %s.",
- // mstrerror(g_errno));
- g_errno = 0;
- // otherwise, true, even if reloadList() blocked
- return true;
- }
- */
- extern bool g_inAutoSave;
- // . save to disk
- // . returns false if blocked, true otherwise
- bool Collectiondb::save ( ) {
- if ( g_conf.m_readOnlyMode ) return true;
- if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_hostId != 0 )
- return true;
- // which collection rec needs a save
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- // temp debug message
- //logf(LOG_DEBUG,"admin: SAVING collection #%"INT32" ANYWAY",i);
- if ( ! m_recs[i]->m_needsSave ) continue;
- // if we core in malloc we won't be able to save the
- // coll.conf files
- if ( m_recs[i]->m_isCustomCrawl &&
- g_inMemFunction &&
- g_hostdb.m_hostId != 0 )
- continue;
- //log(LOG_INFO,"admin: Saving collection #%"INT32".",i);
- m_recs[i]->save ( );
- }
- // oh well
- return true;
- }
- ///////////
- //
- // fill up our m_recs[] array based on the coll.*.*/coll.conf files
- //
- ///////////
- bool Collectiondb::loadAllCollRecs ( ) {
- m_initializing = true;
- char dname[1024];
- // MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
- sprintf ( dname , "%s" , g_hostdb.m_dir );
- Dir d;
- d.set ( dname );
- if ( ! d.open ()) return log("admin: Could not load collection config "
- "files.");
- int32_t count = 0;
- char *f;
- while ( ( f = d.getNextFilename ( "*" ) ) ) {
- // skip if first char not "coll."
- if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
- // must end on a digit (i.e. coll.main.0)
- if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
- // count them
- count++;
- }
- // reset directory for another scan
- d.set ( dname );
- if ( ! d.open ()) return log("admin: Could not load collection config "
- "files.");
- // note it
- //log(LOG_INFO,"db: loading collection config files.");
- // . scan through all subdirs in the collections dir
- // . they should be like, "coll.main/" and "coll.mycollection/"
- while ( ( f = d.getNextFilename ( "*" ) ) ) {
- // skip if first char not "coll."
- if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
- // must end on a digit (i.e. coll.main.0)
- if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
- // point to collection
- char *coll = f + 5;
- // NULL terminate at .
- char *pp = strchr ( coll , '.' );
- if ( ! pp ) continue;
- *pp = '\0';
- // get collnum
- collnum_t collnum = atol ( pp + 1 );
- // add it
- if ( ! addExistingColl ( coll , collnum ) )
- return false;
- // swap it out if we got 100+ collections
- // if ( count < 100 ) continue;
- // CollectionRec *cr = getRec ( collnum );
- // if ( cr ) cr->swapOut();
- }
- // if no existing recs added... add coll.main.0 always at startup
- if ( m_numRecs == 0 ) {
- log("admin: adding main collection.");
- addNewColl ( "main",
- 0 , // customCrawl ,
- NULL,
- 0 ,
- true , // bool saveIt ,
- // Parms.cpp reserves this so it can be sure
- // to add the same collnum to every shard
- 0 );
- }
- m_initializing = false;
- // note it
- //log(LOG_INFO,"db: Loaded data for %"INT32" collections. Ranging from "
- // "collection #0 to #%"INT32".",m_numRecsUsed,m_numRecs-1);
- // update the time
- //updateTime();
- // don't clean the tree if just dumpin
- //if ( isDump ) return true;
- return true;
- }
- // after we've initialized all rdbs in main.cpp call this to clean out
- // our rdb trees
- bool Collectiondb::cleanTrees ( ) {
- // remove any nodes with illegal collnums
- Rdb *r;
- //r = g_indexdb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- r = g_posdb.getRdb();
- //r->m_tree.cleanTree ();//(char **)r->m_bases);
- r->m_buckets.cleanBuckets();
- //r = g_datedb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- r = g_titledb.getRdb();
- r->m_tree.cleanTree ();//(char **)r->m_bases);
- //r = g_revdb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- //r = g_sectiondb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- //r = g_checksumdb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- //r = g_tfndb.getRdb();
- //r->m_tree.cleanTree ((char **)r->m_bases);
- r = g_spiderdb.getRdb();
- r->m_tree.cleanTree ();//(char **)r->m_bases);
- r = g_doledb.getRdb();
- r->m_tree.cleanTree ();//(char **)r->m_bases);
- // success
- return true;
- }
- /*
- void Collectiondb::updateTime() {
- // get time now in milliseconds
- int64_t newTime = gettimeofdayInMilliseconds();
- // change it
- if ( m_lastUpdateTime == newTime ) newTime++;
- // update it
- m_lastUpdateTime = newTime;
- // we need a save
- m_needsSave = true;
- }
- */
- #include "Statsdb.h"
- #include "Cachedb.h"
- #include "Syncdb.h"
- // same as addOldColl()
- bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
- int32_t i = collnum;
- // ensure does not already exist in memory
- collnum_t oldCollnum = getCollnum(coll);
- if ( oldCollnum >= 0 ) {
- g_errno = EEXIST;
- log("admin: Trying to create collection \"%s\" but "
- "already exists in memory. Do an ls on "
- "the working dir to see if there are two "
- "collection dirs with the same coll name",coll);
- char *xx=NULL;*xx=0;
- }
- // also try by #, i've seen this happen too
- CollectionRec *ocr = getRec ( i );
- if ( ocr ) {
- g_errno = EEXIST;
- log("admin: Collection id %i is in use already by "
- "%s, so we can not add %s. moving %s to trash."
- ,(int)i,ocr->m_coll,coll,coll);
- SafeBuf cmd;
- int64_t now = gettimeofdayInMilliseconds();
- cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%"UINT64
- , coll
- ,(int)i
- , coll
- ,(int)i
- , now );
- //log("admin: %s",cmd.getBufStart());
- gbsystem ( cmd.getBufStart() );
- return true;
- }
- // create the record in memory
- CollectionRec *cr = new (CollectionRec);
- if ( ! cr )
- return log("admin: Failed to allocated %"INT32" bytes for new "
- "collection record for \"%s\".",
- (int32_t)sizeof(CollectionRec),coll);
- mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
- // set collnum right for g_parms.setToDefault() call just in case
- // because before it was calling CollectionRec::reset() which
- // was resetting the RdbBases for the m_collnum which was garbage
- // and ended up resetting random collections' rdb. but now
- // CollectionRec::CollectionRec() sets m_collnum to -1 so we should
- // not need this!
- //cr->m_collnum = oldCollnum;
- // get the default.conf from working dir if there
- g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
- strcpy ( cr->m_coll , coll );
- cr->m_collLen = gbstrlen ( coll );
- cr->m_collnum = i;
- // point to this, so Rdb and RdbBase can reference it
- coll = cr->m_coll;
- //log("admin: loaded old coll \"%s\"",coll);
- // load coll.conf file
- if ( ! cr->load ( coll , i ) ) {
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- log("admin: Failed to load coll.%s.%"INT32"/coll.conf",coll,i);
- delete ( cr );
- if ( m_recs ) m_recs[i] = NULL;
- return false;
- }
- if ( ! registerCollRec ( cr , false ) ) return false;
- // always index spider status docs now for custom crawls
- if ( cr->m_isCustomCrawl )
- cr->m_indexSpiderReplies = true;
- // and don't do link voting, will help speed up
- if ( cr->m_isCustomCrawl ) {
- cr->m_getLinkInfo = false;
- cr->m_computeSiteNumInlinks = false;
- // limit each shard to 5 spiders per collection to prevent
- // ppl from spidering the web and hogging up resources
- cr->m_maxNumSpiders = 5;
- // diffbot download docs up to 50MB so we don't truncate
- // things like sitemap.xml. but keep regular html pages
- // 1MB
- cr->m_maxTextDocLen = 1024*1024;
- // xml, pdf, etc can be this. 50MB
- cr->m_maxOtherDocLen = 50000000;
- }
- // we need to compile the regular expressions or update the url
- // filters with new logic that maps crawlbot parms to url filters
- return cr->rebuildUrlFilters ( );
- }
- // . add a new rec
- // . returns false and sets g_errno on error
- // . was addRec()
- // . "isDump" is true if we don't need to initialize all the rdbs etc
- // because we are doing a './gb dump ...' cmd to dump out data from
- // one Rdb which we will custom initialize in main.cpp where the dump
- // code is. like for instance, posdb.
- // . "customCrawl" is 0 for a regular collection, 1 for a simple crawl
- // 2 for a bulk job. diffbot terminology.
- bool Collectiondb::addNewColl ( char *coll ,
- char customCrawl ,
- char *cpc ,
- int32_t cpclen ,
- bool saveIt ,
- // Parms.cpp reserves this so it can be sure
- // to add the same collnum to every shard
- collnum_t newCollnum ) {
- //do not send add/del coll request until we are in sync with shard!!
- // just return ETRYAGAIN for the parmlist...
- // ensure coll name is legit
- char *p = coll;
- for ( ; *p ; p++ ) {
- if ( is_alnum_a(*p) ) continue;
- if ( *p == '-' ) continue;
- if ( *p == '_' ) continue; // underscore now allowed
- break;
- }
- if ( *p ) {
- g_errno = EBADENGINEER;
- log("admin: \"%s\" is a malformed collection name because it "
- "contains the '%c' character.",coll,*p);
- return false;
- }
- // . scan for holes
- // . i is also known as the collection id
- //int32_t i = (int32_t)newCollnum;
- // no longer fill empty slots because if they do a reset then
- // a new rec right away it will be filled with msg4 recs not
- // destined for it. Later we will have to recycle some how!!
- //else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
- // right now we #define collnum_t int16_t. so do not breach that!
- //if ( m_numRecs < 0x7fff ) {
- // // set it
- // i = m_numRecs;
- // // claim it
- // // we don't do it here, because we check i below and
- // // increment m_numRecs below.
- // //m_numRecs++;
- //}
- // TODO: scan for holes here...
- //else {
- if ( newCollnum < 0 ) { char *xx=NULL;*xx=0; }
- // ceiling?
- //int64_t maxColls = 1LL<<(sizeof(collnum_t)*8);
- //if ( i >= maxColls ) {
- // g_errno = ENOBUFS;
- // return log("admin: Limit of %"INT64" collection reached. "
- // "Collection not created.",maxColls);
- //}
- // if empty... bail, no longer accepted, use "main"
- if ( ! coll || !coll[0] ) {
- g_errno = EBADENGINEER;
- return log("admin: Trying to create a new collection "
- "but no collection name provided. Use the \"c\" "
- "cgi parameter to specify it.");
- }
- // or if too big
- if ( gbstrlen(coll) > MAX_COLL_LEN ) {
- g_errno = ENOBUFS;
- return log("admin: Trying to create a new collection "
- "whose name \"%s\" of %i chars is longer than the "
- "max of %"INT32" chars.",coll,gbstrlen(coll),
- (int32_t)MAX_COLL_LEN);
- }
-
- // ensure does not already exist in memory
- if ( getCollnum ( coll ) >= 0 ) {
- g_errno = EEXIST;
- log("admin: Trying to create collection \"%s\" but "
- "already exists in memory.",coll);
- // just let it pass...
- g_errno = 0 ;
- return true;
- }
- // MDW: ensure not created on disk since time of last load
- char dname[512];
- sprintf(dname, "%scoll.%s.%"INT32"/",g_hostdb.m_dir,coll,(int32_t)newCollnum);
- DIR *dir = opendir ( dname );
- if ( dir ) closedir ( dir );
- if ( dir ) {
- g_errno = EEXIST;
- return log("admin: Trying to create collection %s but "
- "directory %s already exists on disk.",coll,dname);
- }
- // create the record in memory
- CollectionRec *cr = new (CollectionRec);
- if ( ! cr )
- return log("admin: Failed to allocated %"INT32" bytes for new "
- "collection record for \"%s\".",
- (int32_t)sizeof(CollectionRec),coll);
- // register the mem
- mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
- // get copy collection
- //CollectionRec *cpcrec = NULL;
- //if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
- //if ( cpc && cpc[0] && ! cpcrec )
- // log("admin: Collection \"%s\" to copy config from does not "
- // "exist.",cpc);
- // set collnum right for g_parms.setToDefault() call
- //cr->m_collnum = newCollnum;
- // . get the default.conf from working dir if there
- // . i think this calls CollectionRec::reset() which resets all of its
- // rdbbase classes for its collnum so m_collnum needs to be right
- //g_parms.setToDefault( (char *)cr );
- // get the default.conf from working dir if there
- //g_parms.setToDefault( (char *)cr , OBJ_COLL );
- g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
- // put search results back so it doesn't mess up results in qatest123
- if ( strcmp(coll,"qatest123") == 0 )
- cr->m_sameLangWeight = 20.0;
- /*
- // the default conf file
- char tmp1[1024];
- sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
- // . set our parms from the file.
- // . accepts OBJ_COLLECTIONREC or OBJ_CONF
- g_parms.setFromFile ( cr , NULL , tmp1 );
- */
- // this will override all
- // if ( cpcrec ) {
- // // copy it, but not the timedb hashtable, etc.
- // int32_t size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
- // // JAB: bad gbmemcpy - no donut!
- // // this is not how objects are supposed to be copied!!!
- // gbmemcpy ( cr , cpcrec , size);
- // }
- // set coll id and coll name for coll id #i
- strcpy ( cr->m_coll , coll );
- cr->m_collLen = gbstrlen ( coll );
- cr->m_collnum = newCollnum;
- // point to this, so Rdb and RdbBase can reference it
- coll = cr->m_coll;
- //
- // BEGIN NEW CODE
- //
- //
- // get token and crawlname if customCrawl is 1 or 2
- //
- char *token = NULL;
- char *crawl = NULL;
- SafeBuf tmp;
- // . return true with g_errno set on error
- // . if we fail to set a parm right we should force ourselves
- // out sync
- if ( customCrawl ) {
- if ( ! tmp.safeStrcpy ( coll ) ) return true;
- token = tmp.getBufStart();
- // diffbot coll name format is <token>-<crawlname>
- char *h = strchr ( tmp.getBufStart() , '-' );
- if ( ! h ) {
- log("crawlbot: bad custom collname");
- g_errno = EBADENGINEER;
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- return true;
- }
- *h = '\0';
- crawl = h + 1;
- if ( ! crawl[0] ) {
- log("crawlbot: bad custom crawl name");
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- g_errno = EBADENGINEER;
- return true;
- }
- // or if too big!
- if ( gbstrlen(crawl) > 30 ) {
- log("crawlbot: crawlbot crawl NAME is over 30 chars");
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- g_errno = EBADENGINEER;
- return true;
- }
- }
- //log("parms: added new collection \"%s\"", collName );
- cr->m_maxToCrawl = -1;
- cr->m_maxToProcess = -1;
- if ( customCrawl ) {
- // always index spider status docs now
- cr->m_indexSpiderReplies = true;
- // remember the token
- cr->m_diffbotToken.set ( token );
- cr->m_diffbotCrawlName.set ( crawl );
- // bring this back
- cr->m_diffbotApiUrl.set ( "" );
- cr->m_diffbotUrlCrawlPattern.set ( "" );
- cr->m_diffbotUrlProcessPattern.set ( "" );
- cr->m_diffbotPageProcessPattern.set ( "" );
- cr->m_diffbotUrlCrawlRegEx.set ( "" );
- cr->m_diffbotUrlProcessRegEx.set ( "" );
- cr->m_diffbotMaxHops = -1;
-
- cr->m_spiderStatus = SP_INITIALIZING;
- // do not spider more than this many urls total.
- // -1 means no max.
- cr->m_maxToCrawl = 100000;
- // do not process more than this. -1 means no max.
- cr->m_maxToProcess = 100000;
- // -1 means no max
- cr->m_maxCrawlRounds = -1;
- // diffbot download docs up to 10MB so we don't truncate
- // things like sitemap.xml
- cr->m_maxTextDocLen = 10000000;
- cr->m_maxOtherDocLen = 10000000;
- // john wants deduping on by default to avoid
- // processing similar pgs
- cr->m_dedupingEnabled = true;
- // show the ban links in the search results. the
- // collection name is cryptographic enough to show that
- cr->m_isCustomCrawl = customCrawl;
- cr->m_diffbotOnlyProcessIfNewUrl = true;
- // default respider to off
- cr->m_collectiveRespiderFrequency = 0.0;
- //cr->m_restrictDomain = true;
- // reset the crawl stats
- // always turn off gigabits so &s=1000 can do summary skipping
- cr->m_docsToScanForTopics = 0;
- // turn off link voting, etc. to speed up
- cr->m_getLinkInfo = false;
- cr->m_computeSiteNumInlinks = false;
- }
- // . this will core if a host was dead and then when it came
- // back up host #0's parms.cpp told it to add a new coll
- cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
- cr->m_diffbotCrawlEndTime = 0;
-
- // . just the basics on these for now
- // . if certain parms are changed then the url filters
- // must be rebuilt, as well as possibly the waiting tree!!!
- // . need to set m_urlFiltersHavePageCounts etc.
- cr->rebuildUrlFilters ( );
- cr->m_useRobotsTxt = true;
- // reset crawler stats.they should be loaded from crawlinfo.txt
- memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
- memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );
- // note that
- log("colldb: initial revival for %s",cr->m_coll);
- // . assume we got some urls ready to spider
- // . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
- // urls it spidered in that time these will get set to 0 and it
- // will send out an email alert if m_sentCrawlDoneAlert is not true.
- cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
- cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;
- // set some defaults. max spiders for all priorities in this
- // collection. NO, default is in Parms.cpp.
- //cr->m_maxNumSpiders = 10;
- //cr->m_needsSave = 1;
- // start the spiders!
- cr->m_spideringEnabled = true;
- // override this?
- saveIt = true;
- //
- // END NEW CODE
- //
- //log("admin: adding coll \"%s\" (new=%"INT32")",coll,(int32_t)isNew);
- // MDW: create the new directory
- retry22:
- if ( ::mkdir ( dname ,
- getDirCreationFlags() ) ) {
- // S_IRUSR | S_IWUSR | S_IXUSR |
- // S_IRGRP | S_IWGRP | S_IXGRP |
- // S_IROTH | S_IXOTH ) ) {
- // valgrind?
- if ( errno == EINTR ) goto retry22;
- g_errno = errno;
- mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
- delete ( cr );
- return log("admin: Creating directory %s had error: "
- "%s.", dname,mstrerror(g_errno));
- }
- // save it into this dir... might fail!
- if ( saveIt && ! cr->save() ) {
- mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
- delete ( cr );
- return log("admin: Failed to save file %s: %s",
- dname,mstrerror(g_errno));
- }
- if ( ! registerCollRec ( cr , true ) )
- return false;
- // add the rdbbases for this coll, CollectionRec::m_bases[]
- if ( ! addRdbBasesForCollRec ( cr ) )
- return false;
- return true;
- }
- void CollectionRec::setBasePtr ( char rdbId , class RdbBase *base ) {
- // if in the process of swapping in, this will be false...
- //if ( m_swappedOut ) { char *xx=NULL;*xx=0; }
- if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
- // Rdb::deleteColl() will call this even though we are swapped in
- // but it calls it with "base" set to NULL after it nukes the RdbBase
- // so check if base is null here.
- if ( base && m_bases[ (unsigned char)rdbId ]){ char *xx=NULL;*xx=0; }
- m_bases [ (unsigned char)rdbId ] = base;
- }
- RdbBase *CollectionRec::getBasePtr ( char rdbId ) {
- if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
- return m_bases [ (unsigned char)rdbId ];
- }
- static bool s_inside = false;
- // . returns NULL w/ g_errno set on error.
- // . TODO: ensure not called from in thread, not thread safe
- RdbBase *CollectionRec::getBase ( char rdbId ) {
- if ( s_inside ) { char *xx=NULL;*xx=0; }
- if ( ! m_swappedOut ) return m_bases[(unsigned char)rdbId];
- log("cdb: swapin collnum=%"INT32"",(int32_t)m_collnum);
- // sanity!
- if ( g_threads.amThread() ) { char *xx=NULL;*xx=0; }
- s_inside = true;
- // turn off quickpoll to avoid getbase() being re-called and
- // coring from s_inside being true
- int32_t saved = g_conf.m_useQuickpoll;
- g_conf.m_useQuickpoll = false;
- // load them back in. return NULL w/ g_errno set on error.
- if ( ! g_collectiondb.addRdbBasesForCollRec ( this ) ) {
- log("coll: error swapin: %s",mstrerror(g_errno));
- g_conf.m_useQuickpoll = saved;
- s_inside = false;
- return NULL;
- }
- g_conf.m_useQuickpoll = saved;
- s_inside = false;
- g_collectiondb.m_numCollsSwappedOut--;
- m_swappedOut = false;
- log("coll: swapin was successful for collnum=%"INT32"",(int32_t)m_collnum);
- return m_bases[(unsigned char)rdbId];
- }
- bool CollectionRec::swapOut ( ) {
- if ( m_swappedOut ) return true;
- log("cdb: swapout collnum=%"INT32"",(int32_t)m_collnum);
- // free all RdbBases in each rdb
- for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
- Rdb *rdb = g_process.m_rdbs[i];
- // this frees all the RdbBase::m_files and m_maps for the base
- rdb->resetBase ( m_collnum );
- }
- // now free each base itself
- for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
- RdbBase *base = m_bases[i];
- if ( ! base ) continue;
- mdelete (base, sizeof(RdbBase), "Rdb Coll");
- delete (base);
- m_bases[i] = NULL;
- }
- m_swappedOut = true;
- g_collectiondb.m_numCollsSwappedOut++;
- return true;
- }
- // . called only by addNewColl() and by addExistingColl()
- bool Collectiondb::registerCollRec ( CollectionRec *cr , bool isNew ) {
- // add m_recs[] and to hashtable
- if ( ! setRecPtr ( cr->m_collnum , cr ) )
- return false;
- return true;
- }
- // swap it in
- bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- CollectionRec *cr = m_recs[i];
- if ( ! cr ) continue;
- // skip if swapped out
- if ( cr->m_swappedOut ) continue;
- // add rdb base files etc. for it
- addRdbBasesForCollRec ( cr );
- }
- // now clean the trees. moved this into here from
- // addRdbBasesForCollRec() since we call addRdbBasesForCollRec()
- // now from getBase() to load on-demand for saving memory
- cleanTrees();
- return true;
- }
- bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
- char *coll = cr->m_coll;
- //////
- //
- // if we are doing a dump from the command line, skip this stuff
- //
- //////
- if ( g_dumpMode ) return true;
- // tell rdbs to add one, too
- //if ( ! g_indexdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_datedb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
-
- if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_revdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_sectiondb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_catdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- //if ( ! g_tfndb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
- // now clean the trees
- //cleanTrees();
- // debug message
- //log ( LOG_INFO, "db: verified collection \"%s\" (%"INT32").",
- // coll,(int32_t)cr->m_collnum);
- // tell SpiderCache about this collection, it will create a
- // SpiderCollection class for it.
- //g_spiderCache.reset1();
- // success
- return true;
- hadError:
- log("db: error registering coll: %s",mstrerror(g_errno));
- return false;
- }
- /*
- bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
- if ( r->getLong("admin",1) == 0 ) return false;
- if ( g_conf.isMasterAdmin ( s , r ) ) return true;
- char *c = r->getString ( "c" );
- CollectionRec *cr = getRec ( c );
- if ( ! cr ) return false;
- return g_users.hasPermission ( r , PAGE_SEARCH );
- //return cr->hasPermission ( r , s );
- }
- void savingCheckWrapper1 ( int fd , void *state ) {
- WaitEntry *we = (WaitEntry *)state;
- // no state?
- if ( ! we ) { log("colldb: we1 is null"); return; }
- // unregister too
- g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
- // if it blocked again i guess tree is still saving
- if ( ! g_collectiondb.resetColl ( we->m_coll ,
- we ,
- we->m_purgeSeeds))
- return;
- // all done
- we->m_callback ( we->m_state );
- }
- void savingCheckWrapper2 ( int fd , void *state ) {
- WaitEntry *we = (WaitEntry *)state;
- // no state?
- if ( ! we ) { log("colldb: we2 is null"); return; }
- // unregister too
- g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
- // if it blocked again i guess tree is still saving
- if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
- // all done
- we->m_callback ( we->m_state );
- }
- */
- /*
- // delete all records checked in the list
- bool Collectiondb::deleteRecs ( HttpRequest *r ) {
- for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
- char *f = r->getField ( i );
- if ( strncmp ( f , "del" , 3 ) != 0 ) continue;
- char *coll = f + 3;
- //if ( ! is_digit ( f[3] ) ) continue;
- //int32_t h = atol ( f + 3 );
- deleteRec ( coll , NULL );
- }
- return true;
- }
- */
- /*
- // . delete a collection
- // . this uses blocking unlinks, may make non-blocking later
- // . returns false if blocked, true otherwise
- bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
- // force on for now
- //deleteTurkdb = true;
- // no spiders can be out. they may be referencing the CollectionRec
- // in XmlDoc.cpp... quite likely.
- //if ( g_conf.m_spideringEnabled ||
- // g_spiderLoop.m_numSpidersOut > 0 ) {
- // log("admin: Can not delete collection while "
- // "spiders are enabled or active.");
- // return false;
- //}
- // ensure it's not NULL
- if ( ! coll ) {
- log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
- g_errno = ENOTFOUND;
- return true;
- }
- // find the rec for this collection
- collnum_t collnum = getCollnum ( coll );
- return deleteRec2 ( collnum , we );
- }
- */
- // if there is an outstanding disk read thread or merge thread then
- // Spider.cpp will handle the delete in the callback.
- // this is now tryToDeleteSpiderColl in Spider.cpp
- /*
- void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
- sc->m_deleteMyself = true;
- // if not currently being accessed nuke it now
- if ( ! sc->m_msg5.m_waitingForList &&
- ! sc->m_msg5b.m_waitingForList &&
- ! sc->m_msg1.m_mcast.m_inUse ) {
- mdelete ( sc, sizeof(SpiderColl),"nukecr2");
- delete ( sc );
- return;
- }
- }
- */
- /// this deletes the collection, not just part of a reset.
- bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
- // do not allow this if in repair mode
- if ( g_repair.isRepairActive() && g_repair.m_collnum == collnum ) {
- log("admin: Can not delete collection while in repair mode.");
- g_errno = EBADENGINEER;
- return true;
- }
- // bitch if not found
- if ( collnum < 0 ) {
- g_errno = ENOTFOUND;
- log(LOG_LOGIC,"admin: Collection #%"INT32" is bad, "
- "delete failed.",(int32_t)collnum);
- return true;
- }
- CollectionRec *cr = m_recs [ collnum ];
- if ( ! cr ) {
- log("admin: Collection id problem. Delete failed.");
- g_errno = ENOTFOUND;
- return true;
- }
- if ( g_process.isAnyTreeSaving() ) {
- // note it
- log("admin: tree is saving. waiting2.");
- // all done
- return false;
- }
- // spiders off
- //if ( cr->m_spiderColl &&
- // cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
- // log("admin: Can not delete collection while "
- // "spiders are outstanding for collection. Turn off "
- // "spiders and wait for them to exit.");
- // return false;
- //}
- char *coll = cr->m_coll;
- // note it
- log(LOG_INFO,"db: deleting coll \"%s\" (%"INT32")",coll,
- (int32_t)cr->m_collnum);
- // we need a save
- m_needsSave = true;
- // nuke doleiptable and waintree and waitingtable
- /*
- SpiderColl *sc = g_spiderCache.getSpiderColl ( collnum );
- sc->m_waitingTree.clear();
- sc->m_waitingTable.clear();
- sc->m_doleIpTable.clear();
- g_spiderLoop.m_lockTable.clear();
- g_spiderLoop.m_lockCache.clear(0);
- sc->m_lastDownloadCache.clear(collnum);
- */
- // CAUTION: tree might be in the middle of saving
- // we deal with this in Process.cpp now
- // remove from spider cache, tell it to sync up with collectiondb
- //g_spiderCache.reset1();
- // . TODO: remove from g_sync
- // . remove from all rdbs
- //g_indexdb.getRdb()->delColl ( coll );
- g_posdb.getRdb()->delColl ( coll );
- //g_datedb.getRdb()->delColl ( coll );
- g_titledb.getRdb()->delColl ( coll );
- //g_revdb.getRdb()->delColl ( coll );
- //g_sectiondb.getRdb()->delColl ( coll );
- g_tagdb.getRdb()->delColl ( coll );
- // let's preserve the tags... they have all the turk votes in them
- //if ( deleteTurkdb ) {
- //}
- //g_catdb.getRdb()->delColl ( coll );
- //g_checksumdb.getRdb()->delColl ( coll );
- g_spiderdb.getRdb()->delColl ( coll );
- g_doledb.getRdb()->delColl ( coll );
- //g_tfndb.getRdb()->delColl ( coll );
- g_clusterdb.getRdb()->delColl ( coll );
- g_linkdb.getRdb()->delColl ( coll );
- // reset spider info
- SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
- if ( sc ) {
- // remove locks from lock table:
- sc->clearLocks();
- //sc->m_collnum = newCollnum;
- //sc->reset();
- // you have to set this for tryToDeleteSpiderColl to
- // actually have a shot at deleting it
- sc->m_deleteMyself = true;
- // cr will be invalid int16_tly after this
- // MDW: this is causing the core...
- // use fake ptrs for easier debugging
- //sc->m_cr = (CollectionRec *)0x99999;//NULL;
- //sc->m_cr = NULL;
- sc->setCollectionRec ( NULL );
- // this will put it on "death row" so it will be deleted
- // once Msg5::m_waitingForList/Merge is NULL
- tryToDeleteSpiderColl ( sc ,"10");
- //mdelete ( sc, sizeof(SpiderColl),"nukecr2");
- //delete ( sc );
- // don't let cr reference us anymore, sc is on deathrow
- // and "cr" is delete below!
- //cr->m_spiderColl = (SpiderColl *)0x8888;//NULL;
- cr->m_spiderColl = NULL;
- }
- // the bulk urls file too i guess
- if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
- SafeBuf bu;
- bu.safePrintf("%sbulkurls-%s.txt",
- g_hostdb.m_dir , cr->m_coll );
- File bf;
- bf.set ( bu.getBufStart() );
- if ( bf.doesExist() ) bf.unlink();
- }
- // now remove from list of collections that might need a disk merge
- removeFromMergeLinkedList ( cr );
- //////
- //
- // remove from m_recs[]
- //
- //////
- setRecPtr ( cr->m_collnum , NULL );
- // free it
- mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
- delete ( cr );
- // do not do this here in case spiders were outstanding
- // and they added a new coll right away and it ended up getting
- // recs from the deleted coll!!
- //while ( ! m_recs[m_numRecs-1] ) m_numRecs--;
- // update the time
- //updateTime();
- // done
- return true;
- }
- //#include "PageTurk.h"
- /*
- // . reset a collection
- // . returns false if blocked and will call callback
- bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
- // ensure it's not NULL
- if ( ! coll ) {
- log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
- g_errno = ENOCOLLREC;
- return true;
- }
- // get the CollectionRec for "qatest123"
- CollectionRec *cr = getRec ( coll ); // "qatest123" );
- // must be there. if not, we create test i guess
- if ( ! cr ) {
- log("db: could not get coll rec \"%s\" to reset", coll);
- char *xx=NULL;*xx=0;
- }
- return resetColl2 ( cr->m_collnum, purgeSeeds);
- }
- */
- // ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
- bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
- // an add, make sure big enough
- int32_t need = ((int32_t)collnum+1)*sizeof(CollectionRec *);
- int32_t have = m_recPtrBuf.getLength();
- int32_t need2 = need - have;
- // if already big enough
- if ( need2 <= 0 ) {
- m_recs [ collnum ] = NULL;
- return true;
- }
- m_recPtrBuf.setLabel ("crecptrb");
- // . true here means to clear the new space to zeroes
- // . this shit works based on m_length not m_capacity
- if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
- log("admin: error growing rec ptr buf2.");
- return false;
- }
- // sanity
- if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }
- // set it
- m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
- // update length of used bytes in case we re-alloc
- m_recPtrBuf.setLength ( need );
- // re-max
- int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
- // sanity
- if ( collnum >= max ) { char *xx=NULL;*xx=0; }
- // initialize slot
- m_recs [ collnum ] = NULL;
- return true;
- }
- bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
- // first time init hashtable that maps coll to collnum
- if ( g_collTable.m_numSlots == 0 &&
- ! g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
- false,0,"nhshtbl"))
- return false;
- // sanity
- if ( collnum < 0 ) { char *xx=NULL;*xx=0; }
- // sanity
- int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
- // set it
- m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
- // tell spiders to re-upadted the active list
- g_spiderLoop.m_activeListValid = false;
- g_spiderLoop.m_activeListModified = true;
- // a delete?
- if ( ! cr ) {
- // sanity
- if ( collnum >= max ) { char *xx=NULL;*xx=0; }
- // get what's there
- CollectionRec *oc = m_recs[collnum];
- // let it go
- m_recs[collnum] = NULL;
- // if nothing already, done
- if ( ! oc ) return true;
- // tally it up
- m_numRecsUsed--;
- // delete key
- int64_t h64 = hash64n(oc->m_coll);
- // if in the hashtable UNDER OUR COLLNUM then nuke it
- // otherwise, we might be called from resetColl2()
- void *vp = g_collTable.getValue ( &h64 );
- if ( ! vp ) return true;
- collnum_t ct = *(collnum_t *)vp;
- if ( ct != collnum ) return true;
- g_collTable.removeKey ( &h64 );
- return true;
- }
- // ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
- if ( ! growRecPtrBuf ( collnum ) )
- return false;
- // sanity
- if ( cr->m_collnum != collnum ) { char *xx=NULL;*xx=0; }
- // add to hash table to map name to collnum_t
- int64_t h64 = hash64n(cr->m_coll);
- // debug
- //log("coll: adding key %"INT64" for %s",h64,cr->m_coll);
- if ( ! g_collTable.addKey ( &h64 , &collnum ) )
- return false;
- // ensure last is NULL
- m_recs[collnum] = cr;
- // count it
- m_numRecsUsed++;
- //log("coll: adding key4 %"UINT64" for coll \"%s\" (%"INT32")",h64,cr->m_coll,
- // (int32_t)i);
- // reserve it
- if ( collnum >= m_numRecs ) m_numRecs = collnum + 1;
- // sanity to make sure collectionrec ptrs are legit
- for ( int32_t j = 0 ; j < m_numRecs ; j++ ) {
- if ( ! m_recs[j] ) continue;
- if ( m_recs[j]->m_collnum == 1 ) continue;
- }
- // update the time
- //updateTime();
- return true;
- }
- // moves a file by first trying rename, then copying since cross device renaming doesn't work
- // returns 0 on success
- int mv(char* src, char* dest) {
- int status = rename( src , dest );
- if (status == 0)
- return 0;
- FILE *fsrc, *fdest;
- fsrc = fopen(src, "r");
- if (fsrc == NULL)
- return -1;
- fdest = fopen(dest, "w");
- if (fdest == NULL) {
- fclose(fsrc);
- return -1;
- }
- const int BUF_SIZE = 1024;
- char buf[BUF_SIZE];
- while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
- int read = fread(buf, 1, BUF_SIZE, fsrc);
- fwrite(buf, 1, read, fdest);
- }
- fclose(fsrc);
- fclose(fdest);
- if (ferror(fdest) || ferror(fsrc))
- return -1;
- remove(src);
- return 0;
- }
- // . returns false if we need a re-call, true if we completed
- // . returns true with g_errno set on error
- bool Collectiondb::resetColl2( collnum_t oldCollnum,
- collnum_t newCollnum,
- //WaitEntry *we,
- bool purgeSeeds){
- // save parms in case we block
- //we->m_purgeSeeds = purgeSeeds;
- // now must be "qatest123" only for now
- //if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
- // no spiders can be out. they may be referencing the CollectionRec
- // in XmlDoc.cpp... quite likely.
- //if ( g_conf.m_spideringEnabled ||
- // g_spiderLoop.m_numSpidersOut > 0 ) {
- // log("admin: Can not delete collection while "
- // "spiders are enabled or active.");
- // return false;
- //}
- // do not allow this if in repair mode
- if ( g_repair.isRepairActive() && g_repair.m_collnum == oldCollnum ) {
- log("admin: Can not delete collection while in repair mode.");
- g_errno = EBADENGINEER;
- return true;
- }
- //log("admin: resetting collnum %"INT32"",(int32_t)oldCollnum);
- // CAUTION: tree might be in the middle of saving
- // we deal with this in Process.cpp now
- if ( g_process.isAnyTreeSaving() ) {
- // we could not complete...
- return false;
- }
- CollectionRec *cr = m_recs [ oldCollnum ];
- // let's reset crawlinfo crap
- cr->m_globalCrawlInfo.reset();
- cr->m_localCrawlInfo.reset();
- //collnum_t oldCollnum = cr->m_collnum;
- //collnum_t newCollnum = m_numRecs;
- // in case of bulk job, be sure to save list of spots
- // copy existing list to a /tmp, where they will later be transferred back to the new folder
- // now i just store in the root working dir... MDW
- /*
- char oldbulkurlsname[1036];
- snprintf(oldbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)oldCollnum);
- char newbulkurlsname[1036];
- snprintf(newbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)newCollnum);
- char tmpbulkurlsname[1036];
- snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%"INT32".bulkurls.txt",cr->m_coll,(int32_t)oldCollnum);
- if (cr->m_isCustomCrawl == 2)
- mv( oldbulkurlsname , tmpbulkurlsname );
- */
- // reset spider info
- SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
- if ( sc ) {
- // remove locks from lock table:
- sc->clearLocks();
- // don't do this anymore, just nuke it in case
- // m_populatingDoledb was true etc. there are too many
- // flags to worry about
- //sc->m_collnum = newCollnum;
- //sc->reset();
- // this will put it on "death row" so it will be deleted
- // once Msg5::m_waitingForList/Merge is NULL
- tryToDeleteSpiderColl ( sc,"11" );
- //mdelete ( sc, sizeof(SpiderColl),"nukecr2");
- //delete ( sc );
- cr->m_spiderColl = NULL;
- }
- // reset spider round
- cr->m_spiderRoundNum = 0;
- cr->m_spiderRoundStartTime = 0;
- cr->m_spiderStatus = SP_INITIALIZING; // this is 0
- //cr->m_spiderStatusMsg = NULL;
- // reset seed buf
- if ( purgeSeeds ) {
- // free the buffer of seed urls
- cr->m_diffbotSeeds.purge();
- // reset seed dedup table
- HashTableX *ht = &cr->m_seedHashTable;
- ht->reset();
- }
- // so XmlDoc.cpp can detect if the collection was reset since it
- // launched its spider:
- cr->m_lastResetCount++;
- if ( newCollnum >= m_numRecs ) m_numRecs = (int32_t)newCollnum + 1;
- // advance sanity check. did we wrap around?
- // right now we #define collnum_t int16_t
- if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }
- // make a new collnum so records in transit will not be added
- // to any rdb...
- cr->m_collnum = newCollnum;
- // update the timestamps since we are restarting/resetting
- cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
- cr->m_diffbotCrawlEndTime = 0;
- ////////
- //
- // ALTER m_recs[] array
- //
- ////////
- // Rdb::resetColl() needs to know the new cr so it can move
- // the RdbBase into cr->m_bases[rdbId] array. recycling.
- setRecPtr ( newCollnum , cr );
- // a new directory then since we changed the collnum
- char dname[512];
- sprintf(dname, "%scoll.%s.%"INT32"/",
- g_hostdb.m_dir,
- cr->m_coll,
- (int32_t)newCollnum);
- DIR *dir = opendir ( dname );
- if ( dir )
- closedir ( dir );
- if ( dir ) {
- //g_errno = EEXIST;
- log("admin: Trying to create collection %s but "
- "directory %s already exists on disk.",cr->m_coll,dname);
- }
- if ( ::mkdir ( dname ,
- getDirCreationFlags() ) ) {
- // S_IRUSR | S_IWUSR | S_IXUSR |
- // S_IRGRP | S_IWGRP | S_IXGRP |
- // S_IROTH | S_IXOTH ) ) {
- // valgrind?
- //if ( errno == EINTR ) goto retry22;
- //g_errno = errno;
- log("admin: Creating directory %s had error: "
- "%s.", dname,mstrerror(g_errno));
- }
- // be sure to copy back the bulk urls for bulk jobs
- // MDW: now i just store that file in the root working dir
- //if (cr->m_isCustomCrawl == 2)
- // mv( tmpbulkurlsname, newbulkurlsname );
- // . unlink all the *.dat and *.map files for this coll in its subdir
- // . remove all recs from this collnum from m_tree/m_buckets
- // . updates RdbBase::m_collnum
- // . so for the tree it just needs to mark the old collnum recs
- // with a collnum -1 in case it is saving...
- g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
- // reset crawl status too!
- cr->m_spiderStatus = SP_INITIALIZING;
- // . set m_recs[oldCollnum] to NULL and remove from hash table
- // . do after calls to deleteColl() above so it won't crash
- setRecPtr ( oldCollnum , NULL );
- // save coll.conf to new directory
- cr->save();
- // and clear the robots.txt cache in case we recently spidered a
- // robots.txt, we don't want to use it, we want to use the one we
- // have in the test-parser subdir so we are consistent
- //RdbCache *robots = Msg13::getHttpCacheRobots();
- //RdbCache *others = Msg13::getHttpCacheOthers();
- // clear() was removed do to possible corruption
- //robots->clear ( oldCollnum );
- //others->clear ( oldCollnum );
- //g_templateTable.reset();
- //g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
- // repopulate CollectionRec::m_sortByDateTable. should be empty
- // since we are resetting here.
- //initSortByDateTable ( coll );
- // done
- return true;
- }
- // a hack function
- bool addCollToTable ( char *coll , collnum_t collnum ) {
- // readd it to the hashtable that maps name to collnum too
- int64_t h64 = hash64n(coll);
- g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
- false,0,"nhshtbl");
- return g_collTable.addKey ( &h64 , &collnum );
- }
- // get coll rec specified in the HTTP request
- CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
- char *coll = r->getString ( "c" );
- if ( coll && ! coll[0] ) coll = NULL;
- // maybe it is crawlbot?
- char *name = NULL;
- char *token = NULL;
- if ( ! coll ) {
- name = r->getString("name");
- token = r->getString("token");
- }
- char tmp[MAX_COLL_LEN+1];
- if ( ! coll && token && name ) {
- snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
- coll = tmp;
- }
- // default to main first
- if ( ! coll && useDefaultRec ) {
- CollectionRec *cr = g_collectiondb.getRec("main");
- if ( cr ) return cr;
- }
- // try next in line
- if ( ! coll && useDefaultRec ) {
- return getFirstRec ();
- }
- // give up?
- if ( ! coll ) return NULL;
- //if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
- return g_collectiondb.getRec ( coll );
- }
- char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
- char *coll = r->getString ( "c" );
- if ( coll && ! coll[0] ) coll = NULL;
- if ( coll ) return coll;
- CollectionRec *cr = NULL;
- // default to main first
- if ( ! coll ) {
- cr = g_collectiondb.getRec("main");
- // CAUTION: cr could be deleted so don't trust this ptr
- // if you give up control of the cpu
- if ( cr ) return cr->m_coll;
- }
- // try next in line
- if ( ! coll ) {
- cr = getFirstRec ();
- if ( cr ) return cr->m_coll;
- }
- // give up?
- return NULL;
- }
- //CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
- // char *coll = getDefaultColl();
- // return g_collectiondb.getRec(coll);
- //}
- // . get collectionRec from name
- // . returns NULL if not available
- CollectionRec *Collectiondb::getRec ( char *coll ) {
- if ( ! coll ) coll = "";
- return getRec ( coll , gbstrlen(coll) );
- }
- CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
- if ( ! coll ) coll = "";
- collnum_t collnum = getCollnum ( coll , collLen );
- if ( collnum < 0 ) return NULL;
- return m_recs [ (int32_t)collnum ];
- }
- CollectionRec *Collectiondb::getRec ( collnum_t collnum) {
- if ( collnum >= m_numRecs || collnum < 0 ) {
- // Rdb::resetBase() gets here, so don't always log.
- // it is called from CollectionRec::reset() which is called
- // from the CollectionRec constructor and ::load() so
- // it won't have anything in rdb at that time
- //log("colldb: collnum %"INT32" > numrecs = %"INT32"",
- // (int32_t)collnum,(int32_t)m_numRecs);
- return NULL;
- }
- return m_recs[collnum];
- }
- //CollectionRec *Collectiondb::getDefaultRec ( ) {
- // if ( ! g_conf.m_defaultColl[0] ) return NULL; // no default?
- // collnum_t collnum = getCollnum ( g_conf.m_defaultColl );
- // if ( collnum < (collnum_t)0 ) return NULL;
- // return m_recs[(int32_t)collnum];
- //}
- CollectionRec *Collectiondb::getFirstRec ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ )
- if ( m_recs[i] ) return m_recs[i];
- return NULL;
- }
- collnum_t Collectiondb::getFirstCollnum ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ )
- if ( m_recs[i] ) return i;
- return (collnum_t)-1;
- }
- char *Collectiondb::getFirstCollName ( ) {
- for ( int32_t i = 0 ; i < m_numRecs ; i++ )
- if ( m_recs[i] ) return m_recs[i]->m_coll;
- return NULL;
- }
- char *Collectiondb::getCollName ( collnum_t collnum ) {
- if ( collnum < 0 || collnum > m_numRecs ) return NULL;
- if ( ! m_recs[(int32_t)collnum] ) return NULL;
- return m_recs[collnum]->m_coll;
- }
- collnum_t Collectiondb::getCollnum ( char *coll ) {
- int32_t clen = 0;
- if ( coll ) clen = gbstrlen(coll );
- return getCollnum ( coll , clen );
- /*
- //if ( ! coll ) coll = "";
- // default empty collection names
- if ( coll && ! coll[0] ) coll = NULL;
- if ( ! coll ) coll = g_conf.m_defaultColl;
- if ( ! coll || ! coll[0] ) coll = "main";
- // This is necessary for Statsdb to work, as it is
- // not associated with any collection. Is this
- // necessary for Catdb?
- if ( coll[0]=='s' && coll[1] =='t' &&
- strcmp ( "statsdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='f' && coll[1]=='a' &&
- strcmp ( "facebookdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='a' && coll[1]=='c' &&
- strcmp ( "accessdb\0", coll ) == 0)
- return 0;
- // because diffbot may have thousands of crawls/collections
- // let's improve the speed here. try hashing it...
- int64_t h64 = hash64n(coll);
- void *vp = g_collTable.getValue ( &h64 );
- if ( ! vp ) return -1; // not found
- return *(collnum_t *)vp;
- */
- /*
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- if ( m_recs[i]->m_coll[0] != coll[0] ) continue;
- if ( strcmp ( m_recs[i]->m_coll , coll ) == 0 ) return i;
- }
- //if ( strcmp ( "catdb\0", coll ) == 0) return 0;
- return (collnum_t)-1; // not found
- */
- }
- collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {
- // default empty collection names
- if ( coll && ! coll[0] ) coll = NULL;
- if ( ! coll ) {
- coll = g_conf.m_defaultColl;
- if ( coll ) clen = gbstrlen(coll);
- else clen = 0;
- }
- if ( ! coll || ! coll[0] ) {
- coll = "main";
- clen = gbstrlen(coll);
- }
- // This is necessary for Statsdb to work, as it is
- //if ( ! coll ) coll = "";
- // not associated with any collection. Is this
- // necessary for Catdb?
- if ( coll[0]=='s' && coll[1] =='t' &&
- strcmp ( "statsdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='f' && coll[1]=='f' &&
- strcmp ( "facebookdb\0", coll ) == 0)
- return 0;
- if ( coll[0]=='a' && coll[1]=='c' &&
- strcmp ( "accessdb\0", coll ) == 0)
- return 0;
- // because diffbot may have thousands of crawls/collections
- // let's improve the speed here. try hashing it...
- int64_t h64 = hash64(coll,clen);
- void *vp = g_collTable.getValue ( &h64 );
- if ( ! vp ) return -1; // not found
- return *(collnum_t *)vp;
- /*
- for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
- if ( ! m_recs[i] ) continue;
- if ( m_recs[i]->m_collLen != clen ) continue;
- if ( strncmp(m_recs[i]->m_coll,coll,clen) == 0 ) return i;
- }
- //if ( strncmp ( "catdb\0", coll, clen ) == 0) return 0;
- return (collnum_t)-1; // not found
- */
- }
- //collnum_t Collectiondb::getNextCollnum ( collnum_t collnum ) {
- // for ( int32_t i = (int32_t)collnum + 1 ; i < m_numRecs ; i++ )
- // if ( m_recs[i] ) return i;
- // // no next one, use -1
- // return (collnum_t) -1;
- //}
- // what collnum will be used the next time a coll is added?
- collnum_t Collectiondb::reserveCollNum ( ) {
- if ( m_numRecs < 0x7fff ) {
- collnum_t next = m_numRecs;
- // make the ptr NULL at least to accommodate the
- // loop that scan up to m_numRecs lest we core
- growRecPtrBuf ( next );
- m_numRecs++;
- return next;
- }
- // c…
Large files files are truncated, but you can click here to view the full file