/Blaster.cpp
C++ | 1285 lines | 887 code | 92 blank | 306 comment | 176 complexity | 9271c9afd5c844ce5bbb58a22b3bca01 MD5 | raw file
Possible License(s): Apache-2.0
- // Matt Wells, copyright Sep 2001
- // the main program that brings it all together
- #include "gb-include.h"
- #include "Blaster.h"
- #include "Titledb.h" // TITLEREC_CURRENT_VERSION
- #include "Linkdb.h"
- Blaster g_blaster;
- static void gotDocWrapper1 ( void *state , TcpSocket *s ) ;
- static void gotDocWrapper2 ( void *state , TcpSocket *s ) ;
- static void gotDocWrapper3 ( void *state , TcpSocket *s ) ;
- static void gotDocWrapper4 ( void *state , TcpSocket *s ) ;
- static void sleepWrapper ( int fd , void *state ) ;
- static void sleepWrapperLog(int fd, void *state);
- Blaster::Blaster() {}
- Blaster::~Blaster() {
- if (m_buf1)
- mfree(m_buf1,m_bufSize1,"blaster1");
- if (m_buf2)
- mfree(m_buf2,m_bufSize2,"blaster2");
- }
- bool Blaster::init(){
- // let's ensure our core file can dump
- struct rlimit lim;
- lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
- if ( setrlimit(RLIMIT_CORE,&lim) )
- log("blaster::setrlimit: %s", mstrerror(errno) );
-
- g_conf.m_maxMem = 500000000;
-
- // init our table for doing zobrist hashing
- if ( ! hashinit() ) {
- log("blaster::hashinit failed" ); return 0; }
- // init the memory class after conf since it gets maxMem from Conf
- if ( ! g_mem.init ( ) ) {//200000000 ) ) {
- log("blaster::Mem init failed" ); return 0; }
- // start up log file
- if ( ! g_log.init( "/tmp/blasterLog" ) ) {
- log("blaster::Log open /tmp/blasterLog failed" ); return 0; }
- /*
- // get dns ip from /etc/resolv.conf
- g_conf.m_dnsIps[0] = 0;
- FILE *fd = fopen ( "/etc/resolv.conf" , "r" );
- if ( ! fd ) {
- log("blaster::fopen: /etc/resolve.conf %s",
- mstrerror(errno)); return 0; }
- char tmp[1024];
- while ( fgets ( tmp , 1024 , fd ) ) {
- // tmp buf ptr
- char *p = tmp;
- // skip comments
- if ( *p == '#' ) continue;
- // skip nameserver name
- if ( ! isdigit(*p) ) while ( ! isspace ( *p ) ) p++ ;
- // skip spaces
- while ( isspace ( *p ) ) p++;
- // if this is not a digit, continue
- if ( ! isdigit(*p) ) continue;
- // get ip
- g_conf.m_dnsIps[0] = atoip ( p , gbstrlen(p) );
- // done
- break;
- }
- fclose ( fd );
- // if no dns server found, bail
- if ( g_conf.m_dnsIps[0] == 0 ) {
- log("blaster:: no dns ip found in /etc/resolv.conf");return 0;}
- // hack # of dns servers
- g_conf.m_numDns = 1;
- g_conf.m_dnsPorts[0] = 53;
- */
- g_conf.m_askRootNameservers = true;
- //g_conf.m_dnsIps [0] = atoip ( "192.168.0.1", 11 );
- //g_conf.m_dnsClientPort = 9909;
- g_conf.m_dnsMaxCacheMem = 1024*10;
- // hack http server port to -1 (none)
- //g_conf.m_httpPort = 0;
- g_conf.m_httpMaxSockets = 200;
- //g_conf.m_httpMaxReadBufSize = 102*1024*1024;
- g_conf.m_httpMaxSendBufSize = 16*1024;
- // init the loop
- if ( ! g_loop.init() ) {
- log("blaster::Loop init failed" ); return 0; }
- // . then dns client
- // . server should listen to a socket and register with g_loop
- if ( ! g_dns.init(6000) ) {
- log("blaster::Dns client init failed" ); return 0; }
- // . then webserver
- // . server should listen to a socket and register with g_loop
- if ( ! g_httpServer.init( 8333 , 9334 ) ) {
- log("blaster::HttpServer init failed" ); return 0; }
- return 1;
- }
-
- void Blaster::runBlaster(char *file1,char *file2,
- int32_t maxNumThreads, int32_t wait, bool isLogFile,
- bool verbose,bool justDisplay,
- bool useProxy ,
- bool injectUrlWithLinks ,
- bool injectUrl ) {
- if (!init())
- return;
- m_blasterDiff=true;
- if (!file2)
- m_blasterDiff=false;
-
- // set File class
- File f1;
- f1.set ( file1 );
- // open files
- if ( ! f1.open ( O_RDONLY ) ) {
- log("blaster:open: %s %s",file1,mstrerror(g_errno));
- return;
- }
- // get file size
- int32_t fileSize1 = f1.getFileSize() ;
- // store a \0 at the end
- int32_t m_bufSize1 = fileSize1 + 1;
- m_doInjectionWithLinks = injectUrlWithLinks;
- m_doInjection = injectUrl;
- // make buffers to hold all
- m_buf1 = (char *) mmalloc ( m_bufSize1 , "blaster1" );
- if ( ! m_buf1) {
- log("blaster:mmalloc: %s",mstrerror(errno));
- return;
- }
- //char *bufEnd = buf + bufSize;
- // set m_p1
- m_p1 = m_buf1;
- m_p1end = m_buf1 + m_bufSize1 - 1;
- // read em all in
- if ( ! f1.read ( m_buf1 , fileSize1 , 0 ) ) {
- log("blaster:read: %s %s",file1,mstrerror(g_errno));
- return;
- }
- // change \n to \0
- //char *p = buf;
- int32_t n = 0;
- for ( int32_t i = 0 ; i < m_bufSize1 ; i++ ) {
- if ( m_buf1[i] != '\n' ) continue;
- m_buf1[i] = '\0';
- n++;
- }
- if (m_blasterDiff){
- File f2;
- f2.set ( file2 );
- if ( ! f2.open ( O_RDONLY ) ) {
- log("blaster:open: %s %s",file2,mstrerror(g_errno));
- return;
- }
- int32_t fileSize2 = f2.getFileSize() ;
- int32_t m_bufSize2 = fileSize2 + 1;
- m_buf2 = (char *) mmalloc ( m_bufSize2 , "blaster2" );
- if ( ! m_buf2) {
- log("blaster:mmalloc: %s",mstrerror(errno));
- return;
- }
- // set m_p2
- m_p2 = m_buf2;
- m_p2end = m_buf2 + m_bufSize2 - 1;
- if ( ! f2.read ( m_buf2 , fileSize2 , 0 ) ) {
- log("blaster:read: %s %s",file2,mstrerror(g_errno));
- return;
- }
- int32_t m=0;
- for ( int32_t i = 0 ; i < m_bufSize2 ; i++ ) {
- if ( m_buf2[i] != '\n' ) continue;
- m_buf2[i] = '\0';
- m++;
- }
- // Working on only the least number of urls from both files,
- //because we need to work in pairs
- if (m<n) n=m;
- else m=n;
- m_totalUrls=n;
- // should we print out all the logs?
- m_verbose=verbose;
- // Should we use the proxy for getting the first Doc
- m_useProxy=useProxy;
- // Should we just display the not present links and not fetch
- // the page to see if they are actually present ?
- m_justDisplay=justDisplay;
- }
- else{
- m_isLogFile=isLogFile;
-
- /*if reading a gigablast log file, find the lines that have
- GET and POST commands for search, and register a sleep
- callback for those lines with sleepWrapperLog*/
- if(!isLogFile)
- m_totalUrls=n;
- else {
- m_totalUrls=0;
- char *p=m_buf1;
- char *pend=p+m_bufSize1;
-
- // start is the time in milliseconds of the first log
- // message
- int64_t start=atoll(m_buf1);
- while(p<pend) {
- char *lineStart=p;
- char *urlStart=strstr(p," GET /search");
- if (!urlStart)
- urlStart=strstr(p," POST /search");
- if(!urlStart){
- p+=gbstrlen(p)+1; //goto next line
- continue;
- }
- urlStart++;
- m_wait=atoll(lineStart)-start;
- // register it here
- g_loop.registerSleepCallback(m_wait ,
- urlStart,
- sleepWrapperLog);
- m_totalUrls++;
- p+=gbstrlen(p)+1;
- }
- }
- }
- log(LOG_INIT,"blaster: read %"INT32" urls into memory",
- m_totalUrls );
- if(!isLogFile){
- // get min time between each spider in milliseconds
- m_wait = wait;
-
- // # of threads
- m_maxNumThreads = maxNumThreads;
-
- m_launched=0;
-
- m_portSwitch = 0;
- //if ( argc == 4 ) m_portSwitch = 1;
- //else m_portSwitch = 0;
-
- // start our spider loop
- //startSpidering( );
-
- // wakeup wrapper every X ms
- g_loop.registerSleepCallback ( m_wait , NULL ,
- sleepWrapper );
- }
- // this print to print how many docs have been processed
- m_print=false;
- m_startTime=gettimeofdayInMilliseconds();
- m_totalDone=0;
- // . now start g_loops main interrupt handling loop
- // . it should block forever
- // . when it gets a signal it dispatches to a server or db to handle it
- if ( ! g_loop.runLoop() ) {
- log("blaster::runLoop failed" ); return; }
- // dummy return (0-->normal exit status for the shell)
- return;
- }
- void sleepWrapper ( int fd , void *state ) {
- g_blaster.startBlastering();
- }
- void sleepWrapperLog(int fd, void *state) {
- // unregister the sleepCallback
- g_loop.unregisterSleepCallback(state,sleepWrapperLog);
- g_blaster.processLogFile(state);
- }
- void Blaster:: processLogFile(void *state){
- // No need to print how many docs processed in log
- // because this is called at epochs given in the log
- char *urlStart=(char*)state;
- if (!urlStart){
- log(LOG_WARN,"blaster: got NULL urlStart");
- return;
- }
- // log(LOG_WARN,"blaster:: Line is %s",urlStart);
- char tmp[1024];
- if (urlStart[0]=='P'){ //POST
- // advance by "POST /search HTTP/1.1 " = 22 chars
- urlStart+=22;
- sprintf(tmp,"http://www.gigablast.com/search?%s",urlStart);
- }
- else if (urlStart[0]=='G'){ //GET
- // advance by "GET "= 4 chars
- urlStart+=4;
- char *end=strstr(urlStart," HTTP/1.");
- if (end)
- end[0]='\0';
- sprintf(tmp,"http://www.gigablast.com%s",urlStart);
- }
- // log(LOG_WARN,"blaster: URL=%s",tmp);
- StateBD *st;
- try { st = new (StateBD); }
- catch ( ... ) {
- g_errno = ENOMEM;
- log("blaster: Failed. "
- "Could not allocate %"INT32" bytes for query. "
- "Returning HTTP status of 500.",
- (int32_t)sizeof(StateBD));
- return;
- }
- mnew ( st , sizeof(StateBD) , "BlasterDiff3" );
- //st->m_u1.set(tmp,gbstrlen(tmp));
- st->m_buf1=NULL;
- // get it
- bool status = g_httpServer.getDoc ( tmp, // &(st->m_u1) , // url
- 0 , // ip (none)
- 0 , // offset
- -1 , // size
- 0 , // ifModifiedSince
- st, // state
- gotDocWrapper1, // callback
- 20*1000, // timeout
- 0, // proxy ip
- 0, // proxy port
- 30*1024*1024, //maxLen
- 30*1024*1024);//maxOtherLen
- // continue if it blocked
- if ( status )
- // else there was error
- log("blaster: got doc %s: %s", urlStart,mstrerror(g_errno) );
- return;
- }
-
- void Blaster::startBlastering(){
- int64_t now=gettimeofdayInMilliseconds();
- if(m_print && m_totalDone>0 && (m_totalDone % 20)==0){
- log("blaster: Processed %"INT32" urls in %"INT32" ms",m_totalDone,
- (int32_t) (now-m_startTime));
- m_print=false;
- }
- //Launch the maximum number of threads that are allowed
- while ( m_p1 < m_p1end && m_launched < m_maxNumThreads && m_totalUrls){
- // clear any error
- g_errno = 0;
- // make a new state
- StateBD *st;
- try { st = new (StateBD); }
- catch ( ... ) {
- g_errno = ENOMEM;
- log("blaster: Failed. "
- "Could not allocate %"INT32" bytes for query. "
- "Returning HTTP status of 500.",
- (int32_t)sizeof(StateBD));
- return;
- }
- mnew ( st , sizeof(StateBD) , "BlasterDiff3" );
- st->m_buf1=NULL;
- m_totalUrls--;
- // make into a url class. Set both u1 and u2 here.
- //st->m_u1.set ( m_p1 , gbstrlen(m_p1) );
- st->m_u1 = m_p1;
- // is it an injection url
- if ( m_doInjection || m_doInjectionWithLinks ) {
- // get host #0 i guess
- Host *h0 = g_hostdb.getHost(0);
- if ( ! h0 ) { char *xx=NULL;*xx=0; }
- static bool s_flag = true;
- if ( s_flag ) {
- s_flag = false;
- log("blaster: injecting to host #0 at %s on "
- "http/tcp port %"INT32"",
- iptoa(h0->m_ip),
- (int32_t)h0->m_httpPort);
- }
- // use spiderlinks=1 so we add the outlinks to spiderdb
- // but that will slow the spider rate down since it
- // will have to do a dns lookup on the domain of every
- // outlink.
- st->m_injectUrl.safePrintf("http://127.0.0.1:8000/"
- "admin/inject?");
- if ( m_doInjectionWithLinks )
- st->m_injectUrl.safePrintf("spiderlinks=1&");
- else
- st->m_injectUrl.safePrintf("spiderlinks=0&");
- st->m_injectUrl.safePrintf("u=");
- st->m_injectUrl.urlEncode(m_p1);
- st->m_injectUrl.pushChar('\0');
- st->m_u1 = st->m_injectUrl.getBufStart();
- }
- // skip to next url
- m_p1 += gbstrlen ( m_p1 ) + 1;
- if (m_blasterDiff){
- //st->m_u2.set ( m_p2 , gbstrlen(m_p2) );
- st->m_u2 = m_p2;
- m_p2 += gbstrlen ( m_p2 ) + 1;
- }
- // log(LOG_WARN,"\n");
- log(LOG_WARN,"blaster: Downloading %s",st->m_u1);
- // set port if port switch is true
- //if ( m_portSwitch ) {
- // int32_t r = rand() % 32;
- // u.setPort ( 8000 + r );
- //}
- // count it
- m_launched++;
- int32_t ip=0;
- int32_t port=0;
- if (m_useProxy){
- ip=atoip("66.154.102.20",13);
- port=3128;
- }
- // get it
- bool status = g_httpServer.getDoc ( st->m_u1 , // url
- 0, // ip
- 0 , // offset
- -1 , // size
- 0 , // ifModifiedSince
- st , // state
- gotDocWrapper1, // callback
- 60*1000, // timeout
- ip,
- port,
- 30*1024*1024, //maxLen
- 30*1024*1024);
- // continue if it blocked
- if ( ! status ) continue;
- // If not blocked, there is an error.
- m_launched--;
- // log msg
- log("From file1, got doc1 %s: %s", st->m_u1 ,
- mstrerror(g_errno) );
- // we gotta wait
- break;
- }
- // bail if not done yet
- //if ( m_launched > 0 ) return;
- if (m_totalUrls) return;
- //otherwise return if launched have not come back
- if (m_launched) return;
- // exit now
- // g_conf.save();
- // closeALL(NULL,NULL);
- exit ( 0 );
- }
- void gotDocWrapper1 ( void *state , TcpSocket *s ) {
- g_blaster.gotDoc1(state,s);
- }
- void Blaster::gotDoc1( void *state, TcpSocket *s){
- StateBD *st=(StateBD *)state;
- // Even if we loose the request, still count it as done.
- m_totalDone++;
- m_print=true;
- // bail if got cut off
- if ( s->m_readOffset == 0 ) {
- log("blaster: lost the Request in gotDoc1");
- m_launched--;
- freeStateBD(st);
- return;
- }
- //if we are not doing diff
- if (!m_blasterDiff){
- m_launched--;
- }
- int64_t now = gettimeofdayInMilliseconds();
- // get hash
- char *reply = s->m_readBuf ;
- int32_t size = s->m_readOffset;
- HttpMime mime;
- mime.set ( reply , size , NULL );
- char *content = reply + mime.getMimeLen();
- int32_t contentLen = size - mime.getMimeLen();
- uint32_t h = hash32 ( content , contentLen );
- // log msg
- if ( g_errno )
- logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) %s : %s",
- s->m_readOffset ,
- (int32_t)(now - s->m_startTime) ,
- st->m_u1 ,
- mstrerror(g_errno) );
- else
- logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) "
- "(hash=%"XINT32") %s",
- s->m_readOffset ,
- (int32_t)(now - s->m_startTime) ,
- h ,
- st->m_u1 );
- if (!m_blasterDiff){
- // try to launch another if not using log file
- freeStateBD(st);
- if (!m_isLogFile){
- startBlastering();
- }
- if (m_isLogFile && --m_totalUrls==0) exit(0);
- return;
- }
- // Store the buffer from socket so that it does not get destroyed
- // at the end. Also, add another space because in gotDoc2 xml.set
- // demands the content to be null ended, so we need to store the
- // null character there. So as a precaution, just allocating the
- // max buf size.
- st->m_buf1=(char*) mcalloc(s->m_readBufSize,"Blaster5");
- gbmemcpy(st->m_buf1,s->m_readBuf,s->m_readOffset);
- //st->m_buf1=(char*) mdup(s->m_readBuf,s->m_readOffset,"Blaster5");
- st->m_buf1Len=s->m_readOffset;
- st->m_buf1MaxLen=s->m_readBufSize;
- // . don't let TcpServer free m_buf when socket is recycled/closed
- // . we own it now and are responsible for freeing it. DON'T do this
- // because I believe this makes malloc crash, since TcpServer says
- // that it has freed the memory so malloc tries to allocate wrong
- // memory and gives a seg fault.
- // s->m_readBuf = NULL;
-
- log(LOG_WARN,"blaster: Downloading %s",st->m_u2);
- //char *ss="www.gigablast.com/search?q=hoopla&code=gbmonitor";
- // st->m_u2.set(ss,gbstrlen(ss));
- // get it
- bool status = g_httpServer.getDoc ( st->m_u2 , // url
- 0,//ip
- 0 , // offset
- -1 , // size
- 0 , // ifModifiedSince
- st , // state
- gotDocWrapper2, // callback
- 60*1000, // timeout
- 0,//atoip("66.154.102.20",13),//proxy ip
- 0,//3128,//80, // proxy port
- 30*1024*1024, //maxLen
- 30*1024*1024);//maxOtherLen
- // continue if it blocked
- if ( ! status ) return;
- // If not blocked, there is an error.
- m_launched--;
- // log msg
- log("From file2, gotdoc2 %s: %s", st->m_u2,
- mstrerror(g_errno) );
- // No need to point p2 ahead because already been done
- // Free stateBD
- freeStateBD(st);
- return;
-
- }
- void gotDocWrapper2 ( void *state , TcpSocket *s ) {
- g_blaster.gotDoc2(state,s);
- }
- void Blaster::gotDoc2 ( void *state, TcpSocket *s){
- StateBD *st=(StateBD *)state;
- // bail if got cut off
- if ( s->m_readOffset == 0 ) {
- log("blaster: Lost the Request in gotDoc2");
- m_launched--;
- //No need to point p2
- // Free stateBD
- freeStateBD(st);
- return;
- }
-
- // . don't let TcpServer free m_buf when socket is recycled/closed
- // . we own it now and are responsible for freeing it
- // s->m_readBuf = NULL;
- int64_t now = gettimeofdayInMilliseconds();
- // So now after getting both docIds, get their contents
- char *reply1 = st->m_buf1 ;
- int32_t size1 = st->m_buf1Len;
- HttpMime mime1;
- mime1.set ( reply1 , size1 , NULL );
- char *content1 = reply1 + mime1.getMimeLen();
- int32_t content1Len = size1 - mime1.getMimeLen();
- uint32_t h = hash32 ( content1 , content1Len );
- // log msg
- if ( g_errno )
- logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) %s : %s",
- s->m_readOffset ,
- (int32_t)(now - s->m_startTime) ,
- st->m_u2 ,
- mstrerror(g_errno) );
- else
- logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) "
- "(hash=%"XINT32") %s",
- s->m_readOffset ,
- (int32_t)(now - s->m_startTime) ,
- h ,
- st->m_u2 );
- if (m_verbose){
- log(LOG_WARN,"blaster: content1len=%"INT32", Content1 is =%s",
- content1Len,content1);
- log(LOG_WARN,"\n");
- }
- char *reply2 = s->m_readBuf ;
- int32_t size2 = s->m_readOffset;
- HttpMime mime2;
- mime2.set ( reply2 , size2 , NULL );
- char *content2 = reply2 + mime2.getMimeLen();
- int32_t content2Len = size2 - mime2.getMimeLen();
- if (m_verbose)
- log(LOG_WARN,"blaster: content2len=%"INT32", Content2 is =%s",
- content2Len,content2);
- // Now that we've got the contents, lets get the url links out
- // of these pages.Passing them to function getSearchLinks should
- // get the first x links found out.
- /* st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3");
- st->m_links2=st->m_links1+100*MAX_URL_LEN;
- st->m_numLinks1=100;
- st->m_numLinks2=100;*/
- /* int32_t numLinks1=getSearchLinks(content1,content1Len,
- st->m_links1,st->m_numLinks1);
- int32_t numLinks2=getSearchLinks(content2,content2Len,
- st->m_links2,st->m_numLinks2);*/
- content1[content1Len]='\0';
- //int16_t csEnum1= get_iana_charset(mime1.getCharset(),
- // mime1.getCharsetLen());
- /* if (csEnum1== csUnknown)
- log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
- Xml xml1;
- // assume utf8
- if (!xml1.set(content1,
- content1Len,
- false,
- 0,
- false,
- TITLEREC_CURRENT_VERSION ,
- true , // set parents
- 0 , // niceness
- CT_XML )){ // content type
- log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
- }
- Links links1;
- Url parent; parent.set ( st->m_u1);
- if (!links1.set(false , // userellnofollow
- &xml1,
- &parent,//mime1.getLocationUrl(), parent Url
- false, // setLinkHashes
- NULL , // baseUrl
- TITLEREC_CURRENT_VERSION, // version
- 0 , // niceness
- false , // parent is permalink?
- NULL )) { // oldLinks
- log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2");
- }
- content2[content2Len]='\0';
- //int16_t csEnum2= get_iana_charset(mime2.getCharset(),
- // mime2.getCharsetLen());
- /* if (csEnum2== csUnknown)
- log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
- Xml xml2;
- if (!xml2.set(content2,
- content2Len,
- false,
- 0,
- false,
- TITLEREC_CURRENT_VERSION,
- true , // setparents
- 0 , // niceness
- CT_XML )){
- log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
- }
- Links links2;
- parent.set(st->m_u2);
- if (!links2.set(0,//siterec xml
- &xml2,
- &parent,//&st->m_u2,//mime2.getLocationUrl(),
- false,
- NULL,
- TITLEREC_CURRENT_VERSION,
- 0,
- false,
- NULL)){
- log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2");
- }
-
- // put the hash of the sites into a hashtable, since we have
- // about a 100 or so of them
- HashTableT<uint32_t, bool> urlHash;
- // put the urls from doc2 into the hastable, but first check if
- // they are links to google or gigablast (for now). For msn and
- // yahoo we have to add other checks.
- char domain2[256];
- int32_t dlen = 0;
- char *dom = getDomFast ( st->m_u2 , &dlen );
- if ( dom ) strncpy(domain2,dom,dlen);
- domain2[dlen]='\0';
- for (int32_t i=0;i<links2.getNumLinks();i++){
- // The dots check if exactly google or gigablast are present
- // in the link
- char *ss=links2.getLink(i);
- char *p;
- p=strstr(ss,domain2);
- if(p) continue;
- p=strstr(ss,"google.");
- if(p) continue;
- p=strstr(ss,"cache:"); //googles cache page
- if(p) continue;
- p= strstr(ss,"gigablast.");
- if(p) continue;
- p= strstr(ss,"web.archive.org");//older copies on gigablast
- if(p) continue;
- p= strstr(ss,"search.yahoo.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"search.msn.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"s.teoma.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"search.dmoz.org");//from gigablast search
- if(p) continue;
- p= strstr(ss,"www.answers.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"cc.msncache.com");//msn's cache page
- if(p) continue;
- if (m_verbose)
- log(LOG_WARN,"blaster: link in Doc2=%s"
- ,links2.getLink(i));
- uint32_t h=hash32Lower_a(links2.getLink(i),
- links2.getLinkLen(i));
- //should i check for conflict. no, because it doesn't matter
- urlHash.addKey(h,1);
- }
- // now check if the urls from doc1 are in doc2. save the
- // ones that are not
- // in there for later.
- /* int32_t numUrlsToCheck=links2.getNumLinks();*/
- int32_t numUrlsNotFound=0;
- /*if (numLinks1<numUrlsToCheck)
- numUrlsToCheck=numLinks1;*/
- char domain1[256];
- dlen = 0;
- dom = getDomFast ( st->m_u1 ,&dlen );
- if ( dom ) strncpy(domain1,dom,dlen);
- domain1[dlen]='\0';
- for (int32_t i=0;i<links1.getNumLinks();i++){
- char *ss=links1.getLink(i);
- char *p;
- p=strstr(ss,domain1);
- if(p) continue;
- p=strstr(ss,"google.");
- if(p) continue;
- p=strstr(ss,"cache:"); //googles cache page
- if(p) continue;
- p= strstr(ss,"gigablast.");
- if(p) continue;
- p= strstr(ss,"web.archive.org");//older copies on gigablast
- if(p) continue;
- p= strstr(ss,"search.yahoo.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"search.msn.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"s.teoma.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"search.dmoz.org");//from gigablast search
- if(p) continue;
- p= strstr(ss,"www.answers.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"cc.msncache.com");//msn's cache page
- if(p) continue;
- if (m_verbose)
- log(LOG_WARN,"blaster: link in Doc1=%s"
- ,links1.getLink(i));
- uint32_t h=hash32Lower_a(links1.getLink(i),
- links1.getLinkLen(i));
- int32_t slot= urlHash.getSlot(h);
- if(slot!=-1) continue;
- // if url is not present, get its doc.
- if (m_verbose || m_justDisplay)
- log(LOG_WARN,"blaster: NOT FOUND %s in %s"
- ,links1.getLink(i),domain2);
- numUrlsNotFound++;
- //Don't do anything else if just have to display the urls
- if (m_justDisplay) continue;
- //now get the doc of these urls
- //initialize
- st->m_numUrlDocsReceived=0;
- StateBD2 *st2;
- try { st2 = new (StateBD2); }
- catch ( ... ) {
- g_errno = ENOMEM;
- log("blaster: Failed. "
- "Could not allocate %"INT32" bytes for query. "
- "Returning HTTP status of 500.",
- (int32_t)sizeof(StateBD2));
- return;
- }
- mnew ( st2 , sizeof(StateBD2) , "Blaster4" );
- //Point to the big state;
- st2->m_st=st;
- //Msg16 does 6 redirects, so I do 6 too
- st2->m_numRedirects=6;
- //st2->m_url.set(links1.getLink(i),links1.getLinkLen(i));
- st2->m_url = links1.getLink(i);
- // No need for a proxy ip here, since we are fetching
- // doc's from different IPs. Faster this way
- bool status = g_httpServer.getDoc ( st2->m_url, // url
- 0,//ip
- 0 , // offset
- -1 , // size
- 0 , // ifModifiedSince
- st2, // state
- gotDocWrapper3, // callback
- 60*1000, // timeout
- 0, // proxy ip
- 0, // proxy port
- 30*1024*1024, //maxLen
- 30*1024*1024);//maxOtherLen
- // continue if it blocked
- if ( ! status ) continue;
- // If not blocked, there is an error.
- st->m_numUrlDocsReceived++;
- }
- st->m_numUrlDocsSent=numUrlsNotFound;
- //There might have been an error while sending the docs, so if there
- //has been put a check
- if ( st->m_numUrlDocsReceived > 0 &&
- st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){
- log(LOG_WARN,"blaster: %"INT32" docs could not be sent due to "
- "error",st->m_numUrlDocsReceived);
- m_launched--;
- freeStateBD(st);
- return;
- }
-
- if (numUrlsNotFound==0){
- //job done for this pair
- log(LOG_WARN,"blaster: All urls from %s found in "
- "%s",domain1,domain2);
- m_launched--;
- // Free stateBD
- freeStateBD(st);
- return;
- }
- log(LOG_WARN,"blaster: %"INT32" urls from %s Not found in %s",
- numUrlsNotFound,domain1,domain2);
- if(m_justDisplay){
- m_launched--;
- // Free stateBD
- freeStateBD(st);
- }
- return;
- }
- // This is not a generic function as yet. Gigablast stores the link in tag
- // <span class="url"> and google stores it in tag <font color=#008000>. Takes
- // the content to search for links, the array in which to store the links and
- // the length of the array as arguments.Returns number of links it found in
- // the page. This function is not being used as yet as Xml and Links are used
- #if 0
- int32_t Blaster::getSearchLinks(char *content,
- int32_t contentLen,
- char *links,
- int32_t numLinks){
- char *p=content;
- char *pend=content+contentLen;
- char *p2;
- int32_t linksFound=0;
- //considering code given is raw=1
- /* while (p<pend){
- if (p=strstr(p,"http://"))
- p2=strstr(p,"\n");
- else break;
- int32_t length=p2-p;
- if (length>=MAX_URL_LEN) length=255;
- strncpy(links+linksFound*MAX_URL_LEN,p,length);
- links[linksFound*MAX_URL_LEN+length]='\0';
- log(LOG_WARN,"blaster: The url is=%s",
- links+linksFound*MAX_URL_LEN);
- linksFound++;
- p+=7;
- }
- return linksFound;*/
- // Deciding if it is gigablast 1 or google 0 or else 2
- int32_t isGB;
- if (contentLen<19) {
- log(LOG_WARN,"blaster: Contentlen is less");
- return 0;
- }
- if (strstr(content,"<span class=\"url\">"))
- isGB=1;
- else isGB=0;
- p=content;
- if (isGB){
- while (p && p<pend && linksFound<numLinks){
-
- p=strstr(p,"<span class=\"url\">");
- if (!p) break;
- p2=strstr(p,"</span>");
- if (!p2) break;
-
- //point to the url
- p+=18;
- //Check if it is in bounds. Also need to put '\0' at
- // the end.
- int32_t length=p2-p;
- if (length>=MAX_URL_LEN) length=MAX_URL_LEN-1;
- //Copy into the links buffer
- strncpy(links+linksFound*MAX_URL_LEN,p,length);
- links[linksFound*MAX_URL_LEN+length]='\0';
- log(LOG_WARN,"blaster:the url is=%s",
- links+linksFound*MAX_URL_LEN);
- //advance p2 too
- p2+=7;
- linksFound++;
- }
- }
- else{
- while (p && p<pend && linksFound<numLinks){
- p=strstr(p,"<font color=#008000>");
- if(!p) break;
- p2=strstr(p,"</font>");
- if (!p2) break;
-
- //point to the url
- p+=20;
- //Check if it is in bounds. Also need to put '\0' at
- // the end.
- int32_t length=p2-p;
- if (length>=MAX_URL_LEN) length=255;
- //Copy into the links buffer
- strncpy(links+linksFound*MAX_URL_LEN,p,length);
- links[linksFound*MAX_URL_LEN+length]='\0';
- log(LOG_WARN,"blaster:the url is=%s",
- links+linksFound*MAX_URL_LEN);
- //advance p2 too
- p2+=7;
- linksFound++;
- }
- }
- return linksFound;
- }
- #endif
- void gotDocWrapper3 ( void *state , TcpSocket *s ) {
- g_blaster.gotDoc3(state,s);
- }
- void Blaster::gotDoc3 ( void *state, TcpSocket *s){
- StateBD2 *st2=(StateBD2 *)state;
- StateBD *st=st2->m_st;
- if (!s) {
- log (LOG_WARN,"blaster: Got a null s in gotDoc3."
- "Happened because ip could not be found");
- st->m_numUrlDocsReceived++;
- //Free StateBD2
- mdelete(st2,sizeof(StateBD2),"Blaster4");
- if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
- m_launched--;
- // Free stateBD
- freeStateBD(st);
- }
- return;
- }
- // bail if got cut off
- if ( s->m_readOffset == 0 ) {
- log("blasterDiff : lost the Request in gotDoc3");
- st->m_numUrlDocsReceived++;
- //Free StateBD2
- mdelete(st2,sizeof(StateBD2),"Blaster4");
- if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
- m_launched--;
- // Free stateBD
- freeStateBD(st);
- }
- return;
- }
- char *reply = s->m_readBuf ;
- int32_t size = s->m_readOffset;
- HttpMime mime;
- mime.set(reply,size,NULL);
- int32_t httpStatus=mime.getHttpStatus();
- if(httpStatus==404){
- if (m_verbose)
- log(LOG_WARN,"blaster: The page was not found - 404");
- st->m_numUrlDocsReceived++;
- }
- // If the url is a redirect check if it is still http (might have
- // become https or something else, in which case we aren't going to
- // follow it
- else if (httpStatus>=300){
- Url *u=mime.getLocationUrl();
- //If max number of redirects done, bail
- if(!st2->m_numRedirects--){
- log(LOG_WARN,"blaster: Max number of redirects "
- "reached.");
- st->m_numUrlDocsReceived++;
- }
- //check if it is still http (might have become https or
- // something else, in which case we aren't going to follow it
- else if (!u->isHttp()){
- log(LOG_WARN,"blaster: Redirection not for an http "
- "page for url %s",u->getUrl());
- st->m_numUrlDocsReceived++;
- }
- // sometimes idiots don't supply us with a Location: mime
- else if ( u->getUrlLen() == 0 ) {
- log(LOG_WARN,"blaster: Redirect url is of 0 length");
- st->m_numUrlDocsReceived++;
- }
- else{
- // I'm not checking as yet if the redirect url is the
- // same as the earlier url, as I've set the max number
- // of redirs to 6 Now lets get the redirect url. Do not
- // increase the numDocsReceived because this wrapper
- // will be called back for the page
- if (m_verbose)
- log(LOG_WARN,"blaster: Downloading redirect"
- " %s",u->getUrl());
- //Changing the url to the new place
- //st2->m_url.set(u,false);
- st2->m_url = u->getUrl();
- bool status = g_httpServer.getDoc (st2->m_url, // url
- 0,//ip
- 0 , // offset
- -1 , // size
- 0 ,
- st2 , // state
- gotDocWrapper3,
- 60*1000, // timeout
- 0, // proxy ip
- 0, // proxy port
- 30*1024*1024, //maxLen
- 30*1024*1024);
- // If not blocked, there is an error.
- if (status )
- st->m_numUrlDocsReceived++;
- }
- }
- else if(httpStatus<200){
- log(LOG_WARN,"blaster: Bad HTTP status %"INT32"",httpStatus);
- st->m_numUrlDocsReceived++;
- }
- else{
- // This means the page is still there, somewhere. Status must
- // be 200 So find it on server2. This server is assumed to be
- // running an instance of gb, so it shall be given the query in
- // the format 'xxxxx.com/search?q=url%3Ayyyy&code=gbmonitor.
- // Then check if we have the exact page in the search results
- // that have come back. So now the problem is that we do
- // not know which url has been got. So I get the location
- // url from mime.
- // The site name is in st->m_u2.getSite()
- // But copy it because it is not nulled.
- char tmp[1024];
- //char site[1024];//how long could a site be?
- int32_t siteLen = 0;
- char *site = getHostFast(st->m_u2,&siteLen);
- char c = site[siteLen];
- site[siteLen] = 0;
- //strncpy(site,st->m_u2.getSite(),
- // st->m_u2.getSiteLen());
- //site[st->m_u2.getSiteLen()]='\0';
- sprintf(tmp,"%ssearch?"
- "code=gbmonitor&"
- "q=url%%3A%s",site,st2->m_url);
- site[siteLen] = c;
- if (m_verbose)
- log(LOG_WARN,"blaster: Checking %s",tmp);
- //Url u;
- //u.set(tmp,gbstrlen(tmp));
- //Now get the doc
- bool status = g_httpServer.getDoc ( tmp,//&u,
- 0,//ip
- 0, // offset
- -1 , // size
- 0 ,
- st , // state
- gotDocWrapper4,
- 60*1000, // timeout
- 0,//atoip("66.154.102.20",13),//proxy ip
- 0,//3128,//proxy port
- 30*1024*1024,
- 30*1024*1024);
- // continue if it blocked
- // If not blocked, there is an error. Since we are
- // getting the doc from a gigablast server, report it
- if (status ){
- st->m_numUrlDocsReceived++;
- log(LOG_WARN,"blaster: could not get back"
- "%s from server in gotDoc3",tmp);
- }
- }
- // If we reached here, that means all the url redirects have been
- // finished, and there is no need for st2. Free it
- mdelete(st2,sizeof(StateBD2),"Blaster4");
- if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
- m_launched--;
- // Free stateBD
- freeStateBD(st);
- }
- return;
- }
- void gotDocWrapper4 ( void *state , TcpSocket *s ) {
- g_blaster.gotDoc4(state,s);
- }
- void Blaster::gotDoc4 ( void *state, TcpSocket *s){
- StateBD *st=(StateBD *)state;
- st->m_numUrlDocsReceived++;
- if (!s) {
- //Shouldn't happen, but still putting a checkpoint
- log (LOG_WARN,"blaster: Got a null s in gotDoc4."
- "Happened because ip could not be found for gigablast"
- "server");
- if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
- m_launched--;
- // Free stateBD
- freeStateBD(st);
- }
- return;
- }
- // bail if got cut off
- if ( s->m_readOffset == 0 ) {
- log("blasterDiff : lost the Request in gotDoc4");
- if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
- m_launched--;
- freeStateBD(st);
- }
- return;
- }
- char *reply = s->m_readBuf ;
- int32_t size = s->m_readOffset;
- HttpMime mime;
- mime.set ( reply , size , NULL );
- char *content = reply + mime.getMimeLen();
- int32_t contentLen = size - mime.getMimeLen();
- //int16_t csEnum = get_iana_charset(mime.getCharset(),
- // mime.getCharsetLen());
- /* if (csEnum == csUnknown)
- log(LOG_DEBUG, "blaster: Unknown charset : %s", mime.getCharset());*/
-
- Xml xml;
- if (!xml.set(
- content,
- contentLen,
- false,
- 0,
- false,
- TITLEREC_CURRENT_VERSION,
- true, // setparents
- 0, // niceness
- CT_XML )){
- log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
- }
- Links links;
- Url *url=mime.getLocationUrl();
- if (!links.set(0,//siterec xml
- &xml,
- url,
- false,
- NULL,
- TITLEREC_CURRENT_VERSION,
- 0,
- false,
- NULL)){
- log(LOG_WARN, "blaster: Couldn't set Links class in gotDoc4");
- }
- for (int32_t i=0;i<links.getNumLinks();i++){
- char *ss=links.getLink(i);
- char *p;
- // This page *should* always be a gigablast page. So not adding
- // checks for msn or yahoo or google page.
- p=strstr(ss,"google.");
- if(p) continue;
- p=strstr(ss,"cache:"); //googles cache page
- if(p) continue;
- p= strstr(ss,"gigablast.");
- if(p) continue;
- p= strstr(ss,"web.archive.org");//older copies on gigablast
- if(p) continue;
- p= strstr(ss,"search.yahoo.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"search.msn.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"s.teoma.com");//from gigablast search
- if(p) continue;
- p= strstr(ss,"search.dmoz.org");//from gigablast search
- if(p) continue;
- p= strstr(ss,"www.answers.com");//from gigablast search
- if(p) continue;
- if (m_verbose)
- log(LOG_WARN,"blaster: Link Present on server2=%s",ss);
- }
-
- // So if one of the links that is returned is the exact url,
- // then we know that the url is present.So get the url from the
- // mime, search for it in the links that are returned.
- char tmp[1024];
- char *sendBuf=s->m_sendBuf;
- char *p1,*p2;
- // First get the Host, which is the domain. Since socket s is going to
- // be useless after this function, changing m_sendBuf instead of using
- // more space
- p1=strstr(sendBuf,"%3A");
- if(p1){
- p1+=3;
- p2=strstr(p1," HTTP");
- if (p2){
- //Since I do not care about the sendbuf anymore
- *p2='\0';
- }
- }
- if (!p1 || !p2){
- log(LOG_WARN,"blasterdiff: Could not find search link"
- "from m_sendBuf in gotdoc4");
- }
- else{
- sprintf(tmp,"%s",p1);
- //log(LOG_WARN,"blaster: tmp in gotDoc4 = %s",tmp);
- bool isFound=false;
- // So now we search for tmp in the links
- for (int32_t i=0;i<links.getNumLinks();i++){
- if(strstr(links.getLink(i),tmp) &&
- links.getLinkLen(i)==(int)gbstrlen(tmp)){
- isFound=true;
- log(LOG_WARN,"blaster: %s in results1 but not"
- " in results2 for query %s but does exist"
- " in server2",tmp,st->m_u1);//->getQuery()
- }
- }
- if (!isFound)
- log(LOG_WARN,"blaster: %s in results1 but not"
- " in results2 for query %s and does NOT exist"
- " in server2",tmp,st->m_u1); // ->getQuery()
- }
-
- if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
- m_launched--;
- // Free stateBD
- freeStateBD(st);
- }
- return;
- }
- void Blaster::freeStateBD(StateBD *st){
- // Free stateBD's buf
- if (!st) return;
- if (st->m_buf1)
- mfree(st->m_buf1,st->m_buf1MaxLen,"Blaster5");
- mdelete(st,sizeof(StateBD),"Blaster3");
- }