PageRenderTime 2003ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/Blaster.cpp

https://github.com/gigablast/open-source-search-engine
C++ | 1285 lines | 887 code | 92 blank | 306 comment | 176 complexity | 9271c9afd5c844ce5bbb58a22b3bca01 MD5 | raw file
Possible License(s): Apache-2.0
  1. // Matt Wells, copyright Sep 2001
  2. // the main program that brings it all together
  3. #include "gb-include.h"
  4. #include "Blaster.h"
  5. #include "Titledb.h" // TITLEREC_CURRENT_VERSION
  6. #include "Linkdb.h"
  7. Blaster g_blaster;
  8. static void gotDocWrapper1 ( void *state , TcpSocket *s ) ;
  9. static void gotDocWrapper2 ( void *state , TcpSocket *s ) ;
  10. static void gotDocWrapper3 ( void *state , TcpSocket *s ) ;
  11. static void gotDocWrapper4 ( void *state , TcpSocket *s ) ;
  12. static void sleepWrapper ( int fd , void *state ) ;
  13. static void sleepWrapperLog(int fd, void *state);
  14. Blaster::Blaster() {}
  15. Blaster::~Blaster() {
  16. if (m_buf1)
  17. mfree(m_buf1,m_bufSize1,"blaster1");
  18. if (m_buf2)
  19. mfree(m_buf2,m_bufSize2,"blaster2");
  20. }
  21. bool Blaster::init(){
  22. // let's ensure our core file can dump
  23. struct rlimit lim;
  24. lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
  25. if ( setrlimit(RLIMIT_CORE,&lim) )
  26. log("blaster::setrlimit: %s", mstrerror(errno) );
  27. g_conf.m_maxMem = 500000000;
  28. // init our table for doing zobrist hashing
  29. if ( ! hashinit() ) {
  30. log("blaster::hashinit failed" ); return 0; }
  31. // init the memory class after conf since it gets maxMem from Conf
  32. if ( ! g_mem.init ( ) ) {//200000000 ) ) {
  33. log("blaster::Mem init failed" ); return 0; }
  34. // start up log file
  35. if ( ! g_log.init( "/tmp/blasterLog" ) ) {
  36. log("blaster::Log open /tmp/blasterLog failed" ); return 0; }
  37. /*
  38. // get dns ip from /etc/resolv.conf
  39. g_conf.m_dnsIps[0] = 0;
  40. FILE *fd = fopen ( "/etc/resolv.conf" , "r" );
  41. if ( ! fd ) {
  42. log("blaster::fopen: /etc/resolve.conf %s",
  43. mstrerror(errno)); return 0; }
  44. char tmp[1024];
  45. while ( fgets ( tmp , 1024 , fd ) ) {
  46. // tmp buf ptr
  47. char *p = tmp;
  48. // skip comments
  49. if ( *p == '#' ) continue;
  50. // skip nameserver name
  51. if ( ! isdigit(*p) ) while ( ! isspace ( *p ) ) p++ ;
  52. // skip spaces
  53. while ( isspace ( *p ) ) p++;
  54. // if this is not a digit, continue
  55. if ( ! isdigit(*p) ) continue;
  56. // get ip
  57. g_conf.m_dnsIps[0] = atoip ( p , gbstrlen(p) );
  58. // done
  59. break;
  60. }
  61. fclose ( fd );
  62. // if no dns server found, bail
  63. if ( g_conf.m_dnsIps[0] == 0 ) {
  64. log("blaster:: no dns ip found in /etc/resolv.conf");return 0;}
  65. // hack # of dns servers
  66. g_conf.m_numDns = 1;
  67. g_conf.m_dnsPorts[0] = 53;
  68. */
  69. g_conf.m_askRootNameservers = true;
  70. //g_conf.m_dnsIps [0] = atoip ( "192.168.0.1", 11 );
  71. //g_conf.m_dnsClientPort = 9909;
  72. g_conf.m_dnsMaxCacheMem = 1024*10;
  73. // hack http server port to -1 (none)
  74. //g_conf.m_httpPort = 0;
  75. g_conf.m_httpMaxSockets = 200;
  76. //g_conf.m_httpMaxReadBufSize = 102*1024*1024;
  77. g_conf.m_httpMaxSendBufSize = 16*1024;
  78. // init the loop
  79. if ( ! g_loop.init() ) {
  80. log("blaster::Loop init failed" ); return 0; }
  81. // . then dns client
  82. // . server should listen to a socket and register with g_loop
  83. if ( ! g_dns.init(6000) ) {
  84. log("blaster::Dns client init failed" ); return 0; }
  85. // . then webserver
  86. // . server should listen to a socket and register with g_loop
  87. if ( ! g_httpServer.init( 8333 , 9334 ) ) {
  88. log("blaster::HttpServer init failed" ); return 0; }
  89. return 1;
  90. }
  91. void Blaster::runBlaster(char *file1,char *file2,
  92. int32_t maxNumThreads, int32_t wait, bool isLogFile,
  93. bool verbose,bool justDisplay,
  94. bool useProxy ,
  95. bool injectUrlWithLinks ,
  96. bool injectUrl ) {
  97. if (!init())
  98. return;
  99. m_blasterDiff=true;
  100. if (!file2)
  101. m_blasterDiff=false;
  102. // set File class
  103. File f1;
  104. f1.set ( file1 );
  105. // open files
  106. if ( ! f1.open ( O_RDONLY ) ) {
  107. log("blaster:open: %s %s",file1,mstrerror(g_errno));
  108. return;
  109. }
  110. // get file size
  111. int32_t fileSize1 = f1.getFileSize() ;
  112. // store a \0 at the end
  113. int32_t m_bufSize1 = fileSize1 + 1;
  114. m_doInjectionWithLinks = injectUrlWithLinks;
  115. m_doInjection = injectUrl;
  116. // make buffers to hold all
  117. m_buf1 = (char *) mmalloc ( m_bufSize1 , "blaster1" );
  118. if ( ! m_buf1) {
  119. log("blaster:mmalloc: %s",mstrerror(errno));
  120. return;
  121. }
  122. //char *bufEnd = buf + bufSize;
  123. // set m_p1
  124. m_p1 = m_buf1;
  125. m_p1end = m_buf1 + m_bufSize1 - 1;
  126. // read em all in
  127. if ( ! f1.read ( m_buf1 , fileSize1 , 0 ) ) {
  128. log("blaster:read: %s %s",file1,mstrerror(g_errno));
  129. return;
  130. }
  131. // change \n to \0
  132. //char *p = buf;
  133. int32_t n = 0;
  134. for ( int32_t i = 0 ; i < m_bufSize1 ; i++ ) {
  135. if ( m_buf1[i] != '\n' ) continue;
  136. m_buf1[i] = '\0';
  137. n++;
  138. }
  139. if (m_blasterDiff){
  140. File f2;
  141. f2.set ( file2 );
  142. if ( ! f2.open ( O_RDONLY ) ) {
  143. log("blaster:open: %s %s",file2,mstrerror(g_errno));
  144. return;
  145. }
  146. int32_t fileSize2 = f2.getFileSize() ;
  147. int32_t m_bufSize2 = fileSize2 + 1;
  148. m_buf2 = (char *) mmalloc ( m_bufSize2 , "blaster2" );
  149. if ( ! m_buf2) {
  150. log("blaster:mmalloc: %s",mstrerror(errno));
  151. return;
  152. }
  153. // set m_p2
  154. m_p2 = m_buf2;
  155. m_p2end = m_buf2 + m_bufSize2 - 1;
  156. if ( ! f2.read ( m_buf2 , fileSize2 , 0 ) ) {
  157. log("blaster:read: %s %s",file2,mstrerror(g_errno));
  158. return;
  159. }
  160. int32_t m=0;
  161. for ( int32_t i = 0 ; i < m_bufSize2 ; i++ ) {
  162. if ( m_buf2[i] != '\n' ) continue;
  163. m_buf2[i] = '\0';
  164. m++;
  165. }
  166. // Working on only the least number of urls from both files,
  167. //because we need to work in pairs
  168. if (m<n) n=m;
  169. else m=n;
  170. m_totalUrls=n;
  171. // should we print out all the logs?
  172. m_verbose=verbose;
  173. // Should we use the proxy for getting the first Doc
  174. m_useProxy=useProxy;
  175. // Should we just display the not present links and not fetch
  176. // the page to see if they are actually present ?
  177. m_justDisplay=justDisplay;
  178. }
  179. else{
  180. m_isLogFile=isLogFile;
  181. /*if reading a gigablast log file, find the lines that have
  182. GET and POST commands for search, and register a sleep
  183. callback for those lines with sleepWrapperLog*/
  184. if(!isLogFile)
  185. m_totalUrls=n;
  186. else {
  187. m_totalUrls=0;
  188. char *p=m_buf1;
  189. char *pend=p+m_bufSize1;
  190. // start is the time in milliseconds of the first log
  191. // message
  192. int64_t start=atoll(m_buf1);
  193. while(p<pend) {
  194. char *lineStart=p;
  195. char *urlStart=strstr(p," GET /search");
  196. if (!urlStart)
  197. urlStart=strstr(p," POST /search");
  198. if(!urlStart){
  199. p+=gbstrlen(p)+1; //goto next line
  200. continue;
  201. }
  202. urlStart++;
  203. m_wait=atoll(lineStart)-start;
  204. // register it here
  205. g_loop.registerSleepCallback(m_wait ,
  206. urlStart,
  207. sleepWrapperLog);
  208. m_totalUrls++;
  209. p+=gbstrlen(p)+1;
  210. }
  211. }
  212. }
  213. log(LOG_INIT,"blaster: read %"INT32" urls into memory",
  214. m_totalUrls );
  215. if(!isLogFile){
  216. // get min time between each spider in milliseconds
  217. m_wait = wait;
  218. // # of threads
  219. m_maxNumThreads = maxNumThreads;
  220. m_launched=0;
  221. m_portSwitch = 0;
  222. //if ( argc == 4 ) m_portSwitch = 1;
  223. //else m_portSwitch = 0;
  224. // start our spider loop
  225. //startSpidering( );
  226. // wakeup wrapper every X ms
  227. g_loop.registerSleepCallback ( m_wait , NULL ,
  228. sleepWrapper );
  229. }
  230. // this print to print how many docs have been processed
  231. m_print=false;
  232. m_startTime=gettimeofdayInMilliseconds();
  233. m_totalDone=0;
  234. // . now start g_loops main interrupt handling loop
  235. // . it should block forever
  236. // . when it gets a signal it dispatches to a server or db to handle it
  237. if ( ! g_loop.runLoop() ) {
  238. log("blaster::runLoop failed" ); return; }
  239. // dummy return (0-->normal exit status for the shell)
  240. return;
  241. }
  242. void sleepWrapper ( int fd , void *state ) {
  243. g_blaster.startBlastering();
  244. }
  245. void sleepWrapperLog(int fd, void *state) {
  246. // unregister the sleepCallback
  247. g_loop.unregisterSleepCallback(state,sleepWrapperLog);
  248. g_blaster.processLogFile(state);
  249. }
  250. void Blaster:: processLogFile(void *state){
  251. // No need to print how many docs processed in log
  252. // because this is called at epochs given in the log
  253. char *urlStart=(char*)state;
  254. if (!urlStart){
  255. log(LOG_WARN,"blaster: got NULL urlStart");
  256. return;
  257. }
  258. // log(LOG_WARN,"blaster:: Line is %s",urlStart);
  259. char tmp[1024];
  260. if (urlStart[0]=='P'){ //POST
  261. // advance by "POST /search HTTP/1.1 " = 22 chars
  262. urlStart+=22;
  263. sprintf(tmp,"http://www.gigablast.com/search?%s",urlStart);
  264. }
  265. else if (urlStart[0]=='G'){ //GET
  266. // advance by "GET "= 4 chars
  267. urlStart+=4;
  268. char *end=strstr(urlStart," HTTP/1.");
  269. if (end)
  270. end[0]='\0';
  271. sprintf(tmp,"http://www.gigablast.com%s",urlStart);
  272. }
  273. // log(LOG_WARN,"blaster: URL=%s",tmp);
  274. StateBD *st;
  275. try { st = new (StateBD); }
  276. catch ( ... ) {
  277. g_errno = ENOMEM;
  278. log("blaster: Failed. "
  279. "Could not allocate %"INT32" bytes for query. "
  280. "Returning HTTP status of 500.",
  281. (int32_t)sizeof(StateBD));
  282. return;
  283. }
  284. mnew ( st , sizeof(StateBD) , "BlasterDiff3" );
  285. //st->m_u1.set(tmp,gbstrlen(tmp));
  286. st->m_buf1=NULL;
  287. // get it
  288. bool status = g_httpServer.getDoc ( tmp, // &(st->m_u1) , // url
  289. 0 , // ip (none)
  290. 0 , // offset
  291. -1 , // size
  292. 0 , // ifModifiedSince
  293. st, // state
  294. gotDocWrapper1, // callback
  295. 20*1000, // timeout
  296. 0, // proxy ip
  297. 0, // proxy port
  298. 30*1024*1024, //maxLen
  299. 30*1024*1024);//maxOtherLen
  300. // continue if it blocked
  301. if ( status )
  302. // else there was error
  303. log("blaster: got doc %s: %s", urlStart,mstrerror(g_errno) );
  304. return;
  305. }
  306. void Blaster::startBlastering(){
  307. int64_t now=gettimeofdayInMilliseconds();
  308. if(m_print && m_totalDone>0 && (m_totalDone % 20)==0){
  309. log("blaster: Processed %"INT32" urls in %"INT32" ms",m_totalDone,
  310. (int32_t) (now-m_startTime));
  311. m_print=false;
  312. }
  313. //Launch the maximum number of threads that are allowed
  314. while ( m_p1 < m_p1end && m_launched < m_maxNumThreads && m_totalUrls){
  315. // clear any error
  316. g_errno = 0;
  317. // make a new state
  318. StateBD *st;
  319. try { st = new (StateBD); }
  320. catch ( ... ) {
  321. g_errno = ENOMEM;
  322. log("blaster: Failed. "
  323. "Could not allocate %"INT32" bytes for query. "
  324. "Returning HTTP status of 500.",
  325. (int32_t)sizeof(StateBD));
  326. return;
  327. }
  328. mnew ( st , sizeof(StateBD) , "BlasterDiff3" );
  329. st->m_buf1=NULL;
  330. m_totalUrls--;
  331. // make into a url class. Set both u1 and u2 here.
  332. //st->m_u1.set ( m_p1 , gbstrlen(m_p1) );
  333. st->m_u1 = m_p1;
  334. // is it an injection url
  335. if ( m_doInjection || m_doInjectionWithLinks ) {
  336. // get host #0 i guess
  337. Host *h0 = g_hostdb.getHost(0);
  338. if ( ! h0 ) { char *xx=NULL;*xx=0; }
  339. static bool s_flag = true;
  340. if ( s_flag ) {
  341. s_flag = false;
  342. log("blaster: injecting to host #0 at %s on "
  343. "http/tcp port %"INT32"",
  344. iptoa(h0->m_ip),
  345. (int32_t)h0->m_httpPort);
  346. }
  347. // use spiderlinks=1 so we add the outlinks to spiderdb
  348. // but that will slow the spider rate down since it
  349. // will have to do a dns lookup on the domain of every
  350. // outlink.
  351. st->m_injectUrl.safePrintf("http://127.0.0.1:8000/"
  352. "admin/inject?");
  353. if ( m_doInjectionWithLinks )
  354. st->m_injectUrl.safePrintf("spiderlinks=1&");
  355. else
  356. st->m_injectUrl.safePrintf("spiderlinks=0&");
  357. st->m_injectUrl.safePrintf("u=");
  358. st->m_injectUrl.urlEncode(m_p1);
  359. st->m_injectUrl.pushChar('\0');
  360. st->m_u1 = st->m_injectUrl.getBufStart();
  361. }
  362. // skip to next url
  363. m_p1 += gbstrlen ( m_p1 ) + 1;
  364. if (m_blasterDiff){
  365. //st->m_u2.set ( m_p2 , gbstrlen(m_p2) );
  366. st->m_u2 = m_p2;
  367. m_p2 += gbstrlen ( m_p2 ) + 1;
  368. }
  369. // log(LOG_WARN,"\n");
  370. log(LOG_WARN,"blaster: Downloading %s",st->m_u1);
  371. // set port if port switch is true
  372. //if ( m_portSwitch ) {
  373. // int32_t r = rand() % 32;
  374. // u.setPort ( 8000 + r );
  375. //}
  376. // count it
  377. m_launched++;
  378. int32_t ip=0;
  379. int32_t port=0;
  380. if (m_useProxy){
  381. ip=atoip("66.154.102.20",13);
  382. port=3128;
  383. }
  384. // get it
  385. bool status = g_httpServer.getDoc ( st->m_u1 , // url
  386. 0, // ip
  387. 0 , // offset
  388. -1 , // size
  389. 0 , // ifModifiedSince
  390. st , // state
  391. gotDocWrapper1, // callback
  392. 60*1000, // timeout
  393. ip,
  394. port,
  395. 30*1024*1024, //maxLen
  396. 30*1024*1024);
  397. // continue if it blocked
  398. if ( ! status ) continue;
  399. // If not blocked, there is an error.
  400. m_launched--;
  401. // log msg
  402. log("From file1, got doc1 %s: %s", st->m_u1 ,
  403. mstrerror(g_errno) );
  404. // we gotta wait
  405. break;
  406. }
  407. // bail if not done yet
  408. //if ( m_launched > 0 ) return;
  409. if (m_totalUrls) return;
  410. //otherwise return if launched have not come back
  411. if (m_launched) return;
  412. // exit now
  413. // g_conf.save();
  414. // closeALL(NULL,NULL);
  415. exit ( 0 );
  416. }
  417. void gotDocWrapper1 ( void *state , TcpSocket *s ) {
  418. g_blaster.gotDoc1(state,s);
  419. }
  420. void Blaster::gotDoc1( void *state, TcpSocket *s){
  421. StateBD *st=(StateBD *)state;
  422. // Even if we loose the request, still count it as done.
  423. m_totalDone++;
  424. m_print=true;
  425. // bail if got cut off
  426. if ( s->m_readOffset == 0 ) {
  427. log("blaster: lost the Request in gotDoc1");
  428. m_launched--;
  429. freeStateBD(st);
  430. return;
  431. }
  432. //if we are not doing diff
  433. if (!m_blasterDiff){
  434. m_launched--;
  435. }
  436. int64_t now = gettimeofdayInMilliseconds();
  437. // get hash
  438. char *reply = s->m_readBuf ;
  439. int32_t size = s->m_readOffset;
  440. HttpMime mime;
  441. mime.set ( reply , size , NULL );
  442. char *content = reply + mime.getMimeLen();
  443. int32_t contentLen = size - mime.getMimeLen();
  444. uint32_t h = hash32 ( content , contentLen );
  445. // log msg
  446. if ( g_errno )
  447. logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) %s : %s",
  448. s->m_readOffset ,
  449. (int32_t)(now - s->m_startTime) ,
  450. st->m_u1 ,
  451. mstrerror(g_errno) );
  452. else
  453. logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) "
  454. "(hash=%"XINT32") %s",
  455. s->m_readOffset ,
  456. (int32_t)(now - s->m_startTime) ,
  457. h ,
  458. st->m_u1 );
  459. if (!m_blasterDiff){
  460. // try to launch another if not using log file
  461. freeStateBD(st);
  462. if (!m_isLogFile){
  463. startBlastering();
  464. }
  465. if (m_isLogFile && --m_totalUrls==0) exit(0);
  466. return;
  467. }
  468. // Store the buffer from socket so that it does not get destroyed
  469. // at the end. Also, add another space because in gotDoc2 xml.set
  470. // demands the content to be null ended, so we need to store the
  471. // null character there. So as a precaution, just allocating the
  472. // max buf size.
  473. st->m_buf1=(char*) mcalloc(s->m_readBufSize,"Blaster5");
  474. gbmemcpy(st->m_buf1,s->m_readBuf,s->m_readOffset);
  475. //st->m_buf1=(char*) mdup(s->m_readBuf,s->m_readOffset,"Blaster5");
  476. st->m_buf1Len=s->m_readOffset;
  477. st->m_buf1MaxLen=s->m_readBufSize;
  478. // . don't let TcpServer free m_buf when socket is recycled/closed
  479. // . we own it now and are responsible for freeing it. DON'T do this
  480. // because I believe this makes malloc crash, since TcpServer says
  481. // that it has freed the memory so malloc tries to allocate wrong
  482. // memory and gives a seg fault.
  483. // s->m_readBuf = NULL;
  484. log(LOG_WARN,"blaster: Downloading %s",st->m_u2);
  485. //char *ss="www.gigablast.com/search?q=hoopla&code=gbmonitor";
  486. // st->m_u2.set(ss,gbstrlen(ss));
  487. // get it
  488. bool status = g_httpServer.getDoc ( st->m_u2 , // url
  489. 0,//ip
  490. 0 , // offset
  491. -1 , // size
  492. 0 , // ifModifiedSince
  493. st , // state
  494. gotDocWrapper2, // callback
  495. 60*1000, // timeout
  496. 0,//atoip("66.154.102.20",13),//proxy ip
  497. 0,//3128,//80, // proxy port
  498. 30*1024*1024, //maxLen
  499. 30*1024*1024);//maxOtherLen
  500. // continue if it blocked
  501. if ( ! status ) return;
  502. // If not blocked, there is an error.
  503. m_launched--;
  504. // log msg
  505. log("From file2, gotdoc2 %s: %s", st->m_u2,
  506. mstrerror(g_errno) );
  507. // No need to point p2 ahead because already been done
  508. // Free stateBD
  509. freeStateBD(st);
  510. return;
  511. }
  512. void gotDocWrapper2 ( void *state , TcpSocket *s ) {
  513. g_blaster.gotDoc2(state,s);
  514. }
  515. void Blaster::gotDoc2 ( void *state, TcpSocket *s){
  516. StateBD *st=(StateBD *)state;
  517. // bail if got cut off
  518. if ( s->m_readOffset == 0 ) {
  519. log("blaster: Lost the Request in gotDoc2");
  520. m_launched--;
  521. //No need to point p2
  522. // Free stateBD
  523. freeStateBD(st);
  524. return;
  525. }
  526. // . don't let TcpServer free m_buf when socket is recycled/closed
  527. // . we own it now and are responsible for freeing it
  528. // s->m_readBuf = NULL;
  529. int64_t now = gettimeofdayInMilliseconds();
  530. // So now after getting both docIds, get their contents
  531. char *reply1 = st->m_buf1 ;
  532. int32_t size1 = st->m_buf1Len;
  533. HttpMime mime1;
  534. mime1.set ( reply1 , size1 , NULL );
  535. char *content1 = reply1 + mime1.getMimeLen();
  536. int32_t content1Len = size1 - mime1.getMimeLen();
  537. uint32_t h = hash32 ( content1 , content1Len );
  538. // log msg
  539. if ( g_errno )
  540. logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) %s : %s",
  541. s->m_readOffset ,
  542. (int32_t)(now - s->m_startTime) ,
  543. st->m_u2 ,
  544. mstrerror(g_errno) );
  545. else
  546. logf(LOG_INFO,"blaster: got doc (%"INT32") (%"INT32" ms) "
  547. "(hash=%"XINT32") %s",
  548. s->m_readOffset ,
  549. (int32_t)(now - s->m_startTime) ,
  550. h ,
  551. st->m_u2 );
  552. if (m_verbose){
  553. log(LOG_WARN,"blaster: content1len=%"INT32", Content1 is =%s",
  554. content1Len,content1);
  555. log(LOG_WARN,"\n");
  556. }
  557. char *reply2 = s->m_readBuf ;
  558. int32_t size2 = s->m_readOffset;
  559. HttpMime mime2;
  560. mime2.set ( reply2 , size2 , NULL );
  561. char *content2 = reply2 + mime2.getMimeLen();
  562. int32_t content2Len = size2 - mime2.getMimeLen();
  563. if (m_verbose)
  564. log(LOG_WARN,"blaster: content2len=%"INT32", Content2 is =%s",
  565. content2Len,content2);
  566. // Now that we've got the contents, lets get the url links out
  567. // of these pages.Passing them to function getSearchLinks should
  568. // get the first x links found out.
  569. /* st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3");
  570. st->m_links2=st->m_links1+100*MAX_URL_LEN;
  571. st->m_numLinks1=100;
  572. st->m_numLinks2=100;*/
  573. /* int32_t numLinks1=getSearchLinks(content1,content1Len,
  574. st->m_links1,st->m_numLinks1);
  575. int32_t numLinks2=getSearchLinks(content2,content2Len,
  576. st->m_links2,st->m_numLinks2);*/
  577. content1[content1Len]='\0';
  578. //int16_t csEnum1= get_iana_charset(mime1.getCharset(),
  579. // mime1.getCharsetLen());
  580. /* if (csEnum1== csUnknown)
  581. log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
  582. Xml xml1;
  583. // assume utf8
  584. if (!xml1.set(content1,
  585. content1Len,
  586. false,
  587. 0,
  588. false,
  589. TITLEREC_CURRENT_VERSION ,
  590. true , // set parents
  591. 0 , // niceness
  592. CT_XML )){ // content type
  593. log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
  594. }
  595. Links links1;
  596. Url parent; parent.set ( st->m_u1);
  597. if (!links1.set(false , // userellnofollow
  598. &xml1,
  599. &parent,//mime1.getLocationUrl(), parent Url
  600. false, // setLinkHashes
  601. NULL , // baseUrl
  602. TITLEREC_CURRENT_VERSION, // version
  603. 0 , // niceness
  604. false , // parent is permalink?
  605. NULL )) { // oldLinks
  606. log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2");
  607. }
  608. content2[content2Len]='\0';
  609. //int16_t csEnum2= get_iana_charset(mime2.getCharset(),
  610. // mime2.getCharsetLen());
  611. /* if (csEnum2== csUnknown)
  612. log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
  613. Xml xml2;
  614. if (!xml2.set(content2,
  615. content2Len,
  616. false,
  617. 0,
  618. false,
  619. TITLEREC_CURRENT_VERSION,
  620. true , // setparents
  621. 0 , // niceness
  622. CT_XML )){
  623. log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
  624. }
  625. Links links2;
  626. parent.set(st->m_u2);
  627. if (!links2.set(0,//siterec xml
  628. &xml2,
  629. &parent,//&st->m_u2,//mime2.getLocationUrl(),
  630. false,
  631. NULL,
  632. TITLEREC_CURRENT_VERSION,
  633. 0,
  634. false,
  635. NULL)){
  636. log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2");
  637. }
  638. // put the hash of the sites into a hashtable, since we have
  639. // about a 100 or so of them
  640. HashTableT<uint32_t, bool> urlHash;
  641. // put the urls from doc2 into the hastable, but first check if
  642. // they are links to google or gigablast (for now). For msn and
  643. // yahoo we have to add other checks.
  644. char domain2[256];
  645. int32_t dlen = 0;
  646. char *dom = getDomFast ( st->m_u2 , &dlen );
  647. if ( dom ) strncpy(domain2,dom,dlen);
  648. domain2[dlen]='\0';
  649. for (int32_t i=0;i<links2.getNumLinks();i++){
  650. // The dots check if exactly google or gigablast are present
  651. // in the link
  652. char *ss=links2.getLink(i);
  653. char *p;
  654. p=strstr(ss,domain2);
  655. if(p) continue;
  656. p=strstr(ss,"google.");
  657. if(p) continue;
  658. p=strstr(ss,"cache:"); //googles cache page
  659. if(p) continue;
  660. p= strstr(ss,"gigablast.");
  661. if(p) continue;
  662. p= strstr(ss,"web.archive.org");//older copies on gigablast
  663. if(p) continue;
  664. p= strstr(ss,"search.yahoo.com");//from gigablast search
  665. if(p) continue;
  666. p= strstr(ss,"search.msn.com");//from gigablast search
  667. if(p) continue;
  668. p= strstr(ss,"s.teoma.com");//from gigablast search
  669. if(p) continue;
  670. p= strstr(ss,"search.dmoz.org");//from gigablast search
  671. if(p) continue;
  672. p= strstr(ss,"www.answers.com");//from gigablast search
  673. if(p) continue;
  674. p= strstr(ss,"cc.msncache.com");//msn's cache page
  675. if(p) continue;
  676. if (m_verbose)
  677. log(LOG_WARN,"blaster: link in Doc2=%s"
  678. ,links2.getLink(i));
  679. uint32_t h=hash32Lower_a(links2.getLink(i),
  680. links2.getLinkLen(i));
  681. //should i check for conflict. no, because it doesn't matter
  682. urlHash.addKey(h,1);
  683. }
  684. // now check if the urls from doc1 are in doc2. save the
  685. // ones that are not
  686. // in there for later.
  687. /* int32_t numUrlsToCheck=links2.getNumLinks();*/
  688. int32_t numUrlsNotFound=0;
  689. /*if (numLinks1<numUrlsToCheck)
  690. numUrlsToCheck=numLinks1;*/
  691. char domain1[256];
  692. dlen = 0;
  693. dom = getDomFast ( st->m_u1 ,&dlen );
  694. if ( dom ) strncpy(domain1,dom,dlen);
  695. domain1[dlen]='\0';
  696. for (int32_t i=0;i<links1.getNumLinks();i++){
  697. char *ss=links1.getLink(i);
  698. char *p;
  699. p=strstr(ss,domain1);
  700. if(p) continue;
  701. p=strstr(ss,"google.");
  702. if(p) continue;
  703. p=strstr(ss,"cache:"); //googles cache page
  704. if(p) continue;
  705. p= strstr(ss,"gigablast.");
  706. if(p) continue;
  707. p= strstr(ss,"web.archive.org");//older copies on gigablast
  708. if(p) continue;
  709. p= strstr(ss,"search.yahoo.com");//from gigablast search
  710. if(p) continue;
  711. p= strstr(ss,"search.msn.com");//from gigablast search
  712. if(p) continue;
  713. p= strstr(ss,"s.teoma.com");//from gigablast search
  714. if(p) continue;
  715. p= strstr(ss,"search.dmoz.org");//from gigablast search
  716. if(p) continue;
  717. p= strstr(ss,"www.answers.com");//from gigablast search
  718. if(p) continue;
  719. p= strstr(ss,"cc.msncache.com");//msn's cache page
  720. if(p) continue;
  721. if (m_verbose)
  722. log(LOG_WARN,"blaster: link in Doc1=%s"
  723. ,links1.getLink(i));
  724. uint32_t h=hash32Lower_a(links1.getLink(i),
  725. links1.getLinkLen(i));
  726. int32_t slot= urlHash.getSlot(h);
  727. if(slot!=-1) continue;
  728. // if url is not present, get its doc.
  729. if (m_verbose || m_justDisplay)
  730. log(LOG_WARN,"blaster: NOT FOUND %s in %s"
  731. ,links1.getLink(i),domain2);
  732. numUrlsNotFound++;
  733. //Don't do anything else if just have to display the urls
  734. if (m_justDisplay) continue;
  735. //now get the doc of these urls
  736. //initialize
  737. st->m_numUrlDocsReceived=0;
  738. StateBD2 *st2;
  739. try { st2 = new (StateBD2); }
  740. catch ( ... ) {
  741. g_errno = ENOMEM;
  742. log("blaster: Failed. "
  743. "Could not allocate %"INT32" bytes for query. "
  744. "Returning HTTP status of 500.",
  745. (int32_t)sizeof(StateBD2));
  746. return;
  747. }
  748. mnew ( st2 , sizeof(StateBD2) , "Blaster4" );
  749. //Point to the big state;
  750. st2->m_st=st;
  751. //Msg16 does 6 redirects, so I do 6 too
  752. st2->m_numRedirects=6;
  753. //st2->m_url.set(links1.getLink(i),links1.getLinkLen(i));
  754. st2->m_url = links1.getLink(i);
  755. // No need for a proxy ip here, since we are fetching
  756. // doc's from different IPs. Faster this way
  757. bool status = g_httpServer.getDoc ( st2->m_url, // url
  758. 0,//ip
  759. 0 , // offset
  760. -1 , // size
  761. 0 , // ifModifiedSince
  762. st2, // state
  763. gotDocWrapper3, // callback
  764. 60*1000, // timeout
  765. 0, // proxy ip
  766. 0, // proxy port
  767. 30*1024*1024, //maxLen
  768. 30*1024*1024);//maxOtherLen
  769. // continue if it blocked
  770. if ( ! status ) continue;
  771. // If not blocked, there is an error.
  772. st->m_numUrlDocsReceived++;
  773. }
  774. st->m_numUrlDocsSent=numUrlsNotFound;
  775. //There might have been an error while sending the docs, so if there
  776. //has been put a check
  777. if ( st->m_numUrlDocsReceived > 0 &&
  778. st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){
  779. log(LOG_WARN,"blaster: %"INT32" docs could not be sent due to "
  780. "error",st->m_numUrlDocsReceived);
  781. m_launched--;
  782. freeStateBD(st);
  783. return;
  784. }
  785. if (numUrlsNotFound==0){
  786. //job done for this pair
  787. log(LOG_WARN,"blaster: All urls from %s found in "
  788. "%s",domain1,domain2);
  789. m_launched--;
  790. // Free stateBD
  791. freeStateBD(st);
  792. return;
  793. }
  794. log(LOG_WARN,"blaster: %"INT32" urls from %s Not found in %s",
  795. numUrlsNotFound,domain1,domain2);
  796. if(m_justDisplay){
  797. m_launched--;
  798. // Free stateBD
  799. freeStateBD(st);
  800. }
  801. return;
  802. }
  803. // This is not a generic function as yet. Gigablast stores the link in tag
  804. // <span class="url"> and google stores it in tag <font color=#008000>. Takes
  805. // the content to search for links, the array in which to store the links and
  806. // the length of the array as arguments.Returns number of links it found in
  807. // the page. This function is not being used as yet as Xml and Links are used
  808. #if 0
  809. int32_t Blaster::getSearchLinks(char *content,
  810. int32_t contentLen,
  811. char *links,
  812. int32_t numLinks){
  813. char *p=content;
  814. char *pend=content+contentLen;
  815. char *p2;
  816. int32_t linksFound=0;
  817. //considering code given is raw=1
  818. /* while (p<pend){
  819. if (p=strstr(p,"http://"))
  820. p2=strstr(p,"\n");
  821. else break;
  822. int32_t length=p2-p;
  823. if (length>=MAX_URL_LEN) length=255;
  824. strncpy(links+linksFound*MAX_URL_LEN,p,length);
  825. links[linksFound*MAX_URL_LEN+length]='\0';
  826. log(LOG_WARN,"blaster: The url is=%s",
  827. links+linksFound*MAX_URL_LEN);
  828. linksFound++;
  829. p+=7;
  830. }
  831. return linksFound;*/
  832. // Deciding if it is gigablast 1 or google 0 or else 2
  833. int32_t isGB;
  834. if (contentLen<19) {
  835. log(LOG_WARN,"blaster: Contentlen is less");
  836. return 0;
  837. }
  838. if (strstr(content,"<span class=\"url\">"))
  839. isGB=1;
  840. else isGB=0;
  841. p=content;
  842. if (isGB){
  843. while (p && p<pend && linksFound<numLinks){
  844. p=strstr(p,"<span class=\"url\">");
  845. if (!p) break;
  846. p2=strstr(p,"</span>");
  847. if (!p2) break;
  848. //point to the url
  849. p+=18;
  850. //Check if it is in bounds. Also need to put '\0' at
  851. // the end.
  852. int32_t length=p2-p;
  853. if (length>=MAX_URL_LEN) length=MAX_URL_LEN-1;
  854. //Copy into the links buffer
  855. strncpy(links+linksFound*MAX_URL_LEN,p,length);
  856. links[linksFound*MAX_URL_LEN+length]='\0';
  857. log(LOG_WARN,"blaster:the url is=%s",
  858. links+linksFound*MAX_URL_LEN);
  859. //advance p2 too
  860. p2+=7;
  861. linksFound++;
  862. }
  863. }
  864. else{
  865. while (p && p<pend && linksFound<numLinks){
  866. p=strstr(p,"<font color=#008000>");
  867. if(!p) break;
  868. p2=strstr(p,"</font>");
  869. if (!p2) break;
  870. //point to the url
  871. p+=20;
  872. //Check if it is in bounds. Also need to put '\0' at
  873. // the end.
  874. int32_t length=p2-p;
  875. if (length>=MAX_URL_LEN) length=255;
  876. //Copy into the links buffer
  877. strncpy(links+linksFound*MAX_URL_LEN,p,length);
  878. links[linksFound*MAX_URL_LEN+length]='\0';
  879. log(LOG_WARN,"blaster:the url is=%s",
  880. links+linksFound*MAX_URL_LEN);
  881. //advance p2 too
  882. p2+=7;
  883. linksFound++;
  884. }
  885. }
  886. return linksFound;
  887. }
  888. #endif
  889. void gotDocWrapper3 ( void *state , TcpSocket *s ) {
  890. g_blaster.gotDoc3(state,s);
  891. }
  892. void Blaster::gotDoc3 ( void *state, TcpSocket *s){
  893. StateBD2 *st2=(StateBD2 *)state;
  894. StateBD *st=st2->m_st;
  895. if (!s) {
  896. log (LOG_WARN,"blaster: Got a null s in gotDoc3."
  897. "Happened because ip could not be found");
  898. st->m_numUrlDocsReceived++;
  899. //Free StateBD2
  900. mdelete(st2,sizeof(StateBD2),"Blaster4");
  901. if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
  902. m_launched--;
  903. // Free stateBD
  904. freeStateBD(st);
  905. }
  906. return;
  907. }
  908. // bail if got cut off
  909. if ( s->m_readOffset == 0 ) {
  910. log("blasterDiff : lost the Request in gotDoc3");
  911. st->m_numUrlDocsReceived++;
  912. //Free StateBD2
  913. mdelete(st2,sizeof(StateBD2),"Blaster4");
  914. if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
  915. m_launched--;
  916. // Free stateBD
  917. freeStateBD(st);
  918. }
  919. return;
  920. }
  921. char *reply = s->m_readBuf ;
  922. int32_t size = s->m_readOffset;
  923. HttpMime mime;
  924. mime.set(reply,size,NULL);
  925. int32_t httpStatus=mime.getHttpStatus();
  926. if(httpStatus==404){
  927. if (m_verbose)
  928. log(LOG_WARN,"blaster: The page was not found - 404");
  929. st->m_numUrlDocsReceived++;
  930. }
  931. // If the url is a redirect check if it is still http (might have
  932. // become https or something else, in which case we aren't going to
  933. // follow it
  934. else if (httpStatus>=300){
  935. Url *u=mime.getLocationUrl();
  936. //If max number of redirects done, bail
  937. if(!st2->m_numRedirects--){
  938. log(LOG_WARN,"blaster: Max number of redirects "
  939. "reached.");
  940. st->m_numUrlDocsReceived++;
  941. }
  942. //check if it is still http (might have become https or
  943. // something else, in which case we aren't going to follow it
  944. else if (!u->isHttp()){
  945. log(LOG_WARN,"blaster: Redirection not for an http "
  946. "page for url %s",u->getUrl());
  947. st->m_numUrlDocsReceived++;
  948. }
  949. // sometimes idiots don't supply us with a Location: mime
  950. else if ( u->getUrlLen() == 0 ) {
  951. log(LOG_WARN,"blaster: Redirect url is of 0 length");
  952. st->m_numUrlDocsReceived++;
  953. }
  954. else{
  955. // I'm not checking as yet if the redirect url is the
  956. // same as the earlier url, as I've set the max number
  957. // of redirs to 6 Now lets get the redirect url. Do not
  958. // increase the numDocsReceived because this wrapper
  959. // will be called back for the page
  960. if (m_verbose)
  961. log(LOG_WARN,"blaster: Downloading redirect"
  962. " %s",u->getUrl());
  963. //Changing the url to the new place
  964. //st2->m_url.set(u,false);
  965. st2->m_url = u->getUrl();
  966. bool status = g_httpServer.getDoc (st2->m_url, // url
  967. 0,//ip
  968. 0 , // offset
  969. -1 , // size
  970. 0 ,
  971. st2 , // state
  972. gotDocWrapper3,
  973. 60*1000, // timeout
  974. 0, // proxy ip
  975. 0, // proxy port
  976. 30*1024*1024, //maxLen
  977. 30*1024*1024);
  978. // If not blocked, there is an error.
  979. if (status )
  980. st->m_numUrlDocsReceived++;
  981. }
  982. }
  983. else if(httpStatus<200){
  984. log(LOG_WARN,"blaster: Bad HTTP status %"INT32"",httpStatus);
  985. st->m_numUrlDocsReceived++;
  986. }
  987. else{
  988. // This means the page is still there, somewhere. Status must
  989. // be 200 So find it on server2. This server is assumed to be
  990. // running an instance of gb, so it shall be given the query in
  991. // the format 'xxxxx.com/search?q=url%3Ayyyy&code=gbmonitor.
  992. // Then check if we have the exact page in the search results
  993. // that have come back. So now the problem is that we do
  994. // not know which url has been got. So I get the location
  995. // url from mime.
  996. // The site name is in st->m_u2.getSite()
  997. // But copy it because it is not nulled.
  998. char tmp[1024];
  999. //char site[1024];//how long could a site be?
  1000. int32_t siteLen = 0;
  1001. char *site = getHostFast(st->m_u2,&siteLen);
  1002. char c = site[siteLen];
  1003. site[siteLen] = 0;
  1004. //strncpy(site,st->m_u2.getSite(),
  1005. // st->m_u2.getSiteLen());
  1006. //site[st->m_u2.getSiteLen()]='\0';
  1007. sprintf(tmp,"%ssearch?"
  1008. "code=gbmonitor&"
  1009. "q=url%%3A%s",site,st2->m_url);
  1010. site[siteLen] = c;
  1011. if (m_verbose)
  1012. log(LOG_WARN,"blaster: Checking %s",tmp);
  1013. //Url u;
  1014. //u.set(tmp,gbstrlen(tmp));
  1015. //Now get the doc
  1016. bool status = g_httpServer.getDoc ( tmp,//&u,
  1017. 0,//ip
  1018. 0, // offset
  1019. -1 , // size
  1020. 0 ,
  1021. st , // state
  1022. gotDocWrapper4,
  1023. 60*1000, // timeout
  1024. 0,//atoip("66.154.102.20",13),//proxy ip
  1025. 0,//3128,//proxy port
  1026. 30*1024*1024,
  1027. 30*1024*1024);
  1028. // continue if it blocked
  1029. // If not blocked, there is an error. Since we are
  1030. // getting the doc from a gigablast server, report it
  1031. if (status ){
  1032. st->m_numUrlDocsReceived++;
  1033. log(LOG_WARN,"blaster: could not get back"
  1034. "%s from server in gotDoc3",tmp);
  1035. }
  1036. }
  1037. // If we reached here, that means all the url redirects have been
  1038. // finished, and there is no need for st2. Free it
  1039. mdelete(st2,sizeof(StateBD2),"Blaster4");
  1040. if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
  1041. m_launched--;
  1042. // Free stateBD
  1043. freeStateBD(st);
  1044. }
  1045. return;
  1046. }
  1047. void gotDocWrapper4 ( void *state , TcpSocket *s ) {
  1048. g_blaster.gotDoc4(state,s);
  1049. }
  1050. void Blaster::gotDoc4 ( void *state, TcpSocket *s){
  1051. StateBD *st=(StateBD *)state;
  1052. st->m_numUrlDocsReceived++;
  1053. if (!s) {
  1054. //Shouldn't happen, but still putting a checkpoint
  1055. log (LOG_WARN,"blaster: Got a null s in gotDoc4."
  1056. "Happened because ip could not be found for gigablast"
  1057. "server");
  1058. if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
  1059. m_launched--;
  1060. // Free stateBD
  1061. freeStateBD(st);
  1062. }
  1063. return;
  1064. }
  1065. // bail if got cut off
  1066. if ( s->m_readOffset == 0 ) {
  1067. log("blasterDiff : lost the Request in gotDoc4");
  1068. if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
  1069. m_launched--;
  1070. freeStateBD(st);
  1071. }
  1072. return;
  1073. }
  1074. char *reply = s->m_readBuf ;
  1075. int32_t size = s->m_readOffset;
  1076. HttpMime mime;
  1077. mime.set ( reply , size , NULL );
  1078. char *content = reply + mime.getMimeLen();
  1079. int32_t contentLen = size - mime.getMimeLen();
  1080. //int16_t csEnum = get_iana_charset(mime.getCharset(),
  1081. // mime.getCharsetLen());
  1082. /* if (csEnum == csUnknown)
  1083. log(LOG_DEBUG, "blaster: Unknown charset : %s", mime.getCharset());*/
  1084. Xml xml;
  1085. if (!xml.set(
  1086. content,
  1087. contentLen,
  1088. false,
  1089. 0,
  1090. false,
  1091. TITLEREC_CURRENT_VERSION,
  1092. true, // setparents
  1093. 0, // niceness
  1094. CT_XML )){
  1095. log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
  1096. }
  1097. Links links;
  1098. Url *url=mime.getLocationUrl();
  1099. if (!links.set(0,//siterec xml
  1100. &xml,
  1101. url,
  1102. false,
  1103. NULL,
  1104. TITLEREC_CURRENT_VERSION,
  1105. 0,
  1106. false,
  1107. NULL)){
  1108. log(LOG_WARN, "blaster: Couldn't set Links class in gotDoc4");
  1109. }
  1110. for (int32_t i=0;i<links.getNumLinks();i++){
  1111. char *ss=links.getLink(i);
  1112. char *p;
  1113. // This page *should* always be a gigablast page. So not adding
  1114. // checks for msn or yahoo or google page.
  1115. p=strstr(ss,"google.");
  1116. if(p) continue;
  1117. p=strstr(ss,"cache:"); //googles cache page
  1118. if(p) continue;
  1119. p= strstr(ss,"gigablast.");
  1120. if(p) continue;
  1121. p= strstr(ss,"web.archive.org");//older copies on gigablast
  1122. if(p) continue;
  1123. p= strstr(ss,"search.yahoo.com");//from gigablast search
  1124. if(p) continue;
  1125. p= strstr(ss,"search.msn.com");//from gigablast search
  1126. if(p) continue;
  1127. p= strstr(ss,"s.teoma.com");//from gigablast search
  1128. if(p) continue;
  1129. p= strstr(ss,"search.dmoz.org");//from gigablast search
  1130. if(p) continue;
  1131. p= strstr(ss,"www.answers.com");//from gigablast search
  1132. if(p) continue;
  1133. if (m_verbose)
  1134. log(LOG_WARN,"blaster: Link Present on server2=%s",ss);
  1135. }
  1136. // So if one of the links that is returned is the exact url,
  1137. // then we know that the url is present.So get the url from the
  1138. // mime, search for it in the links that are returned.
  1139. char tmp[1024];
  1140. char *sendBuf=s->m_sendBuf;
  1141. char *p1,*p2;
  1142. // First get the Host, which is the domain. Since socket s is going to
  1143. // be useless after this function, changing m_sendBuf instead of using
  1144. // more space
  1145. p1=strstr(sendBuf,"%3A");
  1146. if(p1){
  1147. p1+=3;
  1148. p2=strstr(p1," HTTP");
  1149. if (p2){
  1150. //Since I do not care about the sendbuf anymore
  1151. *p2='\0';
  1152. }
  1153. }
  1154. if (!p1 || !p2){
  1155. log(LOG_WARN,"blasterdiff: Could not find search link"
  1156. "from m_sendBuf in gotdoc4");
  1157. }
  1158. else{
  1159. sprintf(tmp,"%s",p1);
  1160. //log(LOG_WARN,"blaster: tmp in gotDoc4 = %s",tmp);
  1161. bool isFound=false;
  1162. // So now we search for tmp in the links
  1163. for (int32_t i=0;i<links.getNumLinks();i++){
  1164. if(strstr(links.getLink(i),tmp) &&
  1165. links.getLinkLen(i)==(int)gbstrlen(tmp)){
  1166. isFound=true;
  1167. log(LOG_WARN,"blaster: %s in results1 but not"
  1168. " in results2 for query %s but does exist"
  1169. " in server2",tmp,st->m_u1);//->getQuery()
  1170. }
  1171. }
  1172. if (!isFound)
  1173. log(LOG_WARN,"blaster: %s in results1 but not"
  1174. " in results2 for query %s and does NOT exist"
  1175. " in server2",tmp,st->m_u1); // ->getQuery()
  1176. }
  1177. if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
  1178. m_launched--;
  1179. // Free stateBD
  1180. freeStateBD(st);
  1181. }
  1182. return;
  1183. }
  1184. void Blaster::freeStateBD(StateBD *st){
  1185. // Free stateBD's buf
  1186. if (!st) return;
  1187. if (st->m_buf1)
  1188. mfree(st->m_buf1,st->m_buf1MaxLen,"Blaster5");
  1189. mdelete(st,sizeof(StateBD),"Blaster3");
  1190. }