PageRenderTime 706ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/blaster2.cpp

https://github.com/gigablast/open-source-search-engine
C++ | 459 lines | 303 code | 51 blank | 105 comment | 85 complexity | 0a966ef03754a0f2052d9f9296327718 MD5 | raw file
Possible License(s): Apache-2.0
  1. // Matt Wells, copyright Sep 2001
  2. // the main program that brings it all together
  3. #include "gb-include.h"
  4. #include "Mem.h"
  5. #include "Conf.h"
  6. #include "Dns.h"
  7. #include "HttpServer.h"
  8. #include "Loop.h"
  9. #include <sys/resource.h> // setrlimit
  10. #include "SafeBuf.h"
  11. static void startSpidering ( ) ;
  12. static void gotDocWrapper ( void *state , TcpSocket *s ) ;
  13. static void sleepWrapper ( int fd , void *state ) ;
  14. bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
  15. bool g_recoveryMode;
  16. int g_inMemcpy;
  17. int32_t g_recoveryLevel;
  18. static int32_t s_maxNumThreads = 1 ;
  19. static int32_t s_launched = 0;
  20. static int32_t s_total = 0;
  21. static char *s_p = NULL;
  22. static char *s_pend = NULL;
  23. static bool s_portSwitch = 0;
  24. static int32_t s_wait;
  25. static int32_t s_lastTime = 0;
  26. static int32_t s_printIt = true;
  27. static char s_append[512];
  28. static SafeBuf s_words;
  29. static SafeBuf s_windices;
  30. static char *s_server = NULL;
  31. static int32_t s_numRandWords = 0;
  32. int32_t getRandomWords(char *buf, char *bufend, int32_t numWords);
  33. bool getWords();
  34. bool mainShutdown ( bool urgent ) { return true; }
  35. bool closeAll ( void *state , void (* callback)(void *state) ) {return true;}
  36. bool allExit ( ) {return true;}
  37. int main ( int argc , char *argv[] ) {
  38. // let's ensure our core file can dump
  39. struct rlimit lim;
  40. lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
  41. if ( setrlimit(RLIMIT_CORE,&lim) )
  42. log("blaster::setrlimit: %s", mstrerror(errno) );
  43. //g_conf.m_maxMem = 500000000;
  44. // init our table for doing zobrist hashing
  45. if ( ! hashinit() ) {
  46. log("blaster::hashinit failed" ); return 1; }
  47. // init the memory class after conf since it gets maxMem from Conf
  48. //if ( ! g_mem.init ( 20000000 ) ) {
  49. // log("blaster::Mem init failed" ); return 1; }
  50. //g_mem.m_maxMem = 200000000;
  51. // start up log file
  52. if ( ! g_log.init( "/tmp/blasterLog" ) ) {
  53. log("blaster::Log open /tmp/blasterLog failed" ); return 1; }
  54. // get dns ip from /etc/resolv.conf
  55. g_conf.m_dnsIps[0] = 0;
  56. FILE *fd = fopen ( "/etc/resolv.conf" , "r" );
  57. if ( ! fd ) {
  58. log("blaster::fopen: /etc/resolve.conf %s",
  59. mstrerror(errno)); return 1; }
  60. char tmp[1024];
  61. while ( fgets ( tmp , 1024 , fd ) ) {
  62. // tmp buf ptr
  63. char *p = tmp;
  64. // skip comments
  65. if ( *p == '#' ) continue;
  66. // skip nameserver name
  67. if ( ! isdigit(*p) ) while ( ! isspace ( *p ) ) p++ ;
  68. // skip spaces
  69. while ( isspace ( *p ) ) p++;
  70. // if this is not a digit, continue
  71. if ( ! isdigit(*p) ) continue;
  72. // get ip
  73. g_conf.m_dnsIps[0] = atoip ( p , gbstrlen(p) );
  74. // done
  75. break;
  76. }
  77. fclose ( fd );
  78. // if no dns server found, bail
  79. if ( g_conf.m_dnsIps[0] == 0 ) {
  80. log("blaster:: no dns ip found in /etc/resolv.conf");return 1;}
  81. // hack # of dns servers
  82. g_conf.m_numDns = 1;
  83. g_conf.m_dnsPorts[0] = 53;
  84. //g_conf.m_dnsIps [0] = atoip ( "192.168.0.1", 11 );
  85. //g_conf.m_dnsClientPort = 9909;
  86. g_conf.m_dnsMaxCacheMem = 1024*10;
  87. // hack http server port to -1 (none)
  88. //g_conf.m_httpPort = 0;
  89. g_conf.m_httpMaxSockets = 200;
  90. //g_conf.m_httpMaxReadBufSize = 102*1024*1024;
  91. g_conf.m_httpMaxSendBufSize = 16*1024;
  92. //g_conf.m_httpMaxDownloadSockets = 200;
  93. if ( argc != 4 && argc != 5 && argc !=6 ) {
  94. printUsage:
  95. fprintf(stderr,"USAGE: blaster [fileOfUrls | -r<num random words><server>] [maxNumThreads] [wait in ms] "
  96. "<lines to skip> <string to append>\n");
  97. fprintf(stderr,"USAGE: examples:\n");
  98. fprintf(stderr,"USAGE: ./blaster queries.fromlog 10 1\n");
  99. fprintf(stderr,"USAGE: ./blaster -r3http://www.gigablast.com/index.php?q= 1 100\n");
  100. return 1;
  101. }
  102. fprintf(stderr,"Logging to /tmp/blasterLog\n");
  103. // init the loop
  104. if ( ! g_loop.init() ) {
  105. log("blaster::Loop init failed" ); return 1; }
  106. // . then dns client
  107. // . server should listen to a socket and register with g_loop
  108. if ( ! g_dns.init(6000) ) {
  109. log("blaster::Dns client init failed" ); return 1; }
  110. // . then webserver
  111. // . server should listen to a socket and register with g_loop
  112. for(int32_t i = 0; i < 50; i++) {
  113. if ( ! g_httpServer.init( 8333 + i, 9334+i ) ) {
  114. log("blaster::HttpServer init failed" );
  115. //return 1;
  116. }
  117. else break;
  118. }
  119. // set File class
  120. char *fname = argv[1];
  121. int32_t fnameLen = gbstrlen(fname);
  122. int32_t fileSize = 0;
  123. int32_t bufSize = 0;
  124. char *buf = NULL;
  125. int32_t n = 0;
  126. //should we generate random queries?
  127. if(fnameLen > 2 && fname[0] == '-' && fname[1] == 'r') {
  128. char *p = fname + 2;
  129. s_numRandWords = atoi( p );
  130. while(is_digit(*p)) p++;
  131. getWords();
  132. if(*p == '\0') goto printUsage;
  133. s_server = p;
  134. log("blaster server is %s", s_server);
  135. // char x[1024];
  136. // while(1) {
  137. // int32_t l = getRandomWords(x, x + 1024, s_numRandWords);
  138. // *(x + l) = '\0';
  139. // log("blaster: %s", x);
  140. // }
  141. // exit(1);
  142. }
  143. else { //it is a real file
  144. File f;
  145. f.set ( fname );
  146. // open file
  147. if ( ! f.open ( O_RDONLY ) ) {
  148. log("blaster::open: %s %s",fname,mstrerror(g_errno));
  149. return 1;
  150. }
  151. // get file size
  152. fileSize = f.getFileSize() ;
  153. // store a \0 at the end
  154. bufSize = fileSize + 1;
  155. // make buffer to hold all
  156. buf = (char *) mmalloc ( bufSize , "blaster" );
  157. if ( ! buf) {log("blaster::mmalloc: %s",mstrerror(errno));return 1;}
  158. //char *bufEnd = buf + bufSize;
  159. // set s_p
  160. s_p = buf;
  161. s_pend = buf + bufSize - 1;
  162. // read em all in
  163. if ( ! f.read ( buf , fileSize , 0 ) ) {
  164. log("blaster::read: %s %s",fname,mstrerror(g_errno));return 1;}
  165. // change \n to \0
  166. //char *p = buf;
  167. for ( int32_t i = 0 ; i < bufSize ; i++ ) {
  168. if ( buf[i] != '\n' ) continue;
  169. buf[i] = '\0';
  170. n++;
  171. }
  172. f.close();
  173. }
  174. // log a msg
  175. log(LOG_INIT,"blaster: read %"INT32" urls into memory", n );
  176. int32_t linesToSkip = 0;
  177. if ( argc >= 5 ) {
  178. linesToSkip = atoi ( argv[4] );
  179. log (LOG_INIT,"blaster: skipping %"INT32" urls",linesToSkip);
  180. }
  181. for ( int32_t i = 0; i < linesToSkip && s_p < s_pend; i++ )
  182. s_p += gbstrlen(s_p) + 1;
  183. if ( argc == 6 ) {
  184. int32_t len = gbstrlen ( argv[5] );
  185. if ( len > 512 )
  186. len = 512;
  187. strncpy ( s_append , argv[5] , gbstrlen (argv[5]) );
  188. }
  189. else
  190. s_append[0] = '\0';
  191. // get min time between each spider in milliseconds
  192. s_wait = atoi( argv[3] );
  193. // # of threads
  194. s_maxNumThreads = 1;
  195. s_maxNumThreads = atoi ( argv[2] );
  196. s_portSwitch = 0;
  197. //if ( argc == 4 ) s_portSwitch = 1;
  198. //else s_portSwitch = 0;
  199. // start our spider loop
  200. //startSpidering( );
  201. // wakeup wrapper every X ms
  202. g_loop.registerSleepCallback ( s_wait , NULL , sleepWrapper );
  203. //msg10.addUrls ( uu , gbstrlen(uu)+1, NULL,0,time(0),4,true,NULL,NULL);
  204. // . now start g_loops main interrupt handling loop
  205. // . it should block forever
  206. // . when it gets a signal it dispatches to a server or db to handle it
  207. if ( ! g_loop.runLoop() ) {
  208. log("blaster::runLoop failed" ); return 1; }
  209. // dummy return (0-->normal exit status for the shell)
  210. return 0;
  211. }
  212. void sleepWrapper ( int fd , void *state ) {
  213. startSpidering();
  214. }
  215. void startSpidering ( ) {
  216. // url class for parsing/normalizing url
  217. Url u;
  218. // count total urls done
  219. static int64_t s_startTime = 0;
  220. // set startTime
  221. if ( s_startTime == 0 ) s_startTime = gettimeofdayInMilliseconds();
  222. // get time now
  223. int64_t now = gettimeofdayInMilliseconds();
  224. // elapsed time to do all urls
  225. double took = (double)(now - s_startTime) / 1000.0 ;
  226. // log this every 20 urls
  227. if ( s_printIt && s_total > 0 && ( s_total % 20 ) == 0 ) {
  228. logf(LOG_INFO,"did %"INT32" urls in %f seconds. %f urls per second."
  229. " threads now = %"INT32".",
  230. s_total , took , ((double)s_total) / took, s_launched);
  231. s_printIt = false;
  232. }
  233. // did we wait int32_t enough?
  234. if ( now - s_lastTime < s_wait ) return;
  235. s_lastTime = now;
  236. // . use HttpServer.getDoc() to fetch it
  237. // . fetch X at a time
  238. while ( (s_server || s_p < s_pend) && s_launched < s_maxNumThreads ) {
  239. // clear any error
  240. g_errno = 0;
  241. //append s_append to the url
  242. char url[MAX_URL_LEN];
  243. char *p = url;
  244. char *pend = url + MAX_URL_LEN;
  245. char *t = NULL;
  246. if(s_server) {
  247. int32_t len = gbstrlen(s_server);
  248. gbmemcpy ( p, s_server, len);
  249. p += len;
  250. p += getRandomWords(p, pend, s_numRandWords);
  251. int32_t appendLen = gbstrlen(s_append);
  252. if ( p + appendLen < pend ) {
  253. gbmemcpy ( p, s_append, gbstrlen(s_append) );
  254. p += gbstrlen(s_append);
  255. }
  256. *p++ = '\0';
  257. u.set ( url , p - url);
  258. t = g_mem.strdup(url, "saved url");
  259. }
  260. else {
  261. gbmemcpy ( p, s_p, gbstrlen(s_p));
  262. p += gbstrlen ( s_p );
  263. if ( gbstrlen(s_p) + gbstrlen(s_append) < MAX_URL_LEN )
  264. gbmemcpy ( p, s_append, gbstrlen(s_append) );
  265. p += gbstrlen(s_append);
  266. //null end
  267. *p ='\0';
  268. // make into a url class
  269. u.set ( url , gbstrlen(url) );
  270. // set port if port switch is true
  271. //if ( s_portSwitch ) {
  272. // int32_t r = rand() % 32;
  273. // u.setPort ( 8000 + r );
  274. //}
  275. // save s_p
  276. t = s_p;
  277. // skip to next url
  278. s_p += gbstrlen ( s_p ) + 1;
  279. }
  280. // count it
  281. s_launched++;
  282. // get it
  283. bool status = g_httpServer.getDoc ( u.getUrl() , // url
  284. 0, // ip
  285. 0 , // offset
  286. -1 , // size
  287. 0 , // ifModifiedSince
  288. (void *)t , // state
  289. gotDocWrapper, // callback
  290. 20*1000, // timeout
  291. 0, // proxy ip
  292. 0, // proxy port
  293. 30*1024*1024, //maxLen
  294. 30*1024*1024);//maxOtherLen
  295. // continue if it blocked
  296. if ( ! status ) continue;
  297. // otherwise, got it right away
  298. s_launched--;
  299. // log msg
  300. log("got doc1 %s: %s", u.getUrl() , mstrerror(g_errno) );
  301. // we gotta wait
  302. break;
  303. }
  304. // bail if not done yet
  305. //if ( s_launched > 0 ) return;
  306. if ( s_server || s_p < s_pend ) return;
  307. // otherwise, we're all done
  308. logf(LOG_INFO,"blaster: did %"INT32" urls in %f seconds. %f urls per "
  309. "second.",
  310. s_total , took , ((double)s_total) / took );
  311. // exit now
  312. exit ( 0 );
  313. }
  314. void gotDocWrapper ( void *state , TcpSocket *s ) {
  315. // no longer launched
  316. s_launched--;
  317. char* url = (char*)state;
  318. // bail if got cut off
  319. if ( s->m_readOffset == 0 ) {
  320. log("lost %s",(char *) state);
  321. if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
  322. return;
  323. }
  324. // got one more result page
  325. s_total++;
  326. // allow printing
  327. s_printIt = true;
  328. // get time now
  329. int64_t now = gettimeofdayInMilliseconds();
  330. // get hash
  331. char *reply = s->m_readBuf ;
  332. int32_t size = s->m_readOffset;
  333. HttpMime mime;
  334. mime.set ( reply , size , NULL );
  335. char *content = reply + mime.getMimeLen();
  336. int32_t contentLen = size - mime.getMimeLen();
  337. int32_t status = mime.getHttpStatus();
  338. uint32_t h = hash32 ( content , contentLen );
  339. char *p = mime.getMime();
  340. char *pend = p + mime.getMimeLen();
  341. char message[256];
  342. int32_t mlen = 0;
  343. // parse status message out of response
  344. // HTTP/1.0
  345. while ( p < pend && !isspace(*p) ) p++;
  346. // skip space
  347. while ( p < pend && isspace(*p) ) p++;
  348. // copy to end of line
  349. while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){
  350. message[mlen++] = *p;
  351. }
  352. message[mlen] = '\0';
  353. // log msg
  354. if ( g_errno )
  355. logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) %s : "
  356. "%s", status,
  357. s->m_readOffset ,
  358. (int32_t)(now - s->m_startTime) ,
  359. (char *)state ,
  360. mstrerror(g_errno) );
  361. else
  362. logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) "
  363. "(hash=%"XINT32") %s", status,
  364. s->m_readOffset ,
  365. (int32_t)(now - s->m_startTime) ,
  366. h ,
  367. (char *)state );
  368. if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
  369. // try to launch another
  370. startSpidering();
  371. }
  372. int32_t getRandomWords(char *buf, char *bufend, int32_t numWords) {
  373. int32_t totalWords = s_windices.length() / sizeof(int32_t);
  374. char *p = buf;
  375. while(1) {
  376. int32_t wordNum = rand() % totalWords;
  377. int32_t windex = *(int32_t*)(&s_windices[wordNum*sizeof(int32_t)]);
  378. int32_t wlen = gbstrlen(&s_words[windex]);
  379. if(wlen + 1 + p >= bufend) return p - buf;
  380. gbmemcpy(p, &s_words[windex], wlen);
  381. p += wlen;
  382. if(--numWords <= 0) return p - buf;
  383. *p++ = '+';
  384. }
  385. return p - buf;
  386. }
  387. bool getWords() {
  388. FILE *fd = fopen ( "/usr/share/dict/words" , "r" );
  389. if ( ! fd ) {
  390. log("blaster:: failed to open /usr/share/dict/words %s",
  391. mstrerror(errno));
  392. return 1;
  393. }
  394. char tmp[1024];
  395. while ( fgets ( tmp , 1024 , fd ) ) {
  396. int32_t len = gbstrlen(tmp);
  397. if(len > 2 && tmp[len-2] == 's' && tmp[len-3] == '\'') continue;
  398. s_windices += s_words.length();
  399. s_words.safeMemcpy(tmp, len-1); //copy in data minus the newline
  400. s_words += '\0';
  401. }
  402. fclose ( fd );
  403. log("blaster: read %"INT32" words, "
  404. "%"INT32" bytes in from dictionary.",
  405. (int32_t)(s_windices.length() / sizeof(int32_t)),
  406. (int32_t)s_words.length());
  407. return true;
  408. }