PageRenderTime 46ms CodeModel.GetById 19ms app.highlight 23ms RepoModel.GetById 0ms app.codeStats 0ms

/blaster2.cpp

https://github.com/gigablast/open-source-search-engine
C++ | 459 lines | 303 code | 51 blank | 105 comment | 85 complexity | 0a966ef03754a0f2052d9f9296327718 MD5 | raw file
Possible License(s): Apache-2.0
  1// Matt Wells, copyright Sep 2001
  2
  3// the main program that brings it all together
  4
  5#include "gb-include.h"
  6
  7#include "Mem.h"
  8#include "Conf.h"
  9#include "Dns.h"
 10#include "HttpServer.h"
 11#include "Loop.h"
 12#include <sys/resource.h>  // setrlimit
 13#include "SafeBuf.h"
 14
 15static void startSpidering ( ) ;
 16static void gotDocWrapper ( void *state , TcpSocket *s ) ;
 17static void sleepWrapper ( int fd , void *state ) ;
 18
 19bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
 20bool g_recoveryMode;
 21int g_inMemcpy;
 22int32_t g_recoveryLevel;
 23
 24static int32_t  s_maxNumThreads = 1 ;
 25static int32_t  s_launched   = 0;
 26static int32_t  s_total      = 0;
 27static char *s_p          = NULL;
 28static char *s_pend       = NULL;
 29static bool  s_portSwitch = 0;
 30static int32_t  s_wait;
 31static int32_t  s_lastTime   = 0;
 32static int32_t  s_printIt    = true;
 33static char  s_append[512];
 34static SafeBuf s_words;
 35static SafeBuf s_windices;
 36static char *s_server = NULL;
 37static int32_t  s_numRandWords = 0;
 38int32_t getRandomWords(char *buf, char *bufend, int32_t numWords);
 39bool getWords();
 40
 41
 42bool mainShutdown ( bool urgent ) { return true; }
 43bool closeAll ( void *state , void (* callback)(void *state) ) {return true;}
 44bool allExit ( ) {return true;}
 45
 46int main ( int argc , char *argv[] ) {
 47	// let's ensure our core file can dump
 48	struct rlimit lim;
 49	lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
 50	if ( setrlimit(RLIMIT_CORE,&lim) )
 51		log("blaster::setrlimit: %s", mstrerror(errno) );
 52
 53	//g_conf.m_maxMem = 500000000;
 54
 55	// init our table for doing zobrist hashing
 56	if ( ! hashinit() ) {
 57		log("blaster::hashinit failed" ); return 1; }
 58
 59	// init the memory class after conf since it gets maxMem from Conf
 60	//if ( ! g_mem.init ( 20000000 ) ) {
 61	//	log("blaster::Mem init failed" ); return 1; }
 62	//g_mem.m_maxMem = 200000000;
 63	// start up log file
 64	if ( ! g_log.init( "/tmp/blasterLog" )        ) {
 65		log("blaster::Log open /tmp/blasterLog failed" ); return 1; }
 66
 67	// get dns ip from /etc/resolv.conf
 68	g_conf.m_dnsIps[0] = 0;
 69	FILE *fd = fopen ( "/etc/resolv.conf" , "r" );
 70	if ( ! fd ) {
 71		log("blaster::fopen: /etc/resolve.conf %s",
 72		    mstrerror(errno)); return 1; }
 73
 74	char tmp[1024];
 75	while ( fgets ( tmp , 1024 , fd ) ) {
 76		// tmp buf ptr
 77		char *p = tmp;
 78		// skip comments
 79		if ( *p == '#' ) continue;
 80		// skip nameserver name
 81		if ( ! isdigit(*p) ) while ( ! isspace ( *p ) ) p++ ;
 82		// skip spaces
 83		while ( isspace ( *p ) ) p++;
 84		// if this is not a digit, continue
 85		if ( ! isdigit(*p) ) continue;
 86		// get ip
 87		g_conf.m_dnsIps[0] = atoip ( p , gbstrlen(p) );
 88		// done
 89		break;
 90	}
 91	fclose ( fd );
 92
 93
 94	// if no dns server found, bail
 95	if ( g_conf.m_dnsIps[0] == 0 ) {
 96		log("blaster:: no dns ip found in /etc/resolv.conf");return 1;}
 97
 98	// hack # of dns servers
 99	g_conf.m_numDns         = 1;
100	g_conf.m_dnsPorts[0]    = 53;
101	//g_conf.m_dnsIps  [0]    = atoip ( "192.168.0.1", 11 );
102	//g_conf.m_dnsClientPort  = 9909;
103	g_conf.m_dnsMaxCacheMem = 1024*10;
104	// hack http server port to -1 (none)
105	//g_conf.m_httpPort           = 0;
106	g_conf.m_httpMaxSockets     = 200;
107	//g_conf.m_httpMaxReadBufSize = 102*1024*1024;
108	g_conf.m_httpMaxSendBufSize = 16*1024;
109	//g_conf.m_httpMaxDownloadSockets = 200;
110
111	if ( argc != 4 && argc != 5 && argc !=6 ) {
112	printUsage:
113		fprintf(stderr,"USAGE: blaster [fileOfUrls | -r<num random words><server>] [maxNumThreads] [wait in ms] " 
114		    "<lines to skip> <string to append>\n");
115		fprintf(stderr,"USAGE: examples:\n");
116		fprintf(stderr,"USAGE:  ./blaster queries.fromlog 10 1\n");
117		fprintf(stderr,"USAGE:  ./blaster -r3http://www.gigablast.com/index.php?q= 1 100\n");
118		return 1; 
119	}
120
121	fprintf(stderr,"Logging to /tmp/blasterLog\n");
122
123	// init the loop
124	if ( ! g_loop.init() ) {
125		log("blaster::Loop init failed" ); return 1; }
126	// . then dns client
127	// . server should listen to a socket and register with g_loop
128	if ( ! g_dns.init(6000)        ) {
129		log("blaster::Dns client init failed" ); return 1; }
130	// . then webserver
131	// . server should listen to a socket and register with g_loop
132	for(int32_t i = 0; i < 50; i++) {
133		if ( ! g_httpServer.init( 8333 + i, 9334+i ) ) {
134			log("blaster::HttpServer init failed" ); 
135			//return 1; 
136		}
137		else break;
138	}
139	// set File class
140	char *fname = argv[1];
141	int32_t fnameLen = gbstrlen(fname);
142	int32_t fileSize = 0;
143	int32_t bufSize = 0;
144	char *buf = NULL;
145	int32_t  n = 0;
146
147	//should we generate random queries?
148	if(fnameLen > 2 && fname[0] == '-' && fname[1] == 'r') {
149		char *p = fname + 2;
150		s_numRandWords = atoi( p );
151		while(is_digit(*p)) p++;
152		getWords();
153		
154		if(*p == '\0') goto printUsage;
155		s_server = p;
156		log("blaster server is %s", s_server);
157		//		char x[1024];
158		// 		while(1) {
159		// 			int32_t l = getRandomWords(x, x + 1024, s_numRandWords);
160		// 			*(x + l) = '\0';
161		// 			log("blaster: %s", x);
162		// 		}
163		//		exit(1);
164	}
165	else { //it is a real file
166		File f;
167		f.set ( fname );
168
169		// open file
170		if ( ! f.open ( O_RDONLY ) ) {
171			log("blaster::open: %s %s",fname,mstrerror(g_errno)); 
172			return 1; 
173		}
174
175		// get file size
176		fileSize = f.getFileSize() ;
177
178		// store a \0 at the end
179		bufSize = fileSize + 1;
180
181		// make buffer to hold all
182		buf = (char *) mmalloc ( bufSize , "blaster" );
183		if ( ! buf) {log("blaster::mmalloc: %s",mstrerror(errno));return 1;}
184
185		//char *bufEnd = buf + bufSize;
186
187		// set s_p
188		s_p    = buf;
189		s_pend = buf + bufSize - 1;
190
191		// read em all in
192		if ( ! f.read ( buf , fileSize , 0 ) ) {
193			log("blaster::read: %s %s",fname,mstrerror(g_errno));return 1;}
194
195		// change \n to \0
196		//char *p = buf;
197		for ( int32_t i = 0 ; i < bufSize ; i++ ) {
198			if ( buf[i] != '\n' ) continue;
199			buf[i] = '\0';
200			n++;
201		}
202
203		f.close();
204	}
205	// log a msg
206	log(LOG_INIT,"blaster: read %"INT32" urls into memory", n );
207
208	int32_t linesToSkip = 0;
209	if ( argc >=  5 ) {
210		linesToSkip = atoi ( argv[4] );
211		log (LOG_INIT,"blaster: skipping %"INT32" urls",linesToSkip);
212	}
213	for ( int32_t i = 0; i < linesToSkip && s_p < s_pend; i++ )
214		s_p += gbstrlen(s_p) + 1;
215	
216	if ( argc == 6 ) {
217		int32_t len  = gbstrlen ( argv[5] );
218		if ( len > 512 )
219			len = 512;
220		strncpy ( s_append , argv[5] , gbstrlen (argv[5]) );
221	}
222	else
223		s_append[0] = '\0';
224
225	// get min time between each spider in milliseconds
226	s_wait = atoi( argv[3] );
227
228	// # of threads
229	s_maxNumThreads = 1;
230	s_maxNumThreads = atoi ( argv[2] );
231
232	s_portSwitch = 0;
233	//if ( argc == 4 ) s_portSwitch = 1;
234	//else             s_portSwitch = 0;
235
236	// start our spider loop
237	//startSpidering( );
238
239	// wakeup wrapper every X ms
240	g_loop.registerSleepCallback ( s_wait , NULL , sleepWrapper );
241
242	//msg10.addUrls ( uu , gbstrlen(uu)+1, NULL,0,time(0),4,true,NULL,NULL);
243	// . now start g_loops main interrupt handling loop
244	// . it should block forever
245	// . when it gets a signal it dispatches to a server or db to handle it
246	if ( ! g_loop.runLoop()    ) {
247		log("blaster::runLoop failed" ); return 1; }
248	// dummy return (0-->normal exit status for the shell)
249	return 0;
250}
251
252void sleepWrapper ( int fd , void *state ) {
253	startSpidering();
254}
255
256
257void startSpidering ( ) {
258	// url class for parsing/normalizing url
259	Url u;
260	// count total urls done
261	static int64_t s_startTime = 0;
262	// set startTime
263	if ( s_startTime == 0 ) s_startTime = gettimeofdayInMilliseconds();
264	// get time now
265	int64_t now = gettimeofdayInMilliseconds();
266	// elapsed time to do all urls
267	double took = (double)(now - s_startTime) / 1000.0 ;
268	// log this every 20 urls
269	if ( s_printIt && s_total > 0 && ( s_total % 20 ) == 0 ) {
270		logf(LOG_INFO,"did %"INT32" urls in %f seconds. %f urls per second."
271		    " threads now = %"INT32".",
272		    s_total ,  took , ((double)s_total) / took, s_launched);
273		s_printIt = false;
274	}
275	// did we wait int32_t enough?
276	if ( now - s_lastTime < s_wait ) return;
277	s_lastTime = now;
278	// . use HttpServer.getDoc() to fetch it
279	// . fetch X at a time
280	while ( (s_server || s_p < s_pend) && s_launched < s_maxNumThreads ) {
281		// clear any error
282		g_errno = 0;
283		//append s_append to the url
284		char url[MAX_URL_LEN];
285		char *p = url;
286		char *pend = url + MAX_URL_LEN;
287		char *t = NULL;
288
289		if(s_server) {
290			int32_t len = gbstrlen(s_server);
291			gbmemcpy ( p, s_server, len);
292			p += len;
293			p += getRandomWords(p, pend, s_numRandWords);
294			int32_t appendLen = gbstrlen(s_append);
295			if ( p + appendLen < pend ) {
296				gbmemcpy ( p, s_append, gbstrlen(s_append) );
297				p += gbstrlen(s_append);
298			}
299			*p++ = '\0';
300			u.set ( url , p - url);
301			t = g_mem.strdup(url, "saved url");
302		}
303		else {
304			gbmemcpy ( p, s_p, gbstrlen(s_p));
305			p += gbstrlen ( s_p );
306			if ( gbstrlen(s_p) + gbstrlen(s_append) < MAX_URL_LEN )
307				gbmemcpy ( p, s_append, gbstrlen(s_append) );
308			p += gbstrlen(s_append);
309			//null end
310			*p ='\0';
311
312			// make into a url class
313			u.set ( url , gbstrlen(url) );
314			// set port if port switch is true
315			//if ( s_portSwitch ) {
316			//	int32_t r = rand() % 32;
317			//	u.setPort ( 8000 + r );
318			//}
319			// save s_p
320			t = s_p;
321			// skip to next url
322			s_p += gbstrlen ( s_p ) + 1;
323		}
324		// count it
325		s_launched++;
326		// get it
327		bool status = g_httpServer.getDoc ( u.getUrl() , // url
328						    0, // ip
329						    0 ,  // offset
330						    -1 ,  // size
331						    0 , // ifModifiedSince
332						    (void *)t ,  // state
333						    gotDocWrapper, // callback
334						    20*1000, // timeout
335						    0, // proxy ip
336						    0, // proxy port
337						    30*1024*1024, //maxLen
338						    30*1024*1024);//maxOtherLen
339		// continue if it blocked
340		if ( ! status ) continue;
341		// otherwise, got it right away
342		s_launched--;
343		// log msg
344		log("got doc1 %s: %s", u.getUrl() , mstrerror(g_errno) );
345		// we gotta wait
346		break;
347	}
348	// bail if not done yet
349	//if ( s_launched > 0 ) return;
350	if ( s_server || s_p < s_pend ) return;
351	// otherwise, we're all done
352	logf(LOG_INFO,"blaster: did %"INT32" urls in %f seconds. %f urls per "
353	     "second.",
354	    s_total ,  took , ((double)s_total) / took );
355	// exit now
356	exit ( 0 );
357}
358
359void gotDocWrapper ( void *state , TcpSocket *s ) {
360	// no longer launched
361	s_launched--;
362	char* url = (char*)state;
363	// bail if got cut off
364	if ( s->m_readOffset == 0 ) {
365		log("lost %s",(char *) state);
366		if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
367		return;
368	}
369	// got one more result page
370	s_total++;
371	// allow printing
372	s_printIt = true;
373	// get time now
374	int64_t now = gettimeofdayInMilliseconds();
375	// get hash
376	char *reply = s->m_readBuf ;
377	int32_t  size  = s->m_readOffset;
378	HttpMime mime;
379	mime.set ( reply , size , NULL );
380	char *content    = reply + mime.getMimeLen();
381	int32_t  contentLen = size  - mime.getMimeLen();
382	int32_t status      = mime.getHttpStatus();
383	uint32_t h = hash32 ( content , contentLen );
384	char *p = mime.getMime();
385	char *pend = p + mime.getMimeLen();
386	char message[256];
387	int32_t mlen = 0;
388
389	// parse status message out of response
390
391	// HTTP/1.0
392	while ( p < pend && !isspace(*p) ) p++;
393	// skip space
394	while ( p < pend &&  isspace(*p) ) p++;
395	// copy to end of line
396	while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){
397		message[mlen++] = *p;
398	}
399	message[mlen] = '\0';
400
401	// log msg
402	if ( g_errno ) 
403		logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) %s : "
404		     "%s", status,
405		      s->m_readOffset      , 
406		      (int32_t)(now - s->m_startTime) , 
407		      (char *)state        , 
408		      mstrerror(g_errno)   );
409	else
410		logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) "
411		     "(hash=%"XINT32") %s", status,
412		      s->m_readOffset      , 
413		      (int32_t)(now - s->m_startTime) , 
414		      h ,
415		      (char *)state        );
416
417	if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
418	// try to launch another
419	startSpidering();
420}
421
422int32_t getRandomWords(char *buf, char *bufend, int32_t numWords) {
423	int32_t totalWords = s_windices.length() / sizeof(int32_t);
424	char *p = buf;
425	while(1) {
426		int32_t wordNum = rand() % totalWords;
427		int32_t windex = *(int32_t*)(&s_windices[wordNum*sizeof(int32_t)]);
428		int32_t wlen = gbstrlen(&s_words[windex]);
429		if(wlen + 1 + p >= bufend) return p - buf;
430		gbmemcpy(p, &s_words[windex], wlen);
431		p += wlen;
432		if(--numWords <= 0) return p - buf;
433		*p++ = '+';
434	}
435	return p - buf;
436}
437
438bool getWords() {
439	FILE *fd = fopen ( "/usr/share/dict/words" , "r" );
440	if ( ! fd ) {
441		log("blaster:: failed to open /usr/share/dict/words %s",
442		    mstrerror(errno)); 
443		return 1; 
444	}
445	char tmp[1024];
446	while ( fgets ( tmp , 1024 , fd ) ) {
447		int32_t len = gbstrlen(tmp);
448		if(len > 2 && tmp[len-2] == 's' && tmp[len-3] == '\'') continue;
449		s_windices += s_words.length();
450		s_words.safeMemcpy(tmp, len-1); //copy in data minus the newline
451		s_words += '\0';
452	}
453	fclose ( fd );
454	log("blaster: read %"INT32" words, "
455	    "%"INT32" bytes in from dictionary.", 
456	    (int32_t)(s_windices.length() / sizeof(int32_t)), 
457	    (int32_t)s_words.length());
458	return true;
459}