PageRenderTime 31ms CodeModel.GetById 23ms app.highlight 6ms RepoModel.GetById 0ms app.codeStats 0ms

/Msg13.h

https://github.com/gigablast/open-source-search-engine
C Header | 212 lines | 105 code | 47 blank | 60 comment | 0 complexity | 5132b9a73f2c47ee935e02da0a443e70 MD5 | raw file
Possible License(s): Apache-2.0
  1// Matt Wells, copyright Oct 2001
  2
  3// . ask another host to download a url for you
  4// . the remote host will also use a cache if m_maxCacheAge > 0
  5// . used for downloading and caching robots.txt
  6// . if m_compressReply then the host compressed the http reply before
  7//   sending it back to you via udp
  8
  9#ifndef _MSG13_H_
 10#define _MSG13_H_
 11
 12#include "Url.h" // MAX_URL_LEN
 13#include "SpiderProxy.h" // MAXUSERNAMEPWD
 14
 15// max crawl delay form proxy backoff of 1 minute (60 seconds)
 16#define MAX_PROXYCRAWLDELAYMS 60000
 17
 18void resetMsg13Caches ( ) ;
 19bool printHammerQueueTable ( SafeBuf *sb ) ;
 20
 21extern char *g_fakeReply;
 22
 23class Msg13Request {
 24public:
 25
 26	// the top portion of Msg13Request is sent to handleRequest54()
 27	// in SpiderProxy.cpp to get and return proxies, as well as to
 28	// ban proxies.
 29	int32_t getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;};
 30	int32_t  m_urlIp;
 31	int32_t  m_lbId; // loadbucket id
 32	// the http proxy to use to download
 33	int32_t  m_proxyIp;
 34	uint16_t m_proxyPort;
 35	int32_t  m_banProxyIp;
 36	uint16_t m_banProxyPort;
 37	char  m_opCode;
 38	char  m_lastHack;
 39
 40	collnum_t m_collnum;
 41
 42	// not part of the proxy request, but set from ProxyReply:
 43	int32_t  m_numBannedProxies;
 44	// . if using proxies, how many proxies have we tried to download 
 45	//   this url through
 46	// . used internally in Msg13.cpp
 47	int32_t m_proxyTries;
 48	// if using proxies, did host #0 tell us there were more to try if
 49	// this one did not work out?
 50	bool m_hasMoreProxiesToTry;
 51
 52	// we call this function after the imposed crawl-delay is over
 53	void (*m_hammerCallback)(class Msg13Request *r);
 54
 55
 56	int64_t m_urlHash48;
 57	int32_t  m_firstIp;
 58
 59	// when it was stored in the hammer queue
 60	int64_t m_stored;
 61
 62	// a tmp hack var referencing into m_url[] below
 63	char *m_proxiedUrl;
 64	int32_t  m_proxiedUrlLen;
 65
 66	int64_t m_downloadStartTimeMS;
 67
 68	char  m_niceness;
 69	int32_t  m_ifModifiedSince;
 70	int32_t  m_maxCacheAge;
 71	int32_t  m_maxTextDocLen;
 72	int32_t  m_maxOtherDocLen;
 73	// in milliseconds. use -1 if none or unknown.
 74	int32_t  m_crawlDelayMS;
 75	// for linked list, this is the hammer queue
 76	class Msg13Request *m_nextLink;
 77
 78	char m_proxyUsernamePwdAuth[MAXUSERNAMEPWD];
 79
 80	// if doing spider compression, compute contentHash32 of document
 81	// downloaded, and if it matches this then send back EDOCUNCHANGED
 82	int32_t  m_contentHash32;
 83	// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
 84	char m_isCustomCrawl;
 85	// send back error ENOGOODDATE if it does not have one. but if
 86	// harvestLinks is true, just send back a filtered list of links
 87	int32_t  m_requireGoodDate:1;
 88	int32_t  m_harvestLinksIfNoGoodDate:1;
 89	int32_t  m_compressReply:1;
 90	int32_t  m_useCompressionProxy:1;
 91	// if m_forwardDownloadRequest is true then we pick the host to 
 92	// download this url based on the IP address, the idea being that
 93	// only one host is responsible for downloading from a particular
 94	// ip address. this keeps webmasters happier so they can block us
 95	// by just blocking one ip address. and it makes it easier for them
 96	// to analyze their web logs.
 97	int32_t  m_forwardDownloadRequest:1;
 98	int32_t  m_isScraping:1;
 99	// does url end in /robots.txt ?
100	int32_t  m_isRobotsTxt:1; 
101	// should we call getTestDoc()/addTestDoc() like for the "test" coll
102	// and for Test.cpp?
103	int32_t  m_useTestCache:1; 
104	int32_t  m_addToTestCache:1;
105	int32_t  m_skipHammerCheck:1;
106	int32_t  m_attemptedIframeExpansion:1;
107	int32_t  m_crawlDelayFromEnd:1;
108	int32_t  m_forEvents:1;
109
110	// does m_url represent a FULL http request mime and NOT just a url?
111	// this happens when gigablast is being used like a squid proxy.
112	int32_t  m_isSquidProxiedUrl:1;
113
114	int32_t  m_foundInCache:1;
115	int32_t  m_forceUseFloaters:1;
116
117	int32_t  m_wasInTableBeforeStarting:1;
118	int32_t  m_isRootSeedUrl:1;
119
120	//int32_t  m_testParserEnabled:1;
121	//int32_t  m_testSpiderEnabled:1;
122	//int32_t  m_isPageParser:1;
123	//int32_t  m_isPageInject:1;
124
125	// if we just end up calling HttpServer::getDoc() via calling
126	// downloadDoc() then we set this for callback purposes
127	class Msg13 *m_parent;
128
129	// on the other hand, if we are called indirectly by handleRequest13()
130	// then we set m_udpSlot.
131	class UdpSlot *m_udpSlot;
132
133	class TcpSocket *m_tcpSocket;
134
135	// used for addTestDoc() and caching. msg13 sets this
136	int64_t m_urlHash64;	
137	int32_t      m_spideredTime;
138	// used for caching (and for request table, wait in line table)
139	int64_t m_cacheKey;
140	char      m_testDir[32];
141	// msg13 sets this too, so you don't have to worry about setting it
142	//int32_t      m_urlLen;
143	// includes \0 termination
144	//char      m_url[MAX_URL_LEN+1];
145
146	char *ptr_url;
147	char *ptr_cookie;
148
149	int32_t  size_url;
150	int32_t  size_cookie;
151
152	// variable data starts here
153
154	int32_t getSize() {
155		return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
156
157	// zero it all out
158	void reset() {
159		//memset (this,0,(char *)m_url - (char *)this + 1); 
160		memset (this,0,sizeof(Msg13Request));
161		m_maxTextDocLen  = -1; // no limit
162		m_maxOtherDocLen = -1; // no limit
163		m_crawlDelayMS   = -1; // unknown or none
164		m_collnum = (collnum_t)-1;
165	};
166};
167
168class Msg13 {
169
170 public:
171
172	Msg13() ;
173	~Msg13();
174	void reset() ;
175
176	// register our request handler with g_udpServer (called by main.cpp)
177	static bool registerHandler();
178
179	static class RdbCache *getHttpCacheRobots();
180	static class RdbCache *getHttpCacheOthers();
181
182	bool getDoc ( Msg13Request *r ,
183		      bool isTestColl ,
184		      void   *state             ,
185		      void  (*callback)(void *state) );
186
187	bool forwardRequest();
188
189	bool gotForwardedReply ( class UdpSlot *slot );
190	bool gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSize);
191
192	// keep public so wrappers can access
193	void *m_state;
194	void  (* m_callback) (void *state );
195
196	// we now store the uncompressed http reply in here
197	char *m_replyBuf;
198	int32_t  m_replyBufSize;
199	int32_t  m_replyBufAllocSize;
200
201	// point to it
202	Msg13Request *m_request;
203
204	//char m_tmpBuf32[32];
205};
206
207bool getTestSpideredDate ( Url *u , int32_t *origSpideredDate , char *testDir ) ;
208bool addTestSpideredDate ( Url *u , int32_t  spideredTime     , char *testDir ) ;
209
210extern RdbCache s_hammerCache;
211
212#endif