PageRenderTime 46ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/Msg13.h

https://github.com/gigablast/open-source-search-engine
C Header | 212 lines | 105 code | 47 blank | 60 comment | 0 complexity | 5132b9a73f2c47ee935e02da0a443e70 MD5 | raw file
Possible License(s): Apache-2.0
  1. // Matt Wells, copyright Oct 2001
  2. // . ask another host to download a url for you
  3. // . the remote host will also use a cache if m_maxCacheAge > 0
  4. // . used for downloading and caching robots.txt
  5. // . if m_compressReply then the host compressed the http reply before
  6. // sending it back to you via udp
  7. #ifndef _MSG13_H_
  8. #define _MSG13_H_
  9. #include "Url.h" // MAX_URL_LEN
  10. #include "SpiderProxy.h" // MAXUSERNAMEPWD
  11. // max crawl delay form proxy backoff of 1 minute (60 seconds)
  12. #define MAX_PROXYCRAWLDELAYMS 60000
  13. void resetMsg13Caches ( ) ;
  14. bool printHammerQueueTable ( SafeBuf *sb ) ;
  15. extern char *g_fakeReply;
  16. class Msg13Request {
  17. public:
  18. // the top portion of Msg13Request is sent to handleRequest54()
  19. // in SpiderProxy.cpp to get and return proxies, as well as to
  20. // ban proxies.
  21. int32_t getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;};
  22. int32_t m_urlIp;
  23. int32_t m_lbId; // loadbucket id
  24. // the http proxy to use to download
  25. int32_t m_proxyIp;
  26. uint16_t m_proxyPort;
  27. int32_t m_banProxyIp;
  28. uint16_t m_banProxyPort;
  29. char m_opCode;
  30. char m_lastHack;
  31. collnum_t m_collnum;
  32. // not part of the proxy request, but set from ProxyReply:
  33. int32_t m_numBannedProxies;
  34. // . if using proxies, how many proxies have we tried to download
  35. // this url through
  36. // . used internally in Msg13.cpp
  37. int32_t m_proxyTries;
  38. // if using proxies, did host #0 tell us there were more to try if
  39. // this one did not work out?
  40. bool m_hasMoreProxiesToTry;
  41. // we call this function after the imposed crawl-delay is over
  42. void (*m_hammerCallback)(class Msg13Request *r);
  43. int64_t m_urlHash48;
  44. int32_t m_firstIp;
  45. // when it was stored in the hammer queue
  46. int64_t m_stored;
  47. // a tmp hack var referencing into m_url[] below
  48. char *m_proxiedUrl;
  49. int32_t m_proxiedUrlLen;
  50. int64_t m_downloadStartTimeMS;
  51. char m_niceness;
  52. int32_t m_ifModifiedSince;
  53. int32_t m_maxCacheAge;
  54. int32_t m_maxTextDocLen;
  55. int32_t m_maxOtherDocLen;
  56. // in milliseconds. use -1 if none or unknown.
  57. int32_t m_crawlDelayMS;
  58. // for linked list, this is the hammer queue
  59. class Msg13Request *m_nextLink;
  60. char m_proxyUsernamePwdAuth[MAXUSERNAMEPWD];
  61. // if doing spider compression, compute contentHash32 of document
  62. // downloaded, and if it matches this then send back EDOCUNCHANGED
  63. int32_t m_contentHash32;
  64. // copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
  65. char m_isCustomCrawl;
  66. // send back error ENOGOODDATE if it does not have one. but if
  67. // harvestLinks is true, just send back a filtered list of links
  68. int32_t m_requireGoodDate:1;
  69. int32_t m_harvestLinksIfNoGoodDate:1;
  70. int32_t m_compressReply:1;
  71. int32_t m_useCompressionProxy:1;
  72. // if m_forwardDownloadRequest is true then we pick the host to
  73. // download this url based on the IP address, the idea being that
  74. // only one host is responsible for downloading from a particular
  75. // ip address. this keeps webmasters happier so they can block us
  76. // by just blocking one ip address. and it makes it easier for them
  77. // to analyze their web logs.
  78. int32_t m_forwardDownloadRequest:1;
  79. int32_t m_isScraping:1;
  80. // does url end in /robots.txt ?
  81. int32_t m_isRobotsTxt:1;
  82. // should we call getTestDoc()/addTestDoc() like for the "test" coll
  83. // and for Test.cpp?
  84. int32_t m_useTestCache:1;
  85. int32_t m_addToTestCache:1;
  86. int32_t m_skipHammerCheck:1;
  87. int32_t m_attemptedIframeExpansion:1;
  88. int32_t m_crawlDelayFromEnd:1;
  89. int32_t m_forEvents:1;
  90. // does m_url represent a FULL http request mime and NOT just a url?
  91. // this happens when gigablast is being used like a squid proxy.
  92. int32_t m_isSquidProxiedUrl:1;
  93. int32_t m_foundInCache:1;
  94. int32_t m_forceUseFloaters:1;
  95. int32_t m_wasInTableBeforeStarting:1;
  96. int32_t m_isRootSeedUrl:1;
  97. //int32_t m_testParserEnabled:1;
  98. //int32_t m_testSpiderEnabled:1;
  99. //int32_t m_isPageParser:1;
  100. //int32_t m_isPageInject:1;
  101. // if we just end up calling HttpServer::getDoc() via calling
  102. // downloadDoc() then we set this for callback purposes
  103. class Msg13 *m_parent;
  104. // on the other hand, if we are called indirectly by handleRequest13()
  105. // then we set m_udpSlot.
  106. class UdpSlot *m_udpSlot;
  107. class TcpSocket *m_tcpSocket;
  108. // used for addTestDoc() and caching. msg13 sets this
  109. int64_t m_urlHash64;
  110. int32_t m_spideredTime;
  111. // used for caching (and for request table, wait in line table)
  112. int64_t m_cacheKey;
  113. char m_testDir[32];
  114. // msg13 sets this too, so you don't have to worry about setting it
  115. //int32_t m_urlLen;
  116. // includes \0 termination
  117. //char m_url[MAX_URL_LEN+1];
  118. char *ptr_url;
  119. char *ptr_cookie;
  120. int32_t size_url;
  121. int32_t size_cookie;
  122. // variable data starts here
  123. int32_t getSize() {
  124. return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
  125. // zero it all out
  126. void reset() {
  127. //memset (this,0,(char *)m_url - (char *)this + 1);
  128. memset (this,0,sizeof(Msg13Request));
  129. m_maxTextDocLen = -1; // no limit
  130. m_maxOtherDocLen = -1; // no limit
  131. m_crawlDelayMS = -1; // unknown or none
  132. m_collnum = (collnum_t)-1;
  133. };
  134. };
  135. class Msg13 {
  136. public:
  137. Msg13() ;
  138. ~Msg13();
  139. void reset() ;
  140. // register our request handler with g_udpServer (called by main.cpp)
  141. static bool registerHandler();
  142. static class RdbCache *getHttpCacheRobots();
  143. static class RdbCache *getHttpCacheOthers();
  144. bool getDoc ( Msg13Request *r ,
  145. bool isTestColl ,
  146. void *state ,
  147. void (*callback)(void *state) );
  148. bool forwardRequest();
  149. bool gotForwardedReply ( class UdpSlot *slot );
  150. bool gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSize);
  151. // keep public so wrappers can access
  152. void *m_state;
  153. void (* m_callback) (void *state );
  154. // we now store the uncompressed http reply in here
  155. char *m_replyBuf;
  156. int32_t m_replyBufSize;
  157. int32_t m_replyBufAllocSize;
  158. // point to it
  159. Msg13Request *m_request;
  160. //char m_tmpBuf32[32];
  161. };
  162. bool getTestSpideredDate ( Url *u , int32_t *origSpideredDate , char *testDir ) ;
  163. bool addTestSpideredDate ( Url *u , int32_t spideredTime , char *testDir ) ;
  164. extern RdbCache s_hammerCache;
  165. #endif