/Msg13.h

https://github.com/acchou/open-source-search-engine · C Header · 193 lines · 95 code · 40 blank · 58 comment · 0 complexity · 87f8cda0e1a398d02ca25f56606cf610 MD5 · raw file

  1. // Matt Wells, copyright Oct 2001
  2. // . ask another host to download a url for you
  3. // . the remote host will also use a cache if m_maxCacheAge > 0
  4. // . used for downloading and caching robots.txt
  5. // . if m_compressReply then the host compressed the http reply before
  6. // sending it back to you via udp
  7. #ifndef _MSG13_H_
  8. #define _MSG13_H_
  9. #include "Url.h" // MAX_URL_LEN
  10. void resetMsg13Caches ( ) ;
  11. extern char *g_fakeReply;
  12. class Msg13Request {
  13. public:
  14. // the top portion of Msg13Request is sent to handleRequest54()
  15. // in SpiderProxy.cpp to get and return proxies, as well as to
  16. // ban proxies.
  17. long getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;};
  18. long m_urlIp;
  19. long m_lbId; // loadbucket id
  20. // the http proxy to use to download
  21. long m_proxyIp;
  22. short m_proxyPort;
  23. long m_banProxyIp;
  24. short m_banProxyPort;
  25. char m_opCode;
  26. char m_lastHack;
  27. // not part of the proxy request, but set from ProxyReply:
  28. long m_numBannedProxies;
  29. // . if using proxies, how many proxies have we tried to download
  30. // this url through
  31. // . used internally in Msg13.cpp
  32. long m_proxyTries;
  33. // if using proxies, did host #0 tell us there were more to try if
  34. // this one did not work out?
  35. bool m_hasMoreProxiesToTry;
  36. // we call this function after the imposed crawl-delay is over
  37. void (*m_hammerCallback)(class Msg13Request *r);
  38. long long m_urlHash48;
  39. long m_firstIp;
  40. // a tmp hack var referencing into m_url[] below
  41. char *m_proxiedUrl;
  42. long m_proxiedUrlLen;
  43. char m_niceness;
  44. long m_ifModifiedSince;
  45. long m_maxCacheAge;
  46. long m_maxTextDocLen;
  47. long m_maxOtherDocLen;
  48. // in milliseconds. use -1 if none or unknown.
  49. long m_crawlDelayMS;
  50. // for linked list, this is the hammer queue
  51. class Msg13Request *m_nextLink;
  52. // if doing spider compression, compute contentHash32 of document
  53. // downloaded, and if it matches this then send back EDOCUNCHANGED
  54. long m_contentHash32;
  55. // copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
  56. char m_isCustomCrawl;
  57. // send back error ENOGOODDATE if it does not have one. but if
  58. // harvestLinks is true, just send back a filtered list of links
  59. long m_requireGoodDate:1;
  60. long m_harvestLinksIfNoGoodDate:1;
  61. long m_compressReply:1;
  62. long m_useCompressionProxy:1;
  63. // if m_forwardDownloadRequest is true then we pick the host to
  64. // download this url based on the IP address, the idea being that
  65. // only one host is responsible for downloading from a particular
  66. // ip address. this keeps webmasters happier so they can block us
  67. // by just blocking one ip address. and it makes it easier for them
  68. // to analyze their web logs.
  69. long m_forwardDownloadRequest:1;
  70. long m_isScraping:1;
  71. // does url end in /robots.txt ?
  72. long m_isRobotsTxt:1;
  73. // should we call getTestDoc()/addTestDoc() like for the "test" coll
  74. // and for Test.cpp?
  75. long m_useTestCache:1;
  76. long m_addToTestCache:1;
  77. long m_skipHammerCheck:1;
  78. long m_attemptedIframeExpansion:1;
  79. long m_crawlDelayFromEnd:1;
  80. long m_forEvents:1;
  81. // does m_url represent a FULL http request mime and NOT just a url?
  82. // this happens when gigablast is being used like a squid proxy.
  83. long m_isSquidProxiedUrl:1;
  84. long m_foundInCache:1;
  85. //long m_testParserEnabled:1;
  86. //long m_testSpiderEnabled:1;
  87. //long m_isPageParser:1;
  88. //long m_isPageInject:1;
  89. // if we just end up calling HttpServer::getDoc() via calling
  90. // downloadDoc() then we set this for callback purposes
  91. class Msg13 *m_parent;
  92. // on the other hand, if we are called indirectly by handleRequest13()
  93. // then we set m_udpSlot.
  94. class UdpSlot *m_udpSlot;
  95. class TcpSocket *m_tcpSocket;
  96. // used for addTestDoc() and caching. msg13 sets this
  97. long long m_urlHash64;
  98. long m_spideredTime;
  99. // used for caching (and for request table, wait in line table)
  100. long long m_cacheKey;
  101. char m_testDir[32];
  102. // msg13 sets this too, so you don't have to worry about setting it
  103. //long m_urlLen;
  104. // includes \0 termination
  105. //char m_url[MAX_URL_LEN+1];
  106. char *ptr_url;
  107. char *ptr_cookie;
  108. long size_url;
  109. long size_cookie;
  110. // string buf for deserializeMsg() function
  111. char m_buf[0];
  112. long getSize() {
  113. return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
  114. // zero it all out
  115. void reset() {
  116. //memset (this,0,(char *)m_url - (char *)this + 1);
  117. memset (this,0,sizeof(Msg13Request));
  118. m_maxTextDocLen = -1; // no limit
  119. m_maxOtherDocLen = -1; // no limit
  120. m_crawlDelayMS = -1; // unknown or none
  121. };
  122. };
  123. class Msg13 {
  124. public:
  125. Msg13() ;
  126. ~Msg13();
  127. void reset() ;
  128. // register our request handler with g_udpServer (called by main.cpp)
  129. static bool registerHandler();
  130. static class RdbCache *getHttpCacheRobots();
  131. static class RdbCache *getHttpCacheOthers();
  132. bool getDoc ( Msg13Request *r ,
  133. bool isTestColl ,
  134. void *state ,
  135. void (*callback)(void *state) );
  136. bool forwardRequest();
  137. bool gotForwardedReply ( class UdpSlot *slot );
  138. bool gotFinalReply ( char *reply, long replySize, long replyAllocSize);
  139. // keep public so wrappers can access
  140. void *m_state;
  141. void (* m_callback) (void *state );
  142. // we now store the uncompressed http reply in here
  143. char *m_replyBuf;
  144. long m_replyBufSize;
  145. long m_replyBufAllocSize;
  146. // point to it
  147. Msg13Request *m_request;
  148. //char m_tmpBuf32[32];
  149. };
  150. bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) ;
  151. bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) ;
  152. extern RdbCache s_hammerCache;
  153. #endif