/Msg13.h

https://github.com/privacore/open-source-search-engine · C Header · 177 lines · 92 code · 45 blank · 40 comment · 0 complexity · 528cd0c4cc38a42907882c9f84e69e15 MD5 · raw file

  1. // Matt Wells, copyright Oct 2001
  2. // . ask another host to download a url for you
  3. // . the remote host will also use a cache if m_maxCacheAge > 0
  4. // . used for downloading and caching robots.txt
  5. // . if m_compressReply then the host compressed the http reply before
  6. // sending it back to you via udp
  7. #ifndef GB_MSG13_H
  8. #define GB_MSG13_H
  9. #include "SpiderProxy.h" // MAXUSERNAMEPWD
  10. #include "collnum_t.h"
  11. #include <stddef.h>
  12. // max crawl delay form proxy backoff of 1 minute (60 seconds)
  13. #define MAX_PROXYCRAWLDELAYMS 60000
  14. class RdbCache;
  15. void resetMsg13Caches ( ) ;
  16. bool printHammerQueueTable ( SafeBuf *sb ) ;
  17. class Msg13Request {
  18. public:
  19. // the top portion of Msg13Request is sent to handleRequest54()
  20. // in SpiderProxy.cpp to get and return proxies, as well as to
  21. // ban proxies.
  22. int32_t getProxyRequestSize() const { return offsetof(Msg13Request,m_lastHack); }
  23. int32_t m_urlIp;
  24. int32_t m_lbId; // loadbucket id
  25. // the http proxy to use to download
  26. int32_t m_proxyIp;
  27. uint16_t m_proxyPort;
  28. int32_t m_banProxyIp;
  29. uint16_t m_banProxyPort;
  30. char m_opCode;
  31. char m_lastHack;
  32. collnum_t m_collnum;
  33. // not part of the proxy request, but set from ProxyReply:
  34. int32_t m_numBannedProxies;
  35. // . if using proxies, how many proxies have we tried to download
  36. // this url through
  37. // . used internally in Msg13.cpp
  38. int32_t m_proxyTries;
  39. // if using proxies, did host #0 tell us there were more to try if
  40. // this one did not work out?
  41. bool m_hasMoreProxiesToTry;
  42. // we call this function after the imposed crawl-delay is over
  43. void (*m_hammerCallback)(class Msg13Request *r);
  44. int64_t m_urlHash48;
  45. int32_t m_firstIp;
  46. // when it was stored in the hammer queue
  47. int64_t m_stored;
  48. // a tmp hack var referencing into m_url[] below
  49. char *m_proxiedUrl;
  50. int32_t m_proxiedUrlLen;
  51. int64_t m_downloadStartTimeMS;
  52. char m_niceness;
  53. int32_t m_ifModifiedSince;
  54. int32_t m_maxCacheAge;
  55. int32_t m_maxTextDocLen;
  56. int32_t m_maxOtherDocLen;
  57. // in milliseconds. use -1 if none or unknown.
  58. int32_t m_crawlDelayMS;
  59. // for linked list, this is the hammer queue
  60. class Msg13Request *m_nextLink;
  61. char m_proxyUsernamePwdAuth[MAXUSERNAMEPWD];
  62. // if doing spider compression, compute contentHash32 of document
  63. // downloaded, and if it matches this then send back EDOCUNCHANGED
  64. int32_t m_contentHash32;
  65. unsigned m_compressReply:1;
  66. unsigned m_useCompressionProxy:1;
  67. // does url end in /robots.txt ?
  68. unsigned m_isRobotsTxt:1;
  69. unsigned m_skipHammerCheck:1;
  70. unsigned m_attemptedIframeExpansion:1;
  71. unsigned m_crawlDelayFromEnd:1;
  72. // does m_url represent a FULL http request mime and NOT just a url?
  73. // this happens when gigablast is being used like a squid proxy.
  74. unsigned m_isSquidProxiedUrl:1;
  75. unsigned m_forceUseFloaters:1;
  76. unsigned m_wasInTableBeforeStarting:1;
  77. // if we just end up calling HttpServer::getDoc() via calling
  78. // downloadDoc() then we set this for callback purposes
  79. class Msg13 *m_parent;
  80. // on the other hand, if we are called indirectly by handleRequest13()
  81. // then we set m_udpSlot.
  82. class UdpSlot *m_udpSlot;
  83. // used for addTestDoc() and caching. msg13 sets this
  84. int64_t m_urlHash64;
  85. int32_t m_spideredTime;
  86. // used for caching (and for request table, wait in line table)
  87. int64_t m_cacheKey;
  88. char *ptr_url;
  89. char *ptr_cookie;
  90. int32_t size_url;
  91. int32_t size_cookie;
  92. // variable data starts here
  93. int32_t getSize() const {
  94. return offsetof(Msg13Request,ptr_url) + size_url + size_cookie;
  95. }
  96. // zero it all out
  97. void reset() {
  98. //memset (this,0,(char *)m_url - (char *)this + 1);
  99. memset (this,0,sizeof(Msg13Request));
  100. m_maxTextDocLen = -1; // no limit
  101. m_maxOtherDocLen = -1; // no limit
  102. m_crawlDelayMS = -1; // unknown or none
  103. m_collnum = (collnum_t)-1;
  104. }
  105. };
  106. class Msg13 {
  107. public:
  108. Msg13() ;
  109. ~Msg13();
  110. void reset() ;
  111. // register our request handler with g_udpServer (called by main.cpp)
  112. static bool registerHandler();
  113. static RdbCache *getHttpCacheRobots();
  114. static RdbCache *getHttpCacheOthers();
  115. bool getDoc ( Msg13Request *r ,
  116. void *state ,
  117. void (*callback)(void *state) );
  118. bool forwardRequest();
  119. bool gotForwardedReply ( class UdpSlot *slot );
  120. bool gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSize);
  121. // keep public so wrappers can access
  122. void *m_state;
  123. void (* m_callback) (void *state );
  124. // we now store the uncompressed http reply in here
  125. char *m_replyBuf;
  126. int32_t m_replyBufSize;
  127. int32_t m_replyBufAllocSize;
  128. // point to it
  129. Msg13Request *m_request;
  130. };
  131. extern RdbCache s_hammerCache;
  132. #endif // GB_MSG13_H