PageRenderTime 1028ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/Abot/Core/PageRequester.cs

https://gitlab.com/kokeiro001/abot
C# | 259 lines | 141 code | 41 blank | 77 comment | 21 complexity | 91599c5dbb38c15d7975a3412edf98ee MD5 | raw file
  1. using Abot.Poco;
  2. using log4net;
  3. using System;
  4. using System.CodeDom;
  5. using System.Linq;
  6. using System.Net;
  7. using System.Reflection;
  8. using System.Threading.Tasks;
  9. using log4net.Core;
  10. namespace Abot.Core
  11. {
  12. public interface IPageRequester : IDisposable
  13. {
  14. /// <summary>
  15. /// Make an http web request to the url and download its content
  16. /// </summary>
  17. CrawledPage MakeRequest(Uri uri);
  18. /// <summary>
  19. /// Make an http web request to the url and download its content based on the param func decision
  20. /// </summary>
  21. CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent);
  22. ///// <summary>
  23. ///// Asynchronously make an http web request to the url and download its content based on the param func decision
  24. ///// </summary>
  25. //Task<CrawledPage> MakeRequestAsync(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent);
  26. }
  27. [Serializable]
  28. public class PageRequester : IPageRequester
  29. {
  30. static ILog _logger = LogManager.GetLogger("AbotLogger");
  31. protected CrawlConfiguration _config;
  32. protected IWebContentExtractor _extractor;
  33. protected CookieContainer _cookieContainer = new CookieContainer();
  34. public PageRequester(CrawlConfiguration config)
  35. : this(config, null)
  36. {
  37. }
  38. public PageRequester(CrawlConfiguration config, IWebContentExtractor contentExtractor)
  39. {
  40. if (config == null)
  41. throw new ArgumentNullException("config");
  42. _config = config;
  43. if (_config.HttpServicePointConnectionLimit > 0)
  44. ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit;
  45. if (!_config.IsSslCertificateValidationEnabled)
  46. ServicePointManager.ServerCertificateValidationCallback +=
  47. (sender, certificate, chain, sslPolicyErrors) => true;
  48. _extractor = contentExtractor ?? new WebContentExtractor();
  49. }
  50. /// <summary>
  51. /// Make an http web request to the url and download its content
  52. /// </summary>
  53. public virtual CrawledPage MakeRequest(Uri uri)
  54. {
  55. return MakeRequest(uri, (x) => new CrawlDecision { Allow = true });
  56. }
  57. /// <summary>
  58. /// Make an http web request to the url and download its content based on the param func decision
  59. /// </summary>
  60. public virtual CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent)
  61. {
  62. if (uri == null)
  63. throw new ArgumentNullException("uri");
  64. CrawledPage crawledPage = new CrawledPage(uri);
  65. HttpWebRequest request = null;
  66. HttpWebResponse response = null;
  67. try
  68. {
  69. request = BuildRequestObject(uri);
  70. crawledPage.RequestStarted = DateTime.Now;
  71. response = (HttpWebResponse)request.GetResponse();
  72. ProcessResponseObject(response);
  73. }
  74. catch (WebException e)
  75. {
  76. crawledPage.WebException = e;
  77. if (e.Response != null)
  78. response = (HttpWebResponse)e.Response;
  79. _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
  80. _logger.Debug(e);
  81. }
  82. catch (Exception e)
  83. {
  84. _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
  85. _logger.Debug(e);
  86. }
  87. finally
  88. {
  89. try
  90. {
  91. crawledPage.HttpWebRequest = request;
  92. crawledPage.RequestCompleted = DateTime.Now;
  93. if (response != null)
  94. {
  95. crawledPage.HttpWebResponse = new HttpWebResponseWrapper(response);
  96. CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
  97. if (shouldDownloadContentDecision.Allow)
  98. {
  99. crawledPage.DownloadContentStarted = DateTime.Now;
  100. crawledPage.Content = _extractor.GetContent(response);
  101. crawledPage.DownloadContentCompleted = DateTime.Now;
  102. }
  103. else
  104. {
  105. _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
  106. }
  107. response.Close();//Should already be closed by _extractor but just being safe
  108. }
  109. }
  110. catch (Exception e)
  111. {
  112. _logger.DebugFormat("Error occurred finalizing requesting url [{0}]", uri.AbsoluteUri);
  113. _logger.Debug(e);
  114. }
  115. }
  116. return crawledPage;
  117. }
  118. ///// <summary>
  119. ///// Asynchronously make an http web request to the url and download its content based on the param func decision
  120. ///// </summary>
  121. //public Task<CrawledPage> MakeRequestAsync(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent)
  122. //{
  123. // if (uri == null)
  124. // throw new ArgumentNullException("uri");
  125. // CrawledPage crawledPage = new CrawledPage(uri);
  126. // crawledPage.RequestStarted = DateTime.Now;
  127. // HttpWebRequest request = BuildRequestObject(uri);
  128. // HttpWebResponse response = null;
  129. // crawledPage.HttpWebRequest = request;
  130. // crawledPage.RequestStarted = DateTime.Now;
  131. // Task<WebResponse> task = Task.Factory.FromAsync(
  132. // request.BeginGetResponse,
  133. // asyncResult => request.EndGetResponse(asyncResult),
  134. // null);
  135. // return task.ContinueWith((Task<WebResponse> t) =>
  136. // {
  137. // crawledPage.RequestCompleted = DateTime.Now;
  138. // if (t.IsFaulted)
  139. // {
  140. // //handle error
  141. // Exception firstException = t.Exception.InnerExceptions.First();
  142. // crawledPage.WebException = firstException as WebException;
  143. // if (crawledPage.WebException != null && crawledPage.WebException.Response != null)
  144. // response = (HttpWebResponse)crawledPage.WebException.Response;
  145. // _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
  146. // _logger.Debug(crawledPage.WebException);
  147. // }
  148. // else
  149. // {
  150. // ProcessResponseObject(response);
  151. // response = (HttpWebResponse)t.Result;
  152. // }
  153. // if (response != null)
  154. // {
  155. // crawledPage.HttpWebResponse = response;
  156. // CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
  157. // if (shouldDownloadContentDecision.Allow)
  158. // {
  159. // crawledPage.DownloadContentStarted = DateTime.Now;
  160. // crawledPage.Content = _extractor.GetContent(response);
  161. // crawledPage.DownloadContentCompleted = DateTime.Now;
  162. // }
  163. // else
  164. // {
  165. // _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri,
  166. // shouldDownloadContentDecision.Reason);
  167. // }
  168. // response.Close(); //Should already be closed by _extractor but just being safe
  169. // }
  170. // return crawledPage;
  171. // });
  172. //}
  173. protected virtual HttpWebRequest BuildRequestObject(Uri uri)
  174. {
  175. HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
  176. request.AllowAutoRedirect = _config.IsHttpRequestAutoRedirectsEnabled;
  177. request.UserAgent = _config.UserAgentString;
  178. request.Accept = "*/*";
  179. if (_config.HttpRequestMaxAutoRedirects > 0)
  180. request.MaximumAutomaticRedirections = _config.HttpRequestMaxAutoRedirects;
  181. if (_config.IsHttpRequestAutomaticDecompressionEnabled)
  182. request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
  183. if (_config.HttpRequestTimeoutInSeconds > 0)
  184. request.Timeout = _config.HttpRequestTimeoutInSeconds * 1000;
  185. if (_config.IsSendingCookiesEnabled)
  186. request.CookieContainer = _cookieContainer;
  187. //Supposedly this does not work... https://github.com/sjdirect/abot/issues/122
  188. //if (_config.IsAlwaysLogin)
  189. //{
  190. // request.Credentials = new NetworkCredential(_config.LoginUser, _config.LoginPassword);
  191. // request.UseDefaultCredentials = false;
  192. //}
  193. if (_config.IsAlwaysLogin)
  194. {
  195. string credentials = Convert.ToBase64String(System.Text.Encoding.ASCII.GetBytes(_config.LoginUser + ":" + _config.LoginPassword));
  196. request.Headers[HttpRequestHeader.Authorization] = "Basic " + credentials;
  197. }
  198. return request;
  199. }
  200. protected virtual void ProcessResponseObject(HttpWebResponse response)
  201. {
  202. if (response != null && _config.IsSendingCookiesEnabled)
  203. {
  204. CookieCollection cookies = response.Cookies;
  205. _cookieContainer.Add(cookies);
  206. }
  207. }
  208. public void Dispose()
  209. {
  210. if (_extractor != null)
  211. {
  212. _extractor.Dispose();
  213. }
  214. _cookieContainer = null;
  215. _config = null;
  216. }
  217. }
  218. }