/ScrapySharp/Network/WebPage.cs

https://bitbucket.org/rflechner/scrapysharp · C# · 335 lines · 283 code · 51 blank · 1 comment · 34 complexity · 24e3a350b9277818a8e987d4e22b2f55 MD5 · raw file

  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Net;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using HtmlAgilityPack;
  8. using ScrapySharp.Cache;
  9. using ScrapySharp.Extensions;
  10. using System.Linq;
  11. using ScrapySharp.Html;
  12. using ScrapySharp.Html.Forms;
  13. namespace ScrapySharp.Network
  14. {
  15. public class WebPage
  16. {
  17. private readonly ScrapingBrowser browser;
  18. private readonly Uri absoluteUrl;
  19. private readonly RawRequest rawRequest;
  20. private readonly RawResponse rawResponse;
  21. private readonly bool autoDetectCharsetEncoding;
  22. private string content;
  23. private readonly List<WebResource> resources;
  24. private HtmlNode html;
  25. private string baseUrl;
  26. private static readonly Dictionary<string, string> resourceTags = new Dictionary<string, string>
  27. {
  28. {"img", "src"},
  29. {"script", "src"},
  30. {"link", "href"},
  31. };
  32. public Encoding Encoding { get; private set; }
  33. public WebPage(ScrapingBrowser browser, Uri absoluteUrl, bool autoDownloadPagesResources, RawRequest rawRequest, RawResponse rawResponse,
  34. Encoding encoding, bool autoDetectCharsetEncoding)
  35. {
  36. this.browser = browser;
  37. this.absoluteUrl = absoluteUrl;
  38. this.rawRequest = rawRequest;
  39. this.rawResponse = rawResponse;
  40. this.autoDetectCharsetEncoding = autoDetectCharsetEncoding;
  41. Encoding = encoding;
  42. content = Encoding.GetString(rawResponse.Body);
  43. resources = new List<WebResource>();
  44. LoadHtml();
  45. if (autoDownloadPagesResources)
  46. {
  47. LoadBaseUrl();
  48. DownloadResources();
  49. }
  50. }
  51. private void LoadHtml()
  52. {
  53. try
  54. {
  55. html = content.ToHtmlNode();
  56. if (autoDetectCharsetEncoding)
  57. {
  58. var charset = html.Descendants("meta").Select(meta => meta.GetAttributeValue("charset", string.Empty).Trim())
  59. .FirstOrDefault(v => !string.IsNullOrEmpty(v));
  60. if (charset == null)
  61. {
  62. // Parse content-type too.
  63. var contentType = html.Descendants("meta").FirstOrDefault(m => m.GetAttributeValue("http-equiv") == "content-type");
  64. if (contentType != null)
  65. {
  66. var contentTypeContent = contentType.GetAttributeValue("content");
  67. int posContentType = contentTypeContent.IndexOf("charset=", StringComparison.Ordinal);
  68. if (posContentType != -1)
  69. {
  70. charset = contentTypeContent.Substring(posContentType + "charset=".Length);
  71. }
  72. }
  73. }
  74. if (!string.IsNullOrEmpty(charset))
  75. {
  76. Encoding = Encoding.GetEncoding(charset);
  77. content = Encoding.GetString(rawResponse.Body);
  78. html = content.ToHtmlNode();
  79. }
  80. }
  81. }
  82. catch
  83. {
  84. }
  85. }
  86. public bool AutoDetectCharsetEncoding
  87. {
  88. get { return autoDetectCharsetEncoding; }
  89. }
  90. public RawRequest RawRequest
  91. {
  92. get { return rawRequest; }
  93. }
  94. public RawResponse RawResponse
  95. {
  96. get { return rawResponse; }
  97. }
  98. public IEnumerable<HtmlNode> Find(string tag, By by)
  99. {
  100. return @by.CreateElementFinder(html, tag).FindElements();
  101. }
  102. public IEnumerable<HyperLink> FindLinks(By by)
  103. {
  104. return Find("a", by).Select(a => new HyperLink(this, a));
  105. }
  106. public PageWebForm FindForm(string name)
  107. {
  108. var node = (from n in Html.Descendants("form")
  109. let formName = n.GetAttributeValue("name", string.Empty)
  110. where formName == name
  111. select n).FirstOrDefault();
  112. return node == null ? null : new PageWebForm(node, browser);
  113. }
  114. public PageWebForm FindFormById(string id)
  115. {
  116. var node = Html.Descendants("form").FirstOrDefault(f => f.Id == id);
  117. return node == null ? null : new PageWebForm(node, browser);
  118. }
  119. private void LoadBaseUrl()
  120. {
  121. var baseAttr = html.Descendants("base").Where(e => e.Attributes.Any(a => a.Name == "href"))
  122. .Select(e => e.Attributes["href"].Value).FirstOrDefault();
  123. if (baseAttr != null)
  124. {
  125. baseUrl = baseAttr;
  126. return;
  127. }
  128. baseUrl = string.Format("{0}://{1}", absoluteUrl.Scheme, absoluteUrl.Host);
  129. if (!absoluteUrl.IsDefaultPort)
  130. baseUrl += ":" + absoluteUrl.Port;
  131. }
  132. public override string ToString()
  133. {
  134. return content;
  135. }
  136. public static implicit operator string(WebPage page)
  137. {
  138. return page.content;
  139. }
  140. private void DownloadResources()
  141. {
  142. var resourceUrls = GetResourceUrls();
  143. foreach (var resourceUrl in resourceUrls)
  144. {
  145. var url = GetFullResourceUrl(resourceUrl, absoluteUrl);
  146. if (WebResourceStorage.Current.Exists(url.ToString()))
  147. continue;
  148. try
  149. {
  150. WebResource resource = browser.DownloadWebResource(url);
  151. resources.Add(resource);
  152. if (!resource.ForceDownload || !string.IsNullOrEmpty(resource.LastModified))
  153. WebResourceStorage.Current.Save(resource);
  154. }
  155. catch
  156. {
  157. }
  158. }
  159. }
  160. private Uri GetFullResourceUrl(string resourceUrl, Uri root)
  161. {
  162. Uri result;
  163. Uri.TryCreate(resourceUrl, UriKind.RelativeOrAbsolute, out result);
  164. Uri url;
  165. if (!result.IsAbsoluteUri)
  166. {
  167. if (resourceUrl.StartsWith("/") || resourceUrl.StartsWith("./") || resourceUrl.StartsWith("../"))
  168. {
  169. url = baseUrl != null ? baseUrl.CombineUrl(resourceUrl) : root.Combine(resourceUrl);
  170. }
  171. else
  172. {
  173. var path = string.Join("/", root.Segments.Take(root.Segments.Length - 1).Skip(1));
  174. url = baseUrl != null ? baseUrl.CombineUrl(path).Combine(resourceUrl) : root.Combine(resourceUrl);
  175. }
  176. }
  177. else
  178. url = new Uri(resourceUrl);
  179. return url;
  180. }
  181. public List<string> GetResourceUrls()
  182. {
  183. var resourceUrls = new List<string>();
  184. foreach (var resourceTag in resourceTags)
  185. {
  186. var sources = html.Descendants(resourceTag.Key)
  187. .Where(e => e.Attributes.Any(a => a.Name == resourceTag.Value))
  188. .Select(e => e.Attributes[resourceTag.Value].Value).ToArray();
  189. resourceUrls.AddRange(sources);
  190. }
  191. return resourceUrls;
  192. }
  193. public ScrapingBrowser Browser
  194. {
  195. get { return browser; }
  196. }
  197. public Uri AbsoluteUrl
  198. {
  199. get { return absoluteUrl; }
  200. }
  201. public string Content
  202. {
  203. get { return content; }
  204. }
  205. public List<WebResource> Resources
  206. {
  207. get { return resources; }
  208. }
  209. public HtmlNode Html
  210. {
  211. get { return html; }
  212. }
  213. public string BaseUrl
  214. {
  215. get { return baseUrl; }
  216. }
  217. private static readonly Regex urlInCssRegex = new Regex(@"url \s* [(] \s* (?<url>[^)\r\n]+) \s* [)]",
  218. RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
  219. public void SaveSnapshot(string path)
  220. {
  221. if (!browser.AutoDownloadPagesResources)
  222. DownloadResources();
  223. if (!Directory.Exists(path))
  224. Directory.CreateDirectory(path);
  225. else
  226. Directory.GetFiles(path).ToList().ForEach(File.Delete);
  227. foreach (var resource in Resources)
  228. {
  229. var guid = Guid.NewGuid();
  230. resource.Content.Position = 0;
  231. var fileName = guid.ToString("N");
  232. RewriteHtml(resource, fileName);
  233. if (!string.IsNullOrEmpty(resource.ContentType) && resource.ContentType.EndsWith("css", StringComparison.InvariantCultureIgnoreCase))
  234. {
  235. var textContent = resource.GetTextContent();
  236. textContent = RewriteCssUrls(path, textContent, resource.AbsoluteUrl.ToString());
  237. File.WriteAllText(Path.Combine(path, fileName), textContent);
  238. }
  239. else
  240. File.WriteAllBytes(Path.Combine(path, fileName), resource.Content.ToArray());
  241. }
  242. var outerHtml = RewriteCssUrls(path, Html.OuterHtml, AbsoluteUrl.ToString());
  243. File.WriteAllText(Path.Combine(path, "page.html"), outerHtml);
  244. }
  245. private void RewriteHtml(WebResource resource, string fileName)
  246. {
  247. foreach (var resourceTag in resourceTags)
  248. {
  249. var nodes = html.Descendants(resourceTag.Key)
  250. .Where(
  251. e =>
  252. e.Attributes.Any(a => a.Name == resourceTag.Value) &&
  253. resource.AbsoluteUrl.ToString().EndsWith(e.Attributes[resourceTag.Value].Value))
  254. .ToArray();
  255. foreach (var node in nodes)
  256. {
  257. node.SetAttributeValue(resourceTag.Value, fileName);
  258. }
  259. }
  260. }
  261. private string RewriteCssUrls(string path, string textContent, string rootUrl)
  262. {
  263. var match = urlInCssRegex.Match(textContent);
  264. while (match.Success)
  265. {
  266. var imageId = Guid.NewGuid().ToString("N");
  267. var url = match.Groups["url"].Value;
  268. var parts = rootUrl.Split('/');
  269. var leftPart = string.Join("/", parts.Take(parts.Length - 1));
  270. try
  271. {
  272. var image = browser.DownloadWebResource(GetFullResourceUrl(url, new Uri(leftPart)));
  273. File.WriteAllBytes(Path.Combine(path, imageId), image.Content.ToArray());
  274. }
  275. catch
  276. {
  277. }
  278. textContent = textContent.Replace(url, imageId);
  279. match = match.NextMatch();
  280. }
  281. return textContent;
  282. }
  283. }
  284. }