WebPage.cs | searchcode

/ScrapySharp/Network/WebPage.cs

https://bitbucket.org/rflechner/scrapysharp · C# · 335 lines · 283 code · 51 blank · 1 comment · 34 complexity · 24e3a350b9277818a8e987d4e22b2f55 MD5 · raw file

using System;

using System.Collections.Generic;

using System.IO;

using System.Net;

using System.Text;

using System.Text.RegularExpressions;

using HtmlAgilityPack;

using ScrapySharp.Cache;

using ScrapySharp.Extensions;

using System.Linq;

using ScrapySharp.Html;

using ScrapySharp.Html.Forms;



namespace ScrapySharp.Network

{

    public class WebPage

    {

        private readonly ScrapingBrowser browser;

        private readonly Uri absoluteUrl;

        private readonly RawRequest rawRequest;

        private readonly RawResponse rawResponse;

        private readonly bool autoDetectCharsetEncoding;

        private string content;

        private readonly List<WebResource> resources;

        private HtmlNode html;

        private string baseUrl;



        private static readonly Dictionary<string, string> resourceTags = new Dictionary<string, string> 

            {

                {"img", "src"},

                {"script", "src"},

                {"link", "href"},

            };

        public Encoding Encoding { get; private set; }



        public WebPage(ScrapingBrowser browser, Uri absoluteUrl, bool autoDownloadPagesResources, RawRequest rawRequest, RawResponse rawResponse, 

            Encoding encoding, bool autoDetectCharsetEncoding)

        {

            this.browser = browser;

            this.absoluteUrl = absoluteUrl;

            this.rawRequest = rawRequest;

            this.rawResponse = rawResponse;

            this.autoDetectCharsetEncoding = autoDetectCharsetEncoding;

            Encoding = encoding;



            content = Encoding.GetString(rawResponse.Body);

            resources = new List<WebResource>();



            LoadHtml();



            if (autoDownloadPagesResources)

            {

                LoadBaseUrl();

                DownloadResources();

            }

        }



        private void LoadHtml()

        {

            try

            {

                html = content.ToHtmlNode();

                if (autoDetectCharsetEncoding)

                {

                    var charset = html.Descendants("meta").Select(meta => meta.GetAttributeValue("charset", string.Empty).Trim())

                        .FirstOrDefault(v => !string.IsNullOrEmpty(v));

                    if (charset == null)

                    {

                        // Parse content-type too.

                        var contentType = html.Descendants("meta").FirstOrDefault(m => m.GetAttributeValue("http-equiv") == "content-type");

                        if (contentType != null)

                        {

                            var contentTypeContent = contentType.GetAttributeValue("content");

                            int posContentType = contentTypeContent.IndexOf("charset=", StringComparison.Ordinal);

                            if (posContentType != -1)

                            {

                                charset = contentTypeContent.Substring(posContentType + "charset=".Length);

                            }

                        }

                    }



                    if (!string.IsNullOrEmpty(charset))

                    {

                        Encoding = Encoding.GetEncoding(charset);

                        content = Encoding.GetString(rawResponse.Body);

                        html = content.ToHtmlNode();

                    }

                }

            }

            catch

            {

                

            }

        }



        public bool AutoDetectCharsetEncoding

        {

            get { return autoDetectCharsetEncoding; }

        }



        public RawRequest RawRequest

        {

            get { return rawRequest; }

        }



        public RawResponse RawResponse

        {

            get { return rawResponse; }

        }



        public IEnumerable<HtmlNode> Find(string tag, By by)

        {

            return @by.CreateElementFinder(html, tag).FindElements();

        }



        public IEnumerable<HyperLink> FindLinks(By by)

        {

            return Find("a", by).Select(a => new HyperLink(this, a));

        }



        public PageWebForm FindForm(string name)

        {

            var node = (from n in Html.Descendants("form")

                        let formName = n.GetAttributeValue("name", string.Empty)

                        where formName == name

                        select n).FirstOrDefault();



            return node == null ? null : new PageWebForm(node, browser);

        }



        public PageWebForm FindFormById(string id)

        {

            var node = Html.Descendants("form").FirstOrDefault(f => f.Id == id);

            return node == null ? null : new PageWebForm(node, browser);

        }



        private void LoadBaseUrl()

        {

            var baseAttr = html.Descendants("base").Where(e => e.Attributes.Any(a => a.Name == "href"))

                .Select(e => e.Attributes["href"].Value).FirstOrDefault();



            if (baseAttr != null)

            {

                baseUrl = baseAttr;

                return;

            }



            baseUrl = string.Format("{0}://{1}", absoluteUrl.Scheme, absoluteUrl.Host);

            if (!absoluteUrl.IsDefaultPort)

                baseUrl += ":" + absoluteUrl.Port;

        }



        public override string ToString()

        {

            return content;

        }



        public static implicit operator string(WebPage page)

        {

            return page.content;

        }



        private void DownloadResources()

        {

            var resourceUrls = GetResourceUrls();



            foreach (var resourceUrl in resourceUrls)

            {

                var url = GetFullResourceUrl(resourceUrl, absoluteUrl);



                if (WebResourceStorage.Current.Exists(url.ToString()))

                    continue;



                try

                {

                    WebResource resource = browser.DownloadWebResource(url);

                    resources.Add(resource);

                    if (!resource.ForceDownload || !string.IsNullOrEmpty(resource.LastModified))

                        WebResourceStorage.Current.Save(resource);

                }

                catch

                {

                    

                }

            }

        }



        private Uri GetFullResourceUrl(string resourceUrl, Uri root)

        {

            Uri result;

            Uri.TryCreate(resourceUrl, UriKind.RelativeOrAbsolute, out result);

            Uri url;



            if (!result.IsAbsoluteUri)

            {

                if (resourceUrl.StartsWith("/") || resourceUrl.StartsWith("./") || resourceUrl.StartsWith("../"))

                {

                    url = baseUrl != null ? baseUrl.CombineUrl(resourceUrl) : root.Combine(resourceUrl);

                }

                else

                {

                    var path = string.Join("/", root.Segments.Take(root.Segments.Length - 1).Skip(1));

                    url = baseUrl != null ? baseUrl.CombineUrl(path).Combine(resourceUrl) : root.Combine(resourceUrl);

                }

            }

            else

                url = new Uri(resourceUrl);

            return url;

        }



        public List<string> GetResourceUrls()

        {

            var resourceUrls = new List<string>();



            foreach (var resourceTag in resourceTags)

            {

                var sources = html.Descendants(resourceTag.Key)

                    .Where(e => e.Attributes.Any(a => a.Name == resourceTag.Value))

                    .Select(e => e.Attributes[resourceTag.Value].Value).ToArray();

                resourceUrls.AddRange(sources);

            }

            return resourceUrls;

        }



        public ScrapingBrowser Browser

        {

            get { return browser; }

        }



        public Uri AbsoluteUrl

        {

            get { return absoluteUrl; }

        }



        public string Content

        {

            get { return content; }

        }



        public List<WebResource> Resources

        {

            get { return resources; }

        }



        public HtmlNode Html

        {

            get { return html; }

        }



        public string BaseUrl

        {

            get { return baseUrl; }

        }



        private static readonly Regex urlInCssRegex = new Regex(@"url \s* [(] \s* (?<url>[^)\r\n]+) \s* [)]", 

            RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);



        public void SaveSnapshot(string path)

        {

            if (!browser.AutoDownloadPagesResources)

                DownloadResources();



            if (!Directory.Exists(path))

                Directory.CreateDirectory(path);

            else

                Directory.GetFiles(path).ToList().ForEach(File.Delete);

            

            foreach (var resource in Resources)

            {

                var guid = Guid.NewGuid();

                resource.Content.Position = 0;

                var fileName = guid.ToString("N");

                

                RewriteHtml(resource, fileName);



                if (!string.IsNullOrEmpty(resource.ContentType) && resource.ContentType.EndsWith("css", StringComparison.InvariantCultureIgnoreCase))

                {

                    var textContent = resource.GetTextContent();

                    textContent = RewriteCssUrls(path, textContent, resource.AbsoluteUrl.ToString());

                    File.WriteAllText(Path.Combine(path, fileName), textContent);

                }

                else

                    File.WriteAllBytes(Path.Combine(path, fileName), resource.Content.ToArray());

            }



            var outerHtml = RewriteCssUrls(path, Html.OuterHtml, AbsoluteUrl.ToString());

            File.WriteAllText(Path.Combine(path, "page.html"), outerHtml);

        }



        private void RewriteHtml(WebResource resource, string fileName)

        {

            foreach (var resourceTag in resourceTags)

            {

                var nodes = html.Descendants(resourceTag.Key)

                    .Where(

                        e =>

                        e.Attributes.Any(a => a.Name == resourceTag.Value) &&

                        resource.AbsoluteUrl.ToString().EndsWith(e.Attributes[resourceTag.Value].Value))

                    .ToArray();



                foreach (var node in nodes)

                {

                    node.SetAttributeValue(resourceTag.Value, fileName);

                }

            }

        }



        private string RewriteCssUrls(string path, string textContent, string rootUrl)

        {

            var match = urlInCssRegex.Match(textContent);

            while (match.Success)

            {

                var imageId = Guid.NewGuid().ToString("N");

                var url = match.Groups["url"].Value;



                var parts = rootUrl.Split('/');

                var leftPart = string.Join("/", parts.Take(parts.Length - 1));



                try

                {

                    var image = browser.DownloadWebResource(GetFullResourceUrl(url, new Uri(leftPart)));

                    File.WriteAllBytes(Path.Combine(path, imageId), image.Content.ToArray());

                }

                catch

                {



                }



                textContent = textContent.Replace(url, imageId);

                match = match.NextMatch();

            }

            return textContent;

        }

    }

}
Alerts (3)

Complexity hotspot; lines 70 to 71 (total complexity: 3)
70 71
Complexity hotspot; line 196 (total complexity: 3)
196