PageRenderTime 93ms CodeModel.GetById 74ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/NBoilerpipePortable/Util/MultiPageUtils.cs

https://github.com/hippiehunter/Baconography
C# | 571 lines | 414 code | 97 blank | 60 comment | 105 complexity | 0ab6917bdbfe8fec2b1dd2d4dfa7db41 MD5 | raw file
  1using Sgml;
  2using System;
  3using System.Collections.Generic;
  4using System.Diagnostics;
  5using System.IO;
  6using System.Linq;
  7using System.Text;
  8using System.Text.RegularExpressions;
  9using System.Threading.Tasks;
 10using System.Xml.Linq;
 11
 12namespace NBoilerpipePortable.Util
 13{
 14    public class MultiPageUtils
 15    {
 16        private class LinkData
 17        {
 18            public float Score;
 19            public string LinkText;
 20            public string LinkHref;
 21        }
 22
 23        private static readonly Regex _UnlikelyCandidatesRegex = new Regex("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|side|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",  RegexOptions.IgnoreCase);
 24        private static readonly Regex _OkMaybeItsACandidateRegex = new Regex("and|article|body|column|main|shadow", RegexOptions.IgnoreCase);
 25        private static readonly Regex _PositiveWeightRegex = new Regex("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story",  RegexOptions.IgnoreCase);
 26        private static readonly Regex _NegativeWeightRegex = new Regex("combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|side|sponsor|shopping|tags|tool|widget",  RegexOptions.IgnoreCase);
 27        private static readonly Regex _NegativeLinkParentRegex = new Regex("(stories|articles|news|documents|posts|notes|series|historie|artykuly|artykuły|wpisy|dokumenty|serie|geschichten|erzählungen|erzahlungen)",  RegexOptions.IgnoreCase);
 28        private static readonly Regex _Extraneous = new Regex("print|archive|comment|discuss|e[-]?mail|share|reply|all|login|sign|single|also",  RegexOptions.IgnoreCase);
 29        private static readonly Regex _DivToPElementsRegex = new Regex("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)",  RegexOptions.IgnoreCase);
 30        private static readonly Regex _EndOfSentenceRegex = new Regex("\\.( |$)",  RegexOptions.Multiline);
 31        private static readonly Regex _BreakBeforeParagraphRegex = new Regex("<br[^>]*>\\s*<p", RegexOptions.None);
 32        private static readonly Regex _NormalizeSpacesRegex = new Regex("\\s{2,}", RegexOptions.None);
 33        private static readonly Regex _KillBreaksRegex = new Regex("(<br\\s*\\/?>(\\s|&nbsp;?)*){1,}", RegexOptions.None);
 34        private static readonly Regex _VideoRegex = new Regex("http:\\/\\/(www\\.)?(youtube|vimeo)\\.com",  RegexOptions.IgnoreCase);
 35        private static readonly Regex _ReplaceDoubleBrsRegex = new Regex("(<br[^>]*>[ \\n\\r\\t]*){2,}",  RegexOptions.IgnoreCase);
 36        private static readonly Regex _ReplaceFontsRegex = new Regex("<(\\/?)font[^>]*>",  RegexOptions.IgnoreCase);
 37        private static readonly Regex _ArticleTitleDashRegex1 = new Regex(" [\\|\\-] ", RegexOptions.None);
 38        private static readonly Regex _ArticleTitleDashRegex2 = new Regex("(.*)[\\|\\-] .*", RegexOptions.None);
 39        private static readonly Regex _ArticleTitleDashRegex3 = new Regex("[^\\|\\-]*[\\|\\-](.*)", RegexOptions.None);
 40        private static readonly Regex _ArticleTitleColonRegex1 = new Regex(".*:(.*)", RegexOptions.None);
 41        private static readonly Regex _ArticleTitleColonRegex2 = new Regex("[^:]*[:](.*)", RegexOptions.None);
 42        private static readonly Regex _NextLink = new Regex(@"(next|weiter|continue|dalej|następna|nastepna>([^\|]|$)|�([^\|]|$))",  RegexOptions.IgnoreCase);
 43        private static readonly Regex _NextStoryLink = new Regex("(story|article|news|document|post|note|series|historia|artykul|artykuł|wpis|dokument|seria|geschichte|erzählung|erzahlung|artikel|serie)",  RegexOptions.IgnoreCase);
 44        private static readonly Regex _PrevLink = new Regex("(prev|earl|[^b]old|new|wstecz|poprzednia|<|�)",  RegexOptions.IgnoreCase);
 45        private static readonly Regex _PageRegex = new Regex("pag(e|ing|inat)|([^a-z]|^)pag([^a-z]|$)",  RegexOptions.IgnoreCase);
 46        private static readonly Regex _LikelyParagraphDivRegex = new Regex("text|para|parbase",  RegexOptions.IgnoreCase);
 47        private static readonly Regex _MailtoHrefRegex = new Regex("^\\s*mailto\\s*:", RegexOptions.IgnoreCase);
 48        private static readonly Regex _TitleWhitespacesCleanUpRegex = new Regex("\\s+", RegexOptions.None);
 49
 50        /// <summary>
 51        /// Looks for any paging links that may occur within the document
 52        /// </summary>
 53        /// <param name="body">Content body</param>
 54        /// <param name="url">Url of document</param>
 55        public static string FindNextPageLink(XElement body, string url)
 56        {
 57            try
 58            {
 59                Dictionary<string, LinkData> possiblePagesByLink = new Dictionary<string, LinkData>();
 60                IEnumerable<XElement> allLinks = GetElementsByTagName(body, "a");
 61                string articleBaseUrl = FindBaseUrl(url);
 62
 63                /* Loop through all links, looking for hints that they may be next-page links. 
 64                 * Things like having "page" in their textContent, className or id, or being a child
 65                 * of a node with a page-y className or id. 
 66                 * After we do that, assign each page a score.
 67                 */
 68                foreach (XElement linkElement in allLinks)
 69                {
 70                    string linkHref = (string)linkElement.Attribute("href");
 71
 72                    if (string.IsNullOrEmpty(linkHref)
 73                     || _MailtoHrefRegex.IsMatch(linkHref))
 74                    {
 75                        continue;
 76                    }
 77
 78                    linkHref = Regex.Replace(linkHref, "#.*$", "");
 79                    linkHref = Regex.Replace(linkHref, "/$", "");
 80
 81                    /* If we've already seen this page, then ignore it. */
 82                    // This leaves out an already-checked page check, because 
 83                    // the web transcoder is seperate from the original transcoder
 84                    if (linkHref == "" || linkHref == articleBaseUrl || linkHref == url)
 85                    {
 86                        continue;
 87                    }
 88
 89                    /* If it's on a different domain, skip it. */
 90                    Uri linkHrefUri;
 91
 92                    if (Uri.TryCreate(linkHref, UriKind.Absolute, out linkHrefUri) && linkHrefUri.Host != new Uri(articleBaseUrl).Host)
 93                    {
 94                        continue;
 95                    }
 96
 97                    string linkText = GetInnerText(linkElement);
 98
 99                    /* If the linktext looks like it's not the next page, then skip it */
100                    if (_Extraneous.IsMatch(linkText))
101                    {
102                        continue;
103                    }
104
105                    /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
106                    string linkHrefLeftover = linkHref.Replace(articleBaseUrl, "");
107
108                    if (!Regex.IsMatch(linkHrefLeftover, @"\d"))
109                    {
110                        continue;
111                    }
112
113                    if (!possiblePagesByLink.Keys.Contains(linkHref))
114                    {
115                        possiblePagesByLink[linkHref] = new LinkData { Score = 0, LinkHref = linkHref, LinkText = linkText };
116                    }
117                    else
118                    {
119                        possiblePagesByLink[linkHref].LinkText += " | " + linkText;
120                    }
121
122                    LinkData linkObj = possiblePagesByLink[linkHref];
123
124                    /*
125                     * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
126                     * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
127                     */
128                    if (linkHref.IndexOf(articleBaseUrl, StringComparison.OrdinalIgnoreCase) == -1)
129                    {
130                        linkObj.Score -= 50;
131                    }
132
133                    string linkData = linkText + " " + GetClass(linkElement) + " " + GetId(linkElement);
134
135                    if (_NextLink.IsMatch(linkData)
136                    && !_NextStoryLink.IsMatch(linkData))
137                    {
138                        linkObj.Score += 50;
139                    }
140
141                    if (_PageRegex.IsMatch(linkData))
142                    {
143                        linkObj.Score += 25;
144                    }
145
146                    /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
147                    /* -65 is enough to negate any bonuses gotten from a > or � in the text */
148                    if (Regex.IsMatch(linkData, "(first|last)", RegexOptions.IgnoreCase)
149                     && !_NextLink.IsMatch(linkObj.LinkText))
150                    {
151                        linkObj.Score -= 65;
152                    }
153
154                    if (_NegativeWeightRegex.IsMatch(linkData) || _Extraneous.IsMatch(linkData))
155                    {
156                        linkObj.Score -= 50;
157                    }
158
159                    if (_PrevLink.IsMatch(linkData))
160                    {
161                        linkObj.Score -= 200;
162                    }
163
164                    /* If any ancestor node contains page or paging or paginat */
165                    XElement parentNode = linkElement.Parent;
166                    bool positiveNodeMatch = false;
167                    bool negativeNodeMatch = false;
168
169                    while (parentNode != null)
170                    {
171                        string parentNodeClassAndId = GetClass(parentNode) + " " + GetId(parentNode);
172
173                        if (!positiveNodeMatch && (_PageRegex.IsMatch(parentNodeClassAndId) || _NextLink.IsMatch(parentNodeClassAndId)))
174                        {
175                            positiveNodeMatch = true;
176                            linkObj.Score += 25;
177                        }
178
179                        if (!negativeNodeMatch && (_NegativeWeightRegex.IsMatch(parentNodeClassAndId) || _NegativeLinkParentRegex.IsMatch(parentNodeClassAndId)))
180                        {
181                            if (!_PositiveWeightRegex.IsMatch(parentNodeClassAndId))
182                            {
183                                linkObj.Score -= 25;
184                                negativeNodeMatch = true;
185                            }
186                        }
187
188                        parentNode = parentNode.Parent;
189                    }
190
191                    /* If any descendant node contains 'next indicator' or 'prev indicator' - adjust the score */
192                    bool positiveDescendantMatch = false;
193                    bool negativeDescendantMatch = false;
194
195                    foreach (XElement descendantElement in linkElement.Descendants())
196                    {
197                        string descendantData = GetInnerText(descendantElement) + " " + GetClass(descendantElement) + " " + GetId(descendantElement) + " " + GetAttributeValue(descendantElement, "alt", "");
198
199                        if (!positiveDescendantMatch && _NextLink.IsMatch(descendantData))
200                        {
201                            linkObj.Score += 12.5f;
202                            positiveDescendantMatch = true;
203                        }
204
205                        if (!negativeDescendantMatch && _PrevLink.IsMatch(descendantData))
206                        {
207                            linkObj.Score -= 100;
208                            negativeDescendantMatch = true;
209                        }
210                    }
211
212                    /*
213                    * If the URL looks like it has paging in it, add to the score.
214                    * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
215                    */
216                    if (Regex.IsMatch(linkHref, @"p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}", RegexOptions.IgnoreCase)
217                     || Regex.IsMatch(linkHref, @"(page|paging)", RegexOptions.IgnoreCase)
218                     || Regex.IsMatch(linkHref, @"section", RegexOptions.IgnoreCase))
219                    {
220                        linkObj.Score += 25;
221                    }
222
223                    /* If the URL contains negative values, give a slight decrease. */
224                    if (_Extraneous.IsMatch(linkHref))
225                    {
226                        linkObj.Score -= 15;
227                    }
228
229                    /*
230                     * If the link text can be parsed as a number, give it a minor bonus, with a slight
231                     * bias towards lower numbered pages. This is so that pages that might not have 'next'
232                     * in their text can still get scored, and sorted properly by score.
233                     */
234                    int linkTextAsNumber;
235                    bool isInt = int.TryParse(linkText, out linkTextAsNumber);
236
237                    if (isInt)
238                    {
239                        /* Punish 1 since we're either already there, or it's probably before what we want anyways. */
240                        if (linkTextAsNumber == 1)
241                        {
242                            linkObj.Score -= 10;
243                        }
244                        else
245                        {
246                            linkObj.Score += Math.Max(0, 10 - linkTextAsNumber);
247                        }
248                    }
249                }
250
251                /*
252                * Loop through all of our possible pages from above and find our top candidate for the next page URL.
253                * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
254                */
255                LinkData topPage = null;
256
257                foreach (string page in possiblePagesByLink.Keys)
258                {
259                    if (possiblePagesByLink[page].Score >= 50 && (topPage == null || topPage.Score < possiblePagesByLink[page].Score))
260                    {
261                        topPage = possiblePagesByLink[page];
262                    }
263                }
264
265                if (topPage != null)
266                {
267                    string nextHref = Regex.Replace(topPage.LinkHref, @"\/$", "");
268                    var nextHrefUri = new Uri(new Uri(articleBaseUrl), nextHref);
269
270                    return nextHrefUri.OriginalString;
271                }
272            }
273            catch(Exception ex)
274            {
275                Debug.WriteLine(ex.ToString());
276            }
277            return null;
278        }
279
280        internal static string FindBaseUrl(string url)
281        {
282            Uri urlUri;
283
284            if (!Uri.TryCreate(url, UriKind.Absolute, out urlUri))
285            {
286                return url;
287            }
288
289            string protocol = urlUri.Scheme;
290            string hostname = urlUri.Host;
291            string noUrlParams = urlUri.AbsolutePath + "/";
292            List<string> urlSlashes = noUrlParams.Split('/').Reverse().ToList();
293            var cleanedSegments = new List<string>();
294            int slashLen = urlSlashes.Count();
295
296            for (int i = 0; i < slashLen; i++)
297            {
298                string segment = urlSlashes[i];
299
300                /* Split off and save anything that looks like a file type. */
301                if (segment.IndexOf('.') != -1)
302                {
303                    string possibleType = segment.Split('.')[1];
304
305                    /* If the type isn't alpha-only, it's probably not actually a file extension. */
306                    if (!Regex.IsMatch(possibleType, "[a-zA-Z]"))
307                    {
308                        segment = segment.Split('.')[0];
309                    }
310                }
311
312                /*
313                 * EW-CMS specific segment replacement. Ugly.
314                 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
315                */
316                if (segment.IndexOf(",00") != -1)
317                {
318                    segment = segment.Replace(",00", "");
319                }
320
321                /* If our first or second segment has anything looking like a page number, remove it. */
322                var pageNumRegex = new Regex("((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$", RegexOptions.IgnoreCase);
323
324                if (pageNumRegex.IsMatch(segment) && ((i == 1) || (i == 0)))
325                {
326                    segment = pageNumRegex.Replace(segment, "");
327                }
328
329                /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
330                bool del = (i < 2 && Regex.IsMatch(segment, @"^[\d]{1,2}$"));
331
332                /* If this is the first segment and it's just "index," remove it. */
333                if (i == 0 && segment.ToLower() == "index")
334                {
335                    del = true;
336                }
337
338                /* If tour first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
339                // TODO: Check these "purely alpha" regexes.  They don't seem right.
340                if (i < 2 && segment.Length < 3 && !Regex.IsMatch(urlSlashes[0], "[a-z]", RegexOptions.IgnoreCase))
341                {
342                    del = true;
343                }
344
345                /* If it's not marked for deletion, push it to cleanedSegments */
346                if (!del)
347                {
348                    cleanedSegments.Add(segment);
349                }
350            }
351
352            /* This is our final, cleaned, base article URL. */
353            cleanedSegments.Reverse();
354
355            return string.Format("{0}://{1}{2}", protocol, hostname, String.Join("/", cleanedSegments.ToArray()));
356        }
357
358        public static IEnumerable<XElement> GetElementsByTagName(XContainer container, string tagName)
359        {
360            if (container == null)
361            {
362                throw new ArgumentNullException("container");
363            }
364
365            if (string.IsNullOrEmpty(tagName))
366            {
367                throw new ArgumentNullException("tagName");
368            }
369
370            return container.Descendants()
371              .Where(e => tagName.Equals(e.Name.LocalName, StringComparison.OrdinalIgnoreCase));
372        }
373
374        public static string GetClass(XElement element)
375        {
376            return GetAttributeValue(element, "class", "");
377        }
378
379        public static string GetId(XElement element)
380        {
381            return GetAttributeValue(element, "id", "");
382        }
383
384        public static string GetAttributeValue(XElement element, string attributeName, string defaultValue)
385        {
386            if (element == null)
387            {
388                throw new ArgumentNullException("element");
389            }
390
391            if (string.IsNullOrEmpty(attributeName))
392            {
393                throw new ArgumentNullException("attributeName");
394            }
395
396            var attribute = element.Attribute(attributeName);
397
398            return attribute != null
399                     ? (attribute.Value ?? defaultValue)
400                     : defaultValue;
401        }
402
403        internal static string GetInnerText(XNode node, bool dontNormalizeSpaces)
404        {
405            if (node == null)
406            {
407                throw new ArgumentNullException("node");
408            }
409
410            string result;
411
412            if (node is XElement)
413            {
414                result = ((XElement)node).Value;
415            }
416            else if (node is XText)
417            {
418                result = ((XText)node).Value;
419            }
420            else
421            {
422                throw new NotSupportedException(string.Format("Nodes of type '{0}' are not supported.", node.GetType()));
423            }
424
425            result = (result ?? "").Trim();
426
427            if (!dontNormalizeSpaces)
428            {
429                return _NormalizeSpacesRegex.Replace(result, " ");
430            }
431
432            return result;
433        }
434
435        internal static string GetInnerText(XNode node)
436        {
437            return GetInnerText(node, false);
438        }
439    }
440    public class SgmlDomBuilder
441    {
442        #region Public methods
443
444        public static XElement GetBody(XDocument document)
445        {
446            if (document == null)
447            {
448                throw new ArgumentNullException("document");
449            }
450
451            var documentRoot = document.Root;
452
453            if (documentRoot == null)
454            {
455                return null;
456            }
457
458            return MultiPageUtils.GetElementsByTagName(documentRoot, "body").FirstOrDefault();
459        }
460
461        /// <summary>
462        /// Constructs a DOM (System.Xml.Linq.XDocument) from HTML markup.
463        /// </summary>
464        /// <param name="htmlContent">HTML markup from which the DOM is to be constructed.</param>
465        /// <returns>System.Linq.Xml.XDocument instance which is a DOM of the provided HTML markup.</returns>
466        public static XDocument BuildDocument(string htmlContent)
467        {
468            if (htmlContent == null)
469            {
470                throw new ArgumentNullException("htmlContent");
471            }
472
473            if (htmlContent.Trim().Length == 0)
474            {
475                return new XDocument();
476            }
477
478            // "trim end" htmlContent to ...</html>$ (codinghorror.com puts some scripts after the </html> - sic!)
479            const string htmlEnd = "</html";
480            int indexOfHtmlEnd = htmlContent.LastIndexOf(htmlEnd);
481
482            if (indexOfHtmlEnd != -1)
483            {
484                int indexOfHtmlEndBracket = htmlContent.IndexOf('>', indexOfHtmlEnd);
485
486                if (indexOfHtmlEndBracket != -1)
487                {
488                    htmlContent = htmlContent.Substring(0, indexOfHtmlEndBracket + 1);
489                }
490            }
491
492            XDocument document;
493
494            try
495            {
496                document = LoadDocument(htmlContent);
497            }
498            catch (InvalidOperationException exc)
499            {
500                // sometimes SgmlReader doesn't handle <script> tags well and XDocument.Load() throws,
501                // so we can retry with the html content with <script> tags stripped off
502
503                if (!exc.Message.Contains("EndOfFile"))
504                {
505                    throw;
506                }
507
508                htmlContent = HtmlUtils.RemoveScriptTags(htmlContent);
509
510                document = LoadDocument(htmlContent);
511            }
512
513            return document;
514        }
515
516        private static XDocument LoadDocument(string htmlContent)
517        {
518            using (var sgmlReader = new SgmlReader())
519            {
520                sgmlReader.CaseFolding = CaseFolding.ToLower;
521                sgmlReader.DocType = "HTML";
522
523                using (var sr = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(htmlContent))))
524                {
525                    sgmlReader.InputStream = sr;
526
527                    var document = XDocument.Load(sgmlReader);
528
529                    return document;
530                }
531            }
532        }
533
534        #endregion
535    }
536    public static class HtmlUtils
537    {
538        public static string RemoveScriptTags(string htmlContent)
539        {
540            if (htmlContent == null)
541            {
542                throw new ArgumentNullException("htmlContent");
543            }
544
545            if (htmlContent.Length == 0)
546            {
547                return "";
548            }
549
550            int indexOfScriptTagStart = htmlContent.IndexOf("<script", StringComparison.OrdinalIgnoreCase);
551
552            if (indexOfScriptTagStart == -1)
553            {
554                return htmlContent;
555            }
556
557            int indexOfScriptTagEnd = htmlContent.IndexOf("</script>", indexOfScriptTagStart, StringComparison.OrdinalIgnoreCase);
558
559            if (indexOfScriptTagEnd == -1)
560            {
561                return htmlContent.Substring(0, indexOfScriptTagStart);
562            }
563
564            string strippedHtmlContent =
565              htmlContent.Substring(0, indexOfScriptTagStart) +
566              htmlContent.Substring(indexOfScriptTagEnd + "</script>".Length);
567
568            return RemoveScriptTags(strippedHtmlContent);
569        }
570    }
571}