PageRenderTime 52ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/NBoilerpipePortable/Util/MultiPageUtils.cs

https://github.com/hippiehunter/Baconography
C# | 571 lines | 414 code | 97 blank | 60 comment | 105 complexity | 0ab6917bdbfe8fec2b1dd2d4dfa7db41 MD5 | raw file
  1. using Sgml;
  2. using System;
  3. using System.Collections.Generic;
  4. using System.Diagnostics;
  5. using System.IO;
  6. using System.Linq;
  7. using System.Text;
  8. using System.Text.RegularExpressions;
  9. using System.Threading.Tasks;
  10. using System.Xml.Linq;
  11. namespace NBoilerpipePortable.Util
  12. {
  13. public class MultiPageUtils
  14. {
  15. private class LinkData
  16. {
  17. public float Score;
  18. public string LinkText;
  19. public string LinkHref;
  20. }
  21. private static readonly Regex _UnlikelyCandidatesRegex = new Regex("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|side|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter", RegexOptions.IgnoreCase);
  22. private static readonly Regex _OkMaybeItsACandidateRegex = new Regex("and|article|body|column|main|shadow", RegexOptions.IgnoreCase);
  23. private static readonly Regex _PositiveWeightRegex = new Regex("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story", RegexOptions.IgnoreCase);
  24. private static readonly Regex _NegativeWeightRegex = new Regex("combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|side|sponsor|shopping|tags|tool|widget", RegexOptions.IgnoreCase);
  25. private static readonly Regex _NegativeLinkParentRegex = new Regex("(stories|articles|news|documents|posts|notes|series|historie|artykuly|artykuły|wpisy|dokumenty|serie|geschichten|erzählungen|erzahlungen)", RegexOptions.IgnoreCase);
  26. private static readonly Regex _Extraneous = new Regex("print|archive|comment|discuss|e[-]?mail|share|reply|all|login|sign|single|also", RegexOptions.IgnoreCase);
  27. private static readonly Regex _DivToPElementsRegex = new Regex("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", RegexOptions.IgnoreCase);
  28. private static readonly Regex _EndOfSentenceRegex = new Regex("\\.( |$)", RegexOptions.Multiline);
  29. private static readonly Regex _BreakBeforeParagraphRegex = new Regex("<br[^>]*>\\s*<p", RegexOptions.None);
  30. private static readonly Regex _NormalizeSpacesRegex = new Regex("\\s{2,}", RegexOptions.None);
  31. private static readonly Regex _KillBreaksRegex = new Regex("(<br\\s*\\/?>(\\s|&nbsp;?)*){1,}", RegexOptions.None);
  32. private static readonly Regex _VideoRegex = new Regex("http:\\/\\/(www\\.)?(youtube|vimeo)\\.com", RegexOptions.IgnoreCase);
  33. private static readonly Regex _ReplaceDoubleBrsRegex = new Regex("(<br[^>]*>[ \\n\\r\\t]*){2,}", RegexOptions.IgnoreCase);
  34. private static readonly Regex _ReplaceFontsRegex = new Regex("<(\\/?)font[^>]*>", RegexOptions.IgnoreCase);
  35. private static readonly Regex _ArticleTitleDashRegex1 = new Regex(" [\\|\\-] ", RegexOptions.None);
  36. private static readonly Regex _ArticleTitleDashRegex2 = new Regex("(.*)[\\|\\-] .*", RegexOptions.None);
  37. private static readonly Regex _ArticleTitleDashRegex3 = new Regex("[^\\|\\-]*[\\|\\-](.*)", RegexOptions.None);
  38. private static readonly Regex _ArticleTitleColonRegex1 = new Regex(".*:(.*)", RegexOptions.None);
  39. private static readonly Regex _ArticleTitleColonRegex2 = new Regex("[^:]*[:](.*)", RegexOptions.None);
  40. private static readonly Regex _NextLink = new Regex(@"(next|weiter|continue|dalej|następna|nastepna>([^\|]|$)|�([^\|]|$))", RegexOptions.IgnoreCase);
  41. private static readonly Regex _NextStoryLink = new Regex("(story|article|news|document|post|note|series|historia|artykul|artykuł|wpis|dokument|seria|geschichte|erzählung|erzahlung|artikel|serie)", RegexOptions.IgnoreCase);
  42. private static readonly Regex _PrevLink = new Regex("(prev|earl|[^b]old|new|wstecz|poprzednia|<|�)", RegexOptions.IgnoreCase);
  43. private static readonly Regex _PageRegex = new Regex("pag(e|ing|inat)|([^a-z]|^)pag([^a-z]|$)", RegexOptions.IgnoreCase);
  44. private static readonly Regex _LikelyParagraphDivRegex = new Regex("text|para|parbase", RegexOptions.IgnoreCase);
  45. private static readonly Regex _MailtoHrefRegex = new Regex("^\\s*mailto\\s*:", RegexOptions.IgnoreCase);
  46. private static readonly Regex _TitleWhitespacesCleanUpRegex = new Regex("\\s+", RegexOptions.None);
  47. /// <summary>
  48. /// Looks for any paging links that may occur within the document
  49. /// </summary>
  50. /// <param name="body">Content body</param>
  51. /// <param name="url">Url of document</param>
  52. public static string FindNextPageLink(XElement body, string url)
  53. {
  54. try
  55. {
  56. Dictionary<string, LinkData> possiblePagesByLink = new Dictionary<string, LinkData>();
  57. IEnumerable<XElement> allLinks = GetElementsByTagName(body, "a");
  58. string articleBaseUrl = FindBaseUrl(url);
  59. /* Loop through all links, looking for hints that they may be next-page links.
  60. * Things like having "page" in their textContent, className or id, or being a child
  61. * of a node with a page-y className or id.
  62. * After we do that, assign each page a score.
  63. */
  64. foreach (XElement linkElement in allLinks)
  65. {
  66. string linkHref = (string)linkElement.Attribute("href");
  67. if (string.IsNullOrEmpty(linkHref)
  68. || _MailtoHrefRegex.IsMatch(linkHref))
  69. {
  70. continue;
  71. }
  72. linkHref = Regex.Replace(linkHref, "#.*$", "");
  73. linkHref = Regex.Replace(linkHref, "/$", "");
  74. /* If we've already seen this page, then ignore it. */
  75. // This leaves out an already-checked page check, because
  76. // the web transcoder is seperate from the original transcoder
  77. if (linkHref == "" || linkHref == articleBaseUrl || linkHref == url)
  78. {
  79. continue;
  80. }
  81. /* If it's on a different domain, skip it. */
  82. Uri linkHrefUri;
  83. if (Uri.TryCreate(linkHref, UriKind.Absolute, out linkHrefUri) && linkHrefUri.Host != new Uri(articleBaseUrl).Host)
  84. {
  85. continue;
  86. }
  87. string linkText = GetInnerText(linkElement);
  88. /* If the linktext looks like it's not the next page, then skip it */
  89. if (_Extraneous.IsMatch(linkText))
  90. {
  91. continue;
  92. }
  93. /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
  94. string linkHrefLeftover = linkHref.Replace(articleBaseUrl, "");
  95. if (!Regex.IsMatch(linkHrefLeftover, @"\d"))
  96. {
  97. continue;
  98. }
  99. if (!possiblePagesByLink.Keys.Contains(linkHref))
  100. {
  101. possiblePagesByLink[linkHref] = new LinkData { Score = 0, LinkHref = linkHref, LinkText = linkText };
  102. }
  103. else
  104. {
  105. possiblePagesByLink[linkHref].LinkText += " | " + linkText;
  106. }
  107. LinkData linkObj = possiblePagesByLink[linkHref];
  108. /*
  109. * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
  110. * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
  111. */
  112. if (linkHref.IndexOf(articleBaseUrl, StringComparison.OrdinalIgnoreCase) == -1)
  113. {
  114. linkObj.Score -= 50;
  115. }
  116. string linkData = linkText + " " + GetClass(linkElement) + " " + GetId(linkElement);
  117. if (_NextLink.IsMatch(linkData)
  118. && !_NextStoryLink.IsMatch(linkData))
  119. {
  120. linkObj.Score += 50;
  121. }
  122. if (_PageRegex.IsMatch(linkData))
  123. {
  124. linkObj.Score += 25;
  125. }
  126. /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
  127. /* -65 is enough to negate any bonuses gotten from a > or � in the text */
  128. if (Regex.IsMatch(linkData, "(first|last)", RegexOptions.IgnoreCase)
  129. && !_NextLink.IsMatch(linkObj.LinkText))
  130. {
  131. linkObj.Score -= 65;
  132. }
  133. if (_NegativeWeightRegex.IsMatch(linkData) || _Extraneous.IsMatch(linkData))
  134. {
  135. linkObj.Score -= 50;
  136. }
  137. if (_PrevLink.IsMatch(linkData))
  138. {
  139. linkObj.Score -= 200;
  140. }
  141. /* If any ancestor node contains page or paging or paginat */
  142. XElement parentNode = linkElement.Parent;
  143. bool positiveNodeMatch = false;
  144. bool negativeNodeMatch = false;
  145. while (parentNode != null)
  146. {
  147. string parentNodeClassAndId = GetClass(parentNode) + " " + GetId(parentNode);
  148. if (!positiveNodeMatch && (_PageRegex.IsMatch(parentNodeClassAndId) || _NextLink.IsMatch(parentNodeClassAndId)))
  149. {
  150. positiveNodeMatch = true;
  151. linkObj.Score += 25;
  152. }
  153. if (!negativeNodeMatch && (_NegativeWeightRegex.IsMatch(parentNodeClassAndId) || _NegativeLinkParentRegex.IsMatch(parentNodeClassAndId)))
  154. {
  155. if (!_PositiveWeightRegex.IsMatch(parentNodeClassAndId))
  156. {
  157. linkObj.Score -= 25;
  158. negativeNodeMatch = true;
  159. }
  160. }
  161. parentNode = parentNode.Parent;
  162. }
  163. /* If any descendant node contains 'next indicator' or 'prev indicator' - adjust the score */
  164. bool positiveDescendantMatch = false;
  165. bool negativeDescendantMatch = false;
  166. foreach (XElement descendantElement in linkElement.Descendants())
  167. {
  168. string descendantData = GetInnerText(descendantElement) + " " + GetClass(descendantElement) + " " + GetId(descendantElement) + " " + GetAttributeValue(descendantElement, "alt", "");
  169. if (!positiveDescendantMatch && _NextLink.IsMatch(descendantData))
  170. {
  171. linkObj.Score += 12.5f;
  172. positiveDescendantMatch = true;
  173. }
  174. if (!negativeDescendantMatch && _PrevLink.IsMatch(descendantData))
  175. {
  176. linkObj.Score -= 100;
  177. negativeDescendantMatch = true;
  178. }
  179. }
  180. /*
  181. * If the URL looks like it has paging in it, add to the score.
  182. * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
  183. */
  184. if (Regex.IsMatch(linkHref, @"p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}", RegexOptions.IgnoreCase)
  185. || Regex.IsMatch(linkHref, @"(page|paging)", RegexOptions.IgnoreCase)
  186. || Regex.IsMatch(linkHref, @"section", RegexOptions.IgnoreCase))
  187. {
  188. linkObj.Score += 25;
  189. }
  190. /* If the URL contains negative values, give a slight decrease. */
  191. if (_Extraneous.IsMatch(linkHref))
  192. {
  193. linkObj.Score -= 15;
  194. }
  195. /*
  196. * If the link text can be parsed as a number, give it a minor bonus, with a slight
  197. * bias towards lower numbered pages. This is so that pages that might not have 'next'
  198. * in their text can still get scored, and sorted properly by score.
  199. */
  200. int linkTextAsNumber;
  201. bool isInt = int.TryParse(linkText, out linkTextAsNumber);
  202. if (isInt)
  203. {
  204. /* Punish 1 since we're either already there, or it's probably before what we want anyways. */
  205. if (linkTextAsNumber == 1)
  206. {
  207. linkObj.Score -= 10;
  208. }
  209. else
  210. {
  211. linkObj.Score += Math.Max(0, 10 - linkTextAsNumber);
  212. }
  213. }
  214. }
  215. /*
  216. * Loop through all of our possible pages from above and find our top candidate for the next page URL.
  217. * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
  218. */
  219. LinkData topPage = null;
  220. foreach (string page in possiblePagesByLink.Keys)
  221. {
  222. if (possiblePagesByLink[page].Score >= 50 && (topPage == null || topPage.Score < possiblePagesByLink[page].Score))
  223. {
  224. topPage = possiblePagesByLink[page];
  225. }
  226. }
  227. if (topPage != null)
  228. {
  229. string nextHref = Regex.Replace(topPage.LinkHref, @"\/$", "");
  230. var nextHrefUri = new Uri(new Uri(articleBaseUrl), nextHref);
  231. return nextHrefUri.OriginalString;
  232. }
  233. }
  234. catch(Exception ex)
  235. {
  236. Debug.WriteLine(ex.ToString());
  237. }
  238. return null;
  239. }
  240. internal static string FindBaseUrl(string url)
  241. {
  242. Uri urlUri;
  243. if (!Uri.TryCreate(url, UriKind.Absolute, out urlUri))
  244. {
  245. return url;
  246. }
  247. string protocol = urlUri.Scheme;
  248. string hostname = urlUri.Host;
  249. string noUrlParams = urlUri.AbsolutePath + "/";
  250. List<string> urlSlashes = noUrlParams.Split('/').Reverse().ToList();
  251. var cleanedSegments = new List<string>();
  252. int slashLen = urlSlashes.Count();
  253. for (int i = 0; i < slashLen; i++)
  254. {
  255. string segment = urlSlashes[i];
  256. /* Split off and save anything that looks like a file type. */
  257. if (segment.IndexOf('.') != -1)
  258. {
  259. string possibleType = segment.Split('.')[1];
  260. /* If the type isn't alpha-only, it's probably not actually a file extension. */
  261. if (!Regex.IsMatch(possibleType, "[a-zA-Z]"))
  262. {
  263. segment = segment.Split('.')[0];
  264. }
  265. }
  266. /*
  267. * EW-CMS specific segment replacement. Ugly.
  268. * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
  269. */
  270. if (segment.IndexOf(",00") != -1)
  271. {
  272. segment = segment.Replace(",00", "");
  273. }
  274. /* If our first or second segment has anything looking like a page number, remove it. */
  275. var pageNumRegex = new Regex("((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$", RegexOptions.IgnoreCase);
  276. if (pageNumRegex.IsMatch(segment) && ((i == 1) || (i == 0)))
  277. {
  278. segment = pageNumRegex.Replace(segment, "");
  279. }
  280. /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
  281. bool del = (i < 2 && Regex.IsMatch(segment, @"^[\d]{1,2}$"));
  282. /* If this is the first segment and it's just "index," remove it. */
  283. if (i == 0 && segment.ToLower() == "index")
  284. {
  285. del = true;
  286. }
  287. /* If tour first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
  288. // TODO: Check these "purely alpha" regexes. They don't seem right.
  289. if (i < 2 && segment.Length < 3 && !Regex.IsMatch(urlSlashes[0], "[a-z]", RegexOptions.IgnoreCase))
  290. {
  291. del = true;
  292. }
  293. /* If it's not marked for deletion, push it to cleanedSegments */
  294. if (!del)
  295. {
  296. cleanedSegments.Add(segment);
  297. }
  298. }
  299. /* This is our final, cleaned, base article URL. */
  300. cleanedSegments.Reverse();
  301. return string.Format("{0}://{1}{2}", protocol, hostname, String.Join("/", cleanedSegments.ToArray()));
  302. }
  303. public static IEnumerable<XElement> GetElementsByTagName(XContainer container, string tagName)
  304. {
  305. if (container == null)
  306. {
  307. throw new ArgumentNullException("container");
  308. }
  309. if (string.IsNullOrEmpty(tagName))
  310. {
  311. throw new ArgumentNullException("tagName");
  312. }
  313. return container.Descendants()
  314. .Where(e => tagName.Equals(e.Name.LocalName, StringComparison.OrdinalIgnoreCase));
  315. }
  316. public static string GetClass(XElement element)
  317. {
  318. return GetAttributeValue(element, "class", "");
  319. }
  320. public static string GetId(XElement element)
  321. {
  322. return GetAttributeValue(element, "id", "");
  323. }
  324. public static string GetAttributeValue(XElement element, string attributeName, string defaultValue)
  325. {
  326. if (element == null)
  327. {
  328. throw new ArgumentNullException("element");
  329. }
  330. if (string.IsNullOrEmpty(attributeName))
  331. {
  332. throw new ArgumentNullException("attributeName");
  333. }
  334. var attribute = element.Attribute(attributeName);
  335. return attribute != null
  336. ? (attribute.Value ?? defaultValue)
  337. : defaultValue;
  338. }
  339. internal static string GetInnerText(XNode node, bool dontNormalizeSpaces)
  340. {
  341. if (node == null)
  342. {
  343. throw new ArgumentNullException("node");
  344. }
  345. string result;
  346. if (node is XElement)
  347. {
  348. result = ((XElement)node).Value;
  349. }
  350. else if (node is XText)
  351. {
  352. result = ((XText)node).Value;
  353. }
  354. else
  355. {
  356. throw new NotSupportedException(string.Format("Nodes of type '{0}' are not supported.", node.GetType()));
  357. }
  358. result = (result ?? "").Trim();
  359. if (!dontNormalizeSpaces)
  360. {
  361. return _NormalizeSpacesRegex.Replace(result, " ");
  362. }
  363. return result;
  364. }
  365. internal static string GetInnerText(XNode node)
  366. {
  367. return GetInnerText(node, false);
  368. }
  369. }
  370. public class SgmlDomBuilder
  371. {
  372. #region Public methods
  373. public static XElement GetBody(XDocument document)
  374. {
  375. if (document == null)
  376. {
  377. throw new ArgumentNullException("document");
  378. }
  379. var documentRoot = document.Root;
  380. if (documentRoot == null)
  381. {
  382. return null;
  383. }
  384. return MultiPageUtils.GetElementsByTagName(documentRoot, "body").FirstOrDefault();
  385. }
  386. /// <summary>
  387. /// Constructs a DOM (System.Xml.Linq.XDocument) from HTML markup.
  388. /// </summary>
  389. /// <param name="htmlContent">HTML markup from which the DOM is to be constructed.</param>
  390. /// <returns>System.Linq.Xml.XDocument instance which is a DOM of the provided HTML markup.</returns>
  391. public static XDocument BuildDocument(string htmlContent)
  392. {
  393. if (htmlContent == null)
  394. {
  395. throw new ArgumentNullException("htmlContent");
  396. }
  397. if (htmlContent.Trim().Length == 0)
  398. {
  399. return new XDocument();
  400. }
  401. // "trim end" htmlContent to ...</html>$ (codinghorror.com puts some scripts after the </html> - sic!)
  402. const string htmlEnd = "</html";
  403. int indexOfHtmlEnd = htmlContent.LastIndexOf(htmlEnd);
  404. if (indexOfHtmlEnd != -1)
  405. {
  406. int indexOfHtmlEndBracket = htmlContent.IndexOf('>', indexOfHtmlEnd);
  407. if (indexOfHtmlEndBracket != -1)
  408. {
  409. htmlContent = htmlContent.Substring(0, indexOfHtmlEndBracket + 1);
  410. }
  411. }
  412. XDocument document;
  413. try
  414. {
  415. document = LoadDocument(htmlContent);
  416. }
  417. catch (InvalidOperationException exc)
  418. {
  419. // sometimes SgmlReader doesn't handle <script> tags well and XDocument.Load() throws,
  420. // so we can retry with the html content with <script> tags stripped off
  421. if (!exc.Message.Contains("EndOfFile"))
  422. {
  423. throw;
  424. }
  425. htmlContent = HtmlUtils.RemoveScriptTags(htmlContent);
  426. document = LoadDocument(htmlContent);
  427. }
  428. return document;
  429. }
  430. private static XDocument LoadDocument(string htmlContent)
  431. {
  432. using (var sgmlReader = new SgmlReader())
  433. {
  434. sgmlReader.CaseFolding = CaseFolding.ToLower;
  435. sgmlReader.DocType = "HTML";
  436. using (var sr = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(htmlContent))))
  437. {
  438. sgmlReader.InputStream = sr;
  439. var document = XDocument.Load(sgmlReader);
  440. return document;
  441. }
  442. }
  443. }
  444. #endregion
  445. }
  446. public static class HtmlUtils
  447. {
  448. public static string RemoveScriptTags(string htmlContent)
  449. {
  450. if (htmlContent == null)
  451. {
  452. throw new ArgumentNullException("htmlContent");
  453. }
  454. if (htmlContent.Length == 0)
  455. {
  456. return "";
  457. }
  458. int indexOfScriptTagStart = htmlContent.IndexOf("<script", StringComparison.OrdinalIgnoreCase);
  459. if (indexOfScriptTagStart == -1)
  460. {
  461. return htmlContent;
  462. }
  463. int indexOfScriptTagEnd = htmlContent.IndexOf("</script>", indexOfScriptTagStart, StringComparison.OrdinalIgnoreCase);
  464. if (indexOfScriptTagEnd == -1)
  465. {
  466. return htmlContent.Substring(0, indexOfScriptTagStart);
  467. }
  468. string strippedHtmlContent =
  469. htmlContent.Substring(0, indexOfScriptTagStart) +
  470. htmlContent.Substring(indexOfScriptTagEnd + "</script>".Length);
  471. return RemoveScriptTags(strippedHtmlContent);
  472. }
  473. }
  474. }