/ScrapySharp/Extensions/HtmlParsingHelper.cs

https://bitbucket.org/rflechner/scrapysharp · C# · 270 lines · 134 code · 36 blank · 100 comment · 16 complexity · 861d7134561f1fb6a7c0bbc7b676d042 MD5 · raw file

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Globalization;
  4. using System.IO;
  5. using System.Net;
  6. using System.Text;
  7. using System.Text.RegularExpressions;
  8. using System.Linq;
  9. using System.Web;
  10. using HtmlAgilityPack;
  11. using ScrapySharp.Html.Dom;
  12. namespace ScrapySharp.Extensions
  13. {
  14. public static class HtmlParsingHelper
  15. {
  16. private static Regex spacesRegex = new Regex("[ ]+", RegexOptions.Compiled);
  17. private static Regex asciiRegex = new Regex("(([=][0-9A-F]{0,2})+)|([ ]+)", RegexOptions.Compiled);
  18. /// <summary>
  19. /// Convert a string to a date.
  20. /// </summary>
  21. /// <param name="value">The value.</param>
  22. /// <returns></returns>
  23. public static DateTime ToDate(this string value)
  24. {
  25. return Convert.ToDateTime(value);
  26. }
  27. /// <summary>
  28. /// Convert a string to a date.
  29. /// </summary>
  30. /// <param name="value">The value.</param>
  31. /// <param name="format">The format.</param>
  32. /// <returns></returns>
  33. public static DateTime ToDate(this string value, string format)
  34. {
  35. return ToDate(value, format, CultureInfo.InvariantCulture);
  36. }
  37. /// <summary>
  38. /// Convert a string to a date.
  39. /// </summary>
  40. /// <param name="value">The value.</param>
  41. /// <param name="format">The format.</param>
  42. /// <param name="cultureInfo">The culture info.</param>
  43. /// <returns></returns>
  44. public static DateTime ToDate(this string value, string format, CultureInfo cultureInfo)
  45. {
  46. DateTime result;
  47. if (DateTime.TryParseExact(value, format, cultureInfo, DateTimeStyles.None, out result))
  48. return result;
  49. return DateTime.MinValue;
  50. }
  51. /// <summary>
  52. /// Gets the attribute value.
  53. /// </summary>
  54. /// <param name="node">The node.</param>
  55. /// <param name="name">The name.</param>
  56. /// <returns></returns>
  57. public static string GetAttributeValue(this HtmlNode node, string name)
  58. {
  59. return node.GetAttributeValue(name, string.Empty);
  60. }
  61. /// <summary>
  62. /// Convert string value to HTML node.
  63. /// </summary>
  64. /// <param name="content">The content.</param>
  65. /// <returns></returns>
  66. public static HtmlNode ToHtmlNode(this string content)
  67. {
  68. var document = new HtmlDocument();
  69. document.LoadHtml(content);
  70. return document.DocumentNode;
  71. }
  72. /// <summary>
  73. /// Convert WebResponse content to HTML node.
  74. /// </summary>
  75. /// <param name="response">The response.</param>
  76. /// <returns></returns>
  77. public static HtmlNode ToHtmlNode(this WebResponse response)
  78. {
  79. var document = new HtmlDocument();
  80. string html;
  81. var responseStream = response.GetResponseStream();
  82. if (responseStream == null)
  83. html = string.Empty;
  84. else
  85. using (var reader = new StreamReader(responseStream))
  86. html = reader.ReadToEnd();
  87. document.LoadHtml(html);
  88. return document.DocumentNode;
  89. }
  90. /// <summary>
  91. /// Convert string value to HDocument.
  92. /// </summary>
  93. /// <param name="content">The content.</param>
  94. /// <returns></returns>
  95. public static HDocument ToHDocument(this string content)
  96. {
  97. return HDocument.Parse(content);
  98. }
  99. /// <summary>
  100. /// Convert WebResponse content to HDocument.
  101. /// </summary>
  102. /// <param name="response">The response.</param>
  103. /// <returns></returns>
  104. public static HDocument ToHDocument(this WebResponse response)
  105. {
  106. string html;
  107. var responseStream = response.GetResponseStream();
  108. if (responseStream == null)
  109. html = string.Empty;
  110. else
  111. using (var reader = new StreamReader(responseStream))
  112. html = reader.ReadToEnd();
  113. return html.ToHDocument();
  114. }
  115. /// <summary>
  116. /// Gets the next sibling with specified tag name.
  117. /// </summary>
  118. /// <param name="node">The node.</param>
  119. /// <param name="name">The name.</param>
  120. /// <returns></returns>
  121. public static HtmlNode GetNextSibling(this HtmlNode node, string name)
  122. {
  123. var currentNode = node.NextSibling;
  124. while (currentNode.NextSibling != null && currentNode.Name != name)
  125. currentNode = currentNode.NextSibling;
  126. return currentNode.Name == name ? currentNode : null;
  127. }
  128. /// <summary>
  129. /// Gets the next table cell value.
  130. /// </summary>
  131. /// <param name="node">The node.</param>
  132. /// <param name="name">The name.</param>
  133. /// <param name="comparison">The comparison type.</param>
  134. /// <returns></returns>
  135. public static HtmlValue GetNextTableCellValue(this HtmlNode node, string name)
  136. {
  137. var results = GetNodesFollowedByValue(node, "td", name, NodeValueComparison.Equals);
  138. if (!results.Any())
  139. return null;
  140. var innerText = results.LastOrDefault().InnerText.CleanInnerHtmlAscii().CleanInnerText();
  141. if (innerText.StartsWith(":"))
  142. innerText = innerText.Substring(1).CleanInnerHtmlAscii().CleanInnerText();
  143. return innerText;
  144. }
  145. /// <summary>
  146. /// Gets the next table cell value.
  147. /// </summary>
  148. /// <param name="node">The node.</param>
  149. /// <param name="name">The name.</param>
  150. /// <param name="comparison">The comparison type.</param>
  151. /// <returns></returns>
  152. public static HtmlValue GetNextTableCellValue(this HtmlNode node, string name, NodeValueComparison comparison/* = NodeValueComparison.Equals*/)
  153. {
  154. var results = GetNodesFollowedByValue(node, "td", name, comparison);
  155. if (!results.Any())
  156. return null;
  157. var innerText = results.LastOrDefault().InnerText.CleanInnerHtmlAscii().CleanInnerText();
  158. if (innerText.StartsWith(":"))
  159. innerText = innerText.Substring(1).CleanInnerHtmlAscii().CleanInnerText();
  160. return innerText;
  161. }
  162. /// <summary>
  163. /// Gets the nodes followed by value.
  164. /// </summary>
  165. /// <param name="node">The node.</param>
  166. /// <param name="name">The name.</param>
  167. /// <param name="value">The value.</param>
  168. /// <param name="comparison">The comparison.</param>
  169. /// <returns></returns>
  170. public static IEnumerable<HtmlNode> GetNodesFollowedByValue(this HtmlNode node, string name, string value, NodeValueComparison comparison = NodeValueComparison.Equals)
  171. {
  172. var comparer = new NodeValueComparer(comparison);
  173. var cleanName = value.CleanInnerText();
  174. return (from d in node.Descendants(name)
  175. where comparer.Compare(d.InnerText.CleanInnerHtmlAscii().CleanInnerText(), cleanName)
  176. select d.GetNextSibling(name)).ToArray();
  177. }
  178. /// <summary>
  179. /// Gets the nodes followed by value.
  180. /// </summary>
  181. /// <param name="nodes">The nodes.</param>
  182. /// <param name="name">The name.</param>
  183. /// <param name="value">The value.</param>
  184. /// <param name="comparison">The comparison.</param>
  185. /// <returns></returns>
  186. public static IEnumerable<HtmlNode> GetNodesFollowedByValue(this IEnumerable<HtmlNode> nodes, string name, string value, NodeValueComparison comparison = NodeValueComparison.Equals)
  187. {
  188. return nodes.SelectMany(node => node.GetNodesFollowedByValue(name, value, comparison));
  189. }
  190. /// <summary>
  191. /// Gets the next table line value.
  192. /// </summary>
  193. /// <param name="node">The node.</param>
  194. /// <param name="name">The name.</param>
  195. /// <param name="comparison">The comparison type.</param>
  196. /// <returns></returns>
  197. public static HtmlValue GetNextTableLineValue(this HtmlNode node, string name, NodeValueComparison comparison = NodeValueComparison.Equals)
  198. {
  199. var results = GetNodesFollowedByValue(node, "tr", name, comparison);
  200. if (!results.Any())
  201. return null;
  202. var innerText = results.FirstOrDefault().InnerText.CleanInnerHtmlAscii().CleanInnerText();
  203. if (innerText.StartsWith(":"))
  204. innerText = innerText.Substring(1).CleanInnerHtmlAscii().CleanInnerText();
  205. return innerText;
  206. }
  207. /// <summary>
  208. /// Cleans the inner HTML ASCII.
  209. /// </summary>
  210. /// <example>
  211. /// "text =09".CleanInnerHtmlAscii() returns "text "
  212. /// </example>
  213. /// <param name="expression">The expression.</param>
  214. /// <returns></returns>
  215. public static string CleanInnerHtmlAscii(this string expression)
  216. {
  217. var cleaned = expression.Replace("=C3=B4", "ô");
  218. cleaned = asciiRegex.Replace(cleaned, " ");
  219. return cleaned;
  220. }
  221. /// <summary>
  222. /// Cleans the inner text from excessive spaces characters.
  223. /// </summary>
  224. /// <param name="expression">The expression.</param>
  225. /// <returns></returns>
  226. public static string CleanInnerText(this string expression)
  227. {
  228. var cleaned = expression.Replace('\t', ' ').Replace('\r', ' ')
  229. .Replace('\n', ' ');
  230. cleaned = HttpUtility.HtmlDecode(cleaned);
  231. return spacesRegex.Replace(cleaned, " ").Trim();
  232. }
  233. }
  234. }