/branches/v2.1/Ivony.Html.Parser/JumonyReader.cs

# · C# · 294 lines · 140 code · 79 blank · 75 comment · 21 complexity · e629797acd8cdb59d5b4b3ee51595a35 MD5 · raw file

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using Ivony.Html.Parser.ContentModels;
  6. using System.Text.RegularExpressions;
  7. namespace Ivony.Html.Parser
  8. {
  9. /// <summary>
  10. /// Jumony 提供的HTML文档读取器的一个实现
  11. /// </summary>
  12. public class JumonyReader : IHtmlReader
  13. {
  14. /// <summary>
  15. /// 用于匹配 HTML 标签的正则表达式对象
  16. /// </summary>
  17. protected static readonly Regex tagRegex = new Regulars.HtmlTag();
  18. /// <summary>
  19. /// 创建一个 JumonyReader 对象
  20. /// </summary>
  21. /// <param name="htmlText"></param>
  22. public JumonyReader( string htmlText )
  23. {
  24. if ( htmlText == null )
  25. throw new ArgumentNullException( "htmlText" );
  26. HtmlText = htmlText;
  27. CDataElement = null;
  28. }
  29. /// <summary>
  30. /// 要分析的 HTML 文本
  31. /// </summary>
  32. public string HtmlText
  33. {
  34. get;
  35. private set;
  36. }
  37. /// <summary>
  38. /// 若当前处于 CData 元素内部,此属性指示元素名
  39. /// </summary>
  40. protected string CDataElement
  41. {
  42. get;
  43. private set;
  44. }
  45. void IHtmlReader.EnterCDataMode( string elementName )
  46. {
  47. CDataElement = elementName;
  48. }
  49. /// <summary>
  50. /// 枚举读取到的每一个内容元素
  51. /// </summary>
  52. /// <returns>枚举结果</returns>
  53. public IEnumerable<HtmlContentFragment> EnumerateContent()
  54. {
  55. var index = 0;//读取指针
  56. while ( true )
  57. {
  58. HtmlContentFragment contentNode;
  59. //CData标签处理
  60. if ( CDataElement != null )//如果在CData标签内。
  61. {
  62. contentNode = FindEndTag( index, CDataElement );
  63. CDataElement = null;//自动退出 CData 元素读取模式
  64. }
  65. else
  66. contentNode = NextContentNode( index );
  67. if ( contentNode == null )
  68. {
  69. //处理末尾的文本
  70. if ( index != HtmlText.Length )
  71. yield return CreateText( index, HtmlText.Length );
  72. yield break;
  73. }
  74. else//当读取到了某个节点
  75. {
  76. if ( index < contentNode.StartIndex )
  77. yield return CreateText( index, contentNode.StartIndex );
  78. yield return contentNode;
  79. }
  80. index = contentNode.StartIndex + contentNode.Length;//推后读取指针
  81. }
  82. }
  83. /// <summary>
  84. /// 查找指定元素的结束标签(用于CData元素结束位置查找)
  85. /// </summary>
  86. /// <param name="index">查找的开始位置</param>
  87. /// <param name="elementName">元素名称</param>
  88. /// <returns>找到的结束标签,若已到达文档末尾,则返回 null</returns>
  89. protected virtual HtmlEndTag FindEndTag( int index, string elementName )
  90. {
  91. Regex endTagRegex = HtmlSpecification.GetEndTagRegex( elementName );
  92. var endTagMatch = endTagRegex.Match( HtmlText, index );
  93. if ( !endTagMatch.Success )
  94. return null;
  95. return new HtmlEndTag( CreateFragment( endTagMatch ), elementName );
  96. }
  97. /// <summary>
  98. /// 读取下一个 HTML 内容节点(开始标签、结束标签、注释或特殊节点)
  99. /// </summary>
  100. /// <param name="index">读取开始位置</param>
  101. /// <returns>下一个内容节点,若已经达到文档末尾,则返回 null</returns>
  102. protected virtual HtmlContentFragment NextContentNode( int index )
  103. {
  104. var match = tagRegex.Match( HtmlText, index );
  105. if ( !match.Success )//如果不再有标签的匹配
  106. return null;
  107. if ( match.Groups["beginTag"].Success )
  108. return CreateBeginTag( match );
  109. else if ( match.Groups["endTag"].Success )
  110. return CreateEndTag( match );
  111. else if ( match.Groups["comment"].Success )
  112. return CreateComment( match );
  113. else if ( match.Groups["special"].Success )
  114. return CreateSpacial( match );
  115. else if ( match.Groups["doctype"].Success )
  116. return CreateDoctypeDeclaration( match );
  117. else
  118. throw new InvalidOperationException();
  119. }
  120. /// <summary>
  121. /// 创建开始标签内容对象
  122. /// </summary>
  123. /// <param name="match">开始标签的匹配</param>
  124. /// <returns>开始标签内容对象</returns>
  125. protected virtual HtmlBeginTag CreateBeginTag( Match match )
  126. {
  127. string tagName = match.Groups["tagName"].Value;
  128. bool selfClosed = match.Groups["selfClosed"].Success;
  129. //处理所有属性
  130. var attributes = CreateAttributes( match );
  131. var fragment = CreateFragment( match );
  132. return new HtmlBeginTag( fragment, tagName, selfClosed, attributes );
  133. }
  134. /// <summary>
  135. /// 创建属性设置内容对象
  136. /// </summary>
  137. /// <param name="match">属性设置的匹配</param>
  138. /// <returns>HTML 属性设置的内容对象</returns>
  139. protected virtual IEnumerable<HtmlAttributeSetting> CreateAttributes( Match match )
  140. {
  141. foreach ( Capture capture in match.Groups["attribute"].Captures )
  142. {
  143. string name = capture.FindCaptures( match.Groups["attrName"] ).Single().Value;
  144. string value = capture.FindCaptures( match.Groups["attrValue"] ).Select( c => c.Value ).SingleOrDefault();
  145. yield return new HtmlAttributeSetting( CreateFragment( capture ), name, value );
  146. }
  147. }
  148. /// <summary>
  149. /// 根据匹配到的结果,创建一个结束标签
  150. /// </summary>
  151. /// <param name="match">正则表达式匹配结果</param>
  152. /// <returns>用于描述结束标签内容的对象</returns>
  153. protected virtual HtmlEndTag CreateEndTag( Match match )
  154. {
  155. string tagName = match.Groups["tagName"].Value;
  156. var fragment = CreateFragment( match );
  157. return new HtmlEndTag( fragment, tagName );
  158. }
  159. /// <summary>
  160. /// 根据匹配到的结果,创建一个注释标签
  161. /// </summary>
  162. /// <param name="match">正则表达式匹配结果</param>
  163. /// <returns>用于描述注释标签内容的对象</returns>
  164. protected virtual HtmlCommentContent CreateComment( Match match )
  165. {
  166. var commentText = match.Groups["commentText"].Value;
  167. var fragment = CreateFragment( match );
  168. return new HtmlCommentContent( fragment, commentText );
  169. }
  170. /// <summary>
  171. /// 根据匹配到的结果,创建一个特殊标签
  172. /// </summary>
  173. /// <param name="match">正则表达式匹配结果</param>
  174. /// <returns>用于描述特殊标签内容的对象</returns>
  175. protected virtual HtmlSpecialTag CreateSpacial( Match match )
  176. {
  177. var raw = match.ToString();
  178. var symbol = raw.Substring( 1, 1 );
  179. var content = match.Groups["specialText"].Value;
  180. var fragment = CreateFragment( match );
  181. return new HtmlSpecialTag( fragment, content, symbol );
  182. }
  183. /// <summary>
  184. /// 根据匹配到的结果,创建一个文档声明标签
  185. /// </summary>
  186. /// <param name="match">正则表达式匹配结果</param>
  187. /// <returns>用于描述文档声明标签内容的对象</returns>
  188. private HtmlContentFragment CreateDoctypeDeclaration( Match match )
  189. {
  190. var raw = match.ToString();
  191. var fragment = CreateFragment( match );
  192. return new HtmlDoctypeDeclaration( fragment );
  193. }
  194. /// <summary>
  195. /// 创建一段文本内容
  196. /// </summary>
  197. ///<param name="startIndex">文本开始位置</param>
  198. /// <param name="endIndex">文本结束位置</param>
  199. /// <returns>用于描述文档声明标签内容的对象</returns>
  200. protected virtual HtmlTextContent CreateText( int startIndex, int endIndex )
  201. {
  202. var text = new HtmlTextContent( new HtmlContentFragment( this, startIndex, endIndex - startIndex ) );
  203. return text;
  204. }
  205. /// <summary>
  206. /// 创建一个文档内容片段对象
  207. /// </summary>
  208. /// <param name="capture">捕获到的字符串</param>
  209. /// <returns>文档内容片段对象</returns>
  210. protected HtmlContentFragment CreateFragment( Capture capture )
  211. {
  212. return new HtmlContentFragment( this, capture.Index, capture.Length );
  213. }
  214. }
  215. }