/branches/Jumony 2 RC/Ivony.Html.Parser/JumonyReader.cs

# · C# · 279 lines · 150 code · 82 blank · 47 comment · 22 complexity · 0e67bc6380fa58fdb103cdfa4b48e36b MD5 · raw file

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using Ivony.Html.Parser.ContentModels;
  6. using System.Text.RegularExpressions;
  7. namespace Ivony.Html.Parser
  8. {
  9. /// <summary>
  10. /// Jumony 提供的HTML文档读取器的一个实现
  11. /// </summary>
  12. public class JumonyReader : IHtmlReader
  13. {
  14. private static readonly string tagPattern = string.Format( @"(?<beginTag>{0})|(?<endTag>{1})|(?<comment>{2})|(?<special>{3})|(?<doctype>{4})", Regulars.beginTagPattern, Regulars.endTagPattern, Regulars.commentPattern, Regulars.specialTagPattern, Regulars.doctypeDeclarationPattern );
  15. /// <summary>
  16. /// 用于匹配 HTML 标签的正则表达式对象
  17. /// </summary>
  18. protected static readonly Regex tagRegex = new Regex( tagPattern, RegexOptions.Compiled | RegexOptions.CultureInvariant );
  19. private static bool _isWarmedUp = false;
  20. /// <summary>
  21. /// 调用此方法通知进行预热 JumonyReader
  22. /// </summary>
  23. public static void WarmUp()
  24. {
  25. if ( !_isWarmedUp )
  26. {
  27. tagRegex.IsMatch( "" );
  28. _isWarmedUp = true;
  29. }
  30. }
  31. /// <summary>
  32. /// 创建一个 JumonyReader 对象
  33. /// </summary>
  34. /// <param name="htmlText"></param>
  35. public JumonyReader( string htmlText )
  36. {
  37. if ( htmlText == null )
  38. throw new ArgumentNullException( "htmlText" );
  39. HtmlText = htmlText;
  40. CDataElement = null;
  41. }
  42. /// <summary>
  43. /// 要分析的 HTML 文本
  44. /// </summary>
  45. public string HtmlText
  46. {
  47. get;
  48. private set;
  49. }
  50. /// <summary>
  51. /// 若当前处于 CData 元素内部,此属性指示元素名
  52. /// </summary>
  53. protected string CDataElement
  54. {
  55. get;
  56. private set;
  57. }
  58. void IHtmlReader.EnterCDataMode( string elementName )
  59. {
  60. CDataElement = elementName;
  61. }
  62. /// <summary>
  63. /// 枚举读取到的每一个内容元素
  64. /// </summary>
  65. /// <returns>枚举结果</returns>
  66. public IEnumerable<HtmlContentFragment> EnumerateContent()
  67. {
  68. var index = 0;//读取指针
  69. while ( true )
  70. {
  71. HtmlContentFragment contentNode;
  72. //CData标签处理
  73. if ( CDataElement != null )//如果在CData标签内。
  74. {
  75. contentNode = FindEndTag( index, CDataElement );
  76. CDataElement = null;//自动退出 CData 元素读取模式
  77. }
  78. else
  79. contentNode = NextContentNode( index );
  80. if ( contentNode == null )
  81. {
  82. //处理末尾的文本
  83. if ( index != HtmlText.Length )
  84. yield return CreateText( index, HtmlText.Length );
  85. yield break;
  86. }
  87. else//当读取到了某个节点
  88. {
  89. if ( index < contentNode.StartIndex )
  90. yield return CreateText( index, contentNode.StartIndex );
  91. yield return contentNode;
  92. }
  93. index = contentNode.StartIndex + contentNode.Length;//推后读取指针
  94. }
  95. }
  96. /// <summary>
  97. /// 查找指定元素的结束标签(用于CData元素结束位置查找)
  98. /// </summary>
  99. /// <param name="index">查找的开始位置</param>
  100. /// <param name="elementName">元素名称</param>
  101. /// <returns>找到的结束标签,若已到达文档末尾,则返回 null</returns>
  102. protected virtual HtmlEndTag FindEndTag( int index, string elementName )
  103. {
  104. Regex endTagRegex = HtmlSpecification.GetEndTagRegex( elementName );
  105. var endTagMatch = endTagRegex.Match( HtmlText, index );
  106. if ( !endTagMatch.Success )
  107. return null;
  108. return new HtmlEndTag( CreateFragment( endTagMatch ), elementName );
  109. }
  110. /// <summary>
  111. /// 读取下一个 HTML 内容节点(开始标签、结束标签、注释或特殊节点)
  112. /// </summary>
  113. /// <param name="index">读取开始位置</param>
  114. /// <returns>下一个内容节点,若已经达到文档末尾,则返回 null</returns>
  115. protected virtual HtmlContentFragment NextContentNode( int index )
  116. {
  117. var match = tagRegex.Match( HtmlText, index );
  118. if ( !match.Success )//如果不再有标签的匹配
  119. return null;
  120. if ( match.Groups["beginTag"].Success )
  121. return CreateBeginTag( match );
  122. else if ( match.Groups["endTag"].Success )
  123. return CreateEndTag( match );
  124. else if ( match.Groups["comment"].Success )
  125. return CreateComment( match );
  126. else if ( match.Groups["special"].Success )
  127. return CreateSpacial( match );
  128. else if ( match.Groups["doctype"].Success )
  129. return CreateDoctypeDeclaration( match );
  130. else
  131. throw new InvalidOperationException();
  132. }
  133. /// <summary>
  134. /// 创建开始标签内容对象
  135. /// </summary>
  136. /// <param name="match">开始标签的匹配</param>
  137. /// <returns>开始标签内容对象</returns>
  138. protected virtual HtmlBeginTag CreateBeginTag( Match match )
  139. {
  140. string tagName = match.Groups["tagName"].Value;
  141. bool selfClosed = match.Groups["selfClosed"].Success;
  142. //处理所有属性
  143. var attributes = CreateAttributes( match );
  144. var fragment = CreateFragment( match );
  145. return new HtmlBeginTag( fragment, tagName, selfClosed, attributes );
  146. }
  147. /// <summary>
  148. /// 创建属性设置内容对象
  149. /// </summary>
  150. /// <param name="match">属性设置的匹配</param>
  151. /// <returns>HTML 属性设置的内容对象</returns>
  152. protected virtual IEnumerable<HtmlAttributeSetting> CreateAttributes( Match match )
  153. {
  154. foreach ( Capture capture in match.Groups["attribute"].Captures )
  155. {
  156. string name = capture.FindCaptures( match.Groups["attrName"] ).Single().Value;
  157. string value = capture.FindCaptures( match.Groups["attrValue"] ).Select( c => c.Value ).SingleOrDefault();
  158. yield return new HtmlAttributeSetting( CreateFragment( capture ), name, value );
  159. }
  160. }
  161. protected virtual HtmlEndTag CreateEndTag( Match match )
  162. {
  163. string tagName = match.Groups["tagName"].Value;
  164. var fragment = CreateFragment( match );
  165. return new HtmlEndTag( fragment, tagName );
  166. }
  167. protected virtual HtmlCommentContent CreateComment( Match match )
  168. {
  169. var commentText = match.Groups["commentText"].Value;
  170. var fragment = CreateFragment( match );
  171. return new HtmlCommentContent( fragment, commentText );
  172. }
  173. protected virtual HtmlSpecialTag CreateSpacial( Match match )
  174. {
  175. var raw = match.ToString();
  176. var symbol = raw.Substring( 1, 1 );
  177. var content = match.Groups["specialText"].Value;
  178. var fragment = CreateFragment( match );
  179. return new HtmlSpecialTag( fragment, content, symbol );
  180. }
  181. private HtmlContentFragment CreateDoctypeDeclaration( Match match )
  182. {
  183. var raw = match.ToString();
  184. var fragment = CreateFragment( match );
  185. return new HtmlDoctypeDeclaration( fragment );
  186. }
  187. protected virtual HtmlTextContent CreateText( int startIndex, int endIndex )
  188. {
  189. var text = new HtmlTextContent( new HtmlContentFragment( this, startIndex, endIndex - startIndex ) );
  190. return text;
  191. }
  192. protected HtmlContentFragment CreateFragment( Capture capture )
  193. {
  194. return new HtmlContentFragment( this, capture.Index, capture.Length );
  195. }
  196. }
  197. }