PageRenderTime 51ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/CBR/CBR.Core/Helpers/HTML/HtmlParser.cs

#
C# | 565 lines | 294 code | 78 blank | 193 comment | 64 complexity | d6dc8162b8d8ba83fd4c7b9a447a1c44 MD5 | raw file
  1. //---------------------------------------------------------------------------
  2. //
  3. // File: HtmlParser.cs
  4. //
  5. // Copyright (C) Microsoft Corporation. All rights reserved.
  6. //
  7. // Description: Parser for Html-to-Xaml converter
  8. //
  9. //---------------------------------------------------------------------------
  10. using System;
  11. using System.Xml;
  12. using System.Diagnostics;
  13. using System.Collections;
  14. using System.Collections.Generic;
  15. using System.IO;
  16. using System.Text; // StringBuilder
  17. // important TODOS:
  18. // TODO 1. Start tags: The ParseXmlElement function has been modified to be called after both the
  19. // angle bracket < and element name have been read, instead of just the < bracket and some valid name character,
  20. // previously the case. This change was made so that elements with optional closing tags could read a new
  21. // element's start tag and decide whether they were required to close. However, there is a question of whether to
  22. // handle this in the parser or lexical analyzer. It is currently handled in the parser - the lexical analyzer still
  23. // recognizes a start tag opener as a '<' + valid name start char; it is the parser that reads the actual name.
  24. // this is correct behavior assuming that the name is a valid html name, because the lexical analyzer should not know anything
  25. // about optional closing tags, etc. UPDATED: 10/13/2004: I am updating this to read the whole start tag of something
  26. // that is not an HTML, treat it as empty, and add it to the tree. That way the converter will know it's there, but
  27. // it will hvae no content. We could also partially recover by trying to look up and match names if they are similar
  28. // TODO 2. Invalid element names: However, it might make sense to give the lexical analyzer the ability to identify
  29. // a valid html element name and not return something as a start tag otherwise. For example, if we type <good>, should
  30. // the lexical analyzer return that it has found the start of an element when this is not the case in HTML? But this will
  31. // require implementing a lookahead token in the lexical analyzer so that it can treat an invalid element name as text. One
  32. // character of lookahead will not be enough.
  33. // TODO 3. Attributes: The attribute recovery is poor when reading attribute values in quotes - if no closing quotes are found,
  34. // the lexical analyzer just keeps reading and if it eventually reaches the end of file, it would have just skipped everything.
  35. // There are a couple of ways to deal with this: 1) stop reading attributes when we encounter a '>' character - this doesn't allow
  36. // the '>' character to be used in attribute values, but it can still be used as an entity. 2) Maintain a HTML-specific list
  37. // of attributes and their values that each html element can take, and if we find correct attribute namesand values for an
  38. // element we use them regardless of the quotes, this way we could just ignore something invalid. One more option: 3) Read ahead
  39. // in the quoted value and if we find an end of file, we can return to where we were and process as text. However this requires
  40. // a lot of lookahead and a resettable reader.
  41. // TODO 4: elements with optional closing tags: For elements with optional closing tags, we always close the element if we find
  42. // that one of it's ancestors has closed. This condition may be too broad and we should develop a better heuristic. We should also
  43. // improve the heuristics for closing certain elements when the next element starts
  44. // TODO 5. Nesting: Support for unbalanced nesting, e.g. <b> <i> </b> </i>: this is not presently supported. To support it we may need
  45. // to maintain two xml elements, one the element that represents what has already been read and another represents what we are presently reading.
  46. // Then if we encounter an unbalanced nesting tag we could close the element that was supposed to close, save the current element
  47. // and store it in the list of already-read content, and then open a new element to which all tags that are currently open
  48. // can be applied. Is there a better way to do this? Should we do it at all?
  49. // TODO 6. Elements with optional starting tags: there are 4 such elements in the HTML 4 specification - html, tbody, body and head.
  50. // The current recovery doesn;t do anything for any of these elements except the html element, because it's not critical - head
  51. // and body elementscan be contained within html element, and tbody is contained within table. To extend this for XHTML
  52. // extensions, and to recover in case other elements are missing start tags, we would need to insert an extra recursive call
  53. // to ParseXmlElement for the missing start tag. It is suggested to do this by giving ParseXmlElement an argument that specifies
  54. // a name to use. If this argument is null, it assumes its name is the next token from the lexical analyzer and continues
  55. // exactly as it does now. However, if the argument contains a valid html element name then it takes that value as its name
  56. // and continues as before. This way, if the next token is the element that should actually be its child, it will see
  57. // the name in the next step and initiate a recursive call. We would also need to add some logic in the loop for when a start tag
  58. // is found - if the start tag is not compatible with current context and indicates that a start tag has been missed, then we
  59. // can initiate the extra recursive call and give it the name of the missed start tag. The issues are when to insert this logic,
  60. // and if we want to support it over multiple missing start tags. If we insert it at the time a start tag is read in element
  61. // text, then we can support only one missing start tag, since the extra call will read the next start tag and make a recursive
  62. // call without checking the context. This is a conceptual problem, and the check should be made just before a recursive call,
  63. // with the choice being whether we should supply an element name as argument, or leave it as NULL and read from the input
  64. // TODO 7: Context: Is it appropriate to keep track of context here? For example, should we only expect td, tr elements when
  65. // reading a table and ignore them otherwise? This may be too much of a load on the parser, I think it's better if the converter
  66. // deals with it
  67. namespace HTMLConverter
  68. {
  69. /// <summary>
  70. /// HtmlParser class accepts a string of possibly badly formed Html, parses it and returns a string
  71. /// of well-formed Html that is as close to the original string in content as possible
  72. /// </summary>
  73. internal class HtmlParser
  74. {
  75. // ---------------------------------------------------------------------
  76. //
  77. // Constructors
  78. //
  79. // ---------------------------------------------------------------------
  80. #region Constructors
  81. /// <summary>
  82. /// Constructor. Initializes the _htmlLexicalAnalayzer element with the given input string
  83. /// </summary>
  84. /// <param name="inputString">
  85. /// string to parsed into well-formed Html
  86. /// </param>
  87. private HtmlParser(string inputString)
  88. {
  89. // Create an output xml document
  90. _document = new XmlDocument();
  91. // initialize open tag stack
  92. _openedElements = new Stack<XmlElement>();
  93. _pendingInlineElements = new Stack<XmlElement>();
  94. // initialize lexical analyzer
  95. _htmlLexicalAnalyzer = new HtmlLexicalAnalyzer(inputString);
  96. // get first token from input, expecting text
  97. _htmlLexicalAnalyzer.GetNextContentToken();
  98. }
  99. #endregion Constructors
  100. // ---------------------------------------------------------------------
  101. //
  102. // Internal Methods
  103. //
  104. // ---------------------------------------------------------------------
  105. #region Internal Methods
  106. /// <summary>
  107. /// Instantiates an HtmlParser element and calls the parsing function on the given input string
  108. /// </summary>
  109. /// <param name="htmlString">
  110. /// Input string of pssibly badly-formed Html to be parsed into well-formed Html
  111. /// </param>
  112. /// <returns>
  113. /// XmlElement rep
  114. /// </returns>
  115. internal static XmlElement ParseHtml(string htmlString)
  116. {
  117. HtmlParser htmlParser = new HtmlParser(htmlString);
  118. XmlElement htmlRootElement = htmlParser.ParseHtmlContent();
  119. return htmlRootElement;
  120. }
  121. // .....................................................................
  122. //
  123. // Html Header on Clipboard
  124. //
  125. // .....................................................................
  126. // Html header structure.
  127. // Version:1.0
  128. // StartHTML:000000000
  129. // EndHTML:000000000
  130. // StartFragment:000000000
  131. // EndFragment:000000000
  132. // StartSelection:000000000
  133. // EndSelection:000000000
  134. internal const string HtmlHeader = "Version:1.0\r\nStartHTML:{0:D10}\r\nEndHTML:{1:D10}\r\nStartFragment:{2:D10}\r\nEndFragment:{3:D10}\r\nStartSelection:{4:D10}\r\nEndSelection:{5:D10}\r\n";
  135. internal const string HtmlStartFragmentComment = "<!--StartFragment-->";
  136. internal const string HtmlEndFragmentComment = "<!--EndFragment-->";
  137. /// <summary>
  138. /// Extracts Html string from clipboard data by parsing header information in htmlDataString
  139. /// </summary>
  140. /// <param name="htmlDataString">
  141. /// String representing Html clipboard data. This includes Html header
  142. /// </param>
  143. /// <returns>
  144. /// String containing only the Html data part of htmlDataString, without header
  145. /// </returns>
  146. internal static string ExtractHtmlFromClipboardData(string htmlDataString)
  147. {
  148. int startHtmlIndex = htmlDataString.IndexOf("StartHTML:");
  149. if (startHtmlIndex < 0)
  150. {
  151. return "ERROR: Urecognized html header";
  152. }
  153. // TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
  154. // which could be wrong assumption. We need to implement more flrxible parsing here
  155. startHtmlIndex = Int32.Parse(htmlDataString.Substring(startHtmlIndex + "StartHTML:".Length, "0123456789".Length));
  156. if (startHtmlIndex < 0 || startHtmlIndex > htmlDataString.Length)
  157. {
  158. return "ERROR: Urecognized html header";
  159. }
  160. int endHtmlIndex = htmlDataString.IndexOf("EndHTML:");
  161. if (endHtmlIndex < 0)
  162. {
  163. return "ERROR: Urecognized html header";
  164. }
  165. // TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
  166. // which could be wrong assumption. We need to implement more flrxible parsing here
  167. endHtmlIndex = Int32.Parse(htmlDataString.Substring(endHtmlIndex + "EndHTML:".Length, "0123456789".Length));
  168. if (endHtmlIndex > htmlDataString.Length)
  169. {
  170. endHtmlIndex = htmlDataString.Length;
  171. }
  172. return htmlDataString.Substring(startHtmlIndex, endHtmlIndex - startHtmlIndex);
  173. }
  174. /// <summary>
  175. /// Adds Xhtml header information to Html data string so that it can be placed on clipboard
  176. /// </summary>
  177. /// <param name="htmlString">
  178. /// Html string to be placed on clipboard with appropriate header
  179. /// </param>
  180. /// <returns>
  181. /// String wrapping htmlString with appropriate Html header
  182. /// </returns>
  183. internal static string AddHtmlClipboardHeader(string htmlString)
  184. {
  185. StringBuilder stringBuilder = new StringBuilder();
  186. // each of 6 numbers is represented by "{0:D10}" in the format string
  187. // must actually occupy 10 digit positions ("0123456789")
  188. int startHTML = HtmlHeader.Length + 6 * ("0123456789".Length - "{0:D10}".Length);
  189. int endHTML = startHTML + htmlString.Length;
  190. int startFragment = htmlString.IndexOf(HtmlStartFragmentComment, 0);
  191. if (startFragment >= 0)
  192. {
  193. startFragment = startHTML + startFragment + HtmlStartFragmentComment.Length;
  194. }
  195. else
  196. {
  197. startFragment = startHTML;
  198. }
  199. int endFragment = htmlString.IndexOf(HtmlEndFragmentComment, 0);
  200. if (endFragment >= 0)
  201. {
  202. endFragment = startHTML + endFragment;
  203. }
  204. else
  205. {
  206. endFragment = endHTML;
  207. }
  208. // Create HTML clipboard header string
  209. stringBuilder.AppendFormat(HtmlHeader, startHTML, endHTML, startFragment, endFragment, startFragment, endFragment);
  210. // Append HTML body.
  211. stringBuilder.Append(htmlString);
  212. return stringBuilder.ToString();
  213. }
  214. #endregion Internal Methods
  215. // ---------------------------------------------------------------------
  216. //
  217. // Private methods
  218. //
  219. // ---------------------------------------------------------------------
  220. #region Private Methods
  221. private void InvariantAssert(bool condition, string message)
  222. {
  223. if (!condition)
  224. {
  225. throw new Exception("Assertion error: " + message);
  226. }
  227. }
  228. /// <summary>
  229. /// Parses the stream of html tokens starting
  230. /// from the name of top-level element.
  231. /// Returns XmlElement representing the top-level
  232. /// html element
  233. /// </summary>
  234. private XmlElement ParseHtmlContent()
  235. {
  236. // Create artificial root elelemt to be able to group multiple top-level elements
  237. // We create "html" element which may be a duplicate of real HTML element, which is ok, as HtmlConverter will swallow it painlessly..
  238. XmlElement htmlRootElement = _document.CreateElement("html", XhtmlNamespace);
  239. OpenStructuringElement(htmlRootElement);
  240. while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EOF)
  241. {
  242. if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.OpeningTagStart)
  243. {
  244. _htmlLexicalAnalyzer.GetNextTagToken();
  245. if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
  246. {
  247. string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
  248. _htmlLexicalAnalyzer.GetNextTagToken();
  249. // Create an element
  250. XmlElement htmlElement = _document.CreateElement(htmlElementName, XhtmlNamespace);
  251. // Parse element attributes
  252. ParseAttributes(htmlElement);
  253. if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd || HtmlSchema.IsEmptyElement(htmlElementName))
  254. {
  255. // It is an element without content (because of explicit slash or based on implicit knowledge aboout html)
  256. AddEmptyElement(htmlElement);
  257. }
  258. else if (HtmlSchema.IsInlineElement(htmlElementName))
  259. {
  260. // Elements known as formatting are pushed to some special
  261. // pending stack, which allows them to be transferred
  262. // over block tags - by doing this we convert
  263. // overlapping tags into normal heirarchical element structure.
  264. OpenInlineElement(htmlElement);
  265. }
  266. else if (HtmlSchema.IsBlockElement(htmlElementName) || HtmlSchema.IsKnownOpenableElement(htmlElementName))
  267. {
  268. // This includes no-scope elements
  269. OpenStructuringElement(htmlElement);
  270. }
  271. else
  272. {
  273. // Do nothing. Skip the whole opening tag.
  274. // Ignoring all unknown elements on their start tags.
  275. // Thus we will ignore them on closinng tag as well.
  276. // Anyway we don't know what to do withthem on conversion to Xaml.
  277. }
  278. }
  279. else
  280. {
  281. // Note that the token following opening angle bracket must be a name - lexical analyzer must guarantee that.
  282. // Otherwise - we skip the angle bracket and continue parsing the content as if it is just text.
  283. // Add the following asserion here, right? or output "<" as a text run instead?:
  284. // InvariantAssert(false, "Angle bracket without a following name is not expected");
  285. }
  286. }
  287. else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.ClosingTagStart)
  288. {
  289. _htmlLexicalAnalyzer.GetNextTagToken();
  290. if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
  291. {
  292. string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
  293. // Skip the name token. Assume that the following token is end of tag,
  294. // but do not check this. If it is not true, we simply ignore one token
  295. // - this is our recovery from bad xml in this case.
  296. _htmlLexicalAnalyzer.GetNextTagToken();
  297. CloseElement(htmlElementName);
  298. }
  299. }
  300. else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Text)
  301. {
  302. AddTextContent(_htmlLexicalAnalyzer.NextToken);
  303. }
  304. else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Comment)
  305. {
  306. AddComment(_htmlLexicalAnalyzer.NextToken);
  307. }
  308. _htmlLexicalAnalyzer.GetNextContentToken();
  309. }
  310. // Get rid of the artificial root element
  311. if (htmlRootElement.FirstChild is XmlElement &&
  312. htmlRootElement.FirstChild == htmlRootElement.LastChild &&
  313. htmlRootElement.FirstChild.LocalName.ToLower() == "html")
  314. {
  315. htmlRootElement = (XmlElement)htmlRootElement.FirstChild;
  316. }
  317. return htmlRootElement;
  318. }
  319. private XmlElement CreateElementCopy(XmlElement htmlElement)
  320. {
  321. XmlElement htmlElementCopy = _document.CreateElement(htmlElement.LocalName, XhtmlNamespace);
  322. for (int i = 0; i < htmlElement.Attributes.Count; i++)
  323. {
  324. XmlAttribute attribute = htmlElement.Attributes[i];
  325. htmlElementCopy.SetAttribute(attribute.Name, attribute.Value);
  326. }
  327. return htmlElementCopy;
  328. }
  329. private void AddEmptyElement(XmlElement htmlEmptyElement)
  330. {
  331. InvariantAssert(_openedElements.Count > 0, "AddEmptyElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
  332. XmlElement htmlParent = _openedElements.Peek();
  333. htmlParent.AppendChild(htmlEmptyElement);
  334. }
  335. private void OpenInlineElement(XmlElement htmlInlineElement)
  336. {
  337. _pendingInlineElements.Push(htmlInlineElement);
  338. }
  339. // Opens structurig element such as Div or Table etc.
  340. private void OpenStructuringElement(XmlElement htmlElement)
  341. {
  342. // Close all pending inline elements
  343. // All block elements are considered as delimiters for inline elements
  344. // which forces all inline elements to be closed and re-opened in the following
  345. // structural element (if any).
  346. // By doing that we guarantee that all inline elements appear only within most nested blocks
  347. if (HtmlSchema.IsBlockElement(htmlElement.LocalName))
  348. {
  349. while (_openedElements.Count > 0 && HtmlSchema.IsInlineElement(_openedElements.Peek().LocalName))
  350. {
  351. XmlElement htmlInlineElement = _openedElements.Pop();
  352. InvariantAssert(_openedElements.Count > 0, "OpenStructuringElement: stack of opened elements cannot become empty here");
  353. _pendingInlineElements.Push(CreateElementCopy(htmlInlineElement));
  354. }
  355. }
  356. // Add this block element to its parent
  357. if (_openedElements.Count > 0)
  358. {
  359. XmlElement htmlParent = _openedElements.Peek();
  360. // Check some known block elements for auto-closing (LI and P)
  361. if (HtmlSchema.ClosesOnNextElementStart(htmlParent.LocalName, htmlElement.LocalName))
  362. {
  363. _openedElements.Pop();
  364. htmlParent = _openedElements.Count > 0 ? _openedElements.Peek() : null;
  365. }
  366. if (htmlParent != null)
  367. {
  368. // NOTE:
  369. // Actually we never expect null - it would mean two top-level P or LI (without a parent).
  370. // In such weird case we will loose all paragraphs except the first one...
  371. htmlParent.AppendChild(htmlElement);
  372. }
  373. }
  374. // Push it onto a stack
  375. _openedElements.Push(htmlElement);
  376. }
  377. private bool IsElementOpened(string htmlElementName)
  378. {
  379. foreach (XmlElement openedElement in _openedElements)
  380. {
  381. if (openedElement.LocalName == htmlElementName)
  382. {
  383. return true;
  384. }
  385. }
  386. return false;
  387. }
  388. private void CloseElement(string htmlElementName)
  389. {
  390. // Check if the element is opened and already added to the parent
  391. InvariantAssert(_openedElements.Count > 0, "CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
  392. // Check if the element is opened and still waiting to be added to the parent
  393. if (_pendingInlineElements.Count > 0 && _pendingInlineElements.Peek().LocalName == htmlElementName)
  394. {
  395. // Closing an empty inline element.
  396. // Note that HtmlConverter will skip empty inlines, but for completeness we keep them here on parser level.
  397. XmlElement htmlInlineElement = _pendingInlineElements.Pop();
  398. InvariantAssert(_openedElements.Count > 0, "CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
  399. XmlElement htmlParent = _openedElements.Peek();
  400. htmlParent.AppendChild(htmlInlineElement);
  401. return;
  402. }
  403. else if (IsElementOpened(htmlElementName))
  404. {
  405. while (_openedElements.Count > 1) // we never pop the last element - the artificial root
  406. {
  407. // Close all unbalanced elements.
  408. XmlElement htmlOpenedElement = _openedElements.Pop();
  409. if (htmlOpenedElement.LocalName == htmlElementName)
  410. {
  411. return;
  412. }
  413. if (HtmlSchema.IsInlineElement(htmlOpenedElement.LocalName))
  414. {
  415. // Unbalances Inlines will be transfered to the next element content
  416. _pendingInlineElements.Push(CreateElementCopy(htmlOpenedElement));
  417. }
  418. }
  419. }
  420. // If element was not opened, we simply ignore the unbalanced closing tag
  421. return;
  422. }
  423. private void AddTextContent(string textContent)
  424. {
  425. OpenPendingInlineElements();
  426. InvariantAssert(_openedElements.Count > 0, "AddTextContent: Stack of opened elements cannot be empty, as we have at least one artificial root element");
  427. XmlElement htmlParent = _openedElements.Peek();
  428. XmlText textNode = _document.CreateTextNode(textContent);
  429. htmlParent.AppendChild(textNode);
  430. }
  431. private void AddComment(string comment)
  432. {
  433. OpenPendingInlineElements();
  434. InvariantAssert(_openedElements.Count > 0, "AddComment: Stack of opened elements cannot be empty, as we have at least one artificial root element");
  435. XmlElement htmlParent = _openedElements.Peek();
  436. XmlComment xmlComment = _document.CreateComment(comment);
  437. htmlParent.AppendChild(xmlComment);
  438. }
  439. // Moves all inline elements pending for opening to actual document
  440. // and adds them to current open stack.
  441. private void OpenPendingInlineElements()
  442. {
  443. if (_pendingInlineElements.Count > 0)
  444. {
  445. XmlElement htmlInlineElement = _pendingInlineElements.Pop();
  446. OpenPendingInlineElements();
  447. InvariantAssert(_openedElements.Count > 0, "OpenPendingInlineElements: Stack of opened elements cannot be empty, as we have at least one artificial root element");
  448. XmlElement htmlParent = _openedElements.Peek();
  449. htmlParent.AppendChild(htmlInlineElement);
  450. _openedElements.Push(htmlInlineElement);
  451. }
  452. }
  453. private void ParseAttributes(XmlElement xmlElement)
  454. {
  455. while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EOF && //
  456. _htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.TagEnd && //
  457. _htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EmptyTagEnd)
  458. {
  459. // read next attribute (name=value)
  460. if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
  461. {
  462. string attributeName = _htmlLexicalAnalyzer.NextToken;
  463. _htmlLexicalAnalyzer.GetNextEqualSignToken();
  464. _htmlLexicalAnalyzer.GetNextAtomToken();
  465. string attributeValue = _htmlLexicalAnalyzer.NextToken;
  466. xmlElement.SetAttribute(attributeName, attributeValue);
  467. }
  468. _htmlLexicalAnalyzer.GetNextTagToken();
  469. }
  470. }
  471. #endregion Private Methods
  472. // ---------------------------------------------------------------------
  473. //
  474. // Private Fields
  475. //
  476. // ---------------------------------------------------------------------
  477. #region Private Fields
  478. internal const string XhtmlNamespace = "http://www.w3.org/1999/xhtml";
  479. private HtmlLexicalAnalyzer _htmlLexicalAnalyzer;
  480. // document from which all elements are created
  481. private XmlDocument _document;
  482. // stack for open elements
  483. Stack<XmlElement> _openedElements;
  484. Stack<XmlElement> _pendingInlineElements;
  485. #endregion Private Fields
  486. }
  487. }