PageRenderTime 35ms CodeModel.GetById 18ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 1ms

/CBR/CBR.Core/Helpers/HTML/HtmlParser.cs

#
C# | 565 lines | 294 code | 78 blank | 193 comment | 64 complexity | d6dc8162b8d8ba83fd4c7b9a447a1c44 MD5 | raw file
  1//---------------------------------------------------------------------------
  2// 
  3// File: HtmlParser.cs
  4//
  5// Copyright (C) Microsoft Corporation.  All rights reserved.
  6//
  7// Description: Parser for Html-to-Xaml converter
  8//
  9//---------------------------------------------------------------------------
 10
 11using System;
 12using System.Xml;
 13using System.Diagnostics;
 14using System.Collections;
 15using System.Collections.Generic;
 16using System.IO;
 17using System.Text; // StringBuilder
 18
 19// important TODOS: 
 20// TODO 1. Start tags: The ParseXmlElement function has been modified to be called after both the 
 21// angle bracket < and element name have been read, instead of just the < bracket and some valid name character, 
 22// previously the case. This change was made so that elements with optional closing tags could read a new
 23// element's start tag and decide whether they were required to close. However, there is a question of whether to
 24// handle this in the parser or lexical analyzer. It is currently handled in the parser - the lexical analyzer still
 25// recognizes a start tag opener as a '<' + valid name start char; it is the parser that reads the actual name. 
 26// this is correct behavior assuming that the name is a valid html name, because the lexical analyzer should not know anything
 27// about optional closing tags, etc. UPDATED: 10/13/2004: I am updating this to read the whole start tag of something 
 28// that is not an HTML, treat it as empty, and add it to the tree. That way the converter will know it's there, but
 29// it will hvae no content. We could also partially recover by trying to look up and match names if they are similar
 30// TODO 2. Invalid element names: However, it might make sense to give the lexical analyzer the ability to identify
 31// a valid html element name and not return something as a start tag otherwise. For example, if we type <good>, should
 32// the lexical analyzer return that it has found the start of an element when this is not the case in HTML? But this will
 33// require implementing a lookahead token in the lexical analyzer so that it can treat an invalid element name as text. One 
 34// character of lookahead will not be enough.
 35// TODO 3. Attributes: The attribute recovery is poor when reading attribute values in quotes - if no closing quotes are found,
 36// the lexical analyzer just keeps reading and if it eventually reaches the end of file, it would have just skipped everything.
 37// There are a couple of ways to deal with this: 1) stop reading attributes when we encounter a '>' character - this doesn't allow
 38// the '>' character to be used in attribute values, but it can still be used as an entity. 2) Maintain a HTML-specific list
 39// of attributes and their values that each html element can take, and if we find correct attribute namesand values for an
 40// element we use them regardless of the quotes, this way we could just ignore something invalid. One more option: 3) Read ahead
 41// in the quoted value and if we find an end of file, we can return to where we were and process as text. However this requires
 42// a lot of lookahead and a resettable reader.
 43// TODO 4: elements with optional closing tags: For elements with optional closing tags, we always close the element if we find
 44// that one of it's ancestors has closed. This condition may be too broad and we should develop a better heuristic. We should also
 45// improve the heuristics for closing certain elements when the next element starts
 46// TODO 5. Nesting: Support for unbalanced nesting, e.g. <b> <i> </b> </i>: this is not presently supported. To support it we may need
 47// to maintain two xml elements, one the element that represents what has already been read and another represents what we are presently reading.
 48// Then if we encounter an unbalanced nesting tag we could close the element that was supposed to close, save the current element
 49// and store it in the list of already-read content, and then open a new element to which all tags that are currently open
 50// can be applied. Is there a better way to do this? Should we do it at all?
 51// TODO 6. Elements with optional starting tags: there are 4 such elements in the HTML 4 specification - html, tbody, body and head.
 52// The current recovery doesn;t do anything for any of these elements except the html element, because it's not critical - head
 53// and body elementscan be contained within html element, and tbody is contained within table. To extend this for XHTML 
 54// extensions, and to recover in case other elements are missing start tags, we would need to insert an extra recursive call
 55// to ParseXmlElement for the missing start tag. It is suggested to do this by giving ParseXmlElement an argument that specifies
 56// a name to use. If this argument is null, it  assumes its name is the next token from the lexical analyzer and continues
 57// exactly as it does now. However, if the argument contains a valid html element name then it takes that value as its name
 58// and continues as before. This way, if the next token is the element that should actually be its child, it will see
 59// the name in the next step and initiate a recursive call. We would also need to add some logic in the loop for when a start tag
 60// is found - if the start tag is not compatible with current context and indicates that a start tag has been missed, then we
 61// can initiate the extra recursive call and give it the name of the missed start tag. The issues are when to insert this logic,
 62// and if we want to support it over multiple missing start tags. If we insert it at the time a start tag is read in element
 63// text,  then we can support only one missing start tag, since the extra call will read the next start tag and make a recursive
 64// call without checking the context. This is a conceptual problem, and the check should be made just before a recursive call,
 65// with the choice being whether we should supply an element name as argument, or leave it as NULL and read from the input
 66// TODO 7: Context: Is it appropriate to keep track of context here? For example, should we only expect td, tr elements when
 67// reading a table and ignore them otherwise? This may be too much of a load on the parser, I think it's better if the converter
 68// deals with it
 69
 70
 71namespace HTMLConverter
 72{
 73    /// <summary>
 74    /// HtmlParser class accepts a string of possibly badly formed Html, parses it and returns a string
 75    /// of well-formed Html that is as close to the original string in content as possible
 76    /// </summary>
 77
 78    internal class HtmlParser
 79    {
 80        // ---------------------------------------------------------------------
 81        //
 82        // Constructors
 83        //
 84        // ---------------------------------------------------------------------
 85
 86        #region Constructors
 87
 88        /// <summary>
 89        /// Constructor. Initializes the _htmlLexicalAnalayzer element with the given input string
 90        /// </summary>
 91        /// <param name="inputString">
 92        /// string to parsed into well-formed Html
 93        /// </param>
 94        private HtmlParser(string inputString)
 95        {
 96            // Create an output xml document
 97            _document = new XmlDocument();
 98
 99            // initialize open tag stack
100            _openedElements = new Stack<XmlElement>();
101
102            _pendingInlineElements = new Stack<XmlElement>();
103
104            // initialize lexical analyzer
105            _htmlLexicalAnalyzer = new HtmlLexicalAnalyzer(inputString);
106
107            // get first token from input, expecting text
108            _htmlLexicalAnalyzer.GetNextContentToken();
109        }
110
111        #endregion Constructors
112
113        // ---------------------------------------------------------------------
114        //
115        // Internal Methods
116        //
117        // ---------------------------------------------------------------------
118
119        #region Internal Methods
120
121        /// <summary>
122        /// Instantiates an HtmlParser element and calls the parsing function on the given input string
123        /// </summary>
124        /// <param name="htmlString">
125        /// Input string of pssibly badly-formed Html to be parsed into well-formed Html
126        /// </param>
127        /// <returns>
128        /// XmlElement rep
129        /// </returns>
130        internal static XmlElement ParseHtml(string htmlString)
131        {
132            HtmlParser htmlParser = new HtmlParser(htmlString);
133
134            XmlElement htmlRootElement = htmlParser.ParseHtmlContent();
135
136            return htmlRootElement;
137        }
138
139        // .....................................................................
140        //
141        // Html Header on Clipboard
142        //
143        // .....................................................................
144
145        // Html header structure.
146        //      Version:1.0
147        //      StartHTML:000000000
148        //      EndHTML:000000000
149        //      StartFragment:000000000
150        //      EndFragment:000000000
151        //      StartSelection:000000000
152        //      EndSelection:000000000
153        internal const string HtmlHeader = "Version:1.0\r\nStartHTML:{0:D10}\r\nEndHTML:{1:D10}\r\nStartFragment:{2:D10}\r\nEndFragment:{3:D10}\r\nStartSelection:{4:D10}\r\nEndSelection:{5:D10}\r\n";
154        internal const string HtmlStartFragmentComment = "<!--StartFragment-->";
155        internal const string HtmlEndFragmentComment = "<!--EndFragment-->";
156
157        /// <summary>
158        /// Extracts Html string from clipboard data by parsing header information in htmlDataString
159        /// </summary>
160        /// <param name="htmlDataString">
161        /// String representing Html clipboard data. This includes Html header
162        /// </param>
163        /// <returns>
164        /// String containing only the Html data part of htmlDataString, without header
165        /// </returns>
166        internal static string ExtractHtmlFromClipboardData(string htmlDataString)
167        {
168            int startHtmlIndex = htmlDataString.IndexOf("StartHTML:");
169            if (startHtmlIndex < 0)
170            {
171                return "ERROR: Urecognized html header";
172            }
173            // TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
174            // which could be wrong assumption. We need to implement more flrxible parsing here
175            startHtmlIndex = Int32.Parse(htmlDataString.Substring(startHtmlIndex + "StartHTML:".Length, "0123456789".Length));
176            if (startHtmlIndex < 0 || startHtmlIndex > htmlDataString.Length)
177            {
178                return "ERROR: Urecognized html header";
179            }
180
181            int endHtmlIndex = htmlDataString.IndexOf("EndHTML:");
182            if (endHtmlIndex < 0)
183            {
184                return "ERROR: Urecognized html header";
185            }
186            // TODO: We assume that indices represented by strictly 10 zeros ("0123456789".Length),
187            // which could be wrong assumption. We need to implement more flrxible parsing here
188            endHtmlIndex = Int32.Parse(htmlDataString.Substring(endHtmlIndex + "EndHTML:".Length, "0123456789".Length));
189            if (endHtmlIndex > htmlDataString.Length)
190            {
191                endHtmlIndex = htmlDataString.Length;
192            }
193
194            return htmlDataString.Substring(startHtmlIndex, endHtmlIndex - startHtmlIndex);
195        }
196
197        /// <summary>
198        /// Adds Xhtml header information to Html data string so that it can be placed on clipboard
199        /// </summary>
200        /// <param name="htmlString">
201        /// Html string to be placed on clipboard with appropriate header
202        /// </param>
203        /// <returns>
204        /// String wrapping htmlString with appropriate Html header
205        /// </returns>
206        internal static string AddHtmlClipboardHeader(string htmlString)
207        {
208            StringBuilder stringBuilder = new StringBuilder();
209
210            // each of 6 numbers is represented by "{0:D10}" in the format string
211            // must actually occupy 10 digit positions ("0123456789")
212            int startHTML = HtmlHeader.Length + 6 * ("0123456789".Length - "{0:D10}".Length);
213            int endHTML = startHTML + htmlString.Length;
214            int startFragment = htmlString.IndexOf(HtmlStartFragmentComment, 0);
215            if (startFragment >= 0)
216            {
217                startFragment = startHTML + startFragment + HtmlStartFragmentComment.Length;
218            }
219            else
220            {
221                startFragment = startHTML;
222            }
223            int endFragment = htmlString.IndexOf(HtmlEndFragmentComment, 0);
224            if (endFragment >= 0)
225            {
226                endFragment = startHTML + endFragment;
227            }
228            else
229            {
230                endFragment = endHTML;
231            }
232
233            // Create HTML clipboard header string
234            stringBuilder.AppendFormat(HtmlHeader, startHTML, endHTML, startFragment, endFragment, startFragment, endFragment);
235
236            // Append HTML body.
237            stringBuilder.Append(htmlString);
238
239            return stringBuilder.ToString();
240        }
241
242        #endregion Internal Methods
243
244        // ---------------------------------------------------------------------
245        //
246        // Private methods
247        //
248        // ---------------------------------------------------------------------
249
250        #region Private Methods
251
252        private void InvariantAssert(bool condition, string message)
253        {
254            if (!condition)
255            {
256                throw new Exception("Assertion error: " + message);
257            }
258        }
259
260        /// <summary>
261        /// Parses the stream of html tokens starting
262        /// from the name of top-level element.
263        /// Returns XmlElement representing the top-level
264        /// html element
265        /// </summary>
266        private XmlElement ParseHtmlContent()
267        {
268            // Create artificial root elelemt to be able to group multiple top-level elements
269            // We create "html" element which may be a duplicate of real HTML element, which is ok, as HtmlConverter will swallow it painlessly..
270            XmlElement htmlRootElement = _document.CreateElement("html", XhtmlNamespace);
271            OpenStructuringElement(htmlRootElement);
272
273            while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EOF)
274            {
275                if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.OpeningTagStart)
276                {
277                    _htmlLexicalAnalyzer.GetNextTagToken();
278                    if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
279                    {
280                        string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
281                        _htmlLexicalAnalyzer.GetNextTagToken();
282
283                        // Create an element
284                        XmlElement htmlElement = _document.CreateElement(htmlElementName, XhtmlNamespace);
285
286                        // Parse element attributes
287                        ParseAttributes(htmlElement);
288
289                        if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd || HtmlSchema.IsEmptyElement(htmlElementName))
290                        {
291                            // It is an element without content (because of explicit slash or based on implicit knowledge aboout html)
292                            AddEmptyElement(htmlElement);
293                        }
294                        else if (HtmlSchema.IsInlineElement(htmlElementName))
295                        {
296                            // Elements known as formatting are pushed to some special
297                            // pending stack, which allows them to be transferred
298                            // over block tags - by doing this we convert
299                            // overlapping tags into normal heirarchical element structure.
300                            OpenInlineElement(htmlElement);
301                        }
302                        else if (HtmlSchema.IsBlockElement(htmlElementName) || HtmlSchema.IsKnownOpenableElement(htmlElementName))
303                        {
304                            // This includes no-scope elements
305                            OpenStructuringElement(htmlElement);
306                        }
307                        else
308                        {
309                            // Do nothing. Skip the whole opening tag.
310                            // Ignoring all unknown elements on their start tags.
311                            // Thus we will ignore them on closinng tag as well.
312                            // Anyway we don't know what to do withthem on conversion to Xaml.
313                        }
314                    }
315                    else
316                    {
317                        // Note that the token following opening angle bracket must be a name - lexical analyzer must guarantee that.
318                        // Otherwise - we skip the angle bracket and continue parsing the content as if it is just text.
319                        //  Add the following asserion here, right? or output "<" as a text run instead?:
320                        // InvariantAssert(false, "Angle bracket without a following name is not expected");
321                    }
322                }
323                else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.ClosingTagStart)
324                {
325                    _htmlLexicalAnalyzer.GetNextTagToken();
326                    if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
327                    {
328                        string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
329
330                        // Skip the name token. Assume that the following token is end of tag,
331                        // but do not check this. If it is not true, we simply ignore one token
332                        // - this is our recovery from bad xml in this case.
333                        _htmlLexicalAnalyzer.GetNextTagToken();
334
335                        CloseElement(htmlElementName);
336                    }
337                }
338                else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Text)
339                {
340                    AddTextContent(_htmlLexicalAnalyzer.NextToken);
341                }
342                else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Comment)
343                {
344                    AddComment(_htmlLexicalAnalyzer.NextToken);
345                }
346
347                _htmlLexicalAnalyzer.GetNextContentToken();
348            }
349
350            // Get rid of the artificial root element
351            if (htmlRootElement.FirstChild is XmlElement &&
352                htmlRootElement.FirstChild == htmlRootElement.LastChild &&
353                htmlRootElement.FirstChild.LocalName.ToLower() == "html")
354            {
355                htmlRootElement = (XmlElement)htmlRootElement.FirstChild;
356            }
357
358            return htmlRootElement;
359        }
360
361        private XmlElement CreateElementCopy(XmlElement htmlElement)
362        {
363            XmlElement htmlElementCopy = _document.CreateElement(htmlElement.LocalName, XhtmlNamespace);
364            for (int i = 0; i < htmlElement.Attributes.Count; i++)
365            {
366                XmlAttribute attribute = htmlElement.Attributes[i];
367                htmlElementCopy.SetAttribute(attribute.Name, attribute.Value);
368            }
369            return htmlElementCopy;
370        }
371
372        private void AddEmptyElement(XmlElement htmlEmptyElement)
373        {
374            InvariantAssert(_openedElements.Count > 0, "AddEmptyElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
375            XmlElement htmlParent = _openedElements.Peek();
376            htmlParent.AppendChild(htmlEmptyElement);
377        }
378
379        private void OpenInlineElement(XmlElement htmlInlineElement)
380        {
381            _pendingInlineElements.Push(htmlInlineElement);
382        }
383
384        // Opens structurig element such as Div or Table etc.
385        private void OpenStructuringElement(XmlElement htmlElement)
386        {
387            // Close all pending inline elements
388            // All block elements are considered as delimiters for inline elements
389            // which forces all inline elements to be closed and re-opened in the following
390            // structural element (if any).
391            // By doing that we guarantee that all inline elements appear only within most nested blocks
392            if (HtmlSchema.IsBlockElement(htmlElement.LocalName))
393            {
394                while (_openedElements.Count > 0 && HtmlSchema.IsInlineElement(_openedElements.Peek().LocalName))
395                {
396                    XmlElement htmlInlineElement = _openedElements.Pop();
397                    InvariantAssert(_openedElements.Count > 0, "OpenStructuringElement: stack of opened elements cannot become empty here");
398
399                    _pendingInlineElements.Push(CreateElementCopy(htmlInlineElement));
400                }
401            }
402
403            // Add this block element to its parent
404            if (_openedElements.Count > 0)
405            {
406                XmlElement htmlParent = _openedElements.Peek();
407
408                // Check some known block elements for auto-closing (LI and P)
409                if (HtmlSchema.ClosesOnNextElementStart(htmlParent.LocalName, htmlElement.LocalName))
410                {
411                    _openedElements.Pop();
412                    htmlParent = _openedElements.Count > 0 ? _openedElements.Peek() : null;
413                }
414
415                if (htmlParent != null)
416                {
417                    // NOTE:
418                    // Actually we never expect null - it would mean two top-level P or LI (without a parent).
419                    // In such weird case we will loose all paragraphs except the first one...
420                    htmlParent.AppendChild(htmlElement);
421                }
422            }
423
424            // Push it onto a stack
425            _openedElements.Push(htmlElement);
426        }
427
428        private bool IsElementOpened(string htmlElementName)
429        {
430            foreach (XmlElement openedElement in _openedElements)
431            {
432                if (openedElement.LocalName == htmlElementName)
433                {
434                    return true;
435                }
436            }
437            return false;
438        }
439
440        private void CloseElement(string htmlElementName)
441        {
442            // Check if the element is opened and already added to the parent
443            InvariantAssert(_openedElements.Count > 0, "CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
444
445            // Check if the element is opened and still waiting to be added to the parent
446            if (_pendingInlineElements.Count > 0 && _pendingInlineElements.Peek().LocalName == htmlElementName)
447            {
448                // Closing an empty inline element.
449                // Note that HtmlConverter will skip empty inlines, but for completeness we keep them here on parser level.
450                XmlElement htmlInlineElement = _pendingInlineElements.Pop();
451                InvariantAssert(_openedElements.Count > 0, "CloseElement: Stack of opened elements cannot be empty, as we have at least one artificial root element");
452                XmlElement htmlParent = _openedElements.Peek();
453                htmlParent.AppendChild(htmlInlineElement);
454                return;
455            }
456            else if (IsElementOpened(htmlElementName))
457            {
458                while (_openedElements.Count > 1) // we never pop the last element - the artificial root
459                {
460                    // Close all unbalanced elements.
461                    XmlElement htmlOpenedElement = _openedElements.Pop();
462
463                    if (htmlOpenedElement.LocalName == htmlElementName)
464                    {
465                        return;
466                    }
467
468                    if (HtmlSchema.IsInlineElement(htmlOpenedElement.LocalName))
469                    {
470                        // Unbalances Inlines will be transfered to the next element content
471                        _pendingInlineElements.Push(CreateElementCopy(htmlOpenedElement));
472                    }
473                }
474            }
475
476            // If element was not opened, we simply ignore the unbalanced closing tag
477            return;
478        }
479
480        private void AddTextContent(string textContent)
481        {
482            OpenPendingInlineElements();
483
484            InvariantAssert(_openedElements.Count > 0, "AddTextContent: Stack of opened elements cannot be empty, as we have at least one artificial root element");
485
486            XmlElement htmlParent = _openedElements.Peek();
487            XmlText textNode = _document.CreateTextNode(textContent);
488            htmlParent.AppendChild(textNode);
489        }
490
491        private void AddComment(string comment)
492        {
493            OpenPendingInlineElements();
494
495            InvariantAssert(_openedElements.Count > 0, "AddComment: Stack of opened elements cannot be empty, as we have at least one artificial root element");
496
497            XmlElement htmlParent = _openedElements.Peek();
498            XmlComment xmlComment = _document.CreateComment(comment);
499            htmlParent.AppendChild(xmlComment);
500        }
501
502        // Moves all inline elements pending for opening to actual document
503        // and adds them to current open stack.
504        private void OpenPendingInlineElements()
505        {
506            if (_pendingInlineElements.Count > 0)
507            {
508                XmlElement htmlInlineElement = _pendingInlineElements.Pop();
509
510                OpenPendingInlineElements();
511
512                InvariantAssert(_openedElements.Count > 0, "OpenPendingInlineElements: Stack of opened elements cannot be empty, as we have at least one artificial root element");
513
514                XmlElement htmlParent = _openedElements.Peek();
515                htmlParent.AppendChild(htmlInlineElement);
516                _openedElements.Push(htmlInlineElement);
517            }
518        }
519
520        private void ParseAttributes(XmlElement xmlElement)
521        {
522            while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EOF && //
523                _htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.TagEnd && //
524                _htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EmptyTagEnd)
525            {
526                // read next attribute (name=value)
527                if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
528                {
529                    string attributeName = _htmlLexicalAnalyzer.NextToken;
530                    _htmlLexicalAnalyzer.GetNextEqualSignToken();
531
532                    _htmlLexicalAnalyzer.GetNextAtomToken();
533
534                    string attributeValue = _htmlLexicalAnalyzer.NextToken;
535                    xmlElement.SetAttribute(attributeName, attributeValue);
536                }
537                _htmlLexicalAnalyzer.GetNextTagToken();
538            }
539        }
540
541        #endregion Private Methods
542
543
544        // ---------------------------------------------------------------------
545        //
546        // Private Fields
547        //
548        // ---------------------------------------------------------------------
549
550        #region Private Fields
551
552        internal const string XhtmlNamespace = "http://www.w3.org/1999/xhtml";
553
554        private HtmlLexicalAnalyzer _htmlLexicalAnalyzer;
555
556        // document from which all elements are created
557        private XmlDocument _document;
558
559        // stack for open elements
560        Stack<XmlElement> _openedElements;
561        Stack<XmlElement> _pendingInlineElements;
562
563        #endregion Private Fields
564    }
565}