PageRenderTime 338ms CodeModel.GetById 303ms app.highlight 30ms RepoModel.GetById 1ms app.codeStats 0ms

/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlParser.jj

#
Unknown | 669 lines | 607 code | 62 blank | 0 comment | 0 complexity | ddf1bd7dc765250076a03078b02b5da1 MD5 | raw file
  1/*
  2 * HtmlParser.jj -- JavaCC grammar for HTML.
  3 * Copyright (C) 1999 Quiotix Corporation.
  4 * Copyright (C) 2011 Eric Le Lay
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License, version 2, as
  8 * published by the Free Software Foundation.
  9 *
 10 * This program is distributed in the hope that it will be useful,
 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 14 * for more details.
 15 */
 16
 17
 18/*
 19 * JavaCC grammar file for HTML.
 20 *
 21 * Author: Brian Goetz, Quiotix
 22 * Version: 1.03
 23 * Revision: $Id: HtmlParser.jj 19677 2011-07-17 15:14:39Z kerik-sf $
 24 *
 25 * This grammar parses an HTML document and produces a (flat) parse "tree"
 26 * representing the document.  It preserves almost all information in the
 27 * source document, including carriage control and spacing (except inside
 28 * of tags.)  See the HtmlDocument and HtmlDocument.* classes for a
 29 * description of the parse tree.  The parse tree supports traversal using
 30 * the commonly used "Visitor" pattern.  The HtmlDumper class is a visitor
 31 * which dumps out the tree to an output stream.
 32 *
 33 * It does not require begin tags to be matched with end tags, or validate
 34 * the names or contents of the tags (this can easily be done post-parsing;
 35 * see the HtmlCollector class (which matches begin tags with end tags)
 36 * for an example.)
 37 *
 38 * Notable edge cases include:
 39 * - Quoted string processing.  Quoted strings are matched inside of comments, and
 40 *   as tag attribute values.  Quoted strings are matched in normal text only
 41 *   to the extent that they do not span line breaks.
 42 *
 43 * Please direct comments, questions, gripes or praise to
 44 * html-parser@quiotix.com.  If you like it/hate it/use it, please let us know!
 45 *
 46 * danson: Modified for HtmlSideKick plugin for jEdit, added ability to handle
 47 * jsp as well as html, added locations for tags, etc.  This parser works well
 48 * for most xml-based markup also.
 49 */
 50
 51options { IGNORE_CASE = true; STATIC = false; }
 52
 53PARSER_BEGIN(HtmlParser)
 54
 55package sidekick.html.parser.html;
 56
 57import java.text.MessageFormat;
 58import java.util.*;
 59import java.util.regex.*;
 60import sidekick.util.*;
 61import java.io.Reader;
 62
 63public class HtmlParser {
 64
 65  static String NL = System.getProperty("line.separator");
 66
 67  private List<ParseError> parseErrors = new ArrayList<ParseError>();
 68
 69  public void setLineSeparator(String ls) {
 70    NL = ls;
 71  }
 72
 73  private static String getTokenText(Token first, Token cur) {
 74    Token t;
 75    StringBuffer sb = new StringBuffer();
 76
 77    for (t=first; t != cur.next; t = t.next) {
 78      if (t.specialToken != null) {
 79        Token tt=t.specialToken;
 80        while (tt.specialToken != null)
 81          tt = tt.specialToken;
 82        for (; tt != null; tt = tt.next)
 83          sb.append(tt.image);
 84      };
 85      sb.append(t.image);
 86    };
 87    return sb.toString();
 88  }
 89
 90  /**
 91  * The line offset is used when the HTML to be parsed is only part of a file,
 92  * @param lineOffset The line number of the first line of the fragment.
 93  * @param columnOffset The column number of the first character of the fragment.
 94  */
 95  public HtmlParser(Reader in, int lineOffset, int columnOffset){
 96  	this(in);
 97  	jj_input_stream.ReInit(in,lineOffset,columnOffset);
 98  }
 99  
100  public static void main(String[] args) throws ParseException {
101    HtmlParser parser = new HtmlParser(System.in);
102    HtmlDocument doc = parser.HtmlDocument();
103    doc.accept(new HtmlDumper(System.out));
104    System.exit(0);
105  }
106
107   public void setTabSize(int size) {
108        jj_input_stream.setTabSize(size);
109   }
110
111   public int getTabSize() {
112        return jj_input_stream.getTabSize(0);
113   }
114
115    private void addException(ParseException pe) {
116        Range range = getExceptionLocation( pe );
117        parseErrors.add(new ParseError(pe.getMessage(), range));
118        pe.printStackTrace();
119    }
120
121    public List<ParseError> getParseErrors() {
122       System.out.println("getParserErrors, there are " + parseErrors.size() + " errors");
123       return parseErrors;
124    }
125
126    // regex to extract line and colun from a ParseException message
127    // ParseException message look like: "Parse error at line 116, column 5.  Encountered: }"
128    private Pattern pePattern = Pattern.compile( "(.*?)(\\d+)(.*?)(\\d+)(.*?)" );
129
130    /**
131     * @return attempts to return a Location indicating the location of a parser
132     * exception.  If the ParseException contains a Token reference, all is well,
133     * otherwise, this method attempts to parse the message string for the
134     * exception.
135     */
136    private Range getExceptionLocation( ParseException pe ) {
137        Token t = pe.currentToken;
138        if ( t != null ) {
139            return new Range( new Location( t.next.beginLine - 1, t.next.beginColumn ), new Location( t.next.endLine - 1, t.next.endColumn ) );
140        }
141
142        // ParseException message look like: "Parse error at line 116, column 5.  Encountered: }"
143        try {
144            Matcher m = pePattern.matcher( pe.getMessage() );
145            if ( m.matches() ) {
146                String ln = m.group( 2 );
147                String cn = m.group( 4 );
148                int line_number = -1;
149                int column_number = 0;
150                if ( ln != null )
151                    line_number = Integer.parseInt( ln );
152                if ( cn != null )
153                    column_number = Integer.parseInt( cn );
154                return line_number > -1 ? new Range( new Location( line_number - 1, column_number - 1 ), new Location( line_number - 1, column_number ) ) : null;
155            }
156            return new Range();
157        }
158        catch ( Exception e ) {
159            //e.printStackTrace();
160            return new Range();
161        }
162    }
163
164    // regex pattern for a valid non-quoted attribute.
165    // Attributes can be single or double quoted, or consist solely of
166    // letters in the range A-Z and a-z, digits (0-9), hyphens ("-"),
167    // and periods (".")
168    private Pattern attributePattern = Pattern.compile( "([a-zA-Z0-9.-])*" );
169    private boolean isProperAttribute(String s) {
170        // could have double quotes
171        if (s.startsWith("\"") && s.endsWith("\"")) {
172            return true;
173        }
174        // or single quotes
175        else if (s.startsWith("'") && s.endsWith("'")) {
176            return true;
177        }
178        // or might be jsp
179        else if (s.startsWith("<%") && (s.endsWith("%>") || s.endsWith("%")) ) {
180            return true;
181        }
182        boolean rtn = attributePattern.matcher(s).matches();
183        if (rtn == false) {
184            System.out.println("bad attribute: " + s);
185        }
186        return rtn;
187    }
188}
189
190PARSER_END(HtmlParser)
191
192MORE:
193{
194    "<%" : IN_JSP_EXP
195}
196
197<IN_JSP_EXP>
198SPECIAL_TOKEN :
199{
200  <JSP_EXP_END: "%>" > : DEFAULT
201}
202<IN_JSP_EXP>
203MORE :
204{
205  < ~[] >
206}
207
208<*> TOKEN :
209{
210  <#ALPHA_CHAR: [
211       "\u0024",
212       "\u0041"-"\u005a",
213       "\u005f",
214       "\u0061"-"\u007a",
215       "\u00c0"-"\u00d6",       // Latin with diacritics
216       "\u00d8"-"\u00f6",       // Latin with diacritics
217       "\u00f8"-"\u00ff",       // Latin with diacritics
218       "\u0100"-"\u1fff",       // Latin Extended-A through Greek Extended
219       "\u3040"-"\u318f",       // Hiragana through Hangul Compatibility Jamo
220       "\u3300"-"\u337f",       // CJK Compatibility
221       "\u3400"-"\u3d2d",       // CJK Unified Ideographs Extension A
222       "\u4e00"-"\u9fff",       // CJK Unified Ideographs
223       "\uf900"-"\ufaff" ] >    // CJK Compatibility Ideographs
224| <#NUM_CHAR:   ["0"-"9"] >
225| <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
226| <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
227| <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
228| <#STYLE_IDENTIFIER: (<ALPHA_CHAR>)+ >
229| <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
230                    | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
231| <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
232| <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
233| <#NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
234| <#QUOTE:      ( "'" | "\"" ) >
235}
236
237<DEFAULT> TOKEN :
238{
239  <EOL:               ( " " | "\t" )* <NEWLINE> >
240| <COMMENT_START:     "<!--" | "<%--"    >                 : LexComment
241| <ENDTAG_START:      "</" | "</" <IDENTIFIER> ":" >       : LexStartTag
242| <TAG_START:         "<" | "<%@" | "<" <IDENTIFIER> ":" > : LexStartTag
243| <DECL_START:        "<!"      >                          : LexDecl
244| <PCDATA:            ( ~["<", "\r", "\n"] )+ >
245}
246
247<LexStartTag> SPECIAL_TOKEN :
248{
249  < (<WHITESPACE>)+ >
250}
251
252<LexStartTag> TOKEN :
253{
254  <TAG_SCRIPT: "SCRIPT">    : LexInTag
255| <TAG_STYLE:  "STYLE">     : LexInTag
256| <TAG_NAME: <IDENTIFIER> > : LexInTag
257| <LST_ERROR: ~[]>          : DEFAULT
258}
259
260<LexInTag> SPECIAL_TOKEN :
261{
262  < (<WHITESPACE>)+ >
263}
264
265<LexInTag> TOKEN :
266{
267  <ATTR_NAME: <IDENTIFIER> >
268| <TAG_END: ">" | "%>" >                : DEFAULT
269| <TAG_SLASHEND: "/>" >                 : DEFAULT
270| <ATTR_EQ: "=" >                       : LexAttrVal
271| <IMPLICIT_TAG_END: "<">
272  {
273    Token t = new Token();
274    t.image       = "<";
275    t.kind        = TAG_START;
276    t.next        = matchedToken.next;
277    t.beginLine   = matchedToken.beginLine;
278    t.beginColumn = matchedToken.beginColumn;
279    t.endLine     = matchedToken.endLine;
280    t.endColumn   = matchedToken.endColumn;
281    matchedToken.next  = t;
282    matchedToken.kind  = TAG_END;
283    matchedToken.image = ">";
284  }                                     : LexStartTag
285| <LIT_ERROR: ~[]>
286}
287
288<LexAttrVal> SPECIAL_TOKEN :
289{
290  < <WHITESPACE> >
291}
292
293<LexAttrVal> TOKEN :
294{
295  <ATTR_VAL: <QUOTED_STRING>
296| ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexInTag
297| <LAV_ERROR: ~[]>
298}
299
300<LexComment> TOKEN :
301{
302  < COMMENT_END:  ("--" (" ")* ">" | "->" | "--%>" ) > : DEFAULT
303| < DASH:         "-" >
304| < COMMENT_EOL:  <NEWLINE> >
305| < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
306                    | <QUOTED_STRING_NB>
307                    | <QUOTE> ) >
308}
309
310<LexDecl> TOKEN :
311{
312  <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
313| <DECL_END: ">" > : DEFAULT
314}
315
316<LexScript> TOKEN : {
317  <SCRIPT_END:   "</SCRIPT>" > : DEFAULT
318}
319
320<LexStyle> TOKEN : {
321  <STYLE_END:    "</STYLE>" > : DEFAULT
322}
323
324<LexScript, LexStyle> TOKEN :
325{
326  <BLOCK_EOL:    <NEWLINE> >
327| <BLOCK_LBR:    "<" >
328| <BLOCK_WORD:   ( <QUOTED_STRING_NB>
329                   | <QUOTE>
330                   | (~[ "\n", "\r", "'", "\"", "<"])+ ) >
331}
332
333HtmlDocument HtmlDocument() :
334{
335  HtmlDocument.ElementSequence s;
336}
337{
338  s=ElementSequence() <EOF>
339  { return new HtmlDocument(s); }
340}
341
342HtmlDocument.ElementSequence ElementSequence() :
343{
344  HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();
345  HtmlDocument.HtmlElement h;
346}
347{
348  ( h=Element() { s.addElement(h); } ) *
349  { return s; }
350}
351
352HtmlDocument.HtmlElement Element() :
353{
354  HtmlDocument.HtmlElement e;
355  Token text;
356}
357{
358(
359    LOOKAHEAD(2)
360         e = Tag()        { return e; }
361  |      e = EndTag()     { return e; }
362  |      e = CommentTag() { return e; }
363  |      e = DeclTag()    { return e; }
364  | LOOKAHEAD(2)
365         e = ScriptBlock() { return e; }
366  | LOOKAHEAD(2)
367         e = StyleBlock()  { return e; }
368  | LOOKAHEAD(2)
369             <TAG_START> text=<LST_ERROR>
370                          { return new HtmlDocument.Text("<" + text.image); }
371  |   text = <PCDATA>     { return new HtmlDocument.Text(text.image); }
372  |          <EOL>        { return new HtmlDocument.Newline(); }
373)
374}
375
376HtmlDocument.Attribute Attribute() :
377{
378  HtmlDocument.Attribute a;
379  Token t1, t2=null;
380}
381{
382    try {
383      t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
384      {
385    
386        if (t2 == null) {
387          a = new HtmlDocument.Attribute(t1.image);
388          a.setStartLocation(t1.beginLine, t1.beginColumn);
389          a.setEndLocation(t1.endLine, t1.endColumn + 1);
390        }
391        else {
392          a = new HtmlDocument.Attribute(t1.image, t2.image);
393          a.setStartLocation(t1.beginLine, t1.beginColumn);
394          a.setValueStartLocation(t2.beginLine,t2.beginColumn);
395          a.setEndLocation(t2.endLine, t2.endColumn + 1);
396          if (!isProperAttribute(t2.image)) {
397           ParseException e = new ParseException("Parse error at line " + t2.beginLine + ", column " + t2.beginColumn + ".  Attribute is improperly quoted." );
398           addException(e);
399          }
400        }
401    
402         return a;
403      }
404    }
405      catch(ParseException e) {
406        addException(e);
407        return null;
408      }
409}
410
411HtmlDocument.AttributeList AttributeList() :
412{
413  HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();
414  HtmlDocument.Attribute a;
415}
416{
417  (a=Attribute() { alist.addAttribute(a); } )*
418  {
419    return alist;
420  }
421}
422
423HtmlDocument.HtmlElement Tag() :
424{
425  Token t, et;
426  HtmlDocument.AttributeList alist;
427  Token firstToken = getToken(1);
428  Token st = null;
429  boolean isJspTag = false;
430}
431{
432  try {
433    st=<TAG_START>  t=<TAG_NAME> alist=AttributeList()
434    ( et=<TAG_END> | et=<TAG_SLASHEND> )
435    {
436        String tag_start = "<";
437        String tag_name = "";
438        if (st.image.startsWith("<") && st.image.endsWith(":")) {
439            isJspTag = true;
440            tag_start = "<";
441            tag_name = st.image.substring(1) + t.image;
442        }
443        else {
444            tag_name = t.image;
445        }
446        if (st.image.startsWith("<%")) {
447            isJspTag = true;
448        }
449      HtmlDocument.Tag rtn_tag = new HtmlDocument.Tag(tag_start, tag_name, alist, et.image);
450      if (et.kind == TAG_SLASHEND) {
451          rtn_tag.setEmpty(true);
452      }
453      rtn_tag.setStartLocation(st.beginLine, st.beginColumn);
454      rtn_tag.setEndLocation(et.endLine, et.endColumn + 1);
455      rtn_tag.setIsJspTag(isJspTag);
456      return rtn_tag;
457    }
458  }
459  catch (ParseException ex) {
460      addException(ex);
461    token_source.SwitchTo(DEFAULT);
462    String s = getTokenText(firstToken, getNextToken());
463    return new HtmlDocument.Text(s);
464  }
465}
466
467
468String StyleBlockContents() :
469{
470    StringBuffer sb = new StringBuffer();
471    Token t = null;
472}
473{
474    try {
475        ( t=<BLOCK_EOL> { sb.append(t.image); }
476        | t=<BLOCK_WORD> { sb.append(t.image); }
477        | t=<BLOCK_LBR> { sb.append(t.image); }
478        )*
479        {
480            // don't trim content, otherwise errors in the first line will be off by
481            // the amount of whitespace trimmed
482            return sb.toString();
483        }
484      }
485      catch(ParseException e) {
486        addException(e);          
487      }
488}
489
490String ScriptBlockContents() :
491{
492  //HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();
493  StringBuffer sb = new StringBuffer();
494  Token t = null;
495}
496{
497    try {
498      ( t=<BLOCK_EOL>    { sb.append(t.image); }
499        | t=<BLOCK_WORD>  { sb.append(t.image); }
500        | t=<BLOCK_LBR>  { sb.append(t.image); }
501      )*
502      {
503            String contents = sb.toString();
504            contents = contents.trim();
505            // sometimes people wrap the contents of script tags with html comments
506            // to protect older browsers that don't understand script tags from puking.
507            // I'm removing them here as they don't serve a purpose as far as a jEdit
508            // SideKick plugin is concerned.
509            if (contents.startsWith("<!--")) {
510                contents = contents.substring(4);
511            }
512            if (contents.endsWith("//-->")) {
513                contents = contents.substring(0, contents.length() - 5);
514            }
515            return contents.trim();
516        //return e;
517      }
518  }
519  catch(ParseException e) {
520    addException(e);          
521  }
522}
523
524HtmlDocument.HtmlElement ScriptBlock() :
525{
526  HtmlDocument.AttributeList alist;
527  Token firstToken = getToken(1);
528  Token st, et, ts, est;
529  String contents = "";
530}
531{
532  try {
533    st=<TAG_START> ts=<TAG_SCRIPT> alist=AttributeList() est=<TAG_END>
534    {
535      token_source.SwitchTo(LexScript);
536    }
537    contents=ScriptBlockContents()
538    et=<SCRIPT_END>
539    {
540    	HtmlDocument.Tag script = new HtmlDocument.Tag(ts.image, alist);
541        script.setStartLocation(st.beginLine, st.beginColumn);
542        script.setEndLocation(est.endLine, est.endColumn);
543        
544        HtmlDocument.EndTag endScript = new HtmlDocument.EndTag( ts.image );
545        endScript.setStartLocation(et.beginLine, et.beginColumn);
546        endScript.setEndLocation(et.endLine, et.endColumn);
547
548        
549        HtmlDocument.Text text = new HtmlDocument.Text(contents);
550        HtmlDocument.ElementSequence seq = new HtmlDocument.ElementSequence();
551        seq.addElement(text);
552        HtmlDocument.TagBlock b = new HtmlDocument.TagBlock(script, seq, endScript);
553        b.setStartLocation(st.beginLine, st.beginColumn);
554        b.setEndLocation(et.endLine, et.endColumn + 1);
555        return b;
556    }
557  }
558  catch (ParseException ex) {
559    addException(ex);
560    token_source.SwitchTo(DEFAULT);
561    String s = getTokenText(firstToken, getNextToken());
562    return new HtmlDocument.Text(s);
563  }
564}
565
566
567HtmlDocument.HtmlElement StyleBlock() :
568{
569  HtmlDocument.AttributeList alist;
570  Token firstToken = getToken(1);
571  Token st, est, et;
572  String contents = "";
573}
574{
575  try {
576    st=<TAG_START> <TAG_STYLE> alist=AttributeList() est=<TAG_END>
577    {
578      token_source.SwitchTo(LexStyle);
579    }
580    contents=StyleBlockContents()
581    et=<STYLE_END>
582    {
583        HtmlDocument.Text text = new HtmlDocument.Text(contents);
584        HtmlDocument.ElementSequence seq = new HtmlDocument.ElementSequence();
585        seq.addElement(text);
586        HtmlDocument.TagBlock b = new HtmlDocument.TagBlock("STYLE", alist, seq);
587        b.setStartLocation(st.beginLine, st.beginColumn);
588        b.setEndLocation(et.endLine, et.endColumn + 1);
589
590		b.startTag.setStartLocation(st.beginLine, st.beginColumn);
591        b.startTag.setEndLocation(est.endLine, est.endColumn + 1);
592
593        return b;
594    }
595  }
596  catch (ParseException ex) {
597    addException(ex);
598    token_source.SwitchTo(DEFAULT);
599    String s = getTokenText(firstToken, getNextToken());
600    return new HtmlDocument.Text(s);
601  }
602}
603
604
605HtmlDocument.HtmlElement EndTag() :
606{
607  Token t;
608  Token firstToken = getToken(1);
609  Token st, et;
610}
611{
612  try {
613    st=<ENDTAG_START> t=<TAG_NAME> et=<TAG_END>
614    {
615        String tag_name = "";
616        if (st.image.startsWith("</") && st.image.endsWith(":")) {
617            tag_name = st.image.substring(2) + t.image;
618        }
619        else
620            tag_name = t.image;
621        HtmlDocument.EndTag b = new HtmlDocument.EndTag(tag_name);
622        b.setStartLocation(st.beginLine, st.beginColumn);
623        b.setEndLocation(et.endLine, et.endColumn + 1);
624        return b;
625    }
626  }
627  catch (ParseException ex) {
628    addException(ex);
629    token_source.SwitchTo(DEFAULT);
630    String s = getTokenText(firstToken, getNextToken());
631    return new HtmlDocument.Text(s);
632  }
633}
634
635HtmlDocument.Comment CommentTag() :
636{
637  Token t, comment_start, comment_end = null;
638  StringBuffer s = new StringBuffer();
639}
640{
641    try {
642          comment_start=<COMMENT_START>
643          ( t=<DASH> { s.append(t.image); }
644            | <COMMENT_EOL>  { s.append(NL); }
645            | t=<COMMENT_WORD> { s.append(t.image); } )*
646          (<EOF> | comment_end=<COMMENT_END>)
647          { return new HtmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image)); }
648      }
649      catch(ParseException e) {
650        addException(e);          
651      }
652  
653}
654
655HtmlDocument.Comment DeclTag() :
656{
657  Token t;
658}
659{
660    try {
661          <DECL_START> t=<DECL_ANY> <DECL_END>
662          {
663            return new HtmlDocument.Comment(t.image);
664          }
665      }
666      catch(ParseException e) {
667        addException(e);          
668      }
669}