PageRenderTime 79ms CodeModel.GetById 54ms app.highlight 21ms RepoModel.GetById 1ms app.codeStats 0ms

/bundles/plugins-trunk/XML/xml/parser/javacc/XmlParser.jj

#
Unknown | 481 lines | 435 code | 46 blank | 0 comment | 0 complexity | 9a342d7aedefe532fe78eed13a3ec2c1 MD5 | raw file
  1/*
  2 * XmlParser.jj -- JavaCC grammar for HTML.
  3 * Copyright (C) 1999 Quiotix Corporation.
  4 * Copyright (C) 2010 Eric Le Lay
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License, version 2, as
  8 * published by the Free Software Foundation.
  9 *
 10 * This program is distributed in the hope that it will be useful,
 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 14 * for more details.
 15 */
 16
 17
 18/*
 19 * JavaCC grammar file for XML.
 20 *
 21 * Author: Eric Le Lay
 22 * Version: 1.0
 23 * Revision: $Id$
 24 *
 25 * Adapted form sidekick/html/parser/html/HtmlParser.jj
 26 * This grammar parses an HTML document and produces a (flat) parse "tree"
 27 * representing the document.  It preserves almost all information in the
 28 * source document, including carriage control and spacing (except inside
 29 * of tags.)  See the XmlDocument and XmlDocument.* classes for a
 30 * description of the parse tree.  The parse tree supports traversal using
 31 * the commonly used "Visitor" pattern.  The XmlDumper class is a visitor
 32 * which dumps out the tree to an output stream.
 33 *
 34 * It does not require begin tags to be matched with end tags, or validate
 35 * the names or contents of the tags (this can easily be done post-parsing;
 36 * see the HtmlCollector class (which matches begin tags with end tags)
 37 * for an example.)
 38 *
 39 * Notable edge cases include:
 40 * - Quoted string processing.  Quoted strings are matched inside of comments, and
 41 *   as tag attribute values.  Quoted strings are matched in normal text only
 42 *   to the extent that they do not span line breaks.
 43 *
 44 */
 45
 46options { IGNORE_CASE = true; STATIC = false; }
 47
 48PARSER_BEGIN(XmlParser)
 49
 50package xml.parser.javacc;
 51
 52import java.text.MessageFormat;
 53import java.util.*;
 54import java.util.regex.*;
 55import sidekick.util.*;
 56import java.io.Reader;
 57
 58public class XmlParser {
 59
 60  static String NL = System.getProperty("line.separator");
 61
 62  private List<ParseError> parseErrors = new ArrayList<ParseError>();
 63
 64  public void setLineSeparator(String ls) {
 65    NL = ls;
 66  }
 67
 68  private static String getTokenText(Token first, Token cur) {
 69    Token t;
 70    StringBuffer sb = new StringBuffer();
 71
 72    for (t=first; t != cur.next; t = t.next) {
 73      if (t.specialToken != null) {
 74        Token tt=t.specialToken;
 75        while (tt.specialToken != null)
 76          tt = tt.specialToken;
 77        for (; tt != null; tt = tt.next)
 78          sb.append(tt.image);
 79      };
 80      sb.append(t.image);
 81    };
 82    return sb.toString();
 83  }
 84
 85  /**
 86  * The line offset is used when the XML to be parsed is only part of a file,
 87  * @param lineOffset The line number of the first line of the fragment.
 88  * @param columnOffset The column number of the first character of the fragment.
 89  */
 90  public XmlParser(Reader in, int lineOffset, int columnOffset){
 91  	this(in);
 92  	jj_input_stream.ReInit(in,lineOffset,columnOffset);
 93  }
 94  
 95  public static void main(String[] args) throws ParseException {
 96    XmlParser parser = new XmlParser(System.in);
 97    XmlDocument doc = parser.XmlDocument();
 98    doc.accept(new XmlDebugDumper(System.out));
 99    System.exit(0);
100  }
101
102   public void setTabSize(int size) {
103        jj_input_stream.setTabSize(size);
104   }
105
106   public int getTabSize() {
107        return jj_input_stream.getTabSize(0);
108   }
109
110    private void addException(ParseException pe) {
111        Range range = getExceptionLocation( pe );
112        parseErrors.add(new ParseError(pe.getMessage(), range));
113        pe.printStackTrace();
114    }
115
116    public List<ParseError> getParseErrors() {
117       System.out.println("getParserErrors, there are " + parseErrors.size() + " errors");
118       return parseErrors;
119    }
120
121    // regex to extract line and colun from a ParseException message
122    // ParseException message look like: "Parse error at line 116, column 5.  Encountered: }"
123    private Pattern pePattern = Pattern.compile( "(.*?)(\\d+)(.*?)(\\d+)(.*?)" );
124
125    /**
126     * @return attempts to return a Location indicating the location of a parser
127     * exception.  If the ParseException contains a Token reference, all is well,
128     * otherwise, this method attempts to parse the message string for the
129     * exception.
130     */
131    private Range getExceptionLocation( ParseException pe ) {
132        Token t = pe.currentToken;
133        if ( t != null ) {
134            return new Range( new Location( t.next.beginLine - 1, t.next.beginColumn ), new Location( t.next.endLine - 1, t.next.endColumn ) );
135        }
136
137        // ParseException message look like: "Parse error at line 116, column 5.  Encountered: }"
138        try {
139            Matcher m = pePattern.matcher( pe.getMessage() );
140            if ( m.matches() ) {
141                String ln = m.group( 2 );
142                String cn = m.group( 4 );
143                int line_number = -1;
144                int column_number = 0;
145                if ( ln != null )
146                    line_number = Integer.parseInt( ln );
147                if ( cn != null )
148                    column_number = Integer.parseInt( cn );
149                return line_number > -1 ? new Range( new Location( line_number - 1, column_number - 1 ), new Location( line_number - 1, column_number ) ) : null;
150            }
151            return new Range();
152        }
153        catch ( Exception e ) {
154            //e.printStackTrace();
155            return new Range();
156        }
157    }
158
159    // regex pattern for a valid non-quoted attribute.
160    // Attributes can be single or double quoted, or consist solely of
161    // letters in the range A-Z and a-z, digits (0-9), hyphens ("-"),
162    // and periods (".")
163    private Pattern attributePattern = Pattern.compile( "([a-zA-Z0-9.-])*" );
164    private boolean isProperAttribute(String s) {
165        // could have double quotes
166        if (s.startsWith("\"") && s.endsWith("\"")) {
167            return true;
168        }
169        // or single quotes
170        else if (s.startsWith("'") && s.endsWith("'")) {
171            return true;
172        }
173        // or might be jsp
174        else if (s.startsWith("<%") && (s.endsWith("%>") || s.endsWith("%")) ) {
175            return true;
176        }
177        boolean rtn = attributePattern.matcher(s).matches();
178        if (rtn == false) {
179            System.out.println("bad attribute: " + s);
180        }
181        return rtn;
182    }
183}
184
185PARSER_END(XmlParser)
186
187<*> TOKEN :
188{
189  <#ALPHA_CHAR: [
190       "\u0024",
191       "\u0041"-"\u005a",
192       "\u005f",
193       "\u0061"-"\u007a",
194       "\u00c0"-"\u00d6",       // Latin with diacritics
195       "\u00d8"-"\u00f6",       // Latin with diacritics
196       "\u00f8"-"\u00ff",       // Latin with diacritics
197       "\u0100"-"\u1fff",       // Latin Extended-A through Greek Extended
198       "\u3040"-"\u318f",       // Hiragana through Hangul Compatibility Jamo
199       "\u3300"-"\u337f",       // CJK Compatibility
200       "\u3400"-"\u3d2d",       // CJK Unified Ideographs Extension A
201       "\u4e00"-"\u9fff",       // CJK Unified Ideographs
202       "\uf900"-"\ufaff" ] >    // CJK Compatibility Ideographs
203| <#NUM_CHAR:   ["0"-"9"] >
204| <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
205| <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
206| <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
207| <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
208                    | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
209| <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
210| <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
211| <#NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
212| <#QUOTE:      ( "'" | "\"" ) >
213}
214
215<DEFAULT> TOKEN :
216{
217  <EOL:               ( " " | "\t" )* <NEWLINE> >
218| <COMMENT_START:     "<!--" | "<%--"    >                 : LexComment
219| <ENDTAG_START:      "</" | "</" <IDENTIFIER> ":" >       : LexStartTag
220| <TAG_START:         "<" | "<" <IDENTIFIER> ":" > : LexStartTag
221| <DECL_START:        "<!"      >                          : LexDecl
222| <PCDATA:            ( ~["<", "\r", "\n"] )+ >
223}
224
225<LexStartTag> SPECIAL_TOKEN :
226{
227  < (<WHITESPACE>)+ >
228}
229
230<LexStartTag> TOKEN :
231{
232  <TAG_NAME: <IDENTIFIER> > : LexInTag
233| <LST_ERROR: ~[]>          : DEFAULT
234}
235
236<LexInTag> SPECIAL_TOKEN :
237{
238  < (<WHITESPACE>)+ >
239}
240
241<LexInTag> TOKEN :
242{
243  <ATTR_NAME: <IDENTIFIER> >
244| <TAG_END: ">" >                : DEFAULT
245| <TAG_SLASHEND: "/>" >                 : DEFAULT
246| <ATTR_EQ: "=" >                       : LexAttrVal
247| <IMPLICIT_TAG_END: "<">
248  {
249    Token t = new Token();
250    t.image       = "<";
251    t.kind        = TAG_START;
252    t.next        = matchedToken.next;
253    t.beginLine   = matchedToken.beginLine;
254    t.beginColumn = matchedToken.beginColumn;
255    t.endLine     = matchedToken.endLine;
256    t.endColumn   = matchedToken.endColumn;
257    matchedToken.next  = t;
258    matchedToken.kind  = TAG_END;
259    matchedToken.image = ">";
260  }                                     : LexStartTag
261| <LIT_ERROR: ~[]>
262}
263
264<LexAttrVal> SPECIAL_TOKEN :
265{
266  < <WHITESPACE> >
267}
268
269<LexAttrVal> TOKEN :
270{
271  <ATTR_VAL: <QUOTED_STRING>
272| ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexInTag
273| <LAV_ERROR: ~[]>
274}
275
276<LexComment> TOKEN :
277{
278  < COMMENT_END:  ("-->" | "->" ) > : DEFAULT
279| < DASH:         "-" >
280| < COMMENT_EOL:  <NEWLINE> >
281| < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
282                    | <QUOTED_STRING_NB>
283                    | <QUOTE> ) >
284}
285
286<LexDecl> TOKEN :
287{
288  <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
289| <DECL_END: ">" > : DEFAULT
290}
291
292XmlDocument XmlDocument() :
293{
294  XmlDocument.ElementSequence s;
295}
296{
297  s=ElementSequence() <EOF>
298  { return new XmlDocument(s); }
299}
300
301XmlDocument.ElementSequence ElementSequence() :
302{
303  XmlDocument.ElementSequence s = new XmlDocument.ElementSequence();
304  XmlDocument.XmlElement h;
305}
306{
307  ( h=Element() { s.addElement(h); } ) *
308  { return s; }
309}
310
311XmlDocument.XmlElement Element() :
312{
313  XmlDocument.XmlElement e;
314  Token text;
315}
316{
317(
318    LOOKAHEAD(2)
319         e = Tag()        { return e; }
320  |      e = EndTag()     { return e; }
321  |      e = CommentTag() { return e; }
322  |      e = DeclTag()    { return e; }
323  | LOOKAHEAD(2)
324             <TAG_START> text=<LST_ERROR>
325                          { return new XmlDocument.Text("<" + text.image); }
326  |   text = <PCDATA>     { return new XmlDocument.Text(text.image); }
327  |          <EOL>        { return new XmlDocument.Newline(); }
328)
329}
330
331XmlDocument.Attribute Attribute() :
332{
333  XmlDocument.Attribute a;
334  Token t1, t2=null;
335}
336{
337    try {
338      t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
339      {
340    
341        if (t2 == null) {
342          a = new XmlDocument.Attribute(t1.image);
343          a.setStartLocation(t1.beginLine, t1.beginColumn);
344          a.setEndLocation(t1.endLine, t1.endColumn + 1);
345      }
346        else {
347          a = new XmlDocument.Attribute(t1.image, t2.image);
348          a.setStartLocation(t1.beginLine, t1.beginColumn);
349          a.setValueStartLocation(t2.beginLine,t2.beginColumn);
350          a.setEndLocation(t2.endLine, t2.endColumn + 1);
351          if (!isProperAttribute(t2.image)) {
352           ParseException e = new ParseException("Parse error at line " + t2.beginLine + ", column " + t2.beginColumn + ".  Attribute is improperly quoted." );
353           addException(e);
354          }
355        }
356    
357         return a;
358      }
359    }
360      catch(ParseException e) {
361        addException(e);
362        return null;
363      }
364}
365
366XmlDocument.AttributeList AttributeList() :
367{
368  XmlDocument.AttributeList alist = new XmlDocument.AttributeList();
369  XmlDocument.Attribute a;
370}
371{
372  (a=Attribute() { alist.addAttribute(a); } )*
373  {
374    return alist;
375  }
376}
377
378XmlDocument.XmlElement Tag() :
379{
380  Token t, et;
381  XmlDocument.AttributeList alist;
382  Token firstToken = getToken(1);
383  Token st = null;
384}
385{
386  try {
387    st=<TAG_START>  t=<TAG_NAME> alist=AttributeList()
388    ( et=<TAG_END> | et=<TAG_SLASHEND> )
389    {
390        String tag_start = "<";
391        String tag_name = "";
392        if (st.image.startsWith("<") && st.image.endsWith(":")) {
393            tag_start = "<";
394            tag_name = st.image.substring(1) + t.image;
395        }
396        else {
397            tag_name = t.image;
398        }
399      XmlDocument.Tag rtn_tag = new XmlDocument.Tag(tag_start, tag_name, alist, et.image);
400      if (et.kind == TAG_SLASHEND) {
401          rtn_tag.setEmpty(true);
402      }
403      rtn_tag.setStartLocation(st.beginLine, st.beginColumn);
404      rtn_tag.setEndLocation(et.endLine, et.endColumn + 1);
405      return rtn_tag;
406    }
407  }
408  catch (ParseException ex) {
409      addException(ex);
410    token_source.SwitchTo(DEFAULT);
411    String s = getTokenText(firstToken, getNextToken());
412    return new XmlDocument.Text(s);
413  }
414}
415
416
417XmlDocument.XmlElement EndTag() :
418{
419  Token t;
420  Token firstToken = getToken(1);
421  Token st, et;
422}
423{
424  try {
425    st=<ENDTAG_START> t=<TAG_NAME> et=<TAG_END>
426    {
427        String tag_name = "";
428        if (st.image.startsWith("</") && st.image.endsWith(":")) {
429            tag_name = st.image.substring(2) + t.image;
430        }
431        else
432            tag_name = t.image;
433        XmlDocument.EndTag b = new XmlDocument.EndTag(tag_name);
434        b.setStartLocation(st.beginLine, st.beginColumn);
435        b.setEndLocation(et.endLine, et.endColumn + 1);
436        return b;
437    }
438  }
439  catch (ParseException ex) {
440    addException(ex);
441    token_source.SwitchTo(DEFAULT);
442    String s = getTokenText(firstToken, getNextToken());
443    return new XmlDocument.Text(s);
444  }
445}
446
447XmlDocument.Comment CommentTag() :
448{
449  Token t, comment_start, comment_end = null;
450  StringBuffer s = new StringBuffer();
451}
452{
453    try {
454          comment_start=<COMMENT_START>
455          ( t=<DASH> { s.append(t.image); }
456            | <COMMENT_EOL>  { s.append(NL); }
457            | t=<COMMENT_WORD> { s.append(t.image); } )*
458          (<EOF> | comment_end=<COMMENT_END>)
459          { return new XmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image)); }
460      }
461      catch(ParseException e) {
462        addException(e);          
463      }
464  
465}
466
467XmlDocument.Comment DeclTag() :
468{
469  Token t;
470}
471{
472    try {
473          <DECL_START> t=<DECL_ANY> <DECL_END>
474          {
475            return new XmlDocument.Comment(t.image);
476          }
477      }
478      catch(ParseException e) {
479        addException(e);          
480      }
481}