HtmlParser.jj | searchcode

/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlParser.jj

#
Unknown | 669 lines | 607 code | 62 blank | 0 comment | 0 complexity | ddf1bd7dc765250076a03078b02b5da1 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0

/*
 * HtmlParser.jj -- JavaCC grammar for HTML.
 * Copyright (C) 1999 Quiotix Corporation.
 * Copyright (C) 2011 Eric Le Lay
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 * for more details.
 */


/*
 * JavaCC grammar file for HTML.
 *
 * Author: Brian Goetz, Quiotix
 * Version: 1.03
 * Revision: $Id: HtmlParser.jj 19677 2011-07-17 15:14:39Z kerik-sf $
 *
 * This grammar parses an HTML document and produces a (flat) parse "tree"
 * representing the document.  It preserves almost all information in the
 * source document, including carriage control and spacing (except inside
 * of tags.)  See the HtmlDocument and HtmlDocument.* classes for a
 * description of the parse tree.  The parse tree supports traversal using
 * the commonly used "Visitor" pattern.  The HtmlDumper class is a visitor
 * which dumps out the tree to an output stream.
 *
 * It does not require begin tags to be matched with end tags, or validate
 * the names or contents of the tags (this can easily be done post-parsing;
 * see the HtmlCollector class (which matches begin tags with end tags)
 * for an example.)
 *
 * Notable edge cases include:
 * - Quoted string processing.  Quoted strings are matched inside of comments, and
 *   as tag attribute values.  Quoted strings are matched in normal text only
 *   to the extent that they do not span line breaks.
 *
 * Please direct comments, questions, gripes or praise to
 * html-parser@quiotix.com.  If you like it/hate it/use it, please let us know!
 *
 * danson: Modified for HtmlSideKick plugin for jEdit, added ability to handle
 * jsp as well as html, added locations for tags, etc.  This parser works well
 * for most xml-based markup also.
 */

options { IGNORE_CASE = true; STATIC = false; }

PARSER_BEGIN(HtmlParser)

package sidekick.html.parser.html;

import java.text.MessageFormat;
import java.util.*;
import java.util.regex.*;
import sidekick.util.*;
import java.io.Reader;

public class HtmlParser {

  static String NL = System.getProperty("line.separator");

  private List<ParseError> parseErrors = new ArrayList<ParseError>();

  public void setLineSeparator(String ls) {
    NL = ls;
  }

  private static String getTokenText(Token first, Token cur) {
    Token t;
    StringBuffer sb = new StringBuffer();

    for (t=first; t != cur.next; t = t.next) {
      if (t.specialToken != null) {
        Token tt=t.specialToken;
        while (tt.specialToken != null)
          tt = tt.specialToken;
        for (; tt != null; tt = tt.next)
          sb.append(tt.image);
      };
      sb.append(t.image);
    };
    return sb.toString();
  }

  /**
  * The line offset is used when the HTML to be parsed is only part of a file,
  * @param lineOffset The line number of the first line of the fragment.
  * @param columnOffset The column number of the first character of the fragment.
  */
  public HtmlParser(Reader in, int lineOffset, int columnOffset){
  	this(in);
  	jj_input_stream.ReInit(in,lineOffset,columnOffset);
  }
  
  public static void main(String[] args) throws ParseException {
    HtmlParser parser = new HtmlParser(System.in);
    HtmlDocument doc = parser.HtmlDocument();
    doc.accept(new HtmlDumper(System.out));
    System.exit(0);
  }

   public void setTabSize(int size) {
        jj_input_stream.setTabSize(size);
   }

   public int getTabSize() {
        return jj_input_stream.getTabSize(0);
   }

    private void addException(ParseException pe) {
        Range range = getExceptionLocation( pe );
        parseErrors.add(new ParseError(pe.getMessage(), range));
        pe.printStackTrace();
    }

    public List<ParseError> getParseErrors() {
       System.out.println("getParserErrors, there are " + parseErrors.size() + " errors");
       return parseErrors;
    }

    // regex to extract line and colun from a ParseException message
    // ParseException message look like: "Parse error at line 116, column 5.  Encountered: }"
    private Pattern pePattern = Pattern.compile( "(.*?)(\\d+)(.*?)(\\d+)(.*?)" );

    /**
     * @return attempts to return a Location indicating the location of a parser
     * exception.  If the ParseException contains a Token reference, all is well,
     * otherwise, this method attempts to parse the message string for the
     * exception.
     */
    private Range getExceptionLocation( ParseException pe ) {
        Token t = pe.currentToken;
        if ( t != null ) {
            return new Range( new Location( t.next.beginLine - 1, t.next.beginColumn ), new Location( t.next.endLine - 1, t.next.endColumn ) );
        }

        // ParseException message look like: "Parse error at line 116, column 5.  Encountered: }"
        try {
            Matcher m = pePattern.matcher( pe.getMessage() );
            if ( m.matches() ) {
                String ln = m.group( 2 );
                String cn = m.group( 4 );
                int line_number = -1;
                int column_number = 0;
                if ( ln != null )
                    line_number = Integer.parseInt( ln );
                if ( cn != null )
                    column_number = Integer.parseInt( cn );
                return line_number > -1 ? new Range( new Location( line_number - 1, column_number - 1 ), new Location( line_number - 1, column_number ) ) : null;
            }
            return new Range();
        }
        catch ( Exception e ) {
            //e.printStackTrace();
            return new Range();
        }
    }

    // regex pattern for a valid non-quoted attribute.
    // Attributes can be single or double quoted, or consist solely of
    // letters in the range A-Z and a-z, digits (0-9), hyphens ("-"),
    // and periods (".")
    private Pattern attributePattern = Pattern.compile( "([a-zA-Z0-9.-])*" );
    private boolean isProperAttribute(String s) {
        // could have double quotes
        if (s.startsWith("\"") && s.endsWith("\"")) {
            return true;
        }
        // or single quotes
        else if (s.startsWith("'") && s.endsWith("'")) {
            return true;
        }
        // or might be jsp
        else if (s.startsWith("<%") && (s.endsWith("%>") || s.endsWith("%")) ) {
            return true;
        }
        boolean rtn = attributePattern.matcher(s).matches();
        if (rtn == false) {
            System.out.println("bad attribute: " + s);
        }
        return rtn;
    }
}

PARSER_END(HtmlParser)

MORE:
{
    "<%" : IN_JSP_EXP
}

<IN_JSP_EXP>
SPECIAL_TOKEN :
{
  <JSP_EXP_END: "%>" > : DEFAULT
}
<IN_JSP_EXP>
MORE :
{
  < ~[] >
}

<*> TOKEN :
{
  <#ALPHA_CHAR: [
       "\u0024",
       "\u0041"-"\u005a",
       "\u005f",
       "\u0061"-"\u007a",
       "\u00c0"-"\u00d6",       // Latin with diacritics
       "\u00d8"-"\u00f6",       // Latin with diacritics
       "\u00f8"-"\u00ff",       // Latin with diacritics
       "\u0100"-"\u1fff",       // Latin Extended-A through Greek Extended
       "\u3040"-"\u318f",       // Hiragana through Hangul Compatibility Jamo
       "\u3300"-"\u337f",       // CJK Compatibility
       "\u3400"-"\u3d2d",       // CJK Unified Ideographs Extension A
       "\u4e00"-"\u9fff",       // CJK Unified Ideographs
       "\uf900"-"\ufaff" ] >    // CJK Compatibility Ideographs
| <#NUM_CHAR:   ["0"-"9"] >
| <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
| <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
| <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
| <#STYLE_IDENTIFIER: (<ALPHA_CHAR>)+ >
| <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
                    | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
| <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
| <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
| <#NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
| <#QUOTE:      ( "'" | "\"" ) >
}

<DEFAULT> TOKEN :
{
  <EOL:               ( " " | "\t" )* <NEWLINE> >
| <COMMENT_START:     "<!--" | "<%--"    >                 : LexComment
| <ENDTAG_START:      "</" | "</" <IDENTIFIER> ":" >       : LexStartTag
| <TAG_START:         "<" | "<%@" | "<" <IDENTIFIER> ":" > : LexStartTag
| <DECL_START:        "<!"      >                          : LexDecl
| <PCDATA:            ( ~["<", "\r", "\n"] )+ >
}

<LexStartTag> SPECIAL_TOKEN :
{
  < (<WHITESPACE>)+ >
}

<LexStartTag> TOKEN :
{
  <TAG_SCRIPT: "SCRIPT">    : LexInTag
| <TAG_STYLE:  "STYLE">     : LexInTag
| <TAG_NAME: <IDENTIFIER> > : LexInTag
| <LST_ERROR: ~[]>          : DEFAULT
}

<LexInTag> SPECIAL_TOKEN :
{
  < (<WHITESPACE>)+ >
}

<LexInTag> TOKEN :
{
  <ATTR_NAME: <IDENTIFIER> >
| <TAG_END: ">" | "%>" >                : DEFAULT
| <TAG_SLASHEND: "/>" >                 : DEFAULT
| <ATTR_EQ: "=" >                       : LexAttrVal
| <IMPLICIT_TAG_END: "<">
  {
    Token t = new Token();
    t.image       = "<";
    t.kind        = TAG_START;
    t.next        = matchedToken.next;
    t.beginLine   = matchedToken.beginLine;
    t.beginColumn = matchedToken.beginColumn;
    t.endLine     = matchedToken.endLine;
    t.endColumn   = matchedToken.endColumn;
    matchedToken.next  = t;
    matchedToken.kind  = TAG_END;
    matchedToken.image = ">";
  }                                     : LexStartTag
| <LIT_ERROR: ~[]>
}

<LexAttrVal> SPECIAL_TOKEN :
{
  < <WHITESPACE> >
}

<LexAttrVal> TOKEN :
{
  <ATTR_VAL: <QUOTED_STRING>
| ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexInTag
| <LAV_ERROR: ~[]>
}

<LexComment> TOKEN :
{
  < COMMENT_END:  ("--" (" ")* ">" | "->" | "--%>" ) > : DEFAULT
| < DASH:         "-" >
| < COMMENT_EOL:  <NEWLINE> >
| < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
                    | <QUOTED_STRING_NB>
                    | <QUOTE> ) >
}

<LexDecl> TOKEN :
{
  <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
| <DECL_END: ">" > : DEFAULT
}

<LexScript> TOKEN : {
  <SCRIPT_END:   "</SCRIPT>" > : DEFAULT
}

<LexStyle> TOKEN : {
  <STYLE_END:    "</STYLE>" > : DEFAULT
}

<LexScript, LexStyle> TOKEN :
{
  <BLOCK_EOL:    <NEWLINE> >
| <BLOCK_LBR:    "<" >
| <BLOCK_WORD:   ( <QUOTED_STRING_NB>
                   | <QUOTE>
                   | (~[ "\n", "\r", "'", "\"", "<"])+ ) >
}

HtmlDocument HtmlDocument() :
{
  HtmlDocument.ElementSequence s;
}
{
  s=ElementSequence() <EOF>
  { return new HtmlDocument(s); }
}

HtmlDocument.ElementSequence ElementSequence() :
{
  HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();
  HtmlDocument.HtmlElement h;
}
{
  ( h=Element() { s.addElement(h); } ) *
  { return s; }
}

HtmlDocument.HtmlElement Element() :
{
  HtmlDocument.HtmlElement e;
  Token text;
}
{
(
    LOOKAHEAD(2)
         e = Tag()        { return e; }
  |      e = EndTag()     { return e; }
  |      e = CommentTag() { return e; }
  |      e = DeclTag()    { return e; }
  | LOOKAHEAD(2)
         e = ScriptBlock() { return e; }
  | LOOKAHEAD(2)
         e = StyleBlock()  { return e; }
  | LOOKAHEAD(2)
             <TAG_START> text=<LST_ERROR>
                          { return new HtmlDocument.Text("<" + text.image); }
  |   text = <PCDATA>     { return new HtmlDocument.Text(text.image); }
  |          <EOL>        { return new HtmlDocument.Newline(); }
)
}

HtmlDocument.Attribute Attribute() :
{
  HtmlDocument.Attribute a;
  Token t1, t2=null;
}
{
    try {
      t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
      {
    
        if (t2 == null) {
          a = new HtmlDocument.Attribute(t1.image);
          a.setStartLocation(t1.beginLine, t1.beginColumn);
          a.setEndLocation(t1.endLine, t1.endColumn + 1);
        }
        else {
          a = new HtmlDocument.Attribute(t1.image, t2.image);
          a.setStartLocation(t1.beginLine, t1.beginColumn);
          a.setValueStartLocation(t2.beginLine,t2.beginColumn);
          a.setEndLocation(t2.endLine, t2.endColumn + 1);
          if (!isProperAttribute(t2.image)) {
           ParseException e = new ParseException("Parse error at line " + t2.beginLine + ", column " + t2.beginColumn + ".  Attribute is improperly quoted." );
           addException(e);
          }
        }
    
         return a;
      }
    }
      catch(ParseException e) {
        addException(e);
        return null;
      }
}

HtmlDocument.AttributeList AttributeList() :
{
  HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();
  HtmlDocument.Attribute a;
}
{
  (a=Attribute() { alist.addAttribute(a); } )*
  {
    return alist;
  }
}

HtmlDocument.HtmlElement Tag() :
{
  Token t, et;
  HtmlDocument.AttributeList alist;
  Token firstToken = getToken(1);
  Token st = null;
  boolean isJspTag = false;
}
{
  try {
    st=<TAG_START>  t=<TAG_NAME> alist=AttributeList()
    ( et=<TAG_END> | et=<TAG_SLASHEND> )
    {
        String tag_start = "<";
        String tag_name = "";
        if (st.image.startsWith("<") && st.image.endsWith(":")) {
            isJspTag = true;
            tag_start = "<";
            tag_name = st.image.substring(1) + t.image;
        }
        else {
            tag_name = t.image;
        }
        if (st.image.startsWith("<%")) {
            isJspTag = true;
        }
      HtmlDocument.Tag rtn_tag = new HtmlDocument.Tag(tag_start, tag_name, alist, et.image);
      if (et.kind == TAG_SLASHEND) {
          rtn_tag.setEmpty(true);
      }
      rtn_tag.setStartLocation(st.beginLine, st.beginColumn);
      rtn_tag.setEndLocation(et.endLine, et.endColumn + 1);
      rtn_tag.setIsJspTag(isJspTag);
      return rtn_tag;
    }
  }
  catch (ParseException ex) {
      addException(ex);
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}


String StyleBlockContents() :
{
    StringBuffer sb = new StringBuffer();
    Token t = null;
}
{
    try {
        ( t=<BLOCK_EOL> { sb.append(t.image); }
        | t=<BLOCK_WORD> { sb.append(t.image); }
        | t=<BLOCK_LBR> { sb.append(t.image); }
        )*
        {
            // don't trim content, otherwise errors in the first line will be off by
            // the amount of whitespace trimmed
            return sb.toString();
        }
      }
      catch(ParseException e) {
        addException(e);          
      }
}

String ScriptBlockContents() :
{
  //HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();
  StringBuffer sb = new StringBuffer();
  Token t = null;
}
{
    try {
      ( t=<BLOCK_EOL>    { sb.append(t.image); }
        | t=<BLOCK_WORD>  { sb.append(t.image); }
        | t=<BLOCK_LBR>  { sb.append(t.image); }
      )*
      {
            String contents = sb.toString();
            contents = contents.trim();
            // sometimes people wrap the contents of script tags with html comments
            // to protect older browsers that don't understand script tags from puking.
            // I'm removing them here as they don't serve a purpose as far as a jEdit
            // SideKick plugin is concerned.
            if (contents.startsWith("<!--")) {
                contents = contents.substring(4);
            }
            if (contents.endsWith("//-->")) {
                contents = contents.substring(0, contents.length() - 5);
            }
            return contents.trim();
        //return e;
      }
  }
  catch(ParseException e) {
    addException(e);          
  }
}

HtmlDocument.HtmlElement ScriptBlock() :
{
  HtmlDocument.AttributeList alist;
  Token firstToken = getToken(1);
  Token st, et, ts, est;
  String contents = "";
}
{
  try {
    st=<TAG_START> ts=<TAG_SCRIPT> alist=AttributeList() est=<TAG_END>
    {
      token_source.SwitchTo(LexScript);
    }
    contents=ScriptBlockContents()
    et=<SCRIPT_END>
    {
    	HtmlDocument.Tag script = new HtmlDocument.Tag(ts.image, alist);
        script.setStartLocation(st.beginLine, st.beginColumn);
        script.setEndLocation(est.endLine, est.endColumn);
        
        HtmlDocument.EndTag endScript = new HtmlDocument.EndTag( ts.image );
        endScript.setStartLocation(et.beginLine, et.beginColumn);
        endScript.setEndLocation(et.endLine, et.endColumn);

        
        HtmlDocument.Text text = new HtmlDocument.Text(contents);
        HtmlDocument.ElementSequence seq = new HtmlDocument.ElementSequence();
        seq.addElement(text);
        HtmlDocument.TagBlock b = new HtmlDocument.TagBlock(script, seq, endScript);
        b.setStartLocation(st.beginLine, st.beginColumn);
        b.setEndLocation(et.endLine, et.endColumn + 1);
        return b;
    }
  }
  catch (ParseException ex) {
    addException(ex);
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}


HtmlDocument.HtmlElement StyleBlock() :
{
  HtmlDocument.AttributeList alist;
  Token firstToken = getToken(1);
  Token st, est, et;
  String contents = "";
}
{
  try {
    st=<TAG_START> <TAG_STYLE> alist=AttributeList() est=<TAG_END>
    {
      token_source.SwitchTo(LexStyle);
    }
    contents=StyleBlockContents()
    et=<STYLE_END>
    {
        HtmlDocument.Text text = new HtmlDocument.Text(contents);
        HtmlDocument.ElementSequence seq = new HtmlDocument.ElementSequence();
        seq.addElement(text);
        HtmlDocument.TagBlock b = new HtmlDocument.TagBlock("STYLE", alist, seq);
        b.setStartLocation(st.beginLine, st.beginColumn);
        b.setEndLocation(et.endLine, et.endColumn + 1);

		b.startTag.setStartLocation(st.beginLine, st.beginColumn);
        b.startTag.setEndLocation(est.endLine, est.endColumn + 1);

        return b;
    }
  }
  catch (ParseException ex) {
    addException(ex);
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}


HtmlDocument.HtmlElement EndTag() :
{
  Token t;
  Token firstToken = getToken(1);
  Token st, et;
}
{
  try {
    st=<ENDTAG_START> t=<TAG_NAME> et=<TAG_END>
    {
        String tag_name = "";
        if (st.image.startsWith("</") && st.image.endsWith(":")) {
            tag_name = st.image.substring(2) + t.image;
        }
        else
            tag_name = t.image;
        HtmlDocument.EndTag b = new HtmlDocument.EndTag(tag_name);
        b.setStartLocation(st.beginLine, st.beginColumn);
        b.setEndLocation(et.endLine, et.endColumn + 1);
        return b;
    }
  }
  catch (ParseException ex) {
    addException(ex);
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}

HtmlDocument.Comment CommentTag() :
{
  Token t, comment_start, comment_end = null;
  StringBuffer s = new StringBuffer();
}
{
    try {
          comment_start=<COMMENT_START>
          ( t=<DASH> { s.append(t.image); }
            | <COMMENT_EOL>  { s.append(NL); }
            | t=<COMMENT_WORD> { s.append(t.image); } )*
          (<EOF> | comment_end=<COMMENT_END>)
          { return new HtmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image)); }
      }
      catch(ParseException e) {
        addException(e);          
      }
  
}

HtmlDocument.Comment DeclTag() :
{
  Token t;
}
{
    try {
          <DECL_START> t=<DECL_ANY> <DECL_END>
          {
            return new HtmlDocument.Comment(t.image);
          }
      }
      catch(ParseException e) {
        addException(e);          
      }
}