HtmlParser.jj | searchcode

/plugins/Beauty/tags/beauty-0.6.1/src/beauty/parsers/html/HtmlParser.jj

# · Unknown · 619 lines · 563 code · 56 blank · 0 comment · 0 complexity · 913d6a99fe515a910e1fd9d78d2669fe MD5 · raw file

/*
 * HtmlParser.jj -- JavaCC grammar for HTML.
 * Copyright (C) 1999 Quiotix Corporation.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 * for more details.
 */


/*
 * JavaCC grammar file for HTML.
 *
 * Author: Brian Goetz, Quiotix
 * Version: 1.03
 * Revision: $Id: HtmlParser.jj 18072 2010-06-15 04:41:57Z daleanson $
 *
 * This grammar parses an HTML document and produces a (flat) parse "tree"
 * representing the document.  It preserves almost all information in the
 * source document, including carriage control and spacing (except inside
 * of tags.)  See the HtmlDocument and HtmlDocument.* classes for a
 * description of the parse tree.  The parse tree supports traversal using
 * the commonly used "Visitor" pattern.  The HtmlDumper class is a visitor
 * which dumps out the tree to an output stream.
 *
 * It does not require begin tags to be matched with end tags, or validate
 * the names or contents of the tags (this can easily be done post-parsing;
 * see the HtmlCollector class (which matches begin tags with end tags)
 * for an example.)
 *
 * Notable edge cases include:
 * - Quoted string processing.  Quoted strings are matched inside of comments, and
 *   as tag attribute values.  Quoted strings are matched in normal text only
 *   to the extent that they do not span line breaks.
 *
 * Please direct comments, questions, gripes or praise to
 * html-parser@quiotix.com.  If you like it/hate it/use it, please let us know!
 */

options {
    IGNORE_CASE = true;
    STATIC = false;
    //DEBUG_PARSER = true;
}

PARSER_BEGIN(HtmlParser)

package beauty.parsers.html;

// TODO: need to support jsp markup and comments in script and style blocks.
public class HtmlParser {

  final static String NL = System.getProperty("line.separator");

  private static String getTokenText(Token first, Token cur) {
    Token t;
    StringBuffer sb = new StringBuffer();

    for (t=first; t != cur.next; t = t.next) {
      if (t.specialToken != null) {
        Token tt=t.specialToken;
        while (tt.specialToken != null)
          tt = tt.specialToken;
        for (; tt != null; tt = tt.next)
          sb.append(tt.image);
      }
      sb.append(t.image);
    }
    return sb.toString();
  }

  public static void main(String[] args) throws ParseException {
      if (args.length == 0) {
        return;
      }
      try {
        String filename = args[0];
        HtmlParser parser = new HtmlParser(new java.io.FileReader(filename));
        HtmlDocument document = parser.HtmlDocument();
        //doc.accept(new HtmlDumper(System.out));

            document.setLineSeparator("\n");
            document.accept(new HtmlCollector());
            document.accept(new HtmlScrubber(HtmlScrubber.DEFAULT_OPTIONS | HtmlScrubber.TRIM_SPACES));
            HtmlFormatter formatter = new HtmlFormatter();
            formatter.setRightMargin(1024);
            formatter.setLineSeparator("\n");
            formatter.setIndent(3);
            document.accept(formatter);
            System.out.println(formatter.toString());
      }
      catch(Exception e) {
          e.printStackTrace();
      }
    System.exit(0);
  }

}

PARSER_END(HtmlParser)


<*> TOKEN :
{
  <#ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
| <#NUM_CHAR:   ["0"-"9"] >
| <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
| <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
| <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
| <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
                    | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
| <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
| <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
| <#NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
| <#QUOTE:      ( "'" | "\"" )>
| <#EL_ESCAPE: ("\\${" | "\\#{") >
| <#TEXT_IN_EL: (~["}", "'", "\""])+ >
| <#NO_JSP_TAG_END: ( ~["%"] | ("%" ~[">"]) )+ >
| <#NO_TAG_END: ( ~[">"] | ( "/" ~[">"]) )+ >
}

<DEFAULT> TOKEN :
{
  <EOL:            ( " " | "\t" )* <NEWLINE> >
| <TAG_START:      "<" (<WHITESPACE>)* | "<%@" (<WHITESPACE>)* > : LexStartTag
| <SCRIPTLET_TAG:  "<%"      > : LexScriptlet
| <ENDTAG_START:   "</"      > : LexStartTag
| <COMMENT_START:     "<!--" | "<%--"    >   : LexComment
| <DECL_START:     "<!"      > : LexDecl
| <PCDATA:         ( ~["<", "\r", "\n"] )+ >
| <BLANK_LINES: <EOL> (<EOL>)+ >
}

<LexStartTag> TOKEN :
{
  <TAG_SCRIPT: ("SCRIPT" | "script") >    : LexInTag
| <TAG_STYLE:  ("STYLE"  | "style" ) >    : LexInTag
| <TAG_NAME: <IDENTIFIER> > : LexInTag
| <LST_ERROR: ~[]>          : DEFAULT
}

<LexInTag> SPECIAL_TOKEN :
{
  < (<WHITESPACE>)+ >
}

<LexInTag> TOKEN :
{
  <ATTR_NAME:  <IDENTIFIER> >
| <TAG_END: ">" >                       : DEFAULT
| <TAG_PERCENTEND: "%>" >               : DEFAULT
| <TAG_SLASHEND: "/>" >                 : DEFAULT
| <ATTR_EQ: "=" >                       : LexAttrVal
| <IMPLICIT_TAG_END: "<">
  {
    Token t = new Token();
    t.image       = "<";
    t.kind        = TAG_START;
    t.next        = matchedToken.next;
    t.beginLine   = matchedToken.beginLine;
    t.beginColumn = matchedToken.beginColumn;
    t.endLine     = matchedToken.endLine;
    t.endColumn   = matchedToken.endColumn;
    matchedToken.next  = t;
    matchedToken.kind  = TAG_END;
    matchedToken.image = ">";
  }                                     : LexStartTag
| <LIT_ERROR: ~[]>
}

<LexAttrVal> TOKEN :
{
	<SINGLE_QUOTE: "'"> : AttrValueBetweenSingleQuotesState
|	<DOUBLE_QUOTE: "\""> : AttrValueBetweenDoubleQuotesState
}

<LexAttrVal> SPECIAL_TOKEN :
{
  < (<WHITESPACE>)+ >
}


<AttrValueBetweenSingleQuotesState> TOKEN :
{
	<ENDING_SINGLE_QUOTE: "'"> : LexInTag
|	<UNPARSED_TEXT_NO_SINGLE_QUOTES:
		( (~["$", "#", "'"]) | (["$", "#"] ~["{", "'"]) | <EL_ESCAPE> )+ >
|	<DOLLAR_OR_HASH_SINGLE_QUOTE: ["$", "#"] "'" > : LexInTag
}

<AttrValueBetweenDoubleQuotesState> TOKEN :
{
	<ENDING_DOUBLE_QUOTE: "\""> : LexInTag
|	<UNPARSED_TEXT_NO_DOUBLE_QUOTES: ( (~["<", "$", "#", "\""]) | (["$", "#"] ~["{", "\""]) | <EL_ESCAPE> )+ > : AttrValueBetweenDoubleQuotesState
|	<DOLLAR_OR_HASH_DOUBLE_QUOTE: ["$", "#"] "\"" > : LexInTag
}

<AttrValueBetweenSingleQuotesState, AttrValueBetweenDoubleQuotesState> TOKEN:
{
	<EL_EXPRESSION_IN_ATTRIBUTE: "${" (<QUOTED_STRING> | <TEXT_IN_EL>)* "}" >
|	<VALUE_BINDING_IN_ATTRIBUTE: "#{" (<QUOTED_STRING> | <TEXT_IN_EL>)* "}" >
|	<JSP_EXPRESSION_IN_ATTRIBUTE: "<%=" <NO_JSP_TAG_END> "%>" >
|   <JSP_TAG_IN_ATTRIBUTE: "<" <NO_TAG_END> "/>" >
}

<LexInAttrVal> TOKEN :
{
  <ATTR_VAL: ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexAttrVal
}


<LexComment> TOKEN :
{
  < COMMENT_END:  ("--" (" ")* ">" | "->" | "--%>" ) > : DEFAULT
| < DASH:         "-" >
| < COMMENT_EOL:  <NEWLINE> >
| < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
                    | <QUOTED_STRING_NB>
                    | <QUOTE> ) >
}

<LexScriptlet> TOKEN :
{
  <SCRIPTLET_END: (<WHITESPACE>)* "%>" > : DEFAULT
}

<LexDecl> TOKEN :
{
  <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
| <DECL_END: ">" > : DEFAULT
}

<LexScript> TOKEN : {
  <SCRIPT_END:   ("</SCRIPT>" | "</script>") > : DEFAULT
}

<LexStyle> TOKEN : {
  <STYLE_END:    ("</STYLE>" | "</style>") > : DEFAULT
}

<LexScript, LexStyle, LexScriptlet> TOKEN :
{
  <BLOCK_EOL:    <NEWLINE> >
| <BLOCK_LBR:    "<" >
| <BLOCK_WORD:   ( <QUOTED_STRING_NB>
                   | <QUOTE>
                   | (~[ "\n", "\r", "'", "\"", "<", "%"])+ ) >
}

HtmlDocument HtmlDocument() :
{
  HtmlDocument.ElementSequence s;
}
{
  s=ElementSequence() <EOF>
  { return new HtmlDocument(s); }
}

HtmlDocument.ElementSequence ElementSequence() :
{
  HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();
  HtmlDocument.HtmlElement h;
}
{
    ( h=Element() { s.addElement(h); } ) *
  { 
    return s; 
  }
}

HtmlDocument.HtmlElement Element() :
{
  HtmlDocument.HtmlElement e;
  Token text;
}
{
(
    LOOKAHEAD(2)
         e = Tag()          { return e; }
  |      e = EndTag()       { return e; }
  |      e = CommentTag()   { return e; }
  |      e = DeclTag()      { return e; }
  | LOOKAHEAD(3)
       e = ScriptletBlock() { return e; }
  | LOOKAHEAD(2)
         e = ScriptBlock()  { return e; }
  | LOOKAHEAD(2)
         e = StyleBlock()   { return e; }
  | LOOKAHEAD(2)
             <TAG_START> text=<LST_ERROR>
                            { return new HtmlDocument.Text("<" + text.image); }
  |   text = <PCDATA>       { return new HtmlDocument.Text(text.image); }
  |          <BLANK_LINES>  { return new HtmlDocument.BlankLines(); }
  |          <EOL>          { return new HtmlDocument.Newline(); }
)
}

HtmlDocument.Attribute Attribute() :
{
  HtmlDocument.Attribute a;
  Token t;
  String value=null;
}
{
      //t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
      t=<ATTR_NAME> [ <ATTR_EQ> value=AttributeValue() ]
      { if (value == null)
          return new HtmlDocument.Attribute(t.image);
        else
          return new HtmlDocument.Attribute(t.image, value);
      }
}

String AttributeValue() :
{
    StringBuffer content = new StringBuffer();
    Token t = null;
    HtmlDocument.HtmlElement inner_tag = null;
}
{
    try {
        (
            ( 	<DOUBLE_QUOTE>
                (
                    (
                     	t = QuoteIndependentAttributeValueContent()
                    )
                    {
                        if (t != null) {
                            content.append(t.image);
                        }
                        else if (inner_tag != null) {
                            content.append(inner_tag.toString());
                        }
                    }
                )*
                ( 	<ENDING_DOUBLE_QUOTE>
                  |	t = <DOLLAR_OR_HASH_DOUBLE_QUOTE> { content.append(t.image.substring(0, 1)); }
                )
                [ t=<ATTR_VAL> { content.append( t.image ); } ]
            )
            |
            ( 	<SINGLE_QUOTE>
                (
                    ( t = <UNPARSED_TEXT_NO_SINGLE_QUOTES>
                    | t = QuoteIndependentAttributeValueContent()
                    ) { content.append(t.image); }
                )*
                ( 	<ENDING_SINGLE_QUOTE>
                  | t = <DOLLAR_OR_HASH_SINGLE_QUOTE>  { content.append(t.image.substring(0, 1)); }
                )
            )
        )
        {
            return content.toString();
        }
    }
    catch(Exception e) {
        e.printStackTrace();
    }
}

/**
 * Partial content of an attribute value that can contain all quotes.
 * This groups EL expressions, value bindings, and JSP expressions.
 */
Token QuoteIndependentAttributeValueContent() :
{ 	Token t; }
{
    try {
	(
	    LOOKAHEAD(2)
	    t = <EL_EXPRESSION_IN_ATTRIBUTE>
	| 	LOOKAHEAD(2) t = <VALUE_BINDING_IN_ATTRIBUTE>
  	| 	LOOKAHEAD(2) t = <JSP_EXPRESSION_IN_ATTRIBUTE>
  	|   LOOKAHEAD(2) t = <JSP_TAG_IN_ATTRIBUTE>
  	|   LOOKAHEAD(2) t = <UNPARSED_TEXT_NO_DOUBLE_QUOTES>
	)
	{ return t; }

    }
    catch(Exception e) {
        e.printStackTrace();
    }
}


HtmlDocument.AttributeList AttributeList() :
{
  HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();
  HtmlDocument.Attribute a;
}
{
    ( a=Attribute() { alist.addAttribute(a); } )*
  { return alist; }
}

HtmlDocument.HtmlElement Tag() :
{
  Token t, et;
  HtmlDocument.AttributeList alist = null;
  Token firstToken = getToken(1);
  HtmlDocument.HtmlElement rtn_tag = null;
  Token st = null;
  String tag_name;
  String tag_start;
}
{
  try {
    st=<TAG_START>  t=<TAG_NAME> alist=AttributeList() ( et=<TAG_END> | et=<TAG_PERCENTEND> | et=<TAG_SLASHEND> )
    {
        HtmlDocument.Tag tag = new HtmlDocument.Tag(st.image, t.image, alist, et.image);
        if (st.image.startsWith("<%") || t.image.indexOf(":") > 0) {
            tag.setIsJspTag(true);
        }
        if (et.kind == TAG_SLASHEND) {
          tag.setEmpty(true);
        }
        rtn_tag = tag;
    }
  }
  catch (ParseException ex) {
      System.out.println(ex.getMessage());
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    HtmlDocument.Text tag = new HtmlDocument.Text(s);
    rtn_tag = tag;
  }
  finally {
    return rtn_tag;
  }
}

HtmlDocument.ElementSequence BlockContents() : {
  Token t;
  StringBuffer s = new StringBuffer();
  HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();
}
{
  ( <BLOCK_EOL> {
      if (s.length() > 0) {
        e.addElement(new HtmlDocument.Text(s.toString()));
        s.setLength(0);
      };
      e.addElement(new HtmlDocument.Newline());
    }
    | t=<BLOCK_WORD> { s.append(t.image); }
    | t=<BLOCK_LBR>  { s.append(t.image); }
  )*
  {
    if (s.length() > 0)
      e.addElement(new HtmlDocument.Text(s.toString()));
    // danson, removed next line, it causes an extra blank line to be inserted
    // in script and style blocks
    //e.addElement(new HtmlDocument.Newline());
    return e;
  }
}

HtmlDocument.HtmlElement ScriptBlock() :
{
  HtmlDocument.AttributeList alist;
  HtmlDocument.ElementSequence e;
  Token firstToken = getToken(1);
}
{
  try {
    <TAG_START> <TAG_SCRIPT> alist=AttributeList() <TAG_END>
    {
      token_source.SwitchTo(LexScript);
    }
    e=BlockContents()
    <SCRIPT_END>
    {
      return new HtmlDocument.TagBlock("SCRIPT", alist, e);
    }
  }
  catch (ParseException ex) {
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}


HtmlDocument.HtmlElement StyleBlock() :
{
  HtmlDocument.AttributeList alist;
  HtmlDocument.ElementSequence e;
  Token firstToken = getToken(1);
}
{
  try {
    <TAG_START> <TAG_STYLE> alist=AttributeList() <TAG_END>
    {
      token_source.SwitchTo(LexStyle);
    }
    e=BlockContents()
    <STYLE_END>
    {
      return new HtmlDocument.TagBlock("STYLE", alist, e);
    }
  }
  catch (ParseException ex) {
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}


HtmlDocument.HtmlElement EndTag() :
{
  Token t;
  Token firstToken = getToken(1);
}
{
  try {
    <ENDTAG_START> t=<TAG_NAME> <TAG_END>
    { 
        HtmlDocument.EndTag tag = new HtmlDocument.EndTag(t.image);
        if (t.image.indexOf(":") > 0) {
            tag.setIsJspTag(true);
        }
        return tag; 
    }
  }
  catch (ParseException ex) {
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}

HtmlDocument.Comment CommentTag() :
{
  Token t, comment_start, comment_end = null;
  StringBuffer s = new StringBuffer();
}
{
  comment_start=<COMMENT_START>
  ( t=<DASH> { s.append(t.image); }
    | <COMMENT_EOL>  { s.append(NL); }
    | t=<COMMENT_WORD> { s.append(t.image); } )*
  (<EOF> | comment_end=<COMMENT_END>)

  {
      return new HtmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image));
  }
}


HtmlDocument.Comment DeclTag() :
{
  Token t;
}
{
  <DECL_START> t=<DECL_ANY> <DECL_END>
  {
    return new HtmlDocument.Comment(t.image);
  }
}

HtmlDocument.HtmlElement ScriptletBlock() :
{
  HtmlDocument.ElementSequence e;
  Token firstToken = getToken(1);
}
{
  try {
    <SCRIPTLET_TAG>
    {
        String block = getScriptletBlock();
        e = new HtmlDocument.ElementSequence();
        e.addElement(new HtmlDocument.Text(block));
        token_source.SwitchTo(DEFAULT);
        return new HtmlDocument.JspScriptletBlock(e);
    }
  }
  catch (ParseException ex) {
    // ex.printStackTrace();
    token_source.SwitchTo(DEFAULT);
    String s = getTokenText(firstToken, getNextToken());
    return new HtmlDocument.Text(s);
  }
}

/*
 Originally I tried to use BlockContents to get the contents of a jsp scriptlet,
 but the matching rules for a word trump the matching rules for the %> end tag.
 This works well anyway, since I really don't care about tokenizing the contents
 of a jsp scriptlet.  I just want the contents verbatim.

 This returns the contents of the jsp scriptlet block without the <% or %>.  The
 %> will be consumed.
*/
JAVACODE
String getScriptletBlock() {
    StringBuilder sb = new StringBuilder();
    try {
        while (true) {
            sb.append(jj_input_stream.readChar());
            if (sb.length() > 2 && sb.substring(sb.length() - 2).equals("%>")) {
                // trim the %> from the end of the string
                sb.setLength(sb.length() - 2);
                return sb.toString();
            }
        }
    }
    catch(Exception e) {
    }
    return sb.toString();
}