/plugins/Beauty/tags/beauty-0.6.1/src/beauty/parsers/html/HtmlParser.jj
# · Unknown · 619 lines · 563 code · 56 blank · 0 comment · 0 complexity · 913d6a99fe515a910e1fd9d78d2669fe MD5 · raw file
- /*
- * HtmlParser.jj -- JavaCC grammar for HTML.
- * Copyright (C) 1999 Quiotix Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
- * for more details.
- */
- /*
- * JavaCC grammar file for HTML.
- *
- * Author: Brian Goetz, Quiotix
- * Version: 1.03
- * Revision: $Id: HtmlParser.jj 18072 2010-06-15 04:41:57Z daleanson $
- *
- * This grammar parses an HTML document and produces a (flat) parse "tree"
- * representing the document. It preserves almost all information in the
- * source document, including carriage control and spacing (except inside
- * of tags.) See the HtmlDocument and HtmlDocument.* classes for a
- * description of the parse tree. The parse tree supports traversal using
- * the commonly used "Visitor" pattern. The HtmlDumper class is a visitor
- * which dumps out the tree to an output stream.
- *
- * It does not require begin tags to be matched with end tags, or validate
- * the names or contents of the tags (this can easily be done post-parsing;
- * see the HtmlCollector class (which matches begin tags with end tags)
- * for an example.)
- *
- * Notable edge cases include:
- * - Quoted string processing. Quoted strings are matched inside of comments, and
- * as tag attribute values. Quoted strings are matched in normal text only
- * to the extent that they do not span line breaks.
- *
- * Please direct comments, questions, gripes or praise to
- * html-parser@quiotix.com. If you like it/hate it/use it, please let us know!
- */
- options {
- IGNORE_CASE = true;
- STATIC = false;
- //DEBUG_PARSER = true;
- }
- PARSER_BEGIN(HtmlParser)
- package beauty.parsers.html;
- // TODO: need to support jsp markup and comments in script and style blocks.
- public class HtmlParser {
- final static String NL = System.getProperty("line.separator");
- private static String getTokenText(Token first, Token cur) {
- Token t;
- StringBuffer sb = new StringBuffer();
- for (t=first; t != cur.next; t = t.next) {
- if (t.specialToken != null) {
- Token tt=t.specialToken;
- while (tt.specialToken != null)
- tt = tt.specialToken;
- for (; tt != null; tt = tt.next)
- sb.append(tt.image);
- }
- sb.append(t.image);
- }
- return sb.toString();
- }
- public static void main(String[] args) throws ParseException {
- if (args.length == 0) {
- return;
- }
- try {
- String filename = args[0];
- HtmlParser parser = new HtmlParser(new java.io.FileReader(filename));
- HtmlDocument document = parser.HtmlDocument();
- //doc.accept(new HtmlDumper(System.out));
- document.setLineSeparator("\n");
- document.accept(new HtmlCollector());
- document.accept(new HtmlScrubber(HtmlScrubber.DEFAULT_OPTIONS | HtmlScrubber.TRIM_SPACES));
- HtmlFormatter formatter = new HtmlFormatter();
- formatter.setRightMargin(1024);
- formatter.setLineSeparator("\n");
- formatter.setIndent(3);
- document.accept(formatter);
- System.out.println(formatter.toString());
- }
- catch(Exception e) {
- e.printStackTrace();
- }
- System.exit(0);
- }
- }
- PARSER_END(HtmlParser)
- <*> TOKEN :
- {
- <#ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
- | <#NUM_CHAR: ["0"-"9"] >
- | <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
- | <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
- | <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
- | <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
- | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
- | <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
- | <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
- | <#NEWLINE: ( "\r\n" | "\r" | "\n" ) >
- | <#QUOTE: ( "'" | "\"" )>
- | <#EL_ESCAPE: ("\\${" | "\\#{") >
- | <#TEXT_IN_EL: (~["}", "'", "\""])+ >
- | <#NO_JSP_TAG_END: ( ~["%"] | ("%" ~[">"]) )+ >
- | <#NO_TAG_END: ( ~[">"] | ( "/" ~[">"]) )+ >
- }
- <DEFAULT> TOKEN :
- {
- <EOL: ( " " | "\t" )* <NEWLINE> >
- | <TAG_START: "<" (<WHITESPACE>)* | "<%@" (<WHITESPACE>)* > : LexStartTag
- | <SCRIPTLET_TAG: "<%" > : LexScriptlet
- | <ENDTAG_START: "</" > : LexStartTag
- | <COMMENT_START: "<!--" | "<%--" > : LexComment
- | <DECL_START: "<!" > : LexDecl
- | <PCDATA: ( ~["<", "\r", "\n"] )+ >
- | <BLANK_LINES: <EOL> (<EOL>)+ >
- }
- <LexStartTag> TOKEN :
- {
- <TAG_SCRIPT: ("SCRIPT" | "script") > : LexInTag
- | <TAG_STYLE: ("STYLE" | "style" ) > : LexInTag
- | <TAG_NAME: <IDENTIFIER> > : LexInTag
- | <LST_ERROR: ~[]> : DEFAULT
- }
- <LexInTag> SPECIAL_TOKEN :
- {
- < (<WHITESPACE>)+ >
- }
- <LexInTag> TOKEN :
- {
- <ATTR_NAME: <IDENTIFIER> >
- | <TAG_END: ">" > : DEFAULT
- | <TAG_PERCENTEND: "%>" > : DEFAULT
- | <TAG_SLASHEND: "/>" > : DEFAULT
- | <ATTR_EQ: "=" > : LexAttrVal
- | <IMPLICIT_TAG_END: "<">
- {
- Token t = new Token();
- t.image = "<";
- t.kind = TAG_START;
- t.next = matchedToken.next;
- t.beginLine = matchedToken.beginLine;
- t.beginColumn = matchedToken.beginColumn;
- t.endLine = matchedToken.endLine;
- t.endColumn = matchedToken.endColumn;
- matchedToken.next = t;
- matchedToken.kind = TAG_END;
- matchedToken.image = ">";
- } : LexStartTag
- | <LIT_ERROR: ~[]>
- }
- <LexAttrVal> TOKEN :
- {
- <SINGLE_QUOTE: "'"> : AttrValueBetweenSingleQuotesState
- | <DOUBLE_QUOTE: "\""> : AttrValueBetweenDoubleQuotesState
- }
- <LexAttrVal> SPECIAL_TOKEN :
- {
- < (<WHITESPACE>)+ >
- }
- <AttrValueBetweenSingleQuotesState> TOKEN :
- {
- <ENDING_SINGLE_QUOTE: "'"> : LexInTag
- | <UNPARSED_TEXT_NO_SINGLE_QUOTES:
- ( (~["$", "#", "'"]) | (["$", "#"] ~["{", "'"]) | <EL_ESCAPE> )+ >
- | <DOLLAR_OR_HASH_SINGLE_QUOTE: ["$", "#"] "'" > : LexInTag
- }
- <AttrValueBetweenDoubleQuotesState> TOKEN :
- {
- <ENDING_DOUBLE_QUOTE: "\""> : LexInTag
- | <UNPARSED_TEXT_NO_DOUBLE_QUOTES: ( (~["<", "$", "#", "\""]) | (["$", "#"] ~["{", "\""]) | <EL_ESCAPE> )+ > : AttrValueBetweenDoubleQuotesState
- | <DOLLAR_OR_HASH_DOUBLE_QUOTE: ["$", "#"] "\"" > : LexInTag
- }
- <AttrValueBetweenSingleQuotesState, AttrValueBetweenDoubleQuotesState> TOKEN:
- {
- <EL_EXPRESSION_IN_ATTRIBUTE: "${" (<QUOTED_STRING> | <TEXT_IN_EL>)* "}" >
- | <VALUE_BINDING_IN_ATTRIBUTE: "#{" (<QUOTED_STRING> | <TEXT_IN_EL>)* "}" >
- | <JSP_EXPRESSION_IN_ATTRIBUTE: "<%=" <NO_JSP_TAG_END> "%>" >
- | <JSP_TAG_IN_ATTRIBUTE: "<" <NO_TAG_END> "/>" >
- }
- <LexInAttrVal> TOKEN :
- {
- <ATTR_VAL: ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexAttrVal
- }
- <LexComment> TOKEN :
- {
- < COMMENT_END: ("--" (" ")* ">" | "->" | "--%>" ) > : DEFAULT
- | < DASH: "-" >
- | < COMMENT_EOL: <NEWLINE> >
- | < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
- | <QUOTED_STRING_NB>
- | <QUOTE> ) >
- }
- <LexScriptlet> TOKEN :
- {
- <SCRIPTLET_END: (<WHITESPACE>)* "%>" > : DEFAULT
- }
- <LexDecl> TOKEN :
- {
- <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
- | <DECL_END: ">" > : DEFAULT
- }
- <LexScript> TOKEN : {
- <SCRIPT_END: ("</SCRIPT>" | "</script>") > : DEFAULT
- }
- <LexStyle> TOKEN : {
- <STYLE_END: ("</STYLE>" | "</style>") > : DEFAULT
- }
- <LexScript, LexStyle, LexScriptlet> TOKEN :
- {
- <BLOCK_EOL: <NEWLINE> >
- | <BLOCK_LBR: "<" >
- | <BLOCK_WORD: ( <QUOTED_STRING_NB>
- | <QUOTE>
- | (~[ "\n", "\r", "'", "\"", "<", "%"])+ ) >
- }
- HtmlDocument HtmlDocument() :
- {
- HtmlDocument.ElementSequence s;
- }
- {
- s=ElementSequence() <EOF>
- { return new HtmlDocument(s); }
- }
- HtmlDocument.ElementSequence ElementSequence() :
- {
- HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();
- HtmlDocument.HtmlElement h;
- }
- {
- ( h=Element() { s.addElement(h); } ) *
- {
- return s;
- }
- }
- HtmlDocument.HtmlElement Element() :
- {
- HtmlDocument.HtmlElement e;
- Token text;
- }
- {
- (
- LOOKAHEAD(2)
- e = Tag() { return e; }
- | e = EndTag() { return e; }
- | e = CommentTag() { return e; }
- | e = DeclTag() { return e; }
- | LOOKAHEAD(3)
- e = ScriptletBlock() { return e; }
- | LOOKAHEAD(2)
- e = ScriptBlock() { return e; }
- | LOOKAHEAD(2)
- e = StyleBlock() { return e; }
- | LOOKAHEAD(2)
- <TAG_START> text=<LST_ERROR>
- { return new HtmlDocument.Text("<" + text.image); }
- | text = <PCDATA> { return new HtmlDocument.Text(text.image); }
- | <BLANK_LINES> { return new HtmlDocument.BlankLines(); }
- | <EOL> { return new HtmlDocument.Newline(); }
- )
- }
- HtmlDocument.Attribute Attribute() :
- {
- HtmlDocument.Attribute a;
- Token t;
- String value=null;
- }
- {
- //t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
- t=<ATTR_NAME> [ <ATTR_EQ> value=AttributeValue() ]
- { if (value == null)
- return new HtmlDocument.Attribute(t.image);
- else
- return new HtmlDocument.Attribute(t.image, value);
- }
- }
- String AttributeValue() :
- {
- StringBuffer content = new StringBuffer();
- Token t = null;
- HtmlDocument.HtmlElement inner_tag = null;
- }
- {
- try {
- (
- ( <DOUBLE_QUOTE>
- (
- (
- t = QuoteIndependentAttributeValueContent()
- )
- {
- if (t != null) {
- content.append(t.image);
- }
- else if (inner_tag != null) {
- content.append(inner_tag.toString());
- }
- }
- )*
- ( <ENDING_DOUBLE_QUOTE>
- | t = <DOLLAR_OR_HASH_DOUBLE_QUOTE> { content.append(t.image.substring(0, 1)); }
- )
- [ t=<ATTR_VAL> { content.append( t.image ); } ]
- )
- |
- ( <SINGLE_QUOTE>
- (
- ( t = <UNPARSED_TEXT_NO_SINGLE_QUOTES>
- | t = QuoteIndependentAttributeValueContent()
- ) { content.append(t.image); }
- )*
- ( <ENDING_SINGLE_QUOTE>
- | t = <DOLLAR_OR_HASH_SINGLE_QUOTE> { content.append(t.image.substring(0, 1)); }
- )
- )
- )
- {
- return content.toString();
- }
- }
- catch(Exception e) {
- e.printStackTrace();
- }
- }
- /**
- * Partial content of an attribute value that can contain all quotes.
- * This groups EL expressions, value bindings, and JSP expressions.
- */
- Token QuoteIndependentAttributeValueContent() :
- { Token t; }
- {
- try {
- (
- LOOKAHEAD(2)
- t = <EL_EXPRESSION_IN_ATTRIBUTE>
- | LOOKAHEAD(2) t = <VALUE_BINDING_IN_ATTRIBUTE>
- | LOOKAHEAD(2) t = <JSP_EXPRESSION_IN_ATTRIBUTE>
- | LOOKAHEAD(2) t = <JSP_TAG_IN_ATTRIBUTE>
- | LOOKAHEAD(2) t = <UNPARSED_TEXT_NO_DOUBLE_QUOTES>
- )
- { return t; }
- }
- catch(Exception e) {
- e.printStackTrace();
- }
- }
- HtmlDocument.AttributeList AttributeList() :
- {
- HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();
- HtmlDocument.Attribute a;
- }
- {
- ( a=Attribute() { alist.addAttribute(a); } )*
- { return alist; }
- }
- HtmlDocument.HtmlElement Tag() :
- {
- Token t, et;
- HtmlDocument.AttributeList alist = null;
- Token firstToken = getToken(1);
- HtmlDocument.HtmlElement rtn_tag = null;
- Token st = null;
- String tag_name;
- String tag_start;
- }
- {
- try {
- st=<TAG_START> t=<TAG_NAME> alist=AttributeList() ( et=<TAG_END> | et=<TAG_PERCENTEND> | et=<TAG_SLASHEND> )
- {
- HtmlDocument.Tag tag = new HtmlDocument.Tag(st.image, t.image, alist, et.image);
- if (st.image.startsWith("<%") || t.image.indexOf(":") > 0) {
- tag.setIsJspTag(true);
- }
- if (et.kind == TAG_SLASHEND) {
- tag.setEmpty(true);
- }
- rtn_tag = tag;
- }
- }
- catch (ParseException ex) {
- System.out.println(ex.getMessage());
- token_source.SwitchTo(DEFAULT);
- String s = getTokenText(firstToken, getNextToken());
- HtmlDocument.Text tag = new HtmlDocument.Text(s);
- rtn_tag = tag;
- }
- finally {
- return rtn_tag;
- }
- }
- HtmlDocument.ElementSequence BlockContents() : {
- Token t;
- StringBuffer s = new StringBuffer();
- HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();
- }
- {
- ( <BLOCK_EOL> {
- if (s.length() > 0) {
- e.addElement(new HtmlDocument.Text(s.toString()));
- s.setLength(0);
- };
- e.addElement(new HtmlDocument.Newline());
- }
- | t=<BLOCK_WORD> { s.append(t.image); }
- | t=<BLOCK_LBR> { s.append(t.image); }
- )*
- {
- if (s.length() > 0)
- e.addElement(new HtmlDocument.Text(s.toString()));
- // danson, removed next line, it causes an extra blank line to be inserted
- // in script and style blocks
- //e.addElement(new HtmlDocument.Newline());
- return e;
- }
- }
- HtmlDocument.HtmlElement ScriptBlock() :
- {
- HtmlDocument.AttributeList alist;
- HtmlDocument.ElementSequence e;
- Token firstToken = getToken(1);
- }
- {
- try {
- <TAG_START> <TAG_SCRIPT> alist=AttributeList() <TAG_END>
- {
- token_source.SwitchTo(LexScript);
- }
- e=BlockContents()
- <SCRIPT_END>
- {
- return new HtmlDocument.TagBlock("SCRIPT", alist, e);
- }
- }
- catch (ParseException ex) {
- token_source.SwitchTo(DEFAULT);
- String s = getTokenText(firstToken, getNextToken());
- return new HtmlDocument.Text(s);
- }
- }
- HtmlDocument.HtmlElement StyleBlock() :
- {
- HtmlDocument.AttributeList alist;
- HtmlDocument.ElementSequence e;
- Token firstToken = getToken(1);
- }
- {
- try {
- <TAG_START> <TAG_STYLE> alist=AttributeList() <TAG_END>
- {
- token_source.SwitchTo(LexStyle);
- }
- e=BlockContents()
- <STYLE_END>
- {
- return new HtmlDocument.TagBlock("STYLE", alist, e);
- }
- }
- catch (ParseException ex) {
- token_source.SwitchTo(DEFAULT);
- String s = getTokenText(firstToken, getNextToken());
- return new HtmlDocument.Text(s);
- }
- }
- HtmlDocument.HtmlElement EndTag() :
- {
- Token t;
- Token firstToken = getToken(1);
- }
- {
- try {
- <ENDTAG_START> t=<TAG_NAME> <TAG_END>
- {
- HtmlDocument.EndTag tag = new HtmlDocument.EndTag(t.image);
- if (t.image.indexOf(":") > 0) {
- tag.setIsJspTag(true);
- }
- return tag;
- }
- }
- catch (ParseException ex) {
- token_source.SwitchTo(DEFAULT);
- String s = getTokenText(firstToken, getNextToken());
- return new HtmlDocument.Text(s);
- }
- }
- HtmlDocument.Comment CommentTag() :
- {
- Token t, comment_start, comment_end = null;
- StringBuffer s = new StringBuffer();
- }
- {
- comment_start=<COMMENT_START>
- ( t=<DASH> { s.append(t.image); }
- | <COMMENT_EOL> { s.append(NL); }
- | t=<COMMENT_WORD> { s.append(t.image); } )*
- (<EOF> | comment_end=<COMMENT_END>)
- {
- return new HtmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image));
- }
- }
- HtmlDocument.Comment DeclTag() :
- {
- Token t;
- }
- {
- <DECL_START> t=<DECL_ANY> <DECL_END>
- {
- return new HtmlDocument.Comment(t.image);
- }
- }
- HtmlDocument.HtmlElement ScriptletBlock() :
- {
- HtmlDocument.ElementSequence e;
- Token firstToken = getToken(1);
- }
- {
- try {
- <SCRIPTLET_TAG>
- {
- String block = getScriptletBlock();
- e = new HtmlDocument.ElementSequence();
- e.addElement(new HtmlDocument.Text(block));
- token_source.SwitchTo(DEFAULT);
- return new HtmlDocument.JspScriptletBlock(e);
- }
- }
- catch (ParseException ex) {
- // ex.printStackTrace();
- token_source.SwitchTo(DEFAULT);
- String s = getTokenText(firstToken, getNextToken());
- return new HtmlDocument.Text(s);
- }
- }
- /*
- Originally I tried to use BlockContents to get the contents of a jsp scriptlet,
- but the matching rules for a word trump the matching rules for the %> end tag.
- This works well anyway, since I really don't care about tokenizing the contents
- of a jsp scriptlet. I just want the contents verbatim.
- This returns the contents of the jsp scriptlet block without the <% or %>. The
- %> will be consumed.
- */
- JAVACODE
- String getScriptletBlock() {
- StringBuilder sb = new StringBuilder();
- try {
- while (true) {
- sb.append(jj_input_stream.readChar());
- if (sb.length() > 2 && sb.substring(sb.length() - 2).equals("%>")) {
- // trim the %> from the end of the string
- sb.setLength(sb.length() - 2);
- return sb.toString();
- }
- }
- }
- catch(Exception e) {
- }
- return sb.toString();
- }