/plugins/Beauty/tags/beauty-0.6.1/src/beauty/parsers/html/HtmlParser.jj

# · Unknown · 619 lines · 563 code · 56 blank · 0 comment · 0 complexity · 913d6a99fe515a910e1fd9d78d2669fe MD5 · raw file

  1. /*
  2. * HtmlParser.jj -- JavaCC grammar for HTML.
  3. * Copyright (C) 1999 Quiotix Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License, version 2, as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
  13. * for more details.
  14. */
  15. /*
  16. * JavaCC grammar file for HTML.
  17. *
  18. * Author: Brian Goetz, Quiotix
  19. * Version: 1.03
  20. * Revision: $Id: HtmlParser.jj 18072 2010-06-15 04:41:57Z daleanson $
  21. *
  22. * This grammar parses an HTML document and produces a (flat) parse "tree"
  23. * representing the document. It preserves almost all information in the
  24. * source document, including carriage control and spacing (except inside
  25. * of tags.) See the HtmlDocument and HtmlDocument.* classes for a
  26. * description of the parse tree. The parse tree supports traversal using
  27. * the commonly used "Visitor" pattern. The HtmlDumper class is a visitor
  28. * which dumps out the tree to an output stream.
  29. *
  30. * It does not require begin tags to be matched with end tags, or validate
  31. * the names or contents of the tags (this can easily be done post-parsing;
  32. * see the HtmlCollector class (which matches begin tags with end tags)
  33. * for an example.)
  34. *
  35. * Notable edge cases include:
  36. * - Quoted string processing. Quoted strings are matched inside of comments, and
  37. * as tag attribute values. Quoted strings are matched in normal text only
  38. * to the extent that they do not span line breaks.
  39. *
  40. * Please direct comments, questions, gripes or praise to
  41. * html-parser@quiotix.com. If you like it/hate it/use it, please let us know!
  42. */
  43. options {
  44. IGNORE_CASE = true;
  45. STATIC = false;
  46. //DEBUG_PARSER = true;
  47. }
  48. PARSER_BEGIN(HtmlParser)
  49. package beauty.parsers.html;
  50. // TODO: need to support jsp markup and comments in script and style blocks.
  51. public class HtmlParser {
  52. final static String NL = System.getProperty("line.separator");
  53. private static String getTokenText(Token first, Token cur) {
  54. Token t;
  55. StringBuffer sb = new StringBuffer();
  56. for (t=first; t != cur.next; t = t.next) {
  57. if (t.specialToken != null) {
  58. Token tt=t.specialToken;
  59. while (tt.specialToken != null)
  60. tt = tt.specialToken;
  61. for (; tt != null; tt = tt.next)
  62. sb.append(tt.image);
  63. }
  64. sb.append(t.image);
  65. }
  66. return sb.toString();
  67. }
  68. public static void main(String[] args) throws ParseException {
  69. if (args.length == 0) {
  70. return;
  71. }
  72. try {
  73. String filename = args[0];
  74. HtmlParser parser = new HtmlParser(new java.io.FileReader(filename));
  75. HtmlDocument document = parser.HtmlDocument();
  76. //doc.accept(new HtmlDumper(System.out));
  77. document.setLineSeparator("\n");
  78. document.accept(new HtmlCollector());
  79. document.accept(new HtmlScrubber(HtmlScrubber.DEFAULT_OPTIONS | HtmlScrubber.TRIM_SPACES));
  80. HtmlFormatter formatter = new HtmlFormatter();
  81. formatter.setRightMargin(1024);
  82. formatter.setLineSeparator("\n");
  83. formatter.setIndent(3);
  84. document.accept(formatter);
  85. System.out.println(formatter.toString());
  86. }
  87. catch(Exception e) {
  88. e.printStackTrace();
  89. }
  90. System.exit(0);
  91. }
  92. }
  93. PARSER_END(HtmlParser)
  94. <*> TOKEN :
  95. {
  96. <#ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
  97. | <#NUM_CHAR: ["0"-"9"] >
  98. | <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
  99. | <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
  100. | <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
  101. | <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
  102. | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
  103. | <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
  104. | <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
  105. | <#NEWLINE: ( "\r\n" | "\r" | "\n" ) >
  106. | <#QUOTE: ( "'" | "\"" )>
  107. | <#EL_ESCAPE: ("\\${" | "\\#{") >
  108. | <#TEXT_IN_EL: (~["}", "'", "\""])+ >
  109. | <#NO_JSP_TAG_END: ( ~["%"] | ("%" ~[">"]) )+ >
  110. | <#NO_TAG_END: ( ~[">"] | ( "/" ~[">"]) )+ >
  111. }
  112. <DEFAULT> TOKEN :
  113. {
  114. <EOL: ( " " | "\t" )* <NEWLINE> >
  115. | <TAG_START: "<" (<WHITESPACE>)* | "<%@" (<WHITESPACE>)* > : LexStartTag
  116. | <SCRIPTLET_TAG: "<%" > : LexScriptlet
  117. | <ENDTAG_START: "</" > : LexStartTag
  118. | <COMMENT_START: "<!--" | "<%--" > : LexComment
  119. | <DECL_START: "<!" > : LexDecl
  120. | <PCDATA: ( ~["<", "\r", "\n"] )+ >
  121. | <BLANK_LINES: <EOL> (<EOL>)+ >
  122. }
  123. <LexStartTag> TOKEN :
  124. {
  125. <TAG_SCRIPT: ("SCRIPT" | "script") > : LexInTag
  126. | <TAG_STYLE: ("STYLE" | "style" ) > : LexInTag
  127. | <TAG_NAME: <IDENTIFIER> > : LexInTag
  128. | <LST_ERROR: ~[]> : DEFAULT
  129. }
  130. <LexInTag> SPECIAL_TOKEN :
  131. {
  132. < (<WHITESPACE>)+ >
  133. }
  134. <LexInTag> TOKEN :
  135. {
  136. <ATTR_NAME: <IDENTIFIER> >
  137. | <TAG_END: ">" > : DEFAULT
  138. | <TAG_PERCENTEND: "%>" > : DEFAULT
  139. | <TAG_SLASHEND: "/>" > : DEFAULT
  140. | <ATTR_EQ: "=" > : LexAttrVal
  141. | <IMPLICIT_TAG_END: "<">
  142. {
  143. Token t = new Token();
  144. t.image = "<";
  145. t.kind = TAG_START;
  146. t.next = matchedToken.next;
  147. t.beginLine = matchedToken.beginLine;
  148. t.beginColumn = matchedToken.beginColumn;
  149. t.endLine = matchedToken.endLine;
  150. t.endColumn = matchedToken.endColumn;
  151. matchedToken.next = t;
  152. matchedToken.kind = TAG_END;
  153. matchedToken.image = ">";
  154. } : LexStartTag
  155. | <LIT_ERROR: ~[]>
  156. }
  157. <LexAttrVal> TOKEN :
  158. {
  159. <SINGLE_QUOTE: "'"> : AttrValueBetweenSingleQuotesState
  160. | <DOUBLE_QUOTE: "\""> : AttrValueBetweenDoubleQuotesState
  161. }
  162. <LexAttrVal> SPECIAL_TOKEN :
  163. {
  164. < (<WHITESPACE>)+ >
  165. }
  166. <AttrValueBetweenSingleQuotesState> TOKEN :
  167. {
  168. <ENDING_SINGLE_QUOTE: "'"> : LexInTag
  169. | <UNPARSED_TEXT_NO_SINGLE_QUOTES:
  170. ( (~["$", "#", "'"]) | (["$", "#"] ~["{", "'"]) | <EL_ESCAPE> )+ >
  171. | <DOLLAR_OR_HASH_SINGLE_QUOTE: ["$", "#"] "'" > : LexInTag
  172. }
  173. <AttrValueBetweenDoubleQuotesState> TOKEN :
  174. {
  175. <ENDING_DOUBLE_QUOTE: "\""> : LexInTag
  176. | <UNPARSED_TEXT_NO_DOUBLE_QUOTES: ( (~["<", "$", "#", "\""]) | (["$", "#"] ~["{", "\""]) | <EL_ESCAPE> )+ > : AttrValueBetweenDoubleQuotesState
  177. | <DOLLAR_OR_HASH_DOUBLE_QUOTE: ["$", "#"] "\"" > : LexInTag
  178. }
  179. <AttrValueBetweenSingleQuotesState, AttrValueBetweenDoubleQuotesState> TOKEN:
  180. {
  181. <EL_EXPRESSION_IN_ATTRIBUTE: "${" (<QUOTED_STRING> | <TEXT_IN_EL>)* "}" >
  182. | <VALUE_BINDING_IN_ATTRIBUTE: "#{" (<QUOTED_STRING> | <TEXT_IN_EL>)* "}" >
  183. | <JSP_EXPRESSION_IN_ATTRIBUTE: "<%=" <NO_JSP_TAG_END> "%>" >
  184. | <JSP_TAG_IN_ATTRIBUTE: "<" <NO_TAG_END> "/>" >
  185. }
  186. <LexInAttrVal> TOKEN :
  187. {
  188. <ATTR_VAL: ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexAttrVal
  189. }
  190. <LexComment> TOKEN :
  191. {
  192. < COMMENT_END: ("--" (" ")* ">" | "->" | "--%>" ) > : DEFAULT
  193. | < DASH: "-" >
  194. | < COMMENT_EOL: <NEWLINE> >
  195. | < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
  196. | <QUOTED_STRING_NB>
  197. | <QUOTE> ) >
  198. }
  199. <LexScriptlet> TOKEN :
  200. {
  201. <SCRIPTLET_END: (<WHITESPACE>)* "%>" > : DEFAULT
  202. }
  203. <LexDecl> TOKEN :
  204. {
  205. <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
  206. | <DECL_END: ">" > : DEFAULT
  207. }
  208. <LexScript> TOKEN : {
  209. <SCRIPT_END: ("</SCRIPT>" | "</script>") > : DEFAULT
  210. }
  211. <LexStyle> TOKEN : {
  212. <STYLE_END: ("</STYLE>" | "</style>") > : DEFAULT
  213. }
  214. <LexScript, LexStyle, LexScriptlet> TOKEN :
  215. {
  216. <BLOCK_EOL: <NEWLINE> >
  217. | <BLOCK_LBR: "<" >
  218. | <BLOCK_WORD: ( <QUOTED_STRING_NB>
  219. | <QUOTE>
  220. | (~[ "\n", "\r", "'", "\"", "<", "%"])+ ) >
  221. }
  222. HtmlDocument HtmlDocument() :
  223. {
  224. HtmlDocument.ElementSequence s;
  225. }
  226. {
  227. s=ElementSequence() <EOF>
  228. { return new HtmlDocument(s); }
  229. }
  230. HtmlDocument.ElementSequence ElementSequence() :
  231. {
  232. HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();
  233. HtmlDocument.HtmlElement h;
  234. }
  235. {
  236. ( h=Element() { s.addElement(h); } ) *
  237. {
  238. return s;
  239. }
  240. }
  241. HtmlDocument.HtmlElement Element() :
  242. {
  243. HtmlDocument.HtmlElement e;
  244. Token text;
  245. }
  246. {
  247. (
  248. LOOKAHEAD(2)
  249. e = Tag() { return e; }
  250. | e = EndTag() { return e; }
  251. | e = CommentTag() { return e; }
  252. | e = DeclTag() { return e; }
  253. | LOOKAHEAD(3)
  254. e = ScriptletBlock() { return e; }
  255. | LOOKAHEAD(2)
  256. e = ScriptBlock() { return e; }
  257. | LOOKAHEAD(2)
  258. e = StyleBlock() { return e; }
  259. | LOOKAHEAD(2)
  260. <TAG_START> text=<LST_ERROR>
  261. { return new HtmlDocument.Text("<" + text.image); }
  262. | text = <PCDATA> { return new HtmlDocument.Text(text.image); }
  263. | <BLANK_LINES> { return new HtmlDocument.BlankLines(); }
  264. | <EOL> { return new HtmlDocument.Newline(); }
  265. )
  266. }
  267. HtmlDocument.Attribute Attribute() :
  268. {
  269. HtmlDocument.Attribute a;
  270. Token t;
  271. String value=null;
  272. }
  273. {
  274. //t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
  275. t=<ATTR_NAME> [ <ATTR_EQ> value=AttributeValue() ]
  276. { if (value == null)
  277. return new HtmlDocument.Attribute(t.image);
  278. else
  279. return new HtmlDocument.Attribute(t.image, value);
  280. }
  281. }
  282. String AttributeValue() :
  283. {
  284. StringBuffer content = new StringBuffer();
  285. Token t = null;
  286. HtmlDocument.HtmlElement inner_tag = null;
  287. }
  288. {
  289. try {
  290. (
  291. ( <DOUBLE_QUOTE>
  292. (
  293. (
  294. t = QuoteIndependentAttributeValueContent()
  295. )
  296. {
  297. if (t != null) {
  298. content.append(t.image);
  299. }
  300. else if (inner_tag != null) {
  301. content.append(inner_tag.toString());
  302. }
  303. }
  304. )*
  305. ( <ENDING_DOUBLE_QUOTE>
  306. | t = <DOLLAR_OR_HASH_DOUBLE_QUOTE> { content.append(t.image.substring(0, 1)); }
  307. )
  308. [ t=<ATTR_VAL> { content.append( t.image ); } ]
  309. )
  310. |
  311. ( <SINGLE_QUOTE>
  312. (
  313. ( t = <UNPARSED_TEXT_NO_SINGLE_QUOTES>
  314. | t = QuoteIndependentAttributeValueContent()
  315. ) { content.append(t.image); }
  316. )*
  317. ( <ENDING_SINGLE_QUOTE>
  318. | t = <DOLLAR_OR_HASH_SINGLE_QUOTE> { content.append(t.image.substring(0, 1)); }
  319. )
  320. )
  321. )
  322. {
  323. return content.toString();
  324. }
  325. }
  326. catch(Exception e) {
  327. e.printStackTrace();
  328. }
  329. }
  330. /**
  331. * Partial content of an attribute value that can contain all quotes.
  332. * This groups EL expressions, value bindings, and JSP expressions.
  333. */
  334. Token QuoteIndependentAttributeValueContent() :
  335. { Token t; }
  336. {
  337. try {
  338. (
  339. LOOKAHEAD(2)
  340. t = <EL_EXPRESSION_IN_ATTRIBUTE>
  341. | LOOKAHEAD(2) t = <VALUE_BINDING_IN_ATTRIBUTE>
  342. | LOOKAHEAD(2) t = <JSP_EXPRESSION_IN_ATTRIBUTE>
  343. | LOOKAHEAD(2) t = <JSP_TAG_IN_ATTRIBUTE>
  344. | LOOKAHEAD(2) t = <UNPARSED_TEXT_NO_DOUBLE_QUOTES>
  345. )
  346. { return t; }
  347. }
  348. catch(Exception e) {
  349. e.printStackTrace();
  350. }
  351. }
  352. HtmlDocument.AttributeList AttributeList() :
  353. {
  354. HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();
  355. HtmlDocument.Attribute a;
  356. }
  357. {
  358. ( a=Attribute() { alist.addAttribute(a); } )*
  359. { return alist; }
  360. }
  361. HtmlDocument.HtmlElement Tag() :
  362. {
  363. Token t, et;
  364. HtmlDocument.AttributeList alist = null;
  365. Token firstToken = getToken(1);
  366. HtmlDocument.HtmlElement rtn_tag = null;
  367. Token st = null;
  368. String tag_name;
  369. String tag_start;
  370. }
  371. {
  372. try {
  373. st=<TAG_START> t=<TAG_NAME> alist=AttributeList() ( et=<TAG_END> | et=<TAG_PERCENTEND> | et=<TAG_SLASHEND> )
  374. {
  375. HtmlDocument.Tag tag = new HtmlDocument.Tag(st.image, t.image, alist, et.image);
  376. if (st.image.startsWith("<%") || t.image.indexOf(":") > 0) {
  377. tag.setIsJspTag(true);
  378. }
  379. if (et.kind == TAG_SLASHEND) {
  380. tag.setEmpty(true);
  381. }
  382. rtn_tag = tag;
  383. }
  384. }
  385. catch (ParseException ex) {
  386. System.out.println(ex.getMessage());
  387. token_source.SwitchTo(DEFAULT);
  388. String s = getTokenText(firstToken, getNextToken());
  389. HtmlDocument.Text tag = new HtmlDocument.Text(s);
  390. rtn_tag = tag;
  391. }
  392. finally {
  393. return rtn_tag;
  394. }
  395. }
  396. HtmlDocument.ElementSequence BlockContents() : {
  397. Token t;
  398. StringBuffer s = new StringBuffer();
  399. HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();
  400. }
  401. {
  402. ( <BLOCK_EOL> {
  403. if (s.length() > 0) {
  404. e.addElement(new HtmlDocument.Text(s.toString()));
  405. s.setLength(0);
  406. };
  407. e.addElement(new HtmlDocument.Newline());
  408. }
  409. | t=<BLOCK_WORD> { s.append(t.image); }
  410. | t=<BLOCK_LBR> { s.append(t.image); }
  411. )*
  412. {
  413. if (s.length() > 0)
  414. e.addElement(new HtmlDocument.Text(s.toString()));
  415. // danson, removed next line, it causes an extra blank line to be inserted
  416. // in script and style blocks
  417. //e.addElement(new HtmlDocument.Newline());
  418. return e;
  419. }
  420. }
  421. HtmlDocument.HtmlElement ScriptBlock() :
  422. {
  423. HtmlDocument.AttributeList alist;
  424. HtmlDocument.ElementSequence e;
  425. Token firstToken = getToken(1);
  426. }
  427. {
  428. try {
  429. <TAG_START> <TAG_SCRIPT> alist=AttributeList() <TAG_END>
  430. {
  431. token_source.SwitchTo(LexScript);
  432. }
  433. e=BlockContents()
  434. <SCRIPT_END>
  435. {
  436. return new HtmlDocument.TagBlock("SCRIPT", alist, e);
  437. }
  438. }
  439. catch (ParseException ex) {
  440. token_source.SwitchTo(DEFAULT);
  441. String s = getTokenText(firstToken, getNextToken());
  442. return new HtmlDocument.Text(s);
  443. }
  444. }
  445. HtmlDocument.HtmlElement StyleBlock() :
  446. {
  447. HtmlDocument.AttributeList alist;
  448. HtmlDocument.ElementSequence e;
  449. Token firstToken = getToken(1);
  450. }
  451. {
  452. try {
  453. <TAG_START> <TAG_STYLE> alist=AttributeList() <TAG_END>
  454. {
  455. token_source.SwitchTo(LexStyle);
  456. }
  457. e=BlockContents()
  458. <STYLE_END>
  459. {
  460. return new HtmlDocument.TagBlock("STYLE", alist, e);
  461. }
  462. }
  463. catch (ParseException ex) {
  464. token_source.SwitchTo(DEFAULT);
  465. String s = getTokenText(firstToken, getNextToken());
  466. return new HtmlDocument.Text(s);
  467. }
  468. }
  469. HtmlDocument.HtmlElement EndTag() :
  470. {
  471. Token t;
  472. Token firstToken = getToken(1);
  473. }
  474. {
  475. try {
  476. <ENDTAG_START> t=<TAG_NAME> <TAG_END>
  477. {
  478. HtmlDocument.EndTag tag = new HtmlDocument.EndTag(t.image);
  479. if (t.image.indexOf(":") > 0) {
  480. tag.setIsJspTag(true);
  481. }
  482. return tag;
  483. }
  484. }
  485. catch (ParseException ex) {
  486. token_source.SwitchTo(DEFAULT);
  487. String s = getTokenText(firstToken, getNextToken());
  488. return new HtmlDocument.Text(s);
  489. }
  490. }
  491. HtmlDocument.Comment CommentTag() :
  492. {
  493. Token t, comment_start, comment_end = null;
  494. StringBuffer s = new StringBuffer();
  495. }
  496. {
  497. comment_start=<COMMENT_START>
  498. ( t=<DASH> { s.append(t.image); }
  499. | <COMMENT_EOL> { s.append(NL); }
  500. | t=<COMMENT_WORD> { s.append(t.image); } )*
  501. (<EOF> | comment_end=<COMMENT_END>)
  502. {
  503. return new HtmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image));
  504. }
  505. }
  506. HtmlDocument.Comment DeclTag() :
  507. {
  508. Token t;
  509. }
  510. {
  511. <DECL_START> t=<DECL_ANY> <DECL_END>
  512. {
  513. return new HtmlDocument.Comment(t.image);
  514. }
  515. }
  516. HtmlDocument.HtmlElement ScriptletBlock() :
  517. {
  518. HtmlDocument.ElementSequence e;
  519. Token firstToken = getToken(1);
  520. }
  521. {
  522. try {
  523. <SCRIPTLET_TAG>
  524. {
  525. String block = getScriptletBlock();
  526. e = new HtmlDocument.ElementSequence();
  527. e.addElement(new HtmlDocument.Text(block));
  528. token_source.SwitchTo(DEFAULT);
  529. return new HtmlDocument.JspScriptletBlock(e);
  530. }
  531. }
  532. catch (ParseException ex) {
  533. // ex.printStackTrace();
  534. token_source.SwitchTo(DEFAULT);
  535. String s = getTokenText(firstToken, getNextToken());
  536. return new HtmlDocument.Text(s);
  537. }
  538. }
  539. /*
  540. Originally I tried to use BlockContents to get the contents of a jsp scriptlet,
  541. but the matching rules for a word trump the matching rules for the %> end tag.
  542. This works well anyway, since I really don't care about tokenizing the contents
  543. of a jsp scriptlet. I just want the contents verbatim.
  544. This returns the contents of the jsp scriptlet block without the <% or %>. The
  545. %> will be consumed.
  546. */
  547. JAVACODE
  548. String getScriptletBlock() {
  549. StringBuilder sb = new StringBuilder();
  550. try {
  551. while (true) {
  552. sb.append(jj_input_stream.readChar());
  553. if (sb.length() > 2 && sb.substring(sb.length() - 2).equals("%>")) {
  554. // trim the %> from the end of the string
  555. sb.setLength(sb.length() - 2);
  556. return sb.toString();
  557. }
  558. }
  559. }
  560. catch(Exception e) {
  561. }
  562. return sb.toString();
  563. }