/plugins/XML/tags/release-2-0-6/sidekick/html/parser/html/HtmlParser.jj

# · Unknown · 619 lines · 564 code · 55 blank · 0 comment · 0 complexity · 0aa7c4a05490008f8d90c0cd291648e4 MD5 · raw file

  1. /*
  2. * HtmlParser.jj -- JavaCC grammar for HTML.
  3. * Copyright (C) 1999 Quiotix Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License, version 2, as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
  13. * for more details.
  14. */
  15. /*
  16. * JavaCC grammar file for HTML.
  17. *
  18. * Author: Brian Goetz, Quiotix
  19. * Version: 1.03
  20. * Revision: $Id: HtmlParser.jj 7069 2006-09-19 16:04:33Z daleanson $
  21. *
  22. * This grammar parses an HTML document and produces a (flat) parse "tree"
  23. * representing the document. It preserves almost all information in the
  24. * source document, including carriage control and spacing (except inside
  25. * of tags.) See the HtmlDocument and HtmlDocument.* classes for a
  26. * description of the parse tree. The parse tree supports traversal using
  27. * the commonly used "Visitor" pattern. The HtmlDumper class is a visitor
  28. * which dumps out the tree to an output stream.
  29. *
  30. * It does not require begin tags to be matched with end tags, or validate
  31. * the names or contents of the tags (this can easily be done post-parsing;
  32. * see the HtmlCollector class (which matches begin tags with end tags)
  33. * for an example.)
  34. *
  35. * Notable edge cases include:
  36. * - Quoted string processing. Quoted strings are matched inside of comments, and
  37. * as tag attribute values. Quoted strings are matched in normal text only
  38. * to the extent that they do not span line breaks.
  39. *
  40. * Please direct comments, questions, gripes or praise to
  41. * html-parser@quiotix.com. If you like it/hate it/use it, please let us know!
  42. *
  43. * danson: Modified for HtmlSideKick plugin for jEdit, added ability to handle
  44. * jsp as well as html, added locations for tags, etc. This parser works well
  45. * for most xml-based markup also.
  46. */
  47. options { IGNORE_CASE = true; STATIC = false; }
  48. PARSER_BEGIN(HtmlParser)
  49. package sidekick.html.parser.html;
  50. import java.text.MessageFormat;
  51. import java.util.*;
  52. import java.util.regex.*;
  53. import sidekick.util.*;
  54. public class HtmlParser {
  55. static String NL = System.getProperty("line.separator");
  56. private List<ParseError> parseErrors = new ArrayList<ParseError>();
  57. public void setLineSeparator(String ls) {
  58. NL = ls;
  59. }
  60. private static String getTokenText(Token first, Token cur) {
  61. Token t;
  62. StringBuffer sb = new StringBuffer();
  63. for (t=first; t != cur.next; t = t.next) {
  64. if (t.specialToken != null) {
  65. Token tt=t.specialToken;
  66. while (tt.specialToken != null)
  67. tt = tt.specialToken;
  68. for (; tt != null; tt = tt.next)
  69. sb.append(tt.image);
  70. };
  71. sb.append(t.image);
  72. };
  73. return sb.toString();
  74. }
  75. public static void main(String[] args) throws ParseException {
  76. HtmlParser parser = new HtmlParser(System.in);
  77. HtmlDocument doc = parser.HtmlDocument();
  78. doc.accept(new HtmlDumper(System.out));
  79. System.exit(0);
  80. }
  81. public void setTabSize(int size) {
  82. jj_input_stream.setTabSize(size);
  83. }
  84. public int getTabSize() {
  85. return jj_input_stream.getTabSize(0);
  86. }
  87. private void addException(ParseException pe) {
  88. Range range = getExceptionLocation( pe );
  89. parseErrors.add(new ParseError(pe.getMessage(), range));
  90. pe.printStackTrace();
  91. }
  92. public List<ParseError> getParseErrors() {
  93. System.out.println("getParserErrors, there are " + parseErrors.size() + " errors");
  94. return parseErrors;
  95. }
  96. // regex to extract line and colun from a ParseException message
  97. // ParseException message look like: "Parse error at line 116, column 5. Encountered: }"
  98. private Pattern pePattern = Pattern.compile( "(.*?)(\\d+)(.*?)(\\d+)(.*?)" );
  99. /**
  100. * @return attempts to return a Location indicating the location of a parser
  101. * exception. If the ParseException contains a Token reference, all is well,
  102. * otherwise, this method attempts to parse the message string for the
  103. * exception.
  104. */
  105. private Range getExceptionLocation( ParseException pe ) {
  106. Token t = pe.currentToken;
  107. if ( t != null ) {
  108. return new Range( new Location( t.next.beginLine - 1, t.next.beginColumn ), new Location( t.next.endLine - 1, t.next.endColumn ) );
  109. }
  110. // ParseException message look like: "Parse error at line 116, column 5. Encountered: }"
  111. try {
  112. Matcher m = pePattern.matcher( pe.getMessage() );
  113. if ( m.matches() ) {
  114. String ln = m.group( 2 );
  115. String cn = m.group( 4 );
  116. int line_number = -1;
  117. int column_number = 0;
  118. if ( ln != null )
  119. line_number = Integer.parseInt( ln );
  120. if ( cn != null )
  121. column_number = Integer.parseInt( cn );
  122. return line_number > -1 ? new Range( new Location( line_number - 1, column_number - 1 ), new Location( line_number - 1, column_number ) ) : null;
  123. }
  124. return new Range();
  125. }
  126. catch ( Exception e ) {
  127. //e.printStackTrace();
  128. return new Range();
  129. }
  130. }
  131. // regex pattern for a valid non-quoted attribute.
  132. // Attributes can be single or double quoted, or consist solely of
  133. // letters in the range A-Z and a-z, digits (0-9), hyphens ("-"),
  134. // and periods (".")
  135. private Pattern attributePattern = Pattern.compile( "([a-zA-Z0-9.-])*" );
  136. private boolean isProperAttribute(String s) {
  137. // could have double quotes
  138. if (s.startsWith("\"") && s.endsWith("\"")) {
  139. return true;
  140. }
  141. // or single quotes
  142. else if (s.startsWith("'") && s.endsWith("'")) {
  143. return true;
  144. }
  145. // or might be jsp
  146. else if (s.startsWith("<%") && (s.endsWith("%>") || s.endsWith("%")) ) {
  147. return true;
  148. }
  149. boolean rtn = attributePattern.matcher(s).matches();
  150. if (rtn == false) {
  151. System.out.println("bad attribute: " + s);
  152. }
  153. return rtn;
  154. }
  155. }
  156. PARSER_END(HtmlParser)
  157. MORE:
  158. {
  159. "<%" : IN_JSP_EXP
  160. }
  161. <IN_JSP_EXP>
  162. SPECIAL_TOKEN :
  163. {
  164. <JSP_EXP_END: "%>" > : DEFAULT
  165. }
  166. <IN_JSP_EXP>
  167. MORE :
  168. {
  169. < ~[] >
  170. }
  171. <*> TOKEN :
  172. {
  173. <#ALPHA_CHAR: [
  174. "\u0024",
  175. "\u0041"-"\u005a",
  176. "\u005f",
  177. "\u0061"-"\u007a",
  178. "\u00c0"-"\u00d6", // Latin with diacritics
  179. "\u00d8"-"\u00f6", // Latin with diacritics
  180. "\u00f8"-"\u00ff", // Latin with diacritics
  181. "\u0100"-"\u1fff", // Latin Extended-A through Greek Extended
  182. "\u3040"-"\u318f", // Hiragana through Hangul Compatibility Jamo
  183. "\u3300"-"\u337f", // CJK Compatibility
  184. "\u3400"-"\u3d2d", // CJK Unified Ideographs Extension A
  185. "\u4e00"-"\u9fff", // CJK Unified Ideographs
  186. "\uf900"-"\ufaff" ] > // CJK Compatibility Ideographs
  187. | <#NUM_CHAR: ["0"-"9"] >
  188. | <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
  189. | <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
  190. | <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
  191. | <#STYLE_IDENTIFIER: (<ALPHA_CHAR>)+ >
  192. | <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
  193. | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
  194. | <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
  195. | <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
  196. | <#NEWLINE: ( "\r\n" | "\r" | "\n" ) >
  197. | <#QUOTE: ( "'" | "\"" ) >
  198. }
  199. <DEFAULT> TOKEN :
  200. {
  201. <EOL: ( " " | "\t" )* <NEWLINE> >
  202. | <COMMENT_START: "<!--" | "<%--" > : LexComment
  203. | <ENDTAG_START: "</" | "</" <IDENTIFIER> ":" > : LexStartTag
  204. | <TAG_START: "<" | "<%@" | "<" <IDENTIFIER> ":" > : LexStartTag
  205. | <DECL_START: "<!" > : LexDecl
  206. | <PCDATA: ( ~["<", "\r", "\n"] )+ >
  207. }
  208. <LexStartTag> SPECIAL_TOKEN :
  209. {
  210. < (<WHITESPACE>)+ >
  211. }
  212. <LexStartTag> TOKEN :
  213. {
  214. <TAG_SCRIPT: "SCRIPT"> : LexInTag
  215. | <TAG_STYLE: "STYLE"> : LexInTag
  216. | <TAG_NAME: <IDENTIFIER> > : LexInTag
  217. | <LST_ERROR: ~[]> : DEFAULT
  218. }
  219. <LexInTag> SPECIAL_TOKEN :
  220. {
  221. < (<WHITESPACE>)+ >
  222. }
  223. <LexInTag> TOKEN :
  224. {
  225. <ATTR_NAME: <IDENTIFIER> >
  226. | <TAG_END: ">" | "%>" > : DEFAULT
  227. | <TAG_SLASHEND: "/>" > : DEFAULT
  228. | <ATTR_EQ: "=" > : LexAttrVal
  229. | <IMPLICIT_TAG_END: "<">
  230. {
  231. Token t = new Token();
  232. t.image = "<";
  233. t.kind = TAG_START;
  234. t.next = matchedToken.next;
  235. t.beginLine = matchedToken.beginLine;
  236. t.beginColumn = matchedToken.beginColumn;
  237. t.endLine = matchedToken.endLine;
  238. t.endColumn = matchedToken.endColumn;
  239. matchedToken.next = t;
  240. matchedToken.kind = TAG_END;
  241. matchedToken.image = ">";
  242. } : LexStartTag
  243. | <LIT_ERROR: ~[]>
  244. }
  245. <LexAttrVal> SPECIAL_TOKEN :
  246. {
  247. < <WHITESPACE> >
  248. }
  249. <LexAttrVal> TOKEN :
  250. {
  251. <ATTR_VAL: <QUOTED_STRING>
  252. | ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexInTag
  253. | <LAV_ERROR: ~[]>
  254. }
  255. <LexComment> TOKEN :
  256. {
  257. < COMMENT_END: ("--" (" ")* ">" | "->" | "--%>" ) > : DEFAULT
  258. | < DASH: "-" >
  259. | < COMMENT_EOL: <NEWLINE> >
  260. | < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
  261. | <QUOTED_STRING_NB>
  262. | <QUOTE> ) >
  263. }
  264. <LexDecl> TOKEN :
  265. {
  266. <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
  267. | <DECL_END: ">" > : DEFAULT
  268. }
  269. <LexScript> TOKEN : {
  270. <SCRIPT_END: "</SCRIPT>" > : DEFAULT
  271. }
  272. <LexStyle> TOKEN : {
  273. <STYLE_END: "</STYLE>" > : DEFAULT
  274. }
  275. <LexScript, LexStyle> TOKEN :
  276. {
  277. <BLOCK_EOL: <NEWLINE> >
  278. | <BLOCK_LBR: "<" >
  279. | <BLOCK_WORD: ( <QUOTED_STRING_NB>
  280. | <QUOTE>
  281. | (~[ "\n", "\r", "'", "\"", "<"])+ ) >
  282. }
  283. HtmlDocument HtmlDocument() :
  284. {
  285. HtmlDocument.ElementSequence s;
  286. }
  287. {
  288. s=ElementSequence() <EOF>
  289. { return new HtmlDocument(s); }
  290. }
  291. HtmlDocument.ElementSequence ElementSequence() :
  292. {
  293. HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();
  294. HtmlDocument.HtmlElement h;
  295. }
  296. {
  297. ( h=Element() { s.addElement(h); } ) *
  298. { return s; }
  299. }
  300. HtmlDocument.HtmlElement Element() :
  301. {
  302. HtmlDocument.HtmlElement e;
  303. Token text;
  304. }
  305. {
  306. (
  307. LOOKAHEAD(2)
  308. e = Tag() { return e; }
  309. | e = EndTag() { return e; }
  310. | e = CommentTag() { return e; }
  311. | e = DeclTag() { return e; }
  312. | LOOKAHEAD(2)
  313. e = ScriptBlock() { return e; }
  314. | LOOKAHEAD(2)
  315. e = StyleBlock() { return e; }
  316. | LOOKAHEAD(2)
  317. <TAG_START> text=<LST_ERROR>
  318. { return new HtmlDocument.Text("<" + text.image); }
  319. | text = <PCDATA> { return new HtmlDocument.Text(text.image); }
  320. | <EOL> { return new HtmlDocument.Newline(); }
  321. )
  322. }
  323. HtmlDocument.Attribute Attribute() :
  324. {
  325. HtmlDocument.Attribute a;
  326. Token t1, t2=null;
  327. }
  328. {
  329. t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
  330. {
  331. if (t2 == null) {
  332. a = new HtmlDocument.Attribute(t1.image);
  333. }
  334. else {
  335. a = new HtmlDocument.Attribute(t1.image, t2.image);
  336. if (!isProperAttribute(t2.image)) {
  337. ParseException e = new ParseException("Parse error at line " + t2.beginLine + ", column " + t2.beginColumn + ". Attribute is improperly quoted." );
  338. addException(e);
  339. }
  340. }
  341. return a;
  342. }
  343. }
  344. HtmlDocument.AttributeList AttributeList() :
  345. {
  346. HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();
  347. HtmlDocument.Attribute a;
  348. }
  349. {
  350. (a=Attribute() { alist.addAttribute(a); } )*
  351. {
  352. return alist;
  353. }
  354. }
  355. HtmlDocument.HtmlElement Tag() :
  356. {
  357. Token t, et;
  358. HtmlDocument.AttributeList alist;
  359. Token firstToken = getToken(1);
  360. Token st = null;
  361. boolean isJspTag = false;
  362. }
  363. {
  364. try {
  365. st=<TAG_START> t=<TAG_NAME> alist=AttributeList()
  366. ( et=<TAG_END> | et=<TAG_SLASHEND> )
  367. {
  368. String tag_start = "<";
  369. String tag_name = "";
  370. if (st.image.startsWith("<") && st.image.endsWith(":")) {
  371. isJspTag = true;
  372. tag_start = "<";
  373. tag_name = st.image.substring(1) + t.image;
  374. }
  375. else {
  376. tag_name = t.image;
  377. }
  378. if (st.image.startsWith("<%")) {
  379. isJspTag = true;
  380. }
  381. HtmlDocument.Tag rtn_tag = new HtmlDocument.Tag(tag_start, tag_name, alist, et.image);
  382. if (et.kind == TAG_SLASHEND) {
  383. rtn_tag.setEmpty(true);
  384. }
  385. rtn_tag.setStartLocation(st.beginLine, st.beginColumn);
  386. rtn_tag.setEndLocation(et.endLine, et.endColumn);
  387. rtn_tag.setIsJspTag(isJspTag);
  388. return rtn_tag;
  389. }
  390. }
  391. catch (ParseException ex) {
  392. token_source.SwitchTo(DEFAULT);
  393. String s = getTokenText(firstToken, getNextToken());
  394. return new HtmlDocument.Text(s);
  395. }
  396. }
  397. String StyleBlockContents() :
  398. {
  399. StringBuffer sb = new StringBuffer();
  400. Token t = null;
  401. }
  402. {
  403. ( t=<BLOCK_EOL> { sb.append(t.image); }
  404. | t=<BLOCK_WORD> { sb.append(t.image); }
  405. | t=<BLOCK_LBR> { sb.append(t.image); }
  406. )*
  407. {
  408. String contents = sb.toString();
  409. contents = contents.trim();
  410. // sometimes people wrap the contents of style tags with html comments
  411. // to protect older browsers that don't understand style tags from puking.
  412. // I'm removing them here as they don't serve a purpose as far as a jEdit
  413. // SideKick plugin is concerned.
  414. if (contents.startsWith("<!--")) {
  415. contents = contents.substring(4);
  416. }
  417. if (contents.endsWith("-->")) {
  418. contents = contents.substring(0, contents.length() - 3);
  419. }
  420. return contents.trim();
  421. }
  422. }
  423. String ScriptBlockContents() :
  424. {
  425. //HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();
  426. StringBuffer sb = new StringBuffer();
  427. Token t = null;
  428. }
  429. {
  430. ( t=<BLOCK_EOL> { sb.append(t.image); }
  431. | t=<BLOCK_WORD> { sb.append(t.image); }
  432. | t=<BLOCK_LBR> { sb.append(t.image); }
  433. )*
  434. {
  435. String contents = sb.toString();
  436. contents = contents.trim();
  437. // sometimes people wrap the contents of script tags with html comments
  438. // to protect older browsers that don't understand script tags from puking.
  439. // I'm removing them here as they don't serve a purpose as far as a jEdit
  440. // SideKick plugin is concerned.
  441. if (contents.startsWith("<!--")) {
  442. contents = contents.substring(4);
  443. }
  444. if (contents.endsWith("//-->")) {
  445. contents = contents.substring(0, contents.length() - 5);
  446. }
  447. return contents.trim();
  448. //return e;
  449. }
  450. }
  451. HtmlDocument.HtmlElement ScriptBlock() :
  452. {
  453. HtmlDocument.AttributeList alist;
  454. Token firstToken = getToken(1);
  455. Token st, et;
  456. String contents = "";
  457. }
  458. {
  459. try {
  460. st=<TAG_START> <TAG_SCRIPT> alist=AttributeList() <TAG_END>
  461. {
  462. token_source.SwitchTo(LexScript);
  463. }
  464. contents=ScriptBlockContents()
  465. et=<SCRIPT_END>
  466. {
  467. HtmlDocument.Text text = new HtmlDocument.Text(contents);
  468. HtmlDocument.ElementSequence seq = new HtmlDocument.ElementSequence();
  469. seq.addElement(text);
  470. HtmlDocument.TagBlock b = new HtmlDocument.TagBlock("SCRIPT", alist, seq);
  471. b.setStartLocation(st.beginLine, st.beginColumn);
  472. b.setEndLocation(et.endLine, et.endColumn);
  473. return b;
  474. }
  475. }
  476. catch (ParseException ex) {
  477. ex.printStackTrace();
  478. token_source.SwitchTo(DEFAULT);
  479. String s = getTokenText(firstToken, getNextToken());
  480. return new HtmlDocument.Text(s);
  481. }
  482. }
  483. HtmlDocument.HtmlElement StyleBlock() :
  484. {
  485. HtmlDocument.AttributeList alist;
  486. Token firstToken = getToken(1);
  487. Token st, et;
  488. String contents = "";
  489. }
  490. {
  491. try {
  492. st=<TAG_START> <TAG_STYLE> alist=AttributeList() <TAG_END>
  493. {
  494. token_source.SwitchTo(LexStyle);
  495. }
  496. contents=StyleBlockContents()
  497. et=<STYLE_END>
  498. {
  499. HtmlDocument.Text text = new HtmlDocument.Text(contents);
  500. HtmlDocument.ElementSequence seq = new HtmlDocument.ElementSequence();
  501. seq.addElement(text);
  502. HtmlDocument.TagBlock b = new HtmlDocument.TagBlock("STYLE", alist, seq);
  503. b.setStartLocation(st.beginLine, st.beginColumn);
  504. b.setEndLocation(et.endLine, et.endColumn);
  505. return b;
  506. }
  507. }
  508. catch (ParseException ex) {
  509. token_source.SwitchTo(DEFAULT);
  510. String s = getTokenText(firstToken, getNextToken());
  511. return new HtmlDocument.Text(s);
  512. }
  513. }
  514. HtmlDocument.HtmlElement EndTag() :
  515. {
  516. Token t;
  517. Token firstToken = getToken(1);
  518. Token st, et;
  519. }
  520. {
  521. try {
  522. st=<ENDTAG_START> t=<TAG_NAME> et=<TAG_END>
  523. {
  524. String tag_name = "";
  525. if (st.image.startsWith("</") && st.image.endsWith(":")) {
  526. tag_name = st.image.substring(2) + t.image;
  527. }
  528. else
  529. tag_name = t.image;
  530. HtmlDocument.EndTag b = new HtmlDocument.EndTag(tag_name);
  531. b.setStartLocation(st.beginLine, st.beginColumn);
  532. b.setEndLocation(et.endLine, et.endColumn);
  533. return b;
  534. }
  535. }
  536. catch (ParseException ex) {
  537. token_source.SwitchTo(DEFAULT);
  538. String s = getTokenText(firstToken, getNextToken());
  539. return new HtmlDocument.Text(s);
  540. }
  541. }
  542. HtmlDocument.Comment CommentTag() :
  543. {
  544. Token t, comment_start, comment_end = null;
  545. StringBuffer s = new StringBuffer();
  546. }
  547. {
  548. comment_start=<COMMENT_START>
  549. ( t=<DASH> { s.append(t.image); }
  550. | <COMMENT_EOL> { s.append(NL); }
  551. | t=<COMMENT_WORD> { s.append(t.image); } )*
  552. (<EOF> | comment_end=<COMMENT_END>)
  553. { return new HtmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image)); }
  554. }
  555. HtmlDocument.Comment DeclTag() :
  556. {
  557. Token t;
  558. }
  559. {
  560. <DECL_START> t=<DECL_ANY> <DECL_END>
  561. {
  562. return new HtmlDocument.Comment(t.image);
  563. }
  564. }