PageRenderTime 42ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/bundles/plugins-trunk/XML/xml/parser/javacc/XmlParser.jj

#
Unknown | 481 lines | 435 code | 46 blank | 0 comment | 0 complexity | 9a342d7aedefe532fe78eed13a3ec2c1 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
  1. /*
  2. * XmlParser.jj -- JavaCC grammar for HTML.
  3. * Copyright (C) 1999 Quiotix Corporation.
  4. * Copyright (C) 2010 Eric Le Lay
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License, version 2, as
  8. * published by the Free Software Foundation.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
  14. * for more details.
  15. */
  16. /*
  17. * JavaCC grammar file for XML.
  18. *
  19. * Author: Eric Le Lay
  20. * Version: 1.0
  21. * Revision: $Id$
  22. *
  23. * Adapted form sidekick/html/parser/html/HtmlParser.jj
  24. * This grammar parses an HTML document and produces a (flat) parse "tree"
  25. * representing the document. It preserves almost all information in the
  26. * source document, including carriage control and spacing (except inside
  27. * of tags.) See the XmlDocument and XmlDocument.* classes for a
  28. * description of the parse tree. The parse tree supports traversal using
  29. * the commonly used "Visitor" pattern. The XmlDumper class is a visitor
  30. * which dumps out the tree to an output stream.
  31. *
  32. * It does not require begin tags to be matched with end tags, or validate
  33. * the names or contents of the tags (this can easily be done post-parsing;
  34. * see the HtmlCollector class (which matches begin tags with end tags)
  35. * for an example.)
  36. *
  37. * Notable edge cases include:
  38. * - Quoted string processing. Quoted strings are matched inside of comments, and
  39. * as tag attribute values. Quoted strings are matched in normal text only
  40. * to the extent that they do not span line breaks.
  41. *
  42. */
  43. options { IGNORE_CASE = true; STATIC = false; }
  44. PARSER_BEGIN(XmlParser)
  45. package xml.parser.javacc;
  46. import java.text.MessageFormat;
  47. import java.util.*;
  48. import java.util.regex.*;
  49. import sidekick.util.*;
  50. import java.io.Reader;
  51. public class XmlParser {
  52. static String NL = System.getProperty("line.separator");
  53. private List<ParseError> parseErrors = new ArrayList<ParseError>();
  54. public void setLineSeparator(String ls) {
  55. NL = ls;
  56. }
  57. private static String getTokenText(Token first, Token cur) {
  58. Token t;
  59. StringBuffer sb = new StringBuffer();
  60. for (t=first; t != cur.next; t = t.next) {
  61. if (t.specialToken != null) {
  62. Token tt=t.specialToken;
  63. while (tt.specialToken != null)
  64. tt = tt.specialToken;
  65. for (; tt != null; tt = tt.next)
  66. sb.append(tt.image);
  67. };
  68. sb.append(t.image);
  69. };
  70. return sb.toString();
  71. }
  72. /**
  73. * The line offset is used when the XML to be parsed is only part of a file,
  74. * @param lineOffset The line number of the first line of the fragment.
  75. * @param columnOffset The column number of the first character of the fragment.
  76. */
  77. public XmlParser(Reader in, int lineOffset, int columnOffset){
  78. this(in);
  79. jj_input_stream.ReInit(in,lineOffset,columnOffset);
  80. }
  81. public static void main(String[] args) throws ParseException {
  82. XmlParser parser = new XmlParser(System.in);
  83. XmlDocument doc = parser.XmlDocument();
  84. doc.accept(new XmlDebugDumper(System.out));
  85. System.exit(0);
  86. }
  87. public void setTabSize(int size) {
  88. jj_input_stream.setTabSize(size);
  89. }
  90. public int getTabSize() {
  91. return jj_input_stream.getTabSize(0);
  92. }
  93. private void addException(ParseException pe) {
  94. Range range = getExceptionLocation( pe );
  95. parseErrors.add(new ParseError(pe.getMessage(), range));
  96. pe.printStackTrace();
  97. }
  98. public List<ParseError> getParseErrors() {
  99. System.out.println("getParserErrors, there are " + parseErrors.size() + " errors");
  100. return parseErrors;
  101. }
  102. // regex to extract line and colun from a ParseException message
  103. // ParseException message look like: "Parse error at line 116, column 5. Encountered: }"
  104. private Pattern pePattern = Pattern.compile( "(.*?)(\\d+)(.*?)(\\d+)(.*?)" );
  105. /**
  106. * @return attempts to return a Location indicating the location of a parser
  107. * exception. If the ParseException contains a Token reference, all is well,
  108. * otherwise, this method attempts to parse the message string for the
  109. * exception.
  110. */
  111. private Range getExceptionLocation( ParseException pe ) {
  112. Token t = pe.currentToken;
  113. if ( t != null ) {
  114. return new Range( new Location( t.next.beginLine - 1, t.next.beginColumn ), new Location( t.next.endLine - 1, t.next.endColumn ) );
  115. }
  116. // ParseException message look like: "Parse error at line 116, column 5. Encountered: }"
  117. try {
  118. Matcher m = pePattern.matcher( pe.getMessage() );
  119. if ( m.matches() ) {
  120. String ln = m.group( 2 );
  121. String cn = m.group( 4 );
  122. int line_number = -1;
  123. int column_number = 0;
  124. if ( ln != null )
  125. line_number = Integer.parseInt( ln );
  126. if ( cn != null )
  127. column_number = Integer.parseInt( cn );
  128. return line_number > -1 ? new Range( new Location( line_number - 1, column_number - 1 ), new Location( line_number - 1, column_number ) ) : null;
  129. }
  130. return new Range();
  131. }
  132. catch ( Exception e ) {
  133. //e.printStackTrace();
  134. return new Range();
  135. }
  136. }
  137. // regex pattern for a valid non-quoted attribute.
  138. // Attributes can be single or double quoted, or consist solely of
  139. // letters in the range A-Z and a-z, digits (0-9), hyphens ("-"),
  140. // and periods (".")
  141. private Pattern attributePattern = Pattern.compile( "([a-zA-Z0-9.-])*" );
  142. private boolean isProperAttribute(String s) {
  143. // could have double quotes
  144. if (s.startsWith("\"") && s.endsWith("\"")) {
  145. return true;
  146. }
  147. // or single quotes
  148. else if (s.startsWith("'") && s.endsWith("'")) {
  149. return true;
  150. }
  151. // or might be jsp
  152. else if (s.startsWith("<%") && (s.endsWith("%>") || s.endsWith("%")) ) {
  153. return true;
  154. }
  155. boolean rtn = attributePattern.matcher(s).matches();
  156. if (rtn == false) {
  157. System.out.println("bad attribute: " + s);
  158. }
  159. return rtn;
  160. }
  161. }
  162. PARSER_END(XmlParser)
  163. <*> TOKEN :
  164. {
  165. <#ALPHA_CHAR: [
  166. "\u0024",
  167. "\u0041"-"\u005a",
  168. "\u005f",
  169. "\u0061"-"\u007a",
  170. "\u00c0"-"\u00d6", // Latin with diacritics
  171. "\u00d8"-"\u00f6", // Latin with diacritics
  172. "\u00f8"-"\u00ff", // Latin with diacritics
  173. "\u0100"-"\u1fff", // Latin Extended-A through Greek Extended
  174. "\u3040"-"\u318f", // Hiragana through Hangul Compatibility Jamo
  175. "\u3300"-"\u337f", // CJK Compatibility
  176. "\u3400"-"\u3d2d", // CJK Unified Ideographs Extension A
  177. "\u4e00"-"\u9fff", // CJK Unified Ideographs
  178. "\uf900"-"\ufaff" ] > // CJK Compatibility Ideographs
  179. | <#NUM_CHAR: ["0"-"9"] >
  180. | <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
  181. | <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", ".", ":" ] >
  182. | <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >
  183. | <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )
  184. | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >
  185. | <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >
  186. | <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >
  187. | <#NEWLINE: ( "\r\n" | "\r" | "\n" ) >
  188. | <#QUOTE: ( "'" | "\"" ) >
  189. }
  190. <DEFAULT> TOKEN :
  191. {
  192. <EOL: ( " " | "\t" )* <NEWLINE> >
  193. | <COMMENT_START: "<!--" | "<%--" > : LexComment
  194. | <ENDTAG_START: "</" | "</" <IDENTIFIER> ":" > : LexStartTag
  195. | <TAG_START: "<" | "<" <IDENTIFIER> ":" > : LexStartTag
  196. | <DECL_START: "<!" > : LexDecl
  197. | <PCDATA: ( ~["<", "\r", "\n"] )+ >
  198. }
  199. <LexStartTag> SPECIAL_TOKEN :
  200. {
  201. < (<WHITESPACE>)+ >
  202. }
  203. <LexStartTag> TOKEN :
  204. {
  205. <TAG_NAME: <IDENTIFIER> > : LexInTag
  206. | <LST_ERROR: ~[]> : DEFAULT
  207. }
  208. <LexInTag> SPECIAL_TOKEN :
  209. {
  210. < (<WHITESPACE>)+ >
  211. }
  212. <LexInTag> TOKEN :
  213. {
  214. <ATTR_NAME: <IDENTIFIER> >
  215. | <TAG_END: ">" > : DEFAULT
  216. | <TAG_SLASHEND: "/>" > : DEFAULT
  217. | <ATTR_EQ: "=" > : LexAttrVal
  218. | <IMPLICIT_TAG_END: "<">
  219. {
  220. Token t = new Token();
  221. t.image = "<";
  222. t.kind = TAG_START;
  223. t.next = matchedToken.next;
  224. t.beginLine = matchedToken.beginLine;
  225. t.beginColumn = matchedToken.beginColumn;
  226. t.endLine = matchedToken.endLine;
  227. t.endColumn = matchedToken.endColumn;
  228. matchedToken.next = t;
  229. matchedToken.kind = TAG_END;
  230. matchedToken.image = ">";
  231. } : LexStartTag
  232. | <LIT_ERROR: ~[]>
  233. }
  234. <LexAttrVal> SPECIAL_TOKEN :
  235. {
  236. < <WHITESPACE> >
  237. }
  238. <LexAttrVal> TOKEN :
  239. {
  240. <ATTR_VAL: <QUOTED_STRING>
  241. | ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexInTag
  242. | <LAV_ERROR: ~[]>
  243. }
  244. <LexComment> TOKEN :
  245. {
  246. < COMMENT_END: ("-->" | "->" ) > : DEFAULT
  247. | < DASH: "-" >
  248. | < COMMENT_EOL: <NEWLINE> >
  249. | < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+
  250. | <QUOTED_STRING_NB>
  251. | <QUOTE> ) >
  252. }
  253. <LexDecl> TOKEN :
  254. {
  255. <DECL_ANY: ( <QUOTED_STRING_NB> | <QUOTE> | ~[ ">" ] )+ >
  256. | <DECL_END: ">" > : DEFAULT
  257. }
  258. XmlDocument XmlDocument() :
  259. {
  260. XmlDocument.ElementSequence s;
  261. }
  262. {
  263. s=ElementSequence() <EOF>
  264. { return new XmlDocument(s); }
  265. }
  266. XmlDocument.ElementSequence ElementSequence() :
  267. {
  268. XmlDocument.ElementSequence s = new XmlDocument.ElementSequence();
  269. XmlDocument.XmlElement h;
  270. }
  271. {
  272. ( h=Element() { s.addElement(h); } ) *
  273. { return s; }
  274. }
  275. XmlDocument.XmlElement Element() :
  276. {
  277. XmlDocument.XmlElement e;
  278. Token text;
  279. }
  280. {
  281. (
  282. LOOKAHEAD(2)
  283. e = Tag() { return e; }
  284. | e = EndTag() { return e; }
  285. | e = CommentTag() { return e; }
  286. | e = DeclTag() { return e; }
  287. | LOOKAHEAD(2)
  288. <TAG_START> text=<LST_ERROR>
  289. { return new XmlDocument.Text("<" + text.image); }
  290. | text = <PCDATA> { return new XmlDocument.Text(text.image); }
  291. | <EOL> { return new XmlDocument.Newline(); }
  292. )
  293. }
  294. XmlDocument.Attribute Attribute() :
  295. {
  296. XmlDocument.Attribute a;
  297. Token t1, t2=null;
  298. }
  299. {
  300. try {
  301. t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]
  302. {
  303. if (t2 == null) {
  304. a = new XmlDocument.Attribute(t1.image);
  305. a.setStartLocation(t1.beginLine, t1.beginColumn);
  306. a.setEndLocation(t1.endLine, t1.endColumn + 1);
  307. }
  308. else {
  309. a = new XmlDocument.Attribute(t1.image, t2.image);
  310. a.setStartLocation(t1.beginLine, t1.beginColumn);
  311. a.setValueStartLocation(t2.beginLine,t2.beginColumn);
  312. a.setEndLocation(t2.endLine, t2.endColumn + 1);
  313. if (!isProperAttribute(t2.image)) {
  314. ParseException e = new ParseException("Parse error at line " + t2.beginLine + ", column " + t2.beginColumn + ". Attribute is improperly quoted." );
  315. addException(e);
  316. }
  317. }
  318. return a;
  319. }
  320. }
  321. catch(ParseException e) {
  322. addException(e);
  323. return null;
  324. }
  325. }
  326. XmlDocument.AttributeList AttributeList() :
  327. {
  328. XmlDocument.AttributeList alist = new XmlDocument.AttributeList();
  329. XmlDocument.Attribute a;
  330. }
  331. {
  332. (a=Attribute() { alist.addAttribute(a); } )*
  333. {
  334. return alist;
  335. }
  336. }
  337. XmlDocument.XmlElement Tag() :
  338. {
  339. Token t, et;
  340. XmlDocument.AttributeList alist;
  341. Token firstToken = getToken(1);
  342. Token st = null;
  343. }
  344. {
  345. try {
  346. st=<TAG_START> t=<TAG_NAME> alist=AttributeList()
  347. ( et=<TAG_END> | et=<TAG_SLASHEND> )
  348. {
  349. String tag_start = "<";
  350. String tag_name = "";
  351. if (st.image.startsWith("<") && st.image.endsWith(":")) {
  352. tag_start = "<";
  353. tag_name = st.image.substring(1) + t.image;
  354. }
  355. else {
  356. tag_name = t.image;
  357. }
  358. XmlDocument.Tag rtn_tag = new XmlDocument.Tag(tag_start, tag_name, alist, et.image);
  359. if (et.kind == TAG_SLASHEND) {
  360. rtn_tag.setEmpty(true);
  361. }
  362. rtn_tag.setStartLocation(st.beginLine, st.beginColumn);
  363. rtn_tag.setEndLocation(et.endLine, et.endColumn + 1);
  364. return rtn_tag;
  365. }
  366. }
  367. catch (ParseException ex) {
  368. addException(ex);
  369. token_source.SwitchTo(DEFAULT);
  370. String s = getTokenText(firstToken, getNextToken());
  371. return new XmlDocument.Text(s);
  372. }
  373. }
  374. XmlDocument.XmlElement EndTag() :
  375. {
  376. Token t;
  377. Token firstToken = getToken(1);
  378. Token st, et;
  379. }
  380. {
  381. try {
  382. st=<ENDTAG_START> t=<TAG_NAME> et=<TAG_END>
  383. {
  384. String tag_name = "";
  385. if (st.image.startsWith("</") && st.image.endsWith(":")) {
  386. tag_name = st.image.substring(2) + t.image;
  387. }
  388. else
  389. tag_name = t.image;
  390. XmlDocument.EndTag b = new XmlDocument.EndTag(tag_name);
  391. b.setStartLocation(st.beginLine, st.beginColumn);
  392. b.setEndLocation(et.endLine, et.endColumn + 1);
  393. return b;
  394. }
  395. }
  396. catch (ParseException ex) {
  397. addException(ex);
  398. token_source.SwitchTo(DEFAULT);
  399. String s = getTokenText(firstToken, getNextToken());
  400. return new XmlDocument.Text(s);
  401. }
  402. }
  403. XmlDocument.Comment CommentTag() :
  404. {
  405. Token t, comment_start, comment_end = null;
  406. StringBuffer s = new StringBuffer();
  407. }
  408. {
  409. try {
  410. comment_start=<COMMENT_START>
  411. ( t=<DASH> { s.append(t.image); }
  412. | <COMMENT_EOL> { s.append(NL); }
  413. | t=<COMMENT_WORD> { s.append(t.image); } )*
  414. (<EOF> | comment_end=<COMMENT_END>)
  415. { return new XmlDocument.Comment(comment_start.image + s.toString() + (comment_end == null ? "" : comment_end.image)); }
  416. }
  417. catch(ParseException e) {
  418. addException(e);
  419. }
  420. }
  421. XmlDocument.Comment DeclTag() :
  422. {
  423. Token t;
  424. }
  425. {
  426. try {
  427. <DECL_START> t=<DECL_ANY> <DECL_END>
  428. {
  429. return new XmlDocument.Comment(t.image);
  430. }
  431. }
  432. catch(ParseException e) {
  433. addException(e);
  434. }
  435. }