PageRenderTime 48ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/projects/jena-2.6.3/com/hp/hpl/jena/graph/query/regexptrees/PerlPatternParser.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 412 lines | 305 code | 22 blank | 85 comment | 59 complexity | 02b6674ec1c23b0c8ed78e4a5b80fbdf MD5 | raw file
  1. /*
  2. (c) Copyright 2004, 2005, 2006, 2007, 2008, 2009 Hewlett-Packard Development Company, LP, all rights reserved.
  3. [See end of file]
  4. $Id: PerlPatternParser.java,v 1.1 2009/06/29 08:55:51 castagna Exp $
  5. */
  6. package com.hp.hpl.jena.graph.query.regexptrees;
  7. import java.util.*;
  8. /**
  9. Parse Perl5 patterns into RegexpTree structures, or throw an exception for
  10. cases that haven't been implemented.
  11. @author hedgehog
  12. */
  13. public class PerlPatternParser
  14. {
  15. /**
  16. The string being parsed, as supplied to the constructor(s).
  17. */
  18. protected final String toParse;
  19. /**
  20. The index into the string of the next undealt-with character, ie, it starts at 0.
  21. */
  22. protected int pointer;
  23. /**
  24. The length of the string to parse, used as a limit.
  25. */
  26. protected final int limit;
  27. /**
  28. The generator for the RegexpTree nodes to be used in the parse.
  29. */
  30. protected RegexpTreeGenerator generator;
  31. /**
  32. Count of how many back-references match-points seen so far.
  33. */
  34. protected int matchPointsSeen;
  35. /**
  36. The digits, in order.
  37. */
  38. public static final String digits = "0123456789";
  39. /**
  40. The characters that are (non-)matchable by \w[W].
  41. */
  42. public static final String wordChars =
  43. digits
  44. + "abcdefghijklmnopqrstuvwxyz"
  45. + "_"
  46. + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  47. ;
  48. /**
  49. Initialise this parser with the string to parse and with the default
  50. generator (SimpleGenerator).
  51. */
  52. public PerlPatternParser( String toParse )
  53. { this( toParse, new SimpleGenerator() ); }
  54. /**
  55. Initialise this parser with the string to parse and with the generator to
  56. use for node construction.
  57. */
  58. public PerlPatternParser( String toParse, RegexpTreeGenerator gen )
  59. { this.toParse = toParse;
  60. this.limit = toParse.length();
  61. this.generator = gen; }
  62. /**
  63. Answer the result of parsing the given string as a sequence of alternatives.
  64. */
  65. public static RegexpTree parse( String string )
  66. { return new PerlPatternParser( string ) .parseAlts(); }
  67. /**
  68. Answer the result of parsing the given string as a sequence of alternatives,
  69. using the supplied generator for the pattern nodes.
  70. */
  71. public static RegexpTree parse( String string, RegexpTreeGenerator gen )
  72. { return new PerlPatternParser( string, gen ) .parseAlts(); }
  73. /**
  74. Exception thrown if a syntax error is detected. Further details are in the
  75. error message - it doesn't seem worth worrying about having different
  76. classes for different errors. Possibly this should be a non-static class so
  77. that it can get at the current context?
  78. */
  79. public static class SyntaxException extends RuntimeException
  80. {
  81. public SyntaxException( String message )
  82. { super( message ); }
  83. }
  84. /**
  85. Answer the string that this parser is parsing.
  86. */
  87. public String getString()
  88. { return toParse; }
  89. /**
  90. Answer the current index into the parse string.
  91. */
  92. public int getPointer()
  93. { return pointer; }
  94. /**
  95. Answer the character under the pointer, and advance the pointer.
  96. */
  97. protected char nextChar()
  98. {
  99. return toParse.charAt( pointer++ );
  100. }
  101. /**
  102. Parse a single atom and return the tree for it, advancing the pointer. This
  103. does not deal with quantifiers, for which see parseQuantifier. Unmatched
  104. right parentheses, unexpected (hence unbound) quantifiers, and those things
  105. that aren't implemented, throw exceptions. An empty atom is permitted
  106. (at the end of a string or before a |).
  107. */
  108. public RegexpTree parseAtom()
  109. {
  110. if (pointer < limit)
  111. {
  112. char ch = nextChar();
  113. switch (ch)
  114. {
  115. case '.': return generator.getAnySingle();
  116. case '^': return generator.getStartOfLine();
  117. case '$': return generator.getEndOfLine();
  118. case '|': pointer -= 1; return generator.getNothing();
  119. case '[': return parseClass();
  120. case ')': pointer -= 1; return generator.getNothing();
  121. case '(': return parseParens();
  122. case '\\': return parseBackslash();
  123. case '*':
  124. case '+':
  125. case '?':
  126. case '{': throw new PerlPatternParser.SyntaxException( "unbound quantifier " + ch );
  127. case ']':
  128. case '}':
  129. default: return generator.getText( ch );
  130. }
  131. }
  132. return generator.getNothing();
  133. }
  134. /**
  135. Parse a class expression and answer an appropriate tree.
  136. */
  137. protected RegexpTree parseClass()
  138. {
  139. StringBuffer b = new StringBuffer();
  140. boolean negated = parseClassNegation();
  141. while (true)
  142. {
  143. int ch = nextClassChar();
  144. if (ch == ']') break;
  145. if (ch == '-' && b.length() > 0)
  146. {
  147. char begin = (char) (b.charAt( b.length() - 1 ) + 1);
  148. char end = (char) Math.abs( nextClassChar() );
  149. for (char i = begin; i <= end; i += 1) b.append( i );
  150. }
  151. else
  152. b.append( (char) Math.abs( ch ) );
  153. }
  154. pointer += 1;
  155. return generator.getClass( b.toString(), negated );
  156. }
  157. /**
  158. Answer the next character, if it's suitable for part of a class expression,
  159. negated if it's been escaped. Iffy.
  160. */
  161. private int nextClassChar()
  162. {
  163. char ch = nextChar();
  164. if (ch == '\\')
  165. {
  166. RegexpTree t = parseAtom();
  167. if (t instanceof Text) return -((Text) t).getString().charAt( 0 );
  168. throw new SyntaxException( "not allowed in class" );
  169. }
  170. else
  171. return ch;
  172. }
  173. protected boolean parseClassNegation()
  174. {
  175. if (toParse.charAt( pointer ) == '^')
  176. { pointer += 1; return true; }
  177. else
  178. return false;
  179. }
  180. /**
  181. Parse a parenthesised expression. Throw a SyntaxException if the closing
  182. bracket is missing. Answer the wrapped sub-expression. Does not cater
  183. for the (? ...) stuff.
  184. */
  185. protected RegexpTree parseParens()
  186. {
  187. RegexpTree operand = parseAlts();
  188. if (pointer < limit && toParse.charAt( pointer ) == ')') pointer += 1;
  189. else throw new SyntaxException( "missing closing bracket" );
  190. matchPointsSeen += 1;
  191. return generator.getParen( operand, matchPointsSeen );
  192. }
  193. /**
  194. Parse a backslash-escape and answer the appropriate regexp tree.
  195. Unhandled escapes throw an exception.
  196. */
  197. private RegexpTree parseBackslash()
  198. {
  199. char ch = nextChar();
  200. if ("bBAZnrtfdDwWSsxc0123456789".indexOf( ch ) < 0)
  201. return generator.getText( ch );
  202. else if (ch == 'n')
  203. return generator.getText( '\n' );
  204. else if (ch == 'r')
  205. return generator.getText( '\r' );
  206. else if (ch == 'f')
  207. return generator.getText( '\f' );
  208. else if (ch == 't')
  209. return generator.getText( '\t' );
  210. else if (ch == 's')
  211. return generator.getClass( " \r\n\t\f", false );
  212. else if (ch == 'S')
  213. return generator.getClass( " \r\n\t\f", true );
  214. else if (ch == 'd')
  215. return generator.getClass( digits, false );
  216. else if (ch == 'D')
  217. return generator.getClass( digits, true );
  218. else if (ch == 'w')
  219. return generator.getClass( wordChars, false );
  220. else if (ch == 'W')
  221. return generator.getClass( wordChars, true );
  222. else if ('0' <= ch && ch <= '9')
  223. return backReferenceOrOctalChar( ch );
  224. else if (ch == 'x')
  225. return hexEscape();
  226. else if (ch == 'c')
  227. return control( nextChar() );
  228. else
  229. throw new PerlPatternParser.SyntaxException( "can't do \\" + ch + " yet" );
  230. }
  231. /**
  232. Answer a RegexpTree representing the single character which is CTRL-ch.
  233. */
  234. protected RegexpTree control( char ch )
  235. { return Text.create( (char) (ch - 'A' + 1) ); }
  236. /**
  237. Answer a RegexpTree representing the single character whose value is
  238. given by the next two hexadecimal digits.
  239. */
  240. protected RegexpTree hexEscape()
  241. {
  242. char hi = nextChar(), lo = nextChar();
  243. return Text.create( (char) (deHex( hi ) * 16 + deHex( lo )) );
  244. }
  245. /**
  246. Answer the integer value corresponding to the hex digit <code>ch</code>.
  247. */
  248. private int deHex( char ch )
  249. {
  250. if (Character.isDigit( ch )) return ch - '0';
  251. if ('a' <= ch && ch <= 'f') return 10 + ch - 'a';
  252. if ('A' <= ch && ch <= 'F') return 10 + ch - 'A';
  253. throw new SyntaxException( "'" + ch + "' is not a hex digit" );
  254. }
  255. /**
  256. Answer the backreference or octal character described by \nnnn sequences.
  257. */
  258. protected RegexpTree backReferenceOrOctalChar( char ch )
  259. {
  260. char [] chars = new char[20];
  261. int index = 0;
  262. chars[index++] = ch;
  263. while (pointer < limit)
  264. {
  265. ch = nextChar();
  266. if (!Character.isDigit( ch )) break;
  267. chars[index++] = ch;
  268. }
  269. int n = numeric( chars, 10, index );
  270. return 0 < n && n <= matchPointsSeen
  271. ? generator.getBackReference( n )
  272. : generator.getText( numeric( chars, 8, index ) );
  273. }
  274. /**
  275. Answer the numeric value represented by chars[0..limit-1] in the given base.
  276. */
  277. protected char numeric( char [] chars, int base, int limit )
  278. {
  279. int result = 0;
  280. for (int i = 0; i < limit; i += 1) result = result * base + (chars[i] - '0');
  281. return (char) result;
  282. }
  283. /**
  284. Parse any quantifier and answer the quantified version of the argument
  285. tree <code>d</code>. TODO: handle non-greedy quantifiers. (These will
  286. currently generate syntax errors when their flagging ? is encountered by
  287. parseAtom.)
  288. */
  289. public RegexpTree parseQuantifier( RegexpTree d )
  290. {
  291. if (pointer < limit)
  292. {
  293. char ch = toParse.charAt( pointer );
  294. switch (ch)
  295. {
  296. case '*':
  297. pointer += 1;
  298. return generator.getZeroOrMore( d );
  299. case '+':
  300. pointer += 1;
  301. return generator.getOneOrMore( d );
  302. case '?':
  303. pointer += 1;
  304. return generator.getOptional( d );
  305. case '{':
  306. throw new SyntaxException( "numeric quantifiers not done yet" );
  307. }
  308. }
  309. return d;
  310. }
  311. /**
  312. Parse an element (an atom and any following quantifier) and answer the
  313. possibly-quantified tree.
  314. */
  315. public RegexpTree parseElement()
  316. { return parseQuantifier( parseAtom() ); }
  317. /**
  318. Parse a sequence of elements [possibly-quantified atoms] and answer the
  319. sequence (singular sequences may be reduced to its single element).
  320. */
  321. public RegexpTree parseSeq()
  322. {
  323. List<RegexpTree> operands = new ArrayList<RegexpTree>();
  324. while (true)
  325. {
  326. RegexpTree next = parseElement();
  327. if (next.equals( generator.getNothing() ) ) break;
  328. operands.add( next );
  329. }
  330. return generator.getSequence( operands );
  331. }
  332. /**
  333. Parse an alternation of sequences and answer an alternative tree (or the
  334. single component if there is just one alternative).
  335. */
  336. public RegexpTree parseAlts()
  337. {
  338. List<RegexpTree> operands = new ArrayList<RegexpTree>();
  339. while (true)
  340. {
  341. operands.add( parseSeq() );
  342. if (pointer < limit && toParse.charAt( pointer ) == '|') pointer += 1;
  343. else break;
  344. }
  345. return generator.getAlternatives( operands );
  346. }
  347. }
  348. /*
  349. (c) Copyright 2004, 2005, 2006, 2007, 2008, 2009 Hewlett-Packard Development Company, LP
  350. All rights reserved.
  351. Redistribution and use in source and binary forms, with or without
  352. modification, are permitted provided that the following conditions
  353. are met:
  354. 1. Redistributions of source code must retain the above copyright
  355. notice, this list of conditions and the following disclaimer.
  356. 2. Redistributions in binary form must reproduce the above copyright
  357. notice, this list of conditions and the following disclaimer in the
  358. documentation and/or other materials provided with the distribution.
  359. 3. The name of the author may not be used to endorse or promote products
  360. derived from this software without specific prior written permission.
  361. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  362. IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  363. OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  364. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  365. INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  366. NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  367. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  368. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  369. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  370. THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  371. */