PageRenderTime 112ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/jEdit/tags/jedit-4-0-pre3/gnu/regexp/RE.java

#
Java | 1343 lines | 676 code | 164 blank | 503 comment | 370 complexity | 4bf34a599d21693ac6c2994c4a081c25 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
  1. /*
  2. * gnu/regexp/RE.java
  3. * Copyright (C) 1998-2001 Wes Biggs
  4. *
  5. * This library is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published
  7. * by the Free Software Foundation; either version 2.1 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. package gnu.regexp;
  20. import java.io.InputStream;
  21. import java.io.Reader;
  22. import java.io.Serializable;
  23. import java.util.Locale;
  24. import java.util.PropertyResourceBundle;
  25. import java.util.ResourceBundle;
  26. import java.util.Vector;
  27. class IntPair implements Serializable {
  28. public int first, second;
  29. }
  30. class CharUnit implements Serializable {
  31. public char ch;
  32. public boolean bk;
  33. }
  34. /**
  35. * RE provides the user interface for compiling and matching regular
  36. * expressions.
  37. * <P>
  38. * A regular expression object (class RE) is compiled by constructing it
  39. * from a String, StringBuffer or character array, with optional
  40. * compilation flags (below)
  41. * and an optional syntax specification (see RESyntax; if not specified,
  42. * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  43. * <P>
  44. * Various methods attempt to match input text against a compiled
  45. * regular expression. These methods are:
  46. * <LI><code>isMatch</code>: returns true if the input text in its entirety
  47. * matches the regular expression pattern.
  48. * <LI><code>getMatch</code>: returns the first match found in the input text,
  49. * or null if no match is found.
  50. * <LI><code>getAllMatches</code>: returns an array of all non-overlapping
  51. * matches found in the input text. If no matches are found, the array is
  52. * zero-length.
  53. * <LI><code>substitute</code>: substitute the first occurence of the pattern
  54. * in the input text with a replacement string (which may include
  55. * metacharacters $0-$9, see REMatch.substituteInto).
  56. * <LI><code>substituteAll</code>: same as above, but repeat for each match
  57. * before returning.
  58. * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
  59. * that allows iteration over the matches (see REMatchEnumeration for some
  60. * reasons why you may want to do this instead of using <code>getAllMatches</code>.
  61. * <P>
  62. *
  63. * These methods all have similar argument lists. The input can be a
  64. * String, a character array, a StringBuffer, a Reader or an
  65. * InputStream of some sort. Note that when using a Reader or
  66. * InputStream, the stream read position cannot be guaranteed after
  67. * attempting a match (this is not a bug, but a consequence of the way
  68. * regular expressions work). Using an REMatchEnumeration can
  69. * eliminate most positioning problems.
  70. *
  71. * <P>
  72. *
  73. * The optional index argument specifies the offset from the beginning
  74. * of the text at which the search should start (see the descriptions
  75. * of some of the execution flags for how this can affect positional
  76. * pattern operators). For a Reader or InputStream, this means an
  77. * offset from the current read position, so subsequent calls with the
  78. * same index argument on a Reader or an InputStream will not
  79. * necessarily access the same position on the stream, whereas
  80. * repeated searches at a given index in a fixed string will return
  81. * consistent results.
  82. *
  83. * <P>
  84. * You can optionally affect the execution environment by using a
  85. * combination of execution flags (constants listed below).
  86. *
  87. * <P>
  88. * All operations on a regular expression are performed in a
  89. * thread-safe manner.
  90. *
  91. * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  92. * @version 1.1.4-dev, to be released
  93. */
  94. public class RE extends REToken {
  95. // This String will be returned by getVersion()
  96. private static final String VERSION = "1.1.4-dev";
  97. // The localized strings are kept in a separate file
  98. private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
  99. // These are, respectively, the first and last tokens in our linked list
  100. // If there is only one token, firstToken == lastToken
  101. private REToken firstToken, lastToken;
  102. // This is the number of subexpressions in this regular expression,
  103. // with a minimum value of zero. Returned by getNumSubs()
  104. private int numSubs;
  105. /** Minimum length, in characters, of any possible match. */
  106. private int minimumLength;
  107. /**
  108. * Compilation flag. Do not differentiate case. Subsequent
  109. * searches using this RE will be case insensitive.
  110. */
  111. public static final int REG_ICASE = 2;
  112. /**
  113. * Compilation flag. The match-any-character operator (dot)
  114. * will match a newline character. When set this overrides the syntax
  115. * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
  116. * the "/s" operator in Perl.
  117. */
  118. public static final int REG_DOT_NEWLINE = 4;
  119. /**
  120. * Compilation flag. Use multiline mode. In this mode, the ^ and $
  121. * anchors will match based on newlines within the input. This is
  122. * equivalent to the "/m" operator in Perl.
  123. */
  124. public static final int REG_MULTILINE = 8;
  125. /**
  126. * Execution flag.
  127. * The match-beginning operator (^) will not match at the beginning
  128. * of the input string. Useful for matching on a substring when you
  129. * know the context of the input is such that position zero of the
  130. * input to the match test is not actually position zero of the text.
  131. * <P>
  132. * This example demonstrates the results of various ways of matching on
  133. * a substring.
  134. * <P>
  135. * <CODE>
  136. * String s = "food bar fool";<BR>
  137. * RE exp = new RE("^foo.");<BR>
  138. * REMatch m0 = exp.getMatch(s);<BR>
  139. * REMatch m1 = exp.getMatch(s.substring(8));<BR>
  140. * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
  141. * REMatch m3 = exp.getMatch(s,8); <BR>
  142. * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
  143. * <P>
  144. * // Results:<BR>
  145. * // m0 = "food"<BR>
  146. * // m1 = "fool"<BR>
  147. * // m2 = null<BR>
  148. * // m3 = null<BR>
  149. * // m4 = "fool"<BR>
  150. * </CODE>
  151. */
  152. public static final int REG_NOTBOL = 16;
  153. /**
  154. * Execution flag.
  155. * The match-end operator ($) does not match at the end
  156. * of the input string. Useful for matching on substrings.
  157. */
  158. public static final int REG_NOTEOL = 32;
  159. /**
  160. * Execution flag.
  161. * When a match method is invoked that starts matching at a non-zero
  162. * index into the input, treat the input as if it begins at the index
  163. * given. The effect of this flag is that the engine does not "see"
  164. * any text in the input before the given index. This is useful so
  165. * that the match-beginning operator (^) matches not at position 0
  166. * in the input string, but at the position the search started at
  167. * (based on the index input given to the getMatch function). See
  168. * the example under REG_NOTBOL. It also affects the use of the \&lt;
  169. * and \b operators.
  170. */
  171. public static final int REG_ANCHORINDEX = 64;
  172. /**
  173. * Execution flag.
  174. * The substitute and substituteAll methods will not attempt to
  175. * interpolate occurrences of $1-$9 in the replacement text with
  176. * the corresponding subexpressions. For example, you may want to
  177. * replace all matches of "one dollar" with "$1".
  178. */
  179. public static final int REG_NO_INTERPOLATE = 128;
  180. /** Returns a string representing the version of the gnu.regexp package. */
  181. public static final String version() {
  182. return VERSION;
  183. }
  184. // Retrieves a message from the ResourceBundle
  185. static final String getLocalizedMessage(String key) {
  186. return messages.getString(key);
  187. }
  188. /**
  189. * Constructs a regular expression pattern buffer without any compilation
  190. * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
  191. *
  192. * @param pattern A regular expression pattern, in the form of a String,
  193. * StringBuffer or char[]. Other input types will be converted to
  194. * strings using the toString() method.
  195. * @exception REException The input pattern could not be parsed.
  196. * @exception NullPointerException The pattern was null.
  197. */
  198. public RE(Object pattern) throws REException {
  199. this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
  200. }
  201. /**
  202. * Constructs a regular expression pattern buffer using the specified
  203. * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
  204. *
  205. * @param pattern A regular expression pattern, in the form of a String,
  206. * StringBuffer, or char[]. Other input types will be converted to
  207. * strings using the toString() method.
  208. * @param cflags The logical OR of any combination of the compilation flags listed above.
  209. * @exception REException The input pattern could not be parsed.
  210. * @exception NullPointerException The pattern was null.
  211. */
  212. public RE(Object pattern, int cflags) throws REException {
  213. this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
  214. }
  215. /**
  216. * Constructs a regular expression pattern buffer using the specified
  217. * compilation flags and regular expression syntax.
  218. *
  219. * @param pattern A regular expression pattern, in the form of a String,
  220. * StringBuffer, or char[]. Other input types will be converted to
  221. * strings using the toString() method.
  222. * @param cflags The logical OR of any combination of the compilation flags listed above.
  223. * @param syntax The type of regular expression syntax to use.
  224. * @exception REException The input pattern could not be parsed.
  225. * @exception NullPointerException The pattern was null.
  226. */
  227. public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
  228. this(pattern,cflags,syntax,0,0);
  229. }
  230. // internal constructor used for alternation
  231. private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
  232. super(subIndex);
  233. firstToken = first;
  234. lastToken = last;
  235. numSubs = subs;
  236. minimumLength = minLength;
  237. addToken(new RETokenEndSub(subIndex));
  238. }
  239. private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
  240. super(myIndex); // Subexpression index of this token.
  241. initialize(patternObj, cflags, syntax, myIndex, nextSub);
  242. }
  243. // For use by subclasses
  244. protected RE() { super(0); }
  245. // The meat of construction
  246. protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
  247. char[] pattern;
  248. if (patternObj instanceof String) {
  249. pattern = ((String) patternObj).toCharArray();
  250. } else if (patternObj instanceof char[]) {
  251. pattern = (char[]) patternObj;
  252. } else if (patternObj instanceof StringBuffer) {
  253. pattern = new char [((StringBuffer) patternObj).length()];
  254. ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
  255. } else {
  256. pattern = patternObj.toString().toCharArray();
  257. }
  258. int pLength = pattern.length;
  259. numSubs = 0; // Number of subexpressions in this token.
  260. Vector branches = null;
  261. // linked list of tokens (sort of -- some closed loops can exist)
  262. firstToken = lastToken = null;
  263. // Precalculate these so we don't pay for the math every time we
  264. // need to access them.
  265. boolean insens = ((cflags & REG_ICASE) > 0);
  266. // Parse pattern into tokens. Does anyone know if it's more efficient
  267. // to use char[] than a String.charAt()? I'm assuming so.
  268. // index tracks the position in the char array
  269. int index = 0;
  270. // this will be the current parse character (pattern[index])
  271. CharUnit unit = new CharUnit();
  272. // This is used for {x,y} calculations
  273. IntPair minMax = new IntPair();
  274. // Buffer a token so we can create a TokenRepeated, etc.
  275. REToken currentToken = null;
  276. char ch;
  277. while (index < pLength) {
  278. // read the next character unit (including backslash escapes)
  279. index = getCharUnit(pattern,index,unit);
  280. // ALTERNATION OPERATOR
  281. // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
  282. // not available if RE_LIMITED_OPS is set
  283. // TODO: the '\n' literal here should be a test against REToken.newline,
  284. // which unfortunately may be more than a single character.
  285. if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk))
  286. || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) )
  287. && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
  288. // make everything up to here be a branch. create vector if nec.
  289. addToken(currentToken);
  290. RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
  291. minimumLength = 0;
  292. if (branches == null) {
  293. branches = new Vector();
  294. }
  295. branches.addElement(theBranch);
  296. firstToken = lastToken = currentToken = null;
  297. }
  298. // INTERVAL OPERATOR:
  299. // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
  300. // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
  301. //
  302. // OPEN QUESTION:
  303. // what is proper interpretation of '{' at start of string?
  304. else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
  305. int newIndex = getMinMax(pattern,index,minMax,syntax);
  306. if (newIndex > index) {
  307. if (minMax.first > minMax.second)
  308. throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
  309. if (currentToken == null)
  310. throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
  311. if (currentToken instanceof RETokenRepeated)
  312. throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
  313. if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
  314. throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
  315. if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
  316. throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex);
  317. index = newIndex;
  318. currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
  319. }
  320. else {
  321. addToken(currentToken);
  322. currentToken = new RETokenChar(subIndex,unit.ch,insens);
  323. }
  324. }
  325. // LIST OPERATOR:
  326. // [...] | [^...]
  327. else if ((unit.ch == '[') && !unit.bk) {
  328. Vector options = new Vector();
  329. boolean negative = false;
  330. char lastChar = 0;
  331. if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
  332. // Check for initial caret, negation
  333. if ((ch = pattern[index]) == '^') {
  334. negative = true;
  335. if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
  336. ch = pattern[index];
  337. }
  338. // Check for leading right bracket literal
  339. if (ch == ']') {
  340. lastChar = ch;
  341. if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
  342. }
  343. while ((ch = pattern[index++]) != ']') {
  344. if ((ch == '-') && (lastChar != 0)) {
  345. if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
  346. if ((ch = pattern[index]) == ']') {
  347. options.addElement(new RETokenChar(subIndex,lastChar,insens));
  348. lastChar = '-';
  349. } else {
  350. options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
  351. lastChar = 0;
  352. index++;
  353. }
  354. } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
  355. if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
  356. int posixID = -1;
  357. boolean negate = false;
  358. char asciiEsc = 0;
  359. if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
  360. switch (pattern[index]) {
  361. case 'D':
  362. negate = true;
  363. case 'd':
  364. posixID = RETokenPOSIX.DIGIT;
  365. break;
  366. case 'S':
  367. negate = true;
  368. case 's':
  369. posixID = RETokenPOSIX.SPACE;
  370. break;
  371. case 'W':
  372. negate = true;
  373. case 'w':
  374. posixID = RETokenPOSIX.ALNUM;
  375. break;
  376. }
  377. }
  378. else if ("nrt".indexOf(pattern[index]) != -1) {
  379. switch (pattern[index]) {
  380. case 'n':
  381. asciiEsc = '\n';
  382. break;
  383. case 't':
  384. asciiEsc = '\t';
  385. break;
  386. case 'r':
  387. asciiEsc = '\r';
  388. break;
  389. }
  390. }
  391. if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
  392. if (posixID != -1) {
  393. options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
  394. } else if (asciiEsc != 0) {
  395. lastChar = asciiEsc;
  396. } else {
  397. lastChar = pattern[index];
  398. }
  399. ++index;
  400. } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
  401. StringBuffer posixSet = new StringBuffer();
  402. index = getPosixSet(pattern,index+1,posixSet);
  403. int posixId = RETokenPOSIX.intValue(posixSet.toString());
  404. if (posixId != -1)
  405. options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
  406. } else {
  407. if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
  408. lastChar = ch;
  409. }
  410. if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
  411. } // while in list
  412. // Out of list, index is one past ']'
  413. if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
  414. // Create a new RETokenOneOf
  415. addToken(currentToken);
  416. options.trimToSize();
  417. currentToken = new RETokenOneOf(subIndex,options,negative);
  418. }
  419. // SUBEXPRESSIONS
  420. // (...) | \(...\) depending on RE_NO_BK_PARENS
  421. else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
  422. boolean pure = false;
  423. boolean comment = false;
  424. boolean lookAhead = false;
  425. boolean negativelh = false;
  426. if ((index+1 < pLength) && (pattern[index] == '?')) {
  427. switch (pattern[index+1]) {
  428. case '!':
  429. if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
  430. pure = true;
  431. negativelh = true;
  432. lookAhead = true;
  433. index += 2;
  434. }
  435. break;
  436. case '=':
  437. if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
  438. pure = true;
  439. lookAhead = true;
  440. index += 2;
  441. }
  442. break;
  443. case ':':
  444. if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
  445. pure = true;
  446. index += 2;
  447. }
  448. break;
  449. case '#':
  450. if (syntax.get(RESyntax.RE_COMMENTS)) {
  451. comment = true;
  452. }
  453. break;
  454. default:
  455. throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
  456. }
  457. }
  458. if (index >= pLength) {
  459. throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
  460. }
  461. // find end of subexpression
  462. int endIndex = index;
  463. int nextIndex = index;
  464. int nested = 0;
  465. while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0)
  466. && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) )
  467. if ((endIndex = nextIndex) >= pLength)
  468. throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
  469. else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
  470. nested++;
  471. else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
  472. nested--;
  473. // endIndex is now position at a ')','\)'
  474. // nextIndex is end of string or position after ')' or '\)'
  475. if (comment) index = nextIndex;
  476. else { // not a comment
  477. // create RE subexpression as token.
  478. addToken(currentToken);
  479. if (!pure) {
  480. numSubs++;
  481. }
  482. int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
  483. currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
  484. numSubs += ((RE) currentToken).getNumSubs();
  485. if (lookAhead) {
  486. currentToken = new RETokenLookAhead(currentToken,negativelh);
  487. }
  488. index = nextIndex;
  489. } // not a comment
  490. } // subexpression
  491. // UNMATCHED RIGHT PAREN
  492. // ) or \) throw exception if
  493. // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
  494. else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
  495. throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
  496. }
  497. // START OF LINE OPERATOR
  498. // ^
  499. else if ((unit.ch == '^') && !unit.bk) {
  500. addToken(currentToken);
  501. currentToken = null;
  502. addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
  503. }
  504. // END OF LINE OPERATOR
  505. // $
  506. else if ((unit.ch == '$') && !unit.bk) {
  507. addToken(currentToken);
  508. currentToken = null;
  509. addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
  510. }
  511. // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
  512. // .
  513. else if ((unit.ch == '.') && !unit.bk) {
  514. addToken(currentToken);
  515. currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
  516. }
  517. // ZERO-OR-MORE REPEAT OPERATOR
  518. // *
  519. else if ((unit.ch == '*') && !unit.bk) {
  520. if (currentToken == null)
  521. throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
  522. if (currentToken instanceof RETokenRepeated)
  523. throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
  524. if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
  525. throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
  526. if (currentToken.getMinimumLength() == 0)
  527. throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
  528. currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
  529. }
  530. // ONE-OR-MORE REPEAT OPERATOR
  531. // + | \+ depending on RE_BK_PLUS_QM
  532. // not available if RE_LIMITED_OPS is set
  533. else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
  534. if (currentToken == null)
  535. throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
  536. if (currentToken instanceof RETokenRepeated)
  537. throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
  538. if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
  539. throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
  540. if (currentToken.getMinimumLength() == 0)
  541. throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
  542. currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
  543. }
  544. // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
  545. // ? | \? depending on RE_BK_PLUS_QM
  546. // not available if RE_LIMITED_OPS is set
  547. // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
  548. else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
  549. if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
  550. // Check for stingy matching on RETokenRepeated
  551. if (currentToken instanceof RETokenRepeated) {
  552. if (syntax.get(RESyntax.RE_STINGY_OPS) && !((RETokenRepeated)currentToken).isStingy())
  553. ((RETokenRepeated)currentToken).makeStingy();
  554. else
  555. throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
  556. }
  557. else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
  558. throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
  559. else
  560. currentToken = setRepeated(currentToken,0,1,index);
  561. }
  562. // BACKREFERENCE OPERATOR
  563. // \1 \2 ... \9
  564. // not available if RE_NO_BK_REFS is set
  565. else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
  566. addToken(currentToken);
  567. currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
  568. }
  569. // START OF STRING OPERATOR
  570. // \A if RE_STRING_ANCHORS is set
  571. else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
  572. addToken(currentToken);
  573. currentToken = new RETokenStart(subIndex,null);
  574. }
  575. // WORD BREAK OPERATOR
  576. // \b if ????
  577. else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
  578. addToken(currentToken);
  579. currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
  580. }
  581. // WORD BEGIN OPERATOR
  582. // \< if ????
  583. else if (unit.bk && (unit.ch == '<')) {
  584. addToken(currentToken);
  585. currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
  586. }
  587. // WORD END OPERATOR
  588. // \> if ????
  589. else if (unit.bk && (unit.ch == '>')) {
  590. addToken(currentToken);
  591. currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
  592. }
  593. // NON-WORD BREAK OPERATOR
  594. // \B if ????
  595. else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
  596. addToken(currentToken);
  597. currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
  598. }
  599. // DIGIT OPERATOR
  600. // \d if RE_CHAR_CLASS_ESCAPES is set
  601. else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
  602. addToken(currentToken);
  603. currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
  604. }
  605. // NON-DIGIT OPERATOR
  606. // \D
  607. else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
  608. addToken(currentToken);
  609. currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
  610. }
  611. // NEWLINE ESCAPE
  612. // \n
  613. else if (unit.bk && (unit.ch == 'n')) {
  614. addToken(currentToken);
  615. currentToken = new RETokenChar(subIndex,'\n',false);
  616. }
  617. // RETURN ESCAPE
  618. // \r
  619. else if (unit.bk && (unit.ch == 'r')) {
  620. addToken(currentToken);
  621. currentToken = new RETokenChar(subIndex,'\r',false);
  622. }
  623. // WHITESPACE OPERATOR
  624. // \s if RE_CHAR_CLASS_ESCAPES is set
  625. else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
  626. addToken(currentToken);
  627. currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
  628. }
  629. // NON-WHITESPACE OPERATOR
  630. // \S
  631. else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
  632. addToken(currentToken);
  633. currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
  634. }
  635. // TAB ESCAPE
  636. // \t
  637. else if (unit.bk && (unit.ch == 't')) {
  638. addToken(currentToken);
  639. currentToken = new RETokenChar(subIndex,'\t',false);
  640. }
  641. // ALPHANUMERIC OPERATOR
  642. // \w
  643. else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
  644. addToken(currentToken);
  645. currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
  646. }
  647. // NON-ALPHANUMERIC OPERATOR
  648. // \W
  649. else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
  650. addToken(currentToken);
  651. currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
  652. }
  653. // END OF STRING OPERATOR
  654. // \Z
  655. else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
  656. addToken(currentToken);
  657. currentToken = new RETokenEnd(subIndex,null);
  658. }
  659. // NON-SPECIAL CHARACTER (or escape to make literal)
  660. // c | \* for example
  661. else { // not a special character
  662. addToken(currentToken);
  663. currentToken = new RETokenChar(subIndex,unit.ch,insens);
  664. }
  665. } // end while
  666. // Add final buffered token and an EndSub marker
  667. addToken(currentToken);
  668. if (branches != null) {
  669. branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
  670. branches.trimToSize(); // compact the Vector
  671. minimumLength = 0;
  672. firstToken = lastToken = null;
  673. addToken(new RETokenOneOf(subIndex,branches,false));
  674. }
  675. else addToken(new RETokenEndSub(subIndex));
  676. }
  677. private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException {
  678. unit.ch = input[index++];
  679. if (unit.bk = (unit.ch == '\\'))
  680. if (index < input.length)
  681. unit.ch = input[index++];
  682. else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
  683. return index;
  684. }
  685. /**
  686. * Checks if the regular expression matches the input in its entirety.
  687. *
  688. * @param input The input text.
  689. */
  690. public boolean isMatch(Object input) {
  691. return isMatch(input,0,0);
  692. }
  693. /**
  694. * Checks if the input string, starting from index, is an exact match of
  695. * this regular expression.
  696. *
  697. * @param input The input text.
  698. * @param index The offset index at which the search should be begin.
  699. */
  700. public boolean isMatch(Object input,int index) {
  701. return isMatch(input,index,0);
  702. }
  703. /**
  704. * Checks if the input, starting from index and using the specified
  705. * execution flags, is an exact match of this regular expression.
  706. *
  707. * @param input The input text.
  708. * @param index The offset index at which the search should be begin.
  709. * @param eflags The logical OR of any execution flags above.
  710. */
  711. public boolean isMatch(Object input,int index,int eflags) {
  712. return isMatchImpl(makeCharIndexed(input,index),index,eflags);
  713. }
  714. private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
  715. if (firstToken == null) // Trivial case
  716. return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
  717. REMatch m = new REMatch(numSubs, index, eflags);
  718. if (firstToken.match(input, m)) {
  719. while (m != null) {
  720. if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
  721. return true;
  722. }
  723. m = m.next;
  724. }
  725. }
  726. return false;
  727. }
  728. /**
  729. * Returns the maximum number of subexpressions in this regular expression.
  730. * If the expression contains branches, the value returned will be the
  731. * maximum subexpressions in any of the branches.
  732. */
  733. public int getNumSubs() {
  734. return numSubs;
  735. }
  736. // Overrides REToken.setUncle
  737. void setUncle(REToken uncle) {
  738. if (lastToken != null) {
  739. lastToken.setUncle(uncle);
  740. } else super.setUncle(uncle); // to deal with empty subexpressions
  741. }
  742. // Overrides REToken.chain
  743. boolean chain(REToken next) {
  744. super.chain(next);
  745. setUncle(next);
  746. return true;
  747. }
  748. /**
  749. * Returns the minimum number of characters that could possibly
  750. * constitute a match of this regular expression.
  751. */
  752. public int getMinimumLength() {
  753. return minimumLength;
  754. }
  755. /**
  756. * Returns an array of all matches found in the input.
  757. *
  758. * If the regular expression allows the empty string to match, it will
  759. * substitute matches at all positions except the end of the input.
  760. *
  761. * @param input The input text.
  762. * @return a non-null (but possibly zero-length) array of matches
  763. */
  764. public REMatch[] getAllMatches(Object input) {
  765. return getAllMatches(input,0,0);
  766. }
  767. /**
  768. * Returns an array of all matches found in the input,
  769. * beginning at the specified index position.
  770. *
  771. * If the regular expression allows the empty string to match, it will
  772. * substitute matches at all positions except the end of the input.
  773. *
  774. * @param input The input text.
  775. * @param index The offset index at which the search should be begin.
  776. * @return a non-null (but possibly zero-length) array of matches
  777. */
  778. public REMatch[] getAllMatches(Object input, int index) {
  779. return getAllMatches(input,index,0);
  780. }
  781. /**
  782. * Returns an array of all matches found in the input string,
  783. * beginning at the specified index position and using the specified
  784. * execution flags.
  785. *
  786. * If the regular expression allows the empty string to match, it will
  787. * substitute matches at all positions except the end of the input.
  788. *
  789. * @param input The input text.
  790. * @param index The offset index at which the search should be begin.
  791. * @param eflags The logical OR of any execution flags above.
  792. * @return a non-null (but possibly zero-length) array of matches
  793. */
  794. public REMatch[] getAllMatches(Object input, int index, int eflags) {
  795. return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
  796. }
  797. // this has been changed since 1.03 to be non-overlapping matches
  798. private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
  799. Vector all = new Vector();
  800. REMatch m = null;
  801. while ((m = getMatchImpl(input,index,eflags,null)) != null) {
  802. all.addElement(m);
  803. index = m.getEndIndex();
  804. if (m.end[0] == 0) { // handle pathological case of zero-length match
  805. index++;
  806. input.move(1);
  807. } else {
  808. input.move(m.end[0]);
  809. }
  810. if (!input.isValid()) break;
  811. }
  812. REMatch[] mset = new REMatch[all.size()];
  813. all.copyInto(mset);
  814. return mset;
  815. }
  816. /* Implements abstract method REToken.match() */
  817. boolean match(CharIndexed input, REMatch mymatch) {
  818. if (firstToken == null) return next(input, mymatch);
  819. // Note the start of this subexpression
  820. mymatch.start[subIndex] = mymatch.index;
  821. return firstToken.match(input, mymatch);
  822. }
  823. /**
  824. * Returns the first match found in the input. If no match is found,
  825. * null is returned.
  826. *
  827. * @param input The input text.
  828. * @return An REMatch instance referencing the match, or null if none.
  829. */
  830. public REMatch getMatch(Object input) {
  831. return getMatch(input,0,0);
  832. }
  833. /**
  834. * Returns the first match found in the input, beginning
  835. * the search at the specified index. If no match is found,
  836. * returns null.
  837. *
  838. * @param input The input text.
  839. * @param index The offset within the text to begin looking for a match.
  840. * @return An REMatch instance referencing the match, or null if none.
  841. */
  842. public REMatch getMatch(Object input, int index) {
  843. return getMatch(input,index,0);
  844. }
  845. /**
  846. * Returns the first match found in the input, beginning
  847. * the search at the specified index, and using the specified
  848. * execution flags. If no match is found, returns null.
  849. *
  850. * @param input The input text.
  851. * @param index The offset index at which the search should be begin.
  852. * @param eflags The logical OR of any execution flags above.
  853. * @return An REMatch instance referencing the match, or null if none.
  854. */
  855. public REMatch getMatch(Object input, int index, int eflags) {
  856. return getMatch(input,index,eflags,null);
  857. }
  858. /**
  859. * Returns the first match found in the input, beginning the search
  860. * at the specified index, and using the specified execution flags.
  861. * If no match is found, returns null. If a StringBuffer is
  862. * provided and is non-null, the contents of the input text from the
  863. * index to the beginning of the match (or to the end of the input,
  864. * if there is no match) are appended to the StringBuffer.
  865. *
  866. * @param input The input text.
  867. * @param index The offset index at which the search should be begin.
  868. * @param eflags The logical OR of any execution flags above.
  869. * @param buffer The StringBuffer to save pre-match text in.
  870. * @return An REMatch instance referencing the match, or null if none. */
  871. public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
  872. return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
  873. }
  874. REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
  875. // Create a new REMatch to hold results
  876. REMatch mymatch = new REMatch(numSubs, anchor, eflags);
  877. do {
  878. // Optimization: check if anchor + minimumLength > length
  879. if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
  880. if (match(input, mymatch)) {
  881. // Find longest match of them all to observe leftmost longest
  882. REMatch longest = mymatch;
  883. while ((mymatch = mymatch.next) != null) {
  884. if (mymatch.index > longest.index) {
  885. longest = mymatch;
  886. }
  887. }
  888. longest.end[0] = longest.index;
  889. longest.finish(input);
  890. return longest;
  891. }
  892. }
  893. mymatch.clear(++anchor);
  894. // Append character to buffer if needed
  895. if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
  896. buffer.append(input.charAt(0));
  897. }
  898. } while (input.move(1));
  899. return null;
  900. }
  901. /**
  902. * Returns an REMatchEnumeration that can be used to iterate over the
  903. * matches found in the input text.
  904. *
  905. * @param input The input text.
  906. * @return A non-null REMatchEnumeration instance.
  907. */
  908. public REMatchEnumeration getMatchEnumeration(Object input) {
  909. return getMatchEnumeration(input,0,0);
  910. }
  911. /**
  912. * Returns an REMatchEnumeration that can be used to iterate over the
  913. * matches found in the input text.
  914. *
  915. * @param input The input text.
  916. * @param index The offset index at which the search should be begin.
  917. * @return A non-null REMatchEnumeration instance, with its input cursor
  918. * set to the index position specified.
  919. */
  920. public REMatchEnumeration getMatchEnumeration(Object input, int index) {
  921. return getMatchEnumeration(input,index,0);
  922. }
  923. /**
  924. * Returns an REMatchEnumeration that can be used to iterate over the
  925. * matches found in the input text.
  926. *
  927. * @param input The input text.
  928. * @param index The offset index at which the search should be begin.
  929. * @param eflags The logical OR of any execution flags above.
  930. * @return A non-null REMatchEnumeration instance, with its input cursor
  931. * set to the index position specified.
  932. */
  933. public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
  934. return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
  935. }
  936. /**
  937. * Substitutes the replacement text for the first match found in the input.
  938. *
  939. * @param input The input text.
  940. * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
  941. * @return A String interpolating the substituted text.
  942. * @see REMatch#substituteInto
  943. */
  944. public String substitute(Object input,String replace) {
  945. return substitute(input,replace,0,0);
  946. }
  947. /**
  948. * Substitutes the replacement text for the first match found in the input
  949. * beginning at the specified index position. Specifying an index
  950. * effectively causes the regular expression engine to throw away the
  951. * specified number of characters.
  952. *
  953. * @param input The input text.
  954. * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
  955. * @param index The offset index at which the search should be begin.
  956. * @return A String containing the substring of the input, starting
  957. * at the index position, and interpolating the substituted text.
  958. * @see REMatch#substituteInto
  959. */
  960. public String substitute(Object input,String replace,int index) {
  961. return substitute(input,replace,index,0);
  962. }
  963. /**
  964. * Substitutes the replacement text for the first match found in the input
  965. * string, beginning at the specified index position and using the
  966. * specified execution flags.
  967. *
  968. * @param input The input text.
  969. * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
  970. * @param index The offset index at which the search should be begin.
  971. * @param eflags The logical OR of any execution flags above.
  972. * @return A String containing the substring of the input, starting
  973. * at the index position, and interpolating the substituted text.
  974. * @see REMatch#substituteInto
  975. */
  976. public String substitute(Object input,String replace,int index,int eflags) {
  977. return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
  978. }
  979. private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
  980. StringBuffer buffer = new StringBuffer();
  981. REMatch m = getMatchImpl(input,index,eflags,buffer);
  982. if (m==null) return buffer.toString();
  983. buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
  984. replace : m.substituteInto(replace) );
  985. if (input.move(m.end[0])) {
  986. do {
  987. buffer.append(input.charAt(0));
  988. } while (input.move(1));
  989. }
  990. return buffer.toString();
  991. }
  992. /**
  993. * Substitutes the replacement text for each non-overlapping match found
  994. * in the input text.
  995. *
  996. * @param input The input text.
  997. * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
  998. * @return A String interpolating the substituted text.
  999. * @see REMatch#substituteInto
  1000. */
  1001. public String substituteAll(Object input,String replace) {
  1002. return substituteAll(input,replace,0,0);
  1003. }
  1004. /**
  1005. * Substitutes the replacement text for each non-overlapping match found
  1006. * in the input text, starting at the specified index.
  1007. *
  1008. * If the regular expression allows the empty string to match, it will
  1009. * substitute matches at all positions except the end of the input.
  1010. *
  1011. * @param input The input text.
  1012. * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
  1013. * @param index The offset index at which the search should be begin.
  1014. * @return A String containing the substring of the input, starting
  1015. * at the index position, and interpolating the substituted text.
  1016. * @see REMatch#substituteInto
  1017. */
  1018. public String substituteAll(Object input,String replace,int index) {
  1019. return substituteAll(input,replace,index,0);
  1020. }
  1021. /**
  1022. * Substitutes the replacement text for each non-overlapping match found
  1023. * in the input text, starting at the specified index and using the
  1024. * specified execution flags.
  1025. *
  1026. * @param input The input text.
  1027. * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
  1028. * @param index The offset index at which the search should be begin.
  1029. * @param eflags The logical OR of any execution flags above.
  1030. * @return A String containing the substring of the input, starting
  1031. * at the index position, and interpolating the substituted text.
  1032. * @see REMatch#substituteInto
  1033. */
  1034. public String substituteAll(Object input,String replace,int index,int eflags) {
  1035. return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
  1036. }
  1037. private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
  1038. StringBuffer buffer = new StringBuffer();
  1039. REMatch m;
  1040. while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
  1041. buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
  1042. replace : m.substituteInto(replace) );
  1043. index = m.getEndIndex();
  1044. if (m.end[0] == 0) {
  1045. char ch = input.charAt(0);
  1046. if (ch != CharIndexed.OUT_OF_BOUNDS)
  1047. buffer.append(ch);
  1048. input.move(1);
  1049. } else {
  1050. input.move(m.end[0]);
  1051. }
  1052. if (!input.isValid()) break;
  1053. }
  1054. return buffer.toString();
  1055. }
  1056. /* Helper function for constructor */
  1057. private void addToken(REToken next) {
  1058. if (next == null) return;
  1059. minimumLength += next.getMinimumLength();
  1060. if (firstToken == null) {
  1061. lastToken = firstToken = next;
  1062. } else {
  1063. // if chain returns false, it "rejected" the token due to
  1064. // an optimization, and next was combined with lastToken
  1065. if (lastToken.chain(next)) {
  1066. lastToken = next;
  1067. }
  1068. }
  1069. }
  1070. private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
  1071. if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
  1072. return new RETokenRepeated(current.subIndex,current,min,max);
  1073. }
  1074. private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
  1075. // Precondition: pattern[index-1] == ':'
  1076. // we will return pos of closing ']'.
  1077. int i;
  1078. for (i=index; i<(pattern.length-1); i++) {
  1079. if ((pattern[i] == ':') && (pattern[i+1] == ']'))
  1080. return i+2;
  1081. buf.append(pattern[i]);
  1082. }
  1083. return index; // didn't match up
  1084. }
  1085. private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
  1086. // Precondition: input[index-1] == '{', minMax != null
  1087. boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
  1088. int startIndex = index;
  1089. if (index == input.length) {
  1090. if (mustMatch)
  1091. throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
  1092. else
  1093. return startIndex;
  1094. }
  1095. int min,max=0;
  1096. CharUnit unit = new CharUnit();
  1097. StringBuffer buf = new StringBuffer();
  1098. // Read string of digits
  1099. do {
  1100. index = getCharUnit(input,index,unit);
  1101. if (Character.isDigit(unit.ch))
  1102. buf.append(unit.ch);
  1103. } while ((index != input.length) && Character.isDigit(unit.ch));
  1104. // Check for {} tomfoolery
  1105. if (buf.length() == 0) {
  1106. if (mustMatch)
  1107. throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
  1108. else
  1109. return startIndex;
  1110. }
  1111. min = Integer.parseInt(buf.toString());
  1112. if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
  1113. max = min;
  1114. else if (index == input.length)
  1115. if (mustMatch)
  1116. throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
  1117. else
  1118. return startIndex;
  1119. else if ((unit.ch == ',') && !unit.bk) {
  1120. buf = new StringBuffer();
  1121. // Read string of digits
  1122. while (((index = getCharUnit(input,index,unit)) != input.length) && Character.isDigit(unit.ch))
  1123. buf.append(unit.ch);
  1124. if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
  1125. if (mustMatch)
  1126. throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
  1127. else
  1128. return startIndex;
  1129. // This is the case of {x,}
  1130. if (buf.length() == 0) max = Integer.MAX_VALUE;
  1131. else max = Integer.parseInt(buf.toString());
  1132. } else
  1133. if (mustMatch)
  1134. throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
  1135. else
  1136. return startIndex;
  1137. // We know min and max now, and they are valid.
  1138. minMax.first = min;
  1139. minMax.second = max;
  1140. // return the index following the '}'
  1141. return index;
  1142. }
  1143. /**
  1144. * Return a human readable form of the compiled regular expression,
  1145. * useful for debugging.
  1146. */
  1147. public String toString() {
  1148. StringBuffer sb = new StringBuffer();
  1149. dump(sb);
  1150. return sb.toString();
  1151. }
  1152. void dump(StringBuffer os) {
  1153. os.append('(');
  1154. if (subIndex == 0)
  1155. os.append("?:");
  1156. if (firstToken != null)
  1157. firstToken.dumpAll(os);
  1158. os.append(')');
  1159. }
  1160. // Cast input appropriately or throw exception
  1161. private static CharIndexed makeCharIndexed(Object input, int index) {
  1162. // We could let a String fall through to final input, but since
  1163. // it's the most likely input type, we check it first.
  1164. if (input instanceof String)
  1165. return new CharIndexedString((String) input,index);
  1166. else if (input instanceof char[])
  1167. return new CharIndexedCharArray((char[]) input,index);
  1168. else if (input instanceof StringBuffer)
  1169. return new CharIndexedStringBuffer((StringBuffer) input,index);
  1170. else if (input instanceof InputStream)
  1171. return new CharIndexedInputStream((InputStream) input,index);
  1172. else if (input instanceof Reader)
  1173. return new CharIndexedReader((Reader) input, index);
  1174. else if (input instanceof CharIndexed)
  1175. return (CharIndexed) input; // do we lose index info?
  1176. else
  1177. return new CharIndexedString(input.toString(), index);
  1178. }
  1179. }