PageRenderTime 137ms CodeModel.GetById 69ms app.highlight 59ms RepoModel.GetById 1ms app.codeStats 1ms

/jEdit/tags/jedit-4-0-pre3/gnu/regexp/RE.java

#
Java | 1343 lines | 676 code | 164 blank | 503 comment | 370 complexity | 4bf34a599d21693ac6c2994c4a081c25 MD5 | raw file
   1/*
   2 *  gnu/regexp/RE.java
   3 *  Copyright (C) 1998-2001 Wes Biggs
   4 *
   5 *  This library is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU Lesser General Public License as published
   7 *  by the Free Software Foundation; either version 2.1 of the License, or
   8 *  (at your option) any later version.
   9 *
  10 *  This library is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 *  GNU Lesser General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU Lesser General Public License
  16 *  along with this program; if not, write to the Free Software
  17 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18 */
  19
  20package gnu.regexp;
  21import java.io.InputStream;
  22import java.io.Reader;
  23import java.io.Serializable;
  24import java.util.Locale;
  25import java.util.PropertyResourceBundle;
  26import java.util.ResourceBundle;
  27import java.util.Vector;
  28
  29class IntPair implements Serializable {
  30  public int first, second;
  31}
  32
  33class CharUnit implements Serializable {
  34  public char ch;
  35  public boolean bk;
  36}
  37
  38/**
  39 * RE provides the user interface for compiling and matching regular
  40 * expressions.
  41 * <P>
  42 * A regular expression object (class RE) is compiled by constructing it
  43 * from a String, StringBuffer or character array, with optional 
  44 * compilation flags (below)
  45 * and an optional syntax specification (see RESyntax; if not specified,
  46 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  47 * <P>
  48 * Various methods attempt to match input text against a compiled
  49 * regular expression.  These methods are:
  50 * <LI><code>isMatch</code>: returns true if the input text in its entirety
  51 * matches the regular expression pattern.
  52 * <LI><code>getMatch</code>: returns the first match found in the input text,
  53 * or null if no match is found.
  54 * <LI><code>getAllMatches</code>: returns an array of all non-overlapping 
  55 * matches found in the input text.  If no matches are found, the array is
  56 * zero-length.
  57 * <LI><code>substitute</code>: substitute the first occurence of the pattern
  58 * in the input text with a replacement string (which may include
  59 * metacharacters $0-$9, see REMatch.substituteInto).
  60 * <LI><code>substituteAll</code>: same as above, but repeat for each match
  61 * before returning.
  62 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
  63 * that allows iteration over the matches (see REMatchEnumeration for some
  64 * reasons why you may want to do this instead of using <code>getAllMatches</code>.
  65 * <P>
  66 *
  67 * These methods all have similar argument lists.  The input can be a
  68 * String, a character array, a StringBuffer, a Reader or an
  69 * InputStream of some sort.  Note that when using a Reader or
  70 * InputStream, the stream read position cannot be guaranteed after
  71 * attempting a match (this is not a bug, but a consequence of the way
  72 * regular expressions work).  Using an REMatchEnumeration can
  73 * eliminate most positioning problems.
  74 *
  75 * <P>
  76 *
  77 * The optional index argument specifies the offset from the beginning
  78 * of the text at which the search should start (see the descriptions
  79 * of some of the execution flags for how this can affect positional
  80 * pattern operators).  For a Reader or InputStream, this means an
  81 * offset from the current read position, so subsequent calls with the
  82 * same index argument on a Reader or an InputStream will not
  83 * necessarily access the same position on the stream, whereas
  84 * repeated searches at a given index in a fixed string will return
  85 * consistent results.
  86 *
  87 * <P>
  88 * You can optionally affect the execution environment by using a
  89 * combination of execution flags (constants listed below).
  90 * 
  91 * <P>
  92 * All operations on a regular expression are performed in a
  93 * thread-safe manner.
  94 *
  95 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  96 * @version 1.1.4-dev, to be released
  97 */
  98
  99public class RE extends REToken {
 100  // This String will be returned by getVersion()
 101  private static final String VERSION = "1.1.4-dev";
 102
 103  // The localized strings are kept in a separate file
 104  private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
 105
 106  // These are, respectively, the first and last tokens in our linked list
 107  // If there is only one token, firstToken == lastToken
 108  private REToken firstToken, lastToken;
 109
 110  // This is the number of subexpressions in this regular expression,
 111  // with a minimum value of zero.  Returned by getNumSubs()
 112  private int numSubs;
 113
 114    /** Minimum length, in characters, of any possible match. */
 115    private int minimumLength;
 116
 117  /**
 118   * Compilation flag. Do  not  differentiate  case.   Subsequent
 119   * searches  using  this  RE will be case insensitive.
 120   */
 121  public static final int REG_ICASE = 2;
 122
 123  /**
 124   * Compilation flag. The match-any-character operator (dot)
 125   * will match a newline character.  When set this overrides the syntax
 126   * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 127   * the "/s" operator in Perl.
 128   */
 129  public static final int REG_DOT_NEWLINE = 4;
 130
 131  /**
 132   * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 133   * anchors will match based on newlines within the input. This is
 134   * equivalent to the "/m" operator in Perl.
 135   */
 136  public static final int REG_MULTILINE = 8;
 137
 138  /**
 139   * Execution flag.
 140   * The match-beginning operator (^) will not match at the beginning
 141   * of the input string. Useful for matching on a substring when you
 142   * know the context of the input is such that position zero of the
 143   * input to the match test is not actually position zero of the text.
 144   * <P>
 145   * This example demonstrates the results of various ways of matching on
 146   * a substring.
 147   * <P>
 148   * <CODE>
 149   * String s = "food bar fool";<BR>
 150   * RE exp = new RE("^foo.");<BR>
 151   * REMatch m0 = exp.getMatch(s);<BR>
 152   * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 153   * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 154   * REMatch m3 = exp.getMatch(s,8);                            <BR>
 155   * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 156   * <P>
 157   * // Results:<BR>
 158   * //  m0 = "food"<BR>
 159   * //  m1 = "fool"<BR>
 160   * //  m2 = null<BR>
 161   * //  m3 = null<BR>
 162   * //  m4 = "fool"<BR>
 163   * </CODE>
 164   */
 165  public static final int REG_NOTBOL = 16;
 166
 167  /**
 168   * Execution flag.
 169   * The match-end operator ($) does not match at the end
 170   * of the input string. Useful for matching on substrings.
 171   */
 172  public static final int REG_NOTEOL = 32;
 173
 174  /**
 175   * Execution flag.
 176   * When a match method is invoked that starts matching at a non-zero
 177   * index into the input, treat the input as if it begins at the index
 178   * given.  The effect of this flag is that the engine does not "see"
 179   * any text in the input before the given index.  This is useful so
 180   * that the match-beginning operator (^) matches not at position 0
 181   * in the input string, but at the position the search started at
 182   * (based on the index input given to the getMatch function).  See
 183   * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 184   * and \b operators.
 185   */
 186  public static final int REG_ANCHORINDEX = 64;
 187
 188  /**
 189   * Execution flag.
 190   * The substitute and substituteAll methods will not attempt to
 191   * interpolate occurrences of $1-$9 in the replacement text with
 192   * the corresponding subexpressions.  For example, you may want to
 193   * replace all matches of "one dollar" with "$1".
 194   */
 195  public static final int REG_NO_INTERPOLATE = 128;
 196
 197  /** Returns a string representing the version of the gnu.regexp package. */
 198  public static final String version() {
 199    return VERSION;
 200  }
 201
 202  // Retrieves a message from the ResourceBundle
 203  static final String getLocalizedMessage(String key) {
 204    return messages.getString(key);
 205  }
 206
 207  /**
 208   * Constructs a regular expression pattern buffer without any compilation
 209   * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 210   *
 211   * @param pattern A regular expression pattern, in the form of a String,
 212   *   StringBuffer or char[].  Other input types will be converted to
 213   *   strings using the toString() method.
 214   * @exception REException The input pattern could not be parsed.
 215   * @exception NullPointerException The pattern was null.
 216   */
 217  public RE(Object pattern) throws REException {
 218    this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
 219  }
 220
 221  /**
 222   * Constructs a regular expression pattern buffer using the specified
 223   * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 224   *
 225   * @param pattern A regular expression pattern, in the form of a String,
 226   *   StringBuffer, or char[].  Other input types will be converted to
 227   *   strings using the toString() method.
 228   * @param cflags The logical OR of any combination of the compilation flags listed above.
 229   * @exception REException The input pattern could not be parsed.
 230   * @exception NullPointerException The pattern was null.
 231   */
 232  public RE(Object pattern, int cflags) throws REException {
 233    this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
 234  }
 235
 236  /**
 237   * Constructs a regular expression pattern buffer using the specified
 238   * compilation flags and regular expression syntax.
 239   *
 240   * @param pattern A regular expression pattern, in the form of a String,
 241   *   StringBuffer, or char[].  Other input types will be converted to
 242   *   strings using the toString() method.
 243   * @param cflags The logical OR of any combination of the compilation flags listed above.
 244   * @param syntax The type of regular expression syntax to use.
 245   * @exception REException The input pattern could not be parsed.
 246   * @exception NullPointerException The pattern was null.
 247   */
 248  public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
 249    this(pattern,cflags,syntax,0,0);
 250  }
 251
 252  // internal constructor used for alternation
 253  private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
 254    super(subIndex);
 255    firstToken = first;
 256    lastToken = last;
 257    numSubs = subs;
 258    minimumLength = minLength;
 259    addToken(new RETokenEndSub(subIndex));
 260  }
 261
 262  private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 263    super(myIndex); // Subexpression index of this token.
 264    initialize(patternObj, cflags, syntax, myIndex, nextSub);
 265  }
 266
 267    // For use by subclasses
 268    protected RE() { super(0); }
 269
 270    // The meat of construction
 271  protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 272      char[] pattern;
 273    if (patternObj instanceof String) {
 274      pattern = ((String) patternObj).toCharArray();
 275    } else if (patternObj instanceof char[]) {
 276      pattern = (char[]) patternObj;
 277    } else if (patternObj instanceof StringBuffer) {
 278      pattern = new char [((StringBuffer) patternObj).length()];
 279      ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
 280    } else {
 281	pattern = patternObj.toString().toCharArray();
 282    }
 283
 284    int pLength = pattern.length;
 285
 286    numSubs = 0; // Number of subexpressions in this token.
 287    Vector branches = null;
 288
 289    // linked list of tokens (sort of -- some closed loops can exist)
 290    firstToken = lastToken = null;
 291
 292    // Precalculate these so we don't pay for the math every time we
 293    // need to access them.
 294    boolean insens = ((cflags & REG_ICASE) > 0);
 295
 296    // Parse pattern into tokens.  Does anyone know if it's more efficient
 297    // to use char[] than a String.charAt()?  I'm assuming so.
 298
 299    // index tracks the position in the char array
 300    int index = 0;
 301
 302    // this will be the current parse character (pattern[index])
 303    CharUnit unit = new CharUnit();
 304
 305    // This is used for {x,y} calculations
 306    IntPair minMax = new IntPair();
 307
 308    // Buffer a token so we can create a TokenRepeated, etc.
 309    REToken currentToken = null;
 310    char ch;
 311
 312    while (index < pLength) {
 313      // read the next character unit (including backslash escapes)
 314      index = getCharUnit(pattern,index,unit);
 315
 316      // ALTERNATION OPERATOR
 317      //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 318      //  not available if RE_LIMITED_OPS is set
 319
 320      // TODO: the '\n' literal here should be a test against REToken.newline,
 321      // which unfortunately may be more than a single character.
 322      if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk))
 323	     || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) )
 324	   && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
 325	// make everything up to here be a branch. create vector if nec.
 326	addToken(currentToken);
 327	RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
 328	minimumLength = 0;
 329	if (branches == null) {
 330	    branches = new Vector();
 331	}
 332	branches.addElement(theBranch);
 333	firstToken = lastToken = currentToken = null;
 334      }
 335      
 336      // INTERVAL OPERATOR:
 337      //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 338      //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 339      //
 340      // OPEN QUESTION: 
 341      //  what is proper interpretation of '{' at start of string?
 342
 343      else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
 344	int newIndex = getMinMax(pattern,index,minMax,syntax);
 345        if (newIndex > index) {
 346          if (minMax.first > minMax.second)
 347            throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
 348          if (currentToken == null)
 349            throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
 350          if (currentToken instanceof RETokenRepeated) 
 351            throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
 352          if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 353            throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
 354          if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
 355            throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex);
 356          index = newIndex;
 357          currentToken = setRepeated(currentToken,minMax.first,minMax.second,index); 
 358        }
 359        else {
 360          addToken(currentToken);
 361          currentToken = new RETokenChar(subIndex,unit.ch,insens);
 362        } 
 363      }
 364      
 365      // LIST OPERATOR:
 366      //  [...] | [^...]
 367
 368      else if ((unit.ch == '[') && !unit.bk) {
 369	Vector options = new Vector();
 370	boolean negative = false;
 371	char lastChar = 0;
 372	if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
 373	
 374	// Check for initial caret, negation
 375	if ((ch = pattern[index]) == '^') {
 376	  negative = true;
 377	  if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 378	  ch = pattern[index];
 379	}
 380
 381	// Check for leading right bracket literal
 382	if (ch == ']') {
 383	  lastChar = ch;
 384	  if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 385	}
 386
 387	while ((ch = pattern[index++]) != ']') {
 388	  if ((ch == '-') && (lastChar != 0)) {
 389	    if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 390	    if ((ch = pattern[index]) == ']') {
 391	      options.addElement(new RETokenChar(subIndex,lastChar,insens));
 392	      lastChar = '-';
 393	    } else {
 394	      options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
 395	      lastChar = 0;
 396	      index++;
 397	    }
 398          } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
 399            if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 400	    int posixID = -1;
 401	    boolean negate = false;
 402            char asciiEsc = 0;
 403	    if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
 404	      switch (pattern[index]) {
 405	      case 'D':
 406		negate = true;
 407	      case 'd':
 408		posixID = RETokenPOSIX.DIGIT;
 409		break;
 410	      case 'S':
 411		negate = true;
 412	      case 's':
 413		posixID = RETokenPOSIX.SPACE;
 414		break;
 415	      case 'W':
 416		negate = true;
 417	      case 'w':
 418		posixID = RETokenPOSIX.ALNUM;
 419		break;
 420	      }
 421	    }
 422            else if ("nrt".indexOf(pattern[index]) != -1) {
 423              switch (pattern[index]) {
 424                case 'n':
 425                  asciiEsc = '\n';
 426                  break;
 427                case 't':
 428                  asciiEsc = '\t';
 429                  break;
 430                case 'r':
 431                  asciiEsc = '\r';
 432                  break;
 433              }
 434            }
 435	    if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 436	    
 437	    if (posixID != -1) {
 438	      options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
 439	    } else if (asciiEsc != 0) {
 440	      lastChar = asciiEsc;
 441	    } else {
 442	      lastChar = pattern[index];
 443	    }
 444	    ++index;
 445	  } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
 446	    StringBuffer posixSet = new StringBuffer();
 447	    index = getPosixSet(pattern,index+1,posixSet);
 448	    int posixId = RETokenPOSIX.intValue(posixSet.toString());
 449	    if (posixId != -1)
 450	      options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
 451	  } else {
 452	    if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 453	    lastChar = ch;
 454	  }
 455	  if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 456	} // while in list
 457	// Out of list, index is one past ']'
 458	    
 459	if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 460	    
 461	// Create a new RETokenOneOf
 462	addToken(currentToken);
 463	options.trimToSize();
 464	currentToken = new RETokenOneOf(subIndex,options,negative);
 465      }
 466
 467      // SUBEXPRESSIONS
 468      //  (...) | \(...\) depending on RE_NO_BK_PARENS
 469
 470      else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
 471	boolean pure = false;
 472	boolean comment = false;
 473        boolean lookAhead = false;
 474        boolean negativelh = false;
 475	if ((index+1 < pLength) && (pattern[index] == '?')) {
 476	  switch (pattern[index+1]) {
 477          case '!':
 478            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 479              pure = true;
 480              negativelh = true;
 481              lookAhead = true;
 482              index += 2;
 483            }
 484            break;
 485          case '=':
 486            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 487              pure = true;
 488              lookAhead = true;
 489              index += 2;
 490            }
 491            break;
 492	  case ':':
 493	    if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
 494	      pure = true;
 495	      index += 2;
 496	    }
 497	    break;
 498	  case '#':
 499	    if (syntax.get(RESyntax.RE_COMMENTS)) {
 500	      comment = true;
 501	    }
 502	    break;
 503          default:
 504            throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 505	  }
 506	}
 507
 508	if (index >= pLength) {
 509	    throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 510	}
 511
 512	// find end of subexpression
 513	int endIndex = index;
 514	int nextIndex = index;
 515	int nested = 0;
 516
 517	while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0)
 518		&& !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) )
 519	  if ((endIndex = nextIndex) >= pLength)
 520	    throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 521	  else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
 522	    nested++;
 523	  else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
 524	    nested--;
 525
 526	// endIndex is now position at a ')','\)' 
 527	// nextIndex is end of string or position after ')' or '\)'
 528
 529	if (comment) index = nextIndex;
 530	else { // not a comment
 531	  // create RE subexpression as token.
 532	  addToken(currentToken);
 533	  if (!pure) {
 534	    numSubs++;
 535	  }
 536
 537	  int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
 538	  currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
 539	  numSubs += ((RE) currentToken).getNumSubs();
 540
 541          if (lookAhead) {
 542	      currentToken = new RETokenLookAhead(currentToken,negativelh);
 543	  }
 544
 545	  index = nextIndex;
 546	} // not a comment
 547      } // subexpression
 548    
 549      // UNMATCHED RIGHT PAREN
 550      // ) or \) throw exception if
 551      // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 552      else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
 553	throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
 554      }
 555
 556      // START OF LINE OPERATOR
 557      //  ^
 558
 559      else if ((unit.ch == '^') && !unit.bk) {
 560	addToken(currentToken);
 561	currentToken = null;
 562	addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 563      }
 564
 565      // END OF LINE OPERATOR
 566      //  $
 567
 568      else if ((unit.ch == '$') && !unit.bk) {
 569	addToken(currentToken);
 570	currentToken = null;
 571	addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 572      }
 573
 574      // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 575      //  .
 576
 577      else if ((unit.ch == '.') && !unit.bk) {
 578	addToken(currentToken);
 579	currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
 580      }
 581
 582      // ZERO-OR-MORE REPEAT OPERATOR
 583      //  *
 584
 585      else if ((unit.ch == '*') && !unit.bk) {
 586	if (currentToken == null)
 587          throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 588	if (currentToken instanceof RETokenRepeated)
 589          throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 590	if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 591	  throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 592	if (currentToken.getMinimumLength() == 0)
 593	  throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
 594	currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
 595      }
 596
 597      // ONE-OR-MORE REPEAT OPERATOR
 598      //  + | \+ depending on RE_BK_PLUS_QM
 599      //  not available if RE_LIMITED_OPS is set
 600
 601      else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
 602	if (currentToken == null)
 603          throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 604	if (currentToken instanceof RETokenRepeated)
 605          throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 606	if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 607	  throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 608	if (currentToken.getMinimumLength() == 0)
 609	  throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
 610	currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
 611      }
 612
 613      // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
 614      //  ? | \? depending on RE_BK_PLUS_QM
 615      //  not available if RE_LIMITED_OPS is set
 616      //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
 617
 618      else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
 619	if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 620
 621	// Check for stingy matching on RETokenRepeated
 622	if (currentToken instanceof RETokenRepeated) {
 623          if (syntax.get(RESyntax.RE_STINGY_OPS) && !((RETokenRepeated)currentToken).isStingy())
 624            ((RETokenRepeated)currentToken).makeStingy();
 625          else
 626            throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 627        }
 628        else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 629          throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 630	else
 631	  currentToken = setRepeated(currentToken,0,1,index);
 632      }
 633	
 634      // BACKREFERENCE OPERATOR
 635      //  \1 \2 ... \9
 636      // not available if RE_NO_BK_REFS is set
 637
 638      else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
 639	addToken(currentToken);
 640	currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
 641      }
 642
 643      // START OF STRING OPERATOR
 644      //  \A if RE_STRING_ANCHORS is set
 645      
 646      else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 647	addToken(currentToken);
 648	currentToken = new RETokenStart(subIndex,null);
 649      }
 650
 651      // WORD BREAK OPERATOR
 652      //  \b if ????
 653
 654      else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 655	  addToken(currentToken);
 656	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
 657      } 
 658
 659      // WORD BEGIN OPERATOR 
 660      //  \< if ????
 661      else if (unit.bk && (unit.ch == '<')) {
 662	  addToken(currentToken);
 663	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
 664      } 
 665
 666      // WORD END OPERATOR 
 667      //  \> if ????
 668      else if (unit.bk && (unit.ch == '>')) {
 669	  addToken(currentToken);
 670	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
 671      } 
 672
 673      // NON-WORD BREAK OPERATOR
 674      // \B if ????
 675
 676      else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 677	  addToken(currentToken);
 678	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
 679      } 
 680
 681      
 682      // DIGIT OPERATOR
 683      //  \d if RE_CHAR_CLASS_ESCAPES is set
 684      
 685      else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 686	addToken(currentToken);
 687	currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
 688      }
 689
 690      // NON-DIGIT OPERATOR
 691      //  \D
 692
 693	else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 694	  addToken(currentToken);
 695	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
 696	}
 697
 698	// NEWLINE ESCAPE
 699        //  \n
 700
 701	else if (unit.bk && (unit.ch == 'n')) {
 702	  addToken(currentToken);
 703	  currentToken = new RETokenChar(subIndex,'\n',false);
 704	}
 705
 706	// RETURN ESCAPE
 707        //  \r
 708
 709	else if (unit.bk && (unit.ch == 'r')) {
 710	  addToken(currentToken);
 711	  currentToken = new RETokenChar(subIndex,'\r',false);
 712	}
 713
 714	// WHITESPACE OPERATOR
 715        //  \s if RE_CHAR_CLASS_ESCAPES is set
 716
 717	else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 718	  addToken(currentToken);
 719	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
 720	}
 721
 722	// NON-WHITESPACE OPERATOR
 723        //  \S
 724
 725	else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 726	  addToken(currentToken);
 727	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
 728	}
 729
 730	// TAB ESCAPE
 731        //  \t
 732
 733	else if (unit.bk && (unit.ch == 't')) {
 734	  addToken(currentToken);
 735	  currentToken = new RETokenChar(subIndex,'\t',false);
 736	}
 737
 738	// ALPHANUMERIC OPERATOR
 739        //  \w
 740
 741	else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 742	  addToken(currentToken);
 743	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
 744	}
 745
 746	// NON-ALPHANUMERIC OPERATOR
 747        //  \W
 748
 749	else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 750	  addToken(currentToken);
 751	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
 752	}
 753
 754	// END OF STRING OPERATOR
 755        //  \Z
 756
 757	else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 758	  addToken(currentToken);
 759	  currentToken = new RETokenEnd(subIndex,null);
 760	}
 761
 762	// NON-SPECIAL CHARACTER (or escape to make literal)
 763        //  c | \* for example
 764
 765	else {  // not a special character
 766	  addToken(currentToken);
 767	  currentToken = new RETokenChar(subIndex,unit.ch,insens);
 768	} 
 769      } // end while
 770
 771    // Add final buffered token and an EndSub marker
 772    addToken(currentToken);
 773      
 774    if (branches != null) {
 775	branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
 776	branches.trimToSize(); // compact the Vector
 777	minimumLength = 0;
 778	firstToken = lastToken = null;
 779	addToken(new RETokenOneOf(subIndex,branches,false));
 780    } 
 781    else addToken(new RETokenEndSub(subIndex));
 782
 783  }
 784
 785  private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException {
 786    unit.ch = input[index++];
 787    if (unit.bk = (unit.ch == '\\'))
 788      if (index < input.length)
 789	unit.ch = input[index++];
 790      else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
 791    return index;
 792  }
 793
 794  /**
 795   * Checks if the regular expression matches the input in its entirety.
 796   *
 797   * @param input The input text.
 798   */
 799  public boolean isMatch(Object input) {
 800    return isMatch(input,0,0);
 801  }
 802  
 803  /**
 804   * Checks if the input string, starting from index, is an exact match of
 805   * this regular expression.
 806   *
 807   * @param input The input text.
 808   * @param index The offset index at which the search should be begin.
 809   */
 810  public boolean isMatch(Object input,int index) {
 811    return isMatch(input,index,0);
 812  }
 813  
 814
 815  /**
 816   * Checks if the input, starting from index and using the specified
 817   * execution flags, is an exact match of this regular expression.
 818   *
 819   * @param input The input text.
 820   * @param index The offset index at which the search should be begin.
 821   * @param eflags The logical OR of any execution flags above.
 822   */
 823  public boolean isMatch(Object input,int index,int eflags) {
 824    return isMatchImpl(makeCharIndexed(input,index),index,eflags);
 825  }
 826
 827  private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
 828    if (firstToken == null)  // Trivial case
 829      return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
 830    REMatch m = new REMatch(numSubs, index, eflags);
 831    if (firstToken.match(input, m)) {
 832	while (m != null) {
 833	    if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
 834		return true;
 835	    }
 836	    m = m.next;
 837	}
 838    }
 839    return false;
 840  }
 841    
 842  /**
 843   * Returns the maximum number of subexpressions in this regular expression.
 844   * If the expression contains branches, the value returned will be the
 845   * maximum subexpressions in any of the branches.
 846   */
 847  public int getNumSubs() {
 848    return numSubs;
 849  }
 850
 851  // Overrides REToken.setUncle
 852  void setUncle(REToken uncle) {
 853      if (lastToken != null) {
 854	  lastToken.setUncle(uncle);
 855      } else super.setUncle(uncle); // to deal with empty subexpressions
 856  }
 857
 858  // Overrides REToken.chain
 859
 860  boolean chain(REToken next) {
 861    super.chain(next);
 862    setUncle(next);
 863    return true;
 864  }
 865
 866  /**
 867   * Returns the minimum number of characters that could possibly
 868   * constitute a match of this regular expression.
 869   */
 870  public int getMinimumLength() {
 871      return minimumLength;
 872  }
 873
 874  /**
 875   * Returns an array of all matches found in the input.
 876   *
 877   * If the regular expression allows the empty string to match, it will
 878   * substitute matches at all positions except the end of the input.
 879   *
 880   * @param input The input text.
 881   * @return a non-null (but possibly zero-length) array of matches
 882   */
 883  public REMatch[] getAllMatches(Object input) {
 884    return getAllMatches(input,0,0);
 885  }
 886
 887  /**
 888   * Returns an array of all matches found in the input,
 889   * beginning at the specified index position.
 890   *
 891   * If the regular expression allows the empty string to match, it will
 892   * substitute matches at all positions except the end of the input.
 893   *
 894   * @param input The input text.
 895   * @param index The offset index at which the search should be begin.
 896   * @return a non-null (but possibly zero-length) array of matches
 897   */
 898  public REMatch[] getAllMatches(Object input, int index) {
 899    return getAllMatches(input,index,0);
 900  }
 901
 902  /**
 903   * Returns an array of all matches found in the input string,
 904   * beginning at the specified index position and using the specified
 905   * execution flags.
 906   *
 907   * If the regular expression allows the empty string to match, it will
 908   * substitute matches at all positions except the end of the input.
 909   *
 910   * @param input The input text.
 911   * @param index The offset index at which the search should be begin.
 912   * @param eflags The logical OR of any execution flags above.
 913   * @return a non-null (but possibly zero-length) array of matches
 914   */
 915  public REMatch[] getAllMatches(Object input, int index, int eflags) {
 916    return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
 917  }
 918
 919  // this has been changed since 1.03 to be non-overlapping matches
 920  private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
 921    Vector all = new Vector();
 922    REMatch m = null;
 923    while ((m = getMatchImpl(input,index,eflags,null)) != null) {
 924      all.addElement(m);
 925      index = m.getEndIndex();
 926      if (m.end[0] == 0) {   // handle pathological case of zero-length match
 927	index++;
 928	input.move(1);
 929      } else {
 930	input.move(m.end[0]);
 931      }
 932      if (!input.isValid()) break;
 933    }
 934    REMatch[] mset = new REMatch[all.size()];
 935    all.copyInto(mset);
 936    return mset;
 937  }
 938  
 939    /* Implements abstract method REToken.match() */
 940    boolean match(CharIndexed input, REMatch mymatch) { 
 941	if (firstToken == null) return next(input, mymatch);
 942
 943	// Note the start of this subexpression
 944	mymatch.start[subIndex] = mymatch.index;
 945
 946	return firstToken.match(input, mymatch);
 947    }
 948  
 949  /**
 950   * Returns the first match found in the input.  If no match is found,
 951   * null is returned.
 952   *
 953   * @param input The input text.
 954   * @return An REMatch instance referencing the match, or null if none.
 955   */
 956  public REMatch getMatch(Object input) {
 957    return getMatch(input,0,0);
 958  }
 959  
 960  /**
 961   * Returns the first match found in the input, beginning
 962   * the search at the specified index.  If no match is found,
 963   * returns null.
 964   *
 965   * @param input The input text.
 966   * @param index The offset within the text to begin looking for a match.
 967   * @return An REMatch instance referencing the match, or null if none.
 968   */
 969  public REMatch getMatch(Object input, int index) {
 970    return getMatch(input,index,0);
 971  }
 972  
 973  /**
 974   * Returns the first match found in the input, beginning
 975   * the search at the specified index, and using the specified
 976   * execution flags.  If no match is found, returns null.
 977   *
 978   * @param input The input text.
 979   * @param index The offset index at which the search should be begin.
 980   * @param eflags The logical OR of any execution flags above.
 981   * @return An REMatch instance referencing the match, or null if none.
 982   */
 983  public REMatch getMatch(Object input, int index, int eflags) {
 984    return getMatch(input,index,eflags,null);
 985  }
 986
 987  /**
 988   * Returns the first match found in the input, beginning the search
 989   * at the specified index, and using the specified execution flags.
 990   * If no match is found, returns null.  If a StringBuffer is
 991   * provided and is non-null, the contents of the input text from the
 992   * index to the beginning of the match (or to the end of the input,
 993   * if there is no match) are appended to the StringBuffer.
 994   *
 995   * @param input The input text.
 996   * @param index The offset index at which the search should be begin.
 997   * @param eflags The logical OR of any execution flags above.
 998   * @param buffer The StringBuffer to save pre-match text in.
 999   * @return An REMatch instance referencing the match, or null if none.  */
1000  public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1001    return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1002  }
1003
1004  REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1005      // Create a new REMatch to hold results
1006      REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1007      do {
1008	  // Optimization: check if anchor + minimumLength > length
1009	  if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1010	      if (match(input, mymatch)) {
1011		  // Find longest match of them all to observe leftmost longest
1012		  REMatch longest = mymatch;
1013		  while ((mymatch = mymatch.next) != null) {
1014		      if (mymatch.index > longest.index) {
1015			  longest = mymatch;
1016		      }
1017		  }
1018		  
1019		  longest.end[0] = longest.index;
1020		  longest.finish(input);
1021		  return longest;
1022	      }
1023	  }
1024	  mymatch.clear(++anchor);
1025	  // Append character to buffer if needed
1026	  if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1027	      buffer.append(input.charAt(0));
1028	  }
1029      } while (input.move(1));
1030      
1031      return null;
1032  }
1033
1034  /**
1035   * Returns an REMatchEnumeration that can be used to iterate over the
1036   * matches found in the input text.
1037   *
1038   * @param input The input text.
1039   * @return A non-null REMatchEnumeration instance.
1040   */
1041  public REMatchEnumeration getMatchEnumeration(Object input) {
1042    return getMatchEnumeration(input,0,0);
1043  }
1044
1045
1046  /**
1047   * Returns an REMatchEnumeration that can be used to iterate over the
1048   * matches found in the input text.
1049   *
1050   * @param input The input text.
1051   * @param index The offset index at which the search should be begin.
1052   * @return A non-null REMatchEnumeration instance, with its input cursor
1053   *  set to the index position specified.
1054   */
1055  public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1056    return getMatchEnumeration(input,index,0);
1057  }
1058
1059  /**
1060   * Returns an REMatchEnumeration that can be used to iterate over the
1061   * matches found in the input text.
1062   *
1063   * @param input The input text.
1064   * @param index The offset index at which the search should be begin.
1065   * @param eflags The logical OR of any execution flags above.
1066   * @return A non-null REMatchEnumeration instance, with its input cursor
1067   *  set to the index position specified.
1068   */
1069  public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1070    return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1071  }
1072
1073
1074  /**
1075   * Substitutes the replacement text for the first match found in the input.
1076   *
1077   * @param input The input text.
1078   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1079   * @return A String interpolating the substituted text.
1080   * @see REMatch#substituteInto
1081   */
1082  public String substitute(Object input,String replace) {
1083    return substitute(input,replace,0,0);
1084  }
1085
1086  /**
1087   * Substitutes the replacement text for the first match found in the input
1088   * beginning at the specified index position.  Specifying an index
1089   * effectively causes the regular expression engine to throw away the
1090   * specified number of characters. 
1091   *
1092   * @param input The input text.
1093   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1094   * @param index The offset index at which the search should be begin.
1095   * @return A String containing the substring of the input, starting
1096   *   at the index position, and interpolating the substituted text.
1097   * @see REMatch#substituteInto
1098   */
1099  public String substitute(Object input,String replace,int index) {
1100    return substitute(input,replace,index,0);
1101  }
1102
1103  /**
1104   * Substitutes the replacement text for the first match found in the input
1105   * string, beginning at the specified index position and using the
1106   * specified execution flags.
1107   *
1108   * @param input The input text.
1109   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1110   * @param index The offset index at which the search should be begin.
1111   * @param eflags The logical OR of any execution flags above.
1112   * @return A String containing the substring of the input, starting
1113   *   at the index position, and interpolating the substituted text.
1114   * @see REMatch#substituteInto
1115   */
1116  public String substitute(Object input,String replace,int index,int eflags) {
1117    return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1118  }
1119
1120  private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1121    StringBuffer buffer = new StringBuffer();
1122    REMatch m = getMatchImpl(input,index,eflags,buffer);
1123    if (m==null) return buffer.toString();
1124    buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1125		   replace : m.substituteInto(replace) );
1126    if (input.move(m.end[0])) {
1127      do {
1128	buffer.append(input.charAt(0));
1129      } while (input.move(1));
1130    }
1131    return buffer.toString();
1132  }
1133  
1134  /**
1135   * Substitutes the replacement text for each non-overlapping match found 
1136   * in the input text.
1137   *
1138   * @param input The input text.
1139   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1140   * @return A String interpolating the substituted text.
1141   * @see REMatch#substituteInto
1142   */
1143  public String substituteAll(Object input,String replace) {
1144    return substituteAll(input,replace,0,0);
1145  }
1146
1147  /**
1148   * Substitutes the replacement text for each non-overlapping match found 
1149   * in the input text, starting at the specified index.
1150   *
1151   * If the regular expression allows the empty string to match, it will
1152   * substitute matches at all positions except the end of the input.
1153   *
1154   * @param input The input text.
1155   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1156   * @param index The offset index at which the search should be begin.
1157   * @return A String containing the substring of the input, starting
1158   *   at the index position, and interpolating the substituted text.
1159   * @see REMatch#substituteInto
1160   */
1161  public String substituteAll(Object input,String replace,int index) {
1162    return substituteAll(input,replace,index,0);
1163  }
1164 
1165  /**
1166   * Substitutes the replacement text for each non-overlapping match found 
1167   * in the input text, starting at the specified index and using the
1168   * specified execution flags.
1169   *
1170   * @param input The input text.
1171   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1172   * @param index The offset index at which the search should be begin.
1173   * @param eflags The logical OR of any execution flags above.
1174   * @return A String containing the substring of the input, starting
1175   *   at the index position, and interpolating the substituted text.
1176   * @see REMatch#substituteInto
1177   */
1178  public String substituteAll(Object input,String replace,int index,int eflags) {
1179    return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1180  }
1181
1182  private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1183    StringBuffer buffer = new StringBuffer();
1184    REMatch m;
1185    while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1186	buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1187		       replace : m.substituteInto(replace) );
1188      index = m.getEndIndex();
1189      if (m.end[0] == 0) {
1190	char ch = input.charAt(0);
1191	if (ch != CharIndexed.OUT_OF_BOUNDS) 
1192	    buffer.append(ch);
1193	input.move(1);
1194      } else {
1195	  input.move(m.end[0]);
1196      }
1197
1198      if (!input.isValid()) break;
1199    }
1200    return buffer.toString();
1201  }
1202  
1203  /* Helper function for constructor */
1204  private void addToken(REToken next) {
1205    if (next == null) return;
1206    minimumLength += next.getMinimumLength();
1207    if (firstToken == null) {
1208	lastToken = firstToken = next;
1209    } else {
1210      // if chain returns false, it "rejected" the token due to
1211      // an optimization, and next was combined with lastToken
1212      if (lastToken.chain(next)) {
1213	  lastToken = next;
1214      }
1215    }
1216  }
1217
1218  private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1219    if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1220    return new RETokenRepeated(current.subIndex,current,min,max);
1221  }
1222
1223  private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1224    // Precondition: pattern[index-1] == ':'
1225    // we will return pos of closing ']'.
1226    int i;
1227    for (i=index; i<(pattern.length-1); i++) {
1228      if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1229	return i+2;
1230      buf.append(pattern[i]);
1231    }
1232    return index; // didn't match up
1233  }
1234
1235  private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1236    // Precondition: input[index-1] == '{', minMax != null
1237
1238    boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1239    int startIndex = index;
1240    if (index == input.length) {
1241      if (mustMatch)
1242        throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
1243      else
1244        return startIndex;
1245    }
1246    
1247    int min,max=0;
1248    CharUnit unit = new CharUnit();
1249    StringBuffer buf = new StringBuffer();
1250    
1251    // Read string of digits
1252    do {
1253      index = getCharUnit(input,index,unit);
1254      if (Character.isDigit(unit.ch))
1255        buf.append(unit.ch);
1256    } while ((index != input.length) && Character.isDigit(unit.ch));
1257
1258    // Check for {} tomfoolery
1259    if (buf.length() == 0) {
1260      if (mustMatch)
1261        throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1262      else
1263        return startIndex;
1264    }
1265
1266    min = Integer.parseInt(buf.toString());
1267	
1268    if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1269      max = min;
1270    else if (index == input.length)
1271      if (mustMatch)
1272        throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
1273      else
1274        return startIndex;
1275    else if ((unit.ch == ',') && !unit.bk) {
1276      buf = new StringBuffer();
1277      // Read string of digits
1278      while (((index = getCharUnit(input,index,unit)) != input.length) && Character.isDigit(unit.ch))
1279	buf.append(unit.ch);
1280
1281      if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1282        if (mustMatch)
1283          throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1284        else
1285          return startIndex;
1286
1287      // This is the case of {x,}
1288      if (buf.length() == 0) max = Integer.MAX_VALUE;
1289      else max = Integer.parseInt(buf.toString());
1290    } else
1291      if (mustMatch)
1292        throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1293      else
1294        return startIndex;
1295
1296    // We know min and max now, and they are valid.
1297
1298    minMax.first = min;
1299    minMax.second = max;
1300
1301    // return the index following the '}'
1302    return index;
1303  }
1304
1305   /**
1306    * Return a human readable form of the compiled regular expression,
1307    * useful for debugging.
1308    */
1309   public String toString() {
1310     StringBuffer sb = new StringBuffer();
1311     dump(sb);
1312     return sb.toString();
1313   }
1314
1315  void dump(StringBuffer os) {
1316    os.append('(');
1317    if (subIndex == 0)
1318      os.append("?:");
1319    if (firstToken != null)
1320      firstToken.dumpAll(os);
1321    os.append(')');
1322  }
1323
1324  // Cast input appropriately or throw exception
1325  private static CharIndexed makeCharIndexed(Object input, int index) {
1326      // We could let a String fall through to final input, but since
1327      // it's the most likely input type, we check it first.
1328    if (input instanceof String)
1329      return new CharIndexedString((String) input,index);
1330    else if (input instanceof char[])
1331      return new CharIndexedCharArray((char[]) input,index);
1332    else if (input instanceof StringBuffer)
1333      return new CharIndexedStringBuffer((StringBuffer) input,index);
1334    else if (input instanceof InputStream)
1335      return new CharIndexedInputStream((InputStream) input,index);
1336    else if (input instanceof Reader)
1337	return new CharIndexedReader((Reader) input, index);
1338    else if (input instanceof CharIndexed)
1339	return (CharIndexed) input; // do we lose index info?
1340    else 
1341	return new CharIndexedString(input.toString(), index);
1342  }
1343}