PageRenderTime 86ms CodeModel.GetById 11ms app.highlight 62ms RepoModel.GetById 1ms app.codeStats 0ms

/interpreter/tags/at2-build270707/src/edu/vub/util/regexp/RE.java

http://ambienttalk.googlecode.com/
Java | 1959 lines | 1127 code | 190 blank | 642 comment | 544 complexity | 21580fc622e8e962bff57ce154485c7c MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/* gnu/regexp/RE.java
   2   Copyright (C) 2006 Free Software Foundation, Inc.
   3
   4This file is part of GNU Classpath.
   5
   6GNU Classpath is free software; you can redistribute it and/or modify
   7it under the terms of the GNU General Public License as published by
   8the Free Software Foundation; either version 2, or (at your option)
   9any later version.
  10
  11GNU Classpath is distributed in the hope that it will be useful, but
  12WITHOUT ANY WARRANTY; without even the implied warranty of
  13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14General Public License for more details.
  15
  16You should have received a copy of the GNU General Public License
  17along with GNU Classpath; see the file COPYING.  If not, write to the
  18Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  1902110-1301 USA.
  20
  21Linking this library statically or dynamically with other modules is
  22making a combined work based on this library.  Thus, the terms and
  23conditions of the GNU General Public License cover the whole
  24combination.
  25
  26As a special exception, the copyright holders of this library give you
  27permission to link this library with independent modules to produce an
  28executable, regardless of the license terms of these independent
  29modules, and to copy and distribute the resulting executable under
  30terms of your choice, provided that you also meet, for each linked
  31independent module, the terms and conditions of the license of that
  32module.  An independent module is a module which is not derived from
  33or based on this library.  If you modify this library, you may extend
  34this exception to your version of the library, but you are not
  35obligated to do so.  If you do not wish to do so, delete this
  36exception statement from your version. */
  37
  38package edu.vub.util.regexp;
  39import java.io.InputStream;
  40import java.io.Serializable;
  41import java.util.Vector;
  42
  43/**
  44 * RE provides the user interface for compiling and matching regular
  45 * expressions.
  46 * <P>
  47 * A regular expression object (class RE) is compiled by constructing it
  48 * from a String, StringBuffer or character array, with optional 
  49 * compilation flags (below)
  50 * and an optional syntax specification (see RESyntax; if not specified,
  51 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  52 * <P>
  53 * Once compiled, a regular expression object is reusable as well as
  54 * threadsafe: multiple threads can use the RE instance simultaneously
  55 * to match against different input text.
  56 * <P>
  57 * Various methods attempt to match input text against a compiled
  58 * regular expression.  These methods are:
  59 * <LI><code>isMatch</code>: returns true if the input text in its
  60 * entirety matches the regular expression pattern.
  61 * <LI><code>getMatch</code>: returns the first match found in the
  62 * input text, or null if no match is found.
  63 * <LI><code>getAllMatches</code>: returns an array of all
  64 * non-overlapping matches found in the input text.  If no matches are
  65 * found, the array is zero-length.
  66 * <LI><code>substitute</code>: substitute the first occurence of the
  67 * pattern in the input text with a replacement string (which may
  68 * include metacharacters $0-$9, see REMatch.substituteInto).
  69 * <LI><code>substituteAll</code>: same as above, but repeat for each
  70 * match before returning.
  71 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
  72 * object that allows iteration over the matches (see
  73 * REMatchEnumeration for some reasons why you may want to do this
  74 * instead of using <code>getAllMatches</code>.
  75 * <P>
  76 *
  77 * These methods all have similar argument lists.  The input can be a
  78 * String, a character array, a StringBuffer, or an
  79 * InputStream of some sort.  Note that when using an
  80 * InputStream, the stream read position cannot be guaranteed after
  81 * attempting a match (this is not a bug, but a consequence of the way
  82 * regular expressions work).  Using an REMatchEnumeration can
  83 * eliminate most positioning problems.
  84 *
  85 * <P>
  86 *
  87 * The optional index argument specifies the offset from the beginning
  88 * of the text at which the search should start (see the descriptions
  89 * of some of the execution flags for how this can affect positional
  90 * pattern operators).  For an InputStream, this means an
  91 * offset from the current read position, so subsequent calls with the
  92 * same index argument on an InputStream will not
  93 * necessarily access the same position on the stream, whereas
  94 * repeated searches at a given index in a fixed string will return
  95 * consistent results.
  96 *
  97 * <P>
  98 * You can optionally affect the execution environment by using a
  99 * combination of execution flags (constants listed below).
 100 * 
 101 * <P>
 102 * All operations on a regular expression are performed in a
 103 * thread-safe manner.
 104 *
 105 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
 106 * @version 1.1.5-dev, to be released
 107 */
 108
 109public class RE extends REToken {
 110
 111  private static final class IntPair implements Serializable {
 112    public int first, second;
 113  }
 114
 115  private static final class CharUnit implements Serializable {
 116    public char ch;
 117    public boolean bk;
 118  }
 119
 120  // This String will be returned by getVersion()
 121  private static final String VERSION = "1.1.5-dev";
 122
 123  // These are, respectively, the first and last tokens in our linked list
 124  // If there is only one token, firstToken == lastToken
 125  private REToken firstToken, lastToken;
 126
 127  // This is the number of subexpressions in this regular expression,
 128  // with a minimum value of zero.  Returned by getNumSubs()
 129  private int numSubs;
 130
 131    /** Minimum length, in characters, of any possible match. */
 132    private int minimumLength;
 133    private int maximumLength;
 134
 135  /**
 136   * Compilation flag. Do  not  differentiate  case.   Subsequent
 137   * searches  using  this  RE will be case insensitive.
 138   */
 139  public static final int REG_ICASE = 0x02;
 140
 141  /**
 142   * Compilation flag. The match-any-character operator (dot)
 143   * will match a newline character.  When set this overrides the syntax
 144   * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 145   * the "/s" operator in Perl.
 146   */
 147  public static final int REG_DOT_NEWLINE = 0x04;
 148
 149  /**
 150   * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 151   * anchors will match based on newlines within the input. This is
 152   * equivalent to the "/m" operator in Perl.
 153   */
 154  public static final int REG_MULTILINE = 0x08;
 155
 156  /**
 157   * Execution flag.
 158   * The match-beginning operator (^) will not match at the beginning
 159   * of the input string. Useful for matching on a substring when you
 160   * know the context of the input is such that position zero of the
 161   * input to the match test is not actually position zero of the text.
 162   * <P>
 163   * This example demonstrates the results of various ways of matching on
 164   * a substring.
 165   * <P>
 166   * <CODE>
 167   * String s = "food bar fool";<BR>
 168   * RE exp = new RE("^foo.");<BR>
 169   * REMatch m0 = exp.getMatch(s);<BR>
 170   * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 171   * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 172   * REMatch m3 = exp.getMatch(s,8);                            <BR>
 173   * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 174   * <P>
 175   * // Results:<BR>
 176   * //  m0.toString(): "food"<BR>
 177   * //  m1.toString(): "fool"<BR>
 178   * //  m2.toString(): null<BR>
 179   * //  m3.toString(): null<BR>
 180   * //  m4.toString(): "fool"<BR>
 181   * </CODE>
 182   */
 183  public static final int REG_NOTBOL = 0x10;
 184
 185  /**
 186   * Execution flag.
 187   * The match-end operator ($) does not match at the end
 188   * of the input string. Useful for matching on substrings.
 189   */
 190  public static final int REG_NOTEOL = 0x20;
 191
 192  /**
 193   * Execution flag.
 194   * When a match method is invoked that starts matching at a non-zero
 195   * index into the input, treat the input as if it begins at the index
 196   * given.  The effect of this flag is that the engine does not "see"
 197   * any text in the input before the given index.  This is useful so
 198   * that the match-beginning operator (^) matches not at position 0
 199   * in the input string, but at the position the search started at
 200   * (based on the index input given to the getMatch function).  See
 201   * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 202   * and \b operators.
 203   */
 204  public static final int REG_ANCHORINDEX = 0x40;
 205
 206  /**
 207   * Execution flag.
 208   * The substitute and substituteAll methods will not attempt to
 209   * interpolate occurrences of $1-$9 in the replacement text with
 210   * the corresponding subexpressions.  For example, you may want to
 211   * replace all matches of "one dollar" with "$1".
 212   */
 213  public static final int REG_NO_INTERPOLATE = 0x80;
 214
 215  /**
 216   * Execution flag.
 217   * Try to match the whole input string. An implicit match-end operator
 218   * is added to this regexp.
 219   */
 220  public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
 221
 222  /**
 223   * Execution flag.
 224   * The substitute and substituteAll methods will treat the
 225   * character '\' in the replacement as an escape to a literal
 226   * character. In this case "\n", "\$", "\\", "\x40" and "\012"
 227   * will become "n", "$", "\", "x40" and "012" respectively.
 228   * This flag has no effect if REG_NO_INTERPOLATE is set on.
 229   */
 230  public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
 231
 232  /** Returns a string representing the version of the edu.vub.util.regexp package. */
 233  public static final String version() {
 234    return VERSION;
 235  }
 236
 237  /**
 238   * Constructs a regular expression pattern buffer without any compilation
 239   * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 240   *
 241   * @param pattern A regular expression pattern, in the form of a String,
 242   *   StringBuffer or char[].  Other input types will be converted to
 243   *   strings using the toString() method.
 244   * @exception REException The input pattern could not be parsed.
 245   * @exception NullPointerException The pattern was null.
 246   */
 247  public RE(Object pattern) throws REException {
 248    this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
 249  }
 250
 251  /**
 252   * Constructs a regular expression pattern buffer using the specified
 253   * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 254   *
 255   * @param pattern A regular expression pattern, in the form of a String,
 256   *   StringBuffer, or char[].  Other input types will be converted to
 257   *   strings using the toString() method.
 258   * @param cflags The logical OR of any combination of the compilation flags listed above.
 259   * @exception REException The input pattern could not be parsed.
 260   * @exception NullPointerException The pattern was null.
 261   */
 262  public RE(Object pattern, int cflags) throws REException {
 263    this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
 264  }
 265
 266  /**
 267   * Constructs a regular expression pattern buffer using the specified
 268   * compilation flags and regular expression syntax.
 269   *
 270   * @param pattern A regular expression pattern, in the form of a String,
 271   *   StringBuffer, or char[].  Other input types will be converted to
 272   *   strings using the toString() method.
 273   * @param cflags The logical OR of any combination of the compilation flags listed above.
 274   * @param syntax The type of regular expression syntax to use.
 275   * @exception REException The input pattern could not be parsed.
 276   * @exception NullPointerException The pattern was null.
 277   */
 278  public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
 279    this(pattern,cflags,syntax,0,0);
 280  }
 281
 282  // internal constructor used for alternation
 283  private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
 284    super(subIndex);
 285    firstToken = first;
 286    lastToken = last;
 287    numSubs = subs;
 288    minimumLength = minLength;
 289    maximumLength = maxLength;
 290    addToken(new RETokenEndSub(subIndex));
 291  }
 292
 293  private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 294    super(myIndex); // Subexpression index of this token.
 295    initialize(patternObj, cflags, syntax, myIndex, nextSub);
 296  }
 297
 298    // For use by subclasses
 299    protected RE() { super(0); }
 300
 301    // The meat of construction
 302  protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 303      char[] pattern;
 304    if (patternObj instanceof String) {
 305      pattern = ((String) patternObj).toCharArray();
 306    } else if (patternObj instanceof char[]) {
 307      pattern = (char[]) patternObj;
 308    } else if (patternObj instanceof StringBuffer) {
 309      pattern = new char [((StringBuffer) patternObj).length()];
 310      ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
 311    } else {
 312	pattern = patternObj.toString().toCharArray();
 313    }
 314
 315    int pLength = pattern.length;
 316
 317    numSubs = 0; // Number of subexpressions in this token.
 318    Vector branches = null;
 319
 320    // linked list of tokens (sort of -- some closed loops can exist)
 321    firstToken = lastToken = null;
 322
 323    // Precalculate these so we don't pay for the math every time we
 324    // need to access them.
 325    boolean insens = ((cflags & REG_ICASE) > 0);
 326
 327    // Parse pattern into tokens.  Does anyone know if it's more efficient
 328    // to use char[] than a String.charAt()?  I'm assuming so.
 329
 330    // index tracks the position in the char array
 331    int index = 0;
 332
 333    // this will be the current parse character (pattern[index])
 334    CharUnit unit = new CharUnit();
 335
 336    // This is used for {x,y} calculations
 337    IntPair minMax = new IntPair();
 338
 339    // Buffer a token so we can create a TokenRepeated, etc.
 340    REToken currentToken = null;
 341    char ch;
 342    boolean quot = false;
 343
 344    // Saved syntax and flags.
 345    RESyntax savedSyntax = null;
 346    int savedCflags = 0;
 347    boolean flagsSaved = false;
 348
 349    while (index < pLength) {
 350      // read the next character unit (including backslash escapes)
 351      index = getCharUnit(pattern,index,unit,quot);
 352
 353      if (unit.bk)
 354        if (unit.ch == 'Q') {
 355          quot = true;
 356          continue;
 357        } else if (unit.ch == 'E') {
 358          quot = false;
 359          continue;
 360        }
 361      if (quot)
 362      	unit.bk = false;
 363
 364      // ALTERNATION OPERATOR
 365      //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 366      //  not available if RE_LIMITED_OPS is set
 367
 368      // TODO: the '\n' literal here should be a test against REToken.newline,
 369      // which unfortunately may be more than a single character.
 370      if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
 371	     || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) )
 372	   && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
 373	// make everything up to here be a branch. create vector if nec.
 374	addToken(currentToken);
 375	RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
 376	minimumLength = 0;
 377	maximumLength = 0;
 378	if (branches == null) {
 379	    branches = new Vector();
 380	}
 381	branches.addElement(theBranch);
 382	firstToken = lastToken = currentToken = null;
 383      }
 384      
 385      // INTERVAL OPERATOR:
 386      //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 387      //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 388      //
 389      // OPEN QUESTION: 
 390      //  what is proper interpretation of '{' at start of string?
 391      //
 392      // This method used to check "repeat.empty.token" to avoid such regexp
 393      // as "(a*){2,}", but now "repeat.empty.token" is allowed.
 394
 395      else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) {
 396	int newIndex = getMinMax(pattern,index,minMax,syntax);
 397        if (newIndex > index) {
 398          if (minMax.first > minMax.second)
 399            throw new REException("an interval''s minimum is greater than its maximum",REException.REG_BADRPT,newIndex);
 400          if (currentToken == null)
 401            throw new REException("quantifier (?*+{}) without preceding token",REException.REG_BADRPT,newIndex);
 402          if (currentToken instanceof RETokenRepeated) 
 403            throw new REException("attempted to repeat a token that is already repeated",REException.REG_BADRPT,newIndex);
 404          if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 405            throw new REException("repeated token is zero-width assertion",REException.REG_BADRPT,newIndex);
 406          index = newIndex;
 407          currentToken = setRepeated(currentToken,minMax.first,minMax.second,index); 
 408        }
 409        else {
 410          addToken(currentToken);
 411          currentToken = new RETokenChar(subIndex,unit.ch,insens);
 412        } 
 413      }
 414      
 415      // LIST OPERATOR:
 416      //  [...] | [^...]
 417
 418      else if ((unit.ch == '[') && !(unit.bk || quot)) {
 419	// Create a new RETokenOneOf
 420	ParseCharClassResult result = parseCharClass(
 421		subIndex, pattern, index, pLength, cflags, syntax, 0);
 422	addToken(currentToken);
 423	currentToken = result.token;
 424	index = result.index;
 425      }
 426
 427      // SUBEXPRESSIONS
 428      //  (...) | \(...\) depending on RE_NO_BK_PARENS
 429
 430      else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) {
 431	boolean pure = false;
 432	boolean comment = false;
 433        boolean lookAhead = false;
 434        boolean lookBehind = false;
 435        boolean independent = false;
 436        boolean negativelh = false;
 437        boolean negativelb = false;
 438	if ((index+1 < pLength) && (pattern[index] == '?')) {
 439	  switch (pattern[index+1]) {
 440          case '!':
 441            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 442              pure = true;
 443              negativelh = true;
 444              lookAhead = true;
 445              index += 2;
 446            }
 447            break;
 448          case '=':
 449            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 450              pure = true;
 451              lookAhead = true;
 452              index += 2;
 453            }
 454            break;
 455	  case '<':
 456	    // We assume that if the syntax supports look-ahead,
 457	    // it also supports look-behind.
 458	    if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 459		index++;
 460		switch (pattern[index +1]) {
 461		case '!':
 462		  pure = true;
 463		  negativelb = true;
 464		  lookBehind = true;
 465		  index += 2;
 466		  break;
 467		case '=':
 468		  pure = true;
 469		  lookBehind = true;
 470		  index += 2;
 471		}
 472	    }
 473	    break;
 474	  case '>':
 475	    // We assume that if the syntax supports look-ahead,
 476	    // it also supports independent group.
 477            if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 478              pure = true;
 479              independent = true;
 480              index += 2;
 481            }
 482            break;
 483	  case 'i':
 484	  case 'd':
 485	  case 'm':
 486	  case 's':
 487	  // case 'u':  not supported
 488	  // case 'x':  not supported
 489	  case '-':
 490            if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
 491	    // Set or reset syntax flags.
 492	    int flagIndex = index + 1;
 493	    int endFlag = -1;
 494	    RESyntax newSyntax = new RESyntax(syntax);
 495	    int newCflags = cflags;
 496	    boolean negate = false;
 497	    while (flagIndex < pLength && endFlag < 0) {
 498	        switch(pattern[flagIndex]) {
 499	  	case 'i':
 500		  if (negate)
 501		    newCflags &= ~REG_ICASE;
 502		  else
 503		    newCflags |= REG_ICASE;
 504		  flagIndex++;
 505		  break;
 506	  	case 'd':
 507		  if (negate)
 508		    newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
 509		  else
 510		    newSyntax.setLineSeparator("\n");
 511		  flagIndex++;
 512		  break;
 513	  	case 'm':
 514		  if (negate)
 515		    newCflags &= ~REG_MULTILINE;
 516		  else
 517		    newCflags |= REG_MULTILINE;
 518		  flagIndex++;
 519		  break;
 520	  	case 's':
 521		  if (negate)
 522		    newCflags &= ~REG_DOT_NEWLINE;
 523		  else
 524		    newCflags |= REG_DOT_NEWLINE;
 525		  flagIndex++;
 526		  break;
 527	  	// case 'u': not supported
 528	  	// case 'x': not supported
 529	  	case '-':
 530		  negate = true;
 531		  flagIndex++;
 532		  break;
 533		case ':':
 534		case ')':
 535		  endFlag = pattern[flagIndex];
 536		  break;
 537		default:
 538            	  throw new REException("quantifier (?*+{}) without preceding token", REException.REG_BADRPT, index);
 539		}
 540	    }
 541	    if (endFlag == ')') {
 542		syntax = newSyntax;
 543		cflags = newCflags;
 544		insens = ((cflags & REG_ICASE) > 0);
 545		// This can be treated as though it were a comment.
 546		comment = true;
 547		index = flagIndex - 1;
 548		break;
 549	    }
 550	    if (endFlag == ':') {
 551		savedSyntax = syntax;
 552		savedCflags = cflags;
 553		flagsSaved = true;
 554		syntax = newSyntax;
 555		cflags = newCflags;
 556		insens = ((cflags & REG_ICASE) > 0);
 557		index = flagIndex -1;
 558		// Fall through to the next case.
 559	    }
 560	    else {
 561	        throw new REException("unmatched parenthesis", REException.REG_ESUBREG,index);
 562	    }
 563	  case ':':
 564	    if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
 565	      pure = true;
 566	      index += 2;
 567	    }
 568	    break;
 569	  case '#':
 570	    if (syntax.get(RESyntax.RE_COMMENTS)) {
 571	      comment = true;
 572	    }
 573	    break;
 574          default:
 575            throw new REException("quantifier (?*+{}) without preceding token", REException.REG_BADRPT, index);
 576	  }
 577	}
 578
 579	if (index >= pLength) {
 580	    throw new REException("unmatched parenthesis", REException.REG_ESUBREG,index);
 581	}
 582
 583	// find end of subexpression
 584	int endIndex = index;
 585	int nextIndex = index;
 586	int nested = 0;
 587
 588	while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0)
 589		&& !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) {
 590	  if ((endIndex = nextIndex) >= pLength)
 591	    throw new REException("unexpected end of subexpression",REException.REG_ESUBREG,nextIndex);
 592	  else if ((unit.ch == '[') && !(unit.bk || quot)) {
 593	    // I hate to do something similar to the LIST OPERATOR matters
 594	    // above, but ...
 595	    int listIndex = nextIndex;
 596	    if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 597	    if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 598	    int listEndIndex = -1;
 599	    int listNest = 0;
 600	    while (listIndex < pLength && listEndIndex < 0) {
 601	      switch(pattern[listIndex++]) {
 602		case '\\':
 603		  listIndex++;
 604		  break;
 605		case '[':
 606		  // Sun's API document says that regexp like "[a-d[m-p]]"
 607		  // is legal. Even something like "[[[^]]]]" is accepted.
 608		  listNest++;
 609		  if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 610		  if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 611		  break;
 612		case ']':
 613		  if (listNest == 0)
 614		    listEndIndex = listIndex;
 615		  listNest--;
 616		  break;
 617	      }
 618	    }
 619	    if (listEndIndex >= 0) {
 620	      nextIndex = listEndIndex;
 621	      if ((endIndex = nextIndex) >= pLength)
 622	        throw new REException("unexpected end of subexpression",REException.REG_ESUBREG,nextIndex);
 623	      else
 624	        continue;
 625	    }
 626	    throw new REException("unexpected end of subexpression",REException.REG_ESUBREG,nextIndex);
 627	  }
 628	  else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 629	    nested++;
 630	  else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 631	    nested--;
 632	}
 633
 634	// endIndex is now position at a ')','\)' 
 635	// nextIndex is end of string or position after ')' or '\)'
 636
 637	if (comment) index = nextIndex;
 638	else { // not a comment
 639	  // create RE subexpression as token.
 640	  addToken(currentToken);
 641	  if (!pure) {
 642	    numSubs++;
 643	  }
 644
 645	  int useIndex = (pure || lookAhead || lookBehind || independent) ?
 646			 0 : nextSub + numSubs;
 647	  currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
 648	  numSubs += ((RE) currentToken).getNumSubs();
 649
 650          if (lookAhead) {
 651	      currentToken = new RETokenLookAhead(currentToken,negativelh);
 652	  }
 653          else if (lookBehind) {
 654	      currentToken = new RETokenLookBehind(currentToken,negativelb);
 655	  }
 656          else if (independent) {
 657	      currentToken = new RETokenIndependent(currentToken);
 658	  }
 659
 660	  index = nextIndex;
 661	  if (flagsSaved) {
 662	      syntax = savedSyntax;
 663	      cflags = savedCflags;
 664	      insens = ((cflags & REG_ICASE) > 0);
 665	      flagsSaved = false;
 666	  }
 667	} // not a comment
 668      } // subexpression
 669    
 670      // UNMATCHED RIGHT PAREN
 671      // ) or \) throw exception if
 672      // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 673      else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) {
 674	throw new REException("unmatched parenthesis",REException.REG_EPAREN,index);
 675      }
 676
 677      // START OF LINE OPERATOR
 678      //  ^
 679
 680      else if ((unit.ch == '^') && !(unit.bk || quot)) {
 681	addToken(currentToken);
 682	currentToken = null;
 683	addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 684      }
 685
 686      // END OF LINE OPERATOR
 687      //  $
 688
 689      else if ((unit.ch == '$') && !(unit.bk || quot)) {
 690	addToken(currentToken);
 691	currentToken = null;
 692	addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 693      }
 694
 695      // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 696      //  .
 697
 698      else if ((unit.ch == '.') && !(unit.bk || quot)) {
 699	addToken(currentToken);
 700	currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
 701      }
 702
 703      // ZERO-OR-MORE REPEAT OPERATOR
 704      //  *
 705      //
 706      // This method used to check "repeat.empty.token" to avoid such regexp
 707      // as "(a*)*", but now "repeat.empty.token" is allowed.
 708
 709      else if ((unit.ch == '*') && !(unit.bk || quot)) {
 710	if (currentToken == null)
 711          throw new REException("quantifier (?*+{}) without preceding token",REException.REG_BADRPT,index);
 712	if (currentToken instanceof RETokenRepeated)
 713          throw new REException("attempted to repeat a token that is already repeated",REException.REG_BADRPT,index);
 714	if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 715	  throw new REException("repeated token is zero-width assertion",REException.REG_BADRPT,index);
 716	currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
 717      }
 718
 719      // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
 720      //  + | \+ depending on RE_BK_PLUS_QM
 721      //  not available if RE_LIMITED_OPS is set
 722      //
 723      // This method used to check "repeat.empty.token" to avoid such regexp
 724      // as "(a*)+", but now "repeat.empty.token" is allowed.
 725
 726      else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 727	if (currentToken == null)
 728          throw new REException("quantifier (?*+{}) without preceding token",REException.REG_BADRPT,index);
 729	
 730	// Check for possessive matching on RETokenRepeated
 731	if (currentToken instanceof RETokenRepeated) {
 732	  RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 733	  if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
 734	    tokenRep.makePossessive();
 735	  else
 736	    throw new REException("attempted to repeat a token that is already repeated",REException.REG_BADRPT,index);
 737
 738	}
 739	else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 740	  throw new REException("repeated token is zero-width assertion",REException.REG_BADRPT,index);
 741	else
 742	  currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
 743      }
 744
 745      // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
 746      //  ? | \? depending on RE_BK_PLUS_QM
 747      //  not available if RE_LIMITED_OPS is set
 748      //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
 749
 750      else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 751	if (currentToken == null) throw new REException("quantifier (?*+{}) without preceding token",REException.REG_BADRPT,index);
 752
 753	// Check for stingy matching on RETokenRepeated
 754	if (currentToken instanceof RETokenRepeated) {
 755	  RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 756	  if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
 757	    tokenRep.makeStingy();
 758	  else
 759	    throw new REException("attempted to repeat a token that is already repeated",REException.REG_BADRPT,index);
 760	}
 761	else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 762	  throw new REException("repeated token is zero-width assertion",REException.REG_BADRPT,index);
 763	else
 764	  currentToken = setRepeated(currentToken,0,1,index);
 765      }
 766
 767      // OCTAL CHARACTER
 768      //  \0377
 769	
 770      else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
 771	CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
 772	if (ce == null)
 773	  throw new REException("invalid octal character", REException.REG_ESCAPE, index);
 774	index = index - 2 + ce.len;
 775	addToken(currentToken);
 776	currentToken = new RETokenChar(subIndex,ce.ch,insens);
 777      }
 778
 779      // BACKREFERENCE OPERATOR
 780      //  \1 \2 ... \9 and \10 \11 \12 ...
 781      // not available if RE_NO_BK_REFS is set
 782      // Perl recognizes \10, \11, and so on only if enough number of
 783      // parentheses have opened before it, otherwise they are treated
 784      // as aliases of \010, \011, ... (octal characters).  In case of
 785      // Sun's JDK, octal character expression must always begin with \0.
 786      // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
 787      // JDK treats \2 as a back reference to the 2nd group because
 788      // there are only two groups. But in our poor implementation,
 789      // we cannot help but treat \29 as a back reference to the 29th group.
 790
 791      else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
 792	addToken(currentToken);
 793	int numBegin = index - 1;
 794	int numEnd = pLength;
 795	for (int i = index; i < pLength; i++) {
 796	    if (! Character.isDigit(pattern[i])) {
 797		numEnd = i;
 798		break;
 799	    }
 800	}
 801	int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
 802
 803	currentToken = new RETokenBackRef(subIndex,num,insens);
 804	index = numEnd;
 805      }
 806
 807      // START OF STRING OPERATOR
 808      //  \A if RE_STRING_ANCHORS is set
 809      
 810      else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 811	addToken(currentToken);
 812	currentToken = new RETokenStart(subIndex,null);
 813      }
 814
 815      // WORD BREAK OPERATOR
 816      //  \b if ????
 817
 818      else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 819	  addToken(currentToken);
 820	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
 821      } 
 822
 823      // WORD BEGIN OPERATOR 
 824      //  \< if ????
 825      else if (unit.bk && (unit.ch == '<')) {
 826	  addToken(currentToken);
 827	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
 828      } 
 829
 830      // WORD END OPERATOR 
 831      //  \> if ????
 832      else if (unit.bk && (unit.ch == '>')) {
 833	  addToken(currentToken);
 834	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
 835      } 
 836
 837      // NON-WORD BREAK OPERATOR
 838      // \B if ????
 839
 840      else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 841	  addToken(currentToken);
 842	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
 843      } 
 844
 845      
 846      // DIGIT OPERATOR
 847      //  \d if RE_CHAR_CLASS_ESCAPES is set
 848      
 849      else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 850	addToken(currentToken);
 851	currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
 852      }
 853
 854      // NON-DIGIT OPERATOR
 855      //  \D
 856
 857	else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 858	  addToken(currentToken);
 859	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
 860	}
 861
 862	// NEWLINE ESCAPE
 863        //  \n
 864
 865	else if (unit.bk && (unit.ch == 'n')) {
 866	  addToken(currentToken);
 867	  currentToken = new RETokenChar(subIndex,'\n',false);
 868	}
 869
 870	// RETURN ESCAPE
 871        //  \r
 872
 873	else if (unit.bk && (unit.ch == 'r')) {
 874	  addToken(currentToken);
 875	  currentToken = new RETokenChar(subIndex,'\r',false);
 876	}
 877
 878	// WHITESPACE OPERATOR
 879        //  \s if RE_CHAR_CLASS_ESCAPES is set
 880
 881	else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 882	  addToken(currentToken);
 883	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
 884	}
 885
 886	// NON-WHITESPACE OPERATOR
 887        //  \S
 888
 889	else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 890	  addToken(currentToken);
 891	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
 892	}
 893
 894	// TAB ESCAPE
 895        //  \t
 896
 897	else if (unit.bk && (unit.ch == 't')) {
 898	  addToken(currentToken);
 899	  currentToken = new RETokenChar(subIndex,'\t',false);
 900	}
 901
 902	// ALPHANUMERIC OPERATOR
 903        //  \w
 904
 905	else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 906	  addToken(currentToken);
 907	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
 908	}
 909
 910	// NON-ALPHANUMERIC OPERATOR
 911        //  \W
 912
 913	else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 914	  addToken(currentToken);
 915	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
 916	}
 917
 918	// END OF STRING OPERATOR
 919        //  \Z
 920
 921	else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 922	  addToken(currentToken);
 923	  currentToken = new RETokenEnd(subIndex,null);
 924	}
 925
 926        // HEX CHARACTER, UNICODE CHARACTER
 927        //  \x1B, \u1234
 928	
 929	else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
 930		 (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
 931	  CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
 932	  if (ce == null)
 933	    throw new REException("invalid hex character", REException.REG_ESCAPE, index);
 934	  index = index - 2 + ce.len;
 935	  addToken(currentToken);
 936	  currentToken = new RETokenChar(subIndex,ce.ch,insens);
 937	}
 938
 939	// NAMED PROPERTY
 940	// \p{prop}, \P{prop}
 941
 942	else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
 943	         (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
 944	  NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
 945	  if (np == null)
 946	      throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
 947	  index = index - 2 + np.len;
 948	  addToken(currentToken);
 949	  currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
 950	}
 951
 952	// NON-SPECIAL CHARACTER (or escape to make literal)
 953        //  c | \* for example
 954
 955	else {  // not a special character
 956	  addToken(currentToken);
 957	  currentToken = new RETokenChar(subIndex,unit.ch,insens);
 958	} 
 959      } // end while
 960
 961    // Add final buffered token and an EndSub marker
 962    addToken(currentToken);
 963      
 964    if (branches != null) {
 965	branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
 966	branches.trimToSize(); // compact the Vector
 967	minimumLength = 0;
 968	maximumLength = 0;
 969	firstToken = lastToken = null;
 970	addToken(new RETokenOneOf(subIndex,branches,false));
 971    } 
 972    else addToken(new RETokenEndSub(subIndex));
 973
 974  }
 975
 976  private static class ParseCharClassResult {
 977      RETokenOneOf token;
 978      int index;
 979      boolean returnAtAndOperator = false;
 980  }
 981
 982  /**
 983   * Parse [...] or [^...] and make an RETokenOneOf instance.
 984   * @param subIndex subIndex to be given to the created RETokenOneOf instance.
 985   * @param pattern Input array of characters to be parsed.
 986   * @param index Index pointing to the character next to the beginning '['.
 987   * @param pLength Limit of the input array.
 988   * @param cflags Compilation flags used to parse the pattern.
 989   * @param pflags Flags that affect the behavior of this method.
 990   * @param syntax Syntax used to parse the pattern.
 991   */
 992  private static ParseCharClassResult parseCharClass(int subIndex,
 993		char[] pattern, int index,
 994		int pLength, int cflags, RESyntax syntax, int pflags)
 995		throws REException {
 996
 997	boolean insens = ((cflags & REG_ICASE) > 0);
 998	Vector options = new Vector();
 999	Vector addition = new Vector();
1000	boolean additionAndAppeared = false;
1001	final int RETURN_AT_AND = 0x01;
1002	boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1003	boolean negative = false;
1004	char ch;
1005
1006	char lastChar = 0;
1007	boolean lastCharIsSet = false;
1008	if (index == pLength) throw new REException("unmatched bracket",REException.REG_EBRACK,index);
1009	
1010	// Check for initial caret, negation
1011	if ((ch = pattern[index]) == '^') {
1012	  negative = true;
1013	  if (++index == pLength) throw new REException("unexpected end of character class",REException.REG_EBRACK,index);
1014	  ch = pattern[index];
1015	}
1016
1017	// Check for leading right bracket literal
1018	if (ch == ']') {
1019	  lastChar = ch; lastCharIsSet = true;
1020	  if (++index == pLength) throw new REException("unexpected end of character class",REException.REG_EBRACK,index);
1021	}
1022
1023	while ((ch = pattern[index++]) != ']') {
1024	  if ((ch == '-') && (lastCharIsSet)) {
1025	    if (index == pLength) throw new REException("unexpected end of character class",REException.REG_EBRACK,index);
1026	    if ((ch = pattern[index]) == ']') {
1027	      options.addElement(new RETokenChar(subIndex,lastChar,insens));
1028	      lastChar = '-';
1029	    } else {
1030	      if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1031	        CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
1032	        if (ce == null)
1033		  throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1034		ch = ce.ch;
1035		index = index + ce.len - 1;
1036	      }
1037	      options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
1038	      lastChar = 0; lastCharIsSet = false;
1039	      index++;
1040	    }
1041          } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1042            if (index == pLength) throw new REException("unexpected end of character class",REException.REG_EBRACK,index);
1043	    int posixID = -1;
1044	    boolean negate = false;
1045            char asciiEsc = 0;
1046	    boolean asciiEscIsSet = false;
1047	    NamedProperty np = null;
1048	    if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
1049	      switch (pattern[index]) {
1050	      case 'D':
1051		negate = true;
1052	      case 'd':
1053		posixID = RETokenPOSIX.DIGIT;
1054		break;
1055	      case 'S':
1056		negate = true;
1057	      case 's':
1058		posixID = RETokenPOSIX.SPACE;
1059		break;
1060	      case 'W':
1061		negate = true;
1062	      case 'w':
1063		posixID = RETokenPOSIX.ALNUM;
1064		break;
1065	      }
1066	    }
1067	    if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
1068	      np = getNamedProperty(pattern, index - 1, pLength);
1069	      if (np == null)
1070		throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1071	      index = index - 1 + np.len - 1;
1072	    }
1073	    else {
1074	      CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
1075	      if (ce == null)
1076		throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1077	      asciiEsc = ce.ch; asciiEscIsSet = true;
1078	      index = index - 1 + ce.len - 1;
1079	    }
1080	    if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1081	    
1082	    if (posixID != -1) {
1083	      options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
1084	    } else if (np != null) {
1085	      options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
1086	    } else if (asciiEscIsSet) {
1087	      lastChar = asciiEsc; lastCharIsSet = true;
1088	    } else {
1089	      lastChar = pattern[index]; lastCharIsSet = true;
1090	    }
1091	    ++index;
1092	  } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
1093	    StringBuffer posixSet = new StringBuffer();
1094	    index = getPosixSet(pattern,index+1,posixSet);
1095	    int posixId = RETokenPOSIX.intValue(posixSet.toString());
1096	    if (posixId != -1)
1097	      options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
1098	  } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
1099		ParseCharClassResult result = parseCharClass(
1100		    subIndex, pattern, index, pLength, cflags, syntax, 0);
1101		addition.addElement(result.token);
1102		addition.addElement("|");
1103		index = result.index;
1104	  } else if ((ch == '&') &&
1105		     (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
1106		     (index < pLength) && (pattern[index] == '&')) {
1107		if (returnAtAndOperator) {
1108		    ParseCharClassResult result = new ParseCharClassResult(); 
1109		    options.trimToSize();
1110		    if (additionAndAppeared) addition.addElement("&");
1111		    if (addition.size() == 0) addition = null;
1112		    result.token = new RETokenOneOf(subIndex,
1113			options, addition, negative);
1114		    result.index = index - 1;
1115		    result.returnAtAndOperator = true;
1116		    return result;
1117		}
1118		// The precedence of the operator "&&" is the lowest.
1119		// So we postpone adding "&" until other elements
1120		// are added. And we insert Boolean.FALSE at the
1121		// beginning of the list of tokens following "&&".
1122		// So, "&&[a-b][k-m]" will be stored in the Vecter
1123		// addition in this order:
1124		//     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1125		if (additionAndAppeared) addition.addElement("&");
1126		addition.addElement(Boolean.FALSE);
1127		additionAndAppeared = true;
1128
1129		// The part on which "&&" operates may be either
1130		//   (1) explicitly enclosed by []
1131		//   or
1132		//   (2) not enclosed by [] and terminated by the
1133		//       next "&&" or the end of the character list.
1134	        //  Let the preceding else if block do the case (1).
1135		//  We must do something in case of (2).
1136		if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
1137		    ParseCharClassResult result = parseCharClass(
1138			subIndex, pattern, index+1, pLength, cflags, syntax,
1139			RETURN_AT_AND);
1140		    addition.addElement(result.token);
1141		    addition.addElement("|");
1142		    // If the method returned at the next "&&", it is OK.
1143		    // Otherwise we have eaten the mark of the end of this
1144		    // character list "]".  In this case we must give back
1145		    // the end mark.
1146		    index = (result.returnAtAndOperator ?
1147			result.index: result.index - 1);
1148		}
1149	  } else {
1150	    if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1151	    lastChar = ch; lastCharIsSet = true;
1152	  }
1153	  if (index == pLength) throw new REException("unexpected end of character class",REException.REG_EBRACK,index);
1154	} // while in list
1155	// Out of list, index is one past ']'
1156	    
1157	if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
1158	   
1159	ParseCharClassResult result = new ParseCharClassResult(); 
1160	// Create a new RETokenOneOf
1161	options.trimToSize();
1162	if (additionAndAppeared) addition.addElement("&");
1163	if (addition.size() == 0) addition = null;
1164	result.token = new RETokenOneOf(subIndex,options, addition, negative);
1165	result.index = index;
1166	return result;
1167  }
1168
1169  private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
1170    unit.ch = input[index++];
1171    unit.bk = (unit.ch == '\\'
1172	       && (!quot || index >= input.length || input[index] == 'E'));
1173    if (unit.bk)
1174      if (index < input.length)
1175	unit.ch = input[index++];
1176      else throw new REException("backslash at end of pattern",REException.REG_ESCAPE,index);
1177    return index;
1178  }
1179
1180  private static int parseInt(char[] input, int pos, int len, int radix) {
1181    int ret = 0;
1182    for (int i = pos; i < pos + len; i++) {
1183	ret = ret * radix + Character.digit(input[i], radix);
1184    }
1185    return ret;
1186  }
1187
1188  /**
1189   * This class represents various expressions for a character.
1190   * "a"      : 'a' itself.
1191   * "\0123"  : Octal char 0123
1192   * "\x1b"   : Hex char 0x1b
1193   * "\u1234" : Unicode char \u1234
1194   */
1195  private static class CharExpression {
1196    /** character represented by this expression */
1197    char ch;
1198    /** String expression */
1199    String expr;
1200    /** length of this expression */
1201    int len;
1202    public String toString() { return expr; }
1203  }
1204
1205  private static CharExpression getCharExpression(char[] input, int pos, int lim,
1206        RESyntax syntax) {
1207    CharExpression ce = new CharExpression();
1208    char c = input[pos];
1209    if (c == '\\') {
1210      if (pos + 1 >= lim) return null;
1211      c = input[pos + 1];
1212      switch(c) {
1213      case 't':
1214        ce.ch = '\t';
1215        ce.len = 2;
1216        break;
1217      case 'n':
1218        ce.ch = '\n';
1219        ce.len = 2;
1220        break;
1221      case 'r':
1222        ce.ch = '\r';
1223        ce.len = 2;
1224        break;
1225      case 'x':
1226      case 'u':
1227        if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1228            (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1229          int l = 0;
1230          int expectedLength = (c == 'x' ? 2 : 4);
1231          for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
1232            if (i >= lim) break;
1233            if (!((input[i] >= '0' && input[i] <= '9') ||
1234                  (input[i] >= 'A' && input[i] <= 'F') ||
1235                  (input[i] >= 'a' && input[i] <= 'f')))
1236                break;
1237	    l++;
1238          }
1239          if (l != expectedLength) return null;
1240          ce.ch = (char)(parseInt(input, pos + 2, l, 16));
1241	  ce.len = l + 2;
1242        }
1243        else {
1244          ce.ch = c;
1245          ce.len = 2;
1246        }
1247        break;
1248      case '0':
1249        if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
1250          int l = 0;
1251          for (int i = pos + 2; i < pos + 2 + 3; i++) {
1252            if (i >= lim) break;
1253	    if (input[i] < '0' || input[i] > '7') break;
1254            l++;
1255          }
1256          if (l == 3 && input[pos + 2] > '3') l--;
1257          if (l <= 0) return null;
1258          ce.ch = (char)(parseInt(input, pos + 2, l, 8));
1259          ce.len = l + 2;
1260        }
1261        else {
1262          ce.ch = c;
1263          ce.len = 2;
1264        }
1265        break;
1266      default:
1267        ce.ch = c;
1268        ce.len = 2;
1269        break;
1270      }
1271    }
1272    else {
1273      ce.ch = input[pos];
1274      ce.len = 1;
1275    }
1276    ce.expr = new String(input, pos, ce.len);
1277    return ce;
1278  }
1279
1280  /**
1281   * This class represents a substring in a pattern string expressing
1282   * a named property.
1283   * "\pA"      : Property named "A"
1284   * "\p{prop}" : Property named "prop"
1285   * "\PA"      : Property named "A" (Negated)
1286   * "\P{prop}" : Property named "prop" (Negated)
1287   */
1288  private static class NamedProperty {
1289    /** Property name */
1290    String name;
1291    /** Negated or not */
1292    boolean negate;
1293    /** length of this expression */
1294    int len;
1295  }
1296
1297  private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
1298    NamedProperty np = new NamedProperty();
1299    char c = input[pos];
1300    if (c == '\\') {
1301      if (++pos >= lim) return null;
1302      c = input[pos++];
1303      switch(c) {
1304      case 'p':
1305        np.negate = false;
1306        break;
1307      case 'P':
1308        np.negate = true;
1309        break;
1310      default:
1311	return null;
1312      }
1313      c = input[pos++];
1314      if (c == '{') {
1315          int p = -1;
1316	  for (int i = pos; i < lim; i++) {
1317	      if (input[i] == '}') {
1318		  p = i;
1319		  break;
1320	      }
1321	  }
1322	  if (p < 0) return null;
1323	  int len = p - pos;
1324          np.name = new String(input, pos, len);
1325	  np.len = len + 4;
1326      }
1327      else {
1328          np.name = new String(input, pos - 1, 1);
1329	  np.len = 3;
1330      }
1331      return np;
1332    }
1333    else return null;
1334  }
1335
1336  private static RETokenNamedProperty getRETokenNamedProperty(
1337      int subIndex, NamedProperty np, boolean insens, int index)
1338      throws REException {
1339    try {
1340	return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
1341    }
1342    catch (REException e) {
1343	REException ree;
1344	ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
1345	// ree.initCause(e);
1346	throw ree;
1347    }
1348  }
1349
1350  /**
1351   * Checks if the regular expression matches the input in its entirety.
1352   *
1353   * @param input The input text.
1354   */
1355  public boolean isMatch(Object input) {
1356    return isMatch(input,0,0);
1357  }
1358  
1359  /**
1360   * Checks if the input string, starting from index, is an exact match of
1361   * this regular expression.
1362   *
1363   * @param input The input text.
1364   * @param index The offset index at which the search should be begin.
1365   */
1366  public boolean isMatch(Object input,int index) {
1367    return isMatch(input,index,0);
1368  }
1369  
1370
1371  /**
1372   * Checks if the input, starting from index and using the specified
1373   * execution flags, is an exact match of this regular expression.
1374   *
1375   * @param input The input text.
1376   * @param index The offset index at which the search should be begin.
1377   * @param eflags The logical OR of any execution flags above.
1378   */
1379  public boolean isMatch(Object input,int index,int eflags) {
1380    return isMatchImpl(makeCharIndexed(input,index),index,eflags);
1381  }
1382
1383  private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
1384    if (firstToken == null)  // Trivial case
1385      return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
1386    REMatch m = new REMatch(numSubs, index, eflags);
1387    if (firstToken.match(input, m)) {
1388	while (m != null) {
1389	    if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
1390		return true;
1391	    }
1392	    m = m.next;
1393	}
1394    }
1395    return false;
1396  }
1397    
1398  /**
1399   * Returns the maximum number of subexpressions in this regular expression.
1400   * If the expression contains branches, the value returned will be the
1401   * maximum subexpressions in any of the branches.
1402   */
1403  public int getNumSubs() {
1404    return numSubs;
1405  }
1406
1407  // Overrides REToken.setUncle
1408  void setUncle(REToken uncle) {
1409      if (lastToken != null) {
1410	  lastToken.setUncle(uncle);
1411      } else super.setUncle(uncle); // to deal with empty subexpressions
1412  }
1413
1414  // Overrides REToken.chain
1415
1416  boolean chain(REToken next) {
1417    super.chain(next);
1418    setUncle(next);
1419    return true;
1420  }
1421
1422  /**
1423   * Returns the minimum number of characters that could possibly
1424   * constitute a match of this regular expression.
1425   */
1426  public int getMinimumLength() {
1427      return minimumLength;
1428  }
1429
1430  public int getMaximumLength() {
1431      return maximumLength;
1432  }
1433
1434  /**
1435   * Returns an array of all matches found in the input.
1436   *
1437   * If the regular expression allows the empty string to match, it w

Large files files are truncated, but you can click here to view the full file