/interpreter/tags/at2-build190607/src/edu/vub/util/regexp/RESyntax.java
Java | 563 lines | 194 code | 89 blank | 280 comment | 3 complexity | d10bad2dd3cfee27b9948752dff54bf1 MD5 | raw file
1/* gnu/regexp/RESyntax.java 2 Copyright (C) 2006 Free Software Foundation, Inc. 3 4This file is part of GNU Classpath. 5 6GNU Classpath is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 2, or (at your option) 9any later version. 10 11GNU Classpath is distributed in the hope that it will be useful, but 12WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14General Public License for more details. 15 16You should have received a copy of the GNU General Public License 17along with GNU Classpath; see the file COPYING. If not, write to the 18Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 1902110-1301 USA. 20 21Linking this library statically or dynamically with other modules is 22making a combined work based on this library. Thus, the terms and 23conditions of the GNU General Public License cover the whole 24combination. 25 26As a special exception, the copyright holders of this library give you 27permission to link this library with independent modules to produce an 28executable, regardless of the license terms of these independent 29modules, and to copy and distribute the resulting executable under 30terms of your choice, provided that you also meet, for each linked 31independent module, the terms and conditions of the license of that 32module. An independent module is a module which is not derived from 33or based on this library. If you modify this library, you may extend 34this exception to your version of the library, but you are not 35obligated to do so. If you do not wish to do so, delete this 36exception statement from your version. */ 37 38 39package edu.vub.util.regexp; 40import java.io.Serializable; 41import java.util.BitSet; 42 43/** 44 * An RESyntax specifies the way a regular expression will be compiled. 45 * This class provides a number of predefined useful constants for 46 * emulating popular regular expression syntaxes. Additionally the 47 * user may construct his or her own syntax, using any combination of the 48 * syntax bit constants. The syntax is an optional argument to any of the 49 * matching methods on class RE. 50 * 51 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 52 */ 53 54public final class RESyntax implements Serializable { 55 static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator"); 56 57 private static final String SYNTAX_IS_FINAL = "Syntax has been declared final and cannot be modified"; 58 59 private BitSet bits; 60 61 // true for the constant defined syntaxes 62 private boolean isFinal = false; 63 64 private String lineSeparator = DEFAULT_LINE_SEPARATOR; 65 66 // Values for constants are bit indexes 67 68 /** 69 * Syntax bit. Backslash is an escape character in lists. 70 */ 71 public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0; 72 73 /** 74 * Syntax bit. Use \? instead of ? and \+ instead of +. 75 */ 76 public static final int RE_BK_PLUS_QM = 1; 77 78 /** 79 * Syntax bit. POSIX character classes ([:...:]) in lists are allowed. 80 */ 81 public static final int RE_CHAR_CLASSES = 2; 82 83 /** 84 * Syntax bit. ^ and $ are special everywhere. 85 * <B>Not implemented.</B> 86 */ 87 public static final int RE_CONTEXT_INDEP_ANCHORS = 3; 88 89 /** 90 * Syntax bit. Repetition operators are only special in valid positions. 91 * <B>Not implemented.</B> 92 */ 93 public static final int RE_CONTEXT_INDEP_OPS = 4; 94 95 /** 96 * Syntax bit. Repetition and alternation operators are invalid 97 * at start and end of pattern and other places. 98 * <B>Not implemented</B>. 99 */ 100 public static final int RE_CONTEXT_INVALID_OPS = 5; 101 102 /** 103 * Syntax bit. Match-any-character operator (.) matches a newline. 104 */ 105 public static final int RE_DOT_NEWLINE = 6; 106 107 /** 108 * Syntax bit. Match-any-character operator (.) does not match a null. 109 */ 110 public static final int RE_DOT_NOT_NULL = 7; 111 112 /** 113 * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed. 114 */ 115 public static final int RE_INTERVALS = 8; 116 117 /** 118 * Syntax bit. No alternation (|), match one-or-more (+), or 119 * match zero-or-one (?) operators. 120 */ 121 public static final int RE_LIMITED_OPS = 9; 122 123 /** 124 * Syntax bit. Newline is an alternation operator. 125 */ 126 public static final int RE_NEWLINE_ALT = 10; // impl. 127 128 /** 129 * Syntax bit. Intervals use { } instead of \{ \} 130 */ 131 public static final int RE_NO_BK_BRACES = 11; 132 133 /** 134 * Syntax bit. Grouping uses ( ) instead of \( \). 135 */ 136 public static final int RE_NO_BK_PARENS = 12; 137 138 /** 139 * Syntax bit. Backreferences not allowed. 140 */ 141 public static final int RE_NO_BK_REFS = 13; 142 143 /** 144 * Syntax bit. Alternation uses | instead of \| 145 */ 146 public static final int RE_NO_BK_VBAR = 14; 147 148 /** 149 * Syntax bit. <B>Not implemented</B>. 150 */ 151 public static final int RE_NO_EMPTY_RANGES = 15; 152 153 /** 154 * Syntax bit. An unmatched right parenthesis (')' or '\)', depending 155 * on RE_NO_BK_PARENS) will throw an exception when compiling. 156 */ 157 public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16; 158 159 /** 160 * Syntax bit. <B>Not implemented.</B> 161 */ 162 public static final int RE_HAT_LISTS_NOT_NEWLINE = 17; 163 164 /** 165 * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?). 166 */ 167 public static final int RE_STINGY_OPS = 18; 168 169 /** 170 * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W). 171 */ 172 public static final int RE_CHAR_CLASS_ESCAPES = 19; 173 174 /** 175 * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved). 176 */ 177 public static final int RE_PURE_GROUPING = 20; 178 179 /** 180 * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression 181 * to the text following the current position without consuming that text. 182 */ 183 public static final int RE_LOOKAHEAD = 21; 184 185 /** 186 * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z). 187 */ 188 public static final int RE_STRING_ANCHORS = 22; 189 190 /** 191 * Syntax bit. Allow embedded comments, (?#comment), as in Perl5. 192 */ 193 public static final int RE_COMMENTS = 23; 194 195 /** 196 * Syntax bit. Allow character class escapes within lists, as in Perl5. 197 */ 198 public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24; 199 200 /** 201 * Syntax bit. Possessive matching is allowed (++, *+, ?+, {x,y}+). 202 */ 203 public static final int RE_POSSESSIVE_OPS = 25; 204 205 /** 206 * Syntax bit. Allow embedded flags, (?is-x), as in Perl5. 207 */ 208 public static final int RE_EMBEDDED_FLAGS = 26; 209 210 /** 211 * Syntax bit. Allow octal char (\0377), as in Perl5. 212 */ 213 public static final int RE_OCTAL_CHAR = 27; 214 215 /** 216 * Syntax bit. Allow hex char (\x1b), as in Perl5. 217 */ 218 public static final int RE_HEX_CHAR = 28; 219 220 /** 221 * Syntax bit. Allow Unicode char (\u1234), as in Java 1.4. 222 */ 223 public static final int RE_UNICODE_CHAR = 29; 224 225 /** 226 * Syntax bit. Allow named property (\p{P}, \P{p}), as in Perl5. 227 */ 228 public static final int RE_NAMED_PROPERTY = 30; 229 230 /** 231 * Syntax bit. Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4. 232 */ 233 public static final int RE_NESTED_CHARCLASS = 31; 234 235 private static final int BIT_TOTAL = 32; 236 237 /** 238 * Predefined syntax. 239 * Emulates regular expression support in the awk utility. 240 */ 241 public static final RESyntax RE_SYNTAX_AWK; 242 243 /** 244 * Predefined syntax. 245 * Emulates regular expression support in the ed utility. 246 */ 247 public static final RESyntax RE_SYNTAX_ED; 248 249 /** 250 * Predefined syntax. 251 * Emulates regular expression support in the egrep utility. 252 */ 253 public static final RESyntax RE_SYNTAX_EGREP; 254 255 /** 256 * Predefined syntax. 257 * Emulates regular expression support in the GNU Emacs editor. 258 */ 259 public static final RESyntax RE_SYNTAX_EMACS; 260 261 /** 262 * Predefined syntax. 263 * Emulates regular expression support in the grep utility. 264 */ 265 public static final RESyntax RE_SYNTAX_GREP; 266 267 /** 268 * Predefined syntax. 269 * Emulates regular expression support in the POSIX awk specification. 270 */ 271 public static final RESyntax RE_SYNTAX_POSIX_AWK; 272 273 /** 274 * Predefined syntax. 275 * Emulates POSIX basic regular expression support. 276 */ 277 public static final RESyntax RE_SYNTAX_POSIX_BASIC; 278 279 /** 280 * Predefined syntax. 281 * Emulates regular expression support in the POSIX egrep specification. 282 */ 283 public static final RESyntax RE_SYNTAX_POSIX_EGREP; 284 285 /** 286 * Predefined syntax. 287 * Emulates POSIX extended regular expression support. 288 */ 289 public static final RESyntax RE_SYNTAX_POSIX_EXTENDED; 290 291 /** 292 * Predefined syntax. 293 * Emulates POSIX basic minimal regular expressions. 294 */ 295 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC; 296 297 /** 298 * Predefined syntax. 299 * Emulates POSIX extended minimal regular expressions. 300 */ 301 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED; 302 303 /** 304 * Predefined syntax. 305 * Emulates regular expression support in the sed utility. 306 */ 307 public static final RESyntax RE_SYNTAX_SED; 308 309 /** 310 * Predefined syntax. 311 * Emulates regular expression support in Larry Wall's perl, version 4, 312 */ 313 public static final RESyntax RE_SYNTAX_PERL4; 314 315 /** 316 * Predefined syntax. 317 * Emulates regular expression support in Larry Wall's perl, version 4, 318 * using single line mode (/s modifier). 319 */ 320 public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s) 321 322 /** 323 * Predefined syntax. 324 * Emulates regular expression support in Larry Wall's perl, version 5. 325 */ 326 public static final RESyntax RE_SYNTAX_PERL5; 327 328 /** 329 * Predefined syntax. 330 * Emulates regular expression support in Larry Wall's perl, version 5, 331 * using single line mode (/s modifier). 332 */ 333 public static final RESyntax RE_SYNTAX_PERL5_S; 334 335 /** 336 * Predefined syntax. 337 * Emulates regular expression support in Java 1.4's java.util.regex 338 * package. 339 */ 340 public static final RESyntax RE_SYNTAX_JAVA_1_4; 341 342 static { 343 // Define syntaxes 344 345 RE_SYNTAX_EMACS = new RESyntax().makeFinal(); 346 347 RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax() 348 .set(RE_CHAR_CLASSES) 349 .set(RE_DOT_NEWLINE) 350 .set(RE_DOT_NOT_NULL) 351 .set(RE_INTERVALS) 352 .set(RE_NO_EMPTY_RANGES) 353 .makeFinal(); 354 355 RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON) 356 .set(RE_BK_PLUS_QM) 357 .makeFinal(); 358 359 RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON) 360 .set(RE_CONTEXT_INDEP_ANCHORS) 361 .set(RE_CONTEXT_INDEP_OPS) 362 .set(RE_NO_BK_BRACES) 363 .set(RE_NO_BK_PARENS) 364 .set(RE_NO_BK_VBAR) 365 .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 366 .makeFinal(); 367 368 RE_SYNTAX_AWK = new RESyntax() 369 .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 370 .set(RE_DOT_NOT_NULL) 371 .set(RE_NO_BK_PARENS) 372 .set(RE_NO_BK_REFS) 373 .set(RE_NO_BK_VBAR) 374 .set(RE_NO_EMPTY_RANGES) 375 .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 376 .makeFinal(); 377 378 RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED) 379 .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 380 .makeFinal(); 381 382 RE_SYNTAX_GREP = new RESyntax() 383 .set(RE_BK_PLUS_QM) 384 .set(RE_CHAR_CLASSES) 385 .set(RE_HAT_LISTS_NOT_NEWLINE) 386 .set(RE_INTERVALS) 387 .set(RE_NEWLINE_ALT) 388 .makeFinal(); 389 390 RE_SYNTAX_EGREP = new RESyntax() 391 .set(RE_CHAR_CLASSES) 392 .set(RE_CONTEXT_INDEP_ANCHORS) 393 .set(RE_CONTEXT_INDEP_OPS) 394 .set(RE_HAT_LISTS_NOT_NEWLINE) 395 .set(RE_NEWLINE_ALT) 396 .set(RE_NO_BK_PARENS) 397 .set(RE_NO_BK_VBAR) 398 .makeFinal(); 399 400 RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP) 401 .set(RE_INTERVALS) 402 .set(RE_NO_BK_BRACES) 403 .makeFinal(); 404 405 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ 406 407 RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC) 408 .makeFinal(); 409 410 RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC) 411 .makeFinal(); 412 413 RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON) 414 .set(RE_LIMITED_OPS) 415 .makeFinal(); 416 417 /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS 418 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ 419 420 RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON) 421 .set(RE_CONTEXT_INDEP_ANCHORS) 422 .set(RE_CONTEXT_INVALID_OPS) 423 .set(RE_NO_BK_BRACES) 424 .set(RE_NO_BK_PARENS) 425 .set(RE_NO_BK_REFS) 426 .set(RE_NO_BK_VBAR) 427 .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 428 .makeFinal(); 429 430 /* There is no official Perl spec, but here's a "best guess" */ 431 432 RE_SYNTAX_PERL4 = new RESyntax() 433 .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 434 .set(RE_CONTEXT_INDEP_ANCHORS) 435 .set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently 436 .set(RE_INTERVALS) 437 .set(RE_NO_BK_BRACES) 438 .set(RE_NO_BK_PARENS) 439 .set(RE_NO_BK_VBAR) 440 .set(RE_NO_EMPTY_RANGES) 441 .set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S 442 .makeFinal(); 443 444 RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4) 445 .set(RE_DOT_NEWLINE) 446 .makeFinal(); 447 448 RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4) 449 .set(RE_PURE_GROUPING) // (?:) 450 .set(RE_STINGY_OPS) // *?,??,+?,{}? 451 .set(RE_LOOKAHEAD) // (?=)(?!) 452 .set(RE_STRING_ANCHORS) // \A,\Z 453 .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within [] 454 .set(RE_COMMENTS) // (?#) 455 .set(RE_EMBEDDED_FLAGS) // (?imsx-imsx) 456 .set(RE_OCTAL_CHAR) // \0377 457 .set(RE_HEX_CHAR) // \x1b 458 .set(RE_NAMED_PROPERTY) // \p{prop}, \P{prop} 459 .makeFinal(); 460 461 RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5) 462 .set(RE_DOT_NEWLINE) 463 .makeFinal(); 464 465 RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5) 466 // XXX 467 .set(RE_POSSESSIVE_OPS) // *+,?+,++,{}+ 468 .set(RE_UNICODE_CHAR) // \u1234 469 .set(RE_NESTED_CHARCLASS) // [a-z&&[^p-r]] 470 .makeFinal(); 471 } 472 473 /** 474 * Construct a new syntax object with all bits turned off. 475 * This is equivalent to RE_SYNTAX_EMACS. 476 */ 477 public RESyntax() { 478 bits = new BitSet(BIT_TOTAL); 479 } 480 481 /** 482 * Called internally when constructing predefined syntaxes 483 * so their interpretation cannot vary. Conceivably useful 484 * for your syntaxes as well. Causes IllegalAccessError to 485 * be thrown if any attempt to modify the syntax is made. 486 * 487 * @return this object for convenient chaining 488 */ 489 public RESyntax makeFinal() { 490 isFinal = true; 491 return this; 492 } 493 494 /** 495 * Construct a new syntax object with all bits set the same 496 * as the other syntax. 497 */ 498 public RESyntax(RESyntax other) { 499 bits = (BitSet) other.bits.clone(); 500 } 501 502 /** 503 * Check if a given bit is set in this syntax. 504 */ 505 public boolean get(int index) { 506 return bits.get(index); 507 } 508 509 /** 510 * Set a given bit in this syntax. 511 * 512 * @param index the constant (RESyntax.RE_xxx) bit to set. 513 * @return a reference to this object for easy chaining. 514 */ 515 public RESyntax set(int index) { 516 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL); 517 bits.set(index); 518 return this; 519 } 520 521 /** 522 * Clear a given bit in this syntax. 523 * 524 * @param index the constant (RESyntax.RE_xxx) bit to clear. 525 * @return a reference to this object for easy chaining. 526 */ 527 public RESyntax clear(int index) { 528 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL); 529 bits.clear(index); 530 return this; 531 } 532 533 /** 534 * Changes the line separator string for regular expressions 535 * created using this RESyntax. The default separator is the 536 * value returned by the system property "line.separator", which 537 * should be correct when reading platform-specific files from a 538 * filesystem. However, many programs may collect input from 539 * sources where the line separator is differently specified (for 540 * example, in the applet environment, the text box widget 541 * interprets line breaks as single-character newlines, 542 * regardless of the host platform. 543 * 544 * Note that setting the line separator to a character or 545 * characters that have specific meaning within the current syntax 546 * can cause unexpected chronosynclastic infundibula. 547 * 548 * @return this object for convenient chaining 549 */ 550 public RESyntax setLineSeparator(String aSeparator) { 551 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL); 552 lineSeparator = aSeparator; 553 return this; 554 } 555 556 /** 557 * Returns the currently active line separator string. The default 558 * is the platform-dependent system property "line.separator". 559 */ 560 public String getLineSeparator() { 561 return lineSeparator; 562 } 563}