PageRenderTime 60ms CodeModel.GetById 15ms app.highlight 39ms RepoModel.GetById 1ms app.codeStats 0ms

/interpreter/tags/at2-build190607/src/edu/vub/util/regexp/RESyntax.java

http://ambienttalk.googlecode.com/
Java | 563 lines | 194 code | 89 blank | 280 comment | 3 complexity | d10bad2dd3cfee27b9948752dff54bf1 MD5 | raw file
  1/* gnu/regexp/RESyntax.java
  2   Copyright (C) 2006 Free Software Foundation, Inc.
  3
  4This file is part of GNU Classpath.
  5
  6GNU Classpath is free software; you can redistribute it and/or modify
  7it under the terms of the GNU General Public License as published by
  8the Free Software Foundation; either version 2, or (at your option)
  9any later version.
 10
 11GNU Classpath is distributed in the hope that it will be useful, but
 12WITHOUT ANY WARRANTY; without even the implied warranty of
 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14General Public License for more details.
 15
 16You should have received a copy of the GNU General Public License
 17along with GNU Classpath; see the file COPYING.  If not, write to the
 18Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 1902110-1301 USA.
 20
 21Linking this library statically or dynamically with other modules is
 22making a combined work based on this library.  Thus, the terms and
 23conditions of the GNU General Public License cover the whole
 24combination.
 25
 26As a special exception, the copyright holders of this library give you
 27permission to link this library with independent modules to produce an
 28executable, regardless of the license terms of these independent
 29modules, and to copy and distribute the resulting executable under
 30terms of your choice, provided that you also meet, for each linked
 31independent module, the terms and conditions of the license of that
 32module.  An independent module is a module which is not derived from
 33or based on this library.  If you modify this library, you may extend
 34this exception to your version of the library, but you are not
 35obligated to do so.  If you do not wish to do so, delete this
 36exception statement from your version. */
 37
 38
 39package edu.vub.util.regexp;
 40import java.io.Serializable;
 41import java.util.BitSet;
 42
 43/**
 44 * An RESyntax specifies the way a regular expression will be compiled.
 45 * This class provides a number of predefined useful constants for
 46 * emulating popular regular expression syntaxes.  Additionally the
 47 * user may construct his or her own syntax, using any combination of the
 48 * syntax bit constants.  The syntax is an optional argument to any of the
 49 * matching methods on class RE.
 50 *
 51 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
 52 */
 53
 54public final class RESyntax implements Serializable {
 55    static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator");
 56
 57    private static final String SYNTAX_IS_FINAL = "Syntax has been declared final and cannot be modified";
 58
 59    private BitSet bits;
 60
 61    // true for the constant defined syntaxes
 62    private boolean isFinal = false;
 63
 64    private String lineSeparator = DEFAULT_LINE_SEPARATOR;
 65
 66  // Values for constants are bit indexes
 67
 68  /**
 69   * Syntax bit. Backslash is an escape character in lists.
 70   */
 71  public static final int RE_BACKSLASH_ESCAPE_IN_LISTS =  0;
 72
 73  /**
 74   * Syntax bit. Use \? instead of ? and \+ instead of +.
 75   */
 76  public static final int RE_BK_PLUS_QM                =  1;
 77
 78  /**
 79   * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
 80   */
 81  public static final int RE_CHAR_CLASSES              =  2;
 82
 83  /**
 84   * Syntax bit. ^ and $ are special everywhere.
 85   * <B>Not implemented.</B>
 86   */
 87  public static final int RE_CONTEXT_INDEP_ANCHORS     =  3; 
 88
 89  /**
 90   * Syntax bit. Repetition operators are only special in valid positions.
 91   * <B>Not implemented.</B>
 92   */
 93  public static final int RE_CONTEXT_INDEP_OPS         =  4; 
 94
 95  /**
 96   * Syntax bit. Repetition and alternation operators are invalid
 97   * at start and end of pattern and other places. 
 98   * <B>Not implemented</B>.
 99   */
100  public static final int RE_CONTEXT_INVALID_OPS       =  5; 
101
102  /**
103   * Syntax bit. Match-any-character operator (.) matches a newline.
104   */
105  public static final int RE_DOT_NEWLINE               =  6;
106
107  /**
108   * Syntax bit. Match-any-character operator (.) does not match a null.
109   */
110  public static final int RE_DOT_NOT_NULL              =  7;
111
112  /**
113   * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
114   */
115  public static final int RE_INTERVALS                 =  8;
116
117  /**
118   * Syntax bit. No alternation (|), match one-or-more (+), or 
119   * match zero-or-one (?) operators.
120   */
121  public static final int RE_LIMITED_OPS               =  9;
122
123  /**
124   * Syntax bit. Newline is an alternation operator.
125   */
126  public static final int RE_NEWLINE_ALT               = 10; // impl.
127
128  /**
129   * Syntax bit. Intervals use { } instead of \{ \}
130   */
131  public static final int RE_NO_BK_BRACES              = 11; 
132
133  /**
134   * Syntax bit. Grouping uses ( ) instead of \( \).
135   */
136  public static final int RE_NO_BK_PARENS              = 12;
137
138  /**
139   * Syntax bit. Backreferences not allowed.
140   */
141  public static final int RE_NO_BK_REFS                = 13;
142
143  /**
144   * Syntax bit. Alternation uses | instead of \|
145   */
146  public static final int RE_NO_BK_VBAR                = 14;
147
148  /**
149   * Syntax bit. <B>Not implemented</B>.
150   */
151  public static final int RE_NO_EMPTY_RANGES           = 15;
152
153  /**
154   * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
155   * on RE_NO_BK_PARENS) will throw an exception when compiling.
156   */
157  public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
158
159  /**
160   * Syntax bit. <B>Not implemented.</B>
161   */
162  public static final int RE_HAT_LISTS_NOT_NEWLINE     = 17;
163
164  /**
165   * Syntax bit.  Stingy matching is allowed (+?, *?, ??, {x,y}?).
166   */
167  public static final int RE_STINGY_OPS                = 18;
168
169  /**
170   * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
171   */
172  public static final int RE_CHAR_CLASS_ESCAPES        = 19;
173
174  /**
175   * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
176   */
177  public static final int RE_PURE_GROUPING             = 20;
178
179  /**
180   * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
181   * to the text following the current position without consuming that text.
182   */
183  public static final int RE_LOOKAHEAD                 = 21;
184
185  /**
186   * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
187   */
188  public static final int RE_STRING_ANCHORS            = 22;
189
190  /**
191   * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
192   */
193  public static final int RE_COMMENTS                  = 23;
194
195  /**
196   * Syntax bit. Allow character class escapes within lists, as in Perl5.
197   */
198  public static final int RE_CHAR_CLASS_ESC_IN_LISTS   = 24;
199
200  /**
201   * Syntax bit.  Possessive matching is allowed (++, *+, ?+, {x,y}+).
202   */
203  public static final int RE_POSSESSIVE_OPS            = 25;
204
205  /**
206   * Syntax bit.  Allow embedded flags, (?is-x), as in Perl5.
207   */
208  public static final int RE_EMBEDDED_FLAGS            = 26;
209
210  /**
211   * Syntax bit.  Allow octal char (\0377), as in Perl5.
212   */
213  public static final int RE_OCTAL_CHAR                = 27;
214
215  /**
216   * Syntax bit.  Allow hex char (\x1b), as in Perl5.
217   */
218  public static final int RE_HEX_CHAR                  = 28;
219
220  /**
221   * Syntax bit.  Allow Unicode char (\u1234), as in Java 1.4.
222   */
223  public static final int RE_UNICODE_CHAR              = 29;
224
225  /**
226   * Syntax bit.  Allow named property (\p{P}, \P{p}), as in Perl5.
227   */
228  public static final int RE_NAMED_PROPERTY            = 30;
229
230  /**
231   * Syntax bit.  Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4.
232   */
233  public static final int RE_NESTED_CHARCLASS          = 31;
234
235  private static final int BIT_TOTAL                   = 32;
236
237  /**
238   * Predefined syntax.
239   * Emulates regular expression support in the awk utility.
240   */
241  public static final RESyntax RE_SYNTAX_AWK;
242
243  /**
244   * Predefined syntax.
245   * Emulates regular expression support in the ed utility.
246   */
247  public static final RESyntax RE_SYNTAX_ED;
248
249  /**
250   * Predefined syntax.
251   * Emulates regular expression support in the egrep utility.
252   */
253  public static final RESyntax RE_SYNTAX_EGREP;
254
255  /**
256   * Predefined syntax.
257   * Emulates regular expression support in the GNU Emacs editor.
258   */
259  public static final RESyntax RE_SYNTAX_EMACS;
260
261  /**
262   * Predefined syntax.
263   * Emulates regular expression support in the grep utility.
264   */
265  public static final RESyntax RE_SYNTAX_GREP;
266
267  /**
268   * Predefined syntax.
269   * Emulates regular expression support in the POSIX awk specification.
270   */
271  public static final RESyntax RE_SYNTAX_POSIX_AWK;
272
273  /**
274   * Predefined syntax.
275   * Emulates POSIX basic regular expression support.
276   */
277  public static final RESyntax RE_SYNTAX_POSIX_BASIC;
278
279  /**
280   * Predefined syntax.
281   * Emulates regular expression support in the POSIX egrep specification.
282   */
283  public static final RESyntax RE_SYNTAX_POSIX_EGREP;
284
285  /**
286   * Predefined syntax.
287   * Emulates POSIX extended regular expression support.
288   */
289  public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
290
291  /**
292   * Predefined syntax.
293   * Emulates POSIX basic minimal regular expressions.
294   */
295  public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
296
297  /**
298   * Predefined syntax.
299   * Emulates POSIX extended minimal regular expressions.
300   */
301  public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
302
303  /**
304   * Predefined syntax.
305   * Emulates regular expression support in the sed utility.
306   */
307  public static final RESyntax RE_SYNTAX_SED;
308
309  /**
310   * Predefined syntax.
311   * Emulates regular expression support in Larry Wall's perl, version 4,
312   */
313  public static final RESyntax RE_SYNTAX_PERL4;
314
315  /**
316   * Predefined syntax.
317   * Emulates regular expression support in Larry Wall's perl, version 4,
318   * using single line mode (/s modifier).
319   */
320  public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
321
322  /**
323   * Predefined syntax.
324   * Emulates regular expression support in Larry Wall's perl, version 5.
325   */
326  public static final RESyntax RE_SYNTAX_PERL5;  
327
328  /**
329   * Predefined syntax.
330   * Emulates regular expression support in Larry Wall's perl, version 5,
331   * using single line mode (/s modifier).
332   */
333  public static final RESyntax RE_SYNTAX_PERL5_S;
334
335    /**
336     * Predefined syntax.
337     * Emulates regular expression support in Java 1.4's java.util.regex
338     * package.
339     */
340    public static final RESyntax RE_SYNTAX_JAVA_1_4;
341
342  static {
343      // Define syntaxes
344      
345      RE_SYNTAX_EMACS = new RESyntax().makeFinal();
346      
347      RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax()
348	  .set(RE_CHAR_CLASSES)
349	  .set(RE_DOT_NEWLINE)
350	  .set(RE_DOT_NOT_NULL)
351	  .set(RE_INTERVALS)
352	  .set(RE_NO_EMPTY_RANGES)
353	  .makeFinal();
354      
355      RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
356	  .set(RE_BK_PLUS_QM)
357	  .makeFinal();
358      
359      RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
360	  .set(RE_CONTEXT_INDEP_ANCHORS)
361	  .set(RE_CONTEXT_INDEP_OPS)
362	  .set(RE_NO_BK_BRACES)
363	  .set(RE_NO_BK_PARENS)
364	  .set(RE_NO_BK_VBAR)
365	  .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
366	  .makeFinal();
367
368      RE_SYNTAX_AWK = new RESyntax()
369	  .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
370	  .set(RE_DOT_NOT_NULL)
371	  .set(RE_NO_BK_PARENS)
372	  .set(RE_NO_BK_REFS)
373	  .set(RE_NO_BK_VBAR)
374	  .set(RE_NO_EMPTY_RANGES)
375	  .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
376	  .makeFinal();
377      
378      RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
379	  .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
380	  .makeFinal();
381      
382      RE_SYNTAX_GREP = new RESyntax()
383	  .set(RE_BK_PLUS_QM)
384	  .set(RE_CHAR_CLASSES)
385	  .set(RE_HAT_LISTS_NOT_NEWLINE)
386	  .set(RE_INTERVALS)
387	  .set(RE_NEWLINE_ALT)
388	  .makeFinal();
389      
390      RE_SYNTAX_EGREP = new RESyntax()
391	  .set(RE_CHAR_CLASSES)
392	  .set(RE_CONTEXT_INDEP_ANCHORS)
393	  .set(RE_CONTEXT_INDEP_OPS)
394	  .set(RE_HAT_LISTS_NOT_NEWLINE)
395	  .set(RE_NEWLINE_ALT)
396	  .set(RE_NO_BK_PARENS)
397	  .set(RE_NO_BK_VBAR)
398	  .makeFinal();
399    
400      RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP)
401	  .set(RE_INTERVALS)
402	  .set(RE_NO_BK_BRACES)
403	  .makeFinal();
404    
405      /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff.  */
406    
407      RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
408	  .makeFinal();
409    
410      RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
411	  .makeFinal();
412      
413      RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
414	  .set(RE_LIMITED_OPS)
415	  .makeFinal();
416      
417      /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
418	 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
419      
420      RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
421	  .set(RE_CONTEXT_INDEP_ANCHORS)
422	  .set(RE_CONTEXT_INVALID_OPS)
423	  .set(RE_NO_BK_BRACES)
424	  .set(RE_NO_BK_PARENS)
425	  .set(RE_NO_BK_REFS)
426	  .set(RE_NO_BK_VBAR)
427	  .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
428	  .makeFinal();
429      
430      /* There is no official Perl spec, but here's a "best guess" */
431      
432      RE_SYNTAX_PERL4 = new RESyntax()
433	  .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
434	  .set(RE_CONTEXT_INDEP_ANCHORS)
435	  .set(RE_CONTEXT_INDEP_OPS)          // except for '{', apparently
436	  .set(RE_INTERVALS)
437	  .set(RE_NO_BK_BRACES)
438	  .set(RE_NO_BK_PARENS)
439	  .set(RE_NO_BK_VBAR)
440	  .set(RE_NO_EMPTY_RANGES)
441	  .set(RE_CHAR_CLASS_ESCAPES)    // \d,\D,\w,\W,\s,\S
442	  .makeFinal();
443      
444      RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4)
445	  .set(RE_DOT_NEWLINE)
446	  .makeFinal();
447      
448      RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4)
449	  .set(RE_PURE_GROUPING)          // (?:)
450	  .set(RE_STINGY_OPS)             // *?,??,+?,{}?
451	  .set(RE_LOOKAHEAD)              // (?=)(?!)
452	  .set(RE_STRING_ANCHORS)         // \A,\Z
453	  .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
454	  .set(RE_COMMENTS)              // (?#)
455	  .set(RE_EMBEDDED_FLAGS)         // (?imsx-imsx)
456	  .set(RE_OCTAL_CHAR)             // \0377
457	  .set(RE_HEX_CHAR)               // \x1b
458	  .set(RE_NAMED_PROPERTY)         // \p{prop}, \P{prop}
459	  .makeFinal();
460      
461      RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
462	  .set(RE_DOT_NEWLINE)
463	  .makeFinal();
464
465      RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
466	  // XXX
467	  .set(RE_POSSESSIVE_OPS)         // *+,?+,++,{}+
468	  .set(RE_UNICODE_CHAR)           // \u1234
469	  .set(RE_NESTED_CHARCLASS)       // [a-z&&[^p-r]]
470	  .makeFinal();
471  }
472
473  /**
474   * Construct a new syntax object with all bits turned off.
475   * This is equivalent to RE_SYNTAX_EMACS.
476   */
477  public RESyntax() {
478    bits = new BitSet(BIT_TOTAL);
479  }
480
481    /**
482     * Called internally when constructing predefined syntaxes
483     * so their interpretation cannot vary.  Conceivably useful
484     * for your syntaxes as well.  Causes IllegalAccessError to
485     * be thrown if any attempt to modify the syntax is made.
486     *
487     * @return this object for convenient chaining
488     */
489    public RESyntax makeFinal() {
490	isFinal = true;
491	return this;
492    }
493
494  /**
495   * Construct a new syntax object with all bits set the same 
496   * as the other syntax.
497   */
498  public RESyntax(RESyntax other) {
499    bits = (BitSet) other.bits.clone();
500  }
501
502  /**
503   * Check if a given bit is set in this syntax.
504   */
505  public boolean get(int index) {
506    return bits.get(index);
507  }
508
509  /**
510   * Set a given bit in this syntax. 
511   *
512   * @param index the constant (RESyntax.RE_xxx) bit to set.
513   * @return a reference to this object for easy chaining.
514   */
515  public RESyntax set(int index) {
516      if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
517    bits.set(index);
518    return this;
519  }
520
521  /**
522   * Clear a given bit in this syntax. 
523   *
524   * @param index the constant (RESyntax.RE_xxx) bit to clear.
525   * @return a reference to this object for easy chaining.
526   */
527  public RESyntax clear(int index) {
528      if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
529      bits.clear(index);
530      return this;
531  }
532
533    /**
534     * Changes the line separator string for regular expressions
535     * created using this RESyntax.  The default separator is the
536     * value returned by the system property "line.separator", which
537     * should be correct when reading platform-specific files from a
538     * filesystem.  However, many programs may collect input from
539     * sources where the line separator is differently specified (for
540     * example, in the applet environment, the text box widget
541     * interprets line breaks as single-character newlines,
542     * regardless of the host platform.
543     *
544     * Note that setting the line separator to a character or
545     * characters that have specific meaning within the current syntax
546     * can cause unexpected chronosynclastic infundibula.
547     *
548     * @return this object for convenient chaining 
549     */
550    public RESyntax setLineSeparator(String aSeparator) {
551	if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
552	lineSeparator = aSeparator;
553	return this;
554    }
555
556    /**
557     * Returns the currently active line separator string.  The default
558     * is the platform-dependent system property "line.separator".
559     */
560    public String getLineSeparator() {
561	return lineSeparator;
562    }
563}