/bianca/src/main/java/com/clevercloud/bianca/lib/regexp/Regexp.java
Java | 306 lines | 216 code | 51 blank | 39 comment | 41 complexity | 0063f71ab31348a18d3a8e9c6e4f0d38 MD5 | raw file
Possible License(s): GPL-2.0, MPL-2.0-no-copyleft-exception
- /*
- * Copyright (c) 1998-2010 Caucho Technology -- all rights reserved
- * Copyright (c) 2011-2012 Clever Cloud SAS -- all rights reserved
- *
- * This file is part of Bianca(R) Open Source
- *
- * Each copy or derived work must preserve the copyright notice and this
- * notice unmodified.
- *
- * Bianca Open Source is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Bianca Open Source is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
- * of NON-INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Bianca Open Source; if not, write to the
- *
- * Free Software Foundation, Inc.
- * 59 Temple Place, Suite 330
- * Boston, MA 02111-1307 USA
- *
- * @author Scott Ferguson
- * @author Marc-Antoine Perennou <Marc-Antoine@Perennou.com>
- */
- package com.clevercloud.bianca.lib.regexp;
- import com.clevercloud.bianca.BiancaException;
- import com.clevercloud.bianca.env.StringValue;
- import com.clevercloud.util.CharBuffer;
- import com.clevercloud.util.L10N;
- import java.util.Map;
- import java.util.logging.Logger;
- public class Regexp {
- private static final Logger log = Logger.getLogger(Regexp.class.getName());
- private static final L10N L = new L10N(Regexp.class);
- public static final int FAIL = -1;
- public static final int SUCCESS = 0;
- final StringValue _rawRegexp;
- StringValue _pattern;
- int _flags;
- RegexpNode _prog;
- boolean _ignoreCase;
- boolean _isGlobal;
- int _nLoop;
- int _nGroup;
- // optim stuff
- CharBuffer _prefix; // initial string
- int _minLength; // minimum length possible for this regexp
- int _firstChar;
- boolean[] _firstSet;
- boolean _isAnchorBegin;
- StringValue[] _groupNames;
- boolean _isUnicode;
- boolean _isPHP5String;
- boolean _isUtf8;
- boolean _isEval;
- public Regexp(StringValue rawRegexp)
- throws IllegalRegexpException {
- _rawRegexp = rawRegexp;
- _pattern = rawRegexp;
- init();
- Regcomp comp = new Regcomp(_flags);
- _prog = comp.parse(new PeekString(_pattern));
- compile(_prog, comp);
- }
- protected void init() {
- StringValue rawRegexp = _rawRegexp;
- if (rawRegexp.length() < 2) {
- throw new IllegalStateException(L.l(
- "Can't find delimiters in regexp '{0}'.",
- rawRegexp));
- }
- int head = 0;
- char delim = '/';
- for (;
- head < rawRegexp.length()
- && Character.isWhitespace((delim = rawRegexp.charAt(head)));
- head++) {
- }
- if (delim == '{') {
- delim = '}';
- } else if (delim == '[') {
- delim = ']';
- } else if (delim == '(') {
- delim = ')';
- } else if (delim == '<') {
- delim = '>';
- } else if (delim == '\\' || Character.isLetterOrDigit(delim)) {
- throw new BiancaException(L.l(
- "Delimiter {0} in regexp '{1}' must "
- + "not be backslash or alphanumeric.",
- String.valueOf(delim),
- rawRegexp));
- }
- int tail = rawRegexp.lastIndexOf(delim);
- if (tail <= 0) {
- throw new BiancaException(L.l(
- "Can't find second {0} in regexp '{1}'.",
- String.valueOf(delim),
- rawRegexp));
- }
- StringValue sflags = rawRegexp.substring(tail + 1);
- StringValue pattern = rawRegexp.substring(head + 1, tail);
- _pattern = pattern;
- int flags = 0;
- for (int i = 0; sflags != null && i < sflags.length(); i++) {
- switch (sflags.charAt(i)) {
- case 'm':
- flags |= Regcomp.MULTILINE;
- break;
- case 's':
- flags |= Regcomp.SINGLE_LINE;
- break;
- case 'i':
- flags |= Regcomp.IGNORE_CASE;
- break;
- case 'x':
- flags |= Regcomp.IGNORE_WS;
- break;
- case 'g':
- flags |= Regcomp.GLOBAL;
- break;
- case 'A':
- flags |= Regcomp.ANCHORED;
- break;
- case 'D':
- flags |= Regcomp.END_ONLY;
- break;
- case 'U':
- flags |= Regcomp.UNGREEDY;
- break;
- case 'X':
- flags |= Regcomp.STRICT;
- break;
- case 'S': /* speedup */
- ;
- break;
- case 'u':
- flags |= Regcomp.UTF8;
- break;
- case 'e':
- _isEval = true;
- break;
- default:
- throw new BiancaException(L.l("'{0}' is an unknown regexp flag in {1}",
- (char) sflags.charAt(i), rawRegexp));
- }
- }
- _flags = flags;
- _pattern = pattern;
- }
- public StringValue getRawRegexp() {
- return _rawRegexp;
- }
- public StringValue getPattern() {
- return _pattern;
- }
- public boolean isUTF8() {
- return _isUtf8;
- }
- public boolean isEval() {
- return _isEval;
- }
- private void compile(RegexpNode prog, Regcomp comp) {
- _ignoreCase = (comp._flags & Regcomp.IGNORE_CASE) != 0;
- _isGlobal = (comp._flags & Regcomp.GLOBAL) != 0;
- _isAnchorBegin = (comp._flags & Regcomp.ANCHORED) != 0;
- _isUtf8 = (comp._flags & Regcomp.UTF8) != 0;
- if (prog.isAnchorBegin()) {
- _isAnchorBegin = true;
- }
- /*
- if (_ignoreCase)
- RegOptim.ignoreCase(prog);
- if (! _ignoreCase)
- RegOptim.eliminateBacktrack(prog, null);
- */
- _minLength = prog.minLength();
- _firstChar = prog.firstChar();
- _firstSet = prog.firstSet(new boolean[256]);
- _prefix = new CharBuffer(prog.prefix());
- //this._prog = RegOptim.linkLoops(prog);
- _nGroup = comp._maxGroup;
- _nLoop = comp._nLoop;
- _groupNames = new StringValue[_nGroup + 1];
- for (Map.Entry<Integer, StringValue> entry : comp._groupNameMap.entrySet()) {
- StringValue groupName = entry.getValue();
- _groupNames[entry.getKey().intValue()] = groupName;
- }
- }
- public StringValue getGroupName(int i) {
- return _groupNames[i];
- }
- public boolean isGlobal() {
- return _isGlobal;
- }
- public boolean ignoreCase() {
- return _ignoreCase;
- }
- static StringValue fromUtf8(StringValue source) {
- StringValue target = new StringValue();
- int len = source.length();
- for (int i = 0; i < len; i++) {
- char ch = source.charAt(i);
- if (ch < 0x80) {
- target.append(ch);
- } else if ((ch & 0xe0) == 0xc0) {
- if (len <= i + 1) {
- log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF"));
- return null;
- }
- char ch2 = source.charAt(++i);
- target.append((char) (((ch & 0x1f) << 6)
- + (ch2 & 0x3f)));
- } else if ((ch & 0xf0) == 0xe0) {
- if (len <= i + 2) {
- log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF"));
- return null;
- }
- char ch2 = source.charAt(++i);
- char ch3 = source.charAt(++i);
- target.append((char) (((ch & 0x0f) << 12)
- + ((ch2 & 0x3f) << 6)
- + (ch3 & 0x3f)));
- } else {
- if (i + 3 >= len) {
- log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF"));
- return null;
- }
- char ch2 = source.charAt(++i);
- char ch3 = source.charAt(++i);
- char ch4 = source.charAt(++i);
- int codePoint = ((ch & 0x07) << 18)
- + ((ch2 & 0x3F) << 12)
- + ((ch3 & 0x3F) << 6)
- + (ch4 & 0x3F);
- int high = ((codePoint - 0x10000) >> 10) + 0xD800;
- int low = (codePoint & 0x3FF) + 0xDC00;
- target.append((char) high);
- target.append((char) low);
- }
- }
- return target;
- }
- @Override
- public String toString() {
- return getClass().getSimpleName() + "[" + _pattern + "]";
- }
- }