PageRenderTime 38ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/bianca/src/main/java/com/clevercloud/bianca/lib/regexp/Regexp.java

http://github.com/CleverCloud/Bianca
Java | 306 lines | 216 code | 51 blank | 39 comment | 41 complexity | 0063f71ab31348a18d3a8e9c6e4f0d38 MD5 | raw file
Possible License(s): GPL-2.0, MPL-2.0-no-copyleft-exception
  1. /*
  2. * Copyright (c) 1998-2010 Caucho Technology -- all rights reserved
  3. * Copyright (c) 2011-2012 Clever Cloud SAS -- all rights reserved
  4. *
  5. * This file is part of Bianca(R) Open Source
  6. *
  7. * Each copy or derived work must preserve the copyright notice and this
  8. * notice unmodified.
  9. *
  10. * Bianca Open Source is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * Bianca Open Source is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
  18. * of NON-INFRINGEMENT. See the GNU General Public License for more
  19. * details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with Bianca Open Source; if not, write to the
  23. *
  24. * Free Software Foundation, Inc.
  25. * 59 Temple Place, Suite 330
  26. * Boston, MA 02111-1307 USA
  27. *
  28. * @author Scott Ferguson
  29. * @author Marc-Antoine Perennou <Marc-Antoine@Perennou.com>
  30. */
  31. package com.clevercloud.bianca.lib.regexp;
  32. import com.clevercloud.bianca.BiancaException;
  33. import com.clevercloud.bianca.env.StringValue;
  34. import com.clevercloud.util.CharBuffer;
  35. import com.clevercloud.util.L10N;
  36. import java.util.Map;
  37. import java.util.logging.Logger;
  38. public class Regexp {
  39. private static final Logger log = Logger.getLogger(Regexp.class.getName());
  40. private static final L10N L = new L10N(Regexp.class);
  41. public static final int FAIL = -1;
  42. public static final int SUCCESS = 0;
  43. final StringValue _rawRegexp;
  44. StringValue _pattern;
  45. int _flags;
  46. RegexpNode _prog;
  47. boolean _ignoreCase;
  48. boolean _isGlobal;
  49. int _nLoop;
  50. int _nGroup;
  51. // optim stuff
  52. CharBuffer _prefix; // initial string
  53. int _minLength; // minimum length possible for this regexp
  54. int _firstChar;
  55. boolean[] _firstSet;
  56. boolean _isAnchorBegin;
  57. StringValue[] _groupNames;
  58. boolean _isUnicode;
  59. boolean _isPHP5String;
  60. boolean _isUtf8;
  61. boolean _isEval;
  62. public Regexp(StringValue rawRegexp)
  63. throws IllegalRegexpException {
  64. _rawRegexp = rawRegexp;
  65. _pattern = rawRegexp;
  66. init();
  67. Regcomp comp = new Regcomp(_flags);
  68. _prog = comp.parse(new PeekString(_pattern));
  69. compile(_prog, comp);
  70. }
  71. protected void init() {
  72. StringValue rawRegexp = _rawRegexp;
  73. if (rawRegexp.length() < 2) {
  74. throw new IllegalStateException(L.l(
  75. "Can't find delimiters in regexp '{0}'.",
  76. rawRegexp));
  77. }
  78. int head = 0;
  79. char delim = '/';
  80. for (;
  81. head < rawRegexp.length()
  82. && Character.isWhitespace((delim = rawRegexp.charAt(head)));
  83. head++) {
  84. }
  85. if (delim == '{') {
  86. delim = '}';
  87. } else if (delim == '[') {
  88. delim = ']';
  89. } else if (delim == '(') {
  90. delim = ')';
  91. } else if (delim == '<') {
  92. delim = '>';
  93. } else if (delim == '\\' || Character.isLetterOrDigit(delim)) {
  94. throw new BiancaException(L.l(
  95. "Delimiter {0} in regexp '{1}' must "
  96. + "not be backslash or alphanumeric.",
  97. String.valueOf(delim),
  98. rawRegexp));
  99. }
  100. int tail = rawRegexp.lastIndexOf(delim);
  101. if (tail <= 0) {
  102. throw new BiancaException(L.l(
  103. "Can't find second {0} in regexp '{1}'.",
  104. String.valueOf(delim),
  105. rawRegexp));
  106. }
  107. StringValue sflags = rawRegexp.substring(tail + 1);
  108. StringValue pattern = rawRegexp.substring(head + 1, tail);
  109. _pattern = pattern;
  110. int flags = 0;
  111. for (int i = 0; sflags != null && i < sflags.length(); i++) {
  112. switch (sflags.charAt(i)) {
  113. case 'm':
  114. flags |= Regcomp.MULTILINE;
  115. break;
  116. case 's':
  117. flags |= Regcomp.SINGLE_LINE;
  118. break;
  119. case 'i':
  120. flags |= Regcomp.IGNORE_CASE;
  121. break;
  122. case 'x':
  123. flags |= Regcomp.IGNORE_WS;
  124. break;
  125. case 'g':
  126. flags |= Regcomp.GLOBAL;
  127. break;
  128. case 'A':
  129. flags |= Regcomp.ANCHORED;
  130. break;
  131. case 'D':
  132. flags |= Regcomp.END_ONLY;
  133. break;
  134. case 'U':
  135. flags |= Regcomp.UNGREEDY;
  136. break;
  137. case 'X':
  138. flags |= Regcomp.STRICT;
  139. break;
  140. case 'S': /* speedup */
  141. ;
  142. break;
  143. case 'u':
  144. flags |= Regcomp.UTF8;
  145. break;
  146. case 'e':
  147. _isEval = true;
  148. break;
  149. default:
  150. throw new BiancaException(L.l("'{0}' is an unknown regexp flag in {1}",
  151. (char) sflags.charAt(i), rawRegexp));
  152. }
  153. }
  154. _flags = flags;
  155. _pattern = pattern;
  156. }
  157. public StringValue getRawRegexp() {
  158. return _rawRegexp;
  159. }
  160. public StringValue getPattern() {
  161. return _pattern;
  162. }
  163. public boolean isUTF8() {
  164. return _isUtf8;
  165. }
  166. public boolean isEval() {
  167. return _isEval;
  168. }
  169. private void compile(RegexpNode prog, Regcomp comp) {
  170. _ignoreCase = (comp._flags & Regcomp.IGNORE_CASE) != 0;
  171. _isGlobal = (comp._flags & Regcomp.GLOBAL) != 0;
  172. _isAnchorBegin = (comp._flags & Regcomp.ANCHORED) != 0;
  173. _isUtf8 = (comp._flags & Regcomp.UTF8) != 0;
  174. if (prog.isAnchorBegin()) {
  175. _isAnchorBegin = true;
  176. }
  177. /*
  178. if (_ignoreCase)
  179. RegOptim.ignoreCase(prog);
  180. if (! _ignoreCase)
  181. RegOptim.eliminateBacktrack(prog, null);
  182. */
  183. _minLength = prog.minLength();
  184. _firstChar = prog.firstChar();
  185. _firstSet = prog.firstSet(new boolean[256]);
  186. _prefix = new CharBuffer(prog.prefix());
  187. //this._prog = RegOptim.linkLoops(prog);
  188. _nGroup = comp._maxGroup;
  189. _nLoop = comp._nLoop;
  190. _groupNames = new StringValue[_nGroup + 1];
  191. for (Map.Entry<Integer, StringValue> entry : comp._groupNameMap.entrySet()) {
  192. StringValue groupName = entry.getValue();
  193. _groupNames[entry.getKey().intValue()] = groupName;
  194. }
  195. }
  196. public StringValue getGroupName(int i) {
  197. return _groupNames[i];
  198. }
  199. public boolean isGlobal() {
  200. return _isGlobal;
  201. }
  202. public boolean ignoreCase() {
  203. return _ignoreCase;
  204. }
  205. static StringValue fromUtf8(StringValue source) {
  206. StringValue target = new StringValue();
  207. int len = source.length();
  208. for (int i = 0; i < len; i++) {
  209. char ch = source.charAt(i);
  210. if (ch < 0x80) {
  211. target.append(ch);
  212. } else if ((ch & 0xe0) == 0xc0) {
  213. if (len <= i + 1) {
  214. log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF"));
  215. return null;
  216. }
  217. char ch2 = source.charAt(++i);
  218. target.append((char) (((ch & 0x1f) << 6)
  219. + (ch2 & 0x3f)));
  220. } else if ((ch & 0xf0) == 0xe0) {
  221. if (len <= i + 2) {
  222. log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF"));
  223. return null;
  224. }
  225. char ch2 = source.charAt(++i);
  226. char ch3 = source.charAt(++i);
  227. target.append((char) (((ch & 0x0f) << 12)
  228. + ((ch2 & 0x3f) << 6)
  229. + (ch3 & 0x3f)));
  230. } else {
  231. if (i + 3 >= len) {
  232. log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF"));
  233. return null;
  234. }
  235. char ch2 = source.charAt(++i);
  236. char ch3 = source.charAt(++i);
  237. char ch4 = source.charAt(++i);
  238. int codePoint = ((ch & 0x07) << 18)
  239. + ((ch2 & 0x3F) << 12)
  240. + ((ch3 & 0x3F) << 6)
  241. + (ch4 & 0x3F);
  242. int high = ((codePoint - 0x10000) >> 10) + 0xD800;
  243. int low = (codePoint & 0x3FF) + 0xDC00;
  244. target.append((char) high);
  245. target.append((char) low);
  246. }
  247. }
  248. return target;
  249. }
  250. @Override
  251. public String toString() {
  252. return getClass().getSimpleName() + "[" + _pattern + "]";
  253. }
  254. }