PageRenderTime 46ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/saxonB/net/sf/saxon/dotnet/DotNetRegularExpression.java

https://bitbucket.org/dmwelch/phdxnat_pipeline
Java | 264 lines | 137 code | 28 blank | 99 comment | 35 complexity | c32f0b0bc96c2345840ea15c4bf5408d MD5 | raw file
  1. package net.sf.saxon.dotnet;
  2. import cli.System.ArgumentException;
  3. import cli.System.Text.RegularExpressions.Regex;
  4. import cli.System.Text.RegularExpressions.RegexOptions;
  5. import net.sf.saxon.om.FastStringBuffer;
  6. import net.sf.saxon.om.SequenceIterator;
  7. import net.sf.saxon.regex.RegexIterator;
  8. import net.sf.saxon.regex.RegexSyntaxException;
  9. import net.sf.saxon.regex.RegularExpression;
  10. import net.sf.saxon.trans.XPathException;
  11. /**
  12. * A compiled regular expression implemented using the .NET regex package
  13. */
  14. public class DotNetRegularExpression implements RegularExpression {
  15. Regex pattern;
  16. int groupCount;
  17. /**
  18. * Create (compile) a regular expression
  19. * @param regex the source text of the regular expression, in XML Schema or XPath syntax
  20. * @param xmlVersion indicates whether XML 1.0 or XML 1.1 is in use
  21. * @param syntax indicates whether this is an XPath regular expression, an XML Schema regular expression,
  22. * or a regex in .NET native regex syntax
  23. * @param flags the flags argument as supplied to functions such as fn:matches(), in string form
  24. * @throws net.sf.saxon.trans.XPathException if the syntax of the regular expression or flags is incorrect
  25. */
  26. public DotNetRegularExpression(CharSequence regex, int xmlVersion, int syntax, CharSequence flags)
  27. throws XPathException {
  28. String translated = "";
  29. try {
  30. if (syntax == NATIVE_SYNTAX) {
  31. groupCount = 9;
  32. pattern = new Regex(regex.toString(), setFlags(flags));
  33. } else {
  34. DotNetRegexTranslator translator = new DotNetRegexTranslator();
  35. translated = translator.translate(
  36. regex, xmlVersion, syntax==XPATH_SYNTAX, isIgnoreWhitespace(flags), isCaseBlind(flags));
  37. groupCount = translator.getNumberOfCapturedGroups();
  38. pattern = new Regex(translated, setFlags(flags));
  39. }
  40. //noinspection ConstantIfStatement
  41. if (false) {
  42. // to keep the compiler happy
  43. throw new ArgumentException();
  44. }
  45. } catch (RegexSyntaxException e) {
  46. throw new XPathException(e.getMessage());
  47. } catch (ArgumentException e) {
  48. throw new XPathException("Error in translated regular expression. Input regex = " +
  49. FastStringBuffer.diagnosticPrint(regex) + ". Translated regex = " +
  50. FastStringBuffer.diagnosticPrint(translated) + ". Message = " + e.getMessage());
  51. }
  52. }
  53. /**
  54. * Use this regular expression to analyze an input string, in support of the XSLT
  55. * analyze-string instruction. The resulting RegexIterator provides both the matching and
  56. * non-matching substrings, and allows them to be distinguished. It also provides access
  57. * to matched subgroups.
  58. */
  59. public RegexIterator analyze(CharSequence input) {
  60. return new DotNetRegexIterator(input.toString(), pattern);
  61. }
  62. /**
  63. * Determine whether the regular expression contains a match of a given string
  64. *
  65. * @param input the string to match
  66. * @return true if the string matches, false otherwise
  67. */
  68. public boolean containsMatch(CharSequence input) {
  69. return pattern.IsMatch(input.toString());
  70. }
  71. /**
  72. * Determine whether the regular expression matches (the whole of) a given string
  73. *
  74. * @param input the string to match
  75. * @return true if the regular expression matches the whole input string, false otherwise
  76. */
  77. public boolean matches(CharSequence input) {
  78. // We rely on the fact that this method is only used for the XML Schema pattern facet, and
  79. // the regular expressions are preprocessed in that case to add implicit anchoring. The method
  80. // is also used to test if the pattern matches an empty string, which is OK.
  81. return pattern.IsMatch(input.toString());
  82. }
  83. /**
  84. * Replace all substrings of a supplied input string that match the regular expression
  85. * with a replacement string.
  86. *
  87. * @param input the input string on which replacements are to be performed
  88. * @param replacement the replacement string in the format of the XPath replace() function
  89. * @return the result of performing the replacement
  90. * @throws net.sf.saxon.trans.XPathException
  91. * if the replacement string is invalid
  92. */
  93. public CharSequence replace(CharSequence input, CharSequence replacement) throws XPathException {
  94. // preprocess the replacement string: .NET uses $$ to represent $, and doesn't treat \ specially
  95. // The calling code will already have validated the replacement string, so we can assume for example
  96. // that "\" will be followed by "\" or "$".
  97. FastStringBuffer sb = new FastStringBuffer(replacement.length() + 4);
  98. for (int i=0; i<replacement.length(); i++) {
  99. final char ch = replacement.charAt(i);
  100. if (ch == '\\') {
  101. if (replacement.charAt(i+1) == '\\') {
  102. sb.append('\\');
  103. } else if (replacement.charAt(i+1) == '$') {
  104. sb.append("$$");
  105. } else {
  106. throw new IllegalArgumentException("bad replacement string");
  107. }
  108. i++;
  109. } else if (ch == '$') {
  110. int n = 0;
  111. while (true) {
  112. if (i+1 >= replacement.length()) {
  113. break;
  114. }
  115. char d = replacement.charAt(i+1);
  116. int dval = "0123456789".indexOf(d);
  117. if (dval < 0) {
  118. break;
  119. }
  120. i++;
  121. n = n*10 + dval;
  122. }
  123. processGroupReference(n, sb);
  124. } else {
  125. sb.append(ch);
  126. }
  127. }
  128. //System.err.println("original replacement string: " + replacement);
  129. //System.err.println("processed replacement string: " + sb);
  130. return pattern.Replace(input.toString(), sb.toString());
  131. }
  132. /**
  133. * Translate a group reference in the replacement string from XPath notation into .NET notation
  134. * This closely follows the algorithm in F+O section 7.6.3 fn:replace
  135. * @param n the consecutive sequence of digits following a "$" sign
  136. * @param sb teh buffer to contain the replacement string in .NET notation
  137. */
  138. private void processGroupReference(int n, FastStringBuffer sb) {
  139. if (n == 0) {
  140. sb.append("$0");
  141. } else if (n <= groupCount) {
  142. sb.append("${" + n + '}');
  143. } else if (n <= 9) {
  144. // no-op - group reference is replaced by zero-length string
  145. } else {
  146. // try replacing $67 by ${6}7
  147. int n0 = n / 10;
  148. int n1 = n % 10;
  149. processGroupReference(n0, sb);
  150. sb.append("" + n1);
  151. }
  152. }
  153. /**
  154. * Use this regular expression to tokenize an input string.
  155. *
  156. * @param input the string to be tokenized
  157. * @return a SequenceIterator containing the resulting tokens, as objects of type StringValue
  158. */
  159. public SequenceIterator tokenize(CharSequence input) {
  160. return new DotNetTokenIterator(input, pattern);
  161. }
  162. /**
  163. * Set the Java flags from the supplied XPath flags.
  164. * @param inFlags the flags as a string, e.g. "im"
  165. * @return the flags as a RegexOptions FlagsAttribute
  166. * @throws XPathException if the supplied value is invalid
  167. */
  168. public static RegexOptions setFlags(CharSequence inFlags) throws XPathException {
  169. int flags = 0;
  170. for (int i=0; i<inFlags.length(); i++) {
  171. char c = inFlags.charAt(i);
  172. switch (c) {
  173. case'm':
  174. flags |= RegexOptions.Multiline;
  175. break;
  176. case'i':
  177. // flags |= RegexOptions.IgnoreCase;
  178. break;
  179. case's':
  180. flags |= RegexOptions.Singleline;
  181. break;
  182. case'x':
  183. //flags |= RegexOptions.IgnorePatternWhitespace;
  184. break;
  185. default:
  186. XPathException err = new XPathException("Invalid character '" + c + "' in regular expression flags");
  187. err.setErrorCode("FORX0001");
  188. throw err;
  189. }
  190. }
  191. return RegexOptions.wrap(flags);
  192. }
  193. /**
  194. * Test whether the 'x' flag is set.
  195. * @param inFlags the flags as a string, e.g. "im"
  196. * @return true if the 'x' flag is set
  197. */
  198. public static boolean isIgnoreWhitespace(CharSequence inFlags) {
  199. for (int i=0; i<inFlags.length(); i++) {
  200. if (inFlags.charAt(i) == 'x') {
  201. return true;
  202. }
  203. }
  204. return false;
  205. }
  206. /**
  207. * Test whether the 'i' flag is set.
  208. * @param inFlags the flags as a string, e.g. "im"
  209. * @return true if the 'i' flag is set
  210. */
  211. public static boolean isCaseBlind(CharSequence inFlags) {
  212. for (int i=0; i<inFlags.length(); i++) {
  213. if (inFlags.charAt(i) == 'i') {
  214. return true;
  215. }
  216. }
  217. return false;
  218. }
  219. }
  220. //
  221. // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
  222. // you may not use this file except in compliance with the License. You may obtain a copy of the
  223. // License at http://www.mozilla.org/MPL/
  224. //
  225. // Software distributed under the License is distributed on an "AS IS" basis,
  226. // WITHOUT WARRANTY OF ANY KIND, either express or implied.
  227. // See the License for the specific language governing rights and limitations under the License.
  228. //
  229. // The Original Code is: all this file.
  230. //
  231. // The Initial Developer of the Original Code is Michael H. Kay
  232. //
  233. // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
  234. //
  235. // Contributor(s):
  236. //