DotNetRegularExpression.java

/lib/saxonB/net/sf/saxon/dotnet/DotNetRegularExpression.java

https://bitbucket.org/dmwelch/phdxnat_pipeline · Java · 264 lines · 137 code · 28 blank · 99 comment · 35 complexity · c32f0b0bc96c2345840ea15c4bf5408d MD5 · raw file

package net.sf.saxon.dotnet;



import cli.System.ArgumentException;

import cli.System.Text.RegularExpressions.Regex;

import cli.System.Text.RegularExpressions.RegexOptions;

import net.sf.saxon.om.FastStringBuffer;

import net.sf.saxon.om.SequenceIterator;

import net.sf.saxon.regex.RegexIterator;

import net.sf.saxon.regex.RegexSyntaxException;

import net.sf.saxon.regex.RegularExpression;

import net.sf.saxon.trans.XPathException;



/**

 * A compiled regular expression implemented using the .NET regex package

 */

public class DotNetRegularExpression implements RegularExpression {



    Regex pattern;

    int groupCount;



    /**

     * Create (compile) a regular expression

     * @param regex the source text of the regular expression, in XML Schema or XPath syntax

     * @param xmlVersion indicates whether XML 1.0 or XML 1.1 is in use

     * @param syntax indicates whether this is an XPath regular expression, an XML Schema regular expression,

     * or a regex in .NET native regex syntax

     * @param flags the flags argument as supplied to functions such as fn:matches(), in string form

     * @throws net.sf.saxon.trans.XPathException if the syntax of the regular expression or flags is incorrect

     */



    public DotNetRegularExpression(CharSequence regex, int xmlVersion, int syntax, CharSequence flags)

            throws XPathException {

        String translated = "";

        try {

            if (syntax == NATIVE_SYNTAX) {

                groupCount = 9;

                pattern = new Regex(regex.toString(), setFlags(flags));

            } else {

                DotNetRegexTranslator translator = new DotNetRegexTranslator();

                translated = translator.translate(

                        regex, xmlVersion, syntax==XPATH_SYNTAX, isIgnoreWhitespace(flags), isCaseBlind(flags));

                groupCount = translator.getNumberOfCapturedGroups();

                pattern = new Regex(translated, setFlags(flags));

            }

            //noinspection ConstantIfStatement

            if (false) {

                // to keep the compiler happy

                throw new ArgumentException();

            }

        } catch (RegexSyntaxException e) {

            throw new XPathException(e.getMessage());

        } catch (ArgumentException e) {

            throw new XPathException("Error in translated regular expression. Input regex = " +

                    FastStringBuffer.diagnosticPrint(regex) + ". Translated regex = " + 

                    FastStringBuffer.diagnosticPrint(translated) + ". Message = " + e.getMessage());

        }

    }



    /**

     * Use this regular expression to analyze an input string, in support of the XSLT

     * analyze-string instruction. The resulting RegexIterator provides both the matching and

     * non-matching substrings, and allows them to be distinguished. It also provides access

     * to matched subgroups.

     */



    public RegexIterator analyze(CharSequence input) {

        return new DotNetRegexIterator(input.toString(), pattern);

    }



    /**

     * Determine whether the regular expression contains a match of a given string

     *

     * @param input the string to match

     * @return true if the string matches, false otherwise

     */



    public boolean containsMatch(CharSequence input) {

        return pattern.IsMatch(input.toString());

    }



    /**

     * Determine whether the regular expression matches (the whole of) a given string

     *

     * @param input the string to match

     * @return true if the regular expression matches the whole input string, false otherwise

     */



    public boolean matches(CharSequence input) {

        // We rely on the fact that this method is only used for the XML Schema pattern facet, and

        // the regular expressions are preprocessed in that case to add implicit anchoring. The method

        // is also used to test if the pattern matches an empty string, which is OK.

        return pattern.IsMatch(input.toString());

    }



    /**

     * Replace all substrings of a supplied input string that match the regular expression

     * with a replacement string.

     *

     * @param input       the input string on which replacements are to be performed

     * @param replacement the replacement string in the format of the XPath replace() function

     * @return the result of performing the replacement

     * @throws net.sf.saxon.trans.XPathException

     *          if the replacement string is invalid

     */



    public CharSequence replace(CharSequence input, CharSequence replacement) throws XPathException {

        // preprocess the replacement string: .NET uses $$ to represent $, and doesn't treat \ specially

        // The calling code will already have validated the replacement string, so we can assume for example

        // that "\" will be followed by "\" or "$".

        FastStringBuffer sb = new FastStringBuffer(replacement.length() + 4);

        for (int i=0; i<replacement.length(); i++) {

            final char ch = replacement.charAt(i);

            if (ch == '\\') {

                if (replacement.charAt(i+1) == '\\') {

                    sb.append('\\');

                } else if (replacement.charAt(i+1) == '$') {

                    sb.append("$$");

                } else {

                    throw new IllegalArgumentException("bad replacement string");

                }

                i++;

            } else if (ch == '$') {

                int n = 0;

                while (true) {

                    if (i+1 >= replacement.length()) {

                        break;

                    }

                    char d = replacement.charAt(i+1);

                    int dval = "0123456789".indexOf(d);

                    if (dval < 0) {

                        break;

                    }

                    i++;

                    n = n*10 + dval;

                }

                processGroupReference(n, sb);

            } else {

                sb.append(ch);

            }

        }

        //System.err.println("original replacement string: " + replacement);

        //System.err.println("processed replacement string: " + sb);

        return pattern.Replace(input.toString(), sb.toString());

    }



    /**

     * Translate a group reference in the replacement string from XPath notation into .NET notation

     * This closely follows the algorithm in F+O section 7.6.3 fn:replace

     * @param n the consecutive sequence of digits following a "$" sign

     * @param sb teh buffer to contain the replacement string in .NET notation

     */



    private void processGroupReference(int n, FastStringBuffer sb) {

        if (n == 0) {

            sb.append("$0");

        } else if (n <= groupCount) {

            sb.append("${" + n + '}');

        } else if (n <= 9) {

            // no-op - group reference is replaced by zero-length string

        } else {

            // try replacing $67 by ${6}7

            int n0 = n / 10;

            int n1 = n % 10;

            processGroupReference(n0, sb);

            sb.append("" + n1);

        }

    }



    /**

     * Use this regular expression to tokenize an input string.

     *

     * @param input the string to be tokenized

     * @return a SequenceIterator containing the resulting tokens, as objects of type StringValue

     */



    public SequenceIterator tokenize(CharSequence input) {

        return new DotNetTokenIterator(input, pattern);

    }



    /**

     * Set the Java flags from the supplied XPath flags.

     * @param inFlags the flags as a string, e.g. "im"

     * @return the flags as a RegexOptions FlagsAttribute

     * @throws XPathException if the supplied value is invalid

     */



    public static RegexOptions setFlags(CharSequence inFlags) throws XPathException {

        int flags = 0;

        for (int i=0; i<inFlags.length(); i++) {

            char c = inFlags.charAt(i);

            switch (c) {

            case'm':

                flags |= RegexOptions.Multiline;

                break;

            case'i':

                // flags |= RegexOptions.IgnoreCase;

                break;

            case's':

                flags |= RegexOptions.Singleline;

                break;

            case'x':

                //flags |= RegexOptions.IgnorePatternWhitespace;

                break;

            default:

                XPathException err = new XPathException("Invalid character '" + c + "' in regular expression flags");

                err.setErrorCode("FORX0001");

                throw err;

            }

        }

        return RegexOptions.wrap(flags);

    }



    /**

     * Test whether the 'x' flag is set.

     * @param inFlags the flags as a string, e.g. "im"

     * @return true if the 'x' flag is set

     */



    public static boolean isIgnoreWhitespace(CharSequence inFlags) {

        for (int i=0; i<inFlags.length(); i++) {

            if (inFlags.charAt(i) == 'x') {

                return true;

            }

        }

        return false;

    }



    /**

     * Test whether the 'i' flag is set.

     * @param inFlags the flags as a string, e.g. "im"

     * @return true if the 'i' flag is set

     */



    public static boolean isCaseBlind(CharSequence inFlags) {

        for (int i=0; i<inFlags.length(); i++) {

            if (inFlags.charAt(i) == 'i') {

                return true;

            }

        }

        return false;

    }





}



//

// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");

// you may not use this file except in compliance with the License. You may obtain a copy of the

// License at http://www.mozilla.org/MPL/

//

// Software distributed under the License is distributed on an "AS IS" basis,

// WITHOUT WARRANTY OF ANY KIND, either express or implied.

// See the License for the specific language governing rights and limitations under the License.

//

// The Original Code is: all this file.

//

// The Initial Developer of the Original Code is Michael H. Kay

//

// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.

//

// Contributor(s):

//
Alerts (9)

'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
36 110 162 163
'if (false)' Redundant condition: This 'if' statement's condition always evaluates to the same value, indicating potentially dead or unnecessary code.
46
Complexity hotspot; lines 113 to 114 (total complexity: 4)
113 114
'<=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
158
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
205