codecs.java | searchcode

/lib/jars-src/jython2.2.1/src/org/python/core/codecs.java

https://bitbucket.org/wrapman/frostwire.desktop.translations.pirate · Java · 541 lines · 405 code · 83 blank · 53 comment · 126 complexity · d6a60e29e95e5d10b553254a096400f8 MD5 · raw file


/*
 * Copyright 2000 Finn Bock
 *
 * This program contains material copyrighted by:
 * Copyright (c) Corporation for National Research Initiatives.
 * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
 */

package org.python.core;

/**
 * Contains the implementation of the builtin codecs.
 * @since Jython 2.0
 */

public class codecs {
    private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;

    private static PyList searchPath = new PyList();
    private static PyStringMap searchCache = new PyStringMap();

    private static String default_encoding = "ascii";

    public static String getDefaultEncoding() {
        return default_encoding;
    }

    public static void setDefaultEncoding(String encoding) {
        lookup(encoding);
        default_encoding = encoding;
    }

    public static void register(PyObject search_function) {
        if (!search_function.isCallable()) {
           throw Py.TypeError("argument must be callable");
        }
        searchPath.append(search_function);
    }


    public static PyTuple lookup(String encoding) {
        import_encodings();
        PyString v = new PyString(normalizestring(encoding));
        PyObject result = searchCache.__finditem__(v);
        if (result != null) {
            return (PyTuple)result;
        }

        if (searchPath.__len__() == 0) {
             throw new PyException(Py.LookupError,
                   "no codec search functions registered: " +
                   "can't find encoding");
        }

        PyObject iter = searchPath.__iter__();
        PyObject func = null;
        while ((func = iter.__iternext__()) != null) {
            result = func.__call__(v);
            if (result == Py.None) {
                continue;
            }
            if (!(result instanceof PyTuple) || result.__len__() != 4) {
                throw Py.TypeError("codec search functions must "+
                                   "return 4-tuples");
            }
            break;
        }
        if (func == null) {
            throw new PyException(Py.LookupError, "unknown encoding " +
                                  encoding);
        }
        searchCache.__setitem__(v, result);
        return (PyTuple)result;
    }

    private static String normalizestring(String string) {
        return string.toLowerCase().replace(' ', '-');
    }


    private static boolean import_encodings_called = false;

    private static void import_encodings() {
        if (!import_encodings_called) {
            import_encodings_called = true;
            try {
                __builtin__.__import__("encodings");
            } catch (PyException exc) {
                if (exc.type != Py.ImportError) {
                    throw exc;
                }
            }
        }
    }



    public static String decode(PyString v, String encoding,
                                  String errors)
    {
        if (encoding == null) {
            encoding = getDefaultEncoding();
        } else {
            encoding = normalizestring(encoding);
        }

        if (errors != null) {
            errors = errors.intern();
        }

        /* Shortcuts for common default encodings */
/*
        if (encoding.equals("utf-8"))
            return utf_8_decode(v, errors).__getitem__(0).__str__();
        else if (encoding.equals("latin-1"))
            ; //return PyUnicode_DecodeLatin1(s, size, errors);
        else if (encoding.equals("ascii"))
            ; //return PyUnicode_DecodeASCII(s, size, errors);
*/
        if (encoding.equals("ascii")) {
            return PyUnicode_DecodeASCII(v.toString(),
                                                      v.__len__(), errors);
        }

        /* Decode via the codec registry */
        PyObject decoder = getDecoder(encoding);
        PyObject result = null;
        if (errors != null) {
            result = decoder.__call__(v, new PyString(errors));
        } else {
            result = decoder.__call__(v);
        }

        if (!(result instanceof PyTuple) || result.__len__() != 2) {
            throw Py.TypeError("decoder must return a tuple " +
                               "(object,integer)");
        }
        return result.__getitem__(0).toString();
    }


    private static PyObject getDecoder(String encoding) {
        PyObject codecs = lookup(encoding);
        return codecs.__getitem__(1);
    }



    public static String encode(PyString v, String encoding,
                                  String errors)
    {
        if (encoding == null) {
            encoding = getDefaultEncoding();
        } else {
            encoding = normalizestring(encoding);
        }

        if (errors != null) {
            errors = errors.intern();
        }

        /* Shortcuts for common default encodings */
/*
        if (encoding.equals("utf-8"))
            return PyUnicode_DecodeUTF8(v.toString(), v.__len__(), errors);
        else if (encoding.equals("latin-1"))
            return PyUnicode_DecodeLatin1(v.toString(), v.__len__(), errors);
        else
*/

        if (encoding.equals("ascii")) {
            return PyUnicode_EncodeASCII(v.toString(),
                                                      v.__len__(), errors);
        }

        /* Decode via the codec registry */
        PyObject encoder = getEncoder(encoding);
        PyObject result = null;
        if (errors != null) {
            result = encoder.__call__(v, new PyString(errors));
        } else {
            result = encoder.__call__(v);
        }

        if (!(result instanceof PyTuple) || result.__len__() != 2) {
            throw Py.TypeError("encoder must return a tuple " +
                               "(object,integer)");
        }
        return result.__getitem__(0).toString();
    }

    private static PyObject getEncoder(String encoding) {
        PyObject codecs = lookup(encoding);
        return codecs.__getitem__(0);
    }


    /* --- UTF-8 Codec ---------------------------------------------------- */
    private static byte utf8_code_length[] = {
       /* Map UTF-8 encoded prefix byte to sequence length.  zero means
           illegal prefix.  see RFC 2279 for details */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
    };


    public static String PyUnicode_DecodeUTF8(String str, String errors) {
        int size = str.length();
        StringBuffer unicode = new StringBuffer(size);

        /* Unpack UTF-8 encoded data */
        for (int i = 0; i < size; ) {
            int ch = str.charAt(i);
            if (ch > 0xFF) {
                codecs.decoding_error("utf-8", unicode, errors,
                                      "ordinal not in range(255)");
                i++;
                continue;
            }

            if (ch < 0x80) {
                unicode.append((char) ch);
                i++;
                continue;
            }

            int n = utf8_code_length[ch];

            if (i + n > size) {
                codecs.decoding_error("utf-8", unicode, errors,
                                      "unexpected end of data");
                i++;
                continue;
            }


            switch (n) {
            case 0:
                codecs.decoding_error("utf-8", unicode, errors,
                                      "unexpected code byte");
                i++;
                continue;
            case 1:
                codecs.decoding_error("utf-8", unicode, errors,
                                      "internal error");
                i++;
                continue;
            case 2:
                char ch1 = str.charAt(i+1);
                if ((ch1 & 0xc0) != 0x80) {
                    codecs.decoding_error("utf-8", unicode, errors,
                                          "invalid data");
                    i++;
                    continue;
                }
                ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
                if (ch < 0x80) {
                    codecs.decoding_error("utf-8", unicode, errors,
                                          "illegal encoding");
                    i++;
                    continue;
                } else
                    unicode.append((char) ch);
                break;

            case 3:
                ch1 = str.charAt(i+1);
                char ch2 = str.charAt(i+2);
                if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
                    codecs.decoding_error("utf-8", unicode, errors,
                                          "invalid data");
                    i++;
                    continue;
                }
                ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
                if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
                    codecs.decoding_error("utf-8", unicode, errors,
                                          "illegal encoding");
                    i++;
                    continue;
                } else
                   unicode.append((char) ch);
                break;

            case 4:
                ch1 = str.charAt(i+1);
                ch2 = str.charAt(i+2);
                char ch3 = str.charAt(i+3);
                if ((ch1 & 0xc0) != 0x80 ||
                    (ch2 & 0xc0) != 0x80 ||
                    (ch3 & 0xc0) != 0x80) {
                    codecs.decoding_error("utf-8", unicode, errors,
                                          "invalid data");
                    i++;
                    continue;
                }
                ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
                     ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
                /* validate and convert to UTF-16 */
                if ((ch < 0x10000) ||   /* minimum value allowed for 4
                                           byte encoding */
                    (ch > 0x10ffff)) {  /* maximum value allowed for
                                           UTF-16 */
                    codecs.decoding_error("utf-8", unicode, errors,
                                          "illegal encoding");
                    i++;
                    continue;
                }
                /*  compute and append the two surrogates: */

                /*  translate from 10000..10FFFF to 0..FFFF */
                ch -= 0x10000;

                /*  high surrogate = top 10 bits added to D800 */
                unicode.append((char) (0xD800 + (ch >> 10)));

                /*  low surrogate = bottom 10 bits added to DC00 */
                unicode.append((char) (0xDC00 + (ch & ~0xFC00)));
                break;

            default:
                /* Other sizes are only needed for UCS-4 */
                codecs.decoding_error("utf-8", unicode, errors,
                                      "unsupported Unicode code range");
                i++;
            }
            i += n;
        }

        return unicode.toString();
    }


    public static String PyUnicode_EncodeUTF8(String str, String errors) {
        int size = str.length();
        StringBuffer v = new StringBuffer(size * 3);

        for (int i = 0; i < size; ) {
            int ch = str.charAt(i++);
            if (ch < 0x80) {
                v.append((char) ch);
            } else if (ch < 0x0800) {
                v.append((char) (0xc0 | (ch >> 6)));
                v.append((char) (0x80 | (ch & 0x3f)));
            } else {
                if (0xD800 <= ch && ch <= 0xDFFF) {
                    if (i != size) {
                        int ch2 = str.charAt(i);
                        if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                            /* combine the two values */
                            ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;

                            v.append((char)((ch >> 18) | 0xf0));
                            v.append((char)(0x80 | ((ch >> 12) & 0x3f)));
                            i++;
                        }
                    }
                } else {
                    v.append((char)(0xe0 | (ch >> 12)));
                }
                v.append((char) (0x80 | ((ch >> 6) & 0x3f)));
                v.append((char) (0x80 | (ch & 0x3f)));
            }
        }
        return v.toString();
    }



    /* --- 7-bit ASCII Codec -------------------------------------------- */

    public static String PyUnicode_DecodeASCII(String str, int size,
                                               String errors)
    {
        StringBuffer v = new StringBuffer(size);

        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch < 128) {
                v.append(ch);
            } else {
                decoding_error("ascii", v, errors,
                               "ordinal not in range(128)");
                continue;
            }
        }

        return v.toString();
    }


    public static String PyUnicode_EncodeASCII(String str, int size,
                                               String errors)
    {
        StringBuffer v = new StringBuffer(size);

        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch >= 128) {
                encoding_error("ascii", v, errors,
                               "ordinal not in range(128)");
            } else {
                v.append(ch);
            }
        }
        return v.toString();
    }



    /* --- RawUnicodeEscape Codec ---------------------------------------- */

    private static char[] hexdigit = "0123456789ABCDEF".toCharArray();

    // The modified flag is used by cPickle.
    public static String PyUnicode_EncodeRawUnicodeEscape(String str,
                                                          String errors,
                                                          boolean modifed)
    {

        int size = str.length();
        StringBuffer v = new StringBuffer(str.length());

        for (int i = 0; i < size; i++) {
            char ch = str.charAt(i);
            if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) {
                v.append("\\u");
                v.append(hexdigit[(ch >>> 12) & 0xF]);
                v.append(hexdigit[(ch >>> 8) & 0xF]);
                v.append(hexdigit[(ch >>> 4) & 0xF]);
                v.append(hexdigit[ch & 0xF]);
            } else {
                v.append(ch);
            }
        }

        return v.toString();
    }


    public static String PyUnicode_DecodeRawUnicodeEscape(String str,
                                                          String errors)
    {
        int size = str.length();
        StringBuffer v = new StringBuffer(size);

        for (int i = 0; i < size; ) {
            char ch = str.charAt(i);

            /* Non-escape characters are interpreted as Unicode ordinals */
            if (ch != '\\') {
                v.append(ch);
                i++;
                continue;
            }

            /* \\u-escapes are only interpreted iff the number of leading
               backslashes is odd */
            int bs = i;
            while (i < size) {
                ch = str.charAt(i);
                if (ch != '\\')
                    break;
                v.append(ch);
                i++;
            }
            if (((i - bs) & 1) == 0 || i >= size || ch != 'u') {
                continue;
            }
            v.setLength(v.length() - 1);
            i++;

            /* \\uXXXX with 4 hex digits */
            int x = 0;
            for (int j = 0; j < 4; j++) {
                ch = str.charAt(i+j);
                int d  = Character.digit(ch, 16);
                if (d == -1) {
                    codecs.decoding_error("unicode escape", v, errors,
                                          "truncated \\uXXXX");
                    break;
                }
                x = ((x<<4) & ~0xF) + d;
            }
            i += 4;
            v.append((char) x);
       }
       return v.toString();
    }


    /* --- Utility methods -------------------------------------------- */

    public static void encoding_error(String type, StringBuffer dest,
                                      String errors, String details)
    {
        if (errors == null || errors == "strict") {
            throw Py.UnicodeError(type + " encoding error: " + details);
        } else if (errors == "ignore") {
            //ignore
        } else if (errors == "replace") {
            dest.append('?');
        } else {
            throw Py.ValueError(type + " encoding error; "+
                                "unknown error handling code: " + errors);
        }
    }


    public static void decoding_error(String type, StringBuffer dest,
                                      String errors, String details)
    {
        if (errors == null || errors == "strict") {
            throw Py.UnicodeError(type + " decoding error: " + details);
        }
        else if (errors == "ignore") {
            //ignore
        } else if (errors == "replace") {
            if (dest != null) {
                dest.append(Py_UNICODE_REPLACEMENT_CHARACTER);
            }
        } else {
            throw Py.ValueError(type + " decoding error; "+
                                "unknown error handling code: " + errors);
        }
    }
}

Alerts (22)

'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
19
'Map' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
20
'instanceof' Frequent 'instanceof' checks can indicate a need for better polymorphism (using overridden methods in subclasses) or visitor pattern. Consider if the design can be improved.
62 134 185
'!=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
62 134 185
'switch (' Ensure switch statements on enums or non-trivial types cover all cases or include a 'default:' label to handle unexpected values.
251
'case' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
262 280 299
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
350 491
'<' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
393
'>=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
413
'==' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
440
Complexity hotspot; line 440 (total complexity: 6)
440
'==' Correctness Warning: Comparing String objects using '==' or '!=' checks for reference equality, not value equality. Use '.equals()' to compare String content.
513 515 530 532