rstr.py - Concrete implementation of RPython strings:

/rpython/rtyper/lltypesystem/rstr.py

https://bitbucket.org/pypy/pypy/ · Python · 1344 lines · 1127 code · 139 blank · 78 comment · 252 complexity · 00a3aadf0b3727bbf2d431e4c6dafbca MD5 · raw file

from weakref import WeakValueDictionary

from rpython.annotator import model as annmodel
from rpython.rlib import jit, types
from rpython.rlib.objectmodel import (malloc_zero_filled, we_are_translated,
    _hash_string, keepalive_until_here, specialize, enforceargs)
from rpython.rlib.signature import signature
from rpython.rlib.rarithmetic import ovfcheck
from rpython.rtyper.error import TyperError
from rpython.rtyper.debug import ll_assert
from rpython.rtyper.lltypesystem import ll_str, llmemory
from rpython.rtyper.lltypesystem.lltype import (GcStruct, Signed, Array, Char,
    UniChar, Ptr, malloc, Bool, Void, GcArray, nullptr, cast_primitive,
    typeOf, staticAdtMethod, GcForwardReference)
from rpython.rtyper.rmodel import inputconst, Repr
from rpython.rtyper.rint import IntegerRepr
from rpython.rtyper.rstr import (AbstractStringRepr, AbstractCharRepr,
    AbstractUniCharRepr, AbstractStringIteratorRepr, AbstractLLHelpers,
    AbstractUnicodeRepr)
from rpython.tool.sourcetools import func_with_new_name

# ____________________________________________________________
#
#  Concrete implementation of RPython strings:
#
#    struct str {
#        hash: Signed
#        chars: array of Char
#    }

STR = GcForwardReference()
UNICODE = GcForwardReference()

def new_malloc(TP, name):
    @enforceargs(int)
    def mallocstr(length):
        ll_assert(length >= 0, "negative string length")
        r = malloc(TP, length)
        if not we_are_translated() or not malloc_zero_filled:
            r.hash = 0
        return r
    return func_with_new_name(mallocstr, name)

mallocstr = new_malloc(STR, 'mallocstr')
mallocunicode = new_malloc(UNICODE, 'mallocunicode')

def emptystrfun():
    return emptystr

def emptyunicodefun():
    return emptyunicode

def _new_copy_contents_fun(SRC_TP, DST_TP, CHAR_TP, name):
    @specialize.arg(0)
    def _str_ofs(TP, item):
        return (llmemory.offsetof(TP, 'chars') +
                llmemory.itemoffsetof(TP.chars, 0) +
                llmemory.sizeof(CHAR_TP) * item)

    @signature(types.any(), types.any(), types.int(), returns=types.any())
    @specialize.arg(0)
    def _get_raw_buf(TP, src, ofs):
        """
        WARNING: dragons ahead.
        Return the address of the internal char* buffer of the low level
        string. The return value is valid as long as no GC operation occur, so
        you must ensure that it will be used inside a "GC safe" section, for
        example by marking your function with @rgc.no_collect
        """
        assert typeOf(src).TO == TP
        assert ofs >= 0
        return llmemory.cast_ptr_to_adr(src) + _str_ofs(TP, ofs)
    _get_raw_buf._always_inline_ = True

    @jit.oopspec('stroruni.copy_contents(src, dst, srcstart, dststart, length)')
    @signature(types.any(), types.any(), types.int(), types.int(), types.int(), returns=types.none())
    def copy_string_contents(src, dst, srcstart, dststart, length):
        """Copies 'length' characters from the 'src' string to the 'dst'
        string, starting at position 'srcstart' and 'dststart'."""
        # xxx Warning: don't try to do this at home.  It relies on a lot
        # of details to be sure that it works correctly in all cases.
        # Notably: no GC operation at all from the first cast_ptr_to_adr()
        # because it might move the strings.  The keepalive_until_here()
        # are obscurely essential to make sure that the strings stay alive
        # longer than the raw_memcopy().
        assert length >= 0
        ll_assert(srcstart >= 0, "copystrc: negative srcstart")
        ll_assert(srcstart + length <= len(src.chars), "copystrc: src ovf")
        ll_assert(dststart >= 0, "copystrc: negative dststart")
        ll_assert(dststart + length <= len(dst.chars), "copystrc: dst ovf")
        # from here, no GC operations can happen
        asrc = _get_raw_buf(SRC_TP, src, srcstart)
        adst = _get_raw_buf(DST_TP, dst, dststart)
        llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(CHAR_TP) * length)
        # end of "no GC" section
        keepalive_until_here(src)
        keepalive_until_here(dst)
    copy_string_contents._always_inline_ = True
    copy_string_contents = func_with_new_name(copy_string_contents,
                                              'copy_%s_contents' % name)

    @jit.oopspec('stroruni.copy_string_to_raw(src, ptrdst, srcstart, length)')
    def copy_string_to_raw(src, ptrdst, srcstart, length):
        """
        Copies 'length' characters from the 'src' string to the 'ptrdst'
        buffer, starting at position 'srcstart'.
        'ptrdst' must be a non-gc Array of Char.
        """
        # xxx Warning: same note as above apply: don't do this at home
        assert length >= 0
        # from here, no GC operations can happen
        asrc = _get_raw_buf(SRC_TP, src, srcstart)
        adst = llmemory.cast_ptr_to_adr(ptrdst)
        adst = adst + llmemory.itemoffsetof(typeOf(ptrdst).TO, 0)
        llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(CHAR_TP) * length)
        # end of "no GC" section
        keepalive_until_here(src)
    copy_string_to_raw._always_inline_ = True
    copy_string_to_raw = func_with_new_name(copy_string_to_raw, 'copy_%s_to_raw' % name)

    @jit.dont_look_inside
    @signature(types.any(), types.any(), types.int(), types.int(),
               returns=types.none())
    def copy_raw_to_string(ptrsrc, dst, dststart, length):
        # xxx Warning: same note as above apply: don't do this at home
        assert length >= 0
        # from here, no GC operations can happen
        adst = _get_raw_buf(SRC_TP, dst, dststart)
        asrc = llmemory.cast_ptr_to_adr(ptrsrc)

        asrc = asrc + llmemory.itemoffsetof(typeOf(ptrsrc).TO, 0)
        llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(CHAR_TP) * length)
        # end of "no GC" section
        keepalive_until_here(dst)
    copy_raw_to_string._always_inline_ = True
    copy_raw_to_string = func_with_new_name(copy_raw_to_string,
                                              'copy_raw_to_%s' % name)

    return copy_string_to_raw, copy_raw_to_string, copy_string_contents

(copy_string_to_raw,
 copy_raw_to_string,
 copy_string_contents) = _new_copy_contents_fun(STR, STR, Char, 'string')

(copy_unicode_to_raw,
 copy_raw_to_unicode,
 copy_unicode_contents) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar, 'unicode')

CONST_STR_CACHE = WeakValueDictionary()
CONST_UNICODE_CACHE = WeakValueDictionary()

class BaseLLStringRepr(Repr):
    def convert_const(self, value):
        if value is None:
            return nullptr(self.lowleveltype.TO)
        #value = getattr(value, '__self__', value)  # for bound string methods
        if not isinstance(value, self.basetype):
            raise TyperError("not a str: %r" % (value,))
        try:
            return self.CACHE[value]
        except KeyError:
            p = self.malloc(len(value))
            for i in range(len(value)):
                p.chars[i] = cast_primitive(self.base, value[i])
            p.hash = 0
            self.ll.ll_strhash(p)   # precompute the hash
            self.CACHE[value] = p
            return p

    def make_iterator_repr(self, variant=None):
        if variant is not None:
            raise TyperError("unsupported %r iterator over a str/unicode" %
                             (variant,))
        return self.repr.iterator_repr

    def can_ll_be_null(self, s_value):
        # XXX unicode
        if self is string_repr:
            return s_value.can_be_none()
        else:
            return True     # for CharRepr/UniCharRepr subclasses,
                            # where NULL is always valid: it is chr(0)


    def _list_length_items(self, hop, v_lst, LIST):
        LIST = LIST.TO
        v_length = hop.gendirectcall(LIST.ll_length, v_lst)
        v_items = hop.gendirectcall(LIST.ll_items, v_lst)
        return v_length, v_items

class StringRepr(BaseLLStringRepr, AbstractStringRepr):
    lowleveltype = Ptr(STR)
    basetype = str
    base = Char
    CACHE = CONST_STR_CACHE

    def __init__(self, *args):
        AbstractStringRepr.__init__(self, *args)
        self.ll = LLHelpers
        self.malloc = mallocstr

    def ll_decode_latin1(self, value):
        lgt = len(value.chars)
        s = mallocunicode(lgt)
        for i in range(lgt):
            s.chars[i] = cast_primitive(UniChar, value.chars[i])
        return s

class UnicodeRepr(BaseLLStringRepr, AbstractUnicodeRepr):
    lowleveltype = Ptr(UNICODE)
    basetype = basestring
    base = UniChar
    CACHE = CONST_UNICODE_CACHE

    def __init__(self, *args):
        AbstractUnicodeRepr.__init__(self, *args)
        self.ll = LLHelpers
        self.malloc = mallocunicode

    @jit.elidable
    def ll_str(self, s):
        # XXX crazy that this is here, but I don't want to break
        #     rmodel logic
        if not s:
            return self.ll.ll_constant('None')
        lgt = len(s.chars)
        result = mallocstr(lgt)
        for i in range(lgt):
            c = s.chars[i]
            if ord(c) > 127:
                raise UnicodeEncodeError("character not in ascii range")
            result.chars[i] = cast_primitive(Char, c)
        return result

    @jit.elidable
    def ll_unicode(self, s):
        if s:
            return s
        else:
            return self.ll.ll_constant_unicode(u'None')

    @jit.elidable
    def ll_encode_latin1(self, s):
        length = len(s.chars)
        result = mallocstr(length)
        for i in range(length):
            c = s.chars[i]
            if ord(c) > 255:
                raise UnicodeEncodeError("character not in latin1 range")
            result.chars[i] = cast_primitive(Char, c)
        return result

class CharRepr(AbstractCharRepr, StringRepr):
    lowleveltype = Char

class UniCharRepr(AbstractUniCharRepr, UnicodeRepr):
    lowleveltype = UniChar



# ____________________________________________________________
#
#  Low-level methods.  These can be run for testing, but are meant to
#  be direct_call'ed from rtyped flow graphs, which means that they will
#  get flowed and annotated, mostly with SomePtr.
#

FAST_COUNT = 0
FAST_FIND = 1
FAST_RFIND = 2


from rpython.rlib.rarithmetic import LONG_BIT as BLOOM_WIDTH


def bloom_add(mask, c):
    return mask | (1 << (ord(c) & (BLOOM_WIDTH - 1)))


def bloom(mask, c):
    return mask & (1 << (ord(c) & (BLOOM_WIDTH - 1)))


class LLHelpers(AbstractLLHelpers):
    from rpython.rtyper.annlowlevel import llstr, llunicode

    @staticmethod
    @jit.elidable
    def ll_str_mul(s, times):
        if times < 0:
            times = 0
        try:
            size = ovfcheck(len(s.chars) * times)
        except OverflowError:
            raise MemoryError
        newstr = s.malloc(size)
        i = 0
        if i < size:
            s.copy_contents(s, newstr, 0, 0, len(s.chars))
            i += len(s.chars)
        while i < size:
            if i <= size - i:
                j = i
            else:
                j = size - i
            s.copy_contents(newstr, newstr, 0, i, j)
            i += j
        return newstr

    @staticmethod
    @jit.elidable
    def ll_char_mul(ch, times):
        if typeOf(ch) is Char:
            malloc = mallocstr
        else:
            malloc = mallocunicode
        if times < 0:
            times = 0
        newstr = malloc(times)
        j = 0
        # XXX we can use memset here, not sure how useful this is
        while j < times:
            newstr.chars[j] = ch
            j += 1
        return newstr

    @staticmethod
    def ll_strlen(s):
        return len(s.chars)

    @staticmethod
    @signature(types.any(), types.int(), returns=types.any())
    def ll_stritem_nonneg(s, i):
        chars = s.chars
        ll_assert(i >= 0, "negative str getitem index")
        ll_assert(i < len(chars), "str getitem index out of bound")
        return chars[i]

    @staticmethod
    def ll_chr2str(ch):
        if typeOf(ch) is Char:
            malloc = mallocstr
        else:
            malloc = mallocunicode
        s = malloc(1)
        s.chars[0] = ch
        return s

    # @jit.look_inside_iff(lambda str: jit.isconstant(len(str.chars)) and len(str.chars) == 1)
    @staticmethod
    @jit.oopspec("str.str2unicode(str)")
    def ll_str2unicode(str):
        lgt = len(str.chars)
        s = mallocunicode(lgt)
        for i in range(lgt):
            if ord(str.chars[i]) > 127:
                raise UnicodeDecodeError
            s.chars[i] = cast_primitive(UniChar, str.chars[i])
        return s

    @staticmethod
    def ll_str2bytearray(str):
        from rpython.rtyper.lltypesystem.rbytearray import BYTEARRAY

        lgt = len(str.chars)
        b = malloc(BYTEARRAY, lgt)
        for i in range(lgt):
            b.chars[i] = str.chars[i]
        return b

    @staticmethod
    @jit.elidable
    def ll_strhash(s):
        # unlike CPython, there is no reason to avoid to return -1
        # but our malloc initializes the memory to zero, so we use zero as the
        # special non-computed-yet value.
        if not s:
            return 0
        x = s.hash
        if x == 0:
            x = _hash_string(s.chars)
            if x == 0:
                x = 29872897
            s.hash = x
        return x

    @staticmethod
    def ll_length(s):
        return len(s.chars)

    @staticmethod
    def ll_strfasthash(s):
        return s.hash     # assumes that the hash is already computed

    @staticmethod
    @jit.elidable
    @jit.oopspec('stroruni.concat(s1, s2)')
    def ll_strconcat(s1, s2):
        len1 = s1.length()
        len2 = s2.length()
        # a single '+' like this is allowed to overflow: it gets
        # a negative result, and the gc will complain
        # the typechecks below are if TP == BYTEARRAY
        if typeOf(s1) == Ptr(STR):
            newstr = s2.malloc(len1 + len2)
            newstr.copy_contents_from_str(s1, newstr, 0, 0, len1)
        else:
            newstr = s1.malloc(len1 + len2)
            newstr.copy_contents(s1, newstr, 0, 0, len1)
        if typeOf(s2) == Ptr(STR):
            newstr.copy_contents_from_str(s2, newstr, 0, len1, len2)
        else:
            newstr.copy_contents(s2, newstr, 0, len1, len2)
        return newstr

    @staticmethod
    @jit.elidable
    def ll_strip(s, ch, left, right):
        s_len = len(s.chars)
        if s_len == 0:
            return s.empty()
        lpos = 0
        rpos = s_len - 1
        if left:
            while lpos < rpos and s.chars[lpos] == ch:
                lpos += 1
        if right:
            while lpos < rpos + 1 and s.chars[rpos] == ch:
                rpos -= 1
        if rpos < lpos:
            return s.empty()
        r_len = rpos - lpos + 1
        result = s.malloc(r_len)
        s.copy_contents(s, result, lpos, 0, r_len)
        return result

    @staticmethod
    @jit.elidable
    def ll_strip_default(s, left, right):
        s_len = len(s.chars)
        if s_len == 0:
            return s.empty()
        lpos = 0
        rpos = s_len - 1
        if left:
            while lpos < rpos and s.chars[lpos].isspace():
                lpos += 1
        if right:
            while lpos < rpos + 1 and s.chars[rpos].isspace():
                rpos -= 1
        if rpos < lpos:
            return s.empty()
        r_len = rpos - lpos + 1
        result = s.malloc(r_len)
        s.copy_contents(s, result, lpos, 0, r_len)
        return result

    @staticmethod
    @jit.elidable
    def ll_strip_multiple(s, s2, left, right):
        s_len = len(s.chars)
        if s_len == 0:
            return s.empty()
        lpos = 0
        rpos = s_len - 1
        if left:
            while lpos < rpos and LLHelpers.ll_contains(s2, s.chars[lpos]):
                lpos += 1
        if right:
            while lpos < rpos + 1 and LLHelpers.ll_contains(s2, s.chars[rpos]):
                rpos -= 1
        if rpos < lpos:
            return s.empty()
        r_len = rpos - lpos + 1
        result = s.malloc(r_len)
        s.copy_contents(s, result, lpos, 0, r_len)
        return result

    @staticmethod
    @jit.elidable
    def ll_upper(s):
        s_chars = s.chars
        s_len = len(s_chars)
        if s_len == 0:
            return s.empty()
        i = 0
        result = mallocstr(s_len)
        #        ^^^^^^^^^ specifically to explode on unicode
        while i < s_len:
            result.chars[i] = LLHelpers.ll_upper_char(s_chars[i])
            i += 1
        return result

    @staticmethod
    @jit.elidable
    def ll_lower(s):
        s_chars = s.chars
        s_len = len(s_chars)
        if s_len == 0:
            return s.empty()
        i = 0
        result = mallocstr(s_len)
        #        ^^^^^^^^^ specifically to explode on unicode
        while i < s_len:
            result.chars[i] = LLHelpers.ll_lower_char(s_chars[i])
            i += 1
        return result

    @staticmethod
    def ll_join(s, length, items):
        s_chars = s.chars
        s_len = len(s_chars)
        num_items = length
        if num_items == 0:
            return s.empty()
        itemslen = 0
        i = 0
        while i < num_items:
            try:
                itemslen = ovfcheck(itemslen + len(items[i].chars))
            except OverflowError:
                raise MemoryError
            i += 1
        try:
            seplen = ovfcheck(s_len * (num_items - 1))
        except OverflowError:
            raise MemoryError
        # a single '+' at the end is allowed to overflow: it gets
        # a negative result, and the gc will complain
        result = s.malloc(itemslen + seplen)
        res_index = len(items[0].chars)
        s.copy_contents(items[0], result, 0, 0, res_index)
        i = 1
        while i < num_items:
            s.copy_contents(s, result, 0, res_index, s_len)
            res_index += s_len
            lgt = len(items[i].chars)
            s.copy_contents(items[i], result, 0, res_index, lgt)
            res_index += lgt
            i += 1
        return result

    @staticmethod
    @jit.elidable
    @jit.oopspec('stroruni.cmp(s1, s2)')
    def ll_strcmp(s1, s2):
        if not s1 and not s2:
            return True
        if not s1 or not s2:
            return False
        chars1 = s1.chars
        chars2 = s2.chars
        len1 = len(chars1)
        len2 = len(chars2)

        if len1 < len2:
            cmplen = len1
        else:
            cmplen = len2
        i = 0
        while i < cmplen:
            diff = ord(chars1[i]) - ord(chars2[i])
            if diff != 0:
                return diff
            i += 1
        return len1 - len2

    @staticmethod
    @jit.elidable
    @jit.oopspec('stroruni.equal(s1, s2)')
    def ll_streq(s1, s2):
        if s1 == s2:       # also if both are NULLs
            return True
        if not s1 or not s2:
            return False
        len1 = len(s1.chars)
        len2 = len(s2.chars)
        if len1 != len2:
            return False
        j = 0
        chars1 = s1.chars
        chars2 = s2.chars
        while j < len1:
            if chars1[j] != chars2[j]:
                return False
            j += 1
        return True

    @staticmethod
    @jit.elidable
    def ll_startswith(s1, s2):
        len1 = len(s1.chars)
        len2 = len(s2.chars)
        if len1 < len2:
            return False
        j = 0
        chars1 = s1.chars
        chars2 = s2.chars
        while j < len2:
            if chars1[j] != chars2[j]:
                return False
            j += 1

        return True

    @staticmethod
    def ll_startswith_char(s, ch):
        if not len(s.chars):
            return False
        return s.chars[0] == ch

    @staticmethod
    @jit.elidable
    def ll_endswith(s1, s2):
        len1 = len(s1.chars)
        len2 = len(s2.chars)
        if len1 < len2:
            return False
        j = 0
        chars1 = s1.chars
        chars2 = s2.chars
        offset = len1 - len2
        while j < len2:
            if chars1[offset + j] != chars2[j]:
                return False
            j += 1

        return True

    @staticmethod
    def ll_endswith_char(s, ch):
        if not len(s.chars):
            return False
        return s.chars[len(s.chars) - 1] == ch

    @staticmethod
    @jit.elidable
    @signature(types.any(), types.any(), types.int(), types.int(), returns=types.int())
    def ll_find_char(s, ch, start, end):
        i = start
        if end > len(s.chars):
            end = len(s.chars)
        while i < end:
            if s.chars[i] == ch:
                return i
            i += 1
        return -1

    @staticmethod
    @jit.elidable
    def ll_rfind_char(s, ch, start, end):
        if end > len(s.chars):
            end = len(s.chars)
        i = end
        while i > start:
            i -= 1
            if s.chars[i] == ch:
                return i
        return -1

    @staticmethod
    @jit.elidable
    def ll_count_char(s, ch, start, end):
        count = 0
        i = start
        if end > len(s.chars):
            end = len(s.chars)
        while i < end:
            if s.chars[i] == ch:
                count += 1
            i += 1
        return count

    @staticmethod
    @signature(types.any(), types.any(), types.int(), types.int(), returns=types.int())
    def ll_find(s1, s2, start, end):
        if start < 0:
            start = 0
        if end > len(s1.chars):
            end = len(s1.chars)
        if end - start < 0:
            return -1

        m = len(s2.chars)
        if m == 1:
            return LLHelpers.ll_find_char(s1, s2.chars[0], start, end)

        return LLHelpers.ll_search(s1, s2, start, end, FAST_FIND)

    @staticmethod
    @signature(types.any(), types.any(), types.int(), types.int(), returns=types.int())
    def ll_rfind(s1, s2, start, end):
        if start < 0:
            start = 0
        if end > len(s1.chars):
            end = len(s1.chars)
        if end - start < 0:
            return -1

        m = len(s2.chars)
        if m == 1:
            return LLHelpers.ll_rfind_char(s1, s2.chars[0], start, end)

        return LLHelpers.ll_search(s1, s2, start, end, FAST_RFIND)

    @classmethod
    def ll_count(cls, s1, s2, start, end):
        if start < 0:
            start = 0
        if end > len(s1.chars):
            end = len(s1.chars)
        if end - start < 0:
            return 0

        m = len(s2.chars)
        if m == 1:
            return cls.ll_count_char(s1, s2.chars[0], start, end)

        res = cls.ll_search(s1, s2, start, end, FAST_COUNT)
        assert res >= 0
        return res

    @staticmethod
    @jit.elidable
    def ll_search(s1, s2, start, end, mode):
        count = 0
        n = end - start
        m = len(s2.chars)

        if m == 0:
            if mode == FAST_COUNT:
                return end - start + 1
            elif mode == FAST_RFIND:
                return end
            else:
                return start

        w = n - m

        if w < 0:
            if mode == FAST_COUNT:
                return 0
            return -1

        mlast = m - 1
        skip = mlast - 1
        mask = 0

        if mode != FAST_RFIND:
            for i in range(mlast):
                mask = bloom_add(mask, s2.chars[i])
                if s2.chars[i] == s2.chars[mlast]:
                    skip = mlast - i - 1
            mask = bloom_add(mask, s2.chars[mlast])

            i = start - 1
            while i + 1 <= start + w:
                i += 1
                if s1.chars[i + m - 1] == s2.chars[m - 1]:
                    for j in range(mlast):
                        if s1.chars[i + j] != s2.chars[j]:
                            break
                    else:
                        if mode != FAST_COUNT:
                            return i
                        count += 1
                        i += mlast
                        continue

                    if i + m < len(s1.chars):
                        c = s1.chars[i + m]
                    else:
                        c = '\0'
                    if not bloom(mask, c):
                        i += m
                    else:
                        i += skip
                else:
                    if i + m < len(s1.chars):
                        c = s1.chars[i + m]
                    else:
                        c = '\0'
                    if not bloom(mask, c):
                        i += m
        else:
            mask = bloom_add(mask, s2.chars[0])
            for i in range(mlast, 0, -1):
                mask = bloom_add(mask, s2.chars[i])
                if s2.chars[i] == s2.chars[0]:
                    skip = i - 1

            i = start + w + 1
            while i - 1 >= start:
                i -= 1
                if s1.chars[i] == s2.chars[0]:
                    for j in xrange(mlast, 0, -1):
                        if s1.chars[i + j] != s2.chars[j]:
                            break
                    else:
                        return i
                    if i - 1 >= 0 and not bloom(mask, s1.chars[i - 1]):
                        i -= m
                    else:
                        i -= skip
                else:
                    if i - 1 >= 0 and not bloom(mask, s1.chars[i - 1]):
                        i -= m

        if mode != FAST_COUNT:
            return -1
        return count

    @staticmethod
    @signature(types.int(), types.any(), returns=types.any())
    @jit.look_inside_iff(lambda length, items: jit.loop_unrolling_heuristic(
        items, length))
    def ll_join_strs(length, items):
        # Special case for length 1 items, helps both the JIT and other code
        if length == 1:
            return items[0]

        num_items = length
        itemslen = 0
        i = 0
        while i < num_items:
            try:
                itemslen = ovfcheck(itemslen + len(items[i].chars))
            except OverflowError:
                raise MemoryError
            i += 1
        if typeOf(items).TO.OF.TO == STR:
            malloc = mallocstr
            copy_contents = copy_string_contents
        else:
            malloc = mallocunicode
            copy_contents = copy_unicode_contents
        result = malloc(itemslen)
        res_index = 0
        i = 0
        while i < num_items:
            item_chars = items[i].chars
            item_len = len(item_chars)
            copy_contents(items[i], result, 0, res_index, item_len)
            res_index += item_len
            i += 1
        return result

    @staticmethod
    @jit.look_inside_iff(lambda length, chars, RES: jit.isconstant(length) and jit.isvirtual(chars))
    def ll_join_chars(length, chars, RES):
        # no need to optimize this, will be replaced by string builder
        # at some point soon
        num_chars = length
        if RES is StringRepr.lowleveltype:
            target = Char
            malloc = mallocstr
        else:
            target = UniChar
            malloc = mallocunicode
        result = malloc(num_chars)
        res_chars = result.chars
        i = 0
        while i < num_chars:
            res_chars[i] = cast_primitive(target, chars[i])
            i += 1
        return result

    @staticmethod
    @jit.oopspec('stroruni.slice(s1, start, stop)')
    @signature(types.any(), types.int(), types.int(), returns=types.any())
    @jit.elidable
    def _ll_stringslice(s1, start, stop):
        lgt = stop - start
        assert start >= 0
        # If start > stop, return a empty string. This can happen if the start
        # is greater than the length of the string. Use < instead of <= to avoid
        # creating another path for the JIT when start == stop.
        if lgt < 0:
            return s1.empty()
        newstr = s1.malloc(lgt)
        s1.copy_contents(s1, newstr, start, 0, lgt)
        return newstr

    @staticmethod
    def ll_stringslice_startonly(s1, start):
        return LLHelpers._ll_stringslice(s1, start, len(s1.chars))

    @staticmethod
    @signature(types.any(), types.int(), types.int(), returns=types.any())
    def ll_stringslice_startstop(s1, start, stop):
        if jit.we_are_jitted():
            if stop > len(s1.chars):
                stop = len(s1.chars)
        else:
            if stop >= len(s1.chars):
                if start == 0:
                    return s1
                stop = len(s1.chars)
        return LLHelpers._ll_stringslice(s1, start, stop)

    @staticmethod
    def ll_stringslice_minusone(s1):
        newlen = len(s1.chars) - 1
        return LLHelpers._ll_stringslice(s1, 0, newlen)

    @staticmethod
    def ll_split_chr(LIST, s, c, max):
        chars = s.chars
        strlen = len(chars)
        count = 1
        i = 0
        if max == 0:
            i = strlen
        while i < strlen:
            if chars[i] == c:
                count += 1
                if max >= 0 and count > max:
                    break
            i += 1
        res = LIST.ll_newlist(count)
        items = res.ll_items()
        i = 0
        j = 0
        resindex = 0
        if max == 0:
            j = strlen
        while j < strlen:
            if chars[j] == c:
                item = items[resindex] = s.malloc(j - i)
                item.copy_contents(s, item, i, 0, j - i)
                resindex += 1
                i = j + 1
                if max >= 0 and resindex >= max:
                    j = strlen
                    break
            j += 1
        item = items[resindex] = s.malloc(j - i)
        item.copy_contents(s, item, i, 0, j - i)
        return res

    @staticmethod
    def ll_split(LIST, s, c, max):
        count = 1
        if max == -1:
            max = len(s.chars)
        pos = 0
        last = len(s.chars)
        markerlen = len(c.chars)
        pos = s.find(c, 0, last)
        while pos >= 0 and count <= max:
            pos = s.find(c, pos + markerlen, last)
            count += 1
        res = LIST.ll_newlist(count)
        items = res.ll_items()
        pos = 0
        count = 0
        pos = s.find(c, 0, last)
        prev_pos = 0
        if pos < 0:
            items[0] = s
            return res
        while pos >= 0 and count < max:
            item = items[count] = s.malloc(pos - prev_pos)
            item.copy_contents(s, item, prev_pos, 0, pos -
                               prev_pos)
            count += 1
            prev_pos = pos + markerlen
            pos = s.find(c, pos + markerlen, last)
        item = items[count] = s.malloc(last - prev_pos)
        item.copy_contents(s, item, prev_pos, 0, last - prev_pos)
        return res

    @staticmethod
    def ll_rsplit_chr(LIST, s, c, max):
        chars = s.chars
        strlen = len(chars)
        count = 1
        i = 0
        if max == 0:
            i = strlen
        while i < strlen:
            if chars[i] == c:
                count += 1
                if max >= 0 and count > max:
                    break
            i += 1
        res = LIST.ll_newlist(count)
        items = res.ll_items()
        i = strlen
        j = strlen
        resindex = count - 1
        assert resindex >= 0
        if max == 0:
            j = 0
        while j > 0:
            j -= 1
            if chars[j] == c:
                item = items[resindex] = s.malloc(i - j - 1)
                item.copy_contents(s, item, j + 1, 0, i - j - 1)
                resindex -= 1
                i = j
                if resindex == 0:
                    j = 0
                    break
        item = items[resindex] = s.malloc(i - j)
        item.copy_contents(s, item, j, 0, i - j)
        return res

    @staticmethod
    def ll_rsplit(LIST, s, c, max):
        count = 1
        if max == -1:
            max = len(s.chars)
        pos = len(s.chars)
        markerlen = len(c.chars)
        pos = s.rfind(c, 0, pos)
        while pos >= 0 and count <= max:
            pos = s.rfind(c, 0, pos - markerlen)
            count += 1
        res = LIST.ll_newlist(count)
        items = res.ll_items()
        pos = 0
        pos = len(s.chars)
        prev_pos = pos
        pos = s.rfind(c, 0, pos)
        if pos < 0:
            items[0] = s
            return res
        count -= 1
        while pos >= 0 and count > 0:
            item = items[count] = s.malloc(prev_pos - pos - markerlen)
            item.copy_contents(s, item, pos + markerlen, 0,
                               prev_pos - pos - markerlen)
            count -= 1
            prev_pos = pos
            pos = s.rfind(c, 0, pos)
        item = items[count] = s.malloc(prev_pos)
        item.copy_contents(s, item, 0, 0, prev_pos)
        return res

    @staticmethod
    @jit.elidable
    def ll_replace_chr_chr(s, c1, c2):
        length = len(s.chars)
        newstr = s.malloc(length)
        src = s.chars
        dst = newstr.chars
        j = 0
        while j < length:
            c = src[j]
            if c == c1:
                c = c2
            dst[j] = c
            j += 1
        return newstr

    @staticmethod
    @jit.elidable
    def ll_contains(s, c):
        chars = s.chars
        strlen = len(chars)
        i = 0
        while i < strlen:
            if chars[i] == c:
                return True
            i += 1
        return False

    @staticmethod
    @jit.elidable
    def ll_int(s, base):
        if not 2 <= base <= 36:
            raise ValueError
        chars = s.chars
        strlen = len(chars)
        i = 0
        #XXX: only space is allowed as white space for now
        while i < strlen and chars[i] == ' ':
            i += 1
        if not i < strlen:
            raise ValueError
        #check sign
        sign = 1
        if chars[i] == '-':
            sign = -1
            i += 1
        elif chars[i] == '+':
            i += 1
        # skip whitespaces between sign and digits
        while i < strlen and chars[i] == ' ':
            i += 1
        #now get digits
        val = 0
        oldpos = i
        while i < strlen:
            c = ord(chars[i])
            if ord('a') <= c <= ord('z'):
                digit = c - ord('a') + 10
            elif ord('A') <= c <= ord('Z'):
                digit = c - ord('A') + 10
            elif ord('0') <= c <= ord('9'):
                digit = c - ord('0')
            else:
                break
            if digit >= base:
                break
            val = val * base + digit
            i += 1
        if i == oldpos:
            raise ValueError # catch strings like '+' and '+  '
        #skip trailing whitespace
        while i < strlen and chars[i] == ' ':
            i += 1
        if not i == strlen:
            raise ValueError
        return sign * val

    # interface to build strings:
    #   x = ll_build_start(n)
    #   ll_build_push(x, next_string, 0)
    #   ll_build_push(x, next_string, 1)
    #   ...
    #   ll_build_push(x, next_string, n-1)
    #   s = ll_build_finish(x)

    @staticmethod
    def ll_build_start(parts_count):
        return malloc(TEMP, parts_count)

    @staticmethod
    def ll_build_push(builder, next_string, index):
        builder[index] = next_string

    @staticmethod
    def ll_build_finish(builder):
        return LLHelpers.ll_join_strs(len(builder), builder)

    @staticmethod
    @specialize.memo()
    def ll_constant(s):
        return string_repr.convert_const(s)

    @staticmethod
    @specialize.memo()
    def ll_constant_unicode(s):
        return unicode_repr.convert_const(s)

    @classmethod
    def do_stringformat(cls, hop, sourcevarsrepr):
        s_str = hop.args_s[0]
        assert s_str.is_constant()
        is_unicode = isinstance(s_str, annmodel.SomeUnicodeString)
        if is_unicode:
            TEMPBUF = TEMP_UNICODE
        else:
            TEMPBUF = TEMP
        s = s_str.const
        things = cls.parse_fmt_string(s)
        size = inputconst(Signed, len(things)) # could be unsigned?
        cTEMP = inputconst(Void, TEMPBUF)
        cflags = inputconst(Void, {'flavor': 'gc'})
        vtemp = hop.genop("malloc_varsize", [cTEMP, cflags, size],
                          resulttype=Ptr(TEMPBUF))

        argsiter = iter(sourcevarsrepr)

        from rpython.rtyper.rclass import InstanceRepr
        for i, thing in enumerate(things):
            if isinstance(thing, tuple):
                code = thing[0]
                vitem, r_arg = argsiter.next()
                if not hasattr(r_arg, 'll_str'):
                    raise TyperError("ll_str unsupported for: %r" % r_arg)
                if code == 's':
                    if is_unicode:
                        # only UniCharRepr and UnicodeRepr has it so far
                        vchunk = hop.gendirectcall(r_arg.ll_unicode, vitem)
                    else:
                        vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
                elif code == 'r' and isinstance(r_arg, InstanceRepr):
                    vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
                elif code == 'd':
                    assert isinstance(r_arg, IntegerRepr)
                    #vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
                    vchunk = hop.gendirectcall(ll_str.ll_int2dec, vitem)
                elif code == 'f':
                    #assert isinstance(r_arg, FloatRepr)
                    vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
                elif code == 'x':
                    assert isinstance(r_arg, IntegerRepr)
                    vchunk = hop.gendirectcall(ll_str.ll_int2hex, vitem,
                                               inputconst(Bool, False))
                elif code == 'o':
                    assert isinstance(r_arg, IntegerRepr)
                    vchunk = hop.gendirectcall(ll_str.ll_int2oct, vitem,
                                               inputconst(Bool, False))
                else:
                    raise TyperError("%%%s is not RPython" % (code,))
            else:
                if is_unicode:
                    vchunk = inputconst(unicode_repr, thing)
                else:
                    vchunk = inputconst(string_repr, thing)
            i = inputconst(Signed, i)
            if is_unicode and vchunk.concretetype != Ptr(UNICODE):
                # if we are here, one of the ll_str.* functions returned some
                # STR, so we convert it to unicode. It's a bit suboptimal
                # because we do one extra copy.
                vchunk = hop.gendirectcall(cls.ll_str2unicode, vchunk)
            hop.genop('setarrayitem', [vtemp, i, vchunk])

        hop.exception_cannot_occur()   # to ignore the ZeroDivisionError of '%'
        return hop.gendirectcall(cls.ll_join_strs, size, vtemp)

    @staticmethod
    @jit.dont_look_inside
    def ll_string2list(RESLIST, src):
        length = len(src.chars)
        lst = RESLIST.ll_newlist(length)
        dst = lst.ll_items()
        SRC = typeOf(src).TO     # STR or UNICODE
        DST = typeOf(dst).TO     # GcArray
        assert DST.OF is SRC.chars.OF
        # from here, no GC operations can happen
        asrc = llmemory.cast_ptr_to_adr(src) + (
            llmemory.offsetof(SRC, 'chars') +
            llmemory.itemoffsetof(SRC.chars, 0))
        adst = llmemory.cast_ptr_to_adr(dst) + llmemory.itemoffsetof(DST, 0)
        llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(DST.OF) * length)
        # end of "no GC" section
        keepalive_until_here(src)
        keepalive_until_here(dst)
        return lst

TEMP = GcArray(Ptr(STR))
TEMP_UNICODE = GcArray(Ptr(UNICODE))

# ____________________________________________________________

STR.become(GcStruct('rpy_string', ('hash',  Signed),
                    ('chars', Array(Char, hints={'immutable': True,
                                    'extra_item_after_alloc': 1})),
                    adtmeths={'malloc' : staticAdtMethod(mallocstr),
                              'empty'  : staticAdtMethod(emptystrfun),
                              'copy_contents' : staticAdtMethod(copy_string_contents),
                              'copy_contents_from_str' : staticAdtMethod(copy_string_contents),
                              'gethash': LLHelpers.ll_strhash,
                              'length': LLHelpers.ll_length,
                              'find': LLHelpers.ll_find,
                              'rfind': LLHelpers.ll_rfind}))
UNICODE.become(GcStruct('rpy_unicode', ('hash', Signed),
                        ('chars', Array(UniChar, hints={'immutable': True})),
                        adtmeths={'malloc' : staticAdtMethod(mallocunicode),
                                  'empty'  : staticAdtMethod(emptyunicodefun),
                                  'copy_contents' : staticAdtMethod(copy_unicode_contents),
                                  'copy_contents_from_str' : staticAdtMethod(copy_unicode_contents),
                                  'gethash': LLHelpers.ll_strhash,
                                  'length': LLHelpers.ll_length}
                        ))


# TODO: make the public interface of the rstr module cleaner
ll_strconcat = LLHelpers.ll_strconcat
ll_join = LLHelpers.ll_join
ll_str2unicode = LLHelpers.ll_str2unicode
do_stringformat = LLHelpers.do_stringformat

string_repr = StringRepr()
char_repr = CharRepr()
unichar_repr = UniCharRepr()
char_repr.ll = LLHelpers
unichar_repr.ll = LLHelpers
unicode_repr = UnicodeRepr()
emptystr = string_repr.convert_const("")
emptyunicode = unicode_repr.convert_const(u'')

StringRepr.repr = string_repr
UnicodeRepr.repr = unicode_repr
UniCharRepr.repr = unicode_repr
UniCharRepr.char_repr = unichar_repr
UnicodeRepr.char_repr = unichar_repr
CharRepr.char_repr = char_repr
StringRepr.char_repr = char_repr

class BaseStringIteratorRepr(AbstractStringIteratorRepr):

    def __init__(self):
        self.ll_striter = ll_striter
        self.ll_strnext = ll_strnext
        self.ll_getnextindex = ll_getnextindex

class StringIteratorRepr(BaseStringIteratorRepr):

    external_item_repr = char_repr
    lowleveltype = Ptr(GcStruct('stringiter',
                                ('string', string_repr.lowleveltype),
                                ('length', Signed),
                                ('index', Signed)))

class UnicodeIteratorRepr(BaseStringIteratorRepr):

    external_item_repr = unichar_repr
    lowleveltype = Ptr(GcStruct('unicodeiter',
                                ('string', unicode_repr.lowleveltype),
                                ('length', Signed),
                                ('index', Signed)))

def ll_striter(string):
    if typeOf(string) == string_repr.lowleveltype:
        TP = string_repr.iterator_repr.lowleveltype.TO
    elif typeOf(string) == unicode_repr.lowleveltype:
        TP = unicode_repr.iterator_repr.lowleveltype.TO
    else:
        raise TypeError("Unknown string type %s" % (typeOf(string),))
    iter = malloc(TP)
    iter.string = string
    iter.length = len(string.chars)    # load this value only once
    iter.index = 0
    return iter

def ll_strnext(iter):
    index = iter.index
    if index >= iter.length:
        raise StopIteration
    iter.index = index + 1
    return iter.string.chars[index]

def ll_getnextindex(iter):
    return iter.index

string_repr.iterator_repr = StringIteratorRepr()
unicode_repr.iterator_repr = UnicodeIteratorRepr()

# these should be in rclass, but circular imports prevent (also it's
# not that insane that a string constant is built in this file).

instance_str_prefix = string_repr.convert_const("<")
instance_str_infix  = string_repr.convert_const(" object at 0x")
instance_str_suffix = string_repr.convert_const(">")

null_str = string_repr.convert_const("NULL")

unboxed_instance_str_prefix = string_repr.convert_const("<unboxed ")
unboxed_instance_str_suffix = string_repr.convert_const(">")
Tech Fingerprint

Alerts (97)

'def' Ensure functions have docstrings for documentation
34 36 47 50 124 153 170 176 202 221 236 243 276 280 289 312 328 333 340 352 362 373 388 392 398 418 439 460 481 496 510 546 571 591 607 614 631 639 651 663 676 692 707 725 817 850 885 890 902 907 942 974 1010 1043 1059 1071 1127 1131 1135 1140 1145 1149 1217 1308 1321 1328
'isinstance(' Overuse may indicate design issues; consider polymorphism
157 1152 1169 1180 1183 1187 1190 1194
Complexity hotspot; lines 424 to 425 (total complexity: 3)
424 425
Complexity hotspot; lines 427 to 428 (total complexity: 3)
427 428
Complexity hotspot; lines 445 to 446 (total complexity: 3)
445 446
Complexity hotspot; lines 448 to 449 (total complexity: 3)
448 449
Complexity hotspot; lines 466 to 467 (total complexity: 3)
466 467
Complexity hotspot; lines 469 to 470 (total complexity: 3)
469 470
Complexity hotspot; lines 759 to 761 (total complexity: 3)
759 760 761
Complexity hotspot; lines 795 to 797 (total complexity: 3)
795 796 797
Complexity hotspot; lines 805 to 806 (total complexity: 3)
805 806
Complexity hotspot; lines 894 to 896 (total complexity: 3)
894 895 896