rstr.py | searchcode

/pypy/rpython/lltypesystem/rstr.py

https://bitbucket.org/pypy/pypy/
Python | 1091 lines | 934 code | 106 blank | 51 comment | 208 complexity | 56e3e7da9fc72bca43724cd845ad14e3 MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0

from weakref import WeakValueDictionary
from pypy.tool.pairtype import pairtype
from pypy.rpython.error import TyperError
from pypy.rlib.objectmodel import malloc_zero_filled, we_are_translated
from pypy.rlib.objectmodel import _hash_string, enforceargs
from pypy.rlib.objectmodel import keepalive_until_here
from pypy.rlib.debug import ll_assert
from pypy.rlib import jit
from pypy.rlib.rarithmetic import ovfcheck
from pypy.rpython.robject import PyObjRepr, pyobj_repr
from pypy.rpython.rmodel import inputconst, IntegerRepr
from pypy.rpython.rstr import AbstractStringRepr,AbstractCharRepr,\
     AbstractUniCharRepr, AbstractStringIteratorRepr,\
     AbstractLLHelpers, AbstractUnicodeRepr
from pypy.rpython.lltypesystem import ll_str
from pypy.rpython.lltypesystem.lltype import \
     GcStruct, Signed, Array, Char, UniChar, Ptr, malloc, \
     Bool, Void, GcArray, nullptr, pyobjectptr, cast_primitive, typeOf,\
     staticAdtMethod, GcForwardReference
from pypy.rpython.rmodel import Repr
from pypy.rpython.lltypesystem import llmemory
from pypy.tool.sourcetools import func_with_new_name
from pypy.rpython.lltypesystem.lloperation import llop

# ____________________________________________________________
#
#  Concrete implementation of RPython strings:
#
#    struct str {
#        hash: Signed
#        chars: array of Char
#    }

STR = GcForwardReference()
UNICODE = GcForwardReference()

def new_malloc(TP, name):
    def mallocstr(length):
        ll_assert(length >= 0, "negative string length")
        r = malloc(TP, length)
        if not we_are_translated() or not malloc_zero_filled:
            r.hash = 0
        return r
    mallocstr._annspecialcase_ = 'specialize:semierased'
    return func_with_new_name(mallocstr, name)

mallocstr = new_malloc(STR, 'mallocstr')
mallocunicode = new_malloc(UNICODE, 'mallocunicode')

def emptystrfun():
    return emptystr

def emptyunicodefun():
    return emptyunicode

def _new_copy_contents_fun(TP, CHAR_TP, name):
    def _str_ofs(item):
        return (llmemory.offsetof(TP, 'chars') +
                llmemory.itemoffsetof(TP.chars, 0) +
                llmemory.sizeof(CHAR_TP) * item)

    @jit.oopspec('stroruni.copy_contents(src, dst, srcstart, dststart, length)')
    @enforceargs(None, None, int, int, int)
    def copy_string_contents(src, dst, srcstart, dststart, length):
        assert srcstart >= 0
        assert dststart >= 0
        assert length >= 0
        src = llmemory.cast_ptr_to_adr(src) + _str_ofs(srcstart)
        dst = llmemory.cast_ptr_to_adr(dst) + _str_ofs(dststart)
        llmemory.raw_memcopy(src, dst, llmemory.sizeof(CHAR_TP) * length)
        keepalive_until_here(src)
        keepalive_until_here(dst)
    copy_string_contents._always_inline_ = True
    return func_with_new_name(copy_string_contents, 'copy_%s_contents' % name)

copy_string_contents = _new_copy_contents_fun(STR, Char, 'string')
copy_unicode_contents = _new_copy_contents_fun(UNICODE, UniChar, 'unicode')

SIGNED_ARRAY = GcArray(Signed)
CONST_STR_CACHE = WeakValueDictionary()
CONST_UNICODE_CACHE = WeakValueDictionary()

class BaseLLStringRepr(Repr):
    def convert_const(self, value):
        if value is None:
            return nullptr(self.lowleveltype.TO)
        #value = getattr(value, '__self__', value)  # for bound string methods
        if not isinstance(value, self.basetype):
            raise TyperError("not a str: %r" % (value,))
        try:
            return self.CACHE[value]
        except KeyError:
            p = self.malloc(len(value))
            for i in range(len(value)):
                p.chars[i] = cast_primitive(self.base, value[i])
            p.hash = 0
            self.ll.ll_strhash(p)   # precompute the hash
            self.CACHE[value] = p
            return p

    def make_iterator_repr(self):
        return self.repr.iterator_repr

    def can_ll_be_null(self, s_value):
        # XXX unicode
        if self is string_repr:
            return s_value.can_be_none()
        else:
            return True     # for CharRepr/UniCharRepr subclasses,
                            # where NULL is always valid: it is chr(0)


    def _list_length_items(self, hop, v_lst, LIST):
        LIST = LIST.TO
        v_length = hop.gendirectcall(LIST.ll_length, v_lst)
        v_items = hop.gendirectcall(LIST.ll_items, v_lst)
        return v_length, v_items

class StringRepr(BaseLLStringRepr, AbstractStringRepr):
    lowleveltype = Ptr(STR)
    basetype = str
    base = Char
    CACHE = CONST_STR_CACHE

    def __init__(self, *args):
        AbstractStringRepr.__init__(self, *args)
        self.ll = LLHelpers
        self.malloc = mallocstr

    def ll_decode_latin1(self, value):
        lgt = len(value.chars)
        s = mallocunicode(lgt)
        for i in range(lgt):
            s.chars[i] = cast_primitive(UniChar, value.chars[i])
        return s

class UnicodeRepr(BaseLLStringRepr, AbstractUnicodeRepr):
    lowleveltype = Ptr(UNICODE)
    basetype = basestring
    base = UniChar
    CACHE = CONST_UNICODE_CACHE

    def __init__(self, *args):
        AbstractUnicodeRepr.__init__(self, *args)
        self.ll = LLHelpers
        self.malloc = mallocunicode

    @jit.elidable
    def ll_str(self, s):
        # XXX crazy that this is here, but I don't want to break
        #     rmodel logic
        if not s:
            return self.ll.ll_constant('None')
        lgt = len(s.chars)
        result = mallocstr(lgt)
        for i in range(lgt):
            c = s.chars[i]
            if ord(c) > 127:
                raise UnicodeEncodeError("character not in ascii range")
            result.chars[i] = cast_primitive(Char, c)
        return result

    @jit.elidable
    def ll_encode_latin1(self, s):
        length = len(s.chars)
        result = mallocstr(length)
        for i in range(length):
            c = s.chars[i]
            if ord(c) > 255:
                raise UnicodeEncodeError("character not in latin1 range")
            result.chars[i] = cast_primitive(Char, c)
        return result

class CharRepr(AbstractCharRepr, StringRepr):
    lowleveltype = Char

class UniCharRepr(AbstractUniCharRepr, UnicodeRepr):
    lowleveltype = UniChar

class __extend__(pairtype(PyObjRepr, AbstractStringRepr)):
    def convert_from_to((r_from, r_to), v, llops):
        v_len = llops.gencapicall('PyString_Size', [v], resulttype=Signed)
        cstr = inputconst(Void, STR)
        cflags = inputconst(Void, {'flavor': 'gc'})
        v_result = llops.genop('malloc_varsize', [cstr, cflags, v_len],
                               resulttype=Ptr(STR))
        llops.gencapicall('PyString_ToRPyString', [v, v_result])
        string_repr = llops.rtyper.type_system.rstr.string_repr
        v_result = llops.convertvar(v_result, string_repr, r_to)
        return v_result


class __extend__(pairtype(AbstractStringRepr, PyObjRepr)):
    def convert_from_to((r_from, r_to), v, llops):
        string_repr = llops.rtyper.type_system.rstr.string_repr
        v = llops.convertvar(v, r_from, string_repr)
        cchars = inputconst(Void, "chars")
        # xxx put in table
        return llops.gencapicall(
            'PyString_FromRPyString',
            [v],
            resulttype=pyobj_repr,
            _callable=lambda v: pyobjectptr(''.join(v.chars)))

class __extend__(pairtype(AbstractUnicodeRepr, PyObjRepr)):
    def convert_from_to((r_from, r_to), v, llops):
        unicode_repr = llops.rtyper.type_system.rstr.unicode_repr
        v = llops.convertvar(v, r_from, unicode_repr)
        cchars = inputconst(Void, "chars")
        # xxx put in table
        return llops.gencapicall(
            'PyUnicode_FromRPyUnicode',
            [v],
            resulttype=pyobj_repr,
            _callable=lambda v: pyobjectptr(u''.join(v.chars)))

# ____________________________________________________________
#
#  Low-level methods.  These can be run for testing, but are meant to
#  be direct_call'ed from rtyped flow graphs, which means that they will
#  get flowed and annotated, mostly with SomePtr.
#

def ll_construct_restart_positions(s, l):
    # Construct the array of possible restarting positions
    # T = Array_of_ints [-1..len2]
    # T[-1] = -1 s2.chars[-1] is supposed to be unequal to everything else
    T = malloc( SIGNED_ARRAY, l)
    T[0] = 0
    i = 1
    j = 0
    while i<l:
        if s.chars[i] == s.chars[j]:
            j += 1
            T[i] = j
            i += 1
        elif j>0:
            j = T[j-1]
        else:
            T[i] = 0
            i += 1
            j = 0
    return T


FAST_COUNT = 0
FAST_FIND = 1
FAST_RFIND = 2


from pypy.rlib.rarithmetic import LONG_BIT as BLOOM_WIDTH


def bloom_add(mask, c):
    return mask | (1 << (ord(c) & (BLOOM_WIDTH - 1)))

def bloom(mask, c):
    return mask & (1 << (ord(c) & (BLOOM_WIDTH - 1)))


class LLHelpers(AbstractLLHelpers):
    @jit.elidable
    def ll_str_mul(s, times):
        if times < 0:
            times = 0
        try:
            size = ovfcheck(len(s.chars) * times)
        except OverflowError:
            raise MemoryError
        newstr = s.malloc(size)
        i = 0
        if i < size:
            s.copy_contents(s, newstr, 0, 0, len(s.chars))
            i += len(s.chars)
        while i < size:
            if i <= size - i:
                j = i
            else:
                j = size - i
            s.copy_contents(newstr, newstr, 0, i, j)
            i += j
        return newstr

    @jit.elidable
    def ll_char_mul(ch, times):
        if typeOf(ch) is Char:
            malloc = mallocstr
        else:
            malloc = mallocunicode
        if times < 0:
            times = 0
        newstr = malloc(times)
        j = 0
        # XXX we can use memset here, not sure how useful this is
        while j < times:
            newstr.chars[j] = ch
            j += 1
        return newstr

    def ll_strlen(s):
        return len(s.chars)

    def ll_stritem_nonneg(s, i):
        chars = s.chars
        ll_assert(i>=0, "negative str getitem index")
        ll_assert(i<len(chars), "str getitem index out of bound")
        return chars[i]
    ll_stritem_nonneg._annenforceargs_ = [None, int]

    def ll_chr2str(ch):
        if typeOf(ch) is Char:
            malloc = mallocstr
        else:
            malloc = mallocunicode
        s = malloc(1)
        s.chars[0] = ch
        return s

    # @jit.look_inside_iff(lambda str: jit.isconstant(len(str.chars)) and len(str.chars) == 1)
    @jit.oopspec("str.str2unicode(str)")
    def ll_str2unicode(str):
        lgt = len(str.chars)
        s = mallocunicode(lgt)
        for i in range(lgt):
            if ord(str.chars[i]) > 127:
                raise UnicodeDecodeError
            s.chars[i] = cast_primitive(UniChar, str.chars[i])
        return s

    @jit.elidable
    def ll_strhash(s):
        # unlike CPython, there is no reason to avoid to return -1
        # but our malloc initializes the memory to zero, so we use zero as the
        # special non-computed-yet value.
        if not s:
            return 0
        x = s.hash
        if x == 0:
            x = _hash_string(s.chars)
            if x == 0:
                x = 29872897
            s.hash = x
        return x

    def ll_strfasthash(s):
        return s.hash     # assumes that the hash is already computed

    @jit.elidable
    def ll_strconcat(s1, s2):
        len1 = len(s1.chars)
        len2 = len(s2.chars)
        # a single '+' like this is allowed to overflow: it gets
        # a negative result, and the gc will complain
        newstr = s1.malloc(len1 + len2)
        s1.copy_contents(s1, newstr, 0, 0, len1)
        s1.copy_contents(s2, newstr, 0, len1, len2)
        return newstr
    ll_strconcat.oopspec = 'stroruni.concat(s1, s2)'

    @jit.elidable
    def ll_strip(s, ch, left, right):
        s_len = len(s.chars)
        if s_len == 0:
            return s.empty()
        lpos = 0
        rpos = s_len - 1
        if left:
            while lpos < rpos and s.chars[lpos] == ch:
                lpos += 1
        if right:
            while lpos < rpos + 1 and s.chars[rpos] == ch:
                rpos -= 1
        if rpos < lpos:
            return s.empty()
        r_len = rpos - lpos + 1
        result = s.malloc(r_len)
        s.copy_contents(s, result, lpos, 0, r_len)
        return result

    @jit.elidable
    def ll_upper(s):
        s_chars = s.chars
        s_len = len(s_chars)
        if s_len == 0:
            return s.empty()
        i = 0
        result = mallocstr(s_len)
        #        ^^^^^^^^^ specifically to explode on unicode
        while i < s_len:
            ch = s_chars[i]
            if 'a' <= ch <= 'z':
                ch = chr(ord(ch) - 32)
            result.chars[i] = ch
            i += 1
        return result

    @jit.elidable
    def ll_lower(s):
        s_chars = s.chars
        s_len = len(s_chars)
        if s_len == 0:
            return s.empty()
        i = 0
        result = mallocstr(s_len)
        #        ^^^^^^^^^ specifically to explode on unicode
        while i < s_len:
            ch = s_chars[i]
            if 'A' <= ch <= 'Z':
                ch = chr(ord(ch) + 32)
            result.chars[i] = ch
            i += 1
        return result

    def ll_join(s, length, items):
        s_chars = s.chars
        s_len = len(s_chars)
        num_items = length
        if num_items == 0:
            return s.empty()
        itemslen = 0
        i = 0
        while i < num_items:
            try:
                itemslen = ovfcheck(itemslen + len(items[i].chars))
            except OverflowError:
                raise MemoryError
            i += 1
        try:
            seplen = ovfcheck(s_len * (num_items - 1))
        except OverflowError:
            raise MemoryError
        # a single '+' at the end is allowed to overflow: it gets
        # a negative result, and the gc will complain
        result = s.malloc(itemslen + seplen)
        res_index = len(items[0].chars)
        s.copy_contents(items[0], result, 0, 0, res_index)
        i = 1
        while i < num_items:
            s.copy_contents(s, result, 0, res_index, s_len)
            res_index += s_len
            lgt = len(items[i].chars)
            s.copy_contents(items[i], result, 0, res_index, lgt)
            res_index += lgt
            i += 1
        return result

    @jit.elidable
    def ll_strcmp(s1, s2):
        if not s1 and not s2:
            return True
        if not s1 or not s2:
            return False
        chars1 = s1.chars
        chars2 = s2.chars
        len1 = len(chars1)
        len2 = len(chars2)

        if len1 < len2:
            cmplen = len1
        else:
            cmplen = len2
        i = 0
        while i < cmplen:
            diff = ord(chars1[i]) - ord(chars2[i])
            if diff != 0:
                return diff
            i += 1
        return len1 - len2

    @jit.elidable
    def ll_streq(s1, s2):
        if s1 == s2:       # also if both are NULLs
            return True
        if not s1 or not s2:
            return False
        len1 = len(s1.chars)
        len2 = len(s2.chars)
        if len1 != len2:
            return False
        j = 0
        chars1 = s1.chars
        chars2 = s2.chars
        while j < len1:
            if chars1[j] != chars2[j]:
                return False
            j += 1
        return True
    ll_streq.oopspec = 'stroruni.equal(s1, s2)'

    @jit.elidable
    def ll_startswith(s1, s2):
        len1 = len(s1.chars)
        len2 = len(s2.chars)
        if len1 < len2:
            return False
        j = 0
        chars1 = s1.chars
        chars2 = s2.chars
        while j < len2:
            if chars1[j] != chars2[j]:
                return False
            j += 1

        return True

    def ll_startswith_char(s, ch):
        if not len(s.chars):
            return False
        return s.chars[0] == ch

    @jit.elidable
    def ll_endswith(s1, s2):
        len1 = len(s1.chars)
        len2 = len(s2.chars)
        if len1 < len2:
            return False
        j = 0
        chars1 = s1.chars
        chars2 = s2.chars
        offset = len1 - len2
        while j < len2:
            if chars1[offset + j] != chars2[j]:
                return False
            j += 1

        return True

    def ll_endswith_char(s, ch):
        if not len(s.chars):
            return False
        return s.chars[len(s.chars) - 1] == ch

    @jit.elidable
    def ll_find_char(s, ch, start, end):
        i = start
        if end > len(s.chars):
            end = len(s.chars)
        while i < end:
            if s.chars[i] == ch:
                return i
            i += 1
        return -1
    ll_find_char._annenforceargs_ = [None, None, int, int]

    @jit.elidable
    def ll_rfind_char(s, ch, start, end):
        if end > len(s.chars):
            end = len(s.chars)
        i = end
        while i > start:
            i -= 1
            if s.chars[i] == ch:
                return i
        return -1

    @jit.elidable
    def ll_count_char(s, ch, start, end):
        count = 0
        i = start
        if end > len(s.chars):
            end = len(s.chars)
        while i < end:
            if s.chars[i] == ch:
                count += 1
            i += 1
        return count

    @classmethod
    def ll_find(cls, s1, s2, start, end):
        if start < 0:
            start = 0
        if end > len(s1.chars):
            end = len(s1.chars)
        if end - start < 0:
            return -1

        m = len(s2.chars)
        if m == 0:
            return start
        elif m == 1:
            return cls.ll_find_char(s1, s2.chars[0], start, end)

        return cls.ll_search(s1, s2, start, end, FAST_FIND)

    @classmethod
    def ll_rfind(cls, s1, s2, start, end):
        if start < 0:
            start = 0
        if end > len(s1.chars):
            end = len(s1.chars)
        if end - start < 0:
            return -1

        m = len(s2.chars)
        if m == 0:
            return end
        elif m == 1:
            return cls.ll_rfind_char(s1, s2.chars[0], start, end)

        return cls.ll_search(s1, s2, start, end, FAST_RFIND)

    @classmethod
    def ll_count(cls, s1, s2, start, end):
        if start < 0:
            start = 0
        if end > len(s1.chars):
            end = len(s1.chars)
        if end - start < 0:
            return 0

        m = len(s2.chars)
        if m == 0:
            return end - start + 1
        elif m == 1:
            return cls.ll_count_char(s1, s2.chars[0], start, end)

        res = cls.ll_search(s1, s2, start, end, FAST_COUNT)
        # For a few cases ll_search can return -1 to indicate an "impossible"
        # condition for a string match, count just returns 0 in these cases.
        if res < 0:
            res = 0
        return res

    @jit.elidable
    def ll_search(s1, s2, start, end, mode):
        count = 0
        n = end - start
        m = len(s2.chars)

        w = n - m

        if w < 0:
            return -1

        mlast = m - 1
        skip = mlast - 1
        mask = 0

        if mode != FAST_RFIND:
            for i in range(mlast):
                mask = bloom_add(mask, s2.chars[i])
                if s2.chars[i] == s2.chars[mlast]:
                    skip = mlast - i - 1
            mask = bloom_add(mask, s2.chars[mlast])

            i = start - 1
            while i + 1 <= start + w:
                i += 1
                if s1.chars[i+m-1] == s2.chars[m-1]:
                    for j in range(mlast):
                        if s1.chars[i+j] != s2.chars[j]:
                            break
                    else:
                        if mode != FAST_COUNT:
                            return i
                        count += 1
                        i += mlast
                        continue

                    if i + m < len(s1.chars):
                        c = s1.chars[i + m]
                    else:
                        c = '\0'
                    if not bloom(mask, c):
                        i += m
                    else:
                        i += skip
                else:
                    if i + m < len(s1.chars):
                        c = s1.chars[i + m]
                    else:
                        c = '\0'
                    if not bloom(mask, c):
                        i += m
        else:
            mask = bloom_add(mask, s2.chars[0])
            for i in range(mlast, 0, -1):
                mask = bloom_add(mask, s2.chars[i])
                if s2.chars[i] == s2.chars[0]:
                    skip = i - 1

            i = start + w + 1
            while i - 1 >= start:
                i -= 1
                if s1.chars[i] == s2.chars[0]:
                    for j in xrange(mlast, 0, -1):
                        if s1.chars[i+j] != s2.chars[j]:
                            break
                    else:
                        return i
                    if i-1 >= 0 and not bloom(mask, s1.chars[i-1]):
                        i -= m
                    else:
                        i -= skip
                else:
                    if i-1 >= 0 and not bloom(mask, s1.chars[i-1]):
                        i -= m

        if mode != FAST_COUNT:
            return -1
        return count

    @enforceargs(int, None)
    @jit.look_inside_iff(lambda length, items: jit.isconstant(length) and length <= 2)
    def ll_join_strs(length, items):
        # Special case for length 1 items, helps both the JIT and other code
        if length == 1:
            return items[0]

        num_items = length
        itemslen = 0
        i = 0
        while i < num_items:
            try:
                itemslen = ovfcheck(itemslen + len(items[i].chars))
            except OverflowError:
                raise MemoryError
            i += 1
        if typeOf(items).TO.OF.TO == STR:
            malloc = mallocstr
            copy_contents = copy_string_contents
        else:
            malloc = mallocunicode
            copy_contents = copy_unicode_contents
        result = malloc(itemslen)
        res_chars = result.chars
        res_index = 0
        i = 0
        while i < num_items:
            item_chars = items[i].chars
            item_len = len(item_chars)
            copy_contents(items[i], result, 0, res_index, item_len)
            res_index += item_len
            i += 1
        return result

    @jit.look_inside_iff(lambda length, chars, RES: jit.isconstant(length) and jit.isvirtual(chars))
    def ll_join_chars(length, chars, RES):
        # no need to optimize this, will be replaced by string builder
        # at some point soon
        num_chars = length
        if RES is StringRepr.lowleveltype:
            target = Char
            malloc = mallocstr
        else:
            target = UniChar
            malloc = mallocunicode
        result = malloc(num_chars)
        res_chars = result.chars
        i = 0
        while i < num_chars:
            res_chars[i] = cast_primitive(target, chars[i])
            i += 1
        return result

    @jit.elidable
    def _ll_stringslice(s1, start, stop):
        lgt = stop - start
        assert start >= 0
        assert lgt >= 0
        newstr = s1.malloc(lgt)
        s1.copy_contents(s1, newstr, start, 0, lgt)
        return newstr
    _ll_stringslice.oopspec = 'stroruni.slice(s1, start, stop)'
    _ll_stringslice._annenforceargs_ = [None, int, int]

    def ll_stringslice_startonly(s1, start):
        return LLHelpers._ll_stringslice(s1, start, len(s1.chars))

    def ll_stringslice_startstop(s1, start, stop):
        if jit.we_are_jitted():
            if stop > len(s1.chars):
                stop = len(s1.chars)
        else:
            if stop >= len(s1.chars):
                if start == 0:
                    return s1
                stop = len(s1.chars)
        return LLHelpers._ll_stringslice(s1, start, stop)

    def ll_stringslice_minusone(s1):
        newlen = len(s1.chars) - 1
        return LLHelpers._ll_stringslice(s1, 0, newlen)

    def ll_split_chr(LIST, s, c, max):
        chars = s.chars
        strlen = len(chars)
        count = 1
        i = 0
        if max == 0:
            i = strlen
        while i < strlen:
            if chars[i] == c:
                count += 1
                if max >= 0 and count > max:
                    break
            i += 1
        res = LIST.ll_newlist(count)
        items = res.ll_items()
        i = 0
        j = 0
        resindex = 0
        if max == 0:
            j = strlen
        while j < strlen:
            if chars[j] == c:
                item = items[resindex] = s.malloc(j - i)
                item.copy_contents(s, item, i, 0, j - i)
                resindex += 1
                i = j + 1
                if max >= 0 and resindex >= max:
                    j = strlen
                    break
            j += 1
        item = items[resindex] = s.malloc(j - i)
        item.copy_contents(s, item, i, 0, j - i)
        return res

    def ll_rsplit_chr(LIST, s, c, max):
        chars = s.chars
        strlen = len(chars)
        count = 1
        i = 0
        if max == 0:
            i = strlen
        while i < strlen:
            if chars[i] == c:
                count += 1
                if max >= 0 and count > max:
                    break
            i += 1
        res = LIST.ll_newlist(count)
        items = res.ll_items()
        i = strlen
        j = strlen
        resindex = count - 1
        assert resindex >= 0
        if max == 0:
            j = 0
        while j > 0:
            j -= 1
            if chars[j] == c:
                item = items[resindex] = s.malloc(i - j - 1)
                item.copy_contents(s, item, j + 1, 0, i - j - 1)
                resindex -= 1
                i = j
                if resindex == 0:
                    j = 0
                    break
        item = items[resindex] = s.malloc(i - j)
        item.copy_contents(s, item, j, 0, i - j)
        return res

    @jit.elidable
    def ll_replace_chr_chr(s, c1, c2):
        length = len(s.chars)
        newstr = s.malloc(length)
        src = s.chars
        dst = newstr.chars
        j = 0
        while j < length:
            c = src[j]
            if c == c1:
                c = c2
            dst[j] = c
            j += 1
        return newstr

    @jit.elidable
    def ll_contains(s, c):
        chars = s.chars
        strlen = len(chars)
        i = 0
        while i < strlen:
            if chars[i] == c:
                return True
            i += 1
        return False

    @jit.elidable
    def ll_int(s, base):
        if not 2 <= base <= 36:
            raise ValueError
        chars = s.chars
        strlen = len(chars)
        i = 0
        #XXX: only space is allowed as white space for now
        while i < strlen and chars[i] == ' ':
            i += 1
        if not i < strlen:
            raise ValueError
        #check sign
        sign = 1
        if chars[i] == '-':
            sign = -1
            i += 1
        elif chars[i] == '+':
            i += 1;
        # skip whitespaces between sign and digits
        while i < strlen and chars[i] == ' ':
            i += 1
        #now get digits
        val = 0
        oldpos = i
        while i < strlen:
            c = ord(chars[i])
            if ord('a') <= c <= ord('z'):
                digit = c - ord('a') + 10
            elif ord('A') <= c <= ord('Z'):
                digit = c - ord('A') + 10
            elif ord('0') <= c <= ord('9'):
                digit = c - ord('0')
            else:
                break
            if digit >= base:
                break
            val = val * base + digit
            i += 1
        if i == oldpos:
            raise ValueError # catch strings like '+' and '+  '
        #skip trailing whitespace
        while i < strlen and chars[i] == ' ':
            i += 1
        if not i == strlen:
            raise ValueError
        return sign * val

    # interface to build strings:
    #   x = ll_build_start(n)
    #   ll_build_push(x, next_string, 0)
    #   ll_build_push(x, next_string, 1)
    #   ...
    #   ll_build_push(x, next_string, n-1)
    #   s = ll_build_finish(x)

    def ll_build_start(parts_count):
        return malloc(TEMP, parts_count)

    def ll_build_push(builder, next_string, index):
        builder[index] = next_string

    def ll_build_finish(builder):
        return LLHelpers.ll_join_strs(len(builder), builder)

    def ll_constant(s):
        return string_repr.convert_const(s)
    ll_constant._annspecialcase_ = 'specialize:memo'

    def do_stringformat(cls, hop, sourcevarsrepr):
        s_str = hop.args_s[0]
        assert s_str.is_constant()
        s = s_str.const
        things = cls.parse_fmt_string(s)
        size = inputconst(Signed, len(things)) # could be unsigned?
        cTEMP = inputconst(Void, TEMP)
        cflags = inputconst(Void, {'flavor': 'gc'})
        vtemp = hop.genop("malloc_varsize", [cTEMP, cflags, size],
                          resulttype=Ptr(TEMP))

        argsiter = iter(sourcevarsrepr)

        InstanceRepr = hop.rtyper.type_system.rclass.InstanceRepr
        for i, thing in enumerate(things):
            if isinstance(thing, tuple):
                code = thing[0]
                vitem, r_arg = argsiter.next()
                if not hasattr(r_arg, 'll_str'):
                    raise TyperError("ll_str unsupported for: %r" % r_arg)
                if code == 's' or (code == 'r' and isinstance(r_arg, InstanceRepr)):
                    vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
                elif code == 'd':
                    assert isinstance(r_arg, IntegerRepr)
                    #vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
                    vchunk = hop.gendirectcall(ll_str.ll_int2dec, vitem)
                elif code == 'f':
                    #assert isinstance(r_arg, FloatRepr)
                    vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
                elif code == 'x':
                    assert isinstance(r_arg, IntegerRepr)
                    vchunk = hop.gendirectcall(ll_str.ll_int2hex, vitem,
                                               inputconst(Bool, False))
                elif code == 'o':
                    assert isinstance(r_arg, IntegerRepr)
                    vchunk = hop.gendirectcall(ll_str.ll_int2oct, vitem,
                                               inputconst(Bool, False))
                else:
                    raise TyperError, "%%%s is not RPython" % (code, )
            else:
                from pypy.rpython.lltypesystem.rstr import string_repr
                vchunk = inputconst(string_repr, thing)
            i = inputconst(Signed, i)
            hop.genop('setarrayitem', [vtemp, i, vchunk])

        hop.exception_cannot_occur()   # to ignore the ZeroDivisionError of '%'
        return hop.gendirectcall(cls.ll_join_strs, size, vtemp)
    do_stringformat = classmethod(do_stringformat)

TEMP = GcArray(Ptr(STR))

# ____________________________________________________________

STR.become(GcStruct('rpy_string', ('hash',  Signed),
                    ('chars', Array(Char, hints={'immutable': True})),
                    adtmeths={'malloc' : staticAdtMethod(mallocstr),
                              'empty'  : staticAdtMethod(emptystrfun),
                              'copy_contents' : staticAdtMethod(copy_string_contents),
                              'gethash': LLHelpers.ll_strhash}))
UNICODE.become(GcStruct('rpy_unicode', ('hash', Signed),
                        ('chars', Array(UniChar, hints={'immutable': True})),
                        adtmeths={'malloc' : staticAdtMethod(mallocunicode),
                                  'empty'  : staticAdtMethod(emptyunicodefun),
                                  'copy_contents' : staticAdtMethod(copy_unicode_contents),
                                  'gethash': LLHelpers.ll_strhash}
                        ))


# TODO: make the public interface of the rstr module cleaner
ll_strconcat = LLHelpers.ll_strconcat
ll_join = LLHelpers.ll_join
ll_str2unicode = LLHelpers.ll_str2unicode
do_stringformat = LLHelpers.do_stringformat

string_repr = StringRepr()
char_repr = CharRepr()
unichar_repr = UniCharRepr()
char_repr.ll = LLHelpers
unichar_repr.ll = LLHelpers
unicode_repr = UnicodeRepr()
emptystr = string_repr.convert_const("")
emptyunicode = unicode_repr.convert_const(u'')

StringRepr.repr = string_repr
UnicodeRepr.repr = unicode_repr
UniCharRepr.repr = unicode_repr
UniCharRepr.char_repr = unichar_repr
UnicodeRepr.char_repr = unichar_repr
CharRepr.char_repr = char_repr
StringRepr.char_repr = char_repr

class BaseStringIteratorRepr(AbstractStringIteratorRepr):

    def __init__(self):
        self.ll_striter = ll_striter
        self.ll_strnext = ll_strnext

class StringIteratorRepr(BaseStringIteratorRepr):

    lowleveltype = Ptr(GcStruct('stringiter',
                                ('string', string_repr.lowleveltype),
                                ('index', Signed)))

class UnicodeIteratorRepr(BaseStringIteratorRepr):

    lowleveltype = Ptr(GcStruct('unicodeiter',
                                ('string', unicode_repr.lowleveltype),
                                ('index', Signed)))

def ll_striter(string):
    if typeOf(string) == string_repr.lowleveltype:
        TP = string_repr.iterator_repr.lowleveltype.TO
    elif typeOf(string) == unicode_repr.lowleveltype:
        TP = unicode_repr.iterator_repr.lowleveltype.TO
    else:
        raise TypeError("Unknown string type %s" % (typeOf(string),))
    iter = malloc(TP)
    iter.string = string
    iter.index = 0
    return iter

def ll_strnext(iter):
    chars = iter.string.chars
    index = iter.index
    if index >= len(chars):
        raise StopIteration
    iter.index = index + 1
    return chars[index]

string_repr.iterator_repr = StringIteratorRepr()
unicode_repr.iterator_repr = UnicodeIteratorRepr()

# these should be in rclass, but circular imports prevent (also it's
# not that insane that a string constant is built in this file).

instance_str_prefix = string_repr.convert_const("<")
instance_str_infix  = string_repr.convert_const(" object at 0x")
instance_str_suffix = string_repr.convert_const(">")

null_str = string_repr.convert_const("NULL")

unboxed_instance_str_prefix = string_repr.convert_const("<unboxed ")
unboxed_instance_str_suffix = string_repr.convert_const(">")