pypy /rpython/rtyper/lltypesystem/rbuilder.py

Language Python Lines 454
MD5 Hash baa488bba315457b3aa0f7cde44cfafc Estimated Cost $8,086 (why?)
Repository https://bitbucket.org/pypy/pypy/ View Raw File View Project SPDX
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
from rpython.rlib import rgc, jit
from rpython.rlib.objectmodel import enforceargs, dont_inline, always_inline
from rpython.rlib.rarithmetic import ovfcheck, r_uint, intmask
from rpython.rtyper.debug import ll_assert
from rpython.rlib.unroll import unrolling_iterable
from rpython.rtyper.rptr import PtrRepr
from rpython.rtyper.lltypesystem import lltype, rffi, rstr
from rpython.rtyper.lltypesystem.lltype import staticAdtMethod, nullptr
from rpython.rtyper.lltypesystem.rstr import (STR, UNICODE, char_repr,
    string_repr, unichar_repr, unicode_repr)
from rpython.rtyper.rbuilder import AbstractStringBuilderRepr
from rpython.tool.sourcetools import func_with_new_name
from rpython.rtyper.annlowlevel import llstr, llunicode



# ------------------------------------------------------------
# Basic idea:
#
# - A StringBuilder has a rstr.STR of the specified initial size
#   (100 by default), which is filled gradually.
#
# - When it is full, we allocate extra buffers as an extra rstr.STR,
#   and the already-filled one is added to a chained list of STRINGPIECE
#   objects.
#
# - At build() time, we consolidate all these pieces into a single
#   rstr.STR, which is both returned and re-attached to the StringBuilder,
#   replacing the STRINGPIECEs.
#
# - The data is copied at most twice, and only once in case it fits
#   into the initial size (and the GC supports shrinking the STR).
#
# XXX in build(), we could try keeping around a global weakref to the
# chain of STRINGPIECEs and reuse them the next time.
#
# ------------------------------------------------------------


STRINGPIECE = lltype.GcStruct('stringpiece',
    ('buf', lltype.Ptr(STR)),
    ('prev_piece', lltype.Ptr(lltype.GcForwardReference())))
STRINGPIECE.prev_piece.TO.become(STRINGPIECE)

STRINGBUILDER = lltype.GcStruct('stringbuilder',
    ('current_buf', lltype.Ptr(STR)),
    ('current_pos', lltype.Signed),
    ('current_end', lltype.Signed),
    ('total_size', lltype.Signed),
    ('extra_pieces', lltype.Ptr(STRINGPIECE)),
    adtmeths={
        'copy_string_contents': staticAdtMethod(rstr.copy_string_contents),
        'copy_raw_to_string': staticAdtMethod(rstr.copy_raw_to_string),
        'mallocfn': staticAdtMethod(rstr.mallocstr),
    }
)

UNICODEPIECE = lltype.GcStruct('unicodepiece',
    ('buf', lltype.Ptr(UNICODE)),
    ('prev_piece', lltype.Ptr(lltype.GcForwardReference())))
UNICODEPIECE.prev_piece.TO.become(UNICODEPIECE)

UNICODEBUILDER = lltype.GcStruct('unicodebuilder',
    ('current_buf', lltype.Ptr(UNICODE)),
    ('current_pos', lltype.Signed),
    ('current_end', lltype.Signed),
    ('total_size', lltype.Signed),
    ('extra_pieces', lltype.Ptr(UNICODEPIECE)),
    adtmeths={
        'copy_string_contents': staticAdtMethod(rstr.copy_unicode_contents),
        'copy_raw_to_string': staticAdtMethod(rstr.copy_raw_to_unicode),
        'mallocfn': staticAdtMethod(rstr.mallocunicode),
    }
)

# ------------------------------------------------------------
# The generic piece of code to append a string (or a slice of it)
# to a builder; it is inlined inside various functions below

@always_inline
def _ll_append(ll_builder, ll_str, start, size):
    pos = ll_builder.current_pos
    end = ll_builder.current_end
    if (end - pos) < size:
        ll_grow_and_append(ll_builder, ll_str, start, size)
    else:
        ll_builder.current_pos = pos + size
        ll_builder.copy_string_contents(ll_str, ll_builder.current_buf,
                                        start, pos, size)

# ------------------------------------------------------------
# Logic to grow a builder (by adding a new string to it)

@dont_inline
@enforceargs(None, int)
def ll_grow_by(ll_builder, needed):
    try:
        needed = ovfcheck(needed + ll_builder.total_size)
        needed = ovfcheck(needed + 63) & ~63
        total_size = ll_builder.total_size + needed
    except OverflowError:
        raise MemoryError
    #
    new_string = ll_builder.mallocfn(needed)
    #
    PIECE = lltype.typeOf(ll_builder.extra_pieces).TO
    old_piece = lltype.malloc(PIECE)
    old_piece.buf = ll_builder.current_buf
    old_piece.prev_piece = ll_builder.extra_pieces
    ll_assert(bool(old_piece.buf), "no buf??")
    ll_builder.current_buf = new_string
    ll_builder.current_pos = 0
    ll_builder.current_end = needed
    ll_builder.total_size = total_size
    ll_builder.extra_pieces = old_piece

@dont_inline
def ll_grow_and_append(ll_builder, ll_str, start, size):
    # First, the part that still fits in the current piece
    part1 = ll_builder.current_end - ll_builder.current_pos
    ll_assert(part1 < size, "part1 >= size")
    ll_builder.copy_string_contents(ll_str, ll_builder.current_buf,
                                    start, ll_builder.current_pos,
                                    part1)
    start += part1
    size -= part1
    # Allocate the new piece
    ll_grow_by(ll_builder, size)
    ll_assert(ll_builder.current_pos == 0, "current_pos must be 0 after grow()")
    # Finally, the second part of the string
    ll_builder.current_pos = size
    ll_builder.copy_string_contents(ll_str, ll_builder.current_buf,
                                    start, 0, size)

# ------------------------------------------------------------
# builder.append()

@always_inline
def ll_append(ll_builder, ll_str):
    if jit.we_are_jitted():
        ll_jit_append(ll_builder, ll_str)
    else:
        # no-jit case: inline the logic of _ll_append() in the caller
        _ll_append(ll_builder, ll_str, 0, len(ll_str.chars))

@dont_inline
def ll_jit_append(ll_builder, ll_str):
    # jit case: first try special cases for known small lengths
    if ll_jit_try_append_slice(ll_builder, ll_str, 0, len(ll_str.chars)):
        return
    # fall-back to do a residual call to ll_append_res0
    ll_append_res0(ll_builder, ll_str)

@jit.dont_look_inside
def ll_append_res0(ll_builder, ll_str):
    _ll_append(ll_builder, ll_str, 0, len(ll_str.chars))

# ------------------------------------------------------------
# builder.append_char()

@always_inline
def ll_append_char(ll_builder, char):
    jit.conditional_call(ll_builder.current_pos == ll_builder.current_end,
                         ll_grow_by, ll_builder, 1)
    pos = ll_builder.current_pos
    ll_builder.current_pos = pos + 1
    ll_builder.current_buf.chars[pos] = char

# ------------------------------------------------------------
# builder.append_slice()

@always_inline
def ll_append_slice(ll_builder, ll_str, start, end):
    if jit.we_are_jitted():
        ll_jit_append_slice(ll_builder, ll_str, start, end)
    else:
        # no-jit case: inline the logic of _ll_append() in the caller
        _ll_append(ll_builder, ll_str, start, end - start)

@dont_inline
def ll_jit_append_slice(ll_builder, ll_str, start, end):
    # jit case: first try special cases for known small lengths
    if ll_jit_try_append_slice(ll_builder, ll_str, start, end - start):
        return
    # fall-back to do a residual call to ll_append_res_slice
    ll_append_res_slice(ll_builder, ll_str, start, end)

@jit.dont_look_inside
def ll_append_res_slice(ll_builder, ll_str, start, end):
    _ll_append(ll_builder, ll_str, start, end - start)

# ------------------------------------------------------------
# Special-casing for the JIT: appending strings (or slices) of
# a known length up to MAX_N.  These functions all contain an
# inlined copy of _ll_append(), but with a known small N, gcc
# will compile the copy_string_contents() efficiently.

MAX_N = 10

def make_func_for_size(N):
    @jit.dont_look_inside
    def ll_append_0(ll_builder, ll_str):
        _ll_append(ll_builder, ll_str, 0, N)
    ll_append_0 = func_with_new_name(ll_append_0, "ll_append_0_%d" % N)
    #
    @jit.dont_look_inside
    def ll_append_start(ll_builder, ll_str, start):
        _ll_append(ll_builder, ll_str, start, N)
    ll_append_start = func_with_new_name(ll_append_start,
                                                  "ll_append_start_%d" % N)
    return ll_append_0, ll_append_start, N

unroll_func_for_size = unrolling_iterable([make_func_for_size(_n)
                                           for _n in range(2, MAX_N + 1)])

@jit.unroll_safe
def ll_jit_try_append_slice(ll_builder, ll_str, start, size):
    if jit.isconstant(size):
        if size == 0:
            return True
        # a special case: if the builder's pos and end are still contants
        # (typically if the builder is still virtual), and if 'size' fits,
        # then we don't need any reallocation and can just set the
        # characters in the buffer, in a way that won't force anything.
        if (jit.isconstant(ll_builder.current_pos) and
            jit.isconstant(ll_builder.current_end) and
            size <= (ll_builder.current_end - ll_builder.current_pos) and
            size <= 16):
            pos = ll_builder.current_pos
            buf = ll_builder.current_buf
            stop = pos + size
            ll_builder.current_pos = stop
            while pos < stop:
                buf.chars[pos] = ll_str.chars[start]
                pos += 1
                start += 1
            return True
        # turn appends of length 1 into ll_append_char().
        if size == 1:
            ll_append_char(ll_builder, ll_str.chars[start])
            return True
        # turn appends of length 2 to 10 into residual calls to
        # specialized functions, for the lengths 2 to 10, where
        # gcc will optimize the known-length copy_string_contents()
        # as much as possible.
        for func0, funcstart, for_size in unroll_func_for_size:
            if size == for_size:
                if jit.isconstant(start) and start == 0:
                    func0(ll_builder, ll_str)
                else:
                    funcstart(ll_builder, ll_str, start)
                return True
    return False     # use the fall-back path

# ------------------------------------------------------------
# builder.append_multiple_char()

@always_inline
def ll_append_multiple_char(ll_builder, char, times):
    if jit.we_are_jitted():
        if ll_jit_try_append_multiple_char(ll_builder, char, times):
            return
    _ll_append_multiple_char(ll_builder, char, times)

@jit.dont_look_inside
def _ll_append_multiple_char(ll_builder, char, times):
    part1 = ll_builder.current_end - ll_builder.current_pos
    if times > part1:
        times -= part1
        buf = ll_builder.current_buf
        for i in xrange(ll_builder.current_pos, ll_builder.current_end):
            buf.chars[i] = char
        ll_grow_by(ll_builder, times)
    #
    buf = ll_builder.current_buf
    pos = ll_builder.current_pos
    end = pos + times
    ll_builder.current_pos = end
    for i in xrange(pos, end):
        buf.chars[i] = char

@jit.unroll_safe
def ll_jit_try_append_multiple_char(ll_builder, char, size):
    if jit.isconstant(size):
        if size == 0:
            return True
        # a special case: if the builder's pos and end are still contants
        # (typically if the builder is still virtual), and if 'size' fits,
        # then we don't need any reallocation and can just set the
        # characters in the buffer, in a way that won't force anything.
        if (jit.isconstant(ll_builder.current_pos) and
            jit.isconstant(ll_builder.current_end) and
            size <= (ll_builder.current_end - ll_builder.current_pos) and
            size <= 16):
            pos = ll_builder.current_pos
            buf = ll_builder.current_buf
            stop = pos + size
            ll_builder.current_pos = stop
            while pos < stop:
                buf.chars[pos] = char
                pos += 1
            return True
        if size == 1:
            ll_append_char(ll_builder, char)
            return True
    return False     # use the fall-back path

# ------------------------------------------------------------
# builder.append_charpsize()

@jit.dont_look_inside
def ll_append_charpsize(ll_builder, charp, size):
    part1 = ll_builder.current_end - ll_builder.current_pos
    if size > part1:
        # First, the part that still fits
        ll_builder.copy_raw_to_string(charp, ll_builder.current_buf,
                                      ll_builder.current_pos, part1)
        charp = rffi.ptradd(charp, part1)
        size -= part1
        ll_grow_by(ll_builder, size)
    #
    pos = ll_builder.current_pos
    ll_builder.current_pos = pos + size
    ll_builder.copy_raw_to_string(charp, ll_builder.current_buf, pos, size)

# ------------------------------------------------------------
# builder.getlength()

@always_inline
def ll_getlength(ll_builder):
    num_chars_missing_from_last_piece = (
        ll_builder.current_end - ll_builder.current_pos)
    return ll_builder.total_size - num_chars_missing_from_last_piece

# ------------------------------------------------------------
# builder.build()

@jit.look_inside_iff(lambda ll_builder: jit.isvirtual(ll_builder))
def ll_build(ll_builder):
    # NB. usually the JIT doesn't look inside this function; it does
    # so only in the simplest example where it could virtualize everything
    if ll_builder.extra_pieces:
        ll_fold_pieces(ll_builder)
    elif ll_builder.current_pos != ll_builder.total_size:
        ll_shrink_final(ll_builder)
    return ll_builder.current_buf

def ll_shrink_final(ll_builder):
    final_size = ll_builder.current_pos
    ll_assert(final_size <= ll_builder.total_size,
              "final_size > ll_builder.total_size?")
    buf = rgc.ll_shrink_array(ll_builder.current_buf, final_size)
    ll_builder.current_buf = buf
    ll_builder.current_end = final_size
    ll_builder.total_size = final_size

def ll_fold_pieces(ll_builder):
    final_size = BaseStringBuilderRepr.ll_getlength(ll_builder)
    ll_assert(final_size >= 0, "negative final_size")
    extra = ll_builder.extra_pieces
    ll_builder.extra_pieces = lltype.nullptr(lltype.typeOf(extra).TO)
    #
    result = ll_builder.mallocfn(final_size)
    piece = ll_builder.current_buf
    piece_lgt = ll_builder.current_pos
    ll_assert(ll_builder.current_end == len(piece.chars),
              "bogus last piece_lgt")
    ll_builder.total_size = final_size
    ll_builder.current_buf = result
    ll_builder.current_pos = final_size
    ll_builder.current_end = final_size

    dst = final_size
    while True:
        dst -= piece_lgt
        ll_assert(dst >= 0, "rbuilder build: overflow")
        ll_builder.copy_string_contents(piece, result, 0, dst, piece_lgt)
        if not extra:
            break
        piece = extra.buf
        piece_lgt = len(piece.chars)
        extra = extra.prev_piece
    ll_assert(dst == 0, "rbuilder build: underflow")

# ------------------------------------------------------------
# bool(builder)

def ll_bool(ll_builder):
    return ll_builder != nullptr(lltype.typeOf(ll_builder).TO)

# ------------------------------------------------------------

class BaseStringBuilderRepr(AbstractStringBuilderRepr):
    def empty(self):
        return nullptr(self.lowleveltype.TO)

    ll_append               = staticmethod(ll_append)
    ll_append_char          = staticmethod(ll_append_char)
    ll_append_slice         = staticmethod(ll_append_slice)
    ll_append_multiple_char = staticmethod(ll_append_multiple_char)
    ll_append_charpsize     = staticmethod(ll_append_charpsize)
    ll_getlength            = staticmethod(ll_getlength)
    ll_build                = staticmethod(ll_build)
    ll_bool                 = staticmethod(ll_bool)

class StringBuilderRepr(BaseStringBuilderRepr):
    lowleveltype = lltype.Ptr(STRINGBUILDER)
    basetp = STR
    convert_to_ll = staticmethod(llstr)
    string_repr = string_repr
    char_repr = char_repr
    raw_ptr_repr = PtrRepr(
        lltype.Ptr(lltype.Array(lltype.Char, hints={'nolength': True}))
    )

    @staticmethod
    def ll_new(init_size):
        # Clamp 'init_size' to be a value between 0 and 1280.
        # Negative values are mapped to 1280.
        init_size = intmask(min(r_uint(init_size), r_uint(1280)))
        ll_builder = lltype.malloc(STRINGBUILDER)
        ll_builder.current_buf = ll_builder.mallocfn(init_size)
        ll_builder.current_pos = 0
        ll_builder.current_end = init_size
        ll_builder.total_size = init_size
        return ll_builder


class UnicodeBuilderRepr(BaseStringBuilderRepr):
    lowleveltype = lltype.Ptr(UNICODEBUILDER)
    basetp = UNICODE
    convert_to_ll = staticmethod(llunicode)
    string_repr = unicode_repr
    char_repr = unichar_repr
    raw_ptr_repr = PtrRepr(
        lltype.Ptr(lltype.Array(lltype.UniChar, hints={'nolength': True}))
    )

    @staticmethod
    def ll_new(init_size):
        # Clamp 'init_size' to be a value between 0 and 1280.
        # Negative values are mapped to 1280.
        init_size = intmask(min(r_uint(init_size), r_uint(1280)))
        ll_builder = lltype.malloc(UNICODEBUILDER)
        ll_builder.current_buf = ll_builder.mallocfn(init_size)
        ll_builder.current_pos = 0
        ll_builder.current_end = init_size
        ll_builder.total_size = init_size
        return ll_builder


unicodebuilder_repr = UnicodeBuilderRepr()
stringbuilder_repr = StringBuilderRepr()
Back to Top