PageRenderTime 329ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/rpython/rtyper/lltypesystem/rstr.py

https://bitbucket.org/pypy/pypy/
Python | 1344 lines | 1323 code | 9 blank | 12 comment | 2 complexity | 00a3aadf0b3727bbf2d431e4c6dafbca MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. from weakref import WeakValueDictionary
  2. from rpython.annotator import model as annmodel
  3. from rpython.rlib import jit, types
  4. from rpython.rlib.objectmodel import (malloc_zero_filled, we_are_translated,
  5. _hash_string, keepalive_until_here, specialize, enforceargs)
  6. from rpython.rlib.signature import signature
  7. from rpython.rlib.rarithmetic import ovfcheck
  8. from rpython.rtyper.error import TyperError
  9. from rpython.rtyper.debug import ll_assert
  10. from rpython.rtyper.lltypesystem import ll_str, llmemory
  11. from rpython.rtyper.lltypesystem.lltype import (GcStruct, Signed, Array, Char,
  12. UniChar, Ptr, malloc, Bool, Void, GcArray, nullptr, cast_primitive,
  13. typeOf, staticAdtMethod, GcForwardReference)
  14. from rpython.rtyper.rmodel import inputconst, Repr
  15. from rpython.rtyper.rint import IntegerRepr
  16. from rpython.rtyper.rstr import (AbstractStringRepr, AbstractCharRepr,
  17. AbstractUniCharRepr, AbstractStringIteratorRepr, AbstractLLHelpers,
  18. AbstractUnicodeRepr)
  19. from rpython.tool.sourcetools import func_with_new_name
  20. # ____________________________________________________________
  21. #
  22. # Concrete implementation of RPython strings:
  23. #
  24. # struct str {
  25. # hash: Signed
  26. # chars: array of Char
  27. # }
  28. STR = GcForwardReference()
  29. UNICODE = GcForwardReference()
  30. def new_malloc(TP, name):
  31. @enforceargs(int)
  32. def mallocstr(length):
  33. ll_assert(length >= 0, "negative string length")
  34. r = malloc(TP, length)
  35. if not we_are_translated() or not malloc_zero_filled:
  36. r.hash = 0
  37. return r
  38. return func_with_new_name(mallocstr, name)
  39. mallocstr = new_malloc(STR, 'mallocstr')
  40. mallocunicode = new_malloc(UNICODE, 'mallocunicode')
  41. def emptystrfun():
  42. return emptystr
  43. def emptyunicodefun():
  44. return emptyunicode
  45. def _new_copy_contents_fun(SRC_TP, DST_TP, CHAR_TP, name):
  46. @specialize.arg(0)
  47. def _str_ofs(TP, item):
  48. return (llmemory.offsetof(TP, 'chars') +
  49. llmemory.itemoffsetof(TP.chars, 0) +
  50. llmemory.sizeof(CHAR_TP) * item)
  51. @signature(types.any(), types.any(), types.int(), returns=types.any())
  52. @specialize.arg(0)
  53. def _get_raw_buf(TP, src, ofs):
  54. """
  55. WARNING: dragons ahead.
  56. Return the address of the internal char* buffer of the low level
  57. string. The return value is valid as long as no GC operation occur, so
  58. you must ensure that it will be used inside a "GC safe" section, for
  59. example by marking your function with @rgc.no_collect
  60. """
  61. assert typeOf(src).TO == TP
  62. assert ofs >= 0
  63. return llmemory.cast_ptr_to_adr(src) + _str_ofs(TP, ofs)
  64. _get_raw_buf._always_inline_ = True
  65. @jit.oopspec('stroruni.copy_contents(src, dst, srcstart, dststart, length)')
  66. @signature(types.any(), types.any(), types.int(), types.int(), types.int(), returns=types.none())
  67. def copy_string_contents(src, dst, srcstart, dststart, length):
  68. """Copies 'length' characters from the 'src' string to the 'dst'
  69. string, starting at position 'srcstart' and 'dststart'."""
  70. # xxx Warning: don't try to do this at home. It relies on a lot
  71. # of details to be sure that it works correctly in all cases.
  72. # Notably: no GC operation at all from the first cast_ptr_to_adr()
  73. # because it might move the strings. The keepalive_until_here()
  74. # are obscurely essential to make sure that the strings stay alive
  75. # longer than the raw_memcopy().
  76. assert length >= 0
  77. ll_assert(srcstart >= 0, "copystrc: negative srcstart")
  78. ll_assert(srcstart + length <= len(src.chars), "copystrc: src ovf")
  79. ll_assert(dststart >= 0, "copystrc: negative dststart")
  80. ll_assert(dststart + length <= len(dst.chars), "copystrc: dst ovf")
  81. # from here, no GC operations can happen
  82. asrc = _get_raw_buf(SRC_TP, src, srcstart)
  83. adst = _get_raw_buf(DST_TP, dst, dststart)
  84. llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(CHAR_TP) * length)
  85. # end of "no GC" section
  86. keepalive_until_here(src)
  87. keepalive_until_here(dst)
  88. copy_string_contents._always_inline_ = True
  89. copy_string_contents = func_with_new_name(copy_string_contents,
  90. 'copy_%s_contents' % name)
  91. @jit.oopspec('stroruni.copy_string_to_raw(src, ptrdst, srcstart, length)')
  92. def copy_string_to_raw(src, ptrdst, srcstart, length):
  93. """
  94. Copies 'length' characters from the 'src' string to the 'ptrdst'
  95. buffer, starting at position 'srcstart'.
  96. 'ptrdst' must be a non-gc Array of Char.
  97. """
  98. # xxx Warning: same note as above apply: don't do this at home
  99. assert length >= 0
  100. # from here, no GC operations can happen
  101. asrc = _get_raw_buf(SRC_TP, src, srcstart)
  102. adst = llmemory.cast_ptr_to_adr(ptrdst)
  103. adst = adst + llmemory.itemoffsetof(typeOf(ptrdst).TO, 0)
  104. llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(CHAR_TP) * length)
  105. # end of "no GC" section
  106. keepalive_until_here(src)
  107. copy_string_to_raw._always_inline_ = True
  108. copy_string_to_raw = func_with_new_name(copy_string_to_raw, 'copy_%s_to_raw' % name)
  109. @jit.dont_look_inside
  110. @signature(types.any(), types.any(), types.int(), types.int(),
  111. returns=types.none())
  112. def copy_raw_to_string(ptrsrc, dst, dststart, length):
  113. # xxx Warning: same note as above apply: don't do this at home
  114. assert length >= 0
  115. # from here, no GC operations can happen
  116. adst = _get_raw_buf(SRC_TP, dst, dststart)
  117. asrc = llmemory.cast_ptr_to_adr(ptrsrc)
  118. asrc = asrc + llmemory.itemoffsetof(typeOf(ptrsrc).TO, 0)
  119. llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(CHAR_TP) * length)
  120. # end of "no GC" section
  121. keepalive_until_here(dst)
  122. copy_raw_to_string._always_inline_ = True
  123. copy_raw_to_string = func_with_new_name(copy_raw_to_string,
  124. 'copy_raw_to_%s' % name)
  125. return copy_string_to_raw, copy_raw_to_string, copy_string_contents
  126. (copy_string_to_raw,
  127. copy_raw_to_string,
  128. copy_string_contents) = _new_copy_contents_fun(STR, STR, Char, 'string')
  129. (copy_unicode_to_raw,
  130. copy_raw_to_unicode,
  131. copy_unicode_contents) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar, 'unicode')
  132. CONST_STR_CACHE = WeakValueDictionary()
  133. CONST_UNICODE_CACHE = WeakValueDictionary()
  134. class BaseLLStringRepr(Repr):
  135. def convert_const(self, value):
  136. if value is None:
  137. return nullptr(self.lowleveltype.TO)
  138. #value = getattr(value, '__self__', value) # for bound string methods
  139. if not isinstance(value, self.basetype):
  140. raise TyperError("not a str: %r" % (value,))
  141. try:
  142. return self.CACHE[value]
  143. except KeyError:
  144. p = self.malloc(len(value))
  145. for i in range(len(value)):
  146. p.chars[i] = cast_primitive(self.base, value[i])
  147. p.hash = 0
  148. self.ll.ll_strhash(p) # precompute the hash
  149. self.CACHE[value] = p
  150. return p
  151. def make_iterator_repr(self, variant=None):
  152. if variant is not None:
  153. raise TyperError("unsupported %r iterator over a str/unicode" %
  154. (variant,))
  155. return self.repr.iterator_repr
  156. def can_ll_be_null(self, s_value):
  157. # XXX unicode
  158. if self is string_repr:
  159. return s_value.can_be_none()
  160. else:
  161. return True # for CharRepr/UniCharRepr subclasses,
  162. # where NULL is always valid: it is chr(0)
  163. def _list_length_items(self, hop, v_lst, LIST):
  164. LIST = LIST.TO
  165. v_length = hop.gendirectcall(LIST.ll_length, v_lst)
  166. v_items = hop.gendirectcall(LIST.ll_items, v_lst)
  167. return v_length, v_items
  168. class StringRepr(BaseLLStringRepr, AbstractStringRepr):
  169. lowleveltype = Ptr(STR)
  170. basetype = str
  171. base = Char
  172. CACHE = CONST_STR_CACHE
  173. def __init__(self, *args):
  174. AbstractStringRepr.__init__(self, *args)
  175. self.ll = LLHelpers
  176. self.malloc = mallocstr
  177. def ll_decode_latin1(self, value):
  178. lgt = len(value.chars)
  179. s = mallocunicode(lgt)
  180. for i in range(lgt):
  181. s.chars[i] = cast_primitive(UniChar, value.chars[i])
  182. return s
  183. class UnicodeRepr(BaseLLStringRepr, AbstractUnicodeRepr):
  184. lowleveltype = Ptr(UNICODE)
  185. basetype = basestring
  186. base = UniChar
  187. CACHE = CONST_UNICODE_CACHE
  188. def __init__(self, *args):
  189. AbstractUnicodeRepr.__init__(self, *args)
  190. self.ll = LLHelpers
  191. self.malloc = mallocunicode
  192. @jit.elidable
  193. def ll_str(self, s):
  194. # XXX crazy that this is here, but I don't want to break
  195. # rmodel logic
  196. if not s:
  197. return self.ll.ll_constant('None')
  198. lgt = len(s.chars)
  199. result = mallocstr(lgt)
  200. for i in range(lgt):
  201. c = s.chars[i]
  202. if ord(c) > 127:
  203. raise UnicodeEncodeError("character not in ascii range")
  204. result.chars[i] = cast_primitive(Char, c)
  205. return result
  206. @jit.elidable
  207. def ll_unicode(self, s):
  208. if s:
  209. return s
  210. else:
  211. return self.ll.ll_constant_unicode(u'None')
  212. @jit.elidable
  213. def ll_encode_latin1(self, s):
  214. length = len(s.chars)
  215. result = mallocstr(length)
  216. for i in range(length):
  217. c = s.chars[i]
  218. if ord(c) > 255:
  219. raise UnicodeEncodeError("character not in latin1 range")
  220. result.chars[i] = cast_primitive(Char, c)
  221. return result
  222. class CharRepr(AbstractCharRepr, StringRepr):
  223. lowleveltype = Char
  224. class UniCharRepr(AbstractUniCharRepr, UnicodeRepr):
  225. lowleveltype = UniChar
  226. # ____________________________________________________________
  227. #
  228. # Low-level methods. These can be run for testing, but are meant to
  229. # be direct_call'ed from rtyped flow graphs, which means that they will
  230. # get flowed and annotated, mostly with SomePtr.
  231. #
  232. FAST_COUNT = 0
  233. FAST_FIND = 1
  234. FAST_RFIND = 2
  235. from rpython.rlib.rarithmetic import LONG_BIT as BLOOM_WIDTH
  236. def bloom_add(mask, c):
  237. return mask | (1 << (ord(c) & (BLOOM_WIDTH - 1)))
  238. def bloom(mask, c):
  239. return mask & (1 << (ord(c) & (BLOOM_WIDTH - 1)))
  240. class LLHelpers(AbstractLLHelpers):
  241. from rpython.rtyper.annlowlevel import llstr, llunicode
  242. @staticmethod
  243. @jit.elidable
  244. def ll_str_mul(s, times):
  245. if times < 0:
  246. times = 0
  247. try:
  248. size = ovfcheck(len(s.chars) * times)
  249. except OverflowError:
  250. raise MemoryError
  251. newstr = s.malloc(size)
  252. i = 0
  253. if i < size:
  254. s.copy_contents(s, newstr, 0, 0, len(s.chars))
  255. i += len(s.chars)
  256. while i < size:
  257. if i <= size - i:
  258. j = i
  259. else:
  260. j = size - i
  261. s.copy_contents(newstr, newstr, 0, i, j)
  262. i += j
  263. return newstr
  264. @staticmethod
  265. @jit.elidable
  266. def ll_char_mul(ch, times):
  267. if typeOf(ch) is Char:
  268. malloc = mallocstr
  269. else:
  270. malloc = mallocunicode
  271. if times < 0:
  272. times = 0
  273. newstr = malloc(times)
  274. j = 0
  275. # XXX we can use memset here, not sure how useful this is
  276. while j < times:
  277. newstr.chars[j] = ch
  278. j += 1
  279. return newstr
  280. @staticmethod
  281. def ll_strlen(s):
  282. return len(s.chars)
  283. @staticmethod
  284. @signature(types.any(), types.int(), returns=types.any())
  285. def ll_stritem_nonneg(s, i):
  286. chars = s.chars
  287. ll_assert(i >= 0, "negative str getitem index")
  288. ll_assert(i < len(chars), "str getitem index out of bound")
  289. return chars[i]
  290. @staticmethod
  291. def ll_chr2str(ch):
  292. if typeOf(ch) is Char:
  293. malloc = mallocstr
  294. else:
  295. malloc = mallocunicode
  296. s = malloc(1)
  297. s.chars[0] = ch
  298. return s
  299. # @jit.look_inside_iff(lambda str: jit.isconstant(len(str.chars)) and len(str.chars) == 1)
  300. @staticmethod
  301. @jit.oopspec("str.str2unicode(str)")
  302. def ll_str2unicode(str):
  303. lgt = len(str.chars)
  304. s = mallocunicode(lgt)
  305. for i in range(lgt):
  306. if ord(str.chars[i]) > 127:
  307. raise UnicodeDecodeError
  308. s.chars[i] = cast_primitive(UniChar, str.chars[i])
  309. return s
  310. @staticmethod
  311. def ll_str2bytearray(str):
  312. from rpython.rtyper.lltypesystem.rbytearray import BYTEARRAY
  313. lgt = len(str.chars)
  314. b = malloc(BYTEARRAY, lgt)
  315. for i in range(lgt):
  316. b.chars[i] = str.chars[i]
  317. return b
  318. @staticmethod
  319. @jit.elidable
  320. def ll_strhash(s):
  321. # unlike CPython, there is no reason to avoid to return -1
  322. # but our malloc initializes the memory to zero, so we use zero as the
  323. # special non-computed-yet value.
  324. if not s:
  325. return 0
  326. x = s.hash
  327. if x == 0:
  328. x = _hash_string(s.chars)
  329. if x == 0:
  330. x = 29872897
  331. s.hash = x
  332. return x
  333. @staticmethod
  334. def ll_length(s):
  335. return len(s.chars)
  336. @staticmethod
  337. def ll_strfasthash(s):
  338. return s.hash # assumes that the hash is already computed
  339. @staticmethod
  340. @jit.elidable
  341. @jit.oopspec('stroruni.concat(s1, s2)')
  342. def ll_strconcat(s1, s2):
  343. len1 = s1.length()
  344. len2 = s2.length()
  345. # a single '+' like this is allowed to overflow: it gets
  346. # a negative result, and the gc will complain
  347. # the typechecks below are if TP == BYTEARRAY
  348. if typeOf(s1) == Ptr(STR):
  349. newstr = s2.malloc(len1 + len2)
  350. newstr.copy_contents_from_str(s1, newstr, 0, 0, len1)
  351. else:
  352. newstr = s1.malloc(len1 + len2)
  353. newstr.copy_contents(s1, newstr, 0, 0, len1)
  354. if typeOf(s2) == Ptr(STR):
  355. newstr.copy_contents_from_str(s2, newstr, 0, len1, len2)
  356. else:
  357. newstr.copy_contents(s2, newstr, 0, len1, len2)
  358. return newstr
  359. @staticmethod
  360. @jit.elidable
  361. def ll_strip(s, ch, left, right):
  362. s_len = len(s.chars)
  363. if s_len == 0:
  364. return s.empty()
  365. lpos = 0
  366. rpos = s_len - 1
  367. if left:
  368. while lpos < rpos and s.chars[lpos] == ch:
  369. lpos += 1
  370. if right:
  371. while lpos < rpos + 1 and s.chars[rpos] == ch:
  372. rpos -= 1
  373. if rpos < lpos:
  374. return s.empty()
  375. r_len = rpos - lpos + 1
  376. result = s.malloc(r_len)
  377. s.copy_contents(s, result, lpos, 0, r_len)
  378. return result
  379. @staticmethod
  380. @jit.elidable
  381. def ll_strip_default(s, left, right):
  382. s_len = len(s.chars)
  383. if s_len == 0:
  384. return s.empty()
  385. lpos = 0
  386. rpos = s_len - 1
  387. if left:
  388. while lpos < rpos and s.chars[lpos].isspace():
  389. lpos += 1
  390. if right:
  391. while lpos < rpos + 1 and s.chars[rpos].isspace():
  392. rpos -= 1
  393. if rpos < lpos:
  394. return s.empty()
  395. r_len = rpos - lpos + 1
  396. result = s.malloc(r_len)
  397. s.copy_contents(s, result, lpos, 0, r_len)
  398. return result
  399. @staticmethod
  400. @jit.elidable
  401. def ll_strip_multiple(s, s2, left, right):
  402. s_len = len(s.chars)
  403. if s_len == 0:
  404. return s.empty()
  405. lpos = 0
  406. rpos = s_len - 1
  407. if left:
  408. while lpos < rpos and LLHelpers.ll_contains(s2, s.chars[lpos]):
  409. lpos += 1
  410. if right:
  411. while lpos < rpos + 1 and LLHelpers.ll_contains(s2, s.chars[rpos]):
  412. rpos -= 1
  413. if rpos < lpos:
  414. return s.empty()
  415. r_len = rpos - lpos + 1
  416. result = s.malloc(r_len)
  417. s.copy_contents(s, result, lpos, 0, r_len)
  418. return result
  419. @staticmethod
  420. @jit.elidable
  421. def ll_upper(s):
  422. s_chars = s.chars
  423. s_len = len(s_chars)
  424. if s_len == 0:
  425. return s.empty()
  426. i = 0
  427. result = mallocstr(s_len)
  428. # ^^^^^^^^^ specifically to explode on unicode
  429. while i < s_len:
  430. result.chars[i] = LLHelpers.ll_upper_char(s_chars[i])
  431. i += 1
  432. return result
  433. @staticmethod
  434. @jit.elidable
  435. def ll_lower(s):
  436. s_chars = s.chars
  437. s_len = len(s_chars)
  438. if s_len == 0:
  439. return s.empty()
  440. i = 0
  441. result = mallocstr(s_len)
  442. # ^^^^^^^^^ specifically to explode on unicode
  443. while i < s_len:
  444. result.chars[i] = LLHelpers.ll_lower_char(s_chars[i])
  445. i += 1
  446. return result
  447. @staticmethod
  448. def ll_join(s, length, items):
  449. s_chars = s.chars
  450. s_len = len(s_chars)
  451. num_items = length
  452. if num_items == 0:
  453. return s.empty()
  454. itemslen = 0
  455. i = 0
  456. while i < num_items:
  457. try:
  458. itemslen = ovfcheck(itemslen + len(items[i].chars))
  459. except OverflowError:
  460. raise MemoryError
  461. i += 1
  462. try:
  463. seplen = ovfcheck(s_len * (num_items - 1))
  464. except OverflowError:
  465. raise MemoryError
  466. # a single '+' at the end is allowed to overflow: it gets
  467. # a negative result, and the gc will complain
  468. result = s.malloc(itemslen + seplen)
  469. res_index = len(items[0].chars)
  470. s.copy_contents(items[0], result, 0, 0, res_index)
  471. i = 1
  472. while i < num_items:
  473. s.copy_contents(s, result, 0, res_index, s_len)
  474. res_index += s_len
  475. lgt = len(items[i].chars)
  476. s.copy_contents(items[i], result, 0, res_index, lgt)
  477. res_index += lgt
  478. i += 1
  479. return result
  480. @staticmethod
  481. @jit.elidable
  482. @jit.oopspec('stroruni.cmp(s1, s2)')
  483. def ll_strcmp(s1, s2):
  484. if not s1 and not s2:
  485. return True
  486. if not s1 or not s2:
  487. return False
  488. chars1 = s1.chars
  489. chars2 = s2.chars
  490. len1 = len(chars1)
  491. len2 = len(chars2)
  492. if len1 < len2:
  493. cmplen = len1
  494. else:
  495. cmplen = len2
  496. i = 0
  497. while i < cmplen:
  498. diff = ord(chars1[i]) - ord(chars2[i])
  499. if diff != 0:
  500. return diff
  501. i += 1
  502. return len1 - len2
  503. @staticmethod
  504. @jit.elidable
  505. @jit.oopspec('stroruni.equal(s1, s2)')
  506. def ll_streq(s1, s2):
  507. if s1 == s2: # also if both are NULLs
  508. return True
  509. if not s1 or not s2:
  510. return False
  511. len1 = len(s1.chars)
  512. len2 = len(s2.chars)
  513. if len1 != len2:
  514. return False
  515. j = 0
  516. chars1 = s1.chars
  517. chars2 = s2.chars
  518. while j < len1:
  519. if chars1[j] != chars2[j]:
  520. return False
  521. j += 1
  522. return True
  523. @staticmethod
  524. @jit.elidable
  525. def ll_startswith(s1, s2):
  526. len1 = len(s1.chars)
  527. len2 = len(s2.chars)
  528. if len1 < len2:
  529. return False
  530. j = 0
  531. chars1 = s1.chars
  532. chars2 = s2.chars
  533. while j < len2:
  534. if chars1[j] != chars2[j]:
  535. return False
  536. j += 1
  537. return True
  538. @staticmethod
  539. def ll_startswith_char(s, ch):
  540. if not len(s.chars):
  541. return False
  542. return s.chars[0] == ch
  543. @staticmethod
  544. @jit.elidable
  545. def ll_endswith(s1, s2):
  546. len1 = len(s1.chars)
  547. len2 = len(s2.chars)
  548. if len1 < len2:
  549. return False
  550. j = 0
  551. chars1 = s1.chars
  552. chars2 = s2.chars
  553. offset = len1 - len2
  554. while j < len2:
  555. if chars1[offset + j] != chars2[j]:
  556. return False
  557. j += 1
  558. return True
  559. @staticmethod
  560. def ll_endswith_char(s, ch):
  561. if not len(s.chars):
  562. return False
  563. return s.chars[len(s.chars) - 1] == ch
  564. @staticmethod
  565. @jit.elidable
  566. @signature(types.any(), types.any(), types.int(), types.int(), returns=types.int())
  567. def ll_find_char(s, ch, start, end):
  568. i = start
  569. if end > len(s.chars):
  570. end = len(s.chars)
  571. while i < end:
  572. if s.chars[i] == ch:
  573. return i
  574. i += 1
  575. return -1
  576. @staticmethod
  577. @jit.elidable
  578. def ll_rfind_char(s, ch, start, end):
  579. if end > len(s.chars):
  580. end = len(s.chars)
  581. i = end
  582. while i > start:
  583. i -= 1
  584. if s.chars[i] == ch:
  585. return i
  586. return -1
  587. @staticmethod
  588. @jit.elidable
  589. def ll_count_char(s, ch, start, end):
  590. count = 0
  591. i = start
  592. if end > len(s.chars):
  593. end = len(s.chars)
  594. while i < end:
  595. if s.chars[i] == ch:
  596. count += 1
  597. i += 1
  598. return count
  599. @staticmethod
  600. @signature(types.any(), types.any(), types.int(), types.int(), returns=types.int())
  601. def ll_find(s1, s2, start, end):
  602. if start < 0:
  603. start = 0
  604. if end > len(s1.chars):
  605. end = len(s1.chars)
  606. if end - start < 0:
  607. return -1
  608. m = len(s2.chars)
  609. if m == 1:
  610. return LLHelpers.ll_find_char(s1, s2.chars[0], start, end)
  611. return LLHelpers.ll_search(s1, s2, start, end, FAST_FIND)
  612. @staticmethod
  613. @signature(types.any(), types.any(), types.int(), types.int(), returns=types.int())
  614. def ll_rfind(s1, s2, start, end):
  615. if start < 0:
  616. start = 0
  617. if end > len(s1.chars):
  618. end = len(s1.chars)
  619. if end - start < 0:
  620. return -1
  621. m = len(s2.chars)
  622. if m == 1:
  623. return LLHelpers.ll_rfind_char(s1, s2.chars[0], start, end)
  624. return LLHelpers.ll_search(s1, s2, start, end, FAST_RFIND)
  625. @classmethod
  626. def ll_count(cls, s1, s2, start, end):
  627. if start < 0:
  628. start = 0
  629. if end > len(s1.chars):
  630. end = len(s1.chars)
  631. if end - start < 0:
  632. return 0
  633. m = len(s2.chars)
  634. if m == 1:
  635. return cls.ll_count_char(s1, s2.chars[0], start, end)
  636. res = cls.ll_search(s1, s2, start, end, FAST_COUNT)
  637. assert res >= 0
  638. return res
  639. @staticmethod
  640. @jit.elidable
  641. def ll_search(s1, s2, start, end, mode):
  642. count = 0
  643. n = end - start
  644. m = len(s2.chars)
  645. if m == 0:
  646. if mode == FAST_COUNT:
  647. return end - start + 1
  648. elif mode == FAST_RFIND:
  649. return end
  650. else:
  651. return start
  652. w = n - m
  653. if w < 0:
  654. if mode == FAST_COUNT:
  655. return 0
  656. return -1
  657. mlast = m - 1
  658. skip = mlast - 1
  659. mask = 0
  660. if mode != FAST_RFIND:
  661. for i in range(mlast):
  662. mask = bloom_add(mask, s2.chars[i])
  663. if s2.chars[i] == s2.chars[mlast]:
  664. skip = mlast - i - 1
  665. mask = bloom_add(mask, s2.chars[mlast])
  666. i = start - 1
  667. while i + 1 <= start + w:
  668. i += 1
  669. if s1.chars[i + m - 1] == s2.chars[m - 1]:
  670. for j in range(mlast):
  671. if s1.chars[i + j] != s2.chars[j]:
  672. break
  673. else:
  674. if mode != FAST_COUNT:
  675. return i
  676. count += 1
  677. i += mlast
  678. continue
  679. if i + m < len(s1.chars):
  680. c = s1.chars[i + m]
  681. else:
  682. c = '\0'
  683. if not bloom(mask, c):
  684. i += m
  685. else:
  686. i += skip
  687. else:
  688. if i + m < len(s1.chars):
  689. c = s1.chars[i + m]
  690. else:
  691. c = '\0'
  692. if not bloom(mask, c):
  693. i += m
  694. else:
  695. mask = bloom_add(mask, s2.chars[0])
  696. for i in range(mlast, 0, -1):
  697. mask = bloom_add(mask, s2.chars[i])
  698. if s2.chars[i] == s2.chars[0]:
  699. skip = i - 1
  700. i = start + w + 1
  701. while i - 1 >= start:
  702. i -= 1
  703. if s1.chars[i] == s2.chars[0]:
  704. for j in xrange(mlast, 0, -1):
  705. if s1.chars[i + j] != s2.chars[j]:
  706. break
  707. else:
  708. return i
  709. if i - 1 >= 0 and not bloom(mask, s1.chars[i - 1]):
  710. i -= m
  711. else:
  712. i -= skip
  713. else:
  714. if i - 1 >= 0 and not bloom(mask, s1.chars[i - 1]):
  715. i -= m
  716. if mode != FAST_COUNT:
  717. return -1
  718. return count
  719. @staticmethod
  720. @signature(types.int(), types.any(), returns=types.any())
  721. @jit.look_inside_iff(lambda length, items: jit.loop_unrolling_heuristic(
  722. items, length))
  723. def ll_join_strs(length, items):
  724. # Special case for length 1 items, helps both the JIT and other code
  725. if length == 1:
  726. return items[0]
  727. num_items = length
  728. itemslen = 0
  729. i = 0
  730. while i < num_items:
  731. try:
  732. itemslen = ovfcheck(itemslen + len(items[i].chars))
  733. except OverflowError:
  734. raise MemoryError
  735. i += 1
  736. if typeOf(items).TO.OF.TO == STR:
  737. malloc = mallocstr
  738. copy_contents = copy_string_contents
  739. else:
  740. malloc = mallocunicode
  741. copy_contents = copy_unicode_contents
  742. result = malloc(itemslen)
  743. res_index = 0
  744. i = 0
  745. while i < num_items:
  746. item_chars = items[i].chars
  747. item_len = len(item_chars)
  748. copy_contents(items[i], result, 0, res_index, item_len)
  749. res_index += item_len
  750. i += 1
  751. return result
  752. @staticmethod
  753. @jit.look_inside_iff(lambda length, chars, RES: jit.isconstant(length) and jit.isvirtual(chars))
  754. def ll_join_chars(length, chars, RES):
  755. # no need to optimize this, will be replaced by string builder
  756. # at some point soon
  757. num_chars = length
  758. if RES is StringRepr.lowleveltype:
  759. target = Char
  760. malloc = mallocstr
  761. else:
  762. target = UniChar
  763. malloc = mallocunicode
  764. result = malloc(num_chars)
  765. res_chars = result.chars
  766. i = 0
  767. while i < num_chars:
  768. res_chars[i] = cast_primitive(target, chars[i])
  769. i += 1
  770. return result
  771. @staticmethod
  772. @jit.oopspec('stroruni.slice(s1, start, stop)')
  773. @signature(types.any(), types.int(), types.int(), returns=types.any())
  774. @jit.elidable
  775. def _ll_stringslice(s1, start, stop):
  776. lgt = stop - start
  777. assert start >= 0
  778. # If start > stop, return a empty string. This can happen if the start
  779. # is greater than the length of the string. Use < instead of <= to avoid
  780. # creating another path for the JIT when start == stop.
  781. if lgt < 0:
  782. return s1.empty()
  783. newstr = s1.malloc(lgt)
  784. s1.copy_contents(s1, newstr, start, 0, lgt)
  785. return newstr
  786. @staticmethod
  787. def ll_stringslice_startonly(s1, start):
  788. return LLHelpers._ll_stringslice(s1, start, len(s1.chars))
  789. @staticmethod
  790. @signature(types.any(), types.int(), types.int(), returns=types.any())
  791. def ll_stringslice_startstop(s1, start, stop):
  792. if jit.we_are_jitted():
  793. if stop > len(s1.chars):
  794. stop = len(s1.chars)
  795. else:
  796. if stop >= len(s1.chars):
  797. if start == 0:
  798. return s1
  799. stop = len(s1.chars)
  800. return LLHelpers._ll_stringslice(s1, start, stop)
  801. @staticmethod
  802. def ll_stringslice_minusone(s1):
  803. newlen = len(s1.chars) - 1
  804. return LLHelpers._ll_stringslice(s1, 0, newlen)
  805. @staticmethod
  806. def ll_split_chr(LIST, s, c, max):
  807. chars = s.chars
  808. strlen = len(chars)
  809. count = 1
  810. i = 0
  811. if max == 0:
  812. i = strlen
  813. while i < strlen:
  814. if chars[i] == c:
  815. count += 1
  816. if max >= 0 and count > max:
  817. break
  818. i += 1
  819. res = LIST.ll_newlist(count)
  820. items = res.ll_items()
  821. i = 0
  822. j = 0
  823. resindex = 0
  824. if max == 0:
  825. j = strlen
  826. while j < strlen:
  827. if chars[j] == c:
  828. item = items[resindex] = s.malloc(j - i)
  829. item.copy_contents(s, item, i, 0, j - i)
  830. resindex += 1
  831. i = j + 1
  832. if max >= 0 and resindex >= max:
  833. j = strlen
  834. break
  835. j += 1
  836. item = items[resindex] = s.malloc(j - i)
  837. item.copy_contents(s, item, i, 0, j - i)
  838. return res
  839. @staticmethod
  840. def ll_split(LIST, s, c, max):
  841. count = 1
  842. if max == -1:
  843. max = len(s.chars)
  844. pos = 0
  845. last = len(s.chars)
  846. markerlen = len(c.chars)
  847. pos = s.find(c, 0, last)
  848. while pos >= 0 and count <= max:
  849. pos = s.find(c, pos + markerlen, last)
  850. count += 1
  851. res = LIST.ll_newlist(count)
  852. items = res.ll_items()
  853. pos = 0
  854. count = 0
  855. pos = s.find(c, 0, last)
  856. prev_pos = 0
  857. if pos < 0:
  858. items[0] = s
  859. return res
  860. while pos >= 0 and count < max:
  861. item = items[count] = s.malloc(pos - prev_pos)
  862. item.copy_contents(s, item, prev_pos, 0, pos -
  863. prev_pos)
  864. count += 1
  865. prev_pos = pos + markerlen
  866. pos = s.find(c, pos + markerlen, last)
  867. item = items[count] = s.malloc(last - prev_pos)
  868. item.copy_contents(s, item, prev_pos, 0, last - prev_pos)
  869. return res
  870. @staticmethod
  871. def ll_rsplit_chr(LIST, s, c, max):
  872. chars = s.chars
  873. strlen = len(chars)
  874. count = 1
  875. i = 0
  876. if max == 0:
  877. i = strlen
  878. while i < strlen:
  879. if chars[i] == c:
  880. count += 1
  881. if max >= 0 and count > max:
  882. break
  883. i += 1
  884. res = LIST.ll_newlist(count)
  885. items = res.ll_items()
  886. i = strlen
  887. j = strlen
  888. resindex = count - 1
  889. assert resindex >= 0
  890. if max == 0:
  891. j = 0
  892. while j > 0:
  893. j -= 1
  894. if chars[j] == c:
  895. item = items[resindex] = s.malloc(i - j - 1)
  896. item.copy_contents(s, item, j + 1, 0, i - j - 1)
  897. resindex -= 1
  898. i = j
  899. if resindex == 0:
  900. j = 0
  901. break
  902. item = items[resindex] = s.malloc(i - j)
  903. item.copy_contents(s, item, j, 0, i - j)
  904. return res
  905. @staticmethod
  906. def ll_rsplit(LIST, s, c, max):
  907. count = 1
  908. if max == -1:
  909. max = len(s.chars)
  910. pos = len(s.chars)
  911. markerlen = len(c.chars)
  912. pos = s.rfind(c, 0, pos)
  913. while pos >= 0 and count <= max:
  914. pos = s.rfind(c, 0, pos - markerlen)
  915. count += 1
  916. res = LIST.ll_newlist(count)
  917. items = res.ll_items()
  918. pos = 0
  919. pos = len(s.chars)
  920. prev_pos = pos
  921. pos = s.rfind(c, 0, pos)
  922. if pos < 0:
  923. items[0] = s
  924. return res
  925. count -= 1
  926. while pos >= 0 and count > 0:
  927. item = items[count] = s.malloc(prev_pos - pos - markerlen)
  928. item.copy_contents(s, item, pos + markerlen, 0,
  929. prev_pos - pos - markerlen)
  930. count -= 1
  931. prev_pos = pos
  932. pos = s.rfind(c, 0, pos)
  933. item = items[count] = s.malloc(prev_pos)
  934. item.copy_contents(s, item, 0, 0, prev_pos)
  935. return res
  936. @staticmethod
  937. @jit.elidable
  938. def ll_replace_chr_chr(s, c1, c2):
  939. length = len(s.chars)
  940. newstr = s.malloc(length)
  941. src = s.chars
  942. dst = newstr.chars
  943. j = 0
  944. while j < length:
  945. c = src[j]
  946. if c == c1:
  947. c = c2
  948. dst[j] = c
  949. j += 1
  950. return newstr
  951. @staticmethod
  952. @jit.elidable
  953. def ll_contains(s, c):
  954. chars = s.chars
  955. strlen = len(chars)
  956. i = 0
  957. while i < strlen:
  958. if chars[i] == c:
  959. return True
  960. i += 1
  961. return False
  962. @staticmethod
  963. @jit.elidable
  964. def ll_int(s, base):
  965. if not 2 <= base <= 36:
  966. raise ValueError
  967. chars = s.chars
  968. strlen = len(chars)
  969. i = 0
  970. #XXX: only space is allowed as white space for now
  971. while i < strlen and chars[i] == ' ':
  972. i += 1
  973. if not i < strlen:
  974. raise ValueError
  975. #check sign
  976. sign = 1
  977. if chars[i] == '-':
  978. sign = -1
  979. i += 1
  980. elif chars[i] == '+':
  981. i += 1
  982. # skip whitespaces between sign and digits
  983. while i < strlen and chars[i] == ' ':
  984. i += 1
  985. #now get digits
  986. val = 0
  987. oldpos = i
  988. while i < strlen:
  989. c = ord(chars[i])
  990. if ord('a') <= c <= ord('z'):
  991. digit = c - ord('a') + 10
  992. elif ord('A') <= c <= ord('Z'):
  993. digit = c - ord('A') + 10
  994. elif ord('0') <= c <= ord('9'):
  995. digit = c - ord('0')
  996. else:
  997. break
  998. if digit >= base:
  999. break
  1000. val = val * base + digit
  1001. i += 1
  1002. if i == oldpos:
  1003. raise ValueError # catch strings like '+' and '+ '
  1004. #skip trailing whitespace
  1005. while i < strlen and chars[i] == ' ':
  1006. i += 1
  1007. if not i == strlen:
  1008. raise ValueError
  1009. return sign * val
  1010. # interface to build strings:
  1011. # x = ll_build_start(n)
  1012. # ll_build_push(x, next_string, 0)
  1013. # ll_build_push(x, next_string, 1)
  1014. # ...
  1015. # ll_build_push(x, next_string, n-1)
  1016. # s = ll_build_finish(x)
  1017. @staticmethod
  1018. def ll_build_start(parts_count):
  1019. return malloc(TEMP, parts_count)
  1020. @staticmethod
  1021. def ll_build_push(builder, next_string, index):
  1022. builder[index] = next_string
  1023. @staticmethod
  1024. def ll_build_finish(builder):
  1025. return LLHelpers.ll_join_strs(len(builder), builder)
  1026. @staticmethod
  1027. @specialize.memo()
  1028. def ll_constant(s):
  1029. return string_repr.convert_const(s)
  1030. @staticmethod
  1031. @specialize.memo()
  1032. def ll_constant_unicode(s):
  1033. return unicode_repr.convert_const(s)
  1034. @classmethod
  1035. def do_stringformat(cls, hop, sourcevarsrepr):
  1036. s_str = hop.args_s[0]
  1037. assert s_str.is_constant()
  1038. is_unicode = isinstance(s_str, annmodel.SomeUnicodeString)
  1039. if is_unicode:
  1040. TEMPBUF = TEMP_UNICODE
  1041. else:
  1042. TEMPBUF = TEMP
  1043. s = s_str.const
  1044. things = cls.parse_fmt_string(s)
  1045. size = inputconst(Signed, len(things)) # could be unsigned?
  1046. cTEMP = inputconst(Void, TEMPBUF)
  1047. cflags = inputconst(Void, {'flavor': 'gc'})
  1048. vtemp = hop.genop("malloc_varsize", [cTEMP, cflags, size],
  1049. resulttype=Ptr(TEMPBUF))
  1050. argsiter = iter(sourcevarsrepr)
  1051. from rpython.rtyper.rclass import InstanceRepr
  1052. for i, thing in enumerate(things):
  1053. if isinstance(thing, tuple):
  1054. code = thing[0]
  1055. vitem, r_arg = argsiter.next()
  1056. if not hasattr(r_arg, 'll_str'):
  1057. raise TyperError("ll_str unsupported for: %r" % r_arg)
  1058. if code == 's':
  1059. if is_unicode:
  1060. # only UniCharRepr and UnicodeRepr has it so far
  1061. vchunk = hop.gendirectcall(r_arg.ll_unicode, vitem)
  1062. else:
  1063. vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
  1064. elif code == 'r' and isinstance(r_arg, InstanceRepr):
  1065. vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
  1066. elif code == 'd':
  1067. assert isinstance(r_arg, IntegerRepr)
  1068. #vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
  1069. vchunk = hop.gendirectcall(ll_str.ll_int2dec, vitem)
  1070. elif code == 'f':
  1071. #assert isinstance(r_arg, FloatRepr)
  1072. vchunk = hop.gendirectcall(r_arg.ll_str, vitem)
  1073. elif code == 'x':
  1074. assert isinstance(r_arg, IntegerRepr)
  1075. vchunk = hop.gendirectcall(ll_str.ll_int2hex, vitem,
  1076. inputconst(Bool, False))
  1077. elif code == 'o':
  1078. assert isinstance(r_arg, IntegerRepr)
  1079. vchunk = hop.gendirectcall(ll_str.ll_int2oct, vitem,
  1080. inputconst(Bool, False))
  1081. else:
  1082. raise TyperError("%%%s is not RPython" % (code,))
  1083. else:
  1084. if is_unicode:
  1085. vchunk = inputconst(unicode_repr, thing)
  1086. else:
  1087. vchunk = inputconst(string_repr, thing)
  1088. i = inputconst(Signed, i)
  1089. if is_unicode and vchunk.concretetype != Ptr(UNICODE):
  1090. # if we are here, one of the ll_str.* functions returned some
  1091. # STR, so we convert it to unicode. It's a bit suboptimal
  1092. # because we do one extra copy.
  1093. vchunk = hop.gendirectcall(cls.ll_str2unicode, vchunk)
  1094. hop.genop('setarrayitem', [vtemp, i, vchunk])
  1095. hop.exception_cannot_occur() # to ignore the ZeroDivisionError of '%'
  1096. return hop.gendirectcall(cls.ll_join_strs, size, vtemp)
  1097. @staticmethod
  1098. @jit.dont_look_inside
  1099. def ll_string2list(RESLIST, src):
  1100. length = len(src.chars)
  1101. lst = RESLIST.ll_newlist(length)
  1102. dst = lst.ll_items()
  1103. SRC = typeOf(src).TO # STR or UNICODE
  1104. DST = typeOf(dst).TO # GcArray
  1105. assert DST.OF is SRC.chars.OF
  1106. # from here, no GC operations can happen
  1107. asrc = llmemory.cast_ptr_to_adr(src) + (
  1108. llmemory.offsetof(SRC, 'chars') +
  1109. llmemory.itemoffsetof(SRC.chars, 0))
  1110. adst = llmemory.cast_ptr_to_adr(dst) + llmemory.itemoffsetof(DST, 0)
  1111. llmemory.raw_memcopy(asrc, adst, llmemory.sizeof(DST.OF) * length)
  1112. # end of "no GC" section
  1113. keepalive_until_here(src)
  1114. keepalive_until_here(dst)
  1115. return lst
  1116. TEMP = GcArray(Ptr(STR))
  1117. TEMP_UNICODE = GcArray(Ptr(UNICODE))
  1118. # ____________________________________________________________
  1119. STR.become(GcStruct('rpy_string', ('hash', Signed),
  1120. ('chars', Array(Char, hints={'immutable': True,
  1121. 'extra_item_after_alloc': 1})),
  1122. adtmeths={'malloc' : staticAdtMethod(mallocstr),
  1123. 'empty' : staticAdtMethod(emptystrfun),
  1124. 'copy_contents' : staticAdtMethod(copy_string_contents),
  1125. 'copy_contents_from_str' : staticAdtMethod(copy_string_contents),
  1126. 'gethash': LLHelpers.ll_strhash,
  1127. 'length': LLHelpers.ll_length,
  1128. 'find': LLHelpers.ll_find,
  1129. 'rfind': LLHelpers.ll_rfind}))
  1130. UNICODE.become(GcStruct('rpy_unicode', ('hash', Signed),
  1131. ('chars', Array(UniChar, hints={'immutable': True})),
  1132. adtmeths={'malloc' : staticAdtMethod(mallocunicode),
  1133. 'empty' : staticAdtMethod(emptyunicodefun),
  1134. 'copy_contents' : staticAdtMethod(copy_unicode_contents),
  1135. 'copy_contents_from_str' : staticAdtMethod(copy_unicode_contents),
  1136. 'gethash': LLHelpers.ll_strhash,
  1137. 'length': LLHelpers.ll_length}
  1138. ))
  1139. # TODO: make the public interface of the rstr module cleaner
  1140. ll_strconcat = LLHelpers.ll_strconcat
  1141. ll_join = LLHelpers.ll_join
  1142. ll_str2unicode = LLHelpers.ll_str2unicode
  1143. do_stringformat = LLHelpers.do_stringformat
  1144. string_repr = StringRepr()
  1145. char_repr = CharRepr()
  1146. unichar_repr = UniCharRepr()
  1147. char_repr.ll = LLHelpers
  1148. unichar_repr.ll = LLHelpers
  1149. unicode_repr = UnicodeRepr()
  1150. emptystr = string_repr.convert_const("")
  1151. emptyunicode = unicode_repr.convert_const(u'')
  1152. StringRepr.repr = string_repr
  1153. UnicodeRepr.repr = unicode_repr
  1154. UniCharRepr.repr = unicode_repr
  1155. UniCharRepr.char_repr = unichar_repr
  1156. UnicodeRepr.char_repr = unichar_repr
  1157. CharRepr.char_repr = char_repr
  1158. StringRepr.char_repr = char_repr
  1159. class BaseStringIteratorRepr(AbstractStringIteratorRepr):
  1160. def __init__(self):
  1161. self.ll_striter = ll_striter
  1162. self.ll_strnext = ll_strnext
  1163. self.ll_getnextindex = ll_getnextindex
  1164. class StringIteratorRepr(BaseStringIteratorRepr):
  1165. external_item_repr = char_repr
  1166. lowleveltype = Ptr(GcStruct('stringiter',
  1167. ('string', string_repr.lowleveltype),
  1168. ('length', Signed),
  1169. ('index', Signed)))
  1170. class UnicodeIteratorRepr(BaseStringIteratorRepr):
  1171. external_item_repr = unichar_repr
  1172. lowleveltype = Ptr(GcStruct('unicodeiter',
  1173. ('string', unicode_repr.lowleveltype),
  1174. ('length', Signed),
  1175. ('index', Signed)))
  1176. def ll_striter(string):
  1177. if typeOf(string) == string_repr.lowleveltype:
  1178. TP = string_repr.iterator_repr.lowleveltype.TO
  1179. elif typeOf(string) == unicode_repr.lowleveltype:
  1180. TP = unicode_repr.iterator_repr.lowleveltype.TO
  1181. else:
  1182. raise TypeError("Unknown string type %s" % (typeOf(string),))
  1183. iter = malloc(TP)
  1184. iter.string = string
  1185. iter.length = len(string.chars) # load this value only once
  1186. iter.index = 0
  1187. return iter
  1188. def ll_strnext(iter):
  1189. index = iter.index
  1190. if index >= iter.length:
  1191. raise StopIteration
  1192. iter.index = index + 1
  1193. return iter.string.chars[index]
  1194. def ll_getnextindex(iter):
  1195. return iter.index
  1196. string_repr.iterator_repr = StringIteratorRepr()
  1197. unicode_repr.iterator_repr = UnicodeIteratorRepr()
  1198. # these should be in rclass, but circular imports prevent (also it's
  1199. # not that insane that a string constant is built in this file).
  1200. instance_str_prefix = string_repr.convert_const("<")
  1201. instance_str_infix = string_repr.convert_const(" object at 0x")
  1202. instance_str_suffix = string_repr.convert_const(">")
  1203. null_str = string_repr.convert_const("NULL")
  1204. unboxed_instance_str_prefix = string_repr.convert_const("<unboxed ")
  1205. unboxed_instance_str_suffix = string_repr.convert_const(">")