PageRenderTime 62ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/rpython/rlib/rstring.py

https://bitbucket.org/pypy/pypy/
Python | 778 lines | 675 code | 79 blank | 24 comment | 155 complexity | 89c897309f0d8be59cf1f1f020c9018f MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. """ String builder interface and string functions
  2. """
  3. import sys
  4. from rpython.annotator.model import (SomeObject, SomeString, s_None, SomeChar,
  5. SomeInteger, SomeUnicodeCodePoint, SomeUnicodeString, SomePBC)
  6. from rpython.rtyper.llannotation import SomePtr
  7. from rpython.rlib import jit
  8. from rpython.rlib.objectmodel import newlist_hint, resizelist_hint, specialize
  9. from rpython.rlib.rarithmetic import ovfcheck, LONG_BIT as BLOOM_WIDTH
  10. from rpython.rlib.buffer import Buffer
  11. from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
  12. from rpython.rtyper.extregistry import ExtRegistryEntry
  13. from rpython.tool.pairtype import pairtype
  14. # -------------- public API for string functions -----------------------
  15. @specialize.argtype(0)
  16. def _isspace(char):
  17. if isinstance(char, str):
  18. return char.isspace()
  19. else:
  20. assert isinstance(char, unicode)
  21. return unicodedb.isspace(ord(char))
  22. @specialize.argtype(0, 1)
  23. def split(value, by=None, maxsplit=-1):
  24. if by is None:
  25. length = len(value)
  26. i = 0
  27. res = []
  28. while True:
  29. # find the beginning of the next word
  30. while i < length:
  31. if not _isspace(value[i]):
  32. break # found
  33. i += 1
  34. else:
  35. break # end of string, finished
  36. # find the end of the word
  37. if maxsplit == 0:
  38. j = length # take all the rest of the string
  39. else:
  40. j = i + 1
  41. while j < length and not _isspace(value[j]):
  42. j += 1
  43. maxsplit -= 1 # NB. if it's already < 0, it stays < 0
  44. # the word is value[i:j]
  45. res.append(value[i:j])
  46. # continue to look from the character following the space after the word
  47. i = j + 1
  48. return res
  49. if isinstance(value, unicode):
  50. assert isinstance(by, unicode)
  51. if isinstance(value, str):
  52. assert isinstance(by, str)
  53. if isinstance(value, list):
  54. assert isinstance(by, str)
  55. bylen = len(by)
  56. if bylen == 0:
  57. raise ValueError("empty separator")
  58. start = 0
  59. if bylen == 1:
  60. # fast path: uses str.rfind(character) and str.count(character)
  61. by = by[0] # annotator hack: string -> char
  62. cnt = count(value, by, 0, len(value))
  63. if 0 <= maxsplit < cnt:
  64. cnt = maxsplit
  65. res = newlist_hint(cnt + 1)
  66. while cnt > 0:
  67. next = find(value, by, start, len(value))
  68. assert next >= 0 # cannot fail due to the value.count above
  69. res.append(value[start:next])
  70. start = next + bylen
  71. cnt -= 1
  72. res.append(value[start:len(value)])
  73. return res
  74. if maxsplit > 0:
  75. res = newlist_hint(min(maxsplit + 1, len(value)))
  76. else:
  77. res = []
  78. while maxsplit != 0:
  79. next = find(value, by, start, len(value))
  80. if next < 0:
  81. break
  82. assert start >= 0
  83. res.append(value[start:next])
  84. start = next + bylen
  85. maxsplit -= 1 # NB. if it's already < 0, it stays < 0
  86. res.append(value[start:len(value)])
  87. return res
  88. @specialize.argtype(0, 1)
  89. def rsplit(value, by=None, maxsplit=-1):
  90. if by is None:
  91. res = []
  92. i = len(value) - 1
  93. while True:
  94. # starting from the end, find the end of the next word
  95. while i >= 0:
  96. if not _isspace(value[i]):
  97. break # found
  98. i -= 1
  99. else:
  100. break # end of string, finished
  101. # find the start of the word
  102. # (more precisely, 'j' will be the space character before the word)
  103. if maxsplit == 0:
  104. j = -1 # take all the rest of the string
  105. else:
  106. j = i - 1
  107. while j >= 0 and not _isspace(value[j]):
  108. j -= 1
  109. maxsplit -= 1 # NB. if it's already < 0, it stays < 0
  110. # the word is value[j+1:i+1]
  111. j1 = j + 1
  112. assert j1 >= 0
  113. res.append(value[j1:i+1])
  114. # continue to look from the character before the space before the word
  115. i = j - 1
  116. res.reverse()
  117. return res
  118. if isinstance(value, unicode):
  119. assert isinstance(by, unicode)
  120. if isinstance(value, str):
  121. assert isinstance(by, str)
  122. if isinstance(value, list):
  123. assert isinstance(by, str)
  124. if maxsplit > 0:
  125. res = newlist_hint(min(maxsplit + 1, len(value)))
  126. else:
  127. res = []
  128. end = len(value)
  129. bylen = len(by)
  130. if bylen == 0:
  131. raise ValueError("empty separator")
  132. while maxsplit != 0:
  133. next = rfind(value, by, 0, end)
  134. if next < 0:
  135. break
  136. res.append(value[next + bylen:end])
  137. end = next
  138. maxsplit -= 1 # NB. if it's already < 0, it stays < 0
  139. res.append(value[:end])
  140. res.reverse()
  141. return res
  142. @specialize.argtype(0, 1)
  143. @jit.elidable
  144. def replace(input, sub, by, maxsplit=-1):
  145. if isinstance(input, str):
  146. Builder = StringBuilder
  147. elif isinstance(input, unicode):
  148. Builder = UnicodeBuilder
  149. else:
  150. assert isinstance(input, list)
  151. Builder = ByteListBuilder
  152. if maxsplit == 0:
  153. return input
  154. if not sub:
  155. upper = len(input)
  156. if maxsplit > 0 and maxsplit < upper + 2:
  157. upper = maxsplit - 1
  158. assert upper >= 0
  159. try:
  160. result_size = ovfcheck(upper * len(by))
  161. result_size = ovfcheck(result_size + upper)
  162. result_size = ovfcheck(result_size + len(by))
  163. remaining_size = len(input) - upper
  164. result_size = ovfcheck(result_size + remaining_size)
  165. except OverflowError:
  166. raise
  167. builder = Builder(result_size)
  168. for i in range(upper):
  169. builder.append(by)
  170. builder.append(input[i])
  171. builder.append(by)
  172. builder.append_slice(input, upper, len(input))
  173. else:
  174. # First compute the exact result size
  175. cnt = count(input, sub, 0, len(input))
  176. if cnt > maxsplit and maxsplit > 0:
  177. cnt = maxsplit
  178. diff_len = len(by) - len(sub)
  179. try:
  180. result_size = ovfcheck(diff_len * cnt)
  181. result_size = ovfcheck(result_size + len(input))
  182. except OverflowError:
  183. raise
  184. builder = Builder(result_size)
  185. start = 0
  186. sublen = len(sub)
  187. while maxsplit != 0:
  188. next = find(input, sub, start, len(input))
  189. if next < 0:
  190. break
  191. builder.append_slice(input, start, next)
  192. builder.append(by)
  193. start = next + sublen
  194. maxsplit -= 1 # NB. if it's already < 0, it stays < 0
  195. builder.append_slice(input, start, len(input))
  196. return builder.build()
  197. def _normalize_start_end(length, start, end):
  198. if start < 0:
  199. start += length
  200. if start < 0:
  201. start = 0
  202. if end < 0:
  203. end += length
  204. if end < 0:
  205. end = 0
  206. elif end > length:
  207. end = length
  208. return start, end
  209. @specialize.argtype(0, 1)
  210. @jit.elidable
  211. def startswith(u_self, prefix, start=0, end=sys.maxint):
  212. length = len(u_self)
  213. start, end = _normalize_start_end(length, start, end)
  214. stop = start + len(prefix)
  215. if stop > end:
  216. return False
  217. for i in range(len(prefix)):
  218. if u_self[start+i] != prefix[i]:
  219. return False
  220. return True
  221. @specialize.argtype(0, 1)
  222. @jit.elidable
  223. def endswith(u_self, suffix, start=0, end=sys.maxint):
  224. length = len(u_self)
  225. start, end = _normalize_start_end(length, start, end)
  226. begin = end - len(suffix)
  227. if begin < start:
  228. return False
  229. for i in range(len(suffix)):
  230. if u_self[begin+i] != suffix[i]:
  231. return False
  232. return True
  233. @specialize.argtype(0, 1)
  234. def find(value, other, start, end):
  235. if ((isinstance(value, str) and isinstance(other, str)) or
  236. (isinstance(value, unicode) and isinstance(other, unicode))):
  237. return value.find(other, start, end)
  238. return _search(value, other, start, end, SEARCH_FIND)
  239. @specialize.argtype(0, 1)
  240. def rfind(value, other, start, end):
  241. if ((isinstance(value, str) and isinstance(other, str)) or
  242. (isinstance(value, unicode) and isinstance(other, unicode))):
  243. return value.rfind(other, start, end)
  244. return _search(value, other, start, end, SEARCH_RFIND)
  245. @specialize.argtype(0, 1)
  246. def count(value, other, start, end):
  247. if ((isinstance(value, str) and isinstance(other, str)) or
  248. (isinstance(value, unicode) and isinstance(other, unicode))):
  249. return value.count(other, start, end)
  250. return _search(value, other, start, end, SEARCH_COUNT)
  251. # -------------- substring searching helper ----------------
  252. # XXX a lot of code duplication with lltypesystem.rstr :-(
  253. SEARCH_COUNT = 0
  254. SEARCH_FIND = 1
  255. SEARCH_RFIND = 2
  256. def bloom_add(mask, c):
  257. return mask | (1 << (ord(c) & (BLOOM_WIDTH - 1)))
  258. def bloom(mask, c):
  259. return mask & (1 << (ord(c) & (BLOOM_WIDTH - 1)))
  260. @specialize.argtype(0, 1)
  261. def _search(value, other, start, end, mode):
  262. if start < 0:
  263. start = 0
  264. if end > len(value):
  265. end = len(value)
  266. if start > end:
  267. if mode == SEARCH_COUNT:
  268. return 0
  269. return -1
  270. count = 0
  271. n = end - start
  272. m = len(other)
  273. if m == 0:
  274. if mode == SEARCH_COUNT:
  275. return end - start + 1
  276. elif mode == SEARCH_RFIND:
  277. return end
  278. else:
  279. return start
  280. w = n - m
  281. if w < 0:
  282. if mode == SEARCH_COUNT:
  283. return 0
  284. return -1
  285. mlast = m - 1
  286. skip = mlast - 1
  287. mask = 0
  288. if mode != SEARCH_RFIND:
  289. for i in range(mlast):
  290. mask = bloom_add(mask, other[i])
  291. if other[i] == other[mlast]:
  292. skip = mlast - i - 1
  293. mask = bloom_add(mask, other[mlast])
  294. i = start - 1
  295. while i + 1 <= start + w:
  296. i += 1
  297. if value[i + m - 1] == other[m - 1]:
  298. for j in range(mlast):
  299. if value[i + j] != other[j]:
  300. break
  301. else:
  302. if mode != SEARCH_COUNT:
  303. return i
  304. count += 1
  305. i += mlast
  306. continue
  307. if i + m < len(value):
  308. c = value[i + m]
  309. else:
  310. c = '\0'
  311. if not bloom(mask, c):
  312. i += m
  313. else:
  314. i += skip
  315. else:
  316. if i + m < len(value):
  317. c = value[i + m]
  318. else:
  319. c = '\0'
  320. if not bloom(mask, c):
  321. i += m
  322. else:
  323. mask = bloom_add(mask, other[0])
  324. for i in range(mlast, 0, -1):
  325. mask = bloom_add(mask, other[i])
  326. if other[i] == other[0]:
  327. skip = i - 1
  328. i = start + w + 1
  329. while i - 1 >= start:
  330. i -= 1
  331. if value[i] == other[0]:
  332. for j in xrange(mlast, 0, -1):
  333. if value[i + j] != other[j]:
  334. break
  335. else:
  336. return i
  337. if i - 1 >= 0 and not bloom(mask, value[i - 1]):
  338. i -= m
  339. else:
  340. i -= skip
  341. else:
  342. if i - 1 >= 0 and not bloom(mask, value[i - 1]):
  343. i -= m
  344. if mode != SEARCH_COUNT:
  345. return -1
  346. return count
  347. # -------------- numeric parsing support --------------------
  348. def strip_spaces(s):
  349. # XXX this is not locale-dependent
  350. p = 0
  351. q = len(s)
  352. while p < q and s[p] in ' \f\n\r\t\v':
  353. p += 1
  354. while p < q and s[q-1] in ' \f\n\r\t\v':
  355. q -= 1
  356. assert q >= p # annotator hint, don't remove
  357. return s[p:q]
  358. class ParseStringError(Exception):
  359. def __init__(self, msg):
  360. self.msg = msg
  361. class InvalidBaseError(ParseStringError):
  362. """Signals an invalid base argument"""
  363. class ParseStringOverflowError(Exception):
  364. def __init__(self, parser):
  365. self.parser = parser
  366. # iterator-like class
  367. class NumberStringParser:
  368. def error(self):
  369. raise ParseStringError("invalid literal for %s() with base %d" %
  370. (self.fname, self.original_base))
  371. def __init__(self, s, literal, base, fname):
  372. self.fname = fname
  373. sign = 1
  374. if s.startswith('-'):
  375. sign = -1
  376. s = strip_spaces(s[1:])
  377. elif s.startswith('+'):
  378. s = strip_spaces(s[1:])
  379. self.sign = sign
  380. self.original_base = base
  381. if base == 0:
  382. if s.startswith('0x') or s.startswith('0X'):
  383. base = 16
  384. elif s.startswith('0b') or s.startswith('0B'):
  385. base = 2
  386. elif s.startswith('0'): # also covers the '0o' case
  387. base = 8
  388. else:
  389. base = 10
  390. elif base < 2 or base > 36:
  391. raise InvalidBaseError("%s() base must be >= 2 and <= 36" % fname)
  392. self.base = base
  393. if base == 16 and (s.startswith('0x') or s.startswith('0X')):
  394. s = s[2:]
  395. if base == 8 and (s.startswith('0o') or s.startswith('0O')):
  396. s = s[2:]
  397. if base == 2 and (s.startswith('0b') or s.startswith('0B')):
  398. s = s[2:]
  399. if not s:
  400. self.error()
  401. self.s = s
  402. self.n = len(s)
  403. self.i = 0
  404. def rewind(self):
  405. self.i = 0
  406. def next_digit(self): # -1 => exhausted
  407. if self.i < self.n:
  408. c = self.s[self.i]
  409. digit = ord(c)
  410. if '0' <= c <= '9':
  411. digit -= ord('0')
  412. elif 'A' <= c <= 'Z':
  413. digit = (digit - ord('A')) + 10
  414. elif 'a' <= c <= 'z':
  415. digit = (digit - ord('a')) + 10
  416. else:
  417. self.error()
  418. if digit >= self.base:
  419. self.error()
  420. self.i += 1
  421. return digit
  422. else:
  423. return -1
  424. def prev_digit(self):
  425. # After exhausting all n digits in next_digit(), you can walk them
  426. # again in reverse order by calling prev_digit() exactly n times
  427. i = self.i - 1
  428. assert i >= 0
  429. self.i = i
  430. c = self.s[i]
  431. digit = ord(c)
  432. if '0' <= c <= '9':
  433. digit -= ord('0')
  434. elif 'A' <= c <= 'Z':
  435. digit = (digit - ord('A')) + 10
  436. elif 'a' <= c <= 'z':
  437. digit = (digit - ord('a')) + 10
  438. else:
  439. raise AssertionError
  440. return digit
  441. # -------------- public API ---------------------------------
  442. INIT_SIZE = 100 # XXX tweak
  443. class AbstractStringBuilder(object):
  444. # This is not the real implementation!
  445. def __init__(self, init_size=INIT_SIZE):
  446. "NOT_RPYTHON"
  447. self._l = []
  448. self._size = 0
  449. def _grow(self, size):
  450. "NOT_RPYTHON"
  451. self._size += size
  452. def append(self, s):
  453. "NOT_RPYTHON"
  454. assert isinstance(s, self._tp)
  455. self._l.append(s)
  456. self._grow(len(s))
  457. def append_slice(self, s, start, end):
  458. "NOT_RPYTHON"
  459. assert isinstance(s, self._tp)
  460. assert 0 <= start <= end <= len(s)
  461. s = s[start:end]
  462. self._l.append(s)
  463. self._grow(len(s))
  464. def append_multiple_char(self, c, times):
  465. "NOT_RPYTHON"
  466. assert isinstance(c, self._tp)
  467. self._l.append(c * times)
  468. self._grow(times)
  469. def append_charpsize(self, s, size):
  470. "NOT_RPYTHON"
  471. assert size >= 0
  472. l = []
  473. for i in xrange(size):
  474. l.append(s[i])
  475. self._l.append(self._tp("").join(l))
  476. self._grow(size)
  477. def build(self):
  478. "NOT_RPYTHON"
  479. result = self._tp("").join(self._l)
  480. assert len(result) == self._size
  481. self._l = [result]
  482. return result
  483. def getlength(self):
  484. "NOT_RPYTHON"
  485. return self._size
  486. class StringBuilder(AbstractStringBuilder):
  487. _tp = str
  488. class UnicodeBuilder(AbstractStringBuilder):
  489. _tp = unicode
  490. class ByteListBuilder(object):
  491. def __init__(self, init_size=INIT_SIZE):
  492. assert init_size >= 0
  493. self.l = newlist_hint(init_size)
  494. @specialize.argtype(1)
  495. def append(self, s):
  496. l = self.l
  497. for c in s:
  498. l.append(c)
  499. @specialize.argtype(1)
  500. def append_slice(self, s, start, end):
  501. l = self.l
  502. for i in xrange(start, end):
  503. l.append(s[i])
  504. def append_multiple_char(self, c, times):
  505. assert isinstance(c, str)
  506. self.l.extend([c[0]] * times)
  507. def append_charpsize(self, s, size):
  508. assert size >= 0
  509. l = self.l
  510. for i in xrange(size):
  511. l.append(s[i])
  512. def build(self):
  513. return self.l
  514. def getlength(self):
  515. return len(self.l)
  516. # ------------------------------------------------------------
  517. # ----------------- implementation details -------------------
  518. # ------------------------------------------------------------
  519. class SomeStringBuilder(SomeObject):
  520. def method_append(self, s_str):
  521. if s_str != s_None:
  522. assert isinstance(s_str, (SomeString, SomeChar))
  523. return s_None
  524. def method_append_slice(self, s_str, s_start, s_end):
  525. if s_str != s_None:
  526. assert isinstance(s_str, SomeString)
  527. assert isinstance(s_start, SomeInteger)
  528. assert isinstance(s_end, SomeInteger)
  529. return s_None
  530. def method_append_multiple_char(self, s_char, s_times):
  531. assert isinstance(s_char, SomeChar)
  532. assert isinstance(s_times, SomeInteger)
  533. return s_None
  534. def method_append_charpsize(self, s_ptr, s_size):
  535. assert isinstance(s_ptr, SomePtr)
  536. assert isinstance(s_size, SomeInteger)
  537. return s_None
  538. def method_getlength(self):
  539. return SomeInteger(nonneg=True)
  540. def method_build(self):
  541. return SomeString()
  542. def rtyper_makerepr(self, rtyper):
  543. from rpython.rtyper.lltypesystem.rbuilder import stringbuilder_repr
  544. return stringbuilder_repr
  545. def rtyper_makekey(self):
  546. return self.__class__,
  547. def noneify(self):
  548. return self
  549. class SomeUnicodeBuilder(SomeObject):
  550. def method_append(self, s_str):
  551. if s_str != s_None:
  552. assert isinstance(s_str, (SomeUnicodeCodePoint, SomeUnicodeString))
  553. return s_None
  554. def method_append_slice(self, s_str, s_start, s_end):
  555. if s_str != s_None:
  556. assert isinstance(s_str, SomeUnicodeString)
  557. assert isinstance(s_start, SomeInteger)
  558. assert isinstance(s_end, SomeInteger)
  559. return s_None
  560. def method_append_multiple_char(self, s_char, s_times):
  561. assert isinstance(s_char, SomeUnicodeCodePoint)
  562. assert isinstance(s_times, SomeInteger)
  563. return s_None
  564. def method_append_charpsize(self, s_ptr, s_size):
  565. assert isinstance(s_ptr, SomePtr)
  566. assert isinstance(s_size, SomeInteger)
  567. return s_None
  568. def method_getlength(self):
  569. return SomeInteger(nonneg=True)
  570. def method_build(self):
  571. return SomeUnicodeString()
  572. def rtyper_makerepr(self, rtyper):
  573. from rpython.rtyper.lltypesystem.rbuilder import unicodebuilder_repr
  574. return unicodebuilder_repr
  575. def rtyper_makekey(self):
  576. return self.__class__,
  577. def noneify(self):
  578. return self
  579. class BaseEntry(object):
  580. def compute_result_annotation(self, s_init_size=None):
  581. if s_init_size is not None:
  582. assert isinstance(s_init_size, SomeInteger)
  583. if self.use_unicode:
  584. return SomeUnicodeBuilder()
  585. return SomeStringBuilder()
  586. def specialize_call(self, hop):
  587. return hop.r_result.rtyper_new(hop)
  588. class StringBuilderEntry(BaseEntry, ExtRegistryEntry):
  589. _about_ = StringBuilder
  590. use_unicode = False
  591. class UnicodeBuilderEntry(BaseEntry, ExtRegistryEntry):
  592. _about_ = UnicodeBuilder
  593. use_unicode = True
  594. class __extend__(pairtype(SomeStringBuilder, SomeStringBuilder)):
  595. def union((obj1, obj2)):
  596. return obj1
  597. class __extend__(pairtype(SomeUnicodeBuilder, SomeUnicodeBuilder)):
  598. def union((obj1, obj2)):
  599. return obj1
  600. class PrebuiltStringBuilderEntry(ExtRegistryEntry):
  601. _type_ = StringBuilder
  602. def compute_annotation(self):
  603. return SomeStringBuilder()
  604. class PrebuiltUnicodeBuilderEntry(ExtRegistryEntry):
  605. _type_ = UnicodeBuilder
  606. def compute_annotation(self):
  607. return SomeUnicodeBuilder()
  608. #___________________________________________________________________
  609. # Support functions for SomeString.no_nul
  610. def assert_str0(fname):
  611. assert '\x00' not in fname, "NUL byte in string"
  612. return fname
  613. class Entry(ExtRegistryEntry):
  614. _about_ = assert_str0
  615. def compute_result_annotation(self, s_obj):
  616. if s_None.contains(s_obj):
  617. return s_obj
  618. assert isinstance(s_obj, (SomeString, SomeUnicodeString))
  619. if s_obj.no_nul:
  620. return s_obj
  621. new_s_obj = SomeObject.__new__(s_obj.__class__)
  622. new_s_obj.__dict__ = s_obj.__dict__.copy()
  623. new_s_obj.no_nul = True
  624. return new_s_obj
  625. def specialize_call(self, hop):
  626. hop.exception_cannot_occur()
  627. return hop.inputarg(hop.args_r[0], arg=0)
  628. def check_str0(fname):
  629. """A 'probe' to trigger a failure at translation time, if the
  630. string was not proved to not contain NUL characters."""
  631. assert '\x00' not in fname, "NUL byte in string"
  632. class Entry(ExtRegistryEntry):
  633. _about_ = check_str0
  634. def compute_result_annotation(self, s_obj):
  635. if not isinstance(s_obj, (SomeString, SomeUnicodeString)):
  636. return s_obj
  637. if not s_obj.no_nul:
  638. raise ValueError("Value is not no_nul")
  639. def specialize_call(self, hop):
  640. hop.exception_cannot_occur()