PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/pypy/module/_codecs/interp_codecs.py

https://bitbucket.org/pypy/pypy/
Python | 701 lines | 635 code | 35 blank | 31 comment | 42 complexity | be99c439bcaff3af53948a92e935c3e7 MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. from rpython.rlib import jit
  2. from rpython.rlib.objectmodel import we_are_translated
  3. from rpython.rlib.rstring import UnicodeBuilder
  4. from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
  5. from pypy.interpreter.error import OperationError, oefmt
  6. from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
  7. class VersionTag(object):
  8. pass
  9. class CodecState(object):
  10. _immutable_fields_ = ["version?"]
  11. def __init__(self, space):
  12. self.codec_search_path = []
  13. self.codec_search_cache = {}
  14. self.codec_error_registry = {}
  15. self.codec_need_encodings = True
  16. self.decode_error_handler = self.make_decode_errorhandler(space)
  17. self.encode_error_handler = self.make_encode_errorhandler(space)
  18. self.unicodedata_handler = None
  19. self.modified()
  20. def _make_errorhandler(self, space, decode):
  21. def call_errorhandler(errors, encoding, reason, input, startpos,
  22. endpos):
  23. """Generic wrapper for calling into error handlers.
  24. Returns (unicode_or_none, str_or_none, newpos) as error
  25. handlers may return unicode or on Python 3, bytes.
  26. """
  27. w_errorhandler = lookup_error(space, errors)
  28. if decode:
  29. w_cls = space.w_UnicodeDecodeError
  30. w_input = space.newbytes(input)
  31. else:
  32. w_cls = space.w_UnicodeEncodeError
  33. w_input = space.newunicode(input)
  34. w_exc = space.call_function(
  35. w_cls,
  36. space.wrap(encoding),
  37. w_input,
  38. space.wrap(startpos),
  39. space.wrap(endpos),
  40. space.wrap(reason))
  41. w_res = space.call_function(w_errorhandler, w_exc)
  42. if (not space.isinstance_w(w_res, space.w_tuple)
  43. or space.len_w(w_res) != 2
  44. or not space.isinstance_w(
  45. space.getitem(w_res, space.wrap(0)),
  46. space.w_unicode)):
  47. raise oefmt(space.w_TypeError,
  48. "%s error handler must return (unicode, int) "
  49. "tuple, not %R",
  50. "decoding" if decode else "encoding", w_res)
  51. w_replace, w_newpos = space.fixedview(w_res, 2)
  52. newpos = space.int_w(w_newpos)
  53. if newpos < 0:
  54. newpos = len(input) + newpos
  55. if newpos < 0 or newpos > len(input):
  56. raise oefmt(space.w_IndexError,
  57. "position %d from error handler out of bounds",
  58. newpos)
  59. replace = space.unicode_w(w_replace)
  60. return replace, newpos
  61. return call_errorhandler
  62. def make_decode_errorhandler(self, space):
  63. return self._make_errorhandler(space, True)
  64. def make_encode_errorhandler(self, space):
  65. errorhandler = self._make_errorhandler(space, False)
  66. def encode_call_errorhandler(errors, encoding, reason, input, startpos,
  67. endpos):
  68. replace, newpos = errorhandler(errors, encoding, reason, input,
  69. startpos, endpos)
  70. return replace, None, newpos
  71. return encode_call_errorhandler
  72. def get_unicodedata_handler(self, space):
  73. if self.unicodedata_handler:
  74. return self.unicodedata_handler
  75. try:
  76. w_unicodedata = space.getbuiltinmodule("unicodedata")
  77. w_getcode = space.getattr(w_unicodedata, space.wrap("_get_code"))
  78. except OperationError:
  79. return None
  80. else:
  81. self.unicodedata_handler = UnicodeData_Handler(space, w_getcode)
  82. return self.unicodedata_handler
  83. def modified(self):
  84. self.version = VersionTag()
  85. def get_codec_from_cache(self, key):
  86. return self._get_codec_with_version(key, self.version)
  87. @jit.elidable
  88. def _get_codec_with_version(self, key, version):
  89. return self.codec_search_cache.get(key, None)
  90. def _cleanup_(self):
  91. assert not self.codec_search_path
  92. def register_codec(space, w_search_function):
  93. """register(search_function)
  94. Register a codec search function. Search functions are expected to take
  95. one argument, the encoding name in all lower case letters, and return
  96. a tuple of functions (encoder, decoder, stream_reader, stream_writer).
  97. """
  98. state = space.fromcache(CodecState)
  99. if space.is_true(space.callable(w_search_function)):
  100. state.codec_search_path.append(w_search_function)
  101. else:
  102. raise oefmt(space.w_TypeError, "argument must be callable")
  103. @unwrap_spec(encoding=str)
  104. def lookup_codec(space, encoding):
  105. """lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
  106. Looks up a codec tuple in the Python codec registry and returns
  107. a tuple of functions.
  108. """
  109. assert not (space.config.translating and not we_are_translated()), \
  110. "lookup_codec() should not be called during translation"
  111. state = space.fromcache(CodecState)
  112. normalized_encoding = encoding.replace(" ", "-").lower()
  113. w_result = state.get_codec_from_cache(normalized_encoding)
  114. if w_result is not None:
  115. return w_result
  116. return _lookup_codec_loop(space, encoding, normalized_encoding)
  117. def _lookup_codec_loop(space, encoding, normalized_encoding):
  118. state = space.fromcache(CodecState)
  119. if state.codec_need_encodings:
  120. w_import = space.getattr(space.builtin, space.wrap("__import__"))
  121. # registers new codecs
  122. space.call_function(w_import, space.wrap("encodings"))
  123. state.codec_need_encodings = False
  124. if len(state.codec_search_path) == 0:
  125. raise oefmt(space.w_LookupError,
  126. "no codec search functions registered: can't find "
  127. "encoding")
  128. for w_search in state.codec_search_path:
  129. w_result = space.call_function(w_search,
  130. space.wrap(normalized_encoding))
  131. if not space.is_w(w_result, space.w_None):
  132. if not (space.isinstance_w(w_result, space.w_tuple) and
  133. space.len_w(w_result) == 4):
  134. raise oefmt(space.w_TypeError,
  135. "codec search functions must return 4-tuples")
  136. else:
  137. state.codec_search_cache[normalized_encoding] = w_result
  138. state.modified()
  139. return w_result
  140. raise oefmt(space.w_LookupError, "unknown encoding: %s", encoding)
  141. # ____________________________________________________________
  142. # Register standard error handlers
  143. def check_exception(space, w_exc):
  144. try:
  145. w_start = space.getattr(w_exc, space.wrap('start'))
  146. w_end = space.getattr(w_exc, space.wrap('end'))
  147. w_obj = space.getattr(w_exc, space.wrap('object'))
  148. except OperationError as e:
  149. if not e.match(space, space.w_AttributeError):
  150. raise
  151. raise oefmt(space.w_TypeError, "wrong exception")
  152. delta = space.int_w(w_end) - space.int_w(w_start)
  153. if delta < 0 or not (space.isinstance_w(w_obj, space.w_str) or
  154. space.isinstance_w(w_obj, space.w_unicode)):
  155. raise oefmt(space.w_TypeError, "wrong exception")
  156. def strict_errors(space, w_exc):
  157. check_exception(space, w_exc)
  158. if space.isinstance_w(w_exc, space.w_BaseException):
  159. raise OperationError(space.type(w_exc), w_exc)
  160. else:
  161. raise oefmt(space.w_TypeError, "codec must pass exception instance")
  162. def ignore_errors(space, w_exc):
  163. check_exception(space, w_exc)
  164. w_end = space.getattr(w_exc, space.wrap('end'))
  165. return space.newtuple([space.wrap(u''), w_end])
  166. def replace_errors(space, w_exc):
  167. check_exception(space, w_exc)
  168. w_start = space.getattr(w_exc, space.wrap('start'))
  169. w_end = space.getattr(w_exc, space.wrap('end'))
  170. size = space.int_w(w_end) - space.int_w(w_start)
  171. if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
  172. text = u'?' * size
  173. return space.newtuple([space.wrap(text), w_end])
  174. elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
  175. text = u'\ufffd'
  176. return space.newtuple([space.wrap(text), w_end])
  177. elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
  178. text = u'\ufffd' * size
  179. return space.newtuple([space.wrap(text), w_end])
  180. else:
  181. raise oefmt(space.w_TypeError,
  182. "don't know how to handle %T in error callback", w_exc)
  183. def xmlcharrefreplace_errors(space, w_exc):
  184. check_exception(space, w_exc)
  185. if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
  186. obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
  187. start = space.int_w(space.getattr(w_exc, space.wrap('start')))
  188. w_end = space.getattr(w_exc, space.wrap('end'))
  189. end = space.int_w(w_end)
  190. builder = UnicodeBuilder()
  191. pos = start
  192. while pos < end:
  193. code = ord(obj[pos])
  194. if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and
  195. pos + 1 < end and 0xDC00 <= ord(obj[pos+1]) <= 0xDFFF):
  196. code = (code & 0x03FF) << 10
  197. code |= ord(obj[pos+1]) & 0x03FF
  198. code += 0x10000
  199. pos += 1
  200. builder.append(u"&#")
  201. builder.append(unicode(str(code)))
  202. builder.append(u";")
  203. pos += 1
  204. return space.newtuple([space.wrap(builder.build()), w_end])
  205. else:
  206. raise oefmt(space.w_TypeError,
  207. "don't know how to handle %T in error callback", w_exc)
  208. def backslashreplace_errors(space, w_exc):
  209. check_exception(space, w_exc)
  210. if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
  211. obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
  212. start = space.int_w(space.getattr(w_exc, space.wrap('start')))
  213. w_end = space.getattr(w_exc, space.wrap('end'))
  214. end = space.int_w(w_end)
  215. builder = UnicodeBuilder()
  216. pos = start
  217. while pos < end:
  218. oc = ord(obj[pos])
  219. num = hex(oc)
  220. if (oc >= 0x10000):
  221. builder.append(u"\\U")
  222. zeros = 8
  223. elif (oc >= 0x100):
  224. builder.append(u"\\u")
  225. zeros = 4
  226. else:
  227. builder.append(u"\\x")
  228. zeros = 2
  229. lnum = len(num)
  230. nb = zeros + 2 - lnum # num starts with '0x'
  231. if nb > 0:
  232. builder.append_multiple_char(u'0', nb)
  233. builder.append_slice(unicode(num), 2, lnum)
  234. pos += 1
  235. return space.newtuple([space.wrap(builder.build()), w_end])
  236. else:
  237. raise oefmt(space.w_TypeError,
  238. "don't know how to handle %T in error callback", w_exc)
  239. def register_builtin_error_handlers(space):
  240. "NOT_RPYTHON"
  241. state = space.fromcache(CodecState)
  242. for error in ("strict", "ignore", "replace", "xmlcharrefreplace",
  243. "backslashreplace"):
  244. name = error + "_errors"
  245. state.codec_error_registry[error] = space.wrap(interp2app(globals()[name]))
  246. @unwrap_spec(errors=str)
  247. def lookup_error(space, errors):
  248. """lookup_error(errors) -> handler
  249. Return the error handler for the specified error handling name
  250. or raise a LookupError, if no handler exists under this name.
  251. """
  252. state = space.fromcache(CodecState)
  253. try:
  254. w_err_handler = state.codec_error_registry[errors]
  255. except KeyError:
  256. raise oefmt(space.w_LookupError,
  257. "unknown error handler name %s", errors)
  258. return w_err_handler
  259. @unwrap_spec(errors=str)
  260. def encode(space, w_obj, w_encoding=None, errors='strict'):
  261. """encode(obj, [encoding[,errors]]) -> object
  262. Encodes obj using the codec registered for encoding. encoding defaults
  263. to the default encoding. errors may be given to set a different error
  264. handling scheme. Default is 'strict' meaning that encoding errors raise
  265. a ValueError. Other possible values are 'ignore', 'replace' and
  266. 'xmlcharrefreplace' as well as any other name registered with
  267. codecs.register_error that can handle ValueErrors.
  268. """
  269. if w_encoding is None:
  270. encoding = space.sys.defaultencoding
  271. else:
  272. encoding = space.str_w(w_encoding)
  273. w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0))
  274. w_res = space.call_function(w_encoder, w_obj, space.wrap(errors))
  275. return space.getitem(w_res, space.wrap(0))
  276. @unwrap_spec(errors='str_or_None')
  277. def readbuffer_encode(space, w_data, errors='strict'):
  278. s = space.getarg_w('s#', w_data)
  279. return space.newtuple([space.newbytes(s), space.wrap(len(s))])
  280. @unwrap_spec(errors='str_or_None')
  281. def charbuffer_encode(space, w_data, errors='strict'):
  282. s = space.getarg_w('t#', w_data)
  283. return space.newtuple([space.wrap(s), space.wrap(len(s))])
  284. @unwrap_spec(errors=str)
  285. def decode(space, w_obj, w_encoding=None, errors='strict'):
  286. """decode(obj, [encoding[,errors]]) -> object
  287. Decodes obj using the codec registered for encoding. encoding defaults
  288. to the default encoding. errors may be given to set a different error
  289. handling scheme. Default is 'strict' meaning that encoding errors raise
  290. a ValueError. Other possible values are 'ignore' and 'replace'
  291. as well as any other name registerd with codecs.register_error that is
  292. able to handle ValueErrors.
  293. """
  294. if w_encoding is None:
  295. encoding = space.sys.defaultencoding
  296. else:
  297. encoding = space.str_w(w_encoding)
  298. w_decoder = space.getitem(lookup_codec(space, encoding), space.wrap(1))
  299. if space.is_true(w_decoder):
  300. w_res = space.call_function(w_decoder, w_obj, space.wrap(errors))
  301. if (not space.isinstance_w(w_res, space.w_tuple) or space.len_w(w_res) != 2):
  302. raise oefmt(space.w_TypeError,
  303. "encoder must return a tuple (object, integer)")
  304. return space.getitem(w_res, space.wrap(0))
  305. else:
  306. assert 0, "XXX, what to do here?"
  307. @unwrap_spec(errors=str)
  308. def register_error(space, errors, w_handler):
  309. """register_error(errors, handler)
  310. Register the specified error handler under the name
  311. errors. handler must be a callable object, that
  312. will be called with an exception instance containing
  313. information about the location of the encoding/decoding
  314. error and must return a (replacement, new position) tuple.
  315. """
  316. state = space.fromcache(CodecState)
  317. if space.is_true(space.callable(w_handler)):
  318. state.codec_error_registry[errors] = w_handler
  319. else:
  320. raise oefmt(space.w_TypeError, "handler must be callable")
  321. # ____________________________________________________________
  322. # delegation to runicode
  323. from rpython.rlib import runicode
  324. def make_encoder_wrapper(name):
  325. rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
  326. assert hasattr(runicode, rname)
  327. @unwrap_spec(uni=unicode, errors='str_or_None')
  328. def wrap_encoder(space, uni, errors="strict"):
  329. if errors is None:
  330. errors = 'strict'
  331. state = space.fromcache(CodecState)
  332. func = getattr(runicode, rname)
  333. result = func(uni, len(uni), errors, state.encode_error_handler)
  334. return space.newtuple([space.newbytes(result), space.wrap(len(uni))])
  335. wrap_encoder.func_name = rname
  336. globals()[name] = wrap_encoder
  337. def make_decoder_wrapper(name):
  338. rname = "str_decode_%s" % (name.replace("_decode", ""), )
  339. assert hasattr(runicode, rname)
  340. @unwrap_spec(string='bufferstr', errors='str_or_None',
  341. w_final=WrappedDefault(False))
  342. def wrap_decoder(space, string, errors="strict", w_final=None):
  343. if errors is None:
  344. errors = 'strict'
  345. final = space.is_true(w_final)
  346. state = space.fromcache(CodecState)
  347. func = getattr(runicode, rname)
  348. result, consumed = func(string, len(string), errors,
  349. final, state.decode_error_handler)
  350. return space.newtuple([space.wrap(result), space.wrap(consumed)])
  351. wrap_decoder.func_name = rname
  352. globals()[name] = wrap_decoder
  353. for encoder in [
  354. "ascii_encode",
  355. "latin_1_encode",
  356. "utf_7_encode",
  357. "utf_16_encode",
  358. "utf_16_be_encode",
  359. "utf_16_le_encode",
  360. "utf_32_encode",
  361. "utf_32_be_encode",
  362. "utf_32_le_encode",
  363. "unicode_escape_encode",
  364. "raw_unicode_escape_encode",
  365. "unicode_internal_encode",
  366. ]:
  367. make_encoder_wrapper(encoder)
  368. for decoder in [
  369. "ascii_decode",
  370. "latin_1_decode",
  371. "utf_7_decode",
  372. "utf_16_decode",
  373. "utf_16_be_decode",
  374. "utf_16_le_decode",
  375. "utf_32_decode",
  376. "utf_32_be_decode",
  377. "utf_32_le_decode",
  378. "raw_unicode_escape_decode",
  379. ]:
  380. make_decoder_wrapper(decoder)
  381. if hasattr(runicode, 'str_decode_mbcs'):
  382. make_encoder_wrapper('mbcs_encode')
  383. make_decoder_wrapper('mbcs_decode')
  384. # utf-8 functions are not regular, because we have to pass
  385. # "allow_surrogates=True"
  386. @unwrap_spec(uni=unicode, errors='str_or_None')
  387. def utf_8_encode(space, uni, errors="strict"):
  388. if errors is None:
  389. errors = 'strict'
  390. state = space.fromcache(CodecState)
  391. result = runicode.unicode_encode_utf_8(
  392. uni, len(uni), errors, state.encode_error_handler,
  393. allow_surrogates=True)
  394. return space.newtuple([space.wrap(result), space.wrap(len(uni))])
  395. @unwrap_spec(string='bufferstr', errors='str_or_None',
  396. w_final = WrappedDefault(False))
  397. def utf_8_decode(space, string, errors="strict", w_final=None):
  398. if errors is None:
  399. errors = 'strict'
  400. final = space.is_true(w_final)
  401. state = space.fromcache(CodecState)
  402. result, consumed = runicode.str_decode_utf_8(
  403. string, len(string), errors,
  404. final, state.decode_error_handler,
  405. allow_surrogates=True)
  406. return space.newtuple([space.wrap(result), space.wrap(consumed)])
  407. @unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
  408. w_final=WrappedDefault(False))
  409. def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
  410. if errors is None:
  411. errors = 'strict'
  412. final = space.is_true(w_final)
  413. state = space.fromcache(CodecState)
  414. if byteorder == 0:
  415. byteorder = 'native'
  416. elif byteorder == -1:
  417. byteorder = 'little'
  418. else:
  419. byteorder = 'big'
  420. consumed = len(data)
  421. if final:
  422. consumed = 0
  423. res, consumed, byteorder = runicode.str_decode_utf_16_helper(
  424. data, len(data), errors, final, state.decode_error_handler, byteorder)
  425. return space.newtuple([space.wrap(res), space.wrap(consumed),
  426. space.wrap(byteorder)])
  427. @unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
  428. w_final=WrappedDefault(False))
  429. def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
  430. final = space.is_true(w_final)
  431. state = space.fromcache(CodecState)
  432. if byteorder == 0:
  433. byteorder = 'native'
  434. elif byteorder == -1:
  435. byteorder = 'little'
  436. else:
  437. byteorder = 'big'
  438. consumed = len(data)
  439. if final:
  440. consumed = 0
  441. res, consumed, byteorder = runicode.str_decode_utf_32_helper(
  442. data, len(data), errors, final, state.decode_error_handler, byteorder)
  443. return space.newtuple([space.wrap(res), space.wrap(consumed),
  444. space.wrap(byteorder)])
  445. # ____________________________________________________________
  446. # Charmap
  447. class Charmap_Decode:
  448. def __init__(self, space, w_mapping):
  449. self.space = space
  450. self.w_mapping = w_mapping
  451. # fast path for all the stuff in the encodings module
  452. if space.isinstance_w(w_mapping, space.w_tuple):
  453. self.mapping_w = space.fixedview(w_mapping)
  454. else:
  455. self.mapping_w = None
  456. def get(self, ch, errorchar):
  457. space = self.space
  458. # get the character from the mapping
  459. if self.mapping_w is not None:
  460. w_ch = self.mapping_w[ord(ch)]
  461. else:
  462. try:
  463. w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
  464. except OperationError as e:
  465. if not e.match(space, space.w_LookupError):
  466. raise
  467. return errorchar
  468. if space.isinstance_w(w_ch, space.w_unicode):
  469. # Charmap may return a unicode string
  470. return space.unicode_w(w_ch)
  471. elif space.isinstance_w(w_ch, space.w_int):
  472. # Charmap may return a number
  473. x = space.int_w(w_ch)
  474. if not 0 <= x <= 0x10FFFF:
  475. raise oefmt(space.w_TypeError,
  476. "character mapping must be in range(0x110000)")
  477. return code_to_unichr(x)
  478. elif space.is_w(w_ch, space.w_None):
  479. # Charmap may return None
  480. return errorchar
  481. raise oefmt(space.w_TypeError,
  482. "character mapping must return integer, None or unicode")
  483. class Charmap_Encode:
  484. def __init__(self, space, w_mapping):
  485. self.space = space
  486. self.w_mapping = w_mapping
  487. def get(self, ch, errorchar):
  488. space = self.space
  489. # get the character from the mapping
  490. try:
  491. w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
  492. except OperationError as e:
  493. if not e.match(space, space.w_LookupError):
  494. raise
  495. return errorchar
  496. if space.isinstance_w(w_ch, space.w_str):
  497. # Charmap may return a string
  498. return space.bytes_w(w_ch)
  499. elif space.isinstance_w(w_ch, space.w_int):
  500. # Charmap may return a number
  501. x = space.int_w(w_ch)
  502. if not 0 <= x < 256:
  503. raise oefmt(space.w_TypeError,
  504. "character mapping must be in range(256)")
  505. return chr(x)
  506. elif space.is_w(w_ch, space.w_None):
  507. # Charmap may return None
  508. return errorchar
  509. raise oefmt(space.w_TypeError,
  510. "character mapping must return integer, None or str")
  511. @unwrap_spec(string='bufferstr', errors='str_or_None')
  512. def charmap_decode(space, string, errors="strict", w_mapping=None):
  513. if errors is None:
  514. errors = 'strict'
  515. if len(string) == 0:
  516. return space.newtuple([space.wrap(u''), space.wrap(0)])
  517. if space.is_none(w_mapping):
  518. mapping = None
  519. else:
  520. mapping = Charmap_Decode(space, w_mapping)
  521. final = True
  522. state = space.fromcache(CodecState)
  523. result, consumed = runicode.str_decode_charmap(
  524. string, len(string), errors,
  525. final, state.decode_error_handler, mapping)
  526. return space.newtuple([space.wrap(result), space.wrap(consumed)])
  527. @unwrap_spec(uni=unicode, errors='str_or_None')
  528. def charmap_encode(space, uni, errors="strict", w_mapping=None):
  529. if errors is None:
  530. errors = 'strict'
  531. if space.is_none(w_mapping):
  532. mapping = None
  533. else:
  534. mapping = Charmap_Encode(space, w_mapping)
  535. state = space.fromcache(CodecState)
  536. result = runicode.unicode_encode_charmap(
  537. uni, len(uni), errors,
  538. state.encode_error_handler, mapping)
  539. return space.newtuple([space.newbytes(result), space.wrap(len(uni))])
  540. @unwrap_spec(chars=unicode)
  541. def charmap_build(space, chars):
  542. # XXX CPython sometimes uses a three-level trie
  543. w_charmap = space.newdict()
  544. for num in range(len(chars)):
  545. elem = chars[num]
  546. space.setitem(w_charmap, space.newint(ord(elem)), space.newint(num))
  547. return w_charmap
  548. # ____________________________________________________________
  549. # Unicode escape
  550. class UnicodeData_Handler:
  551. def __init__(self, space, w_getcode):
  552. self.space = space
  553. self.w_getcode = w_getcode
  554. def call(self, name):
  555. space = self.space
  556. try:
  557. w_code = space.call_function(self.w_getcode, space.wrap(name))
  558. except OperationError as e:
  559. if not e.match(space, space.w_KeyError):
  560. raise
  561. return -1
  562. return space.int_w(w_code)
  563. @unwrap_spec(string='bufferstr', errors='str_or_None',
  564. w_final=WrappedDefault(False))
  565. def unicode_escape_decode(space, string, errors="strict", w_final=None):
  566. if errors is None:
  567. errors = 'strict'
  568. final = space.is_true(w_final)
  569. state = space.fromcache(CodecState)
  570. unicode_name_handler = state.get_unicodedata_handler(space)
  571. result, consumed = runicode.str_decode_unicode_escape(
  572. string, len(string), errors,
  573. final, state.decode_error_handler,
  574. unicode_name_handler)
  575. return space.newtuple([space.wrap(result), space.wrap(consumed)])
  576. # ____________________________________________________________
  577. # Unicode-internal
  578. @unwrap_spec(errors='str_or_None')
  579. def unicode_internal_decode(space, w_string, errors="strict"):
  580. if errors is None:
  581. errors = 'strict'
  582. # special case for this codec: unicodes are returned as is
  583. if space.isinstance_w(w_string, space.w_unicode):
  584. return space.newtuple([w_string, space.len(w_string)])
  585. string = space.readbuf_w(w_string).as_str()
  586. if len(string) == 0:
  587. return space.newtuple([space.wrap(u''), space.wrap(0)])
  588. final = True
  589. state = space.fromcache(CodecState)
  590. result, consumed = runicode.str_decode_unicode_internal(
  591. string, len(string), errors,
  592. final, state.decode_error_handler)
  593. return space.newtuple([space.wrap(result), space.wrap(consumed)])
  594. # ____________________________________________________________
  595. # support for the "string escape" codec
  596. # This is a bytes-to bytes transformation
  597. @unwrap_spec(data=str, errors='str_or_None')
  598. def escape_encode(space, data, errors='strict'):
  599. from pypy.objspace.std.bytesobject import string_escape_encode
  600. result = string_escape_encode(data, quote="'")
  601. start = 1
  602. end = len(result) - 1
  603. assert end >= 0
  604. w_result = space.wrap(result[start:end])
  605. return space.newtuple([w_result, space.wrap(len(data))])
  606. @unwrap_spec(data=str, errors='str_or_None')
  607. def escape_decode(space, data, errors='strict'):
  608. from pypy.interpreter.pyparser.parsestring import PyString_DecodeEscape
  609. result = PyString_DecodeEscape(space, data, errors, None)
  610. return space.newtuple([space.newbytes(result), space.wrap(len(data))])