PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/pypy/objspace/std/unicodeobject.py

https://bitbucket.org/pypy/pypy/
Python | 1145 lines | 1122 code | 20 blank | 3 comment | 14 complexity | 96150e2b429e92309e2a554d1d76733c MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. """The builtin unicode implementation"""
  2. from rpython.rlib.objectmodel import (
  3. compute_hash, compute_unique_id, import_from_mixin)
  4. from rpython.rlib.buffer import StringBuffer
  5. from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
  6. from rpython.rlib.runicode import (
  7. make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
  8. unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
  9. from pypy.interpreter import unicodehelper
  10. from pypy.interpreter.baseobjspace import W_Root
  11. from pypy.interpreter.error import OperationError, oefmt
  12. from pypy.interpreter.gateway import WrappedDefault, interp2app, unwrap_spec
  13. from pypy.interpreter.typedef import TypeDef
  14. from pypy.module.unicodedata import unicodedb
  15. from pypy.objspace.std import newformat
  16. from pypy.objspace.std.basestringtype import basestring_typedef
  17. from pypy.objspace.std.formatting import mod_format
  18. from pypy.objspace.std.stringmethods import StringMethods
  19. from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
  20. __all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode',
  21. 'encode_object', 'decode_object', 'unicode_from_object',
  22. 'unicode_from_string', 'unicode_to_decimal_w']
  23. class W_UnicodeObject(W_Root):
  24. import_from_mixin(StringMethods)
  25. _immutable_fields_ = ['_value']
  26. def __init__(self, unistr):
  27. assert isinstance(unistr, unicode)
  28. self._value = unistr
  29. def __repr__(self):
  30. """representation for debugging purposes"""
  31. return "%s(%r)" % (self.__class__.__name__, self._value)
  32. def unwrap(self, space):
  33. # for testing
  34. return self._value
  35. def create_if_subclassed(self):
  36. if type(self) is W_UnicodeObject:
  37. return self
  38. return W_UnicodeObject(self._value)
  39. def is_w(self, space, w_other):
  40. if not isinstance(w_other, W_UnicodeObject):
  41. return False
  42. if self is w_other:
  43. return True
  44. if self.user_overridden_class or w_other.user_overridden_class:
  45. return False
  46. s1 = space.unicode_w(self)
  47. s2 = space.unicode_w(w_other)
  48. if len(s2) > 1:
  49. return s1 is s2
  50. else: # strings of len <= 1 are unique-ified
  51. return s1 == s2
  52. def immutable_unique_id(self, space):
  53. if self.user_overridden_class:
  54. return None
  55. s = space.unicode_w(self)
  56. if len(s) > 1:
  57. uid = compute_unique_id(s)
  58. else: # strings of len <= 1 are unique-ified
  59. if len(s) == 1:
  60. base = ~ord(s[0]) # negative base values
  61. else:
  62. base = 257 # empty unicode string: base value 257
  63. uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
  64. return space.wrap(uid)
  65. def str_w(self, space):
  66. return space.str_w(space.str(self))
  67. def unicode_w(self, space):
  68. return self._value
  69. def readbuf_w(self, space):
  70. from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
  71. builder = StringBuilder(len(self._value) * UNICODE_SIZE)
  72. for unich in self._value:
  73. pack_unichar(unich, builder)
  74. return StringBuffer(builder.build())
  75. def writebuf_w(self, space):
  76. raise oefmt(space.w_TypeError,
  77. "cannot use unicode as modifiable buffer")
  78. charbuf_w = str_w
  79. def listview_unicode(self):
  80. return _create_list_from_unicode(self._value)
  81. def ord(self, space):
  82. if len(self._value) != 1:
  83. raise oefmt(space.w_TypeError,
  84. "ord() expected a character, but string of length %d "
  85. "found", len(self._value))
  86. return space.wrap(ord(self._value[0]))
  87. def _new(self, value):
  88. return W_UnicodeObject(value)
  89. def _new_from_list(self, value):
  90. return W_UnicodeObject(u''.join(value))
  91. def _empty(self):
  92. return W_UnicodeObject.EMPTY
  93. def _len(self):
  94. return len(self._value)
  95. _val = unicode_w
  96. @staticmethod
  97. def _use_rstr_ops(space, w_other):
  98. # Always return true because we always need to copy the other
  99. # operand(s) before we can do comparisons
  100. return True
  101. @staticmethod
  102. def _op_val(space, w_other):
  103. if isinstance(w_other, W_UnicodeObject):
  104. return w_other._value
  105. if space.isinstance_w(w_other, space.w_str):
  106. return unicode_from_string(space, w_other)._value
  107. return unicode_from_encoded_object(
  108. space, w_other, None, "strict")._value
  109. def _chr(self, char):
  110. assert len(char) == 1
  111. return unicode(char)[0]
  112. _builder = UnicodeBuilder
  113. def _isupper(self, ch):
  114. return unicodedb.isupper(ord(ch))
  115. def _islower(self, ch):
  116. return unicodedb.islower(ord(ch))
  117. def _isnumeric(self, ch):
  118. return unicodedb.isnumeric(ord(ch))
  119. def _istitle(self, ch):
  120. return unicodedb.isupper(ord(ch)) or unicodedb.istitle(ord(ch))
  121. def _isspace(self, ch):
  122. return unicodedb.isspace(ord(ch))
  123. def _isalpha(self, ch):
  124. return unicodedb.isalpha(ord(ch))
  125. def _isalnum(self, ch):
  126. return unicodedb.isalnum(ord(ch))
  127. def _isdigit(self, ch):
  128. return unicodedb.isdigit(ord(ch))
  129. def _isdecimal(self, ch):
  130. return unicodedb.isdecimal(ord(ch))
  131. def _iscased(self, ch):
  132. return unicodedb.iscased(ord(ch))
  133. def _islinebreak(self, ch):
  134. return unicodedb.islinebreak(ord(ch))
  135. def _upper(self, ch):
  136. return unichr(unicodedb.toupper(ord(ch)))
  137. def _lower(self, ch):
  138. return unichr(unicodedb.tolower(ord(ch)))
  139. def _title(self, ch):
  140. return unichr(unicodedb.totitle(ord(ch)))
  141. def _newlist_unwrapped(self, space, lst):
  142. return space.newlist_unicode(lst)
  143. @staticmethod
  144. @unwrap_spec(w_string=WrappedDefault(""))
  145. def descr_new(space, w_unicodetype, w_string, w_encoding=None,
  146. w_errors=None):
  147. # NB. the default value of w_obj is really a *wrapped* empty string:
  148. # there is gateway magic at work
  149. w_obj = w_string
  150. encoding, errors = _get_encoding_and_errors(space, w_encoding,
  151. w_errors)
  152. # convoluted logic for the case when unicode subclass has a __unicode__
  153. # method, we need to call this method
  154. is_precisely_unicode = space.is_w(space.type(w_obj), space.w_unicode)
  155. if (is_precisely_unicode or
  156. (space.isinstance_w(w_obj, space.w_unicode) and
  157. space.findattr(w_obj, space.wrap('__unicode__')) is None)):
  158. if encoding is not None or errors is not None:
  159. raise oefmt(space.w_TypeError,
  160. "decoding Unicode is not supported")
  161. if (is_precisely_unicode and
  162. space.is_w(w_unicodetype, space.w_unicode)):
  163. return w_obj
  164. w_value = w_obj
  165. else:
  166. if encoding is None and errors is None:
  167. w_value = unicode_from_object(space, w_obj)
  168. else:
  169. w_value = unicode_from_encoded_object(space, w_obj,
  170. encoding, errors)
  171. if space.is_w(w_unicodetype, space.w_unicode):
  172. return w_value
  173. assert isinstance(w_value, W_UnicodeObject)
  174. w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
  175. W_UnicodeObject.__init__(w_newobj, w_value._value)
  176. return w_newobj
  177. def descr_repr(self, space):
  178. chars = self._value
  179. size = len(chars)
  180. s = _repr_function(chars, size, "strict")
  181. return space.wrap(s)
  182. def descr_str(self, space):
  183. return encode_object(space, self, None, None)
  184. def descr_hash(self, space):
  185. x = compute_hash(self._value)
  186. return space.wrap(x)
  187. def descr_eq(self, space, w_other):
  188. try:
  189. res = self._val(space) == self._op_val(space, w_other)
  190. except OperationError as e:
  191. if e.match(space, space.w_TypeError):
  192. return space.w_NotImplemented
  193. if (e.match(space, space.w_UnicodeDecodeError) or
  194. e.match(space, space.w_UnicodeEncodeError)):
  195. msg = ("Unicode equal comparison failed to convert both "
  196. "arguments to Unicode - interpreting them as being "
  197. "unequal")
  198. space.warn(space.wrap(msg), space.w_UnicodeWarning)
  199. return space.w_False
  200. raise
  201. return space.newbool(res)
  202. def descr_ne(self, space, w_other):
  203. try:
  204. res = self._val(space) != self._op_val(space, w_other)
  205. except OperationError as e:
  206. if e.match(space, space.w_TypeError):
  207. return space.w_NotImplemented
  208. if (e.match(space, space.w_UnicodeDecodeError) or
  209. e.match(space, space.w_UnicodeEncodeError)):
  210. msg = ("Unicode unequal comparison failed to convert both "
  211. "arguments to Unicode - interpreting them as being "
  212. "unequal")
  213. space.warn(space.wrap(msg), space.w_UnicodeWarning)
  214. return space.w_True
  215. raise
  216. return space.newbool(res)
  217. def descr_lt(self, space, w_other):
  218. try:
  219. res = self._val(space) < self._op_val(space, w_other)
  220. except OperationError as e:
  221. if e.match(space, space.w_TypeError):
  222. return space.w_NotImplemented
  223. raise
  224. return space.newbool(res)
  225. def descr_le(self, space, w_other):
  226. try:
  227. res = self._val(space) <= self._op_val(space, w_other)
  228. except OperationError as e:
  229. if e.match(space, space.w_TypeError):
  230. return space.w_NotImplemented
  231. raise
  232. return space.newbool(res)
  233. def descr_gt(self, space, w_other):
  234. try:
  235. res = self._val(space) > self._op_val(space, w_other)
  236. except OperationError as e:
  237. if e.match(space, space.w_TypeError):
  238. return space.w_NotImplemented
  239. raise
  240. return space.newbool(res)
  241. def descr_ge(self, space, w_other):
  242. try:
  243. res = self._val(space) >= self._op_val(space, w_other)
  244. except OperationError as e:
  245. if e.match(space, space.w_TypeError):
  246. return space.w_NotImplemented
  247. raise
  248. return space.newbool(res)
  249. def descr_format(self, space, __args__):
  250. return newformat.format_method(space, self, __args__, is_unicode=True)
  251. def descr__format__(self, space, w_format_spec):
  252. if not space.isinstance_w(w_format_spec, space.w_unicode):
  253. w_format_spec = space.call_function(space.w_unicode, w_format_spec)
  254. spec = space.unicode_w(w_format_spec)
  255. formatter = newformat.unicode_formatter(space, spec)
  256. self2 = unicode_from_object(space, self)
  257. assert isinstance(self2, W_UnicodeObject)
  258. return formatter.format_string(self2._value)
  259. def descr_mod(self, space, w_values):
  260. return mod_format(space, self, w_values, do_unicode=True)
  261. def descr_translate(self, space, w_table):
  262. selfvalue = self._value
  263. w_sys = space.getbuiltinmodule('sys')
  264. maxunicode = space.int_w(space.getattr(w_sys,
  265. space.wrap("maxunicode")))
  266. result = []
  267. for unichar in selfvalue:
  268. try:
  269. w_newval = space.getitem(w_table, space.wrap(ord(unichar)))
  270. except OperationError as e:
  271. if e.match(space, space.w_LookupError):
  272. result.append(unichar)
  273. else:
  274. raise
  275. else:
  276. if space.is_w(w_newval, space.w_None):
  277. continue
  278. elif space.isinstance_w(w_newval, space.w_int):
  279. newval = space.int_w(w_newval)
  280. if newval < 0 or newval > maxunicode:
  281. raise oefmt(space.w_TypeError,
  282. "character mapping must be in range(%s)",
  283. hex(maxunicode + 1))
  284. result.append(unichr(newval))
  285. elif space.isinstance_w(w_newval, space.w_unicode):
  286. result.append(space.unicode_w(w_newval))
  287. else:
  288. raise oefmt(space.w_TypeError,
  289. "character mapping must return integer, None "
  290. "or unicode")
  291. return W_UnicodeObject(u''.join(result))
  292. def descr_encode(self, space, w_encoding=None, w_errors=None):
  293. encoding, errors = _get_encoding_and_errors(space, w_encoding,
  294. w_errors)
  295. return encode_object(space, self, encoding, errors)
  296. _StringMethods_descr_join = descr_join
  297. def descr_join(self, space, w_list):
  298. l = space.listview_unicode(w_list)
  299. if l is not None:
  300. if len(l) == 1:
  301. return space.wrap(l[0])
  302. return space.wrap(self._val(space).join(l))
  303. return self._StringMethods_descr_join(space, w_list)
  304. def _join_return_one(self, space, w_obj):
  305. return space.is_w(space.type(w_obj), space.w_unicode)
  306. def _join_check_item(self, space, w_obj):
  307. if (space.isinstance_w(w_obj, space.w_str) or
  308. space.isinstance_w(w_obj, space.w_unicode)):
  309. return 0
  310. return 1
  311. def descr_formatter_parser(self, space):
  312. from pypy.objspace.std.newformat import unicode_template_formatter
  313. tformat = unicode_template_formatter(space, space.unicode_w(self))
  314. return tformat.formatter_parser()
  315. def descr_formatter_field_name_split(self, space):
  316. from pypy.objspace.std.newformat import unicode_template_formatter
  317. tformat = unicode_template_formatter(space, space.unicode_w(self))
  318. return tformat.formatter_field_name_split()
  319. def descr_isdecimal(self, space):
  320. return self._is_generic(space, '_isdecimal')
  321. def descr_isnumeric(self, space):
  322. return self._is_generic(space, '_isnumeric')
  323. def descr_islower(self, space):
  324. cased = False
  325. for uchar in self._value:
  326. if (unicodedb.isupper(ord(uchar)) or
  327. unicodedb.istitle(ord(uchar))):
  328. return space.w_False
  329. if not cased and unicodedb.islower(ord(uchar)):
  330. cased = True
  331. return space.newbool(cased)
  332. def descr_isupper(self, space):
  333. cased = False
  334. for uchar in self._value:
  335. if (unicodedb.islower(ord(uchar)) or
  336. unicodedb.istitle(ord(uchar))):
  337. return space.w_False
  338. if not cased and unicodedb.isupper(ord(uchar)):
  339. cased = True
  340. return space.newbool(cased)
  341. def _starts_ends_overflow(self, prefix):
  342. return len(prefix) == 0
  343. def wrapunicode(space, uni):
  344. return W_UnicodeObject(uni)
  345. def plain_str2unicode(space, s):
  346. try:
  347. return unicode(s)
  348. except UnicodeDecodeError:
  349. for i in range(len(s)):
  350. if ord(s[i]) > 127:
  351. raise OperationError(
  352. space.w_UnicodeDecodeError,
  353. space.newtuple([
  354. space.wrap('ascii'),
  355. space.wrap(s),
  356. space.wrap(i),
  357. space.wrap(i+1),
  358. space.wrap("ordinal not in range(128)")]))
  359. assert False, "unreachable"
  360. # stuff imported from bytesobject for interoperability
  361. # ____________________________________________________________
  362. def getdefaultencoding(space):
  363. return space.sys.defaultencoding
  364. def _get_encoding_and_errors(space, w_encoding, w_errors):
  365. encoding = None if w_encoding is None else space.str_w(w_encoding)
  366. errors = None if w_errors is None else space.str_w(w_errors)
  367. return encoding, errors
  368. def encode_object(space, w_object, encoding, errors):
  369. if encoding is None:
  370. # Get the encoder functions as a wrapped object.
  371. # This lookup is cached.
  372. w_encoder = space.sys.get_w_default_encoder()
  373. else:
  374. if errors is None or errors == 'strict':
  375. try:
  376. if encoding == 'ascii':
  377. u = space.unicode_w(w_object)
  378. eh = unicodehelper.raise_unicode_exception_encode
  379. return space.wrap(unicode_encode_ascii(
  380. u, len(u), None, errorhandler=eh))
  381. if encoding == 'utf-8':
  382. u = space.unicode_w(w_object)
  383. eh = unicodehelper.raise_unicode_exception_encode
  384. return space.wrap(unicode_encode_utf_8(
  385. u, len(u), None, errorhandler=eh,
  386. allow_surrogates=True))
  387. except unicodehelper.RUnicodeEncodeError as ue:
  388. raise OperationError(space.w_UnicodeEncodeError,
  389. space.newtuple([
  390. space.wrap(ue.encoding),
  391. space.wrap(ue.object),
  392. space.wrap(ue.start),
  393. space.wrap(ue.end),
  394. space.wrap(ue.reason)]))
  395. from pypy.module._codecs.interp_codecs import lookup_codec
  396. w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0))
  397. if errors is None:
  398. w_errors = space.wrap('strict')
  399. else:
  400. w_errors = space.wrap(errors)
  401. w_restuple = space.call_function(w_encoder, w_object, w_errors)
  402. w_retval = space.getitem(w_restuple, space.wrap(0))
  403. if not space.isinstance_w(w_retval, space.w_str):
  404. raise oefmt(space.w_TypeError,
  405. "encoder did not return an string object (type '%T')",
  406. w_retval)
  407. return w_retval
  408. def decode_object(space, w_obj, encoding, errors):
  409. if encoding is None:
  410. encoding = getdefaultencoding(space)
  411. if errors is None or errors == 'strict':
  412. if encoding == 'ascii':
  413. # XXX error handling
  414. s = space.charbuf_w(w_obj)
  415. try:
  416. u = fast_str_decode_ascii(s)
  417. except ValueError:
  418. eh = unicodehelper.decode_error_handler(space)
  419. u = str_decode_ascii( # try again, to get the error right
  420. s, len(s), None, final=True, errorhandler=eh)[0]
  421. return space.wrap(u)
  422. if encoding == 'utf-8':
  423. s = space.charbuf_w(w_obj)
  424. eh = unicodehelper.decode_error_handler(space)
  425. return space.wrap(str_decode_utf_8(
  426. s, len(s), None, final=True, errorhandler=eh,
  427. allow_surrogates=True)[0])
  428. w_codecs = space.getbuiltinmodule("_codecs")
  429. w_decode = space.getattr(w_codecs, space.wrap("decode"))
  430. if errors is None:
  431. w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding))
  432. else:
  433. w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding),
  434. space.wrap(errors))
  435. return w_retval
  436. def unicode_from_encoded_object(space, w_obj, encoding, errors):
  437. # explicitly block bytearray on 2.7
  438. from .bytearrayobject import W_BytearrayObject
  439. if isinstance(w_obj, W_BytearrayObject):
  440. raise oefmt(space.w_TypeError, "decoding bytearray is not supported")
  441. w_retval = decode_object(space, w_obj, encoding, errors)
  442. if not space.isinstance_w(w_retval, space.w_unicode):
  443. raise oefmt(space.w_TypeError,
  444. "decoder did not return an unicode object (type '%T')",
  445. w_retval)
  446. assert isinstance(w_retval, W_UnicodeObject)
  447. return w_retval
  448. def unicode_from_object(space, w_obj):
  449. if space.is_w(space.type(w_obj), space.w_unicode):
  450. return w_obj
  451. elif space.is_w(space.type(w_obj), space.w_str):
  452. w_res = w_obj
  453. else:
  454. w_unicode_method = space.lookup(w_obj, "__unicode__")
  455. # obscure workaround: for the next two lines see
  456. # test_unicode_conversion_with__str__
  457. if w_unicode_method is None:
  458. if space.isinstance_w(w_obj, space.w_unicode):
  459. return space.wrap(space.unicode_w(w_obj))
  460. w_unicode_method = space.lookup(w_obj, "__str__")
  461. if w_unicode_method is not None:
  462. w_res = space.get_and_call_function(w_unicode_method, w_obj)
  463. else:
  464. w_res = space.str(w_obj)
  465. if space.isinstance_w(w_res, space.w_unicode):
  466. return w_res
  467. return unicode_from_encoded_object(space, w_res, None, "strict")
  468. def unicode_from_string(space, w_str):
  469. # this is a performance and bootstrapping hack
  470. encoding = getdefaultencoding(space)
  471. if encoding != 'ascii':
  472. return unicode_from_encoded_object(space, w_str, encoding, "strict")
  473. s = space.str_w(w_str)
  474. try:
  475. return W_UnicodeObject(s.decode("ascii"))
  476. except UnicodeDecodeError:
  477. # raising UnicodeDecodeError is messy, "please crash for me"
  478. return unicode_from_encoded_object(space, w_str, "ascii", "strict")
  479. class UnicodeDocstrings:
  480. """unicode(object='') -> unicode object
  481. unicode(string[, encoding[, errors]]) -> unicode object
  482. Create a new Unicode object from the given encoded string.
  483. encoding defaults to the current default string encoding.
  484. errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.
  485. """
  486. def __add__():
  487. """x.__add__(y) <==> x+y"""
  488. def __contains__():
  489. """x.__contains__(y) <==> y in x"""
  490. def __eq__():
  491. """x.__eq__(y) <==> x==y"""
  492. def __format__():
  493. """S.__format__(format_spec) -> unicode
  494. Return a formatted version of S as described by format_spec.
  495. """
  496. def __ge__():
  497. """x.__ge__(y) <==> x>=y"""
  498. def __getattribute__():
  499. """x.__getattribute__('name') <==> x.name"""
  500. def __getitem__():
  501. """x.__getitem__(y) <==> x[y]"""
  502. def __getnewargs__():
  503. ""
  504. def __getslice__():
  505. """x.__getslice__(i, j) <==> x[i:j]
  506. Use of negative indices is not supported.
  507. """
  508. def __gt__():
  509. """x.__gt__(y) <==> x>y"""
  510. def __hash__():
  511. """x.__hash__() <==> hash(x)"""
  512. def __le__():
  513. """x.__le__(y) <==> x<=y"""
  514. def __len__():
  515. """x.__len__() <==> len(x)"""
  516. def __lt__():
  517. """x.__lt__(y) <==> x<y"""
  518. def __mod__():
  519. """x.__mod__(y) <==> x%y"""
  520. def __mul__():
  521. """x.__mul__(n) <==> x*n"""
  522. def __ne__():
  523. """x.__ne__(y) <==> x!=y"""
  524. def __repr__():
  525. """x.__repr__() <==> repr(x)"""
  526. def __rmod__():
  527. """x.__rmod__(y) <==> y%x"""
  528. def __rmul__():
  529. """x.__rmul__(n) <==> n*x"""
  530. def __sizeof__():
  531. """S.__sizeof__() -> size of S in memory, in bytes"""
  532. def __str__():
  533. """x.__str__() <==> str(x)"""
  534. def capitalize():
  535. """S.capitalize() -> unicode
  536. Return a capitalized version of S, i.e. make the first character
  537. have upper case and the rest lower case.
  538. """
  539. def center():
  540. """S.center(width[, fillchar]) -> unicode
  541. Return S centered in a Unicode string of length width. Padding is
  542. done using the specified fill character (default is a space).
  543. """
  544. def count():
  545. """S.count(sub[, start[, end]]) -> int
  546. Return the number of non-overlapping occurrences of substring sub in
  547. Unicode string S[start:end]. Optional arguments start and end are
  548. interpreted as in slice notation.
  549. """
  550. def decode():
  551. """S.decode(encoding=None, errors='strict') -> string or unicode
  552. Decode S using the codec registered for encoding. encoding defaults
  553. to the default encoding. errors may be given to set a different error
  554. handling scheme. Default is 'strict' meaning that encoding errors raise
  555. a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'
  556. as well as any other name registered with codecs.register_error that is
  557. able to handle UnicodeDecodeErrors.
  558. """
  559. def encode():
  560. """S.encode(encoding=None, errors='strict') -> string or unicode
  561. Encode S using the codec registered for encoding. encoding defaults
  562. to the default encoding. errors may be given to set a different error
  563. handling scheme. Default is 'strict' meaning that encoding errors raise
  564. a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
  565. 'xmlcharrefreplace' as well as any other name registered with
  566. codecs.register_error that can handle UnicodeEncodeErrors.
  567. """
  568. def endswith():
  569. """S.endswith(suffix[, start[, end]]) -> bool
  570. Return True if S ends with the specified suffix, False otherwise.
  571. With optional start, test S beginning at that position.
  572. With optional end, stop comparing S at that position.
  573. suffix can also be a tuple of strings to try.
  574. """
  575. def expandtabs():
  576. """S.expandtabs([tabsize]) -> unicode
  577. Return a copy of S where all tab characters are expanded using spaces.
  578. If tabsize is not given, a tab size of 8 characters is assumed.
  579. """
  580. def find():
  581. """S.find(sub[, start[, end]]) -> int
  582. Return the lowest index in S where substring sub is found,
  583. such that sub is contained within S[start:end]. Optional
  584. arguments start and end are interpreted as in slice notation.
  585. Return -1 on failure.
  586. """
  587. def format():
  588. """S.format(*args, **kwargs) -> unicode
  589. Return a formatted version of S, using substitutions from args and
  590. kwargs. The substitutions are identified by braces ('{' and '}').
  591. """
  592. def index():
  593. """S.index(sub[, start[, end]]) -> int
  594. Like S.find() but raise ValueError when the substring is not found.
  595. """
  596. def isalnum():
  597. """S.isalnum() -> bool
  598. Return True if all characters in S are alphanumeric
  599. and there is at least one character in S, False otherwise.
  600. """
  601. def isalpha():
  602. """S.isalpha() -> bool
  603. Return True if all characters in S are alphabetic
  604. and there is at least one character in S, False otherwise.
  605. """
  606. def isdecimal():
  607. """S.isdecimal() -> bool
  608. Return True if there are only decimal characters in S,
  609. False otherwise.
  610. """
  611. def isdigit():
  612. """S.isdigit() -> bool
  613. Return True if all characters in S are digits
  614. and there is at least one character in S, False otherwise.
  615. """
  616. def islower():
  617. """S.islower() -> bool
  618. Return True if all cased characters in S are lowercase and there is
  619. at least one cased character in S, False otherwise.
  620. """
  621. def isnumeric():
  622. """S.isnumeric() -> bool
  623. Return True if there are only numeric characters in S,
  624. False otherwise.
  625. """
  626. def isspace():
  627. """S.isspace() -> bool
  628. Return True if all characters in S are whitespace
  629. and there is at least one character in S, False otherwise.
  630. """
  631. def istitle():
  632. """S.istitle() -> bool
  633. Return True if S is a titlecased string and there is at least one
  634. character in S, i.e. upper- and titlecase characters may only
  635. follow uncased characters and lowercase characters only cased ones.
  636. Return False otherwise.
  637. """
  638. def isupper():
  639. """S.isupper() -> bool
  640. Return True if all cased characters in S are uppercase and there is
  641. at least one cased character in S, False otherwise.
  642. """
  643. def join():
  644. """S.join(iterable) -> unicode
  645. Return a string which is the concatenation of the strings in the
  646. iterable. The separator between elements is S.
  647. """
  648. def ljust():
  649. """S.ljust(width[, fillchar]) -> int
  650. Return S left-justified in a Unicode string of length width. Padding is
  651. done using the specified fill character (default is a space).
  652. """
  653. def lower():
  654. """S.lower() -> unicode
  655. Return a copy of the string S converted to lowercase.
  656. """
  657. def lstrip():
  658. """S.lstrip([chars]) -> unicode
  659. Return a copy of the string S with leading whitespace removed.
  660. If chars is given and not None, remove characters in chars instead.
  661. If chars is a str, it will be converted to unicode before stripping
  662. """
  663. def partition():
  664. """S.partition(sep) -> (head, sep, tail)
  665. Search for the separator sep in S, and return the part before it,
  666. the separator itself, and the part after it. If the separator is not
  667. found, return S and two empty strings.
  668. """
  669. def replace():
  670. """S.replace(old, new[, count]) -> unicode
  671. Return a copy of S with all occurrences of substring
  672. old replaced by new. If the optional argument count is
  673. given, only the first count occurrences are replaced.
  674. """
  675. def rfind():
  676. """S.rfind(sub[, start[, end]]) -> int
  677. Return the highest index in S where substring sub is found,
  678. such that sub is contained within S[start:end]. Optional
  679. arguments start and end are interpreted as in slice notation.
  680. Return -1 on failure.
  681. """
  682. def rindex():
  683. """S.rindex(sub[, start[, end]]) -> int
  684. Like S.rfind() but raise ValueError when the substring is not found.
  685. """
  686. def rjust():
  687. """S.rjust(width[, fillchar]) -> unicode
  688. Return S right-justified in a Unicode string of length width. Padding
  689. is done using the specified fill character (default is a space).
  690. """
  691. def rpartition():
  692. """S.rpartition(sep) -> (head, sep, tail)
  693. Search for the separator sep in S, starting at the end of S, and return
  694. the part before it, the separator itself, and the part after it. If
  695. the separator is not found, return two empty strings and S.
  696. """
  697. def rsplit():
  698. """S.rsplit(sep=None, maxsplit=-1) -> list of strings
  699. Return a list of the words in S, using sep as the
  700. delimiter string, starting at the end of the string and
  701. working to the front. If maxsplit is given, at most maxsplit
  702. splits are done. If sep is not specified, any whitespace string
  703. is a separator.
  704. """
  705. def rstrip():
  706. """S.rstrip([chars]) -> unicode
  707. Return a copy of the string S with trailing whitespace removed.
  708. If chars is given and not None, remove characters in chars instead.
  709. If chars is a str, it will be converted to unicode before stripping
  710. """
  711. def split():
  712. """S.split(sep=None, maxsplit=-1) -> list of strings
  713. Return a list of the words in S, using sep as the
  714. delimiter string. If maxsplit is given, at most maxsplit
  715. splits are done. If sep is not specified or is None, any
  716. whitespace string is a separator and empty strings are
  717. removed from the result.
  718. """
  719. def splitlines():
  720. """S.splitlines(keepends=False) -> list of strings
  721. Return a list of the lines in S, breaking at line boundaries.
  722. Line breaks are not included in the resulting list unless keepends
  723. is given and true.
  724. """
  725. def startswith():
  726. """S.startswith(prefix[, start[, end]]) -> bool
  727. Return True if S starts with the specified prefix, False otherwise.
  728. With optional start, test S beginning at that position.
  729. With optional end, stop comparing S at that position.
  730. prefix can also be a tuple of strings to try.
  731. """
  732. def strip():
  733. """S.strip([chars]) -> unicode
  734. Return a copy of the string S with leading and trailing
  735. whitespace removed.
  736. If chars is given and not None, remove characters in chars instead.
  737. If chars is a str, it will be converted to unicode before stripping
  738. """
  739. def swapcase():
  740. """S.swapcase() -> unicode
  741. Return a copy of S with uppercase characters converted to lowercase
  742. and vice versa.
  743. """
  744. def title():
  745. """S.title() -> unicode
  746. Return a titlecased version of S, i.e. words start with title case
  747. characters, all remaining cased characters have lower case.
  748. """
  749. def translate():
  750. """S.translate(table) -> unicode
  751. Return a copy of the string S, where all characters have been mapped
  752. through the given translation table, which must be a mapping of
  753. Unicode ordinals to Unicode ordinals, Unicode strings or None.
  754. Unmapped characters are left untouched. Characters mapped to None
  755. are deleted.
  756. """
  757. def upper():
  758. """S.upper() -> unicode
  759. Return a copy of S converted to uppercase.
  760. """
  761. def zfill():
  762. """S.zfill(width) -> unicode
  763. Pad a numeric string S with zeros on the left, to fill a field
  764. of the specified width. The string S is never truncated.
  765. """
  766. W_UnicodeObject.typedef = TypeDef(
  767. "unicode", basestring_typedef,
  768. __new__ = interp2app(W_UnicodeObject.descr_new),
  769. __doc__ = UnicodeDocstrings.__doc__,
  770. __repr__ = interp2app(W_UnicodeObject.descr_repr,
  771. doc=UnicodeDocstrings.__repr__.__doc__),
  772. __str__ = interp2app(W_UnicodeObject.descr_str,
  773. doc=UnicodeDocstrings.__str__.__doc__),
  774. __hash__ = interp2app(W_UnicodeObject.descr_hash,
  775. doc=UnicodeDocstrings.__hash__.__doc__),
  776. __eq__ = interp2app(W_UnicodeObject.descr_eq,
  777. doc=UnicodeDocstrings.__eq__.__doc__),
  778. __ne__ = interp2app(W_UnicodeObject.descr_ne,
  779. doc=UnicodeDocstrings.__ne__.__doc__),
  780. __lt__ = interp2app(W_UnicodeObject.descr_lt,
  781. doc=UnicodeDocstrings.__lt__.__doc__),
  782. __le__ = interp2app(W_UnicodeObject.descr_le,
  783. doc=UnicodeDocstrings.__le__.__doc__),
  784. __gt__ = interp2app(W_UnicodeObject.descr_gt,
  785. doc=UnicodeDocstrings.__gt__.__doc__),
  786. __ge__ = interp2app(W_UnicodeObject.descr_ge,
  787. doc=UnicodeDocstrings.__ge__.__doc__),
  788. __len__ = interp2app(W_UnicodeObject.descr_len,
  789. doc=UnicodeDocstrings.__len__.__doc__),
  790. __contains__ = interp2app(W_UnicodeObject.descr_contains,
  791. doc=UnicodeDocstrings.__contains__.__doc__),
  792. __add__ = interp2app(W_UnicodeObject.descr_add,
  793. doc=UnicodeDocstrings.__add__.__doc__),
  794. __mul__ = interp2app(W_UnicodeObject.descr_mul,
  795. doc=UnicodeDocstrings.__mul__.__doc__),
  796. __rmul__ = interp2app(W_UnicodeObject.descr_mul,
  797. doc=UnicodeDocstrings.__rmul__.__doc__),
  798. __getitem__ = interp2app(W_UnicodeObject.descr_getitem,
  799. doc=UnicodeDocstrings.__getitem__.__doc__),
  800. __getslice__ = interp2app(W_UnicodeObject.descr_getslice,
  801. doc=UnicodeDocstrings.__getslice__.__doc__),
  802. capitalize = interp2app(W_UnicodeObject.descr_capitalize,
  803. doc=UnicodeDocstrings.capitalize.__doc__),
  804. center = interp2app(W_UnicodeObject.descr_center,
  805. doc=UnicodeDocstrings.center.__doc__),
  806. count = interp2app(W_UnicodeObject.descr_count,
  807. doc=UnicodeDocstrings.count.__doc__),
  808. decode = interp2app(W_UnicodeObject.descr_decode,
  809. doc=UnicodeDocstrings.decode.__doc__),
  810. encode = interp2app(W_UnicodeObject.descr_encode,
  811. doc=UnicodeDocstrings.encode.__doc__),
  812. expandtabs = interp2app(W_UnicodeObject.descr_expandtabs,
  813. doc=UnicodeDocstrings.expandtabs.__doc__),
  814. find = interp2app(W_UnicodeObject.descr_find,
  815. doc=UnicodeDocstrings.find.__doc__),
  816. rfind = interp2app(W_UnicodeObject.descr_rfind,
  817. doc=UnicodeDocstrings.rfind.__doc__),
  818. index = interp2app(W_UnicodeObject.descr_index,
  819. doc=UnicodeDocstrings.index.__doc__),
  820. rindex = interp2app(W_UnicodeObject.descr_rindex,
  821. doc=UnicodeDocstrings.rindex.__doc__),
  822. isalnum = interp2app(W_UnicodeObject.descr_isalnum,
  823. doc=UnicodeDocstrings.isalnum.__doc__),
  824. isalpha = interp2app(W_UnicodeObject.descr_isalpha,
  825. doc=UnicodeDocstrings.isalpha.__doc__),
  826. isdecimal = interp2app(W_UnicodeObject.descr_isdecimal,
  827. doc=UnicodeDocstrings.isdecimal.__doc__),
  828. isdigit = interp2app(W_UnicodeObject.descr_isdigit,
  829. doc=UnicodeDocstrings.isdigit.__doc__),
  830. islower = interp2app(W_UnicodeObject.descr_islower,
  831. doc=UnicodeDocstrings.islower.__doc__),
  832. isnumeric = interp2app(W_UnicodeObject.descr_isnumeric,
  833. doc=UnicodeDocstrings.isnumeric.__doc__),
  834. isspace = interp2app(W_UnicodeObject.descr_isspace,
  835. doc=UnicodeDocstrings.isspace.__doc__),
  836. istitle = interp2app(W_UnicodeObject.descr_istitle,
  837. doc=UnicodeDocstrings.istitle.__doc__),
  838. isupper = interp2app(W_UnicodeObject.descr_isupper,
  839. doc=UnicodeDocstrings.isupper.__doc__),
  840. join = interp2app(W_UnicodeObject.descr_join,
  841. doc=UnicodeDocstrings.join.__doc__),
  842. ljust = interp2app(W_UnicodeObject.descr_ljust,
  843. doc=UnicodeDocstrings.ljust.__doc__),
  844. rjust = interp2app(W_UnicodeObject.descr_rjust,
  845. doc=UnicodeDocstrings.rjust.__doc__),
  846. lower = interp2app(W_UnicodeObject.descr_lower,
  847. doc=UnicodeDocstrings.lower.__doc__),
  848. partition = interp2app(W_UnicodeObject.descr_partition,
  849. doc=UnicodeDocstrings.partition.__doc__),
  850. rpartition = interp2app(W_UnicodeObject.descr_rpartition,
  851. doc=UnicodeDocstrings.rpartition.__doc__),
  852. replace = interp2app(W_UnicodeObject.descr_replace,
  853. doc=UnicodeDocstrings.replace.__doc__),
  854. split = interp2app(W_UnicodeObject.descr_split,
  855. doc=UnicodeDocstrings.split.__doc__),
  856. rsplit = interp2app(W_UnicodeObject.descr_rsplit,
  857. doc=UnicodeDocstrings.rsplit.__doc__),
  858. splitlines = interp2app(W_UnicodeObject.descr_splitlines,
  859. doc=UnicodeDocstrings.splitlines.__doc__),
  860. startswith = interp2app(W_UnicodeObject.descr_startswith,
  861. doc=UnicodeDocstrings.startswith.__doc__),
  862. endswith = interp2app(W_UnicodeObject.descr_endswith,
  863. doc=UnicodeDocstrings.endswith.__doc__),
  864. strip = interp2app(W_UnicodeObject.descr_strip,
  865. doc=UnicodeDocstrings.strip.__doc__),
  866. lstrip = interp2app(W_UnicodeObject.descr_lstrip,
  867. doc=UnicodeDocstrings.lstrip.__doc__),
  868. rstrip = interp2app(W_UnicodeObject.descr_rstrip,
  869. doc=UnicodeDocstrings.rstrip.__doc__),
  870. swapcase = interp2app(W_UnicodeObject.descr_swapcase,
  871. doc=UnicodeDocstrings.swapcase.__doc__),
  872. title = interp2app(W_UnicodeObject.descr_title,
  873. doc=UnicodeDocstrings.title.__doc__),
  874. translate = interp2app(W_UnicodeObject.descr_translate,
  875. doc=UnicodeDocstrings.translate.__doc__),
  876. upper = interp2app(W_UnicodeObject.descr_upper,
  877. doc=UnicodeDocstrings.upper.__doc__),
  878. zfill = interp2app(W_UnicodeObject.descr_zfill,
  879. doc=UnicodeDocstrings.zfill.__doc__),
  880. format = interp2app(W_UnicodeObject.descr_format,
  881. doc=UnicodeDocstrings.format.__doc__),
  882. __format__ = interp2app(W_UnicodeObject.descr__format__,
  883. doc=UnicodeDocstrings.__format__.__doc__),
  884. __mod__ = interp2app(W_UnicodeObject.descr_mod,
  885. doc=UnicodeDocstrings.__mod__.__doc__),
  886. __getnewargs__ = interp2app(W_UnicodeObject.descr_getnewargs,
  887. doc=UnicodeDocstrings.__getnewargs__.__doc__),
  888. _formatter_parser = interp2app(W_UnicodeObject.descr_formatter_parser),
  889. _formatter_field_name_split =
  890. interp2app(W_UnicodeObject.descr_formatter_field_name_split),
  891. )
  892. W_UnicodeObject.typedef.flag_sequence_bug_compat = True
  893. def _create_list_from_unicode(value):
  894. # need this helper function to allow the jit to look inside and inline
  895. # listview_unicode
  896. return [s for s in value]
  897. W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
  898. # Helper for converting int/long
  899. def unicode_to_decimal_w(space, w_unistr):
  900. if not isinstance(w_unistr, W_UnicodeObject):
  901. raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
  902. unistr = w_unistr._value
  903. result = ['\0'] * len(unistr)
  904. digits = ['0', '1', '2', '3', '4',
  905. '5', '6', '7', '8', '9']
  906. for i in xrange(len(unistr)):
  907. uchr = ord(unistr[i])
  908. if unicodedb.isspace(uchr):
  909. result[i] = ' '
  910. continue
  911. try:
  912. result[i] = digits[unicodedb.decimal(uchr)]
  913. except KeyError:
  914. if 0 < uchr < 256:
  915. result[i] = chr(uchr)
  916. else:
  917. w_encoding = space.wrap('decimal')
  918. w_start = space.wrap(i)
  919. w_end = space.wrap(i+1)
  920. w_reason = space.wrap('invalid decimal Unicode string')
  921. raise OperationError(space.w_UnicodeEncodeError,
  922. space.newtuple([w_encoding, w_unistr,
  923. w_start, w_end,
  924. w_reason]))
  925. return ''.join(result)
  926. _repr_function, _ = make_unicode_escape_function(
  927. pass_printable=False, unicode_output=False, quotes=True, prefix='u')