PageRenderTime 59ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/pypy/rlib/test/test_runicode.py

https://bitbucket.org/dac_io/pypy
Python | 730 lines | 719 code | 10 blank | 1 comment | 14 complexity | 61208340aa6083d8abd2d8101b38afdc MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import py
  3. import sys, random
  4. from pypy.rlib import runicode
  5. def test_unichr():
  6. a = runicode.UNICHR(0xffff)
  7. assert a == u'\uffff'
  8. if runicode.MAXUNICODE > 0xffff:
  9. a = runicode.UNICHR(0x10000)
  10. if sys.maxunicode < 0x10000:
  11. assert len(a) == 2 # surrogates
  12. else:
  13. assert len(a) == 1
  14. else:
  15. py.test.raises(ValueError, runicode.UNICHR, 0x10000)
  16. class UnicodeTests(object):
  17. def typeequals(self, x, y):
  18. assert x == y
  19. assert type(x) is type(y)
  20. def getdecoder(self, encoding):
  21. return getattr(runicode, "str_decode_%s" % encoding.replace("-", "_"))
  22. def getencoder(self, encoding):
  23. return getattr(runicode,
  24. "unicode_encode_%s" % encoding.replace("-", "_"))
  25. def checkdecode(self, s, encoding):
  26. decoder = self.getdecoder(encoding)
  27. try:
  28. if isinstance(s, str):
  29. trueresult = s.decode(encoding)
  30. else:
  31. trueresult = s
  32. s = s.encode(encoding)
  33. except LookupError, e:
  34. py.test.skip(e)
  35. result, consumed = decoder(s, len(s), True)
  36. assert consumed == len(s)
  37. self.typeequals(trueresult, result)
  38. def checkencode(self, s, encoding):
  39. encoder = self.getencoder(encoding)
  40. try:
  41. if isinstance(s, unicode):
  42. trueresult = s.encode(encoding)
  43. else:
  44. trueresult = s
  45. s = s.decode(encoding)
  46. except LookupError, e:
  47. py.test.skip(e)
  48. result = encoder(s, len(s), True)
  49. self.typeequals(trueresult, result)
  50. def checkencodeerror(self, s, encoding, start, stop):
  51. called = [False]
  52. def errorhandler(errors, enc, msg, t, startingpos,
  53. endingpos):
  54. called[0] = True
  55. assert errors == "foo!"
  56. assert enc == encoding
  57. assert t is s
  58. assert start == startingpos
  59. assert stop == endingpos
  60. return "42424242", stop
  61. encoder = self.getencoder(encoding)
  62. result = encoder(s, len(s), "foo!", errorhandler)
  63. assert called[0]
  64. assert "42424242" in result
  65. def checkdecodeerror(self, s, encoding, start, stop,
  66. addstuff=True, msg=None):
  67. called = [0]
  68. def errorhandler(errors, enc, errmsg, t, startingpos,
  69. endingpos):
  70. called[0] += 1
  71. if called[0] == 1:
  72. assert errors == "foo!"
  73. assert enc == encoding
  74. assert t is s
  75. assert start == startingpos
  76. assert stop == endingpos
  77. if msg is not None:
  78. assert errmsg == msg
  79. return u"42424242", stop
  80. return u"", endingpos
  81. decoder = self.getdecoder(encoding)
  82. if addstuff:
  83. s += "some rest in ascii"
  84. result, _ = decoder(s, len(s), "foo!", True, errorhandler)
  85. assert called[0] > 0
  86. assert "42424242" in result
  87. if addstuff:
  88. assert result.endswith(u"some rest in ascii")
  89. class TestDecoding(UnicodeTests):
  90. # XXX test bom recognition in utf-16
  91. # XXX test proper error handling
  92. def test_all_ascii(self):
  93. for i in range(128):
  94. for encoding in "utf-8 latin-1 ascii".split():
  95. self.checkdecode(chr(i), encoding)
  96. def test_all_first_256(self):
  97. for i in range(256):
  98. for encoding in ("utf-7 utf-8 latin-1 utf-16 utf-16-be utf-16-le "
  99. "utf-32 utf-32-be utf-32-le").split():
  100. self.checkdecode(unichr(i), encoding)
  101. def test_first_10000(self):
  102. for i in range(10000):
  103. for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
  104. "utf-32 utf-32-be utf-32-le").split():
  105. self.checkdecode(unichr(i), encoding)
  106. def test_random(self):
  107. for i in range(10000):
  108. v = random.randrange(sys.maxunicode)
  109. if 0xd800 <= v <= 0xdfff:
  110. continue
  111. uni = unichr(v)
  112. if sys.version >= "2.7":
  113. self.checkdecode(uni, "utf-7")
  114. for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
  115. "utf-32 utf-32-be utf-32-le").split():
  116. self.checkdecode(uni, encoding)
  117. def test_maxunicode(self):
  118. uni = unichr(sys.maxunicode)
  119. if sys.version >= "2.7":
  120. self.checkdecode(uni, "utf-7")
  121. for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
  122. "utf-32 utf-32-be utf-32-le").split():
  123. self.checkdecode(uni, encoding)
  124. def test_ascii_error(self):
  125. self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)
  126. def test_utf16_errors(self):
  127. # trunkated BOM
  128. for s in ["\xff", "\xfe"]:
  129. self.checkdecodeerror(s, "utf-16", 0, len(s), addstuff=False)
  130. for s in [
  131. # unexpected end of data ascii
  132. "\xff\xfeF",
  133. # unexpected end of data
  134. '\xff\xfe\xc0\xdb\x00', '\xff\xfe\xc0\xdb', '\xff\xfe\xc0',
  135. ]:
  136. self.checkdecodeerror(s, "utf-16", 2, len(s), addstuff=False)
  137. for s in [
  138. # illegal surrogate
  139. "\xff\xfe\xff\xdb\xff\xff",
  140. ]:
  141. self.checkdecodeerror(s, "utf-16", 2, 4, addstuff=False)
  142. def test_utf16_bugs(self):
  143. s = '\x80-\xe9\xdeL\xa3\x9b'
  144. py.test.raises(UnicodeDecodeError, runicode.str_decode_utf_16_le,
  145. s, len(s), True)
  146. def test_utf7_bugs(self):
  147. u = u'A\u2262\u0391.'
  148. assert runicode.unicode_encode_utf_7(u, len(u), None) == 'A+ImIDkQ.'
  149. def test_utf7_tofrom_utf8_bug(self):
  150. def _assert_decu7(input, expected):
  151. assert runicode.str_decode_utf_7(input, len(input), None) == (expected, len(input))
  152. _assert_decu7('+-', u'+')
  153. _assert_decu7('+-+-', u'++')
  154. _assert_decu7('+-+AOQ-', u'+\xe4')
  155. _assert_decu7('+AOQ-', u'\xe4')
  156. _assert_decu7('+AOQ-', u'\xe4')
  157. _assert_decu7('+AOQ- ', u'\xe4 ')
  158. _assert_decu7(' +AOQ-', u' \xe4')
  159. _assert_decu7(' +AOQ- ', u' \xe4 ')
  160. _assert_decu7('+AOQ-+AOQ-', u'\xe4\xe4')
  161. s_utf7 = 'Die M+AOQ-nner +AOQ-rgen sich!'
  162. s_utf8 = u'Die Männer ärgen sich!'
  163. s_utf8_esc = u'Die M\xe4nner \xe4rgen sich!'
  164. _assert_decu7(s_utf7, s_utf8_esc)
  165. _assert_decu7(s_utf7, s_utf8)
  166. assert runicode.unicode_encode_utf_7(s_utf8_esc, len(s_utf8_esc), None) == s_utf7
  167. assert runicode.unicode_encode_utf_7(s_utf8, len(s_utf8_esc), None) == s_utf7
  168. def test_utf7_partial(self):
  169. s = u"a+-b".encode('utf-7')
  170. assert s == "a+--b"
  171. decode = self.getdecoder('utf-7')
  172. assert decode(s, 1, None) == (u'a', 1)
  173. assert decode(s, 2, None) == (u'a', 1)
  174. assert decode(s, 3, None) == (u'a+', 3)
  175. assert decode(s, 4, None) == (u'a+-', 4)
  176. assert decode(s, 5, None) == (u'a+-b', 5)
  177. def test_utf7_surrogates(self):
  178. encode = self.getencoder('utf-7')
  179. u = u'\U000abcde'
  180. assert encode(u, len(u), None) == '+2m/c3g-'
  181. decode = self.getdecoder('utf-7')
  182. s = '+3ADYAA-'
  183. raises(UnicodeError, decode, s, len(s), None)
  184. def replace_handler(errors, codec, message, input, start, end):
  185. return u'?', end
  186. assert decode(s, len(s), None, final=True,
  187. errorhandler = replace_handler) == (u'??', len(s))
  188. class TestUTF8Decoding(UnicodeTests):
  189. def __init__(self):
  190. self.decoder = self.getdecoder('utf-8')
  191. def replace_handler(self, errors, codec, message, input, start, end):
  192. return u'\ufffd', end
  193. def ignore_handler(self, errors, codec, message, input, start, end):
  194. return u'', end
  195. def to_bytestring(self, bytes):
  196. return ''.join(chr(int(c, 16)) for c in bytes.split())
  197. def test_single_chars_utf8(self):
  198. for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
  199. self.checkdecode(s, "utf-8")
  200. def test_utf8_surrogate(self):
  201. # A surrogate should not be valid utf-8, but python 2.x accepts them.
  202. # This test will raise an error with python 3.x
  203. self.checkdecode(u"\ud800", "utf-8")
  204. def test_invalid_start_byte(self):
  205. """
  206. Test that an 'invalid start byte' error is raised when the first byte
  207. is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
  208. 4-bytes sequence. The invalid start byte is replaced with a single
  209. U+FFFD when errors='replace'.
  210. E.g. <80> is a continuation byte and can appear only after a start byte.
  211. """
  212. FFFD = u'\ufffd'
  213. for byte in '\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
  214. raises(UnicodeDecodeError, self.decoder, byte, 1, None, final=True)
  215. self.checkdecodeerror(byte, 'utf-8', 0, 1, addstuff=False,
  216. msg='invalid start byte')
  217. assert self.decoder(byte, 1, None, final=True,
  218. errorhandler=self.replace_handler) == (FFFD, 1)
  219. assert (self.decoder('aaaa' + byte + 'bbbb', 9, None,
  220. final=True, errorhandler=self.replace_handler) ==
  221. (u'aaaa'+ FFFD + u'bbbb', 9))
  222. assert self.decoder(byte, 1, None, final=True,
  223. errorhandler=self.ignore_handler) == (u'', 1)
  224. assert (self.decoder('aaaa' + byte + 'bbbb', 9, None,
  225. final=True, errorhandler=self.ignore_handler) ==
  226. (u'aaaabbbb', 9))
  227. def test_unexpected_end_of_data(self):
  228. """
  229. Test that an 'unexpected end of data' error is raised when the string
  230. ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
  231. enough continuation bytes. The incomplete sequence is replaced with a
  232. single U+FFFD when errors='replace'.
  233. E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
  234. sequence, but it's followed by only 2 valid continuation bytes and the
  235. last continuation bytes is missing.
  236. Note: the continuation bytes must be all valid, if one of them is
  237. invalid another error will be raised.
  238. """
  239. sequences = [
  240. 'C2', 'DF',
  241. 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
  242. 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
  243. 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
  244. 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
  245. 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
  246. 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
  247. ]
  248. FFFD = u'\ufffd'
  249. for seq in sequences:
  250. seq = self.to_bytestring(seq)
  251. raises(UnicodeDecodeError, self.decoder, seq, len(seq),
  252. None, final=True)
  253. self.checkdecodeerror(seq, 'utf-8', 0, len(seq), addstuff=False,
  254. msg='unexpected end of data')
  255. assert self.decoder(seq, len(seq), None, final=True,
  256. errorhandler=self.replace_handler) == (FFFD, len(seq))
  257. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  258. final=True, errorhandler=self.replace_handler) ==
  259. (u'aaaa'+ FFFD + u'bbbb', len(seq) + 8))
  260. assert self.decoder(seq, len(seq), None, final=True,
  261. errorhandler=self.ignore_handler) == (u'', len(seq))
  262. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  263. final=True, errorhandler=self.ignore_handler) ==
  264. (u'aaaabbbb', len(seq) + 8))
  265. def test_invalid_cb_for_2bytes_seq(self):
  266. """
  267. Test that an 'invalid continuation byte' error is raised when the
  268. continuation byte of a 2-bytes sequence is invalid. The start byte
  269. is replaced by a single U+FFFD and the second byte is handled
  270. separately when errors='replace'.
  271. E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
  272. sequence, but 41 is not a valid continuation byte because it's the
  273. ASCII letter 'A'.
  274. """
  275. FFFD = u'\ufffd'
  276. FFFDx2 = FFFD * 2
  277. sequences = [
  278. ('C2 00', FFFD+u'\x00'), ('C2 7F', FFFD+u'\x7f'),
  279. ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
  280. ('DF 00', FFFD+u'\x00'), ('DF 7F', FFFD+u'\x7f'),
  281. ('DF C0', FFFDx2), ('DF FF', FFFDx2),
  282. ]
  283. for seq, res in sequences:
  284. seq = self.to_bytestring(seq)
  285. raises(UnicodeDecodeError, self.decoder, seq, len(seq),
  286. None, final=True)
  287. self.checkdecodeerror(seq, 'utf-8', 0, 1, addstuff=False,
  288. msg='invalid continuation byte')
  289. assert self.decoder(seq, len(seq), None, final=True,
  290. errorhandler=self.replace_handler) == (res, len(seq))
  291. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  292. final=True, errorhandler=self.replace_handler) ==
  293. (u'aaaa' + res + u'bbbb', len(seq) + 8))
  294. res = res.replace(FFFD, u'')
  295. assert self.decoder(seq, len(seq), None, final=True,
  296. errorhandler=self.ignore_handler) == (res, len(seq))
  297. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  298. final=True, errorhandler=self.ignore_handler) ==
  299. (u'aaaa' + res + u'bbbb', len(seq) + 8))
  300. def test_invalid_cb_for_3bytes_seq(self):
  301. """
  302. Test that an 'invalid continuation byte' error is raised when the
  303. continuation byte(s) of a 3-bytes sequence are invalid. When
  304. errors='replace', if the first continuation byte is valid, the first
  305. two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
  306. third byte is handled separately, otherwise only the start byte is
  307. replaced with a U+FFFD and the other continuation bytes are handled
  308. separately.
  309. E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
  310. sequence, 80 is a valid continuation byte, but 41 is not a valid cb
  311. because it's the ASCII letter 'A'.
  312. Note: when the start byte is E0 or ED, the valid ranges for the first
  313. continuation byte are limited to A0..BF and 80..9F respectively.
  314. However, when the start byte is ED, Python 2 considers all the bytes
  315. in range 80..BF valid. This is fixed in Python 3.
  316. """
  317. FFFD = u'\ufffd'
  318. FFFDx2 = FFFD * 2
  319. sequences = [
  320. ('E0 00', FFFD+u'\x00'), ('E0 7F', FFFD+u'\x7f'), ('E0 80', FFFDx2),
  321. ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
  322. ('E0 A0 00', FFFD+u'\x00'), ('E0 A0 7F', FFFD+u'\x7f'),
  323. ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
  324. ('E0 BF 00', FFFD+u'\x00'), ('E0 BF 7F', FFFD+u'\x7f'),
  325. ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+u'\x00'),
  326. ('E1 7F', FFFD+u'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
  327. ('E1 80 00', FFFD+u'\x00'), ('E1 80 7F', FFFD+u'\x7f'),
  328. ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
  329. ('E1 BF 00', FFFD+u'\x00'), ('E1 BF 7F', FFFD+u'\x7f'),
  330. ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+u'\x00'),
  331. ('EC 7F', FFFD+u'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
  332. ('EC 80 00', FFFD+u'\x00'), ('EC 80 7F', FFFD+u'\x7f'),
  333. ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
  334. ('EC BF 00', FFFD+u'\x00'), ('EC BF 7F', FFFD+u'\x7f'),
  335. ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+u'\x00'),
  336. ('ED 7F', FFFD+u'\x7f'),
  337. # ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
  338. ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+u'\x00'),
  339. ('ED 80 7F', FFFD+u'\x7f'), ('ED 80 C0', FFFDx2),
  340. ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+u'\x00'),
  341. ('ED 9F 7F', FFFD+u'\x7f'), ('ED 9F C0', FFFDx2),
  342. ('ED 9F FF', FFFDx2), ('EE 00', FFFD+u'\x00'),
  343. ('EE 7F', FFFD+u'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
  344. ('EE 80 00', FFFD+u'\x00'), ('EE 80 7F', FFFD+u'\x7f'),
  345. ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
  346. ('EE BF 00', FFFD+u'\x00'), ('EE BF 7F', FFFD+u'\x7f'),
  347. ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+u'\x00'),
  348. ('EF 7F', FFFD+u'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
  349. ('EF 80 00', FFFD+u'\x00'), ('EF 80 7F', FFFD+u'\x7f'),
  350. ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
  351. ('EF BF 00', FFFD+u'\x00'), ('EF BF 7F', FFFD+u'\x7f'),
  352. ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
  353. ]
  354. for seq, res in sequences:
  355. seq = self.to_bytestring(seq)
  356. raises(UnicodeDecodeError, self.decoder, seq, len(seq),
  357. None, final=True)
  358. self.checkdecodeerror(seq, 'utf-8', 0, len(seq)-1, addstuff=False,
  359. msg='invalid continuation byte')
  360. assert self.decoder(seq, len(seq), None, final=True,
  361. errorhandler=self.replace_handler) == (res, len(seq))
  362. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  363. final=True, errorhandler=self.replace_handler) ==
  364. (u'aaaa' + res + u'bbbb', len(seq) + 8))
  365. res = res.replace(FFFD, u'')
  366. assert self.decoder(seq, len(seq), None, final=True,
  367. errorhandler=self.ignore_handler) == (res, len(seq))
  368. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  369. final=True, errorhandler=self.ignore_handler) ==
  370. (u'aaaa' + res + u'bbbb', len(seq) + 8))
  371. def test_invalid_cb_for_4bytes_seq(self):
  372. """
  373. Test that an 'invalid continuation byte' error is raised when the
  374. continuation byte(s) of a 4-bytes sequence are invalid. When
  375. errors='replace',the start byte and all the following valid
  376. continuation bytes are replaced with a single U+FFFD, and all the bytes
  377. starting from the first invalid continuation bytes (included) are
  378. handled separately.
  379. E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
  380. sequence, 80 is a valid continuation byte, but 41 is not a valid cb
  381. because it's the ASCII letter 'A'.
  382. Note: when the start byte is E0 or ED, the valid ranges for the first
  383. continuation byte are limited to A0..BF and 80..9F respectively.
  384. However, when the start byte is ED, Python 2 considers all the bytes
  385. in range 80..BF valid. This is fixed in Python 3.
  386. """
  387. FFFD = u'\ufffd'
  388. FFFDx2 = FFFD * 2
  389. sequences = [
  390. ('F0 00', FFFD+u'\x00'), ('F0 7F', FFFD+u'\x7f'), ('F0 80', FFFDx2),
  391. ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
  392. ('F0 90 00', FFFD+u'\x00'), ('F0 90 7F', FFFD+u'\x7f'),
  393. ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
  394. ('F0 BF 00', FFFD+u'\x00'), ('F0 BF 7F', FFFD+u'\x7f'),
  395. ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
  396. ('F0 90 80 00', FFFD+u'\x00'), ('F0 90 80 7F', FFFD+u'\x7f'),
  397. ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
  398. ('F0 90 BF 00', FFFD+u'\x00'), ('F0 90 BF 7F', FFFD+u'\x7f'),
  399. ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
  400. ('F0 BF 80 00', FFFD+u'\x00'), ('F0 BF 80 7F', FFFD+u'\x7f'),
  401. ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
  402. ('F0 BF BF 00', FFFD+u'\x00'), ('F0 BF BF 7F', FFFD+u'\x7f'),
  403. ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
  404. ('F1 00', FFFD+u'\x00'), ('F1 7F', FFFD+u'\x7f'), ('F1 C0', FFFDx2),
  405. ('F1 FF', FFFDx2), ('F1 80 00', FFFD+u'\x00'),
  406. ('F1 80 7F', FFFD+u'\x7f'), ('F1 80 C0', FFFDx2),
  407. ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+u'\x00'),
  408. ('F1 BF 7F', FFFD+u'\x7f'), ('F1 BF C0', FFFDx2),
  409. ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+u'\x00'),
  410. ('F1 80 80 7F', FFFD+u'\x7f'), ('F1 80 80 C0', FFFDx2),
  411. ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+u'\x00'),
  412. ('F1 80 BF 7F', FFFD+u'\x7f'), ('F1 80 BF C0', FFFDx2),
  413. ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+u'\x00'),
  414. ('F1 BF 80 7F', FFFD+u'\x7f'), ('F1 BF 80 C0', FFFDx2),
  415. ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+u'\x00'),
  416. ('F1 BF BF 7F', FFFD+u'\x7f'), ('F1 BF BF C0', FFFDx2),
  417. ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+u'\x00'),
  418. ('F3 7F', FFFD+u'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
  419. ('F3 80 00', FFFD+u'\x00'), ('F3 80 7F', FFFD+u'\x7f'),
  420. ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
  421. ('F3 BF 00', FFFD+u'\x00'), ('F3 BF 7F', FFFD+u'\x7f'),
  422. ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
  423. ('F3 80 80 00', FFFD+u'\x00'), ('F3 80 80 7F', FFFD+u'\x7f'),
  424. ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
  425. ('F3 80 BF 00', FFFD+u'\x00'), ('F3 80 BF 7F', FFFD+u'\x7f'),
  426. ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
  427. ('F3 BF 80 00', FFFD+u'\x00'), ('F3 BF 80 7F', FFFD+u'\x7f'),
  428. ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
  429. ('F3 BF BF 00', FFFD+u'\x00'), ('F3 BF BF 7F', FFFD+u'\x7f'),
  430. ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
  431. ('F4 00', FFFD+u'\x00'), ('F4 7F', FFFD+u'\x7f'), ('F4 90', FFFDx2),
  432. ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
  433. ('F4 80 00', FFFD+u'\x00'), ('F4 80 7F', FFFD+u'\x7f'),
  434. ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
  435. ('F4 8F 00', FFFD+u'\x00'), ('F4 8F 7F', FFFD+u'\x7f'),
  436. ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
  437. ('F4 80 80 00', FFFD+u'\x00'), ('F4 80 80 7F', FFFD+u'\x7f'),
  438. ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
  439. ('F4 80 BF 00', FFFD+u'\x00'), ('F4 80 BF 7F', FFFD+u'\x7f'),
  440. ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
  441. ('F4 8F 80 00', FFFD+u'\x00'), ('F4 8F 80 7F', FFFD+u'\x7f'),
  442. ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
  443. ('F4 8F BF 00', FFFD+u'\x00'), ('F4 8F BF 7F', FFFD+u'\x7f'),
  444. ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
  445. ]
  446. for seq, res in sequences:
  447. seq = self.to_bytestring(seq)
  448. raises(UnicodeDecodeError, self.decoder, seq, len(seq),
  449. None, final=True)
  450. self.checkdecodeerror(seq, 'utf-8', 0, len(seq)-1, addstuff=False,
  451. msg='invalid continuation byte')
  452. assert self.decoder(seq, len(seq), None, final=True,
  453. errorhandler=self.replace_handler) == (res, len(seq))
  454. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  455. final=True, errorhandler=self.replace_handler) ==
  456. (u'aaaa' + res + u'bbbb', len(seq) + 8))
  457. res = res.replace(FFFD, u'')
  458. assert self.decoder(seq, len(seq), None, final=True,
  459. errorhandler=self.ignore_handler) == (res, len(seq))
  460. assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, None,
  461. final=True, errorhandler=self.ignore_handler) ==
  462. (u'aaaa' + res + u'bbbb', len(seq) + 8))
  463. def test_utf8_errors(self):
  464. # unexpected end of data
  465. for s in ['\xd7', '\xd6', '\xeb\x96', '\xf0\x90\x91', '\xc2', '\xdf']:
  466. self.checkdecodeerror(s, 'utf-8', 0, len(s), addstuff=False,
  467. msg='unexpected end of data')
  468. # invalid data 2 byte
  469. for s in ["\xd7\x50", "\xd6\x06", "\xd6\xD6"]:
  470. self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True,
  471. msg='invalid continuation byte')
  472. # invalid data 3 byte
  473. for s in ["\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95"]:
  474. self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True,
  475. msg='invalid continuation byte')
  476. for s in ["\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5"]:
  477. self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True,
  478. msg='invalid continuation byte')
  479. # invalid data 4 byte
  480. for s in ["\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93"]:
  481. self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True,
  482. msg='invalid continuation byte')
  483. for s in ["\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93"]:
  484. self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True,
  485. msg='invalid continuation byte')
  486. for s in ["\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3"]:
  487. self.checkdecodeerror(s, "utf-8", 0, 3, addstuff=True,
  488. msg='invalid continuation byte')
  489. def test_issue8271(self):
  490. # From CPython
  491. # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
  492. # only the start byte and the continuation byte(s) are now considered
  493. # invalid, instead of the number of bytes specified by the start byte.
  494. # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
  495. # table 3-8, Row 2) for more information about the algorithm used.
  496. FFFD = u'\ufffd'
  497. sequences = [
  498. # invalid start bytes
  499. ('\x80', FFFD), # continuation byte
  500. ('\x80\x80', FFFD*2), # 2 continuation bytes
  501. ('\xc0', FFFD),
  502. ('\xc0\xc0', FFFD*2),
  503. ('\xc1', FFFD),
  504. ('\xc1\xc0', FFFD*2),
  505. ('\xc0\xc1', FFFD*2),
  506. # with start byte of a 2-byte sequence
  507. ('\xc2', FFFD), # only the start byte
  508. ('\xc2\xc2', FFFD*2), # 2 start bytes
  509. ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
  510. ('\xc2\x41', FFFD+'A'), # invalid continuation byte
  511. # with start byte of a 3-byte sequence
  512. ('\xe1', FFFD), # only the start byte
  513. ('\xe1\xe1', FFFD*2), # 2 start bytes
  514. ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
  515. ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
  516. ('\xe1\x80', FFFD), # only 1 continuation byte
  517. ('\xe1\x41', FFFD+'A'), # invalid continuation byte
  518. ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
  519. ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
  520. ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
  521. ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
  522. ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
  523. # with start byte of a 4-byte sequence
  524. ('\xf1', FFFD), # only the start byte
  525. ('\xf1\xf1', FFFD*2), # 2 start bytes
  526. ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
  527. ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
  528. ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
  529. ('\xf1\x80', FFFD), # only 1 continuation bytes
  530. ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
  531. ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
  532. ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
  533. ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
  534. ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
  535. ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
  536. ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
  537. ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
  538. ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
  539. ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
  540. ('\xf1\xf1\x80\x41', FFFD*2+'A'),
  541. ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
  542. # with invalid start byte of a 4-byte sequence (rfc2279)
  543. ('\xf5', FFFD), # only the start byte
  544. ('\xf5\xf5', FFFD*2), # 2 start bytes
  545. ('\xf5\x80', FFFD*2), # only 1 continuation byte
  546. ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
  547. ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
  548. ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
  549. ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
  550. ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
  551. # with invalid start byte of a 5-byte sequence (rfc2279)
  552. ('\xf8', FFFD), # only the start byte
  553. ('\xf8\xf8', FFFD*2), # 2 start bytes
  554. ('\xf8\x80', FFFD*2), # only one continuation byte
  555. ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
  556. ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
  557. # with invalid start byte of a 6-byte sequence (rfc2279)
  558. ('\xfc', FFFD), # only the start byte
  559. ('\xfc\xfc', FFFD*2), # 2 start bytes
  560. ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
  561. ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
  562. # invalid start byte
  563. ('\xfe', FFFD),
  564. ('\xfe\x80\x80', FFFD*3),
  565. # other sequences
  566. ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
  567. ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
  568. ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
  569. ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
  570. u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
  571. ]
  572. for n, (seq, res) in enumerate(sequences):
  573. decoder = self.getdecoder('utf-8')
  574. raises(UnicodeDecodeError, decoder, seq, len(seq), None, final=True)
  575. assert decoder(seq, len(seq), None, final=True,
  576. errorhandler=self.replace_handler) == (res, len(seq))
  577. assert decoder(seq + 'b', len(seq) + 1, None, final=True,
  578. errorhandler=self.replace_handler) == (res + u'b',
  579. len(seq) + 1)
  580. res = res.replace(FFFD, u'')
  581. assert decoder(seq, len(seq), None, final=True,
  582. errorhandler=self.ignore_handler) == (res, len(seq))
  583. class TestEncoding(UnicodeTests):
  584. def test_all_ascii(self):
  585. for i in range(128):
  586. if sys.version >= "2.7":
  587. self.checkencode(unichr(i), "utf-7")
  588. for encoding in "utf-8 latin-1 ascii".split():
  589. self.checkencode(unichr(i), encoding)
  590. def test_all_first_256(self):
  591. for i in range(256):
  592. if sys.version >= "2.7":
  593. self.checkencode(unichr(i), "utf-7")
  594. for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
  595. "utf-32 utf-32-be utf-32-le").split():
  596. self.checkencode(unichr(i), encoding)
  597. def test_first_10000(self):
  598. for i in range(10000):
  599. if sys.version >= "2.7":
  600. self.checkencode(unichr(i), "utf-7")
  601. for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
  602. "utf-32 utf-32-be utf-32-le").split():
  603. self.checkencode(unichr(i), encoding)
  604. def test_random(self):
  605. for i in range(10000):
  606. v = random.randrange(sys.maxunicode)
  607. if 0xd800 <= v <= 0xdfff:
  608. continue
  609. uni = unichr(v)
  610. if sys.version >= "2.7":
  611. self.checkencode(uni, "utf-7")
  612. for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
  613. "utf-32 utf-32-be utf-32-le").split():
  614. self.checkencode(uni, encoding)
  615. def test_maxunicode(self):
  616. uni = unichr(sys.maxunicode)
  617. if sys.version >= "2.7":
  618. self.checkencode(uni, "utf-7")
  619. for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
  620. "utf-32 utf-32-be utf-32-le").split():
  621. self.checkencode(uni, encoding)
  622. def test_single_chars_utf8(self):
  623. # check every number of bytes per char
  624. for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
  625. self.checkencode(s, "utf-8")
  626. def test_utf8_surrogates(self):
  627. # check replacing of two surrogates by single char while encoding
  628. # make sure that the string itself is not marshalled
  629. u = u"\ud800"
  630. for i in range(4):
  631. u += u"\udc00"
  632. self.checkencode(u, "utf-8")
  633. def test_ascii_error(self):
  634. self.checkencodeerror(u"abc\xFF\xFF\xFFcde", "ascii", 3, 6)
  635. def test_latin1_error(self):
  636. self.checkencodeerror(u"abc\uffff\uffff\uffffcde", "latin-1", 3, 6)
  637. def test_mbcs(self):
  638. if sys.platform != 'win32':
  639. py.test.skip("mbcs encoding is win32-specific")
  640. self.checkencode(u'encoding test', "mbcs")
  641. self.checkdecode('decoding test', "mbcs")
  642. # XXX test this on a non-western Windows installation
  643. self.checkencode(u"\N{GREEK CAPITAL LETTER PHI}", "mbcs") # a F
  644. self.checkencode(u"\N{GREEK CAPITAL LETTER PSI}", "mbcs") # a ?
  645. class TestTranslation(object):
  646. def setup_class(cls):
  647. if runicode.MAXUNICODE != sys.maxunicode:
  648. py.test.skip("these tests cannot run on the llinterp")
  649. def test_utf8(self):
  650. from pypy.rpython.test.test_llinterp import interpret
  651. def f(x):
  652. s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x)
  653. u, consumed = runicode.str_decode_utf_8(s1, len(s1), True)
  654. s2 = runicode.unicode_encode_utf_8(u, len(u), True)
  655. return s1 == s2
  656. res = interpret(f, [2])
  657. assert res
  658. def test_surrogates(self):
  659. if runicode.MAXUNICODE < 65536:
  660. py.test.skip("Narrow unicode build")
  661. from pypy.rpython.test.test_llinterp import interpret
  662. def f(x):
  663. u = runicode.UNICHR(x)
  664. t = runicode.ORD(u)
  665. return t
  666. res = interpret(f, [0x10140])
  667. assert res == 0x10140