PageRenderTime 56ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/Lib/test/test_codecs.py

http://unladen-swallow.googlecode.com/
Python | 1508 lines | 1503 code | 1 blank | 4 comment | 1 complexity | d265623b8edd6da1e13fd97d261ee5d8 MD5 | raw file
Possible License(s): 0BSD, BSD-3-Clause
  1. from test import test_support
  2. import unittest
  3. import codecs
  4. import sys, StringIO, _testcapi
  5. class Queue(object):
  6. """
  7. queue: write bytes at one end, read bytes from the other end
  8. """
  9. def __init__(self):
  10. self._buffer = ""
  11. def write(self, chars):
  12. self._buffer += chars
  13. def read(self, size=-1):
  14. if size<0:
  15. s = self._buffer
  16. self._buffer = ""
  17. return s
  18. else:
  19. s = self._buffer[:size]
  20. self._buffer = self._buffer[size:]
  21. return s
  22. class ReadTest(unittest.TestCase):
  23. def check_partial(self, input, partialresults):
  24. # get a StreamReader for the encoding and feed the bytestring version
  25. # of input to the reader byte by byte. Read everything available from
  26. # the StreamReader and check that the results equal the appropriate
  27. # entries from partialresults.
  28. q = Queue()
  29. r = codecs.getreader(self.encoding)(q)
  30. result = u""
  31. for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  32. q.write(c)
  33. result += r.read()
  34. self.assertEqual(result, partialresult)
  35. # check that there's nothing left in the buffers
  36. self.assertEqual(r.read(), u"")
  37. self.assertEqual(r.bytebuffer, "")
  38. self.assertEqual(r.charbuffer, u"")
  39. # do the check again, this time using a incremental decoder
  40. d = codecs.getincrementaldecoder(self.encoding)()
  41. result = u""
  42. for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  43. result += d.decode(c)
  44. self.assertEqual(result, partialresult)
  45. # check that there's nothing left in the buffers
  46. self.assertEqual(d.decode("", True), u"")
  47. self.assertEqual(d.buffer, "")
  48. # Check whether the reset method works properly
  49. d.reset()
  50. result = u""
  51. for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  52. result += d.decode(c)
  53. self.assertEqual(result, partialresult)
  54. # check that there's nothing left in the buffers
  55. self.assertEqual(d.decode("", True), u"")
  56. self.assertEqual(d.buffer, "")
  57. # check iterdecode()
  58. encoded = input.encode(self.encoding)
  59. self.assertEqual(
  60. input,
  61. u"".join(codecs.iterdecode(encoded, self.encoding))
  62. )
  63. def test_readline(self):
  64. def getreader(input):
  65. stream = StringIO.StringIO(input.encode(self.encoding))
  66. return codecs.getreader(self.encoding)(stream)
  67. def readalllines(input, keepends=True, size=None):
  68. reader = getreader(input)
  69. lines = []
  70. while True:
  71. line = reader.readline(size=size, keepends=keepends)
  72. if not line:
  73. break
  74. lines.append(line)
  75. return "|".join(lines)
  76. s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
  77. sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
  78. sexpectednoends = u"foo|bar|baz|spam|eggs"
  79. self.assertEqual(readalllines(s, True), sexpected)
  80. self.assertEqual(readalllines(s, False), sexpectednoends)
  81. self.assertEqual(readalllines(s, True, 10), sexpected)
  82. self.assertEqual(readalllines(s, False, 10), sexpectednoends)
  83. # Test long lines (multiple calls to read() in readline())
  84. vw = []
  85. vwo = []
  86. for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
  87. vw.append((i*200)*u"\3042" + lineend)
  88. vwo.append((i*200)*u"\3042")
  89. self.assertEqual(readalllines("".join(vw), True), "".join(vw))
  90. self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
  91. # Test lines where the first read might end with \r, so the
  92. # reader has to look ahead whether this is a lone \r or a \r\n
  93. for size in xrange(80):
  94. for lineend in u"\n \r\n \r \u2028".split():
  95. s = 10*(size*u"a" + lineend + u"xxx\n")
  96. reader = getreader(s)
  97. for i in xrange(10):
  98. self.assertEqual(
  99. reader.readline(keepends=True),
  100. size*u"a" + lineend,
  101. )
  102. reader = getreader(s)
  103. for i in xrange(10):
  104. self.assertEqual(
  105. reader.readline(keepends=False),
  106. size*u"a",
  107. )
  108. def test_bug1175396(self):
  109. s = [
  110. '<%!--===================================================\r\n',
  111. ' BLOG index page: show recent articles,\r\n',
  112. ' today\'s articles, or articles of a specific date.\r\n',
  113. '========================================================--%>\r\n',
  114. '<%@inputencoding="ISO-8859-1"%>\r\n',
  115. '<%@pagetemplate=TEMPLATE.y%>\r\n',
  116. '<%@import=import frog.util, frog%>\r\n',
  117. '<%@import=import frog.objects%>\r\n',
  118. '<%@import=from frog.storageerrors import StorageError%>\r\n',
  119. '<%\r\n',
  120. '\r\n',
  121. 'import logging\r\n',
  122. 'log=logging.getLogger("Snakelets.logger")\r\n',
  123. '\r\n',
  124. '\r\n',
  125. 'user=self.SessionCtx.user\r\n',
  126. 'storageEngine=self.SessionCtx.storageEngine\r\n',
  127. '\r\n',
  128. '\r\n',
  129. 'def readArticlesFromDate(date, count=None):\r\n',
  130. ' entryids=storageEngine.listBlogEntries(date)\r\n',
  131. ' entryids.reverse() # descending\r\n',
  132. ' if count:\r\n',
  133. ' entryids=entryids[:count]\r\n',
  134. ' try:\r\n',
  135. ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
  136. ' except StorageError,x:\r\n',
  137. ' log.error("Error loading articles: "+str(x))\r\n',
  138. ' self.abort("cannot load articles")\r\n',
  139. '\r\n',
  140. 'showdate=None\r\n',
  141. '\r\n',
  142. 'arg=self.Request.getArg()\r\n',
  143. 'if arg=="today":\r\n',
  144. ' #-------------------- TODAY\'S ARTICLES\r\n',
  145. ' self.write("<h2>Today\'s articles</h2>")\r\n',
  146. ' showdate = frog.util.isodatestr() \r\n',
  147. ' entries = readArticlesFromDate(showdate)\r\n',
  148. 'elif arg=="active":\r\n',
  149. ' #-------------------- ACTIVE ARTICLES redirect\r\n',
  150. ' self.Yredirect("active.y")\r\n',
  151. 'elif arg=="login":\r\n',
  152. ' #-------------------- LOGIN PAGE redirect\r\n',
  153. ' self.Yredirect("login.y")\r\n',
  154. 'elif arg=="date":\r\n',
  155. ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
  156. ' showdate = self.Request.getParameter("date")\r\n',
  157. ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
  158. ' entries = readArticlesFromDate(showdate)\r\n',
  159. 'else:\r\n',
  160. ' #-------------------- RECENT ARTICLES\r\n',
  161. ' self.write("<h2>Recent articles</h2>")\r\n',
  162. ' dates=storageEngine.listBlogEntryDates()\r\n',
  163. ' if dates:\r\n',
  164. ' entries=[]\r\n',
  165. ' SHOWAMOUNT=10\r\n',
  166. ' for showdate in dates:\r\n',
  167. ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
  168. ' if len(entries)>=SHOWAMOUNT:\r\n',
  169. ' break\r\n',
  170. ' \r\n',
  171. ]
  172. stream = StringIO.StringIO("".join(s).encode(self.encoding))
  173. reader = codecs.getreader(self.encoding)(stream)
  174. for (i, line) in enumerate(reader):
  175. self.assertEqual(line, s[i])
  176. def test_readlinequeue(self):
  177. q = Queue()
  178. writer = codecs.getwriter(self.encoding)(q)
  179. reader = codecs.getreader(self.encoding)(q)
  180. # No lineends
  181. writer.write(u"foo\r")
  182. self.assertEqual(reader.readline(keepends=False), u"foo")
  183. writer.write(u"\nbar\r")
  184. self.assertEqual(reader.readline(keepends=False), u"")
  185. self.assertEqual(reader.readline(keepends=False), u"bar")
  186. writer.write(u"baz")
  187. self.assertEqual(reader.readline(keepends=False), u"baz")
  188. self.assertEqual(reader.readline(keepends=False), u"")
  189. # Lineends
  190. writer.write(u"foo\r")
  191. self.assertEqual(reader.readline(keepends=True), u"foo\r")
  192. writer.write(u"\nbar\r")
  193. self.assertEqual(reader.readline(keepends=True), u"\n")
  194. self.assertEqual(reader.readline(keepends=True), u"bar\r")
  195. writer.write(u"baz")
  196. self.assertEqual(reader.readline(keepends=True), u"baz")
  197. self.assertEqual(reader.readline(keepends=True), u"")
  198. writer.write(u"foo\r\n")
  199. self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
  200. def test_bug1098990_a(self):
  201. s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
  202. s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
  203. s3 = u"next line.\r\n"
  204. s = (s1+s2+s3).encode(self.encoding)
  205. stream = StringIO.StringIO(s)
  206. reader = codecs.getreader(self.encoding)(stream)
  207. self.assertEqual(reader.readline(), s1)
  208. self.assertEqual(reader.readline(), s2)
  209. self.assertEqual(reader.readline(), s3)
  210. self.assertEqual(reader.readline(), u"")
  211. def test_bug1098990_b(self):
  212. s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
  213. s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
  214. s3 = u"stillokay:bbbbxx\r\n"
  215. s4 = u"broken!!!!badbad\r\n"
  216. s5 = u"againokay.\r\n"
  217. s = (s1+s2+s3+s4+s5).encode(self.encoding)
  218. stream = StringIO.StringIO(s)
  219. reader = codecs.getreader(self.encoding)(stream)
  220. self.assertEqual(reader.readline(), s1)
  221. self.assertEqual(reader.readline(), s2)
  222. self.assertEqual(reader.readline(), s3)
  223. self.assertEqual(reader.readline(), s4)
  224. self.assertEqual(reader.readline(), s5)
  225. self.assertEqual(reader.readline(), u"")
  226. class UTF32Test(ReadTest):
  227. encoding = "utf-32"
  228. spamle = ('\xff\xfe\x00\x00'
  229. 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
  230. 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
  231. spambe = ('\x00\x00\xfe\xff'
  232. '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
  233. '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
  234. def test_only_one_bom(self):
  235. _,_,reader,writer = codecs.lookup(self.encoding)
  236. # encode some stream
  237. s = StringIO.StringIO()
  238. f = writer(s)
  239. f.write(u"spam")
  240. f.write(u"spam")
  241. d = s.getvalue()
  242. # check whether there is exactly one BOM in it
  243. self.assert_(d == self.spamle or d == self.spambe)
  244. # try to read it back
  245. s = StringIO.StringIO(d)
  246. f = reader(s)
  247. self.assertEquals(f.read(), u"spamspam")
  248. def test_badbom(self):
  249. s = StringIO.StringIO(4*"\xff")
  250. f = codecs.getreader(self.encoding)(s)
  251. self.assertRaises(UnicodeError, f.read)
  252. s = StringIO.StringIO(8*"\xff")
  253. f = codecs.getreader(self.encoding)(s)
  254. self.assertRaises(UnicodeError, f.read)
  255. def test_partial(self):
  256. self.check_partial(
  257. u"\x00\xff\u0100\uffff",
  258. [
  259. u"", # first byte of BOM read
  260. u"", # second byte of BOM read
  261. u"", # third byte of BOM read
  262. u"", # fourth byte of BOM read => byteorder known
  263. u"",
  264. u"",
  265. u"",
  266. u"\x00",
  267. u"\x00",
  268. u"\x00",
  269. u"\x00",
  270. u"\x00\xff",
  271. u"\x00\xff",
  272. u"\x00\xff",
  273. u"\x00\xff",
  274. u"\x00\xff\u0100",
  275. u"\x00\xff\u0100",
  276. u"\x00\xff\u0100",
  277. u"\x00\xff\u0100",
  278. u"\x00\xff\u0100\uffff",
  279. ]
  280. )
  281. def test_handlers(self):
  282. self.assertEqual((u'\ufffd', 1),
  283. codecs.utf_32_decode('\x01', 'replace', True))
  284. self.assertEqual((u'', 1),
  285. codecs.utf_32_decode('\x01', 'ignore', True))
  286. def test_errors(self):
  287. self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
  288. "\xff", "strict", True)
  289. class UTF32LETest(ReadTest):
  290. encoding = "utf-32-le"
  291. def test_partial(self):
  292. self.check_partial(
  293. u"\x00\xff\u0100\uffff",
  294. [
  295. u"",
  296. u"",
  297. u"",
  298. u"\x00",
  299. u"\x00",
  300. u"\x00",
  301. u"\x00",
  302. u"\x00\xff",
  303. u"\x00\xff",
  304. u"\x00\xff",
  305. u"\x00\xff",
  306. u"\x00\xff\u0100",
  307. u"\x00\xff\u0100",
  308. u"\x00\xff\u0100",
  309. u"\x00\xff\u0100",
  310. u"\x00\xff\u0100\uffff",
  311. ]
  312. )
  313. def test_simple(self):
  314. self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
  315. def test_errors(self):
  316. self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
  317. "\xff", "strict", True)
  318. class UTF32BETest(ReadTest):
  319. encoding = "utf-32-be"
  320. def test_partial(self):
  321. self.check_partial(
  322. u"\x00\xff\u0100\uffff",
  323. [
  324. u"",
  325. u"",
  326. u"",
  327. u"\x00",
  328. u"\x00",
  329. u"\x00",
  330. u"\x00",
  331. u"\x00\xff",
  332. u"\x00\xff",
  333. u"\x00\xff",
  334. u"\x00\xff",
  335. u"\x00\xff\u0100",
  336. u"\x00\xff\u0100",
  337. u"\x00\xff\u0100",
  338. u"\x00\xff\u0100",
  339. u"\x00\xff\u0100\uffff",
  340. ]
  341. )
  342. def test_simple(self):
  343. self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
  344. def test_errors(self):
  345. self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
  346. "\xff", "strict", True)
  347. class UTF16Test(ReadTest):
  348. encoding = "utf-16"
  349. spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
  350. spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
  351. def test_only_one_bom(self):
  352. _,_,reader,writer = codecs.lookup(self.encoding)
  353. # encode some stream
  354. s = StringIO.StringIO()
  355. f = writer(s)
  356. f.write(u"spam")
  357. f.write(u"spam")
  358. d = s.getvalue()
  359. # check whether there is exactly one BOM in it
  360. self.assert_(d == self.spamle or d == self.spambe)
  361. # try to read it back
  362. s = StringIO.StringIO(d)
  363. f = reader(s)
  364. self.assertEquals(f.read(), u"spamspam")
  365. def test_badbom(self):
  366. s = StringIO.StringIO("\xff\xff")
  367. f = codecs.getreader(self.encoding)(s)
  368. self.assertRaises(UnicodeError, f.read)
  369. s = StringIO.StringIO("\xff\xff\xff\xff")
  370. f = codecs.getreader(self.encoding)(s)
  371. self.assertRaises(UnicodeError, f.read)
  372. def test_partial(self):
  373. self.check_partial(
  374. u"\x00\xff\u0100\uffff",
  375. [
  376. u"", # first byte of BOM read
  377. u"", # second byte of BOM read => byteorder known
  378. u"",
  379. u"\x00",
  380. u"\x00",
  381. u"\x00\xff",
  382. u"\x00\xff",
  383. u"\x00\xff\u0100",
  384. u"\x00\xff\u0100",
  385. u"\x00\xff\u0100\uffff",
  386. ]
  387. )
  388. def test_handlers(self):
  389. self.assertEqual((u'\ufffd', 1),
  390. codecs.utf_16_decode('\x01', 'replace', True))
  391. self.assertEqual((u'', 1),
  392. codecs.utf_16_decode('\x01', 'ignore', True))
  393. def test_errors(self):
  394. self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
  395. class UTF16LETest(ReadTest):
  396. encoding = "utf-16-le"
  397. def test_partial(self):
  398. self.check_partial(
  399. u"\x00\xff\u0100\uffff",
  400. [
  401. u"",
  402. u"\x00",
  403. u"\x00",
  404. u"\x00\xff",
  405. u"\x00\xff",
  406. u"\x00\xff\u0100",
  407. u"\x00\xff\u0100",
  408. u"\x00\xff\u0100\uffff",
  409. ]
  410. )
  411. def test_errors(self):
  412. self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
  413. class UTF16BETest(ReadTest):
  414. encoding = "utf-16-be"
  415. def test_partial(self):
  416. self.check_partial(
  417. u"\x00\xff\u0100\uffff",
  418. [
  419. u"",
  420. u"\x00",
  421. u"\x00",
  422. u"\x00\xff",
  423. u"\x00\xff",
  424. u"\x00\xff\u0100",
  425. u"\x00\xff\u0100",
  426. u"\x00\xff\u0100\uffff",
  427. ]
  428. )
  429. def test_errors(self):
  430. self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
  431. class UTF8Test(ReadTest):
  432. encoding = "utf-8"
  433. def test_partial(self):
  434. self.check_partial(
  435. u"\x00\xff\u07ff\u0800\uffff",
  436. [
  437. u"\x00",
  438. u"\x00",
  439. u"\x00\xff",
  440. u"\x00\xff",
  441. u"\x00\xff\u07ff",
  442. u"\x00\xff\u07ff",
  443. u"\x00\xff\u07ff",
  444. u"\x00\xff\u07ff\u0800",
  445. u"\x00\xff\u07ff\u0800",
  446. u"\x00\xff\u07ff\u0800",
  447. u"\x00\xff\u07ff\u0800\uffff",
  448. ]
  449. )
  450. class UTF7Test(ReadTest):
  451. encoding = "utf-7"
  452. def test_partial(self):
  453. self.check_partial(
  454. u"a+-b",
  455. [
  456. u"a",
  457. u"a",
  458. u"a+",
  459. u"a+-",
  460. u"a+-b",
  461. ]
  462. )
  463. class UTF16ExTest(unittest.TestCase):
  464. def test_errors(self):
  465. self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
  466. def test_bad_args(self):
  467. self.assertRaises(TypeError, codecs.utf_16_ex_decode)
  468. class ReadBufferTest(unittest.TestCase):
  469. def test_array(self):
  470. import array
  471. self.assertEqual(
  472. codecs.readbuffer_encode(array.array("c", "spam")),
  473. ("spam", 4)
  474. )
  475. def test_empty(self):
  476. self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
  477. def test_bad_args(self):
  478. self.assertRaises(TypeError, codecs.readbuffer_encode)
  479. self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
  480. class CharBufferTest(unittest.TestCase):
  481. def test_string(self):
  482. self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
  483. def test_empty(self):
  484. self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
  485. def test_bad_args(self):
  486. self.assertRaises(TypeError, codecs.charbuffer_encode)
  487. self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
  488. class UTF8SigTest(ReadTest):
  489. encoding = "utf-8-sig"
  490. def test_partial(self):
  491. self.check_partial(
  492. u"\ufeff\x00\xff\u07ff\u0800\uffff",
  493. [
  494. u"",
  495. u"",
  496. u"", # First BOM has been read and skipped
  497. u"",
  498. u"",
  499. u"\ufeff", # Second BOM has been read and emitted
  500. u"\ufeff\x00", # "\x00" read and emitted
  501. u"\ufeff\x00", # First byte of encoded u"\xff" read
  502. u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
  503. u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
  504. u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
  505. u"\ufeff\x00\xff\u07ff",
  506. u"\ufeff\x00\xff\u07ff",
  507. u"\ufeff\x00\xff\u07ff\u0800",
  508. u"\ufeff\x00\xff\u07ff\u0800",
  509. u"\ufeff\x00\xff\u07ff\u0800",
  510. u"\ufeff\x00\xff\u07ff\u0800\uffff",
  511. ]
  512. )
  513. def test_bug1601501(self):
  514. # SF bug #1601501: check that the codec works with a buffer
  515. unicode("\xef\xbb\xbf", "utf-8-sig")
  516. def test_bom(self):
  517. d = codecs.getincrementaldecoder("utf-8-sig")()
  518. s = u"spam"
  519. self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
  520. def test_stream_bom(self):
  521. unistring = u"ABC\u00A1\u2200XYZ"
  522. bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
  523. reader = codecs.getreader("utf-8-sig")
  524. for sizehint in [None] + range(1, 11) + \
  525. [64, 128, 256, 512, 1024]:
  526. istream = reader(StringIO.StringIO(bytestring))
  527. ostream = StringIO.StringIO()
  528. while 1:
  529. if sizehint is not None:
  530. data = istream.read(sizehint)
  531. else:
  532. data = istream.read()
  533. if not data:
  534. break
  535. ostream.write(data)
  536. got = ostream.getvalue()
  537. self.assertEqual(got, unistring)
  538. def test_stream_bare(self):
  539. unistring = u"ABC\u00A1\u2200XYZ"
  540. bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
  541. reader = codecs.getreader("utf-8-sig")
  542. for sizehint in [None] + range(1, 11) + \
  543. [64, 128, 256, 512, 1024]:
  544. istream = reader(StringIO.StringIO(bytestring))
  545. ostream = StringIO.StringIO()
  546. while 1:
  547. if sizehint is not None:
  548. data = istream.read(sizehint)
  549. else:
  550. data = istream.read()
  551. if not data:
  552. break
  553. ostream.write(data)
  554. got = ostream.getvalue()
  555. self.assertEqual(got, unistring)
  556. class EscapeDecodeTest(unittest.TestCase):
  557. def test_empty(self):
  558. self.assertEquals(codecs.escape_decode(""), ("", 0))
  559. class RecodingTest(unittest.TestCase):
  560. def test_recoding(self):
  561. f = StringIO.StringIO()
  562. f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
  563. f2.write(u"a")
  564. f2.close()
  565. # Python used to crash on this at exit because of a refcount
  566. # bug in _codecsmodule.c
  567. # From RFC 3492
  568. punycode_testcases = [
  569. # A Arabic (Egyptian):
  570. (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
  571. u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
  572. "egbpdaj6bu4bxfgehfvwxn"),
  573. # B Chinese (simplified):
  574. (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
  575. "ihqwcrb4cv8a8dqg056pqjye"),
  576. # C Chinese (traditional):
  577. (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
  578. "ihqwctvzc91f659drss3x8bo0yb"),
  579. # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
  580. (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
  581. u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
  582. u"\u0065\u0073\u006B\u0079",
  583. "Proprostnemluvesky-uyb24dma41a"),
  584. # E Hebrew:
  585. (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
  586. u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
  587. u"\u05D1\u05E8\u05D9\u05EA",
  588. "4dbcagdahymbxekheh6e0a7fei0b"),
  589. # F Hindi (Devanagari):
  590. (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
  591. u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
  592. u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
  593. u"\u0939\u0948\u0902",
  594. "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
  595. #(G) Japanese (kanji and hiragana):
  596. (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
  597. u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
  598. "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
  599. # (H) Korean (Hangul syllables):
  600. (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
  601. u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
  602. u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
  603. "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
  604. "psd879ccm6fea98c"),
  605. # (I) Russian (Cyrillic):
  606. (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
  607. u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
  608. u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
  609. u"\u0438",
  610. "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
  611. # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
  612. (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
  613. u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
  614. u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
  615. u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
  616. u"\u0061\u00F1\u006F\u006C",
  617. "PorqunopuedensimplementehablarenEspaol-fmd56a"),
  618. # (K) Vietnamese:
  619. # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
  620. # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
  621. (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
  622. u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
  623. u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
  624. u"\u0056\u0069\u1EC7\u0074",
  625. "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
  626. #(L) 3<nen>B<gumi><kinpachi><sensei>
  627. (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
  628. "3B-ww4c5e180e575a65lsy2b"),
  629. # (M) <amuro><namie>-with-SUPER-MONKEYS
  630. (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
  631. u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
  632. u"\u004F\u004E\u004B\u0045\u0059\u0053",
  633. "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
  634. # (N) Hello-Another-Way-<sorezore><no><basho>
  635. (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
  636. u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
  637. u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
  638. "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
  639. # (O) <hitotsu><yane><no><shita>2
  640. (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
  641. "2-u9tlzr9756bt3uc0v"),
  642. # (P) Maji<de>Koi<suru>5<byou><mae>
  643. (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
  644. u"\u308B\u0035\u79D2\u524D",
  645. "MajiKoi5-783gue6qz075azm5e"),
  646. # (Q) <pafii>de<runba>
  647. (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
  648. "de-jg4avhby1noc0d"),
  649. # (R) <sono><supiido><de>
  650. (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
  651. "d9juau41awczczp"),
  652. # (S) -> $1.00 <-
  653. (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
  654. u"\u003C\u002D",
  655. "-> $1.00 <--")
  656. ]
  657. for i in punycode_testcases:
  658. if len(i)!=2:
  659. print repr(i)
  660. class PunycodeTest(unittest.TestCase):
  661. def test_encode(self):
  662. for uni, puny in punycode_testcases:
  663. # Need to convert both strings to lower case, since
  664. # some of the extended encodings use upper case, but our
  665. # code produces only lower case. Converting just puny to
  666. # lower is also insufficient, since some of the input characters
  667. # are upper case.
  668. self.assertEquals(uni.encode("punycode").lower(), puny.lower())
  669. def test_decode(self):
  670. for uni, puny in punycode_testcases:
  671. self.assertEquals(uni, puny.decode("punycode"))
  672. class UnicodeInternalTest(unittest.TestCase):
  673. def test_bug1251300(self):
  674. # Decoding with unicode_internal used to not correctly handle "code
  675. # points" above 0x10ffff on UCS-4 builds.
  676. if sys.maxunicode > 0xffff:
  677. ok = [
  678. ("\x00\x10\xff\xff", u"\U0010ffff"),
  679. ("\x00\x00\x01\x01", u"\U00000101"),
  680. ("", u""),
  681. ]
  682. not_ok = [
  683. "\x7f\xff\xff\xff",
  684. "\x80\x00\x00\x00",
  685. "\x81\x00\x00\x00",
  686. "\x00",
  687. "\x00\x00\x00\x00\x00",
  688. ]
  689. for internal, uni in ok:
  690. if sys.byteorder == "little":
  691. internal = "".join(reversed(internal))
  692. self.assertEquals(uni, internal.decode("unicode_internal"))
  693. for internal in not_ok:
  694. if sys.byteorder == "little":
  695. internal = "".join(reversed(internal))
  696. self.assertRaises(UnicodeDecodeError, internal.decode,
  697. "unicode_internal")
  698. def test_decode_error_attributes(self):
  699. if sys.maxunicode > 0xffff:
  700. try:
  701. "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
  702. except UnicodeDecodeError, ex:
  703. self.assertEquals("unicode_internal", ex.encoding)
  704. self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
  705. self.assertEquals(4, ex.start)
  706. self.assertEquals(8, ex.end)
  707. else:
  708. self.fail()
  709. def test_decode_callback(self):
  710. if sys.maxunicode > 0xffff:
  711. codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
  712. decoder = codecs.getdecoder("unicode_internal")
  713. ab = u"ab".encode("unicode_internal")
  714. ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
  715. "UnicodeInternalTest")
  716. self.assertEquals((u"ab", 12), ignored)
  717. # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
  718. nameprep_tests = [
  719. # 3.1 Map to nothing.
  720. ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
  721. '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
  722. '\xb8\x8f\xef\xbb\xbf',
  723. 'foobarbaz'),
  724. # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
  725. ('CAFE',
  726. 'cafe'),
  727. # 3.3 Case folding 8bit U+00DF (german sharp s).
  728. # The original test case is bogus; it says \xc3\xdf
  729. ('\xc3\x9f',
  730. 'ss'),
  731. # 3.4 Case folding U+0130 (turkish capital I with dot).
  732. ('\xc4\xb0',
  733. 'i\xcc\x87'),
  734. # 3.5 Case folding multibyte U+0143 U+037A.
  735. ('\xc5\x83\xcd\xba',
  736. '\xc5\x84 \xce\xb9'),
  737. # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
  738. # XXX: skip this as it fails in UCS-2 mode
  739. #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
  740. # 'telc\xe2\x88\x95kg\xcf\x83'),
  741. (None, None),
  742. # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
  743. ('j\xcc\x8c\xc2\xa0\xc2\xaa',
  744. '\xc7\xb0 a'),
  745. # 3.8 Case folding U+1FB7 and normalization.
  746. ('\xe1\xbe\xb7',
  747. '\xe1\xbe\xb6\xce\xb9'),
  748. # 3.9 Self-reverting case folding U+01F0 and normalization.
  749. # The original test case is bogus, it says `\xc7\xf0'
  750. ('\xc7\xb0',
  751. '\xc7\xb0'),
  752. # 3.10 Self-reverting case folding U+0390 and normalization.
  753. ('\xce\x90',
  754. '\xce\x90'),
  755. # 3.11 Self-reverting case folding U+03B0 and normalization.
  756. ('\xce\xb0',
  757. '\xce\xb0'),
  758. # 3.12 Self-reverting case folding U+1E96 and normalization.
  759. ('\xe1\xba\x96',
  760. '\xe1\xba\x96'),
  761. # 3.13 Self-reverting case folding U+1F56 and normalization.
  762. ('\xe1\xbd\x96',
  763. '\xe1\xbd\x96'),
  764. # 3.14 ASCII space character U+0020.
  765. (' ',
  766. ' '),
  767. # 3.15 Non-ASCII 8bit space character U+00A0.
  768. ('\xc2\xa0',
  769. ' '),
  770. # 3.16 Non-ASCII multibyte space character U+1680.
  771. ('\xe1\x9a\x80',
  772. None),
  773. # 3.17 Non-ASCII multibyte space character U+2000.
  774. ('\xe2\x80\x80',
  775. ' '),
  776. # 3.18 Zero Width Space U+200b.
  777. ('\xe2\x80\x8b',
  778. ''),
  779. # 3.19 Non-ASCII multibyte space character U+3000.
  780. ('\xe3\x80\x80',
  781. ' '),
  782. # 3.20 ASCII control characters U+0010 U+007F.
  783. ('\x10\x7f',
  784. '\x10\x7f'),
  785. # 3.21 Non-ASCII 8bit control character U+0085.
  786. ('\xc2\x85',
  787. None),
  788. # 3.22 Non-ASCII multibyte control character U+180E.
  789. ('\xe1\xa0\x8e',
  790. None),
  791. # 3.23 Zero Width No-Break Space U+FEFF.
  792. ('\xef\xbb\xbf',
  793. ''),
  794. # 3.24 Non-ASCII control character U+1D175.
  795. ('\xf0\x9d\x85\xb5',
  796. None),
  797. # 3.25 Plane 0 private use character U+F123.
  798. ('\xef\x84\xa3',
  799. None),
  800. # 3.26 Plane 15 private use character U+F1234.
  801. ('\xf3\xb1\x88\xb4',
  802. None),
  803. # 3.27 Plane 16 private use character U+10F234.
  804. ('\xf4\x8f\x88\xb4',
  805. None),
  806. # 3.28 Non-character code point U+8FFFE.
  807. ('\xf2\x8f\xbf\xbe',
  808. None),
  809. # 3.29 Non-character code point U+10FFFF.
  810. ('\xf4\x8f\xbf\xbf',
  811. None),
  812. # 3.30 Surrogate code U+DF42.
  813. ('\xed\xbd\x82',
  814. None),
  815. # 3.31 Non-plain text character U+FFFD.
  816. ('\xef\xbf\xbd',
  817. None),
  818. # 3.32 Ideographic description character U+2FF5.
  819. ('\xe2\xbf\xb5',
  820. None),
  821. # 3.33 Display property character U+0341.
  822. ('\xcd\x81',
  823. '\xcc\x81'),
  824. # 3.34 Left-to-right mark U+200E.
  825. ('\xe2\x80\x8e',
  826. None),
  827. # 3.35 Deprecated U+202A.
  828. ('\xe2\x80\xaa',
  829. None),
  830. # 3.36 Language tagging character U+E0001.
  831. ('\xf3\xa0\x80\x81',
  832. None),
  833. # 3.37 Language tagging character U+E0042.
  834. ('\xf3\xa0\x81\x82',
  835. None),
  836. # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
  837. ('foo\xd6\xbebar',
  838. None),
  839. # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
  840. ('foo\xef\xb5\x90bar',
  841. None),
  842. # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
  843. ('foo\xef\xb9\xb6bar',
  844. 'foo \xd9\x8ebar'),
  845. # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
  846. ('\xd8\xa71',
  847. None),
  848. # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
  849. ('\xd8\xa71\xd8\xa8',
  850. '\xd8\xa71\xd8\xa8'),
  851. # 3.43 Unassigned code point U+E0002.
  852. # Skip this test as we allow unassigned
  853. #('\xf3\xa0\x80\x82',
  854. # None),
  855. (None, None),
  856. # 3.44 Larger test (shrinking).
  857. # Original test case reads \xc3\xdf
  858. ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
  859. '\xaa\xce\xb0\xe2\x80\x80',
  860. 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
  861. # 3.45 Larger test (expanding).
  862. # Original test case reads \xc3\x9f
  863. ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
  864. '\x80',
  865. 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
  866. '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
  867. '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
  868. ]
  869. class NameprepTest(unittest.TestCase):
  870. def test_nameprep(self):
  871. from encodings.idna import nameprep
  872. for pos, (orig, prepped) in enumerate(nameprep_tests):
  873. if orig is None:
  874. # Skipped
  875. continue
  876. # The Unicode strings are given in UTF-8
  877. orig = unicode(orig, "utf-8")
  878. if prepped is None:
  879. # Input contains prohibited characters
  880. self.assertRaises(UnicodeError, nameprep, orig)
  881. else:
  882. prepped = unicode(prepped, "utf-8")
  883. try:
  884. self.assertEquals(nameprep(orig), prepped)
  885. except Exception,e:
  886. raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
  887. class IDNACodecTest(unittest.TestCase):
  888. def test_builtin_decode(self):
  889. self.assertEquals(unicode("python.org", "idna"), u"python.org")
  890. self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
  891. self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
  892. self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
  893. def test_builtin_encode(self):
  894. self.assertEquals(u"python.org".encode("idna"), "python.org")
  895. self.assertEquals("python.org.".encode("idna"), "python.org.")
  896. self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
  897. self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
  898. def test_stream(self):
  899. import StringIO
  900. r = codecs.getreader("idna")(StringIO.StringIO("abc"))
  901. r.read(3)
  902. self.assertEquals(r.read(), u"")
  903. def test_incremental_decode(self):
  904. self.assertEquals(
  905. "".join(codecs.iterdecode("python.org", "idna")),
  906. u"python.org"
  907. )
  908. self.assertEquals(
  909. "".join(codecs.iterdecode("python.org.", "idna")),
  910. u"python.org."
  911. )
  912. self.assertEquals(
  913. "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
  914. u"pyth\xf6n.org."
  915. )
  916. self.assertEquals(
  917. "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
  918. u"pyth\xf6n.org."
  919. )
  920. decoder = codecs.getincrementaldecoder("idna")()
  921. self.assertEquals(decoder.decode("xn--xam", ), u"")
  922. self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
  923. self.assertEquals(decoder.decode(u"rg"), u"")
  924. self.assertEquals(decoder.decode(u"", True), u"org")
  925. decoder.reset()
  926. self.assertEquals(decoder.decode("xn--xam", ), u"")
  927. self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
  928. self.assertEquals(decoder.decode("rg."), u"org.")
  929. self.assertEquals(decoder.decode("", True), u"")
  930. def test_incremental_encode(self):
  931. self.assertEquals(
  932. "".join(codecs.iterencode(u"python.org", "idna")),
  933. "python.org"
  934. )
  935. self.assertEquals(
  936. "".join(codecs.iterencode(u"python.org.", "idna")),
  937. "python.org."
  938. )
  939. self.assertEquals(
  940. "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
  941. "xn--pythn-mua.org."
  942. )
  943. self.assertEquals(
  944. "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
  945. "xn--pythn-mua.org."
  946. )
  947. encoder = codecs.getincrementalencoder("idna")()
  948. self.assertEquals(encoder.encode(u"\xe4x"), "")
  949. self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
  950. self.assertEquals(encoder.encode(u"", True), "org")
  951. encoder.reset()
  952. self.assertEquals(encoder.encode(u"\xe4x"), "")
  953. self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
  954. self.assertEquals(encoder.encode(u"", True), "")
  955. class CodecsModuleTest(unittest.TestCase):
  956. def test_decode(self):
  957. self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
  958. u'\xe4\xf6\xfc')
  959. self.assertRaises(TypeError, codecs.decode)
  960. self.assertEquals(codecs.decode('abc'), u'abc')
  961. self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
  962. def test_encode(self):
  963. self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
  964. '\xe4\xf6\xfc')
  965. self.assertRaises(TypeError, codecs.encode)
  966. self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
  967. self.assertEquals(codecs.encode(u'abc'), 'abc')
  968. self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
  969. def test_register(self):
  970. self.assertRaises(TypeError, codecs.register)
  971. self.assertRaises(TypeError, codecs.register, 42)
  972. def test_lookup(self):
  973. self.assertRaises(TypeError, codecs.lookup)
  974. self.assertRaises(LookupError, codecs.lookup, "__spam__")
  975. self.assertRaises(LookupError, codecs.lookup, " ")
  976. def test_getencoder(self):
  977. self.assertRaises(TypeError, codecs.getencoder)
  978. self.assertRaises(LookupError, codecs.getencoder, "__spam__")
  979. def test_getdecoder(self):
  980. self.assertRaises(TypeError, codecs.getdecoder)
  981. self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
  982. def test_getreader(self):
  983. self.assertRaises(TypeError, codecs.getreader)
  984. self.assertRaises(LookupError, codecs.getreader, "__spam__")
  985. def test_getwriter(self):
  986. self.assertRaises(TypeError, codecs.getwriter)
  987. self.assertRaises(LookupError, codecs.getwriter, "__spam__")
  988. class StreamReaderTest(unittest.TestCase):
  989. def setUp(self):
  990. self.reader = codecs.getreader('utf-8')
  991. self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
  992. def test_readlines(self):
  993. f = self.reader(self.stream)
  994. self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
  995. class EncodedFileTest(unittest.TestCase):
  996. def test_basic(self):
  997. f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
  998. ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
  999. self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
  1000. f = StringIO.StringIO()
  1001. ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
  1002. ef.write('\xc3\xbc')
  1003. self.assertEquals(f.getvalue(), '\xfc')
  1004. class Str2StrTest(unittest.TestCase):
  1005. def test_read(self):
  1006. sin = "\x80".encode("base64_codec")
  1007. reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
  1008. sout = reader.read()
  1009. self.assertEqual(sout, "\x80")
  1010. self.assert_(isinstance(sout, str))
  1011. def test_readline(self):
  1012. sin = "\x80".encode("base64_codec")
  1013. reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
  1014. sout = reader.readline()
  1015. self.assertEqual(sout, "\x80")
  1016. self.assert_(isinstance(sout, str))
  1017. all_unicode_encodings = [
  1018. "ascii",
  1019. "base64_codec",
  1020. "big5",
  1021. "big5hkscs",
  1022. "charmap",
  1023. "cp037",
  1024. "cp1006",
  1025. "cp1026",
  1026. "cp1140",
  1027. "cp1250",
  1028. "cp1251",
  1029. "cp1252",
  1030. "cp1253",
  1031. "cp1254",
  1032. "cp1255",
  1033. "cp1256",
  1034. "cp1257",
  1035. "cp1258",
  1036. "cp424",
  1037. "cp437",
  1038. "cp500",
  1039. "cp737",
  1040. "cp775",
  1041. "cp850",
  1042. "cp852",
  1043. "cp855",
  1044. "cp856",
  1045. "cp857",
  1046. "cp860",
  1047. "cp861",
  1048. "cp862",
  1049. "cp863",
  1050. "cp864",
  1051. "cp865",
  1052. "cp866",
  1053. "cp869",
  1054. "cp874",
  1055. "cp875",
  1056. "cp932",
  1057. "cp949",
  1058. "cp950",
  1059. "euc_jis_2004",
  1060. "euc_jisx0213",
  1061. "euc_jp",
  1062. "euc_kr",
  1063. "gb18030",
  1064. "gb2312",
  1065. "gbk",
  1066. "hex_codec",
  1067. "hp_roman8",
  1068. "hz",
  1069. "idna",
  1070. "iso2022_jp",
  1071. "iso2022_jp_1",
  1072. "iso2022_jp_2",
  1073. "iso2022_jp_2004",
  1074. "iso2022_jp_3",
  1075. "iso2022_jp_ext",
  1076. "iso2022_kr",
  1077. "iso8859_1",
  1078. "iso8859_10",
  1079. "iso8859_11",
  1080. "iso8859_13",
  1081. "iso8859_14",
  1082. "iso8859_15",
  1083. "iso8859_16",
  1084. "iso8859_2",
  1085. "iso8859_3",
  1086. "iso8859_4",
  1087. "iso8859_5",
  1088. "iso8859_6",
  1089. "iso8859_7",
  1090. "iso8859_8",
  1091. "iso8859_9",
  1092. "johab",
  1093. "koi8_r",
  1094. "koi8_u",
  1095. "latin_1",
  1096. "mac_cyrillic",
  1097. "mac_greek",
  1098. "mac_iceland",
  1099. "mac_latin2",
  1100. "mac_roman",
  1101. "mac_turkish",
  1102. "palmos",
  1103. "ptcp154",
  1104. "punycode",
  1105. "raw_unicode_escape",
  1106. "rot_13",
  1107. "shift_jis",
  1108. "shift_jis_2004",
  1109. "shift_jisx0213",
  1110. "tis_620",
  1111. "unicode_escape",
  1112. "unicode_internal",
  1113. "utf_16",
  1114. "utf_16_be",
  1115. "utf_16_le",
  1116. "utf_7",
  1117. "utf_8",
  1118. ]
  1119. if hasattr(codecs, "mbcs_encode"):
  1120. all_unicode_encodings.append("mbcs")
  1121. # The following encodings work only with str, not unicode
  1122. all_string_encodings = [
  1123. "quopri_codec",
  1124. "string_escape",
  1125. "uu_codec",
  1126. ]
  1127. # The following encoding is not tested, because it's not supposed
  1128. # to work:
  1129. # "undefined"
  1130. # The following encodings don't work in stateful mode
  1131. broken_unicode_with_streams = [
  1132. "base64_codec",
  1133. "hex_codec",
  1134. "punycode",
  1135. "unicode_internal"
  1136. ]
  1137. broken_incremental_coders = broken_unicode_with_streams[:]
  1138. # The following encodings only support "strict" mode
  1139. only_strict_mode = [
  1140. "idna",
  1141. "zlib_codec",
  1142. "bz2_codec",
  1143. ]
  1144. try:
  1145. import bz2
  1146. except ImportError:
  1147. pass
  1148. else:
  1149. all_unicode_encodings.append("bz2_codec")
  1150. broken_unicode_with_streams.append("bz2_codec")
  1151. try:
  1152. import zlib
  1153. except ImportError:
  1154. pass
  1155. else:
  1156. all_unicode_encodings.append("zlib_codec")
  1157. broken_unicode_with_streams.append("zlib_codec")
  1158. class BasicUnicodeTest(unittest.TestCase):
  1159. def test_basics(self):
  1160. s = u"abc123" # all codecs should be able to encode these
  1161. for encoding in all_unicode_encodings:
  1162. name = codecs.lookup(encoding).name
  1163. if encoding.endswith("_codec"):
  1164. name += "_codec"
  1165. elif encoding == "latin_1":
  1166. name = "latin_1"
  1167. self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
  1168. (bytes, size) = codecs.getencoder(encoding)(s)
  1169. if encoding != "unicode_internal":
  1170. self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
  1171. (chars, size) = codecs.getdecoder(encoding)(bytes)
  1172. self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
  1173. if encoding not in broken_unicode_with_streams:
  1174. # check stream reader/writer
  1175. q = Queue()
  1176. writer = codecs.getwriter(encoding)(q)
  1177. encodedresult = ""
  1178. for c in s:
  1179. writer.write(c)
  1180. encodedresult += q.read()
  1181. q = Queue()
  1182. reader = codecs.getreader(encoding)(q)
  1183. decodedresult = u""
  1184. for c in encodedresult:
  1185. q.write(c)
  1186. decodedresult += reader.read()
  1187. self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
  1188. if encoding not in broken_incremental_coders:
  1189. # check incremental decoder/encoder (fetched via the Python
  1190. # and C API) and iterencode()/iterdecode()
  1191. try:
  1192. encoder = codecs.getincrementalencoder(encoding)()
  1193. cencoder = _testcapi.codec_incrementalencoder(encoding)
  1194. except LookupError: # no IncrementalEncoder
  1195. pass
  1196. else:
  1197. # check incremental decoder/encoder
  1198. encodedresult = ""
  1199. for c in s:
  1200. encodedresult += encoder.encode(c)
  1201. encodedresult += encoder.encode(u"", True)
  1202. decoder = codecs.getincrementaldecoder(encoding)()
  1203. decodedresult = u""
  1204. for c in encodedresult:
  1205. decodedresult += decoder.decode(c)
  1206. decodedresult += decoder.decode("", True)
  1207. self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
  1208. # check C API
  1209. encodedresult = ""
  1210. for c in s:
  1211. encodedresult += cencoder.encode(c)
  1212. encodedresult += cencoder.encode(u"", True)
  1213. cdecoder = _testcapi.codec_incrementaldecoder(encoding)
  1214. decodedresult = u""
  1215. for c in encodedresult:
  1216. decodedresult += cdecoder.decode(c)
  1217. decodedresult += cdecoder.decode("", True)
  1218. self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
  1219. # check iterencode()/iterdecode()
  1220. result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
  1221. self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
  1222. # check iterencode()/iterdecode() with empty string
  1223. result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
  1224. self.assertEqual(result, u"")
  1225. if encoding not in only_strict_mode:
  1226. # check incremental decoder/encoder with errors argument
  1227. try:
  1228. encoder = codecs.getincrementalencoder(encoding)("ignore")
  1229. cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
  1230. except LookupError: # no IncrementalEncoder
  1231. pass
  1232. else:
  1233. encodedresult = "".join(encoder.encode(c) for c in s)
  1234. decoder = codecs.getincrementaldecoder(encoding)("ignore")
  1235. decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
  1236. self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
  1237. encodedresult = "".join(cencoder.encode(c) for c in s)
  1238. cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
  1239. decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
  1240. self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
  1241. def test_seek(self):
  1242. # all codecs should be able to encode these
  1243. s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
  1244. for encoding in all_unicode_encodings:
  1245. if encoding == "idna": # FIXME: See SF bug #1163178
  1246. continue
  1247. if encoding in broken_unicode_with_streams:
  1248. continue
  1249. reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
  1250. for t in xrange(5):
  1251. # Test that calling seek resets the internal codec state and buffers
  1252. reader.seek(0, 0)
  1253. line = reader.readline()
  1254. self.assertEqual(s[:len(line)], line)
  1255. def test_bad_decode_args(self):
  1256. for encoding in all_unicode_encodings:
  1257. decoder = codecs.getdecoder(encoding)
  1258. self.assertRaises(TypeError, decoder)
  1259. if encoding not in ("idna", "punycode"):
  1260. self.assertRaises(TypeError, decoder, 42)
  1261. def test_bad_encode_args(self):
  1262. for encoding in all_unicode_encodings:
  1263. encoder = codecs.getencoder(encoding)
  1264. self.assertRaises(TypeError, encoder)
  1265. def test_encoding_map_type_initialized(self):
  1266. from encodings import cp1140
  1267. # This used to crash, we are only verifying there's no crash.
  1268. table_type = type(cp1140.encoding_table)
  1269. self.assertEqual(table_type, table_type)
  1270. class BasicStrTest(unittest.TestCase):
  1271. def test_basics(self):
  1272. s = "abc123"
  1273. for encoding in all_string_encodings:
  1274. (bytes, size) = codecs.getencoder(encoding)(s)
  1275. self.assertEqual(size, len(s))
  1276. (chars, size) = codecs.getdecoder(encoding)(bytes)
  1277. self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
  1278. class CharmapTest(unittest.TestCase):
  1279. def test_decode_with_string_map(self):
  1280. self.assertEquals(
  1281. codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
  1282. (u"abc", 3)
  1283. )
  1284. self.assertEquals(
  1285. codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
  1286. (u"ab\ufffd", 3)
  1287. )
  1288. self.assertEquals(
  1289. codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
  1290. (u"ab\ufffd", 3)
  1291. )
  1292. self.assertEquals(
  1293. codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
  1294. (u"ab", 3)
  1295. )
  1296. self.assertEquals(
  1297. codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
  1298. (u"ab", 3)
  1299. )
  1300. allbytes = "".join(chr(i) for i in xrange(256))
  1301. self.assertEquals(
  1302. codecs.charmap_decode(allbytes, "ignore", u""),
  1303. (u"", len(allbytes))
  1304. )
  1305. class WithStmtTest(unittest.TestCase):
  1306. def test_encodedfile(self):
  1307. f = StringIO.StringIO("\xc3\xbc")
  1308. with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
  1309. self.assertEquals(ef.read(), "\xfc")
  1310. def test_streamreaderwriter(self):
  1311. f = StringIO.StringIO("\xc3\xbc")
  1312. info = codecs.lookup("utf-8")
  1313. with codecs.StreamReaderWriter(f, info.streamreader,
  1314. info.streamwriter, 'strict') as srw:
  1315. self.assertEquals(srw.read(), u"\xfc")
  1316. def test_main():
  1317. test_support.run_unittest(
  1318. UTF32Test,
  1319. UTF32LETest,
  1320. UTF32BETest,
  1321. UTF16Test,
  1322. UTF16LETest,
  1323. UTF16BETest,
  1324. UTF8Test,
  1325. UTF8SigTest,
  1326. UTF7Test,
  1327. UTF16ExTest,
  1328. ReadBufferTest,
  1329. CharBufferTest,
  1330. EscapeDecodeTest,
  1331. RecodingTest,
  1332. PunycodeTest,
  1333. UnicodeInternalTest,
  1334. NameprepTest,
  1335. IDNACodecTest,
  1336. CodecsModuleTest,
  1337. StreamReaderTest,
  1338. EncodedFileTest,
  1339. Str2StrTest,
  1340. BasicUnicodeTest,
  1341. BasicStrTest,
  1342. CharmapTest,
  1343. WithStmtTest,
  1344. )
  1345. if __name__ == "__main__":
  1346. test_main()