PageRenderTime 98ms CodeModel.GetById 12ms app.highlight 75ms RepoModel.GetById 1ms app.codeStats 0ms

/Lib/test/test_codecs.py

http://unladen-swallow.googlecode.com/
Python | 1508 lines | 1503 code | 1 blank | 4 comment | 1 complexity | d265623b8edd6da1e13fd97d261ee5d8 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1from test import test_support
   2import unittest
   3import codecs
   4import sys, StringIO, _testcapi
   5
   6class Queue(object):
   7    """
   8    queue: write bytes at one end, read bytes from the other end
   9    """
  10    def __init__(self):
  11        self._buffer = ""
  12
  13    def write(self, chars):
  14        self._buffer += chars
  15
  16    def read(self, size=-1):
  17        if size<0:
  18            s = self._buffer
  19            self._buffer = ""
  20            return s
  21        else:
  22            s = self._buffer[:size]
  23            self._buffer = self._buffer[size:]
  24            return s
  25
  26class ReadTest(unittest.TestCase):
  27    def check_partial(self, input, partialresults):
  28        # get a StreamReader for the encoding and feed the bytestring version
  29        # of input to the reader byte by byte. Read everything available from
  30        # the StreamReader and check that the results equal the appropriate
  31        # entries from partialresults.
  32        q = Queue()
  33        r = codecs.getreader(self.encoding)(q)
  34        result = u""
  35        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  36            q.write(c)
  37            result += r.read()
  38            self.assertEqual(result, partialresult)
  39        # check that there's nothing left in the buffers
  40        self.assertEqual(r.read(), u"")
  41        self.assertEqual(r.bytebuffer, "")
  42        self.assertEqual(r.charbuffer, u"")
  43
  44        # do the check again, this time using a incremental decoder
  45        d = codecs.getincrementaldecoder(self.encoding)()
  46        result = u""
  47        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  48            result += d.decode(c)
  49            self.assertEqual(result, partialresult)
  50        # check that there's nothing left in the buffers
  51        self.assertEqual(d.decode("", True), u"")
  52        self.assertEqual(d.buffer, "")
  53
  54        # Check whether the reset method works properly
  55        d.reset()
  56        result = u""
  57        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
  58            result += d.decode(c)
  59            self.assertEqual(result, partialresult)
  60        # check that there's nothing left in the buffers
  61        self.assertEqual(d.decode("", True), u"")
  62        self.assertEqual(d.buffer, "")
  63
  64        # check iterdecode()
  65        encoded = input.encode(self.encoding)
  66        self.assertEqual(
  67            input,
  68            u"".join(codecs.iterdecode(encoded, self.encoding))
  69        )
  70
  71    def test_readline(self):
  72        def getreader(input):
  73            stream = StringIO.StringIO(input.encode(self.encoding))
  74            return codecs.getreader(self.encoding)(stream)
  75
  76        def readalllines(input, keepends=True, size=None):
  77            reader = getreader(input)
  78            lines = []
  79            while True:
  80                line = reader.readline(size=size, keepends=keepends)
  81                if not line:
  82                    break
  83                lines.append(line)
  84            return "|".join(lines)
  85
  86        s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
  87        sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
  88        sexpectednoends = u"foo|bar|baz|spam|eggs"
  89        self.assertEqual(readalllines(s, True), sexpected)
  90        self.assertEqual(readalllines(s, False), sexpectednoends)
  91        self.assertEqual(readalllines(s, True, 10), sexpected)
  92        self.assertEqual(readalllines(s, False, 10), sexpectednoends)
  93
  94        # Test long lines (multiple calls to read() in readline())
  95        vw = []
  96        vwo = []
  97        for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
  98            vw.append((i*200)*u"\3042" + lineend)
  99            vwo.append((i*200)*u"\3042")
 100        self.assertEqual(readalllines("".join(vw), True), "".join(vw))
 101        self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
 102
 103        # Test lines where the first read might end with \r, so the
 104        # reader has to look ahead whether this is a lone \r or a \r\n
 105        for size in xrange(80):
 106            for lineend in u"\n \r\n \r \u2028".split():
 107                s = 10*(size*u"a" + lineend + u"xxx\n")
 108                reader = getreader(s)
 109                for i in xrange(10):
 110                    self.assertEqual(
 111                        reader.readline(keepends=True),
 112                        size*u"a" + lineend,
 113                    )
 114                reader = getreader(s)
 115                for i in xrange(10):
 116                    self.assertEqual(
 117                        reader.readline(keepends=False),
 118                        size*u"a",
 119                    )
 120
 121    def test_bug1175396(self):
 122        s = [
 123            '<%!--===================================================\r\n',
 124            '    BLOG index page: show recent articles,\r\n',
 125            '    today\'s articles, or articles of a specific date.\r\n',
 126            '========================================================--%>\r\n',
 127            '<%@inputencoding="ISO-8859-1"%>\r\n',
 128            '<%@pagetemplate=TEMPLATE.y%>\r\n',
 129            '<%@import=import frog.util, frog%>\r\n',
 130            '<%@import=import frog.objects%>\r\n',
 131            '<%@import=from frog.storageerrors import StorageError%>\r\n',
 132            '<%\r\n',
 133            '\r\n',
 134            'import logging\r\n',
 135            'log=logging.getLogger("Snakelets.logger")\r\n',
 136            '\r\n',
 137            '\r\n',
 138            'user=self.SessionCtx.user\r\n',
 139            'storageEngine=self.SessionCtx.storageEngine\r\n',
 140            '\r\n',
 141            '\r\n',
 142            'def readArticlesFromDate(date, count=None):\r\n',
 143            '    entryids=storageEngine.listBlogEntries(date)\r\n',
 144            '    entryids.reverse() # descending\r\n',
 145            '    if count:\r\n',
 146            '        entryids=entryids[:count]\r\n',
 147            '    try:\r\n',
 148            '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
 149            '    except StorageError,x:\r\n',
 150            '        log.error("Error loading articles: "+str(x))\r\n',
 151            '        self.abort("cannot load articles")\r\n',
 152            '\r\n',
 153            'showdate=None\r\n',
 154            '\r\n',
 155            'arg=self.Request.getArg()\r\n',
 156            'if arg=="today":\r\n',
 157            '    #-------------------- TODAY\'S ARTICLES\r\n',
 158            '    self.write("<h2>Today\'s articles</h2>")\r\n',
 159            '    showdate = frog.util.isodatestr() \r\n',
 160            '    entries = readArticlesFromDate(showdate)\r\n',
 161            'elif arg=="active":\r\n',
 162            '    #-------------------- ACTIVE ARTICLES redirect\r\n',
 163            '    self.Yredirect("active.y")\r\n',
 164            'elif arg=="login":\r\n',
 165            '    #-------------------- LOGIN PAGE redirect\r\n',
 166            '    self.Yredirect("login.y")\r\n',
 167            'elif arg=="date":\r\n',
 168            '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
 169            '    showdate = self.Request.getParameter("date")\r\n',
 170            '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
 171            '    entries = readArticlesFromDate(showdate)\r\n',
 172            'else:\r\n',
 173            '    #-------------------- RECENT ARTICLES\r\n',
 174            '    self.write("<h2>Recent articles</h2>")\r\n',
 175            '    dates=storageEngine.listBlogEntryDates()\r\n',
 176            '    if dates:\r\n',
 177            '        entries=[]\r\n',
 178            '        SHOWAMOUNT=10\r\n',
 179            '        for showdate in dates:\r\n',
 180            '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
 181            '            if len(entries)>=SHOWAMOUNT:\r\n',
 182            '                break\r\n',
 183            '                \r\n',
 184        ]
 185        stream = StringIO.StringIO("".join(s).encode(self.encoding))
 186        reader = codecs.getreader(self.encoding)(stream)
 187        for (i, line) in enumerate(reader):
 188            self.assertEqual(line, s[i])
 189
 190    def test_readlinequeue(self):
 191        q = Queue()
 192        writer = codecs.getwriter(self.encoding)(q)
 193        reader = codecs.getreader(self.encoding)(q)
 194
 195        # No lineends
 196        writer.write(u"foo\r")
 197        self.assertEqual(reader.readline(keepends=False), u"foo")
 198        writer.write(u"\nbar\r")
 199        self.assertEqual(reader.readline(keepends=False), u"")
 200        self.assertEqual(reader.readline(keepends=False), u"bar")
 201        writer.write(u"baz")
 202        self.assertEqual(reader.readline(keepends=False), u"baz")
 203        self.assertEqual(reader.readline(keepends=False), u"")
 204
 205        # Lineends
 206        writer.write(u"foo\r")
 207        self.assertEqual(reader.readline(keepends=True), u"foo\r")
 208        writer.write(u"\nbar\r")
 209        self.assertEqual(reader.readline(keepends=True), u"\n")
 210        self.assertEqual(reader.readline(keepends=True), u"bar\r")
 211        writer.write(u"baz")
 212        self.assertEqual(reader.readline(keepends=True), u"baz")
 213        self.assertEqual(reader.readline(keepends=True), u"")
 214        writer.write(u"foo\r\n")
 215        self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
 216
 217    def test_bug1098990_a(self):
 218        s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
 219        s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
 220        s3 = u"next line.\r\n"
 221
 222        s = (s1+s2+s3).encode(self.encoding)
 223        stream = StringIO.StringIO(s)
 224        reader = codecs.getreader(self.encoding)(stream)
 225        self.assertEqual(reader.readline(), s1)
 226        self.assertEqual(reader.readline(), s2)
 227        self.assertEqual(reader.readline(), s3)
 228        self.assertEqual(reader.readline(), u"")
 229
 230    def test_bug1098990_b(self):
 231        s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
 232        s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
 233        s3 = u"stillokay:bbbbxx\r\n"
 234        s4 = u"broken!!!!badbad\r\n"
 235        s5 = u"againokay.\r\n"
 236
 237        s = (s1+s2+s3+s4+s5).encode(self.encoding)
 238        stream = StringIO.StringIO(s)
 239        reader = codecs.getreader(self.encoding)(stream)
 240        self.assertEqual(reader.readline(), s1)
 241        self.assertEqual(reader.readline(), s2)
 242        self.assertEqual(reader.readline(), s3)
 243        self.assertEqual(reader.readline(), s4)
 244        self.assertEqual(reader.readline(), s5)
 245        self.assertEqual(reader.readline(), u"")
 246
 247class UTF32Test(ReadTest):
 248    encoding = "utf-32"
 249
 250    spamle = ('\xff\xfe\x00\x00'
 251              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
 252              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
 253    spambe = ('\x00\x00\xfe\xff'
 254              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
 255              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
 256
 257    def test_only_one_bom(self):
 258        _,_,reader,writer = codecs.lookup(self.encoding)
 259        # encode some stream
 260        s = StringIO.StringIO()
 261        f = writer(s)
 262        f.write(u"spam")
 263        f.write(u"spam")
 264        d = s.getvalue()
 265        # check whether there is exactly one BOM in it
 266        self.assert_(d == self.spamle or d == self.spambe)
 267        # try to read it back
 268        s = StringIO.StringIO(d)
 269        f = reader(s)
 270        self.assertEquals(f.read(), u"spamspam")
 271
 272    def test_badbom(self):
 273        s = StringIO.StringIO(4*"\xff")
 274        f = codecs.getreader(self.encoding)(s)
 275        self.assertRaises(UnicodeError, f.read)
 276
 277        s = StringIO.StringIO(8*"\xff")
 278        f = codecs.getreader(self.encoding)(s)
 279        self.assertRaises(UnicodeError, f.read)
 280
 281    def test_partial(self):
 282        self.check_partial(
 283            u"\x00\xff\u0100\uffff",
 284            [
 285                u"", # first byte of BOM read
 286                u"", # second byte of BOM read
 287                u"", # third byte of BOM read
 288                u"", # fourth byte of BOM read => byteorder known
 289                u"",
 290                u"",
 291                u"",
 292                u"\x00",
 293                u"\x00",
 294                u"\x00",
 295                u"\x00",
 296                u"\x00\xff",
 297                u"\x00\xff",
 298                u"\x00\xff",
 299                u"\x00\xff",
 300                u"\x00\xff\u0100",
 301                u"\x00\xff\u0100",
 302                u"\x00\xff\u0100",
 303                u"\x00\xff\u0100",
 304                u"\x00\xff\u0100\uffff",
 305            ]
 306        )
 307
 308    def test_handlers(self):
 309        self.assertEqual((u'\ufffd', 1),
 310                         codecs.utf_32_decode('\x01', 'replace', True))
 311        self.assertEqual((u'', 1),
 312                         codecs.utf_32_decode('\x01', 'ignore', True))
 313
 314    def test_errors(self):
 315        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
 316                          "\xff", "strict", True)
 317
 318class UTF32LETest(ReadTest):
 319    encoding = "utf-32-le"
 320
 321    def test_partial(self):
 322        self.check_partial(
 323            u"\x00\xff\u0100\uffff",
 324            [
 325                u"",
 326                u"",
 327                u"",
 328                u"\x00",
 329                u"\x00",
 330                u"\x00",
 331                u"\x00",
 332                u"\x00\xff",
 333                u"\x00\xff",
 334                u"\x00\xff",
 335                u"\x00\xff",
 336                u"\x00\xff\u0100",
 337                u"\x00\xff\u0100",
 338                u"\x00\xff\u0100",
 339                u"\x00\xff\u0100",
 340                u"\x00\xff\u0100\uffff",
 341            ]
 342        )
 343
 344    def test_simple(self):
 345        self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
 346
 347    def test_errors(self):
 348        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
 349                          "\xff", "strict", True)
 350
 351class UTF32BETest(ReadTest):
 352    encoding = "utf-32-be"
 353
 354    def test_partial(self):
 355        self.check_partial(
 356            u"\x00\xff\u0100\uffff",
 357            [
 358                u"",
 359                u"",
 360                u"",
 361                u"\x00",
 362                u"\x00",
 363                u"\x00",
 364                u"\x00",
 365                u"\x00\xff",
 366                u"\x00\xff",
 367                u"\x00\xff",
 368                u"\x00\xff",
 369                u"\x00\xff\u0100",
 370                u"\x00\xff\u0100",
 371                u"\x00\xff\u0100",
 372                u"\x00\xff\u0100",
 373                u"\x00\xff\u0100\uffff",
 374            ]
 375        )
 376
 377    def test_simple(self):
 378        self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
 379
 380    def test_errors(self):
 381        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
 382                          "\xff", "strict", True)
 383
 384class UTF16Test(ReadTest):
 385    encoding = "utf-16"
 386
 387    spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
 388    spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
 389
 390    def test_only_one_bom(self):
 391        _,_,reader,writer = codecs.lookup(self.encoding)
 392        # encode some stream
 393        s = StringIO.StringIO()
 394        f = writer(s)
 395        f.write(u"spam")
 396        f.write(u"spam")
 397        d = s.getvalue()
 398        # check whether there is exactly one BOM in it
 399        self.assert_(d == self.spamle or d == self.spambe)
 400        # try to read it back
 401        s = StringIO.StringIO(d)
 402        f = reader(s)
 403        self.assertEquals(f.read(), u"spamspam")
 404
 405    def test_badbom(self):
 406        s = StringIO.StringIO("\xff\xff")
 407        f = codecs.getreader(self.encoding)(s)
 408        self.assertRaises(UnicodeError, f.read)
 409
 410        s = StringIO.StringIO("\xff\xff\xff\xff")
 411        f = codecs.getreader(self.encoding)(s)
 412        self.assertRaises(UnicodeError, f.read)
 413
 414    def test_partial(self):
 415        self.check_partial(
 416            u"\x00\xff\u0100\uffff",
 417            [
 418                u"", # first byte of BOM read
 419                u"", # second byte of BOM read => byteorder known
 420                u"",
 421                u"\x00",
 422                u"\x00",
 423                u"\x00\xff",
 424                u"\x00\xff",
 425                u"\x00\xff\u0100",
 426                u"\x00\xff\u0100",
 427                u"\x00\xff\u0100\uffff",
 428            ]
 429        )
 430
 431    def test_handlers(self):
 432        self.assertEqual((u'\ufffd', 1),
 433                         codecs.utf_16_decode('\x01', 'replace', True))
 434        self.assertEqual((u'', 1),
 435                         codecs.utf_16_decode('\x01', 'ignore', True))
 436
 437    def test_errors(self):
 438        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
 439
 440class UTF16LETest(ReadTest):
 441    encoding = "utf-16-le"
 442
 443    def test_partial(self):
 444        self.check_partial(
 445            u"\x00\xff\u0100\uffff",
 446            [
 447                u"",
 448                u"\x00",
 449                u"\x00",
 450                u"\x00\xff",
 451                u"\x00\xff",
 452                u"\x00\xff\u0100",
 453                u"\x00\xff\u0100",
 454                u"\x00\xff\u0100\uffff",
 455            ]
 456        )
 457
 458    def test_errors(self):
 459        self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
 460
 461class UTF16BETest(ReadTest):
 462    encoding = "utf-16-be"
 463
 464    def test_partial(self):
 465        self.check_partial(
 466            u"\x00\xff\u0100\uffff",
 467            [
 468                u"",
 469                u"\x00",
 470                u"\x00",
 471                u"\x00\xff",
 472                u"\x00\xff",
 473                u"\x00\xff\u0100",
 474                u"\x00\xff\u0100",
 475                u"\x00\xff\u0100\uffff",
 476            ]
 477        )
 478
 479    def test_errors(self):
 480        self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
 481
 482class UTF8Test(ReadTest):
 483    encoding = "utf-8"
 484
 485    def test_partial(self):
 486        self.check_partial(
 487            u"\x00\xff\u07ff\u0800\uffff",
 488            [
 489                u"\x00",
 490                u"\x00",
 491                u"\x00\xff",
 492                u"\x00\xff",
 493                u"\x00\xff\u07ff",
 494                u"\x00\xff\u07ff",
 495                u"\x00\xff\u07ff",
 496                u"\x00\xff\u07ff\u0800",
 497                u"\x00\xff\u07ff\u0800",
 498                u"\x00\xff\u07ff\u0800",
 499                u"\x00\xff\u07ff\u0800\uffff",
 500            ]
 501        )
 502
 503class UTF7Test(ReadTest):
 504    encoding = "utf-7"
 505
 506    def test_partial(self):
 507        self.check_partial(
 508            u"a+-b",
 509            [
 510                u"a",
 511                u"a",
 512                u"a+",
 513                u"a+-",
 514                u"a+-b",
 515            ]
 516        )
 517
 518class UTF16ExTest(unittest.TestCase):
 519
 520    def test_errors(self):
 521        self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
 522
 523    def test_bad_args(self):
 524        self.assertRaises(TypeError, codecs.utf_16_ex_decode)
 525
 526class ReadBufferTest(unittest.TestCase):
 527
 528    def test_array(self):
 529        import array
 530        self.assertEqual(
 531            codecs.readbuffer_encode(array.array("c", "spam")),
 532            ("spam", 4)
 533        )
 534
 535    def test_empty(self):
 536        self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
 537
 538    def test_bad_args(self):
 539        self.assertRaises(TypeError, codecs.readbuffer_encode)
 540        self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
 541
 542class CharBufferTest(unittest.TestCase):
 543
 544    def test_string(self):
 545        self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
 546
 547    def test_empty(self):
 548        self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
 549
 550    def test_bad_args(self):
 551        self.assertRaises(TypeError, codecs.charbuffer_encode)
 552        self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
 553
 554class UTF8SigTest(ReadTest):
 555    encoding = "utf-8-sig"
 556
 557    def test_partial(self):
 558        self.check_partial(
 559            u"\ufeff\x00\xff\u07ff\u0800\uffff",
 560            [
 561                u"",
 562                u"",
 563                u"", # First BOM has been read and skipped
 564                u"",
 565                u"",
 566                u"\ufeff", # Second BOM has been read and emitted
 567                u"\ufeff\x00", # "\x00" read and emitted
 568                u"\ufeff\x00", # First byte of encoded u"\xff" read
 569                u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
 570                u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
 571                u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
 572                u"\ufeff\x00\xff\u07ff",
 573                u"\ufeff\x00\xff\u07ff",
 574                u"\ufeff\x00\xff\u07ff\u0800",
 575                u"\ufeff\x00\xff\u07ff\u0800",
 576                u"\ufeff\x00\xff\u07ff\u0800",
 577                u"\ufeff\x00\xff\u07ff\u0800\uffff",
 578            ]
 579        )
 580
 581    def test_bug1601501(self):
 582        # SF bug #1601501: check that the codec works with a buffer
 583        unicode("\xef\xbb\xbf", "utf-8-sig")
 584
 585    def test_bom(self):
 586        d = codecs.getincrementaldecoder("utf-8-sig")()
 587        s = u"spam"
 588        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
 589
 590    def test_stream_bom(self):
 591        unistring = u"ABC\u00A1\u2200XYZ"
 592        bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
 593
 594        reader = codecs.getreader("utf-8-sig")
 595        for sizehint in [None] + range(1, 11) + \
 596                        [64, 128, 256, 512, 1024]:
 597            istream = reader(StringIO.StringIO(bytestring))
 598            ostream = StringIO.StringIO()
 599            while 1:
 600                if sizehint is not None:
 601                    data = istream.read(sizehint)
 602                else:
 603                    data = istream.read()
 604
 605                if not data:
 606                    break
 607                ostream.write(data)
 608
 609            got = ostream.getvalue()
 610            self.assertEqual(got, unistring)
 611
 612    def test_stream_bare(self):
 613        unistring = u"ABC\u00A1\u2200XYZ"
 614        bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
 615
 616        reader = codecs.getreader("utf-8-sig")
 617        for sizehint in [None] + range(1, 11) + \
 618                        [64, 128, 256, 512, 1024]:
 619            istream = reader(StringIO.StringIO(bytestring))
 620            ostream = StringIO.StringIO()
 621            while 1:
 622                if sizehint is not None:
 623                    data = istream.read(sizehint)
 624                else:
 625                    data = istream.read()
 626
 627                if not data:
 628                    break
 629                ostream.write(data)
 630
 631            got = ostream.getvalue()
 632            self.assertEqual(got, unistring)
 633
 634class EscapeDecodeTest(unittest.TestCase):
 635    def test_empty(self):
 636        self.assertEquals(codecs.escape_decode(""), ("", 0))
 637
 638class RecodingTest(unittest.TestCase):
 639    def test_recoding(self):
 640        f = StringIO.StringIO()
 641        f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
 642        f2.write(u"a")
 643        f2.close()
 644        # Python used to crash on this at exit because of a refcount
 645        # bug in _codecsmodule.c
 646
 647# From RFC 3492
 648punycode_testcases = [
 649    # A Arabic (Egyptian):
 650    (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
 651     u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
 652     "egbpdaj6bu4bxfgehfvwxn"),
 653    # B Chinese (simplified):
 654    (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
 655     "ihqwcrb4cv8a8dqg056pqjye"),
 656    # C Chinese (traditional):
 657    (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
 658     "ihqwctvzc91f659drss3x8bo0yb"),
 659    # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
 660    (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
 661     u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
 662     u"\u0065\u0073\u006B\u0079",
 663     "Proprostnemluvesky-uyb24dma41a"),
 664    # E Hebrew:
 665    (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
 666     u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
 667     u"\u05D1\u05E8\u05D9\u05EA",
 668     "4dbcagdahymbxekheh6e0a7fei0b"),
 669    # F Hindi (Devanagari):
 670    (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
 671    u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
 672    u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
 673    u"\u0939\u0948\u0902",
 674    "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
 675
 676    #(G) Japanese (kanji and hiragana):
 677    (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
 678    u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
 679     "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
 680
 681    # (H) Korean (Hangul syllables):
 682    (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
 683     u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
 684     u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
 685     "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
 686     "psd879ccm6fea98c"),
 687
 688    # (I) Russian (Cyrillic):
 689    (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
 690     u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
 691     u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
 692     u"\u0438",
 693     "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
 694
 695    # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
 696    (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
 697     u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
 698     u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
 699     u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
 700     u"\u0061\u00F1\u006F\u006C",
 701     "PorqunopuedensimplementehablarenEspaol-fmd56a"),
 702
 703    # (K) Vietnamese:
 704    #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
 705    #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
 706    (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
 707     u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
 708     u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
 709     u"\u0056\u0069\u1EC7\u0074",
 710     "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
 711
 712    #(L) 3<nen>B<gumi><kinpachi><sensei>
 713    (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
 714     "3B-ww4c5e180e575a65lsy2b"),
 715
 716    # (M) <amuro><namie>-with-SUPER-MONKEYS
 717    (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
 718     u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
 719     u"\u004F\u004E\u004B\u0045\u0059\u0053",
 720     "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
 721
 722    # (N) Hello-Another-Way-<sorezore><no><basho>
 723    (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
 724     u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
 725     u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
 726     "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
 727
 728    # (O) <hitotsu><yane><no><shita>2
 729    (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
 730     "2-u9tlzr9756bt3uc0v"),
 731
 732    # (P) Maji<de>Koi<suru>5<byou><mae>
 733    (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
 734     u"\u308B\u0035\u79D2\u524D",
 735     "MajiKoi5-783gue6qz075azm5e"),
 736
 737     # (Q) <pafii>de<runba>
 738    (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
 739     "de-jg4avhby1noc0d"),
 740
 741    # (R) <sono><supiido><de>
 742    (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
 743     "d9juau41awczczp"),
 744
 745    # (S) -> $1.00 <-
 746    (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
 747     u"\u003C\u002D",
 748     "-> $1.00 <--")
 749    ]
 750
 751for i in punycode_testcases:
 752    if len(i)!=2:
 753        print repr(i)
 754
 755class PunycodeTest(unittest.TestCase):
 756    def test_encode(self):
 757        for uni, puny in punycode_testcases:
 758            # Need to convert both strings to lower case, since
 759            # some of the extended encodings use upper case, but our
 760            # code produces only lower case. Converting just puny to
 761            # lower is also insufficient, since some of the input characters
 762            # are upper case.
 763            self.assertEquals(uni.encode("punycode").lower(), puny.lower())
 764
 765    def test_decode(self):
 766        for uni, puny in punycode_testcases:
 767            self.assertEquals(uni, puny.decode("punycode"))
 768
 769class UnicodeInternalTest(unittest.TestCase):
 770    def test_bug1251300(self):
 771        # Decoding with unicode_internal used to not correctly handle "code
 772        # points" above 0x10ffff on UCS-4 builds.
 773        if sys.maxunicode > 0xffff:
 774            ok = [
 775                ("\x00\x10\xff\xff", u"\U0010ffff"),
 776                ("\x00\x00\x01\x01", u"\U00000101"),
 777                ("", u""),
 778            ]
 779            not_ok = [
 780                "\x7f\xff\xff\xff",
 781                "\x80\x00\x00\x00",
 782                "\x81\x00\x00\x00",
 783                "\x00",
 784                "\x00\x00\x00\x00\x00",
 785            ]
 786            for internal, uni in ok:
 787                if sys.byteorder == "little":
 788                    internal = "".join(reversed(internal))
 789                self.assertEquals(uni, internal.decode("unicode_internal"))
 790            for internal in not_ok:
 791                if sys.byteorder == "little":
 792                    internal = "".join(reversed(internal))
 793                self.assertRaises(UnicodeDecodeError, internal.decode,
 794                    "unicode_internal")
 795
 796    def test_decode_error_attributes(self):
 797        if sys.maxunicode > 0xffff:
 798            try:
 799                "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
 800            except UnicodeDecodeError, ex:
 801                self.assertEquals("unicode_internal", ex.encoding)
 802                self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
 803                self.assertEquals(4, ex.start)
 804                self.assertEquals(8, ex.end)
 805            else:
 806                self.fail()
 807
 808    def test_decode_callback(self):
 809        if sys.maxunicode > 0xffff:
 810            codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
 811            decoder = codecs.getdecoder("unicode_internal")
 812            ab = u"ab".encode("unicode_internal")
 813            ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
 814                "UnicodeInternalTest")
 815            self.assertEquals((u"ab", 12), ignored)
 816
 817# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
 818nameprep_tests = [
 819    # 3.1 Map to nothing.
 820    ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
 821     '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
 822     '\xb8\x8f\xef\xbb\xbf',
 823     'foobarbaz'),
 824    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
 825    ('CAFE',
 826     'cafe'),
 827    # 3.3 Case folding 8bit U+00DF (german sharp s).
 828    # The original test case is bogus; it says \xc3\xdf
 829    ('\xc3\x9f',
 830     'ss'),
 831    # 3.4 Case folding U+0130 (turkish capital I with dot).
 832    ('\xc4\xb0',
 833     'i\xcc\x87'),
 834    # 3.5 Case folding multibyte U+0143 U+037A.
 835    ('\xc5\x83\xcd\xba',
 836     '\xc5\x84 \xce\xb9'),
 837    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
 838    # XXX: skip this as it fails in UCS-2 mode
 839    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
 840    # 'telc\xe2\x88\x95kg\xcf\x83'),
 841    (None, None),
 842    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
 843    ('j\xcc\x8c\xc2\xa0\xc2\xaa',
 844     '\xc7\xb0 a'),
 845    # 3.8 Case folding U+1FB7 and normalization.
 846    ('\xe1\xbe\xb7',
 847     '\xe1\xbe\xb6\xce\xb9'),
 848    # 3.9 Self-reverting case folding U+01F0 and normalization.
 849    # The original test case is bogus, it says `\xc7\xf0'
 850    ('\xc7\xb0',
 851     '\xc7\xb0'),
 852    # 3.10 Self-reverting case folding U+0390 and normalization.
 853    ('\xce\x90',
 854     '\xce\x90'),
 855    # 3.11 Self-reverting case folding U+03B0 and normalization.
 856    ('\xce\xb0',
 857     '\xce\xb0'),
 858    # 3.12 Self-reverting case folding U+1E96 and normalization.
 859    ('\xe1\xba\x96',
 860     '\xe1\xba\x96'),
 861    # 3.13 Self-reverting case folding U+1F56 and normalization.
 862    ('\xe1\xbd\x96',
 863     '\xe1\xbd\x96'),
 864    # 3.14 ASCII space character U+0020.
 865    (' ',
 866     ' '),
 867    # 3.15 Non-ASCII 8bit space character U+00A0.
 868    ('\xc2\xa0',
 869     ' '),
 870    # 3.16 Non-ASCII multibyte space character U+1680.
 871    ('\xe1\x9a\x80',
 872     None),
 873    # 3.17 Non-ASCII multibyte space character U+2000.
 874    ('\xe2\x80\x80',
 875     ' '),
 876    # 3.18 Zero Width Space U+200b.
 877    ('\xe2\x80\x8b',
 878     ''),
 879    # 3.19 Non-ASCII multibyte space character U+3000.
 880    ('\xe3\x80\x80',
 881     ' '),
 882    # 3.20 ASCII control characters U+0010 U+007F.
 883    ('\x10\x7f',
 884     '\x10\x7f'),
 885    # 3.21 Non-ASCII 8bit control character U+0085.
 886    ('\xc2\x85',
 887     None),
 888    # 3.22 Non-ASCII multibyte control character U+180E.
 889    ('\xe1\xa0\x8e',
 890     None),
 891    # 3.23 Zero Width No-Break Space U+FEFF.
 892    ('\xef\xbb\xbf',
 893     ''),
 894    # 3.24 Non-ASCII control character U+1D175.
 895    ('\xf0\x9d\x85\xb5',
 896     None),
 897    # 3.25 Plane 0 private use character U+F123.
 898    ('\xef\x84\xa3',
 899     None),
 900    # 3.26 Plane 15 private use character U+F1234.
 901    ('\xf3\xb1\x88\xb4',
 902     None),
 903    # 3.27 Plane 16 private use character U+10F234.
 904    ('\xf4\x8f\x88\xb4',
 905     None),
 906    # 3.28 Non-character code point U+8FFFE.
 907    ('\xf2\x8f\xbf\xbe',
 908     None),
 909    # 3.29 Non-character code point U+10FFFF.
 910    ('\xf4\x8f\xbf\xbf',
 911     None),
 912    # 3.30 Surrogate code U+DF42.
 913    ('\xed\xbd\x82',
 914     None),
 915    # 3.31 Non-plain text character U+FFFD.
 916    ('\xef\xbf\xbd',
 917     None),
 918    # 3.32 Ideographic description character U+2FF5.
 919    ('\xe2\xbf\xb5',
 920     None),
 921    # 3.33 Display property character U+0341.
 922    ('\xcd\x81',
 923     '\xcc\x81'),
 924    # 3.34 Left-to-right mark U+200E.
 925    ('\xe2\x80\x8e',
 926     None),
 927    # 3.35 Deprecated U+202A.
 928    ('\xe2\x80\xaa',
 929     None),
 930    # 3.36 Language tagging character U+E0001.
 931    ('\xf3\xa0\x80\x81',
 932     None),
 933    # 3.37 Language tagging character U+E0042.
 934    ('\xf3\xa0\x81\x82',
 935     None),
 936    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
 937    ('foo\xd6\xbebar',
 938     None),
 939    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
 940    ('foo\xef\xb5\x90bar',
 941     None),
 942    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
 943    ('foo\xef\xb9\xb6bar',
 944     'foo \xd9\x8ebar'),
 945    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
 946    ('\xd8\xa71',
 947     None),
 948    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
 949    ('\xd8\xa71\xd8\xa8',
 950     '\xd8\xa71\xd8\xa8'),
 951    # 3.43 Unassigned code point U+E0002.
 952    # Skip this test as we allow unassigned
 953    #('\xf3\xa0\x80\x82',
 954    # None),
 955    (None, None),
 956    # 3.44 Larger test (shrinking).
 957    # Original test case reads \xc3\xdf
 958    ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
 959     '\xaa\xce\xb0\xe2\x80\x80',
 960     'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
 961    # 3.45 Larger test (expanding).
 962    # Original test case reads \xc3\x9f
 963    ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
 964     '\x80',
 965     'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
 966     '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
 967     '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
 968    ]
 969
 970
 971class NameprepTest(unittest.TestCase):
 972    def test_nameprep(self):
 973        from encodings.idna import nameprep
 974        for pos, (orig, prepped) in enumerate(nameprep_tests):
 975            if orig is None:
 976                # Skipped
 977                continue
 978            # The Unicode strings are given in UTF-8
 979            orig = unicode(orig, "utf-8")
 980            if prepped is None:
 981                # Input contains prohibited characters
 982                self.assertRaises(UnicodeError, nameprep, orig)
 983            else:
 984                prepped = unicode(prepped, "utf-8")
 985                try:
 986                    self.assertEquals(nameprep(orig), prepped)
 987                except Exception,e:
 988                    raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
 989
 990class IDNACodecTest(unittest.TestCase):
 991    def test_builtin_decode(self):
 992        self.assertEquals(unicode("python.org", "idna"), u"python.org")
 993        self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
 994        self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
 995        self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
 996
 997    def test_builtin_encode(self):
 998        self.assertEquals(u"python.org".encode("idna"), "python.org")
 999        self.assertEquals("python.org.".encode("idna"), "python.org.")
1000        self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1001        self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
1002
1003    def test_stream(self):
1004        import StringIO
1005        r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1006        r.read(3)
1007        self.assertEquals(r.read(), u"")
1008
1009    def test_incremental_decode(self):
1010        self.assertEquals(
1011            "".join(codecs.iterdecode("python.org", "idna")),
1012            u"python.org"
1013        )
1014        self.assertEquals(
1015            "".join(codecs.iterdecode("python.org.", "idna")),
1016            u"python.org."
1017        )
1018        self.assertEquals(
1019            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1020            u"pyth\xf6n.org."
1021        )
1022        self.assertEquals(
1023            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1024            u"pyth\xf6n.org."
1025        )
1026
1027        decoder = codecs.getincrementaldecoder("idna")()
1028        self.assertEquals(decoder.decode("xn--xam", ), u"")
1029        self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1030        self.assertEquals(decoder.decode(u"rg"), u"")
1031        self.assertEquals(decoder.decode(u"", True), u"org")
1032
1033        decoder.reset()
1034        self.assertEquals(decoder.decode("xn--xam", ), u"")
1035        self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1036        self.assertEquals(decoder.decode("rg."), u"org.")
1037        self.assertEquals(decoder.decode("", True), u"")
1038
1039    def test_incremental_encode(self):
1040        self.assertEquals(
1041            "".join(codecs.iterencode(u"python.org", "idna")),
1042            "python.org"
1043        )
1044        self.assertEquals(
1045            "".join(codecs.iterencode(u"python.org.", "idna")),
1046            "python.org."
1047        )
1048        self.assertEquals(
1049            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1050            "xn--pythn-mua.org."
1051        )
1052        self.assertEquals(
1053            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1054            "xn--pythn-mua.org."
1055        )
1056
1057        encoder = codecs.getincrementalencoder("idna")()
1058        self.assertEquals(encoder.encode(u"\xe4x"), "")
1059        self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1060        self.assertEquals(encoder.encode(u"", True), "org")
1061
1062        encoder.reset()
1063        self.assertEquals(encoder.encode(u"\xe4x"), "")
1064        self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1065        self.assertEquals(encoder.encode(u"", True), "")
1066
1067class CodecsModuleTest(unittest.TestCase):
1068
1069    def test_decode(self):
1070        self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1071                          u'\xe4\xf6\xfc')
1072        self.assertRaises(TypeError, codecs.decode)
1073        self.assertEquals(codecs.decode('abc'), u'abc')
1074        self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1075
1076    def test_encode(self):
1077        self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1078                          '\xe4\xf6\xfc')
1079        self.assertRaises(TypeError, codecs.encode)
1080        self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1081        self.assertEquals(codecs.encode(u'abc'), 'abc')
1082        self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1083
1084    def test_register(self):
1085        self.assertRaises(TypeError, codecs.register)
1086        self.assertRaises(TypeError, codecs.register, 42)
1087
1088    def test_lookup(self):
1089        self.assertRaises(TypeError, codecs.lookup)
1090        self.assertRaises(LookupError, codecs.lookup, "__spam__")
1091        self.assertRaises(LookupError, codecs.lookup, " ")
1092
1093    def test_getencoder(self):
1094        self.assertRaises(TypeError, codecs.getencoder)
1095        self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1096
1097    def test_getdecoder(self):
1098        self.assertRaises(TypeError, codecs.getdecoder)
1099        self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1100
1101    def test_getreader(self):
1102        self.assertRaises(TypeError, codecs.getreader)
1103        self.assertRaises(LookupError, codecs.getreader, "__spam__")
1104
1105    def test_getwriter(self):
1106        self.assertRaises(TypeError, codecs.getwriter)
1107        self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1108
1109class StreamReaderTest(unittest.TestCase):
1110
1111    def setUp(self):
1112        self.reader = codecs.getreader('utf-8')
1113        self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1114
1115    def test_readlines(self):
1116        f = self.reader(self.stream)
1117        self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1118
1119class EncodedFileTest(unittest.TestCase):
1120
1121    def test_basic(self):
1122        f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1123        ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1124        self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
1125
1126        f = StringIO.StringIO()
1127        ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1128        ef.write('\xc3\xbc')
1129        self.assertEquals(f.getvalue(), '\xfc')
1130
1131class Str2StrTest(unittest.TestCase):
1132
1133    def test_read(self):
1134        sin = "\x80".encode("base64_codec")
1135        reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1136        sout = reader.read()
1137        self.assertEqual(sout, "\x80")
1138        self.assert_(isinstance(sout, str))
1139
1140    def test_readline(self):
1141        sin = "\x80".encode("base64_codec")
1142        reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1143        sout = reader.readline()
1144        self.assertEqual(sout, "\x80")
1145        self.assert_(isinstance(sout, str))
1146
1147all_unicode_encodings = [
1148    "ascii",
1149    "base64_codec",
1150    "big5",
1151    "big5hkscs",
1152    "charmap",
1153    "cp037",
1154    "cp1006",
1155    "cp1026",
1156    "cp1140",
1157    "cp1250",
1158    "cp1251",
1159    "cp1252",
1160    "cp1253",
1161    "cp1254",
1162    "cp1255",
1163    "cp1256",
1164    "cp1257",
1165    "cp1258",
1166    "cp424",
1167    "cp437",
1168    "cp500",
1169    "cp737",
1170    "cp775",
1171    "cp850",
1172    "cp852",
1173    "cp855",
1174    "cp856",
1175    "cp857",
1176    "cp860",
1177    "cp861",
1178    "cp862",
1179    "cp863",
1180    "cp864",
1181    "cp865",
1182    "cp866",
1183    "cp869",
1184    "cp874",
1185    "cp875",
1186    "cp932",
1187    "cp949",
1188    "cp950",
1189    "euc_jis_2004",
1190    "euc_jisx0213",
1191    "euc_jp",
1192    "euc_kr",
1193    "gb18030",
1194    "gb2312",
1195    "gbk",
1196    "hex_codec",
1197    "hp_roman8",
1198    "hz",
1199    "idna",
1200    "iso2022_jp",
1201    "iso2022_jp_1",
1202    "iso2022_jp_2",
1203    "iso2022_jp_2004",
1204    "iso2022_jp_3",
1205    "iso2022_jp_ext",
1206    "iso2022_kr",
1207    "iso8859_1",
1208    "iso8859_10",
1209    "iso8859_11",
1210    "iso8859_13",
1211    "iso8859_14",
1212    "iso8859_15",
1213    "iso8859_16",
1214    "iso8859_2",
1215    "iso8859_3",
1216    "iso8859_4",
1217    "iso8859_5",
1218    "iso8859_6",
1219    "iso8859_7",
1220    "iso8859_8",
1221    "iso8859_9",
1222    "johab",
1223    "koi8_r",
1224    "koi8_u",
1225    "latin_1",
1226    "mac_cyrillic",
1227    "mac_greek",
1228    "mac_iceland",
1229    "mac_latin2",
1230    "mac_roman",
1231    "mac_turkish",
1232    "palmos",
1233    "ptcp154",
1234    "punycode",
1235    "raw_unicode_escape",
1236    "rot_13",
1237    "shift_jis",
1238    "shift_jis_2004",
1239    "shift_jisx0213",
1240    "tis_620",
1241    "unicode_escape",
1242    "unicode_internal",
1243    "utf_16",
1244    "utf_16_be",
1245    "utf_16_le",
1246    "utf_7",
1247    "utf_8",
1248]
1249
1250if hasattr(codecs, "mbcs_encode"):
1251    all_unicode_encodings.append("mbcs")
1252
1253# The following encodings work only with str, not unicode
1254all_string_encodings = [
1255    "quopri_codec",
1256    "string_escape",
1257    "uu_codec",
1258]
1259
1260# The following encoding is not tested, because it's not supposed
1261# to work:
1262#    "undefined"
1263
1264# The following encodings don't work in stateful mode
1265broken_unicode_with_streams = [
1266    "base64_codec",
1267    "hex_codec",
1268    "punycode",
1269    "unicode_internal"
1270]
1271broken_incremental_coders = broken_unicode_with_streams[:]
1272
1273# The following encodings only support "strict" mode
1274only_strict_mode = [
1275    "idna",
1276    "zlib_codec",
1277    "bz2_codec",
1278]
1279
1280try:
1281    import bz2
1282except ImportError:
1283    pass
1284else:
1285    all_unicode_encodings.append("bz2_codec")
1286    broken_unicode_with_streams.append("bz2_codec")
1287
1288try:
1289    import zlib
1290except ImportError:
1291    pass
1292else:
1293    all_unicode_encodings.append("zlib_codec")
1294    broken_unicode_with_streams.append("zlib_codec")
1295
1296class BasicUnicodeTest(unittest.TestCase):
1297    def test_basics(self):
1298        s = u"abc123" # all codecs should be able to encode these
1299        for encoding in all_unicode_encodings:
1300            name = codecs.lookup(encoding).name
1301            if encoding.endswith("_codec"):
1302                name += "_codec"
1303            elif encoding == "latin_1":
1304                name = "latin_1"
1305            self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1306            (bytes, size) = codecs.getencoder(encoding)(s)
1307            if encoding != "unicode_internal":
1308                self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1309            (chars, size) = codecs.getdecoder(encoding)(bytes)
1310            self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1311
1312            if encoding not in broken_unicode_with_streams:
1313                # check stream reader/writer
1314                q = Queue()
1315                writer = codecs.getwriter(encoding)(q)
1316                encodedresult = ""
1317                for c in s:
1318                    writer.write(c)
1319                    encodedresult += q.read()
1320                q = Queue()
1321                reader = codecs.getreader(encoding)(q)
1322                decodedresult = u""
1323                for c in encodedresult:
1324                    q.write(c)
1325                    decodedresult += reader.read()
1326                self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1327
1328            if encoding not in broken_incremental_coders:
1329                # check incremental decoder/encoder (fetched via the Python
1330                # and C API) and iterencode()/iterdecode()
1331                try:
1332                    encoder = codecs.getincrementalencoder(encoding)()
1333                    cencoder = _testcapi.codec_incrementalencoder(encoding)
1334                except LookupError: # no IncrementalEncoder
1335                    pass
1336                else:
1337                    # check incremental decoder/encoder
1338                    encodedresult = ""
1339                    for c in s:
1340                        encodedresult += encoder.encode(c)
1341                    encodedresult += encoder.encode(u"", True)
1342                    decoder = codecs.getincrementaldecoder(encoding)()
1343                    decodedresult = u""
1344                    for c in encodedresult:
1345                        decodedresult += decoder.decode(c)
1346                    decodedresult += decoder.decode("", True)
1347                    self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1348
1349                    # check C API
1350                    encodedresult = ""
1351                    for c in s:
1352                        encodedresult += cencoder.encode(c)
1353                    encodedresult += cencoder.encode(u"", True)
1354                    cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1355                    decodedresult = u""
1356                    for c in encodedresult:
1357                        decodedresult += cdecoder.decode(c)
1358                    decodedresult += cdecoder.decode("", True)
1359                    self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1360
1361                    # check iterencode()/iterdecode()
1362                    result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1363                    self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1364
1365                    # check iterencode()/iterdecode() with empty string
1366                    result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1367                    self.assertEqual(result, u"")
1368
1369                if encoding not in only_strict_mode:
1370                    # check incremental decoder/encoder with errors argument
1371                    try:
1372                        encoder = codecs.getincrementalencoder(encoding)("ignore")
1373                        cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1374                    except LookupError: # no IncrementalEncoder
1375                        pass
1376                    else:
1377                        encodedresult = "".join(encoder.encode(c) for c in s)
1378                        decoder = codecs.getincrementaldecoder(encoding)("ignore")
1379                        decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1380                        self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1381
1382                        encodedresult = "".join(cencoder.encode(c) for c in s)
1383                        cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1384                        decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1385                        self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1386
1387    def test_seek(self):
1388        # all codecs should be able to encode these
1389        s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1390        for encoding in all_unicode_encodings:
1391            if encoding == "idna": # FIXME: See SF bug #1163178
1392                continue
1393            if encoding in broken_unicode_with_streams:
1394                continue
1395            reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1396            for t in xrange(5):
1397                # Test that calling seek resets the internal codec state and buffers
1398                reader.seek(0, 0)
1399                line = reader.readline()
1400                self.assertEqual(s[:len(line)], line)
1401
1402    def test_bad_decode_args(self):
1403        for encoding in all_unicode_encodings:
1404            decoder = codecs.getdecoder(encoding)
1405            self.assertRaises(TypeError, decoder)
1406            if encoding not in ("idna", "punycode"):
1407                self.assertRaises(TypeError, decoder, 42)
1408
1409    def test_bad_encode_args(self):
1410        for encoding in all_unicode_encodings:
1411            encoder = codecs.geten

Large files files are truncated, but you can click here to view the full file