PageRenderTime 316ms CodeModel.GetById 91ms app.highlight 65ms RepoModel.GetById 156ms app.codeStats 0ms

/Lib/encodings/punycode.py

http://unladen-swallow.googlecode.com/
Python | 238 lines | 228 code | 3 blank | 7 comment | 3 complexity | f705f5149ea985c4bb1004bcdc05a84e MD5 | raw file
  1# -*- coding: iso-8859-1 -*-
  2""" Codec for the Punicode encoding, as specified in RFC 3492
  3
  4Written by Martin v. Löwis.
  5"""
  6
  7import codecs
  8
  9##################### Encoding #####################################
 10
 11def segregate(str):
 12    """3.1 Basic code point segregation"""
 13    base = []
 14    extended = {}
 15    for c in str:
 16        if ord(c) < 128:
 17            base.append(c)
 18        else:
 19            extended[c] = 1
 20    extended = extended.keys()
 21    extended.sort()
 22    return "".join(base).encode("ascii"),extended
 23
 24def selective_len(str, max):
 25    """Return the length of str, considering only characters below max."""
 26    res = 0
 27    for c in str:
 28        if ord(c) < max:
 29            res += 1
 30    return res
 31
 32def selective_find(str, char, index, pos):
 33    """Return a pair (index, pos), indicating the next occurrence of
 34    char in str. index is the position of the character considering
 35    only ordinals up to and including char, and pos is the position in
 36    the full string. index/pos is the starting position in the full
 37    string."""
 38
 39    l = len(str)
 40    while 1:
 41        pos += 1
 42        if pos == l:
 43            return (-1, -1)
 44        c = str[pos]
 45        if c == char:
 46            return index+1, pos
 47        elif c < char:
 48            index += 1
 49
 50def insertion_unsort(str, extended):
 51    """3.2 Insertion unsort coding"""
 52    oldchar = 0x80
 53    result = []
 54    oldindex = -1
 55    for c in extended:
 56        index = pos = -1
 57        char = ord(c)
 58        curlen = selective_len(str, char)
 59        delta = (curlen+1) * (char - oldchar)
 60        while 1:
 61            index,pos = selective_find(str,c,index,pos)
 62            if index == -1:
 63                break
 64            delta += index - oldindex
 65            result.append(delta-1)
 66            oldindex = index
 67            delta = 0
 68        oldchar = char
 69
 70    return result
 71
 72def T(j, bias):
 73    # Punycode parameters: tmin = 1, tmax = 26, base = 36
 74    res = 36 * (j + 1) - bias
 75    if res < 1: return 1
 76    if res > 26: return 26
 77    return res
 78
 79digits = "abcdefghijklmnopqrstuvwxyz0123456789"
 80def generate_generalized_integer(N, bias):
 81    """3.3 Generalized variable-length integers"""
 82    result = []
 83    j = 0
 84    while 1:
 85        t = T(j, bias)
 86        if N < t:
 87            result.append(digits[N])
 88            return result
 89        result.append(digits[t + ((N - t) % (36 - t))])
 90        N = (N - t) // (36 - t)
 91        j += 1
 92
 93def adapt(delta, first, numchars):
 94    if first:
 95        delta //= 700
 96    else:
 97        delta //= 2
 98    delta += delta // numchars
 99    # ((base - tmin) * tmax) // 2 == 455
100    divisions = 0
101    while delta > 455:
102        delta = delta // 35 # base - tmin
103        divisions += 36
104    bias = divisions + (36 * delta // (delta + 38))
105    return bias
106
107
108def generate_integers(baselen, deltas):
109    """3.4 Bias adaptation"""
110    # Punycode parameters: initial bias = 72, damp = 700, skew = 38
111    result = []
112    bias = 72
113    for points, delta in enumerate(deltas):
114        s = generate_generalized_integer(delta, bias)
115        result.extend(s)
116        bias = adapt(delta, points==0, baselen+points+1)
117    return "".join(result)
118
119def punycode_encode(text):
120    base, extended = segregate(text)
121    base = base.encode("ascii")
122    deltas = insertion_unsort(text, extended)
123    extended = generate_integers(len(base), deltas)
124    if base:
125        return base + "-" + extended
126    return extended
127
128##################### Decoding #####################################
129
130def decode_generalized_number(extended, extpos, bias, errors):
131    """3.3 Generalized variable-length integers"""
132    result = 0
133    w = 1
134    j = 0
135    while 1:
136        try:
137            char = ord(extended[extpos])
138        except IndexError:
139            if errors == "strict":
140                raise UnicodeError, "incomplete punicode string"
141            return extpos + 1, None
142        extpos += 1
143        if 0x41 <= char <= 0x5A: # A-Z
144            digit = char - 0x41
145        elif 0x30 <= char <= 0x39:
146            digit = char - 22 # 0x30-26
147        elif errors == "strict":
148            raise UnicodeError("Invalid extended code point '%s'"
149                               % extended[extpos])
150        else:
151            return extpos, None
152        t = T(j, bias)
153        result += digit * w
154        if digit < t:
155            return extpos, result
156        w = w * (36 - t)
157        j += 1
158
159
160def insertion_sort(base, extended, errors):
161    """3.2 Insertion unsort coding"""
162    char = 0x80
163    pos = -1
164    bias = 72
165    extpos = 0
166    while extpos < len(extended):
167        newpos, delta = decode_generalized_number(extended, extpos,
168                                                  bias, errors)
169        if delta is None:
170            # There was an error in decoding. We can't continue because
171            # synchronization is lost.
172            return base
173        pos += delta+1
174        char += pos // (len(base) + 1)
175        if char > 0x10FFFF:
176            if errors == "strict":
177                raise UnicodeError, ("Invalid character U+%x" % char)
178            char = ord('?')
179        pos = pos % (len(base) + 1)
180        base = base[:pos] + unichr(char) + base[pos:]
181        bias = adapt(delta, (extpos == 0), len(base))
182        extpos = newpos
183    return base
184
185def punycode_decode(text, errors):
186    pos = text.rfind("-")
187    if pos == -1:
188        base = ""
189        extended = text
190    else:
191        base = text[:pos]
192        extended = text[pos+1:]
193    base = unicode(base, "ascii", errors)
194    extended = extended.upper()
195    return insertion_sort(base, extended, errors)
196
197### Codec APIs
198
199class Codec(codecs.Codec):
200
201    def encode(self,input,errors='strict'):
202        res = punycode_encode(input)
203        return res, len(input)
204
205    def decode(self,input,errors='strict'):
206        if errors not in ('strict', 'replace', 'ignore'):
207            raise UnicodeError, "Unsupported error handling "+errors
208        res = punycode_decode(input, errors)
209        return res, len(input)
210
211class IncrementalEncoder(codecs.IncrementalEncoder):
212    def encode(self, input, final=False):
213        return punycode_encode(input)
214
215class IncrementalDecoder(codecs.IncrementalDecoder):
216    def decode(self, input, final=False):
217        if self.errors not in ('strict', 'replace', 'ignore'):
218            raise UnicodeError, "Unsupported error handling "+self.errors
219        return punycode_decode(input, self.errors)
220
221class StreamWriter(Codec,codecs.StreamWriter):
222    pass
223
224class StreamReader(Codec,codecs.StreamReader):
225    pass
226
227### encodings module API
228
229def getregentry():
230    return codecs.CodecInfo(
231        name='punycode',
232        encode=Codec().encode,
233        decode=Codec().decode,
234        incrementalencoder=IncrementalEncoder,
235        incrementaldecoder=IncrementalDecoder,
236        streamwriter=StreamWriter,
237        streamreader=StreamReader,
238    )