PageRenderTime 40ms CodeModel.GetById 25ms app.highlight 12ms RepoModel.GetById 1ms app.codeStats 0ms

/Lib/encodings/idna.py

http://unladen-swallow.googlecode.com/
Python | 288 lines | 278 code | 4 blank | 6 comment | 8 complexity | dd03680d8a6cc50394b4da3bfd37ed80 MD5 | raw file
  1# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
  2
  3import stringprep, re, codecs
  4from unicodedata import ucd_3_2_0 as unicodedata
  5
  6# IDNA section 3.1
  7dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
  8
  9# IDNA section 5
 10ace_prefix = "xn--"
 11uace_prefix = unicode(ace_prefix, "ascii")
 12
 13# This assumes query strings, so AllowUnassigned is true
 14def nameprep(label):
 15    # Map
 16    newlabel = []
 17    for c in label:
 18        if stringprep.in_table_b1(c):
 19            # Map to nothing
 20            continue
 21        newlabel.append(stringprep.map_table_b2(c))
 22    label = u"".join(newlabel)
 23
 24    # Normalize
 25    label = unicodedata.normalize("NFKC", label)
 26
 27    # Prohibit
 28    for c in label:
 29        if stringprep.in_table_c12(c) or \
 30           stringprep.in_table_c22(c) or \
 31           stringprep.in_table_c3(c) or \
 32           stringprep.in_table_c4(c) or \
 33           stringprep.in_table_c5(c) or \
 34           stringprep.in_table_c6(c) or \
 35           stringprep.in_table_c7(c) or \
 36           stringprep.in_table_c8(c) or \
 37           stringprep.in_table_c9(c):
 38            raise UnicodeError("Invalid character %r" % c)
 39
 40    # Check bidi
 41    RandAL = map(stringprep.in_table_d1, label)
 42    for c in RandAL:
 43        if c:
 44            # There is a RandAL char in the string. Must perform further
 45            # tests:
 46            # 1) The characters in section 5.8 MUST be prohibited.
 47            # This is table C.8, which was already checked
 48            # 2) If a string contains any RandALCat character, the string
 49            # MUST NOT contain any LCat character.
 50            if filter(stringprep.in_table_d2, label):
 51                raise UnicodeError("Violation of BIDI requirement 2")
 52
 53            # 3) If a string contains any RandALCat character, a
 54            # RandALCat character MUST be the first character of the
 55            # string, and a RandALCat character MUST be the last
 56            # character of the string.
 57            if not RandAL[0] or not RandAL[-1]:
 58                raise UnicodeError("Violation of BIDI requirement 3")
 59
 60    return label
 61
 62def ToASCII(label):
 63    try:
 64        # Step 1: try ASCII
 65        label = label.encode("ascii")
 66    except UnicodeError:
 67        pass
 68    else:
 69        # Skip to step 3: UseSTD3ASCIIRules is false, so
 70        # Skip to step 8.
 71        if 0 < len(label) < 64:
 72            return label
 73        raise UnicodeError("label empty or too long")
 74
 75    # Step 2: nameprep
 76    label = nameprep(label)
 77
 78    # Step 3: UseSTD3ASCIIRules is false
 79    # Step 4: try ASCII
 80    try:
 81        label = label.encode("ascii")
 82    except UnicodeError:
 83        pass
 84    else:
 85        # Skip to step 8.
 86        if 0 < len(label) < 64:
 87            return label
 88        raise UnicodeError("label empty or too long")
 89
 90    # Step 5: Check ACE prefix
 91    if label.startswith(uace_prefix):
 92        raise UnicodeError("Label starts with ACE prefix")
 93
 94    # Step 6: Encode with PUNYCODE
 95    label = label.encode("punycode")
 96
 97    # Step 7: Prepend ACE prefix
 98    label = ace_prefix + label
 99
100    # Step 8: Check size
101    if 0 < len(label) < 64:
102        return label
103    raise UnicodeError("label empty or too long")
104
105def ToUnicode(label):
106    # Step 1: Check for ASCII
107    if isinstance(label, str):
108        pure_ascii = True
109    else:
110        try:
111            label = label.encode("ascii")
112            pure_ascii = True
113        except UnicodeError:
114            pure_ascii = False
115    if not pure_ascii:
116        # Step 2: Perform nameprep
117        label = nameprep(label)
118        # It doesn't say this, but apparently, it should be ASCII now
119        try:
120            label = label.encode("ascii")
121        except UnicodeError:
122            raise UnicodeError("Invalid character in IDN label")
123    # Step 3: Check for ACE prefix
124    if not label.startswith(ace_prefix):
125        return unicode(label, "ascii")
126
127    # Step 4: Remove ACE prefix
128    label1 = label[len(ace_prefix):]
129
130    # Step 5: Decode using PUNYCODE
131    result = label1.decode("punycode")
132
133    # Step 6: Apply ToASCII
134    label2 = ToASCII(result)
135
136    # Step 7: Compare the result of step 6 with the one of step 3
137    # label2 will already be in lower case.
138    if label.lower() != label2:
139        raise UnicodeError("IDNA does not round-trip", label, label2)
140
141    # Step 8: return the result of step 5
142    return result
143
144### Codec APIs
145
146class Codec(codecs.Codec):
147    def encode(self,input,errors='strict'):
148
149        if errors != 'strict':
150            # IDNA is quite clear that implementations must be strict
151            raise UnicodeError("unsupported error handling "+errors)
152
153        if not input:
154            return "", 0
155
156        result = []
157        labels = dots.split(input)
158        if labels and len(labels[-1])==0:
159            trailing_dot = '.'
160            del labels[-1]
161        else:
162            trailing_dot = ''
163        for label in labels:
164            result.append(ToASCII(label))
165        # Join with U+002E
166        return ".".join(result)+trailing_dot, len(input)
167
168    def decode(self,input,errors='strict'):
169
170        if errors != 'strict':
171            raise UnicodeError("Unsupported error handling "+errors)
172
173        if not input:
174            return u"", 0
175
176        # IDNA allows decoding to operate on Unicode strings, too.
177        if isinstance(input, unicode):
178            labels = dots.split(input)
179        else:
180            # Must be ASCII string
181            input = str(input)
182            unicode(input, "ascii")
183            labels = input.split(".")
184
185        if labels and len(labels[-1]) == 0:
186            trailing_dot = u'.'
187            del labels[-1]
188        else:
189            trailing_dot = u''
190
191        result = []
192        for label in labels:
193            result.append(ToUnicode(label))
194
195        return u".".join(result)+trailing_dot, len(input)
196
197class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
198    def _buffer_encode(self, input, errors, final):
199        if errors != 'strict':
200            # IDNA is quite clear that implementations must be strict
201            raise UnicodeError("unsupported error handling "+errors)
202
203        if not input:
204            return ("", 0)
205
206        labels = dots.split(input)
207        trailing_dot = u''
208        if labels:
209            if not labels[-1]:
210                trailing_dot = '.'
211                del labels[-1]
212            elif not final:
213                # Keep potentially unfinished label until the next call
214                del labels[-1]
215                if labels:
216                    trailing_dot = '.'
217
218        result = []
219        size = 0
220        for label in labels:
221            result.append(ToASCII(label))
222            if size:
223                size += 1
224            size += len(label)
225
226        # Join with U+002E
227        result = ".".join(result) + trailing_dot
228        size += len(trailing_dot)
229        return (result, size)
230
231class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
232    def _buffer_decode(self, input, errors, final):
233        if errors != 'strict':
234            raise UnicodeError("Unsupported error handling "+errors)
235
236        if not input:
237            return (u"", 0)
238
239        # IDNA allows decoding to operate on Unicode strings, too.
240        if isinstance(input, unicode):
241            labels = dots.split(input)
242        else:
243            # Must be ASCII string
244            input = str(input)
245            unicode(input, "ascii")
246            labels = input.split(".")
247
248        trailing_dot = u''
249        if labels:
250            if not labels[-1]:
251                trailing_dot = u'.'
252                del labels[-1]
253            elif not final:
254                # Keep potentially unfinished label until the next call
255                del labels[-1]
256                if labels:
257                    trailing_dot = u'.'
258
259        result = []
260        size = 0
261        for label in labels:
262            result.append(ToUnicode(label))
263            if size:
264                size += 1
265            size += len(label)
266
267        result = u".".join(result) + trailing_dot
268        size += len(trailing_dot)
269        return (result, size)
270
271class StreamWriter(Codec,codecs.StreamWriter):
272    pass
273
274class StreamReader(Codec,codecs.StreamReader):
275    pass
276
277### encodings module API
278
279def getregentry():
280    return codecs.CodecInfo(
281        name='idna',
282        encode=Codec().encode,
283        decode=Codec().decode,
284        incrementalencoder=IncrementalEncoder,
285        incrementaldecoder=IncrementalDecoder,
286        streamwriter=StreamWriter,
287        streamreader=StreamReader,
288    )