PageRenderTime 44ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/lib-python/2.7/encodings/idna.py

https://bitbucket.org/kkris/pypy
Python | 288 lines | 278 code | 4 blank | 6 comment | 6 complexity | dd03680d8a6cc50394b4da3bfd37ed80 MD5 | raw file
  1. # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
  2. import stringprep, re, codecs
  3. from unicodedata import ucd_3_2_0 as unicodedata
  4. # IDNA section 3.1
  5. dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
  6. # IDNA section 5
  7. ace_prefix = "xn--"
  8. uace_prefix = unicode(ace_prefix, "ascii")
  9. # This assumes query strings, so AllowUnassigned is true
  10. def nameprep(label):
  11. # Map
  12. newlabel = []
  13. for c in label:
  14. if stringprep.in_table_b1(c):
  15. # Map to nothing
  16. continue
  17. newlabel.append(stringprep.map_table_b2(c))
  18. label = u"".join(newlabel)
  19. # Normalize
  20. label = unicodedata.normalize("NFKC", label)
  21. # Prohibit
  22. for c in label:
  23. if stringprep.in_table_c12(c) or \
  24. stringprep.in_table_c22(c) or \
  25. stringprep.in_table_c3(c) or \
  26. stringprep.in_table_c4(c) or \
  27. stringprep.in_table_c5(c) or \
  28. stringprep.in_table_c6(c) or \
  29. stringprep.in_table_c7(c) or \
  30. stringprep.in_table_c8(c) or \
  31. stringprep.in_table_c9(c):
  32. raise UnicodeError("Invalid character %r" % c)
  33. # Check bidi
  34. RandAL = map(stringprep.in_table_d1, label)
  35. for c in RandAL:
  36. if c:
  37. # There is a RandAL char in the string. Must perform further
  38. # tests:
  39. # 1) The characters in section 5.8 MUST be prohibited.
  40. # This is table C.8, which was already checked
  41. # 2) If a string contains any RandALCat character, the string
  42. # MUST NOT contain any LCat character.
  43. if filter(stringprep.in_table_d2, label):
  44. raise UnicodeError("Violation of BIDI requirement 2")
  45. # 3) If a string contains any RandALCat character, a
  46. # RandALCat character MUST be the first character of the
  47. # string, and a RandALCat character MUST be the last
  48. # character of the string.
  49. if not RandAL[0] or not RandAL[-1]:
  50. raise UnicodeError("Violation of BIDI requirement 3")
  51. return label
  52. def ToASCII(label):
  53. try:
  54. # Step 1: try ASCII
  55. label = label.encode("ascii")
  56. except UnicodeError:
  57. pass
  58. else:
  59. # Skip to step 3: UseSTD3ASCIIRules is false, so
  60. # Skip to step 8.
  61. if 0 < len(label) < 64:
  62. return label
  63. raise UnicodeError("label empty or too long")
  64. # Step 2: nameprep
  65. label = nameprep(label)
  66. # Step 3: UseSTD3ASCIIRules is false
  67. # Step 4: try ASCII
  68. try:
  69. label = label.encode("ascii")
  70. except UnicodeError:
  71. pass
  72. else:
  73. # Skip to step 8.
  74. if 0 < len(label) < 64:
  75. return label
  76. raise UnicodeError("label empty or too long")
  77. # Step 5: Check ACE prefix
  78. if label.startswith(uace_prefix):
  79. raise UnicodeError("Label starts with ACE prefix")
  80. # Step 6: Encode with PUNYCODE
  81. label = label.encode("punycode")
  82. # Step 7: Prepend ACE prefix
  83. label = ace_prefix + label
  84. # Step 8: Check size
  85. if 0 < len(label) < 64:
  86. return label
  87. raise UnicodeError("label empty or too long")
  88. def ToUnicode(label):
  89. # Step 1: Check for ASCII
  90. if isinstance(label, str):
  91. pure_ascii = True
  92. else:
  93. try:
  94. label = label.encode("ascii")
  95. pure_ascii = True
  96. except UnicodeError:
  97. pure_ascii = False
  98. if not pure_ascii:
  99. # Step 2: Perform nameprep
  100. label = nameprep(label)
  101. # It doesn't say this, but apparently, it should be ASCII now
  102. try:
  103. label = label.encode("ascii")
  104. except UnicodeError:
  105. raise UnicodeError("Invalid character in IDN label")
  106. # Step 3: Check for ACE prefix
  107. if not label.startswith(ace_prefix):
  108. return unicode(label, "ascii")
  109. # Step 4: Remove ACE prefix
  110. label1 = label[len(ace_prefix):]
  111. # Step 5: Decode using PUNYCODE
  112. result = label1.decode("punycode")
  113. # Step 6: Apply ToASCII
  114. label2 = ToASCII(result)
  115. # Step 7: Compare the result of step 6 with the one of step 3
  116. # label2 will already be in lower case.
  117. if label.lower() != label2:
  118. raise UnicodeError("IDNA does not round-trip", label, label2)
  119. # Step 8: return the result of step 5
  120. return result
  121. ### Codec APIs
  122. class Codec(codecs.Codec):
  123. def encode(self,input,errors='strict'):
  124. if errors != 'strict':
  125. # IDNA is quite clear that implementations must be strict
  126. raise UnicodeError("unsupported error handling "+errors)
  127. if not input:
  128. return "", 0
  129. result = []
  130. labels = dots.split(input)
  131. if labels and len(labels[-1])==0:
  132. trailing_dot = '.'
  133. del labels[-1]
  134. else:
  135. trailing_dot = ''
  136. for label in labels:
  137. result.append(ToASCII(label))
  138. # Join with U+002E
  139. return ".".join(result)+trailing_dot, len(input)
  140. def decode(self,input,errors='strict'):
  141. if errors != 'strict':
  142. raise UnicodeError("Unsupported error handling "+errors)
  143. if not input:
  144. return u"", 0
  145. # IDNA allows decoding to operate on Unicode strings, too.
  146. if isinstance(input, unicode):
  147. labels = dots.split(input)
  148. else:
  149. # Must be ASCII string
  150. input = str(input)
  151. unicode(input, "ascii")
  152. labels = input.split(".")
  153. if labels and len(labels[-1]) == 0:
  154. trailing_dot = u'.'
  155. del labels[-1]
  156. else:
  157. trailing_dot = u''
  158. result = []
  159. for label in labels:
  160. result.append(ToUnicode(label))
  161. return u".".join(result)+trailing_dot, len(input)
  162. class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
  163. def _buffer_encode(self, input, errors, final):
  164. if errors != 'strict':
  165. # IDNA is quite clear that implementations must be strict
  166. raise UnicodeError("unsupported error handling "+errors)
  167. if not input:
  168. return ("", 0)
  169. labels = dots.split(input)
  170. trailing_dot = u''
  171. if labels:
  172. if not labels[-1]:
  173. trailing_dot = '.'
  174. del labels[-1]
  175. elif not final:
  176. # Keep potentially unfinished label until the next call
  177. del labels[-1]
  178. if labels:
  179. trailing_dot = '.'
  180. result = []
  181. size = 0
  182. for label in labels:
  183. result.append(ToASCII(label))
  184. if size:
  185. size += 1
  186. size += len(label)
  187. # Join with U+002E
  188. result = ".".join(result) + trailing_dot
  189. size += len(trailing_dot)
  190. return (result, size)
  191. class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  192. def _buffer_decode(self, input, errors, final):
  193. if errors != 'strict':
  194. raise UnicodeError("Unsupported error handling "+errors)
  195. if not input:
  196. return (u"", 0)
  197. # IDNA allows decoding to operate on Unicode strings, too.
  198. if isinstance(input, unicode):
  199. labels = dots.split(input)
  200. else:
  201. # Must be ASCII string
  202. input = str(input)
  203. unicode(input, "ascii")
  204. labels = input.split(".")
  205. trailing_dot = u''
  206. if labels:
  207. if not labels[-1]:
  208. trailing_dot = u'.'
  209. del labels[-1]
  210. elif not final:
  211. # Keep potentially unfinished label until the next call
  212. del labels[-1]
  213. if labels:
  214. trailing_dot = u'.'
  215. result = []
  216. size = 0
  217. for label in labels:
  218. result.append(ToUnicode(label))
  219. if size:
  220. size += 1
  221. size += len(label)
  222. result = u".".join(result) + trailing_dot
  223. size += len(trailing_dot)
  224. return (result, size)
  225. class StreamWriter(Codec,codecs.StreamWriter):
  226. pass
  227. class StreamReader(Codec,codecs.StreamReader):
  228. pass
  229. ### encodings module API
  230. def getregentry():
  231. return codecs.CodecInfo(
  232. name='idna',
  233. encode=Codec().encode,
  234. decode=Codec().decode,
  235. incrementalencoder=IncrementalEncoder,
  236. incrementaldecoder=IncrementalDecoder,
  237. streamwriter=StreamWriter,
  238. streamreader=StreamReader,
  239. )