PageRenderTime 59ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/code/default/python27/1.0/lib/noarch/idna/core.py

https://gitlab.com/Mirros/XX-net
Python | 387 lines | 359 code | 23 blank | 5 comment | 15 complexity | 3890241f1c5e77c65192b615dafd2dd0 MD5 | raw file
  1. from . import idnadata
  2. import bisect
  3. import unicodedata
  4. import re
  5. import sys
  6. from .intranges import intranges_contain
  7. _virama_combining_class = 9
  8. _alabel_prefix = b'xn--'
  9. _unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]')
  10. if sys.version_info[0] == 3:
  11. unicode = str
  12. unichr = chr
  13. class IDNAError(UnicodeError):
  14. """ Base exception for all IDNA-encoding related problems """
  15. pass
  16. class IDNABidiError(IDNAError):
  17. """ Exception when bidirectional requirements are not satisfied """
  18. pass
  19. class InvalidCodepoint(IDNAError):
  20. """ Exception when a disallowed or unallocated codepoint is used """
  21. pass
  22. class InvalidCodepointContext(IDNAError):
  23. """ Exception when the codepoint is not valid in the context it is used """
  24. pass
  25. def _combining_class(cp):
  26. return unicodedata.combining(unichr(cp))
  27. def _is_script(cp, script):
  28. return intranges_contain(ord(cp), idnadata.scripts[script])
  29. def _punycode(s):
  30. return s.encode('punycode')
  31. def _unot(s):
  32. return 'U+{0:04X}'.format(s)
  33. def valid_label_length(label):
  34. if len(label) > 63:
  35. return False
  36. return True
  37. def valid_string_length(label, trailing_dot):
  38. if len(label) > (254 if trailing_dot else 253):
  39. return False
  40. return True
  41. def check_bidi(label, check_ltr=False):
  42. # Bidi rules should only be applied if string contains RTL characters
  43. bidi_label = False
  44. for (idx, cp) in enumerate(label, 1):
  45. direction = unicodedata.bidirectional(cp)
  46. if direction == '':
  47. # String likely comes from a newer version of Unicode
  48. raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx))
  49. if direction in ['R', 'AL', 'AN']:
  50. bidi_label = True
  51. break
  52. if not bidi_label and not check_ltr:
  53. return True
  54. # Bidi rule 1
  55. direction = unicodedata.bidirectional(label[0])
  56. if direction in ['R', 'AL']:
  57. rtl = True
  58. elif direction == 'L':
  59. rtl = False
  60. else:
  61. raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label)))
  62. valid_ending = False
  63. number_type = False
  64. for (idx, cp) in enumerate(label, 1):
  65. direction = unicodedata.bidirectional(cp)
  66. if rtl:
  67. # Bidi rule 2
  68. if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
  69. raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx))
  70. # Bidi rule 3
  71. if direction in ['R', 'AL', 'EN', 'AN']:
  72. valid_ending = True
  73. elif direction != 'NSM':
  74. valid_ending = False
  75. # Bidi rule 4
  76. if direction in ['AN', 'EN']:
  77. if not number_type:
  78. number_type = direction
  79. else:
  80. if number_type != direction:
  81. raise IDNABidiError('Can not mix numeral types in a right-to-left label')
  82. else:
  83. # Bidi rule 5
  84. if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
  85. raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx))
  86. # Bidi rule 6
  87. if direction in ['L', 'EN']:
  88. valid_ending = True
  89. elif direction != 'NSM':
  90. valid_ending = False
  91. if not valid_ending:
  92. raise IDNABidiError('Label ends with illegal codepoint directionality')
  93. return True
  94. def check_initial_combiner(label):
  95. if unicodedata.category(label[0])[0] == 'M':
  96. raise IDNAError('Label begins with an illegal combining character')
  97. return True
  98. def check_hyphen_ok(label):
  99. if label[2:4] == '--':
  100. raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
  101. if label[0] == '-' or label[-1] == '-':
  102. raise IDNAError('Label must not start or end with a hyphen')
  103. return True
  104. def check_nfc(label):
  105. if unicodedata.normalize('NFC', label) != label:
  106. raise IDNAError('Label must be in Normalization Form C')
  107. def valid_contextj(label, pos):
  108. cp_value = ord(label[pos])
  109. if cp_value == 0x200c:
  110. if pos > 0:
  111. if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
  112. return True
  113. ok = False
  114. for i in range(pos-1, -1, -1):
  115. joining_type = idnadata.joining_types.get(ord(label[i]))
  116. if joining_type == 'T':
  117. continue
  118. if joining_type in ['L', 'D']:
  119. ok = True
  120. break
  121. if not ok:
  122. return False
  123. ok = False
  124. for i in range(pos+1, len(label)):
  125. joining_type = idnadata.joining_types.get(ord(label[i]))
  126. if joining_type == 'T':
  127. continue
  128. if joining_type in ['R', 'D']:
  129. ok = True
  130. break
  131. return ok
  132. if cp_value == 0x200d:
  133. if pos > 0:
  134. if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
  135. return True
  136. return False
  137. else:
  138. return False
  139. def valid_contexto(label, pos, exception=False):
  140. cp_value = ord(label[pos])
  141. if cp_value == 0x00b7:
  142. if 0 < pos < len(label)-1:
  143. if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
  144. return True
  145. return False
  146. elif cp_value == 0x0375:
  147. if pos < len(label)-1 and len(label) > 1:
  148. return _is_script(label[pos + 1], 'Greek')
  149. return False
  150. elif cp_value == 0x05f3 or cp_value == 0x05f4:
  151. if pos > 0:
  152. return _is_script(label[pos - 1], 'Hebrew')
  153. return False
  154. elif cp_value == 0x30fb:
  155. for cp in label:
  156. if cp == u'\u30fb':
  157. continue
  158. if not _is_script(cp, 'Hiragana') and not _is_script(cp, 'Katakana') and not _is_script(cp, 'Han'):
  159. return False
  160. return True
  161. elif 0x660 <= cp_value <= 0x669:
  162. for cp in label:
  163. if 0x6f0 <= ord(cp) <= 0x06f9:
  164. return False
  165. return True
  166. elif 0x6f0 <= cp_value <= 0x6f9:
  167. for cp in label:
  168. if 0x660 <= ord(cp) <= 0x0669:
  169. return False
  170. return True
  171. def check_label(label):
  172. if isinstance(label, (bytes, bytearray)):
  173. label = label.decode('utf-8')
  174. if len(label) == 0:
  175. raise IDNAError('Empty Label')
  176. check_nfc(label)
  177. check_hyphen_ok(label)
  178. check_initial_combiner(label)
  179. for (pos, cp) in enumerate(label):
  180. cp_value = ord(cp)
  181. if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
  182. continue
  183. elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
  184. if not valid_contextj(label, pos):
  185. raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label)))
  186. elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
  187. if not valid_contexto(label, pos):
  188. raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label)))
  189. else:
  190. raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
  191. check_bidi(label)
  192. def alabel(label):
  193. try:
  194. label = label.encode('ascii')
  195. try:
  196. ulabel(label)
  197. except:
  198. raise IDNAError('The label {0} is not a valid A-label'.format(label))
  199. if not valid_label_length(label):
  200. raise IDNAError('Label too long')
  201. return label
  202. except UnicodeError:
  203. pass
  204. if not label:
  205. raise IDNAError('No Input')
  206. label = unicode(label)
  207. check_label(label)
  208. label = _punycode(label)
  209. label = _alabel_prefix + label
  210. if not valid_label_length(label):
  211. raise IDNAError('Label too long')
  212. return label
  213. def ulabel(label):
  214. if not isinstance(label, (bytes, bytearray)):
  215. try:
  216. label = label.encode('ascii')
  217. except UnicodeError:
  218. check_label(label)
  219. return label
  220. label = label.lower()
  221. if label.startswith(_alabel_prefix):
  222. label = label[len(_alabel_prefix):]
  223. else:
  224. check_label(label)
  225. return label.decode('ascii')
  226. label = label.decode('punycode')
  227. check_label(label)
  228. return label
  229. def uts46_remap(domain, std3_rules=True, transitional=False):
  230. """Re-map the characters in the string according to UTS46 processing."""
  231. from .uts46data import uts46data
  232. output = u""
  233. try:
  234. for pos, char in enumerate(domain):
  235. code_point = ord(char)
  236. uts46row = uts46data[code_point if code_point < 256 else
  237. bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
  238. status = uts46row[1]
  239. replacement = uts46row[2] if len(uts46row) == 3 else None
  240. if (status == "V" or
  241. (status == "D" and not transitional) or
  242. (status == "3" and std3_rules and replacement is None)):
  243. output += char
  244. elif replacement is not None and (status == "M" or
  245. (status == "3" and std3_rules) or
  246. (status == "D" and transitional)):
  247. output += replacement
  248. elif status != "I":
  249. raise IndexError()
  250. return unicodedata.normalize("NFC", output)
  251. except IndexError:
  252. raise InvalidCodepoint(
  253. "Codepoint {0} not allowed at position {1} in {2}".format(
  254. _unot(code_point), pos + 1, repr(domain)))
  255. def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
  256. if isinstance(s, (bytes, bytearray)):
  257. s = s.decode("ascii")
  258. if uts46:
  259. s = uts46_remap(s, std3_rules, transitional)
  260. trailing_dot = False
  261. result = []
  262. if strict:
  263. labels = s.split('.')
  264. else:
  265. labels = _unicode_dots_re.split(s)
  266. while labels and not labels[0]:
  267. del labels[0]
  268. if not labels:
  269. raise IDNAError('Empty domain')
  270. if labels[-1] == '':
  271. del labels[-1]
  272. trailing_dot = True
  273. for label in labels:
  274. result.append(alabel(label))
  275. if trailing_dot:
  276. result.append(b'')
  277. s = b'.'.join(result)
  278. if not valid_string_length(s, trailing_dot):
  279. raise IDNAError('Domain too long')
  280. return s
  281. def decode(s, strict=False, uts46=False, std3_rules=False):
  282. if isinstance(s, (bytes, bytearray)):
  283. s = s.decode("ascii")
  284. if uts46:
  285. s = uts46_remap(s, std3_rules, False)
  286. trailing_dot = False
  287. result = []
  288. if not strict:
  289. labels = _unicode_dots_re.split(s)
  290. else:
  291. labels = s.split(u'.')
  292. while labels and not labels[0]:
  293. del labels[0]
  294. if not labels:
  295. raise IDNAError('Empty domain')
  296. if not labels[-1]:
  297. del labels[-1]
  298. trailing_dot = True
  299. for label in labels:
  300. result.append(ulabel(label))
  301. if trailing_dot:
  302. result.append(u'')
  303. return u'.'.join(result)