PageRenderTime 58ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/Lib/site-packages/pip/_vendor/idna/core.py

https://gitlab.com/phongphans61/machine-learning-tictactoe
Python | 396 lines | 369 code | 22 blank | 5 comment | 17 complexity | fa713bc551aff5430b9a31e6b1de5669 MD5 | raw file
  1. from . import idnadata
  2. import bisect
  3. import unicodedata
  4. import re
  5. import sys
  6. from .intranges import intranges_contain
  7. _virama_combining_class = 9
  8. _alabel_prefix = b'xn--'
  9. _unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')
  10. class IDNAError(UnicodeError):
  11. """ Base exception for all IDNA-encoding related problems """
  12. pass
  13. class IDNABidiError(IDNAError):
  14. """ Exception when bidirectional requirements are not satisfied """
  15. pass
  16. class InvalidCodepoint(IDNAError):
  17. """ Exception when a disallowed or unallocated codepoint is used """
  18. pass
  19. class InvalidCodepointContext(IDNAError):
  20. """ Exception when the codepoint is not valid in the context it is used """
  21. pass
  22. def _combining_class(cp):
  23. v = unicodedata.combining(chr(cp))
  24. if v == 0:
  25. if not unicodedata.name(chr(cp)):
  26. raise ValueError('Unknown character in unicodedata')
  27. return v
  28. def _is_script(cp, script):
  29. return intranges_contain(ord(cp), idnadata.scripts[script])
  30. def _punycode(s):
  31. return s.encode('punycode')
  32. def _unot(s):
  33. return 'U+{:04X}'.format(s)
  34. def valid_label_length(label):
  35. if len(label) > 63:
  36. return False
  37. return True
  38. def valid_string_length(label, trailing_dot):
  39. if len(label) > (254 if trailing_dot else 253):
  40. return False
  41. return True
  42. def check_bidi(label, check_ltr=False):
  43. # Bidi rules should only be applied if string contains RTL characters
  44. bidi_label = False
  45. for (idx, cp) in enumerate(label, 1):
  46. direction = unicodedata.bidirectional(cp)
  47. if direction == '':
  48. # String likely comes from a newer version of Unicode
  49. raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx))
  50. if direction in ['R', 'AL', 'AN']:
  51. bidi_label = True
  52. if not bidi_label and not check_ltr:
  53. return True
  54. # Bidi rule 1
  55. direction = unicodedata.bidirectional(label[0])
  56. if direction in ['R', 'AL']:
  57. rtl = True
  58. elif direction == 'L':
  59. rtl = False
  60. else:
  61. raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label)))
  62. valid_ending = False
  63. number_type = False
  64. for (idx, cp) in enumerate(label, 1):
  65. direction = unicodedata.bidirectional(cp)
  66. if rtl:
  67. # Bidi rule 2
  68. if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
  69. raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx))
  70. # Bidi rule 3
  71. if direction in ['R', 'AL', 'EN', 'AN']:
  72. valid_ending = True
  73. elif direction != 'NSM':
  74. valid_ending = False
  75. # Bidi rule 4
  76. if direction in ['AN', 'EN']:
  77. if not number_type:
  78. number_type = direction
  79. else:
  80. if number_type != direction:
  81. raise IDNABidiError('Can not mix numeral types in a right-to-left label')
  82. else:
  83. # Bidi rule 5
  84. if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
  85. raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx))
  86. # Bidi rule 6
  87. if direction in ['L', 'EN']:
  88. valid_ending = True
  89. elif direction != 'NSM':
  90. valid_ending = False
  91. if not valid_ending:
  92. raise IDNABidiError('Label ends with illegal codepoint directionality')
  93. return True
  94. def check_initial_combiner(label):
  95. if unicodedata.category(label[0])[0] == 'M':
  96. raise IDNAError('Label begins with an illegal combining character')
  97. return True
  98. def check_hyphen_ok(label):
  99. if label[2:4] == '--':
  100. raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
  101. if label[0] == '-' or label[-1] == '-':
  102. raise IDNAError('Label must not start or end with a hyphen')
  103. return True
  104. def check_nfc(label):
  105. if unicodedata.normalize('NFC', label) != label:
  106. raise IDNAError('Label must be in Normalization Form C')
  107. def valid_contextj(label, pos):
  108. cp_value = ord(label[pos])
  109. if cp_value == 0x200c:
  110. if pos > 0:
  111. if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
  112. return True
  113. ok = False
  114. for i in range(pos-1, -1, -1):
  115. joining_type = idnadata.joining_types.get(ord(label[i]))
  116. if joining_type == ord('T'):
  117. continue
  118. if joining_type in [ord('L'), ord('D')]:
  119. ok = True
  120. break
  121. if not ok:
  122. return False
  123. ok = False
  124. for i in range(pos+1, len(label)):
  125. joining_type = idnadata.joining_types.get(ord(label[i]))
  126. if joining_type == ord('T'):
  127. continue
  128. if joining_type in [ord('R'), ord('D')]:
  129. ok = True
  130. break
  131. return ok
  132. if cp_value == 0x200d:
  133. if pos > 0:
  134. if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
  135. return True
  136. return False
  137. else:
  138. return False
  139. def valid_contexto(label, pos, exception=False):
  140. cp_value = ord(label[pos])
  141. if cp_value == 0x00b7:
  142. if 0 < pos < len(label)-1:
  143. if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
  144. return True
  145. return False
  146. elif cp_value == 0x0375:
  147. if pos < len(label)-1 and len(label) > 1:
  148. return _is_script(label[pos + 1], 'Greek')
  149. return False
  150. elif cp_value == 0x05f3 or cp_value == 0x05f4:
  151. if pos > 0:
  152. return _is_script(label[pos - 1], 'Hebrew')
  153. return False
  154. elif cp_value == 0x30fb:
  155. for cp in label:
  156. if cp == '\u30fb':
  157. continue
  158. if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
  159. return True
  160. return False
  161. elif 0x660 <= cp_value <= 0x669:
  162. for cp in label:
  163. if 0x6f0 <= ord(cp) <= 0x06f9:
  164. return False
  165. return True
  166. elif 0x6f0 <= cp_value <= 0x6f9:
  167. for cp in label:
  168. if 0x660 <= ord(cp) <= 0x0669:
  169. return False
  170. return True
  171. def check_label(label):
  172. if isinstance(label, (bytes, bytearray)):
  173. label = label.decode('utf-8')
  174. if len(label) == 0:
  175. raise IDNAError('Empty Label')
  176. check_nfc(label)
  177. check_hyphen_ok(label)
  178. check_initial_combiner(label)
  179. for (pos, cp) in enumerate(label):
  180. cp_value = ord(cp)
  181. if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
  182. continue
  183. elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
  184. try:
  185. if not valid_contextj(label, pos):
  186. raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
  187. _unot(cp_value), pos+1, repr(label)))
  188. except ValueError:
  189. raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(
  190. _unot(cp_value), pos+1, repr(label)))
  191. elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
  192. if not valid_contexto(label, pos):
  193. raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label)))
  194. else:
  195. raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
  196. check_bidi(label)
  197. def alabel(label):
  198. try:
  199. label = label.encode('ascii')
  200. ulabel(label)
  201. if not valid_label_length(label):
  202. raise IDNAError('Label too long')
  203. return label
  204. except UnicodeEncodeError:
  205. pass
  206. if not label:
  207. raise IDNAError('No Input')
  208. label = str(label)
  209. check_label(label)
  210. label = _punycode(label)
  211. label = _alabel_prefix + label
  212. if not valid_label_length(label):
  213. raise IDNAError('Label too long')
  214. return label
  215. def ulabel(label):
  216. if not isinstance(label, (bytes, bytearray)):
  217. try:
  218. label = label.encode('ascii')
  219. except UnicodeEncodeError:
  220. check_label(label)
  221. return label
  222. label = label.lower()
  223. if label.startswith(_alabel_prefix):
  224. label = label[len(_alabel_prefix):]
  225. if not label:
  226. raise IDNAError('Malformed A-label, no Punycode eligible content found')
  227. if label.decode('ascii')[-1] == '-':
  228. raise IDNAError('A-label must not end with a hyphen')
  229. else:
  230. check_label(label)
  231. return label.decode('ascii')
  232. label = label.decode('punycode')
  233. check_label(label)
  234. return label
  235. def uts46_remap(domain, std3_rules=True, transitional=False):
  236. """Re-map the characters in the string according to UTS46 processing."""
  237. from .uts46data import uts46data
  238. output = ''
  239. try:
  240. for pos, char in enumerate(domain):
  241. code_point = ord(char)
  242. uts46row = uts46data[code_point if code_point < 256 else
  243. bisect.bisect_left(uts46data, (code_point, 'Z')) - 1]
  244. status = uts46row[1]
  245. replacement = uts46row[2] if len(uts46row) == 3 else None
  246. if (status == 'V' or
  247. (status == 'D' and not transitional) or
  248. (status == '3' and not std3_rules and replacement is None)):
  249. output += char
  250. elif replacement is not None and (status == 'M' or
  251. (status == '3' and not std3_rules) or
  252. (status == 'D' and transitional)):
  253. output += replacement
  254. elif status != 'I':
  255. raise IndexError()
  256. return unicodedata.normalize('NFC', output)
  257. except IndexError:
  258. raise InvalidCodepoint(
  259. 'Codepoint {} not allowed at position {} in {}'.format(
  260. _unot(code_point), pos + 1, repr(domain)))
  261. def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
  262. if isinstance(s, (bytes, bytearray)):
  263. s = s.decode('ascii')
  264. if uts46:
  265. s = uts46_remap(s, std3_rules, transitional)
  266. trailing_dot = False
  267. result = []
  268. if strict:
  269. labels = s.split('.')
  270. else:
  271. labels = _unicode_dots_re.split(s)
  272. if not labels or labels == ['']:
  273. raise IDNAError('Empty domain')
  274. if labels[-1] == '':
  275. del labels[-1]
  276. trailing_dot = True
  277. for label in labels:
  278. s = alabel(label)
  279. if s:
  280. result.append(s)
  281. else:
  282. raise IDNAError('Empty label')
  283. if trailing_dot:
  284. result.append(b'')
  285. s = b'.'.join(result)
  286. if not valid_string_length(s, trailing_dot):
  287. raise IDNAError('Domain too long')
  288. return s
  289. def decode(s, strict=False, uts46=False, std3_rules=False):
  290. if isinstance(s, (bytes, bytearray)):
  291. s = s.decode('ascii')
  292. if uts46:
  293. s = uts46_remap(s, std3_rules, False)
  294. trailing_dot = False
  295. result = []
  296. if not strict:
  297. labels = _unicode_dots_re.split(s)
  298. else:
  299. labels = s.split('.')
  300. if not labels or labels == ['']:
  301. raise IDNAError('Empty domain')
  302. if not labels[-1]:
  303. del labels[-1]
  304. trailing_dot = True
  305. for label in labels:
  306. s = ulabel(label)
  307. if s:
  308. result.append(s)
  309. else:
  310. raise IDNAError('Empty label')
  311. if trailing_dot:
  312. result.append('')
  313. return '.'.join(result)