PageRenderTime 42ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/pypy/module/unicodedata/interp_ucd.py

https://bitbucket.org/pypy/pypy/
Python | 330 lines | 284 code | 30 blank | 16 comment | 55 complexity | 1c302c32655bda5830d8c337421a239d MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. """
  2. Implementation of the interpreter-level functions in the module unicodedata.
  3. """
  4. from pypy.interpreter.gateway import interp2app, unwrap_spec
  5. from pypy.interpreter.baseobjspace import W_Root
  6. from pypy.interpreter.error import OperationError, oefmt
  7. from pypy.interpreter.typedef import TypeDef, interp_attrproperty
  8. from rpython.rlib.rarithmetic import r_longlong
  9. from rpython.rlib.objectmodel import we_are_translated
  10. from rpython.rlib.runicode import MAXUNICODE
  11. from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
  12. from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate
  13. import sys
  14. # Contants for Hangul characters
  15. SBase = 0xAC00
  16. LBase = 0x1100
  17. VBase = 0x1161
  18. TBase = 0x11A7
  19. LCount = 19
  20. VCount = 21
  21. TCount = 28
  22. NCount = (VCount*TCount)
  23. SCount = (LCount*NCount)
  24. # Since Python2.7, the unicodedata module gives a preview of Python3 character
  25. # handling: on narrow unicode builds, a surrogate pair is considered as one
  26. # unicode code point.
  27. if MAXUNICODE > 0xFFFF:
  28. # Target is wide build
  29. def unichr_to_code_w(space, w_unichr):
  30. if not space.isinstance_w(w_unichr, space.w_unicode):
  31. raise oefmt(
  32. space.w_TypeError, 'argument 1 must be unicode, not %T',
  33. w_unichr)
  34. if not we_are_translated() and sys.maxunicode == 0xFFFF:
  35. # Host CPython is narrow build, accept surrogates
  36. try:
  37. return ord_accepts_surrogate(space.unicode_w(w_unichr))
  38. except TypeError:
  39. raise oefmt(space.w_TypeError,
  40. "need a single Unicode character as parameter")
  41. else:
  42. if not space.len_w(w_unichr) == 1:
  43. raise oefmt(space.w_TypeError,
  44. "need a single Unicode character as parameter")
  45. return space.int_w(space.ord(w_unichr))
  46. else:
  47. # Target is narrow build
  48. def unichr_to_code_w(space, w_unichr):
  49. if not space.isinstance_w(w_unichr, space.w_unicode):
  50. raise oefmt(
  51. space.w_TypeError, 'argument 1 must be unicode, not %T',
  52. w_unichr)
  53. if not we_are_translated() and sys.maxunicode > 0xFFFF:
  54. # Host CPython is wide build, forbid surrogates
  55. if not space.len_w(w_unichr) == 1:
  56. raise oefmt(space.w_TypeError,
  57. "need a single Unicode character as parameter")
  58. return space.int_w(space.ord(w_unichr))
  59. else:
  60. # Accept surrogates
  61. try:
  62. return ord_accepts_surrogate(space.unicode_w(w_unichr))
  63. except TypeError:
  64. raise oefmt(space.w_TypeError,
  65. "need a single Unicode character as parameter")
  66. class UCD(W_Root):
  67. def __init__(self, unicodedb):
  68. self._lookup = unicodedb.lookup
  69. self._name = unicodedb.name
  70. self._decimal = unicodedb.decimal
  71. self._digit = unicodedb.digit
  72. self._numeric = unicodedb.numeric
  73. self._category = unicodedb.category
  74. self._east_asian_width = unicodedb.east_asian_width
  75. self._bidirectional = unicodedb.bidirectional
  76. self._combining = unicodedb.combining
  77. self._mirrored = unicodedb.mirrored
  78. self._decomposition = unicodedb.decomposition
  79. self._canon_decomposition = unicodedb.canon_decomposition
  80. self._compat_decomposition = unicodedb.compat_decomposition
  81. self._composition = unicodedb._composition
  82. self.version = unicodedb.version
  83. @unwrap_spec(name=str)
  84. def _get_code(self, space, name):
  85. try:
  86. code = self._lookup(name.upper())
  87. except KeyError:
  88. msg = space.mod(space.wrap("undefined character name '%s'"), space.wrap(name))
  89. raise OperationError(space.w_KeyError, msg)
  90. return space.wrap(code)
  91. @unwrap_spec(name=str)
  92. def lookup(self, space, name):
  93. try:
  94. code = self._lookup(name.upper())
  95. except KeyError:
  96. msg = space.mod(space.wrap("undefined character name '%s'"), space.wrap(name))
  97. raise OperationError(space.w_KeyError, msg)
  98. return space.wrap(code_to_unichr(code))
  99. def name(self, space, w_unichr, w_default=None):
  100. code = unichr_to_code_w(space, w_unichr)
  101. try:
  102. name = self._name(code)
  103. except KeyError:
  104. if w_default is not None:
  105. return w_default
  106. raise oefmt(space.w_ValueError, "no such name")
  107. return space.wrap(name)
  108. def decimal(self, space, w_unichr, w_default=None):
  109. code = unichr_to_code_w(space, w_unichr)
  110. try:
  111. return space.wrap(self._decimal(code))
  112. except KeyError:
  113. pass
  114. if w_default is not None:
  115. return w_default
  116. raise oefmt(space.w_ValueError, "not a decimal")
  117. def digit(self, space, w_unichr, w_default=None):
  118. code = unichr_to_code_w(space, w_unichr)
  119. try:
  120. return space.wrap(self._digit(code))
  121. except KeyError:
  122. pass
  123. if w_default is not None:
  124. return w_default
  125. raise oefmt(space.w_ValueError, "not a digit")
  126. def numeric(self, space, w_unichr, w_default=None):
  127. code = unichr_to_code_w(space, w_unichr)
  128. try:
  129. return space.wrap(self._numeric(code))
  130. except KeyError:
  131. pass
  132. if w_default is not None:
  133. return w_default
  134. raise oefmt(space.w_ValueError, "not a numeric character")
  135. def category(self, space, w_unichr):
  136. code = unichr_to_code_w(space, w_unichr)
  137. return space.wrap(self._category(code))
  138. def east_asian_width(self, space, w_unichr):
  139. code = unichr_to_code_w(space, w_unichr)
  140. return space.wrap(self._east_asian_width(code))
  141. def bidirectional(self, space, w_unichr):
  142. code = unichr_to_code_w(space, w_unichr)
  143. return space.wrap(self._bidirectional(code))
  144. def combining(self, space, w_unichr):
  145. code = unichr_to_code_w(space, w_unichr)
  146. return space.wrap(self._combining(code))
  147. def mirrored(self, space, w_unichr):
  148. code = unichr_to_code_w(space, w_unichr)
  149. # For no reason, unicodedata.mirrored() returns an int, not a bool
  150. return space.wrap(int(self._mirrored(code)))
  151. def decomposition(self, space, w_unichr):
  152. code = unichr_to_code_w(space, w_unichr)
  153. return space.wrap(self._decomposition(code))
  154. @unwrap_spec(form=str)
  155. def normalize(self, space, form, w_unistr):
  156. if not space.isinstance_w(w_unistr, space.w_unicode):
  157. raise oefmt(
  158. space.w_TypeError, 'argument 2 must be unicode, not %T',
  159. w_unistr)
  160. if form == 'NFC':
  161. composed = True
  162. decomposition = self._canon_decomposition
  163. elif form == 'NFD':
  164. composed = False
  165. decomposition = self._canon_decomposition
  166. elif form == 'NFKC':
  167. composed = True
  168. decomposition = self._compat_decomposition
  169. elif form == 'NFKD':
  170. composed = False
  171. decomposition = self._compat_decomposition
  172. else:
  173. raise oefmt(space.w_ValueError, "invalid normalization form")
  174. strlen = space.len_w(w_unistr)
  175. result = [0] * (strlen + strlen / 10 + 10)
  176. j = 0
  177. resultlen = len(result)
  178. # Expand the character
  179. for i in range(strlen):
  180. ch = space.int_w(space.ord(space.getitem(w_unistr, space.wrap(i))))
  181. # Do Hangul decomposition
  182. if SBase <= ch < SBase + SCount:
  183. SIndex = ch - SBase
  184. L = LBase + SIndex / NCount
  185. V = VBase + (SIndex % NCount) / TCount
  186. T = TBase + SIndex % TCount
  187. if T == TBase:
  188. if j + 2 > resultlen:
  189. result.extend([0] * (j + 2 - resultlen + 10))
  190. resultlen = len(result)
  191. result[j] = L
  192. result[j + 1] = V
  193. j += 2
  194. else:
  195. if j + 3 > resultlen:
  196. result.extend([0] * (j + 3 - resultlen + 10))
  197. resultlen = len(result)
  198. result[j] = L
  199. result[j + 1] = V
  200. result[j + 2] = T
  201. j += 3
  202. continue
  203. decomp = decomposition(ch)
  204. if decomp:
  205. decomplen = len(decomp)
  206. if j + decomplen > resultlen:
  207. result.extend([0] * (j + decomplen - resultlen + 10))
  208. resultlen = len(result)
  209. for ch in decomp:
  210. result[j] = ch
  211. j += 1
  212. else:
  213. if j + 1 > resultlen:
  214. result.extend([0] * (j + 1 - resultlen + 10))
  215. resultlen = len(result)
  216. result[j] = ch
  217. j += 1
  218. # Sort all combining marks
  219. for i in range(j):
  220. ch = result[i]
  221. comb = self._combining(ch)
  222. if comb == 0:
  223. continue
  224. for k in range(i, 0, -1):
  225. if self._combining(result[k - 1]) <= comb:
  226. result[k] = ch
  227. break
  228. result[k] = result[k - 1]
  229. else:
  230. result[0] = ch
  231. if not composed: # If decomposed normalization we are done
  232. return space.wrap(u''.join([unichr(i) for i in result[:j]]))
  233. if j <= 1:
  234. return space.wrap(u''.join([unichr(i) for i in result[:j]]))
  235. current = result[0]
  236. starter_pos = 0
  237. next_insert = 1
  238. prev_combining = 0
  239. if self._combining(current):
  240. prev_combining = 256
  241. for k in range(1, j):
  242. next = result[k]
  243. next_combining = self._combining(next)
  244. if next_insert == starter_pos + 1 or prev_combining < next_combining:
  245. # Combine if not blocked
  246. if (LBase <= current < LBase + LCount and
  247. VBase <= next < VBase + VCount):
  248. # If L, V -> LV
  249. current = SBase + ((current - LBase)*VCount + (next - VBase)) * TCount
  250. continue
  251. if (SBase <= current < SBase + SCount and
  252. TBase <= next < TBase + TCount and
  253. (current - SBase) % TCount == 0):
  254. # If LV, T -> LVT
  255. current = current + (next - TBase)
  256. continue
  257. key = r_longlong(current) << 32 | next
  258. try:
  259. current = self._composition[key]
  260. continue
  261. except KeyError:
  262. pass
  263. if next_combining == 0:
  264. # New starter symbol
  265. result[starter_pos] = current
  266. starter_pos = next_insert
  267. next_insert += 1
  268. prev_combining = 0
  269. current = next
  270. continue
  271. result[next_insert] = next
  272. next_insert += 1
  273. if next_combining > prev_combining:
  274. prev_combining = next_combining
  275. result[starter_pos] = current
  276. return space.wrap(u''.join([unichr(i) for i in result[:next_insert]]))
  277. methods = {}
  278. for methodname in """
  279. _get_code lookup name decimal digit numeric category east_asian_width
  280. bidirectional combining mirrored decomposition normalize
  281. """.split():
  282. methods[methodname] = interp2app(getattr(UCD, methodname))
  283. UCD.typedef = TypeDef("unicodedata.UCD",
  284. __doc__ = "",
  285. unidata_version = interp_attrproperty('version', UCD),
  286. **methods)
  287. ucd_3_2_0 = UCD(unicodedb_3_2_0)
  288. ucd_5_2_0 = UCD(unicodedb_5_2_0)
  289. ucd = ucd_5_2_0