/Tools/unicode/mkstringprep.py

http://unladen-swallow.googlecode.com/ · Python · 431 lines · 319 code · 69 blank · 43 comment · 47 complexity · a011d583bcbed76b7efff5f1070a4f6a MD5 · raw file

  1. import re, unicodedata, sys
  2. if sys.maxunicode == 65535:
  3. raise RuntimeError, "need UCS-4 Python"
  4. def gen_category(cats):
  5. for i in range(0, 0x110000):
  6. if unicodedata.category(unichr(i)) in cats:
  7. yield(i)
  8. def gen_bidirectional(cats):
  9. for i in range(0, 0x110000):
  10. if unicodedata.bidirectional(unichr(i)) in cats:
  11. yield(i)
  12. def compact_set(l):
  13. single = []
  14. tuple = []
  15. prev = None
  16. span = 0
  17. for e in l:
  18. if prev is None:
  19. prev = e
  20. span = 0
  21. continue
  22. if prev+span+1 != e:
  23. if span > 2:
  24. tuple.append((prev,prev+span+1))
  25. else:
  26. for i in range(prev, prev+span+1):
  27. single.append(i)
  28. prev = e
  29. span = 0
  30. else:
  31. span += 1
  32. if span:
  33. tuple.append((prev,prev+span+1))
  34. else:
  35. single.append(prev)
  36. tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
  37. if not single:
  38. return "set(%s)" % tuple
  39. if not tuple:
  40. return "set(%s)" % repr(single)
  41. return "set(%s + %s)" % (repr(single),tuple)
  42. ############## Read the tables in the RFC #######################
  43. data = open("rfc3454.txt").readlines()
  44. tables = []
  45. curname = None
  46. for l in data:
  47. l = l.strip()
  48. if not l:
  49. continue
  50. # Skip RFC page breaks
  51. if l.startswith("Hoffman & Blanchet") or\
  52. l.startswith("RFC 3454"):
  53. continue
  54. # Find start/end lines
  55. m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
  56. if m:
  57. if m.group(1) == "Start":
  58. if curname:
  59. raise "Double Start",(curname, l)
  60. curname = m.group(2)
  61. table = {}
  62. tables.append((curname, table))
  63. continue
  64. else:
  65. if not curname:
  66. raise "End without start", l
  67. curname = None
  68. continue
  69. if not curname:
  70. continue
  71. # Now we are in a table
  72. fields = l.split(";")
  73. if len(fields) > 1:
  74. # Drop comment field
  75. fields = fields[:-1]
  76. if len(fields) == 1:
  77. fields = fields[0].split("-")
  78. if len(fields) > 1:
  79. # range
  80. try:
  81. start, end = fields
  82. except ValueError:
  83. raise "Unpacking problem", l
  84. else:
  85. start = end = fields[0]
  86. start = int(start, 16)
  87. end = int(end, 16)
  88. for i in range(start, end+1):
  89. table[i] = i
  90. else:
  91. code, value = fields
  92. value = value.strip()
  93. if value:
  94. value = [int(v, 16) for v in value.split(" ")]
  95. else:
  96. # table B.1
  97. value = None
  98. table[int(code, 16)] = value
  99. ########### Generate compact Python versions of the tables #############
  100. print """# This file is generated by mkstringprep.py. DO NOT EDIT.
  101. \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
  102. There are two kinds of tables: sets, for which a member test is provided,
  103. and mappings, for which a mapping function is provided.
  104. \"\"\"
  105. import unicodedata
  106. """
  107. print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)
  108. # A.1 is the table of unassigned characters
  109. # XXX Plane 15 PUA is listed as unassigned in Python.
  110. name, table = tables[0]
  111. del tables[0]
  112. assert name == "A.1"
  113. table = set(table.keys())
  114. Cn = set(gen_category(["Cn"]))
  115. # FDD0..FDEF are process internal codes
  116. Cn -= set(range(0xFDD0, 0xFDF0))
  117. # not a character
  118. Cn -= set(range(0xFFFE, 0x110000, 0x10000))
  119. Cn -= set(range(0xFFFF, 0x110000, 0x10000))
  120. # assert table == Cn
  121. print """
  122. def in_table_a1(code):
  123. if unicodedata.category(code) != 'Cn': return False
  124. c = ord(code)
  125. if 0xFDD0 <= c < 0xFDF0: return False
  126. return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
  127. """
  128. # B.1 cannot easily be derived
  129. name, table = tables[0]
  130. del tables[0]
  131. assert name == "B.1"
  132. table = table.keys()
  133. table.sort()
  134. print """
  135. b1_set = """ + compact_set(table) + """
  136. def in_table_b1(code):
  137. return ord(code) in b1_set
  138. """
  139. # B.2 and B.3 is case folding.
  140. # It takes CaseFolding.txt into account, which is
  141. # not available in the Python database. Since
  142. # B.2 is derived from B.3, we process B.3 first.
  143. # B.3 supposedly *is* CaseFolding-3.2.0.txt.
  144. name, table_b2 = tables[0]
  145. del tables[0]
  146. assert name == "B.2"
  147. name, table_b3 = tables[0]
  148. del tables[0]
  149. assert name == "B.3"
  150. # B.3 is mostly Python's .lower, except for a number
  151. # of special cases, e.g. considering canonical forms.
  152. b3_exceptions = {}
  153. for k,v in table_b2.items():
  154. if map(ord, unichr(k).lower()) != v:
  155. b3_exceptions[k] = u"".join(map(unichr,v))
  156. b3 = b3_exceptions.items()
  157. b3.sort()
  158. print """
  159. b3_exceptions = {"""
  160. for i,(k,v) in enumerate(b3):
  161. print "0x%x:%s," % (k, repr(v)),
  162. if i % 4 == 3:
  163. print
  164. print "}"
  165. print """
  166. def map_table_b3(code):
  167. r = b3_exceptions.get(ord(code))
  168. if r is not None: return r
  169. return code.lower()
  170. """
  171. def map_table_b3(code):
  172. r = b3_exceptions.get(ord(code))
  173. if r is not None: return r
  174. return code.lower()
  175. # B.2 is case folding for NFKC. This is the same as B.3,
  176. # except where NormalizeWithKC(Fold(a)) !=
  177. # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
  178. def map_table_b2(a):
  179. al = map_table_b3(a)
  180. b = unicodedata.normalize("NFKC", al)
  181. bl = u"".join([map_table_b3(ch) for ch in b])
  182. c = unicodedata.normalize("NFKC", bl)
  183. if b != c:
  184. return c
  185. else:
  186. return al
  187. specials = {}
  188. for k,v in table_b2.items():
  189. if map(ord, map_table_b2(unichr(k))) != v:
  190. specials[k] = v
  191. # B.3 should not add any additional special cases
  192. assert specials == {}
  193. print """
  194. def map_table_b2(a):
  195. al = map_table_b3(a)
  196. b = unicodedata.normalize("NFKC", al)
  197. bl = u"".join([map_table_b3(ch) for ch in b])
  198. c = unicodedata.normalize("NFKC", bl)
  199. if b != c:
  200. return c
  201. else:
  202. return al
  203. """
  204. # C.1.1 is a table with a single character
  205. name, table = tables[0]
  206. del tables[0]
  207. assert name == "C.1.1"
  208. assert table == {0x20:0x20}
  209. print """
  210. def in_table_c11(code):
  211. return code == u" "
  212. """
  213. # C.1.2 is the rest of all space characters
  214. name, table = tables[0]
  215. del tables[0]
  216. assert name == "C.1.2"
  217. # table = set(table.keys())
  218. # Zs = set(gen_category(["Zs"])) - set([0x20])
  219. # assert Zs == table
  220. print """
  221. def in_table_c12(code):
  222. return unicodedata.category(code) == "Zs" and code != u" "
  223. def in_table_c11_c12(code):
  224. return unicodedata.category(code) == "Zs"
  225. """
  226. # C.2.1 ASCII control characters
  227. name, table_c21 = tables[0]
  228. del tables[0]
  229. assert name == "C.2.1"
  230. Cc = set(gen_category(["Cc"]))
  231. Cc_ascii = Cc & set(range(128))
  232. table_c21 = set(table_c21.keys())
  233. assert Cc_ascii == table_c21
  234. print """
  235. def in_table_c21(code):
  236. return ord(code) < 128 and unicodedata.category(code) == "Cc"
  237. """
  238. # C.2.2 Non-ASCII control characters. It also includes
  239. # a number of characters in category Cf.
  240. name, table_c22 = tables[0]
  241. del tables[0]
  242. assert name == "C.2.2"
  243. Cc_nonascii = Cc - Cc_ascii
  244. table_c22 = set(table_c22.keys())
  245. assert len(Cc_nonascii - table_c22) == 0
  246. specials = list(table_c22 - Cc_nonascii)
  247. specials.sort()
  248. print """c22_specials = """ + compact_set(specials) + """
  249. def in_table_c22(code):
  250. c = ord(code)
  251. if c < 128: return False
  252. if unicodedata.category(code) == "Cc": return True
  253. return c in c22_specials
  254. def in_table_c21_c22(code):
  255. return unicodedata.category(code) == "Cc" or \\
  256. ord(code) in c22_specials
  257. """
  258. # C.3 Private use
  259. name, table = tables[0]
  260. del tables[0]
  261. assert name == "C.3"
  262. Co = set(gen_category(["Co"]))
  263. assert set(table.keys()) == Co
  264. print """
  265. def in_table_c3(code):
  266. return unicodedata.category(code) == "Co"
  267. """
  268. # C.4 Non-character code points, xFFFE, xFFFF
  269. # plus process internal codes
  270. name, table = tables[0]
  271. del tables[0]
  272. assert name == "C.4"
  273. nonchar = set(range(0xFDD0,0xFDF0) +
  274. range(0xFFFE,0x110000,0x10000) +
  275. range(0xFFFF,0x110000,0x10000))
  276. table = set(table.keys())
  277. assert table == nonchar
  278. print """
  279. def in_table_c4(code):
  280. c = ord(code)
  281. if c < 0xFDD0: return False
  282. if c < 0xFDF0: return True
  283. return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
  284. """
  285. # C.5 Surrogate codes
  286. name, table = tables[0]
  287. del tables[0]
  288. assert name == "C.5"
  289. Cs = set(gen_category(["Cs"]))
  290. assert set(table.keys()) == Cs
  291. print """
  292. def in_table_c5(code):
  293. return unicodedata.category(code) == "Cs"
  294. """
  295. # C.6 Inappropriate for plain text
  296. name, table = tables[0]
  297. del tables[0]
  298. assert name == "C.6"
  299. table = table.keys()
  300. table.sort()
  301. print """
  302. c6_set = """ + compact_set(table) + """
  303. def in_table_c6(code):
  304. return ord(code) in c6_set
  305. """
  306. # C.7 Inappropriate for canonical representation
  307. name, table = tables[0]
  308. del tables[0]
  309. assert name == "C.7"
  310. table = table.keys()
  311. table.sort()
  312. print """
  313. c7_set = """ + compact_set(table) + """
  314. def in_table_c7(code):
  315. return ord(code) in c7_set
  316. """
  317. # C.8 Change display properties or are deprecated
  318. name, table = tables[0]
  319. del tables[0]
  320. assert name == "C.8"
  321. table = table.keys()
  322. table.sort()
  323. print """
  324. c8_set = """ + compact_set(table) + """
  325. def in_table_c8(code):
  326. return ord(code) in c8_set
  327. """
  328. # C.9 Tagging characters
  329. name, table = tables[0]
  330. del tables[0]
  331. assert name == "C.9"
  332. table = table.keys()
  333. table.sort()
  334. print """
  335. c9_set = """ + compact_set(table) + """
  336. def in_table_c9(code):
  337. return ord(code) in c9_set
  338. """
  339. # D.1 Characters with bidirectional property "R" or "AL"
  340. name, table = tables[0]
  341. del tables[0]
  342. assert name == "D.1"
  343. RandAL = set(gen_bidirectional(["R","AL"]))
  344. assert set(table.keys()) == RandAL
  345. print """
  346. def in_table_d1(code):
  347. return unicodedata.bidirectional(code) in ("R","AL")
  348. """
  349. # D.2 Characters with bidirectional property "L"
  350. name, table = tables[0]
  351. del tables[0]
  352. assert name == "D.2"
  353. L = set(gen_bidirectional(["L"]))
  354. assert set(table.keys()) == L
  355. print """
  356. def in_table_d2(code):
  357. return unicodedata.bidirectional(code) == "L"
  358. """