/Tools/unicode/gencodec.py

http://unladen-swallow.googlecode.com/ · Python · 426 lines · 304 code · 53 blank · 69 comment · 78 complexity · a8b517f940bcd69b7ed1bea77a3d43e3 MD5 · raw file

  1. """ Unicode Mapping Parser and Codec Generator.
  2. This script parses Unicode mapping files as available from the Unicode
  3. site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
  4. modules from them. The codecs use the standard character mapping codec
  5. to actually apply the mapping.
  6. Synopsis: gencodec.py dir codec_prefix
  7. All files in dir are scanned and those producing non-empty mappings
  8. will be written to <codec_prefix><mapname>.py with <mapname> being the
  9. first part of the map's filename ('a' in a.b.c.txt) converted to
  10. lowercase with hyphens replaced by underscores.
  11. The tool also writes marshalled versions of the mapping tables to the
  12. same location (with .mapping extension).
  13. Written by Marc-Andre Lemburg (mal@lemburg.com).
  14. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  15. (c) Copyright Guido van Rossum, 2000.
  16. Table generation:
  17. (c) Copyright Marc-Andre Lemburg, 2005.
  18. Licensed to PSF under a Contributor Agreement.
  19. """#"
  20. import re, os, marshal, codecs
  21. # Maximum allowed size of charmap tables
  22. MAX_TABLE_SIZE = 8192
  23. # Standard undefined Unicode code point
  24. UNI_UNDEFINED = unichr(0xFFFE)
  25. mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
  26. '\s+'
  27. '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
  28. '\s*'
  29. '(#.+)?')
  30. def parsecodes(codes,
  31. len=len, filter=filter,range=range):
  32. """ Converts code combinations to either a single code integer
  33. or a tuple of integers.
  34. meta-codes (in angular brackets, e.g. <LR> and <RL>) are
  35. ignored.
  36. Empty codes or illegal ones are returned as None.
  37. """
  38. if not codes:
  39. return None
  40. l = codes.split('+')
  41. if len(l) == 1:
  42. return int(l[0],16)
  43. for i in range(len(l)):
  44. try:
  45. l[i] = int(l[i],16)
  46. except ValueError:
  47. l[i] = None
  48. l = filter(lambda x: x is not None, l)
  49. if len(l) == 1:
  50. return l[0]
  51. else:
  52. return tuple(l)
  53. def readmap(filename):
  54. f = open(filename,'r')
  55. lines = f.readlines()
  56. f.close()
  57. enc2uni = {}
  58. identity = []
  59. unmapped = range(256)
  60. # UTC mapping tables per convention don't include the identity
  61. # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
  62. # explicitly mapped to different characters or undefined
  63. for i in range(32) + [127]:
  64. identity.append(i)
  65. unmapped.remove(i)
  66. enc2uni[i] = (i, 'CONTROL CHARACTER')
  67. for line in lines:
  68. line = line.strip()
  69. if not line or line[0] == '#':
  70. continue
  71. m = mapRE.match(line)
  72. if not m:
  73. #print '* not matched: %s' % repr(line)
  74. continue
  75. enc,uni,comment = m.groups()
  76. enc = parsecodes(enc)
  77. uni = parsecodes(uni)
  78. if comment is None:
  79. comment = ''
  80. else:
  81. comment = comment[1:].strip()
  82. if enc < 256:
  83. if enc in unmapped:
  84. unmapped.remove(enc)
  85. if enc == uni:
  86. identity.append(enc)
  87. enc2uni[enc] = (uni,comment)
  88. else:
  89. enc2uni[enc] = (uni,comment)
  90. # If there are more identity-mapped entries than unmapped entries,
  91. # it pays to generate an identity dictionary first, and add explicit
  92. # mappings to None for the rest
  93. if len(identity) >= len(unmapped):
  94. for enc in unmapped:
  95. enc2uni[enc] = (None, "")
  96. enc2uni['IDENTITY'] = 256
  97. return enc2uni
  98. def hexrepr(t, precision=4):
  99. if t is None:
  100. return 'None'
  101. try:
  102. len(t)
  103. except:
  104. return '0x%0*X' % (precision, t)
  105. try:
  106. return '(' + ', '.join(['0x%0*X' % (precision, item)
  107. for item in t]) + ')'
  108. except TypeError, why:
  109. print '* failed to convert %r: %s' % (t, why)
  110. raise
  111. def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
  112. l = []
  113. append = l.append
  114. if map.has_key("IDENTITY"):
  115. append("%s = codecs.make_identity_dict(range(%d))" %
  116. (varname, map["IDENTITY"]))
  117. append("%s.update({" % varname)
  118. splits = 1
  119. del map["IDENTITY"]
  120. identity = 1
  121. else:
  122. append("%s = {" % varname)
  123. splits = 0
  124. identity = 0
  125. mappings = map.items()
  126. mappings.sort()
  127. i = 0
  128. key_precision, value_precision = precisions
  129. for mapkey, mapvalue in mappings:
  130. mapcomment = ''
  131. if isinstance(mapkey, tuple):
  132. (mapkey, mapcomment) = mapkey
  133. if isinstance(mapvalue, tuple):
  134. (mapvalue, mapcomment) = mapvalue
  135. if mapkey is None:
  136. continue
  137. if (identity and
  138. mapkey == mapvalue and
  139. mapkey < 256):
  140. # No need to include identity mappings, since these
  141. # are already set for the first 256 code points.
  142. continue
  143. key = hexrepr(mapkey, key_precision)
  144. value = hexrepr(mapvalue, value_precision)
  145. if mapcomment and comments:
  146. append(' %s: %s,\t# %s' % (key, value, mapcomment))
  147. else:
  148. append(' %s: %s,' % (key, value))
  149. i += 1
  150. if i == 4096:
  151. # Split the definition into parts to that the Python
  152. # parser doesn't dump core
  153. if splits == 0:
  154. append('}')
  155. else:
  156. append('})')
  157. append('%s.update({' % varname)
  158. i = 0
  159. splits = splits + 1
  160. if splits == 0:
  161. append('}')
  162. else:
  163. append('})')
  164. return l
  165. def python_tabledef_code(varname, map, comments=1, key_precision=2):
  166. l = []
  167. append = l.append
  168. append('%s = (' % varname)
  169. # Analyze map and create table dict
  170. mappings = map.items()
  171. mappings.sort()
  172. table = {}
  173. maxkey = 0
  174. if map.has_key('IDENTITY'):
  175. for key in range(256):
  176. table[key] = (key, '')
  177. maxkey = 255
  178. del map['IDENTITY']
  179. for mapkey, mapvalue in mappings:
  180. mapcomment = ''
  181. if isinstance(mapkey, tuple):
  182. (mapkey, mapcomment) = mapkey
  183. if isinstance(mapvalue, tuple):
  184. (mapvalue, mapcomment) = mapvalue
  185. if mapkey is None:
  186. continue
  187. table[mapkey] = (mapvalue, mapcomment)
  188. if mapkey > maxkey:
  189. maxkey = mapkey
  190. if maxkey > MAX_TABLE_SIZE:
  191. # Table too large
  192. return None
  193. # Create table code
  194. for key in range(maxkey + 1):
  195. if key not in table:
  196. mapvalue = None
  197. mapcomment = 'UNDEFINED'
  198. else:
  199. mapvalue, mapcomment = table[key]
  200. if mapvalue is None:
  201. mapchar = UNI_UNDEFINED
  202. else:
  203. if isinstance(mapvalue, tuple):
  204. # 1-n mappings not supported
  205. return None
  206. else:
  207. mapchar = unichr(mapvalue)
  208. if mapcomment and comments:
  209. append(' %r\t# %s -> %s' % (mapchar,
  210. hexrepr(key, key_precision),
  211. mapcomment))
  212. else:
  213. append(' %r' % mapchar)
  214. append(')')
  215. return l
  216. def codegen(name, map, encodingname, comments=1):
  217. """ Returns Python source for the given map.
  218. Comments are included in the source, if comments is true (default).
  219. """
  220. # Generate code
  221. decoding_map_code = python_mapdef_code(
  222. 'decoding_map',
  223. map,
  224. comments=comments)
  225. decoding_table_code = python_tabledef_code(
  226. 'decoding_table',
  227. map,
  228. comments=comments)
  229. encoding_map_code = python_mapdef_code(
  230. 'encoding_map',
  231. codecs.make_encoding_map(map),
  232. comments=comments,
  233. precisions=(4, 2))
  234. if decoding_table_code:
  235. suffix = 'table'
  236. else:
  237. suffix = 'map'
  238. l = [
  239. '''\
  240. """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
  241. """#"
  242. import codecs
  243. ### Codec APIs
  244. class Codec(codecs.Codec):
  245. def encode(self,input,errors='strict'):
  246. return codecs.charmap_encode(input,errors,encoding_%s)
  247. def decode(self,input,errors='strict'):
  248. return codecs.charmap_decode(input,errors,decoding_%s)
  249. ''' % (encodingname, name, suffix, suffix)]
  250. l.append('''\
  251. class IncrementalEncoder(codecs.IncrementalEncoder):
  252. def encode(self, input, final=False):
  253. return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
  254. class IncrementalDecoder(codecs.IncrementalDecoder):
  255. def decode(self, input, final=False):
  256. return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
  257. (suffix, suffix))
  258. l.append('''
  259. class StreamWriter(Codec,codecs.StreamWriter):
  260. pass
  261. class StreamReader(Codec,codecs.StreamReader):
  262. pass
  263. ### encodings module API
  264. def getregentry():
  265. return codecs.CodecInfo(
  266. name=%r,
  267. encode=Codec().encode,
  268. decode=Codec().decode,
  269. incrementalencoder=IncrementalEncoder,
  270. incrementaldecoder=IncrementalDecoder,
  271. streamreader=StreamReader,
  272. streamwriter=StreamWriter,
  273. )
  274. ''' % encodingname.replace('_', '-'))
  275. # Add decoding table or map (with preference to the table)
  276. if not decoding_table_code:
  277. l.append('''
  278. ### Decoding Map
  279. ''')
  280. l.extend(decoding_map_code)
  281. else:
  282. l.append('''
  283. ### Decoding Table
  284. ''')
  285. l.extend(decoding_table_code)
  286. # Add encoding map
  287. if decoding_table_code:
  288. l.append('''
  289. ### Encoding table
  290. encoding_table=codecs.charmap_build(decoding_table)
  291. ''')
  292. else:
  293. l.append('''
  294. ### Encoding Map
  295. ''')
  296. l.extend(encoding_map_code)
  297. # Final new-line
  298. l.append('')
  299. return '\n'.join(l).expandtabs()
  300. def pymap(name,map,pyfile,encodingname,comments=1):
  301. code = codegen(name,map,encodingname,comments)
  302. f = open(pyfile,'w')
  303. f.write(code)
  304. f.close()
  305. def marshalmap(name,map,marshalfile):
  306. d = {}
  307. for e,(u,c) in map.items():
  308. d[e] = (u,c)
  309. f = open(marshalfile,'wb')
  310. marshal.dump(d,f)
  311. f.close()
  312. def convertdir(dir, dirprefix='', nameprefix='', comments=1):
  313. mapnames = os.listdir(dir)
  314. for mapname in mapnames:
  315. mappathname = os.path.join(dir, mapname)
  316. if not os.path.isfile(mappathname):
  317. continue
  318. name = os.path.split(mapname)[1]
  319. name = name.replace('-','_')
  320. name = name.split('.')[0]
  321. name = name.lower()
  322. name = nameprefix + name
  323. codefile = name + '.py'
  324. marshalfile = name + '.mapping'
  325. print 'converting %s to %s and %s' % (mapname,
  326. dirprefix + codefile,
  327. dirprefix + marshalfile)
  328. try:
  329. map = readmap(os.path.join(dir,mapname))
  330. if not map:
  331. print '* map is empty; skipping'
  332. else:
  333. pymap(mappathname, map, dirprefix + codefile,name,comments)
  334. marshalmap(mappathname, map, dirprefix + marshalfile)
  335. except ValueError, why:
  336. print '* conversion failed: %s' % why
  337. raise
  338. def rewritepythondir(dir, dirprefix='', comments=1):
  339. mapnames = os.listdir(dir)
  340. for mapname in mapnames:
  341. if not mapname.endswith('.mapping'):
  342. continue
  343. name = mapname[:-len('.mapping')]
  344. codefile = name + '.py'
  345. print 'converting %s to %s' % (mapname,
  346. dirprefix + codefile)
  347. try:
  348. map = marshal.load(open(os.path.join(dir,mapname),
  349. 'rb'))
  350. if not map:
  351. print '* map is empty; skipping'
  352. else:
  353. pymap(mapname, map, dirprefix + codefile,name,comments)
  354. except ValueError, why:
  355. print '* conversion failed: %s' % why
  356. if __name__ == '__main__':
  357. import sys
  358. if 1:
  359. apply(convertdir,tuple(sys.argv[1:]))
  360. else:
  361. apply(rewritepythondir,tuple(sys.argv[1:]))