PageRenderTime 44ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/includes/zhtable/Makefile.py

https://github.com/daevid/MWFork
Python | 389 lines | 316 code | 42 blank | 31 comment | 69 complexity | aeeff919416b5251bf1c99520c93a8a0 MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # @author Philip
  4. import tarfile as tf
  5. import zipfile as zf
  6. import os, re, shutil, sys, platform
  7. pyversion = platform.python_version()
  8. islinux = platform.system().lower() == 'linux'
  9. if pyversion[:3] in ['2.6', '2.7']:
  10. import urllib as urllib_request
  11. import codecs
  12. open = codecs.open
  13. _unichr = unichr
  14. if sys.maxunicode < 0x10000:
  15. def unichr(i):
  16. if i < 0x10000:
  17. return _unichr(i)
  18. else:
  19. return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
  20. elif pyversion[:2] == '3.':
  21. import urllib.request as urllib_request
  22. unichr = chr
  23. def unichr2( *args ):
  24. return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
  25. def unichr3( *args ):
  26. return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
  27. # DEFINE
  28. SF_MIRROR = 'cdnetworks-kr-2'
  29. SCIM_TABLES_VER = '0.5.10'
  30. SCIM_PINYIN_VER = '0.5.91'
  31. LIBTABE_VER = '0.2.3'
  32. # END OF DEFINE
  33. def download( url, dest ):
  34. if os.path.isfile( dest ):
  35. print( 'File %s up to date.' % dest )
  36. return
  37. global islinux
  38. if islinux:
  39. # we use wget instead urlretrieve under Linux,
  40. # because wget could display details like download progress
  41. os.system('wget %s' % url)
  42. else:
  43. print( 'Downloading from [%s] ...' % url )
  44. urllib_request.urlretrieve( url, dest )
  45. print( 'Download complete.\n' )
  46. return
  47. def uncompress( fp, member, encoding = 'U8' ):
  48. name = member.rsplit( '/', 1 )[-1]
  49. print( 'Extracting %s ...' % name )
  50. fp.extract( member )
  51. shutil.move( member, name )
  52. if '/' in member:
  53. shutil.rmtree( member.split( '/', 1 )[0] )
  54. return open( name, 'rb', encoding, 'ignore' )
  55. unzip = lambda path, member, encoding = 'U8': \
  56. uncompress( zf.ZipFile( path ), member, encoding )
  57. untargz = lambda path, member, encoding = 'U8': \
  58. uncompress( tf.open( path, 'r:gz' ), member, encoding )
  59. def parserCore( fp, pos, beginmark = None, endmark = None ):
  60. if beginmark and endmark:
  61. start = False
  62. else: start = True
  63. mlist = set()
  64. for line in fp:
  65. if beginmark and line.startswith( beginmark ):
  66. start = True
  67. continue
  68. elif endmark and line.startswith( endmark ):
  69. break
  70. if start and not line.startswith( '#' ):
  71. elems = line.split()
  72. if len( elems ) < 2:
  73. continue
  74. elif len( elems[0] ) > 1:
  75. mlist.add( elems[pos] )
  76. return mlist
  77. def tablesParser( path, name ):
  78. """ Read file from scim-tables and parse it. """
  79. global SCIM_TABLES_VER
  80. src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
  81. fp = untargz( path, src, 'U8' )
  82. return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
  83. ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
  84. wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
  85. zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
  86. def phraseParser( path ):
  87. """ Read phrase_lib.txt and parse it. """
  88. global SCIM_PINYIN_VER
  89. src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
  90. dst = 'phrase_lib.txt'
  91. fp = untargz( path, src, 'U8' )
  92. return parserCore( fp, 0 )
  93. def tsiParser( path ):
  94. """ Read tsi.src and parse it. """
  95. src = 'libtabe/tsi-src/tsi.src'
  96. dst = 'tsi.src'
  97. fp = untargz( path, src, 'big5hkscs' )
  98. return parserCore( fp, 0 )
  99. def unihanParser( path ):
  100. """ Read Unihan_Variants.txt and parse it. """
  101. fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
  102. t2s = dict()
  103. s2t = dict()
  104. for line in fp:
  105. if line.startswith( '#' ):
  106. continue
  107. else:
  108. elems = line.split()
  109. if len( elems ) < 3:
  110. continue
  111. type = elems.pop( 1 )
  112. elems = unichr2( *elems )
  113. if type == 'kTraditionalVariant':
  114. s2t[elems[0]] = elems[1:]
  115. elif type == 'kSimplifiedVariant':
  116. t2s[elems[0]] = elems[1:]
  117. fp.close()
  118. return ( t2s, s2t )
  119. def applyExcludes( mlist, path ):
  120. """ Apply exclude rules from path to mlist. """
  121. excludes = open( path, 'rb', 'U8' ).read().split()
  122. excludes = [word.split( '#' )[0].strip() for word in excludes]
  123. excludes = '|'.join( excludes )
  124. excptn = re.compile( '.*(?:%s).*' % excludes )
  125. diff = [mword for mword in mlist if excptn.search( mword )]
  126. mlist.difference_update( diff )
  127. return mlist
  128. def charManualTable( path ):
  129. fp = open( path, 'rb', 'U8' )
  130. ret = {}
  131. for line in fp:
  132. elems = line.split( '#' )[0].split( '|' )
  133. elems = unichr3( *elems )
  134. if len( elems ) > 1:
  135. ret[elems[0]] = elems[1:]
  136. return ret
  137. def toManyRules( src_table ):
  138. tomany = set()
  139. for ( f, t ) in src_table.iteritems():
  140. for i in range( 1, len( t ) ):
  141. tomany.add( t[i] )
  142. return tomany
  143. def removeRules( path, table ):
  144. fp = open( path, 'rb', 'U8' )
  145. texc = list()
  146. for line in fp:
  147. elems = line.split( '=>' )
  148. f = t = elems[0].strip()
  149. if len( elems ) == 2:
  150. t = elems[1].strip()
  151. f = f.strip('"').strip("'")
  152. t = t.strip('"').strip("'")
  153. if f:
  154. try:
  155. table.pop( f )
  156. except:
  157. pass
  158. if t:
  159. texc.append( t )
  160. texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
  161. for (tmp_f, tmp_t) in table.copy().iteritems():
  162. if texcptn.match( tmp_t ):
  163. table.pop( tmp_f )
  164. return table
  165. def customRules( path ):
  166. fp = open( path, 'rb', 'U8' )
  167. ret = dict()
  168. for line in fp:
  169. elems = line.split( '#' )[0].split()
  170. if len( elems ) > 1:
  171. ret[elems[0]] = elems[1]
  172. return ret
  173. def dictToSortedList( src_table, pos ):
  174. return sorted( src_table.items(), key = lambda m: m[pos] )
  175. def translate( text, conv_table ):
  176. i = 0
  177. while i < len( text ):
  178. for j in range( len( text ) - i, 0, -1 ):
  179. f = text[i:][:j]
  180. t = conv_table.get( f )
  181. if t:
  182. text = text[:i] + t + text[i:][j:]
  183. i += len(t) - 1
  184. break
  185. i += 1
  186. return text
  187. def manualWordsTable( path, conv_table, reconv_table ):
  188. fp = open( path, 'rb', 'U8' )
  189. reconv_table = {}
  190. wordlist = [line.split( '#' )[0].strip() for line in fp]
  191. wordlist = list( set( wordlist ) )
  192. wordlist.sort( key = len, reverse = True )
  193. while wordlist:
  194. word = wordlist.pop()
  195. new_word = translate( word, conv_table )
  196. rcv_word = translate( word, reconv_table )
  197. if word != rcv_word:
  198. reconv_table[word] = word
  199. reconv_table[new_word] = word
  200. return reconv_table
  201. def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
  202. wordlist = list( src_wordlist )
  203. wordlist.sort( key = len, reverse = True )
  204. word_conv_table = {}
  205. word_reconv_table = {}
  206. conv_table = char_conv_table.copy()
  207. reconv_table = char_reconv_table.copy()
  208. tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
  209. while wordlist:
  210. conv_table.update( word_conv_table )
  211. reconv_table.update( word_reconv_table )
  212. word = wordlist.pop()
  213. new_word_len = word_len = len( word )
  214. while new_word_len == word_len:
  215. add = False
  216. test_word = translate( word, reconv_table )
  217. new_word = translate( word, conv_table )
  218. if not reconv_table.get( new_word ) \
  219. and ( test_word != word \
  220. or ( tomanyptn.search( word ) \
  221. and word != translate( new_word, reconv_table ) ) ):
  222. word_conv_table[word] = new_word
  223. word_reconv_table[new_word] = word
  224. try:
  225. word = wordlist.pop()
  226. except IndexError:
  227. break
  228. new_word_len = len(word)
  229. return word_reconv_table
  230. def PHPArray( table ):
  231. lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
  232. return '\n'.join(lines)
  233. def main():
  234. #Get Unihan.zip:
  235. url = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
  236. han_dest = 'Unihan.zip'
  237. download( url, han_dest )
  238. # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
  239. url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
  240. tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
  241. download( url, tbe_dest )
  242. # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
  243. url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
  244. pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
  245. download( url, pyn_dest )
  246. # Get libtabe-$(LIBTABE_VER).tgz:
  247. url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
  248. lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
  249. download( url, lbt_dest )
  250. # Unihan.txt
  251. ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
  252. t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
  253. s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
  254. t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
  255. s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
  256. s_tomany = toManyRules( t2s_1tomany )
  257. t_tomany = toManyRules( s2t_1tomany )
  258. # noconvert rules
  259. t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
  260. s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
  261. # the supper set for word to word conversion
  262. t2s_1to1_supp = t2s_1to1.copy()
  263. s2t_1to1_supp = s2t_1to1.copy()
  264. t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
  265. s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
  266. # word to word manual rules
  267. t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
  268. t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
  269. s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
  270. s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
  271. # word to word rules from input methods
  272. t_wordlist = set()
  273. s_wordlist = set()
  274. t_wordlist.update( ezbigParser( tbe_dest ),
  275. tsiParser( lbt_dest ) )
  276. s_wordlist.update( wubiParser( tbe_dest ),
  277. zrmParser( tbe_dest ),
  278. phraseParser( pyn_dest ) )
  279. # exclude
  280. s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
  281. t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
  282. s2t_supp = s2t_1to1_supp.copy()
  283. s2t_supp.update( s2t_word2word_manual )
  284. t2s_supp = t2s_1to1_supp.copy()
  285. t2s_supp.update( t2s_word2word_manual )
  286. # parse list to dict
  287. t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
  288. t2s_word2word.update( t2s_word2word_manual )
  289. s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
  290. s2t_word2word.update( s2t_word2word_manual )
  291. # Final tables
  292. # sorted list toHans
  293. t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
  294. toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
  295. # sorted list toHant
  296. s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
  297. toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
  298. # sorted list toCN
  299. toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
  300. # sorted list toHK
  301. toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
  302. # sorted list toSG
  303. toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
  304. # sorted list toTW
  305. toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
  306. # Get PHP Array
  307. php = '''<?php
  308. /**
  309. * Simplified / Traditional Chinese conversion tables
  310. *
  311. * Automatically generated using code and data in includes/zhtable/
  312. * Do not modify directly!
  313. *
  314. * @file
  315. */
  316. $zh2Hant = array(\n'''
  317. php += PHPArray( toHant ) \
  318. + '\n);\n\n$zh2Hans = array(\n' \
  319. + PHPArray( toHans ) \
  320. + '\n);\n\n$zh2TW = array(\n' \
  321. + PHPArray( toTW ) \
  322. + '\n);\n\n$zh2HK = array(\n' \
  323. + PHPArray( toHK ) \
  324. + '\n);\n\n$zh2CN = array(\n' \
  325. + PHPArray( toCN ) \
  326. + '\n);\n\n$zh2SG = array(\n' \
  327. + PHPArray( toSG ) \
  328. + '\n);'
  329. f = open( 'ZhConversion.php', 'wb', encoding = 'utf8' )
  330. print ('Writing ZhConversion.php ... ')
  331. f.write( php )
  332. f.close()
  333. #Remove temp files
  334. print ('Deleting temp files ... ')
  335. os.remove('EZ-Big.txt.in')
  336. os.remove('phrase_lib.txt')
  337. os.remove('tsi.src')
  338. os.remove('Unihan_Variants.txt')
  339. os.remove('Wubi.txt.in')
  340. os.remove('Ziranma.txt.in')
  341. if __name__ == '__main__':
  342. main()