PageRenderTime 92ms CodeModel.GetById 3ms RepoModel.GetById 0ms app.codeStats 0ms

/python/engine/XingMa/XMCreateDB.py

http://scim-python.googlecode.com/
Python | 286 lines | 219 code | 33 blank | 34 comment | 44 complexity | f2c49672fefaa1953352bc29e216debd MD5 | raw file
  1. #! /usr/bin/python
  2. # vim: set noet ts=4:
  3. #
  4. # scim-python
  5. #
  6. # Copyright (c) 2007-2008 Yu Yuwei <acevery@gmail.com>
  7. #
  8. #
  9. # This library is free software; you can redistribute it and/or
  10. # modify it under the terms of the GNU Lesser General Public
  11. # License as published by the Free Software Foundation; either
  12. # version 2 of the License, or (at your option) any later version.
  13. #
  14. # This library is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. # GNU Lesser General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Lesser General Public
  20. # License along with this program; if not, write to the
  21. # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  22. # Boston, MA 02111-1307 USA
  23. #
  24. # $Id: $
  25. #
  26. import os
  27. import sys
  28. sys.path.append( os.path.dirname(os.path.abspath(__file__)) )
  29. import XMSQLiteDB
  30. import bz2
  31. import re
  32. from optparse import OptionParser
  33. # we use OptionParser to parse the cmd arguments :)
  34. opt_parser = OptionParser()
  35. opt_parser.add_option( '-n', '--name',
  36. action = 'store', dest='name',default = None,
  37. help = 'set the database name we will use, default is %default')
  38. opt_parser.add_option( '-s', '--source',
  39. action = 'store', dest='source', default = 'xingma.txt.bz2',
  40. help = 'tell me which file is the source file of IME, default is %default')
  41. opt_parser.add_option( '-p', '--pinyin',
  42. action = 'store', dest='pinyin', default = '/usr/share/scim-python/data/pinyin_table.txt',
  43. help = 'tell me which file is the source file of pinyin, default is %default')
  44. opt_parser.add_option( '-o', '--no-create-index',
  45. action = 'store_false', dest='index', default = True,
  46. help = 'do not create index on database, only for distrubution purpose, normal user should not invoke this flag!')
  47. opt_parser.add_option( '-i', '--create-index-only',
  48. action = 'store_true', dest='only_index', default = False,
  49. help = 'only create index on exist database')
  50. opt_parser.add_option( '-d', '--debug',
  51. action = 'store_true', dest='debug', default = False,
  52. help = 'print extra debug messages')
  53. opt_parser.add_option( '-e', '--extra',
  54. action = 'store', dest='extra', default = '',
  55. help = 'tell me which file is the extra words file for IME, default is %default')
  56. opts,args = opt_parser.parse_args()
  57. if not opts.name and opts.only_index:
  58. print 'Please give me the database you want to create index on'
  59. sys.exit(2)
  60. if not opts.name:
  61. opts.name = os.path.basename(opts.source).split('.')[0] + '.db'
  62. def main ():
  63. def debug_print ( message ):
  64. if opts.debug:
  65. print message
  66. if not opts.only_index:
  67. try:
  68. os.unlink (opts.name)
  69. except:
  70. pass
  71. debug_print ("Processing Database")
  72. db = XMSQLiteDB.XMSQLiteDB ( filename = opts.name)
  73. #db.db.execute( 'PRAGMA synchronous = FULL; ' )
  74. def parse_source (f):
  75. _attri = []
  76. _table = []
  77. _gouci = []
  78. patt_com = re.compile(r'^###.*')
  79. patt_blank = re.compile(r'^[ \t]*$')
  80. patt_conf = re.compile(r'.*=.*')
  81. patt_table = re.compile(r'(.*)\t(.*)\t.*')
  82. patt_gouci = re.compile(r'.*\t.*')
  83. patt_s = re.compile(r'(.*)\t([\x00-\xff]{3})\t.*')
  84. for l in f:
  85. if ( not patt_com.match(l) ) and ( not patt_blank.match(l) ):
  86. for _patt, _list in ( (patt_conf,_attri),(patt_table,_table),(patt_gouci,_gouci) ):
  87. if _patt.match(l):
  88. _list.append(l)
  89. break
  90. if not _gouci:
  91. #user didn't provide goucima, so we use the longest single character encode as the goucima.
  92. gouci_dict = {}
  93. for line in _table:
  94. res = patt_s.match(line)
  95. if res:
  96. if gouci_dict.has_key(res.group(2)):
  97. if len(res.group(1)) > len(gouci_dict[res.group(2)]):
  98. gouci_dict[res.group(2)] = res.group(1)
  99. else:
  100. gouci_dict[res.group(2)] = res.group(1)
  101. for key in gouci_dict:
  102. _gouci.append('%s\t%s' %(key,gouci_dict[key] ) )
  103. _gouci.sort()
  104. return (_attri, _table, _gouci)
  105. def parse_pinyin (f):
  106. _pinyins = []
  107. patt_com = re.compile(r'^#.*')
  108. patt_blank = re.compile(r'^[ \t]*$')
  109. patt_py = re.compile(r'(.*)\t(.*)\t.*')
  110. for l in f:
  111. if ( not patt_com.match(l) ) and ( not patt_blank.match(l) ):
  112. if patt_py.match(l):
  113. _pinyins.append(l)
  114. return _pinyins[:]
  115. def parse_extra (f):
  116. _extra = []
  117. patt_com = re.compile(r'^###.*')
  118. patt_blank = re.compile(r'^[ \t]*$')
  119. patt_extra = re.compile(r'(.*)\t(.*)')
  120. patt_s = re.compile(r'(.*)\t([\x00-\xff]{3})\t.*')
  121. for l in f:
  122. if ( not patt_com.match(l) ) and ( not patt_blank.match(l) ):
  123. if patt_extra.match(l):
  124. _extra.append(l)
  125. return _extra
  126. def pinyin_parser (f):
  127. for py in f:
  128. _zi, _pinyin, _freq = unicode (py,'utf-8').strip ().split()
  129. yield (_pinyin, _zi, _freq)
  130. def phrase_parser (f):
  131. list=[]
  132. for l in f:
  133. xingma, phrase, freq = unicode (l, "utf-8").strip ().split ('\t')
  134. list.append ( (xingma, phrase, int(freq), 0) )
  135. return list
  136. def goucima_parser (f):
  137. for l in f:
  138. zi,gcm = unicode (l, "utf-8").strip ().split ()
  139. yield (zi, gcm)
  140. def attribute_parser (f):
  141. for l in f:
  142. try:
  143. attr,val = unicode (l,"utf-8").strip().split ('=')
  144. except:
  145. attr,val = unicode (l,"utf-8").strip().split ('==')
  146. attr = attr.strip().lower()
  147. val = val.strip()
  148. yield (attr,val)
  149. def extra_parser (f):
  150. list = []
  151. for l in f:
  152. phrase, freq = unicode (l, "utf-8").strip ().split ()
  153. try:
  154. _key = db.parse_phrase_to_xm(phrase)
  155. list.append( (_key,phrase,freq,0) )
  156. except:
  157. print '\"%s\" would not been added' % phrase.encode('utf-8')
  158. return list
  159. if opts.only_index:
  160. debug_print ('Only create Indexes')
  161. debug_print ( "Optimizing database " )
  162. db.optimize_database ()
  163. debug_print ('Create Indexes ')
  164. db.create_indexes ('main')
  165. debug_print ('Done! :D')
  166. return 0
  167. # now we parse the ime source file
  168. debug_print ("\tLoad sources %s" % opts.source)
  169. patt_s = re.compile( r'.*\.bz2' )
  170. _bz2s = patt_s.match(opts.source)
  171. if _bz2s:
  172. source = bz2.BZ2File ( opts.source, "r" )
  173. else:
  174. source = file ( opts.source, 'r' )
  175. # first get config line and table line and goucima line respectively
  176. debug_print ('\tParsing xingma source file ')
  177. attri,table,gouci = parse_source ( source )
  178. debug_print ('\t get attribute of IME :)')
  179. attributes = attribute_parser ( attri )
  180. debug_print ('\t add attributes into DB ')
  181. db.update_ime ( attributes )
  182. db.create_tables ('main')
  183. # second, we use generators for database generating:
  184. debug_print ('\t get phrases of IME :)')
  185. phrases = phrase_parser ( table)
  186. # now we add things into db
  187. debug_print ('\t add phrases into DB ')
  188. db.add_phrases ( phrases )
  189. if db.get_ime_property ('user_can_define_phrase').lower() == u'true':
  190. debug_print ('\t get goucima of IME :)')
  191. goucima = goucima_parser (gouci)
  192. debug_print ('\t add goucima into DB ')
  193. db.add_goucima ( goucima )
  194. if db.get_ime_property ('pinyin_mode').lower() == u'true':
  195. debug_print ('\tLoad pinyin source %s' % opts.pinyin)
  196. _bz2p = patt_s.match(opts.pinyin)
  197. if _bz2p:
  198. pinyin_s = bz2.BZ2File ( opts.pinyin, "r" )
  199. else:
  200. pinyin_s = file ( opts.pinyin, 'r' )
  201. debug_print ('\tParsing pinyin source file ')
  202. pyline = parse_pinyin (pinyin_s)
  203. debug_print ('\tParsing pinyin source file')
  204. pinyin = pinyin_parser (pyline)
  205. debug_print ('\t add pinyin into DB ')
  206. db.add_pinyin ( pinyin )
  207. debug_print ("Optimizing database ")
  208. db.optimize_database ()
  209. if db.get_ime_property ('user_can_define_phrase').lower() == u'true' and opts.extra:
  210. debug_print( '\tPreparing for adding extra words' )
  211. db.create_indexes ('main')
  212. debug_print ('\tLoad extra words source \"%s\"' % opts.extra)
  213. _bz2p = patt_s.match(opts.extra)
  214. if _bz2p:
  215. extra_s = bz2.BZ2File ( opts.extra, "r" )
  216. else:
  217. extra_s = file ( opts.extra, 'r' )
  218. debug_print ('\tParsing extra words source file ')
  219. extraline = parse_extra (extra_s)
  220. debug_print ('\tPreparing extra words lines')
  221. db.cache_goucima()
  222. extrawds = extra_parser (extraline)
  223. debug_print( '\t we have %d extra phrases from source' % len(extrawds))
  224. # first get the entry of original phrases from
  225. # phrases-[(xingma, phrase, int(freq), 0)]
  226. orig_phrases = {}
  227. map (lambda x: orig_phrases.update({"%s\t%s"%(x[0],x[1]):x}), phrases )
  228. debug_print( '\t the len of orig_phrases is: %d' % len(orig_phrases) )
  229. extra_phrases = {}
  230. map (lambda x: extra_phrases.update({"%s\t%s" %(x[0],x[1]):x}), extrawds )
  231. debug_print ( '\t the len of extra_phrases is: %d' % len(extra_phrases) )
  232. # pop duplicated keys
  233. map (lambda x: extra_phrases.pop(x) if orig_phrases.has_key(x) else 0, extra_phrases.keys() )
  234. debug_print( '\t %d extra phrases will be added' % len(extra_phrases))
  235. new_phrases = extra_phrases.values()
  236. debug_print ('\tAdding extra words into DB ')
  237. db.add_phrases (new_phrases)
  238. debug_print ("Optimizing database ")
  239. db.optimize_database ()
  240. if opts.index:
  241. debug_print ('Create Indexes ')
  242. db.create_indexes ('main')
  243. else:
  244. debug_print ("We don't create index on database, you should only active this function only for distribution purpose")
  245. debug_print ('Done! :D')
  246. if __name__ == "__main__":
  247. main ()