PageRenderTime 32ms CodeModel.GetById 1ms app.highlight 26ms RepoModel.GetById 1ms app.codeStats 0ms

/python/engine/XingMa/XMCreateDB.py

http://scim-python.googlecode.com/
Python | 286 lines | 219 code | 33 blank | 34 comment | 55 complexity | f2c49672fefaa1953352bc29e216debd MD5 | raw file
  1#! /usr/bin/python
  2# vim: set noet ts=4:
  3#
  4# scim-python
  5#
  6# Copyright (c) 2007-2008 Yu Yuwei <acevery@gmail.com>
  7#
  8#
  9# This library is free software; you can redistribute it and/or
 10# modify it under the terms of the GNU Lesser General Public
 11# License as published by the Free Software Foundation; either
 12# version 2 of the License, or (at your option) any later version.
 13#
 14# This library is distributed in the hope that it will be useful,
 15# but WITHOUT ANY WARRANTY; without even the implied warranty of
 16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17# GNU Lesser General Public License for more details.
 18#
 19# You should have received a copy of the GNU Lesser General Public
 20# License along with this program; if not, write to the
 21# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 22# Boston, MA  02111-1307  USA
 23#
 24# $Id: $
 25#
 26
 27import os
 28import sys
 29sys.path.append( os.path.dirname(os.path.abspath(__file__)) )
 30import XMSQLiteDB
 31import bz2
 32import re
 33
 34from optparse import OptionParser
 35# we use OptionParser to parse the cmd arguments :)
 36opt_parser = OptionParser()
 37
 38opt_parser.add_option( '-n', '--name',
 39		action = 'store', dest='name',default = None,
 40		help = 'set the database name we will use, default is %default')
 41opt_parser.add_option( '-s', '--source',
 42		action = 'store', dest='source', default = 'xingma.txt.bz2',
 43		help = 'tell me which file is the source file of IME, default is %default')
 44
 45opt_parser.add_option( '-p', '--pinyin',
 46		action = 'store', dest='pinyin', default = '/usr/share/scim-python/data/pinyin_table.txt',
 47		help = 'tell me which file is the source file of pinyin, default is %default')
 48
 49opt_parser.add_option( '-o', '--no-create-index',
 50		action = 'store_false', dest='index', default = True,
 51		help = 'do not create index on database, only for distrubution purpose, normal user should not invoke this flag!')
 52
 53opt_parser.add_option( '-i', '--create-index-only',
 54		action = 'store_true', dest='only_index', default = False,
 55		help = 'only create index on exist database')
 56
 57opt_parser.add_option( '-d', '--debug',
 58		action = 'store_true', dest='debug', default = False,
 59		help = 'print extra debug messages')
 60
 61opt_parser.add_option( '-e', '--extra',
 62		action = 'store', dest='extra', default = '',
 63		help = 'tell me which file is the extra words file for IME, default is %default')
 64
 65opts,args = opt_parser.parse_args()
 66if not opts.name and opts.only_index:
 67	print 'Please give me the database you want to create index on'
 68	sys.exit(2)
 69
 70if not opts.name:
 71	opts.name = os.path.basename(opts.source).split('.')[0] + '.db'
 72
 73def main ():
 74	def debug_print ( message ):
 75		if opts.debug:
 76			print message
 77	
 78	if not opts.only_index:
 79		try:
 80			os.unlink (opts.name)
 81		except:
 82			pass
 83	
 84	debug_print ("Processing Database")
 85	db = XMSQLiteDB.XMSQLiteDB ( filename = opts.name)
 86	#db.db.execute( 'PRAGMA synchronous = FULL; ' )
 87	
 88	def parse_source (f):
 89		_attri = []
 90		_table = []
 91		_gouci = []
 92		patt_com = re.compile(r'^###.*')
 93		patt_blank = re.compile(r'^[ \t]*$')
 94		patt_conf = re.compile(r'.*=.*')
 95		patt_table = re.compile(r'(.*)\t(.*)\t.*')
 96		patt_gouci = re.compile(r'.*\t.*')
 97		patt_s = re.compile(r'(.*)\t([\x00-\xff]{3})\t.*')
 98
 99		for l in f:
100			if ( not patt_com.match(l) ) and ( not patt_blank.match(l) ):
101				for _patt, _list in ( (patt_conf,_attri),(patt_table,_table),(patt_gouci,_gouci) ):
102					if _patt.match(l):
103						_list.append(l)
104						break
105		if not _gouci:
106			#user didn't provide goucima, so we use the longest single character encode as the goucima.
107			gouci_dict = {}
108			for line in _table:
109				res = patt_s.match(line)
110				if res:
111					if gouci_dict.has_key(res.group(2)):
112						if len(res.group(1)) > len(gouci_dict[res.group(2)]):
113							gouci_dict[res.group(2)] = res.group(1)
114					else:
115						gouci_dict[res.group(2)] = res.group(1)
116			for key in gouci_dict:
117				_gouci.append('%s\t%s' %(key,gouci_dict[key] ) )
118			_gouci.sort()
119
120		return (_attri, _table, _gouci)
121
122	def parse_pinyin (f):
123		_pinyins = []
124		patt_com = re.compile(r'^#.*')
125		patt_blank = re.compile(r'^[ \t]*$')
126		patt_py = re.compile(r'(.*)\t(.*)\t.*')
127
128		for l in f:
129			if ( not patt_com.match(l) ) and ( not patt_blank.match(l) ):
130				if patt_py.match(l):
131					_pinyins.append(l)
132		return _pinyins[:]
133
134	def parse_extra (f):
135		_extra = []
136		patt_com = re.compile(r'^###.*')
137		patt_blank = re.compile(r'^[ \t]*$')
138		patt_extra = re.compile(r'(.*)\t(.*)')
139		patt_s = re.compile(r'(.*)\t([\x00-\xff]{3})\t.*')
140		
141		for l in f:
142			if ( not patt_com.match(l) ) and ( not patt_blank.match(l) ):
143				if patt_extra.match(l):
144					_extra.append(l)
145		
146		return _extra
147	
148	def pinyin_parser (f):
149		for py in f:
150			_zi, _pinyin, _freq = unicode (py,'utf-8').strip ().split()
151			yield (_pinyin, _zi, _freq)
152
153	def phrase_parser (f):
154		list=[]
155		for l in f:
156			xingma, phrase, freq = unicode (l, "utf-8").strip ().split ('\t')
157			list.append ( (xingma, phrase, int(freq), 0) )
158		return list
159
160	def goucima_parser (f):
161		for l in f:
162			zi,gcm = unicode (l, "utf-8").strip ().split ()
163			yield (zi, gcm)
164	
165	def attribute_parser (f):
166		for l in f:
167			try:
168				attr,val = unicode (l,"utf-8").strip().split ('=')
169			except:
170				attr,val = unicode (l,"utf-8").strip().split ('==')
171
172			attr = attr.strip().lower()
173			val = val.strip()
174			yield (attr,val)
175	
176	def extra_parser (f):
177		list = []
178		for l in f:
179			phrase, freq = unicode (l, "utf-8").strip ().split ()
180			try:
181				_key = db.parse_phrase_to_xm(phrase)
182				list.append( (_key,phrase,freq,0) )
183			except:
184				print '\"%s\" would not been added' % phrase.encode('utf-8')
185		return list
186
187	if opts.only_index:
188		debug_print ('Only create Indexes')
189		debug_print ( "Optimizing database " )
190		db.optimize_database ()
191	
192		debug_print ('Create Indexes ')
193		db.create_indexes ('main')
194		debug_print ('Done! :D')
195		return 0
196
197	# now we parse the ime source file
198	debug_print ("\tLoad sources %s" % opts.source)
199	patt_s = re.compile( r'.*\.bz2' )
200	_bz2s = patt_s.match(opts.source)
201	if _bz2s:
202		source = bz2.BZ2File ( opts.source, "r" )
203	else:
204		source = file ( opts.source, 'r' )
205	# first get config line and table line and goucima line respectively
206	debug_print ('\tParsing xingma source file ')
207	attri,table,gouci =  parse_source ( source )
208	
209	debug_print ('\t  get attribute of IME :)')
210	attributes = attribute_parser ( attri )
211	debug_print ('\t  add attributes into DB ')
212	db.update_ime ( attributes )
213	db.create_tables ('main')
214
215	# second, we use generators for database generating:
216	debug_print ('\t  get phrases of IME :)')
217	phrases = phrase_parser ( table)
218	
219	# now we add things into db
220	debug_print ('\t  add phrases into DB ')
221	db.add_phrases ( phrases )
222	
223	if db.get_ime_property ('user_can_define_phrase').lower() == u'true':
224		debug_print ('\t  get goucima of IME :)')
225		goucima = goucima_parser (gouci)
226		debug_print ('\t  add goucima into DB ')
227		db.add_goucima ( goucima )
228	
229	if db.get_ime_property ('pinyin_mode').lower() == u'true':
230		debug_print ('\tLoad pinyin source %s' % opts.pinyin)
231		_bz2p = patt_s.match(opts.pinyin)
232		if _bz2p:
233			pinyin_s = bz2.BZ2File ( opts.pinyin, "r" )
234		else:
235			pinyin_s = file ( opts.pinyin, 'r' )
236		debug_print ('\tParsing pinyin source file ')
237		pyline = parse_pinyin (pinyin_s)
238		debug_print ('\tParsing pinyin source file')
239		pinyin = pinyin_parser (pyline)
240		debug_print ('\t  add pinyin into DB ')
241		db.add_pinyin ( pinyin )
242
243	debug_print ("Optimizing database ")
244	db.optimize_database ()
245	
246	if db.get_ime_property ('user_can_define_phrase').lower() == u'true' and opts.extra:
247		debug_print( '\tPreparing for adding extra words' )
248		db.create_indexes ('main')
249		debug_print ('\tLoad extra words source \"%s\"' % opts.extra)
250		_bz2p = patt_s.match(opts.extra)
251		if _bz2p:
252			extra_s = bz2.BZ2File ( opts.extra, "r" )
253		else:
254			extra_s = file ( opts.extra, 'r' )
255		debug_print ('\tParsing extra words source file ')
256		extraline = parse_extra (extra_s)
257		debug_print ('\tPreparing extra words lines')
258		db.cache_goucima()
259		extrawds = extra_parser (extraline)
260		debug_print( '\t  we have %d extra phrases from source' % len(extrawds))
261		# first get the entry of original phrases from
262		# phrases-[(xingma, phrase, int(freq), 0)]
263		orig_phrases = {}
264		map (lambda x: orig_phrases.update({"%s\t%s"%(x[0],x[1]):x}), phrases )
265		debug_print( '\t  the len of orig_phrases is: %d' % len(orig_phrases) )
266		extra_phrases = {}
267		map (lambda x: extra_phrases.update({"%s\t%s" %(x[0],x[1]):x}), extrawds )
268		debug_print ( '\t  the len of extra_phrases is: %d' % len(extra_phrases) )
269		# pop duplicated keys
270		map (lambda x: extra_phrases.pop(x) if orig_phrases.has_key(x) else 0, extra_phrases.keys() )
271		debug_print( '\t  %d extra phrases will be added' % len(extra_phrases))
272		new_phrases = extra_phrases.values()
273		debug_print ('\tAdding extra words into DB ')
274		db.add_phrases (new_phrases)
275		debug_print ("Optimizing database ")
276		db.optimize_database ()
277	
278	if opts.index:
279		debug_print ('Create Indexes ')
280		db.create_indexes ('main')
281	else:
282		debug_print ("We don't create index on database, you should only active this function only for distribution purpose")
283	debug_print ('Done! :D')
284	
285if __name__ == "__main__":
286	main ()