XMSQLiteDB.py - This Python code is part of a natural langu…

/python/engine/XingMa/XMSQLiteDB.py

http://scim-python.googlecode.com/ · Python · 914 lines · 708 code · 48 blank · 158 comment · 95 complexity · f7a859d700e8564df830669c483322d7 MD5 · raw file

# -*- coding: utf-8 -*-
# vim: set noet ts=4:
#
# scim-python
#
# Copyright (c) 2008-2008 Yu Yuwei <acevery@gmail.com>
#
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA  02111-1307  USA
#
# $Id: $
#

import os
import os.path as path
import sqlite3
import XMDict
import uuid
import time
import re

patt_r = re.compile(r'c([ea])(\d):(.*)')
patt_p = re.compile(r'p(-{0,1}\d)(\d)')

# first make some number index we will used :)
#(MLEN, CLEN, M0, M1, M2, M3, M4, PHRASE, FREQ, USER_FREQ) = range (0,10)


class XMSQLiteDB:
	'''Phrase database for XingMa'''
	def __init__(self, name = 'xm.db', user_db = None, filename = None ):
		# first we use the Parse in XMDict, which transform the char(a,b,c,...) to int(1,2,3,...) to fasten the sql enquiry
		self.parse = XMDict.Parse
		self.deparse = XMDict.Deparse
		if filename:
			self.db = sqlite3.connect( filename )
		else:
			try:
				os.system('cat %s > /dev/null' % name)
			except:
				pass
			# open system phrase db
			self.db = sqlite3.connect(  name )
		try:
			self.db.execute( 'PRAGMA page_size = 4096; ' )
			self.db.execute( 'PRAGMA cache_size = 40000; ' )
			# increase the cache size to speedup sqlite enquiry
			self.db.execute( 'PRAGMA temp_store = MEMORY; ' )
			self.db.execute( 'PRAGMA synchronous = OFF; ' )
		except:
			print 'encountering error when init db'
			pass
		# create IME property table
		sqlstr = 'CREATE TABLE IF NOT EXISTS main.ime (attr TEXT, val TEXT);' 
		self.db.executescript( sqlstr )
		# make sure we have values in ime table.
		if not self.db.execute('SELECT * FROM main.ime;').fetchall():
			ime_keys={'name':'',
					  'name.zh_cn':'',
					  'name.zh_hk':'',
					  'name.zh_tw':'',
					  'author':'somebody', 
					  'uuid':'%s' % uuid.uuid4(),
					  'serial_number':'%s' % time.strftime('%Y%m%d'),
					  'icon':'/usr/share/scim/icons/scim-python.png',
					  'credit':'GPL',
					  'languages':'zh',
					  'valid_input_chars':'abcdefghijklmnopqrstuvwxyz',
					  'max_key_length':'4',
			#		  'commit_keys':'space',
			#		  'forward_keys':'Return',
			#		  'select_keys':'1,2,3,4,5,6,7,8,9,0',
			#		  'page_up_keys':'Page_Up,minus',
			#		  'page_down_keys':'Page_Down,equal',
					  'status_prompt':'CN',
					  'def_full_width_punct':'TRUE',
					  'def_full_width_letter':'FALSE',
					  'user_can_define_phrase':'FALSE',
					  'pinyin_mode':'FALSE',
					  'dynamic_adjust':'FALSE',
					  'no_check_chars':u'',
					  'rules':''}
					  #'rules':'ce2:p11+p12+p21+p22;ce3:p11+p21+p22+p31;ca4:p11+p21+p31+p41'}
			# inital the attribute in ime table, which should be updated from mabiao
			for _name in ime_keys:
				sqlstr = 'INSERT INTO main.ime (attr,val) VALUES (?,?);'
				self.db.execute( sqlstr, (_name,ime_keys[_name]) )
		# share variables in this class:
		self._mlen = int ( self.get_ime_property ("max_key_length") )
		#(MLEN, CLEN, M0, M1, M2, M3, M4, PHRASE, FREQ, USER_FREQ) = range (0,10)
		self._pt_index = ['mlen','clen']
		for i in range(self._mlen):
			self._pt_index.append ('m%d' %i)
		self._pt_index += ['phrase','freq','user_freq']
		self.user_can_define_phrase = self.get_ime_property('user_can_define_phrase')
		if self.user_can_define_phrase:
			if self.user_can_define_phrase.lower() == u'true' :
				self.user_can_define_phrase = True
				# cache goucima to memory to speedup gouci
				try:
					self.cache_goucima()
				except:
					pass
			else:
				self.user_can_define_phrase = False
		else:
			print 'Could not find "user_can_define_phrase" entry from database, is it a outdated database?'
			self.user_can_define_phrase = False
		
		self.dynamic_adjust = self.get_ime_property('dynamic_adjust')
		if self.dynamic_adjust:
			if self.dynamic_adjust.lower() == u'true' :
				self.dynamic_adjust = True
			else:
				self.dynamic_adjust = False
		else:
			print 'Could not find "dynamic_adjust" entry from database, is it a outdated database?'
			self.dynamic_adjust = False

		self._no_check_chars = self.get_no_check_chars()	
		self.rules = self.get_rules ()
		self._goucima = {}
		
		if filename:
			return
		
		# user database:
		if user_db != None:
			home_path = os.getenv ("HOME")
			xingma_path = path.join (home_path, ".scim", "scim-python", "xingma")
			user_db = path.join (xingma_path, user_db)
			if not path.isdir (xingma_path):
				os.makedirs (xingma_path)
			try:
				desc = self.get_database_desc (user_db)
				if desc == None :
					self.init_user_db (user_db)
				elif desc["version"] != "0.1":
					new_name = "%s.%d" %(user_db, os.getpid())
					print >> sys.stderr, "Can not support the user db. We will rename it to %s" % new_name
					os.rename (user_db, new_name)
					self.init_user_db (user_db)
			except:
				import traceback
				traceback.print_exc()
		else:
			user_db = ":memory:"
		
		# open user phrase database
		try:
			self.db.execute ('ATTACH DATABASE "%s" AS user_db;' % user_db)
		except:
			print >> sys.stderr, "The user database was damaged. We will recreate it!"
			os.rename (user_db, "%s.%d" % (user_db, os.getpid ()))
			self.init_user_db (user_db)
			self.db.execute ('ATTACH DATABASE "%s" AS user_db;' % user_db)

		# try create all tables in user database
		self.create_tables ("user_db")
		self.create_indexes ("user_db")
		self.generate_userdb_desc ()
		
		# attach mudb for working process
		mudb = ":memory:"  
		self.db.execute ('ATTACH DATABASE "%s" AS mudb;' % mudb )
		self.create_tables ("mudb")
	
	def update_phrase (self, entry, database='user_db'):
		'''update phrase freqs'''
		#print entry
		_con = [ entry[-1] ] + list(entry[0:2+entry[0]]) + [entry[-3]]
		#print _con
		_condition = u''.join( map(lambda x: 'AND m%d = ? ' % x, range(entry[0]) )	)
		#print _condition
		sqlstr = 'UPDATE %s.phrases SET user_freq = ? WHERE mlen = ? AND clen = ? %s AND phrase = ?;' % (database, _condition)
		#print sqlstr
		self.db.execute ( sqlstr , _con )
		# because we may update different db, we'd better commit every time.
		self.db.commit()

	def sync_usrdb (self):
		# we need to update the user_db
		#print 'sync userdb'
		mudata = self.db.execute ('SELECT * FROM mudb.phrases;').fetchall()
		data_u = filter ( lambda x: x[-2] in [1,-3], mudata)
		data_a = filter ( lambda x: x[-2]==2, mudata)
		data_n = filter ( lambda x: x[-2]==-2, mudata)
		#print data_a
		data_a = map (lambda x: (u''.join ( map(self.deparse, x[2:2+x[0]])),x[-3],0,x[-1] ), data_a)
		data_n = map (lambda x: (u''.join ( map(self.deparse, x[2:2+x[0]])),x[-3],-1,x[-1] ), data_n)
		#print data_u
		map (self.update_phrase, data_u)
		#print self.db.execute('select * from user_db.phrases;').fetchall()
		map (self.u_add_phrase,data_a)
		map (self.u_add_phrase,data_n)

	def create_tables (self, database):
		'''Create tables that contain all phrase'''

		try:
			self.db.execute( 'PRAGMA cache_size = 20000; ' )
			# increase the cache size to speedup sqlite enquiry
		except:
			pass
		if database == 'main':
			# create  ikeys table
			sqlstr = 'CREATE TABLE IF NOT EXISTS %s.ikeys (ikey TEXT PRIMARY KEY, id INTEGER);' % database
			self.db.execute ( sqlstr )
		
			# create goucima table, this table is used in construct new phrases
			sqlstr = 'CREATE TABLE IF NOT EXISTS %s.goucima (zi TEXT PRIMARY KEY' % database
			#for i in range(self._mlen):
			#	sqlstr += ', g%d INTEGER' % i 
			sqlstr += ''.join(map (lambda x: ', g%d INTEGER' % x, range(self._mlen)) )
			sqlstr += ');'
			self.db.execute ( sqlstr )

			# create pinyin table, this table is used in search single character for user handly
			sqlstr = 'CREATE TABLE IF NOT EXISTS %s.pinyin ( plen INTEGER, ' % database
			#for i in range(6):
			#	sqlstr += 'p%d INTEGER, ' % i 
			sqlstr += ''.join( map (lambda x: 'p%d INTEGER, ' % x, range(6) ) )
			sqlstr += 'zi TEXT, freq INTEGER);'
			self.db.execute ( sqlstr )

		# create phrase table (mabiao)
		sqlstr = 'CREATE TABLE IF NOT EXISTS %s.phrases ( mlen INTEGER, clen INTEGER, ' % database
		#for i in range(self._mlen):
		#	sqlstr += 'm%d INTEGER, ' % i 
		sqlstr += ''.join ( map (lambda x: 'm%d INTEGER, ' % x, range(self._mlen)) )
		sqlstr += 'phrase TEXT, freq INTEGER, user_freq INTEGER);'
		self.db.execute ( sqlstr )
		self.db.commit()
	
	def update_ime (self, attrs):
		'''Update attributes in ime table, attrs is a iterable object
		Like [(attr,val), (attr,val), ...]
		'''
		sqlstr = 'UPDATE main.ime SET val = ? WHERE attr = ?;' 
		for attr,val in attrs:
			_sqlstr = 'SELECT * from main.ime WHERE attr = ?' 
			res = self.db.execute( _sqlstr, (attr,) ).fetchall()
			if res:
				self.db.execute(sqlstr,(val,attr))
			else:
				#print '"',attr,'"'," didn't in ime property now!"
				pass
		# we need to update some self variables now.
		self._mlen = int (self.get_ime_property ('max_key_length' ))
		self._pt_index = ['mlen','clen']
		for i in range(self._mlen):
			self._pt_index.append ('m%d' %i)
		self._pt_index += ['phrase','freq','user_freq']
		self.user_can_define_phrase = self.get_ime_property('user_can_define_phrase')
		if self.user_can_define_phrase:
			if self.user_can_define_phrase.lower() == u'true' :
				self.user_can_define_phrase = True
			else:
				self.user_can_define_phrase = False
		else:
			print 'Could not find "user_can_define_phrase" entry from database, is it a outdated database?'
			self.user_can_define_phrase = False
		self.rules = self.get_rules ()

		self.db.commit()

	def get_rules (self):
		'''Get phrase construct rules'''
		rules={'above':4}
		if self.user_can_define_phrase:
			try:
				_rules = self.get_ime_property ('rules')
				if _rules:
					_rules = _rules.strip().split(';')
				for rule in _rules:
					res = patt_r.match (rule)
					if res:
						cms = []
						if res.group(1) == 'a':
							rules['above'] = int(res.group(2))
						_cms = res.group(3).split('+')
						if len(_cms) > int(self.get_ime_property('max_key_length')):
							print 'rule: "%s" over max key length' % rule
							break
						for _cm in _cms:
							cm_res = patt_p.match(_cm)
							cms.append(( int(cm_res.group(1)),int(cm_res.group(2)) ))
						rules[int(res.group(2))]=cms
					else:
						print 'not a legal rule: "%s"'  % rule 
			except Exception:
				import traceback
				traceback.print_exc ()
			return rules
		else:
			return ""

	def get_no_check_chars (self):
		'''Get the characters which engine should not change freq'''
		_chars= self.get_ime_property('no_check_chars')
		try:
			_chars = _chars.decode('utf-8')
		except:
			pass
		return _chars

	def add_phrases (self, phrases, database = 'main'):
		'''Add phrases to database, phrases is a iterable object
		Like: [(xingma, phrase, freq ,user_freq), (xingma, phrase, freq, user_freq), ...]
		'''
		if database == 'main':
			map (self.add_phrase, phrases)
		else:
			map (self.add_phrase, phrases, [database]*len(phrases),[False]*len(phrases) )
		self.db.commit()
	
	def add_new_phrases (self, nphrases, database='main'):
		'''Add new phrases into db, new phrases is a object
		of [(phrase,freq), (phrase,freq),...]'''
		n_phrases=[]
		for _ph, _freq in nphrases:
			try:
				_xm = self.parse_phrase_to_xm (_ph)
				if not self.check_phrase_internal (_ph, _xm, database):
					# we don't have this phrase
					n_phrases.append ( (_xm, _ph, _freq, 0) )
			except:
				print '\"%s\" would not been added' % _ph
		if n_phrases:
			self.add_phrases ( n_phrases, database )
	

	def u_add_phrase (self,nphrase):
		'''Add a phrase to userdb'''
		self.add_phrase (nphrase,database='user_db')

	def add_phrase (self, aphrase, database = 'main', commit=True):
		'''Add phrase to database, phrase is a object of
		(xingma, phrase, freq ,user_freq)
		'''
		sqlstr = 'INSERT INTO %s.phrases ( mlen, clen, '
		sql_suffix = 'VALUES ( ?, ?, '
		mmlen = range(self._mlen)
		sqlstr += ''.join ( map(lambda x: 'm%d, ' %x , mmlen) )
		sql_suffix += ''.join ( map (lambda x: '?, ' , mmlen) )   
		sqlstr += 'phrase, freq, user_freq) '
		sql_suffix += '?, ?, ? );'
		sqlstr += sql_suffix
		
		try:
			xingma,phrase,freq,user_freq = aphrase
		except:
			xingma,phrase,freq = aphrase
			user_freq = 0
		try:
			xm = self.parse(xingma)
			if len(xm) != len(xingma):
				print 'In %s %s: we parse xingma fail' % (phrase, xingma )
				return
			record = [None, None, None, None, None]
			map( lambda x: record.append(None), range(self._mlen))
			record [0] = len (xingma)
			record [1] = len (phrase)
			record [2: 2+len(xingma)] = map (lambda x: xm[x].get_key_id(), range(0,len(xingma)))
			record [2+self._mlen] = phrase
			record [2+self._mlen+1] = freq
			record [2+self._mlen+2] = user_freq
			self.db.execute (sqlstr % database, record)
			if commit:
				self.db.commit()	
		except Exception:
			import traceback

	def add_goucima (self, gcms):
		'''Add goucima into database, gcms is iterable object
		Like gcms = [(zi,goucima),(zi,goucima), ...]
		'''
		count = 1
		for zi,gcm in gcms:
			_con = ''
			_val = ''
			_len = min ( len(gcm),self._mlen)
			for i in range( _len ):
				_con += ', g%d' % i
				_val += ', ?' 
			sqlstr = '''INSERT INTO main.goucima ( zi %s )
			VALUES ( ? %s );''' % (_con, _val)
			try:
				gc = self.parse(gcm)
				if len(gc) != len(gcm):
					error_m = u'%s %s: Can not parse goucima' % (zi, gcm )
					raise Exception ( error_m.encode ('utf8') )
				record = [zi]
				for i in range(_len):
					record.append( gc[i].get_key_id())
				self.db.execute (sqlstr , record)
			
			except Exception:
				import traceback
				traceback.print_exc()
			count += 1
		self.db.commit()

	def add_pinyin (self, pinyins, database = 'main'):
		'''Add pinyin to database, pinyins is a iterable object
		Like: [(zi,pinyin, freq), (zi, pinyin, freq), ...]
		'''
		sqlstr = 'INSERT INTO %s.pinyin ( plen, '
		sql_suffix = 'VALUES ( ?, '
		for i in range(6):
			sqlstr += 'p%d, ' % i
			sql_suffix += '?, '
		sqlstr += 'zi, freq ) '
		sql_suffix += '?, ? );'
		sqlstr += sql_suffix
		
		count = 1
		for pinyin,zi,freq in pinyins:
			try:
				py = self.parse(pinyin)
				if len(py) != len(pinyin):
					error_m = u'%s %s: Can not parse pinyin' % (zi, pinyin )
					raise Exception ( error_m.encode ('utf8') )
				record = [None, None, None, None, None, None, None, None, None]
				record [0] = len (pinyin)
				for i in range(0,len(pinyin)):
					record [ 1+i ] = py[i].get_key_id()
				record [-2] = zi
				record [-1] = freq
				self.db.execute (sqlstr % database, record)
			except Exception:
				print count, ': ', zi.encode('utf8'), ' ', pinyin
				import traceback
				traceback.print_exc()
			count += 1

		self.db.commit()	
	
	def optimize_database (self, database='main'):
		sqlstr = '''
			CREATE TABLE tmp AS SELECT * FROM %(database)s.phrases;
			DELETE FROM %(database)s.phrases;
			INSERT INTO %(database)s.phrases SELECT * FROM tmp ORDER BY %(xmstr)s mlen ASC, freq DESC;
			DROP TABLE tmp;
			CREATE TABLE tmp AS SELECT * FROM %(database)s.goucima;
			DELETE FROM %(database)s.goucima;
			INSERT INTO %(database)s.goucima SELECT * FROM tmp ORDER BY zi,g0,g1;
			DROP TABLE tmp;
			CREATE TABLE tmp AS SELECT * FROM %(database)s.pinyin;
			DELETE FROM %(database)s.pinyin;
			INSERT INTO %(database)s.pinyin SELECT * FROM tmp ORDER BY p0,p1,p2,p3,p4,p5,plen ASC;
			DROP TABLE tmp;
			'''
		xmstr = ''
		for i in range(self._mlen):
			xmstr +='m%d, ' % i
		self.db.executescript (sqlstr % {'database':database,'xmstr':xmstr })
		self.db.executescript ("VACUUM;")
		self.db.commit()
	
	def drop_indexes(self, database):
		'''Drop the index in database to reduce it's size'''
		sqlstr = '''
			DROP INDEX IF EXISTS %(database)s.goucima_index_z;
			DROP INDEX IF EXISTS %(database)s.pinyin_index_i;
			DROP INDEX IF EXISTS %(database)s.phrases_index_p;
			DROP INDEX IF EXISTS %(database)s.phrases_index_i;
			VACUUM; 
			''' % { 'database':database }
		
		self.db.executescript (sqlstr)
		self.db.commit()
	
	def create_indexes(self, database):
		sqlstr = '''
			DROP INDEX IF EXISTS %(database)s.goucima_index_z;
			CREATE INDEX IF NOT EXISTS %(database)s.goucima_index_z ON goucima (zi);
			DROP INDEX IF EXISTS %(database)s.pinyin_index_i;
			CREATE INDEX IF NOT EXISTS %(database)s.pinyin_index_i ON pinyin (p0,p1,p2,p3,p4,p5,plen ASC, freq DESC);
			VACUUM; 
			''' % { 'database':database }

		sqlstr_t = '''
			DROP INDEX IF EXISTS %(database)s.phrases_index_p;
			CREATE INDEX IF NOT EXISTS %(database)s.phrases_index_p ON phrases (%(xmstr)s mlen ASC, freq DESC);
			DROP INDEX IF EXISTS %(database)s.phrases_index_i;
			CREATE INDEX IF NOT EXISTS %(database)s.phrases_index_i ON phrases (phrase, mlen ASC);
			''' 
		xmstr = ''
		for i in range(self._mlen):
			xmstr +='m%d,' % i
		if database == 'main':
			sqlstr = sqlstr_t % {'database':database,'xmstr':xmstr } + sqlstr
		else:
			sqlstr = sqlstr_t % {'database':database,'xmstr':xmstr }
		self.db.executescript (sqlstr)
		self.db.commit()
	
	def compare (self,x,y):
		return cmp (x[0],y[0]) or -(cmp (x[-1],y[-1])) or -(cmp (x[-2],y[-2]))

	def select_words( self, xms, onechar=False ):
		'''
		Get phrases from database by XingMa_Key objects
		( which should be equal or less than the max key length)
		This method is called in XingMa by passing UserInput held data
		Return result[:] 
		'''
		# firstly, we make sure the len we used is equal or less than the max key length
		_len = min( len(xms),self._mlen )
		_condition = ''
		_condition += ''.join ( map (lambda x: 'AND m%d = ? ' %x, range(_len) ) )
		if onechar:
			# for some users really like to select only single characters
			_condition += 'AND clen=1 '
		# you can increase the x in _len + x to include more result, but in the most case, we only need one more key result, so we don't need the extra overhead :)
		# we start search for 1 key more, if nothing, then 2 key more and so on
		# this is the max len we need to add into the select cause.
		w_len = self._mlen - _len +1
		# we start from 2, because it is < in the sqlite select, which need 1 more.
		x_len = 2
		while x_len <= w_len + 1:
			sqlstr = '''SELECT * FROM (SELECT * FROM main.phrases WHERE mlen < %(mk)d  %(condition)s 
			UNION ALL
			SELECT * FROM user_db.phrases WHERE mlen < %(mk)d %(condition)s 
			UNION ALL
			SELECT * FROM mudb.phrases WHERE mlen < %(mk)d %(condition)s )
			ORDER BY mlen ASC, user_freq DESC, freq DESC;''' % { 'mk':_len+x_len, 'condition':_condition}
			# we have redefine the __int__(self) in class XMDict.XingMa_key to return the key id, so we can use map to got key id :)
			_xms = map(int,xms[:_len])
			_xms += _xms + _xms
			result = self.db.execute(sqlstr, _xms).fetchall()
			#self.db.commit()
			# if we find word, we stop this while, 
			if len(result) >0:
				break
			x_len += 1
		# here in order to get high speed, I use complicated map
		# to subtitute for
		sysdb={}
		usrdb={}
		mudb={}
		_cand = []
		#searchres = map ( lambda res: res[-2] and [ True, [(res[:-2],[res[:-1],res[-1:]])] ]\
		#		or [ False, [(res[:-2] , [res[:-1],res[-1:]])] ] \
		#		, result )
		searchres = map ( lambda res: [ int(res[-2]), int(res[-1]), [(res[:-2],[res[:-1],res[-1:]])] ], result)
		# for sysdb
		reslist=filter( lambda x: not x[1], searchres )
		map (lambda x: sysdb.update(x[2]), reslist)
		# for usrdb
		reslist=filter( lambda x: ( x[0] in [0,-1] ) and x[1], searchres )
		map (lambda x: usrdb.update(x[2]), reslist)
		# for mudb
		reslist=filter( lambda x: ( x[0] not in [0,-1] ) and x[1], searchres )
		map (lambda x: mudb.update(x[2]), reslist)

		# first process mudb
		searchres = map ( lambda key: mudb[key][0] + mudb[key][1], mudb )
		#print searchres
		map (_cand.append, searchres)

		# now process usrdb and sysdb
		searchres = map ( lambda key:  (not mudb.has_key(key))  and usrdb[key][0] + usrdb[key][1]\
				or None , usrdb )
		searchres = filter(lambda x: bool(x), searchres )
		#print searchres
		map (_cand.append, searchres)
		searchres = map ( lambda key: ((not mudb.has_key(key)) and (not usrdb.has_key(key)) )and sysdb[key][0] + sysdb[key][1]\
				or None, sysdb )
		searchres = filter (lambda x: bool(x), searchres)
		map (_cand.append, searchres)
		#for key in usrdb:
		#	if not sysdb.has_key (key):
		#		_cand.append( usrdb[key][0] + usrdb[key][1] )
		#	else:
		#		_cand.append( sysdb[key][0] + usrdb[key][1] )
		#for key in sysdb:
		#	if not usrdb.has_key (key):
		#		_cand.append( sysdb[key][0] + sysdb[key][1] )
		_cand.sort(cmp=self.compare)
		return _cand[:]

	def select_zi( self, xms ):
		'''
		Get zi from database by XingMa_Key objects
		( which should be equal or less than 6)
		This method is called in XingMa by passing UserInput held data
		Return  result[:] 
		'''
		# firstly, we make sure the len we used is equal or less than the max pinyin length 6
		_len = min( len(xms), 6 )
		_condition = ''
		#for i in range(_len):
		#	_condition += 'AND p%d = ? ' % i
		_condition += ''.join ( map (lambda x: 'AND p%d = ? ' %x, range(_len)) )
		# you can increase the x in _len + x to include more result, but in the most case, we only need one more key result, so we don't need the extra overhead :)
		sqlstr = '''SELECT * FROM main.pinyin WHERE plen < %(mk)d  %(condition)s 
		ORDER BY plen ASC, freq DESC;''' % { 'mk':_len+2, 'condition':_condition}
		# we have redefine the __int__(self) in class XMDict.XingMa_key to return the key id, so we can use map to got key id :)
		_xms = map(int,xms[:_len])
		result = self.db.execute(sqlstr, _xms).fetchall()
		#self.db.commit()
		return result[:]

	def get_ime_property( self, attr ):
		'''get IME property from database, attr is the string of property,
		which should be str.lower() :)
		'''
		sqlstr = 'SELECT val FROM main.ime WHERE attr = ?' 
		_result = self.db.execute( sqlstr, (attr,)).fetchall()
		#self.db.commit()
		if _result:
			return _result[0][0]
		else:
			return None

	def get_phrase_table_index (self):
		'''get a list of phrase table columns name'''
		return self._pt_index[:]

	def generate_userdb_desc (self):
		try:
			sqlstring = 'CREATE TABLE IF NOT EXISTS user_db.desc (name PRIMARY KEY, value);'
			self.db.executescript (sqlstring)
			sqlstring = 'INSERT OR IGNORE INTO user_db.desc  VALUES (?, ?);'
			self.db.execute (sqlstring, ('version', '0.1'))
			sqlstring = 'INSERT OR IGNORE INTO user_db.desc  VALUES (?, DATETIME("now", "localtime"));'
			self.db.execute (sqlstring, ("create-time", ))
			self.db.commit ()
		except:
			import traceback
			traceback.print_exc ()

	def init_user_db (self,db_file):
		if not path.exists (db_file):
			db = sqlite3.connect (db_file)
			db.execute('PRAGMA page_size = 4096;')
			db.execute( 'PRAGMA cache_size = 20000;' )
			db.execute( 'PRAGMA temp_store = MEMORY; ' )
			db.execute( 'PRAGMA synchronous = OFF; ' )
			db.commit()
	
	def get_database_desc (self, db_file):
		if not path.exists (db_file):
			return None
		try:
			db = sqlite3.connect (db_file)
			db.execute('PRAGMA page_size = 4096;')
			db.execute( 'PRAGMA cache_size = 20000;' )
			db.execute( 'PRAGMA temp_store = MEMORY; ' )
			db.execute( 'PRAGMA synchronous = OFF; ' )
			desc = {}
			for row in db.execute ("SELECT * FROM desc;").fetchall():
				desc [row[0]] = row[1]
			self.db.commit()
			return desc
		except:
			return None
	
	def cache_goucima (self):
		self._goucima = {}
		goucima = self.db.execute('SELECT * FROM main.goucima;').fetchall()
		map(lambda x: self._goucima.update({x[0]:x[1:]}), goucima)
	
	def get_gcm_id (self, zi):
		'''Get goucima of given character'''
		if self._goucima:
			# we already cache the goucima
			if not isinstance(zi,unicode):
				zi = zi.decode('utf-8')
			try:
				gcds = self._goucima[zi]
				return gcds
			except:
				pass

		sqlstr = 'SELECT %s FROM main.goucima WHERE zi =?;' % ','.join( map (lambda x: 'g%d' % x, range(self._mlen) ) )
		return self.db.execute(sqlstr,(zi,)).fetchall()[0]

	def parse_phrase (self, phrase):
		'''Parse phrase to get its XingMa code'''
		# first we make sure that we are parsing unicode string
		try:
			phrase = unicode(phrase)
		except:
			phrase = phrase.decode('utf8')
		p_len = len(phrase) 
		xmlist = []
		if p_len < 2:
			# phrase should not be shorter than 2
			return []
		try:
			if p_len >= self.rules['above']:
				rule = self.rules[ self.rules['above'] ]
			elif p_len in self.rules:
				rule = self.rules[p_len]
			else:
				raise Exception ('unsupport len of phrase')
			if len(rule) > self._mlen:
				raise Exception ('fault rule: %s' % rule)
			#for (zi,ma) in rule:
			#	if zi > 0:
			#		zi -= 1
			#	gcm = self.get_gcm_id (phrase[zi])
			#	xmlist.append(gcm[ma-1])
			xmlist = map (lambda x: self.get_gcm_id ( phrase[x[0]-1] )[ x[1]-1 ], rule )
			return [len( xmlist)] + [p_len]  + xmlist[:] + [phrase]

		except Exception:
			import traceback
			traceback.print_exc ()

	def parse_phrase_to_xm (self,phrase):
		'''Get the XingMa encoding of the phrase in string form'''
		xmres = self.parse_phrase (phrase) [2:-1]
		xms= u''.join ( map(self.deparse, xmres) )
		return xms

	def check_phrase (self,phrase,xmkey=None,database='main'):
		# if IME didn't support user define phrase,
		# we divide user input phrase into characters,
		# and then check its frequence
		if type(phrase) != type(u''):
			phrase = phrase.decode('utf8')
		if self.user_can_define_phrase:
			self.check_phrase_internal (phrase, xmkey,database)
		else:
			map(self.check_phrase_internal, phrase)
	
	def check_phrase_internal (self,phrase,xmkey=None,database='main'):
		'''Check word freq and user_freq
		'''
		if type(phrase) != type(u''):
			phrase = phrase.decode('utf8')
		try:
			if phrase in self._no_check_chars:
				# if the phrase is a single char, and in no_check_chars, we skip it.
				return
		except:
			print 'you are using old format of database, please regenerate your database.'
		if len(phrase) >=2:
			wordattr = self.parse_phrase ( phrase )
			_len = len (wordattr) -3
		if xmkey == None:
			sqlstr = '''SELECT * FROM (SELECT * FROM main.phrases WHERE phrase = ?
			UNION ALL SELECT * FROM user_db.phrases WHERE phrase = ?
			UNION ALL SELECT * FROM mudb.phrases WHERE phrase = ?)
			ORDER BY user_freq DESC, freq DESC
			''' 
			result = self.db.execute(sqlstr, (phrase,phrase,phrase)).fetchall()
		else:
			# we are using this to check whether the tab-key and phrase is in db
			xms = self.parse (xmkey)
			xmkids = tuple( map(int,xms) )
			condition = ' and '.join( map(lambda x: 'm%d = ?' % x, range( len(xms) )) )
			sqlstr = '''SELECT * FROM %(database)s.phrases WHERE phrase = ? and %(cond)s;''' % {'database':database, 'cond':condition}
			result = self.db.execute(sqlstr, (phrase,)+xmkids ).fetchall()
			return bool(result)

		sysdb = {}
		usrdb = {}
		mudb = {}
		searchres = map ( lambda res: [ int(res[-2]), int(res[-1]), [(res[:-2],[res[:-1],res[-1]])] ], result)
		# for sysdb
		reslist=filter( lambda x: not x[1], searchres )
		map (lambda x: sysdb.update(x[2]), reslist)
		# for usrdb
		reslist=filter( lambda x: ( x[0] in [0,-1] ) and x[1], searchres )
		map (lambda x: usrdb.update(x[2]), reslist)
		# for mudb
		reslist=filter( lambda x: (x[0] not in [0,-1])  and x[1], searchres )
		map (lambda x: mudb.update(x[2]), reslist)
		
		xmkey = ''
		if len(phrase) >=2:
			xmkey = u''.join ( map(self.deparse,wordattr[2:2+_len]) )
			#for k in wordattr[2:2+_len]:
			#	xmkey += self.deparse (k)
		
		sqlstr = 'UPDATE mudb.phrases SET user_freq = ? WHERE mlen = ? AND clen = ? %s AND phrase = ?;'
		
		try:
			if len(phrase) == 1:
				if not self.dynamic_adjust:
					# we should change the frequency of words
					return
				# this is a character
				# we remove the keys contained in mudb from usrdb
				keyout = filter (lambda k: mudb.has_key(k), usrdb.keys() )
				map (usrdb.pop, keyout)
				# we remove the keys contained in mudb and usrdb from sysdb
				keyout = filter (lambda k: mudb.has_key(k) or usrdb.has_key(k) , sysdb.keys() )
				map (sysdb.pop, keyout)
				# first mudb
				map (lambda res: self.db.execute ( sqlstr % ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) ) ,  [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:]) ) , mudb.keys())
				self.db.commit()
				# -----original for loop of above map: 
				#for res in mudb.keys ():
				#	_con = [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:])
				#	_condition = ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) )	
				#	self.db.execute ( sqlstr % _condition, _con )
				
				# then usrdb
				map ( lambda res: self.add_phrase ( (''.join ( map(self.deparse,res[2:2+int(res[0])] ) ),phrase, 1,usrdb[res][1]+1  ), database = 'mudb') , usrdb.keys() )				
				# -----original for loop of above map: 
				#for res in usrdb.keys ():
				#	#if mudb.has_key (res):
				#	#	continue
				#	xmkey = ''.join ( map(self.deparse,res[2:2+int(res[0])] ) )
				#	# here we use freq 1 to denote the phrase needed update in user_db
				#	self.add_phrase ((xmkey,phrase,1,usrdb[res][1]+1 ), database = 'mudb')
				# last sysdb
				map ( lambda res: self.add_phrase ( ( ''.join ( map(self.deparse,res[2:2+int(res[0])]) ),phrase,2,1 ), database = 'mudb'), sysdb.keys() )
				# -----original for loop of above map: 
				#for res in sysdb.keys ():
				#	xmkey = ''.join ( map(self.deparse,res[2:2+int(res[0])]) )
				#	# here we use freq 2 to denote the word needed addition to user_db
				#	self.add_phrase ((xmkey,phrase,2,1), database = 'mudb')
			else:
				# this is a phrase
				if len (result) == 0 and self.user_can_define_phrase:
					# this is a new phrase, we add it into user_db
					self.add_phrase ( (xmkey,phrase,-2,1), database = 'mudb')
				elif len (result) > 0:
					if not self.dynamic_adjust:
						# we should change the frequency of words
						return
					# we remove the keys contained in mudb from usrdb
					keyout = filter (lambda k: mudb.has_key(k), usrdb.keys() )
					map (usrdb.pop, keyout)
					# we remove the keys contained in mudb and usrdb from sysdb
					keyout = filter (lambda k: mudb.has_key(k) or usrdb.has_key(k) , sysdb.keys() )
					map (sysdb.pop, keyout)
					
					# first we process mudb
					# the original for loop can be found above in 'len==1'
					map (lambda res: self.db.execute ( sqlstr % ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) ) ,  [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:]) ) , mudb.keys())
					self.db.commit()
					# then usrdb
					map ( lambda res: self.add_phrase ( (''.join ( map(self.deparse,res[2:2+int(res[0])] ) ),phrase,(-3 if usrdb[res][0][-1] == -1 else 1),usrdb[res][1]+1  ), database = 'mudb') , usrdb.keys() )				
					#print self.db.execute('select * from mudb.phrases;').fetchall()
					# last sysdb
					map ( lambda res: self.add_phrase ( ( ''.join ( map(self.deparse,res[2:2+int(res[0])]) ),phrase,2,1 ), database = 'mudb'), sysdb.keys() )

				else:
					# we come to here when the ime dosen't support user phrase define
					pass
			
			#self.db.commit()
		except:
			import traceback
			traceback.print_exc ()

	def find_zi_code (self,zi):
		'''Check word freq and user_freq
		'''
		zi = zi.decode('utf8')
		sqlstr = '''SELECT * FROM main.phrases WHERE phrase = ?
		ORDER BY mlen ASC;
''' 
		result = self.db.execute(sqlstr, (zi,)).fetchall()
		#self.db.commit()
		codes = []
		try:
			if result:
				for _res in result:
					xmkey = u''
					for i in range ( int ( _res[0] ) ):
						xmkey += self.deparse ( _res[2+i] )
					codes.append(xmkey)
		except:
			import traceback
			traceback.print_exc ()
		return codes[:]

	def remove_phrase (self,phrase,database='user_db'):
		'''Remove phrase from database, default is from user_db
		phrase should be the a row of select * result from database
		Like (mlen,clen,m0,m1,m2,m3,phrase,freq,user_freq)
		'''
		_ph = list(phrase[:-2])
		_condition = ''	
		for i in range(_ph[0]):
			_condition += 'AND m%d = ? ' % i
		nn =_ph.count(None)
		if nn:
			for i in range(nn):
				_ph.remove(None)
		msqlstr= 'SELECT * FROM %(database)s.phrases WHERE mlen = ? and clen = ? %(condition)s AND phrase = ? ;' % { 'database':database, 'condition':_condition }
		if self.db.execute(msqlstr, _ph).fetchall():
			sqlstr = 'DELETE FROM %(database)s.phrases WHERE mlen = ? AND clen =? %(condition)s AND phrase = ?  ;' % { 'database':database, 'condition':_condition }
			self.db.execute(sqlstr,_ph)
			self.db.commit()

		msqlstr= 'SELECT * FROM mudb.phrases WHERE mlen = ? and clen = ? %(condition)s AND phrase = ? ;' % { 'condition':_condition }
		if self.db.execute(msqlstr, _ph).fetchall():
			sqlstr = 'DELETE FROM mudb.phrases WHERE mlen = ? AND clen =? %(condition)s AND phrase = ?  ;' % {  'condition':_condition }
			self.db.execute(sqlstr,_ph)
			self.db.commit()
Summary ✨

This Python code is part of a natural language processing system, specifically for managing and updating word frequencies and user interactions with phrases. It provides methods for adding, removing, and finding phrases in two databases: main.phrases and mudb.phrases. The code handles various edge cases, such as phrase definition, frequency updates, and database queries.
Tech Fingerprint

Alerts (53)

'sqlite3.connect(' Use 'with sqlite3.connect()' for automatic connection closure
49 56
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
53 63 115 156 165 216 316 340 644 671 688 699 753 864 885
'== None' Use 'is' for None comparisons (e.g., x is None)
149 758
'list(' Avoid unnecessary list conversions; use generators where possible
184 809 813 851 895
'def' Ensure functions have docstrings for documentation
194 512 635 648 657 674 733
'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
198 557 560 565 568 572 577 582 777 782 785 818 827 854 857
'try:' Ensure try blocks have corresponding except or finally blocks
284 796
'except Exception:' Catch specific exceptions instead of Exception to avoid masking bugs
304 723
'isinstance(' Overuse may indicate design issues; consider polymorphism
683
'type(' Use isinstance() for type checking instead of type()
737 747