PageRenderTime 104ms CodeModel.GetById 40ms app.highlight 24ms RepoModel.GetById 36ms app.codeStats 0ms

/python/engine/PinYin/ZhengJuDB.py

http://scim-python.googlecode.com/
Python | 231 lines | 205 code | 1 blank | 25 comment | 0 complexity | c91691f4511c7b2c20e1e376199f185d MD5 | raw file
  1# -*- coding: utf-8 -*-
  2# vim: set noet ts=4:
  3#
  4# scim-python
  5#
  6# Copyright (c) 2007-2008 Yu Fan <yufanyufan@gmail.com>
  7#
  8#
  9# This library is free software; you can redistribute it and/or
 10# modify it under the terms of the GNU Lesser General Public
 11# License as published by the Free Software Foundation; either
 12# version 2 of the License, or (at your option) any later version.
 13#
 14# This library is distributed in the hope that it will be useful,
 15# but WITHOUT ANY WARRANTY; without even the implied warranty of
 16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17# GNU Lesser General Public License for more details.
 18#
 19# You should have received a copy of the GNU Lesser General Public
 20# License along with this program; if not, write to the
 21# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 22# Boston, MA  02111-1307  USA
 23#
 24# $Id: $
 25#
 26import os
 27import os.path as path
 28import sys
 29import sqlite3 as sqlite
 30import traceback
 31import PYDict
 32from PYSQLiteDB import PYSQLiteDB, Cache
 33import re
 34import time
 35(YLEN, Y0, Y1, Y2, Y3, YX, S0, S1, S2, S3, PHRASE, FREQ, USER_FREQ) = range (0, 13)
 36USER_WORD = 119771
 37USER_PHRASE = 119769
 38
 39class ZhengJuDB (PYSQLiteDB):
 40	"""phrase database that contains all phrases and phrases' pinyin"""
 41	def __init__ (self, config):
 42		PYSQLiteDB.__init__ (self, user_db = "user_zhengju.db")
 43		#~ print "db init"
 44		#~ Init caches
 45		self.select_cache = Cache ()
 46		self.select_all_cache = Cache()
 47		self.longest_length_cache = Cache()
 48		self.load_config(config)
 49
 50	def load_config(self, config):
 51		self.large_charset = config.read ("/IMEngine/Python/ZhengJu/LargeCharset", False)
 52		
 53	def clear_cache(self):
 54		#~ pass
 55		self.select_cache.clear()
 56		self.select_all_cache.clear()
 57		self.longest_length_cache.clear()
 58
 59	def add_phrase (self, pinyin, freq, database = "user_db", user_freq = 32):
 60		""" add phrase to database"""
 61		sqlstring = """INSERT INTO %s.py_phrase (ylen, y0, y1, y2, y3, yx, s0, s1, s2, s3, phrase, freq, user_freq)
 62			VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
 63		phrase = u"".join([i.char for i in pinyin])
 64		#~ print "commit", phrase
 65		record = [None, None, None, None, None, None, None, None, None, None, None, 0, None]
 66		record [YLEN] = len (pinyin)
 67		i = 0
 68		for p in pinyin[:4]:
 69			record[Y0 + i] = p.get_pinyin_id ()
 70			record[S0 + i] = p.get_sheng_mu_id ()
 71			i += 1
 72		if len(pinyin) > 4:
 73			record[YX] = "'".join (map (str, pinyin[4:]))
 74		record[PHRASE] = phrase
 75		record[FREQ]= freq
 76		record[USER_FREQ] = user_freq
 77		self.db.execute (sqlstring % (database, ), record)
 78		self.db.commit ()
 79		del self.select_cache[u"'".join (map (str, pinyin))]
 80
 81	def clean_useless_phrase (self):
 82		"Remove phrase from user database"
 83		#~ print "remove", record[6]
 84		sql = "DELETE FROM user_db.py_phrase WHERE user_freq = 0"
 85		self.db.execute (sql)
 86		self.db.commit ()
 87		self.clear_cache()
 88
 89	def remove_phrase (self, record, database = "user_db"):
 90		"Remove phrase from user database"
 91		#~ print "remove", record[6]
 92		sql = "DELETE FROM user_db.py_phrase WHERE ylen = ? AND y0 = ? AND phrase = ?"
 93		self.db.execute (sql, (record[YLEN], record[Y0], record[6]))
 94		self.db.commit ()
 95		self.clear_cache()
 96	
 97	def build_pinyin_condition(self, pys, length = None):
 98		sql_conditions = []
 99		i = 0
100		for py in pys[:4]:
101			sql_conditions.append ("s%d = %d" % (i, py.get_sheng_mu_id ()))
102			if py.is_complete ():
103				sql_conditions.append ("y%d = %d" % (i, py.get_pinyin_id ()))
104			i += 1
105		if len (pys) > 4:
106			pp = lambda (x): x.get_pinyin() if x.is_complete () else x.get_pinyin() + "%"
107			pattern = "'".join (map (pp, pys[4:]))
108			if length == None:
109				sql_conditions.append ("yx LIKE \"" + pattern + "%\"")
110			else:
111				sql_conditions.append ("yx LIKE \"" + pattern + "\"")
112		return sql_conditions
113
114	def get_longest_phrase_length(self , pylist):
115		"""return the longest word starting with first two pinyin of pylist"""
116		if len(pylist)<5:
117			return len(pylist)
118		pys=pylist[:4]
119		pinyin_string = u"'".join (map (str, pys))
120		result = self.longest_length_cache [pinyin_string]
121		if result != None:
122			return result
123		candidates = self.select_words_by_pinyin_list_all(pys)
124		if candidates:
125			result = max([i[YLEN] for i in candidates])
126		else:
127			result = 3
128		#~ tables_union = """( SELECT ylen FROM main.py_phrase WHERE %(conditions)s UNION ALL
129		#~ SELECT ylen FROM user_db.py_phrase WHERE %(conditions)s )"""
130		#~ sql_conditions = self.build_pinyin_condition(pys)
131		#~ tables_union = tables_union % { "conditions" : " AND ".join (sql_conditions) }
132		#~ sql_string = "SELECT Max(ylen) FROM " + tables_union + " ;"
133		#~ print "start", time.time()
134		#~ result = list (self.db.execute (sql_string).fetchall ())
135		#~ if not result[0][0]:
136			#~ result =  1
137		#~ else:
138			#~ result = result[0][0]
139		self.longest_length_cache[pinyin_string] = result
140		#~ print "longest",pinyin_string, result, time.time()
141		if result > len(pylist):
142			return len(pylist)
143		return result
144
145	def select_words_by_phrase (self, pys, database = None):
146		return self.select_words_by_pinyin(pys, phrase=u"".join([i.char for i in pys]),database=database)
147	
148	def select_phrase (self, phrase, database = None):
149		sql_conditions = "ylen =%d AND phrase = \"%s\"" % (len(phrase), phrase)
150		if database:
151			tables_union = "(SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM " + database+ ".py_phrase WHERE %(conditions)s) "
152		else:
153			tables_union = """ (SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM user_db.py_phrase WHERE %(conditions)s UNION ALL
154							SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM main.py_phrase WHERE %(conditions)s) """		
155		sql_prefix = "SELECT * FROM " + tables_union % { "conditions" : sql_conditions }
156		sql_string = sql_prefix + " GROUP BY phrase ORDER BY freq;"
157		result = list (self.db.execute (sql_string).fetchall ())
158		return result
159
160	def select_words_by_pinyin (self, pys, length=None, phrase=None, database = None):
161		sql_conditions = []
162		if length:
163			sql_conditions = ["ylen = %d" % length ]
164		if not self.large_charset:
165			sql_conditions.append("freq > 0")
166		if phrase:
167			sql_conditions.append("phrase = \"%s" % phrase +"\"")
168		if database:
169			tables_union = "( SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM " + database+ ".py_phrase WHERE %(conditions)s )"
170		else:
171			tables_union = """( SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM user_db.py_phrase WHERE %(conditions)s UNION ALL
172							SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM main.py_phrase WHERE %(conditions)s )"""		
173		sql_conditions.extend (self.build_pinyin_condition (pys, length))
174		tables_union = tables_union % { "conditions" : " AND ".join (sql_conditions) }
175		sql_prefix = "SELECT ylen, y0, y1, y2, y3, yx, phrase, MAX(freq*ifnull(user_freq,1)) as adj_freq FROM " + tables_union
176		sql_string = sql_prefix + " GROUP BY phrase ORDER BY adj_freq DESC;"
177		#~ print "by pinyin"
178		result = list (self.db.execute (sql_string).fetchall ())
179		#~ print result
180		return result
181
182	def select_words_by_pinyin_list_all (self, pys, length = None):
183		"""select words from database by list that contains PYDict.PinYinWord objects"""
184		pinyin_string = u"'".join (map (str, pys))
185		result = self.select_all_cache [pinyin_string]
186		if result == None:
187			result = self.select_words_by_pinyin(pys)
188			self.select_all_cache[pinyin_string] = result
189			#~ print "list_all",pinyin_string
190		return result
191	def remove_all_user_words(self):
192		sql = "DELETE FROM user_db.py_phrase WHERE freq = " + str(USER_WORD)
193		self.db.execute (sql)
194		self.db.commit ()
195	def remove_all_user_phrase(self):
196		sql = "DELETE FROM user_db.py_phrase WHERE freq = " + str(USER_PHRASE)
197		self.db.execute (sql)
198		self.db.commit ()
199	def remvoe_all_user_freq(self):
200		sql = "DELETE FROM user_db.py_phrase WHERE freq != %d and freq != %d" (USER_PHRASE, USER_WORD)
201		self.db.execute (sql)
202		self.db.commit ()
203	def select_words_by_pinyin_list (self, pys, database = None):
204		"""select words from database by list that contains PYDict.PinYinWord objects"""
205		pinyin_string = u"'".join (map (str, pys))
206		result = self.select_cache [pinyin_string]
207		if result == None:
208			result = self.select_words_by_pinyin(pys, len(pys), database)
209			self.select_cache[pinyin_string] = result
210			#~ print "list",pinyin_string
211		return result
212	
213	def adjust_phrase_freq (self, pylist):
214		"""this function adjusts frequence of phrase in user database."""
215		p = self.select_words_by_phrase(pylist,"user_db")
216		if len(p)>0:
217			q = self.select_words_by_pinyin(pylist, length = len(pylist), database = "main")
218			if len(q) == 0 or p[0][7] > q[0][7]:
219				sql_conditions = ["ylen = %d" % len(pylist) ]
220				sql_conditions.extend (self.build_pinyin_condition (pylist, len(pylist)))
221				sql_conditions.append ( "phrase != ?" )
222				sql_conditions.append ( "user_freq * freq > " + str(p[0][7]) )
223				sql = "UPDATE user_db.py_phrase SET user_freq = user_freq / 2 WHERE %(conditions)s" % { "conditions" : " AND ".join (sql_conditions) }
224				self.db.execute (sql, [u"".join([i.char for i in pylist])])
225				self.db.commit ()
226			sql = "UPDATE user_db.py_phrase SET user_freq = user_freq * 2 WHERE phrase = ?"
227			self.db.execute (sql,[u"".join([i.char for i in pylist])])
228			self.db.commit ()
229		else:
230			p = self.select_words_by_phrase(pylist,"main")
231			self.add_phrase(pylist,p[0][7],user_freq = 10)