/python/engine/PinYin/ZhengJuDB.py
Python | 231 lines | 205 code | 1 blank | 25 comment | 0 complexity | c91691f4511c7b2c20e1e376199f185d MD5 | raw file
- # -*- coding: utf-8 -*-
- # vim: set noet ts=4:
- #
- # scim-python
- #
- # Copyright (c) 2007-2008 Yu Fan <yufanyufan@gmail.com>
- #
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, write to the
- # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
- #
- # $Id: $
- #
- import os
- import os.path as path
- import sys
- import sqlite3 as sqlite
- import traceback
- import PYDict
- from PYSQLiteDB import PYSQLiteDB, Cache
- import re
- import time
- (YLEN, Y0, Y1, Y2, Y3, YX, S0, S1, S2, S3, PHRASE, FREQ, USER_FREQ) = range (0, 13)
- USER_WORD = 119771
- USER_PHRASE = 119769
- class ZhengJuDB (PYSQLiteDB):
- """phrase database that contains all phrases and phrases' pinyin"""
- def __init__ (self, config):
- PYSQLiteDB.__init__ (self, user_db = "user_zhengju.db")
- #~ print "db init"
- #~ Init caches
- self.select_cache = Cache ()
- self.select_all_cache = Cache()
- self.longest_length_cache = Cache()
- self.load_config(config)
- def load_config(self, config):
- self.large_charset = config.read ("/IMEngine/Python/ZhengJu/LargeCharset", False)
-
- def clear_cache(self):
- #~ pass
- self.select_cache.clear()
- self.select_all_cache.clear()
- self.longest_length_cache.clear()
- def add_phrase (self, pinyin, freq, database = "user_db", user_freq = 32):
- """ add phrase to database"""
- sqlstring = """INSERT INTO %s.py_phrase (ylen, y0, y1, y2, y3, yx, s0, s1, s2, s3, phrase, freq, user_freq)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
- phrase = u"".join([i.char for i in pinyin])
- #~ print "commit", phrase
- record = [None, None, None, None, None, None, None, None, None, None, None, 0, None]
- record [YLEN] = len (pinyin)
- i = 0
- for p in pinyin[:4]:
- record[Y0 + i] = p.get_pinyin_id ()
- record[S0 + i] = p.get_sheng_mu_id ()
- i += 1
- if len(pinyin) > 4:
- record[YX] = "'".join (map (str, pinyin[4:]))
- record[PHRASE] = phrase
- record[FREQ]= freq
- record[USER_FREQ] = user_freq
- self.db.execute (sqlstring % (database, ), record)
- self.db.commit ()
- del self.select_cache[u"'".join (map (str, pinyin))]
- def clean_useless_phrase (self):
- "Remove phrase from user database"
- #~ print "remove", record[6]
- sql = "DELETE FROM user_db.py_phrase WHERE user_freq = 0"
- self.db.execute (sql)
- self.db.commit ()
- self.clear_cache()
- def remove_phrase (self, record, database = "user_db"):
- "Remove phrase from user database"
- #~ print "remove", record[6]
- sql = "DELETE FROM user_db.py_phrase WHERE ylen = ? AND y0 = ? AND phrase = ?"
- self.db.execute (sql, (record[YLEN], record[Y0], record[6]))
- self.db.commit ()
- self.clear_cache()
-
- def build_pinyin_condition(self, pys, length = None):
- sql_conditions = []
- i = 0
- for py in pys[:4]:
- sql_conditions.append ("s%d = %d" % (i, py.get_sheng_mu_id ()))
- if py.is_complete ():
- sql_conditions.append ("y%d = %d" % (i, py.get_pinyin_id ()))
- i += 1
- if len (pys) > 4:
- pp = lambda (x): x.get_pinyin() if x.is_complete () else x.get_pinyin() + "%"
- pattern = "'".join (map (pp, pys[4:]))
- if length == None:
- sql_conditions.append ("yx LIKE \"" + pattern + "%\"")
- else:
- sql_conditions.append ("yx LIKE \"" + pattern + "\"")
- return sql_conditions
- def get_longest_phrase_length(self , pylist):
- """return the longest word starting with first two pinyin of pylist"""
- if len(pylist)<5:
- return len(pylist)
- pys=pylist[:4]
- pinyin_string = u"'".join (map (str, pys))
- result = self.longest_length_cache [pinyin_string]
- if result != None:
- return result
- candidates = self.select_words_by_pinyin_list_all(pys)
- if candidates:
- result = max([i[YLEN] for i in candidates])
- else:
- result = 3
- #~ tables_union = """( SELECT ylen FROM main.py_phrase WHERE %(conditions)s UNION ALL
- #~ SELECT ylen FROM user_db.py_phrase WHERE %(conditions)s )"""
- #~ sql_conditions = self.build_pinyin_condition(pys)
- #~ tables_union = tables_union % { "conditions" : " AND ".join (sql_conditions) }
- #~ sql_string = "SELECT Max(ylen) FROM " + tables_union + " ;"
- #~ print "start", time.time()
- #~ result = list (self.db.execute (sql_string).fetchall ())
- #~ if not result[0][0]:
- #~ result = 1
- #~ else:
- #~ result = result[0][0]
- self.longest_length_cache[pinyin_string] = result
- #~ print "longest",pinyin_string, result, time.time()
- if result > len(pylist):
- return len(pylist)
- return result
- def select_words_by_phrase (self, pys, database = None):
- return self.select_words_by_pinyin(pys, phrase=u"".join([i.char for i in pys]),database=database)
-
- def select_phrase (self, phrase, database = None):
- sql_conditions = "ylen =%d AND phrase = \"%s\"" % (len(phrase), phrase)
- if database:
- tables_union = "(SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM " + database+ ".py_phrase WHERE %(conditions)s) "
- else:
- tables_union = """ (SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM user_db.py_phrase WHERE %(conditions)s UNION ALL
- SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM main.py_phrase WHERE %(conditions)s) """
- sql_prefix = "SELECT * FROM " + tables_union % { "conditions" : sql_conditions }
- sql_string = sql_prefix + " GROUP BY phrase ORDER BY freq;"
- result = list (self.db.execute (sql_string).fetchall ())
- return result
- def select_words_by_pinyin (self, pys, length=None, phrase=None, database = None):
- sql_conditions = []
- if length:
- sql_conditions = ["ylen = %d" % length ]
- if not self.large_charset:
- sql_conditions.append("freq > 0")
- if phrase:
- sql_conditions.append("phrase = \"%s" % phrase +"\"")
- if database:
- tables_union = "( SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM " + database+ ".py_phrase WHERE %(conditions)s )"
- else:
- tables_union = """( SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM user_db.py_phrase WHERE %(conditions)s UNION ALL
- SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM main.py_phrase WHERE %(conditions)s )"""
- sql_conditions.extend (self.build_pinyin_condition (pys, length))
- tables_union = tables_union % { "conditions" : " AND ".join (sql_conditions) }
- sql_prefix = "SELECT ylen, y0, y1, y2, y3, yx, phrase, MAX(freq*ifnull(user_freq,1)) as adj_freq FROM " + tables_union
- sql_string = sql_prefix + " GROUP BY phrase ORDER BY adj_freq DESC;"
- #~ print "by pinyin"
- result = list (self.db.execute (sql_string).fetchall ())
- #~ print result
- return result
- def select_words_by_pinyin_list_all (self, pys, length = None):
- """select words from database by list that contains PYDict.PinYinWord objects"""
- pinyin_string = u"'".join (map (str, pys))
- result = self.select_all_cache [pinyin_string]
- if result == None:
- result = self.select_words_by_pinyin(pys)
- self.select_all_cache[pinyin_string] = result
- #~ print "list_all",pinyin_string
- return result
- def remove_all_user_words(self):
- sql = "DELETE FROM user_db.py_phrase WHERE freq = " + str(USER_WORD)
- self.db.execute (sql)
- self.db.commit ()
- def remove_all_user_phrase(self):
- sql = "DELETE FROM user_db.py_phrase WHERE freq = " + str(USER_PHRASE)
- self.db.execute (sql)
- self.db.commit ()
- def remvoe_all_user_freq(self):
- sql = "DELETE FROM user_db.py_phrase WHERE freq != %d and freq != %d" (USER_PHRASE, USER_WORD)
- self.db.execute (sql)
- self.db.commit ()
- def select_words_by_pinyin_list (self, pys, database = None):
- """select words from database by list that contains PYDict.PinYinWord objects"""
- pinyin_string = u"'".join (map (str, pys))
- result = self.select_cache [pinyin_string]
- if result == None:
- result = self.select_words_by_pinyin(pys, len(pys), database)
- self.select_cache[pinyin_string] = result
- #~ print "list",pinyin_string
- return result
-
- def adjust_phrase_freq (self, pylist):
- """this function adjusts frequence of phrase in user database."""
- p = self.select_words_by_phrase(pylist,"user_db")
- if len(p)>0:
- q = self.select_words_by_pinyin(pylist, length = len(pylist), database = "main")
- if len(q) == 0 or p[0][7] > q[0][7]:
- sql_conditions = ["ylen = %d" % len(pylist) ]
- sql_conditions.extend (self.build_pinyin_condition (pylist, len(pylist)))
- sql_conditions.append ( "phrase != ?" )
- sql_conditions.append ( "user_freq * freq > " + str(p[0][7]) )
- sql = "UPDATE user_db.py_phrase SET user_freq = user_freq / 2 WHERE %(conditions)s" % { "conditions" : " AND ".join (sql_conditions) }
- self.db.execute (sql, [u"".join([i.char for i in pylist])])
- self.db.commit ()
- sql = "UPDATE user_db.py_phrase SET user_freq = user_freq * 2 WHERE phrase = ?"
- self.db.execute (sql,[u"".join([i.char for i in pylist])])
- self.db.commit ()
- else:
- p = self.select_words_by_phrase(pylist,"main")
- self.add_phrase(pylist,p[0][7],user_freq = 10)