/python/engine/PinYin/ZhengJu.py
http://scim-python.googlecode.com/ · Python · 1268 lines · 1097 code · 83 blank · 88 comment · 403 complexity · 46afeedb53a388189acf7cb3c399f7df MD5 · raw file
- # -*- coding: utf-8 -*-
- # vim: set noet ts=4:
- #
- # scim-python
- #
- # Copyright (c) 2007-2008 Yu Fan <yufanyufan@gmail.com>
- #
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, write to the
- # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
- #
- # $Id: $
- #
- import scim
- import scim.Log
- import os
- from scim import KeyCode
- from scim import KeyMask
- from scim import Property
- import traceback
- from PYDict import *
- from gettext import dgettext
- from ZhengJuDB import *
- import scim.ascii as ascii
- from sets import Set
- import popen2
- _ = lambda a : dgettext ("scim-python", a)
- RGB = lambda r, g, b : (((r & 0xff) << 16) | ((g & 0xff) << 8) | (b & 0xff))
- IMEngine = scim.IMEngine
- IMEngineFactory = scim.IMEngineFactory
- (YLEN, Y0, Y1, Y2, Y3, YX, PHRASE, ADJ_FREQ) = range (0, 8)
- candidate_sort = lambda x,y: cmp(y[YLEN],x[YLEN]) if x[YLEN] != y[YLEN] else cmp(y[ADJ_FREQ],x[ADJ_FREQ])
- class InputException(Exception):
- def __init__ (self):
- Exception.__init__(self)
- class PinYinWord:
- def __init__ (self, shengmu = "", yunmu = "", pinyin = ""):
- self.char = ""
- self._pinyin_id = None
- self.manual = None
- self.char = ""
- self.spliter = ""
- if pinyin:
- self.set_pinyin(pinyin)
- else:
- self.set_pinyin(shengmu + yunmu)
- def set_pinyin(self,pinyin):
- if pinyin[0] == "'":
- self.spliter = "'"
- pinyin = pinyin[1:]
- if pinyin[:2] in SHENGMU_LIST:
- self.shengmu = pinyin[:2]
- self.yunmu = pinyin[2:]
- elif pinyin[:1] in SHENGMU_LIST:
- self.shengmu = pinyin[:1]
- self.yunmu = pinyin[1:]
- else:
- self.shengmu = ""
- self.yunmu = pinyin
- if self.get_pinyin() in PINYIN_LIST:
- self._pinyin_id = PINYIN_DICT [self.get_pinyin()]
- self._sheng_mu_id = SHENGMU_DICT [self.get_shengmu()]
- else:
- self._sheng_mu_id = SHENGMU_DICT [self.get_shengmu()]
- def mohuyin(self):
- pinyin = ID_PINYIN_DICT[self.real_pinyin_id]
- if pinyin[:2] in SHENGMU_LIST:
- self.shengmu = pinyin[:2]
- yunmu = pinyin[2:]
- elif pinyin[:1] in SHENGMU_LIST:
- self.shengmu = pinyin[:1]
- yunmu = pinyin[1:]
- else:
- self.shengmu = ""
- yunmu = pinyin
- if self.yunmu != "":
- self.yunmu = yunmu
- self.set_pinyin(self.get_pinyin())
- def get_sheng_mu_id (self):
- return self._sheng_mu_id
- def get_pinyin_id (self):
- return self._pinyin_id
- def set_pinyin_id (self, id):
- self.set_pinyin(ID_PINYIN_DICT[id])
-
- def get_shengmu (self):
- return self.shengmu
-
- def set_yunmu( self,yunmu):
- self.yunmu = yunmu
- if(yunmu != ""):
- self._pinyin_id = PINYIN_DICT [ self.get_pinyin() ]
- else:
- self._pinyin_id = None
- def set_char (self,char):
- self.char = char
- def get_pinyin (self):
- return self.shengmu + self.yunmu
- def get_screen_pinyin (self):
- return self.spliter + self.shengmu + self.yunmu
- def __str__ (self):
- return self.get_pinyin()
- def is_complete (self):
- return self._pinyin_id != None
-
- class Editor:
- database = None
- def __init__ (self, config = None):
- if config == None:
- config = PseudoConfig()
- if Editor.database == None:
- Editor.database = ZhengJuDB(config)
- self.lookup_table = scim.LookupTable (9)
- self.lookup_table.fix_page_size(True)
- self.clear()
- self.config = config
- self.load_config(config)
- def clear(self):
- self.cursor = 0
- self.wordlist = []
- self.pinyinlist = []
- self.candidates = []
- self.predict = []
- self.lookup_table.clear()
- self.lookup_table.show_cursor(False)
- Editor.database.clear_cache()
- def load_config(self, config):
- Editor.database.load_config(config)
- self.userword = config.read ("/IMEngine/Python/ZhengJu/CreateUserWords", True)
- self.userphrase = config.read ("/IMEngine/Python/ZhengJu/CreateUserPhrases", True)
- self.adjustfreq = config.read ("/IMEngine/Python/ZhengJu/AdjustWordFreq", True)
- self.logconverror = config.read ("/IMEngine/Python/ZhengJu/LogConvError", True)
- self.splitpinyin = config.read ("/IMEngine/Python/ZhengJu/SplitPinyin", True)
- self.enable_mohuyin = config.read ("/IMEngine/Python/ZhengJu/FuzzyPinyin", False)
- self.mohuyin_s_sh = config.read ("/IMEngine/Python/ZhengJu/FuzzyS_Sh", True)
- self.mohuyin_c_ch = config.read ("/IMEngine/Python/ZhengJu/FuzzyC_Ch", True)
- self.mohuyin_z_zh = config.read ("/IMEngine/Python/ZhengJu/FuzzyZ_Zh", True)
- self.mohuyin_l_n = config.read ("/IMEngine/Python/ZhengJu/FuzzyL_N", True)
- self.mohuyin_in_ing = config.read ("/IMEngine/Python/ZhengJu/FuzzyIn_Ing", True)
- self.mohuyin_en_eng = config.read ("/IMEngine/Python/ZhengJu/FuzzyEn_Eng", True)
- self.mohuyin_an_ang = config.read ("/IMEngine/Python/ZhengJu/FuzzyAn_Ang", True)
- self.build_mohuyin()
- def build_mohuyin(self):
- self.shengmu_mohu = {}
- if self.mohuyin_s_sh:
- self.shengmu_mohu["s"]= MOHU_SHENGMU["s"]
- self.shengmu_mohu["sh"]= MOHU_SHENGMU["sh"]
- if self.mohuyin_z_zh:
- self.shengmu_mohu["z"]= MOHU_SHENGMU["z"]
- self.shengmu_mohu["zh"]= MOHU_SHENGMU["zh"]
- if self.mohuyin_c_ch:
- self.shengmu_mohu["c"]= MOHU_SHENGMU["c"]
- self.shengmu_mohu["ch"]= MOHU_SHENGMU["ch"]
- if self.mohuyin_l_n:
- self.shengmu_mohu["l"]= MOHU_SHENGMU["l"]
- self.shengmu_mohu["n"]= MOHU_SHENGMU["n"]
- self.yunmu_mohu = {}
- if self.mohuyin_an_ang:
- self.yunmu_mohu["an"] = MOHU_YUNMU["an"]
- self.yunmu_mohu["ang"] = MOHU_YUNMU["ang"]
- if self.mohuyin_en_eng:
- self.yunmu_mohu["en"] = MOHU_YUNMU["en"]
- self.yunmu_mohu["eng"] = MOHU_YUNMU["eng"]
- if self.mohuyin_in_ing:
- self.yunmu_mohu["in"] = MOHU_YUNMU["in"]
- self.yunmu_mohu["in"] = MOHU_YUNMU["ing"]
- def current (self):
- if self.pinyinlist:
- return self.pinyinlist[-1]
- else:
- return None
- def is_empty (self):
- return (not self.pinyinlist) and (not self.wordlist)
- def is_end (self):
- return self.is_empty() or (not self.pinyinlist) and self.cursor == len (self.wordlist)
- def get_aux (self):
- return "".join ( u[PHRASE] for u in self.predict)
- def get_screen_pinyin(self):
- if self.splitpinyin:
- s = ""
- if self.pinyinlist:
- for i in range(len(self.pinyinlist)-1):
- p = self.pinyinlist[i].get_screen_pinyin() + self.pinyinlist[i+1].get_screen_pinyin()[0]
- if p in PINYIN_LIST or p in PINYIN_PARTIAL_LIST:
- s += self.pinyinlist[i].get_screen_pinyin() + "'"
- else:
- s += self.pinyinlist[i].get_screen_pinyin()
- s += self.pinyinlist[-1].get_screen_pinyin()
- return s
- else:
- return u"".join( i.get_screen_pinyin() for i in self.pinyinlist)
- def get_preedit (self):
- return u"".join( [i.char for i in self.wordlist[0:self.cursor] ] ) +\
- self.get_screen_pinyin() + \
- u"".join ( [i.char for i in self.wordlist[self.cursor:]] )
- def get_screen_cursor (self):
- if len(self.get_screen_pinyin())>0:
- return self.cursor + len(self.get_screen_pinyin())
- else:
- return self.cursor
-
- def pinyin_select (self, candidate, manual = False):
- phrase = candidate[PHRASE]
- length = len(phrase)
- for i in range(0,length):
- self.pinyinlist[i].set_char(phrase[i])
- if i<4:
- #~ self.pinyinlist[i].set_pinyin_id (candidate[i+1])
- self.pinyinlist[i].real_pinyin_id = candidate[i+1]
- if self.enable_mohuyin:
- self.pinyinlist[i].mohuyin()
- else:
- py = candidate[YX].split("'")
- self.pinyinlist[i].real_pinyin_id = PINYIN_DICT[py[i-4]]
- if self.enable_mohuyin:
- self.pinyinlist[i].mohuyin()
- #~ self.pinyinlist[i].set_pinyin ([py[i-4]])
- self.pinyinlist[i].manual = manual
- self.wordlist[self.cursor:self.cursor] = self.pinyinlist[:length]
- del self.pinyinlist[:length]
- self.cursor += length
- if manual:
- self.update()
- def reparse_backtrace(self):
- if self.cursor < len(self.wordlist):
- i = self.cursor
- while i >= 0:
- if self.wordlist[i].manual:
- break
- i-=1
- i += 1
- self.reparse(i)
-
- def convert_all (self):
- predicts = self.predict
- for i in predicts:
- self.pinyin_select(i)
- self.reparse_backtrace()
- self.update ()
- def jump_to_next_word(self):
- string = self.get_preedit ()
- phrase_list = self.split_phrase (string)
- p = 0;
- for i in phrase_list:
- if p <= self.cursor:
- p += i[1]
- else:
- break
- self.cursor = p
- self.update ()
- #~ predict = self.get_predict (self.wordlist[self.cursor:])
- #~ self.cursor += predict[0][YLEN]
- #~ self.update ()
- def predict_len(self, predicts):
- return sum (u[YLEN] for u in predicts)
- def auto_convert (self):
- self.update_predict()
- while self.predict_len(self.predict[:2]) < len (self.pinyinlist):
- self.pinyin_select(self.predict[0])
- self.update_predict()
- self.update_candidates()
- def auto_convert_quanpin(self):
- #~ self.update_predict()
- p = self.pinyinlist[-1].get_pinyin()
- if p not in SHENGMU_LIST and p not in PINYIN_PARTIAL_LIST:
- for t in PINYIN_LIST:
- if p != t and t[:len(p)] == p :
- while self.predict_len(self.predict[:2]) + 1 < len (self.pinyinlist):
- self.pinyin_select(self.predict[0])
- self.update_predict()
- self.update_candidates()
- return
- while self.predict_len(self.predict[:2]) < len (self.pinyinlist):
- self.pinyin_select(self.predict[0])
- self.update_predict()
- self.update_candidates()
- def update (self):
- self.candidate_cursor = None
- self.update_predict()
- self.update_candidates()
- def update_predict (self):
- if self.pinyinlist:
- self.predict = self.get_predict_pinyinlist (self.pinyinlist)
- else:
- self.predict = []
- def reverse(self, phrase):
- self.clear()
- while phrase:
- for i in range(len(phrase),0,-1):
- #~ print len(phrase),phrase[:i], i
- temp = self.database.select_phrase(phrase[:i])
- if temp:
- result = temp[0]
- break
- else:
- raise InputException()
- for i in range(result[YLEN]):
- pinyin = PinYinWord("'","")
- pinyin.set_char(phrase[i])
- if i < 4:
- pinyin.set_pinyin_id (result[i+1])
- else:
- #print candidate[YX]
- py = result[YX].split("'")
- #print py[i-5]
- pinyin.set_pinyin_id (PINYIN_DICT[py[i-4]])
- #print self.pinyinlist[i]._pinyin_id
- pinyin.real_pinyin_id=pinyin._pinyin_id
- self.wordlist.append(pinyin)
- phrase = phrase[result[YLEN]:]
- def split_phrase (self, string):
- start = 0
- phrase_list = []
- while start < len(self.wordlist)-1:
- candidate = None
- if len(self.wordlist) - start >= 3:
- phrase = Editor.database.select_words_by_pinyin_list_all (self.wordlist[start:start + 3])
- #~ print len(phrase)
- for i in phrase:
- if i[PHRASE] == string[start:start + len(i[PHRASE]) ]:
- if not candidate or candidate[PHRASE] < i[PHRASE]:
- candidate = i
- if candidate == None:
- phrase = Editor.database.select_words_by_phrase (self.wordlist[start:start+2])
- if phrase:
- candidate = phrase[0]
- else:
- phrase = Editor.database.select_words_by_phrase (self.wordlist[start:start+2])
- if phrase:
- candidate = phrase[0]
- if candidate == None:
- phrase_list.append ( (start, 1, string[start]))
- start += 1
- else:
- phrase_list.append ( (start, len(candidate[PHRASE]), candidate[PHRASE]))
- start += len(candidate[PHRASE])
- if start < len (self.wordlist):
- phrase_list.append ((start,1,string[-1]))
- return phrase_list
- def split_phrasev2 (self, string):
- start = 0
- phrase_list = []
- while start < len(self.wordlist)-1:
- phrase = Editor.database.select_words_by_pinyin_list_all (self.wordlist[start:start+2])
- candidate = None
- for i in phrase:
- if i[PHRASE] == string[start:start + len(i[PHRASE]) ]:
- if not candidate or candidate[PHRASE] < i[PHRASE]:
- candidate = i
- if candidate == None:
- phrase_list.append ( (start, 1, string[start]))
- start += 1
- else:
- phrase_list.append ( (start, len(candidate[PHRASE]), candidate[PHRASE]))
- start += len(candidate[PHRASE])
- if start < len (self.wordlist):
- phrase_list.append ((start,1,string[-1]))
- return phrase_list
- def learn_user_words(self, phrase_list, string, sentence):
- if not self.userword:
- return
- start = 0
- while start < len (phrase_list):
- tmp_phrase_start = phrase_list[start][0]
- tmp_phrase = ""
- while start < len (phrase_list) and phrase_list[start][1] == 1 \
- and string[phrase_list[start][0]] != sentence[phrase_list[start][0]]:
- tmp_phrase += phrase_list[start][2]
- del phrase_list[start]
- if tmp_phrase:
- phrase_list.insert (start, (tmp_phrase_start, len(tmp_phrase), tmp_phrase) )
- if len (tmp_phrase) > 1:
- Editor.database.add_phrase (self.wordlist[tmp_phrase_start:tmp_phrase_start + len(tmp_phrase)], USER_WORD)
- self.log_conv_error( sentence, string, phrase_list, tmp_phrase_start, tmp_phrase_start, 0)
- string = string[:tmp_phrase_start] + sentence[tmp_phrase_start:tmp_phrase_start + len(tmp_phrase)] + string[tmp_phrase_start + len(tmp_phrase):]
- start += 1
- return string
-
- def split_predict (self):
- predict = []
- start = 0
- while start < len (self.wordlist):
- p = self.get_predict (self.wordlist[start:])
- predict.append ( (start,len(p[0][PHRASE]), p[0][PHRASE]) )
- start += len (p[0][PHRASE])
- return predict
- def addphrase (self, phrase_list, pstart, pend, freq):
- if pstart < 0:
- return
- if pend >= len (phrase_list):
- return
- Editor.database.add_phrase(\
- self.wordlist[phrase_list[pstart][0]:(phrase_list[pend][0]+phrase_list[pend][1])], freq)
- def adjust_all_freq (self, phrase_list):
- if not self.userphrase:
- return
- p = [ self.wordlist[i[0]:i[0]+i[1]] for i in phrase_list]
- for i in p:
- Editor.database.adjust_phrase_freq (i)
- def adjust_freq (self, phrase_list, phrase_begin):
- if not self.adjustfreq:
- return
- i = phrase_list[phrase_begin]
- p = self.wordlist[i[0]:i[0]+i[1]]
- Editor.database.adjust_phrase_freq (p)
-
- def delete_phrase(self, n):
- if n >= self.lookup_table.get_current_page_size():
- raise InputException()
- candidate = self.candidates[self.lookup_table.get_current_page_start() + n]
- if candidate[ADJ_FREQ] == 0 or \
- candidate[ADJ_FREQ]%USER_PHRASE and candidate[ADJ_FREQ]%USER_WORD:
- raise InputException()
- Editor.database.remove_phrase (candidate)
- self.update ()
-
- def delete_cursor_phrase(self):
- candidate = self.candidates[self.lookup_table.get_cursor_pos() ]
- if candidate[ADJ_FREQ] == 0 or \
- candidate[ADJ_FREQ]%USER_PHRASE and candidate[ADJ_FREQ]%USER_WORD:
- self.candidate_cursor = None
- raise InputException()
- Editor.database.remove_phrase (candidate)
- self.update ()
- def log_conv_error(self, predict, sentence, phrase_list, pstart, pend, type):
- if pstart < 0:
- return
- if pend >= len (phrase_list):
- return
- if self.logconverror:
- begin = phrase_list[pstart][0]
- end = phrase_list[pend][0]+phrase_list[pend][1]
- p = open(os.path.expanduser ("~/.scim/zhengju-conv-error.log"),'a')
- print >> p, sentence[begin:end].encode ("utf-8"), predict[begin:end].encode ("utf-8"), type
- p.close ()
-
- def learn (self):
- if not self.userword and not self.userphrase and not self.adjustfreq and not self.logconverror:
- return
- predict = self.split_predict ()
- sentence = u"".join ([ i[2] for i in predict])
- for i in self.wordlist:
- i._pinyin_id = i.real_pinyin_id
- string = self.get_preedit ()
- phrase_list = self.split_phrase (string)
- string = self.learn_user_words(phrase_list, string, sentence)
- #~ print "out"
- #~ for i in phrase_list:
- #~ print i[1],i[2]
- #~ for i in predict:
- #~ print i[1],i[2]
- if not self.userphrase and not self.adjustfreq:
- return
- cur_phrase = 0
- cur_predict = 0
- phrase_begin = 0
- predict_begin = 0
- while cur_phrase < len(phrase_list):
- while predict[cur_predict][0]+ predict[cur_predict][1] < phrase_list[cur_phrase][0] + phrase_list[cur_phrase][1]:
- cur_predict += 1
- if predict[cur_predict][0]+ predict[cur_predict][1] > phrase_list[cur_phrase][0] + phrase_list[cur_phrase][1]:
- cur_phrase += 1
- else:
- #~ print string[phrase_list[phrase_begin][0]:phrase_list[cur_phrase][0]+phrase_list[cur_phrase][1]]
- #~ print sentence[predict[predict_begin][0]:predict[cur_predict][0]+predict[cur_predict][1]]
- if string[phrase_list[phrase_begin][0]:phrase_list[cur_phrase][0] + phrase_list[cur_phrase][1]]!=\
- sentence[predict[predict_begin][0]:predict[cur_predict][0] + predict[cur_predict][1]]:
- if cur_phrase - phrase_begin == 0:
- if cur_predict - predict_begin == 0:
- self.addphrase(phrase_list, phrase_begin - 1, cur_phrase, USER_PHRASE)
- self.log_conv_error(sentence, string, phrase_list, phrase_begin - 1, cur_phrase, 1)
- self.addphrase(phrase_list, phrase_begin, cur_phrase + 1, USER_PHRASE)
- self.log_conv_error(sentence, string, phrase_list, phrase_begin, cur_phrase + 1, 1)
- self.adjust_freq (phrase_list, phrase_begin)
- else:
- self.addphrase (phrase_list, phrase_begin, cur_phrase, USER_PHRASE)
- self.log_conv_error(sentence, string, phrase_list, phrase_begin, cur_phrase, 2)
- phrase_begin = cur_phrase + 1
- predict_begin = cur_predict + 1
- cur_phrase += 1
- Editor.database.clean_useless_phrase()
-
- def freq_alg(self, phrase1, phrase2):
- freq = 0
- if len(phrase1[PHRASE]) == 1:
- freq += phrase1[ADJ_FREQ] * 10
- elif len(phrase1[PHRASE]) < 4:
- freq += phrase1[ADJ_FREQ] * len(phrase1[PHRASE])
- else:
- freq += phrase1[ADJ_FREQ] * pow( len(phrase1[PHRASE]) , 2)
- if len(phrase2[PHRASE]) == 1:
- freq += phrase2[ADJ_FREQ] * 10
- elif len(phrase2[PHRASE]) < 4:
- freq += phrase2[ADJ_FREQ] * len(phrase2[PHRASE])
- else:
- freq += phrase2[ADJ_FREQ] * pow( len(phrase2[PHRASE]) , 2)
- return freq
- return phrase1[ADJ_FREQ] + phrase2[ADJ_FREQ]
- return phrase1[ADJ_FREQ] * len(phrase1[PHRASE]) + phrase2[ADJ_FREQ] * len(phrase2[PHRASE])
- return phrase1[ADJ_FREQ] * pow( len(phrase1[PHRASE]) , 2.5) + phrase2[ADJ_FREQ] * pow( len(phrase2[PHRASE]) , 2.5)
- return pow (phrase1[ADJ_FREQ], len(phrase1[PHRASE]) / 5.) + pow( phrase2[ADJ_FREQ], len(phrase2[PHRASE]) /5)
- def get_predict_pinyinlist (self, pinyinlist):
- #~ print "Dd", u" ".join( i.get_screen_pinyin() for i in pinyinlist), len(pinyinlist)
- candidates = Editor.database.select_words_by_pinyin_list (pinyinlist)
- if candidates:
- #~ print "phrase1",candidates[0][PHRASE],candidates[0][ADJ_FREQ]
- return [candidates[0]]
- else:
- candidates = Editor.database.select_words_by_pinyin_list_all(pinyinlist)
- if candidates:
- #~ print candidates[0][PHRASE]
- p = list (candidates[0]);
- p[YLEN] = len (pinyinlist)
- p[PHRASE] = p[PHRASE][:p[YLEN]]
- return [p]
- max_freq = 0
- predict = []
- for length in range(len (pinyinlist), 1, -1):
- for i in range (1, length):
- candidates = Editor.database.select_words_by_pinyin_list (pinyinlist[:i])
- if not candidates:
- continue
- candidates2 = Editor.database.select_words_by_pinyin_list(pinyinlist[i:length])
- if not candidates2:
- candidates2 = Editor.database.select_words_by_pinyin_list_all(pinyinlist[i:length])
- if candidates2:
- p = list (candidates2[0]);
- p[YLEN] = length - i
- p[PHRASE] = p[PHRASE][:p[YLEN]]
- tmp_phrase = candidates[0]
- tmp_phrase2 = p
- else:
- continue
- else:
- tmp_phrase = candidates[0]
- tmp_phrase2 = candidates2[0]
- new_freq = self.freq_alg(tmp_phrase, tmp_phrase2)
- #~ print tmp_phrase[PHRASE].encode ("utf-8"),tmp_phrase2[PHRASE].encode ("utf-8"), tmp_phrase[ADJ_FREQ],tmp_phrase2[ADJ_FREQ], new_freq
- #~ if tmp_phrase[ADJ_FREQ] + tmp_phrase2[ADJ_FREQ] >= max_freq:
- if new_freq >= max_freq:
- predict = [tmp_phrase, tmp_phrase2]
- max_freq = new_freq
- if predict:
- break
- if self.predict_len(predict) < len (pinyinlist):
- #~ return
- #~ for i in range(1, predict[0][YLEN]):
- #~ candidates = Editor.database.select_words_by_pinyin_list(pinyinlist[:i])
- #~ if candidates and candidates[0][PHRASE] == predict[0][PHRASE][:i]:
- #~ print "try", i, candidates[0][PHRASE]
- #~ temp = self.get_predict_pinyinlist(pinyinlist[i:self.predict_len(predict)])
- #~ print "resule", temp[0][PHRASE] + temp[1][PHRASE]
- #~ print "match", predict[0][PHRASE][i:] + predict[1][PHRASE]
- #~ if predict[0][PHRASE][i:] + predict[1][PHRASE] == (temp[0][PHRASE] + temp[1][PHRASE]):
- #~ print "go", candidates[0][PHRASE]
- #~ return [candidates[0]] + temp
- return predict + self.get_predict_pinyinlist(pinyinlist[self.predict_len(predict):])
- else:
- return predict
- def get_predict (self, pinyinlist):
- if not pinyinlist:
- return []
- candidates = Editor.database.select_words_by_pinyin_list(pinyinlist)
- if candidates:
- #~ print "phrase1",candidates[0][PHRASE],candidates[0][ADJ_FREQ]
- return [candidates[0]]
- else:
- candidates = Editor.database.select_words_by_pinyin_list_all(pinyinlist)
- if candidates:
- p = list (candidates[0]);
- p[YLEN] = len (pinyinlist)
- p[PHRASE] = p[PHRASE][:p[YLEN]]
- return [p]
- max_freq = 0
- max_length =0
- #~ print "try words"
- #~ if longest==1:
- #~ return [Editor.database.select_words_by_pinyin_list(pinyinlist[:1])[0][PHRASE]]
- #~ print longest
- for i in range (1, len(pinyinlist)):
- candidates = Editor.database.select_words_by_pinyin_list (pinyinlist[:i])
- if not candidates:
- continue
- tmp_phrase = candidates[0]
- tmp_freq = tmp_phrase[ADJ_FREQ]
- longest2 = Editor.database.get_longest_phrase_length (pinyinlist[i:])
- #~ print "phrase1",tmp_phrase[PHRASE]
- #~ print i,longest2
- for p in range(i + longest2,i-1,-1):
- if p < max_length:
- continue
- candidates2 = Editor.database.select_words_by_pinyin_list(pinyinlist[i:p+1])
- #~ print len(candidates2)
- if candidates2:
- tmp_phrase2 = candidates2[0]
- #~ print "phrase2",tmp_phrase2[PHRASE]
- tmp_freq2 = tmp_phrase2[ADJ_FREQ]
- #~ print tmp_phrase, " ", candidates2[0][PYSQLiteDB.PHRASE]
- new_freq = self.freq_alg(tmp_phrase, tmp_phrase2)
- if p > max_length or \
- (new_freq >= max_freq and p == max_length):
- predict = [tmp_phrase, tmp_phrase2]
- #~ print tmp_phrase[PHRASE],tmp_phrase2[PHRASE], tmp_phrase[ADJ_FREQ],tmp_phrase2[ADJ_FREQ]
- max_freq = new_freq
- max_length = p
- #~ print "get_predict" + predict[0], max_length
- return predict
- def reparse (self, start):
- #~ print "reparse"
- if start == len (self.wordlist):
- return
- predict = self.get_predict (self.wordlist[start:])
- phrase = predict[0][PHRASE]
- length = len (phrase)
- #~ if len(phrase)<len(self.wordlist)-start \
- #~ else len(self.wordlist)-start
- #~ print string
- for i in range(0, length):
- if self.wordlist[start+i].manual:
- return
- self.wordlist[start+i].set_char(phrase[i])
- self.reparse (start+length)
- def wordlist_manual_select (self, candidate):
- phrase = candidate[PHRASE]
- for i in range (0, len (phrase) ):
- if i < 4:
- self.wordlist[ self.cursor + i ].real_pinyin_id = candidate[ i + 1 ]
- if self.enable_mohuyin:
- self.wordlist[ self.cursor + i ].mohuyin()
- else:
- py = candidate[YX].split("'")
- self.wordlist[ self.cursor + i ].real_pinyin_id = PINYIN_DICT[ py[ i - 4 ] ]
- if self.enable_mohuyin:
- self.wordlist[ self.cursor + i ].mohuyin()
- self.wordlist[ self.cursor + i ].set_char( phrase[i] )
- self.wordlist[ self.cursor + i ].manual = True
- self.cursor += len (phrase)
- if self.cursor < len (self.wordlist):
- self.reparse (self.cursor);
- self.update ()
- def commit (self):
- if self.pinyinlist:
- self.convert_all ()
- string = self.get_preedit ()
- self.learn ()
- self.clear ()
- return string
- def del_current (self):
- if self.pinyinlist:
- raise InputException ()
- if self.cursor > 0:
- del self.wordlist[self.cursor-1]
- self.cursor -= 1
- if len (self.wordlist) == 0:
- self.clear ()
- self.reparse_backtrace ();
- self.update()
- elif self.wordlist and self.cursor == 0:
- raise InputException()
-
- def del_next (self):
- if self.pinyinlist or self.cursor == len (self.wordlist):
- raise InputException ()
- else:
- del self.wordlist[self.cursor]
- if len (self.wordlist)==0:
- self.clear ()
- self.reparse_backtrace ();
- self.update()
- def move_cursor (self, move):
- if self.is_empty():
- raise InputException()
- if self.pinyinlist and (move<0 or self.candidate_cursor == None):
- raise InputException()
- if move > 0 and self.candidate_cursor != None:
- self.select_cursor()
- else:
- self.cursor += move
- if self.cursor < 0:
- self.cursor += len (self.wordlist) + 1
- elif self.cursor > len (self.wordlist):
- self.cursor = 0
- self.update ()
- def move_cursor_to (self, pos):
- if self.is_empty ():
- raise InputException ()
- if self.pinyinlist:
- self.convert_all ()
- if pos == 0:
- self.cursor = len(self.wordlist)
- elif pos > len(self.wordlist) + 1:
- raise InputException ()
- else:
- self.cursor = pos - 1
- self.update ()
- def select (self, n):
- #~ print self.lookup_table.get_current_page_size()
- if n >= self.lookup_table.get_current_page_size():
- raise InputException()
- candidate = self.candidates[self.lookup_table.get_current_page_start()+n]
- if self.pinyinlist:
- self.pinyin_select(candidate, True)
- else:
- self.wordlist_manual_select(candidate)
- def select_cursor (self):
- candidate = self.candidates[self.lookup_table.get_cursor_pos()]
- #~ print candidate[PHRASE]
- if self.pinyinlist:
- self.pinyin_select(candidate, True)
- else:
- self.wordlist_manual_select(candidate)
- def recursive_mohuyin_pinyinlist(self, pinyinlist):
- for i in self.mohuyin(pinyinlist[0].get_screen_pinyin()):
- if pinyinlist[1:]:
- for p in self.recursive_mohuyin_pinyinlist(pinyinlist[1:]):
- yield [PinYinWord(pinyin = i)] + p
- else:
- yield [PinYinWord(pinyin = i)]
- def recursive_mohuyin(self, strl):
- for i in self.mohuyin(strl[0]):
- if strl[1:]:
- for p in self.recursive_mohu(strl[1:]):
- yield [i] + p
- else:
- yield [i]
-
- def mohuyin(self, pinyin):
- #~ print pinyin
- if pinyin[0] == "'":
- spliter = "'"
- pinyin = pinyin[1:]
- else:
- spliter = ""
- if pinyin[:2] in SHENGMU_LIST:
- shengmu = pinyin[:2]
- yunmu = pinyin[2:]
- elif pinyin[:1] in SHENGMU_LIST:
- shengmu = pinyin[:1]
- yunmu = pinyin[1:]
- else:
- shengmu = ""
- yunmu = pinyin
- if shengmu in self.shengmu_mohu:
- shengmu = self.shengmu_mohu[shengmu]
- else:
- shengmu = [shengmu]
- if yunmu in self.yunmu_mohu:
- yunmu = self.yunmu_mohu[yunmu]
- else:
- yunmu = [yunmu]
- if pinyin in PINYIN_PARTIAL_LIST:
- for q in yunmu:
- if i + q in SHENGMU_LIST or i + q in PINYIN_LIST or i + q in PINYIN_PARTIAL_LIST:
- yield spliter + i + q
- else:
- for i in shengmu:
- for q in yunmu:
- if i + q in SHENGMU_LIST or i + q in PINYIN_LIST:
- yield spliter + i + q
- def parsr_mohuyin(self, pinyinlist):
- candidates = []
- if self.enable_mohuyin:
- ss = Set()
- for p in self.recursive_mohuyin_pinyinlist(pinyinlist):
- #~ print u" ".join( i.get_pinyin() for i in p)
- for i in range (len (p), 0, -1):
- ss.update(Editor.database.select_words_by_pinyin_list (p[:i]))
- candidates = list(ss)
- candidates.sort(cmp = candidate_sort)
- else:
- for i in range (len (pinyinlist), 0, -1):
- candidates += Editor.database.select_words_by_pinyin_list (pinyinlist[:i])
- return candidates
-
- def update_candidates (self):
- if self.is_empty():
- self.candidates = []
- elif self.pinyinlist:
- self.candidates = self.parsr_mohuyin(self.pinyinlist)
- elif len(self.wordlist)>self.cursor:
- self.candidates = self.parsr_mohuyin(self.wordlist[self.cursor:])
- else:
- self.candidates = []
- self.update_lookup_table()
- def update_lookup_table (self):
- self.lookup_table.clear()
- self.lookup_table.show_cursor(False)
- for c in self.candidates:
- if c[ADJ_FREQ] == 0 or c[ADJ_FREQ]%USER_PHRASE and c[ADJ_FREQ]%USER_WORD:
- self.lookup_table.append_candidate (c[PHRASE])
- else:
- attrs = [scim.Attribute (0, len(c[PHRASE]), scim.ATTR_FOREGROUND, RGB (0, 0, 0xef))]
- self.lookup_table.append_candidate (c[PHRASE], attrs)
- class Engine (IMEngine):
- def __init__ (self, factory, config, encoding, id):
- IMEngine.__init__ (self, factory, config, encoding, id)
- self._editor = Editor ()
- self._lookup_table = scim.LookupTable (9)
- self._status_property = Property ("chinese", "CN")
- self._setup_property = Property ("setup", "", "/usr/share/scim/icons/setup.png")
- self._chinese_mode = True
- self.reload_config(config)
- self._log = scim.Log.Log ("ZhengJu")
- #~ print "init"
- def clear(self):
- pass
- def reset(self):
- #~ print "reset"
- if self._editor.wordlist:
- self.commit_string (self._editor.commit())
- else:
- self._editor.clear()
- self.clear()
- self.origin_string = None
- self._double_quotation_state = False
- self._single_quotation_state = False
- self._prev_key = None
- self._shift_key = None
- self.pipe = None
- self.update ()
- props = [self._status_property, self._setup_property]
- self.register_properties (props)
- self.update_properties ()
- def update_preedit (self):
- string = self._editor.get_preedit () + self.get_extra_string()
- if (string == u""):
- self.hide_preedit_string ()
- else:
- self.show_preedit_string ()
- self.update_preedit_string (string , [])
- self.update_preedit_caret (self._editor.get_screen_cursor())
- def focus_out(self):
- #~ print "out reset"
- self.reset()
- IMEngine.focus_out (self)
-
- def focus_in (self):
- #~ print "in reset"
- self.reset()
- IMEngine.focus_in (self)
-
- def trigger_property (self, property):
- if property == "chinese":
- self.change_mode ()
- elif property == "setup":
- self.start_helper ("61af6de6-c29d-421e-9e1b-e34a29c68c76")
- def update_candidate (self):
- if self._editor.candidates:
- self.update_lookup_table(self._editor.lookup_table)
- self.show_lookup_table()
- else:
- self.hide_lookup_table ()
- def update_aux(self):
- if self._editor.predict:
- self.show_aux_string ()
- string = self._editor.get_aux ()
- attrs = [scim.Attribute (0, len (string), scim.ATTR_FOREGROUND, RGB (0, 0, 0xef))]
- self.update_aux_string (string, attrs)
- else:
- self.hide_aux_string ()
- self.update_aux_string (u"")
- def update (self):
- self.update_preedit ()
- self.update_aux ()
- self.update_candidate ()
- def update_properties (self):
- if self._chinese_mode: # refresh mode
- self._status_property.label = _("CN")
- else:
- self._status_property.label = _("EN")
- self.update_property(self._status_property)
- def change_mode(self):
- if self._chinese_mode:
- self.commit_string (self._editor.commit())
- self.update()
- self._chinese_mode = not self._chinese_mode
- self.update_properties ()
- #~ print "change_mode", self._chinese_mode
- def reload_config (self, config):
- self._editor.load_config(config)
- self.progresivepromp = config.read ("/IMEngine/Python/ZhengJu/ProgressivePrompt", False)
- def lookup_table_page_down (self):
- self._editor.lookup_table.page_down ();
- self.update()
- return True
- def lookup_table_page_up (self):
- self._editor.lookup_table.page_up ();
- self.update()
- return True
-
- def process_key_event (self, key):
- #~ print key.code
- if self._chinese_mode and self._editor.is_end() and not self.get_extra_string():
- if (key.code == KeyCode.KEY_Shift_L or key.code == KeyCode.KEY_Shift_R) \
- and key.mask & KeyMask.ReleaseMask:
- if self._shift_key:
- self.change_mode()
- return True
- if key.mask == KeyMask.ShiftMask and (key.code >= KeyCode.KEY_A and key.code <= KeyCode.KEY_Z) \
- or key.mask & KeyMask.CapsLockMask:
- self.change_mode()
- elif not self.get_extra_string():
- if (key.code == KeyCode.KEY_Shift_L or key.code == KeyCode.KEY_Shift_R)\
- and key.mask & KeyMask.ReleaseMask:
- if self._shift_key:
- self.change_mode()
- return True
- if (key.code == KeyCode.KEY_Shift_L or key.code == KeyCode.KEY_Shift_R) \
- and key.mask == KeyMask.NullMask:
- self._shift_key = True;
- else:
- self._shift_key = None;
- if self.pipe and self.pipe.poll() != -1:
- try:
- self.origin_string = unicode(self.pipe.fromchild.read()[:-1],"utf8")
- self._editor.reverse(self.origin_string)
- self._editor.move_cursor_to (1)
- except:
- self._editor.clear()
- self.beep ()
- else:
- self.commit_string(u"")
- self.update()
- finally:
- self.pipe = None
- return True
- if key.mask & KeyMask.ReleaseMask:
- return False
- try:
- if self._chinese_mode:
- result = self.chinese_process_key_event (key)
- else:
- result = self.english_process_key_event (key)
- self._prev_key = key
- except InputException, e:
- self.beep ()
- return True
- except Exception, e:
- self.beep ()
- self._log.print_exc()
- self._log.log("DEBUG", self._editor.cursor)
- self._log.log("DEBUG", [i.char.encode("utf-8") for i in self._editor.wordlist] )
- self._log.log("DEBUG", [i.get_screen_pinyin() for i in self._editor.pinyinlist] )
- self._log.log("DEBUG", self._editor.get_preedit().encode ("utf-8"))
- return True
- else:
- return result
- finally:
- self.update()
- def english_process_key_event (self, key):
- return False
- def chinese_process_key_event (self, key):
- if self._editor.is_empty() and not self.get_extra_string():
- if key.code <= 127 and ascii.ispunct (chr (key.code)):
- self.commit_string (self.convert_to_full_width (unichr (key.code)))
- return True
- elif key.code == KeyCode.KEY_r and key.mask == KeyMask.ControlMask:
- if not self.pipe:
- self.pipe = popen2.Popen3("python -c" +'"import gtk; print gtk.clipboard_get(selection=\\"PRIMARY\\").wait_for_text()"')
- return True
- else:
- return False
- #~ print unichr (key.code)
- if key.code in (KeyCode.KEY_Control_L,KeyCode.KEY_Control_R,
- KeyCode.KEY_Alt_L, KeyCode.KEY_Alt_R):
- return True
- elif key.code in (KeyCode.KEY_KP_Space, KeyCode.KEY_space):
- #~ print self._editor.get_candidate_cursor()
- if self._editor.candidates and self._editor.lookup_table.is_cursor_visible():
- self._editor.select_cursor()
- return True
- elif self._editor.pinyinlist:
- self._editor.convert_all ()
- return True
- elif self._editor.cursor < len (self._editor.wordlist):
- self._editor.jump_to_next_word()
- return True
- else:
- self.commit_string (self._editor.commit())
- return True
- elif key.code == KeyCode.KEY_BackSpace:
- if not self._editor.pinyinlist and self.get_extra_string():
- raise InputException()
- self._editor.del_current()
- return True
- elif key.code == KeyCode.KEY_Delete:
- if self._editor.lookup_table.is_cursor_visible():
- self._editor.delete_cursor_phrase ()
- else:
- self._editor.del_next ()
- return True
- elif key.code >= KeyCode.KEY_0 and key.code <= KeyCode.KEY_9 and key.mask & KeyMask.ControlMask:
- self._editor.delete_phrase (key.code - KeyCode.KEY_1)
- return True
- elif key.code >= KeyCode.KEY_0 and key.code <= KeyCode.KEY_9 and key.mask & KeyMask.AltMask:
- self._editor.move_cursor_to (key.code - KeyCode.KEY_0)
- return True
- elif key.code >= KeyCode.KEY_1 and key.code <= KeyCode.KEY_9:
- self._editor.select (key.code-KeyCode.KEY_1)
- return True
- elif key.code >= KeyCode.KEY_KP_1 and key.code <= KeyCode.KEY_KP_9:
- self._editor.select (key.code-KeyCode.KEY_KP_1)
- return True
- elif key.code == KeyCode.KEY_Shift_L:
- if not self._editor.is_end():
- self._editor.select (0)
- self._shift_key = None
- return True
- elif key.code == KeyCode.KEY_Shift_R:
- if not self._editor.is_end():
- self._editor.select (1)
- self._shift_key = None
- return True
- elif key.code in (KeyCode.KEY_equal, KeyCode.KEY_bracketright, KeyCode.KEY_Page_Down):
- if self._editor.candidates:
- self._editor.lookup_table.page_down ();
- return True
- else:
- raise InputException()
- elif key.code in (KeyCode.KEY_minus, KeyCode.KEY_bracketleft, KeyCode.KEY_Page_Up):
- if self._editor.candidates:
- self._editor.lookup_table.page_up ();
- return True
- else:
- raise InputException()
- elif key.code==KeyCode.KEY_Up:
- if self._editor.candidates:
- self._editor.lookup_table.cursor_up()
- self._editor.lookup_table.show_cursor(True)
- return True
- else:
- raise InputException()
- elif key.code==KeyCode.KEY_Down:
- if self._editor.candidates:
- self._editor.lookup_table.cursor_down()
- self._editor.lookup_table.show_cursor(True)
- return True
- else:
- raise InputException()
- elif key.code == KeyCode.KEY_Left or key.code == KeyCode.KEY_b and key.mask & KeyMask.ControlMask:
- self._editor.move_cursor (-1)
- return True
- elif key.code == KeyCode.KEY_Right or key.code == KeyCode.KEY_f and key.mask & KeyMask.ControlMask:
- if self.get_extra_string():
- raise InputException()
- self._editor.move_cursor (1)
- return True
- elif key.code == KeyCode.KEY_h and key.mask & KeyMask.ControlMask or key.code == KeyCode.KEY_Home:
- if self.get_extra_string():
- raise InputException()
- self._editor.move_cursor_to (1)
- return True
- elif key.code == KeyCode.KEY_e and key.mask & KeyMask.ControlMask or key.code == KeyCode.KEY_End:
- if self.get_extra_string():
- raise InputException()
- self._editor.move_cursor_to (0)
- return True
- elif key.code in (KeyCode.KEY_Return, KeyCode.KEY_KP_Enter):
- self.commit_string (self._editor.commit() + self.get_extra_string())
- self.clear()
- return True
- elif key.code == KeyCode.KEY_Escape or key.code == KeyCode.KEY_c and key.mask & KeyMask.ControlMask:
- if self.origin_string:
- self.commit_string(self.origin_string)
- self._editor.clear()
- self.origin_string = None
- elif self._editor.lookup_table.is_cursor_visible():
- self._editor.lookup_table.show_cursor(False)
- self._editor.update()
- else:
- self.clear()
- self._editor.clear()
- return True
- elif key.code <= 127 and ascii.ispunct (chr (key.code)) and not self.get_extra_string():
- if not self._editor.is_empty ():
- self.commit_string (self._editor.commit ())
- self.commit_string (self.convert_to_full_width (unichr (key.code)))
- return True
- else:
- raise InputException ()
- def convert_to_full_width (self, c):
- if c == u".":
- if self._prev_key and self._prev_key.code >= KeyCode.KEY_0 and self._prev_key.code <= KeyCode.KEY_9:
- return u"."
- else:
- return u"\u3002"
- elif c == u"\\":
- return u"\u3001"
- elif c == u"^":
- return u"\u2026\u2026"
- elif c == u"_":
- return u"\u2014\u2014"
- elif c == u"$":
- return u"\uffe5"
- elif c == u"\"":
- self._double_quotation_state = not self._double_quotation_state
- if self._double_quotation_state:
- return u"\u201c"
- else:
- return u"\u201d"
- elif c == u"'":
- self._single_quotation_state = not self._single_quotation_state
- if self._single_quotation_state:
- return u"\u2018"
- else:
- return u"\u2019"
- elif c == u"<":
- return u"\u300a"
- elif c == u">":
- return u"\u300b"
- return scim.unichar_half_to_full (c)
- class ZhengJuFactory (IMEngineFactory):
- def __init__ (self, config):
- IMEngineFactory.__init__ (self, config)
- self.name = _(u"ZhengJu")
- self.uuid = "59e29ad8-3c95-4cd0-b02f-e21bf1317f7a"
- self.authors = u"Yu Fan <yufanyufan@gmail.com>"
- self.icon_file = "/usr/share/scim/icons/scim-python.png"
- self.credits = u"GPL"
- self.help = _(u"Help For ZhengJu")
- self.set_languages ("zh")
- self._config = config
- def create_instance (self, encoding, id):
- pinyin = self._config.read ("/IMEngine/Python/ZhengJu/PinYinSchema", "JianPin")
- if pinyin == "JianPin":
- import JianPin
- engine = JianPin.JianPinEngine (self, self._config, encoding, id)
- elif pinyin == "QuanPin":
- import QuanPin
- engine = QuanPin.QuanPinEngine (self, self._config, encoding, id)
- elif pinyin == "ShuangPin":
- import ShuangPin
- engine = ShuangPin.ShuangPinEngine (self, self._config, encoding, id)
- else:
- import JianPin
- engine = JianPin.JianPinEngine (self, self._config, encoding, id)
- return engine
- def reload_config (self, config):
- self._config = config
-
- class PseudoConfig:
- def read(self, string, default):
- return default;
- def train(file_name):
- print "Training by " + file_name
- editor = Editor()
- import re
- ex = re.compile(ur"[\da-zA-Z\W]",re.UNICODE)
- for l in file(file_name):
- ll = unicode(l,"utf8")[:-1]
- t = ex.split(ll)
- for i in t:
- if i:
- try:
- editor.reverse(i)
- #~ print i
- editor.learn()
- except:
- print file
- traceback.print_exc ()
- raise Exception()
- def print_usage():
- print "ZhengJu -f FILE\tRead Sentenc from file"
- print "ZhengJu \tConvert parameter to pinyin"
- if __name__ == "__main__":
- editor = Editor()
- import sys
- try:
- if len(sys.argv) == 3:
- if sys.argv[1] == "-f":
- train(sys.argv[2])
- else:
- raise Exception()
- elif len(sys.argv) == 2:
- try:
- editor.reverse(unicode(sys.argv[1],"utf8"))
- for i in editor.wordlist:
- print ID_PINYIN_DICT[i.get_pinyin_id()],
- print i.char,
- except:
- print "Can't convert this to pinyin"
- raise Exception()
- else:
- raise Exception()
- except:
- traceback.print_exc ()
- print_usage()
- sys.exit(1)
- else:
- sys.exit(0)