/python/engine/PinYin/PYUtil.py
Python | 152 lines | 118 code | 9 blank | 25 comment | 16 complexity | cb6d73db2f7ec72a979c1bd4c7e6046e MD5 | raw file
- # -*- coding: utf-8 -*-
- # vim: set noet ts=4:
- #
- # scim-python
- #
- # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
- #
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, write to the
- # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
- #
- # $Id: $
- #
- from PYDict import *
- class PinYinWord:
- correct_dict = {"nve" : "nue", "lve" : "lue"}
- def __init__ (self, pinyin):
- if pinyin in self.correct_dict:
- pinyin = self.correct_dict [pinyin]
- self._pinyin = pinyin
- self._is_completed = self.is_valid_pinyin ()
- if self._is_completed:
- sheng_mu, yun_mu = self.split ()
- self._pinyin_id = PINYIN_DICT [self._pinyin]
- self._sheng_mu_id = SHENGMU_DICT [sheng_mu]
- else:
- self._sheng_mu_id = SHENGMU_DICT [self._pinyin]
-
- def is_valid_pinyin (self):
- return PINYIN_DICT.has_key (self._pinyin)
- def get_sheng_mu_id (self):
- return self._sheng_mu_id
- def get_shengmu (self):
- return ID_SHENGMU_DICT[self._sheng_mu_id]
- def get_pinyin_id (self):
- return self._pinyin_id
- def get_pinyin (self):
- return self._pinyin
- def get_pattern (self, mohu = False):
- if mohu == False:
- if self.is_valid_pinyin ():
- return self._pinyin
- else:
- return self._pinyin + "%"
- else:
- if not self.is_valid_pinyin ():
- if self._pinyin in ("zh", "ch", "sh"):
- return self._pinyin[0] + "%"
- return self._pinyin + "%"
- else:
- shengmu = self.get_shengmu ()
- yunmu = self._pinyin [len (shengmu):]
- if shengmu in ("zh", "ch", "sh", "z", "c", "s"):
- shengmu = shengmu[0] + "%"
- if yunmu in ("ing", "in", "en", "eng", "an", "ang"):
- yunmu = yunmu[0:2] + "%"
- return shengmu + yunmu
- def split (self):
- if not self.is_valid_pinyin ():
- raise Exception ("Pinyin '%s' is not a valid pinyin!" % py)
- if self._pinyin[:2] in SHENGMU_DICT.keys ():
- return self._pinyin[:2], self._pinyin[2:]
- elif self._pinyin[:1] in SHENGMU_DICT.keys ():
- return self._pinyin[:1], self._pinyin[1:]
- else:
- return "", self._pinyin[:]
- def __str__ (self):
- return self._pinyin
- class PinYinString:
- def __init__ (self, string):
- pass
- def load_pinyin_table (_file):
-
- def pinyin_table_parser (f):
- for l in f:
- a = unicode (l, "utf-8").strip ().split ()
- hanzi, pinyin, freq = a
- yield (hanzi, pinyin, int (freq))
- # db.add_phrases (pinyin_table_parser (bzf))
-
- hanzi_dic = {}
- for hanzi, pinyin, freq in pinyin_table_parser (_file):
- if not hanzi_dic.has_key (hanzi):
- hanzi_dic[hanzi] = {}
-
- if hanzi_dic[hanzi].has_key (pinyin):
- hanzi_dic[hanzi][pinyin] += freq
- else:
- hanzi_dic[hanzi][pinyin] = freq
- return hanzi_dic
- def load_phrase_pinyin_freq (_file):
- def phrase_pinyin_parser (f):
- for l in f:
- phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
- pinyin = pinyin.replace (u"u:", u"v")
- yield (phrase, pinyin, int (freq))
- phrases_dic = {}
- for phrase, pinyin, freq in phrase_pinyin_parser (_file):
- if not phrases_dic.has_key (phrase):
- phrases_dic[phrase] = []
- phrases_dic[phrase].append ((phrase, pinyin, freq))
- return phrases_dic
- def load_phrase_pinyin (_file):
- def phrase_pinyin_parser (f):
- for l in f:
- phrase, pinyin = unicode (l, "utf-8").strip ().split ()
- pinyin = pinyin.replace (u"u:", u"v")
- yield (phrase, pinyin, 0)
- phrases_dic = {}
- for phrase, pinyin, freq in phrase_pinyin_parser (_file):
- if not phrases_dic.has_key (phrase):
- phrases_dic[phrase] = []
- phrases_dic[phrase].append ((phrase, pinyin, freq))
- return phrases_dic
- def load_sogou_phrases (_file):
- import re
- dic = {}
- for l in _file:
- w = unicode (l, "utf8")
- w = re.split (ur"\t+", w)
- dic [w[0]] = (w[0], int (w[1]))
- return dic