PageRenderTime 354ms CodeModel.GetById 161ms app.highlight 14ms RepoModel.GetById 177ms app.codeStats 0ms

/python/engine/PinYin/PYUtil.py

http://scim-python.googlecode.com/
Python | 152 lines | 118 code | 9 blank | 25 comment | 18 complexity | cb6d73db2f7ec72a979c1bd4c7e6046e MD5 | raw file
  1# -*- coding: utf-8 -*-
  2# vim: set noet ts=4:
  3#
  4# scim-python
  5#
  6# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  7#
  8#
  9# This library is free software; you can redistribute it and/or
 10# modify it under the terms of the GNU Lesser General Public
 11# License as published by the Free Software Foundation; either
 12# version 2 of the License, or (at your option) any later version.
 13#
 14# This library is distributed in the hope that it will be useful,
 15# but WITHOUT ANY WARRANTY; without even the implied warranty of
 16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17# GNU Lesser General Public License for more details.
 18#
 19# You should have received a copy of the GNU Lesser General Public
 20# License along with this program; if not, write to the
 21# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 22# Boston, MA  02111-1307  USA
 23#
 24# $Id: $
 25#
 26from PYDict import *
 27
 28class PinYinWord:
 29	correct_dict = {"nve" : "nue", "lve" : "lue"}
 30	def __init__ (self, pinyin):
 31		if pinyin in self.correct_dict:
 32			pinyin = self.correct_dict [pinyin]
 33
 34		self._pinyin = pinyin
 35		self._is_completed = self.is_valid_pinyin ()
 36		if self._is_completed:
 37			sheng_mu, yun_mu = self.split ()
 38			self._pinyin_id = PINYIN_DICT [self._pinyin]
 39			self._sheng_mu_id = SHENGMU_DICT [sheng_mu]
 40		else:
 41			self._sheng_mu_id = SHENGMU_DICT [self._pinyin]
 42	
 43	def is_valid_pinyin (self):
 44		return PINYIN_DICT.has_key (self._pinyin)
 45
 46	def get_sheng_mu_id (self):
 47		return self._sheng_mu_id
 48
 49	def get_shengmu (self):
 50		return ID_SHENGMU_DICT[self._sheng_mu_id]
 51
 52	def get_pinyin_id (self):
 53		return self._pinyin_id
 54
 55	def get_pinyin (self):
 56		return self._pinyin
 57
 58	def get_pattern (self, mohu = False):
 59		if mohu == False:
 60			if self.is_valid_pinyin ():
 61				return self._pinyin
 62			else:
 63				return self._pinyin + "%"
 64		else:
 65			if not self.is_valid_pinyin ():
 66				if self._pinyin in ("zh", "ch", "sh"):
 67					return self._pinyin[0] + "%"
 68				return self._pinyin + "%"
 69			else:
 70				shengmu = self.get_shengmu ()
 71				yunmu = self._pinyin [len (shengmu):]
 72				if shengmu in ("zh", "ch", "sh", "z", "c", "s"):
 73					shengmu = shengmu[0] + "%"
 74				if yunmu in ("ing", "in", "en", "eng", "an", "ang"):
 75					yunmu = yunmu[0:2] + "%"
 76				return shengmu + yunmu
 77
 78	def split (self):
 79		if not self.is_valid_pinyin ():
 80			raise Exception ("Pinyin '%s' is not a valid pinyin!" % py)
 81		if self._pinyin[:2] in SHENGMU_DICT.keys ():
 82			return self._pinyin[:2], self._pinyin[2:]
 83		elif self._pinyin[:1] in SHENGMU_DICT.keys ():
 84			return self._pinyin[:1], self._pinyin[1:]
 85		else:
 86			return "", self._pinyin[:]
 87
 88	def __str__ (self):
 89		return self._pinyin
 90
 91class PinYinString:
 92	def __init__ (self, string):
 93		pass
 94
 95def load_pinyin_table (_file):
 96	
 97	def pinyin_table_parser (f):
 98		for l in f:
 99			a = unicode (l, "utf-8").strip ().split ()
100			hanzi, pinyin, freq = a 
101			yield (hanzi, pinyin, int (freq))
102	# db.add_phrases (pinyin_table_parser (bzf))
103	
104	hanzi_dic = {}
105	for hanzi, pinyin, freq in pinyin_table_parser (_file):
106		if not hanzi_dic.has_key (hanzi):
107			hanzi_dic[hanzi] = {}
108		
109		if hanzi_dic[hanzi].has_key (pinyin):
110			hanzi_dic[hanzi][pinyin] += freq
111		else:
112			hanzi_dic[hanzi][pinyin] = freq
113
114	return hanzi_dic
115
116def load_phrase_pinyin_freq (_file):
117	def phrase_pinyin_parser (f):
118		for l in f:
119			phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
120			pinyin = pinyin.replace (u"u:", u"v")
121			yield (phrase, pinyin, int (freq))
122	phrases_dic = {}
123	for phrase, pinyin, freq in phrase_pinyin_parser (_file):
124		if not phrases_dic.has_key (phrase):
125			phrases_dic[phrase] = []
126		phrases_dic[phrase].append ((phrase, pinyin, freq))
127
128	return phrases_dic
129
130def load_phrase_pinyin (_file):
131	def phrase_pinyin_parser (f):
132		for l in f:
133			phrase, pinyin = unicode (l, "utf-8").strip ().split ()
134			pinyin = pinyin.replace (u"u:", u"v")
135			yield (phrase, pinyin, 0)
136	phrases_dic = {}
137	for phrase, pinyin, freq in phrase_pinyin_parser (_file):
138		if not phrases_dic.has_key (phrase):
139			phrases_dic[phrase] = []
140		phrases_dic[phrase].append ((phrase, pinyin, freq))
141
142	return phrases_dic
143
144def load_sogou_phrases (_file):
145	import re
146	dic = {}
147	for l in _file:
148		w = unicode (l, "utf8")
149		w = re.split (ur"\t+", w)
150		dic [w[0]] = (w[0], int (w[1]))
151	return dic
152