PageRenderTime 73ms CodeModel.GetById 40ms app.highlight 16ms RepoModel.GetById 14ms app.codeStats 1ms

/python/engine/PinYin/PYParser.py

http://scim-python.googlecode.com/
Python | 157 lines | 121 code | 11 blank | 25 comment | 41 complexity | bf14f191e2e2375441f023cce7ed20dd MD5 | raw file
  1# -*- coding: utf-8 -*-
  2# vim: set noet ts=4:
  3#
  4# scim-python
  5#
  6# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  7#
  8#
  9# This library is free software; you can redistribute it and/or
 10# modify it under the terms of the GNU Lesser General Public
 11# License as published by the Free Software Foundation; either
 12# version 2 of the License, or (at your option) any later version.
 13#
 14# This library is distributed in the hope that it will be useful,
 15# but WITHOUT ANY WARRANTY; without even the implied warranty of
 16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17# GNU Lesser General Public License for more details.
 18#
 19# You should have received a copy of the GNU Lesser General Public
 20# License along with this program; if not, write to the
 21# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 22# Boston, MA  02111-1307  USA
 23#
 24# $Id: $
 25#
 26import sys
 27import PYUtil
 28import PYDict
 29
 30class PinYinParser:
 31	pinyin_dict = set (PYDict.PINYIN_DICT.keys () + PYDict.SHENGMU_DICT.keys ())
 32	gb2312_pinyin_dict = pinyin_dict - set (["eng", "chua", "fe", "fiao", "liong"])
 33	correct_yunmu = { 
 34		"ing" : ("ign", "img"), "ui" : ("uei", "iu", "i"), 
 35		"un" : ("uen",), "iu" : ("iou", "ui"),
 36		"ao" : ("au", ), "ei" : ("i", ),
 37		"iao" : ("ioa", "ia", "i"), "ian" : ("ia", "i"), "iang" : ("ian", "ia", "i")}
 38
 39	correct_table = {}
 40	
 41	def __init__ (self):
 42		self.init_corrent_table ()
 43
 44	def init_corrent_table (self):
 45		if PinYinParser.correct_table:
 46			return
 47		for key, id in PYDict.PINYIN_DICT.items ():
 48			if key[-3:] in PinYinParser.correct_yunmu:
 49				for yunmu in PinYinParser.correct_yunmu[key[-3:]]:
 50					pinyin = key[:-3] + yunmu
 51					if pinyin not in PYDict.PINYIN_DICT:
 52						PinYinParser.correct_table [pinyin] = key
 53			if key[-2:] in PinYinParser.correct_yunmu:
 54				for yunmu in PinYinParser.correct_yunmu[key[-2:]]:
 55					pinyin = key[:-2] + yunmu
 56					if pinyin not in PYDict.PINYIN_DICT:
 57						PinYinParser.correct_table [pinyin] = key
 58
 59	def parse_recursive (self, string, auto_correct = True, gbk = True):
 60		l = min (6, len (string))
 61		if l == 0:
 62			return []
 63		p = None
 64		for i in range (l, 0, -1):
 65			py = string[-i:]
 66			
 67			if gbk:
 68				if py in self.pinyin_dict:
 69					p = PYUtil.PinYinWord (py)
 70					break
 71			else:
 72				if py in self.gb2312_pinyin_dict:
 73					p = PYUtil.PinYinWord (py)
 74					break
 75			
 76			if p == None and auto_correct and py in PinYinParser.correct_table:
 77				py = PinYinParser.correct_table[py]
 78				if gbk:
 79					if py in self.pinyin_dict:
 80						p = PYUtil.PinYinWord (py)
 81						break
 82				else:
 83					if py in self.gb2312_pinyin_dict:
 84						p = PYUtil.PinYinWord (py)
 85						break
 86		if p == None:
 87			raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
 88		pys = self.parse_recursive (string[:-i], auto_correct, gbk)
 89		pys.append (p)
 90		return pys
 91
 92	def parse (self, string, auto_correct = True, gbk = True):
 93		try:
 94			pys = []
 95			for py in string.split (u"'"):
 96				pys += self.parse_recursive (py, auto_correct, gbk)
 97			return pys
 98		except Exception, e:
 99			import traceback
100			traceback.print_exc ()
101			raise e
102
103class ShuangPinParser:
104	def __init__ (self, schema = "MSPY"):
105		self._gbk = True
106		self._schema = schema
107		self._shengmu_dict, self._yunmu_dict = PYDict.SHUANGPIN_SCHEMAS[self._schema]
108
109	def parse_shuangpin_recursive (self, pys, string, auto_correct = True, gbk = True):
110		if len (string) == 0:
111			return []
112
113		if len (string) == 1:
114			try:
115				shengmu = self._shengmu_dict[string[0]]
116				if shengmu == "'":
117					shengmu = ""
118			except:
119				raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
120			
121			return [PYUtil.PinYinWord (shengmu)]
122
123		try:
124			shengmu = self._shengmu_dict[string[0]]
125			if shengmu == "'":
126				shengmu = ""
127			yunmu = self._yunmu_dict[string[1]]
128		except:
129			raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
130
131		p = None
132
133		for i in yunmu:
134			pinyin = shengmu + i
135			if pinyin in PinYinParser.pinyin_dict:
136				p = PYUtil.PinYinWord (pinyin)
137				break
138		
139		if p == None:
140			raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
141
142		pys.append (p)
143		pys = self.parse_shuangpin_recursive (pys, string[2:], auto_correct, gbk)
144
145		return pys
146
147	def parse (self, string, auto_correct = True, gbk = True):
148		pys = []
149		pys += self.parse_shuangpin_recursive (pys, string, auto_correct, gbk)
150		return pys
151
152if __name__ == "__main__":
153	# parser = PinYinParser ()
154	parser = ShuangPinParser ()	
155	pys = parser.parse (sys.argv[1])
156	print "'".join (map (str, pys))
157