/python/engine/PinYin/PYParser.py
http://scim-python.googlecode.com/ · Python · 157 lines · 110 code · 21 blank · 26 comment · 35 complexity · bf14f191e2e2375441f023cce7ed20dd MD5 · raw file
- # -*- coding: utf-8 -*-
- # vim: set noet ts=4:
- #
- # scim-python
- #
- # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
- #
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, write to the
- # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
- #
- # $Id: $
- #
- import sys
- import PYUtil
- import PYDict
- class PinYinParser:
- pinyin_dict = set (PYDict.PINYIN_DICT.keys () + PYDict.SHENGMU_DICT.keys ())
- gb2312_pinyin_dict = pinyin_dict - set (["eng", "chua", "fe", "fiao", "liong"])
- correct_yunmu = {
- "ing" : ("ign", "img"), "ui" : ("uei", "iu", "i"),
- "un" : ("uen",), "iu" : ("iou", "ui"),
- "ao" : ("au", ), "ei" : ("i", ),
- "iao" : ("ioa", "ia", "i"), "ian" : ("ia", "i"), "iang" : ("ian", "ia", "i")}
- correct_table = {}
-
- def __init__ (self):
- self.init_corrent_table ()
- def init_corrent_table (self):
- if PinYinParser.correct_table:
- return
- for key, id in PYDict.PINYIN_DICT.items ():
- if key[-3:] in PinYinParser.correct_yunmu:
- for yunmu in PinYinParser.correct_yunmu[key[-3:]]:
- pinyin = key[:-3] + yunmu
- if pinyin not in PYDict.PINYIN_DICT:
- PinYinParser.correct_table [pinyin] = key
- if key[-2:] in PinYinParser.correct_yunmu:
- for yunmu in PinYinParser.correct_yunmu[key[-2:]]:
- pinyin = key[:-2] + yunmu
- if pinyin not in PYDict.PINYIN_DICT:
- PinYinParser.correct_table [pinyin] = key
- def parse_recursive (self, string, auto_correct = True, gbk = True):
- l = min (6, len (string))
- if l == 0:
- return []
- p = None
- for i in range (l, 0, -1):
- py = string[-i:]
-
- if gbk:
- if py in self.pinyin_dict:
- p = PYUtil.PinYinWord (py)
- break
- else:
- if py in self.gb2312_pinyin_dict:
- p = PYUtil.PinYinWord (py)
- break
-
- if p == None and auto_correct and py in PinYinParser.correct_table:
- py = PinYinParser.correct_table[py]
- if gbk:
- if py in self.pinyin_dict:
- p = PYUtil.PinYinWord (py)
- break
- else:
- if py in self.gb2312_pinyin_dict:
- p = PYUtil.PinYinWord (py)
- break
- if p == None:
- raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
- pys = self.parse_recursive (string[:-i], auto_correct, gbk)
- pys.append (p)
- return pys
- def parse (self, string, auto_correct = True, gbk = True):
- try:
- pys = []
- for py in string.split (u"'"):
- pys += self.parse_recursive (py, auto_correct, gbk)
- return pys
- except Exception, e:
- import traceback
- traceback.print_exc ()
- raise e
- class ShuangPinParser:
- def __init__ (self, schema = "MSPY"):
- self._gbk = True
- self._schema = schema
- self._shengmu_dict, self._yunmu_dict = PYDict.SHUANGPIN_SCHEMAS[self._schema]
- def parse_shuangpin_recursive (self, pys, string, auto_correct = True, gbk = True):
- if len (string) == 0:
- return []
- if len (string) == 1:
- try:
- shengmu = self._shengmu_dict[string[0]]
- if shengmu == "'":
- shengmu = ""
- except:
- raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
-
- return [PYUtil.PinYinWord (shengmu)]
- try:
- shengmu = self._shengmu_dict[string[0]]
- if shengmu == "'":
- shengmu = ""
- yunmu = self._yunmu_dict[string[1]]
- except:
- raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
- p = None
- for i in yunmu:
- pinyin = shengmu + i
- if pinyin in PinYinParser.pinyin_dict:
- p = PYUtil.PinYinWord (pinyin)
- break
-
- if p == None:
- raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
- pys.append (p)
- pys = self.parse_shuangpin_recursive (pys, string[2:], auto_correct, gbk)
- return pys
- def parse (self, string, auto_correct = True, gbk = True):
- pys = []
- pys += self.parse_shuangpin_recursive (pys, string, auto_correct, gbk)
- return pys
- if __name__ == "__main__":
- # parser = PinYinParser ()
- parser = ShuangPinParser ()
- pys = parser.parse (sys.argv[1])
- print "'".join (map (str, pys))