/python/engine/PinYin/PYParser.py

http://scim-python.googlecode.com/ · Python · 157 lines · 110 code · 21 blank · 26 comment · 35 complexity · bf14f191e2e2375441f023cce7ed20dd MD5 · raw file

  1. # -*- coding: utf-8 -*-
  2. # vim: set noet ts=4:
  3. #
  4. # scim-python
  5. #
  6. # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  7. #
  8. #
  9. # This library is free software; you can redistribute it and/or
  10. # modify it under the terms of the GNU Lesser General Public
  11. # License as published by the Free Software Foundation; either
  12. # version 2 of the License, or (at your option) any later version.
  13. #
  14. # This library is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. # GNU Lesser General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Lesser General Public
  20. # License along with this program; if not, write to the
  21. # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  22. # Boston, MA 02111-1307 USA
  23. #
  24. # $Id: $
  25. #
  26. import sys
  27. import PYUtil
  28. import PYDict
  29. class PinYinParser:
  30. pinyin_dict = set (PYDict.PINYIN_DICT.keys () + PYDict.SHENGMU_DICT.keys ())
  31. gb2312_pinyin_dict = pinyin_dict - set (["eng", "chua", "fe", "fiao", "liong"])
  32. correct_yunmu = {
  33. "ing" : ("ign", "img"), "ui" : ("uei", "iu", "i"),
  34. "un" : ("uen",), "iu" : ("iou", "ui"),
  35. "ao" : ("au", ), "ei" : ("i", ),
  36. "iao" : ("ioa", "ia", "i"), "ian" : ("ia", "i"), "iang" : ("ian", "ia", "i")}
  37. correct_table = {}
  38. def __init__ (self):
  39. self.init_corrent_table ()
  40. def init_corrent_table (self):
  41. if PinYinParser.correct_table:
  42. return
  43. for key, id in PYDict.PINYIN_DICT.items ():
  44. if key[-3:] in PinYinParser.correct_yunmu:
  45. for yunmu in PinYinParser.correct_yunmu[key[-3:]]:
  46. pinyin = key[:-3] + yunmu
  47. if pinyin not in PYDict.PINYIN_DICT:
  48. PinYinParser.correct_table [pinyin] = key
  49. if key[-2:] in PinYinParser.correct_yunmu:
  50. for yunmu in PinYinParser.correct_yunmu[key[-2:]]:
  51. pinyin = key[:-2] + yunmu
  52. if pinyin not in PYDict.PINYIN_DICT:
  53. PinYinParser.correct_table [pinyin] = key
  54. def parse_recursive (self, string, auto_correct = True, gbk = True):
  55. l = min (6, len (string))
  56. if l == 0:
  57. return []
  58. p = None
  59. for i in range (l, 0, -1):
  60. py = string[-i:]
  61. if gbk:
  62. if py in self.pinyin_dict:
  63. p = PYUtil.PinYinWord (py)
  64. break
  65. else:
  66. if py in self.gb2312_pinyin_dict:
  67. p = PYUtil.PinYinWord (py)
  68. break
  69. if p == None and auto_correct and py in PinYinParser.correct_table:
  70. py = PinYinParser.correct_table[py]
  71. if gbk:
  72. if py in self.pinyin_dict:
  73. p = PYUtil.PinYinWord (py)
  74. break
  75. else:
  76. if py in self.gb2312_pinyin_dict:
  77. p = PYUtil.PinYinWord (py)
  78. break
  79. if p == None:
  80. raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
  81. pys = self.parse_recursive (string[:-i], auto_correct, gbk)
  82. pys.append (p)
  83. return pys
  84. def parse (self, string, auto_correct = True, gbk = True):
  85. try:
  86. pys = []
  87. for py in string.split (u"'"):
  88. pys += self.parse_recursive (py, auto_correct, gbk)
  89. return pys
  90. except Exception, e:
  91. import traceback
  92. traceback.print_exc ()
  93. raise e
  94. class ShuangPinParser:
  95. def __init__ (self, schema = "MSPY"):
  96. self._gbk = True
  97. self._schema = schema
  98. self._shengmu_dict, self._yunmu_dict = PYDict.SHUANGPIN_SCHEMAS[self._schema]
  99. def parse_shuangpin_recursive (self, pys, string, auto_correct = True, gbk = True):
  100. if len (string) == 0:
  101. return []
  102. if len (string) == 1:
  103. try:
  104. shengmu = self._shengmu_dict[string[0]]
  105. if shengmu == "'":
  106. shengmu = ""
  107. except:
  108. raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
  109. return [PYUtil.PinYinWord (shengmu)]
  110. try:
  111. shengmu = self._shengmu_dict[string[0]]
  112. if shengmu == "'":
  113. shengmu = ""
  114. yunmu = self._yunmu_dict[string[1]]
  115. except:
  116. raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
  117. p = None
  118. for i in yunmu:
  119. pinyin = shengmu + i
  120. if pinyin in PinYinParser.pinyin_dict:
  121. p = PYUtil.PinYinWord (pinyin)
  122. break
  123. if p == None:
  124. raise Exception ("can not parse '%s'" % string.encode ("utf-8"))
  125. pys.append (p)
  126. pys = self.parse_shuangpin_recursive (pys, string[2:], auto_correct, gbk)
  127. return pys
  128. def parse (self, string, auto_correct = True, gbk = True):
  129. pys = []
  130. pys += self.parse_shuangpin_recursive (pys, string, auto_correct, gbk)
  131. return pys
  132. if __name__ == "__main__":
  133. # parser = PinYinParser ()
  134. parser = ShuangPinParser ()
  135. pys = parser.parse (sys.argv[1])
  136. print "'".join (map (str, pys))