PageRenderTime 23ms CodeModel.GetById 11ms RepoModel.GetById 1ms app.codeStats 0ms

/python/engine/PinYin/PYUtil.py

http://scim-python.googlecode.com/
Python | 152 lines | 118 code | 9 blank | 25 comment | 16 complexity | cb6d73db2f7ec72a979c1bd4c7e6046e MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. # vim: set noet ts=4:
  3. #
  4. # scim-python
  5. #
  6. # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  7. #
  8. #
  9. # This library is free software; you can redistribute it and/or
  10. # modify it under the terms of the GNU Lesser General Public
  11. # License as published by the Free Software Foundation; either
  12. # version 2 of the License, or (at your option) any later version.
  13. #
  14. # This library is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. # GNU Lesser General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Lesser General Public
  20. # License along with this program; if not, write to the
  21. # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  22. # Boston, MA 02111-1307 USA
  23. #
  24. # $Id: $
  25. #
  26. from PYDict import *
  27. class PinYinWord:
  28. correct_dict = {"nve" : "nue", "lve" : "lue"}
  29. def __init__ (self, pinyin):
  30. if pinyin in self.correct_dict:
  31. pinyin = self.correct_dict [pinyin]
  32. self._pinyin = pinyin
  33. self._is_completed = self.is_valid_pinyin ()
  34. if self._is_completed:
  35. sheng_mu, yun_mu = self.split ()
  36. self._pinyin_id = PINYIN_DICT [self._pinyin]
  37. self._sheng_mu_id = SHENGMU_DICT [sheng_mu]
  38. else:
  39. self._sheng_mu_id = SHENGMU_DICT [self._pinyin]
  40. def is_valid_pinyin (self):
  41. return PINYIN_DICT.has_key (self._pinyin)
  42. def get_sheng_mu_id (self):
  43. return self._sheng_mu_id
  44. def get_shengmu (self):
  45. return ID_SHENGMU_DICT[self._sheng_mu_id]
  46. def get_pinyin_id (self):
  47. return self._pinyin_id
  48. def get_pinyin (self):
  49. return self._pinyin
  50. def get_pattern (self, mohu = False):
  51. if mohu == False:
  52. if self.is_valid_pinyin ():
  53. return self._pinyin
  54. else:
  55. return self._pinyin + "%"
  56. else:
  57. if not self.is_valid_pinyin ():
  58. if self._pinyin in ("zh", "ch", "sh"):
  59. return self._pinyin[0] + "%"
  60. return self._pinyin + "%"
  61. else:
  62. shengmu = self.get_shengmu ()
  63. yunmu = self._pinyin [len (shengmu):]
  64. if shengmu in ("zh", "ch", "sh", "z", "c", "s"):
  65. shengmu = shengmu[0] + "%"
  66. if yunmu in ("ing", "in", "en", "eng", "an", "ang"):
  67. yunmu = yunmu[0:2] + "%"
  68. return shengmu + yunmu
  69. def split (self):
  70. if not self.is_valid_pinyin ():
  71. raise Exception ("Pinyin '%s' is not a valid pinyin!" % py)
  72. if self._pinyin[:2] in SHENGMU_DICT.keys ():
  73. return self._pinyin[:2], self._pinyin[2:]
  74. elif self._pinyin[:1] in SHENGMU_DICT.keys ():
  75. return self._pinyin[:1], self._pinyin[1:]
  76. else:
  77. return "", self._pinyin[:]
  78. def __str__ (self):
  79. return self._pinyin
  80. class PinYinString:
  81. def __init__ (self, string):
  82. pass
  83. def load_pinyin_table (_file):
  84. def pinyin_table_parser (f):
  85. for l in f:
  86. a = unicode (l, "utf-8").strip ().split ()
  87. hanzi, pinyin, freq = a
  88. yield (hanzi, pinyin, int (freq))
  89. # db.add_phrases (pinyin_table_parser (bzf))
  90. hanzi_dic = {}
  91. for hanzi, pinyin, freq in pinyin_table_parser (_file):
  92. if not hanzi_dic.has_key (hanzi):
  93. hanzi_dic[hanzi] = {}
  94. if hanzi_dic[hanzi].has_key (pinyin):
  95. hanzi_dic[hanzi][pinyin] += freq
  96. else:
  97. hanzi_dic[hanzi][pinyin] = freq
  98. return hanzi_dic
  99. def load_phrase_pinyin_freq (_file):
  100. def phrase_pinyin_parser (f):
  101. for l in f:
  102. phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
  103. pinyin = pinyin.replace (u"u:", u"v")
  104. yield (phrase, pinyin, int (freq))
  105. phrases_dic = {}
  106. for phrase, pinyin, freq in phrase_pinyin_parser (_file):
  107. if not phrases_dic.has_key (phrase):
  108. phrases_dic[phrase] = []
  109. phrases_dic[phrase].append ((phrase, pinyin, freq))
  110. return phrases_dic
  111. def load_phrase_pinyin (_file):
  112. def phrase_pinyin_parser (f):
  113. for l in f:
  114. phrase, pinyin = unicode (l, "utf-8").strip ().split ()
  115. pinyin = pinyin.replace (u"u:", u"v")
  116. yield (phrase, pinyin, 0)
  117. phrases_dic = {}
  118. for phrase, pinyin, freq in phrase_pinyin_parser (_file):
  119. if not phrases_dic.has_key (phrase):
  120. phrases_dic[phrase] = []
  121. phrases_dic[phrase].append ((phrase, pinyin, freq))
  122. return phrases_dic
  123. def load_sogou_phrases (_file):
  124. import re
  125. dic = {}
  126. for l in _file:
  127. w = unicode (l, "utf8")
  128. w = re.split (ur"\t+", w)
  129. dic [w[0]] = (w[0], int (w[1]))
  130. return dic