/python/engine/PinYin/tools/ann.py

http://scim-python.googlecode.com/ · Python · 121 lines · 104 code · 16 blank · 1 comment · 38 complexity · 1d74f84cd4ef90474de31b61e9c18ada MD5 · raw file

  1. import sys
  2. import bz2
  3. sys.path.append ("..")
  4. import PYUtil
  5. phrase_cache = {}
  6. def load_pinyin_table ():
  7. hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
  8. tmp = {}
  9. for key, value in hanzi_dict.items ():
  10. pinyins = []
  11. for pinyin, freq in value.items ():
  12. pinyins.append ((pinyin, freq))
  13. pinyins.sort (key = lambda v: v[1], reverse = True)
  14. tmp[key] = pinyins
  15. return tmp
  16. def load_duoyin_phrases ():
  17. tmp = {}
  18. for line in bz2.BZ2File ("duoyin_phrase.txt.bz2", "r"):
  19. # for line in bz2.BZ2File ("qq_pinyin_1.0.1.txt.bz2", "r"):
  20. line = unicode (line, "utf8").strip ()
  21. phrase, pinyin = line.split ()
  22. if phrase not in tmp:
  23. tmp[phrase] = []
  24. tmp[phrase].append (pinyin)
  25. for line in file ("duoyin_phrase_manual.txt"):
  26. line = unicode (line, "utf8").strip ()
  27. phrase, pinyin = line.split ()
  28. if phrase not in tmp:
  29. tmp[phrase] = []
  30. tmp[phrase].append (pinyin)
  31. return tmp
  32. def annotate_by_hanzi (phrase, hanzi_dict):
  33. if len (phrase) == 1:
  34. for p in hanzi_dict[phrase[0]]:
  35. yield p[0]
  36. else:
  37. for p in hanzi_dict[phrase[0]]:
  38. for q in annotate_by_hanzi (phrase[1:], hanzi_dict):
  39. yield u"'".join ([p[0], q])
  40. def annotate (phrase, hanzi_dict, phrase_dict):
  41. if phrase in phrase_dict:
  42. for p in phrase_dict[phrase]:
  43. yield p, True
  44. return
  45. if phrase in phrase_cache:
  46. pinyins= phrase_cache[phrase]
  47. for p, ok in pinyins:
  48. yield p, ok
  49. return
  50. pinyins = list (annotate_by_hanzi (phrase, hanzi_dict))
  51. if len (pinyins) == 1:
  52. yield pinyins[0], True
  53. elif len (phrase) <= 2:
  54. for p in pinyins:
  55. yield p, False
  56. else:
  57. pp1 = None
  58. pp2 = None
  59. ll = len (pinyins)
  60. for l in range (len(phrase) - 1, 0 , -1):
  61. phrase_tmp = phrase[:l]
  62. p1 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
  63. if phrase_tmp not in phrase_cache:
  64. phrase_cache[phrase_tmp] = p1
  65. phrase_tmp = phrase[l:]
  66. p2 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
  67. if phrase_tmp not in phrase_cache:
  68. phrase_cache[phrase_tmp] = p2
  69. if len (p1) == 1 and len (p2) == 1:
  70. yield u"'".join ([p1[0][0], p2[0][0]]), True
  71. return
  72. lln = len (p1) * len (p2)
  73. if lln < ll:
  74. ll = lln
  75. pp1 = p1
  76. pp2 = p2
  77. if pp1 == None and pp2 == None:
  78. for p in pinyins:
  79. yield p, False
  80. else:
  81. for p1, ok1 in pp1:
  82. for p2, ok2 in pp2:
  83. yield u"'".join ([p1, p2]), ok1 and ok2
  84. def main ():
  85. hanzi_dict = load_pinyin_table ()
  86. phrase_dict = load_duoyin_phrases ()
  87. lineno = -1
  88. for line in sys.stdin:
  89. lineno += 1
  90. line = unicode (line, "utf8").strip ()
  91. phrase, freq = line.split ()
  92. freq = int (freq)
  93. try:
  94. pinyins = list (annotate (phrase, hanzi_dict, phrase_dict))
  95. except:
  96. output = u"ERROR %d: %s\t%d" % (lineno, phrase, freq)
  97. print >> sys.stderr, output.encode ("utf8")
  98. for p, ok in pinyins:
  99. output = u"%s\t%s\t%d" % (phrase, p, freq)
  100. if ok:
  101. print >> sys.stdout, output.encode ("utf8")
  102. else:
  103. print >> sys.stderr, output.encode ("utf8")
  104. if __name__ == "__main__":
  105. main ()