/python/engine/PinYin/tools/ann.py
http://scim-python.googlecode.com/ · Python · 121 lines · 104 code · 16 blank · 1 comment · 38 complexity · 1d74f84cd4ef90474de31b61e9c18ada MD5 · raw file
- import sys
- import bz2
- sys.path.append ("..")
- import PYUtil
- phrase_cache = {}
- def load_pinyin_table ():
- hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
- tmp = {}
- for key, value in hanzi_dict.items ():
- pinyins = []
- for pinyin, freq in value.items ():
- pinyins.append ((pinyin, freq))
- pinyins.sort (key = lambda v: v[1], reverse = True)
- tmp[key] = pinyins
- return tmp
- def load_duoyin_phrases ():
- tmp = {}
- for line in bz2.BZ2File ("duoyin_phrase.txt.bz2", "r"):
- # for line in bz2.BZ2File ("qq_pinyin_1.0.1.txt.bz2", "r"):
- line = unicode (line, "utf8").strip ()
- phrase, pinyin = line.split ()
- if phrase not in tmp:
- tmp[phrase] = []
- tmp[phrase].append (pinyin)
-
- for line in file ("duoyin_phrase_manual.txt"):
- line = unicode (line, "utf8").strip ()
- phrase, pinyin = line.split ()
- if phrase not in tmp:
- tmp[phrase] = []
- tmp[phrase].append (pinyin)
-
- return tmp
- def annotate_by_hanzi (phrase, hanzi_dict):
- if len (phrase) == 1:
- for p in hanzi_dict[phrase[0]]:
- yield p[0]
- else:
- for p in hanzi_dict[phrase[0]]:
- for q in annotate_by_hanzi (phrase[1:], hanzi_dict):
- yield u"'".join ([p[0], q])
- def annotate (phrase, hanzi_dict, phrase_dict):
- if phrase in phrase_dict:
- for p in phrase_dict[phrase]:
- yield p, True
- return
- if phrase in phrase_cache:
- pinyins= phrase_cache[phrase]
- for p, ok in pinyins:
- yield p, ok
- return
-
- pinyins = list (annotate_by_hanzi (phrase, hanzi_dict))
-
- if len (pinyins) == 1:
- yield pinyins[0], True
- elif len (phrase) <= 2:
- for p in pinyins:
- yield p, False
- else:
- pp1 = None
- pp2 = None
- ll = len (pinyins)
- for l in range (len(phrase) - 1, 0 , -1):
- phrase_tmp = phrase[:l]
- p1 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
- if phrase_tmp not in phrase_cache:
- phrase_cache[phrase_tmp] = p1
- phrase_tmp = phrase[l:]
- p2 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
- if phrase_tmp not in phrase_cache:
- phrase_cache[phrase_tmp] = p2
-
- if len (p1) == 1 and len (p2) == 1:
- yield u"'".join ([p1[0][0], p2[0][0]]), True
- return
-
- lln = len (p1) * len (p2)
- if lln < ll:
- ll = lln
- pp1 = p1
- pp2 = p2
- if pp1 == None and pp2 == None:
- for p in pinyins:
- yield p, False
- else:
- for p1, ok1 in pp1:
- for p2, ok2 in pp2:
- yield u"'".join ([p1, p2]), ok1 and ok2
- def main ():
- hanzi_dict = load_pinyin_table ()
- phrase_dict = load_duoyin_phrases ()
- lineno = -1
- for line in sys.stdin:
- lineno += 1
- line = unicode (line, "utf8").strip ()
- phrase, freq = line.split ()
- freq = int (freq)
- try:
- pinyins = list (annotate (phrase, hanzi_dict, phrase_dict))
- except:
- output = u"ERROR %d: %s\t%d" % (lineno, phrase, freq)
- print >> sys.stderr, output.encode ("utf8")
-
- for p, ok in pinyins:
- output = u"%s\t%s\t%d" % (phrase, p, freq)
- if ok:
- print >> sys.stdout, output.encode ("utf8")
- else:
- print >> sys.stderr, output.encode ("utf8")
- if __name__ == "__main__":
- main ()