/python/engine/PinYin/tools/ann.py
Python | 121 lines | 104 code | 16 blank | 1 comment | 38 complexity | 1d74f84cd4ef90474de31b61e9c18ada MD5 | raw file
1import sys
2import bz2
3sys.path.append ("..")
4import PYUtil
5
6phrase_cache = {}
7
8def load_pinyin_table ():
9 hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
10 tmp = {}
11 for key, value in hanzi_dict.items ():
12 pinyins = []
13 for pinyin, freq in value.items ():
14 pinyins.append ((pinyin, freq))
15 pinyins.sort (key = lambda v: v[1], reverse = True)
16 tmp[key] = pinyins
17
18 return tmp
19
20def load_duoyin_phrases ():
21 tmp = {}
22 for line in bz2.BZ2File ("duoyin_phrase.txt.bz2", "r"):
23 # for line in bz2.BZ2File ("qq_pinyin_1.0.1.txt.bz2", "r"):
24 line = unicode (line, "utf8").strip ()
25 phrase, pinyin = line.split ()
26 if phrase not in tmp:
27 tmp[phrase] = []
28 tmp[phrase].append (pinyin)
29
30 for line in file ("duoyin_phrase_manual.txt"):
31 line = unicode (line, "utf8").strip ()
32 phrase, pinyin = line.split ()
33 if phrase not in tmp:
34 tmp[phrase] = []
35 tmp[phrase].append (pinyin)
36
37 return tmp
38
39def annotate_by_hanzi (phrase, hanzi_dict):
40 if len (phrase) == 1:
41 for p in hanzi_dict[phrase[0]]:
42 yield p[0]
43 else:
44 for p in hanzi_dict[phrase[0]]:
45 for q in annotate_by_hanzi (phrase[1:], hanzi_dict):
46 yield u"'".join ([p[0], q])
47
48def annotate (phrase, hanzi_dict, phrase_dict):
49 if phrase in phrase_dict:
50 for p in phrase_dict[phrase]:
51 yield p, True
52 return
53 if phrase in phrase_cache:
54 pinyins= phrase_cache[phrase]
55 for p, ok in pinyins:
56 yield p, ok
57 return
58
59 pinyins = list (annotate_by_hanzi (phrase, hanzi_dict))
60
61 if len (pinyins) == 1:
62 yield pinyins[0], True
63 elif len (phrase) <= 2:
64 for p in pinyins:
65 yield p, False
66 else:
67 pp1 = None
68 pp2 = None
69 ll = len (pinyins)
70 for l in range (len(phrase) - 1, 0 , -1):
71 phrase_tmp = phrase[:l]
72 p1 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
73 if phrase_tmp not in phrase_cache:
74 phrase_cache[phrase_tmp] = p1
75 phrase_tmp = phrase[l:]
76 p2 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
77 if phrase_tmp not in phrase_cache:
78 phrase_cache[phrase_tmp] = p2
79
80 if len (p1) == 1 and len (p2) == 1:
81 yield u"'".join ([p1[0][0], p2[0][0]]), True
82 return
83
84 lln = len (p1) * len (p2)
85 if lln < ll:
86 ll = lln
87 pp1 = p1
88 pp2 = p2
89 if pp1 == None and pp2 == None:
90 for p in pinyins:
91 yield p, False
92 else:
93 for p1, ok1 in pp1:
94 for p2, ok2 in pp2:
95 yield u"'".join ([p1, p2]), ok1 and ok2
96
97def main ():
98 hanzi_dict = load_pinyin_table ()
99 phrase_dict = load_duoyin_phrases ()
100 lineno = -1
101 for line in sys.stdin:
102 lineno += 1
103 line = unicode (line, "utf8").strip ()
104 phrase, freq = line.split ()
105 freq = int (freq)
106 try:
107 pinyins = list (annotate (phrase, hanzi_dict, phrase_dict))
108 except:
109 output = u"ERROR %d: %s\t%d" % (lineno, phrase, freq)
110 print >> sys.stderr, output.encode ("utf8")
111
112 for p, ok in pinyins:
113 output = u"%s\t%s\t%d" % (phrase, p, freq)
114 if ok:
115 print >> sys.stdout, output.encode ("utf8")
116 else:
117 print >> sys.stderr, output.encode ("utf8")
118
119
120if __name__ == "__main__":
121 main ()