PageRenderTime 20ms CodeModel.GetById 1ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/python/engine/PinYin/tools/ann.py

http://scim-python.googlecode.com/
Python | 121 lines | 104 code | 16 blank | 1 comment | 58 complexity | 1d74f84cd4ef90474de31b61e9c18ada MD5 | raw file
  1import sys
  2import bz2
  3sys.path.append ("..")
  4import PYUtil
  5
  6phrase_cache = {}
  7
  8def load_pinyin_table ():
  9	hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
 10	tmp = {}
 11	for key, value in hanzi_dict.items ():
 12		pinyins = []
 13		for pinyin, freq in value.items ():
 14			pinyins.append ((pinyin, freq))
 15		pinyins.sort (key = lambda v: v[1], reverse = True)
 16		tmp[key] = pinyins
 17
 18	return tmp
 19
 20def load_duoyin_phrases ():
 21	tmp = {}
 22	for line in bz2.BZ2File ("duoyin_phrase.txt.bz2", "r"):
 23	# for line in bz2.BZ2File ("qq_pinyin_1.0.1.txt.bz2", "r"):
 24		line = unicode (line, "utf8").strip ()
 25		phrase, pinyin = line.split ()
 26		if phrase not in tmp:
 27			tmp[phrase] = []
 28		tmp[phrase].append (pinyin)
 29	
 30	for line in file ("duoyin_phrase_manual.txt"):
 31		line = unicode (line, "utf8").strip ()
 32		phrase, pinyin = line.split ()
 33		if phrase not in tmp:
 34			tmp[phrase] = []
 35		tmp[phrase].append (pinyin)
 36		
 37	return tmp
 38
 39def annotate_by_hanzi (phrase, hanzi_dict):
 40	if len (phrase) == 1:
 41		for p in hanzi_dict[phrase[0]]:
 42			yield p[0]
 43	else:
 44		for p in hanzi_dict[phrase[0]]:
 45			for q in annotate_by_hanzi (phrase[1:], hanzi_dict):
 46				yield u"'".join ([p[0], q])
 47
 48def annotate (phrase, hanzi_dict, phrase_dict):
 49	if phrase in phrase_dict:
 50		for p in phrase_dict[phrase]:
 51			yield p, True
 52		return
 53	if phrase in phrase_cache:
 54		pinyins= phrase_cache[phrase]
 55		for p, ok in pinyins:
 56			yield p, ok
 57		return
 58	
 59	pinyins = list (annotate_by_hanzi (phrase, hanzi_dict))
 60	
 61	if len (pinyins) == 1:
 62		yield pinyins[0], True
 63	elif len (phrase) <= 2:
 64		for p in pinyins:
 65			yield p, False
 66	else:
 67		pp1 = None
 68		pp2 = None
 69		ll = len (pinyins)
 70		for l in range (len(phrase) - 1, 0 , -1):
 71			phrase_tmp = phrase[:l]
 72			p1 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
 73			if phrase_tmp not in phrase_cache:
 74				phrase_cache[phrase_tmp] = p1
 75			phrase_tmp = phrase[l:]
 76			p2 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
 77			if phrase_tmp not in phrase_cache:
 78				phrase_cache[phrase_tmp] = p2
 79				
 80			if len (p1) == 1 and len (p2) == 1:
 81				yield u"'".join ([p1[0][0], p2[0][0]]), True
 82				return
 83			
 84			lln = len (p1) * len (p2)
 85			if lln < ll:
 86				ll = lln
 87				pp1 = p1
 88				pp2 = p2
 89		if pp1 == None and pp2 == None:
 90			for p in pinyins:
 91				yield p, False
 92		else:
 93			for p1, ok1 in pp1:
 94				for p2, ok2 in pp2:
 95					yield u"'".join ([p1, p2]), ok1 and ok2
 96
 97def main ():
 98	hanzi_dict = load_pinyin_table ()
 99	phrase_dict = load_duoyin_phrases ()
100	lineno = -1
101	for line in sys.stdin:
102		lineno += 1
103		line = unicode (line, "utf8").strip ()
104		phrase, freq = line.split ()
105		freq = int (freq)
106		try:
107			pinyins = list (annotate (phrase, hanzi_dict, phrase_dict))
108		except:
109			output = u"ERROR %d: %s\t%d" % (lineno, phrase, freq)
110			print >> sys.stderr, output.encode ("utf8")
111			
112		for p, ok in pinyins:
113			output = u"%s\t%s\t%d" % (phrase, p, freq)
114			if ok:
115				print >> sys.stdout, output.encode ("utf8")
116			else:
117				print >> sys.stderr, output.encode ("utf8")
118
119
120if __name__ == "__main__":
121	main ()