ann.py - The code annotates phrases with their correspondin…

/python/engine/PinYin/tools/ann.py

http://scim-python.googlecode.com/ · Python · 121 lines · 104 code · 16 blank · 1 comment · 38 complexity · 1d74f84cd4ef90474de31b61e9c18ada MD5 · raw file


import sys
import bz2
sys.path.append ("..")
import PYUtil

phrase_cache = {}

def load_pinyin_table ():
	hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
	tmp = {}
	for key, value in hanzi_dict.items ():
		pinyins = []
		for pinyin, freq in value.items ():
			pinyins.append ((pinyin, freq))
		pinyins.sort (key = lambda v: v[1], reverse = True)
		tmp[key] = pinyins

	return tmp

def load_duoyin_phrases ():
	tmp = {}
	for line in bz2.BZ2File ("duoyin_phrase.txt.bz2", "r"):
	# for line in bz2.BZ2File ("qq_pinyin_1.0.1.txt.bz2", "r"):
		line = unicode (line, "utf8").strip ()
		phrase, pinyin = line.split ()
		if phrase not in tmp:
			tmp[phrase] = []
		tmp[phrase].append (pinyin)
	
	for line in file ("duoyin_phrase_manual.txt"):
		line = unicode (line, "utf8").strip ()
		phrase, pinyin = line.split ()
		if phrase not in tmp:
			tmp[phrase] = []
		tmp[phrase].append (pinyin)
		
	return tmp

def annotate_by_hanzi (phrase, hanzi_dict):
	if len (phrase) == 1:
		for p in hanzi_dict[phrase[0]]:
			yield p[0]
	else:
		for p in hanzi_dict[phrase[0]]:
			for q in annotate_by_hanzi (phrase[1:], hanzi_dict):
				yield u"'".join ([p[0], q])

def annotate (phrase, hanzi_dict, phrase_dict):
	if phrase in phrase_dict:
		for p in phrase_dict[phrase]:
			yield p, True
		return
	if phrase in phrase_cache:
		pinyins= phrase_cache[phrase]
		for p, ok in pinyins:
			yield p, ok
		return
	
	pinyins = list (annotate_by_hanzi (phrase, hanzi_dict))
	
	if len (pinyins) == 1:
		yield pinyins[0], True
	elif len (phrase) <= 2:
		for p in pinyins:
			yield p, False
	else:
		pp1 = None
		pp2 = None
		ll = len (pinyins)
		for l in range (len(phrase) - 1, 0 , -1):
			phrase_tmp = phrase[:l]
			p1 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
			if phrase_tmp not in phrase_cache:
				phrase_cache[phrase_tmp] = p1
			phrase_tmp = phrase[l:]
			p2 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
			if phrase_tmp not in phrase_cache:
				phrase_cache[phrase_tmp] = p2
				
			if len (p1) == 1 and len (p2) == 1:
				yield u"'".join ([p1[0][0], p2[0][0]]), True
				return
			
			lln = len (p1) * len (p2)
			if lln < ll:
				ll = lln
				pp1 = p1
				pp2 = p2
		if pp1 == None and pp2 == None:
			for p in pinyins:
				yield p, False
		else:
			for p1, ok1 in pp1:
				for p2, ok2 in pp2:
					yield u"'".join ([p1, p2]), ok1 and ok2

def main ():
	hanzi_dict = load_pinyin_table ()
	phrase_dict = load_duoyin_phrases ()
	lineno = -1
	for line in sys.stdin:
		lineno += 1
		line = unicode (line, "utf8").strip ()
		phrase, freq = line.split ()
		freq = int (freq)
		try:
			pinyins = list (annotate (phrase, hanzi_dict, phrase_dict))
		except:
			output = u"ERROR %d: %s\t%d" % (lineno, phrase, freq)
			print >> sys.stderr, output.encode ("utf8")
			
		for p, ok in pinyins:
			output = u"%s\t%s\t%d" % (phrase, p, freq)
			if ok:
				print >> sys.stdout, output.encode ("utf8")
			else:
				print >> sys.stderr, output.encode ("utf8")


if __name__ == "__main__":
	main ()

Summary ✨

The code annotates phrases with their corresponding pinyin characters and frequencies from a dictionary, using a combination of manual and automated methods. It reads input phrases and frequencies from standard input, then outputs annotated phrases to standard output, along with error messages for invalid inputs. The annotations are stored in memory for reuse during subsequent runs.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (12)

'def' Ensure functions have docstrings for documentation
8 20 39 48 97
'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
15
'== None' Use 'is' for None comparisons (e.g., x is None)
89
Complexity hotspot; lines 92 to 95 (total complexity: 4)
92 93 94 95
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
108