PageRenderTime 224ms CodeModel.GetById 101ms app.highlight 12ms RepoModel.GetById 109ms app.codeStats 0ms

/python/engine/PinYin/tools/MergeQQPhrases.py

http://scim-python.googlecode.com/
Python | 150 lines | 122 code | 4 blank | 24 comment | 11 complexity | 70524409edd9cda53fb607a82fddba4e MD5 | raw file
  1# vim: set noet ts=4:
  2#
  3# scim-python
  4#
  5# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  6#
  7#
  8# This library is free software; you can redistribute it and/or
  9# modify it under the terms of the GNU Lesser General Public
 10# License as published by the Free Software Foundation; either
 11# version 2 of the License, or (at your option) any later version.
 12#
 13# This library is distributed in the hope that it will be useful,
 14# but WITHOUT ANY WARRANTY; without even the implied warranty of
 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16# GNU Lesser General Public License for more details.
 17#
 18# You should have received a copy of the GNU Lesser General Public
 19# License along with this program; if not, write to the
 20# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 21# Boston, MA  02111-1307  USA
 22#
 23# $Id: $
 24#
 25import sys, os, re
 26import bz2
 27
 28try:
 29	import PYUtil
 30	import PYSQLiteDB
 31except:
 32	path = os.path.dirname (__file__)
 33	path = os.path.join (path, "..")
 34	path = os.path.abspath (path)
 35	sys.path.append (path)
 36	import PYUtil
 37	import PYSQLiteDB
 38
 39
 40def load_qq_phrases (filename, hanzi_dic, sogou_phrase = {}):
 41	bzf = bz2.BZ2File (filename, "r")
 42	
 43	def parse_qq_phrase (f):	
 44		is_pinyin = lambda x: (x >= u"a" and x <= u"z")
 45		is_hanzi = lambda x: not (is_pinyin (x) or x == u"0")
 46		for l in f:
 47			l = unicode (l, "utf8").strip () + "0"
 48			phrase = []
 49			pinyin = []
 50			current_pinyin = None
 51			state = 0
 52			for c in l:
 53				if state == 0: # expect a hanzi
 54					if is_hanzi (c):
 55						phrase.append (c)
 56						state = 1
 57					else:
 58						raise Exception (l)
 59				elif state == 1: # expect a pinyin start or a hanzi
 60					if is_pinyin (c): # a pinyin start
 61						current_pinyin = [c]
 62						state = 2
 63					elif is_hanzi (c): # an hanzi
 64						pinyin.append (None)
 65						phrase.append (c)
 66					else: # finished
 67						pinyin.append (None)
 68						state = 3
 69				elif state == 2: # expect pinyin continue or hanzi
 70					if is_pinyin (c): # pinyin continue
 71						current_pinyin.append (c)
 72					elif is_hanzi (c): # hanzi
 73						pinyin.append (u"".join (current_pinyin))
 74						current_pinyin = None
 75						phrase.append (c)
 76						state = 1
 77					else: # finished
 78						pinyin.append (u"".join (current_pinyin))
 79						state = 3
 80				else: # finished
 81					continue
 82			i = 0
 83			for hanzi in phrase:
 84				if pinyin [i] == None:
 85					pys = hanzi_dic[hanzi]
 86					if len (pys) != 1:
 87						raise Exception (l)
 88					pinyin[i] = pys.keys ()[0]
 89				else:
 90					if pinyin[i] not in hanzi_dic[hanzi]:
 91						yield (u"".join (phrase), None)
 92						break
 93				i += 1
 94			yield (u"".join (phrase), u"'".join (map (str, pinyin)))
 95	
 96	i = 1
 97	try:
 98		for phrase, pinyin in parse_qq_phrase (bzf):
 99			if pinyin != None:
100				line = u"%s\t%s\t%d" % (phrase, pinyin, sogou_phrase.get (phrase, [0,0])[1])
101				print line.encode ("utf8")
102			i += 1
103	except Exception, e:
104		print u"%d : %s" % (i, e.message)
105	
106def main ():
107	srcdir = "."
108	if len (sys.argv) == 2:
109		srcdir = sys.argv[1]
110
111	# filename = "py.db"
112	# try:
113	# 	os.unlink (filename)
114	# except:
115	# 	pass
116	# print "Load phrase freq data"
117	#	freq_dict = {}
118	#	for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
119	#		l = unicode (l, "utf8")
120	#		l = re.split (ur"\t+", l)
121	#		freq_dict [l[0]] = int (l[1])
122	#	
123	#	print "Load char freq data"
124	#	for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
125	#		l = unicode (l, "utf8")
126	#		l = re.split (ur"\t+", l)
127	#		freq_dict [l[0]] = int (l[2])
128	# 
129	# print "Create DB"
130	# db = PYSQLiteDB.PYSQLiteDB (filename)
131	# db.create_tables ()
132	# db.init_pinyin_table ()
133	# db.init_shengmu_table ()
134
135	print "Load pinyin_table.txt.bz2"
136	filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
137	bzf = bz2.BZ2File (filename, "r") 
138	hanzi_dic = PYUtil.load_pinyin_table (bzf)
139
140	print "Load SogouLabDic-utf8.dic"
141	filename = os.path.join (srcdir, "SogouLabDic-utf8.dic")
142	sogou_phrase = PYUtil.load_sogou_phrases (file (filename));
143
144	print "Load qq_pinyin_1.0.txt.bz2"
145	filename = os.path.join (srcdir, "qq_pinyin_1.0.txt.bz2")
146	qq_phrases = load_qq_phrases (filename, hanzi_dic, sogou_phrase)
147	
148
149if __name__ == "__main__":
150	main ()