PageRenderTime 26ms CodeModel.GetById 1ms app.highlight 18ms RepoModel.GetById 2ms app.codeStats 0ms

/python/engine/PinYin/tools/MergePhrases.py

http://scim-python.googlecode.com/
Python | 112 lines | 56 code | 16 blank | 40 comment | 13 complexity | 6579a33ff475be0d500f5cdc2db70e43 MD5 | raw file
  1# vim: set noet ts=4:
  2#
  3# scim-python
  4#
  5# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  6#
  7#
  8# This library is free software; you can redistribute it and/or
  9# modify it under the terms of the GNU Lesser General Public
 10# License as published by the Free Software Foundation; either
 11# version 2 of the License, or (at your option) any later version.
 12#
 13# This library is distributed in the hope that it will be useful,
 14# but WITHOUT ANY WARRANTY; without even the implied warranty of
 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16# GNU Lesser General Public License for more details.
 17#
 18# You should have received a copy of the GNU Lesser General Public
 19# License along with this program; if not, write to the
 20# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 21# Boston, MA  02111-1307  USA
 22#
 23# $Id: $
 24#
 25import sys, os, re
 26import bz2
 27
 28try:
 29	import PYUtil
 30	import PYSQLiteDB
 31except:
 32	path = os.path.dirname (__file__)
 33	path = os.path.join (path, "..")
 34	path = os.path.abspath (path)
 35	sys.path.append (path)
 36	import PYUtil
 37	import PYSQLiteDB
 38
 39def main ():
 40	srcdir = "."
 41	if len (sys.argv) == 2:
 42		srcdir = sys.argv[1]
 43
 44	filename = "py.db"
 45	try:
 46		os.unlink (filename)
 47	except:
 48		pass
 49	# print "Load phrase freq data"
 50	#	freq_dict = {}
 51	#	for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
 52	#		l = unicode (l, "utf8")
 53	#		l = re.split (ur"\t+", l)
 54	#		freq_dict [l[0]] = int (l[1])
 55	#	
 56	#	print "Load char freq data"
 57	#	for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
 58	#		l = unicode (l, "utf8")
 59	#		l = re.split (ur"\t+", l)
 60	#		freq_dict [l[0]] = int (l[2])
 61	
 62	print "Create DB"
 63	db = PYSQLiteDB.PYSQLiteDB (filename)
 64	db.create_tables ()
 65	db.init_pinyin_table ()
 66	db.init_shengmu_table ()
 67
 68	print "Load phrase_pinyin.txt.bz2"
 69	filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
 70	bzf = bz2.BZ2File (filename, "r")
 71	phrases_dic = PYUtil.load_phrase_pinyin (bzf)
 72	
 73	# db.add_phrases (phrase_pinyin_parser (bzf))
 74	
 75	print "Load pinyin_table.txt.bz2"
 76	filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
 77	bzf = bz2.BZ2File (filename, "r")
 78	hanzi_dic = PYUtil.load_pinyin_table (bzf)
 79		
 80	def print_phrase (phrase, phrase_orig, pinyins, freq):
 81		if not phrase:
 82			line = u"%s\t%s\t%d" % (phrase_orig, u"'".join (pinyins), freq)
 83			print line.encode ("utf-8")
 84			return
 85		if not hanzi_dic.has_key (phrase[0]):
 86			return
 87		for pinyin, f in hanzi_dic[phrase[0]].items ():
 88			print_phrase (phrase[1:], phrase_orig, pinyins + [pinyin], freq)
 89			
 90
 91	for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
 92		w = unicode (l, "utf8")
 93		w = re.split (ur"\t+", w)
 94		if phrases_dic.has_key (w[0]):
 95			continue
 96			for phrase, pinyin, freq in phrases_dic [w[0]]:
 97				line = u"%s\t%s\t%d" % (phrase, pinyin, freq)
 98				print line.encode ("utf8")
 99			continue
100		print_phrase (w[0], w[0], [], int (w[1]))
101
102	return 0
103		
104		
105		
106	# 
107	# print "Load utf8pyPhrase.org"
108	# db.add_phrases_from_file (os.path.join (srcdir, "utf8pyPhrase.org"), freq_dict)
109
110
111if __name__ == "__main__":
112	main ()