PageRenderTime 82ms CodeModel.GetById 40ms app.highlight 6ms RepoModel.GetById 34ms app.codeStats 1ms

/python/engine/PinYin/PYCreatePinYinDB.py

http://scim-python.googlecode.com/
Python | 75 lines | 41 code | 10 blank | 24 comment | 6 complexity | bbe8a035f69fee31379e6b3c5d455f09 MD5 | raw file
 1# vim: set noet ts=4:
 2#
 3# scim-python
 4#
 5# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
 6#
 7#
 8# This library is free software; you can redistribute it and/or
 9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 2 of the License, or (at your option) any later version.
12#
13# This library is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16# GNU Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public
19# License along with this program; if not, write to the
20# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
21# Boston, MA  02111-1307  USA
22#
23# $Id: $
24#
25import sys, os, re
26import PYSQLiteDB
27import bz2
28
29def main ():
30	srcdir = "."
31	if len (sys.argv) == 2:
32		srcdir = sys.argv[1]
33
34	filename = "py.db"
35	try:
36		os.unlink (filename)
37	except:
38		pass
39	
40	print "Create DB"
41	db = PYSQLiteDB.PYSQLiteDB (filename = filename)
42	db.create_tables ()
43	db.init_pinyin_table ()
44	db.init_shengmu_table ()
45
46	def phrase_pinyin_parser (f):
47		for l in f:
48			phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
49			pinyin = pinyin.replace (u"u:", u"v")
50			yield (phrase, pinyin, int (freq))
51
52	def phrase_pinyin_parser_pinyin (f):
53		for l in f:
54			phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
55			pinyin = pinyin.replace (u"u:", u"v")
56			yield (phrase, pinyin, int (freq)*1300)
57
58	print "Load pinyin_table.txt"
59	filename = os.path.join (srcdir, "../../../data/pinyin_table.txt")
60	db.add_phrases (phrase_pinyin_parser_pinyin (file (filename)))
61
62	print "Load phrase_pinyin.txt.bz2"
63	filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
64	bzf = bz2.BZ2File (filename, "r")
65	db.add_phrases (phrase_pinyin_parser (bzf))
66	
67	print "Load phrase_pinyin_duoyin.txt"
68	filename = os.path.join (srcdir, "phrase_pinyin_duoyin.txt")
69	db.add_phrases (phrase_pinyin_parser (file (filename)))
70
71	print "Optimizing database"
72	db.optimize_database ()
73	
74if __name__ == "__main__":
75	main ()