/python/engine/PinYin/PYCreatePinYinDB.py
Python | 75 lines | 41 code | 10 blank | 24 comment | 5 complexity | bbe8a035f69fee31379e6b3c5d455f09 MD5 | raw file
- # vim: set noet ts=4:
- #
- # scim-python
- #
- # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
- #
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, write to the
- # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
- #
- # $Id: $
- #
- import sys, os, re
- import PYSQLiteDB
- import bz2
- def main ():
- srcdir = "."
- if len (sys.argv) == 2:
- srcdir = sys.argv[1]
- filename = "py.db"
- try:
- os.unlink (filename)
- except:
- pass
-
- print "Create DB"
- db = PYSQLiteDB.PYSQLiteDB (filename = filename)
- db.create_tables ()
- db.init_pinyin_table ()
- db.init_shengmu_table ()
- def phrase_pinyin_parser (f):
- for l in f:
- phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
- pinyin = pinyin.replace (u"u:", u"v")
- yield (phrase, pinyin, int (freq))
- def phrase_pinyin_parser_pinyin (f):
- for l in f:
- phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
- pinyin = pinyin.replace (u"u:", u"v")
- yield (phrase, pinyin, int (freq)*1300)
- print "Load pinyin_table.txt"
- filename = os.path.join (srcdir, "../../../data/pinyin_table.txt")
- db.add_phrases (phrase_pinyin_parser_pinyin (file (filename)))
- print "Load phrase_pinyin.txt.bz2"
- filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
- bzf = bz2.BZ2File (filename, "r")
- db.add_phrases (phrase_pinyin_parser (bzf))
-
- print "Load phrase_pinyin_duoyin.txt"
- filename = os.path.join (srcdir, "phrase_pinyin_duoyin.txt")
- db.add_phrases (phrase_pinyin_parser (file (filename)))
- print "Optimizing database"
- db.optimize_database ()
-
- if __name__ == "__main__":
- main ()