/python/engine/PinYin/tools/MergePhrases.py
Python | 112 lines | 56 code | 16 blank | 40 comment | 10 complexity | 6579a33ff475be0d500f5cdc2db70e43 MD5 | raw file
- # vim: set noet ts=4:
- #
- # scim-python
- #
- # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
- #
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, write to the
- # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
- #
- # $Id: $
- #
- import sys, os, re
- import bz2
- try:
- import PYUtil
- import PYSQLiteDB
- except:
- path = os.path.dirname (__file__)
- path = os.path.join (path, "..")
- path = os.path.abspath (path)
- sys.path.append (path)
- import PYUtil
- import PYSQLiteDB
- def main ():
- srcdir = "."
- if len (sys.argv) == 2:
- srcdir = sys.argv[1]
- filename = "py.db"
- try:
- os.unlink (filename)
- except:
- pass
- # print "Load phrase freq data"
- # freq_dict = {}
- # for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
- # l = unicode (l, "utf8")
- # l = re.split (ur"\t+", l)
- # freq_dict [l[0]] = int (l[1])
- #
- # print "Load char freq data"
- # for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
- # l = unicode (l, "utf8")
- # l = re.split (ur"\t+", l)
- # freq_dict [l[0]] = int (l[2])
-
- print "Create DB"
- db = PYSQLiteDB.PYSQLiteDB (filename)
- db.create_tables ()
- db.init_pinyin_table ()
- db.init_shengmu_table ()
- print "Load phrase_pinyin.txt.bz2"
- filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
- bzf = bz2.BZ2File (filename, "r")
- phrases_dic = PYUtil.load_phrase_pinyin (bzf)
-
- # db.add_phrases (phrase_pinyin_parser (bzf))
-
- print "Load pinyin_table.txt.bz2"
- filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
- bzf = bz2.BZ2File (filename, "r")
- hanzi_dic = PYUtil.load_pinyin_table (bzf)
-
- def print_phrase (phrase, phrase_orig, pinyins, freq):
- if not phrase:
- line = u"%s\t%s\t%d" % (phrase_orig, u"'".join (pinyins), freq)
- print line.encode ("utf-8")
- return
- if not hanzi_dic.has_key (phrase[0]):
- return
- for pinyin, f in hanzi_dic[phrase[0]].items ():
- print_phrase (phrase[1:], phrase_orig, pinyins + [pinyin], freq)
-
- for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
- w = unicode (l, "utf8")
- w = re.split (ur"\t+", w)
- if phrases_dic.has_key (w[0]):
- continue
- for phrase, pinyin, freq in phrases_dic [w[0]]:
- line = u"%s\t%s\t%d" % (phrase, pinyin, freq)
- print line.encode ("utf8")
- continue
- print_phrase (w[0], w[0], [], int (w[1]))
- return 0
-
-
-
- #
- # print "Load utf8pyPhrase.org"
- # db.add_phrases_from_file (os.path.join (srcdir, "utf8pyPhrase.org"), freq_dict)
- if __name__ == "__main__":
- main ()