/python/engine/PinYin/tools/MergeQQPhrases.py
http://scim-python.googlecode.com/ · Python · 150 lines · 91 code · 12 blank · 47 comment · 28 complexity · 70524409edd9cda53fb607a82fddba4e MD5 · raw file
- # vim: set noet ts=4:
- #
- # scim-python
- #
- # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
- #
- #
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, write to the
- # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
- #
- # $Id: $
- #
- import sys, os, re
- import bz2
- try:
- import PYUtil
- import PYSQLiteDB
- except:
- path = os.path.dirname (__file__)
- path = os.path.join (path, "..")
- path = os.path.abspath (path)
- sys.path.append (path)
- import PYUtil
- import PYSQLiteDB
- def load_qq_phrases (filename, hanzi_dic, sogou_phrase = {}):
- bzf = bz2.BZ2File (filename, "r")
-
- def parse_qq_phrase (f):
- is_pinyin = lambda x: (x >= u"a" and x <= u"z")
- is_hanzi = lambda x: not (is_pinyin (x) or x == u"0")
- for l in f:
- l = unicode (l, "utf8").strip () + "0"
- phrase = []
- pinyin = []
- current_pinyin = None
- state = 0
- for c in l:
- if state == 0: # expect a hanzi
- if is_hanzi (c):
- phrase.append (c)
- state = 1
- else:
- raise Exception (l)
- elif state == 1: # expect a pinyin start or a hanzi
- if is_pinyin (c): # a pinyin start
- current_pinyin = [c]
- state = 2
- elif is_hanzi (c): # an hanzi
- pinyin.append (None)
- phrase.append (c)
- else: # finished
- pinyin.append (None)
- state = 3
- elif state == 2: # expect pinyin continue or hanzi
- if is_pinyin (c): # pinyin continue
- current_pinyin.append (c)
- elif is_hanzi (c): # hanzi
- pinyin.append (u"".join (current_pinyin))
- current_pinyin = None
- phrase.append (c)
- state = 1
- else: # finished
- pinyin.append (u"".join (current_pinyin))
- state = 3
- else: # finished
- continue
- i = 0
- for hanzi in phrase:
- if pinyin [i] == None:
- pys = hanzi_dic[hanzi]
- if len (pys) != 1:
- raise Exception (l)
- pinyin[i] = pys.keys ()[0]
- else:
- if pinyin[i] not in hanzi_dic[hanzi]:
- yield (u"".join (phrase), None)
- break
- i += 1
- yield (u"".join (phrase), u"'".join (map (str, pinyin)))
-
- i = 1
- try:
- for phrase, pinyin in parse_qq_phrase (bzf):
- if pinyin != None:
- line = u"%s\t%s\t%d" % (phrase, pinyin, sogou_phrase.get (phrase, [0,0])[1])
- print line.encode ("utf8")
- i += 1
- except Exception, e:
- print u"%d : %s" % (i, e.message)
-
- def main ():
- srcdir = "."
- if len (sys.argv) == 2:
- srcdir = sys.argv[1]
- # filename = "py.db"
- # try:
- # os.unlink (filename)
- # except:
- # pass
- # print "Load phrase freq data"
- # freq_dict = {}
- # for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
- # l = unicode (l, "utf8")
- # l = re.split (ur"\t+", l)
- # freq_dict [l[0]] = int (l[1])
- #
- # print "Load char freq data"
- # for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
- # l = unicode (l, "utf8")
- # l = re.split (ur"\t+", l)
- # freq_dict [l[0]] = int (l[2])
- #
- # print "Create DB"
- # db = PYSQLiteDB.PYSQLiteDB (filename)
- # db.create_tables ()
- # db.init_pinyin_table ()
- # db.init_shengmu_table ()
- print "Load pinyin_table.txt.bz2"
- filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
- bzf = bz2.BZ2File (filename, "r")
- hanzi_dic = PYUtil.load_pinyin_table (bzf)
- print "Load SogouLabDic-utf8.dic"
- filename = os.path.join (srcdir, "SogouLabDic-utf8.dic")
- sogou_phrase = PYUtil.load_sogou_phrases (file (filename));
- print "Load qq_pinyin_1.0.txt.bz2"
- filename = os.path.join (srcdir, "qq_pinyin_1.0.txt.bz2")
- qq_phrases = load_qq_phrases (filename, hanzi_dic, sogou_phrase)
-
- if __name__ == "__main__":
- main ()