/python/engine/PinYin/tools/MergeQQPhrases.py

http://scim-python.googlecode.com/ · Python · 150 lines · 91 code · 12 blank · 47 comment · 28 complexity · 70524409edd9cda53fb607a82fddba4e MD5 · raw file

  1. # vim: set noet ts=4:
  2. #
  3. # scim-python
  4. #
  5. # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  6. #
  7. #
  8. # This library is free software; you can redistribute it and/or
  9. # modify it under the terms of the GNU Lesser General Public
  10. # License as published by the Free Software Foundation; either
  11. # version 2 of the License, or (at your option) any later version.
  12. #
  13. # This library is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU Lesser General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU Lesser General Public
  19. # License along with this program; if not, write to the
  20. # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  21. # Boston, MA 02111-1307 USA
  22. #
  23. # $Id: $
  24. #
  25. import sys, os, re
  26. import bz2
  27. try:
  28. import PYUtil
  29. import PYSQLiteDB
  30. except:
  31. path = os.path.dirname (__file__)
  32. path = os.path.join (path, "..")
  33. path = os.path.abspath (path)
  34. sys.path.append (path)
  35. import PYUtil
  36. import PYSQLiteDB
  37. def load_qq_phrases (filename, hanzi_dic, sogou_phrase = {}):
  38. bzf = bz2.BZ2File (filename, "r")
  39. def parse_qq_phrase (f):
  40. is_pinyin = lambda x: (x >= u"a" and x <= u"z")
  41. is_hanzi = lambda x: not (is_pinyin (x) or x == u"0")
  42. for l in f:
  43. l = unicode (l, "utf8").strip () + "0"
  44. phrase = []
  45. pinyin = []
  46. current_pinyin = None
  47. state = 0
  48. for c in l:
  49. if state == 0: # expect a hanzi
  50. if is_hanzi (c):
  51. phrase.append (c)
  52. state = 1
  53. else:
  54. raise Exception (l)
  55. elif state == 1: # expect a pinyin start or a hanzi
  56. if is_pinyin (c): # a pinyin start
  57. current_pinyin = [c]
  58. state = 2
  59. elif is_hanzi (c): # an hanzi
  60. pinyin.append (None)
  61. phrase.append (c)
  62. else: # finished
  63. pinyin.append (None)
  64. state = 3
  65. elif state == 2: # expect pinyin continue or hanzi
  66. if is_pinyin (c): # pinyin continue
  67. current_pinyin.append (c)
  68. elif is_hanzi (c): # hanzi
  69. pinyin.append (u"".join (current_pinyin))
  70. current_pinyin = None
  71. phrase.append (c)
  72. state = 1
  73. else: # finished
  74. pinyin.append (u"".join (current_pinyin))
  75. state = 3
  76. else: # finished
  77. continue
  78. i = 0
  79. for hanzi in phrase:
  80. if pinyin [i] == None:
  81. pys = hanzi_dic[hanzi]
  82. if len (pys) != 1:
  83. raise Exception (l)
  84. pinyin[i] = pys.keys ()[0]
  85. else:
  86. if pinyin[i] not in hanzi_dic[hanzi]:
  87. yield (u"".join (phrase), None)
  88. break
  89. i += 1
  90. yield (u"".join (phrase), u"'".join (map (str, pinyin)))
  91. i = 1
  92. try:
  93. for phrase, pinyin in parse_qq_phrase (bzf):
  94. if pinyin != None:
  95. line = u"%s\t%s\t%d" % (phrase, pinyin, sogou_phrase.get (phrase, [0,0])[1])
  96. print line.encode ("utf8")
  97. i += 1
  98. except Exception, e:
  99. print u"%d : %s" % (i, e.message)
  100. def main ():
  101. srcdir = "."
  102. if len (sys.argv) == 2:
  103. srcdir = sys.argv[1]
  104. # filename = "py.db"
  105. # try:
  106. # os.unlink (filename)
  107. # except:
  108. # pass
  109. # print "Load phrase freq data"
  110. # freq_dict = {}
  111. # for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
  112. # l = unicode (l, "utf8")
  113. # l = re.split (ur"\t+", l)
  114. # freq_dict [l[0]] = int (l[1])
  115. #
  116. # print "Load char freq data"
  117. # for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
  118. # l = unicode (l, "utf8")
  119. # l = re.split (ur"\t+", l)
  120. # freq_dict [l[0]] = int (l[2])
  121. #
  122. # print "Create DB"
  123. # db = PYSQLiteDB.PYSQLiteDB (filename)
  124. # db.create_tables ()
  125. # db.init_pinyin_table ()
  126. # db.init_shengmu_table ()
  127. print "Load pinyin_table.txt.bz2"
  128. filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
  129. bzf = bz2.BZ2File (filename, "r")
  130. hanzi_dic = PYUtil.load_pinyin_table (bzf)
  131. print "Load SogouLabDic-utf8.dic"
  132. filename = os.path.join (srcdir, "SogouLabDic-utf8.dic")
  133. sogou_phrase = PYUtil.load_sogou_phrases (file (filename));
  134. print "Load qq_pinyin_1.0.txt.bz2"
  135. filename = os.path.join (srcdir, "qq_pinyin_1.0.txt.bz2")
  136. qq_phrases = load_qq_phrases (filename, hanzi_dic, sogou_phrase)
  137. if __name__ == "__main__":
  138. main ()