PageRenderTime 137ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/python/engine/PinYin/tools/MergePhrases.py

http://scim-python.googlecode.com/
Python | 112 lines | 56 code | 16 blank | 40 comment | 10 complexity | 6579a33ff475be0d500f5cdc2db70e43 MD5 | raw file
  1. # vim: set noet ts=4:
  2. #
  3. # scim-python
  4. #
  5. # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  6. #
  7. #
  8. # This library is free software; you can redistribute it and/or
  9. # modify it under the terms of the GNU Lesser General Public
  10. # License as published by the Free Software Foundation; either
  11. # version 2 of the License, or (at your option) any later version.
  12. #
  13. # This library is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU Lesser General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU Lesser General Public
  19. # License along with this program; if not, write to the
  20. # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  21. # Boston, MA 02111-1307 USA
  22. #
  23. # $Id: $
  24. #
  25. import sys, os, re
  26. import bz2
  27. try:
  28. import PYUtil
  29. import PYSQLiteDB
  30. except:
  31. path = os.path.dirname (__file__)
  32. path = os.path.join (path, "..")
  33. path = os.path.abspath (path)
  34. sys.path.append (path)
  35. import PYUtil
  36. import PYSQLiteDB
  37. def main ():
  38. srcdir = "."
  39. if len (sys.argv) == 2:
  40. srcdir = sys.argv[1]
  41. filename = "py.db"
  42. try:
  43. os.unlink (filename)
  44. except:
  45. pass
  46. # print "Load phrase freq data"
  47. # freq_dict = {}
  48. # for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
  49. # l = unicode (l, "utf8")
  50. # l = re.split (ur"\t+", l)
  51. # freq_dict [l[0]] = int (l[1])
  52. #
  53. # print "Load char freq data"
  54. # for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
  55. # l = unicode (l, "utf8")
  56. # l = re.split (ur"\t+", l)
  57. # freq_dict [l[0]] = int (l[2])
  58. print "Create DB"
  59. db = PYSQLiteDB.PYSQLiteDB (filename)
  60. db.create_tables ()
  61. db.init_pinyin_table ()
  62. db.init_shengmu_table ()
  63. print "Load phrase_pinyin.txt.bz2"
  64. filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
  65. bzf = bz2.BZ2File (filename, "r")
  66. phrases_dic = PYUtil.load_phrase_pinyin (bzf)
  67. # db.add_phrases (phrase_pinyin_parser (bzf))
  68. print "Load pinyin_table.txt.bz2"
  69. filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
  70. bzf = bz2.BZ2File (filename, "r")
  71. hanzi_dic = PYUtil.load_pinyin_table (bzf)
  72. def print_phrase (phrase, phrase_orig, pinyins, freq):
  73. if not phrase:
  74. line = u"%s\t%s\t%d" % (phrase_orig, u"'".join (pinyins), freq)
  75. print line.encode ("utf-8")
  76. return
  77. if not hanzi_dic.has_key (phrase[0]):
  78. return
  79. for pinyin, f in hanzi_dic[phrase[0]].items ():
  80. print_phrase (phrase[1:], phrase_orig, pinyins + [pinyin], freq)
  81. for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
  82. w = unicode (l, "utf8")
  83. w = re.split (ur"\t+", w)
  84. if phrases_dic.has_key (w[0]):
  85. continue
  86. for phrase, pinyin, freq in phrases_dic [w[0]]:
  87. line = u"%s\t%s\t%d" % (phrase, pinyin, freq)
  88. print line.encode ("utf8")
  89. continue
  90. print_phrase (w[0], w[0], [], int (w[1]))
  91. return 0
  92. #
  93. # print "Load utf8pyPhrase.org"
  94. # db.add_phrases_from_file (os.path.join (srcdir, "utf8pyPhrase.org"), freq_dict)
  95. if __name__ == "__main__":
  96. main ()