PageRenderTime 33ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/python/engine/PinYin/PYCreatePinYinDB.py

http://scim-python.googlecode.com/
Python | 75 lines | 41 code | 10 blank | 24 comment | 5 complexity | bbe8a035f69fee31379e6b3c5d455f09 MD5 | raw file
  1. # vim: set noet ts=4:
  2. #
  3. # scim-python
  4. #
  5. # Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
  6. #
  7. #
  8. # This library is free software; you can redistribute it and/or
  9. # modify it under the terms of the GNU Lesser General Public
  10. # License as published by the Free Software Foundation; either
  11. # version 2 of the License, or (at your option) any later version.
  12. #
  13. # This library is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU Lesser General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU Lesser General Public
  19. # License along with this program; if not, write to the
  20. # Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  21. # Boston, MA 02111-1307 USA
  22. #
  23. # $Id: $
  24. #
  25. import sys, os, re
  26. import PYSQLiteDB
  27. import bz2
  28. def main ():
  29. srcdir = "."
  30. if len (sys.argv) == 2:
  31. srcdir = sys.argv[1]
  32. filename = "py.db"
  33. try:
  34. os.unlink (filename)
  35. except:
  36. pass
  37. print "Create DB"
  38. db = PYSQLiteDB.PYSQLiteDB (filename = filename)
  39. db.create_tables ()
  40. db.init_pinyin_table ()
  41. db.init_shengmu_table ()
  42. def phrase_pinyin_parser (f):
  43. for l in f:
  44. phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
  45. pinyin = pinyin.replace (u"u:", u"v")
  46. yield (phrase, pinyin, int (freq))
  47. def phrase_pinyin_parser_pinyin (f):
  48. for l in f:
  49. phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
  50. pinyin = pinyin.replace (u"u:", u"v")
  51. yield (phrase, pinyin, int (freq)*1300)
  52. print "Load pinyin_table.txt"
  53. filename = os.path.join (srcdir, "../../../data/pinyin_table.txt")
  54. db.add_phrases (phrase_pinyin_parser_pinyin (file (filename)))
  55. print "Load phrase_pinyin.txt.bz2"
  56. filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
  57. bzf = bz2.BZ2File (filename, "r")
  58. db.add_phrases (phrase_pinyin_parser (bzf))
  59. print "Load phrase_pinyin_duoyin.txt"
  60. filename = os.path.join (srcdir, "phrase_pinyin_duoyin.txt")
  61. db.add_phrases (phrase_pinyin_parser (file (filename)))
  62. print "Optimizing database"
  63. db.optimize_database ()
  64. if __name__ == "__main__":
  65. main ()