/script/m2m2format.py

https://code.google.com/p/phonetisaurus/ · Python · 105 lines · 67 code · 15 blank · 23 comment · 18 complexity · 09a206db9c5d3d994fcb3e0b673add30 MD5 · raw file

  1. #!/usr/bin/python
  2. # Copyright 2011-2012 Josef Robert Novak
  3. #
  4. # This file is part of Phonetisaurus.
  5. #
  6. # Phonetisaurus is free software: you can redistribute it
  7. # and/or modify it under the terms of the GNU General Public
  8. # License as published by the Free Software Foundation, either
  9. # version 3 of the License, or (at your option) any later version.
  10. #
  11. # Phonetisaurus is distributed in the hope that it will be useful, but
  12. # WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. # General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public
  17. # License along with Phonetisaurus. If not, see http://www.gnu.org/licenses/.
  18. import re, sys, os
  19. def m2m2Format( dict_file, prefix="test", graph_sep="", phon_sep=" ", entry_sep="\t", reverse=False, swap=False, unify_case=False ):
  20. """
  21. Format the raw dictionary file for the m2m alignment tool.
  22. """
  23. if graph_sep==entry_sep:
  24. raise ValueError("graph_sep and entry_sep share the same value!")
  25. elif phon_sep==entry_sep:
  26. raise ValueError("phon_sep and entry_sep share the same value!")
  27. dict_file_fp = open( dict_file, "r" )
  28. train_file_fp = open( "PREFIX.train".replace("PREFIX",prefix), "w" )
  29. chars = set([])
  30. for line in dict_file_fp:
  31. line = line.strip()
  32. line = line.decode("utf8")
  33. word = None; pron = None
  34. graphs = []; phons = [];
  35. word, pron = line.split(entry_sep)
  36. if unify_case==True:
  37. word = word.lower()
  38. #We will map spaces, dashes, underscores, etc. categorically
  39. # to "" during training. These will be auto-mapped to <eps>
  40. # during decoding.
  41. word = word.replace(" ","").replace("_","").replace("-","")
  42. if graph_sep=="":
  43. graphs = list(word)
  44. else:
  45. graphs = word.split(graph_sep)
  46. for g in graphs:
  47. chars.update( [ x for x in list(g) ])
  48. if phon_sep=="":
  49. phons = list(pron)
  50. else:
  51. phons = pron.split(phon_sep)
  52. for p in phons:
  53. chars.update( [ x for x in list(p) ])
  54. if reverse==True:
  55. graphs.reverse()
  56. phons.reverse()
  57. if swap==True:
  58. outline = "%s\t%s\n" % (" ".join(phons), " ".join(graphs))
  59. else:
  60. outline = "%s\t%s\n" % (" ".join(graphs), " ".join(phons))
  61. outline = outline.encode("utf8")
  62. train_file_fp.write(outline)
  63. dict_file_fp.close()
  64. train_file_fp.close()
  65. return chars
  66. if __name__=="__main__":
  67. import sys, argparse
  68. example = """./m2m2format.py --dict training.dic --prefix test"""
  69. parser = argparse.ArgumentParser(description=example)
  70. parser.add_argument('--dict', "-d", help="The input pronunciation dictionary. This will be used to build the G2P/P2G model.", required=True )
  71. parser.add_argument('--graphsep', "-g", help="The grapheme separator for the raw dictionary file. Defaults to ''.", default="" )
  72. parser.add_argument('--phonsep', "-e", help="The phoneme separator for the raw dictionary file. Defaults to ' '.", default=" " )
  73. parser.add_argument('--reverse', "-r", help="Reverse the training data. 'word'->'drow'.", default=False, action="store_true")
  74. parser.add_argument('--entrysep', "-y", help="The word/pronunciation separator for the raw dictionary file. Defaults to '\t'.", default="\t" )
  75. parser.add_argument('--swap', "-w", help="Swap the grapheme/phomeme input. G2P vs. P2G model.", default=False, action="store_true" )
  76. parser.add_argument('--prefix', "-p", help="A file prefix. Will be prepended to all model files created during cascade generation.", default="test" )
  77. parser.add_argument('--verbose', "-v", help='Verbose mode.', default=False, action="store_true")
  78. args = parser.parse_args()
  79. if args.verbose:
  80. for attr, value in args.__dict__.iteritems():
  81. print attr, value
  82. m2m2Format(
  83. args.dict,
  84. prefix=args.prefix,
  85. graph_sep=args.graphsep,
  86. phon_sep=args.phonsep,
  87. entry_sep=args.entrysep,
  88. reverse=args.reverse,
  89. swap=args.swap
  90. )