/m2m-aligner.py

https://code.google.com/p/phonetisaurus/ · Python · 164 lines · 123 code · 14 blank · 27 comment · 18 complexity · 0ba77f713756efbf02ba11e0f47f8c54 MD5 · raw file

  1. #!/usr/bin/python
  2. # Copyright 2011-2012 Josef Robert Novak
  3. #
  4. # This file is part of Phonetisaurus.
  5. #
  6. # Phonetisaurus is free software: you can redistribute it
  7. # and/or modify it under the terms of the GNU General Public
  8. # License as published by the Free Software Foundation, either
  9. # version 3 of the License, or (at your option) any later version.
  10. #
  11. # Phonetisaurus is distributed in the hope that it will be useful, but
  12. # WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. # General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public
  17. # License along with Phonetisaurus. If not, see http://www.gnu.org/licenses/.
  18. from M2MFstAligner import M2MFstAligner
  19. import sys
  20. def splitter( part, sep ):
  21. if sep=="":
  22. return list(part)
  23. else:
  24. return part.split(sep)
  25. def train_aligner( aligner, training_file, max_iter, s1in_sep="", s2in_sep=" ", s1s2in_sep="\t" ):
  26. print "Building WFST-based alignment corpus from training data..."
  27. for line in open(training_file,"r"):
  28. part1, part2 = line.strip().split(s1s2in_sep)
  29. seq1 = splitter( part1, s1in_sep )
  30. seq2 = splitter( part2, s2in_sep )
  31. aligner.entry2alignfst( seq1, seq2 )
  32. print "Finished adding entries..."
  33. print "Starting EM training..."
  34. change = aligner.maximization(False)
  35. print "Finished initialization..."
  36. for i in xrange(max_iter):
  37. print "Iter:", i
  38. aligner.expectation()
  39. change = aligner.maximization(False)
  40. print "Change:", change
  41. print "Last iteration..."
  42. aligner.expectation()
  43. aligner.maximization(True)
  44. return aligner
  45. def write_aligned_training_data( aligner, aligned_file, nbest=1 ):
  46. print "Writing %d-best alignments to file: %s." % (nbest, aligned_file)
  47. ofp = open(aligned_file,"w")
  48. for i in xrange(aligner.num_fsas()):
  49. results = aligner.write_alignment_wrapper( i, nbest )
  50. for result in results:
  51. ofp.write("%s\n"%" ".join(result.path))
  52. ofp.close()
  53. return
  54. def align_sequences( aligner, part1, part2, s1in_sep="", s2in_sep=" ", nbest=1, write_lattice=None ):
  55. """Convenience access to perform a single alignment. Mainly for testing purposes."""
  56. seq1 = splitter( part1, s1in_sep )
  57. seq2 = splitter( part2, s2in_sep )
  58. if write_lattice:
  59. print "Writing weighted alignment lattice to file: %s" % write_lattice
  60. results = aligner.entry2alignfstnoinit( seq1, seq2, nbest, write_lattice )
  61. else:
  62. results = aligner.entry2alignfstnoinit( seq1, seq2, nbest )
  63. for result in results:
  64. print "%0.4f\t%s" % (result.pathcost, " ".join(result.path))
  65. return
  66. if __name__=="__main__":
  67. import sys, argparse
  68. from argparse import RawTextHelpFormatter
  69. example2 = """Train a model:\n\t%s --align train.txt -s2 --write_model train.model.fst --write_align train.aligned""" % sys.argv[0]
  70. example1 = """Align a sequence:\n\t%s --model model.fst --string1 "aback" --string2 "x b @ k" --nbest 2""" % sys.argv[0]
  71. examples = example1+"\n\n"+example2
  72. parser = argparse.ArgumentParser(description=examples, formatter_class=RawTextHelpFormatter)
  73. parser.add_argument('--seq1_del', "-s1", help="Allow deletions in sequence 1.", default=False, action="store_true" )
  74. parser.add_argument('--seq2_del', "-s2", help="Allow deletions in sequence 2.", default=False, action="store_true" )
  75. parser.add_argument('--seq1_max', "-m1", help="Maximum subsequence length for sequence 1. Defaults to 2.", default=2, type=int, required=False )
  76. parser.add_argument('--seq2_max', "-m2", help="Maximum subsequence length for sequence 2. Defaults to 2.", default=2, type=int, required=False )
  77. parser.add_argument('--seq1_sep', "-p1", help="Separator token for sequence 1. Defaults to '|'.", default="|", required=False )
  78. parser.add_argument('--seq2_sep', "-p2", help="Separator token for sequence 2. Defaults to '|'.", default="|", required=False )
  79. parser.add_argument('--s1s2_sep', "-ss", help="Separator token for seq1 and seq2 alignments. Defaults to '}'.", default="}", required=False )
  80. parser.add_argument('--eps', "-e", help="Epsilon symbol. Defaults to '<eps>'.", default="<eps>", required=False )
  81. parser.add_argument('--skip', "-s", help="Skip/null symbol. Defaults to '_'.", default="_", required=False )
  82. parser.add_argument('--write_model', "-wm", help="Write the model to 'file'.", default=None, required=False )
  83. parser.add_argument('--write_align', "-wa", help="Write the alignments to 'file'.", default=None, required=False )
  84. parser.add_argument('--s1in_sep', "-i1", help="Separator for seq1 in the input training file. Defaults to ''.", default="", required=False )
  85. parser.add_argument('--s2in_sep', "-i2", help="Separator for seq2 in the input training file. Defaults to ' '.", default=" ", required=False )
  86. parser.add_argument('--s1s2_delim', "-d", help="Separator for seq1/seq2 in the input training file. Defaults to '\\\t'.", default="\t", required=False )
  87. parser.add_argument('--model', "-m", help="Input FST-based alignment model.", default=None, required=False )
  88. parser.add_argument('--align', "-a", help="File containing sequences to be aligned.", default=None, required=False )
  89. parser.add_argument('--nbest', "-n", help="n-best. Defaults to 1.", default=1, type=int, required=False )
  90. parser.add_argument('--iter', "-i", help="Maximum number of iterations for EM. Defaults to 11.", default=11, type=int, required=False )
  91. parser.add_argument('--threshold', "-t", help="Threshold for EM training. Defaults to '.001'.", default=.001, type=float, required=False )
  92. parser.add_argument('--string1', "-r1", help="Input string1 to align.", required=False )
  93. parser.add_argument('--string2', "-r2", help="Input string2 to align.", required=False )
  94. parser.add_argument('--no_penalty', "-np", help="By default multi-subsequences are penalized during decoding. Not needed for large training corpora. Defaults to 'False'.", required=False, default=False, action="store_true" )
  95. parser.add_argument('--write_lattice', "-l", help="Write out the union of the weighted alignment lattices from the training corpus.", default=None, required=False )
  96. parser.add_argument('--prefix', "-p", help="Prefix used to generate the model and alignment files. Defaults to 'test'.", required=False, default="test" )
  97. parser.add_argument('--verbose', "-v", help='Verbose mode.', default=False, action="store_true")
  98. args = parser.parse_args()
  99. if args.verbose:
  100. for attr, value in args.__dict__.iteritems():
  101. print attr, value
  102. #aligner = M2MFstAligner( False, True, 2, 2, "|", "|", "}", "<eps>","_")
  103. if args.align:
  104. #Initialize a new aligner object for training
  105. aligner = M2MFstAligner(
  106. args.seq1_del,
  107. args.seq2_del,
  108. args.seq1_max,
  109. args.seq2_max,
  110. args.seq1_sep,
  111. args.seq2_sep,
  112. args.s1s2_sep,
  113. args.eps,
  114. args.skip,
  115. args.no_penalty
  116. )
  117. #Run the EM training
  118. train_aligner(
  119. aligner,
  120. args.align,
  121. args.iter,
  122. s1in_sep=args.s1in_sep,
  123. s2in_sep=args.s2in_sep,
  124. s1s2in_sep=args.s1s2_delim
  125. )
  126. #Optionally write the model to disk
  127. if args.write_model:
  128. aligner.write_model( args.write_model )
  129. #Optionally write the n-best alignments for the training corpus
  130. if args.write_align:
  131. write_aligned_training_data( aligner, args.write_align, nbest=args.nbest )
  132. #Optionally write out the union of the weighted alignment lattices
  133. # useful for building better joint n-gram models and MBR decoding.
  134. if args.write_lattice:
  135. aligner.write_lattice( args.write_lattice )
  136. elif args.model:
  137. #Iinitialize a new aligner object for testing
  138. aligner = M2MFstAligner( args.model )
  139. #Align a pair of sequences supplied via the command line
  140. align_sequences(
  141. aligner,
  142. args.string1,
  143. args.string2,
  144. s1in_sep=args.s1in_sep,
  145. s2in_sep=args.s2in_sep,
  146. nbest=args.nbest,
  147. write_lattice=args.write_lattice
  148. )
  149. else:
  150. print "You need to specify a valid command sequence..."