PageRenderTime 30ms CodeModel.GetById 15ms app.highlight 12ms RepoModel.GetById 1ms app.codeStats 0ms

/m2m-aligner.py

https://code.google.com/p/phonetisaurus/
Python | 164 lines | 146 code | 1 blank | 17 comment | 11 complexity | 0ba77f713756efbf02ba11e0f47f8c54 MD5 | raw file
  1#!/usr/bin/python
  2#    Copyright 2011-2012 Josef Robert Novak
  3#
  4#    This file is part of Phonetisaurus.
  5#
  6#    Phonetisaurus is free software: you can redistribute it 
  7#    and/or modify it under the terms of the GNU General Public 
  8#    License as published by the Free Software Foundation, either 
  9#    version 3 of the License, or (at your option) any later version.
 10#
 11#    Phonetisaurus is distributed in the hope that it will be useful, but 
 12#    WITHOUT ANY WARRANTY; without even the implied warranty of 
 13#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 14#    General Public License for more details.
 15#
 16#    You should have received a copy of the GNU General Public 
 17#    License along with Phonetisaurus. If not, see http://www.gnu.org/licenses/.
 18from M2MFstAligner import M2MFstAligner
 19import sys
 20
 21def splitter( part, sep ):
 22    if sep=="":
 23        return list(part)
 24    else:
 25        return part.split(sep)
 26
 27def train_aligner( aligner, training_file, max_iter, s1in_sep="", s2in_sep=" ", s1s2in_sep="\t" ):
 28    print "Building WFST-based alignment corpus from training data..."
 29    for line in open(training_file,"r"):
 30        part1, part2 = line.strip().split(s1s2in_sep)
 31        seq1 = splitter( part1, s1in_sep )
 32        seq2 = splitter( part2, s2in_sep )
 33        aligner.entry2alignfst( seq1, seq2 )
 34    print "Finished adding entries..."
 35    print "Starting EM training..."
 36
 37    change = aligner.maximization(False)
 38    print "Finished initialization..."
 39    for i in xrange(max_iter):
 40        print "Iter:", i
 41        aligner.expectation()
 42        change = aligner.maximization(False)
 43        print "Change:", change
 44
 45    print "Last iteration..."
 46    aligner.expectation()
 47    aligner.maximization(True)
 48    return aligner
 49
 50def write_aligned_training_data( aligner, aligned_file, nbest=1 ):
 51    print "Writing %d-best alignments to file: %s." % (nbest, aligned_file)
 52    ofp = open(aligned_file,"w")
 53    for i in xrange(aligner.num_fsas()):
 54        results = aligner.write_alignment_wrapper( i, nbest )
 55        for result in results:
 56            ofp.write("%s\n"%" ".join(result.path))
 57    ofp.close()
 58    return
 59
 60def align_sequences( aligner, part1, part2, s1in_sep="", s2in_sep=" ", nbest=1, write_lattice=None ):
 61    """Convenience access to perform a single alignment.  Mainly for testing purposes."""
 62    seq1 = splitter( part1, s1in_sep )
 63    seq2 = splitter( part2, s2in_sep )
 64
 65    if write_lattice:
 66        print "Writing weighted alignment lattice to file: %s" % write_lattice
 67        results = aligner.entry2alignfstnoinit( seq1, seq2, nbest, write_lattice )
 68    else:
 69        results = aligner.entry2alignfstnoinit( seq1, seq2, nbest )
 70
 71    for result in results:
 72        print "%0.4f\t%s" % (result.pathcost, " ".join(result.path))
 73
 74    return
 75
 76
 77if __name__=="__main__":
 78    import sys, argparse
 79    from argparse import RawTextHelpFormatter
 80
 81    example2 = """Train a model:\n\t%s --align train.txt -s2 --write_model train.model.fst --write_align train.aligned""" % sys.argv[0]
 82    example1 = """Align a sequence:\n\t%s --model model.fst --string1 "aback" --string2 "x b @ k" --nbest 2""" % sys.argv[0]
 83    examples = example1+"\n\n"+example2
 84    parser = argparse.ArgumentParser(description=examples, formatter_class=RawTextHelpFormatter)
 85    parser.add_argument('--seq1_del',      "-s1", help="Allow deletions in sequence 1.", default=False, action="store_true" )
 86    parser.add_argument('--seq2_del',      "-s2", help="Allow deletions in sequence 2.", default=False, action="store_true" )
 87    parser.add_argument('--seq1_max',      "-m1", help="Maximum subsequence length for sequence 1. Defaults to 2.", default=2, type=int, required=False )
 88    parser.add_argument('--seq2_max',      "-m2", help="Maximum subsequence length for sequence 2. Defaults to 2.", default=2, type=int, required=False )
 89    parser.add_argument('--seq1_sep',      "-p1", help="Separator token for sequence 1. Defaults to '|'.", default="|", required=False )
 90    parser.add_argument('--seq2_sep',      "-p2", help="Separator token for sequence 2. Defaults to '|'.", default="|", required=False )
 91    parser.add_argument('--s1s2_sep',      "-ss", help="Separator token for seq1 and seq2 alignments. Defaults to '}'.", default="}", required=False )
 92    parser.add_argument('--eps',           "-e",  help="Epsilon symbol.  Defaults to '<eps>'.", default="<eps>", required=False )
 93    parser.add_argument('--skip',          "-s",  help="Skip/null symbol.  Defaults to '_'.", default="_", required=False )
 94    parser.add_argument('--write_model',   "-wm", help="Write the model to 'file'.", default=None, required=False )
 95    parser.add_argument('--write_align',   "-wa", help="Write the alignments to 'file'.", default=None, required=False )
 96    parser.add_argument('--s1in_sep',      "-i1", help="Separator for seq1 in the input training file. Defaults to ''.", default="", required=False )
 97    parser.add_argument('--s2in_sep',      "-i2", help="Separator for seq2 in the input training file. Defaults to ' '.", default=" ", required=False )
 98    parser.add_argument('--s1s2_delim',     "-d", help="Separator for seq1/seq2 in the input training file. Defaults to '\\\t'.", default="\t", required=False )
 99    parser.add_argument('--model',          "-m", help="Input FST-based alignment model.", default=None, required=False )
100    parser.add_argument('--align',          "-a", help="File containing sequences to be aligned.", default=None, required=False )
101    parser.add_argument('--nbest',          "-n", help="n-best. Defaults to 1.", default=1, type=int, required=False )
102    parser.add_argument('--iter',           "-i", help="Maximum number of iterations for EM. Defaults to 11.", default=11, type=int, required=False )
103    parser.add_argument('--threshold',      "-t", help="Threshold for EM training. Defaults to '.001'.", default=.001, type=float, required=False )
104    parser.add_argument('--string1',       "-r1", help="Input string1 to align.", required=False )
105    parser.add_argument('--string2',       "-r2", help="Input string2 to align.", required=False )
106    parser.add_argument('--no_penalty',    "-np", help="By default multi-subsequences are penalized during decoding.  Not needed for large training corpora.  Defaults to 'False'.", required=False, default=False, action="store_true" )
107    parser.add_argument('--write_lattice', "-l",  help="Write out the union of the weighted alignment lattices from the training corpus.", default=None, required=False )
108    parser.add_argument('--prefix',    "-p", help="Prefix used to generate the model and alignment files.  Defaults to 'test'.", required=False, default="test" )
109    parser.add_argument('--verbose',   "-v", help='Verbose mode.', default=False, action="store_true")
110    args = parser.parse_args()
111
112    if args.verbose:
113        for attr, value in args.__dict__.iteritems():
114            print attr, value
115
116    #aligner = M2MFstAligner( False, True, 2, 2, "|", "|", "}", "<eps>","_")
117    if args.align:
118        #Initialize a new aligner object for training
119        aligner = M2MFstAligner( 
120            args.seq1_del, 
121            args.seq2_del, 
122            args.seq1_max, 
123            args.seq2_max, 
124            args.seq1_sep, 
125            args.seq2_sep, 
126            args.s1s2_sep, 
127            args.eps, 
128            args.skip,
129            args.no_penalty
130        )
131        #Run the EM training
132        train_aligner( 
133            aligner, 
134            args.align, 
135            args.iter, 
136            s1in_sep=args.s1in_sep, 
137            s2in_sep=args.s2in_sep, 
138            s1s2in_sep=args.s1s2_delim
139            )
140        #Optionally write the model to disk
141        if args.write_model:
142            aligner.write_model( args.write_model )
143        #Optionally write the n-best alignments for the training corpus
144        if args.write_align:
145            write_aligned_training_data( aligner, args.write_align, nbest=args.nbest )
146        #Optionally write out the union of the weighted alignment lattices
147        # useful for building better joint n-gram models and MBR decoding.
148        if args.write_lattice:
149            aligner.write_lattice( args.write_lattice )
150    elif args.model:
151        #Iinitialize a new aligner object for testing
152        aligner = M2MFstAligner( args.model )
153        #Align a pair of sequences supplied via the command line
154        align_sequences( 
155            aligner, 
156            args.string1, 
157            args.string2, 
158            s1in_sep=args.s1in_sep, 
159            s2in_sep=args.s2in_sep, 
160            nbest=args.nbest,
161            write_lattice=args.write_lattice
162            )
163    else:
164        print "You need to specify a valid command sequence..."