PageRenderTime 43ms CodeModel.GetById 10ms app.highlight 26ms RepoModel.GetById 1ms app.codeStats 0ms

/script/m2m2format.py

https://code.google.com/p/phonetisaurus/
Python | 105 lines | 87 code | 1 blank | 17 comment | 3 complexity | 09a206db9c5d3d994fcb3e0b673add30 MD5 | raw file
  1#!/usr/bin/python
  2#    Copyright 2011-2012 Josef Robert Novak
  3#
  4#    This file is part of Phonetisaurus.
  5#
  6#    Phonetisaurus is free software: you can redistribute it 
  7#    and/or modify it under the terms of the GNU General Public 
  8#    License as published by the Free Software Foundation, either 
  9#    version 3 of the License, or (at your option) any later version.
 10#
 11#    Phonetisaurus is distributed in the hope that it will be useful, but 
 12#    WITHOUT ANY WARRANTY; without even the implied warranty of 
 13#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 14#    General Public License for more details.
 15#
 16#    You should have received a copy of the GNU General Public 
 17#    License along with Phonetisaurus. If not, see http://www.gnu.org/licenses/.
 18import re, sys, os
 19
 20def m2m2Format( dict_file, prefix="test", graph_sep="", phon_sep=" ", entry_sep="\t", reverse=False, swap=False, unify_case=False ):
 21    """
 22      Format the raw dictionary file for the m2m alignment tool.
 23    """
 24
 25    if graph_sep==entry_sep:
 26        raise ValueError("graph_sep and entry_sep share the same value!")
 27    elif phon_sep==entry_sep:
 28        raise ValueError("phon_sep and entry_sep share the same value!")
 29
 30    dict_file_fp = open( dict_file, "r" )
 31    train_file_fp = open( "PREFIX.train".replace("PREFIX",prefix), "w" )
 32    chars = set([])
 33    for line in dict_file_fp:
 34        line = line.strip()
 35        line = line.decode("utf8")
 36        word = None; pron = None
 37        graphs = []; phons = [];
 38        
 39        word, pron = line.split(entry_sep)
 40        if unify_case==True:
 41            word = word.lower()
 42
 43        #We will map spaces, dashes, underscores, etc. categorically
 44        # to "" during training.  These will be auto-mapped to <eps>
 45        # during decoding.
 46        word = word.replace(" ","").replace("_","").replace("-","")
 47        if graph_sep=="":
 48            graphs = list(word)
 49        else:
 50            graphs = word.split(graph_sep)
 51        for g in graphs:
 52            chars.update( [ x for x in list(g) ])
 53
 54        if phon_sep=="":
 55            phons = list(pron)
 56        else:
 57            phons = pron.split(phon_sep)
 58        for p in phons:
 59            chars.update( [ x for x in list(p) ])
 60
 61        if reverse==True:
 62            graphs.reverse()
 63            phons.reverse()
 64
 65        if swap==True:
 66            outline = "%s\t%s\n" % (" ".join(phons), " ".join(graphs))
 67        else:
 68            outline = "%s\t%s\n" % (" ".join(graphs), " ".join(phons))
 69        outline = outline.encode("utf8")
 70        train_file_fp.write(outline)
 71    dict_file_fp.close()
 72    train_file_fp.close()
 73
 74    return chars
 75
 76
 77if __name__=="__main__":
 78    import sys, argparse
 79
 80    example = """./m2m2format.py --dict training.dic --prefix test"""
 81    parser = argparse.ArgumentParser(description=example)
 82    parser.add_argument('--dict',     "-d", help="The input pronunciation dictionary.  This will be used to build the G2P/P2G model.", required=True )
 83    parser.add_argument('--graphsep', "-g", help="The grapheme separator for the raw dictionary file. Defaults to ''.", default="" )
 84    parser.add_argument('--phonsep',  "-e", help="The phoneme separator for the raw dictionary file. Defaults to ' '.", default=" " )
 85    parser.add_argument('--reverse',  "-r", help="Reverse the training data. 'word'->'drow'.", default=False, action="store_true")
 86    parser.add_argument('--entrysep', "-y", help="The word/pronunciation separator for the raw dictionary file. Defaults to '\t'.", default="\t" )
 87    parser.add_argument('--swap',     "-w", help="Swap the grapheme/phomeme input.  G2P vs. P2G model.", default=False, action="store_true" )
 88    parser.add_argument('--prefix',   "-p", help="A file prefix.  Will be prepended to all model files created during cascade generation.", default="test" )
 89    parser.add_argument('--verbose',  "-v", help='Verbose mode.', default=False, action="store_true")
 90    args = parser.parse_args()
 91
 92    if args.verbose:
 93        for attr, value in args.__dict__.iteritems():
 94            print attr, value
 95        
 96    m2m2Format( 
 97        args.dict, 
 98        prefix=args.prefix, 
 99        graph_sep=args.graphsep, 
100        phon_sep=args.phonsep, 
101        entry_sep=args.entrysep, 
102        reverse=args.reverse, 
103        swap=args.swap 
104        )
105