m2m2format.py - Copyright 2011-2012 Josef Robert Novak This…

/script/m2m2format.py

https://code.google.com/p/phonetisaurus/ · Python · 105 lines · 67 code · 15 blank · 23 comment · 18 complexity · 09a206db9c5d3d994fcb3e0b673add30 MD5 · raw file


#!/usr/bin/python
#    Copyright 2011-2012 Josef Robert Novak
#
#    This file is part of Phonetisaurus.
#
#    Phonetisaurus is free software: you can redistribute it 
#    and/or modify it under the terms of the GNU General Public 
#    License as published by the Free Software Foundation, either 
#    version 3 of the License, or (at your option) any later version.
#
#    Phonetisaurus is distributed in the hope that it will be useful, but 
#    WITHOUT ANY WARRANTY; without even the implied warranty of 
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
#    General Public License for more details.
#
#    You should have received a copy of the GNU General Public 
#    License along with Phonetisaurus. If not, see http://www.gnu.org/licenses/.
import re, sys, os

def m2m2Format( dict_file, prefix="test", graph_sep="", phon_sep=" ", entry_sep="\t", reverse=False, swap=False, unify_case=False ):
    """
      Format the raw dictionary file for the m2m alignment tool.
    """

    if graph_sep==entry_sep:
        raise ValueError("graph_sep and entry_sep share the same value!")
    elif phon_sep==entry_sep:
        raise ValueError("phon_sep and entry_sep share the same value!")

    dict_file_fp = open( dict_file, "r" )
    train_file_fp = open( "PREFIX.train".replace("PREFIX",prefix), "w" )
    chars = set([])
    for line in dict_file_fp:
        line = line.strip()
        line = line.decode("utf8")
        word = None; pron = None
        graphs = []; phons = [];
        
        word, pron = line.split(entry_sep)
        if unify_case==True:
            word = word.lower()

        #We will map spaces, dashes, underscores, etc. categorically
        # to "" during training.  These will be auto-mapped to <eps>
        # during decoding.
        word = word.replace(" ","").replace("_","").replace("-","")
        if graph_sep=="":
            graphs = list(word)
        else:
            graphs = word.split(graph_sep)
        for g in graphs:
            chars.update( [ x for x in list(g) ])

        if phon_sep=="":
            phons = list(pron)
        else:
            phons = pron.split(phon_sep)
        for p in phons:
            chars.update( [ x for x in list(p) ])

        if reverse==True:
            graphs.reverse()
            phons.reverse()

        if swap==True:
            outline = "%s\t%s\n" % (" ".join(phons), " ".join(graphs))
        else:
            outline = "%s\t%s\n" % (" ".join(graphs), " ".join(phons))
        outline = outline.encode("utf8")
        train_file_fp.write(outline)
    dict_file_fp.close()
    train_file_fp.close()

    return chars


if __name__=="__main__":
    import sys, argparse

    example = """./m2m2format.py --dict training.dic --prefix test"""
    parser = argparse.ArgumentParser(description=example)
    parser.add_argument('--dict',     "-d", help="The input pronunciation dictionary.  This will be used to build the G2P/P2G model.", required=True )
    parser.add_argument('--graphsep', "-g", help="The grapheme separator for the raw dictionary file. Defaults to ''.", default="" )
    parser.add_argument('--phonsep',  "-e", help="The phoneme separator for the raw dictionary file. Defaults to ' '.", default=" " )
    parser.add_argument('--reverse',  "-r", help="Reverse the training data. 'word'->'drow'.", default=False, action="store_true")
    parser.add_argument('--entrysep', "-y", help="The word/pronunciation separator for the raw dictionary file. Defaults to '\t'.", default="\t" )
    parser.add_argument('--swap',     "-w", help="Swap the grapheme/phomeme input.  G2P vs. P2G model.", default=False, action="store_true" )
    parser.add_argument('--prefix',   "-p", help="A file prefix.  Will be prepended to all model files created during cascade generation.", default="test" )
    parser.add_argument('--verbose',  "-v", help='Verbose mode.', default=False, action="store_true")
    args = parser.parse_args()

    if args.verbose:
        for attr, value in args.__dict__.iteritems():
            print attr, value
        
    m2m2Format( 
        args.dict, 
        prefix=args.prefix, 
        graph_sep=args.graphsep, 
        phon_sep=args.phonsep, 
        entry_sep=args.entrysep, 
        reverse=args.reverse, 
        swap=args.swap 
        )

Tech Fingerprint

Standard Library: OS Interaction

Alerts (5)

'def' Avoid long function definitions; keep signatures concise for readability
20
'list(' Avoid unnecessary list conversions; use generators where possible
48 52 55 59