textgrid.py | searchcode

/nltk_contrib/nltk_contrib/textgrid.py

http://nltk.googlecode.com/
Python | 653 lines | 562 code | 5 blank | 86 comment | 1 complexity | 37522ac4b7b58e12160790e6d88560db MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0

# Natural Language Toolkit: TextGrid analysis
#
# Copyright (C) 2001-2011 NLTK Project
# Author: Margaret Mitchell <itallow@gmail.com>
#         Steven Bird <sb@csse.unimelb.edu.au> (revisions)
# URL: <http://www.nltk.org>
# For license information, see LICENSE.TXT
#

"""
Tools for reading TextGrid files, the format used by Praat.

Module contents
===============

The textgrid corpus reader provides 4 data items and 1 function
for each textgrid file.  For each tier in the file, the reader
provides 10 data items and 2 functions.
 
For the full textgrid file: 

  - size
    The number of tiers in the file.

  - xmin
    First marked time of the file.

  - xmax
    Last marked time of the file.

  - t_time
    xmax - xmin.

  - text_type
    The style of TextGrid format:
        - ooTextFile:  Organized by tier.
        - ChronTextFile:  Organized by time.
        - OldooTextFile:  Similar to ooTextFile.

  - to_chron()
    Convert given file to a ChronTextFile format.

  - to_oo()
    Convert given file to an ooTextFile format.

For each tier:

  - text_type
    The style of TextGrid format, as above.

  - classid
    The style of transcription on this tier:
        - IntervalTier:  Transcription is marked as intervals.
        - TextTier:  Transcription is marked as single points.

  - nameid
    The name of the tier.

  - xmin
    First marked time of the tier.

  - xmax
    Last marked time of the tier.

  - size
    Number of entries in the tier.

  - transcript
    The raw transcript for the tier.

  - simple_transcript
    The transcript formatted as a list of tuples: (time1, time2, utterance).

  - tier_info
    List of (classid, nameid, xmin, xmax, size, transcript).

  - min_max()
    A tuple of (xmin, xmax).  

  - time(non_speech_marker)
    Returns the utterance time of a given tier.
    Excludes entries that begin with a non-speech marker.

"""

# needs more cleanup, subclassing, epydoc docstrings

import sys
import re

TEXTTIER = "TextTier"
INTERVALTIER = "IntervalTier"

OOTEXTFILE = re.compile(r"""(?x)
            xmin\ =\ (.*)[\r\n]+
            xmax\ =\ (.*)[\r\n]+
            [\s\S]+?size\ =\ (.*)[\r\n]+ 
""")

CHRONTEXTFILE = re.compile(r"""(?x)
            [\r\n]+(\S+)\ 
            (\S+)\ +!\ Time\ domain.\ *[\r\n]+
            (\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+"
""")

OLDOOTEXTFILE = re.compile(r"""(?x)
            [\r\n]+(\S+)
            [\r\n]+(\S+)
            [\r\n]+.+[\r\n]+(\S+)
""")



#################################################################
# TextGrid Class
#################################################################

class TextGrid(object):
    """
    Class to manipulate the TextGrid format used by Praat.
    Separates each tier within this file into its own Tier
    object.  Each TextGrid object has
    a number of tiers (size), xmin, xmax, a text type to help
    with the different styles of TextGrid format, and tiers with their
    own attributes.
    """

    def __init__(self, read_file):
        """
        Takes open read file as input, initializes attributes 
        of the TextGrid file.
        @type read_file: An open TextGrid file, mode "r".
        @param size:  Number of tiers.
        @param xmin: xmin.
        @param xmax: xmax.
        @param t_time:  Total time of TextGrid file.
        @param text_type:  TextGrid format.
        @type tiers:  A list of tier objects.
        """

        self.read_file = read_file
        self.size = 0
        self.xmin = 0
        self.xmax = 0
        self.t_time = 0
        self.text_type = self._check_type()
        self.tiers = self._find_tiers()

    def __iter__(self):
        for tier in self.tiers:
            yield tier

    def next(self):
        if self.idx == (self.size - 1):
            raise StopIteration
        self.idx += 1
        return self.tiers[self.idx]

    @staticmethod
    def load(file):
        """
        @param file: a file in TextGrid format
        """

        return TextGrid(open(file).read())

    def _load_tiers(self, header):
        """
        Iterates over each tier and grabs tier information.
        """ 

        tiers = []
        if self.text_type == "ChronTextFile":
            m = re.compile(header)
            tier_headers = m.findall(self.read_file)
            tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\""
            for i in range(0, self.size):
                tier_info = [tier_headers[i]] + \
                re.findall(str(i + 1) + tier_re, self.read_file)
                tier_info = "\n".join(tier_info)
                tiers.append(Tier(tier_info, self.text_type, self.t_time))
            return tiers

        tier_re = header + "[\s\S]+?(?=" + header + "|$$)"
        m = re.compile(tier_re)
        tier_iter = m.finditer(self.read_file)
        for iterator in tier_iter:
            (begin, end) = iterator.span()
            tier_info = self.read_file[begin:end]
            tiers.append(Tier(tier_info, self.text_type, self.t_time))
        return tiers
    
    def _check_type(self):
        """
        Figures out the TextGrid format.
        """

        m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file)
        try:
            type_id = m.group(1).strip()
        except AttributeError:
            raise TypeError("Cannot read file -- try TextGrid.load()")
        xmin = m.group(4)
        if type_id == "File type = \"ooTextFile\"":
            if "xmin" not in xmin:
                text_type = "OldooTextFile"
            else:
                text_type = "ooTextFile"
        elif type_id == "\"Praat chronological TextGrid text file\"":
            text_type = "ChronTextFile"
        else: 
            raise TypeError("Unknown format '(%s)'", (type_id))
        return text_type
        
    def _find_tiers(self):
        """
        Splits the textgrid file into substrings corresponding to tiers. 
        """

        if self.text_type == "ooTextFile":
            m = OOTEXTFILE
            header = " +item \["
        elif self.text_type == "ChronTextFile":
            m = CHRONTEXTFILE
            header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*"
        elif self.text_type == "OldooTextFile":
            m = OLDOOTEXTFILE
            header = "\".*\"[\r\n]+\".*\""

        file_info = m.findall(self.read_file)[0]
        self.xmin = float(file_info[0])
        self.xmax = float(file_info[1])
        self.t_time = self.xmax - self.xmin
        self.size = int(file_info[2])
        tiers = self._load_tiers(header)
        return tiers

    def to_chron(self):
        """ 
        @return:  String in Chronological TextGrid file format.
        """

        chron_file = ""
        chron_file += "\"Praat chronological TextGrid text file\"\n"
        chron_file += str(self.xmin) + " " + str(self.xmax)
        chron_file += "   ! Time domain.\n"
        chron_file += str(self.size) + "   ! Number of tiers.\n"
        for tier in self.tiers:
            idx = (self.tiers.index(tier)) + 1
            tier_header = "\"" + tier.classid + "\" \"" \
                          + tier.nameid + "\" " + str(tier.xmin) \
                          + " " + str(tier.xmax)
            chron_file += tier_header + "\n"
            transcript = tier.simple_transcript
            for (xmin, xmax, utt) in transcript:
                chron_file += str(idx) + " " + str(xmin) 
                chron_file += " " + str(xmax) +"\n"
                chron_file += "\"" + utt + "\"\n"
        return chron_file

    def to_oo(self):
        """ 
        @return:  A string in OoTextGrid file format.
        """
   
        oo_file = ""
        oo_file += "File type = \"ooTextFile\"\n"
        oo_file += "Object class = \"TextGrid\"\n\n"
        oo_file += "xmin = ", self.xmin, "\n"
        oo_file += "xmax = ", self.xmax, "\n"
        oo_file += "tiers? <exists>\n"
        oo_file += "size = ", self.size, "\n"
        oo_file += "item []:\n"
        for i in range(len(self.tiers)):
            oo_file += "%4s%s [%s]" % ("", "item", i + 1)
            _curr_tier = self.tiers[i]
            for (x, y) in _curr_tier.header:
                oo_file += "%8s%s = \"%s\"" % ("", x, y)
            if _curr_tier.classid != TEXTTIER:
                for (xmin, xmax, text) in _curr_tier.simple_transcript:
                    oo_file += "%12s%s = %s" % ("", "xmin", xmin)
                    oo_file += "%12s%s = %s" % ("", "xmax", xmax)
                    oo_file += "%12s%s = \"%s\"" % ("", "text", text)
            else:
                for (time, mark) in _curr_tier.simple_transcript:
                    oo_file += "%12s%s = %s" % ("", "time", time)
                    oo_file += "%12s%s = %s" % ("", "mark", mark)
        return oo_file


#################################################################
# Tier Class
#################################################################

class Tier(object):
    """ 
    A container for each tier.
    """

    def __init__(self, tier, text_type, t_time):
        """
        Initializes attributes of the tier: class, name, xmin, xmax
        size, transcript, total time.  
        Utilizes text_type to guide how to parse the file.
        @type tier: a tier object; single item in the TextGrid list.
        @param text_type:  TextGrid format
        @param t_time:  Total time of TextGrid file.
        @param classid:  Type of tier (point or interval).
        @param nameid:  Name of tier.
        @param xmin:  xmin of the tier.
        @param xmax:  xmax of the tier.
        @param size:  Number of entries in the tier
        @param transcript:  The raw transcript for the tier.
        """

        self.tier = tier
        self.text_type = text_type
        self.t_time = t_time
        self.classid = ""
        self.nameid = ""
        self.xmin = 0
        self.xmax = 0
        self.size = 0
        self.transcript = ""
        self.tier_info = ""
        self._make_info()
        self.simple_transcript = self.make_simple_transcript()
        if self.classid != TEXTTIER:
            self.mark_type = "intervals"
        else:
            self.mark_type = "points"
            self.header = [("class", self.classid), ("name", self.nameid), \
            ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)]

    def __iter__(self):
        return self
  
    def _make_info(self):
        """
        Figures out most attributes of the tier object:
        class, name, xmin, xmax, transcript.
        """

        trans = "([\S\s]*)"
        if self.text_type == "ChronTextFile":
            classid = "\"(.*)\" +"
            nameid = "\"(.*)\" +"
            xmin = "(\d+\.?\d*) +"
            xmax = "(\d+\.?\d*) *[\r\n]+"
            # No size values are given in the Chronological Text File format.
            self.size = None
            size = ""
        elif self.text_type == "ooTextFile":
            classid = " +class = \"(.*)\" *[\r\n]+"
            nameid = " +name = \"(.*)\" *[\r\n]+"
            xmin = " +xmin = (\d+\.?\d*) *[\r\n]+"
            xmax = " +xmax = (\d+\.?\d*) *[\r\n]+"
            size = " +\S+: size = (\d+) *[\r\n]+"
        elif self.text_type == "OldooTextFile":
            classid = "\"(.*)\" *[\r\n]+"
            nameid = "\"(.*)\" *[\r\n]+"
            xmin = "(\d+\.?\d*) *[\r\n]+"
            xmax = "(\d+\.?\d*) *[\r\n]+"
            size = "(\d+) *[\r\n]+"
        m = re.compile(classid + nameid + xmin + xmax + size + trans)
        self.tier_info = m.findall(self.tier)[0]
        self.classid = self.tier_info[0]
        self.nameid = self.tier_info[1]
        self.xmin = float(self.tier_info[2])
        self.xmax = float(self.tier_info[3])
        if self.size != None:
            self.size = int(self.tier_info[4])
        self.transcript = self.tier_info[-1]
            
    def make_simple_transcript(self):
        """ 
        @return:  Transcript of the tier, in form [(start_time end_time label)]
        """

        if self.text_type == "ChronTextFile":
            trans_head = ""
            trans_xmin = " (\S+)"
            trans_xmax = " (\S+)[\r\n]+"
            trans_text = "\"([\S\s]*?)\""
        elif self.text_type == "ooTextFile":
            trans_head = " +\S+ \[\d+\]: *[\r\n]+"
            trans_xmin = " +\S+ = (\S+) *[\r\n]+"
            trans_xmax = " +\S+ = (\S+) *[\r\n]+"
            trans_text = " +\S+ = \"([^\"]*?)\""    
        elif self.text_type == "OldooTextFile":
            trans_head = ""
            trans_xmin = "(.*)[\r\n]+"
            trans_xmax = "(.*)[\r\n]+"
            trans_text = "\"([\S\s]*?)\""
        if self.classid == TEXTTIER:
            trans_xmin = ""
        trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text)
        self.simple_transcript = trans_m.findall(self.transcript)
        return self.simple_transcript

    def transcript(self):
        """
        @return:  Transcript of the tier, as it appears in the file.
        """
       
        return self.transcript

    def time(self, non_speech_char="."):
        """
        @return: Utterance time of a given tier.
        Screens out entries that begin with a non-speech marker.        
        """

        total = 0.0
        if self.classid != TEXTTIER:
            for (time1, time2, utt) in self.simple_transcript:
                utt = utt.strip()
                if utt and not utt[0] == ".":
                    total += (float(time2) - float(time1))
        return total
                    
    def tier_name(self):
        """
        @return:  Tier name of a given tier.
        """

        return self.nameid

    def classid(self):
        """
        @return:  Type of transcription on tier.
        """

        return self.classid

    def min_max(self):
        """
        @return:  (xmin, xmax) tuple for a given tier.
        """

        return (self.xmin, self.xmax)

    def __repr__(self):
        return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time)

    def __str__(self):
        return self.__repr__() + "\n  " + "\n  ".join(" ".join(row) for row in self.simple_transcript)

def demo_TextGrid(demo_data):
    print "** Demo of the TextGrid class. **"

    fid = TextGrid(demo_data)
    print "Tiers:", fid.size

    for i, tier in enumerate(fid):
        print "\n***"
        print "Tier:", i + 1
        print tier

def demo():
    # Each demo demonstrates different TextGrid formats.
    print "Format 1"
    demo_TextGrid(demo_data1)
    print "\nFormat 2"
    demo_TextGrid(demo_data2)
    print "\nFormat 3"
    demo_TextGrid(demo_data3)


demo_data1 = """File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0 
xmax = 2045.144149659864
tiers? <exists> 
size = 3 
item []: 
    item [1]:
        class = "IntervalTier" 
        name = "utterances" 
        xmin = 0 
        xmax = 2045.144149659864 
        intervals: size = 5 
        intervals [1]:
            xmin = 0 
            xmax = 2041.4217474125382 
            text = "" 
        intervals [2]:
            xmin = 2041.4217474125382 
            xmax = 2041.968276643991 
            text = "this" 
        intervals [3]:
            xmin = 2041.968276643991 
            xmax = 2042.5281632653062 
            text = "is" 
        intervals [4]:
            xmin = 2042.5281632653062 
            xmax = 2044.0487352585324 
            text = "a" 
        intervals [5]:
            xmin = 2044.0487352585324 
            xmax = 2045.144149659864 
            text = "demo" 
    item [2]:
        class = "TextTier" 
        name = "notes" 
        xmin = 0 
        xmax = 2045.144149659864 
        points: size = 3 
        points [1]:
            time = 2041.4217474125382 
            mark = ".begin_demo"
        points [2]:
            time = 2043.8338291031832
            mark = "voice gets quiet here" 
        points [3]:
            time = 2045.144149659864
            mark = ".end_demo" 
    item [3]:
        class = "IntervalTier" 
        name = "phones" 
        xmin = 0 
        xmax = 2045.144149659864
        intervals: size = 12
        intervals [1]:
            xmin = 0 
            xmax = 2041.4217474125382 
            text = "" 
        intervals [2]:
            xmin = 2041.4217474125382 
            xmax = 2041.5438290324326 
            text = "D"
        intervals [3]:
            xmin = 2041.5438290324326
            xmax = 2041.7321032910372
            text = "I"
        intervals [4]:
            xmin = 2041.7321032910372            
            xmax = 2041.968276643991 
            text = "s" 
        intervals [5]:
            xmin = 2041.968276643991 
            xmax = 2042.232189031843
            text = "I"
        intervals [6]:
            xmin = 2042.232189031843
            xmax = 2042.5281632653062 
            text = "z" 
        intervals [7]:
            xmin = 2042.5281632653062 
            xmax = 2044.0487352585324 
            text = "eI" 
        intervals [8]:
            xmin = 2044.0487352585324 
            xmax = 2044.2487352585324
            text = "dc"
        intervals [9]:
            xmin = 2044.2487352585324
            xmax = 2044.3102321849011
            text = "d"
        intervals [10]:
            xmin = 2044.3102321849011
            xmax = 2044.5748932104329
            text = "E"
        intervals [11]:
            xmin = 2044.5748932104329
            xmax = 2044.8329108578437
            text = "m"
        intervals [12]:
            xmin = 2044.8329108578437
            xmax = 2045.144149659864 
            text = "oU" 
"""

demo_data2 = """File type = "ooTextFile"
Object class = "TextGrid"

0
2.8
<exists>
2
"IntervalTier"
"utterances"
0
2.8
3
0
1.6229213249309031
""
1.6229213249309031
2.341428074708195
"demo"
2.341428074708195
2.8
""
"IntervalTier"
"phones"
0
2.8
6
0
1.6229213249309031
""
1.6229213249309031
1.6428291382019483
"dc"
1.6428291382019483
1.65372183721983721
"d"
1.65372183721983721
1.94372874328943728
"E"
1.94372874328943728
2.13821938291038210
"m"
2.13821938291038210
2.341428074708195
"oU"
2.341428074708195
2.8
""
"""

demo_data3 = """"Praat chronological TextGrid text file"
0 2.8   ! Time domain.
2   ! Number of tiers.
"IntervalTier" "utterances" 0 2.8
"IntervalTier" "utterances" 0 2.8
1 0 1.6229213249309031
""
2 0 1.6229213249309031
""
2 1.6229213249309031 1.6428291382019483
"dc"
2 1.6428291382019483 1.65372183721983721
"d"
2 1.65372183721983721 1.94372874328943728
"E"
2 1.94372874328943728 2.13821938291038210
"m"
2 2.13821938291038210 2.341428074708195
"oU"
1 1.6229213249309031 2.341428074708195
"demo"
1 2.341428074708195 2.8
""
2 2.341428074708195 2.8
""
"""

if __name__ == "__main__":
    demo()