marshal.py - Marshaling code, contributed by Tiago Tresoldi…

/nltk_contrib/misc/marshal.py

https://github.com/nltk/nltk_contrib · Python · 206 lines · 81 code · 41 blank · 84 comment · 8 complexity · 53e4ba76d36ff9e89b2fa67e6febdc6e MD5 · raw file

# Marshaling code, contributed by Tiago Tresoldi
# This saves/loads models to/from plain text files.
# Unlike Python's shelve and pickle utilities,
# this is useful for inspecting or tweaking the models.
# We may incorporate this as a marshal method in each model.

# TODO: describe each tagger marshal format in the epydocs?

from itertools import islice
import re

from nltk import tag
from nltk.corpus import brown

# marshal-classes

class MarshalDefault (tag.Default):
    _classname = "DefaultTagger"

    def marshal (self, filename):
        """
        Marshals (saves to a plain text file) the tagger model.
       
        @param filename: Name of the file to which save the model (will
                         be overwritten if it already exists).
        @type filename: C{string}
        """
        handler = file(filename, "w")
        handler.write(self._tag)
        handler.close()

    def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).

        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        self._tag = handler.read()
        handler.close()

class MarshalUnigram (tag.Unigram):
    _classname = "UnigramTagger"

    def marshal (self, filename):
        """
        Marshals (saves to a plain text file) the tagger model.

        @param filename: Name of the file to which save the model (will
                         be overwritten if it already exists).
        @type filename: C{string}
        """
        handler = file(filename, "w")
        
        for text, tag in self._model.iteritems():
            handler.write("%s:%s\n" % (text, tag))

        handler.close()
        
    def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
       
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
        for line in handler.readlines():
            m = re.match(pattern, line)
            text, tag = m.groups()
            self._model[text] = tag
        
        handler.close()

class MarshalAffix (tag.Affix):
    _classname = "AffixTagger"

    def marshal (self, filename):
        """
        Marshals (saves to a plain text file) the tagger model.
        
        @param filename: Name of the file to which save the model (will
                         be overwritten if it already exists).
        @type filename: C{string}
        """
        handler = file(filename, "w")
        
        handler.write("length %i\n" % self._length)
        handler.write("minlength %i\n" % self._minlength)
        
        for text, tag in self._model.iteritems():
            handler.write("%s:%s\n" % (text, tag))

        handler.close()

    def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
        
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        lines = handler.readlines()
        # will fail if "length " and "minlength " are not present
        self._length = int(lines[0].split("length ")[1])
        self._minlength = int(lines[1].split("minlength ")[1])
        
        pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
        for line in lines[2:]:
            m = re.match(pattern, line)
            text, tag = m.groups()
            self._model[text] = tag
        
        handler.close()

class MarshalNgram (tag.Ngram):
    _classname = "NgramTagger"

    def marshal (self, filename):
        """
        Marshals (saves to a plain text file) the tagger model.
        
        @param filename: Name of the file to which save the model (will
                         be overwritten if it already exists).
        @type filename: C{string}
        """
        handler = file(filename, "w")
        
        handler.write("n %i\n" % self._n)

        for entry in self._model:
            context, text, tag = entry[0], entry[1], self._model[entry]
            
            try:
                entry_str = "[%s]:%s:%s\n" % (":".join(context), text, tag)
                handler.write(entry_str)
            except TypeError:
                # None found in 'context', pass silently
                pass
        
        handler.close()

    def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
        
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        lines = handler.readlines()
        # will fail if "n " is not present
        self._n = int(lines[0].split("n ")[1])
        
        
        pattern = re.compile(r'^\[(.+)\]:(.+):(.+?)$', re.UNICODE)
        
        # As the separator-char ":" can be used as a tag or as a text,
        # 'context_pattern' is built based on the context's size (self._n),
        # for example:
        #   self._n = 2 -> r'^(.+?)$', like 'tag1'
        #   self._n = 3 -> r'^(.+?):(.+?)$', like 'tag1:tag2'
        #   self._n = 4 -> r'^(.+?):(.+?):(.+?)$', like 'tag1:tag2:tag3'
        context_pattern_str = r'^(.+?)%s$' % ( r':(.+?)' * (self._n-2) )
        context_pattern = re.compile(context_pattern_str, re.UNICODE)
        
        for line in lines[1:]:
            m = re.match(pattern, line)
            context, text, tag = m.groups()
            
            c_m = re.match(context_pattern, context)
            key = (c_m.groups(), text)
            self._model[key] = tag
        
        handler.close()

def demo ():
    # load train corpus
    train_sents = brown.tagged('a')[:500]

    # create taggers
    tagger = MarshalNgram(3)

    #tagger.train(train_sents)
    #tagger.marshal("ngram.test")

    tagger.unmarshal("ngram.test")
    print tagger._model