PageRenderTime 39ms CodeModel.GetById 8ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk_contrib/nltk_contrib/misc/marshal.py

http://nltk.googlecode.com/
Python | 206 lines | 81 code | 41 blank | 84 comment | 8 complexity | 53e4ba76d36ff9e89b2fa67e6febdc6e MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # Marshaling code, contributed by Tiago Tresoldi
  2. # This saves/loads models to/from plain text files.
  3. # Unlike Python's shelve and pickle utilities,
  4. # this is useful for inspecting or tweaking the models.
  5. # We may incorporate this as a marshal method in each model.
  6. # TODO: describe each tagger marshal format in the epydocs?
  7. from itertools import islice
  8. import re
  9. from nltk import tag
  10. from nltk.corpus import brown
  11. # marshal-classes
  12. class MarshalDefault (tag.Default):
  13. _classname = "DefaultTagger"
  14. def marshal (self, filename):
  15. """
  16. Marshals (saves to a plain text file) the tagger model.
  17. @param filename: Name of the file to which save the model (will
  18. be overwritten if it already exists).
  19. @type filename: C{string}
  20. """
  21. handler = file(filename, "w")
  22. handler.write(self._tag)
  23. handler.close()
  24. def unmarshal (self, filename):
  25. """
  26. Unmarshals (loads from a plain text file) the tagger model. For
  27. safety, this operation is intended to be performed only on
  28. newly created taggers (i.e., without any previous model).
  29. @param filename: Name of the file from which the model will
  30. be read.
  31. @type filename: C{string}
  32. """
  33. handler = file(filename, "r")
  34. self._tag = handler.read()
  35. handler.close()
  36. class MarshalUnigram (tag.Unigram):
  37. _classname = "UnigramTagger"
  38. def marshal (self, filename):
  39. """
  40. Marshals (saves to a plain text file) the tagger model.
  41. @param filename: Name of the file to which save the model (will
  42. be overwritten if it already exists).
  43. @type filename: C{string}
  44. """
  45. handler = file(filename, "w")
  46. for text, tag in self._model.iteritems():
  47. handler.write("%s:%s\n" % (text, tag))
  48. handler.close()
  49. def unmarshal (self, filename):
  50. """
  51. Unmarshals (loads from a plain text file) the tagger model. For
  52. safety, this operation is intended to be performed only on
  53. newly created taggers (i.e., without any previous model).
  54. @param filename: Name of the file from which the model will
  55. be read.
  56. @type filename: C{string}
  57. """
  58. handler = file(filename, "r")
  59. pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
  60. for line in handler.readlines():
  61. m = re.match(pattern, line)
  62. text, tag = m.groups()
  63. self._model[text] = tag
  64. handler.close()
  65. class MarshalAffix (tag.Affix):
  66. _classname = "AffixTagger"
  67. def marshal (self, filename):
  68. """
  69. Marshals (saves to a plain text file) the tagger model.
  70. @param filename: Name of the file to which save the model (will
  71. be overwritten if it already exists).
  72. @type filename: C{string}
  73. """
  74. handler = file(filename, "w")
  75. handler.write("length %i\n" % self._length)
  76. handler.write("minlength %i\n" % self._minlength)
  77. for text, tag in self._model.iteritems():
  78. handler.write("%s:%s\n" % (text, tag))
  79. handler.close()
  80. def unmarshal (self, filename):
  81. """
  82. Unmarshals (loads from a plain text file) the tagger model. For
  83. safety, this operation is intended to be performed only on
  84. newly created taggers (i.e., without any previous model).
  85. @param filename: Name of the file from which the model will
  86. be read.
  87. @type filename: C{string}
  88. """
  89. handler = file(filename, "r")
  90. lines = handler.readlines()
  91. # will fail if "length " and "minlength " are not present
  92. self._length = int(lines[0].split("length ")[1])
  93. self._minlength = int(lines[1].split("minlength ")[1])
  94. pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
  95. for line in lines[2:]:
  96. m = re.match(pattern, line)
  97. text, tag = m.groups()
  98. self._model[text] = tag
  99. handler.close()
  100. class MarshalNgram (tag.Ngram):
  101. _classname = "NgramTagger"
  102. def marshal (self, filename):
  103. """
  104. Marshals (saves to a plain text file) the tagger model.
  105. @param filename: Name of the file to which save the model (will
  106. be overwritten if it already exists).
  107. @type filename: C{string}
  108. """
  109. handler = file(filename, "w")
  110. handler.write("n %i\n" % self._n)
  111. for entry in self._model:
  112. context, text, tag = entry[0], entry[1], self._model[entry]
  113. try:
  114. entry_str = "[%s]:%s:%s\n" % (":".join(context), text, tag)
  115. handler.write(entry_str)
  116. except TypeError:
  117. # None found in 'context', pass silently
  118. pass
  119. handler.close()
  120. def unmarshal (self, filename):
  121. """
  122. Unmarshals (loads from a plain text file) the tagger model. For
  123. safety, this operation is intended to be performed only on
  124. newly created taggers (i.e., without any previous model).
  125. @param filename: Name of the file from which the model will
  126. be read.
  127. @type filename: C{string}
  128. """
  129. handler = file(filename, "r")
  130. lines = handler.readlines()
  131. # will fail if "n " is not present
  132. self._n = int(lines[0].split("n ")[1])
  133. pattern = re.compile(r'^\[(.+)\]:(.+):(.+?)$', re.UNICODE)
  134. # As the separator-char ":" can be used as a tag or as a text,
  135. # 'context_pattern' is built based on the context's size (self._n),
  136. # for example:
  137. # self._n = 2 -> r'^(.+?)$', like 'tag1'
  138. # self._n = 3 -> r'^(.+?):(.+?)$', like 'tag1:tag2'
  139. # self._n = 4 -> r'^(.+?):(.+?):(.+?)$', like 'tag1:tag2:tag3'
  140. context_pattern_str = r'^(.+?)%s$' % ( r':(.+?)' * (self._n-2) )
  141. context_pattern = re.compile(context_pattern_str, re.UNICODE)
  142. for line in lines[1:]:
  143. m = re.match(pattern, line)
  144. context, text, tag = m.groups()
  145. c_m = re.match(context_pattern, context)
  146. key = (c_m.groups(), text)
  147. self._model[key] = tag
  148. handler.close()
  149. def demo ():
  150. # load train corpus
  151. train_sents = brown.tagged('a')[:500]
  152. # create taggers
  153. tagger = MarshalNgram(3)
  154. #tagger.train(train_sents)
  155. #tagger.marshal("ngram.test")
  156. tagger.unmarshal("ngram.test")
  157. print tagger._model