PageRenderTime 91ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/sem/util.py

https://github.com/BrucePHill/nltk
Python | 317 lines | 249 code | 13 blank | 55 comment | 13 complexity | 5ec4c3380d68bc83fb38f19db1a0f620 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Semantic Interpretation
  2. #
  3. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  4. #
  5. # Copyright (C) 2001-2013 NLTK Project
  6. # URL: <http://www.nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Utility functions for batch-processing sentences: parsing and
  10. extraction of the semantic representation of the root node of the the
  11. syntax tree, followed by evaluation of the semantic representation in
  12. a first-order model.
  13. """
  14. from __future__ import print_function, unicode_literals
  15. import re
  16. import codecs
  17. from . import evaluate
  18. ##############################################################
  19. ## Utility functions for connecting parse output to semantics
  20. ##############################################################
  21. def batch_parse(inputs, grammar, trace=0):
  22. """
  23. Convert input sentences into syntactic trees.
  24. :param inputs: sentences to be parsed
  25. :type inputs: list of str
  26. :param grammar: ``FeatureGrammar`` or name of feature-based grammar
  27. :rtype: dict
  28. :return: a mapping from input sentences to a list of ``Tree``s
  29. """
  30. # put imports here to avoid circult dependencies
  31. from nltk.grammar import FeatureGrammar
  32. from nltk.parse import FeatureChartParser, load_parser
  33. if isinstance(grammar, FeatureGrammar):
  34. cp = FeatureChartParser(grammar)
  35. else:
  36. cp = load_parser(grammar, trace=trace)
  37. parses = []
  38. for sent in inputs:
  39. tokens = sent.split() # use a tokenizer?
  40. syntrees = cp.nbest_parse(tokens)
  41. parses.append(syntrees)
  42. return parses
  43. def root_semrep(syntree, semkey='SEM'):
  44. """
  45. Find the semantic representation at the root of a tree.
  46. :param syntree: a parse ``Tree``
  47. :param semkey: the feature label to use for the root semantics in the tree
  48. :return: the semantic representation at the root of a ``Tree``
  49. :rtype: sem.Expression
  50. """
  51. from nltk.grammar import FeatStructNonterminal
  52. node = syntree.node
  53. assert isinstance(node, FeatStructNonterminal)
  54. try:
  55. return node[semkey]
  56. except KeyError:
  57. print(node, end=' ')
  58. print("has no specification for the feature %s" % semkey)
  59. raise
  60. def batch_interpret(inputs, grammar, semkey='SEM', trace=0):
  61. """
  62. Add the semantic representation to each syntactic parse tree
  63. of each input sentence.
  64. :param inputs: a list of sentences
  65. :param grammar: ``FeatureGrammar`` or name of feature-based grammar
  66. :return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations)
  67. :rtype: dict
  68. """
  69. return [[(syn, root_semrep(syn, semkey)) for syn in syntrees]
  70. for syntrees in batch_parse(inputs, grammar, trace=trace)]
  71. def batch_evaluate(inputs, grammar, model, assignment, trace=0):
  72. """
  73. Add the truth-in-a-model value to each semantic representation
  74. for each syntactic parse of each input sentences.
  75. :param inputs: a list of sentences
  76. :param grammar: ``FeatureGrammar`` or name of feature-based grammar
  77. :return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model)
  78. :rtype: dict
  79. """
  80. return [[(syn, sem, model.evaluate("%s" % sem, assignment, trace=trace))
  81. for (syn, sem) in interpretations]
  82. for interpretations in batch_interpret(inputs, grammar)]
  83. ##########################################
  84. # REs used by the parse_valuation function
  85. ##########################################
  86. _VAL_SPLIT_RE = re.compile(r'\s*=+>\s*')
  87. _ELEMENT_SPLIT_RE = re.compile(r'\s*,\s*')
  88. _TUPLES_RE = re.compile(r"""\s*
  89. (\([^)]+\)) # tuple-expression
  90. \s*""", re.VERBOSE)
  91. def parse_valuation_line(s, encoding=None):
  92. """
  93. Parse a line in a valuation file.
  94. Lines are expected to be of the form::
  95. noosa => n
  96. girl => {g1, g2}
  97. chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)}
  98. :param s: input line
  99. :type s: str
  100. :param encoding: the encoding of the input string, if it is binary
  101. :type encoding: str
  102. :return: a pair (symbol, value)
  103. :rtype: tuple
  104. """
  105. if encoding is not None:
  106. s = s.decode(encoding)
  107. pieces = _VAL_SPLIT_RE.split(s)
  108. symbol = pieces[0]
  109. value = pieces[1]
  110. # check whether the value is meant to be a set
  111. if value.startswith('{'):
  112. value = value[1:-1]
  113. tuple_strings = _TUPLES_RE.findall(value)
  114. # are the set elements tuples?
  115. if tuple_strings:
  116. set_elements = []
  117. for ts in tuple_strings:
  118. ts = ts[1:-1]
  119. element = tuple(_ELEMENT_SPLIT_RE.split(ts))
  120. set_elements.append(element)
  121. else:
  122. set_elements = _ELEMENT_SPLIT_RE.split(value)
  123. value = set(set_elements)
  124. return symbol, value
  125. def parse_valuation(s, encoding=None):
  126. """
  127. Convert a valuation file into a valuation.
  128. :param s: the contents of a valuation file
  129. :type s: str
  130. :param encoding: the encoding of the input string, if it is binary
  131. :type encoding: str
  132. :return: a ``nltk.sem`` valuation
  133. :rtype: Valuation
  134. """
  135. if encoding is not None:
  136. s = s.decode(encoding)
  137. statements = []
  138. for linenum, line in enumerate(s.splitlines()):
  139. line = line.strip()
  140. if line.startswith('#') or line=='': continue
  141. try: statements.append(parse_valuation_line(line))
  142. except ValueError:
  143. raise ValueError('Unable to parse line %s: %s' % (linenum, line))
  144. val = evaluate.Valuation(statements)
  145. return val
  146. def demo_model0():
  147. global m0, g0
  148. #Initialize a valuation of non-logical constants."""
  149. v = [('john', 'b1'),
  150. ('mary', 'g1'),
  151. ('suzie', 'g2'),
  152. ('fido', 'd1'),
  153. ('tess', 'd2'),
  154. ('noosa', 'n'),
  155. ('girl', set(['g1', 'g2'])),
  156. ('boy', set(['b1', 'b2'])),
  157. ('dog', set(['d1', 'd2'])),
  158. ('bark', set(['d1', 'd2'])),
  159. ('walk', set(['b1', 'g2', 'd1'])),
  160. ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])),
  161. ('see', set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'),('d2', 'b1'), ('g2', 'n')])),
  162. ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])),
  163. ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')]))
  164. ]
  165. #Read in the data from ``v``
  166. val = evaluate.Valuation(v)
  167. #Bind ``dom`` to the ``domain`` property of ``val``
  168. dom = val.domain
  169. #Initialize a model with parameters ``dom`` and ``val``.
  170. m0 = evaluate.Model(dom, val)
  171. #Initialize a variable assignment with parameter ``dom``
  172. g0 = evaluate.Assignment(dom)
  173. def read_sents(filename, encoding='utf8'):
  174. with codecs.open(filename, 'r', encoding) as fp:
  175. sents = [l.rstrip() for l in fp]
  176. # get rid of blank lines
  177. sents = [l for l in sents if len(l) > 0]
  178. sents = [l for l in sents if not l[0] == '#']
  179. return sents
  180. def demo_legacy_grammar():
  181. """
  182. Check that batch_interpret() is compatible with legacy grammars that use
  183. a lowercase 'sem' feature.
  184. Define 'test.fcfg' to be the following
  185. """
  186. from nltk.grammar import parse_fcfg
  187. g = parse_fcfg("""
  188. % start S
  189. S[sem=<hello>] -> 'hello'
  190. """)
  191. print("Reading grammar: %s" % g)
  192. print("*" * 20)
  193. for reading in batch_interpret(['hello'], g, semkey='sem'):
  194. syn, sem = reading[0]
  195. print()
  196. print("output: ", sem)
  197. def demo():
  198. import sys
  199. from optparse import OptionParser
  200. description = \
  201. """
  202. Parse and evaluate some sentences.
  203. """
  204. opts = OptionParser(description=description)
  205. opts.set_defaults(evaluate=True, beta=True, syntrace=0,
  206. semtrace=0, demo='default', grammar='', sentences='')
  207. opts.add_option("-d", "--demo", dest="demo",
  208. help="choose demo D; omit this for the default demo, or specify 'chat80'", metavar="D")
  209. opts.add_option("-g", "--gram", dest="grammar",
  210. help="read in grammar G", metavar="G")
  211. opts.add_option("-m", "--model", dest="model",
  212. help="import model M (omit '.py' suffix)", metavar="M")
  213. opts.add_option("-s", "--sentences", dest="sentences",
  214. help="read in a file of test sentences S", metavar="S")
  215. opts.add_option("-e", "--no-eval", action="store_false", dest="evaluate",
  216. help="just do a syntactic analysis")
  217. opts.add_option("-b", "--no-beta-reduction", action="store_false",
  218. dest="beta", help="don't carry out beta-reduction")
  219. opts.add_option("-t", "--syntrace", action="count", dest="syntrace",
  220. help="set syntactic tracing on; requires '-e' option")
  221. opts.add_option("-T", "--semtrace", action="count", dest="semtrace",
  222. help="set semantic tracing on")
  223. (options, args) = opts.parse_args()
  224. SPACER = '-' * 30
  225. demo_model0()
  226. sents = [
  227. 'Fido sees a boy with Mary',
  228. 'John sees Mary',
  229. 'every girl chases a dog',
  230. 'every boy chases a girl',
  231. 'John walks with a girl in Noosa',
  232. 'who walks']
  233. gramfile = 'grammars/sample_grammars/sem2.fcfg'
  234. if options.sentences:
  235. sentsfile = options.sentences
  236. if options.grammar:
  237. gramfile = options.grammar
  238. if options.model:
  239. exec("import %s as model" % options.model)
  240. if sents is None:
  241. sents = read_sents(sentsfile)
  242. # Set model and assignment
  243. model = m0
  244. g = g0
  245. if options.evaluate:
  246. evaluations = \
  247. batch_evaluate(sents, gramfile, model, g, trace=options.semtrace)
  248. else:
  249. semreps = \
  250. batch_interpret(sents, gramfile, trace=options.syntrace)
  251. for i, sent in enumerate(sents):
  252. n = 1
  253. print('\nSentence: %s' % sent)
  254. print(SPACER)
  255. if options.evaluate:
  256. for (syntree, semrep, value) in evaluations[i]:
  257. if isinstance(value, dict):
  258. value = set(value.keys())
  259. print('%d: %s' % (n, semrep))
  260. print(value)
  261. n += 1
  262. else:
  263. for (syntree, semrep) in semreps[i]:
  264. print('%d: %s' % (n, semrep))
  265. n += 1
  266. if __name__ == "__main__":
  267. #demo()
  268. demo_legacy_grammar()