PageRenderTime 46ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/nltk/sem/util.py

http://nltk.googlecode.com/
Python | 412 lines | 348 code | 10 blank | 54 comment | 13 complexity | 9852965573b82a955f6dae7f8eb369d3 MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # Natural Language Toolkit: Semantic Interpretation
  2. #
  3. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  4. #
  5. # Copyright (C) 2001-2011 NLTK Project
  6. # URL: <http://www.nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Utility functions for batch-processing sentences: parsing and
  10. extraction of the semantic representation of the root node of the the
  11. syntax tree, followed by evaluation of the semantic representation in
  12. a first-order model.
  13. """
  14. import evaluate
  15. import re
  16. import nltk
  17. from nltk.sem.logic import *
  18. ##############################################################
  19. ## Utility functions for connecting parse output to semantics
  20. ##############################################################
  21. def batch_parse(inputs, grammar, trace=0):
  22. """
  23. Convert input sentences into syntactic trees.
  24. @parameter inputs: sentences to be parsed
  25. @type inputs: C{list} of C{str}
  26. @parameter grammar: L{FeatureGrammar} or name of feature-based grammar
  27. @rtype: C{dict}
  28. @return: a mapping from input sentences to a list of L{Tree}s
  29. """
  30. if isinstance(grammar, nltk.grammar.FeatureGrammar):
  31. cp = nltk.parse.FeatureChartParser(grammar)
  32. else:
  33. cp = nltk.parse.load_parser(grammar, trace=trace)
  34. parses = []
  35. for sent in inputs:
  36. tokens = sent.split() # use a tokenizer?
  37. syntrees = cp.nbest_parse(tokens)
  38. parses.append(syntrees)
  39. return parses
  40. def root_semrep(syntree, semkey='SEM'):
  41. """
  42. Find the semantic representation at the root of a tree.
  43. @parameter syntree: a parse L{Tree}
  44. @parameter semkey: the feature label to use for the root semantics in the tree
  45. @return: the semantic representation at the root of a L{Tree}
  46. @rtype: L{logic.Expression}
  47. """
  48. node = syntree.node
  49. assert isinstance(node, nltk.grammar.FeatStructNonterminal)
  50. try:
  51. return node[semkey]
  52. except KeyError:
  53. print node,
  54. print "has no specification for the feature %s" % semkey
  55. raise
  56. def batch_interpret(inputs, grammar, semkey='SEM', trace=0):
  57. """
  58. Add the semantic representation to each syntactic parse tree
  59. of each input sentence.
  60. @parameter inputs: a list of sentences
  61. @parameter grammar: L{FeatureGrammar} or name of feature-based grammar
  62. @return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations)
  63. @rtype: C{dict}
  64. """
  65. return [[(syn, root_semrep(syn, semkey)) for syn in syntrees]
  66. for syntrees in batch_parse(inputs, grammar, trace=trace)]
  67. def batch_evaluate(inputs, grammar, model, assignment, trace=0):
  68. """
  69. Add the truth-in-a-model value to each semantic representation
  70. for each syntactic parse of each input sentences.
  71. @parameter inputs: a list of sentences
  72. @parameter grammar: L{FeatureGrammar} or name of feature-based grammar
  73. @return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model)
  74. @rtype: C{dict}
  75. """
  76. return [[(syn, sem, model.evaluate(str(sem), assignment, trace=trace))
  77. for (syn, sem) in interpretations]
  78. for interpretations in batch_interpret(inputs, grammar)]
  79. ##########################################
  80. # REs used by the parse_valuation function
  81. ##########################################
  82. _VAL_SPLIT_RE = re.compile(r'\s*=+>\s*')
  83. _ELEMENT_SPLIT_RE = re.compile(r'\s*,\s*')
  84. _TUPLES_RE = re.compile(r"""\s*
  85. (\([^)]+\)) # tuple-expression
  86. \s*""", re.VERBOSE)
  87. def parse_valuation_line(s):
  88. """
  89. Parse a line in a valuation file.
  90. Lines are expected to be of the form::
  91. noosa => n
  92. girl => {g1, g2}
  93. chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)}
  94. @parameter s: input line
  95. @type s: C{str}
  96. @return: a pair (symbol, value)
  97. @rtype: C{tuple}
  98. """
  99. pieces = _VAL_SPLIT_RE.split(s)
  100. symbol = pieces[0]
  101. value = pieces[1]
  102. # check whether the value is meant to be a set
  103. if value.startswith('{'):
  104. value = value[1:-1]
  105. tuple_strings = _TUPLES_RE.findall(value)
  106. # are the set elements tuples?
  107. if tuple_strings:
  108. set_elements = []
  109. for ts in tuple_strings:
  110. ts = ts[1:-1]
  111. element = tuple(_ELEMENT_SPLIT_RE.split(ts))
  112. set_elements.append(element)
  113. else:
  114. set_elements = _ELEMENT_SPLIT_RE.split(value)
  115. value = set(set_elements)
  116. return symbol, value
  117. def parse_valuation(s):
  118. """
  119. Convert a valuation file into a valuation.
  120. @parameter s: the contents of a valuation file
  121. @type s: C{str}
  122. @return: a L{nltk.sem} valuation
  123. @rtype: L{Valuation}
  124. """
  125. statements = []
  126. for linenum, line in enumerate(s.splitlines()):
  127. line = line.strip()
  128. if line.startswith('#') or line=='': continue
  129. try: statements.append(parse_valuation_line(line))
  130. except ValueError:
  131. raise ValueError, 'Unable to parse line %s: %s' % (linenum, line)
  132. val = evaluate.Valuation(statements)
  133. return val
  134. def parse_logic(s, logic_parser=None):
  135. """
  136. Convert a file of First Order Formulas into a list of {Expression}s.
  137. @param s: the contents of the file
  138. @type s: C{str}
  139. @param logic_parser: The parser to be used to parse the logical expression
  140. @type logic_parser: C{LogicParser}
  141. @return: a list of parsed formulas.
  142. @rtype: C{list} of L{Expression}
  143. """
  144. if logic_parser is None:
  145. logic_parser = LogicParser()
  146. statements = []
  147. for linenum, line in enumerate(s.splitlines()):
  148. line = line.strip()
  149. if line.startswith('#') or line=='': continue
  150. try:
  151. statements.append(logic_parser.parse(line))
  152. except ParseException:
  153. raise ValueError, 'Unable to parse line %s: %s' % (linenum, line)
  154. return statements
  155. def skolemize(expression, univ_scope=None, used_variables=None):
  156. """
  157. Skolemize the expression and convert to conjunctive normal form (CNF)
  158. """
  159. if univ_scope is None:
  160. univ_scope = set()
  161. if used_variables is None:
  162. used_variables = set()
  163. if isinstance(expression, AllExpression):
  164. term = skolemize(expression.term, univ_scope|set([expression.variable]), used_variables|set([expression.variable]))
  165. return term.replace(expression.variable, VariableExpression(unique_variable(ignore=used_variables)))
  166. elif isinstance(expression, AndExpression):
  167. return skolemize(expression.first, univ_scope, used_variables) &\
  168. skolemize(expression.second, univ_scope, used_variables)
  169. elif isinstance(expression, OrExpression):
  170. return to_cnf(skolemize(expression.first, univ_scope, used_variables),
  171. skolemize(expression.second, univ_scope, used_variables))
  172. elif isinstance(expression, ImpExpression):
  173. return to_cnf(skolemize(-expression.first, univ_scope, used_variables),
  174. skolemize(expression.second, univ_scope, used_variables))
  175. elif isinstance(expression, IffExpression):
  176. return to_cnf(skolemize(-expression.first, univ_scope, used_variables),
  177. skolemize(expression.second, univ_scope, used_variables)) &\
  178. to_cnf(skolemize(expression.first, univ_scope, used_variables),
  179. skolemize(-expression.second, univ_scope, used_variables))
  180. elif isinstance(expression, EqualityExpression):
  181. return expression
  182. elif isinstance(expression, NegatedExpression):
  183. negated = expression.term
  184. if isinstance(negated, AllExpression):
  185. term = skolemize(-negated.term, univ_scope, used_variables|set([negated.variable]))
  186. if univ_scope:
  187. return term.replace(negated.variable, skolem_function(univ_scope))
  188. else:
  189. skolem_constant = VariableExpression(unique_variable(ignore=used_variables))
  190. return term.replace(negated.variable, skolem_constant)
  191. elif isinstance(negated, AndExpression):
  192. return to_cnf(skolemize(-negated.first, univ_scope, used_variables),
  193. skolemize(-negated.second, univ_scope, used_variables))
  194. elif isinstance(negated, OrExpression):
  195. return skolemize(-negated.first, univ_scope, used_variables) &\
  196. skolemize(-negated.second, univ_scope, used_variables)
  197. elif isinstance(negated, ImpExpression):
  198. return skolemize(negated.first, univ_scope, used_variables) &\
  199. skolemize(-negated.second, univ_scope, used_variables)
  200. elif isinstance(negated, IffExpression):
  201. return to_cnf(skolemize(-negated.first, univ_scope, used_variables),
  202. skolemize(-negated.second, univ_scope, used_variables)) &\
  203. to_cnf(skolemize(negated.first, univ_scope, used_variables),
  204. skolemize(negated.second, univ_scope, used_variables))
  205. elif isinstance(negated, EqualityExpression):
  206. return expression
  207. elif isinstance(negated, NegatedExpression):
  208. return skolemize(negated.term, univ_scope, used_variables)
  209. elif isinstance(negated, ExistsExpression):
  210. term = skolemize(-negated.term, univ_scope|set([negated.variable]), used_variables|set([negated.variable]))
  211. return term.replace(negated.variable, VariableExpression(unique_variable(ignore=used_variables)))
  212. elif isinstance(negated, ApplicationExpression):
  213. return expression
  214. else:
  215. raise Exception('\'%s\' cannot be skolemized' % expression)
  216. elif isinstance(expression, ExistsExpression):
  217. term = skolemize(expression.term, univ_scope, used_variables|set([expression.variable]))
  218. if univ_scope:
  219. return term.replace(expression.variable, skolem_function(univ_scope))
  220. else:
  221. skolem_constant = VariableExpression(unique_variable(ignore=used_variables))
  222. return term.replace(expression.variable, skolem_constant)
  223. elif isinstance(expression, ApplicationExpression):
  224. return expression
  225. else:
  226. raise Exception('\'%s\' cannot be skolemized' % expression)
  227. def to_cnf(first, second):
  228. """
  229. Convert this split disjunction to conjunctive normal form (CNF)
  230. """
  231. if isinstance(first, AndExpression):
  232. r_first = to_cnf(first.first, second)
  233. r_second = to_cnf(first.second, second)
  234. return r_first & r_second
  235. elif isinstance(second, AndExpression):
  236. r_first = to_cnf(first, second.first)
  237. r_second = to_cnf(first, second.second)
  238. return r_first & r_second
  239. else:
  240. return first | second
  241. def demo_model0():
  242. global m0, g0
  243. #Initialize a valuation of non-logical constants."""
  244. v = [('john', 'b1'),
  245. ('mary', 'g1'),
  246. ('suzie', 'g2'),
  247. ('fido', 'd1'),
  248. ('tess', 'd2'),
  249. ('noosa', 'n'),
  250. ('girl', set(['g1', 'g2'])),
  251. ('boy', set(['b1', 'b2'])),
  252. ('dog', set(['d1', 'd2'])),
  253. ('bark', set(['d1', 'd2'])),
  254. ('walk', set(['b1', 'g2', 'd1'])),
  255. ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])),
  256. ('see', set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'),('d2', 'b1'), ('g2', 'n')])),
  257. ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])),
  258. ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')]))
  259. ]
  260. #Read in the data from C{v}
  261. val = evaluate.Valuation(v)
  262. #Bind C{dom} to the C{domain} property of C{val}
  263. dom = val.domain
  264. #Initialize a model with parameters C{dom} and C{val}.
  265. m0 = evaluate.Model(dom, val)
  266. #Initialize a variable assignment with parameter C{dom}
  267. g0 = evaluate.Assignment(dom)
  268. def read_sents(file):
  269. sents = [l.rstrip() for l in open(file)]
  270. # get rid of blank lines
  271. sents = [l for l in sents if len(l) > 0]
  272. sents = [l for l in sents if not l[0] == '#']
  273. return sents
  274. def demo_legacy_grammar():
  275. """
  276. Check that batch_interpret() is compatible with legacy grammars that use
  277. a lowercase 'sem' feature.
  278. Define 'test.fcfg' to be the following
  279. """
  280. g = nltk.parse_fcfg("""
  281. % start S
  282. S[sem=<hello>] -> 'hello'
  283. """)
  284. print "Reading grammar: %s" % g
  285. print "*" * 20
  286. for reading in batch_interpret(['hello'], g, semkey='sem'):
  287. syn, sem = reading[0]
  288. print
  289. print "output: ", sem
  290. def demo():
  291. import sys
  292. from optparse import OptionParser
  293. description = \
  294. """
  295. Parse and evaluate some sentences.
  296. """
  297. opts = OptionParser(description=description)
  298. opts.set_defaults(evaluate=True, beta=True, syntrace=0,
  299. semtrace=0, demo='default', grammar='', sentences='')
  300. opts.add_option("-d", "--demo", dest="demo",
  301. help="choose demo D; omit this for the default demo, or specify 'chat80'", metavar="D")
  302. opts.add_option("-g", "--gram", dest="grammar",
  303. help="read in grammar G", metavar="G")
  304. opts.add_option("-m", "--model", dest="model",
  305. help="import model M (omit '.py' suffix)", metavar="M")
  306. opts.add_option("-s", "--sentences", dest="sentences",
  307. help="read in a file of test sentences S", metavar="S")
  308. opts.add_option("-e", "--no-eval", action="store_false", dest="evaluate",
  309. help="just do a syntactic analysis")
  310. opts.add_option("-b", "--no-beta-reduction", action="store_false",
  311. dest="beta", help="don't carry out beta-reduction")
  312. opts.add_option("-t", "--syntrace", action="count", dest="syntrace",
  313. help="set syntactic tracing on; requires '-e' option")
  314. opts.add_option("-T", "--semtrace", action="count", dest="semtrace",
  315. help="set semantic tracing on")
  316. (options, args) = opts.parse_args()
  317. SPACER = '-' * 30
  318. demo_model0()
  319. sents = [
  320. 'Fido sees a boy with Mary',
  321. 'John sees Mary',
  322. 'every girl chases a dog',
  323. 'every boy chases a girl',
  324. 'John walks with a girl in Noosa',
  325. 'who walks']
  326. gramfile = 'grammars/sample_grammars/sem2.fcfg'
  327. if options.sentences:
  328. sentsfile = options.sentences
  329. if options.grammar:
  330. gramfile = options.grammar
  331. if options.model:
  332. exec "import %s as model" % options.model
  333. if sents is None:
  334. sents = read_sents(sentsfile)
  335. # Set model and assignment
  336. model = m0
  337. g = g0
  338. if options.evaluate:
  339. evaluations = \
  340. batch_evaluate(sents, gramfile, model, g, trace=options.semtrace)
  341. else:
  342. semreps = \
  343. batch_interpret(sents, gramfile, trace=options.syntrace)
  344. for i, sent in enumerate(sents):
  345. n = 1
  346. print '\nSentence: %s' % sent
  347. print SPACER
  348. if options.evaluate:
  349. for (syntree, semrep, value) in evaluations[i]:
  350. if isinstance(value, dict):
  351. value = set(value.keys())
  352. print '%d: %s' % (n, semrep)
  353. print value
  354. n += 1
  355. else:
  356. for (syntree, semrep) in semreps[i]:
  357. print '%d: %s' % (n, semrep)
  358. n += 1
  359. if __name__ == "__main__":
  360. #demo()
  361. demo_legacy_grammar()