PageRenderTime 42ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/corpus/reader/propbank.py

https://github.com/BrucePHill/nltk
Python | 481 lines | 467 code | 4 blank | 10 comment | 0 complexity | 36dd9ec72ab8a9cf32fcc3d1140dc2ac MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: PropBank Corpus Reader
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from __future__ import unicode_literals
  8. import re
  9. from xml.etree import ElementTree
  10. from nltk import compat
  11. from nltk.tree import Tree
  12. from nltk.internals import raise_unorderable_types
  13. from nltk.compat import total_ordering
  14. from .util import *
  15. from .api import *
  16. class PropbankCorpusReader(CorpusReader):
  17. """
  18. Corpus reader for the propbank corpus, which augments the Penn
  19. Treebank with information about the predicate argument structure
  20. of every verb instance. The corpus consists of two parts: the
  21. predicate-argument annotations themselves, and a set of "frameset
  22. files" which define the argument labels used by the annotations,
  23. on a per-verb basis. Each "frameset file" contains one or more
  24. predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
  25. divided into coarse-grained word senses called "rolesets". For
  26. each "roleset", the frameset file provides descriptions of the
  27. argument roles, along with examples.
  28. """
  29. def __init__(self, root, propfile, framefiles='',
  30. verbsfile=None, parse_fileid_xform=None,
  31. parse_corpus=None, encoding='utf8'):
  32. """
  33. :param root: The root directory for this corpus.
  34. :param propfile: The name of the file containing the predicate-
  35. argument annotations (relative to ``root``).
  36. :param framefiles: A list or regexp specifying the frameset
  37. fileids for this corpus.
  38. :param parse_fileid_xform: A transform that should be applied
  39. to the fileids in this corpus. This should be a function
  40. of one argument (a fileid) that returns a string (the new
  41. fileid).
  42. :param parse_corpus: The corpus containing the parse trees
  43. corresponding to this corpus. These parse trees are
  44. necessary to resolve the tree pointers used by propbank.
  45. """
  46. # If framefiles is specified as a regexp, expand it.
  47. if isinstance(framefiles, compat.string_types):
  48. framefiles = find_corpus_fileids(root, framefiles)
  49. framefiles = list(framefiles)
  50. # Initialze the corpus reader.
  51. CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles,
  52. encoding)
  53. # Record our frame fileids & prop file.
  54. self._propfile = propfile
  55. self._framefiles = framefiles
  56. self._verbsfile = verbsfile
  57. self._parse_fileid_xform = parse_fileid_xform
  58. self._parse_corpus = parse_corpus
  59. def raw(self, fileids=None):
  60. """
  61. :return: the text contents of the given fileids, as a single string.
  62. """
  63. if fileids is None: fileids = self._fileids
  64. elif isinstance(fileids, compat.string_types): fileids = [fileids]
  65. return concat([self.open(f).read() for f in fileids])
  66. def instances(self, baseform=None):
  67. """
  68. :return: a corpus view that acts as a list of
  69. ``PropBankInstance`` objects, one for each noun in the corpus.
  70. """
  71. kwargs = {}
  72. if baseform is not None:
  73. kwargs['instance_filter'] = lambda inst: inst.baseform==baseform
  74. return StreamBackedCorpusView(self.abspath(self._propfile),
  75. lambda stream: self._read_instance_block(stream, **kwargs),
  76. encoding=self.encoding(self._propfile))
  77. def lines(self):
  78. """
  79. :return: a corpus view that acts as a list of strings, one for
  80. each line in the predicate-argument annotation file.
  81. """
  82. return StreamBackedCorpusView(self.abspath(self._propfile),
  83. read_line_block,
  84. encoding=self.encoding(self._propfile))
  85. def roleset(self, roleset_id):
  86. """
  87. :return: the xml description for the given roleset.
  88. """
  89. baseform = roleset_id.split('.')[0]
  90. framefile = 'frames/%s.xml' % baseform
  91. if framefile not in self._framefiles:
  92. raise ValueError('Frameset file for %s not found' %
  93. roleset_id)
  94. # n.b.: The encoding for XML fileids is specified by the file
  95. # itself; so we ignore self._encoding here.
  96. etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
  97. for roleset in etree.findall('predicate/roleset'):
  98. if roleset.attrib['id'] == roleset_id:
  99. return roleset
  100. else:
  101. raise ValueError('Roleset %s not found in %s' %
  102. (roleset_id, framefile))
  103. def rolesets(self, baseform=None):
  104. """
  105. :return: list of xml descriptions for rolesets.
  106. """
  107. if baseform is not None:
  108. framefile = 'frames/%s.xml' % baseform
  109. if framefile not in self._framefiles:
  110. raise ValueError('Frameset file for %s not found' %
  111. baseform)
  112. framefiles = [framefile]
  113. else:
  114. framefiles = self._framefiles
  115. rsets = []
  116. for framefile in framefiles:
  117. # n.b.: The encoding for XML fileids is specified by the file
  118. # itself; so we ignore self._encoding here.
  119. etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
  120. rsets.append(etree.findall('predicate/roleset'))
  121. return LazyConcatenation(rsets)
  122. def verbs(self):
  123. """
  124. :return: a corpus view that acts as a list of all verb lemmas
  125. in this corpus (from the verbs.txt file).
  126. """
  127. return StreamBackedCorpusView(self.abspath(self._verbsfile),
  128. read_line_block,
  129. encoding=self.encoding(self._verbsfile))
  130. def _read_instance_block(self, stream, instance_filter=lambda inst: True):
  131. block = []
  132. # Read 100 at a time.
  133. for i in range(100):
  134. line = stream.readline().strip()
  135. if line:
  136. inst = PropbankInstance.parse(
  137. line, self._parse_fileid_xform,
  138. self._parse_corpus)
  139. if instance_filter(inst):
  140. block.append(inst)
  141. return block
  142. ######################################################################
  143. #{ Propbank Instance & related datatypes
  144. ######################################################################
  145. @compat.python_2_unicode_compatible
  146. class PropbankInstance(object):
  147. def __init__(self, fileid, sentnum, wordnum, tagger, roleset,
  148. inflection, predicate, arguments, parse_corpus=None):
  149. self.fileid = fileid
  150. """The name of the file containing the parse tree for this
  151. instance's sentence."""
  152. self.sentnum = sentnum
  153. """The sentence number of this sentence within ``fileid``.
  154. Indexing starts from zero."""
  155. self.wordnum = wordnum
  156. """The word number of this instance's predicate within its
  157. containing sentence. Word numbers are indexed starting from
  158. zero, and include traces and other empty parse elements."""
  159. self.tagger = tagger
  160. """An identifier for the tagger who tagged this instance; or
  161. ``'gold'`` if this is an adjuticated instance."""
  162. self.roleset = roleset
  163. """The name of the roleset used by this instance's predicate.
  164. Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
  165. look up information about the roleset."""
  166. self.inflection = inflection
  167. """A ``PropbankInflection`` object describing the inflection of
  168. this instance's predicate."""
  169. self.predicate = predicate
  170. """A ``PropbankTreePointer`` indicating the position of this
  171. instance's predicate within its containing sentence."""
  172. self.arguments = tuple(arguments)
  173. """A list of tuples (argloc, argid), specifying the location
  174. and identifier for each of the predicate's argument in the
  175. containing sentence. Argument identifiers are strings such as
  176. ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
  177. the predicate."""
  178. self.parse_corpus = parse_corpus
  179. """A corpus reader for the parse trees corresponding to the
  180. instances in this propbank corpus."""
  181. @property
  182. def baseform(self):
  183. """The baseform of the predicate."""
  184. return self.roleset.split('.')[0]
  185. @property
  186. def sensenumber(self):
  187. """The sense number of the predicate."""
  188. return self.roleset.split('.')[1]
  189. @property
  190. def predid(self):
  191. """Identifier of the predicate."""
  192. return 'rel'
  193. def __repr__(self):
  194. return ('<PropbankInstance: %s, sent %s, word %s>' %
  195. (self.fileid, self.sentnum, self.wordnum))
  196. def __str__(self):
  197. s = '%s %s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum,
  198. self.tagger, self.roleset, self.inflection)
  199. items = self.arguments + ((self.predicate, 'rel'),)
  200. for (argloc, argid) in sorted(items):
  201. s += ' %s-%s' % (argloc, argid)
  202. return s
  203. def _get_tree(self):
  204. if self.parse_corpus is None: return None
  205. if self.fileid not in self.parse_corpus.fileids(): return None
  206. return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
  207. tree = property(_get_tree, doc="""
  208. The parse tree corresponding to this instance, or None if
  209. the corresponding tree is not available.""")
  210. @staticmethod
  211. def parse(s, parse_fileid_xform=None, parse_corpus=None):
  212. pieces = s.split()
  213. if len(pieces) < 7:
  214. raise ValueError('Badly formatted propbank line: %r' % s)
  215. # Divide the line into its basic pieces.
  216. (fileid, sentnum, wordnum,
  217. tagger, roleset, inflection) = pieces[:6]
  218. rel = [p for p in pieces[6:] if p.endswith('-rel')]
  219. args = [p for p in pieces[6:] if not p.endswith('-rel')]
  220. if len(rel) != 1:
  221. raise ValueError('Badly formatted propbank line: %r' % s)
  222. # Apply the fileid selector, if any.
  223. if parse_fileid_xform is not None:
  224. fileid = parse_fileid_xform(fileid)
  225. # Convert sentence & word numbers to ints.
  226. sentnum = int(sentnum)
  227. wordnum = int(wordnum)
  228. # Parse the inflection
  229. inflection = PropbankInflection.parse(inflection)
  230. # Parse the predicate location.
  231. predicate = PropbankTreePointer.parse(rel[0][:-4])
  232. # Parse the arguments.
  233. arguments = []
  234. for arg in args:
  235. argloc, argid = arg.split('-', 1)
  236. arguments.append( (PropbankTreePointer.parse(argloc), argid) )
  237. # Put it all together.
  238. return PropbankInstance(fileid, sentnum, wordnum, tagger,
  239. roleset, inflection, predicate,
  240. arguments, parse_corpus)
  241. class PropbankPointer(object):
  242. """
  243. A pointer used by propbank to identify one or more constituents in
  244. a parse tree. ``PropbankPointer`` is an abstract base class with
  245. three concrete subclasses:
  246. - ``PropbankTreePointer`` is used to point to single constituents.
  247. - ``PropbankSplitTreePointer`` is used to point to 'split'
  248. constituents, which consist of a sequence of two or more
  249. ``PropbankTreePointer`` pointers.
  250. - ``PropbankChainTreePointer`` is used to point to entire trace
  251. chains in a tree. It consists of a sequence of pieces, which
  252. can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
  253. """
  254. def __init__(self):
  255. if self.__class__ == PropbankPointer:
  256. raise NotImplementedError()
  257. @compat.python_2_unicode_compatible
  258. class PropbankChainTreePointer(PropbankPointer):
  259. def __init__(self, pieces):
  260. self.pieces = pieces
  261. """A list of the pieces that make up this chain. Elements may
  262. be either ``PropbankSplitTreePointer`` or
  263. ``PropbankTreePointer`` pointers."""
  264. def __str__(self):
  265. return '*'.join('%s' % p for p in self.pieces)
  266. def __repr__(self):
  267. return '<PropbankChainTreePointer: %s>' % self
  268. def select(self, tree):
  269. if tree is None: raise ValueError('Parse tree not avaialable')
  270. return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
  271. @compat.python_2_unicode_compatible
  272. class PropbankSplitTreePointer(PropbankPointer):
  273. def __init__(self, pieces):
  274. self.pieces = pieces
  275. """A list of the pieces that make up this chain. Elements are
  276. all ``PropbankTreePointer`` pointers."""
  277. def __str__(self):
  278. return ','.join('%s' % p for p in self.pieces)
  279. def __repr__(self):
  280. return '<PropbankSplitTreePointer: %s>' % self
  281. def select(self, tree):
  282. if tree is None: raise ValueError('Parse tree not avaialable')
  283. return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
  284. @total_ordering
  285. @compat.python_2_unicode_compatible
  286. class PropbankTreePointer(PropbankPointer):
  287. """
  288. wordnum:height*wordnum:height*...
  289. wordnum:height,
  290. """
  291. def __init__(self, wordnum, height):
  292. self.wordnum = wordnum
  293. self.height = height
  294. @staticmethod
  295. def parse(s):
  296. # Deal with chains (xx*yy*zz)
  297. pieces = s.split('*')
  298. if len(pieces) > 1:
  299. return PropbankChainTreePointer([PropbankTreePointer.parse(elt)
  300. for elt in pieces])
  301. # Deal with split args (xx,yy,zz)
  302. pieces = s.split(',')
  303. if len(pieces) > 1:
  304. return PropbankSplitTreePointer([PropbankTreePointer.parse(elt)
  305. for elt in pieces])
  306. # Deal with normal pointers.
  307. pieces = s.split(':')
  308. if len(pieces) != 2: raise ValueError('bad propbank pointer %r' % s)
  309. return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
  310. def __str__(self):
  311. return '%s:%s' % (self.wordnum, self.height)
  312. def __repr__(self):
  313. return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
  314. def __eq__(self, other):
  315. while isinstance(other, (PropbankChainTreePointer,
  316. PropbankSplitTreePointer)):
  317. other = other.pieces[0]
  318. if not isinstance(other, PropbankTreePointer):
  319. return self is other
  320. return (self.wordnum == other.wordnum and self.height == other.height)
  321. def __ne__(self, other):
  322. return not self == other
  323. def __lt__(self, other):
  324. while isinstance(other, (PropbankChainTreePointer,
  325. PropbankSplitTreePointer)):
  326. other = other.pieces[0]
  327. if not isinstance(other, PropbankTreePointer):
  328. return id(self) < id(other)
  329. return (self.wordnum, -self.height) < (other.wordnum, -other.height)
  330. def select(self, tree):
  331. if tree is None: raise ValueError('Parse tree not avaialable')
  332. return tree[self.treepos(tree)]
  333. def treepos(self, tree):
  334. """
  335. Convert this pointer to a standard 'tree position' pointer,
  336. given that it points to the given tree.
  337. """
  338. if tree is None: raise ValueError('Parse tree not avaialable')
  339. stack = [tree]
  340. treepos = []
  341. wordnum = 0
  342. while True:
  343. #print treepos
  344. #print stack[-1]
  345. # tree node:
  346. if isinstance(stack[-1], Tree):
  347. # Select the next child.
  348. if len(treepos) < len(stack):
  349. treepos.append(0)
  350. else:
  351. treepos[-1] += 1
  352. # Update the stack.
  353. if treepos[-1] < len(stack[-1]):
  354. stack.append(stack[-1][treepos[-1]])
  355. else:
  356. # End of node's child list: pop up a level.
  357. stack.pop()
  358. treepos.pop()
  359. # word node:
  360. else:
  361. if wordnum == self.wordnum:
  362. return tuple(treepos[:len(treepos)-self.height-1])
  363. else:
  364. wordnum += 1
  365. stack.pop()
  366. @compat.python_2_unicode_compatible
  367. class PropbankInflection(object):
  368. #{ Inflection Form
  369. INFINITIVE = 'i'
  370. GERUND = 'g'
  371. PARTICIPLE = 'p'
  372. FINITE = 'v'
  373. #{ Inflection Tense
  374. FUTURE = 'f'
  375. PAST = 'p'
  376. PRESENT = 'n'
  377. #{ Inflection Aspect
  378. PERFECT = 'p'
  379. PROGRESSIVE = 'o'
  380. PERFECT_AND_PROGRESSIVE = 'b'
  381. #{ Inflection Person
  382. THIRD_PERSON = '3'
  383. #{ Inflection Voice
  384. ACTIVE = 'a'
  385. PASSIVE = 'p'
  386. #{ Inflection
  387. NONE = '-'
  388. #}
  389. def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
  390. self.form = form
  391. self.tense = tense
  392. self.aspect = aspect
  393. self.person = person
  394. self.voice = voice
  395. def __str__(self):
  396. return self.form+self.tense+self.aspect+self.person+self.voice
  397. def __repr__(self):
  398. return '<PropbankInflection: %s>' % self
  399. _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
  400. @staticmethod
  401. def parse(s):
  402. if not isinstance(s, compat.string_types):
  403. raise TypeError('expected a string')
  404. if (len(s) != 5 or
  405. not PropbankInflection._VALIDATE.match(s)):
  406. raise ValueError('Bad propbank inflection string %r' % s)
  407. return PropbankInflection(*s)