PageRenderTime 63ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/corpus/reader/verbnet.py

https://github.com/BrucePHill/nltk
Python | 389 lines | 309 code | 22 blank | 58 comment | 17 complexity | f371dccb15962bb5ac29273d1ac43162 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Verbnet Corpus Reader
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from __future__ import unicode_literals
  8. import re
  9. import textwrap
  10. from collections import defaultdict
  11. from nltk import compat
  12. from .xmldocs import XMLCorpusReader
  13. class VerbnetCorpusReader(XMLCorpusReader):
  14. # No unicode encoding param, since the data files are all XML.
  15. def __init__(self, root, fileids, wrap_etree=False):
  16. XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
  17. self._lemma_to_class = defaultdict(list)
  18. """A dictionary mapping from verb lemma strings to lists of
  19. verbnet class identifiers."""
  20. self._wordnet_to_class = defaultdict(list)
  21. """A dictionary mapping from wordnet identifier strings to
  22. lists of verbnet class identifiers."""
  23. self._class_to_fileid = {}
  24. """A dictionary mapping from class identifiers to
  25. corresponding file identifiers. The keys of this dictionary
  26. provide a complete list of all classes and subclasses."""
  27. self._shortid_to_longid = {}
  28. # Initialize the dictionaries. Use the quick (regexp-based)
  29. # method instead of the slow (xml-based) method, because it
  30. # runs 2-30 times faster.
  31. self._quick_index()
  32. _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
  33. """Regular expression that matches (and decomposes) longids"""
  34. _SHORTID_RE = re.compile(r'[\d+.\-]+$')
  35. """Regular expression that matches shortids"""
  36. _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|'
  37. r'<VNSUBCLASS ID="([^"]+)"/?>')
  38. """Regular expression used by ``_index()`` to quickly scan the corpus
  39. for basic information."""
  40. def lemmas(self, classid=None):
  41. """
  42. Return a list of all verb lemmas that appear in any class, or
  43. in the ``classid`` if specified.
  44. """
  45. if classid is None:
  46. return sorted(self._lemma_to_class.keys())
  47. else:
  48. # [xx] should this include subclass members?
  49. vnclass = self.vnclass(classid)
  50. return [member.get('name') for member in
  51. vnclass.findall('MEMBERS/MEMBER')]
  52. def wordnetids(self, classid=None):
  53. """
  54. Return a list of all wordnet identifiers that appear in any
  55. class, or in ``classid`` if specified.
  56. """
  57. if classid is None:
  58. return sorted(self._wordnet_to_class.keys())
  59. else:
  60. # [xx] should this include subclass members?
  61. vnclass = self.vnclass(classid)
  62. return sum([member.get('wn','').split() for member in
  63. vnclass.findall('MEMBERS/MEMBER')], [])
  64. def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
  65. """
  66. Return a list of the verbnet class identifiers. If a file
  67. identifier is specified, then return only the verbnet class
  68. identifiers for classes (and subclasses) defined by that file.
  69. If a lemma is specified, then return only verbnet class
  70. identifiers for classes that contain that lemma as a member.
  71. If a wordnetid is specified, then return only identifiers for
  72. classes that contain that wordnetid as a member. If a classid
  73. is specified, then return only identifiers for subclasses of
  74. the specified verbnet class.
  75. """
  76. if len([x for x in [lemma, wordnetid, fileid, classid]
  77. if x is not None]) > 1:
  78. raise ValueError('Specify at most one of: fileid, wordnetid, '
  79. 'fileid, classid')
  80. if fileid is not None:
  81. return [c for (c,f) in self._class_to_fileid.items()
  82. if f == fileid]
  83. elif lemma is not None:
  84. return self._lemma_to_class[lemma]
  85. elif wordnetid is not None:
  86. return self._wordnet_to_class[wordnetid]
  87. elif classid is not None:
  88. xmltree = self.vnclass(classid)
  89. return [subclass.get('ID') for subclass in
  90. xmltree.findall('SUBCLASSES/VNSUBCLASS')]
  91. else:
  92. return sorted(self._class_to_fileid.keys())
  93. def vnclass(self, fileid_or_classid):
  94. """
  95. Return an ElementTree containing the xml for the specified
  96. verbnet class.
  97. :param fileid_or_classid: An identifier specifying which class
  98. should be returned. Can be a file identifier (such as
  99. ``'put-9.1.xml'``), or a verbnet class identifier (such as
  100. ``'put-9.1'``) or a short verbnet class identifier (such as
  101. ``'9.1'``).
  102. """
  103. # File identifier: just return the xml.
  104. if fileid_or_classid in self._fileids:
  105. return self.xml(fileid_or_classid)
  106. # Class identifier: get the xml, and find the right elt.
  107. classid = self.longid(fileid_or_classid)
  108. if classid in self._class_to_fileid:
  109. fileid = self._class_to_fileid[self.longid(classid)]
  110. tree = self.xml(fileid)
  111. if classid == tree.get('ID'):
  112. return tree
  113. else:
  114. for subclass in tree.findall('.//VNSUBCLASS'):
  115. if classid == subclass.get('ID'):
  116. return subclass
  117. else:
  118. assert False # we saw it during _index()!
  119. else:
  120. raise ValueError('Unknown identifier %s' % fileid_or_classid)
  121. def fileids(self, vnclass_ids=None):
  122. """
  123. Return a list of fileids that make up this corpus. If
  124. ``vnclass_ids`` is specified, then return the fileids that make
  125. up the specified verbnet class(es).
  126. """
  127. if vnclass_ids is None:
  128. return self._fileids
  129. elif isinstance(vnclass_ids, compat.string_types):
  130. return [self._class_to_fileid[self.longid(vnclass_ids)]]
  131. else:
  132. return [self._class_to_fileid[self.longid(vnclass_id)]
  133. for vnclass_id in vnclass_ids]
  134. ######################################################################
  135. #{ Index Initialization
  136. ######################################################################
  137. def _index(self):
  138. """
  139. Initialize the indexes ``_lemma_to_class``,
  140. ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
  141. through the corpus fileids. This is fast with cElementTree
  142. (<0.1 secs), but quite slow (>10 secs) with the python
  143. implementation of ElementTree.
  144. """
  145. for fileid in self._fileids:
  146. self._index_helper(self.xml(fileid), fileid)
  147. def _index_helper(self, xmltree, fileid):
  148. """Helper for ``_index()``"""
  149. vnclass = xmltree.get('ID')
  150. self._class_to_fileid[vnclass] = fileid
  151. self._shortid_to_longid[self.shortid(vnclass)] = vnclass
  152. for member in xmltree.findall('MEMBERS/MEMBER'):
  153. self._lemma_to_class[member.get('name')].append(vnclass)
  154. for wn in member.get('wn', '').split():
  155. self._wordnet_to_class[wn].append(vnclass)
  156. for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'):
  157. self._index_helper(subclass, fileid)
  158. def _quick_index(self):
  159. """
  160. Initialize the indexes ``_lemma_to_class``,
  161. ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
  162. through the corpus fileids. This doesn't do proper xml parsing,
  163. but is good enough to find everything in the standard verbnet
  164. corpus -- and it runs about 30 times faster than xml parsing
  165. (with the python ElementTree; only 2-3 times faster with
  166. cElementTree).
  167. """
  168. # nb: if we got rid of wordnet_to_class, this would run 2-3
  169. # times faster.
  170. for fileid in self._fileids:
  171. vnclass = fileid[:-4] # strip the '.xml'
  172. self._class_to_fileid[vnclass] = fileid
  173. self._shortid_to_longid[self.shortid(vnclass)] = vnclass
  174. for m in self._INDEX_RE.finditer(self.open(fileid).read()):
  175. groups = m.groups()
  176. if groups[0] is not None:
  177. self._lemma_to_class[groups[0]].append(vnclass)
  178. for wn in groups[1].split():
  179. self._wordnet_to_class[wn].append(vnclass)
  180. elif groups[2] is not None:
  181. self._class_to_fileid[groups[2]] = fileid
  182. vnclass = groups[2] # for <MEMBER> elts.
  183. self._shortid_to_longid[self.shortid(vnclass)] = vnclass
  184. else:
  185. assert False, 'unexpected match condition'
  186. ######################################################################
  187. #{ Identifier conversion
  188. ######################################################################
  189. def longid(self, shortid):
  190. """Given a short verbnet class identifier (eg '37.10'), map it
  191. to a long id (eg 'confess-37.10'). If ``shortid`` is already a
  192. long id, then return it as-is"""
  193. if self._LONGID_RE.match(shortid):
  194. return shortid # it's already a longid.
  195. elif not self._SHORTID_RE.match(shortid):
  196. raise ValueError('vnclass identifier %r not found' % shortid)
  197. try:
  198. return self._shortid_to_longid[shortid]
  199. except KeyError:
  200. raise ValueError('vnclass identifier %r not found' % shortid)
  201. def shortid(self, longid):
  202. """Given a long verbnet class identifier (eg 'confess-37.10'),
  203. map it to a short id (eg '37.10'). If ``longid`` is already a
  204. short id, then return it as-is."""
  205. if self._SHORTID_RE.match(longid):
  206. return longid # it's already a shortid.
  207. m = self._LONGID_RE.match(longid)
  208. if m:
  209. return m.group(2)
  210. else:
  211. raise ValueError('vnclass identifier %r not found' % longid)
  212. ######################################################################
  213. #{ Pretty Printing
  214. ######################################################################
  215. def pprint(self, vnclass):
  216. """
  217. Return a string containing a pretty-printed representation of
  218. the given verbnet class.
  219. :param vnclass: A verbnet class identifier; or an ElementTree
  220. containing the xml contents of a verbnet class.
  221. """
  222. if isinstance(vnclass, compat.string_types):
  223. vnclass = self.vnclass(vnclass)
  224. s = vnclass.get('ID') + '\n'
  225. s += self.pprint_subclasses(vnclass, indent=' ') + '\n'
  226. s += self.pprint_members(vnclass, indent=' ') + '\n'
  227. s += ' Thematic roles:\n'
  228. s += self.pprint_themroles(vnclass, indent=' ') + '\n'
  229. s += ' Frames:\n'
  230. s += '\n'.join(self.pprint_frame(vnframe, indent=' ')
  231. for vnframe in vnclass.findall('FRAMES/FRAME'))
  232. return s
  233. def pprint_subclasses(self, vnclass, indent=''):
  234. """
  235. Return a string containing a pretty-printed representation of
  236. the given verbnet class's subclasses.
  237. :param vnclass: A verbnet class identifier; or an ElementTree
  238. containing the xml contents of a verbnet class.
  239. """
  240. if isinstance(vnclass, compat.string_types):
  241. vnclass = self.vnclass(vnclass)
  242. subclasses = [subclass.get('ID') for subclass in
  243. vnclass.findall('SUBCLASSES/VNSUBCLASS')]
  244. if not subclasses: subclasses = ['(none)']
  245. s = 'Subclasses: ' + ' '.join(subclasses)
  246. return textwrap.fill(s, 70, initial_indent=indent,
  247. subsequent_indent=indent+' ')
  248. def pprint_members(self, vnclass, indent=''):
  249. """
  250. Return a string containing a pretty-printed representation of
  251. the given verbnet class's member verbs.
  252. :param vnclass: A verbnet class identifier; or an ElementTree
  253. containing the xml contents of a verbnet class.
  254. """
  255. if isinstance(vnclass, compat.string_types):
  256. vnclass = self.vnclass(vnclass)
  257. members = [member.get('name') for member in
  258. vnclass.findall('MEMBERS/MEMBER')]
  259. if not members: members = ['(none)']
  260. s = 'Members: ' + ' '.join(members)
  261. return textwrap.fill(s, 70, initial_indent=indent,
  262. subsequent_indent=indent+' ')
  263. def pprint_themroles(self, vnclass, indent=''):
  264. """
  265. Return a string containing a pretty-printed representation of
  266. the given verbnet class's thematic roles.
  267. :param vnclass: A verbnet class identifier; or an ElementTree
  268. containing the xml contents of a verbnet class.
  269. """
  270. if isinstance(vnclass, compat.string_types):
  271. vnclass = self.vnclass(vnclass)
  272. pieces = []
  273. for themrole in vnclass.findall('THEMROLES/THEMROLE'):
  274. piece = indent + '* ' + themrole.get('type')
  275. modifiers = ['%(Value)s%(type)s' % restr.attrib
  276. for restr in themrole.findall('SELRESTRS/SELRESTR')]
  277. if modifiers:
  278. piece += '[%s]' % ' '.join(modifiers)
  279. pieces.append(piece)
  280. return '\n'.join(pieces)
  281. def pprint_frame(self, vnframe, indent=''):
  282. """
  283. Return a string containing a pretty-printed representation of
  284. the given verbnet frame.
  285. :param vnframe: An ElementTree containing the xml contents of
  286. a verbnet frame.
  287. """
  288. s = self.pprint_description(vnframe, indent) + '\n'
  289. s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n'
  290. s += indent + ' Semantics:\n'
  291. s += self.pprint_semantics(vnframe, indent+' ')
  292. return s
  293. def pprint_description(self, vnframe, indent=''):
  294. """
  295. Return a string containing a pretty-printed representation of
  296. the given verbnet frame description.
  297. :param vnframe: An ElementTree containing the xml contents of
  298. a verbnet frame.
  299. """
  300. descr = vnframe.find('DESCRIPTION')
  301. s = indent + descr.attrib['primary']
  302. if descr.get('secondary', ''):
  303. s += ' (%s)' % descr.get('secondary')
  304. return s
  305. def pprint_syntax(self, vnframe, indent=''):
  306. """
  307. Return a string containing a pretty-printed representation of
  308. the given verbnet frame syntax.
  309. :param vnframe: An ElementTree containing the xml contents of
  310. a verbnet frame.
  311. """
  312. pieces = []
  313. for elt in vnframe.find('SYNTAX'):
  314. piece = elt.tag
  315. modifiers = []
  316. if 'value' in elt.attrib:
  317. modifiers.append(elt.get('value'))
  318. modifiers += ['%(Value)s%(type)s' % restr.attrib
  319. for restr in (elt.findall('SELRESTRS/SELRESTR') +
  320. elt.findall('SYNRESTRS/SYNRESTR'))]
  321. if modifiers:
  322. piece += '[%s]' % ' '.join(modifiers)
  323. pieces.append(piece)
  324. return indent + ' '.join(pieces)
  325. def pprint_semantics(self, vnframe, indent=''):
  326. """
  327. Return a string containing a pretty-printed representation of
  328. the given verbnet frame semantics.
  329. :param vnframe: An ElementTree containing the xml contents of
  330. a verbnet frame.
  331. """
  332. pieces = []
  333. for pred in vnframe.findall('SEMANTICS/PRED'):
  334. args = [arg.get('value') for arg in pred.findall('ARGS/ARG')]
  335. pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args)))
  336. return '\n'.join('%s* %s' % (indent, piece) for piece in pieces)