/nltk/corpus/reader/verbnet.py
Python | 389 lines | 309 code | 22 blank | 58 comment | 17 complexity | 7ef6f6c7bed72b3a2c7f89d6200c8ab0 MD5 | raw file
Possible License(s): Apache-2.0
- # Natural Language Toolkit: Verbnet Corpus Reader
- #
- # Copyright (C) 2001-2012 NLTK Project
- # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
- # URL: <http://www.nltk.org/>
- # For license information, see LICENSE.TXT
- import re
- import textwrap
- from collections import defaultdict
- from util import *
- from api import *
- from xmldocs import *
- class VerbnetCorpusReader(XMLCorpusReader):
- # No unicode encoding param, since the data files are all XML.
- def __init__(self, root, fileids, wrap_etree=False):
- XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
- self._lemma_to_class = defaultdict(list)
- """A dictionary mapping from verb lemma strings to lists of
- verbnet class identifiers."""
- self._wordnet_to_class = defaultdict(list)
- """A dictionary mapping from wordnet identifier strings to
- lists of verbnet class identifiers."""
- self._class_to_fileid = {}
- """A dictionary mapping from class identifiers to
- corresponding file identifiers. The keys of this dictionary
- provide a complete list of all classes and subclasses."""
- self._shortid_to_longid = {}
- # Initialize the dictionaries. Use the quick (regexp-based)
- # method instead of the slow (xml-based) method, because it
- # runs 2-30 times faster.
- self._quick_index()
- _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
- """Regular expression that matches (and decomposes) longids"""
- _SHORTID_RE = re.compile(r'[\d+.\-]+$')
- """Regular expression that matches shortids"""
- _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|'
- r'<VNSUBCLASS ID="([^"]+)"/?>')
- """Regular expression used by ``_index()`` to quickly scan the corpus
- for basic information."""
- def lemmas(self, classid=None):
- """
- Return a list of all verb lemmas that appear in any class, or
- in the ``classid`` if specified.
- """
- if classid is None:
- return sorted(self._lemma_to_class.keys())
- else:
- # [xx] should this include subclass members?
- vnclass = self.vnclass(classid)
- return [member.get('name') for member in
- vnclass.findall('MEMBERS/MEMBER')]
- def wordnetids(self, classid=None):
- """
- Return a list of all wordnet identifiers that appear in any
- class, or in ``classid`` if specified.
- """
- if classid is None:
- return sorted(self._wordnet_to_class.keys())
- else:
- # [xx] should this include subclass members?
- vnclass = self.vnclass(classid)
- return sum([member.get('wn','').split() for member in
- vnclass.findall('MEMBERS/MEMBER')], [])
- def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
- """
- Return a list of the verbnet class identifiers. If a file
- identifier is specified, then return only the verbnet class
- identifiers for classes (and subclasses) defined by that file.
- If a lemma is specified, then return only verbnet class
- identifiers for classes that contain that lemma as a member.
- If a wordnetid is specified, then return only identifiers for
- classes that contain that wordnetid as a member. If a classid
- is specified, then return only identifiers for subclasses of
- the specified verbnet class.
- """
- if len([x for x in [lemma, wordnetid, fileid, classid]
- if x is not None]) > 1:
- raise ValueError('Specify at most one of: fileid, wordnetid, '
- 'fileid, classid')
- if fileid is not None:
- return [c for (c,f) in self._class_to_fileid.items()
- if f == fileid]
- elif lemma is not None:
- return self._lemma_to_class[lemma]
- elif wordnetid is not None:
- return self._wordnet_to_class[wordnetid]
- elif classid is not None:
- xmltree = self.vnclass(classid)
- return [subclass.get('ID') for subclass in
- xmltree.findall('SUBCLASSES/VNSUBCLASS')]
- else:
- return sorted(self._class_to_fileid.keys())
- def vnclass(self, fileid_or_classid):
- """
- Return an ElementTree containing the xml for the specified
- verbnet class.
- :param fileid_or_classid: An identifier specifying which class
- should be returned. Can be a file identifier (such as
- ``'put-9.1.xml'``), or a verbnet class identifier (such as
- ``'put-9.1'``) or a short verbnet class identifier (such as
- ``'9.1'``).
- """
- # File identifier: just return the xml.
- if fileid_or_classid in self._fileids:
- return self.xml(fileid_or_classid)
- # Class identifier: get the xml, and find the right elt.
- classid = self.longid(fileid_or_classid)
- if classid in self._class_to_fileid:
- fileid = self._class_to_fileid[self.longid(classid)]
- tree = self.xml(fileid)
- if classid == tree.get('ID'):
- return tree
- else:
- for subclass in tree.findall('.//VNSUBCLASS'):
- if classid == subclass.get('ID'):
- return subclass
- else:
- assert False # we saw it during _index()!
- else:
- raise ValueError('Unknown identifier %s' % fileid_or_classid)
- def fileids(self, vnclass_ids=None):
- """
- Return a list of fileids that make up this corpus. If
- ``vnclass_ids`` is specified, then return the fileids that make
- up the specified verbnet class(es).
- """
- if vnclass_ids is None:
- return self._fileids
- elif isinstance(vnclass_ids, basestring):
- return [self._class_to_fileid[self.longid(vnclass_ids)]]
- else:
- return [self._class_to_fileid[self.longid(vnclass_id)]
- for vnclass_id in vnclass_ids]
- ######################################################################
- #{ Index Initialization
- ######################################################################
- def _index(self):
- """
- Initialize the indexes ``_lemma_to_class``,
- ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
- through the corpus fileids. This is fast with cElementTree
- (<0.1 secs), but quite slow (>10 secs) with the python
- implementation of ElementTree.
- """
- for fileid in self._fileids:
- self._index_helper(self.xml(fileid), fileid)
- def _index_helper(self, xmltree, fileid):
- """Helper for ``_index()``"""
- vnclass = xmltree.get('ID')
- self._class_to_fileid[vnclass] = fileid
- self._shortid_to_longid[self.shortid(vnclass)] = vnclass
- for member in xmltree.findall('MEMBERS/MEMBER'):
- self._lemma_to_class[member.get('name')].append(vnclass)
- for wn in member.get('wn', '').split():
- self._wordnet_to_class[wn].append(vnclass)
- for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'):
- self._index_helper(subclass, fileid)
- def _quick_index(self):
- """
- Initialize the indexes ``_lemma_to_class``,
- ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
- through the corpus fileids. This doesn't do proper xml parsing,
- but is good enough to find everything in the standard verbnet
- corpus -- and it runs about 30 times faster than xml parsing
- (with the python ElementTree; only 2-3 times faster with
- cElementTree).
- """
- # nb: if we got rid of wordnet_to_class, this would run 2-3
- # times faster.
- for fileid in self._fileids:
- vnclass = fileid[:-4] # strip the '.xml'
- self._class_to_fileid[vnclass] = fileid
- self._shortid_to_longid[self.shortid(vnclass)] = vnclass
- for m in self._INDEX_RE.finditer(self.open(fileid).read()):
- groups = m.groups()
- if groups[0] is not None:
- self._lemma_to_class[groups[0]].append(vnclass)
- for wn in groups[1].split():
- self._wordnet_to_class[wn].append(vnclass)
- elif groups[2] is not None:
- self._class_to_fileid[groups[2]] = fileid
- vnclass = groups[2] # for <MEMBER> elts.
- self._shortid_to_longid[self.shortid(vnclass)] = vnclass
- else:
- assert False, 'unexpected match condition'
- ######################################################################
- #{ Identifier conversion
- ######################################################################
- def longid(self, shortid):
- """Given a short verbnet class identifier (eg '37.10'), map it
- to a long id (eg 'confess-37.10'). If ``shortid`` is already a
- long id, then return it as-is"""
- if self._LONGID_RE.match(shortid):
- return shortid # it's already a longid.
- elif not self._SHORTID_RE.match(shortid):
- raise ValueError('vnclass identifier %r not found' % shortid)
- try:
- return self._shortid_to_longid[shortid]
- except KeyError:
- raise ValueError('vnclass identifier %r not found' % shortid)
- def shortid(self, longid):
- """Given a long verbnet class identifier (eg 'confess-37.10'),
- map it to a short id (eg '37.10'). If ``longid`` is already a
- short id, then return it as-is."""
- if self._SHORTID_RE.match(longid):
- return longid # it's already a shortid.
- m = self._LONGID_RE.match(longid)
- if m:
- return m.group(2)
- else:
- raise ValueError('vnclass identifier %r not found' % longid)
- ######################################################################
- #{ Pretty Printing
- ######################################################################
- def pprint(self, vnclass):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet class.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
- """
- if isinstance(vnclass, basestring):
- vnclass = self.vnclass(vnclass)
- s = vnclass.get('ID') + '\n'
- s += self.pprint_subclasses(vnclass, indent=' ') + '\n'
- s += self.pprint_members(vnclass, indent=' ') + '\n'
- s += ' Thematic roles:\n'
- s += self.pprint_themroles(vnclass, indent=' ') + '\n'
- s += ' Frames:\n'
- s += '\n'.join(self.pprint_frame(vnframe, indent=' ')
- for vnframe in vnclass.findall('FRAMES/FRAME'))
- return s
- def pprint_subclasses(self, vnclass, indent=''):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet class's subclasses.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
- """
- if isinstance(vnclass, basestring):
- vnclass = self.vnclass(vnclass)
- subclasses = [subclass.get('ID') for subclass in
- vnclass.findall('SUBCLASSES/VNSUBCLASS')]
- if not subclasses: subclasses = ['(none)']
- s = 'Subclasses: ' + ' '.join(subclasses)
- return textwrap.fill(s, 70, initial_indent=indent,
- subsequent_indent=indent+' ')
- def pprint_members(self, vnclass, indent=''):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet class's member verbs.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
- """
- if isinstance(vnclass, basestring):
- vnclass = self.vnclass(vnclass)
- members = [member.get('name') for member in
- vnclass.findall('MEMBERS/MEMBER')]
- if not members: members = ['(none)']
- s = 'Members: ' + ' '.join(members)
- return textwrap.fill(s, 70, initial_indent=indent,
- subsequent_indent=indent+' ')
- def pprint_themroles(self, vnclass, indent=''):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet class's thematic roles.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
- """
- if isinstance(vnclass, basestring):
- vnclass = self.vnclass(vnclass)
- pieces = []
- for themrole in vnclass.findall('THEMROLES/THEMROLE'):
- piece = indent + '* ' + themrole.get('type')
- modifiers = ['%(Value)s%(type)s' % restr.attrib
- for restr in themrole.findall('SELRESTRS/SELRESTR')]
- if modifiers:
- piece += '[%s]' % ' '.join(modifiers)
- pieces.append(piece)
- return '\n'.join(pieces)
- def pprint_frame(self, vnframe, indent=''):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet frame.
- :param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
- """
- s = self.pprint_description(vnframe, indent) + '\n'
- s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n'
- s += indent + ' Semantics:\n'
- s += self.pprint_semantics(vnframe, indent+' ')
- return s
- def pprint_description(self, vnframe, indent=''):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet frame description.
- :param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
- """
- descr = vnframe.find('DESCRIPTION')
- s = indent + descr.attrib['primary']
- if descr.get('secondary', ''):
- s += ' (%s)' % descr.get('secondary')
- return s
- def pprint_syntax(self, vnframe, indent=''):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet frame syntax.
- :param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
- """
- pieces = []
- for elt in vnframe.find('SYNTAX'):
- piece = elt.tag
- modifiers = []
- if 'value' in elt.attrib:
- modifiers.append(elt.get('value'))
- modifiers += ['%(Value)s%(type)s' % restr.attrib
- for restr in (elt.findall('SELRESTRS/SELRESTR') +
- elt.findall('SYNRESTRS/SYNRESTR'))]
- if modifiers:
- piece += '[%s]' % ' '.join(modifiers)
- pieces.append(piece)
- return indent + ' '.join(pieces)
- def pprint_semantics(self, vnframe, indent=''):
- """
- Return a string containing a pretty-printed representation of
- the given verbnet frame semantics.
- :param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
- """
- pieces = []
- for pred in vnframe.findall('SEMANTICS/PRED'):
- args = [arg.get('value') for arg in pred.findall('ARGS/ARG')]
- pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args)))
- return '\n'.join(['%s* %s' % (indent, piece) for piece in pieces])