/nltk/ccg/lexicon.py
Python | 244 lines | 219 code | 13 blank | 12 comment | 0 complexity | c581f43b7bd16ec24977bb7f2a5c8621 MD5 | raw file
Possible License(s): Apache-2.0
- # Natural Language Toolkit: Combinatory Categorial Grammar
- #
- # Copyright (C) 2001-2013 NLTK Project
- # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
- # URL: <http://www.nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import unicode_literals
- import re
- from collections import defaultdict
- from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
- from nltk.compat import python_2_unicode_compatible
- #------------
- # Regular expressions used for parsing components of the lexicon
- #------------
- # Parses a primitive category and subscripts
- rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
- # Separates the next primitive category from the remainder of the
- # string
- reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
- # Separates the next application operator from the remainder
- reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
- # Parses the definition of the category of either a word or a family
- reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')
- # Strips comments from a line
- reComm = re.compile('''([^#]*)(?:#.*)?''')
- #----------
- # Lexicons
- #----------
- @python_2_unicode_compatible
- class CCGLexicon(object):
- '''
- Class representing a lexicon for CCG grammars.
- primitives - The list of primitive categories for the lexicon
- families - Families of categories
- entries - A mapping of words to possible categories
- '''
- def __init__(self,start,primitives,families,entries):
- self._start = PrimitiveCategory(start)
- self._primitives = primitives
- self._families = families
- self._entries = entries
- # Returns all the possible categories for a word
- def categories(self,word):
- return self._entries[word]
- # Returns the target category for the parser
- def start(self):
- return self._start
- # String representation of the lexicon
- # Used for debugging
- def __str__(self):
- st = ""
- first = True
- for ident in self._entries:
- if not first:
- st = st + "\n"
- st = st + ident + " => "
- first = True
- for cat in self._entries[ident]:
- if not first:
- st = st + " | "
- else:
- first = False
- st = st + "%s" % cat
- return st
- #-----------
- # Parsing lexicons
- #-----------
- # Separates the contents matching the first set of brackets
- # from the rest of the input.
- def matchBrackets(string):
- rest = string[1:]
- inside = "("
- while rest != "" and not rest.startswith(')'):
- if rest.startswith('('):
- (part,rest) = matchBrackets(rest)
- inside = inside + part
- else:
- inside = inside + rest[0]
- rest = rest[1:]
- if rest.startswith(')'):
- return (inside + ')',rest[1:])
- raise AssertionError('Unmatched bracket in string \'' + string + '\'')
- # Separates the string for the next portion of the category
- # from the rest of the string
- def nextCategory(string):
- if string.startswith('('):
- return matchBrackets(string)
- return reNextPrim.match(string).groups()
- # Parses an application operator
- def parseApplication(app):
- return Direction(app[0],app[1:])
- # Parses the subscripts for a primitive category
- def parseSubscripts(subscr):
- if subscr:
- return subscr[1:-1].split(',')
- return []
- # Parse a primitive category
- def parsePrimitiveCategory(chunks,primitives,families,var):
- # If the primitive is the special category 'var',
- # replace it with the correct CCGVar
- if chunks[0] == "var":
- if chunks[1] is None:
- if var is None:
- var = CCGVar()
- return (var,var)
- catstr = chunks[0]
- if catstr in families:
- (cat, cvar) = families[catstr]
- if var is None:
- var = cvar
- else:
- cat = cat.substitute([(cvar,var)])
- return (cat,var)
- if catstr in primitives:
- subscrs = parseSubscripts(chunks[1])
- return (PrimitiveCategory(catstr,subscrs),var)
- raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.')
- # parseCategory drops the 'var' from the tuple
- def parseCategory(line,primitives,families):
- return augParseCategory(line,primitives,families)[0]
- # Parses a string representing a category, and returns
- # a tuple with (possibly) the CCG variable for the category
- def augParseCategory(line,primitives,families,var = None):
- (str,rest) = nextCategory(line)
- if str.startswith('('):
- (res,var) = augParseCategory(str[1:-1],primitives,families,var)
- else:
- # print rePrim.match(str).groups()
- (res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
- while rest != "":
- app = reApp.match(rest).groups()
- dir = parseApplication(app[0:3])
- rest = app[3]
- (str,rest) = nextCategory(rest)
- if str.startswith('('):
- (arg,var) = augParseCategory(str[1:-1],primitives,families,var)
- else:
- (arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
- res = FunctionalCategory(res,arg,dir)
- return (res,var)
- # Takes an input string, and converts it into a lexicon for CCGs.
- def parseLexicon(lex_str):
- primitives = []
- families = {}
- entries = defaultdict(list)
- for line in lex_str.splitlines():
- # Strip comments and leading/trailing whitespace.
- line = reComm.match(line).groups()[0].strip()
- if line == "":
- continue
- if line.startswith(':-'):
- # A line of primitive categories.
- # The first line is the target category
- # ie, :- S, N, NP, VP
- primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
- else:
- # Either a family definition, or a word definition
- (ident, sep, catstr) = reLex.match(line).groups()
- (cat,var) = augParseCategory(catstr,primitives,families)
- if sep == '::':
- # Family definition
- # ie, Det :: NP/N
- families[ident] = (cat,var)
- else:
- # Word definition
- # ie, which => (N\N)/(S/NP)
- entries[ident].append(cat)
- return CCGLexicon(primitives[0],primitives,families,entries)
- openccg_tinytiny = parseLexicon('''
- # Rather minimal lexicon based on the openccg `tinytiny' grammar.
- # Only incorporates a subset of the morphological subcategories, however.
- :- S,NP,N # Primitive categories
- Det :: NP/N # Determiners
- Pro :: NP
- IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
- IntransVpl :: S\\NP[pl] # Plural
- TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
- TransVpl :: S\\NP[pl]/NP # Plural
- the => NP[sg]/N[sg]
- the => NP[pl]/N[pl]
- I => Pro
- me => Pro
- we => Pro
- us => Pro
- book => N[sg]
- books => N[pl]
- peach => N[sg]
- peaches => N[pl]
- policeman => N[sg]
- policemen => N[pl]
- boy => N[sg]
- boys => N[pl]
- sleep => IntransVsg
- sleep => IntransVpl
- eat => IntransVpl
- eat => TransVpl
- eats => IntransVsg
- eats => TransVsg
- see => TransVpl
- sees => TransVsg
- ''')