PageRenderTime 47ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/ccg/lexicon.py

https://github.com/BrucePHill/nltk
Python | 244 lines | 219 code | 13 blank | 12 comment | 0 complexity | c581f43b7bd16ec24977bb7f2a5c8621 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Combinatory Categorial Grammar
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from __future__ import unicode_literals
  8. import re
  9. from collections import defaultdict
  10. from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
  11. from nltk.compat import python_2_unicode_compatible
  12. #------------
  13. # Regular expressions used for parsing components of the lexicon
  14. #------------
  15. # Parses a primitive category and subscripts
  16. rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
  17. # Separates the next primitive category from the remainder of the
  18. # string
  19. reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
  20. # Separates the next application operator from the remainder
  21. reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
  22. # Parses the definition of the category of either a word or a family
  23. reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')
  24. # Strips comments from a line
  25. reComm = re.compile('''([^#]*)(?:#.*)?''')
  26. #----------
  27. # Lexicons
  28. #----------
  29. @python_2_unicode_compatible
  30. class CCGLexicon(object):
  31. '''
  32. Class representing a lexicon for CCG grammars.
  33. primitives - The list of primitive categories for the lexicon
  34. families - Families of categories
  35. entries - A mapping of words to possible categories
  36. '''
  37. def __init__(self,start,primitives,families,entries):
  38. self._start = PrimitiveCategory(start)
  39. self._primitives = primitives
  40. self._families = families
  41. self._entries = entries
  42. # Returns all the possible categories for a word
  43. def categories(self,word):
  44. return self._entries[word]
  45. # Returns the target category for the parser
  46. def start(self):
  47. return self._start
  48. # String representation of the lexicon
  49. # Used for debugging
  50. def __str__(self):
  51. st = ""
  52. first = True
  53. for ident in self._entries:
  54. if not first:
  55. st = st + "\n"
  56. st = st + ident + " => "
  57. first = True
  58. for cat in self._entries[ident]:
  59. if not first:
  60. st = st + " | "
  61. else:
  62. first = False
  63. st = st + "%s" % cat
  64. return st
  65. #-----------
  66. # Parsing lexicons
  67. #-----------
  68. # Separates the contents matching the first set of brackets
  69. # from the rest of the input.
  70. def matchBrackets(string):
  71. rest = string[1:]
  72. inside = "("
  73. while rest != "" and not rest.startswith(')'):
  74. if rest.startswith('('):
  75. (part,rest) = matchBrackets(rest)
  76. inside = inside + part
  77. else:
  78. inside = inside + rest[0]
  79. rest = rest[1:]
  80. if rest.startswith(')'):
  81. return (inside + ')',rest[1:])
  82. raise AssertionError('Unmatched bracket in string \'' + string + '\'')
  83. # Separates the string for the next portion of the category
  84. # from the rest of the string
  85. def nextCategory(string):
  86. if string.startswith('('):
  87. return matchBrackets(string)
  88. return reNextPrim.match(string).groups()
  89. # Parses an application operator
  90. def parseApplication(app):
  91. return Direction(app[0],app[1:])
  92. # Parses the subscripts for a primitive category
  93. def parseSubscripts(subscr):
  94. if subscr:
  95. return subscr[1:-1].split(',')
  96. return []
  97. # Parse a primitive category
  98. def parsePrimitiveCategory(chunks,primitives,families,var):
  99. # If the primitive is the special category 'var',
  100. # replace it with the correct CCGVar
  101. if chunks[0] == "var":
  102. if chunks[1] is None:
  103. if var is None:
  104. var = CCGVar()
  105. return (var,var)
  106. catstr = chunks[0]
  107. if catstr in families:
  108. (cat, cvar) = families[catstr]
  109. if var is None:
  110. var = cvar
  111. else:
  112. cat = cat.substitute([(cvar,var)])
  113. return (cat,var)
  114. if catstr in primitives:
  115. subscrs = parseSubscripts(chunks[1])
  116. return (PrimitiveCategory(catstr,subscrs),var)
  117. raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.')
  118. # parseCategory drops the 'var' from the tuple
  119. def parseCategory(line,primitives,families):
  120. return augParseCategory(line,primitives,families)[0]
  121. # Parses a string representing a category, and returns
  122. # a tuple with (possibly) the CCG variable for the category
  123. def augParseCategory(line,primitives,families,var = None):
  124. (str,rest) = nextCategory(line)
  125. if str.startswith('('):
  126. (res,var) = augParseCategory(str[1:-1],primitives,families,var)
  127. else:
  128. # print rePrim.match(str).groups()
  129. (res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
  130. while rest != "":
  131. app = reApp.match(rest).groups()
  132. dir = parseApplication(app[0:3])
  133. rest = app[3]
  134. (str,rest) = nextCategory(rest)
  135. if str.startswith('('):
  136. (arg,var) = augParseCategory(str[1:-1],primitives,families,var)
  137. else:
  138. (arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
  139. res = FunctionalCategory(res,arg,dir)
  140. return (res,var)
  141. # Takes an input string, and converts it into a lexicon for CCGs.
  142. def parseLexicon(lex_str):
  143. primitives = []
  144. families = {}
  145. entries = defaultdict(list)
  146. for line in lex_str.splitlines():
  147. # Strip comments and leading/trailing whitespace.
  148. line = reComm.match(line).groups()[0].strip()
  149. if line == "":
  150. continue
  151. if line.startswith(':-'):
  152. # A line of primitive categories.
  153. # The first line is the target category
  154. # ie, :- S, N, NP, VP
  155. primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
  156. else:
  157. # Either a family definition, or a word definition
  158. (ident, sep, catstr) = reLex.match(line).groups()
  159. (cat,var) = augParseCategory(catstr,primitives,families)
  160. if sep == '::':
  161. # Family definition
  162. # ie, Det :: NP/N
  163. families[ident] = (cat,var)
  164. else:
  165. # Word definition
  166. # ie, which => (N\N)/(S/NP)
  167. entries[ident].append(cat)
  168. return CCGLexicon(primitives[0],primitives,families,entries)
  169. openccg_tinytiny = parseLexicon('''
  170. # Rather minimal lexicon based on the openccg `tinytiny' grammar.
  171. # Only incorporates a subset of the morphological subcategories, however.
  172. :- S,NP,N # Primitive categories
  173. Det :: NP/N # Determiners
  174. Pro :: NP
  175. IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
  176. IntransVpl :: S\\NP[pl] # Plural
  177. TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
  178. TransVpl :: S\\NP[pl]/NP # Plural
  179. the => NP[sg]/N[sg]
  180. the => NP[pl]/N[pl]
  181. I => Pro
  182. me => Pro
  183. we => Pro
  184. us => Pro
  185. book => N[sg]
  186. books => N[pl]
  187. peach => N[sg]
  188. peaches => N[pl]
  189. policeman => N[sg]
  190. policemen => N[pl]
  191. boy => N[sg]
  192. boys => N[pl]
  193. sleep => IntransVsg
  194. sleep => IntransVpl
  195. eat => IntransVpl
  196. eat => TransVpl
  197. eats => IntransVsg
  198. eats => TransVsg
  199. see => TransVpl
  200. sees => TransVsg
  201. ''')