PageRenderTime 57ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/ccg/lexicon.py

https://github.com/haewoon/nltk
Python | 241 lines | 216 code | 13 blank | 12 comment | 0 complexity | dad73082824d656370a653b383047112 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Combinatory Categorial Grammar
  2. #
  3. # Copyright (C) 2001-2012 NLTK Project
  4. # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import re
  8. from collections import defaultdict
  9. from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
  10. #------------
  11. # Regular expressions used for parsing components of the lexicon
  12. #------------
  13. # Parses a primitive category and subscripts
  14. rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
  15. # Separates the next primitive category from the remainder of the
  16. # string
  17. reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
  18. # Separates the next application operator from the remainder
  19. reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
  20. # Parses the definition of the category of either a word or a family
  21. reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')
  22. # Strips comments from a line
  23. reComm = re.compile('''([^#]*)(?:#.*)?''')
  24. #----------
  25. # Lexicons
  26. #----------
  27. class CCGLexicon(object):
  28. '''
  29. Class representing a lexicon for CCG grammars.
  30. primitives - The list of primitive categories for the lexicon
  31. families - Families of categories
  32. entries - A mapping of words to possible categories
  33. '''
  34. def __init__(self,start,primitives,families,entries):
  35. self._start = PrimitiveCategory(start)
  36. self._primitives = primitives
  37. self._families = families
  38. self._entries = entries
  39. # Returns all the possible categories for a word
  40. def categories(self,word):
  41. return self._entries[word]
  42. # Returns the target category for the parser
  43. def start(self):
  44. return self._start
  45. # String representation of the lexicon
  46. # Used for debugging
  47. def __str__(self):
  48. st = ""
  49. first = True
  50. for ident in self._entries:
  51. if not first:
  52. st = st + "\n"
  53. st = st + ident + " => "
  54. first = True
  55. for cat in self._entries[ident]:
  56. if not first:
  57. st = st + " | "
  58. else:
  59. first = False
  60. st = st + str(cat)
  61. return st
  62. #-----------
  63. # Parsing lexicons
  64. #-----------
  65. # Separates the contents matching the first set of brackets
  66. # from the rest of the input.
  67. def matchBrackets(string):
  68. rest = string[1:]
  69. inside = "("
  70. while rest != "" and not rest.startswith(')'):
  71. if rest.startswith('('):
  72. (part,rest) = matchBrackets(rest)
  73. inside = inside + part
  74. else:
  75. inside = inside + rest[0]
  76. rest = rest[1:]
  77. if rest.startswith(')'):
  78. return (inside + ')',rest[1:])
  79. raise AssertionError, 'Unmatched bracket in string \'' + string + '\''
  80. # Separates the string for the next portion of the category
  81. # from the rest of the string
  82. def nextCategory(string):
  83. if string.startswith('('):
  84. return matchBrackets(string)
  85. return reNextPrim.match(string).groups()
  86. # Parses an application operator
  87. def parseApplication(app):
  88. return Direction(app[0],app[1:])
  89. # Parses the subscripts for a primitive category
  90. def parseSubscripts(subscr):
  91. if subscr:
  92. return subscr[1:-1].split(',')
  93. return []
  94. # Parse a primitive category
  95. def parsePrimitiveCategory(chunks,primitives,families,var):
  96. # If the primitive is the special category 'var',
  97. # replace it with the correct CCGVar
  98. if chunks[0] == "var":
  99. if chunks[1] is None:
  100. if var is None:
  101. var = CCGVar()
  102. return (var,var)
  103. catstr = chunks[0]
  104. if catstr in families:
  105. (cat, cvar) = families[catstr]
  106. if var is None:
  107. var = cvar
  108. else:
  109. cat = cat.substitute([(cvar,var)])
  110. return (cat,var)
  111. if catstr in primitives:
  112. subscrs = parseSubscripts(chunks[1])
  113. return (PrimitiveCategory(catstr,subscrs),var)
  114. raise AssertionError, 'String \'' + catstr + '\' is neither a family nor primitive category.'
  115. # parseCategory drops the 'var' from the tuple
  116. def parseCategory(line,primitives,families):
  117. return augParseCategory(line,primitives,families)[0]
  118. # Parses a string representing a category, and returns
  119. # a tuple with (possibly) the CCG variable for the category
  120. def augParseCategory(line,primitives,families,var = None):
  121. (str,rest) = nextCategory(line)
  122. if str.startswith('('):
  123. (res,var) = augParseCategory(str[1:-1],primitives,families,var)
  124. else:
  125. # print rePrim.match(str).groups()
  126. (res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
  127. while rest != "":
  128. app = reApp.match(rest).groups()
  129. dir = parseApplication(app[0:3])
  130. rest = app[3]
  131. (str,rest) = nextCategory(rest)
  132. if str.startswith('('):
  133. (arg,var) = augParseCategory(str[1:-1],primitives,families,var)
  134. else:
  135. (arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
  136. res = FunctionalCategory(res,arg,dir)
  137. return (res,var)
  138. # Takes an input string, and converts it into a lexicon for CCGs.
  139. def parseLexicon(lex_str):
  140. primitives = []
  141. families = {}
  142. entries = defaultdict(list)
  143. for line in lex_str.splitlines():
  144. # Strip comments and leading/trailing whitespace.
  145. line = reComm.match(line).groups()[0].strip()
  146. if line == "":
  147. continue
  148. if line.startswith(':-'):
  149. # A line of primitive categories.
  150. # The first line is the target category
  151. # ie, :- S, N, NP, VP
  152. primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
  153. else:
  154. # Either a family definition, or a word definition
  155. (ident, sep, catstr) = reLex.match(line).groups()
  156. (cat,var) = augParseCategory(catstr,primitives,families)
  157. if sep == '::':
  158. # Family definition
  159. # ie, Det :: NP/N
  160. families[ident] = (cat,var)
  161. else:
  162. # Word definition
  163. # ie, which => (N\N)/(S/NP)
  164. entries[ident].append(cat)
  165. return CCGLexicon(primitives[0],primitives,families,entries)
  166. openccg_tinytiny = parseLexicon('''
  167. # Rather minimal lexicon based on the openccg `tinytiny' grammar.
  168. # Only incorporates a subset of the morphological subcategories, however.
  169. :- S,NP,N # Primitive categories
  170. Det :: NP/N # Determiners
  171. Pro :: NP
  172. IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
  173. IntransVpl :: S\\NP[pl] # Plural
  174. TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
  175. TransVpl :: S\\NP[pl]/NP # Plural
  176. the => NP[sg]/N[sg]
  177. the => NP[pl]/N[pl]
  178. I => Pro
  179. me => Pro
  180. we => Pro
  181. us => Pro
  182. book => N[sg]
  183. books => N[pl]
  184. peach => N[sg]
  185. peaches => N[pl]
  186. policeman => N[sg]
  187. policemen => N[pl]
  188. boy => N[sg]
  189. boys => N[pl]
  190. sleep => IntransVsg
  191. sleep => IntransVpl
  192. eat => IntransVpl
  193. eat => TransVpl
  194. eats => IntransVsg
  195. eats => TransVsg
  196. see => TransVpl
  197. sees => TransVsg
  198. ''')