PageRenderTime 77ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/tree.py

https://github.com/BrucePHill/nltk
Python | 1526 lines | 1472 code | 20 blank | 34 comment | 28 complexity | e9ec6a3bcfa7a76dc005b632e60cf528 MD5 | raw file
Possible License(s): Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Text Trees
  3. #
  4. # Copyright (C) 2001-2013 NLTK Project
  5. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # Peter Ljunglรถf <peter.ljunglof@gu.se>
  8. # Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms)
  9. # URL: <http://www.nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. """
  12. Class for representing hierarchical language structures, such as
  13. syntax trees and morphological trees.
  14. """
  15. from __future__ import print_function, unicode_literals
  16. # TODO: add LabelledTree (can be used for dependency trees)
  17. import re
  18. from nltk.grammar import Production, Nonterminal
  19. from nltk.probability import ProbabilisticMixIn
  20. from nltk.util import slice_bounds
  21. from nltk.compat import string_types, python_2_unicode_compatible, unicode_repr
  22. from nltk.internals import raise_unorderable_types
  23. ######################################################################
  24. ## Trees
  25. ######################################################################
  26. @python_2_unicode_compatible
  27. class Tree(list):
  28. """
  29. A Tree represents a hierarchical grouping of leaves and subtrees.
  30. For example, each constituent in a syntax tree is represented by a single Tree.
  31. A tree's children are encoded as a list of leaves and subtrees,
  32. where a leaf is a basic (non-tree) value; and a subtree is a
  33. nested Tree.
  34. >>> from nltk.tree import Tree
  35. >>> print(Tree(1, [2, Tree(3, [4]), 5]))
  36. (1 2 (3 4) 5)
  37. >>> vp = Tree('VP', [Tree('V', ['saw']),
  38. ... Tree('NP', ['him'])])
  39. >>> s = Tree('S', [Tree('NP', ['I']), vp])
  40. >>> print(s)
  41. (S (NP I) (VP (V saw) (NP him)))
  42. >>> print(s[1])
  43. (VP (V saw) (NP him))
  44. >>> print(s[1,1])
  45. (NP him)
  46. >>> t = Tree("(S (NP I) (VP (V saw) (NP him)))")
  47. >>> s == t
  48. True
  49. >>> t[1][1].node = "X"
  50. >>> print(t)
  51. (S (NP I) (VP (V saw) (X him)))
  52. >>> t[0], t[1,1] = t[1,1], t[0]
  53. >>> print(t)
  54. (S (X him) (VP (V saw) (NP I)))
  55. The length of a tree is the number of children it has.
  56. >>> len(t)
  57. 2
  58. Any other properties that a Tree defines are known as node
  59. properties, and are used to add information about individual
  60. hierarchical groupings. For example, syntax trees use a NODE
  61. property to label syntactic constituents with phrase tags, such as
  62. "NP" and "VP".
  63. Several Tree methods use "tree positions" to specify
  64. children or descendants of a tree. Tree positions are defined as
  65. follows:
  66. - The tree position *i* specifies a Tree's *i*\ th child.
  67. - The tree position ``()`` specifies the Tree itself.
  68. - If *p* is the tree position of descendant *d*, then
  69. *p+i* specifies the *i*\ th child of *d*.
  70. I.e., every tree position is either a single index *i*,
  71. specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*,
  72. specifying ``tree[i1][i2]...[iN]``.
  73. Construct a new tree. This constructor can be called in one
  74. of two ways:
  75. - ``Tree(node, children)`` constructs a new tree with the
  76. specified node value and list of children.
  77. - ``Tree(s)`` constructs a new tree by parsing the string ``s``.
  78. It is equivalent to calling the class method ``Tree.parse(s)``.
  79. """
  80. def __init__(self, node_or_str, children=None):
  81. if children is None:
  82. if not isinstance(node_or_str, string_types):
  83. raise TypeError("%s: Expected a node value and child list "
  84. "or a single string" % type(self).__name__)
  85. tree = type(self).parse(node_or_str)
  86. list.__init__(self, tree)
  87. self.node = tree.node
  88. elif isinstance(children, string_types):
  89. raise TypeError("%s() argument 2 should be a list, not a "
  90. "string" % type(self).__name__)
  91. else:
  92. list.__init__(self, children)
  93. self.node = node_or_str
  94. #////////////////////////////////////////////////////////////
  95. # Comparison operators
  96. #////////////////////////////////////////////////////////////
  97. def __eq__(self, other):
  98. return (self.__class__ is other.__class__ and
  99. (self.node, list(self)) == (other.node, list(other)))
  100. def __lt__(self, other):
  101. if not isinstance(other, Tree):
  102. # raise_unorderable_types("<", self, other)
  103. # Sometimes children can be pure strings,
  104. # so we need to be able to compare with non-trees:
  105. return self.__class__.__name__ < other.__class__.__name__
  106. elif self.__class__ is other.__class__:
  107. return (self.node, list(self)) < (other.node, list(other))
  108. else:
  109. return self.__class__.__name__ < other.__class__.__name__
  110. # @total_ordering doesn't work here, since the class inherits from a builtin class
  111. __ne__ = lambda self, other: not self == other
  112. __gt__ = lambda self, other: not (self < other or self == other)
  113. __le__ = lambda self, other: self < other or self == other
  114. __ge__ = lambda self, other: not self < other
  115. #////////////////////////////////////////////////////////////
  116. # Disabled list operations
  117. #////////////////////////////////////////////////////////////
  118. def __mul__(self, v):
  119. raise TypeError('Tree does not support multiplication')
  120. def __rmul__(self, v):
  121. raise TypeError('Tree does not support multiplication')
  122. def __add__(self, v):
  123. raise TypeError('Tree does not support addition')
  124. def __radd__(self, v):
  125. raise TypeError('Tree does not support addition')
  126. #////////////////////////////////////////////////////////////
  127. # Indexing (with support for tree positions)
  128. #////////////////////////////////////////////////////////////
  129. def __getitem__(self, index):
  130. if isinstance(index, (int, slice)):
  131. return list.__getitem__(self, index)
  132. elif isinstance(index, (list, tuple)):
  133. if len(index) == 0:
  134. return self
  135. elif len(index) == 1:
  136. return self[index[0]]
  137. else:
  138. return self[index[0]][index[1:]]
  139. else:
  140. raise TypeError("%s indices must be integers, not %s" %
  141. (type(self).__name__, type(index).__name__))
  142. def __setitem__(self, index, value):
  143. if isinstance(index, (int, slice)):
  144. return list.__setitem__(self, index, value)
  145. elif isinstance(index, (list, tuple)):
  146. if len(index) == 0:
  147. raise IndexError('The tree position () may not be '
  148. 'assigned to.')
  149. elif len(index) == 1:
  150. self[index[0]] = value
  151. else:
  152. self[index[0]][index[1:]] = value
  153. else:
  154. raise TypeError("%s indices must be integers, not %s" %
  155. (type(self).__name__, type(index).__name__))
  156. def __delitem__(self, index):
  157. if isinstance(index, (int, slice)):
  158. return list.__delitem__(self, index)
  159. elif isinstance(index, (list, tuple)):
  160. if len(index) == 0:
  161. raise IndexError('The tree position () may not be deleted.')
  162. elif len(index) == 1:
  163. del self[index[0]]
  164. else:
  165. del self[index[0]][index[1:]]
  166. else:
  167. raise TypeError("%s indices must be integers, not %s" %
  168. (type(self).__name__, type(index).__name__))
  169. #////////////////////////////////////////////////////////////
  170. # Basic tree operations
  171. #////////////////////////////////////////////////////////////
  172. def leaves(self):
  173. """
  174. Return the leaves of the tree.
  175. >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
  176. >>> t.leaves()
  177. ['the', 'dog', 'chased', 'the', 'cat']
  178. :return: a list containing this tree's leaves.
  179. The order reflects the order of the
  180. leaves in the tree's hierarchical structure.
  181. :rtype: list
  182. """
  183. leaves = []
  184. for child in self:
  185. if isinstance(child, Tree):
  186. leaves.extend(child.leaves())
  187. else:
  188. leaves.append(child)
  189. return leaves
  190. def flatten(self):
  191. """
  192. Return a flat version of the tree, with all non-root non-terminals removed.
  193. >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
  194. >>> print(t.flatten())
  195. (S the dog chased the cat)
  196. :return: a tree consisting of this tree's root connected directly to
  197. its leaves, omitting all intervening non-terminal nodes.
  198. :rtype: Tree
  199. """
  200. return Tree(self.node, self.leaves())
  201. def height(self):
  202. """
  203. Return the height of the tree.
  204. >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
  205. >>> t.height()
  206. 5
  207. >>> print(t[0,0])
  208. (D the)
  209. >>> t[0,0].height()
  210. 2
  211. :return: The height of this tree. The height of a tree
  212. containing no children is 1; the height of a tree
  213. containing only leaves is 2; and the height of any other
  214. tree is one plus the maximum of its children's
  215. heights.
  216. :rtype: int
  217. """
  218. max_child_height = 0
  219. for child in self:
  220. if isinstance(child, Tree):
  221. max_child_height = max(max_child_height, child.height())
  222. else:
  223. max_child_height = max(max_child_height, 1)
  224. return 1 + max_child_height
  225. def treepositions(self, order='preorder'):
  226. """
  227. >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
  228. >>> t.treepositions() # doctest: +ELLIPSIS
  229. [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...]
  230. >>> for pos in t.treepositions('leaves'):
  231. ... t[pos] = t[pos][::-1].upper()
  232. >>> print(t)
  233. (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC))))
  234. :param order: One of: ``preorder``, ``postorder``, ``bothorder``,
  235. ``leaves``.
  236. """
  237. positions = []
  238. if order in ('preorder', 'bothorder'): positions.append( () )
  239. for i, child in enumerate(self):
  240. if isinstance(child, Tree):
  241. childpos = child.treepositions(order)
  242. positions.extend((i,)+p for p in childpos)
  243. else:
  244. positions.append( (i,) )
  245. if order in ('postorder', 'bothorder'): positions.append( () )
  246. return positions
  247. def subtrees(self, filter=None):
  248. """
  249. Generate all the subtrees of this tree, optionally restricted
  250. to trees matching the filter function.
  251. >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
  252. >>> for s in t.subtrees(lambda t: t.height() == 2):
  253. ... print(s)
  254. (D the)
  255. (N dog)
  256. (V chased)
  257. (D the)
  258. (N cat)
  259. :type filter: function
  260. :param filter: the function to filter all local trees
  261. """
  262. if not filter or filter(self):
  263. yield self
  264. for child in self:
  265. if isinstance(child, Tree):
  266. for subtree in child.subtrees(filter):
  267. yield subtree
  268. def productions(self):
  269. """
  270. Generate the productions that correspond to the non-terminal nodes of the tree.
  271. For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
  272. form P -> C1 C2 ... Cn.
  273. >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
  274. >>> t.productions()
  275. [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased',
  276. NP -> D N, D -> 'the', N -> 'cat']
  277. :rtype: list(Production)
  278. """
  279. if not isinstance(self.node, string_types):
  280. raise TypeError('Productions can only be generated from trees having node labels that are strings')
  281. prods = [Production(Nonterminal(self.node), _child_names(self))]
  282. for child in self:
  283. if isinstance(child, Tree):
  284. prods += child.productions()
  285. return prods
  286. def pos(self):
  287. """
  288. Return a sequence of pos-tagged words extracted from the tree.
  289. >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
  290. >>> t.pos()
  291. [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]
  292. :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags).
  293. The order reflects the order of the leaves in the tree's hierarchical structure.
  294. :rtype: list(tuple)
  295. """
  296. pos = []
  297. for child in self:
  298. if isinstance(child, Tree):
  299. pos.extend(child.pos())
  300. else:
  301. pos.append((child, self.node))
  302. return pos
  303. def leaf_treeposition(self, index):
  304. """
  305. :return: The tree position of the ``index``-th leaf in this
  306. tree. I.e., if ``tp=self.leaf_treeposition(i)``, then
  307. ``self[tp]==self.leaves()[i]``.
  308. :raise IndexError: If this tree contains fewer than ``index+1``
  309. leaves, or if ``index<0``.
  310. """
  311. if index < 0: raise IndexError('index must be non-negative')
  312. stack = [(self, ())]
  313. while stack:
  314. value, treepos = stack.pop()
  315. if not isinstance(value, Tree):
  316. if index == 0: return treepos
  317. else: index -= 1
  318. else:
  319. for i in range(len(value)-1, -1, -1):
  320. stack.append( (value[i], treepos+(i,)) )
  321. raise IndexError('index must be less than or equal to len(self)')
  322. def treeposition_spanning_leaves(self, start, end):
  323. """
  324. :return: The tree position of the lowest descendant of this
  325. tree that dominates ``self.leaves()[start:end]``.
  326. :raise ValueError: if ``end <= start``
  327. """
  328. if end <= start:
  329. raise ValueError('end must be greater than start')
  330. # Find the tree positions of the start & end leaves, and
  331. # take the longest common subsequence.
  332. start_treepos = self.leaf_treeposition(start)
  333. end_treepos = self.leaf_treeposition(end-1)
  334. # Find the first index where they mismatch:
  335. for i in range(len(start_treepos)):
  336. if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
  337. return start_treepos[:i]
  338. return start_treepos
  339. #////////////////////////////////////////////////////////////
  340. # Transforms
  341. #////////////////////////////////////////////////////////////
  342. def chomsky_normal_form(self, factor = "right", horzMarkov = None, vertMarkov = 0, childChar = "|", parentChar = "^"):
  343. """
  344. This method can modify a tree in three ways:
  345. 1. Convert a tree into its Chomsky Normal Form (CNF)
  346. equivalent -- Every subtree has either two non-terminals
  347. or one terminal as its children. This process requires
  348. the creation of more"artificial" non-terminal nodes.
  349. 2. Markov (vertical) smoothing of children in new artificial
  350. nodes
  351. 3. Horizontal (parent) annotation of nodes
  352. :param factor: Right or left factoring method (default = "right")
  353. :type factor: str = [left|right]
  354. :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings)
  355. :type horzMarkov: int | None
  356. :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation)
  357. :type vertMarkov: int | None
  358. :param childChar: A string used in construction of the artificial nodes, separating the head of the
  359. original subtree from the child nodes that have yet to be expanded (default = "|")
  360. :type childChar: str
  361. :param parentChar: A string used to separate the node representation from its vertical annotation
  362. :type parentChar: str
  363. """
  364. from .treetransforms import chomsky_normal_form
  365. chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
  366. def un_chomsky_normal_form(self, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
  367. """
  368. This method modifies the tree in three ways:
  369. 1. Transforms a tree in Chomsky Normal Form back to its
  370. original structure (branching greater than two)
  371. 2. Removes any parent annotation (if it exists)
  372. 3. (optional) expands unary subtrees (if previously
  373. collapsed with collapseUnary(...) )
  374. :param expandUnary: Flag to expand unary or not (default = True)
  375. :type expandUnary: bool
  376. :param childChar: A string separating the head node from its children in an artificial node (default = "|")
  377. :type childChar: str
  378. :param parentChar: A sting separating the node label from its parent annotation (default = "^")
  379. :type parentChar: str
  380. :param unaryChar: A string joining two non-terminals in a unary production (default = "+")
  381. :type unaryChar: str
  382. """
  383. from .treetransforms import un_chomsky_normal_form
  384. un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
  385. def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
  386. """
  387. Collapse subtrees with a single child (ie. unary productions)
  388. into a new non-terminal (Tree node) joined by 'joinChar'.
  389. This is useful when working with algorithms that do not allow
  390. unary productions, and completely removing the unary productions
  391. would require loss of useful information. The Tree is modified
  392. directly (since it is passed by reference) and no value is returned.
  393. :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
  394. Part-of-Speech tags) since they are always unary productions
  395. :type collapsePOS: bool
  396. :param collapseRoot: 'False' (default) will not modify the root production
  397. if it is unary. For the Penn WSJ treebank corpus, this corresponds
  398. to the TOP -> productions.
  399. :type collapseRoot: bool
  400. :param joinChar: A string used to connect collapsed node values (default = "+")
  401. :type joinChar: str
  402. """
  403. from .treetransforms import collapse_unary
  404. collapse_unary(self, collapsePOS, collapseRoot, joinChar)
  405. #////////////////////////////////////////////////////////////
  406. # Convert, copy
  407. #////////////////////////////////////////////////////////////
  408. @classmethod
  409. def convert(cls, tree):
  410. """
  411. Convert a tree between different subtypes of Tree. ``cls`` determines
  412. which class will be used to encode the new tree.
  413. :type tree: Tree
  414. :param tree: The tree that should be converted.
  415. :return: The new Tree.
  416. """
  417. if isinstance(tree, Tree):
  418. children = [cls.convert(child) for child in tree]
  419. return cls(tree.node, children)
  420. else:
  421. return tree
  422. def copy(self, deep=False):
  423. if not deep: return type(self)(self.node, self)
  424. else: return type(self).convert(self)
  425. def _frozen_class(self): return ImmutableTree
  426. def freeze(self, leaf_freezer=None):
  427. frozen_class = self._frozen_class()
  428. if leaf_freezer is None:
  429. newcopy = frozen_class.convert(self)
  430. else:
  431. newcopy = self.copy(deep=True)
  432. for pos in newcopy.treepositions('leaves'):
  433. newcopy[pos] = leaf_freezer(newcopy[pos])
  434. newcopy = frozen_class.convert(newcopy)
  435. hash(newcopy) # Make sure the leaves are hashable.
  436. return newcopy
  437. #////////////////////////////////////////////////////////////
  438. # Parsing
  439. #////////////////////////////////////////////////////////////
  440. @classmethod
  441. def parse(cls, s, brackets='()', parse_node=None, parse_leaf=None,
  442. node_pattern=None, leaf_pattern=None,
  443. remove_empty_top_bracketing=False):
  444. """
  445. Parse a bracketed tree string and return the resulting tree.
  446. Trees are represented as nested brackettings, such as::
  447. (S (NP (NNP John)) (VP (V runs)))
  448. :type s: str
  449. :param s: The string to parse
  450. :type brackets: str (length=2)
  451. :param brackets: The bracket characters used to mark the
  452. beginning and end of trees and subtrees.
  453. :type parse_node: function
  454. :type parse_leaf: function
  455. :param parse_node, parse_leaf: If specified, these functions
  456. are applied to the substrings of ``s`` corresponding to
  457. nodes and leaves (respectively) to obtain the values for
  458. those nodes and leaves. They should have the following
  459. signature:
  460. parse_node(str) -> value
  461. For example, these functions could be used to parse nodes
  462. and leaves whose values should be some type other than
  463. string (such as ``FeatStruct``).
  464. Note that by default, node strings and leaf strings are
  465. delimited by whitespace and brackets; to override this
  466. default, use the ``node_pattern`` and ``leaf_pattern``
  467. arguments.
  468. :type node_pattern: str
  469. :type leaf_pattern: str
  470. :param node_pattern, leaf_pattern: Regular expression patterns
  471. used to find node and leaf substrings in ``s``. By
  472. default, both nodes patterns are defined to match any
  473. sequence of non-whitespace non-bracket characters.
  474. :type remove_empty_top_bracketing: bool
  475. :param remove_empty_top_bracketing: If the resulting tree has
  476. an empty node label, and is length one, then return its
  477. single child instead. This is useful for treebank trees,
  478. which sometimes contain an extra level of bracketing.
  479. :return: A tree corresponding to the string representation ``s``.
  480. If this class method is called using a subclass of Tree,
  481. then it will return a tree of that type.
  482. :rtype: Tree
  483. """
  484. if not isinstance(brackets, string_types) or len(brackets) != 2:
  485. raise TypeError('brackets must be a length-2 string')
  486. if re.search('\s', brackets):
  487. raise TypeError('whitespace brackets not allowed')
  488. # Construct a regexp that will tokenize the string.
  489. open_b, close_b = brackets
  490. open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
  491. if node_pattern is None:
  492. node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
  493. if leaf_pattern is None:
  494. leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
  495. token_re = re.compile('%s\s*(%s)?|%s|(%s)' % (
  496. open_pattern, node_pattern, close_pattern, leaf_pattern))
  497. # Walk through each token, updating a stack of trees.
  498. stack = [(None, [])] # list of (node, children) tuples
  499. for match in token_re.finditer(s):
  500. token = match.group()
  501. # Beginning of a tree/subtree
  502. if token[0] == open_b:
  503. if len(stack) == 1 and len(stack[0][1]) > 0:
  504. cls._parse_error(s, match, 'end-of-string')
  505. node = token[1:].lstrip()
  506. if parse_node is not None: node = parse_node(node)
  507. stack.append((node, []))
  508. # End of a tree/subtree
  509. elif token == close_b:
  510. if len(stack) == 1:
  511. if len(stack[0][1]) == 0:
  512. cls._parse_error(s, match, open_b)
  513. else:
  514. cls._parse_error(s, match, 'end-of-string')
  515. node, children = stack.pop()
  516. stack[-1][1].append(cls(node, children))
  517. # Leaf node
  518. else:
  519. if len(stack) == 1:
  520. cls._parse_error(s, match, open_b)
  521. if parse_leaf is not None: token = parse_leaf(token)
  522. stack[-1][1].append(token)
  523. # check that we got exactly one complete tree.
  524. if len(stack) > 1:
  525. cls._parse_error(s, 'end-of-string', close_b)
  526. elif len(stack[0][1]) == 0:
  527. cls._parse_error(s, 'end-of-string', open_b)
  528. else:
  529. assert stack[0][0] is None
  530. assert len(stack[0][1]) == 1
  531. tree = stack[0][1][0]
  532. # If the tree has an extra level with node='', then get rid of
  533. # it. E.g.: "((S (NP ...) (VP ...)))"
  534. if remove_empty_top_bracketing and tree.node == '' and len(tree) == 1:
  535. tree = tree[0]
  536. # return the tree.
  537. return tree
  538. @classmethod
  539. def _parse_error(cls, s, match, expecting):
  540. """
  541. Display a friendly error message when parsing a tree string fails.
  542. :param s: The string we're parsing.
  543. :param match: regexp match of the problem token.
  544. :param expecting: what we expected to see instead.
  545. """
  546. # Construct a basic error message
  547. if match == 'end-of-string':
  548. pos, token = len(s), 'end-of-string'
  549. else:
  550. pos, token = match.start(), match.group()
  551. msg = '%s.parse(): expected %r but got %r\n%sat index %d.' % (
  552. cls.__name__, expecting, token, ' '*12, pos)
  553. # Add a display showing the error token itsels:
  554. s = s.replace('\n', ' ').replace('\t', ' ')
  555. offset = pos
  556. if len(s) > pos+10:
  557. s = s[:pos+10]+'...'
  558. if pos > 10:
  559. s = '...'+s[pos-10:]
  560. offset = 13
  561. msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset))
  562. raise ValueError(msg)
  563. #////////////////////////////////////////////////////////////
  564. # Visualization & String Representation
  565. #////////////////////////////////////////////////////////////
  566. def draw(self):
  567. """
  568. Open a new window containing a graphical diagram of this tree.
  569. """
  570. from nltk.draw.tree import draw_trees
  571. draw_trees(self)
  572. def __repr__(self):
  573. childstr = ", ".join(unicode_repr(c) for c in self)
  574. return '%s(%s, [%s])' % (type(self).__name__, unicode_repr(self.node), childstr)
  575. def __str__(self):
  576. return self.pprint()
  577. def pprint(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
  578. """
  579. :return: A pretty-printed string representation of this tree.
  580. :rtype: str
  581. :param margin: The right margin at which to do line-wrapping.
  582. :type margin: int
  583. :param indent: The indentation level at which printing
  584. begins. This number is used to decide how far to indent
  585. subsequent lines.
  586. :type indent: int
  587. :param nodesep: A string that is used to separate the node
  588. from the children. E.g., the default value ``':'`` gives
  589. trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``.
  590. """
  591. # Try writing it on one line.
  592. s = self._pprint_flat(nodesep, parens, quotes)
  593. if len(s)+indent < margin:
  594. return s
  595. # If it doesn't fit on one line, then write it on multi-lines.
  596. if isinstance(self.node, string_types):
  597. s = '%s%s%s' % (parens[0], self.node, nodesep)
  598. else:
  599. s = '%s%s%s' % (parens[0], unicode_repr(self.node), nodesep)
  600. for child in self:
  601. if isinstance(child, Tree):
  602. s += '\n'+' '*(indent+2)+child.pprint(margin, indent+2,
  603. nodesep, parens, quotes)
  604. elif isinstance(child, tuple):
  605. s += '\n'+' '*(indent+2)+ "/".join(child)
  606. elif isinstance(child, string_types) and not quotes:
  607. s += '\n'+' '*(indent+2)+ '%s' % child
  608. else:
  609. s += '\n'+' '*(indent+2)+ unicode_repr(child)
  610. return s+parens[1]
  611. def pprint_latex_qtree(self):
  612. r"""
  613. Returns a representation of the tree compatible with the
  614. LaTeX qtree package. This consists of the string ``\Tree``
  615. followed by the parse tree represented in bracketed notation.
  616. For example, the following result was generated from a parse tree of
  617. the sentence ``The announcement astounded us``::
  618. \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
  619. [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
  620. See http://www.ling.upenn.edu/advice/latex.html for the LaTeX
  621. style file for the qtree package.
  622. :return: A latex qtree representation of this tree.
  623. :rtype: str
  624. """
  625. reserved_chars = re.compile('([#\$%&~_\{\}])')
  626. pprint = self.pprint(indent=6, nodesep='', parens=('[.', ' ]'))
  627. return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pprint)
  628. def _pprint_flat(self, nodesep, parens, quotes):
  629. childstrs = []
  630. for child in self:
  631. if isinstance(child, Tree):
  632. childstrs.append(child._pprint_flat(nodesep, parens, quotes))
  633. elif isinstance(child, tuple):
  634. childstrs.append("/".join(child))
  635. elif isinstance(child, string_types) and not quotes:
  636. childstrs.append('%s' % child)
  637. else:
  638. childstrs.append(unicode_repr(child))
  639. if isinstance(self.node, string_types):
  640. return '%s%s%s %s%s' % (parens[0], self.node, nodesep,
  641. " ".join(childstrs), parens[1])
  642. else:
  643. return '%s%s%s %s%s' % (parens[0], unicode_repr(self.node), nodesep,
  644. " ".join(childstrs), parens[1])
  645. class ImmutableTree(Tree):
  646. def __init__(self, node_or_str, children=None):
  647. super(ImmutableTree, self).__init__(node_or_str, children)
  648. # Precompute our hash value. This ensures that we're really
  649. # immutable. It also means we only have to calculate it once.
  650. try:
  651. self._hash = hash((self.node, tuple(self)))
  652. except (TypeError, ValueError):
  653. raise ValueError("%s: node value and children "
  654. "must be immutable" % type(self).__name__)
  655. def __setitem__(self, index, value):
  656. raise ValueError('%s may not be modified' % type(self).__name__)
  657. def __setslice__(self, i, j, value):
  658. raise ValueError('%s may not be modified' % type(self).__name__)
  659. def __delitem__(self, index):
  660. raise ValueError('%s may not be modified' % type(self).__name__)
  661. def __delslice__(self, i, j):
  662. raise ValueError('%s may not be modified' % type(self).__name__)
  663. def __iadd__(self, other):
  664. raise ValueError('%s may not be modified' % type(self).__name__)
  665. def __imul__(self, other):
  666. raise ValueError('%s may not be modified' % type(self).__name__)
  667. def append(self, v):
  668. raise ValueError('%s may not be modified' % type(self).__name__)
  669. def extend(self, v):
  670. raise ValueError('%s may not be modified' % type(self).__name__)
  671. def pop(self, v=None):
  672. raise ValueError('%s may not be modified' % type(self).__name__)
  673. def remove(self, v):
  674. raise ValueError('%s may not be modified' % type(self).__name__)
  675. def reverse(self):
  676. raise ValueError('%s may not be modified' % type(self).__name__)
  677. def sort(self):
  678. raise ValueError('%s may not be modified' % type(self).__name__)
  679. def __hash__(self):
  680. return self._hash
  681. def _get_node(self):
  682. """Get the node value"""
  683. return self._node
  684. def _set_node(self, value):
  685. """
  686. Set the node value. This will only succeed the first time the
  687. node value is set, which should occur in ImmutableTree.__init__().
  688. """
  689. if hasattr(self, 'node'):
  690. raise ValueError('%s may not be modified' % type(self).__name__)
  691. self._node = value
  692. node = property(_get_node, _set_node)
  693. ######################################################################
  694. ## Parented trees
  695. ######################################################################
  696. class AbstractParentedTree(Tree):
  697. """
  698. An abstract base class for a ``Tree`` that automatically maintains
  699. pointers to parent nodes. These parent pointers are updated
  700. whenever any change is made to a tree's structure. Two subclasses
  701. are currently defined:
  702. - ``ParentedTree`` is used for tree structures where each subtree
  703. has at most one parent. This class should be used in cases
  704. where there is no"sharing" of subtrees.
  705. - ``MultiParentedTree`` is used for tree structures where a
  706. subtree may have zero or more parents. This class should be
  707. used in cases where subtrees may be shared.
  708. Subclassing
  709. ===========
  710. The ``AbstractParentedTree`` class redefines all operations that
  711. modify a tree's structure to call two methods, which are used by
  712. subclasses to update parent information:
  713. - ``_setparent()`` is called whenever a new child is added.
  714. - ``_delparent()`` is called whenever a child is removed.
  715. """
  716. def __init__(self, node_or_str, children=None):
  717. super(AbstractParentedTree, self).__init__(node_or_str, children)
  718. # If children is None, the tree is parsed from node_or_str, and
  719. # all parents will be set during parsing.
  720. if children is not None:
  721. # Otherwise we have to set the parent of the children.
  722. # Iterate over self, and *not* children, because children
  723. # might be an iterator.
  724. for i, child in enumerate(self):
  725. if isinstance(child, Tree):
  726. self._setparent(child, i, dry_run=True)
  727. for i, child in enumerate(self):
  728. if isinstance(child, Tree):
  729. self._setparent(child, i)
  730. #////////////////////////////////////////////////////////////
  731. # Parent management
  732. #////////////////////////////////////////////////////////////
  733. def _setparent(self, child, index, dry_run=False):
  734. """
  735. Update the parent pointer of ``child`` to point to ``self``. This
  736. method is only called if the type of ``child`` is ``Tree``;
  737. i.e., it is not called when adding a leaf to a tree. This method
  738. is always called before the child is actually added to the
  739. child list of ``self``.
  740. :type child: Tree
  741. :type index: int
  742. :param index: The index of ``child`` in ``self``.
  743. :raise TypeError: If ``child`` is a tree with an impropriate
  744. type. Typically, if ``child`` is a tree, then its type needs
  745. to match the type of ``self``. This prevents mixing of
  746. different tree types (single-parented, multi-parented, and
  747. non-parented).
  748. :param dry_run: If true, the don't actually set the child's
  749. parent pointer; just check for any error conditions, and
  750. raise an exception if one is found.
  751. """
  752. raise NotImplementedError()
  753. def _delparent(self, child, index):
  754. """
  755. Update the parent pointer of ``child`` to not point to self. This
  756. method is only called if the type of ``child`` is ``Tree``; i.e., it
  757. is not called when removing a leaf from a tree. This method
  758. is always called before the child is actually removed from the
  759. child list of ``self``.
  760. :type child: Tree
  761. :type index: int
  762. :param index: The index of ``child`` in ``self``.
  763. """
  764. raise NotImplementedError()
  765. #////////////////////////////////////////////////////////////
  766. # Methods that add/remove children
  767. #////////////////////////////////////////////////////////////
  768. # Every method that adds or removes a child must make
  769. # appropriate calls to _setparent() and _delparent().
  770. def __delitem__(self, index):
  771. # del ptree[start:stop]
  772. if isinstance(index, slice):
  773. start, stop, step = slice_bounds(self, index, allow_step=True)
  774. # Clear all the children pointers.
  775. for i in range(start, stop, step):
  776. if isinstance(self[i], Tree):
  777. self._delparent(self[i], i)
  778. # Delete the children from our child list.
  779. super(AbstractParentedTree, self).__delitem__(index)
  780. # del ptree[i]
  781. elif isinstance(index, int):
  782. if index < 0: index += len(self)
  783. if index < 0: raise IndexError('index out of range')
  784. # Clear the child's parent pointer.
  785. if isinstance(self[index], Tree):
  786. self._delparent(self[index], index)
  787. # Remove the child from our child list.
  788. super(AbstractParentedTree, self).__delitem__(index)
  789. elif isinstance(index, (list, tuple)):
  790. # del ptree[()]
  791. if len(index) == 0:
  792. raise IndexError('The tree position () may not be deleted.')
  793. # del ptree[(i,)]
  794. elif len(index) == 1:
  795. del self[index[0]]
  796. # del ptree[i1, i2, i3]
  797. else:
  798. del self[index[0]][index[1:]]
  799. else:
  800. raise TypeError("%s indices must be integers, not %s" %
  801. (type(self).__name__, type(index).__name__))
  802. def __setitem__(self, index, value):
  803. # ptree[start:stop] = value
  804. if isinstance(index, slice):
  805. start, stop, step = slice_bounds(self, index, allow_step=True)
  806. # make a copy of value, in case it's an iterator
  807. if not isinstance(value, (list, tuple)):
  808. value = list(value)
  809. # Check for any error conditions, so we can avoid ending
  810. # up in an inconsistent state if an error does occur.
  811. for i, child in enumerate(value):
  812. if isinstance(child, Tree):
  813. self._setparent(child, start + i*step, dry_run=True)
  814. # clear the child pointers of all parents we're removing
  815. for i in range(start, stop, step):
  816. if isinstance(self[i], Tree):
  817. self._delparent(self[i], i)
  818. # set the child pointers of the new children. We do this
  819. # after clearing *all* child pointers, in case we're e.g.
  820. # reversing the elements in a tree.
  821. for i, child in enumerate(value):
  822. if isinstance(child, Tree):
  823. self._setparent(child, start + i*step)
  824. # finally, update the content of the child list itself.
  825. super(AbstractParentedTree, self).__setitem__(index, value)
  826. # ptree[i] = value
  827. elif isinstance(index, int):
  828. if index < 0: index += len(self)
  829. if index < 0: raise IndexError('index out of range')
  830. # if the value is not changing, do nothing.
  831. if value is self[index]:
  832. return
  833. # Set the new child's parent pointer.
  834. if isinstance(value, Tree):
  835. self._setparent(value, index)
  836. # Remove the old child's parent pointer
  837. if isinstance(self[index], Tree):
  838. self._delparent(self[index], index)
  839. # Update our child list.
  840. super(AbstractParentedTree, self).__setitem__(index, value)
  841. elif isinstance(index, (list, tuple)):
  842. # ptree[()] = value
  843. if len(index) == 0:
  844. raise IndexError('The tree position () may not be assigned to.')
  845. # ptree[(i,)] = value
  846. elif len(index) == 1:
  847. self[index[0]] = value
  848. # ptree[i1, i2, i3] = value
  849. else:
  850. self[index[0]][index[1:]] = value
  851. else:
  852. raise TypeError("%s indices must be integers, not %s" %
  853. (type(self).__name__, type(index).__name__))
  854. def append(self, child):
  855. if isinstance(child, Tree):
  856. self._setparent(child, len(self))
  857. super(AbstractParentedTree, self).append(child)
  858. def extend(self, children):
  859. for child in children:
  860. if isinstance(child, Tree):
  861. self._setparent(child, len(self))
  862. super(AbstractParentedTree, self).append(child)
  863. def insert(self, index, child):
  864. # Handle negative indexes. Note that if index < -len(self),
  865. # we do *not* raise an IndexError, unlike __getitem__. This
  866. # is done for consistency with list.__getitem__ and list.index.
  867. if index < 0: index += len(self)
  868. if index < 0: index = 0
  869. # Set the child's parent, and update our child list.
  870. if isinstance(child, Tree):
  871. self._setparent(child, index)
  872. super(AbstractParentedTree, self).insert(index, child)
  873. def pop(self, index=-1):
  874. if index < 0: index += len(self)
  875. if index < 0: raise IndexError('index out of range')
  876. if isinstance(self[index], Tree):
  877. self._delparent(self[index], index)
  878. return super(AbstractParentedTree, self).pop(index)
  879. # n.b.: like `list`, this is done by equality, not identity!
  880. # To remove a specific child, use del ptree[i].
  881. def remove(self, child):
  882. index = self.index(child)
  883. if isinstance(self[index], Tree):
  884. self._delparent(self[index], index)
  885. super(AbstractParentedTree, self).remove(child)
  886. # We need to implement __getslice__ and friends, even though
  887. # they're deprecated, because otherwise list.__getslice__ will get
  888. # called (since we're subclassing from list). Just delegate to
  889. # __getitem__ etc., but use max(0, start) and max(0, stop) because
  890. # because negative indices are already handled *before*
  891. # __getslice__ is called; and we don't want to double-count them.
  892. if hasattr(list, '__getslice__'):
  893. def __getslice__(self, start, stop):
  894. return self.__getitem__(slice(max(0, start), max(0, stop)))
  895. def __delslice__(self, start, stop):
  896. return self.__delitem__(slice(max(0, start), max(0, stop)))
  897. def __setslice__(self, start, stop, value):
  898. return self.__setitem__(slice(max(0, start), max(0, stop)), value)
  899. class ParentedTree(AbstractParentedTree):
  900. """
  901. A ``Tree`` that automatically maintains parent pointers for
  902. single-parented trees. The following are methods for querying
  903. the structure of a parented tree: ``parent``, ``parent_index``,
  904. ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``.
  905. Each ``ParentedTree`` may have at most one parent. In
  906. particular, subtrees may not be shared. Any attempt to reuse a
  907. single ``ParentedTree`` as a child of more than one parent (or
  908. as multiple children of the same parent) will cause a
  909. ``ValueError`` exception to be raised.
  910. ``ParentedTrees`` should never be used in the same tree as ``Trees``
  911. or ``MultiParentedTrees``. Mixing tree implementations may result
  912. in incorrect parent pointers and in ``TypeError`` exceptions.
  913. """
  914. def __init__(self, node_or_str, children=None):
  915. self._parent = None
  916. """The parent of this Tree, or None if it has no parent."""
  917. super(ParentedTree, self).__init__(node_or_str, children)
  918. if children is None:
  919. # If children is None, the tree is parsed from node_or_str.
  920. # After parsing, the parent of the immediate children
  921. # will point to an intermediate tree, not self.
  922. # We fix this by brute force:
  923. for i, child in enumerate(self):
  924. if isinstance(child, Tree):
  925. child._parent = None
  926. self._setparent(child, i)
  927. def _frozen_class(self): return ImmutableParentedTree
  928. #/////////////////////////////////////////////////////////////////
  929. # Methods
  930. #/////////////////////////////////////////////////////////////////
  931. def parent(self):
  932. """The parent of this tree, or None if it has no parent."""
  933. return self._parent
  934. def parent_index(self):
  935. """
  936. The index of this tree in its parent. I.e.,
  937. ``ptree.parent()[ptree.parent_index()] is ptree``. Note that
  938. ``ptree.parent_index()`` is not necessarily equal to
  939. ``ptree.parent.index(ptree)``, since the ``index()`` method
  940. returns the first child that is equal to its argument.
  941. """
  942. if self._parent is None: return None
  943. for i, child in enumerate(self._parent):
  944. if child is self: return i
  945. assert False, 'expected to find self in self._parent!'
  946. def left_sibling(self):
  947. """The left sibling of this tree, or None if it has none."""
  948. parent_index = self.parent_index()
  949. if self._parent and parent_index > 0:
  950. return self._parent[parent_index-1]
  951. return None # no left sibling
  952. def right_sibling(self):
  953. """The right sibling of this tree, or None if it has none."""
  954. parent_index = self.parent_index()
  955. if self._parent and parent_index < (len(self._parent)-1):
  956. return self._parent[parent_index+1]
  957. return None # no right sibling
  958. def root(self):
  959. """
  960. The root of this tree. I.e., the unique ancestor of this tree
  961. whose parent is None. If ``ptree.parent()`` is None, then
  962. ``ptree`` is its own root.
  963. """
  964. root = self
  965. while root.parent() is not None:
  966. root = root.parent()
  967. return root
  968. def treeposition(self):
  969. """
  970. The tree position of this tree, relative to the root of the
  971. tree. I.e., ``ptree.root[ptree.treeposition] is ptree``.
  972. """
  973. if self.parent() is None:
  974. return ()
  975. else:
  976. return self.parent().treeposition() + (self.parent_index(),)
  977. #/////////////////////////////////////////////////////////////////
  978. # Parent Management
  979. #/////////////////////////////////////////////////////////////////
  980. def _delparent(self, child, index):
  981. # Sanity checks
  982. assert isinstance(child, ParentedTree)
  983. assert self[index] is child
  984. assert child._parent is self
  985. # Delete child's parent pointer.
  986. child._parent = None
  987. def _setparent(self, child, index, dry_run=False):
  988. # If the child's type is incorrect, then complain.
  989. if not isinstance(child, ParentedTree):
  990. raise TypeError('Can not insert a non-ParentedTree '+
  991. 'into a ParentedTree')
  992. # If child already has a parent, then complain.
  993. if child._parent is not None:
  994. raise ValueError('Can not insert a subtree that already '
  995. 'has a parent.')
  996. # Set child's parent pointer & index.
  997. if not dry_run:
  998. child._parent = self
  999. class MultiParentedTree(AbstractParentedTree):
  1000. """
  1001. A ``Tree`` that automatically maintains parent pointers for
  1002. multi-parented trees. The following are methods for querying the
  1003. structure of a multi-parented tree: ``parents()``, ``parent_indices()``,
  1004. ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``.
  1005. Each ``MultiParentedTree`` may have zero or more parents. In
  1006. particular, subtrees may be shared. If a single
  1007. ``MultiParentedTree`` is used as multiple children of the same
  1008. parent, then that parent will appear multiple times in its
  1009. ``parents()`` method.
  1010. ``MultiParentedTrees`` should never be used in the same tree as
  1011. ``Trees`` or ``ParentedTrees``. Mixing tree implementations may
  1012. result in incorrect parent pointers and in ``TypeError`` exceptions.
  1013. """
  1014. def __init__(self, node_or_str, children=None):
  1015. self._parents = []
  1016. """A list of this tree's parents. This list should not
  1017. contain duplicates, even if a parent contains this tree
  1018. multiple times."""
  1019. super(MultiParentedTree, self).__init__(node_or_str, children)
  1020. if children is None:
  1021. # If children is None, the tree is parsed from node_or_str.
  1022. # After parsing, the parent(s) of the immediate children
  1023. # will point to an intermediate tree, not self.
  1024. # We fix this by brute force:
  1025. for i, child in enumerate(self):
  1026. if isinstance(child, Tree):
  1027. child._parents = []
  1028. self._setparent(child, i)
  1029. def _frozen_class(self): return ImmutableMultiParentedTree
  1030. #/////////////////////////////////////////////////////////////////
  1031. # Methods
  1032. #/////////////////////////////////////////////////////////////////
  1033. def parents(self):
  1034. """
  1035. The set of parents of this tree. If this tree has no parents,
  1036. then ``parents`` is the empty set. To check if a tree is used
  1037. as multiple children of the same parent, use the
  1038. ``parent_indices()`` method.
  1039. :type: list(MultiParentedTree)
  1040. """
  1041. return list(self._parents)
  1042. def left_siblings(self):
  1043. """
  1044. A list of all left siblings of this tree, in any of its parent
  1045. trees. A tree ma

Large files files are truncated, but you can click here to view the full file