PageRenderTime 43ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/corpus/reader/bracket_parse.py

https://github.com/BrucePHill/nltk
Python | 182 lines | 125 code | 14 blank | 43 comment | 15 complexity | a7b9dffb05c15cf3e749c00a7ed43ab3 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Penn Treebank Reader
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Steven Bird <sb@ldc.upenn.edu>
  5. # Edward Loper <edloper@gradient.cis.upenn.edu>
  6. # URL: <http://www.nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Corpus reader for corpora that consist of parenthesis-delineated parse trees.
  10. """
  11. import sys
  12. from nltk.tree import Tree
  13. from .util import *
  14. from .api import *
  15. # we use [^\s()]+ instead of \S+? to avoid matching ()
  16. TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
  17. WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
  18. EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
  19. class BracketParseCorpusReader(SyntaxCorpusReader):
  20. """
  21. Reader for corpora that consist of parenthesis-delineated parse
  22. trees.
  23. """
  24. def __init__(self, root, fileids, comment_char=None,
  25. detect_blocks='unindented_paren', encoding='utf8',
  26. tag_mapping_function=None):
  27. """
  28. :param root: The root directory for this corpus.
  29. :param fileids: A list or regexp specifying the fileids in this corpus.
  30. :param comment_char: The character which can appear at the start of
  31. a line to indicate that the rest of the line is a comment.
  32. :param detect_blocks: The method that is used to find blocks
  33. in the corpus; can be 'unindented_paren' (every unindented
  34. parenthesis starts a new parse) or 'sexpr' (brackets are
  35. matched).
  36. """
  37. CorpusReader.__init__(self, root, fileids, encoding)
  38. self._comment_char = comment_char
  39. self._detect_blocks = detect_blocks
  40. self._tag_mapping_function = tag_mapping_function
  41. def _read_block(self, stream):
  42. if self._detect_blocks == 'sexpr':
  43. return read_sexpr_block(stream, comment_char=self._comment_char)
  44. elif self._detect_blocks == 'blankline':
  45. return read_blankline_block(stream)
  46. elif self._detect_blocks == 'unindented_paren':
  47. # Tokens start with unindented left parens.
  48. toks = read_regexp_block(stream, start_re=r'^\(')
  49. # Strip any comments out of the tokens.
  50. if self._comment_char:
  51. toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char),
  52. '', tok)
  53. for tok in toks]
  54. return toks
  55. else:
  56. assert 0, 'bad block type'
  57. def _normalize(self, t):
  58. # If there's an empty set of brackets surrounding the actual
  59. # parse, then strip them off.
  60. if EMPTY_BRACKETS.match(t):
  61. t = t.strip()[1:-1]
  62. # Replace leaves of the form (!), (,), with (! !), (, ,)
  63. t = re.sub(r"\((.)\)", r"(\1 \1)", t)
  64. # Replace leaves of the form (tag word root) with (tag word)
  65. t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
  66. return t
  67. def _parse(self, t):
  68. try:
  69. return Tree.parse(self._normalize(t))
  70. except ValueError as e:
  71. sys.stderr.write("Bad tree detected; trying to recover...\n")
  72. # Try to recover, if we can:
  73. if e.args == ('mismatched parens',):
  74. for n in range(1, 5):
  75. try:
  76. v = Tree.parse(self._normalize(t+')'*n))
  77. sys.stderr.write(" Recovered by adding %d close "
  78. "paren(s)\n" % n)
  79. return v
  80. except ValueError: pass
  81. # Try something else:
  82. sys.stderr.write(" Recovered by returning a flat parse.\n")
  83. #sys.stderr.write(' '.join(t.split())+'\n')
  84. return Tree('S', self._tag(t))
  85. def _tag(self, t, simplify_tags=False):
  86. tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))]
  87. if simplify_tags:
  88. tagged_sent = [(w, self._tag_mapping_function(t))
  89. for (w,t) in tagged_sent]
  90. return tagged_sent
  91. def _word(self, t):
  92. return WORD.findall(self._normalize(t))
  93. class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
  94. BracketParseCorpusReader):
  95. """
  96. A reader for parsed corpora whose documents are
  97. divided into categories based on their file identifiers.
  98. @author: Nathan Schneider <nschneid@cs.cmu.edu>
  99. """
  100. def __init__(self, *args, **kwargs):
  101. """
  102. Initialize the corpus reader. Categorization arguments
  103. (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
  104. the L{CategorizedCorpusReader constructor
  105. <CategorizedCorpusReader.__init__>}. The remaining arguments
  106. are passed to the L{BracketParseCorpusReader constructor
  107. <BracketParseCorpusReader.__init__>}.
  108. """
  109. CategorizedCorpusReader.__init__(self, kwargs)
  110. BracketParseCorpusReader.__init__(self, *args, **kwargs)
  111. def _resolve(self, fileids, categories):
  112. if fileids is not None and categories is not None:
  113. raise ValueError('Specify fileids or categories, not both')
  114. if categories is not None:
  115. return self.fileids(categories)
  116. else:
  117. return fileids
  118. def raw(self, fileids=None, categories=None):
  119. return BracketParseCorpusReader.raw(
  120. self, self._resolve(fileids, categories))
  121. def words(self, fileids=None, categories=None):
  122. return BracketParseCorpusReader.words(
  123. self, self._resolve(fileids, categories))
  124. def sents(self, fileids=None, categories=None):
  125. return BracketParseCorpusReader.sents(
  126. self, self._resolve(fileids, categories))
  127. def paras(self, fileids=None, categories=None):
  128. return BracketParseCorpusReader.paras(
  129. self, self._resolve(fileids, categories))
  130. def tagged_words(self, fileids=None, categories=None, simplify_tags=False):
  131. return BracketParseCorpusReader.tagged_words(
  132. self, self._resolve(fileids, categories), simplify_tags)
  133. def tagged_sents(self, fileids=None, categories=None, simplify_tags=False):
  134. return BracketParseCorpusReader.tagged_sents(
  135. self, self._resolve(fileids, categories), simplify_tags)
  136. def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
  137. return BracketParseCorpusReader.tagged_paras(
  138. self, self._resolve(fileids, categories), simplify_tags)
  139. def parsed_words(self, fileids=None, categories=None):
  140. return BracketParseCorpusReader.parsed_words(
  141. self, self._resolve(fileids, categories))
  142. def parsed_sents(self, fileids=None, categories=None):
  143. return BracketParseCorpusReader.parsed_sents(
  144. self, self._resolve(fileids, categories))
  145. def parsed_paras(self, fileids=None, categories=None):
  146. return BracketParseCorpusReader.parsed_paras(
  147. self, self._resolve(fileids, categories))
  148. class AlpinoCorpusReader(BracketParseCorpusReader):
  149. """
  150. Reader for the Alpino Dutch Treebank.
  151. """
  152. def __init__(self, root, encoding='ISO-8859-1', tag_mapping_function=None):
  153. BracketParseCorpusReader.__init__(self, root, 'alpino\.xml',
  154. detect_blocks='blankline',
  155. encoding=encoding,
  156. tag_mapping_function=tag_mapping_function)
  157. def _normalize(self, t):
  158. if t[:10] != "<alpino_ds":
  159. return ""
  160. # convert XML to sexpr notation
  161. t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
  162. t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t)
  163. t = re.sub(r" </node>", r")", t)
  164. t = re.sub(r"<sentence>.*</sentence>", r"", t)
  165. t = re.sub(r"</?alpino_ds.*>", r"", t)
  166. return t