nltk /nltk/corpus/reader/bracket_parse.py

Language Python Lines 183
MD5 Hash a7b9dffb05c15cf3e749c00a7ed43ab3
Repository https://github.com/BrucePHill/nltk.git View Raw File
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Steven Bird <sb@ldc.upenn.edu>
#         Edward Loper <edloper@gradient.cis.upenn.edu>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""

import sys

from nltk.tree import Tree

from .util import *
from .api import *


# we use [^\s()]+ instead of \S+? to avoid matching ()
TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')

class BracketParseCorpusReader(SyntaxCorpusReader):
    """
    Reader for corpora that consist of parenthesis-delineated parse
    trees.
    """
    def __init__(self, root, fileids, comment_char=None,
                 detect_blocks='unindented_paren', encoding='utf8',
                 tag_mapping_function=None):
        """
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        :param comment_char: The character which can appear at the start of
            a line to indicate that the rest of the line is a comment.
        :param detect_blocks: The method that is used to find blocks
          in the corpus; can be 'unindented_paren' (every unindented
          parenthesis starts a new parse) or 'sexpr' (brackets are
          matched).
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._comment_char = comment_char
        self._detect_blocks = detect_blocks
        self._tag_mapping_function = tag_mapping_function

    def _read_block(self, stream):
        if self._detect_blocks == 'sexpr':
            return read_sexpr_block(stream, comment_char=self._comment_char)
        elif self._detect_blocks == 'blankline':
            return read_blankline_block(stream)
        elif self._detect_blocks == 'unindented_paren':
            # Tokens start with unindented left parens.
            toks = read_regexp_block(stream, start_re=r'^\(')
            # Strip any comments out of the tokens.
            if self._comment_char:
                toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char),
                               '', tok)
                        for tok in toks]
            return toks
        else:
            assert 0, 'bad block type'

    def _normalize(self, t):
        # If there's an empty set of brackets surrounding the actual
        # parse, then strip them off.
        if EMPTY_BRACKETS.match(t):
            t = t.strip()[1:-1]
        # Replace leaves of the form (!), (,), with (! !), (, ,)
        t = re.sub(r"\((.)\)", r"(\1 \1)", t)
        # Replace leaves of the form (tag word root) with (tag word)
        t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
        return t

    def _parse(self, t):
        try:
            return Tree.parse(self._normalize(t))

        except ValueError as e:
            sys.stderr.write("Bad tree detected; trying to recover...\n")
            # Try to recover, if we can:
            if e.args == ('mismatched parens',):
                for n in range(1, 5):
                    try:
                        v = Tree.parse(self._normalize(t+')'*n))
                        sys.stderr.write("  Recovered by adding %d close "
                                         "paren(s)\n" % n)
                        return v
                    except ValueError: pass
            # Try something else:
            sys.stderr.write("  Recovered by returning a flat parse.\n")
            #sys.stderr.write(' '.join(t.split())+'\n')
            return Tree('S', self._tag(t))

    def _tag(self, t, simplify_tags=False):
        tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))]
        if simplify_tags:
            tagged_sent = [(w, self._tag_mapping_function(t))
                           for (w,t) in tagged_sent]
        return tagged_sent

    def _word(self, t):
        return WORD.findall(self._normalize(t))

class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
                                          BracketParseCorpusReader):
    """
    A reader for parsed corpora whose documents are
    divided into categories based on their file identifiers.
    @author: Nathan Schneider <nschneid@cs.cmu.edu>
    """
    def __init__(self, *args, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
        the L{CategorizedCorpusReader constructor
        <CategorizedCorpusReader.__init__>}.  The remaining arguments
        are passed to the L{BracketParseCorpusReader constructor
        <BracketParseCorpusReader.__init__>}.
        """
        CategorizedCorpusReader.__init__(self, kwargs)
        BracketParseCorpusReader.__init__(self, *args, **kwargs)

    def _resolve(self, fileids, categories):
        if fileids is not None and categories is not None:
            raise ValueError('Specify fileids or categories, not both')
        if categories is not None:
            return self.fileids(categories)
        else:
            return fileids
    def raw(self, fileids=None, categories=None):
        return BracketParseCorpusReader.raw(
            self, self._resolve(fileids, categories))
    def words(self, fileids=None, categories=None):
        return BracketParseCorpusReader.words(
            self, self._resolve(fileids, categories))
    def sents(self, fileids=None, categories=None):
        return BracketParseCorpusReader.sents(
            self, self._resolve(fileids, categories))
    def paras(self, fileids=None, categories=None):
        return BracketParseCorpusReader.paras(
            self, self._resolve(fileids, categories))
    def tagged_words(self, fileids=None, categories=None, simplify_tags=False):
        return BracketParseCorpusReader.tagged_words(
            self, self._resolve(fileids, categories), simplify_tags)
    def tagged_sents(self, fileids=None, categories=None, simplify_tags=False):
        return BracketParseCorpusReader.tagged_sents(
            self, self._resolve(fileids, categories), simplify_tags)
    def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
        return BracketParseCorpusReader.tagged_paras(
            self, self._resolve(fileids, categories), simplify_tags)
    def parsed_words(self, fileids=None, categories=None):
        return BracketParseCorpusReader.parsed_words(
            self, self._resolve(fileids, categories))
    def parsed_sents(self, fileids=None, categories=None):
        return BracketParseCorpusReader.parsed_sents(
            self, self._resolve(fileids, categories))
    def parsed_paras(self, fileids=None, categories=None):
        return BracketParseCorpusReader.parsed_paras(
            self, self._resolve(fileids, categories))

class AlpinoCorpusReader(BracketParseCorpusReader):
    """
    Reader for the Alpino Dutch Treebank.
    """
    def __init__(self, root, encoding='ISO-8859-1', tag_mapping_function=None):
        BracketParseCorpusReader.__init__(self, root, 'alpino\.xml',
                                 detect_blocks='blankline',
                                 encoding=encoding,
                                 tag_mapping_function=tag_mapping_function)

    def _normalize(self, t):
        if t[:10] != "<alpino_ds":
            return ""
        # convert XML to sexpr notation
        t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
        t = re.sub(r'  <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t)
        t = re.sub(r"  </node>", r")", t)
        t = re.sub(r"<sentence>.*</sentence>", r"", t)
        t = re.sub(r"</?alpino_ds.*>", r"", t)
        return t
Back to Top