/nltk-2.0.1rc1/nltk/tokenize/treebank.py
# · Python · 71 lines · 39 code · 11 blank · 21 comment · 2 complexity · e5f56b005f9eae5de243342c374fc633 MD5 · raw file
- # Natural Language Toolkit: Tokenizers
- #
- # Copyright (C) 2001-2011 NLTK Project
- # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
- # URL: <http://nltk.sourceforge.net>
- # For license information, see LICENSE.TXT
- """
- A regular-expression based word tokenizer that tokenizes sentences
- using the conventions used by the Penn Treebank.
- """
- import re
- from api import *
- ######################################################################
- #{ Regexp-based treebank tokenizer
- ######################################################################
- # (n.b., this isn't derived from RegexpTokenizer)
- class TreebankWordTokenizer(TokenizerI):
- """
- A word tokenizer that tokenizes sentences using the conventions
- used by the Penn Treebank. Contractions, such as "can't", are
- split in to two tokens. E.g.:
- - can't S{->} ca n't
- - he'll S{->} he 'll
- - weren't S{-} were n't
- This tokenizer assumes that the text has already been segmented into
- sentences. Any periods -- apart from those at the end of a string --
- are assumed to be part of the word they are attached to (e.g. for
- abbreviations, etc), and are not separately tokenized.
- """
- # List of contractions adapted from Robert MacIntyre's tokenizer.
- CONTRACTIONS2 = [re.compile(r"(?i)(.)('ll|'re|'ve|n't|'s|'m|'d)\b"),
- re.compile(r"(?i)\b(can)(not)\b"),
- re.compile(r"(?i)\b(D)('ye)\b"),
- re.compile(r"(?i)\b(Gim)(me)\b"),
- re.compile(r"(?i)\b(Gon)(na)\b"),
- re.compile(r"(?i)\b(Got)(ta)\b"),
- re.compile(r"(?i)\b(Lem)(me)\b"),
- re.compile(r"(?i)\b(Mor)('n)\b"),
- re.compile(r"(?i)\b(T)(is)\b"),
- re.compile(r"(?i)\b(T)(was)\b"),
- re.compile(r"(?i)\b(Wan)(na)\b")]
- CONTRACTIONS3 = [re.compile(r"(?i)\b(Whad)(dd)(ya)\b"),
- re.compile(r"(?i)\b(Wha)(t)(cha)\b")]
-
- def tokenize(self, text):
- for regexp in self.CONTRACTIONS2:
- text = regexp.sub(r'\1 \2', text)
- for regexp in self.CONTRACTIONS3:
- text = regexp.sub(r'\1 \2 \3', text)
- # Separate most punctuation
- text = re.sub(r"([^\w\.\'\-\/,&])", r' \1 ', text)
- # Separate commas if they're followed by space.
- # (E.g., don't separate 2,500)
- text = re.sub(r"(,\s)", r' \1', text)
- # Separate single quotes if they're followed by a space.
- text = re.sub(r"('\s)", r' \1', text)
- # Separate periods that come before newline or end of string.
- text = re.sub('\. *(\n|$)', ' . ', text)
- return text.split()