/nltk-2.0.1rc1/nltk/tokenize/treebank.py

# · Python · 71 lines · 39 code · 11 blank · 21 comment · 2 complexity · e5f56b005f9eae5de243342c374fc633 MD5 · raw file

  1. # Natural Language Toolkit: Tokenizers
  2. #
  3. # Copyright (C) 2001-2011 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # URL: <http://nltk.sourceforge.net>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A regular-expression based word tokenizer that tokenizes sentences
  9. using the conventions used by the Penn Treebank.
  10. """
  11. import re
  12. from api import *
  13. ######################################################################
  14. #{ Regexp-based treebank tokenizer
  15. ######################################################################
  16. # (n.b., this isn't derived from RegexpTokenizer)
  17. class TreebankWordTokenizer(TokenizerI):
  18. """
  19. A word tokenizer that tokenizes sentences using the conventions
  20. used by the Penn Treebank. Contractions, such as "can't", are
  21. split in to two tokens. E.g.:
  22. - can't S{->} ca n't
  23. - he'll S{->} he 'll
  24. - weren't S{-} were n't
  25. This tokenizer assumes that the text has already been segmented into
  26. sentences. Any periods -- apart from those at the end of a string --
  27. are assumed to be part of the word they are attached to (e.g. for
  28. abbreviations, etc), and are not separately tokenized.
  29. """
  30. # List of contractions adapted from Robert MacIntyre's tokenizer.
  31. CONTRACTIONS2 = [re.compile(r"(?i)(.)('ll|'re|'ve|n't|'s|'m|'d)\b"),
  32. re.compile(r"(?i)\b(can)(not)\b"),
  33. re.compile(r"(?i)\b(D)('ye)\b"),
  34. re.compile(r"(?i)\b(Gim)(me)\b"),
  35. re.compile(r"(?i)\b(Gon)(na)\b"),
  36. re.compile(r"(?i)\b(Got)(ta)\b"),
  37. re.compile(r"(?i)\b(Lem)(me)\b"),
  38. re.compile(r"(?i)\b(Mor)('n)\b"),
  39. re.compile(r"(?i)\b(T)(is)\b"),
  40. re.compile(r"(?i)\b(T)(was)\b"),
  41. re.compile(r"(?i)\b(Wan)(na)\b")]
  42. CONTRACTIONS3 = [re.compile(r"(?i)\b(Whad)(dd)(ya)\b"),
  43. re.compile(r"(?i)\b(Wha)(t)(cha)\b")]
  44. def tokenize(self, text):
  45. for regexp in self.CONTRACTIONS2:
  46. text = regexp.sub(r'\1 \2', text)
  47. for regexp in self.CONTRACTIONS3:
  48. text = regexp.sub(r'\1 \2 \3', text)
  49. # Separate most punctuation
  50. text = re.sub(r"([^\w\.\'\-\/,&])", r' \1 ', text)
  51. # Separate commas if they're followed by space.
  52. # (E.g., don't separate 2,500)
  53. text = re.sub(r"(,\s)", r' \1', text)
  54. # Separate single quotes if they're followed by a space.
  55. text = re.sub(r"('\s)", r' \1', text)
  56. # Separate periods that come before newline or end of string.
  57. text = re.sub('\. *(\n|$)', ' . ', text)
  58. return text.split()