PageRenderTime 56ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/tokenize/regexp.py

https://github.com/haewoon/nltk
Python | 207 lines | 190 code | 1 blank | 16 comment | 0 complexity | 683de9df41d18eb388a9fcafbbeb8a98 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Tokenizers
  2. #
  3. # Copyright (C) 2001-2012 NLTK Project
  4. # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
  5. # Steven Bird <sb@csse.unimelb.edu.au>
  6. # Trevor Cohn <tacohn@csse.unimelb.edu.au>
  7. # URL: <http://nltk.sourceforge.net>
  8. # For license information, see LICENSE.TXT
  9. r"""
  10. Regular-Expression Tokenizers
  11. A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
  12. For example, the following tokenizer forms tokens out of alphabetic sequences,
  13. money expressions, and any other non-whitespace sequences:
  14. >>> from nltk.tokenize import RegexpTokenizer
  15. >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
  16. >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
  17. >>> tokenizer.tokenize(s)
  18. ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
  19. 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
  20. A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
  21. >>> tokenizer = RegexpTokenizer('\s+', gaps=True)
  22. >>> tokenizer.tokenize(s)
  23. ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
  24. 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
  25. Note that empty tokens are not returned when the delimiter appears at
  26. the start or end of the string.
  27. The material between the tokens is discarded. For example,
  28. the following tokenizer selects just the capitalized words:
  29. >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+')
  30. >>> capword_tokenizer.tokenize(s)
  31. ['Good', 'New', 'York', 'Please', 'Thanks']
  32. This module contains several subclasses of ``RegexpTokenizer``
  33. that use pre-defined regular expressions.
  34. >>> from nltk.tokenize import BlanklineTokenizer
  35. >>> # Uses '\s*\n\s*\n\s*':
  36. >>> BlanklineTokenizer().tokenize(s)
  37. ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.',
  38. 'Thanks.']
  39. All of the regular expression tokenizers are also available as functions:
  40. >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
  41. >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')
  42. ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
  43. 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
  44. >>> wordpunct_tokenize(s)
  45. ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
  46. '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
  47. >>> blankline_tokenize(s)
  48. ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.']
  49. Caution: The function ``regexp_tokenize()`` takes the text as its
  50. first argument, and the regular expression pattern as its second
  51. argument. This differs from the conventions used by Python's
  52. ``re`` functions, where the pattern is always the first argument.
  53. (This is for consistency with the other NLTK tokenizers.)
  54. """
  55. import re
  56. import sre_constants
  57. from nltk.internals import convert_regexp_to_nongrouping
  58. from nltk.tokenize.api import TokenizerI
  59. from nltk.tokenize.util import regexp_span_tokenize
  60. class RegexpTokenizer(TokenizerI):
  61. """
  62. A tokenizer that splits a string using a regular expression, which
  63. matches either the tokens or the separators between tokens.
  64. >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
  65. :type pattern: str
  66. :param pattern: The pattern used to build this tokenizer.
  67. (This pattern may safely contain grouping parentheses.)
  68. :type gaps: bool
  69. :param gaps: True if this tokenizer's pattern should be used
  70. to find separators between tokens; False if this
  71. tokenizer's pattern should be used to find the tokens
  72. themselves.
  73. :type discard_empty: bool
  74. :param discard_empty: True if any empty tokens `''`
  75. generated by the tokenizer should be discarded. Empty
  76. tokens can only be generated if `_gaps == True`.
  77. :type flags: int
  78. :param flags: The regexp flags used to compile this
  79. tokenizer's pattern. By default, the following flags are
  80. used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
  81. """
  82. def __init__(self, pattern, gaps=False, discard_empty=True,
  83. flags=re.UNICODE | re.MULTILINE | re.DOTALL):
  84. # If they gave us a regexp object, extract the pattern.
  85. pattern = getattr(pattern, 'pattern', pattern)
  86. self._pattern = pattern
  87. self._gaps = gaps
  88. self._discard_empty = discard_empty
  89. self._flags = flags
  90. self._regexp = None
  91. # Remove grouping parentheses -- if the regexp contains any
  92. # grouping parentheses, then the behavior of re.findall and
  93. # re.split will change.
  94. nongrouping_pattern = convert_regexp_to_nongrouping(pattern)
  95. try:
  96. self._regexp = re.compile(nongrouping_pattern, flags)
  97. except re.error, e:
  98. raise ValueError('Error in regular expression %r: %s' %
  99. (pattern, e))
  100. def tokenize(self, text):
  101. # If our regexp matches gaps, use re.split:
  102. if self._gaps:
  103. if self._discard_empty:
  104. return [tok for tok in self._regexp.split(text) if tok]
  105. else:
  106. return self._regexp.split(text)
  107. # If our regexp matches tokens, use re.findall:
  108. else:
  109. return self._regexp.findall(text)
  110. def span_tokenize(self, text):
  111. if self._gaps:
  112. for left, right in regexp_span_tokenize(text, self._regexp):
  113. if not (self._discard_empty and left == right):
  114. yield left, right
  115. else:
  116. for m in re.finditer(self._regexp, text):
  117. yield m.span()
  118. def __repr__(self):
  119. return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' %
  120. (self.__class__.__name__, self._pattern, self._gaps,
  121. self._discard_empty, self._flags))
  122. class WhitespaceTokenizer(RegexpTokenizer):
  123. r"""
  124. Tokenize a string on whitespace (space, tab, newline).
  125. In general, users should use the string ``split()`` method instead.
  126. >>> from nltk.tokenize import WhitespaceTokenizer
  127. >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
  128. >>> WhitespaceTokenizer().tokenize(s)
  129. ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
  130. 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
  131. """
  132. def __init__(self):
  133. RegexpTokenizer.__init__(self, r'\s+', gaps=True)
  134. class BlanklineTokenizer(RegexpTokenizer):
  135. """
  136. Tokenize a string, treating any sequence of blank lines as a delimiter.
  137. Blank lines are defined as lines containing no characters, except for
  138. space or tab characters.
  139. """
  140. def __init__(self):
  141. RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
  142. class WordPunctTokenizer(RegexpTokenizer):
  143. """
  144. Tokenize a text into a sequence of alphabetic and
  145. non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
  146. >>> from nltk.tokenize import WordPunctTokenizer
  147. >>> s = "Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks."
  148. >>> WordPunctTokenizer().tokenize(s)
  149. ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
  150. '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
  151. """
  152. def __init__(self):
  153. RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
  154. ######################################################################
  155. #{ Tokenization Functions
  156. ######################################################################
  157. def regexp_tokenize(text, pattern, gaps=False, discard_empty=True,
  158. flags=re.UNICODE | re.MULTILINE | re.DOTALL):
  159. """
  160. Return a tokenized copy of *text*. See :class:`.RegexpTokenizer`
  161. for descriptions of the arguments.
  162. """
  163. tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
  164. return tokenizer.tokenize(text)
  165. blankline_tokenize = BlanklineTokenizer().tokenize
  166. wordpunct_tokenize = WordPunctTokenizer().tokenize
  167. if __name__ == "__main__":
  168. import doctest
  169. doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)