PageRenderTime 45ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/nltk/stem/regexp.py

https://github.com/BrucePHill/nltk
Python | 62 lines | 46 code | 2 blank | 14 comment | 0 complexity | a9f414e4d2a965d214f848caa8ea9dd1 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Stemmers
  2. #
  3. # Copyright (C) 2001-2013 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # Edward Loper <edloper@gradient.cis.upenn.edu>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # URL: <http://www.nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. from __future__ import unicode_literals
  10. import re
  11. from .api import StemmerI
  12. from nltk.compat import python_2_unicode_compatible
  13. @python_2_unicode_compatible
  14. class RegexpStemmer(StemmerI):
  15. """
  16. A stemmer that uses regular expressions to identify morphological
  17. affixes. Any substrings that match the regular expressions will
  18. be removed.
  19. >>> from nltk.stem import RegexpStemmer
  20. >>> st = RegexpStemmer('ing$|s$|e$', min=4)
  21. >>> st.stem('cars')
  22. 'car'
  23. >>> st.stem('mass')
  24. 'mas'
  25. >>> st.stem('was')
  26. 'was'
  27. >>> st.stem('bee')
  28. 'bee'
  29. >>> st.stem('compute')
  30. 'comput'
  31. :type regexp: str or regexp
  32. :param regexp: The regular expression that should be used to
  33. identify morphological affixes.
  34. :type min: int
  35. :param min: The minimum length of string to stem
  36. """
  37. def __init__(self, regexp, min=0):
  38. if not hasattr(regexp, 'pattern'):
  39. regexp = re.compile(regexp)
  40. self._regexp = regexp
  41. self._min = min
  42. def stem(self, word):
  43. if len(word) < self._min:
  44. return word
  45. else:
  46. return self._regexp.sub('', word)
  47. def __repr__(self):
  48. return '<RegexpStemmer: %r>' % self._regexp.pattern
  49. if __name__ == "__main__":
  50. import doctest
  51. doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)