PageRenderTime 423ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/aiml/WordSub.py

https://bitbucket.org/mmellott/ai
Python | 98 lines | 87 code | 4 blank | 7 comment | 9 complexity | 041af9e2f8d5fccc3c19ab3a8779195a MD5 | raw file
  1. """This module implements the WordSub class, modelled after a recipe
  2. in "Python Cookbook" (Recipe 3.14, "Replacing Multiple Patterns in a
  3. Single Pass" by Xavier Defrang).
  4. Usage:
  5. Use this class like a dictionary to add before/after pairs:
  6. > subber = TextSub()
  7. > subber["before"] = "after"
  8. > subber["begin"] = "end"
  9. Use the sub() method to perform the substitution:
  10. > print subber.sub("before we begin")
  11. after we end
  12. All matching is intelligently case-insensitive:
  13. > print subber.sub("Before we BEGIN")
  14. After we END
  15. The 'before' words must be complete words -- no prefixes.
  16. The following example illustrates this point:
  17. > subber["he"] = "she"
  18. > print subber.sub("he says he'd like to help her")
  19. she says she'd like to help her
  20. Note that "he" and "he'd" were replaced, but "help" and "her" were
  21. not.
  22. """
  23. # 'dict' objects weren't available to subclass from until version 2.2.
  24. # Get around this by importing UserDict.UserDict if the built-in dict
  25. # object isn't available.
  26. try: dict
  27. except: from UserDict import UserDict as dict
  28. import ConfigParser
  29. import re
  30. import string
  31. class WordSub(dict):
  32. """All-in-one multiple-string-substitution class."""
  33. def _wordToRegex(self, word):
  34. """Convert a word to a regex object which matches the word."""
  35. if word != "" and word[0].isalpha() and word[-1].isalpha():
  36. return "\\b%s\\b" % re.escape(word)
  37. else:
  38. return r"\b%s\b" % re.escape(word)
  39. def _update_regex(self):
  40. """Build re object based on the keys of the current
  41. dictionary.
  42. """
  43. self._regex = re.compile("|".join(map(self._wordToRegex, self.keys())))
  44. self._regexIsDirty = False
  45. def __init__(self, defaults = {}):
  46. """Initialize the object, and populate it with the entries in
  47. the defaults dictionary.
  48. """
  49. self._regex = None
  50. self._regexIsDirty = True
  51. for k,v in defaults.items():
  52. self[k] = v
  53. def __call__(self, match):
  54. """Handler invoked for each regex match."""
  55. return self[match.group(0)]
  56. def __setitem__(self, i, y):
  57. self._regexIsDirty = True
  58. # for each entry the user adds, we actually add three entrys:
  59. super(type(self),self).__setitem__(string.lower(i),string.lower(y)) # key = value
  60. super(type(self),self).__setitem__(string.capwords(i), string.capwords(y)) # Key = Value
  61. super(type(self),self).__setitem__(string.upper(i), string.upper(y)) # KEY = VALUE
  62. def sub(self, text):
  63. """Translate text, returns the modified text."""
  64. if self._regexIsDirty:
  65. self._update_regex()
  66. return self._regex.sub(self, text)
  67. # self-test
  68. if __name__ == "__main__":
  69. subber = WordSub()
  70. subber["apple"] = "banana"
  71. subber["orange"] = "pear"
  72. subber["banana" ] = "apple"
  73. subber["he"] = "she"
  74. subber["I'd"] = "I would"
  75. # test case insensitivity
  76. inStr = "I'd like one apple, one Orange and one BANANA."
  77. outStr = "I Would like one banana, one Pear and one APPLE."
  78. if subber.sub(inStr) == outStr: print "Test #1 PASSED"
  79. else: print "Test #1 FAILED: '%s'" % subber.sub(inStr)
  80. inStr = "He said he'd like to go with me"
  81. outStr = "She said she'd like to go with me"
  82. if subber.sub(inStr) == outStr: print "Test #2 PASSED"
  83. else: print "Test #2 FAILED: '%s'" % subber.sub(inStr)