PageRenderTime 42ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/src/whoosh/lang/porter.py

https://bitbucket.org/mkr/whoosh
Python | 178 lines | 165 code | 4 blank | 9 comment | 0 complexity | f8be88a6fba7dc6d9378d898b3fb65cb MD5 | raw file
Possible License(s): Apache-2.0
  1. """
  2. Reimplementation of the
  3. `Porter stemming algorithm <http://tartarus.org/~martin/PorterStemmer/>`_
  4. in Python.
  5. In my quick tests, this implementation about 3.5 times faster than the
  6. seriously weird Python linked from the official page.
  7. """
  8. import re
  9. # Suffix replacement lists
  10. _step2list = {
  11. "ational": "ate",
  12. "tional": "tion",
  13. "enci": "ence",
  14. "anci": "ance",
  15. "izer": "ize",
  16. "bli": "ble",
  17. "alli": "al",
  18. "entli": "ent",
  19. "eli": "e",
  20. "ousli": "ous",
  21. "ization": "ize",
  22. "ation": "ate",
  23. "ator": "ate",
  24. "alism": "al",
  25. "iveness": "ive",
  26. "fulness": "ful",
  27. "ousness": "ous",
  28. "aliti": "al",
  29. "iviti": "ive",
  30. "biliti": "ble",
  31. "logi": "log",
  32. }
  33. _step3list = {
  34. "icate": "ic",
  35. "ative": "",
  36. "alize": "al",
  37. "iciti": "ic",
  38. "ical": "ic",
  39. "ful": "",
  40. "ness": "",
  41. }
  42. _cons = "[^aeiou]"
  43. _vowel = "[aeiouy]"
  44. _cons_seq = "[^aeiouy]+"
  45. _vowel_seq = "[aeiou]+"
  46. # m > 0
  47. _mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq)
  48. # m == 0
  49. _meq1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$")
  50. # m > 1
  51. _mgr1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq)
  52. # vowel in stem
  53. _s_v = re.compile("^(" + _cons_seq + ")?" + _vowel)
  54. # ???
  55. _c_v = re.compile("^" + _cons_seq + _vowel + "[^aeiouwxy]$")
  56. # Patterns used in the rules
  57. _ed_ing = re.compile("^(.*)(ed|ing)$")
  58. _at_bl_iz = re.compile("(at|bl|iz)$")
  59. _step1b = re.compile("([^aeiouylsz])\\1$")
  60. _step2 = re.compile("^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$")
  61. _step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$")
  62. _step4_1 = re.compile("^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$")
  63. _step4_2 = re.compile("^(.+?)(s|t)(ion)$")
  64. _step5 = re.compile("^(.+?)e$")
  65. # Stemming function
  66. def stem(w):
  67. """Uses the Porter stemming algorithm to remove suffixes from English
  68. words.
  69. >>> stem("fundamentally")
  70. "fundament"
  71. """
  72. if len(w) < 3:
  73. return w
  74. first_is_y = w[0] == "y"
  75. if first_is_y:
  76. w = "Y" + w[1:]
  77. # Step 1a
  78. if w.endswith("s"):
  79. if w.endswith("sses"):
  80. w = w[:-2]
  81. elif w.endswith("ies"):
  82. w = w[:-2]
  83. elif w[-2] != "s":
  84. w = w[:-1]
  85. # Step 1b
  86. if w.endswith("eed"):
  87. s = w[:-3]
  88. if _mgr0.match(s):
  89. w = w[:-1]
  90. else:
  91. m = _ed_ing.match(w)
  92. if m:
  93. stem = m.group(1)
  94. if _s_v.match(stem):
  95. w = stem
  96. if _at_bl_iz.match(w):
  97. w += "e"
  98. elif _step1b.match(w):
  99. w = w[:-1]
  100. elif _c_v.match(w):
  101. w += "e"
  102. # Step 1c
  103. if w.endswith("y"):
  104. stem = w[:-1]
  105. if _s_v.match(stem):
  106. w = stem + "i"
  107. # Step 2
  108. m = _step2.match(w)
  109. if m:
  110. stem = m.group(1)
  111. suffix = m.group(2)
  112. if _mgr0.match(stem):
  113. w = stem + _step2list[suffix]
  114. # Step 3
  115. m = _step3.match(w)
  116. if m:
  117. stem = m.group(1)
  118. suffix = m.group(2)
  119. if _mgr0.match(stem):
  120. w = stem + _step3list[suffix]
  121. # Step 4
  122. m = _step4_1.match(w)
  123. if m:
  124. stem = m.group(1)
  125. if _mgr1.match(stem):
  126. w = stem
  127. else:
  128. m = _step4_2.match(w)
  129. if m:
  130. stem = m.group(1) + m.group(2)
  131. if _mgr1.match(stem):
  132. w = stem
  133. # Step 5
  134. m = _step5.match(w)
  135. if m:
  136. stem = m.group(1)
  137. if _mgr1.match(stem) or (_meq1.match(stem) and not _c_v.match(stem)):
  138. w = stem
  139. if w.endswith("ll") and _mgr1.match(w):
  140. w = w[:-1]
  141. if first_is_y:
  142. w = "y" + w[1:]
  143. return w
  144. if __name__ == '__main__':
  145. print(stem("fundamentally"))