PageRenderTime 54ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/src/whoosh/lang/porter2.py

https://bitbucket.org/rayleyva/whoosh
Python | 313 lines | 265 code | 37 blank | 11 comment | 50 complexity | 4293a0b17faf8a499d56de667cf34bb4 MD5 | raw file
Possible License(s): Apache-2.0
  1. """An implementation of the Porter2 stemming algorithm.
  2. See http://snowball.tartarus.org/algorithms/english/stemmer.html
  3. Adapted from pyporter2 by Michael Dirolf.
  4. This algorithm is more correct but (at least in this implementation)
  5. several times slower than the original porter algorithm as implemented
  6. in stemming.porter.
  7. """
  8. import re
  9. r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
  10. ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$")
  11. ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$")
  12. ccy_exp = re.compile(r"([aeiouy])y")
  13. s1a_exp = re.compile(r"[aeiouy].")
  14. s1b_exp = re.compile(r"[aeiouy]")
  15. def get_r1(word):
  16. # exceptional forms
  17. if word.startswith('gener') or word.startswith('arsen'):
  18. return 5
  19. if word.startswith('commun'):
  20. return 6
  21. # normal form
  22. match = r_exp.match(word)
  23. if match:
  24. return match.start(1)
  25. return len(word)
  26. def get_r2(word):
  27. match = r_exp.match(word, get_r1(word))
  28. if match:
  29. return match.start(1)
  30. return len(word)
  31. def ends_with_short_syllable(word):
  32. if len(word) == 2:
  33. if ewss_exp1.match(word):
  34. return True
  35. if ewss_exp2.match(word):
  36. return True
  37. return False
  38. def is_short_word(word):
  39. if ends_with_short_syllable(word):
  40. if get_r1(word) == len(word):
  41. return True
  42. return False
  43. def remove_initial_apostrophe(word):
  44. if word.startswith("'"):
  45. return word[1:]
  46. return word
  47. def capitalize_consonant_ys(word):
  48. if word.startswith('y'):
  49. word = 'Y' + word[1:]
  50. return ccy_exp.sub('\g<1>Y', word)
  51. def step_0(word):
  52. if word.endswith("'s'"):
  53. return word[:-3]
  54. if word.endswith("'s"):
  55. return word[:-2]
  56. if word.endswith("'"):
  57. return word[:-1]
  58. return word
  59. def step_1a(word):
  60. if word.endswith('sses'):
  61. return word[:-4] + 'ss'
  62. if word.endswith('ied') or word.endswith('ies'):
  63. if len(word) > 4:
  64. return word[:-3] + 'i'
  65. else:
  66. return word[:-3] + 'ie'
  67. if word.endswith('us') or word.endswith('ss'):
  68. return word
  69. if word.endswith('s'):
  70. preceding = word[:-1]
  71. if s1a_exp.search(preceding):
  72. return preceding
  73. return word
  74. return word
  75. doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt')
  76. def ends_with_double(word):
  77. for double in doubles:
  78. if word.endswith(double):
  79. return True
  80. return False
  81. def step_1b_helper(word):
  82. if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
  83. return word + 'e'
  84. if ends_with_double(word):
  85. return word[:-1]
  86. if is_short_word(word):
  87. return word + 'e'
  88. return word
  89. s1b_suffixes = ('ed', 'edly', 'ing', 'ingly')
  90. def step_1b(word, r1):
  91. if word.endswith('eedly'):
  92. if len(word) - 5 >= r1:
  93. return word[:-3]
  94. return word
  95. if word.endswith('eed'):
  96. if len(word) - 3 >= r1:
  97. return word[:-1]
  98. return word
  99. for suffix in s1b_suffixes:
  100. if word.endswith(suffix):
  101. preceding = word[:-len(suffix)]
  102. if s1b_exp.search(preceding):
  103. return step_1b_helper(preceding)
  104. return word
  105. return word
  106. def step_1c(word):
  107. if word.endswith('y') or word.endswith('Y') and len(word) > 1:
  108. if word[-2] not in 'aeiouy':
  109. if len(word) > 2:
  110. return word[:-1] + 'i'
  111. return word
  112. def step_2_helper(word, r1, end, repl, prev):
  113. if word.endswith(end):
  114. if len(word) - len(end) >= r1:
  115. if prev == []:
  116. return word[:-len(end)] + repl
  117. for p in prev:
  118. if word[:-len(end)].endswith(p):
  119. return word[:-len(end)] + repl
  120. return word
  121. return None
  122. s2_triples = (('ization', 'ize', []),
  123. ('ational', 'ate', []),
  124. ('fulness', 'ful', []),
  125. ('ousness', 'ous', []),
  126. ('iveness', 'ive', []),
  127. ('tional', 'tion', []),
  128. ('biliti', 'ble', []),
  129. ('lessli', 'less', []),
  130. ('entli', 'ent', []),
  131. ('ation', 'ate', []),
  132. ('alism', 'al', []),
  133. ('aliti', 'al', []),
  134. ('ousli', 'ous', []),
  135. ('iviti', 'ive', []),
  136. ('fulli', 'ful', []),
  137. ('enci', 'ence', []),
  138. ('anci', 'ance', []),
  139. ('abli', 'able', []),
  140. ('izer', 'ize', []),
  141. ('ator', 'ate', []),
  142. ('alli', 'al', []),
  143. ('bli', 'ble', []),
  144. ('ogi', 'og', ['l']),
  145. ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't']))
  146. def step_2(word, r1):
  147. for trip in s2_triples:
  148. attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2])
  149. if attempt:
  150. return attempt
  151. return word
  152. def step_3_helper(word, r1, r2, end, repl, r2_necessary):
  153. if word.endswith(end):
  154. if len(word) - len(end) >= r1:
  155. if not r2_necessary:
  156. return word[:-len(end)] + repl
  157. else:
  158. if len(word) - len(end) >= r2:
  159. return word[:-len(end)] + repl
  160. return word
  161. return None
  162. s3_triples = (('ational', 'ate', False),
  163. ('tional', 'tion', False),
  164. ('alize', 'al', False),
  165. ('icate', 'ic', False),
  166. ('iciti', 'ic', False),
  167. ('ative', '', True),
  168. ('ical', 'ic', False),
  169. ('ness', '', False),
  170. ('ful', '', False))
  171. def step_3(word, r1, r2):
  172. for trip in s3_triples:
  173. attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2])
  174. if attempt:
  175. return attempt
  176. return word
  177. s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement',
  178. 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize')
  179. def step_4(word, r2):
  180. for end in s4_delete_list:
  181. if word.endswith(end):
  182. if len(word) - len(end) >= r2:
  183. return word[:-len(end)]
  184. return word
  185. if word.endswith('sion') or word.endswith('tion'):
  186. if len(word) - 3 >= r2:
  187. return word[:-3]
  188. return word
  189. def step_5(word, r1, r2):
  190. if word.endswith('l'):
  191. if len(word) - 1 >= r2 and word[-2] == 'l':
  192. return word[:-1]
  193. return word
  194. if word.endswith('e'):
  195. if len(word) - 1 >= r2:
  196. return word[:-1]
  197. if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
  198. return word[:-1]
  199. return word
  200. def normalize_ys(word):
  201. return word.replace('Y', 'y')
  202. exceptional_forms = {'skis': 'ski',
  203. 'skies': 'sky',
  204. 'dying': 'die',
  205. 'lying': 'lie',
  206. 'tying': 'tie',
  207. 'idly': 'idl',
  208. 'gently': 'gentl',
  209. 'ugly': 'ugli',
  210. 'early': 'earli',
  211. 'only': 'onli',
  212. 'singly': 'singl',
  213. 'sky': 'sky',
  214. 'news': 'news',
  215. 'howe': 'howe',
  216. 'atlas': 'atlas',
  217. 'cosmos': 'cosmos',
  218. 'bias': 'bias',
  219. 'andes': 'andes'}
  220. exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring',
  221. 'earring', 'proceed', 'exceed', 'succeed'])
  222. def stem(word):
  223. if len(word) <= 2:
  224. return word
  225. word = remove_initial_apostrophe(word)
  226. # handle some exceptional forms
  227. if word in exceptional_forms:
  228. return exceptional_forms[word]
  229. word = capitalize_consonant_ys(word)
  230. r1 = get_r1(word)
  231. r2 = get_r2(word)
  232. word = step_0(word)
  233. word = step_1a(word)
  234. # handle some more exceptional forms
  235. if word in exceptional_early_exit_post_1a:
  236. return word
  237. word = step_1b(word, r1)
  238. word = step_1c(word)
  239. word = step_2(word, r1)
  240. word = step_3(word, r1, r2)
  241. word = step_4(word, r2)
  242. word = step_5(word, r1, r2)
  243. word = normalize_ys(word)
  244. return word