/src/whoosh/lang/porter2.py
Python | 313 lines | 265 code | 37 blank | 11 comment | 50 complexity | 4293a0b17faf8a499d56de667cf34bb4 MD5 | raw file
Possible License(s): Apache-2.0
- """An implementation of the Porter2 stemming algorithm.
- See http://snowball.tartarus.org/algorithms/english/stemmer.html
- Adapted from pyporter2 by Michael Dirolf.
- This algorithm is more correct but (at least in this implementation)
- several times slower than the original porter algorithm as implemented
- in stemming.porter.
- """
- import re
- r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
- ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$")
- ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$")
- ccy_exp = re.compile(r"([aeiouy])y")
- s1a_exp = re.compile(r"[aeiouy].")
- s1b_exp = re.compile(r"[aeiouy]")
- def get_r1(word):
- # exceptional forms
- if word.startswith('gener') or word.startswith('arsen'):
- return 5
- if word.startswith('commun'):
- return 6
- # normal form
- match = r_exp.match(word)
- if match:
- return match.start(1)
- return len(word)
- def get_r2(word):
- match = r_exp.match(word, get_r1(word))
- if match:
- return match.start(1)
- return len(word)
- def ends_with_short_syllable(word):
- if len(word) == 2:
- if ewss_exp1.match(word):
- return True
- if ewss_exp2.match(word):
- return True
- return False
- def is_short_word(word):
- if ends_with_short_syllable(word):
- if get_r1(word) == len(word):
- return True
- return False
- def remove_initial_apostrophe(word):
- if word.startswith("'"):
- return word[1:]
- return word
- def capitalize_consonant_ys(word):
- if word.startswith('y'):
- word = 'Y' + word[1:]
- return ccy_exp.sub('\g<1>Y', word)
- def step_0(word):
- if word.endswith("'s'"):
- return word[:-3]
- if word.endswith("'s"):
- return word[:-2]
- if word.endswith("'"):
- return word[:-1]
- return word
- def step_1a(word):
- if word.endswith('sses'):
- return word[:-4] + 'ss'
- if word.endswith('ied') or word.endswith('ies'):
- if len(word) > 4:
- return word[:-3] + 'i'
- else:
- return word[:-3] + 'ie'
- if word.endswith('us') or word.endswith('ss'):
- return word
- if word.endswith('s'):
- preceding = word[:-1]
- if s1a_exp.search(preceding):
- return preceding
- return word
- return word
- doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt')
- def ends_with_double(word):
- for double in doubles:
- if word.endswith(double):
- return True
- return False
- def step_1b_helper(word):
- if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
- return word + 'e'
- if ends_with_double(word):
- return word[:-1]
- if is_short_word(word):
- return word + 'e'
- return word
- s1b_suffixes = ('ed', 'edly', 'ing', 'ingly')
- def step_1b(word, r1):
- if word.endswith('eedly'):
- if len(word) - 5 >= r1:
- return word[:-3]
- return word
- if word.endswith('eed'):
- if len(word) - 3 >= r1:
- return word[:-1]
- return word
- for suffix in s1b_suffixes:
- if word.endswith(suffix):
- preceding = word[:-len(suffix)]
- if s1b_exp.search(preceding):
- return step_1b_helper(preceding)
- return word
- return word
- def step_1c(word):
- if word.endswith('y') or word.endswith('Y') and len(word) > 1:
- if word[-2] not in 'aeiouy':
- if len(word) > 2:
- return word[:-1] + 'i'
- return word
- def step_2_helper(word, r1, end, repl, prev):
- if word.endswith(end):
- if len(word) - len(end) >= r1:
- if prev == []:
- return word[:-len(end)] + repl
- for p in prev:
- if word[:-len(end)].endswith(p):
- return word[:-len(end)] + repl
- return word
- return None
- s2_triples = (('ization', 'ize', []),
- ('ational', 'ate', []),
- ('fulness', 'ful', []),
- ('ousness', 'ous', []),
- ('iveness', 'ive', []),
- ('tional', 'tion', []),
- ('biliti', 'ble', []),
- ('lessli', 'less', []),
- ('entli', 'ent', []),
- ('ation', 'ate', []),
- ('alism', 'al', []),
- ('aliti', 'al', []),
- ('ousli', 'ous', []),
- ('iviti', 'ive', []),
- ('fulli', 'ful', []),
- ('enci', 'ence', []),
- ('anci', 'ance', []),
- ('abli', 'able', []),
- ('izer', 'ize', []),
- ('ator', 'ate', []),
- ('alli', 'al', []),
- ('bli', 'ble', []),
- ('ogi', 'og', ['l']),
- ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't']))
- def step_2(word, r1):
- for trip in s2_triples:
- attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2])
- if attempt:
- return attempt
- return word
- def step_3_helper(word, r1, r2, end, repl, r2_necessary):
- if word.endswith(end):
- if len(word) - len(end) >= r1:
- if not r2_necessary:
- return word[:-len(end)] + repl
- else:
- if len(word) - len(end) >= r2:
- return word[:-len(end)] + repl
- return word
- return None
- s3_triples = (('ational', 'ate', False),
- ('tional', 'tion', False),
- ('alize', 'al', False),
- ('icate', 'ic', False),
- ('iciti', 'ic', False),
- ('ative', '', True),
- ('ical', 'ic', False),
- ('ness', '', False),
- ('ful', '', False))
- def step_3(word, r1, r2):
- for trip in s3_triples:
- attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2])
- if attempt:
- return attempt
- return word
- s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement',
- 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize')
- def step_4(word, r2):
- for end in s4_delete_list:
- if word.endswith(end):
- if len(word) - len(end) >= r2:
- return word[:-len(end)]
- return word
- if word.endswith('sion') or word.endswith('tion'):
- if len(word) - 3 >= r2:
- return word[:-3]
- return word
- def step_5(word, r1, r2):
- if word.endswith('l'):
- if len(word) - 1 >= r2 and word[-2] == 'l':
- return word[:-1]
- return word
- if word.endswith('e'):
- if len(word) - 1 >= r2:
- return word[:-1]
- if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
- return word[:-1]
- return word
- def normalize_ys(word):
- return word.replace('Y', 'y')
- exceptional_forms = {'skis': 'ski',
- 'skies': 'sky',
- 'dying': 'die',
- 'lying': 'lie',
- 'tying': 'tie',
- 'idly': 'idl',
- 'gently': 'gentl',
- 'ugly': 'ugli',
- 'early': 'earli',
- 'only': 'onli',
- 'singly': 'singl',
- 'sky': 'sky',
- 'news': 'news',
- 'howe': 'howe',
- 'atlas': 'atlas',
- 'cosmos': 'cosmos',
- 'bias': 'bias',
- 'andes': 'andes'}
- exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring',
- 'earring', 'proceed', 'exceed', 'succeed'])
- def stem(word):
- if len(word) <= 2:
- return word
- word = remove_initial_apostrophe(word)
- # handle some exceptional forms
- if word in exceptional_forms:
- return exceptional_forms[word]
- word = capitalize_consonant_ys(word)
- r1 = get_r1(word)
- r2 = get_r2(word)
- word = step_0(word)
- word = step_1a(word)
- # handle some more exceptional forms
- if word in exceptional_early_exit_post_1a:
- return word
- word = step_1b(word, r1)
- word = step_1c(word)
- word = step_2(word, r1)
- word = step_3(word, r1, r2)
- word = step_4(word, r2)
- word = step_5(word, r1, r2)
- word = normalize_ys(word)
- return word