/src/whoosh/lang/paicehusk.py
Python | 242 lines | 236 code | 2 blank | 4 comment | 2 complexity | 0aced1dabd91a36353c2996bb5a344d9 MD5 | raw file
Possible License(s): Apache-2.0
- """This module contains an object that implements the Paice-Husk stemming
- algorithm.
- If you just want to use the standard Paice-Husk stemming rules, use the
- module's ``stem()`` function::
- stemmed_word = stem(word)
- If you want to use a custom rule set, read the rules into a string where the
- rules are separated by newlines, and instantiate the object with the string,
- then use the object's stem method to stem words::
- stemmer = PaiceHuskStemmer(my_rules_string)
- stemmed_word = stemmer.stem(word)
- """
- import re
- from collections import defaultdict
- class PaiceHuskStemmer(object):
- """Implements the Paice-Husk stemming algorithm.
- """
- rule_expr = re.compile(r"""
- ^(?P<ending>\w+)
- (?P<intact>[*]?)
- (?P<num>\d+)
- (?P<append>\w*)
- (?P<cont>[.>])
- """, re.UNICODE | re.VERBOSE)
- stem_expr = re.compile("^\w+", re.UNICODE)
- def __init__(self, ruletable):
- """
- :param ruletable: a string containing the rule data, separated
- by newlines.
- """
- self.rules = defaultdict(list)
- self.read_rules(ruletable)
- def read_rules(self, ruletable):
- rule_expr = self.rule_expr
- rules = self.rules
- for line in ruletable.split("\n"):
- line = line.strip()
- if not line:
- continue
- match = rule_expr.match(line)
- if match:
- ending = match.group("ending")[::-1]
- lastchar = ending[-1]
- intact = match.group("intact") == "*"
- num = int(match.group("num"))
- append = match.group("append")
- cont = match.group("cont") == ">"
- rules[lastchar].append((ending, intact, num, append, cont))
- else:
- raise Exception("Bad rule: %r" % line)
- def first_vowel(self, word):
- vp = min([p for p in [word.find(v) for v in "aeiou"]
- if p > -1])
- yp = word.find("y")
- if yp > 0 and yp < vp:
- return yp
- return vp
- def strip_prefix(self, word):
- for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
- "nano", "pico", "pseudo"):
- if word.startswith(prefix):
- return word[len(prefix):]
- return word
- def stem(self, word):
- """Returns a stemmed version of the argument string.
- """
- rules = self.rules
- match = self.stem_expr.match(word)
- if not match:
- return word
- stem = self.strip_prefix(match.group(0))
- is_intact = True
- continuing = True
- while continuing:
- pfv = self.first_vowel(stem)
- rulelist = rules.get(stem[-1])
- if not rulelist:
- break
- continuing = False
- for ending, intact, num, append, cont in rulelist:
- if stem.endswith(ending):
- if intact and not is_intact:
- continue
- newlen = len(stem) - num + len(append)
- if ((pfv == 0 and newlen < 2)
- or (pfv > 0 and newlen < 3)):
- # If word starts with vowel, minimum stem length is 2.
- # If word starts with consonant, minimum stem length is
- # 3.
- continue
- is_intact = False
- stem = stem[:0 - num] + append
- continuing = cont
- break
- return stem
- # The default rules for the Paice-Husk stemming algorithm
- defaultrules = """
- ai*2. { -ia > - if intact }
- a*1. { -a > - if intact }
- bb1. { -bb > -b }
- city3s. { -ytic > -ys }
- ci2> { -ic > - }
- cn1t> { -nc > -nt }
- dd1. { -dd > -d }
- dei3y> { -ied > -y }
- deec2ss. { -ceed > -cess }
- dee1. { -eed > -ee }
- de2> { -ed > - }
- dooh4> { -hood > - }
- e1> { -e > - }
- feil1v. { -lief > -liev }
- fi2> { -if > - }
- gni3> { -ing > - }
- gai3y. { -iag > -y }
- ga2> { -ag > - }
- gg1. { -gg > -g }
- ht*2. { -th > - if intact }
- hsiug5ct. { -guish > -ct }
- hsi3> { -ish > - }
- i*1. { -i > - if intact }
- i1y> { -i > -y }
- ji1d. { -ij > -id -- see nois4j> & vis3j> }
- juf1s. { -fuj > -fus }
- ju1d. { -uj > -ud }
- jo1d. { -oj > -od }
- jeh1r. { -hej > -her }
- jrev1t. { -verj > -vert }
- jsim2t. { -misj > -mit }
- jn1d. { -nj > -nd }
- j1s. { -j > -s }
- lbaifi6. { -ifiabl > - }
- lbai4y. { -iabl > -y }
- lba3> { -abl > - }
- lbi3. { -ibl > - }
- lib2l> { -bil > -bl }
- lc1. { -cl > c }
- lufi4y. { -iful > -y }
- luf3> { -ful > - }
- lu2. { -ul > - }
- lai3> { -ial > - }
- lau3> { -ual > - }
- la2> { -al > - }
- ll1. { -ll > -l }
- mui3. { -ium > - }
- mu*2. { -um > - if intact }
- msi3> { -ism > - }
- mm1. { -mm > -m }
- nois4j> { -sion > -j }
- noix4ct. { -xion > -ct }
- noi3> { -ion > - }
- nai3> { -ian > - }
- na2> { -an > - }
- nee0. { protect -een }
- ne2> { -en > - }
- nn1. { -nn > -n }
- pihs4> { -ship > - }
- pp1. { -pp > -p }
- re2> { -er > - }
- rae0. { protect -ear }
- ra2. { -ar > - }
- ro2> { -or > - }
- ru2> { -ur > - }
- rr1. { -rr > -r }
- rt1> { -tr > -t }
- rei3y> { -ier > -y }
- sei3y> { -ies > -y }
- sis2. { -sis > -s }
- si2> { -is > - }
- ssen4> { -ness > - }
- ss0. { protect -ss }
- suo3> { -ous > - }
- su*2. { -us > - if intact }
- s*1> { -s > - if intact }
- s0. { -s > -s }
- tacilp4y. { -plicat > -ply }
- ta2> { -at > - }
- tnem4> { -ment > - }
- tne3> { -ent > - }
- tna3> { -ant > - }
- tpir2b. { -ript > -rib }
- tpro2b. { -orpt > -orb }
- tcud1. { -duct > -duc }
- tpmus2. { -sumpt > -sum }
- tpec2iv. { -cept > -ceiv }
- tulo2v. { -olut > -olv }
- tsis0. { protect -sist }
- tsi3> { -ist > - }
- tt1. { -tt > -t }
- uqi3. { -iqu > - }
- ugo1. { -ogu > -og }
- vis3j> { -siv > -j }
- vie0. { protect -eiv }
- vi2> { -iv > - }
- ylb1> { -bly > -bl }
- yli3y> { -ily > -y }
- ylp0. { protect -ply }
- yl2> { -ly > - }
- ygo1. { -ogy > -og }
- yhp1. { -phy > -ph }
- ymo1. { -omy > -om }
- ypo1. { -opy > -op }
- yti3> { -ity > - }
- yte3> { -ety > - }
- ytl2. { -lty > -l }
- yrtsi5. { -istry > - }
- yra3> { -ary > - }
- yro3> { -ory > - }
- yfi3. { -ify > - }
- ycn2t> { -ncy > -nt }
- yca3> { -acy > - }
- zi2> { -iz > - }
- zy1s. { -yz > -ys }
- """
- # Make the standard rules available as a module-level function
- stem = PaiceHuskStemmer(defaultrules).stem