PageRenderTime 39ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/src/whoosh/lang/paicehusk.py

https://bitbucket.org/rayleyva/whoosh
Python | 242 lines | 236 code | 2 blank | 4 comment | 2 complexity | 0aced1dabd91a36353c2996bb5a344d9 MD5 | raw file
Possible License(s): Apache-2.0
  1. """This module contains an object that implements the Paice-Husk stemming
  2. algorithm.
  3. If you just want to use the standard Paice-Husk stemming rules, use the
  4. module's ``stem()`` function::
  5. stemmed_word = stem(word)
  6. If you want to use a custom rule set, read the rules into a string where the
  7. rules are separated by newlines, and instantiate the object with the string,
  8. then use the object's stem method to stem words::
  9. stemmer = PaiceHuskStemmer(my_rules_string)
  10. stemmed_word = stemmer.stem(word)
  11. """
  12. import re
  13. from collections import defaultdict
  14. class PaiceHuskStemmer(object):
  15. """Implements the Paice-Husk stemming algorithm.
  16. """
  17. rule_expr = re.compile(r"""
  18. ^(?P<ending>\w+)
  19. (?P<intact>[*]?)
  20. (?P<num>\d+)
  21. (?P<append>\w*)
  22. (?P<cont>[.>])
  23. """, re.UNICODE | re.VERBOSE)
  24. stem_expr = re.compile("^\w+", re.UNICODE)
  25. def __init__(self, ruletable):
  26. """
  27. :param ruletable: a string containing the rule data, separated
  28. by newlines.
  29. """
  30. self.rules = defaultdict(list)
  31. self.read_rules(ruletable)
  32. def read_rules(self, ruletable):
  33. rule_expr = self.rule_expr
  34. rules = self.rules
  35. for line in ruletable.split("\n"):
  36. line = line.strip()
  37. if not line:
  38. continue
  39. match = rule_expr.match(line)
  40. if match:
  41. ending = match.group("ending")[::-1]
  42. lastchar = ending[-1]
  43. intact = match.group("intact") == "*"
  44. num = int(match.group("num"))
  45. append = match.group("append")
  46. cont = match.group("cont") == ">"
  47. rules[lastchar].append((ending, intact, num, append, cont))
  48. else:
  49. raise Exception("Bad rule: %r" % line)
  50. def first_vowel(self, word):
  51. vp = min([p for p in [word.find(v) for v in "aeiou"]
  52. if p > -1])
  53. yp = word.find("y")
  54. if yp > 0 and yp < vp:
  55. return yp
  56. return vp
  57. def strip_prefix(self, word):
  58. for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
  59. "nano", "pico", "pseudo"):
  60. if word.startswith(prefix):
  61. return word[len(prefix):]
  62. return word
  63. def stem(self, word):
  64. """Returns a stemmed version of the argument string.
  65. """
  66. rules = self.rules
  67. match = self.stem_expr.match(word)
  68. if not match:
  69. return word
  70. stem = self.strip_prefix(match.group(0))
  71. is_intact = True
  72. continuing = True
  73. while continuing:
  74. pfv = self.first_vowel(stem)
  75. rulelist = rules.get(stem[-1])
  76. if not rulelist:
  77. break
  78. continuing = False
  79. for ending, intact, num, append, cont in rulelist:
  80. if stem.endswith(ending):
  81. if intact and not is_intact:
  82. continue
  83. newlen = len(stem) - num + len(append)
  84. if ((pfv == 0 and newlen < 2)
  85. or (pfv > 0 and newlen < 3)):
  86. # If word starts with vowel, minimum stem length is 2.
  87. # If word starts with consonant, minimum stem length is
  88. # 3.
  89. continue
  90. is_intact = False
  91. stem = stem[:0 - num] + append
  92. continuing = cont
  93. break
  94. return stem
  95. # The default rules for the Paice-Husk stemming algorithm
  96. defaultrules = """
  97. ai*2. { -ia > - if intact }
  98. a*1. { -a > - if intact }
  99. bb1. { -bb > -b }
  100. city3s. { -ytic > -ys }
  101. ci2> { -ic > - }
  102. cn1t> { -nc > -nt }
  103. dd1. { -dd > -d }
  104. dei3y> { -ied > -y }
  105. deec2ss. { -ceed > -cess }
  106. dee1. { -eed > -ee }
  107. de2> { -ed > - }
  108. dooh4> { -hood > - }
  109. e1> { -e > - }
  110. feil1v. { -lief > -liev }
  111. fi2> { -if > - }
  112. gni3> { -ing > - }
  113. gai3y. { -iag > -y }
  114. ga2> { -ag > - }
  115. gg1. { -gg > -g }
  116. ht*2. { -th > - if intact }
  117. hsiug5ct. { -guish > -ct }
  118. hsi3> { -ish > - }
  119. i*1. { -i > - if intact }
  120. i1y> { -i > -y }
  121. ji1d. { -ij > -id -- see nois4j> & vis3j> }
  122. juf1s. { -fuj > -fus }
  123. ju1d. { -uj > -ud }
  124. jo1d. { -oj > -od }
  125. jeh1r. { -hej > -her }
  126. jrev1t. { -verj > -vert }
  127. jsim2t. { -misj > -mit }
  128. jn1d. { -nj > -nd }
  129. j1s. { -j > -s }
  130. lbaifi6. { -ifiabl > - }
  131. lbai4y. { -iabl > -y }
  132. lba3> { -abl > - }
  133. lbi3. { -ibl > - }
  134. lib2l> { -bil > -bl }
  135. lc1. { -cl > c }
  136. lufi4y. { -iful > -y }
  137. luf3> { -ful > - }
  138. lu2. { -ul > - }
  139. lai3> { -ial > - }
  140. lau3> { -ual > - }
  141. la2> { -al > - }
  142. ll1. { -ll > -l }
  143. mui3. { -ium > - }
  144. mu*2. { -um > - if intact }
  145. msi3> { -ism > - }
  146. mm1. { -mm > -m }
  147. nois4j> { -sion > -j }
  148. noix4ct. { -xion > -ct }
  149. noi3> { -ion > - }
  150. nai3> { -ian > - }
  151. na2> { -an > - }
  152. nee0. { protect -een }
  153. ne2> { -en > - }
  154. nn1. { -nn > -n }
  155. pihs4> { -ship > - }
  156. pp1. { -pp > -p }
  157. re2> { -er > - }
  158. rae0. { protect -ear }
  159. ra2. { -ar > - }
  160. ro2> { -or > - }
  161. ru2> { -ur > - }
  162. rr1. { -rr > -r }
  163. rt1> { -tr > -t }
  164. rei3y> { -ier > -y }
  165. sei3y> { -ies > -y }
  166. sis2. { -sis > -s }
  167. si2> { -is > - }
  168. ssen4> { -ness > - }
  169. ss0. { protect -ss }
  170. suo3> { -ous > - }
  171. su*2. { -us > - if intact }
  172. s*1> { -s > - if intact }
  173. s0. { -s > -s }
  174. tacilp4y. { -plicat > -ply }
  175. ta2> { -at > - }
  176. tnem4> { -ment > - }
  177. tne3> { -ent > - }
  178. tna3> { -ant > - }
  179. tpir2b. { -ript > -rib }
  180. tpro2b. { -orpt > -orb }
  181. tcud1. { -duct > -duc }
  182. tpmus2. { -sumpt > -sum }
  183. tpec2iv. { -cept > -ceiv }
  184. tulo2v. { -olut > -olv }
  185. tsis0. { protect -sist }
  186. tsi3> { -ist > - }
  187. tt1. { -tt > -t }
  188. uqi3. { -iqu > - }
  189. ugo1. { -ogu > -og }
  190. vis3j> { -siv > -j }
  191. vie0. { protect -eiv }
  192. vi2> { -iv > - }
  193. ylb1> { -bly > -bl }
  194. yli3y> { -ily > -y }
  195. ylp0. { protect -ply }
  196. yl2> { -ly > - }
  197. ygo1. { -ogy > -og }
  198. yhp1. { -phy > -ph }
  199. ymo1. { -omy > -om }
  200. ypo1. { -opy > -op }
  201. yti3> { -ity > - }
  202. yte3> { -ety > - }
  203. ytl2. { -lty > -l }
  204. yrtsi5. { -istry > - }
  205. yra3> { -ary > - }
  206. yro3> { -ory > - }
  207. yfi3. { -ify > - }
  208. ycn2t> { -ncy > -nt }
  209. yca3> { -acy > - }
  210. zi2> { -iz > - }
  211. zy1s. { -yz > -ys }
  212. """
  213. # Make the standard rules available as a module-level function
  214. stem = PaiceHuskStemmer(defaultrules).stem