PageRenderTime 43ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/stem/lancaster.py

https://github.com/haewoon/nltk
Python | 311 lines | 244 code | 27 blank | 40 comment | 28 complexity | 864436d48c6862e1c271f411cd4f0fbb MD5 | raw file
Possible License(s): Apache-2.0
  1. # Natural Language Toolkit: Stemmers
  2. #
  3. # Copyright (C) 2001-2012 NLTK Project
  4. # Author: Steven Tomcavage <stomcava@law.upenn.edu>
  5. # URL: <http://www.nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A word stemmer based on the Lancaster stemming algorithm.
  9. Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
  10. """
  11. import re
  12. from api import StemmerI
  13. class LancasterStemmer(StemmerI):
  14. """
  15. Lancaster Stemmer
  16. >>> from nltk.stem.lancaster import LancasterStemmer
  17. >>> st = LancasterStemmer()
  18. >>> st.stem('maximum') # Remove "-um" when word is intact
  19. 'maxim'
  20. >>> st.stem('presumably') # Don't remove "-um" when word is not intact
  21. 'presum'
  22. >>> st.stem('multiply') # No action taken if word ends with "-ply"
  23. 'multiply'
  24. >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
  25. 'provid'
  26. >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
  27. 'ow'
  28. >>> st.stem('ear') # ditto
  29. 'ear'
  30. >>> st.stem('saying') # Words starting with consonant must contain at least 3
  31. 'say'
  32. >>> st.stem('crying') # letters and one of those letters must be a vowel
  33. 'cry'
  34. >>> st.stem('string') # ditto
  35. 'string'
  36. >>> st.stem('meant') # ditto
  37. 'meant'
  38. >>> st.stem('cement') # ditto
  39. 'cem'
  40. """
  41. # The rule list is static since it doesn't change between instances
  42. rule_tuple = (
  43. "ai*2.", # -ia > - if intact
  44. "a*1.", # -a > - if intact
  45. "bb1.", # -bb > -b
  46. "city3s.", # -ytic > -ys
  47. "ci2>", # -ic > -
  48. "cn1t>", # -nc > -nt
  49. "dd1.", # -dd > -d
  50. "dei3y>", # -ied > -y
  51. "deec2ss.", # -ceed >", -cess
  52. "dee1.", # -eed > -ee
  53. "de2>", # -ed > -
  54. "dooh4>", # -hood > -
  55. "e1>", # -e > -
  56. "feil1v.", # -lief > -liev
  57. "fi2>", # -if > -
  58. "gni3>", # -ing > -
  59. "gai3y.", # -iag > -y
  60. "ga2>", # -ag > -
  61. "gg1.", # -gg > -g
  62. "ht*2.", # -th > - if intact
  63. "hsiug5ct.", # -guish > -ct
  64. "hsi3>", # -ish > -
  65. "i*1.", # -i > - if intact
  66. "i1y>", # -i > -y
  67. "ji1d.", # -ij > -id -- see nois4j> & vis3j>
  68. "juf1s.", # -fuj > -fus
  69. "ju1d.", # -uj > -ud
  70. "jo1d.", # -oj > -od
  71. "jeh1r.", # -hej > -her
  72. "jrev1t.", # -verj > -vert
  73. "jsim2t.", # -misj > -mit
  74. "jn1d.", # -nj > -nd
  75. "j1s.", # -j > -s
  76. "lbaifi6.", # -ifiabl > -
  77. "lbai4y.", # -iabl > -y
  78. "lba3>", # -abl > -
  79. "lbi3.", # -ibl > -
  80. "lib2l>", # -bil > -bl
  81. "lc1.", # -cl > c
  82. "lufi4y.", # -iful > -y
  83. "luf3>", # -ful > -
  84. "lu2.", # -ul > -
  85. "lai3>", # -ial > -
  86. "lau3>", # -ual > -
  87. "la2>", # -al > -
  88. "ll1.", # -ll > -l
  89. "mui3.", # -ium > -
  90. "mu*2.", # -um > - if intact
  91. "msi3>", # -ism > -
  92. "mm1.", # -mm > -m
  93. "nois4j>", # -sion > -j
  94. "noix4ct.", # -xion > -ct
  95. "noi3>", # -ion > -
  96. "nai3>", # -ian > -
  97. "na2>", # -an > -
  98. "nee0.", # protect -een
  99. "ne2>", # -en > -
  100. "nn1.", # -nn > -n
  101. "pihs4>", # -ship > -
  102. "pp1.", # -pp > -p
  103. "re2>", # -er > -
  104. "rae0.", # protect -ear
  105. "ra2.", # -ar > -
  106. "ro2>", # -or > -
  107. "ru2>", # -ur > -
  108. "rr1.", # -rr > -r
  109. "rt1>", # -tr > -t
  110. "rei3y>", # -ier > -y
  111. "sei3y>", # -ies > -y
  112. "sis2.", # -sis > -s
  113. "si2>", # -is > -
  114. "ssen4>", # -ness > -
  115. "ss0.", # protect -ss
  116. "suo3>", # -ous > -
  117. "su*2.", # -us > - if intact
  118. "s*1>", # -s > - if intact
  119. "s0.", # -s > -s
  120. "tacilp4y.", # -plicat > -ply
  121. "ta2>", # -at > -
  122. "tnem4>", # -ment > -
  123. "tne3>", # -ent > -
  124. "tna3>", # -ant > -
  125. "tpir2b.", # -ript > -rib
  126. "tpro2b.", # -orpt > -orb
  127. "tcud1.", # -duct > -duc
  128. "tpmus2.", # -sumpt > -sum
  129. "tpec2iv.", # -cept > -ceiv
  130. "tulo2v.", # -olut > -olv
  131. "tsis0.", # protect -sist
  132. "tsi3>", # -ist > -
  133. "tt1.", # -tt > -t
  134. "uqi3.", # -iqu > -
  135. "ugo1.", # -ogu > -og
  136. "vis3j>", # -siv > -j
  137. "vie0.", # protect -eiv
  138. "vi2>", # -iv > -
  139. "ylb1>", # -bly > -bl
  140. "yli3y>", # -ily > -y
  141. "ylp0.", # protect -ply
  142. "yl2>", # -ly > -
  143. "ygo1.", # -ogy > -og
  144. "yhp1.", # -phy > -ph
  145. "ymo1.", # -omy > -om
  146. "ypo1.", # -opy > -op
  147. "yti3>", # -ity > -
  148. "yte3>", # -ety > -
  149. "ytl2.", # -lty > -l
  150. "yrtsi5.", # -istry > -
  151. "yra3>", # -ary > -
  152. "yro3>", # -ory > -
  153. "yfi3.", # -ify > -
  154. "ycn2t>", # -ncy > -nt
  155. "yca3>", # -acy > -
  156. "zi2>", # -iz > -
  157. "zy1s." # -yz > -ys
  158. )
  159. def __init__(self):
  160. """Create an instance of the Lancaster stemmer.
  161. """
  162. # Setup an empty rule dictionary - this will be filled in later
  163. self.rule_dictionary = {}
  164. def parseRules(self, rule_tuple):
  165. """Validate the set of rules used in this stemmer.
  166. """
  167. valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
  168. # Empty any old rules from the rule set before adding new ones
  169. self.rule_dictionary = {}
  170. for rule in rule_tuple:
  171. if not valid_rule.match(rule):
  172. raise ValueError, "The rule %s is invalid" % rule
  173. first_letter = rule[0:1]
  174. if first_letter in self.rule_dictionary:
  175. self.rule_dictionary[first_letter].append(rule)
  176. else:
  177. self.rule_dictionary[first_letter] = [rule]
  178. def stem(self, word):
  179. """Stem a word using the Lancaster stemmer.
  180. """
  181. # Lower-case the word, since all the rules are lower-cased
  182. word = word.lower()
  183. # Save a copy of the original word
  184. intact_word = word
  185. # If the user hasn't supplied any rules, setup the default rules
  186. if len(self.rule_dictionary) == 0:
  187. self.parseRules(LancasterStemmer.rule_tuple)
  188. return self.__doStemming(word, intact_word)
  189. def __doStemming(self, word, intact_word):
  190. """Perform the actual word stemming
  191. """
  192. valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
  193. proceed = True
  194. while proceed:
  195. # Find the position of the last letter of the word to be stemmed
  196. last_letter_position = self.__getLastLetter(word)
  197. # Only stem the word if it has a last letter and a rule matching that last letter
  198. if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary:
  199. proceed = False
  200. else:
  201. rule_was_applied = False
  202. # Go through each rule that matches the word's final letter
  203. for rule in self.rule_dictionary[word[last_letter_position]]:
  204. rule_match = valid_rule.match(rule)
  205. if rule_match:
  206. (ending_string,
  207. intact_flag,
  208. remove_total,
  209. append_string,
  210. cont_flag) = rule_match.groups()
  211. # Convert the number of chars to remove when stemming
  212. # from a string to an integer
  213. remove_total = int(remove_total)
  214. # Proceed if word's ending matches rule's word ending
  215. if word.endswith(ending_string[::-1]):
  216. if intact_flag:
  217. if (word == intact_word and
  218. self.__isAcceptable(word, remove_total)):
  219. word = self.__applyRule(word,
  220. remove_total,
  221. append_string)
  222. rule_was_applied = True
  223. if cont_flag == '.':
  224. proceed = False
  225. break
  226. elif self.__isAcceptable(word, remove_total):
  227. word = self.__applyRule(word,
  228. remove_total,
  229. append_string)
  230. rule_was_applied = True
  231. if cont_flag == '.':
  232. proceed = False
  233. break
  234. # If no rules apply, the word doesn't need any more stemming
  235. if rule_was_applied == False:
  236. proceed = False
  237. return word
  238. def __getLastLetter(self, word):
  239. """Get the zero-based index of the last alphabetic character in this string
  240. """
  241. last_letter = -1
  242. for position in range(len(word)):
  243. if word[position].isalpha():
  244. last_letter = position
  245. else:
  246. break
  247. return last_letter
  248. def __isAcceptable(self, word, remove_total):
  249. """Determine if the word is acceptable for stemming.
  250. """
  251. word_is_acceptable = False
  252. # If the word starts with a vowel, it must be at least 2
  253. # characters long to be stemmed
  254. if word[0] in "aeiouy":
  255. if (len(word) - remove_total >= 2):
  256. word_is_acceptable = True
  257. # If the word starts with a consonant, it must be at least 3
  258. # characters long (including one vowel) to be stemmed
  259. elif (len(word) - remove_total >= 3):
  260. if word[1] in "aeiouy":
  261. word_is_acceptable = True
  262. elif word[2] in "aeiouy":
  263. word_is_acceptable = True
  264. return word_is_acceptable
  265. def __applyRule(self, word, remove_total, append_string):
  266. """Apply the stemming rule to the word
  267. """
  268. # Remove letters from the end of the word
  269. new_word_length = len(word) - remove_total
  270. word = word[0:new_word_length]
  271. # And add new letters to the end of the truncated word
  272. if append_string:
  273. word += append_string
  274. return word
  275. def __repr__(self):
  276. return '<LancasterStemmer>'
  277. if __name__ == "__main__":
  278. import doctest
  279. doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)