PageRenderTime 47ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk/nltk/stem/isri.py

http://nltk.googlecode.com/
Python | 310 lines | 268 code | 2 blank | 40 comment | 0 complexity | 2d1715bb06fa1353071397eed7474467 MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Natural Language Toolkit: The ISRI Arabic Stemmer
  4. #
  5. # Copyright (C) 2001-2011 NLTK Proejct
  6. # Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
  7. # Author: Hosam Algasaier <hosam_hme@yahoo.com>
  8. # URL: <http://www.nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. """ISRI Arabic Stemmer
  11. The algorithm for this stemmer is described in:
  12. Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary.
  13. Information Science Research Institute. University of Nevada, Las Vegas, USA.
  14. The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features
  15. with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root
  16. dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than
  17. returning the original unmodified word.
  18. Additional adjustments were made to improve the algorithm:
  19. 1- Adding 60 stop words.
  20. 2- Adding the pattern (??????) to ISRI pattern set.
  21. 3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it
  22. increases the word ambiguities and changes the original root.
  23. """
  24. import re
  25. from api import *
  26. class ISRIStemmer(StemmerI):
  27. '''
  28. ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
  29. Information Science Research Institute. University of Nevada, Las Vegas, USA.
  30. A few minor modifications have been made to ISRI basic algorithm.
  31. See the source code of this module for more information.
  32. isri.stem(token) returns Arabic root for the given token.
  33. The ISRI Stemmer requires that all tokens have Unicode string types.
  34. If you use Python IDLE on Arabic Windows you have to decode text first
  35. using Arabic '1256' coding.
  36. '''
  37. def __init__(self):
  38. self.stm = 'defult none'
  39. self.p3 = [u'\u0643\u0627\u0644', u'\u0628\u0627\u0644', u'\u0648\u0644\u0644', u'\u0648\u0627\u0644'] # length three prefixes
  40. self.p2 = [u'\u0627\u0644', u'\u0644\u0644'] # length two prefixes
  41. self.p1 = [u'\u0644', u'\u0628', u'\u0641', u'\u0633', u'\u0648', u'\u064a', u'\u062a', u'\u0646', u'\u0627'] # length one prefixes
  42. self.s3 = [u'\u062a\u0645\u0644', u'\u0647\u0645\u0644', u'\u062a\u0627\u0646', u'\u062a\u064a\u0646', u'\u0643\u0645\u0644'] # length three suffixes
  43. self.s2 = [u'\u0648\u0646', u'\u0627\u062a', u'\u0627\u0646', u'\u064a\u0646', u'\u062a\u0646', u'\u0643\u0645', u'\u0647\u0646', u'\u0646\u0627', u'\u064a\u0627', u'\u0647\u0627', u'\u062a\u0645', u'\u0643\u0646', u'\u0646\u064a', u'\u0648\u0627', u'\u0645\u0627', u'\u0647\u0645'] # length two suffixes
  44. self.s1 = [u'\u0629', u'\u0647', u'\u064a', u'\u0643', u'\u062a', u'\u0627', u'\u0646'] # length one suffixes
  45. self.pr4 = {0:[u'\u0645'], 1:[u'\u0627'], 2:[u'\u0627', u'\u0648', u'\u064A'], 3:[u'\u0629']} # groups of length four patterns
  46. self.pr53 = {0:[u'\u0627', u'\u062a'], 1:[u'\u0627', u'\u064a', u'\u0648'], 2:[u'\u0627', u'\u062a', u'\u0645'], 3:[u'\u0645', u'\u064a', u'\u062a'], 4:[u'\u0645', u'\u062a'], 5:[u'\u0627', u'\u0648'], 6:[u'\u0627', u'\u0645']} # Groups of length five patterns and length three roots
  47. self.re_short_vowels = re.compile(ur'[\u064B-\u0652]')
  48. self.re_hamza = re.compile(ur'[\u0621\u0624\u0626]')
  49. self.re_intial_hamza = re.compile(ur'^[\u0622\u0623\u0625]')
  50. self.stop_words = [u'\u064a\u0643\u0648\u0646', u'\u0648\u0644\u064a\u0633', u'\u0648\u0643\u0627\u0646', u'\u0643\u0630\u0644\u0643', u'\u0627\u0644\u062a\u064a', u'\u0648\u0628\u064a\u0646', u'\u0639\u0644\u064a\u0647\u0627', u'\u0645\u0633\u0627\u0621', u'\u0627\u0644\u0630\u064a', u'\u0648\u0643\u0627\u0646\u062a', u'\u0648\u0644\u0643\u0646', u'\u0648\u0627\u0644\u062a\u064a', u'\u062a\u0643\u0648\u0646', u'\u0627\u0644\u064a\u0648\u0645', u'\u0627\u0644\u0644\u0630\u064a\u0646', u'\u0639\u0644\u064a\u0647', u'\u0643\u0627\u0646\u062a', u'\u0644\u0630\u0644\u0643', u'\u0623\u0645\u0627\u0645', u'\u0647\u0646\u0627\u0643', u'\u0645\u0646\u0647\u0627', u'\u0645\u0627\u0632\u0627\u0644', u'\u0644\u0627\u0632\u0627\u0644', u'\u0644\u0627\u064a\u0632\u0627\u0644', u'\u0645\u0627\u064a\u0632\u0627\u0644', u'\u0627\u0635\u0628\u062d', u'\u0623\u0635\u0628\u062d', u'\u0623\u0645\u0633\u0649', u'\u0627\u0645\u0633\u0649', u'\u0623\u0636\u062d\u0649', u'\u0627\u0636\u062d\u0649', u'\u0645\u0627\u0628\u0631\u062d', u'\u0645\u0627\u0641\u062a\u0626', u'\u0645\u0627\u0627\u0646\u0641\u0643', u'\u0644\u0627\u0633\u064a\u0645\u0627', u'\u0648\u0644\u0627\u064a\u0632\u0627\u0644', u'\u0627\u0644\u062d\u0627\u0644\u064a', u'\u0627\u0644\u064a\u0647\u0627', u'\u0627\u0644\u0630\u064a\u0646', u'\u0641\u0627\u0646\u0647', u'\u0648\u0627\u0644\u0630\u064a', u'\u0648\u0647\u0630\u0627', u'\u0644\u0647\u0630\u0627', u'\u0641\u0643\u0627\u0646', u'\u0633\u062a\u0643\u0648\u0646', u'\u0627\u0644\u064a\u0647', u'\u064a\u0645\u0643\u0646', u'\u0628\u0647\u0630\u0627', u'\u0627\u0644\u0630\u0649']
  51. def stem(self, token):
  52. """
  53. Stemming a word token using the ISRI stemmer.
  54. """
  55. self.stm = token
  56. self.norm(1) # remove diacritics which representing Arabic short vowels
  57. if self.stm in self.stop_words: return self.stm # exclude stop words from being processed
  58. self.pre32() # remove length three and length two prefixes in this order
  59. self.suf32() # remove length three and length two suffixes in this order
  60. self.waw() # remove connective ‘?’ if it precedes a word beginning with ‘?’
  61. self.norm(2) # normalize initial hamza to bare alif
  62. if len(self.stm)<=3: return self.stm # return stem if less than or equal to three
  63. if len(self.stm)==4: # length 4 word
  64. self.pro_w4()
  65. return self.stm
  66. elif len(self.stm)==5: # length 5 word
  67. self.pro_w53()
  68. self.end_w5()
  69. return self.stm
  70. elif len(self.stm)==6: # length 6 word
  71. self.pro_w6()
  72. self.end_w6()
  73. return self.stm
  74. elif len(self.stm)==7: # length 7 word
  75. self.suf1()
  76. if len(self.stm)==7:
  77. self.pre1()
  78. if len(self.stm)==6:
  79. self.pro_w6()
  80. self.end_w6()
  81. return self.stm
  82. return self.stm # if word length >7 , then no stemming
  83. def norm(self, num):
  84. """
  85. normalization:
  86. num=1 normalize diacritics
  87. num=2 normalize initial hamza
  88. num=3 both 1&2
  89. """
  90. self.k = num
  91. if self.k == 1:
  92. self.stm = self.re_short_vowels.sub('', self.stm)
  93. return self.stm
  94. elif self.k == 2:
  95. self.stm = self.re_intial_hamza.sub(ur'\u0627',self.stm)
  96. return self.stm
  97. elif self.k == 3:
  98. self.stm = self.re_short_vowels.sub('', self.stm)
  99. self.stm = self.re_intial_hamza.sub(ur'\u0627',self.stm)
  100. return self.stm
  101. def pre32(self):
  102. """remove length three and length two prefixes in this order"""
  103. if len(self.stm)>=6:
  104. for pre3 in self.p3:
  105. if self.stm.startswith(pre3):
  106. self.stm = self.stm[3:]
  107. return self.stm
  108. elif len(self.stm)>=5:
  109. for pre2 in self.p2:
  110. if self.stm.startswith(pre2):
  111. self.stm = self.stm[2:]
  112. return self.stm
  113. def suf32(self):
  114. """remove length three and length two suffixes in this order"""
  115. if len(self.stm)>=6:
  116. for suf3 in self.s3:
  117. if self.stm.endswith(suf3):
  118. self.stm = self.stm[:-3]
  119. return self.stm
  120. elif len(self.stm)>=5:
  121. for suf2 in self.s2:
  122. if self.stm.endswith(suf2):
  123. self.stm = self.stm[:-2]
  124. return self.stm
  125. def waw(self):
  126. """remove connective ‘?’ if it precedes a word beginning with ‘?’ """
  127. if (len(self.stm)>=4)&(self.stm[:2] == u'\u0648\u0648'):
  128. self.stm = self.stm[1:]
  129. return self.stm
  130. def pro_w4(self):
  131. """process length four patterns and extract length three roots"""
  132. if self.stm[0] in self.pr4[0]: # ????
  133. self.stm = self.stm[1:]
  134. return self.stm
  135. elif self.stm[1] in self.pr4[1]: # ????
  136. self.stm = self.stm[0]+self.stm[2:]
  137. return self.stm
  138. elif self.stm[2] in self.pr4[2]: # ???? - ???? - ????
  139. self.stm = self.stm[:2]+self.stm[3]
  140. return self.stm
  141. elif self.stm[3] in self.pr4[3]: # ????
  142. self.stm = self.stm[:-1]
  143. return self.stm
  144. else:
  145. self.suf1() # do - normalize short sufix
  146. if len(self.stm)==4:
  147. self.pre1() # do - normalize short prefix
  148. return self.stm
  149. def pro_w53(self):
  150. """process length five patterns and extract length three roots"""
  151. if ((self.stm[2] in self.pr53[0]) & (self.stm[0] == u'\u0627')): # ????? - ?????
  152. self.stm = self.stm[1]+self.stm[3:]
  153. return self.stm
  154. elif ((self.stm[3] in self.pr53[1]) & (self.stm[0] == u'\u0645')): # ????? - ????? - ?????
  155. self.stm = self.stm[1:3]+self.stm[4]
  156. return self.stm
  157. elif ((self.stm[0] in self.pr53[2]) & (self.stm[4] == u'\u0629')): # ????? - ????? - ?????
  158. self.stm = self.stm[1:4]
  159. return self.stm
  160. elif ((self.stm[0] in self.pr53[3]) & (self.stm[2] == u'\u062a')): # ????? - ????? - ?????
  161. self.stm = self.stm[1]+self.stm[3:]
  162. return self.stm
  163. elif ((self.stm[0] in self.pr53[4]) & (self.stm[2] == u'\u0627')): #????? - ?????
  164. self.stm = self.stm[1]+self.stm[3:]
  165. return self.stm
  166. elif ((self.stm[2] in self.pr53[5]) & (self.stm[4] == u'\u0629')): # ????? - ?????
  167. self.stm = self.stm[:2]+self.stm[3]
  168. return self.stm
  169. elif ((self.stm[0] in self.pr53[6]) & (self.stm[1] == u'\u0646')): # ????? - ?????
  170. self.stm = self.stm[2:]
  171. return self.stm
  172. elif ((self.stm[3] == u'\u0627') & (self.stm[0] == u'\u0627')): # ?????
  173. self.stm = self.stm[1:3]+self.stm[4]
  174. return self.stm
  175. elif ((self.stm[4] == u'\u0646') & (self.stm[3] == u'\u0627')): # ?????
  176. self.stm = self.stm[:3]
  177. return self.stm
  178. elif ((self.stm[3] == u'\u064a') & (self.stm[0] == u'\u062a')): # ?????
  179. self.stm = self.stm[1:3]+self.stm[4]
  180. return self.stm
  181. elif ((self.stm[3] == u'\u0648') & (self.stm[1] == u'\u0627')): # ?????
  182. self.stm = self.stm[0]+self.stm[2]+self.stm[4]
  183. return self.stm
  184. elif ((self.stm[2] == u'\u0627') & (self.stm[1] == u'\u0648')): # ?????
  185. self.stm = self.stm[0]+self.stm[3:]
  186. return self.stm
  187. elif ((self.stm[3] == u'\u0626') & (self.stm[2] == u'\u0627')): # ?????
  188. self.stm = self.stm[:2]+self.stm[4]
  189. return self.stm
  190. elif ((self.stm[4] == u'\u0629') & (self.stm[1] == u'\u0627')): # ?????
  191. self.stm = self.stm[0]+self.stm[2:4]
  192. return self.stm
  193. elif ((self.stm[4] == u'\u064a') & (self.stm[2] == u'\u0627')): # ?????
  194. self.stm = self.stm[:2]+self.stm[3]
  195. return self.stm
  196. else:
  197. self.suf1() # do - normalize short sufix
  198. if len(self.stm)==5:
  199. self.pre1() # do - normalize short prefix
  200. return self.stm
  201. def pro_w54(self):
  202. """process length five patterns and extract length four roots"""
  203. if (self.stm[0] in self.pr53[2]): #????? - ????? - ?????
  204. self.stm = self.stm[1:]
  205. return self.stm
  206. elif (self.stm[4] == u'\u0629'): # ?????
  207. self.stm = self.stm[:4]
  208. return self.stm
  209. elif (self.stm[2] == u'\u0627'): # ?????
  210. self.stm = self.stm[:2]+self.stm[3:]
  211. return self.stm
  212. def end_w5(self):
  213. """ending step (word of length five)"""
  214. if len(self.stm)==3:
  215. return self.stm
  216. elif len(self.stm)==4:
  217. self.pro_w4()
  218. return self.stm
  219. elif len(self.stm)==5:
  220. self.pro_w54()
  221. return self.stm
  222. def pro_w6(self):
  223. """process length six patterns and extract length three roots"""
  224. if ((self.stm.startswith(u'\u0627\u0633\u062a')) or (self.stm.startswith(u'\u0645\u0633\u062a'))): # ?????? - ??????
  225. self.stm= self.stm[3:]
  226. return self.stm
  227. elif (self.stm[0]== u'\u0645' and self.stm[3]== u'\u0627' and self.stm[5]== u'\u0629'): # ??????
  228. self.stm = self.stm[1:3]+self.stm[4]
  229. return self.stm
  230. elif (self.stm[0]== u'\u0627' and self.stm[2]== u'\u062a' and self.stm[4]== u'\u0627'): # ??????
  231. self.stm = self.stm[1]+self.stm[3]+self.stm[5]
  232. return self.stm
  233. elif (self.stm[0]== u'\u0627' and self.stm[3]== u'\u0648' and self.stm[2]==self.stm[4]): # ??????
  234. self.stm = self.stm[1]+self.stm[4:]
  235. return self.stm
  236. elif (self.stm[0]== u'\u062a' and self.stm[2]== u'\u0627' and self.stm[4]== u'\u064a'): # ?????? new pattern
  237. self.stm = self.stm[1]+self.stm[3]+self.stm[5]
  238. return self.stm
  239. else:
  240. self.suf1() # do - normalize short sufix
  241. if len(self.stm)==6:
  242. self.pre1() # do - normalize short prefix
  243. return self.stm
  244. def pro_w64(self):
  245. """process length six patterns and extract length four roots"""
  246. if (self.stm[0] and self.stm[4])==u'\u0627': # ??????
  247. self.stm=self.stm[1:4]+self.stm[5]
  248. return self.stm
  249. elif (self.stm.startswith(u'\u0645\u062a')): # ??????
  250. self.stm = self.stm[2:]
  251. return self.stm
  252. def end_w6(self):
  253. """ending step (word of length six)"""
  254. if len(self.stm)==3:
  255. return self.stm
  256. elif len(self.stm)==5:
  257. self.pro_w53()
  258. self.end_w5()
  259. return self.stm
  260. elif len (self.stm)==6:
  261. self.pro_w64()
  262. return self.stm
  263. def suf1(self):
  264. """normalize short sufix"""
  265. for sf1 in self.s1:
  266. if self.stm.endswith(sf1):
  267. self.stm = self.stm[:-1]
  268. return self.stm
  269. def pre1(self):
  270. """normalize short prefix"""
  271. for sp1 in self.p1:
  272. if self.stm.startswith(sp1):
  273. self.stm = self.stm[1:]
  274. return self.stm