PageRenderTime 46ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/src/whoosh/lang/isri.py

https://bitbucket.org/rayleyva/whoosh
Python | 382 lines | 333 code | 4 blank | 45 comment | 0 complexity | 8b01b5f37ce6deea63db7415cbd45ab0 MD5 | raw file
Possible License(s): Apache-2.0
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Natural Language Toolkit: The ISRI Arabic Stemmer
  4. #
  5. # Copyright (C) 2001-2012 NLTK Proejct
  6. # Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
  7. # Author: Hosam Algasaier <hosam_hme@yahoo.com>
  8. # URL: <http://www.nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. """
  11. ISRI Arabic Stemmer
  12. The algorithm for this stemmer is described in:
  13. Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root
  14. dictionary. Information Science Research Institute. University of Nevada, Las
  15. Vegas, USA.
  16. The Information Science Research Institutes (ISRI) Arabic stemmer shares many
  17. features with the Khoja stemmer. However, the main difference is that ISRI
  18. stemmer does not use root dictionary. Also, if a root is not found, ISRI
  19. stemmer returned normalized form, rather than returning the original
  20. unmodified word.
  21. Additional adjustments were made to improve the algorithm:
  22. 1- Adding 60 stop words.
  23. 2- Adding the pattern (تفاعيل) to ISRI pattern set.
  24. 3- The step 2 in the original algorithm was normalizing all hamza. This step is
  25. discarded because it increases the word ambiguities and changes the original
  26. root.
  27. """
  28. from __future__ import unicode_literals
  29. import re
  30. class ISRIStemmer(object):
  31. '''
  32. ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
  33. Information Science Research Institute. University of Nevada, Las Vegas, USA.
  34. A few minor modifications have been made to ISRI basic algorithm.
  35. See the source code of this module for more information.
  36. isri.stem(token) returns Arabic root for the given token.
  37. The ISRI Stemmer requires that all tokens have Unicode string types.
  38. If you use Python IDLE on Arabic Windows you have to decode text first
  39. using Arabic '1256' coding.
  40. '''
  41. def __init__(self):
  42. self.stm = 'defult none'
  43. self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644',
  44. '\u0648\u0644\u0644', '\u0648\u0627\u0644'] # length three prefixes
  45. self.p2 = ['\u0627\u0644', '\u0644\u0644'] # length two prefixes
  46. self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648',
  47. '\u064a', '\u062a', '\u0646', '\u0627'] # length one prefixes
  48. self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644',
  49. '\u062a\u0627\u0646', '\u062a\u064a\u0646',
  50. '\u0643\u0645\u0644'] # length three suffixes
  51. self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646',
  52. '\u064a\u0646', '\u062a\u0646', '\u0643\u0645',
  53. '\u0647\u0646', '\u0646\u0627', '\u064a\u0627',
  54. '\u0647\u0627', '\u062a\u0645', '\u0643\u0646',
  55. '\u0646\u064a', '\u0648\u0627', '\u0645\u0627',
  56. '\u0647\u0645'] # length two suffixes
  57. self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a',
  58. '\u0627', '\u0646'] # length one suffixes
  59. self.pr4 = {0: ['\u0645'], 1:['\u0627'],
  60. 2: ['\u0627', '\u0648', '\u064A'], 3:['\u0629']} # groups of length four patterns
  61. self.pr53 = {0: ['\u0627', '\u062a'],
  62. 1: ['\u0627', '\u064a', '\u0648'],
  63. 2: ['\u0627', '\u062a', '\u0645'],
  64. 3: ['\u0645', '\u064a', '\u062a'],
  65. 4: ['\u0645', '\u062a'],
  66. 5: ['\u0627', '\u0648'],
  67. 6: ['\u0627', '\u0645']} # Groups of length five patterns and length three roots
  68. self.re_short_vowels = re.compile(r'[\u064B-\u0652]')
  69. self.re_hamza = re.compile(r'[\u0621\u0624\u0626]')
  70. self.re_intial_hamza = re.compile(r'^[\u0622\u0623\u0625]')
  71. self.stop_words = ['\u064a\u0643\u0648\u0646',
  72. '\u0648\u0644\u064a\u0633',
  73. '\u0648\u0643\u0627\u0646',
  74. '\u0643\u0630\u0644\u0643',
  75. '\u0627\u0644\u062a\u064a',
  76. '\u0648\u0628\u064a\u0646',
  77. '\u0639\u0644\u064a\u0647\u0627',
  78. '\u0645\u0633\u0627\u0621',
  79. '\u0627\u0644\u0630\u064a',
  80. '\u0648\u0643\u0627\u0646\u062a',
  81. '\u0648\u0644\u0643\u0646',
  82. '\u0648\u0627\u0644\u062a\u064a',
  83. '\u062a\u0643\u0648\u0646',
  84. '\u0627\u0644\u064a\u0648\u0645',
  85. '\u0627\u0644\u0644\u0630\u064a\u0646',
  86. '\u0639\u0644\u064a\u0647',
  87. '\u0643\u0627\u0646\u062a',
  88. '\u0644\u0630\u0644\u0643',
  89. '\u0623\u0645\u0627\u0645',
  90. '\u0647\u0646\u0627\u0643',
  91. '\u0645\u0646\u0647\u0627',
  92. '\u0645\u0627\u0632\u0627\u0644',
  93. '\u0644\u0627\u0632\u0627\u0644',
  94. '\u0644\u0627\u064a\u0632\u0627\u0644',
  95. '\u0645\u0627\u064a\u0632\u0627\u0644',
  96. '\u0627\u0635\u0628\u062d',
  97. '\u0623\u0635\u0628\u062d',
  98. '\u0623\u0645\u0633\u0649',
  99. '\u0627\u0645\u0633\u0649',
  100. '\u0623\u0636\u062d\u0649',
  101. '\u0627\u0636\u062d\u0649',
  102. '\u0645\u0627\u0628\u0631\u062d',
  103. '\u0645\u0627\u0641\u062a\u0626',
  104. '\u0645\u0627\u0627\u0646\u0641\u0643',
  105. '\u0644\u0627\u0633\u064a\u0645\u0627',
  106. '\u0648\u0644\u0627\u064a\u0632\u0627\u0644',
  107. '\u0627\u0644\u062d\u0627\u0644\u064a',
  108. '\u0627\u0644\u064a\u0647\u0627',
  109. '\u0627\u0644\u0630\u064a\u0646',
  110. '\u0641\u0627\u0646\u0647',
  111. '\u0648\u0627\u0644\u0630\u064a',
  112. '\u0648\u0647\u0630\u0627',
  113. '\u0644\u0647\u0630\u0627',
  114. '\u0641\u0643\u0627\u0646',
  115. '\u0633\u062a\u0643\u0648\u0646',
  116. '\u0627\u0644\u064a\u0647',
  117. '\u064a\u0645\u0643\u0646',
  118. '\u0628\u0647\u0630\u0627',
  119. '\u0627\u0644\u0630\u0649']
  120. def stem(self, token):
  121. """
  122. Stemming a word token using the ISRI stemmer.
  123. """
  124. self.stm = token
  125. self.norm(1) # remove diacritics which representing Arabic short vowels
  126. if self.stm in self.stop_words: return self.stm # exclude stop words from being processed
  127. self.pre32() # remove length three and length two prefixes in this order
  128. self.suf32() # remove length three and length two suffixes in this order
  129. self.waw() # remove connective و if it precedes a word beginning with و
  130. self.norm(2) # normalize initial hamza to bare alif
  131. if len(self.stm) <= 3: return self.stm # return stem if less than or equal to three
  132. if len(self.stm) == 4: # length 4 word
  133. self.pro_w4()
  134. return self.stm
  135. elif len(self.stm) == 5: # length 5 word
  136. self.pro_w53()
  137. self.end_w5()
  138. return self.stm
  139. elif len(self.stm) == 6: # length 6 word
  140. self.pro_w6()
  141. self.end_w6()
  142. return self.stm
  143. elif len(self.stm) == 7: # length 7 word
  144. self.suf1()
  145. if len(self.stm) == 7:
  146. self.pre1()
  147. if len(self.stm) == 6:
  148. self.pro_w6()
  149. self.end_w6()
  150. return self.stm
  151. return self.stm # if word length >7 , then no stemming
  152. def norm(self, num):
  153. """
  154. normalization:
  155. num=1 normalize diacritics
  156. num=2 normalize initial hamza
  157. num=3 both 1&2
  158. """
  159. self.k = num
  160. if self.k == 1:
  161. self.stm = self.re_short_vowels.sub('', self.stm)
  162. return self.stm
  163. elif self.k == 2:
  164. self.stm = self.re_intial_hamza.sub(r'\u0627', self.stm)
  165. return self.stm
  166. elif self.k == 3:
  167. self.stm = self.re_short_vowels.sub('', self.stm)
  168. self.stm = self.re_intial_hamza.sub(r'\u0627', self.stm)
  169. return self.stm
  170. def pre32(self):
  171. """remove length three and length two prefixes in this order"""
  172. if len(self.stm) >= 6:
  173. for pre3 in self.p3:
  174. if self.stm.startswith(pre3):
  175. self.stm = self.stm[3:]
  176. return self.stm
  177. elif len(self.stm) >= 5:
  178. for pre2 in self.p2:
  179. if self.stm.startswith(pre2):
  180. self.stm = self.stm[2:]
  181. return self.stm
  182. def suf32(self):
  183. """remove length three and length two suffixes in this order"""
  184. if len(self.stm) >= 6:
  185. for suf3 in self.s3:
  186. if self.stm.endswith(suf3):
  187. self.stm = self.stm[:-3]
  188. return self.stm
  189. elif len(self.stm) >= 5:
  190. for suf2 in self.s2:
  191. if self.stm.endswith(suf2):
  192. self.stm = self.stm[:-2]
  193. return self.stm
  194. def waw(self):
  195. """remove connective ‘و’ if it precedes a word beginning with ‘و’ """
  196. if (len(self.stm) >= 4) & (self.stm[:2] == '\u0648\u0648'):
  197. self.stm = self.stm[1:]
  198. return self.stm
  199. def pro_w4(self):
  200. """process length four patterns and extract length three roots"""
  201. if self.stm[0] in self.pr4[0]: # مفعل
  202. self.stm = self.stm[1:]
  203. return self.stm
  204. elif self.stm[1] in self.pr4[1]: # فاعل
  205. self.stm = self.stm[0] + self.stm[2:]
  206. return self.stm
  207. elif self.stm[2] in self.pr4[2]: # فعال - فعول - فعيل
  208. self.stm = self.stm[:2] + self.stm[3]
  209. return self.stm
  210. elif self.stm[3] in self.pr4[3]: # فعلة
  211. self.stm = self.stm[:-1]
  212. return self.stm
  213. else:
  214. self.suf1() # do - normalize short sufix
  215. if len(self.stm) == 4:
  216. self.pre1() # do - normalize short prefix
  217. return self.stm
  218. def pro_w53(self):
  219. """process length five patterns and extract length three roots"""
  220. if ((self.stm[2] in self.pr53[0]) & (self.stm[0] == '\u0627')): # افتعل - افاعل
  221. self.stm = self.stm[1] + self.stm[3:]
  222. return self.stm
  223. elif ((self.stm[3] in self.pr53[1]) & (self.stm[0] == '\u0645')): # مفعول - مفعال - مفعيل
  224. self.stm = self.stm[1:3] + self.stm[4]
  225. return self.stm
  226. elif ((self.stm[0] in self.pr53[2]) & (self.stm[4] == '\u0629')): # مفعلة - تفعلة - افعلة
  227. self.stm = self.stm[1:4]
  228. return self.stm
  229. elif ((self.stm[0] in self.pr53[3]) & (self.stm[2] == '\u062a')): # مفتعل - يفتعل - تفتعل
  230. self.stm = self.stm[1] + self.stm[3:]
  231. return self.stm
  232. elif ((self.stm[0] in self.pr53[4]) & (self.stm[2] == '\u0627')): #مفاعل - تفاعل
  233. self.stm = self.stm[1] + self.stm[3:]
  234. return self.stm
  235. elif ((self.stm[2] in self.pr53[5]) & (self.stm[4] == '\u0629')): # فعولة - فعالة
  236. self.stm = self.stm[:2] + self.stm[3]
  237. return self.stm
  238. elif ((self.stm[0] in self.pr53[6]) & (self.stm[1] == '\u0646')): # انفعل - منفعل
  239. self.stm = self.stm[2:]
  240. return self.stm
  241. elif ((self.stm[3] == '\u0627') & (self.stm[0] == '\u0627')): # افعال
  242. self.stm = self.stm[1:3] + self.stm[4]
  243. return self.stm
  244. elif ((self.stm[4] == '\u0646') & (self.stm[3] == '\u0627')): # فعلان
  245. self.stm = self.stm[:3]
  246. return self.stm
  247. elif ((self.stm[3] == '\u064a') & (self.stm[0] == '\u062a')): # تفعيل
  248. self.stm = self.stm[1:3] + self.stm[4]
  249. return self.stm
  250. elif ((self.stm[3] == '\u0648') & (self.stm[1] == '\u0627')): # فاعول
  251. self.stm = self.stm[0] + self.stm[2] + self.stm[4]
  252. return self.stm
  253. elif ((self.stm[2] == '\u0627') & (self.stm[1] == '\u0648')): # فواعل
  254. self.stm = self.stm[0] + self.stm[3:]
  255. return self.stm
  256. elif ((self.stm[3] == '\u0626') & (self.stm[2] == '\u0627')): # فعائل
  257. self.stm = self.stm[:2] + self.stm[4]
  258. return self.stm
  259. elif ((self.stm[4] == '\u0629') & (self.stm[1] == '\u0627')): # فاعلة
  260. self.stm = self.stm[0] + self.stm[2:4]
  261. return self.stm
  262. elif ((self.stm[4] == '\u064a') & (self.stm[2] == '\u0627')): # فعالي
  263. self.stm = self.stm[:2] + self.stm[3]
  264. return self.stm
  265. else:
  266. self.suf1() # do - normalize short sufix
  267. if len(self.stm) == 5:
  268. self.pre1() # do - normalize short prefix
  269. return self.stm
  270. def pro_w54(self):
  271. """process length five patterns and extract length four roots"""
  272. if (self.stm[0] in self.pr53[2]): #تفعلل - افعلل - مفعلل
  273. self.stm = self.stm[1:]
  274. return self.stm
  275. elif (self.stm[4] == '\u0629'): # فعللة
  276. self.stm = self.stm[:4]
  277. return self.stm
  278. elif (self.stm[2] == '\u0627'): # فعالل
  279. self.stm = self.stm[:2] + self.stm[3:]
  280. return self.stm
  281. def end_w5(self):
  282. """ending step (word of length five)"""
  283. if len(self.stm) == 3:
  284. return self.stm
  285. elif len(self.stm) == 4:
  286. self.pro_w4()
  287. return self.stm
  288. elif len(self.stm) == 5:
  289. self.pro_w54()
  290. return self.stm
  291. def pro_w6(self):
  292. """process length six patterns and extract length three roots"""
  293. if ((self.stm.startswith('\u0627\u0633\u062a')) or (self.stm.startswith('\u0645\u0633\u062a'))): # مستفعل - استفعل
  294. self.stm = self.stm[3:]
  295. return self.stm
  296. elif (self.stm[0] == '\u0645' and self.stm[3] == '\u0627' and self.stm[5] == '\u0629'): # مفعالة
  297. self.stm = self.stm[1:3] + self.stm[4]
  298. return self.stm
  299. elif (self.stm[0] == '\u0627' and self.stm[2] == '\u062a' and self.stm[4] == '\u0627'): # افتعال
  300. self.stm = self.stm[1] + self.stm[3] + self.stm[5]
  301. return self.stm
  302. elif (self.stm[0] == '\u0627' and self.stm[3] == '\u0648' and self.stm[2] == self.stm[4]): # افعوعل
  303. self.stm = self.stm[1] + self.stm[4:]
  304. return self.stm
  305. elif (self.stm[0] == '\u062a' and self.stm[2] == '\u0627' and self.stm[4] == '\u064a'): # تفاعيل new pattern
  306. self.stm = self.stm[1] + self.stm[3] + self.stm[5]
  307. return self.stm
  308. else:
  309. self.suf1() # do - normalize short sufix
  310. if len(self.stm) == 6:
  311. self.pre1() # do - normalize short prefix
  312. return self.stm
  313. def pro_w64(self):
  314. """process length six patterns and extract length four roots"""
  315. if (self.stm[0] and self.stm[4]) == '\u0627': # افعلال
  316. self.stm = self.stm[1:4] + self.stm[5]
  317. return self.stm
  318. elif (self.stm.startswith('\u0645\u062a')): # متفعلل
  319. self.stm = self.stm[2:]
  320. return self.stm
  321. def end_w6(self):
  322. """ending step (word of length six)"""
  323. if len(self.stm) == 3:
  324. return self.stm
  325. elif len(self.stm) == 5:
  326. self.pro_w53()
  327. self.end_w5()
  328. return self.stm
  329. elif len (self.stm) == 6:
  330. self.pro_w64()
  331. return self.stm
  332. def suf1(self):
  333. """normalize short sufix"""
  334. for sf1 in self.s1:
  335. if self.stm.endswith(sf1):
  336. self.stm = self.stm[:-1]
  337. return self.stm
  338. def pre1(self):
  339. """normalize short prefix"""
  340. for sp1 in self.p1:
  341. if self.stm.startswith(sp1):
  342. self.stm = self.stm[1:]
  343. return self.stm