PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/joomla/administrator/components/com_finder/helpers/indexer/stemmer/porter_en.php

https://gitlab.com/ricardosanchez/prueba
PHP | 446 lines | 253 code | 38 blank | 155 comment | 36 complexity | 984a211bd76ce9b5ec555f7895698931 MD5 | raw file
  1. <?php
  2. /**
  3. * @package Joomla.Administrator
  4. * @subpackage com_finder
  5. *
  6. * @copyright Copyright (C) 2005 - 2015 Open Source Matters, Inc. All rights reserved.
  7. * @license GNU General Public License version 2 or later; see LICENSE
  8. */
  9. defined('_JEXEC') or die;
  10. JLoader::register('FinderIndexerStemmer', dirname(__DIR__) . '/stemmer.php');
  11. /**
  12. * Porter English stemmer class for the Finder indexer package.
  13. *
  14. * This class was adapted from one written by Richard Heyes.
  15. * See copyright and link information above.
  16. *
  17. * @since 2.5
  18. */
  19. class FinderIndexerStemmerPorter_En extends FinderIndexerStemmer
  20. {
  21. /**
  22. * Regex for matching a consonant.
  23. *
  24. * @var string
  25. * @since 2.5
  26. */
  27. private static $_regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  28. /**
  29. * Regex for matching a vowel
  30. *
  31. * @var string
  32. * @since 2.5
  33. */
  34. private static $_regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
  35. /**
  36. * Method to stem a token and return the root.
  37. *
  38. * @param string $token The token to stem.
  39. * @param string $lang The language of the token.
  40. *
  41. * @return string The root token.
  42. *
  43. * @since 2.5
  44. */
  45. public function stem($token, $lang)
  46. {
  47. // Check if the token is long enough to merit stemming.
  48. if (strlen($token) <= 2)
  49. {
  50. return $token;
  51. }
  52. // Check if the language is English or All.
  53. if ($lang !== 'en' && $lang != '*')
  54. {
  55. return $token;
  56. }
  57. // Stem the token if it is not in the cache.
  58. if (!isset($this->cache[$lang][$token]))
  59. {
  60. // Stem the token.
  61. $result = $token;
  62. $result = self::_step1ab($result);
  63. $result = self::_step1c($result);
  64. $result = self::_step2($result);
  65. $result = self::_step3($result);
  66. $result = self::_step4($result);
  67. $result = self::_step5($result);
  68. // Add the token to the cache.
  69. $this->cache[$lang][$token] = $result;
  70. }
  71. return $this->cache[$lang][$token];
  72. }
  73. /**
  74. * Step 1
  75. *
  76. * @param string $word The token to stem.
  77. *
  78. * @return string
  79. *
  80. * @since 2.5
  81. */
  82. private static function _step1ab($word)
  83. {
  84. // Part a
  85. if (substr($word, -1) == 's')
  86. {
  87. self::_replace($word, 'sses', 'ss')
  88. or self::_replace($word, 'ies', 'i')
  89. or self::_replace($word, 'ss', 'ss')
  90. or self::_replace($word, 's', '');
  91. }
  92. // Part b
  93. if (substr($word, -2, 1) != 'e' or !self::_replace($word, 'eed', 'ee', 0))
  94. {
  95. // First rule
  96. $v = self::$_regex_vowel;
  97. // Words ending with ing and ed
  98. // Note use of && and OR, for precedence reasons
  99. if (preg_match("#$v+#", substr($word, 0, -3)) && self::_replace($word, 'ing', '')
  100. or preg_match("#$v+#", substr($word, 0, -2)) && self::_replace($word, 'ed', ''))
  101. {
  102. // If one of above two test successful
  103. if (!self::_replace($word, 'at', 'ate') and !self::_replace($word, 'bl', 'ble') and !self::_replace($word, 'iz', 'ize'))
  104. {
  105. // Double consonant ending
  106. if (self::_doubleConsonant($word) and substr($word, -2) != 'll' and substr($word, -2) != 'ss' and substr($word, -2) != 'zz')
  107. {
  108. $word = substr($word, 0, -1);
  109. }
  110. elseif (self::_m($word) == 1 and self::_cvc($word))
  111. {
  112. $word .= 'e';
  113. }
  114. }
  115. }
  116. }
  117. return $word;
  118. }
  119. /**
  120. * Step 1c
  121. *
  122. * @param string $word The token to stem.
  123. *
  124. * @return string
  125. *
  126. * @since 2.5
  127. */
  128. private static function _step1c($word)
  129. {
  130. $v = self::$_regex_vowel;
  131. if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1)))
  132. {
  133. self::_replace($word, 'y', 'i');
  134. }
  135. return $word;
  136. }
  137. /**
  138. * Step 2
  139. *
  140. * @param string $word The token to stem.
  141. *
  142. * @return string
  143. *
  144. * @since 2.5
  145. */
  146. private static function _step2($word)
  147. {
  148. switch (substr($word, -2, 1))
  149. {
  150. case 'a':
  151. self::_replace($word, 'ational', 'ate', 0)
  152. or self::_replace($word, 'tional', 'tion', 0);
  153. break;
  154. case 'c':
  155. self::_replace($word, 'enci', 'ence', 0)
  156. or self::_replace($word, 'anci', 'ance', 0);
  157. break;
  158. case 'e':
  159. self::_replace($word, 'izer', 'ize', 0);
  160. break;
  161. case 'g':
  162. self::_replace($word, 'logi', 'log', 0);
  163. break;
  164. case 'l':
  165. self::_replace($word, 'entli', 'ent', 0)
  166. or self::_replace($word, 'ousli', 'ous', 0)
  167. or self::_replace($word, 'alli', 'al', 0)
  168. or self::_replace($word, 'bli', 'ble', 0)
  169. or self::_replace($word, 'eli', 'e', 0);
  170. break;
  171. case 'o':
  172. self::_replace($word, 'ization', 'ize', 0)
  173. or self::_replace($word, 'ation', 'ate', 0)
  174. or self::_replace($word, 'ator', 'ate', 0);
  175. break;
  176. case 's':
  177. self::_replace($word, 'iveness', 'ive', 0)
  178. or self::_replace($word, 'fulness', 'ful', 0)
  179. or self::_replace($word, 'ousness', 'ous', 0)
  180. or self::_replace($word, 'alism', 'al', 0);
  181. break;
  182. case 't':
  183. self::_replace($word, 'biliti', 'ble', 0)
  184. or self::_replace($word, 'aliti', 'al', 0)
  185. or self::_replace($word, 'iviti', 'ive', 0);
  186. break;
  187. }
  188. return $word;
  189. }
  190. /**
  191. * Step 3
  192. *
  193. * @param string $word The token to stem.
  194. *
  195. * @return string
  196. *
  197. * @since 2.5
  198. */
  199. private static function _step3($word)
  200. {
  201. switch (substr($word, -2, 1))
  202. {
  203. case 'a':
  204. self::_replace($word, 'ical', 'ic', 0);
  205. break;
  206. case 's':
  207. self::_replace($word, 'ness', '', 0);
  208. break;
  209. case 't':
  210. self::_replace($word, 'icate', 'ic', 0)
  211. or self::_replace($word, 'iciti', 'ic', 0);
  212. break;
  213. case 'u':
  214. self::_replace($word, 'ful', '', 0);
  215. break;
  216. case 'v':
  217. self::_replace($word, 'ative', '', 0);
  218. break;
  219. case 'z':
  220. self::_replace($word, 'alize', 'al', 0);
  221. break;
  222. }
  223. return $word;
  224. }
  225. /**
  226. * Step 4
  227. *
  228. * @param string $word The token to stem.
  229. *
  230. * @return string
  231. *
  232. * @since 2.5
  233. */
  234. private static function _step4($word)
  235. {
  236. switch (substr($word, -2, 1))
  237. {
  238. case 'a':
  239. self::_replace($word, 'al', '', 1);
  240. break;
  241. case 'c':
  242. self::_replace($word, 'ance', '', 1)
  243. or self::_replace($word, 'ence', '', 1);
  244. break;
  245. case 'e':
  246. self::_replace($word, 'er', '', 1);
  247. break;
  248. case 'i':
  249. self::_replace($word, 'ic', '', 1);
  250. break;
  251. case 'l':
  252. self::_replace($word, 'able', '', 1)
  253. or self::_replace($word, 'ible', '', 1);
  254. break;
  255. case 'n':
  256. self::_replace($word, 'ant', '', 1)
  257. or self::_replace($word, 'ement', '', 1)
  258. or self::_replace($word, 'ment', '', 1)
  259. or self::_replace($word, 'ent', '', 1);
  260. break;
  261. case 'o':
  262. if (substr($word, -4) == 'tion' or substr($word, -4) == 'sion')
  263. {
  264. self::_replace($word, 'ion', '', 1);
  265. }
  266. else
  267. {
  268. self::_replace($word, 'ou', '', 1);
  269. }
  270. break;
  271. case 's':
  272. self::_replace($word, 'ism', '', 1);
  273. break;
  274. case 't':
  275. self::_replace($word, 'ate', '', 1)
  276. or self::_replace($word, 'iti', '', 1);
  277. break;
  278. case 'u':
  279. self::_replace($word, 'ous', '', 1);
  280. break;
  281. case 'v':
  282. self::_replace($word, 'ive', '', 1);
  283. break;
  284. case 'z':
  285. self::_replace($word, 'ize', '', 1);
  286. break;
  287. }
  288. return $word;
  289. }
  290. /**
  291. * Step 5
  292. *
  293. * @param string $word The token to stem.
  294. *
  295. * @return string
  296. *
  297. * @since 2.5
  298. */
  299. private static function _step5($word)
  300. {
  301. // Part a
  302. if (substr($word, -1) == 'e')
  303. {
  304. if (self::_m(substr($word, 0, -1)) > 1)
  305. {
  306. self::_replace($word, 'e', '');
  307. }
  308. elseif (self::_m(substr($word, 0, -1)) == 1)
  309. {
  310. if (!self::_cvc(substr($word, 0, -1)))
  311. {
  312. self::_replace($word, 'e', '');
  313. }
  314. }
  315. }
  316. // Part b
  317. if (self::_m($word) > 1 and self::_doubleConsonant($word) and substr($word, -1) == 'l')
  318. {
  319. $word = substr($word, 0, -1);
  320. }
  321. return $word;
  322. }
  323. /**
  324. * Replaces the first string with the second, at the end of the string. If third
  325. * arg is given, then the preceding string must match that m count at least.
  326. *
  327. * @param string &$str String to check
  328. * @param string $check Ending to check for
  329. * @param string $repl Replacement string
  330. * @param integer $m Optional minimum number of m() to meet
  331. *
  332. * @return boolean Whether the $check string was at the end
  333. * of the $str string. True does not necessarily mean
  334. * that it was replaced.
  335. *
  336. * @since 2.5
  337. */
  338. private static function _replace(&$str, $check, $repl, $m = null)
  339. {
  340. $len = 0 - strlen($check);
  341. if (substr($str, $len) == $check)
  342. {
  343. $substr = substr($str, 0, $len);
  344. if (is_null($m) or self::_m($substr) > $m)
  345. {
  346. $str = $substr . $repl;
  347. }
  348. return true;
  349. }
  350. return false;
  351. }
  352. /**
  353. * m() measures the number of consonant sequences in $str. if c is
  354. * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
  355. * presence,
  356. *
  357. * <c><v> gives 0
  358. * <c>vc<v> gives 1
  359. * <c>vcvc<v> gives 2
  360. * <c>vcvcvc<v> gives 3
  361. *
  362. * @param string $str The string to return the m count for
  363. *
  364. * @return integer The m count
  365. *
  366. * @since 2.5
  367. */
  368. private static function _m($str)
  369. {
  370. $c = self::$_regex_consonant;
  371. $v = self::$_regex_vowel;
  372. $str = preg_replace("#^$c+#", '', $str);
  373. $str = preg_replace("#$v+$#", '', $str);
  374. preg_match_all("#($v+$c+)#", $str, $matches);
  375. return count($matches[1]);
  376. }
  377. /**
  378. * Returns true/false as to whether the given string contains two
  379. * of the same consonant next to each other at the end of the string.
  380. *
  381. * @param string $str String to check
  382. *
  383. * @return boolean Result
  384. *
  385. * @since 2.5
  386. */
  387. private static function _doubleConsonant($str)
  388. {
  389. $c = self::$_regex_consonant;
  390. return preg_match("#$c{2}$#", $str, $matches) and $matches[0]{0} == $matches[0]{1};
  391. }
  392. /**
  393. * Checks for ending CVC sequence where second C is not W, X or Y
  394. *
  395. * @param string $str String to check
  396. *
  397. * @return boolean Result
  398. *
  399. * @since 2.5
  400. */
  401. private static function _cvc($str)
  402. {
  403. $c = self::$_regex_consonant;
  404. $v = self::$_regex_vowel;
  405. return preg_match("#($c$v$c)$#", $str, $matches) and strlen($matches[1]) == 3 and $matches[1]{2} != 'w' and $matches[1]{2} != 'x'
  406. and $matches[1]{2} != 'y';
  407. }
  408. }