PageRenderTime 52ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/libraries/joomla/language/stemmer/porteren.php

https://gitlab.com/lankerd/paGO---Testing-Site
PHP | 449 lines | 256 code | 38 blank | 155 comment | 37 complexity | 4cdaafc13c64a1580195b210021a0323 MD5 | raw file
  1. <?php
  2. /**
  3. * @package Joomla.Platform
  4. * @subpackage Language
  5. *
  6. * @copyright Copyright (C) 2005 - 2016 Open Source Matters, Inc. All rights reserved.
  7. * @copyright Copyright (C) 2005 Richard Heyes (http://www.phpguru.org/). All rights reserved.
  8. * @license GNU General Public License version 2 or later; see LICENSE
  9. */
  10. defined('JPATH_PLATFORM') or die;
  11. /**
  12. * Porter English stemmer class.
  13. *
  14. * This class was adapted from one written by Richard Heyes.
  15. * See copyright and link information above.
  16. *
  17. * @since 12.1
  18. */
  19. class JLanguageStemmerPorteren extends JLanguageStemmer
  20. {
  21. /**
  22. * Regex for matching a consonant.
  23. *
  24. * @var string
  25. * @since 12.1
  26. */
  27. private static $_regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  28. /**
  29. * Regex for matching a vowel
  30. * @var string
  31. * @since 12.1
  32. */
  33. private static $_regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
  34. /**
  35. * Method to stem a token and return the root.
  36. *
  37. * @param string $token The token to stem.
  38. * @param string $lang The language of the token.
  39. *
  40. * @return string The root token.
  41. *
  42. * @since 12.1
  43. */
  44. public function stem($token, $lang)
  45. {
  46. // Check if the token is long enough to merit stemming.
  47. if (strlen($token) <= 2)
  48. {
  49. return $token;
  50. }
  51. // Check if the language is English or All.
  52. if ($lang !== 'en')
  53. {
  54. return $token;
  55. }
  56. // Stem the token if it is not in the cache.
  57. if (!isset($this->cache[$lang][$token]))
  58. {
  59. // Stem the token.
  60. $result = $token;
  61. $result = self::_step1ab($result);
  62. $result = self::_step1c($result);
  63. $result = self::_step2($result);
  64. $result = self::_step3($result);
  65. $result = self::_step4($result);
  66. $result = self::_step5($result);
  67. // Add the token to the cache.
  68. $this->cache[$lang][$token] = $result;
  69. }
  70. return $this->cache[$lang][$token];
  71. }
  72. /**
  73. * Step 1
  74. *
  75. * @param string $word The token to stem.
  76. *
  77. * @return string
  78. *
  79. * @since 12.1
  80. */
  81. private static function _step1ab($word)
  82. {
  83. // Part a
  84. if (substr($word, -1) == 's')
  85. {
  86. self::_replace($word, 'sses', 'ss')
  87. or self::_replace($word, 'ies', 'i')
  88. or self::_replace($word, 'ss', 'ss')
  89. or self::_replace($word, 's', '');
  90. }
  91. // Part b
  92. if (substr($word, -2, 1) != 'e' or !self::_replace($word, 'eed', 'ee', 0))
  93. {
  94. // First rule
  95. $v = self::$_regex_vowel;
  96. // Check ing and ed
  97. // Note use of && and OR, for precedence reasons
  98. if (preg_match("#$v+#", substr($word, 0, -3)) && self::_replace($word, 'ing', '')
  99. or preg_match("#$v+#", substr($word, 0, -2)) && self::_replace($word, 'ed', ''))
  100. {
  101. // If one of above two test successful
  102. if (!self::_replace($word, 'at', 'ate') and !self::_replace($word, 'bl', 'ble') and !self::_replace($word, 'iz', 'ize'))
  103. {
  104. // Double consonant ending
  105. if (self::_doubleConsonant($word) and substr($word, -2) != 'll' and substr($word, -2) != 'ss' and substr($word, -2) != 'zz')
  106. {
  107. $word = substr($word, 0, -1);
  108. }
  109. elseif (self::_m($word) == 1 and self::_cvc($word))
  110. {
  111. $word .= 'e';
  112. }
  113. }
  114. }
  115. }
  116. return $word;
  117. }
  118. /**
  119. * Step 1c
  120. *
  121. * @param string $word The token to stem.
  122. *
  123. * @return string
  124. *
  125. * @since 12.1
  126. */
  127. private static function _step1c($word)
  128. {
  129. $v = self::$_regex_vowel;
  130. if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1)))
  131. {
  132. self::_replace($word, 'y', 'i');
  133. }
  134. return $word;
  135. }
  136. /**
  137. * Step 2
  138. *
  139. * @param string $word The token to stem.
  140. *
  141. * @return string
  142. *
  143. * @since 12.1
  144. */
  145. private static function _step2($word)
  146. {
  147. switch (substr($word, -2, 1))
  148. {
  149. case 'a':
  150. self::_replace($word, 'ational', 'ate', 0)
  151. or self::_replace($word, 'tional', 'tion', 0);
  152. break;
  153. case 'c':
  154. self::_replace($word, 'enci', 'ence', 0)
  155. or self::_replace($word, 'anci', 'ance', 0);
  156. break;
  157. case 'e':
  158. self::_replace($word, 'izer', 'ize', 0);
  159. break;
  160. case 'g':
  161. self::_replace($word, 'logi', 'log', 0);
  162. break;
  163. case 'l':
  164. self::_replace($word, 'entli', 'ent', 0)
  165. or self::_replace($word, 'ousli', 'ous', 0)
  166. or self::_replace($word, 'alli', 'al', 0)
  167. or self::_replace($word, 'bli', 'ble', 0)
  168. or self::_replace($word, 'eli', 'e', 0);
  169. break;
  170. case 'o':
  171. self::_replace($word, 'ization', 'ize', 0)
  172. or self::_replace($word, 'ation', 'ate', 0)
  173. or self::_replace($word, 'ator', 'ate', 0);
  174. break;
  175. case 's':
  176. self::_replace($word, 'iveness', 'ive', 0)
  177. or self::_replace($word, 'fulness', 'ful', 0)
  178. or self::_replace($word, 'ousness', 'ous', 0)
  179. or self::_replace($word, 'alism', 'al', 0);
  180. break;
  181. case 't':
  182. self::_replace($word, 'biliti', 'ble', 0)
  183. or self::_replace($word, 'aliti', 'al', 0)
  184. or self::_replace($word, 'iviti', 'ive', 0);
  185. break;
  186. }
  187. return $word;
  188. }
  189. /**
  190. * Step 3
  191. *
  192. * @param string $word The token to stem.
  193. *
  194. * @return string
  195. *
  196. * @since 12.1
  197. */
  198. private static function _step3($word)
  199. {
  200. switch (substr($word, -2, 1))
  201. {
  202. case 'a':
  203. self::_replace($word, 'ical', 'ic', 0);
  204. break;
  205. case 's':
  206. self::_replace($word, 'ness', '', 0);
  207. break;
  208. case 't':
  209. self::_replace($word, 'icate', 'ic', 0)
  210. or self::_replace($word, 'iciti', 'ic', 0);
  211. break;
  212. case 'u':
  213. self::_replace($word, 'ful', '', 0);
  214. break;
  215. case 'v':
  216. self::_replace($word, 'ative', '', 0);
  217. break;
  218. case 'z':
  219. self::_replace($word, 'alize', 'al', 0);
  220. break;
  221. }
  222. return $word;
  223. }
  224. /**
  225. * Step 4
  226. *
  227. * @param string $word The token to stem.
  228. *
  229. * @return string
  230. *
  231. * @since 12.1
  232. */
  233. private static function _step4($word)
  234. {
  235. switch (substr($word, -2, 1))
  236. {
  237. case 'a':
  238. self::_replace($word, 'al', '', 1);
  239. break;
  240. case 'c':
  241. self::_replace($word, 'ance', '', 1)
  242. or self::_replace($word, 'ence', '', 1);
  243. break;
  244. case 'e':
  245. self::_replace($word, 'er', '', 1);
  246. break;
  247. case 'i':
  248. self::_replace($word, 'ic', '', 1);
  249. break;
  250. case 'l':
  251. self::_replace($word, 'able', '', 1)
  252. or self::_replace($word, 'ible', '', 1);
  253. break;
  254. case 'n':
  255. self::_replace($word, 'ant', '', 1)
  256. or self::_replace($word, 'ement', '', 1)
  257. or self::_replace($word, 'ment', '', 1)
  258. or self::_replace($word, 'ent', '', 1);
  259. break;
  260. case 'o':
  261. if (substr($word, -4) == 'tion' or substr($word, -4) == 'sion')
  262. {
  263. self::_replace($word, 'ion', '', 1);
  264. }
  265. else
  266. {
  267. self::_replace($word, 'ou', '', 1);
  268. }
  269. break;
  270. case 's':
  271. self::_replace($word, 'ism', '', 1);
  272. break;
  273. case 't':
  274. self::_replace($word, 'ate', '', 1)
  275. or self::_replace($word, 'iti', '', 1);
  276. break;
  277. case 'u':
  278. self::_replace($word, 'ous', '', 1);
  279. break;
  280. case 'v':
  281. self::_replace($word, 'ive', '', 1);
  282. break;
  283. case 'z':
  284. self::_replace($word, 'ize', '', 1);
  285. break;
  286. }
  287. return $word;
  288. }
  289. /**
  290. * Step 5
  291. *
  292. * @param string $word The token to stem.
  293. *
  294. * @return string
  295. *
  296. * @since 12.1
  297. */
  298. private static function _step5($word)
  299. {
  300. // Part a
  301. if (substr($word, -1) == 'e')
  302. {
  303. if (self::_m(substr($word, 0, -1)) > 1)
  304. {
  305. self::_replace($word, 'e', '');
  306. }
  307. elseif (self::_m(substr($word, 0, -1)) == 1)
  308. {
  309. if (!self::_cvc(substr($word, 0, -1)))
  310. {
  311. self::_replace($word, 'e', '');
  312. }
  313. }
  314. }
  315. // Part b
  316. if (self::_m($word) > 1 and self::_doubleConsonant($word) and substr($word, -1) == 'l')
  317. {
  318. $word = substr($word, 0, -1);
  319. }
  320. return $word;
  321. }
  322. /**
  323. * Replaces the first string with the second, at the end of the string. If third
  324. * arg is given, then the preceding string must match that m count at least.
  325. *
  326. * @param string &$str String to check
  327. * @param string $check Ending to check for
  328. * @param string $repl Replacement string
  329. * @param integer $m Optional minimum number of m() to meet
  330. *
  331. * @return boolean Whether the $check string was at the end
  332. * of the $str string. True does not necessarily mean
  333. * that it was replaced.
  334. *
  335. * @since 12.1
  336. */
  337. private static function _replace(&$str, $check, $repl, $m = null)
  338. {
  339. $len = 0 - strlen($check);
  340. if (substr($str, $len) == $check)
  341. {
  342. $substr = substr($str, 0, $len);
  343. if (is_null($m) or self::_m($substr) > $m)
  344. {
  345. $str = $substr . $repl;
  346. }
  347. return true;
  348. }
  349. return false;
  350. }
  351. /**
  352. * m() measures the number of consonant sequences in $str. if c is
  353. * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
  354. * presence,
  355. *
  356. * <c><v> gives 0
  357. * <c>vc<v> gives 1
  358. * <c>vcvc<v> gives 2
  359. * <c>vcvcvc<v> gives 3
  360. *
  361. * @param string $str The string to return the m count for
  362. *
  363. * @return integer The m count
  364. *
  365. * @since 12.1
  366. */
  367. private static function _m($str)
  368. {
  369. $c = self::$_regex_consonant;
  370. $v = self::$_regex_vowel;
  371. $str = preg_replace("#^$c+#", '', $str);
  372. $str = preg_replace("#$v+$#", '', $str);
  373. preg_match_all("#($v+$c+)#", $str, $matches);
  374. return count($matches[1]);
  375. }
  376. /**
  377. * Returns true/false as to whether the given string contains two
  378. * of the same consonant next to each other at the end of the string.
  379. *
  380. * @param string $str String to check
  381. *
  382. * @return boolean Result
  383. *
  384. * @since 12.1
  385. */
  386. private static function _doubleConsonant($str)
  387. {
  388. $c = self::$_regex_consonant;
  389. return preg_match("#$c{2}$#", $str, $matches) and $matches[0]{0} == $matches[0]{1};
  390. }
  391. /**
  392. * Checks for ending CVC sequence where second C is not W, X or Y
  393. *
  394. * @param string $str String to check
  395. *
  396. * @return boolean Result
  397. *
  398. * @since 12.1
  399. */
  400. private static function _cvc($str)
  401. {
  402. $c = self::$_regex_consonant;
  403. $v = self::$_regex_vowel;
  404. $result = preg_match("#($c$v$c)$#", $str, $matches)
  405. and strlen($matches[1]) == 3
  406. and $matches[1]{2} != 'w'
  407. and $matches[1]{2} != 'x'
  408. and $matches[1]{2} != 'y';
  409. return $result;
  410. }
  411. }