/libraries/joomla/language/stemmer/porteren.php

https://github.com/elinw/joomla-cms · PHP · 451 lines · 256 code · 38 blank · 157 comment · 37 complexity · 3807c5528bc09599501ee315530d16a7 MD5 · raw file

  1. <?php
  2. /**
  3. * @package Joomla.Platform
  4. * @subpackage Language
  5. *
  6. * @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
  7. * @copyright Copyright (C) 2005 Richard Heyes (http://www.phpguru.org/). All rights reserved.
  8. * @license GNU General Public License version 2 or later; see LICENSE
  9. */
  10. defined('JPATH_PLATFORM') or die;
  11. /**
  12. * Porter English stemmer class.
  13. *
  14. * This class was adapted from one written by Richard Heyes.
  15. * See copyright and link information above.
  16. *
  17. * @package Joomla.Platform
  18. * @subpackage Language
  19. * @since 12.1
  20. */
  21. class JLanguageStemmerPorteren extends JLanguageStemmer
  22. {
  23. /**
  24. * Regex for matching a consonant.
  25. *
  26. * @var string
  27. * @since 12.1
  28. */
  29. private static $_regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  30. /**
  31. * Regex for matching a vowel
  32. * @var string
  33. * @since 12.1
  34. */
  35. private static $_regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
  36. /**
  37. * Method to stem a token and return the root.
  38. *
  39. * @param string $token The token to stem.
  40. * @param string $lang The language of the token.
  41. *
  42. * @return string The root token.
  43. *
  44. * @since 12.1
  45. */
  46. public function stem($token, $lang)
  47. {
  48. // Check if the token is long enough to merit stemming.
  49. if (strlen($token) <= 2)
  50. {
  51. return $token;
  52. }
  53. // Check if the language is English or All.
  54. if ($lang !== 'en')
  55. {
  56. return $token;
  57. }
  58. // Stem the token if it is not in the cache.
  59. if (!isset($this->cache[$lang][$token]))
  60. {
  61. // Stem the token.
  62. $result = $token;
  63. $result = self::_step1ab($result);
  64. $result = self::_step1c($result);
  65. $result = self::_step2($result);
  66. $result = self::_step3($result);
  67. $result = self::_step4($result);
  68. $result = self::_step5($result);
  69. // Add the token to the cache.
  70. $this->cache[$lang][$token] = $result;
  71. }
  72. return $this->cache[$lang][$token];
  73. }
  74. /**
  75. * Step 1
  76. *
  77. * @param string $word The token to stem.
  78. *
  79. * @return string
  80. *
  81. * @since 12.1
  82. */
  83. private static function _step1ab($word)
  84. {
  85. // Part a
  86. if (substr($word, -1) == 's')
  87. {
  88. self::_replace($word, 'sses', 'ss')
  89. or self::_replace($word, 'ies', 'i')
  90. or self::_replace($word, 'ss', 'ss')
  91. or self::_replace($word, 's', '');
  92. }
  93. // Part b
  94. if (substr($word, -2, 1) != 'e' or !self::_replace($word, 'eed', 'ee', 0))
  95. {
  96. // First rule
  97. $v = self::$_regex_vowel;
  98. // Check ing and ed
  99. // Note use of && and OR, for precedence reasons
  100. if (preg_match("#$v+#", substr($word, 0, -3)) && self::_replace($word, 'ing', '')
  101. or preg_match("#$v+#", substr($word, 0, -2)) && self::_replace($word, 'ed', ''))
  102. {
  103. // If one of above two test successful
  104. if (!self::_replace($word, 'at', 'ate') and !self::_replace($word, 'bl', 'ble') and !self::_replace($word, 'iz', 'ize'))
  105. {
  106. // Double consonant ending
  107. if (self::_doubleConsonant($word) and substr($word, -2) != 'll' and substr($word, -2) != 'ss' and substr($word, -2) != 'zz')
  108. {
  109. $word = substr($word, 0, -1);
  110. }
  111. elseif (self::_m($word) == 1 and self::_cvc($word))
  112. {
  113. $word .= 'e';
  114. }
  115. }
  116. }
  117. }
  118. return $word;
  119. }
  120. /**
  121. * Step 1c
  122. *
  123. * @param string $word The token to stem.
  124. *
  125. * @return string
  126. *
  127. * @since 12.1
  128. */
  129. private static function _step1c($word)
  130. {
  131. $v = self::$_regex_vowel;
  132. if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1)))
  133. {
  134. self::_replace($word, 'y', 'i');
  135. }
  136. return $word;
  137. }
  138. /**
  139. * Step 2
  140. *
  141. * @param string $word The token to stem.
  142. *
  143. * @return string
  144. *
  145. * @since 12.1
  146. */
  147. private static function _step2($word)
  148. {
  149. switch (substr($word, -2, 1))
  150. {
  151. case 'a':
  152. self::_replace($word, 'ational', 'ate', 0)
  153. or self::_replace($word, 'tional', 'tion', 0);
  154. break;
  155. case 'c':
  156. self::_replace($word, 'enci', 'ence', 0)
  157. or self::_replace($word, 'anci', 'ance', 0);
  158. break;
  159. case 'e':
  160. self::_replace($word, 'izer', 'ize', 0);
  161. break;
  162. case 'g':
  163. self::_replace($word, 'logi', 'log', 0);
  164. break;
  165. case 'l':
  166. self::_replace($word, 'entli', 'ent', 0)
  167. or self::_replace($word, 'ousli', 'ous', 0)
  168. or self::_replace($word, 'alli', 'al', 0)
  169. or self::_replace($word, 'bli', 'ble', 0)
  170. or self::_replace($word, 'eli', 'e', 0);
  171. break;
  172. case 'o':
  173. self::_replace($word, 'ization', 'ize', 0)
  174. or self::_replace($word, 'ation', 'ate', 0)
  175. or self::_replace($word, 'ator', 'ate', 0);
  176. break;
  177. case 's':
  178. self::_replace($word, 'iveness', 'ive', 0)
  179. or self::_replace($word, 'fulness', 'ful', 0)
  180. or self::_replace($word, 'ousness', 'ous', 0)
  181. or self::_replace($word, 'alism', 'al', 0);
  182. break;
  183. case 't':
  184. self::_replace($word, 'biliti', 'ble', 0)
  185. or self::_replace($word, 'aliti', 'al', 0)
  186. or self::_replace($word, 'iviti', 'ive', 0);
  187. break;
  188. }
  189. return $word;
  190. }
  191. /**
  192. * Step 3
  193. *
  194. * @param string $word The token to stem.
  195. *
  196. * @return string
  197. *
  198. * @since 12.1
  199. */
  200. private static function _step3($word)
  201. {
  202. switch (substr($word, -2, 1))
  203. {
  204. case 'a':
  205. self::_replace($word, 'ical', 'ic', 0);
  206. break;
  207. case 's':
  208. self::_replace($word, 'ness', '', 0);
  209. break;
  210. case 't':
  211. self::_replace($word, 'icate', 'ic', 0)
  212. or self::_replace($word, 'iciti', 'ic', 0);
  213. break;
  214. case 'u':
  215. self::_replace($word, 'ful', '', 0);
  216. break;
  217. case 'v':
  218. self::_replace($word, 'ative', '', 0);
  219. break;
  220. case 'z':
  221. self::_replace($word, 'alize', 'al', 0);
  222. break;
  223. }
  224. return $word;
  225. }
  226. /**
  227. * Step 4
  228. *
  229. * @param string $word The token to stem.
  230. *
  231. * @return string
  232. *
  233. * @since 12.1
  234. */
  235. private static function _step4($word)
  236. {
  237. switch (substr($word, -2, 1))
  238. {
  239. case 'a':
  240. self::_replace($word, 'al', '', 1);
  241. break;
  242. case 'c':
  243. self::_replace($word, 'ance', '', 1)
  244. or self::_replace($word, 'ence', '', 1);
  245. break;
  246. case 'e':
  247. self::_replace($word, 'er', '', 1);
  248. break;
  249. case 'i':
  250. self::_replace($word, 'ic', '', 1);
  251. break;
  252. case 'l':
  253. self::_replace($word, 'able', '', 1)
  254. or self::_replace($word, 'ible', '', 1);
  255. break;
  256. case 'n':
  257. self::_replace($word, 'ant', '', 1)
  258. or self::_replace($word, 'ement', '', 1)
  259. or self::_replace($word, 'ment', '', 1)
  260. or self::_replace($word, 'ent', '', 1);
  261. break;
  262. case 'o':
  263. if (substr($word, -4) == 'tion' or substr($word, -4) == 'sion')
  264. {
  265. self::_replace($word, 'ion', '', 1);
  266. }
  267. else
  268. {
  269. self::_replace($word, 'ou', '', 1);
  270. }
  271. break;
  272. case 's':
  273. self::_replace($word, 'ism', '', 1);
  274. break;
  275. case 't':
  276. self::_replace($word, 'ate', '', 1)
  277. or self::_replace($word, 'iti', '', 1);
  278. break;
  279. case 'u':
  280. self::_replace($word, 'ous', '', 1);
  281. break;
  282. case 'v':
  283. self::_replace($word, 'ive', '', 1);
  284. break;
  285. case 'z':
  286. self::_replace($word, 'ize', '', 1);
  287. break;
  288. }
  289. return $word;
  290. }
  291. /**
  292. * Step 5
  293. *
  294. * @param string $word The token to stem.
  295. *
  296. * @return string
  297. *
  298. * @since 12.1
  299. */
  300. private static function _step5($word)
  301. {
  302. // Part a
  303. if (substr($word, -1) == 'e')
  304. {
  305. if (self::_m(substr($word, 0, -1)) > 1)
  306. {
  307. self::_replace($word, 'e', '');
  308. }
  309. elseif (self::_m(substr($word, 0, -1)) == 1)
  310. {
  311. if (!self::_cvc(substr($word, 0, -1)))
  312. {
  313. self::_replace($word, 'e', '');
  314. }
  315. }
  316. }
  317. // Part b
  318. if (self::_m($word) > 1 and self::_doubleConsonant($word) and substr($word, -1) == 'l')
  319. {
  320. $word = substr($word, 0, -1);
  321. }
  322. return $word;
  323. }
  324. /**
  325. * Replaces the first string with the second, at the end of the string. If third
  326. * arg is given, then the preceding string must match that m count at least.
  327. *
  328. * @param string &$str String to check
  329. * @param string $check Ending to check for
  330. * @param string $repl Replacement string
  331. * @param integer $m Optional minimum number of m() to meet
  332. *
  333. * @return boolean Whether the $check string was at the end
  334. * of the $str string. True does not necessarily mean
  335. * that it was replaced.
  336. *
  337. * @since 12.1
  338. */
  339. private static function _replace(&$str, $check, $repl, $m = null)
  340. {
  341. $len = 0 - strlen($check);
  342. if (substr($str, $len) == $check)
  343. {
  344. $substr = substr($str, 0, $len);
  345. if (is_null($m) or self::_m($substr) > $m)
  346. {
  347. $str = $substr . $repl;
  348. }
  349. return true;
  350. }
  351. return false;
  352. }
  353. /**
  354. * m() measures the number of consonant sequences in $str. if c is
  355. * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
  356. * presence,
  357. *
  358. * <c><v> gives 0
  359. * <c>vc<v> gives 1
  360. * <c>vcvc<v> gives 2
  361. * <c>vcvcvc<v> gives 3
  362. *
  363. * @param string $str The string to return the m count for
  364. *
  365. * @return integer The m count
  366. *
  367. * @since 12.1
  368. */
  369. private static function _m($str)
  370. {
  371. $c = self::$_regex_consonant;
  372. $v = self::$_regex_vowel;
  373. $str = preg_replace("#^$c+#", '', $str);
  374. $str = preg_replace("#$v+$#", '', $str);
  375. preg_match_all("#($v+$c+)#", $str, $matches);
  376. return count($matches[1]);
  377. }
  378. /**
  379. * Returns true/false as to whether the given string contains two
  380. * of the same consonant next to each other at the end of the string.
  381. *
  382. * @param string $str String to check
  383. *
  384. * @return boolean Result
  385. *
  386. * @since 12.1
  387. */
  388. private static function _doubleConsonant($str)
  389. {
  390. $c = self::$_regex_consonant;
  391. return preg_match("#$c{2}$#", $str, $matches) and $matches[0]{0} == $matches[0]{1};
  392. }
  393. /**
  394. * Checks for ending CVC sequence where second C is not W, X or Y
  395. *
  396. * @param string $str String to check
  397. *
  398. * @return boolean Result
  399. *
  400. * @since 12.1
  401. */
  402. private static function _cvc($str)
  403. {
  404. $c = self::$_regex_consonant;
  405. $v = self::$_regex_vowel;
  406. $result = preg_match("#($c$v$c)$#", $str, $matches)
  407. and strlen($matches[1]) == 3
  408. and $matches[1]{2} != 'w'
  409. and $matches[1]{2} != 'x'
  410. and $matches[1]{2} != 'y';
  411. return $result;
  412. }
  413. }