PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/public_html/models/behaviors/porter_stemmer.php

https://bitbucket.org/southsidehealth/southsidehealth
PHP | 430 lines | 228 code | 80 blank | 122 comment | 40 complexity | 63bfc0d59d6d51447a066c433a25a541 MD5 | raw file
Possible License(s): Apache-2.0, GPL-3.0, LGPL-2.1
  1. <?php
  2. // Copyright 2008-2011 South Side Health.org
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. // http://www.apache.org/licenses/LICENSE-2.0
  7. // Unless required by applicable law or agreed to in writing, software
  8. // distributed under the License is distributed on an "AS IS" BASIS,
  9. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  10. // express or implied.
  11. // See the License for the specific language governing permissions and
  12. // limitations under the License.
  13. /**
  14. * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
  15. *
  16. * All rights reserved.
  17. *
  18. * This script is free software.
  19. */
  20. /**
  21. * PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
  22. * were borrowed from the (broken) implementation by Jon Abernathy.
  23. *
  24. * Usage:
  25. *
  26. * $stem = PorterStemmer::Stem($word);
  27. *
  28. * How easy is that?
  29. */
  30. /**
  31. * Changed the class name to PorterStemmerBehavior to integrate
  32. * it into Cake as a behavior.
  33. * - Declan Frye, 12/22/09
  34. */
  35. class PorterStemmerBehavior extends ModelBehavior
  36. {
  37. /**
  38. * Regex for matching a consonant
  39. * @var string
  40. */
  41. private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  42. /**
  43. * Regex for matching a vowel
  44. * @var string
  45. */
  46. private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
  47. /**
  48. * Stems a word. Simple huh?
  49. *
  50. * @param string $word Word to stem
  51. * @return string Stemmed word
  52. */
  53. public static function Stem(&$model, $word)
  54. {
  55. if (strlen($word) <= 2) {
  56. return $word;
  57. }
  58. $word = self::step1ab($word);
  59. $word = self::step1c($word);
  60. $word = self::step2($word);
  61. $word = self::step3($word);
  62. $word = self::step4($word);
  63. $word = self::step5($word);
  64. return $word;
  65. }
  66. /**
  67. * Step 1
  68. */
  69. private static function step1ab($word)
  70. {
  71. // Part a
  72. if (substr($word, -1) == 's') {
  73. self::replace($word, 'sses', 'ss')
  74. OR self::replace($word, 'ies', 'i')
  75. OR self::replace($word, 'ss', 'ss')
  76. OR self::replace($word, 's', '');
  77. }
  78. // Part b
  79. if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
  80. $v = self::$regex_vowel;
  81. // ing and ed
  82. if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
  83. OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
  84. // If one of above two test successful
  85. if ( !self::replace($word, 'at', 'ate')
  86. AND !self::replace($word, 'bl', 'ble')
  87. AND !self::replace($word, 'iz', 'ize')) {
  88. // Double consonant ending
  89. if ( self::doubleConsonant($word)
  90. AND substr($word, -2) != 'll'
  91. AND substr($word, -2) != 'ss'
  92. AND substr($word, -2) != 'zz') {
  93. $word = substr($word, 0, -1);
  94. } else if (self::m($word) == 1 AND self::cvc($word)) {
  95. $word .= 'e';
  96. }
  97. }
  98. }
  99. }
  100. return $word;
  101. }
  102. /**
  103. * Step 1c
  104. *
  105. * @param string $word Word to stem
  106. */
  107. private static function step1c($word)
  108. {
  109. $v = self::$regex_vowel;
  110. if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
  111. self::replace($word, 'y', 'i');
  112. }
  113. return $word;
  114. }
  115. /**
  116. * Step 2
  117. *
  118. * @param string $word Word to stem
  119. */
  120. private static function step2($word)
  121. {
  122. switch (substr($word, -2, 1)) {
  123. case 'a':
  124. self::replace($word, 'ational', 'ate', 0)
  125. OR self::replace($word, 'tional', 'tion', 0);
  126. break;
  127. case 'c':
  128. self::replace($word, 'enci', 'ence', 0)
  129. OR self::replace($word, 'anci', 'ance', 0);
  130. break;
  131. case 'e':
  132. self::replace($word, 'izer', 'ize', 0);
  133. break;
  134. case 'g':
  135. self::replace($word, 'logi', 'log', 0);
  136. break;
  137. case 'l':
  138. self::replace($word, 'entli', 'ent', 0)
  139. OR self::replace($word, 'ousli', 'ous', 0)
  140. OR self::replace($word, 'alli', 'al', 0)
  141. OR self::replace($word, 'bli', 'ble', 0)
  142. OR self::replace($word, 'eli', 'e', 0);
  143. break;
  144. case 'o':
  145. self::replace($word, 'ization', 'ize', 0)
  146. OR self::replace($word, 'ation', 'ate', 0)
  147. OR self::replace($word, 'ator', 'ate', 0);
  148. break;
  149. case 's':
  150. self::replace($word, 'iveness', 'ive', 0)
  151. OR self::replace($word, 'fulness', 'ful', 0)
  152. OR self::replace($word, 'ousness', 'ous', 0)
  153. OR self::replace($word, 'alism', 'al', 0);
  154. break;
  155. case 't':
  156. self::replace($word, 'biliti', 'ble', 0)
  157. OR self::replace($word, 'aliti', 'al', 0)
  158. OR self::replace($word, 'iviti', 'ive', 0);
  159. break;
  160. }
  161. return $word;
  162. }
  163. /**
  164. * Step 3
  165. *
  166. * @param string $word String to stem
  167. */
  168. private static function step3($word)
  169. {
  170. switch (substr($word, -2, 1)) {
  171. case 'a':
  172. self::replace($word, 'ical', 'ic', 0);
  173. break;
  174. case 's':
  175. self::replace($word, 'ness', '', 0);
  176. break;
  177. case 't':
  178. self::replace($word, 'icate', 'ic', 0)
  179. OR self::replace($word, 'iciti', 'ic', 0);
  180. break;
  181. case 'u':
  182. self::replace($word, 'ful', '', 0);
  183. break;
  184. case 'v':
  185. self::replace($word, 'ative', '', 0);
  186. break;
  187. case 'z':
  188. self::replace($word, 'alize', 'al', 0);
  189. break;
  190. }
  191. return $word;
  192. }
  193. /**
  194. * Step 4
  195. *
  196. * @param string $word Word to stem
  197. */
  198. private static function step4($word)
  199. {
  200. switch (substr($word, -2, 1)) {
  201. case 'a':
  202. self::replace($word, 'al', '', 1);
  203. break;
  204. case 'c':
  205. self::replace($word, 'ance', '', 1)
  206. OR self::replace($word, 'ence', '', 1);
  207. break;
  208. case 'e':
  209. self::replace($word, 'er', '', 1);
  210. break;
  211. case 'i':
  212. self::replace($word, 'ic', '', 1);
  213. break;
  214. case 'l':
  215. self::replace($word, 'able', '', 1)
  216. OR self::replace($word, 'ible', '', 1);
  217. break;
  218. case 'n':
  219. self::replace($word, 'ant', '', 1)
  220. OR self::replace($word, 'ement', '', 1)
  221. OR self::replace($word, 'ment', '', 1)
  222. OR self::replace($word, 'ent', '', 1);
  223. break;
  224. case 'o':
  225. if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
  226. self::replace($word, 'ion', '', 1);
  227. } else {
  228. self::replace($word, 'ou', '', 1);
  229. }
  230. break;
  231. case 's':
  232. self::replace($word, 'ism', '', 1);
  233. break;
  234. case 't':
  235. self::replace($word, 'ate', '', 1)
  236. OR self::replace($word, 'iti', '', 1);
  237. break;
  238. case 'u':
  239. self::replace($word, 'ous', '', 1);
  240. break;
  241. case 'v':
  242. self::replace($word, 'ive', '', 1);
  243. break;
  244. case 'z':
  245. self::replace($word, 'ize', '', 1);
  246. break;
  247. }
  248. return $word;
  249. }
  250. /**
  251. * Step 5
  252. *
  253. * @param string $word Word to stem
  254. */
  255. private static function step5($word)
  256. {
  257. // Part a
  258. if (substr($word, -1) == 'e') {
  259. if (self::m(substr($word, 0, -1)) > 1) {
  260. self::replace($word, 'e', '');
  261. } else if (self::m(substr($word, 0, -1)) == 1) {
  262. if (!self::cvc(substr($word, 0, -1))) {
  263. self::replace($word, 'e', '');
  264. }
  265. }
  266. }
  267. // Part b
  268. if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
  269. $word = substr($word, 0, -1);
  270. }
  271. return $word;
  272. }
  273. /**
  274. * Replaces the first string with the second, at the end of the string. If third
  275. * arg is given, then the preceding string must match that m count at least.
  276. *
  277. * @param string $str String to check
  278. * @param string $check Ending to check for
  279. * @param string $repl Replacement string
  280. * @param int $m Optional minimum number of m() to meet
  281. * @return bool Whether the $check string was at the end
  282. * of the $str string. True does not necessarily mean
  283. * that it was replaced.
  284. */
  285. private static function replace(&$str, $check, $repl, $m = null)
  286. {
  287. $len = 0 - strlen($check);
  288. if (substr($str, $len) == $check) {
  289. $substr = substr($str, 0, $len);
  290. if (is_null($m) OR self::m($substr) > $m) {
  291. $str = $substr . $repl;
  292. }
  293. return true;
  294. }
  295. return false;
  296. }
  297. /**
  298. * What, you mean it's not obvious from the name?
  299. *
  300. * m() measures the number of consonant sequences in $str. if c is
  301. * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
  302. * presence,
  303. *
  304. * <c><v> gives 0
  305. * <c>vc<v> gives 1
  306. * <c>vcvc<v> gives 2
  307. * <c>vcvcvc<v> gives 3
  308. *
  309. * @param string $str The string to return the m count for
  310. * @return int The m count
  311. */
  312. private static function m($str)
  313. {
  314. $c = self::$regex_consonant;
  315. $v = self::$regex_vowel;
  316. $str = preg_replace("#^$c+#", '', $str);
  317. $str = preg_replace("#$v+$#", '', $str);
  318. preg_match_all("#($v+$c+)#", $str, $matches);
  319. return count($matches[1]);
  320. }
  321. /**
  322. * Returns true/false as to whether the given string contains two
  323. * of the same consonant next to each other at the end of the string.
  324. *
  325. * @param string $str String to check
  326. * @return bool Result
  327. */
  328. private static function doubleConsonant($str)
  329. {
  330. $c = self::$regex_consonant;
  331. return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
  332. }
  333. /**
  334. * Checks for ending CVC sequence where second C is not W, X or Y
  335. *
  336. * @param string $str String to check
  337. * @return bool Result
  338. */
  339. private static function cvc($str)
  340. {
  341. $c = self::$regex_consonant;
  342. $v = self::$regex_vowel;
  343. return preg_match("#($c$v$c)$#", $str, $matches)
  344. AND strlen($matches[1]) == 3
  345. AND $matches[1]{2} != 'w'
  346. AND $matches[1]{2} != 'x'
  347. AND $matches[1]{2} != 'y';
  348. }
  349. }
  350. ?>