/application/models/spell.php

https://github.com/meloncholy/video-gallery · PHP · 283 lines · 190 code · 42 blank · 51 comment · 29 complexity · d112e432c02148e2244e9b6083dbdecb MD5 · raw file

  1. <?php
  2. /**
  3. * Check spelling of searches
  4. *
  5. * Dictionary adapted from code by Vincenzo Russo, Ian Barber
  6. * http://neminis.org/blog/research/text-mining/spelling-correction-with-soundex/
  7. *
  8. * @package VideoGallery
  9. * @subpackage Spell
  10. * @copyright Copyright (c) 2011 Andrew Weeks http://meloncholy.com
  11. * @license MIT licence. See licence.txt for details.
  12. * @version 0.1
  13. */
  14. class Spell extends CI_Model
  15. {
  16. // Dictionary
  17. var $dic;
  18. // Dictionary minus single letter words
  19. var $dic_edit;
  20. function __construct()
  21. {
  22. parent::__construct();
  23. $this->load->database();
  24. $this->dic = $this->load_dic('dic');
  25. $this->dic_edit = $this->load_dic('dicedit');
  26. }
  27. function check($word)
  28. {
  29. return isset($this->dic[strtolower($word)]);
  30. }
  31. // $match == how close to correct spelling word is, roughly
  32. function correct($word, &$match = 0.0)
  33. {
  34. $joined_words = array();
  35. $edits1 = $edits2 = array();
  36. $word = strtolower($word);
  37. if(isset($this->dic[$word]))
  38. {
  39. $match = 1.0;
  40. return $word;
  41. }
  42. elseif (strlen($word) == 1)
  43. {
  44. $match = 0.0;
  45. return $word;
  46. }
  47. foreach($this->dic_edit as $dic_word => $count)
  48. {
  49. $dist = levenshtein($word, $dic_word);
  50. if($dist == 1)
  51. {
  52. $edits1[$dic_word] = $count;
  53. }
  54. elseif($dist == 2)
  55. {
  56. $edits2[$dic_word] = $count;
  57. }
  58. }
  59. if(count($edits1))
  60. {
  61. $match = 0.6;
  62. arsort($edits1);
  63. return key($edits1);
  64. }
  65. elseif(count($edits2))
  66. {
  67. $match = 0.3;
  68. arsort($edits2);
  69. return key($edits2);
  70. }
  71. // Nothing better
  72. $match = 0.0;
  73. return $word;
  74. }
  75. /**
  76. * Store a new dictionary of words in the database. Only needed to update the list.
  77. *
  78. * @param string $file List of words or phrases to learn (one per line)
  79. */
  80. function train($file)
  81. {
  82. $dic = array();
  83. $dic_edit = array();
  84. $contents = file_get_contents($file);
  85. // get all strings of word letters
  86. preg_match_all('/\w+/', $contents, $matches);
  87. unset($contents);
  88. foreach($matches[0] as $word)
  89. {
  90. $word = strtolower($word);
  91. if(!isset($dic[$word]))
  92. {
  93. $dic[$word] = 0;
  94. }
  95. $dic[$word] += 1;
  96. }
  97. unset($matches);
  98. foreach ($dic as $word => $matches)
  99. {
  100. if (strlen($word) > 1) $dic_edit[$word] = $matches;
  101. }
  102. $this->dic = $dic;
  103. $this->dic_edit = $dic_edit;
  104. $this->save_dic($dic, 'dic');
  105. $this->save_dic($dic_edit, 'dicedit');
  106. echo "Updated dictionaries from $file.";
  107. }
  108. /**
  109. * Store the new dictionary in the database (called by train)
  110. *
  111. * @param array $dic New dictionary
  112. * @param string $table Target table
  113. */
  114. private function save_dic($dic, $table)
  115. {
  116. // Why can't I execute 2 of these at once through CodeIgniter?
  117. $this->db->query("DELETE FROM $table");
  118. $sql = "INSERT INTO $table (word, freq) VALUES ";
  119. foreach ($dic as $word => $freq)
  120. {
  121. $sql .= "\n('$word', $freq), ";
  122. }
  123. // Cut ', '
  124. $sql = substr($sql, 0, strlen($sql) - 2);
  125. $this->db->query($sql);
  126. }
  127. /**
  128. * Load dictionary from database
  129. *
  130. * @param string $table Table to load
  131. *
  132. */
  133. private function load_dic($table)
  134. {
  135. $dic = array();
  136. $query = $this->db->query("SELECT * FROM $table");
  137. foreach ($query->result() as $row)
  138. {
  139. $dic[$row->word] = $row->freq;
  140. }
  141. return $dic;
  142. }
  143. /**
  144. * Join words together to try in dictionary
  145. *
  146. * @param array $words Search string to check as array of words
  147. * @param array $joined_words Array of joined words (internal; passed by ref)
  148. * @param int $pos_start Current position in array (internal)
  149. * @param array $comb_start Array of joined words current building (internal)
  150. * @return array Array of joined words
  151. */
  152. function join($words, &$joined_words = array(), $pos_start = 0, $comb_start = array())
  153. {
  154. $count = count($words);
  155. $len = $count - $pos_start;
  156. $comb = $comb_start;
  157. for ( ; $len > 0; $len--)
  158. {
  159. $pos = $pos_start;
  160. $word = implode(array_slice($words, $pos, $len));
  161. if ($len == 1 || $this->check($word))
  162. {
  163. if ($pos + $len <= $count)
  164. {
  165. array_push($comb, $word);
  166. if ($pos + $len < $count) $this->join($words, $joined_words, $pos + $len, $comb);
  167. }
  168. if ($pos + $len == $count) array_push($joined_words, $comb);
  169. }
  170. $comb = $comb_start;
  171. }
  172. return $joined_words;
  173. }
  174. /**
  175. * Split compound words for spell check
  176. *
  177. * @param string $word Word to split
  178. * @return array Array of split words
  179. */
  180. function split($word)
  181. {
  182. $new_words = array();
  183. return $this->split_r(str_split($word), $new_words);
  184. }
  185. private function split_r($letters, &$new_words, $pos_start = 0, $len = false, $comb_start = array())
  186. {
  187. $count = count($letters);
  188. $len = $count - $pos_start;
  189. $comb = $comb_start;
  190. $unknown = 0;
  191. for ( ; $len > 0; $len--)
  192. {
  193. $pos = $pos_start;
  194. $word = implode(array_slice($letters, $pos, $len));
  195. if ($pos + $len <= $count)
  196. {
  197. array_push($comb, $word);
  198. if ($pos + $len < $count) $this->split_r($letters, $new_words, $pos + $len, $len, $comb);
  199. }
  200. if ($pos + $len == $count)
  201. {
  202. for ($w = 0; $w < count($comb); $w++)
  203. {
  204. if (strlen($comb[$w]) > 1 && $this->check($comb[$w]))
  205. {
  206. $unknown = 0;
  207. }
  208. else
  209. {
  210. $unknown++;
  211. }
  212. if ($unknown == 2) break;
  213. }
  214. if ($unknown < 2) array_push($new_words, $comb);
  215. }
  216. $comb = $comb_start;
  217. }
  218. return $new_words;
  219. }
  220. function join_alts($split_words_sets)
  221. {
  222. $split_words_list = array();
  223. $this->join_alts_r($split_words_sets, $split_words_list);
  224. return $split_words_list;
  225. }
  226. private function join_alts_r($split_words_sets, &$split_words_list, $set_idx = 0, $cur_split_words = array())
  227. {
  228. foreach ($split_words_sets[$set_idx] as $split_word)
  229. {
  230. $new_split_words = array_merge($cur_split_words, $split_word);
  231. if ($set_idx == count($split_words_sets) - 1)
  232. {
  233. array_push($split_words_list, $new_split_words);
  234. }
  235. else
  236. {
  237. $this->join_alts_r($split_words_sets, $split_words_list, $set_idx + 1, $new_split_words);
  238. }
  239. }
  240. }
  241. }