PageRenderTime 59ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/vendor/Zend/Search/Lucene/Search/Query/Fuzzy.php

https://bitbucket.org/anycode/sfluceneplugin
PHP | 493 lines | 227 code | 66 blank | 200 comment | 57 complexity | 2cf0af41571f5881b3eacc123c8da58a MD5 | raw file
Possible License(s): BSD-3-Clause
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Fuzzy.php 20096 2010-01-06 02:05:09Z bkarwin $
  21. */
  22. /** Zend_Search_Lucene_Search_Query */
  23. require_once 'Zend/Search/Lucene/Search/Query.php';
  24. /**
  25. * @category Zend
  26. * @package Zend_Search_Lucene
  27. * @subpackage Search
  28. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  29. * @license http://framework.zend.com/license/new-bsd New BSD License
  30. */
  31. class Zend_Search_Lucene_Search_Query_Fuzzy extends Zend_Search_Lucene_Search_Query
  32. {
  33. /** Default minimum similarity */
  34. const DEFAULT_MIN_SIMILARITY = 0.5;
  35. /**
  36. * Maximum number of matched terms.
  37. * Apache Lucene defines this limitation as boolean query maximum number of clauses:
  38. * org.apache.lucene.search.BooleanQuery.getMaxClauseCount()
  39. */
  40. const MAX_CLAUSE_COUNT = 1024;
  41. /**
  42. * Array of precalculated max distances
  43. *
  44. * keys are integers representing a word size
  45. */
  46. private $_maxDistances = array();
  47. /**
  48. * Base searching term.
  49. *
  50. * @var Zend_Search_Lucene_Index_Term
  51. */
  52. private $_term;
  53. /**
  54. * A value between 0 and 1 to set the required similarity
  55. * between the query term and the matching terms. For example, for a
  56. * _minimumSimilarity of 0.5 a term of the same length
  57. * as the query term is considered similar to the query term if the edit distance
  58. * between both terms is less than length(term)*0.5
  59. *
  60. * @var float
  61. */
  62. private $_minimumSimilarity;
  63. /**
  64. * The length of common (non-fuzzy) prefix
  65. *
  66. * @var integer
  67. */
  68. private $_prefixLength;
  69. /**
  70. * Matched terms.
  71. *
  72. * Matched terms list.
  73. * It's filled during the search (rewrite operation) and may be used for search result
  74. * post-processing
  75. *
  76. * Array of Zend_Search_Lucene_Index_Term objects
  77. *
  78. * @var array
  79. */
  80. private $_matches = null;
  81. /**
  82. * Matched terms scores
  83. *
  84. * @var array
  85. */
  86. private $_scores = null;
  87. /**
  88. * Array of the term keys.
  89. * Used to sort terms in alphabetical order if terms have the same socres
  90. *
  91. * @var array
  92. */
  93. private $_termKeys = null;
  94. /**
  95. * Default non-fuzzy prefix length
  96. *
  97. * @var integer
  98. */
  99. private static $_defaultPrefixLength = 3;
  100. /**
  101. * Zend_Search_Lucene_Search_Query_Wildcard constructor.
  102. *
  103. * @param Zend_Search_Lucene_Index_Term $term
  104. * @param float $minimumSimilarity
  105. * @param integer $prefixLength
  106. * @throws Zend_Search_Lucene_Exception
  107. */
  108. public function __construct(Zend_Search_Lucene_Index_Term $term, $minimumSimilarity = self::DEFAULT_MIN_SIMILARITY, $prefixLength = null)
  109. {
  110. if ($minimumSimilarity < 0) {
  111. require_once 'Zend/Search/Lucene/Exception.php';
  112. throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be less than 0');
  113. }
  114. if ($minimumSimilarity >= 1) {
  115. require_once 'Zend/Search/Lucene/Exception.php';
  116. throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be greater than or equal to 1');
  117. }
  118. if ($prefixLength < 0) {
  119. require_once 'Zend/Search/Lucene/Exception.php';
  120. throw new Zend_Search_Lucene_Exception('prefixLength cannot be less than 0');
  121. }
  122. $this->_term = $term;
  123. $this->_minimumSimilarity = $minimumSimilarity;
  124. $this->_prefixLength = ($prefixLength !== null)? $prefixLength : self::$_defaultPrefixLength;
  125. }
  126. /**
  127. * Get default non-fuzzy prefix length
  128. *
  129. * @return integer
  130. */
  131. public static function getDefaultPrefixLength()
  132. {
  133. return self::$_defaultPrefixLength;
  134. }
  135. /**
  136. * Set default non-fuzzy prefix length
  137. *
  138. * @param integer $defaultPrefixLength
  139. */
  140. public static function setDefaultPrefixLength($defaultPrefixLength)
  141. {
  142. self::$_defaultPrefixLength = $defaultPrefixLength;
  143. }
  144. /**
  145. * Calculate maximum distance for specified word length
  146. *
  147. * @param integer $prefixLength
  148. * @param integer $termLength
  149. * @param integer $length
  150. * @return integer
  151. */
  152. private function _calculateMaxDistance($prefixLength, $termLength, $length)
  153. {
  154. $this->_maxDistances[$length] = (int) ((1 - $this->_minimumSimilarity)*(min($termLength, $length) + $prefixLength));
  155. return $this->_maxDistances[$length];
  156. }
  157. /**
  158. * Re-write query into primitive queries in the context of specified index
  159. *
  160. * @param Zend_Search_Lucene_Interface $index
  161. * @return Zend_Search_Lucene_Search_Query
  162. * @throws Zend_Search_Lucene_Exception
  163. */
  164. public function rewrite(Zend_Search_Lucene_Interface $index)
  165. {
  166. $this->_matches = array();
  167. $this->_scores = array();
  168. $this->_termKeys = array();
  169. if ($this->_term->field === null) {
  170. // Search through all fields
  171. $fields = $index->getFieldNames(true /* indexed fields list */);
  172. } else {
  173. $fields = array($this->_term->field);
  174. }
  175. require_once 'Zend/Search/Lucene/Index/Term.php';
  176. $prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength);
  177. $prefixByteLength = strlen($prefix);
  178. $prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix);
  179. $termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text);
  180. $termRest = substr($this->_term->text, $prefixByteLength);
  181. // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
  182. $termRestLength = strlen($termRest);
  183. $scaleFactor = 1/(1 - $this->_minimumSimilarity);
  184. require_once 'Zend/Search/Lucene.php';
  185. $maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit();
  186. foreach ($fields as $field) {
  187. $index->resetTermsStream();
  188. require_once 'Zend/Search/Lucene/Index/Term.php';
  189. if ($prefix != '') {
  190. $index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
  191. while ($index->currentTerm() !== null &&
  192. $index->currentTerm()->field == $field &&
  193. substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) {
  194. // Calculate similarity
  195. $target = substr($index->currentTerm()->text, $prefixByteLength);
  196. $maxDistance = isset($this->_maxDistances[strlen($target)])?
  197. $this->_maxDistances[strlen($target)] :
  198. $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
  199. if ($termRestLength == 0) {
  200. // we don't have anything to compare. That means if we just add
  201. // the letters for current term we get the new word
  202. $similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
  203. } else if (strlen($target) == 0) {
  204. $similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
  205. } else if ($maxDistance < abs($termRestLength - strlen($target))){
  206. //just adding the characters of term to target or vice-versa results in too many edits
  207. //for example "pre" length is 3 and "prefixes" length is 8. We can see that
  208. //given this optimal circumstance, the edit distance cannot be less than 5.
  209. //which is 8-3 or more precisesly abs(3-8).
  210. //if our maximum edit distance is 4, then we can discard this word
  211. //without looking at it.
  212. $similarity = 0;
  213. } else {
  214. $similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
  215. }
  216. if ($similarity > $this->_minimumSimilarity) {
  217. $this->_matches[] = $index->currentTerm();
  218. $this->_termKeys[] = $index->currentTerm()->key();
  219. $this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
  220. if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
  221. require_once 'Zend/Search/Lucene/Exception.php';
  222. throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
  223. }
  224. }
  225. $index->nextTerm();
  226. }
  227. } else {
  228. $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
  229. while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
  230. // Calculate similarity
  231. $target = $index->currentTerm()->text;
  232. $maxDistance = isset($this->_maxDistances[strlen($target)])?
  233. $this->_maxDistances[strlen($target)] :
  234. $this->_calculateMaxDistance(0, $termRestLength, strlen($target));
  235. if ($maxDistance < abs($termRestLength - strlen($target))){
  236. //just adding the characters of term to target or vice-versa results in too many edits
  237. //for example "pre" length is 3 and "prefixes" length is 8. We can see that
  238. //given this optimal circumstance, the edit distance cannot be less than 5.
  239. //which is 8-3 or more precisesly abs(3-8).
  240. //if our maximum edit distance is 4, then we can discard this word
  241. //without looking at it.
  242. $similarity = 0;
  243. } else {
  244. $similarity = 1 - levenshtein($termRest, $target)/min($termRestLength, strlen($target));
  245. }
  246. if ($similarity > $this->_minimumSimilarity) {
  247. $this->_matches[] = $index->currentTerm();
  248. $this->_termKeys[] = $index->currentTerm()->key();
  249. $this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
  250. if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
  251. require_once 'Zend/Search/Lucene/Exception.php';
  252. throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
  253. }
  254. }
  255. $index->nextTerm();
  256. }
  257. }
  258. $index->closeTermsStream();
  259. }
  260. if (count($this->_matches) == 0) {
  261. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  262. return new Zend_Search_Lucene_Search_Query_Empty();
  263. } else if (count($this->_matches) == 1) {
  264. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  265. return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
  266. } else {
  267. require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
  268. $rewrittenQuery = new Zend_Search_Lucene_Search_Query_Boolean();
  269. array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC,
  270. $this->_termKeys, SORT_ASC, SORT_STRING,
  271. $this->_matches);
  272. $termCount = 0;
  273. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  274. foreach ($this->_matches as $id => $matchedTerm) {
  275. $subquery = new Zend_Search_Lucene_Search_Query_Term($matchedTerm);
  276. $subquery->setBoost($this->_scores[$id]);
  277. $rewrittenQuery->addSubquery($subquery);
  278. $termCount++;
  279. if ($termCount >= self::MAX_CLAUSE_COUNT) {
  280. break;
  281. }
  282. }
  283. return $rewrittenQuery;
  284. }
  285. }
  286. /**
  287. * Optimize query in the context of specified index
  288. *
  289. * @param Zend_Search_Lucene_Interface $index
  290. * @return Zend_Search_Lucene_Search_Query
  291. */
  292. public function optimize(Zend_Search_Lucene_Interface $index)
  293. {
  294. require_once 'Zend/Search/Lucene/Exception.php';
  295. throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
  296. }
  297. /**
  298. * Return query terms
  299. *
  300. * @return array
  301. * @throws Zend_Search_Lucene_Exception
  302. */
  303. public function getQueryTerms()
  304. {
  305. if ($this->_matches === null) {
  306. require_once 'Zend/Search/Lucene/Exception.php';
  307. throw new Zend_Search_Lucene_Exception('Search or rewrite operations have to be performed before.');
  308. }
  309. return $this->_matches;
  310. }
  311. /**
  312. * Constructs an appropriate Weight implementation for this query.
  313. *
  314. * @param Zend_Search_Lucene_Interface $reader
  315. * @return Zend_Search_Lucene_Search_Weight
  316. * @throws Zend_Search_Lucene_Exception
  317. */
  318. public function createWeight(Zend_Search_Lucene_Interface $reader)
  319. {
  320. require_once 'Zend/Search/Lucene/Exception.php';
  321. throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
  322. }
  323. /**
  324. * Execute query in context of index reader
  325. * It also initializes necessary internal structures
  326. *
  327. * @param Zend_Search_Lucene_Interface $reader
  328. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  329. * @throws Zend_Search_Lucene_Exception
  330. */
  331. public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
  332. {
  333. require_once 'Zend/Search/Lucene/Exception.php';
  334. throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
  335. }
  336. /**
  337. * Get document ids likely matching the query
  338. *
  339. * It's an array with document ids as keys (performance considerations)
  340. *
  341. * @return array
  342. * @throws Zend_Search_Lucene_Exception
  343. */
  344. public function matchedDocs()
  345. {
  346. require_once 'Zend/Search/Lucene/Exception.php';
  347. throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
  348. }
  349. /**
  350. * Score specified document
  351. *
  352. * @param integer $docId
  353. * @param Zend_Search_Lucene_Interface $reader
  354. * @return float
  355. * @throws Zend_Search_Lucene_Exception
  356. */
  357. public function score($docId, Zend_Search_Lucene_Interface $reader)
  358. {
  359. require_once 'Zend/Search/Lucene/Exception.php';
  360. throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
  361. }
  362. /**
  363. * Query specific matches highlighting
  364. *
  365. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  366. */
  367. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  368. {
  369. $words = array();
  370. require_once 'Zend/Search/Lucene/Index/Term.php';
  371. $prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength);
  372. $prefixByteLength = strlen($prefix);
  373. $prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix);
  374. $termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text);
  375. $termRest = substr($this->_term->text, $prefixByteLength);
  376. // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
  377. $termRestLength = strlen($termRest);
  378. $scaleFactor = 1/(1 - $this->_minimumSimilarity);
  379. $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
  380. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  381. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
  382. foreach ($tokens as $token) {
  383. $termText = $token->getTermText();
  384. if (substr($termText, 0, $prefixByteLength) == $prefix) {
  385. // Calculate similarity
  386. $target = substr($termText, $prefixByteLength);
  387. $maxDistance = isset($this->_maxDistances[strlen($target)])?
  388. $this->_maxDistances[strlen($target)] :
  389. $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
  390. if ($termRestLength == 0) {
  391. // we don't have anything to compare. That means if we just add
  392. // the letters for current term we get the new word
  393. $similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
  394. } else if (strlen($target) == 0) {
  395. $similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
  396. } else if ($maxDistance < abs($termRestLength - strlen($target))){
  397. //just adding the characters of term to target or vice-versa results in too many edits
  398. //for example "pre" length is 3 and "prefixes" length is 8. We can see that
  399. //given this optimal circumstance, the edit distance cannot be less than 5.
  400. //which is 8-3 or more precisesly abs(3-8).
  401. //if our maximum edit distance is 4, then we can discard this word
  402. //without looking at it.
  403. $similarity = 0;
  404. } else {
  405. $similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
  406. }
  407. if ($similarity > $this->_minimumSimilarity) {
  408. $words[] = $termText;
  409. }
  410. }
  411. }
  412. $highlighter->highlight($words);
  413. }
  414. /**
  415. * Print a query
  416. *
  417. * @return string
  418. */
  419. public function __toString()
  420. {
  421. // It's used only for query visualisation, so we don't care about characters escaping
  422. return (($this->_term->field === null)? '' : $this->_term->field . ':')
  423. . $this->_term->text . '~'
  424. . (($this->_minimumSimilarity != self::DEFAULT_MIN_SIMILARITY)? round($this->_minimumSimilarity, 4) : '')
  425. . (($this->getBoost() != 1)? '^' . round($this->getBoost(), 4) : '');
  426. }
  427. }