PageRenderTime 40ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/inc/lib/Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php

https://bitbucket.org/yoander/mtrack
PHP | 287 lines | 137 code | 46 blank | 104 comment | 32 complexity | d47dfc156babaabf4cdc391750931b60 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Fuzzy.php 23775 2011-03-01 17:25:24Z ralph $
  21. */
  22. /** Zend_Search_Lucene_Search_Query_Processing */
  23. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
  24. /**
  25. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  26. * This type of query is not actually involved into query execution.
  27. *
  28. * @category Zend
  29. * @package Zend_Search_Lucene
  30. * @subpackage Search
  31. * @internal
  32. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. class Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy extends Zend_Search_Lucene_Search_Query_Preprocessing
  36. {
  37. /**
  38. * word (query parser lexeme) to find.
  39. *
  40. * @var string
  41. */
  42. private $_word;
  43. /**
  44. * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  45. *
  46. * @var string
  47. */
  48. private $_encoding;
  49. /**
  50. * Field name.
  51. *
  52. * @var string
  53. */
  54. private $_field;
  55. /**
  56. * A value between 0 and 1 to set the required similarity
  57. * between the query term and the matching terms. For example, for a
  58. * _minimumSimilarity of 0.5 a term of the same length
  59. * as the query term is considered similar to the query term if the edit distance
  60. * between both terms is less than length(term)*0.5
  61. *
  62. * @var float
  63. */
  64. private $_minimumSimilarity;
  65. /**
  66. * Class constructor. Create a new preprocessing object for prase query.
  67. *
  68. * @param string $word Non-tokenized word (query parser lexeme) to search.
  69. * @param string $encoding Word encoding.
  70. * @param string $fieldName Field name.
  71. * @param float $minimumSimilarity minimum similarity
  72. */
  73. public function __construct($word, $encoding, $fieldName, $minimumSimilarity)
  74. {
  75. $this->_word = $word;
  76. $this->_encoding = $encoding;
  77. $this->_field = $fieldName;
  78. $this->_minimumSimilarity = $minimumSimilarity;
  79. }
  80. /**
  81. * Re-write query into primitive queries in the context of specified index
  82. *
  83. * @param Zend_Search_Lucene_Interface $index
  84. * @return Zend_Search_Lucene_Search_Query
  85. */
  86. public function rewrite(Zend_Search_Lucene_Interface $index)
  87. {
  88. if ($this->_field === null) {
  89. require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
  90. $query = new Zend_Search_Lucene_Search_Query_Boolean();
  91. $hasInsignificantSubqueries = false;
  92. require_once 'Zend/Search/Lucene.php';
  93. if (Zend_Search_Lucene::getDefaultSearchField() === null) {
  94. $searchFields = $index->getFieldNames(true);
  95. } else {
  96. $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
  97. }
  98. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php';
  99. foreach ($searchFields as $fieldName) {
  100. $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_word,
  101. $this->_encoding,
  102. $fieldName,
  103. $this->_minimumSimilarity);
  104. $rewrittenSubquery = $subquery->rewrite($index);
  105. if ( !($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant ||
  106. $rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Empty) ) {
  107. $query->addSubquery($rewrittenSubquery);
  108. }
  109. if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
  110. $hasInsignificantSubqueries = true;
  111. }
  112. }
  113. $subqueries = $query->getSubqueries();
  114. if (count($subqueries) == 0) {
  115. $this->_matches = array();
  116. if ($hasInsignificantSubqueries) {
  117. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  118. return new Zend_Search_Lucene_Search_Query_Insignificant();
  119. } else {
  120. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  121. return new Zend_Search_Lucene_Search_Query_Empty();
  122. }
  123. }
  124. if (count($subqueries) == 1) {
  125. $query = reset($subqueries);
  126. }
  127. $query->setBoost($this->getBoost());
  128. $this->_matches = $query->getQueryTerms();
  129. return $query;
  130. }
  131. // -------------------------------------
  132. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  133. // encoding is not used since we expect binary matching
  134. require_once 'Zend/Search/Lucene/Index/Term.php';
  135. $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
  136. if ($index->hasTerm($term)) {
  137. require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
  138. $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
  139. $query->setBoost($this->getBoost());
  140. // Get rewritten query. Important! It also fills terms matching container.
  141. $rewrittenQuery = $query->rewrite($index);
  142. $this->_matches = $query->getQueryTerms();
  143. return $rewrittenQuery;
  144. }
  145. // -------------------------------------
  146. // Recognize wildcard queries
  147. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  148. if (@preg_match('/\pL/u', 'a') == 1) {
  149. $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
  150. } else {
  151. $subPatterns = preg_split('/[*?]/', $this->_word);
  152. }
  153. if (count($subPatterns) > 1) {
  154. require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  155. throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');
  156. }
  157. // -------------------------------------
  158. // Recognize one-term multi-term and "insignificant" queries
  159. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  160. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  161. if (count($tokens) == 0) {
  162. $this->_matches = array();
  163. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  164. return new Zend_Search_Lucene_Search_Query_Insignificant();
  165. }
  166. if (count($tokens) == 1) {
  167. require_once 'Zend/Search/Lucene/Index/Term.php';
  168. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  169. require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
  170. $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
  171. $query->setBoost($this->getBoost());
  172. // Get rewritten query. Important! It also fills terms matching container.
  173. $rewrittenQuery = $query->rewrite($index);
  174. $this->_matches = $query->getQueryTerms();
  175. return $rewrittenQuery;
  176. }
  177. // Word is tokenized into several tokens
  178. require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  179. throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');
  180. }
  181. /**
  182. * Query specific matches highlighting
  183. *
  184. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  185. */
  186. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  187. {
  188. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  189. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  190. // -------------------------------------
  191. // Recognize wildcard queries
  192. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  193. if (@preg_match('/\pL/u', 'a') == 1) {
  194. $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
  195. } else {
  196. $subPatterns = preg_split('/[*?]/', $this->_word);
  197. }
  198. if (count($subPatterns) > 1) {
  199. // Do nothing
  200. return;
  201. }
  202. // -------------------------------------
  203. // Recognize one-term multi-term and "insignificant" queries
  204. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  205. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  206. if (count($tokens) == 0) {
  207. // Do nothing
  208. return;
  209. }
  210. if (count($tokens) == 1) {
  211. require_once 'Zend/Search/Lucene/Index/Term.php';
  212. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  213. require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
  214. $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
  215. $query->_highlightMatches($highlighter);
  216. return;
  217. }
  218. // Word is tokenized into several tokens
  219. // But fuzzy search is supported only for non-multiple word terms
  220. // Do nothing
  221. }
  222. /**
  223. * Print a query
  224. *
  225. * @return string
  226. */
  227. public function __toString()
  228. {
  229. // It's used only for query visualisation, so we don't care about characters escaping
  230. if ($this->_field !== null) {
  231. $query = $this->_field . ':';
  232. } else {
  233. $query = '';
  234. }
  235. $query .= $this->_word;
  236. if ($this->getBoost() != 1) {
  237. $query .= '^' . round($this->getBoost(), 4);
  238. }
  239. return $query;
  240. }
  241. }