PageRenderTime 27ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/library/Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php

https://github.com/Exercise/zf2
PHP | 286 lines | 127 code | 44 blank | 115 comment | 32 complexity | cfdf9ae9bd5fe252d19b06a36592ff8c MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /**
  23. * @namespace
  24. */
  25. namespace Zend\Search\Lucene\Search\Query\Preprocessing;
  26. use Zend\Search\Lucene;
  27. use Zend\Search\Lucene\Search\Query;
  28. use Zend\Search\Lucene\Index;
  29. use Zend\Search\Lucene\Search;
  30. use Zend\Search\Lucene\Analysis\Analyzer;
  31. use Zend\Search\Lucene\Search\Highlighter;
  32. /**
  33. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  34. * This type of query is not actually involved into query execution.
  35. *
  36. * @uses \Zend\Search\Lucene\Index
  37. * @uses \Zend\Search\Lucene\Analysis\Analyzer
  38. * @uses \Zend\Search\Lucene\Index\Term
  39. * @uses \Zend\Search\Lucene\Search\QueryParserException
  40. * @uses \Zend\Search\Lucene\Search\Query\Boolean
  41. * @uses \Zend\Search\Lucene\Search\Query\EmptyResult
  42. * @uses \Zend\Search\Lucene\Search\Query\Fuzzy
  43. * @uses \Zend\Search\Lucene\Search\Query\Insignificant
  44. * @uses \Zend\Search\Lucene\Search\Query\Preprocessing\AbstractPreprocessing
  45. * @category Zend
  46. * @package Zend_Search_Lucene
  47. * @subpackage Search
  48. * @internal
  49. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  50. * @license http://framework.zend.com/license/new-bsd New BSD License
  51. */
  52. class Fuzzy extends AbstractPreprocessing
  53. {
  54. /**
  55. * word (query parser lexeme) to find.
  56. *
  57. * @var string
  58. */
  59. private $_word;
  60. /**
  61. * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  62. *
  63. * @var string
  64. */
  65. private $_encoding;
  66. /**
  67. * Field name.
  68. *
  69. * @var string
  70. */
  71. private $_field;
  72. /**
  73. * A value between 0 and 1 to set the required similarity
  74. * between the query term and the matching terms. For example, for a
  75. * _minimumSimilarity of 0.5 a term of the same length
  76. * as the query term is considered similar to the query term if the edit distance
  77. * between both terms is less than length(term)*0.5
  78. *
  79. * @var float
  80. */
  81. private $_minimumSimilarity;
  82. /**
  83. * Class constructor. Create a new preprocessing object for prase query.
  84. *
  85. * @param string $word Non-tokenized word (query parser lexeme) to search.
  86. * @param string $encoding Word encoding.
  87. * @param string $fieldName Field name.
  88. * @param float $minimumSimilarity minimum similarity
  89. */
  90. public function __construct($word, $encoding, $fieldName, $minimumSimilarity)
  91. {
  92. $this->_word = $word;
  93. $this->_encoding = $encoding;
  94. $this->_field = $fieldName;
  95. $this->_minimumSimilarity = $minimumSimilarity;
  96. }
  97. /**
  98. * Re-write query into primitive queries in the context of specified index
  99. *
  100. * @param \Zend\Search\Lucene\SearchIndex $index
  101. * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  102. */
  103. public function rewrite(Lucene\SearchIndex $index)
  104. {
  105. if ($this->_field === null) {
  106. $query = new Search\Query\Boolean();
  107. $hasInsignificantSubqueries = false;
  108. if (Lucene\Lucene::getDefaultSearchField() === null) {
  109. $searchFields = $index->getFieldNames(true);
  110. } else {
  111. $searchFields = array(Lucene\Lucene::getDefaultSearchField());
  112. }
  113. foreach ($searchFields as $fieldName) {
  114. $subquery = new self($this->_word,
  115. $this->_encoding,
  116. $fieldName,
  117. $this->_minimumSimilarity);
  118. $rewrittenSubquery = $subquery->rewrite($index);
  119. if ( !($rewrittenSubquery instanceof Query\Insignificant ||
  120. $rewrittenSubquery instanceof Query\EmptyResult) ) {
  121. $query->addSubquery($rewrittenSubquery);
  122. }
  123. if ($rewrittenSubquery instanceof Query\Insignificant) {
  124. $hasInsignificantSubqueries = true;
  125. }
  126. }
  127. $subqueries = $query->getSubqueries();
  128. if (count($subqueries) == 0) {
  129. $this->_matches = array();
  130. if ($hasInsignificantSubqueries) {
  131. return new Query\Insignificant();
  132. } else {
  133. return new Query\EmptyResult();
  134. }
  135. }
  136. if (count($subqueries) == 1) {
  137. $query = reset($subqueries);
  138. }
  139. $query->setBoost($this->getBoost());
  140. $this->_matches = $query->getQueryTerms();
  141. return $query;
  142. }
  143. // -------------------------------------
  144. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  145. // encoding is not used since we expect binary matching
  146. $term = new Index\Term($this->_word, $this->_field);
  147. if ($index->hasTerm($term)) {
  148. $query = new Query\Fuzzy($term, $this->_minimumSimilarity);
  149. $query->setBoost($this->getBoost());
  150. // Get rewritten query. Important! It also fills terms matching container.
  151. $rewrittenQuery = $query->rewrite($index);
  152. $this->_matches = $query->getQueryTerms();
  153. return $rewrittenQuery;
  154. }
  155. // -------------------------------------
  156. // Recognize wildcard queries
  157. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  158. if (@preg_match('/\pL/u', 'a') == 1) {
  159. $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
  160. } else {
  161. $subPatterns = preg_split('/[*?]/', $this->_word);
  162. }
  163. if (count($subPatterns) > 1) {
  164. throw new Search\QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');
  165. }
  166. // -------------------------------------
  167. // Recognize one-term multi-term and "insignificant" queries
  168. $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  169. if (count($tokens) == 0) {
  170. $this->_matches = array();
  171. return new Query\Insignificant();
  172. }
  173. if (count($tokens) == 1) {
  174. $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
  175. $query = new Query\Fuzzy($term, $this->_minimumSimilarity);
  176. $query->setBoost($this->getBoost());
  177. // Get rewritten query. Important! It also fills terms matching container.
  178. $rewrittenQuery = $query->rewrite($index);
  179. $this->_matches = $query->getQueryTerms();
  180. return $rewrittenQuery;
  181. }
  182. // Word is tokenized into several tokens
  183. throw new Search\QueryParserException('Fuzzy search is supported only for non-multiple word terms');
  184. }
  185. /**
  186. * Query specific matches highlighting
  187. *
  188. * @param \Zend\Search\Lucene\Search\Highlighter $highlighter Highlighter object (also contains doc for highlighting)
  189. */
  190. protected function _highlightMatches(Highlighter $highlighter)
  191. {
  192. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  193. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  194. // -------------------------------------
  195. // Recognize wildcard queries
  196. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  197. if (@preg_match('/\pL/u', 'a') == 1) {
  198. $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
  199. } else {
  200. $subPatterns = preg_split('/[*?]/', $this->_word);
  201. }
  202. if (count($subPatterns) > 1) {
  203. // Do nothing
  204. return;
  205. }
  206. // -------------------------------------
  207. // Recognize one-term multi-term and "insignificant" queries
  208. $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  209. if (count($tokens) == 0) {
  210. // Do nothing
  211. return;
  212. }
  213. if (count($tokens) == 1) {
  214. $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
  215. $query = new Query\Fuzzy($term, $this->_minimumSimilarity);
  216. $query->_highlightMatches($highlighter);
  217. return;
  218. }
  219. // Word is tokenized into several tokens
  220. // But fuzzy search is supported only for non-multiple word terms
  221. // Do nothing
  222. }
  223. /**
  224. * Print a query
  225. *
  226. * @return string
  227. */
  228. public function __toString()
  229. {
  230. // It's used only for query visualisation, so we don't care about characters escaping
  231. if ($this->_field !== null) {
  232. $query = $this->_field . ':';
  233. } else {
  234. $query = '';
  235. }
  236. $query .= $this->_word;
  237. if ($this->getBoost() != 1) {
  238. $query .= '^' . round($this->getBoost(), 4);
  239. }
  240. return $query;
  241. }
  242. }