PageRenderTime 58ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/ManoWars/libs/Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php

https://github.com/misterXavier/ManoWars
PHP | 287 lines | 128 code | 50 blank | 109 comment | 32 complexity | d3f3705bd16a00af617380a7afa28936 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Fuzzy.php 16971 2009-07-22 18:05:45Z mikaelkael $
  21. */
  22. /** Zend_Search_Lucene_Search_Query_Processing */
  23. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
  24. /** Zend_Search_Lucene_Search_Query_Phrase */
  25. require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
  26. /** Zend_Search_Lucene_Search_Query_Insignificant */
  27. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  28. /** Zend_Search_Lucene_Search_Query_Empty */
  29. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  30. /** Zend_Search_Lucene_Search_Query_Term */
  31. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  32. /** Zend_Search_Lucene_Index_Term */
  33. require_once 'Zend/Search/Lucene/Index/Term.php';
  34. /**
  35. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  36. * This type of query is not actually involved into query execution.
  37. *
  38. * @category Zend
  39. * @package Zend_Search_Lucene
  40. * @subpackage Search
  41. * @internal
  42. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  43. * @license http://framework.zend.com/license/new-bsd New BSD License
  44. */
  45. class Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy extends Zend_Search_Lucene_Search_Query_Preprocessing
  46. {
  47. /**
  48. * word (query parser lexeme) to find.
  49. *
  50. * @var string
  51. */
  52. private $_word;
  53. /**
  54. * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  55. *
  56. * @var string
  57. */
  58. private $_encoding;
  59. /**
  60. * Field name.
  61. *
  62. * @var string
  63. */
  64. private $_field;
  65. /**
  66. * A value between 0 and 1 to set the required similarity
  67. * between the query term and the matching terms. For example, for a
  68. * _minimumSimilarity of 0.5 a term of the same length
  69. * as the query term is considered similar to the query term if the edit distance
  70. * between both terms is less than length(term)*0.5
  71. *
  72. * @var float
  73. */
  74. private $_minimumSimilarity;
  75. /**
  76. * Class constructor. Create a new preprocessing object for prase query.
  77. *
  78. * @param string $word Non-tokenized word (query parser lexeme) to search.
  79. * @param string $encoding Word encoding.
  80. * @param string $fieldName Field name.
  81. * @param float $minimumSimilarity minimum similarity
  82. */
  83. public function __construct($word, $encoding, $fieldName, $minimumSimilarity)
  84. {
  85. $this->_word = $word;
  86. $this->_encoding = $encoding;
  87. $this->_field = $fieldName;
  88. $this->_minimumSimilarity = $minimumSimilarity;
  89. }
  90. /**
  91. * Re-write query into primitive queries in the context of specified index
  92. *
  93. * @param Zend_Search_Lucene_Interface $index
  94. * @return Zend_Search_Lucene_Search_Query
  95. */
  96. public function rewrite(Zend_Search_Lucene_Interface $index)
  97. {
  98. if ($this->_field === null) {
  99. $query = new Zend_Search_Lucene_Search_Query_Boolean();
  100. $hasInsignificantSubqueries = false;
  101. if (Zend_Search_Lucene::getDefaultSearchField() === null) {
  102. $searchFields = $index->getFieldNames(true);
  103. } else {
  104. $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
  105. }
  106. foreach ($searchFields as $fieldName) {
  107. $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_word,
  108. $this->_encoding,
  109. $fieldName,
  110. $this->_minimumSimilarity);
  111. $rewrittenSubquery = $subquery->rewrite($index);
  112. if ( !($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant ||
  113. $rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Empty) ) {
  114. $query->addSubquery($rewrittenSubquery);
  115. }
  116. if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
  117. $hasInsignificantSubqueries = true;
  118. }
  119. }
  120. $subqueries = $query->getSubqueries();
  121. if (count($subqueries) == 0) {
  122. $this->_matches = array();
  123. if ($hasInsignificantSubqueries) {
  124. return new Zend_Search_Lucene_Search_Query_Insignificant();
  125. } else {
  126. return new Zend_Search_Lucene_Search_Query_Empty();
  127. }
  128. }
  129. if (count($subqueries) == 1) {
  130. $query = reset($subqueries);
  131. }
  132. $query->setBoost($this->getBoost());
  133. $this->_matches = $query->getQueryTerms();
  134. return $query;
  135. }
  136. // -------------------------------------
  137. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  138. // encoding is not used since we expect binary matching
  139. $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
  140. if ($index->hasTerm($term)) {
  141. $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
  142. $query->setBoost($this->getBoost());
  143. // Get rewritten query. Important! It also fills terms matching container.
  144. $rewrittenQuery = $query->rewrite($index);
  145. $this->_matches = $query->getQueryTerms();
  146. return $rewrittenQuery;
  147. }
  148. // -------------------------------------
  149. // Recognize wildcard queries
  150. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  151. if (@preg_match('/\pL/u', 'a') == 1) {
  152. $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
  153. } else {
  154. $subPatterns = preg_split('/[*?]/', $this->_word);
  155. }
  156. if (count($subPatterns) > 1) {
  157. require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  158. throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');
  159. }
  160. // -------------------------------------
  161. // Recognize one-term multi-term and "insignificant" queries
  162. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  163. if (count($tokens) == 0) {
  164. $this->_matches = array();
  165. return new Zend_Search_Lucene_Search_Query_Insignificant();
  166. }
  167. if (count($tokens) == 1) {
  168. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  169. $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
  170. $query->setBoost($this->getBoost());
  171. // Get rewritten query. Important! It also fills terms matching container.
  172. $rewrittenQuery = $query->rewrite($index);
  173. $this->_matches = $query->getQueryTerms();
  174. return $rewrittenQuery;
  175. }
  176. // Word is tokenized into several tokens
  177. require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  178. throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');
  179. }
  180. /**
  181. * Query specific matches highlighting
  182. *
  183. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  184. */
  185. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  186. {
  187. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  188. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  189. // -------------------------------------
  190. // Recognize wildcard queries
  191. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  192. if (@preg_match('/\pL/u', 'a') == 1) {
  193. $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
  194. } else {
  195. $subPatterns = preg_split('/[*?]/', $this->_word);
  196. }
  197. if (count($subPatterns) > 1) {
  198. // Do nothing
  199. return;
  200. }
  201. // -------------------------------------
  202. // Recognize one-term multi-term and "insignificant" queries
  203. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  204. if (count($tokens) == 0) {
  205. // Do nothing
  206. return;
  207. }
  208. if (count($tokens) == 1) {
  209. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  210. $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
  211. $query->_highlightMatches($highlighter);
  212. return;
  213. }
  214. // Word is tokenized into several tokens
  215. // But fuzzy search is supported only for non-multiple word terms
  216. // Do nothing
  217. }
  218. /**
  219. * Print a query
  220. *
  221. * @return string
  222. */
  223. public function __toString()
  224. {
  225. // It's used only for query visualisation, so we don't care about characters escaping
  226. if ($this->_field !== null) {
  227. $query = $this->_field . ':';
  228. } else {
  229. $query = '';
  230. }
  231. $query .= $this->_word;
  232. if ($this->getBoost() != 1) {
  233. $query .= '^' . round($this->getBoost(), 4);
  234. }
  235. return $query;
  236. }
  237. }