PageRenderTime 44ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/thirdparty/Zend/Search/Lucene/Search/Query/Preprocessing/Phrase.php

https://github.com/silverstripe/silverstripe-docsviewer
PHP | 280 lines | 118 code | 38 blank | 124 comment | 18 complexity | 0512315589ce11caef2ae00eddad8d43 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Phrase.php 20096 2010-01-06 02:05:09Z bkarwin $
  21. */
  22. /**
  23. * Zend_Search_Lucene_Search_Query_Processing
  24. */
  25. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
  26. /**
  27. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  28. * This type of query is not actually involved into query execution.
  29. *
  30. * @category Zend
  31. * @package Zend_Search_Lucene
  32. * @subpackage Search
  33. * @internal
  34. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  35. * @license http://framework.zend.com/license/new-bsd New BSD License
  36. */
  37. class Zend_Search_Lucene_Search_Query_Preprocessing_Phrase extends Zend_Search_Lucene_Search_Query_Preprocessing
  38. {
  39. /**
  40. * Phrase to find.
  41. *
  42. * @var string
  43. */
  44. private $_phrase;
  45. /**
  46. * Phrase encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  47. *
  48. * @var string
  49. */
  50. private $_phraseEncoding;
  51. /**
  52. * Field name.
  53. *
  54. * @var string
  55. */
  56. private $_field;
  57. /**
  58. * Sets the number of other words permitted between words in query phrase.
  59. * If zero, then this is an exact phrase search. For larger values this works
  60. * like a WITHIN or NEAR operator.
  61. *
  62. * The slop is in fact an edit-distance, where the units correspond to
  63. * moves of terms in the query phrase out of position. For example, to switch
  64. * the order of two words requires two moves (the first move places the words
  65. * atop one another), so to permit re-orderings of phrases, the slop must be
  66. * at least two.
  67. * More exact matches are scored higher than sloppier matches, thus search
  68. * results are sorted by exactness.
  69. *
  70. * The slop is zero by default, requiring exact matches.
  71. *
  72. * @var integer
  73. */
  74. private $_slop;
  75. /**
  76. * Class constructor. Create a new preprocessing object for prase query.
  77. *
  78. * @param string $phrase Phrase to search.
  79. * @param string $phraseEncoding Phrase encoding.
  80. * @param string $fieldName Field name.
  81. */
  82. public function __construct($phrase, $phraseEncoding, $fieldName)
  83. {
  84. $this->_phrase = $phrase;
  85. $this->_phraseEncoding = $phraseEncoding;
  86. $this->_field = $fieldName;
  87. }
  88. /**
  89. * Set slop
  90. *
  91. * @param integer $slop
  92. */
  93. public function setSlop($slop)
  94. {
  95. $this->_slop = $slop;
  96. }
  97. /**
  98. * Get slop
  99. *
  100. * @return integer
  101. */
  102. public function getSlop()
  103. {
  104. return $this->_slop;
  105. }
  106. /**
  107. * Re-write query into primitive queries in the context of specified index
  108. *
  109. * @param Zend_Search_Lucene_Interface $index
  110. * @return Zend_Search_Lucene_Search_Query
  111. */
  112. public function rewrite(Zend_Search_Lucene_Interface $index)
  113. {
  114. // Allow to use wildcards within phrases
  115. // They are either removed by text analyzer or used as a part of keyword for keyword fields
  116. //
  117. // if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
  118. // require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  119. // throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
  120. // }
  121. // Split query into subqueries if field name is not specified
  122. if ($this->_field === null) {
  123. include_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
  124. $query = new Zend_Search_Lucene_Search_Query_Boolean();
  125. $query->setBoost($this->getBoost());
  126. include_once 'Zend/Search/Lucene.php';
  127. if (Zend_Search_Lucene::getDefaultSearchField() === null) {
  128. $searchFields = $index->getFieldNames(true);
  129. } else {
  130. $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
  131. }
  132. foreach ($searchFields as $fieldName) {
  133. $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Phrase(
  134. $this->_phrase,
  135. $this->_phraseEncoding,
  136. $fieldName
  137. );
  138. $subquery->setSlop($this->getSlop());
  139. $query->addSubquery($subquery->rewrite($index));
  140. }
  141. $this->_matches = $query->getQueryTerms();
  142. return $query;
  143. }
  144. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  145. // encoding is not used since we expect binary matching
  146. include_once 'Zend/Search/Lucene/Index/Term.php';
  147. $term = new Zend_Search_Lucene_Index_Term($this->_phrase, $this->_field);
  148. if ($index->hasTerm($term)) {
  149. include_once 'Zend/Search/Lucene/Search/Query/Term.php';
  150. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  151. $query->setBoost($this->getBoost());
  152. $this->_matches = $query->getQueryTerms();
  153. return $query;
  154. }
  155. // tokenize phrase using current analyzer and process it as a phrase query
  156. include_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  157. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
  158. if (count($tokens) == 0) {
  159. $this->_matches = array();
  160. include_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  161. return new Zend_Search_Lucene_Search_Query_Insignificant();
  162. }
  163. if (count($tokens) == 1) {
  164. include_once 'Zend/Search/Lucene/Index/Term.php';
  165. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  166. include_once 'Zend/Search/Lucene/Search/Query/Term.php';
  167. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  168. $query->setBoost($this->getBoost());
  169. $this->_matches = $query->getQueryTerms();
  170. return $query;
  171. }
  172. //It's non-trivial phrase query
  173. $position = -1;
  174. include_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
  175. $query = new Zend_Search_Lucene_Search_Query_Phrase();
  176. include_once 'Zend/Search/Lucene/Index/Term.php';
  177. foreach ($tokens as $token) {
  178. $position += $token->getPositionIncrement();
  179. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
  180. $query->addTerm($term, $position);
  181. $query->setSlop($this->getSlop());
  182. }
  183. $this->_matches = $query->getQueryTerms();
  184. return $query;
  185. }
  186. /**
  187. * Query specific matches highlighting
  188. *
  189. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  190. */
  191. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  192. {
  193. /**
  194. * Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them
  195. */
  196. /**
  197. * Skip exact term matching recognition, keyword fields highlighting is not supported
  198. */
  199. /**
  200. * Skip wildcard queries recognition. Supported wildcards are removed by text analyzer
  201. */
  202. // tokenize phrase using current analyzer and process it as a phrase query
  203. include_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  204. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
  205. if (count($tokens) == 0) {
  206. // Do nothing
  207. return;
  208. }
  209. if (count($tokens) == 1) {
  210. $highlighter->highlight($tokens[0]->getTermText());
  211. return;
  212. }
  213. //It's non-trivial phrase query
  214. $words = array();
  215. foreach ($tokens as $token) {
  216. $words[] = $token->getTermText();
  217. }
  218. $highlighter->highlight($words);
  219. }
  220. /**
  221. * Print a query
  222. *
  223. * @return string
  224. */
  225. public function __toString()
  226. {
  227. // It's used only for query visualisation, so we don't care about characters escaping
  228. if ($this->_field !== null) {
  229. $query = $this->_field . ':';
  230. } else {
  231. $query = '';
  232. }
  233. $query .= '"' . $this->_phrase . '"';
  234. if ($this->_slop != 0) {
  235. $query .= '~' . $this->_slop;
  236. }
  237. if ($this->getBoost() != 1) {
  238. $query .= '^' . round($this->getBoost(), 4);
  239. }
  240. return $query;
  241. }
  242. }