PageRenderTime 55ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/vendor/Zend/Search/Lucene/Search/Query/Preprocessing/Term.php

https://bitbucket.org/anycode/sfluceneplugin
PHP | 341 lines | 186 code | 54 blank | 101 comment | 35 complexity | a32184003d550758990dbf17ee9d06ef MD5 | raw file
Possible License(s): BSD-3-Clause
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Term.php 20096 2010-01-06 02:05:09Z bkarwin $
  21. */
  22. /** Zend_Search_Lucene_Search_Query_Processing */
  23. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
  24. /**
  25. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  26. * This type of query is not actually involved into query execution.
  27. *
  28. * @category Zend
  29. * @package Zend_Search_Lucene
  30. * @subpackage Search
  31. * @internal
  32. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. class Zend_Search_Lucene_Search_Query_Preprocessing_Term extends Zend_Search_Lucene_Search_Query_Preprocessing
  36. {
  37. /**
  38. * word (query parser lexeme) to find.
  39. *
  40. * @var string
  41. */
  42. private $_word;
  43. /**
  44. * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  45. *
  46. * @var string
  47. */
  48. private $_encoding;
  49. /**
  50. * Field name.
  51. *
  52. * @var string
  53. */
  54. private $_field;
  55. /**
  56. * Class constructor. Create a new preprocessing object for prase query.
  57. *
  58. * @param string $word Non-tokenized word (query parser lexeme) to search.
  59. * @param string $encoding Word encoding.
  60. * @param string $fieldName Field name.
  61. */
  62. public function __construct($word, $encoding, $fieldName)
  63. {
  64. $this->_word = $word;
  65. $this->_encoding = $encoding;
  66. $this->_field = $fieldName;
  67. }
  68. /**
  69. * Re-write query into primitive queries in the context of specified index
  70. *
  71. * @param Zend_Search_Lucene_Interface $index
  72. * @return Zend_Search_Lucene_Search_Query
  73. */
  74. public function rewrite(Zend_Search_Lucene_Interface $index)
  75. {
  76. if ($this->_field === null) {
  77. require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
  78. $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
  79. $query->setBoost($this->getBoost());
  80. $hasInsignificantSubqueries = false;
  81. require_once 'Zend/Search/Lucene.php';
  82. if (Zend_Search_Lucene::getDefaultSearchField() === null) {
  83. $searchFields = $index->getFieldNames(true);
  84. } else {
  85. $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
  86. }
  87. require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Term.php';
  88. foreach ($searchFields as $fieldName) {
  89. $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_word,
  90. $this->_encoding,
  91. $fieldName);
  92. $rewrittenSubquery = $subquery->rewrite($index);
  93. foreach ($rewrittenSubquery->getQueryTerms() as $term) {
  94. $query->addTerm($term);
  95. }
  96. if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
  97. $hasInsignificantSubqueries = true;
  98. }
  99. }
  100. if (count($query->getTerms()) == 0) {
  101. $this->_matches = array();
  102. if ($hasInsignificantSubqueries) {
  103. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  104. return new Zend_Search_Lucene_Search_Query_Insignificant();
  105. } else {
  106. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  107. return new Zend_Search_Lucene_Search_Query_Empty();
  108. }
  109. }
  110. $this->_matches = $query->getQueryTerms();
  111. return $query;
  112. }
  113. // -------------------------------------
  114. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  115. // encoding is not used since we expect binary matching
  116. require_once 'Zend/Search/Lucene/Index/Term.php';
  117. $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
  118. if ($index->hasTerm($term)) {
  119. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  120. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  121. $query->setBoost($this->getBoost());
  122. $this->_matches = $query->getQueryTerms();
  123. return $query;
  124. }
  125. // -------------------------------------
  126. // Recognize wildcard queries
  127. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  128. if (@preg_match('/\pL/u', 'a') == 1) {
  129. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  130. $wildcardsPattern = '/[*?]/u';
  131. $subPatternsEncoding = 'UTF-8';
  132. } else {
  133. $word = $this->_word;
  134. $wildcardsPattern = '/[*?]/';
  135. $subPatternsEncoding = $this->_encoding;
  136. }
  137. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  138. if (count($subPatterns) > 1) {
  139. // Wildcard query is recognized
  140. $pattern = '';
  141. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  142. foreach ($subPatterns as $id => $subPattern) {
  143. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  144. if ($id != 0) {
  145. $pattern .= $word[ $subPattern[1] - 1 ];
  146. }
  147. // Check if each subputtern is a single word in terms of current analyzer
  148. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  149. if (count($tokens) > 1) {
  150. require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  151. throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
  152. }
  153. foreach ($tokens as $token) {
  154. $pattern .= $token->getTermText();
  155. }
  156. }
  157. require_once 'Zend/Search/Lucene/Index/Term.php';
  158. $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
  159. require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
  160. $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
  161. $query->setBoost($this->getBoost());
  162. // Get rewritten query. Important! It also fills terms matching container.
  163. $rewrittenQuery = $query->rewrite($index);
  164. $this->_matches = $query->getQueryTerms();
  165. return $rewrittenQuery;
  166. }
  167. // -------------------------------------
  168. // Recognize one-term multi-term and "insignificant" queries
  169. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  170. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  171. if (count($tokens) == 0) {
  172. $this->_matches = array();
  173. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  174. return new Zend_Search_Lucene_Search_Query_Insignificant();
  175. }
  176. if (count($tokens) == 1) {
  177. require_once 'Zend/Search/Lucene/Index/Term.php';
  178. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  179. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  180. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  181. $query->setBoost($this->getBoost());
  182. $this->_matches = $query->getQueryTerms();
  183. return $query;
  184. }
  185. //It's not insignificant or one term query
  186. require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
  187. $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
  188. /**
  189. * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
  190. * analizer design features
  191. */
  192. require_once 'Zend/Search/Lucene/Index/Term.php';
  193. foreach ($tokens as $token) {
  194. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
  195. $query->addTerm($term, true); // all subterms are required
  196. }
  197. $query->setBoost($this->getBoost());
  198. $this->_matches = $query->getQueryTerms();
  199. return $query;
  200. }
  201. /**
  202. * Query specific matches highlighting
  203. *
  204. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  205. */
  206. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  207. {
  208. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  209. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  210. // -------------------------------------
  211. // Recognize wildcard queries
  212. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  213. if (@preg_match('/\pL/u', 'a') == 1) {
  214. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  215. $wildcardsPattern = '/[*?]/u';
  216. $subPatternsEncoding = 'UTF-8';
  217. } else {
  218. $word = $this->_word;
  219. $wildcardsPattern = '/[*?]/';
  220. $subPatternsEncoding = $this->_encoding;
  221. }
  222. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  223. if (count($subPatterns) > 1) {
  224. // Wildcard query is recognized
  225. $pattern = '';
  226. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  227. foreach ($subPatterns as $id => $subPattern) {
  228. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  229. if ($id != 0) {
  230. $pattern .= $word[ $subPattern[1] - 1 ];
  231. }
  232. // Check if each subputtern is a single word in terms of current analyzer
  233. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  234. if (count($tokens) > 1) {
  235. // Do nothing (nothing is highlighted)
  236. return;
  237. }
  238. foreach ($tokens as $token) {
  239. $pattern .= $token->getTermText();
  240. }
  241. }
  242. require_once 'Zend/Search/Lucene/Index/Term.php';
  243. $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
  244. require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
  245. $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
  246. $query->_highlightMatches($highlighter);
  247. return;
  248. }
  249. // -------------------------------------
  250. // Recognize one-term multi-term and "insignificant" queries
  251. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  252. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  253. if (count($tokens) == 0) {
  254. // Do nothing
  255. return;
  256. }
  257. if (count($tokens) == 1) {
  258. $highlighter->highlight($tokens[0]->getTermText());
  259. return;
  260. }
  261. //It's not insignificant or one term query
  262. $words = array();
  263. foreach ($tokens as $token) {
  264. $words[] = $token->getTermText();
  265. }
  266. $highlighter->highlight($words);
  267. }
  268. /**
  269. * Print a query
  270. *
  271. * @return string
  272. */
  273. public function __toString()
  274. {
  275. // It's used only for query visualisation, so we don't care about characters escaping
  276. if ($this->_field !== null) {
  277. $query = $this->_field . ':';
  278. } else {
  279. $query = '';
  280. }
  281. $query .= $this->_word;
  282. if ($this->getBoost() != 1) {
  283. $query .= '^' . round($this->getBoost(), 4);
  284. }
  285. return $query;
  286. }
  287. }