PageRenderTime 43ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/library/Zend/Search/Lucene/Search/Query/Preprocessing/Term.php

https://github.com/necrogami/zf2
PHP | 324 lines | 171 code | 53 blank | 100 comment | 35 complexity | 595c92e2ab53dba3e35e59d86df227fc MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. namespace Zend\Search\Lucene\Search\Query\Preprocessing;
  22. use Zend\Search\Lucene,
  23. Zend\Search\Lucene\Index,
  24. Zend\Search\Lucene\Search\Query,
  25. Zend\Search\Lucene\Analysis\Analyzer,
  26. Zend\Search\Lucene\Search\Highlighter\HighlighterInterface as Highlighter,
  27. Zend\Search\Lucene\Search\Exception\QueryParserException;
  28. /**
  29. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  30. * This type of query is not actually involved into query execution.
  31. *
  32. * @category Zend
  33. * @package Zend_Search_Lucene
  34. * @subpackage Search
  35. * @internal
  36. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  37. * @license http://framework.zend.com/license/new-bsd New BSD License
  38. */
  39. class Term extends AbstractPreprocessing
  40. {
  41. /**
  42. * word (query parser lexeme) to find.
  43. *
  44. * @var string
  45. */
  46. private $_word;
  47. /**
  48. * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  49. *
  50. * @var string
  51. */
  52. private $_encoding;
  53. /**
  54. * Field name.
  55. *
  56. * @var string
  57. */
  58. private $_field;
  59. /**
  60. * Class constructor. Create a new preprocessing object for prase query.
  61. *
  62. * @param string $word Non-tokenized word (query parser lexeme) to search.
  63. * @param string $encoding Word encoding.
  64. * @param string $fieldName Field name.
  65. */
  66. public function __construct($word, $encoding, $fieldName)
  67. {
  68. $this->_word = $word;
  69. $this->_encoding = $encoding;
  70. $this->_field = $fieldName;
  71. }
  72. /**
  73. * Re-write query into primitive queries in the context of specified index
  74. *
  75. * @param \Zend\Search\Lucene\SearchIndexInterface $index
  76. * @throws \Zend\Search\Lucene\Search\Exception\QueryParserException
  77. * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  78. */
  79. public function rewrite(Lucene\SearchIndexInterface $index)
  80. {
  81. if ($this->_field === null) {
  82. $query = new Query\MultiTerm();
  83. $query->setBoost($this->getBoost());
  84. $hasInsignificantSubqueries = false;
  85. if (Lucene\Lucene::getDefaultSearchField() === null) {
  86. $searchFields = $index->getFieldNames(true);
  87. } else {
  88. $searchFields = array(Lucene\Lucene::getDefaultSearchField());
  89. }
  90. foreach ($searchFields as $fieldName) {
  91. $subquery = new Term($this->_word,
  92. $this->_encoding,
  93. $fieldName);
  94. $rewrittenSubquery = $subquery->rewrite($index);
  95. foreach ($rewrittenSubquery->getQueryTerms() as $term) {
  96. $query->addTerm($term);
  97. }
  98. if ($rewrittenSubquery instanceof Query\Insignificant) {
  99. $hasInsignificantSubqueries = true;
  100. }
  101. }
  102. if (count($query->getTerms()) == 0) {
  103. $this->_matches = array();
  104. if ($hasInsignificantSubqueries) {
  105. return new Query\Insignificant();
  106. } else {
  107. return new Query\EmptyResult();
  108. }
  109. }
  110. $this->_matches = $query->getQueryTerms();
  111. return $query;
  112. }
  113. // -------------------------------------
  114. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  115. // encoding is not used since we expect binary matching
  116. $term = new Index\Term($this->_word, $this->_field);
  117. if ($index->hasTerm($term)) {
  118. $query = new Query\Term($term);
  119. $query->setBoost($this->getBoost());
  120. $this->_matches = $query->getQueryTerms();
  121. return $query;
  122. }
  123. // -------------------------------------
  124. // Recognize wildcard queries
  125. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  126. if (@preg_match('/\pL/u', 'a') == 1) {
  127. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  128. $wildcardsPattern = '/[*?]/u';
  129. $subPatternsEncoding = 'UTF-8';
  130. } else {
  131. $word = $this->_word;
  132. $wildcardsPattern = '/[*?]/';
  133. $subPatternsEncoding = $this->_encoding;
  134. }
  135. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  136. if (count($subPatterns) > 1) {
  137. // Wildcard query is recognized
  138. $pattern = '';
  139. foreach ($subPatterns as $id => $subPattern) {
  140. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  141. if ($id != 0) {
  142. $pattern .= $word[ $subPattern[1] - 1 ];
  143. }
  144. // Check if each subputtern is a single word in terms of current analyzer
  145. $tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  146. if (count($tokens) > 1) {
  147. throw new QueryParserException('Wildcard search is supported only for non-multiple word terms');
  148. }
  149. foreach ($tokens as $token) {
  150. $pattern .= $token->getTermText();
  151. }
  152. }
  153. $term = new Index\Term($pattern, $this->_field);
  154. $query = new Query\Wildcard($term);
  155. $query->setBoost($this->getBoost());
  156. // Get rewritten query. Important! It also fills terms matching container.
  157. $rewrittenQuery = $query->rewrite($index);
  158. $this->_matches = $query->getQueryTerms();
  159. return $rewrittenQuery;
  160. }
  161. // -------------------------------------
  162. // Recognize one-term multi-term and "insignificant" queries
  163. $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  164. if (count($tokens) == 0) {
  165. $this->_matches = array();
  166. return new Query\Insignificant();
  167. }
  168. if (count($tokens) == 1) {
  169. $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
  170. $query = new Query\Term($term);
  171. $query->setBoost($this->getBoost());
  172. $this->_matches = $query->getQueryTerms();
  173. return $query;
  174. }
  175. //It's not insignificant or one term query
  176. $query = new Query\MultiTerm();
  177. /**
  178. * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
  179. * analizer design features
  180. */
  181. foreach ($tokens as $token) {
  182. $term = new Index\Term($token->getTermText(), $this->_field);
  183. $query->addTerm($term, true); // all subterms are required
  184. }
  185. $query->setBoost($this->getBoost());
  186. $this->_matches = $query->getQueryTerms();
  187. return $query;
  188. }
  189. /**
  190. * Query specific matches highlighting
  191. *
  192. * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
  193. */
  194. protected function _highlightMatches(Highlighter $highlighter)
  195. {
  196. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  197. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  198. // -------------------------------------
  199. // Recognize wildcard queries
  200. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  201. if (@preg_match('/\pL/u', 'a') == 1) {
  202. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  203. $wildcardsPattern = '/[*?]/u';
  204. $subPatternsEncoding = 'UTF-8';
  205. } else {
  206. $word = $this->_word;
  207. $wildcardsPattern = '/[*?]/';
  208. $subPatternsEncoding = $this->_encoding;
  209. }
  210. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  211. if (count($subPatterns) > 1) {
  212. // Wildcard query is recognized
  213. $pattern = '';
  214. foreach ($subPatterns as $id => $subPattern) {
  215. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  216. if ($id != 0) {
  217. $pattern .= $word[ $subPattern[1] - 1 ];
  218. }
  219. // Check if each subputtern is a single word in terms of current analyzer
  220. $tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  221. if (count($tokens) > 1) {
  222. // Do nothing (nothing is highlighted)
  223. return;
  224. }
  225. foreach ($tokens as $token) {
  226. $pattern .= $token->getTermText();
  227. }
  228. }
  229. $term = new Index\Term($pattern, $this->_field);
  230. $query = new Query\Wildcard($term);
  231. $query->_highlightMatches($highlighter);
  232. return;
  233. }
  234. // -------------------------------------
  235. // Recognize one-term multi-term and "insignificant" queries
  236. $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  237. if (count($tokens) == 0) {
  238. // Do nothing
  239. return;
  240. }
  241. if (count($tokens) == 1) {
  242. $highlighter->highlight($tokens[0]->getTermText());
  243. return;
  244. }
  245. //It's not insignificant or one term query
  246. $words = array();
  247. foreach ($tokens as $token) {
  248. $words[] = $token->getTermText();
  249. }
  250. $highlighter->highlight($words);
  251. }
  252. /**
  253. * Print a query
  254. *
  255. * @return string
  256. */
  257. public function __toString()
  258. {
  259. // It's used only for query visualisation, so we don't care about characters escaping
  260. if ($this->_field !== null) {
  261. $query = $this->_field . ':';
  262. } else {
  263. $query = '';
  264. }
  265. $query .= $this->_word;
  266. if ($this->getBoost() != 1) {
  267. $query .= '^' . round($this->getBoost(), 4);
  268. }
  269. return $query;
  270. }
  271. }