PageRenderTime 41ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/application/libraries/Zend/Search/Lucene/Search/Query/Preprocessing/Term.php

https://github.com/grandison/budo16
PHP | 335 lines | 164 code | 58 blank | 113 comment | 35 complexity | 1c288b28d278c386ff43a4bbee425bea MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Term.php 16971 2009-07-22 18:05:45Z mikaelkael $
  21. */
  22. /** Zend_Search_Lucene_Search_Query_Processing */
  23. // require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
  24. /** Zend_Search_Lucene_Search_Query_Phrase */
  25. // require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
  26. /** Zend_Search_Lucene_Search_Query_Insignificant */
  27. // require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  28. /** Zend_Search_Lucene_Search_Query_Empty */
  29. // require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  30. /** Zend_Search_Lucene_Search_Query_Term */
  31. // require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  32. /** Zend_Search_Lucene_Index_Term */
  33. // require_once 'Zend/Search/Lucene/Index/Term.php';
  34. /**
  35. * It's an internal abstract class intended to finalize ase a query processing after query parsing.
  36. * This type of query is not actually involved into query execution.
  37. *
  38. * @category Zend
  39. * @package Zend_Search_Lucene
  40. * @subpackage Search
  41. * @internal
  42. * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  43. * @license http://framework.zend.com/license/new-bsd New BSD License
  44. */
  45. class Zend_Search_Lucene_Search_Query_Preprocessing_Term extends Zend_Search_Lucene_Search_Query_Preprocessing
  46. {
  47. /**
  48. * word (query parser lexeme) to find.
  49. *
  50. * @var string
  51. */
  52. private $_word;
  53. /**
  54. * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
  55. *
  56. * @var string
  57. */
  58. private $_encoding;
  59. /**
  60. * Field name.
  61. *
  62. * @var string
  63. */
  64. private $_field;
  65. /**
  66. * Class constructor. Create a new preprocessing object for prase query.
  67. *
  68. * @param string $word Non-tokenized word (query parser lexeme) to search.
  69. * @param string $encoding Word encoding.
  70. * @param string $fieldName Field name.
  71. */
  72. public function __construct($word, $encoding, $fieldName)
  73. {
  74. $this->_word = $word;
  75. $this->_encoding = $encoding;
  76. $this->_field = $fieldName;
  77. }
  78. /**
  79. * Re-write query into primitive queries in the context of specified index
  80. *
  81. * @param Zend_Search_Lucene_Interface $index
  82. * @return Zend_Search_Lucene_Search_Query
  83. */
  84. public function rewrite(Zend_Search_Lucene_Interface $index)
  85. {
  86. if ($this->_field === null) {
  87. $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
  88. $query->setBoost($this->getBoost());
  89. $hasInsignificantSubqueries = false;
  90. if (Zend_Search_Lucene::getDefaultSearchField() === null) {
  91. $searchFields = $index->getFieldNames(true);
  92. } else {
  93. $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
  94. }
  95. foreach ($searchFields as $fieldName) {
  96. $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_word,
  97. $this->_encoding,
  98. $fieldName);
  99. $rewrittenSubquery = $subquery->rewrite($index);
  100. foreach ($rewrittenSubquery->getQueryTerms() as $term) {
  101. $query->addTerm($term);
  102. }
  103. if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
  104. $hasInsignificantSubqueries = true;
  105. }
  106. }
  107. if (count($query->getTerms()) == 0) {
  108. $this->_matches = array();
  109. if ($hasInsignificantSubqueries) {
  110. return new Zend_Search_Lucene_Search_Query_Insignificant();
  111. } else {
  112. return new Zend_Search_Lucene_Search_Query_Empty();
  113. }
  114. }
  115. $this->_matches = $query->getQueryTerms();
  116. return $query;
  117. }
  118. // -------------------------------------
  119. // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
  120. // encoding is not used since we expect binary matching
  121. $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
  122. if ($index->hasTerm($term)) {
  123. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  124. $query->setBoost($this->getBoost());
  125. $this->_matches = $query->getQueryTerms();
  126. return $query;
  127. }
  128. // -------------------------------------
  129. // Recognize wildcard queries
  130. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  131. if (@preg_match('/\pL/u', 'a') == 1) {
  132. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  133. $wildcardsPattern = '/[*?]/u';
  134. $subPatternsEncoding = 'UTF-8';
  135. } else {
  136. $word = $this->_word;
  137. $wildcardsPattern = '/[*?]/';
  138. $subPatternsEncoding = $this->_encoding;
  139. }
  140. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  141. if (count($subPatterns) > 1) {
  142. // Wildcard query is recognized
  143. $pattern = '';
  144. foreach ($subPatterns as $id => $subPattern) {
  145. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  146. if ($id != 0) {
  147. $pattern .= $word[ $subPattern[1] - 1 ];
  148. }
  149. // Check if each subputtern is a single word in terms of current analyzer
  150. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  151. if (count($tokens) > 1) {
  152. // require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
  153. throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
  154. }
  155. foreach ($tokens as $token) {
  156. $pattern .= $token->getTermText();
  157. }
  158. }
  159. $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
  160. $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
  161. $query->setBoost($this->getBoost());
  162. // Get rewritten query. Important! It also fills terms matching container.
  163. $rewrittenQuery = $query->rewrite($index);
  164. $this->_matches = $query->getQueryTerms();
  165. return $rewrittenQuery;
  166. }
  167. // -------------------------------------
  168. // Recognize one-term multi-term and "insignificant" queries
  169. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  170. if (count($tokens) == 0) {
  171. $this->_matches = array();
  172. return new Zend_Search_Lucene_Search_Query_Insignificant();
  173. }
  174. if (count($tokens) == 1) {
  175. $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
  176. $query = new Zend_Search_Lucene_Search_Query_Term($term);
  177. $query->setBoost($this->getBoost());
  178. $this->_matches = $query->getQueryTerms();
  179. return $query;
  180. }
  181. //It's not insignificant or one term query
  182. $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
  183. /**
  184. * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
  185. * analizer design features
  186. */
  187. foreach ($tokens as $token) {
  188. $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
  189. $query->addTerm($term, true); // all subterms are required
  190. }
  191. $query->setBoost($this->getBoost());
  192. $this->_matches = $query->getQueryTerms();
  193. return $query;
  194. }
  195. /**
  196. * Query specific matches highlighting
  197. *
  198. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  199. */
  200. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  201. {
  202. /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
  203. /** Skip exact term matching recognition, keyword fields highlighting is not supported */
  204. // -------------------------------------
  205. // Recognize wildcard queries
  206. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  207. if (@preg_match('/\pL/u', 'a') == 1) {
  208. $word = iconv($this->_encoding, 'UTF-8', $this->_word);
  209. $wildcardsPattern = '/[*?]/u';
  210. $subPatternsEncoding = 'UTF-8';
  211. } else {
  212. $word = $this->_word;
  213. $wildcardsPattern = '/[*?]/';
  214. $subPatternsEncoding = $this->_encoding;
  215. }
  216. $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
  217. if (count($subPatterns) > 1) {
  218. // Wildcard query is recognized
  219. $pattern = '';
  220. foreach ($subPatterns as $id => $subPattern) {
  221. // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
  222. if ($id != 0) {
  223. $pattern .= $word[ $subPattern[1] - 1 ];
  224. }
  225. // Check if each subputtern is a single word in terms of current analyzer
  226. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
  227. if (count($tokens) > 1) {
  228. // Do nothing (nothing is highlighted)
  229. return;
  230. }
  231. foreach ($tokens as $token) {
  232. $pattern .= $token->getTermText();
  233. }
  234. }
  235. $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
  236. $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
  237. $query->_highlightMatches($highlighter);
  238. return;
  239. }
  240. // -------------------------------------
  241. // Recognize one-term multi-term and "insignificant" queries
  242. $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
  243. if (count($tokens) == 0) {
  244. // Do nothing
  245. return;
  246. }
  247. if (count($tokens) == 1) {
  248. $highlighter->highlight($tokens[0]->getTermText());
  249. return;
  250. }
  251. //It's not insignificant or one term query
  252. $words = array();
  253. foreach ($tokens as $token) {
  254. $words[] = $token->getTermText();
  255. }
  256. $highlighter->highlight($words);
  257. }
  258. /**
  259. * Print a query
  260. *
  261. * @return string
  262. */
  263. public function __toString()
  264. {
  265. // It's used only for query visualisation, so we don't care about characters escaping
  266. if ($this->_field !== null) {
  267. $query = $this->_field . ':';
  268. } else {
  269. $query = '';
  270. }
  271. $query .= $this->_word;
  272. if ($this->getBoost() != 1) {
  273. $query .= '^' . round($this->getBoost(), 4);
  274. }
  275. return $query;
  276. }
  277. }