PageRenderTime 23ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/library/Zend/Search/Lucene/Search/Query/Wildcard.php

https://github.com/Shreef/zf2
PHP | 355 lines | 157 code | 47 blank | 151 comment | 40 complexity | 4702ad5c7b87870d5de288aee70b2e86 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /**
  22. * @namespace
  23. */
  24. namespace Zend\Search\Lucene\Search\Query;
  25. use Zend\Search\Lucene\Index;
  26. use Zend\Search\Lucene;
  27. use Zend\Search\Lucene\Analysis\Analyzer;
  28. use Zend\Search\Lucene\Search\Highlighter;
  29. /**
  30. * @uses \Zend\Search\Lucene\Index
  31. * @uses \Zend\Search\Lucene\Analysis\Analyzer\Analyzer
  32. * @uses \Zend\Search\Lucene\Exception
  33. * @uses \Zend\Search\Lucene\Index\Term
  34. * @uses \Zend\Search\Lucene\Search\Query
  35. * @category Zend
  36. * @package Zend_Search_Lucene
  37. * @subpackage Search
  38. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  39. * @license http://framework.zend.com/license/new-bsd New BSD License
  40. */
  41. class Wildcard extends AbstractQuery
  42. {
  43. /**
  44. * Search pattern.
  45. *
  46. * Field has to be fully specified or has to be null
  47. * Text may contain '*' or '?' symbols
  48. *
  49. * @var \Zend\Search\Lucene\Index\Term
  50. */
  51. private $_pattern;
  52. /**
  53. * Matched terms.
  54. *
  55. * Matched terms list.
  56. * It's filled during the search (rewrite operation) and may be used for search result
  57. * post-processing
  58. *
  59. * Array of Zend_Search_Lucene_Index_Term objects
  60. *
  61. * @var array
  62. */
  63. private $_matches = null;
  64. /**
  65. * Minimum term prefix length (number of minimum non-wildcard characters)
  66. *
  67. * @var integer
  68. */
  69. private static $_minPrefixLength = 3;
  70. /**
  71. * Zend_Search_Lucene_Search_Query_Wildcard constructor.
  72. *
  73. * @param \Zend\Search\Lucene\Index\Term $pattern
  74. */
  75. public function __construct(Index\Term $pattern)
  76. {
  77. $this->_pattern = $pattern;
  78. }
  79. /**
  80. * Get minimum prefix length
  81. *
  82. * @return integer
  83. */
  84. public static function getMinPrefixLength()
  85. {
  86. return self::$_minPrefixLength;
  87. }
  88. /**
  89. * Set minimum prefix length
  90. *
  91. * @param integer $minPrefixLength
  92. */
  93. public static function setMinPrefixLength($minPrefixLength)
  94. {
  95. self::$_minPrefixLength = $minPrefixLength;
  96. }
  97. /**
  98. * Get terms prefix
  99. *
  100. * @param string $word
  101. * @return string
  102. */
  103. private static function _getPrefix($word)
  104. {
  105. $questionMarkPosition = strpos($word, '?');
  106. $astrericPosition = strpos($word, '*');
  107. if ($questionMarkPosition !== false) {
  108. if ($astrericPosition !== false) {
  109. return substr($word, 0, min($questionMarkPosition, $astrericPosition));
  110. }
  111. return substr($word, 0, $questionMarkPosition);
  112. } else if ($astrericPosition !== false) {
  113. return substr($word, 0, $astrericPosition);
  114. }
  115. return $word;
  116. }
  117. /**
  118. * Re-write query into primitive queries in the context of specified index
  119. *
  120. * @param \Zend\Search\Lucene\SearchIndex $index
  121. * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  122. * @throws \Zend\Search\Lucene\Exception
  123. */
  124. public function rewrite(Lucene\SearchIndex $index)
  125. {
  126. $this->_matches = array();
  127. if ($this->_pattern->field === null) {
  128. // Search through all fields
  129. $fields = $index->getFieldNames(true /* indexed fields list */);
  130. } else {
  131. $fields = array($this->_pattern->field);
  132. }
  133. $prefix = self::_getPrefix($this->_pattern->text);
  134. $prefixLength = strlen($prefix);
  135. $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
  136. if ($prefixLength < self::$_minPrefixLength) {
  137. throw new Lucene\Exception('At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.');
  138. }
  139. /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
  140. if (@preg_match('/\pL/u', 'a') == 1) {
  141. // PCRE unicode support is turned on
  142. // add Unicode modifier to the match expression
  143. $matchExpression .= 'u';
  144. }
  145. $maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
  146. foreach ($fields as $field) {
  147. $index->resetTermsStream();
  148. if ($prefix != '') {
  149. $index->skipTo(new Index\Term($prefix, $field));
  150. while ($index->currentTerm() !== null &&
  151. $index->currentTerm()->field == $field &&
  152. substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
  153. if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
  154. $this->_matches[] = $index->currentTerm();
  155. if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
  156. throw new Lucene\Exception('Terms per query limit is reached.');
  157. }
  158. }
  159. $index->nextTerm();
  160. }
  161. } else {
  162. $index->skipTo(new Index\Term('', $field));
  163. while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
  164. if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
  165. $this->_matches[] = $index->currentTerm();
  166. if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
  167. throw new Lucene\Exception('Terms per query limit is reached.');
  168. }
  169. }
  170. $index->nextTerm();
  171. }
  172. }
  173. $index->closeTermsStream();
  174. }
  175. if (count($this->_matches) == 0) {
  176. return new EmptyResult();
  177. } else if (count($this->_matches) == 1) {
  178. return new Term(reset($this->_matches));
  179. } else {
  180. $rewrittenQuery = new MultiTerm();
  181. foreach ($this->_matches as $matchedTerm) {
  182. $rewrittenQuery->addTerm($matchedTerm);
  183. }
  184. return $rewrittenQuery;
  185. }
  186. }
  187. /**
  188. * Optimize query in the context of specified index
  189. *
  190. * @param \Zend\Search\Lucene\SearchIndex $index
  191. * @return \Zend\Search\Lucene\Search\Query\AbstractQuery
  192. */
  193. public function optimize(Lucene\SearchIndex $index)
  194. {
  195. throw new Lucene\Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  196. }
  197. /**
  198. * Returns query pattern
  199. *
  200. * @return \Zend\Search\Lucene\Index\Term
  201. */
  202. public function getPattern()
  203. {
  204. return $this->_pattern;
  205. }
  206. /**
  207. * Return query terms
  208. *
  209. * @return array
  210. * @throws \Zend\Search\Lucene\Exception
  211. */
  212. public function getQueryTerms()
  213. {
  214. if ($this->_matches === null) {
  215. throw new Lucene\Exception('Search has to be performed first to get matched terms');
  216. }
  217. return $this->_matches;
  218. }
  219. /**
  220. * Constructs an appropriate Weight implementation for this query.
  221. *
  222. * @param \Zend\Search\Lucene\SearchIndex $reader
  223. * @return \Zend\Search\Lucene\Search\Weight\Weight
  224. * @throws \Zend\Search\Lucene\Exception
  225. */
  226. public function createWeight(Lucene\SearchIndex $reader)
  227. {
  228. throw new Lucene\Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  229. }
  230. /**
  231. * Execute query in context of index reader
  232. * It also initializes necessary internal structures
  233. *
  234. * @param \Zend\Search\Lucene\SearchIndex $reader
  235. * @param \Zend\Search\Lucene\Index\DocsFilter|null $docsFilter
  236. * @throws \Zend\Search\Lucene\Exception
  237. */
  238. public function execute(Lucene\SearchIndex $reader, $docsFilter = null)
  239. {
  240. throw new Lucene\Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  241. }
  242. /**
  243. * Get document ids likely matching the query
  244. *
  245. * It's an array with document ids as keys (performance considerations)
  246. *
  247. * @return array
  248. * @throws \Zend\Search\Lucene\Exception
  249. */
  250. public function matchedDocs()
  251. {
  252. throw new Lucene\Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  253. }
  254. /**
  255. * Score specified document
  256. *
  257. * @param integer $docId
  258. * @param \Zend\Search\Lucene\SearchIndex $reader
  259. * @return float
  260. * @throws \Zend\Search\Lucene\Exception
  261. */
  262. public function score($docId, Lucene\SearchIndex $reader)
  263. {
  264. throw new Lucene\Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
  265. }
  266. /**
  267. * Query specific matches highlighting
  268. *
  269. * @param \Zend\Search\Lucene\Search\Highlighter $highlighter Highlighter object (also contains doc for highlighting)
  270. */
  271. protected function _highlightMatches(Lucene\Search\Highlighter $highlighter)
  272. {
  273. $words = array();
  274. $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
  275. if (@preg_match('/\pL/u', 'a') == 1) {
  276. // PCRE unicode support is turned on
  277. // add Unicode modifier to the match expression
  278. $matchExpression .= 'u';
  279. }
  280. $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
  281. $tokens = Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
  282. foreach ($tokens as $token) {
  283. if (preg_match($matchExpression, $token->getTermText()) === 1) {
  284. $words[] = $token->getTermText();
  285. }
  286. }
  287. $highlighter->highlight($words);
  288. }
  289. /**
  290. * Print a query
  291. *
  292. * @return string
  293. */
  294. public function __toString()
  295. {
  296. // It's used only for query visualisation, so we don't care about characters escaping
  297. if ($this->_pattern->field !== null) {
  298. $query = $this->_pattern->field . ':';
  299. } else {
  300. $query = '';
  301. }
  302. $query .= $this->_pattern->text;
  303. if ($this->getBoost() != 1) {
  304. $query = $query . '^' . round($this->getBoost(), 4);
  305. }
  306. return $query;
  307. }
  308. }