PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/inc/app/sitesearch/lib/Zend/Search/Lucene/Search/Query/Boolean.php

https://github.com/lux/sitellite
PHP | 739 lines | 387 code | 118 blank | 234 comment | 103 complexity | 38da5498f41dde1b62b04e8e7441e096 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, Apache-2.0, GPL-3.0
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Search_Query */
  22. require_once 'Zend/Search/Lucene/Search/Query.php';
  23. /** Zend_Search_Lucene_Search_Weight_Boolean */
  24. require_once 'Zend/Search/Lucene/Search/Weight/Boolean.php';
  25. /**
  26. * @category Zend
  27. * @package Zend_Search_Lucene
  28. * @subpackage Search
  29. * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  30. * @license http://framework.zend.com/license/new-bsd New BSD License
  31. */
  32. class Zend_Search_Lucene_Search_Query_Boolean extends Zend_Search_Lucene_Search_Query
  33. {
  34. /**
  35. * Subqueries
  36. * Array of Zend_Search_Lucene_Query
  37. *
  38. * @var array
  39. */
  40. private $_subqueries = array();
  41. /**
  42. * Subqueries signs.
  43. * If true then subquery is required.
  44. * If false then subquery is prohibited.
  45. * If null then subquery is neither prohibited, nor required
  46. *
  47. * If array is null then all subqueries are required
  48. *
  49. * @var array
  50. */
  51. private $_signs = array();
  52. /**
  53. * Result vector.
  54. *
  55. * @var array
  56. */
  57. private $_resVector = null;
  58. /**
  59. * A score factor based on the fraction of all query subqueries
  60. * that a document contains.
  61. * float for conjunction queries
  62. * array of float for non conjunction queries
  63. *
  64. * @var mixed
  65. */
  66. private $_coord = null;
  67. /**
  68. * Class constructor. Create a new Boolean query object.
  69. *
  70. * if $signs array is omitted then all subqueries are required
  71. * it differs from addSubquery() behavior, but should never be used
  72. *
  73. * @param array $subqueries Array of Zend_Search_Search_Query objects
  74. * @param array $signs Array of signs. Sign is boolean|null.
  75. * @return void
  76. */
  77. public function __construct($subqueries = null, $signs = null)
  78. {
  79. if (is_array($subqueries)) {
  80. $this->_subqueries = $subqueries;
  81. $this->_signs = null;
  82. // Check if all subqueries are required
  83. if (is_array($signs)) {
  84. foreach ($signs as $sign ) {
  85. if ($sign !== true) {
  86. $this->_signs = $signs;
  87. break;
  88. }
  89. }
  90. }
  91. }
  92. }
  93. /**
  94. * Add a $subquery (Zend_Search_Lucene_Query) to this query.
  95. *
  96. * The sign is specified as:
  97. * TRUE - subquery is required
  98. * FALSE - subquery is prohibited
  99. * NULL - subquery is neither prohibited, nor required
  100. *
  101. * @param Zend_Search_Lucene_Search_Query $subquery
  102. * @param boolean|null $sign
  103. * @return void
  104. */
  105. public function addSubquery(Zend_Search_Lucene_Search_Query $subquery, $sign=null) {
  106. if ($sign !== true || $this->_signs !== null) { // Skip, if all subqueries are required
  107. if ($this->_signs === null) { // Check, If all previous subqueries are required
  108. $this->_signs = array();
  109. foreach ($this->_subqueries as $prevSubquery) {
  110. $this->_signs[] = true;
  111. }
  112. }
  113. $this->_signs[] = $sign;
  114. }
  115. $this->_subqueries[] = $subquery;
  116. }
  117. /**
  118. * Re-write queries into primitive queries
  119. *
  120. * @param Zend_Search_Lucene_Interface $index
  121. * @return Zend_Search_Lucene_Search_Query
  122. */
  123. public function rewrite(Zend_Search_Lucene_Interface $index)
  124. {
  125. $query = new Zend_Search_Lucene_Search_Query_Boolean();
  126. $query->setBoost($this->getBoost());
  127. foreach ($this->_subqueries as $subqueryId => $subquery) {
  128. $query->addSubquery($subquery->rewrite($index),
  129. ($this->_signs === null)? true : $this->_signs[$subqueryId]);
  130. }
  131. return $query;
  132. }
  133. /**
  134. * Optimize query in the context of specified index
  135. *
  136. * @param Zend_Search_Lucene_Interface $index
  137. * @return Zend_Search_Lucene_Search_Query
  138. */
  139. public function optimize(Zend_Search_Lucene_Interface $index)
  140. {
  141. $subqueries = array();
  142. $signs = array();
  143. // Optimize all subqueries
  144. foreach ($this->_subqueries as $id => $subquery) {
  145. $subqueries[] = $subquery->optimize($index);
  146. $signs[] = ($this->_signs === null)? true : $this->_signs[$id];
  147. }
  148. // Remove insignificant subqueries
  149. foreach ($subqueries as $id => $subquery) {
  150. if ($subquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
  151. // Insignificant subquery has to be removed anyway
  152. unset($subqueries[$id]);
  153. unset($signs[$id]);
  154. }
  155. }
  156. if (count($subqueries) == 0) {
  157. // Boolean query doesn't has non-insignificant subqueries
  158. return new Zend_Search_Lucene_Search_Query_Insignificant();
  159. }
  160. // Check if all non-insignificant subqueries are prohibited
  161. $allProhibited = true;
  162. foreach ($signs as $sign) {
  163. if ($sign !== false) {
  164. $allProhibited = false;
  165. break;
  166. }
  167. }
  168. if ($allProhibited) {
  169. return new Zend_Search_Lucene_Search_Query_Insignificant();
  170. }
  171. // Check for empty subqueries
  172. foreach ($subqueries as $id => $subquery) {
  173. if ($subquery instanceof Zend_Search_Lucene_Search_Query_Empty) {
  174. if ($signs[$id] === true) {
  175. // Matching is required, but is actually empty
  176. return new Zend_Search_Lucene_Search_Query_Empty();
  177. } else {
  178. // Matching is optional or prohibited, but is empty
  179. // Remove it from subqueries and signs list
  180. unset($subqueries[$id]);
  181. unset($signs[$id]);
  182. }
  183. }
  184. }
  185. // Check, if reduced subqueries list is empty
  186. if (count($subqueries) == 0) {
  187. return new Zend_Search_Lucene_Search_Query_Empty();
  188. }
  189. // Check if all non-empty subqueries are prohibited
  190. $allProhibited = true;
  191. foreach ($signs as $sign) {
  192. if ($sign !== false) {
  193. $allProhibited = false;
  194. break;
  195. }
  196. }
  197. if ($allProhibited) {
  198. return new Zend_Search_Lucene_Search_Query_Empty();
  199. }
  200. // Check, if reduced subqueries list has only one entry
  201. if (count($subqueries) == 1) {
  202. // It's a query with only one required or optional clause
  203. // (it's already checked, that it's not a prohibited clause)
  204. if ($this->getBoost() == 1) {
  205. return reset($subqueries);
  206. }
  207. $optimizedQuery = clone reset($subqueries);
  208. $optimizedQuery->setBoost($optimizedQuery->getBoost()*$this->getBoost());
  209. return $optimizedQuery;
  210. }
  211. // Prepare first candidate for optimized query
  212. $optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
  213. $optimizedQuery->setBoost($this->getBoost());
  214. $terms = array();
  215. $tsigns = array();
  216. $boostFactors = array();
  217. // Try to decompose term and multi-term subqueries
  218. foreach ($subqueries as $id => $subquery) {
  219. if ($subquery instanceof Zend_Search_Lucene_Search_Query_Term) {
  220. $terms[] = $subquery->getTerm();
  221. $tsigns[] = $signs[$id];
  222. $boostFactors[] = $subquery->getBoost();
  223. // remove subquery from a subqueries list
  224. unset($subqueries[$id]);
  225. unset($signs[$id]);
  226. } else if ($subquery instanceof Zend_Search_Lucene_Search_Query_MultiTerm) {
  227. $subTerms = $subquery->getTerms();
  228. $subSigns = $subquery->getSigns();
  229. if ($signs[$id] === true) {
  230. // It's a required multi-term subquery.
  231. // Something like '... +(+term1 -term2 term3 ...) ...'
  232. // Multi-term required subquery can be decomposed only if it contains
  233. // required terms and doesn't contain prohibited terms:
  234. // ... +(+term1 term2 ...) ... => ... +term1 term2 ...
  235. //
  236. // Check this
  237. $hasRequired = false;
  238. $hasProhibited = false;
  239. if ($subSigns === null) {
  240. // All subterms are required
  241. $hasRequired = true;
  242. } else {
  243. foreach ($subSigns as $sign) {
  244. if ($sign === true) {
  245. $hasRequired = true;
  246. } else if ($sign === false) {
  247. $hasProhibited = true;
  248. break;
  249. }
  250. }
  251. }
  252. // Continue if subquery has prohibited terms or doesn't have required terms
  253. if ($hasProhibited || !$hasRequired) {
  254. continue;
  255. }
  256. foreach ($subTerms as $termId => $term) {
  257. $terms[] = $term;
  258. $tsigns[] = ($subSigns === null)? true : $subSigns[$termId];
  259. $boostFactors[] = $subquery->getBoost();
  260. }
  261. // remove subquery from a subqueries list
  262. unset($subqueries[$id]);
  263. unset($signs[$id]);
  264. } else { // $signs[$id] === null || $signs[$id] === false
  265. // It's an optional or prohibited multi-term subquery.
  266. // Something like '... (+term1 -term2 term3 ...) ...'
  267. // or
  268. // something like '... -(+term1 -term2 term3 ...) ...'
  269. // Multi-term optional and required subqueries can be decomposed
  270. // only if all terms are optional.
  271. //
  272. // Check if all terms are optional.
  273. $onlyOptional = true;
  274. if ($subSigns === null) {
  275. // All subterms are required
  276. $onlyOptional = false;
  277. } else {
  278. foreach ($subSigns as $sign) {
  279. if ($sign !== null) {
  280. $onlyOptional = false;
  281. break;
  282. }
  283. }
  284. }
  285. // Continue if non-optional terms are presented in this multi-term subquery
  286. if (!$onlyOptional) {
  287. continue;
  288. }
  289. foreach ($subTerms as $termId => $term) {
  290. $terms[] = $term;
  291. $tsigns[] = ($signs[$id] === null)? null /* optional */ :
  292. false /* prohibited */;
  293. $boostFactors[] = $subquery->getBoost();
  294. }
  295. // remove subquery from a subqueries list
  296. unset($subqueries[$id]);
  297. unset($signs[$id]);
  298. }
  299. }
  300. }
  301. // Check, if there are no decomposed subqueries
  302. if (count($terms) == 0 ) {
  303. // return prepared candidate
  304. return $optimizedQuery;
  305. }
  306. // Check, if all subqueries have been decomposed and all terms has the same boost factor
  307. if (count($subqueries) == 0 && count(array_unique($boostFactors)) == 1) {
  308. $optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
  309. $optimizedQuery->setBoost(reset($boostFactors)*$this->getBoost());
  310. return $optimizedQuery;
  311. }
  312. // This boolean query can't be transformed to Term/MultiTerm query and still contains
  313. // several subqueries
  314. // Separate prohibited terms
  315. $prohibitedTerms = array();
  316. foreach ($terms as $id => $term) {
  317. if ($tsigns[$id] === false) {
  318. $prohibitedTerms[] = $term;
  319. unset($terms[$id]);
  320. unset($tsigns[$id]);
  321. unset($boostFactors[$id]);
  322. }
  323. }
  324. if (count($terms) == 1) {
  325. $clause = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
  326. $clause->setBoost(reset($boostFactors));
  327. $subqueries[] = $clause;
  328. $signs[] = reset($tsigns);
  329. // Clear terms list
  330. $terms = array();
  331. } else if (count($terms) > 1 && count(array_unique($boostFactors)) == 1) {
  332. $clause = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
  333. $clause->setBoost(reset($boostFactors));
  334. $subqueries[] = $clause;
  335. // Clause sign is 'required' if clause contains required terms. 'Optional' otherwise.
  336. $signs[] = (in_array(true, $tsigns))? true : null;
  337. // Clear terms list
  338. $terms = array();
  339. }
  340. if (count($prohibitedTerms) == 1) {
  341. // (boost factors are not significant for prohibited clauses)
  342. $subqueries[] = new Zend_Search_Lucene_Search_Query_Term(reset($prohibitedTerms));
  343. $signs[] = false;
  344. // Clear prohibited terms list
  345. $prohibitedTerms = array();
  346. } else if (count($prohibitedTerms) > 1) {
  347. // prepare signs array
  348. $prohibitedSigns = array();
  349. foreach ($prohibitedTerms as $id => $term) {
  350. // all prohibited term are grouped as optional into multi-term query
  351. $prohibitedSigns[$id] = null;
  352. }
  353. // (boost factors are not significant for prohibited clauses)
  354. $subqueries[] = new Zend_Search_Lucene_Search_Query_MultiTerm($prohibitedTerms, $prohibitedSigns);
  355. // Clause sign is 'prohibited'
  356. $signs[] = false;
  357. // Clear terms list
  358. $prohibitedTerms = array();
  359. }
  360. /** @todo Group terms with the same boost factors together */
  361. // Check, that all terms are processed
  362. // Replace candidate for optimized query
  363. if (count($terms) == 0 && count($prohibitedTerms) == 0) {
  364. $optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
  365. $optimizedQuery->setBoost($this->getBoost());
  366. }
  367. return $optimizedQuery;
  368. }
  369. /**
  370. * Returns subqueries
  371. *
  372. * @return array
  373. */
  374. public function getSubqueries()
  375. {
  376. return $this->_subqueries;
  377. }
  378. /**
  379. * Return subqueries signs
  380. *
  381. * @return array
  382. */
  383. public function getSigns()
  384. {
  385. return $this->_signs;
  386. }
  387. /**
  388. * Constructs an appropriate Weight implementation for this query.
  389. *
  390. * @param Zend_Search_Lucene_Interface $reader
  391. * @return Zend_Search_Lucene_Search_Weight
  392. */
  393. public function createWeight(Zend_Search_Lucene_Interface $reader)
  394. {
  395. $this->_weight = new Zend_Search_Lucene_Search_Weight_Boolean($this, $reader);
  396. return $this->_weight;
  397. }
  398. /**
  399. * Calculate result vector for Conjunction query
  400. * (like '<subquery1> AND <subquery2> AND <subquery3>')
  401. */
  402. private function _calculateConjunctionResult()
  403. {
  404. $this->_resVector = null;
  405. if (count($this->_subqueries) == 0) {
  406. $this->_resVector = array();
  407. }
  408. foreach ($this->_subqueries as $subquery) {
  409. if($this->_resVector === null) {
  410. $this->_resVector = $subquery->matchedDocs();
  411. } else {
  412. $this->_resVector = array_intersect_key($this->_resVector, $subquery->matchedDocs());
  413. }
  414. if (count($this->_resVector) == 0) {
  415. // Empty result set, we don't need to check other terms
  416. break;
  417. }
  418. }
  419. ksort($this->_resVector, SORT_NUMERIC);
  420. }
  421. /**
  422. * Calculate result vector for non Conjunction query
  423. * (like '<subquery1> AND <subquery2> AND NOT <subquery3> OR <subquery4>')
  424. */
  425. private function _calculateNonConjunctionResult()
  426. {
  427. $required = null;
  428. $optional = array();
  429. foreach ($this->_subqueries as $subqueryId => $subquery) {
  430. $docs = $subquery->matchedDocs();
  431. if ($this->_signs[$subqueryId] === true) {
  432. // required
  433. if ($required !== null) {
  434. // array intersection
  435. $required = array_intersect_key($required, $docs);
  436. } else {
  437. $required = $docs;
  438. }
  439. } elseif ($this->_signs[$subqueryId] === false) {
  440. // prohibited
  441. // Do nothing. matchedDocs() may include non-matching id's
  442. } else {
  443. // neither required, nor prohibited
  444. // array union
  445. $optional += $docs;
  446. }
  447. }
  448. if ($required !== null) {
  449. $this->_resVector = &$required;
  450. } else {
  451. $this->_resVector = &$optional;
  452. }
  453. ksort($this->_resVector, SORT_NUMERIC);
  454. }
  455. /**
  456. * Score calculator for conjunction queries (all subqueries are required)
  457. *
  458. * @param integer $docId
  459. * @param Zend_Search_Lucene_Interface $reader
  460. * @return float
  461. */
  462. public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
  463. {
  464. if ($this->_coord === null) {
  465. $this->_coord = $reader->getSimilarity()->coord(count($this->_subqueries),
  466. count($this->_subqueries) );
  467. }
  468. $score = 0;
  469. foreach ($this->_subqueries as $subquery) {
  470. $subscore = $subquery->score($docId, $reader);
  471. if ($subscore == 0) {
  472. return 0;
  473. }
  474. $score += $subquery->score($docId, $reader) * $this->_coord;
  475. }
  476. return $score * $this->_coord * $this->getBoost();
  477. }
  478. /**
  479. * Score calculator for non conjunction queries (not all subqueries are required)
  480. *
  481. * @param integer $docId
  482. * @param Zend_Search_Lucene_Interface $reader
  483. * @return float
  484. */
  485. public function _nonConjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
  486. {
  487. if ($this->_coord === null) {
  488. $this->_coord = array();
  489. $maxCoord = 0;
  490. foreach ($this->_signs as $sign) {
  491. if ($sign !== false /* not prohibited */) {
  492. $maxCoord++;
  493. }
  494. }
  495. for ($count = 0; $count <= $maxCoord; $count++) {
  496. $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
  497. }
  498. }
  499. $score = 0;
  500. $matchedSubqueries = 0;
  501. foreach ($this->_subqueries as $subqueryId => $subquery) {
  502. $subscore = $subquery->score($docId, $reader);
  503. // Prohibited
  504. if ($this->_signs[$subqueryId] === false && $subscore != 0) {
  505. return 0;
  506. }
  507. // is required, but doen't match
  508. if ($this->_signs[$subqueryId] === true && $subscore == 0) {
  509. return 0;
  510. }
  511. if ($subscore != 0) {
  512. $matchedSubqueries++;
  513. $score += $subscore;
  514. }
  515. }
  516. return $score * $this->_coord[$matchedSubqueries] * $this->getBoost();
  517. }
  518. /**
  519. * Execute query in context of index reader
  520. * It also initializes necessary internal structures
  521. *
  522. * @param Zend_Search_Lucene_Interface $reader
  523. */
  524. public function execute(Zend_Search_Lucene_Interface $reader)
  525. {
  526. // Initialize weight if it's not done yet
  527. $this->_initWeight($reader);
  528. foreach ($this->_subqueries as $subquery) {
  529. $subquery->execute($reader);
  530. }
  531. if ($this->_signs === null) {
  532. $this->_calculateConjunctionResult();
  533. } else {
  534. $this->_calculateNonConjunctionResult();
  535. }
  536. }
  537. /**
  538. * Get document ids likely matching the query
  539. *
  540. * It's an array with document ids as keys (performance considerations)
  541. *
  542. * @return array
  543. */
  544. public function matchedDocs()
  545. {
  546. return $this->_resVector;
  547. }
  548. /**
  549. * Score specified document
  550. *
  551. * @param integer $docId
  552. * @param Zend_Search_Lucene_Interface $reader
  553. * @return float
  554. */
  555. public function score($docId, Zend_Search_Lucene_Interface $reader)
  556. {
  557. if (isset($this->_resVector[$docId])) {
  558. if ($this->_signs === null) {
  559. return $this->_conjunctionScore($docId, $reader);
  560. } else {
  561. return $this->_nonConjunctionScore($docId, $reader);
  562. }
  563. } else {
  564. return 0;
  565. }
  566. }
  567. /**
  568. * Return query terms
  569. *
  570. * @return array
  571. */
  572. public function getQueryTerms()
  573. {
  574. $terms = array();
  575. foreach ($this->_subqueries as $id => $subquery) {
  576. if ($this->_signs === null || $this->_signs[$id] !== false) {
  577. $terms = array_merge($terms, $subquery->getQueryTerms());
  578. }
  579. }
  580. return $terms;
  581. }
  582. /**
  583. * Highlight query terms
  584. *
  585. * @param integer &$colorIndex
  586. * @param Zend_Search_Lucene_Document_Html $doc
  587. */
  588. public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
  589. {
  590. foreach ($this->_subqueries as $id => $subquery) {
  591. if ($this->_signs === null || $this->_signs[$id] !== false) {
  592. $subquery->highlightMatchesDOM($doc, $colorIndex);
  593. }
  594. }
  595. }
  596. /**
  597. * Print a query
  598. *
  599. * @return string
  600. */
  601. public function __toString()
  602. {
  603. // It's used only for query visualisation, so we don't care about characters escaping
  604. $query = '';
  605. foreach ($this->_subqueries as $id => $subquery) {
  606. if ($id != 0) {
  607. $query .= ' ';
  608. }
  609. if ($this->_signs === null || $this->_signs[$id] === true) {
  610. $query .= '+';
  611. } else if ($this->_signs[$id] === false) {
  612. $query .= '-';
  613. }
  614. $query .= '(' . $subquery->__toString() . ')';
  615. if ($subquery->getBoost() != 1) {
  616. $query .= '^' . $subquery->getBoost();
  617. }
  618. }
  619. return $query;
  620. }
  621. }