PageRenderTime 41ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/src/application/libraries/Zend/Search/Lucene/Search/Query/MultiTerm.php

https://bitbucket.org/masnug/grc276-blog-laravel
PHP | 668 lines | 340 code | 95 blank | 233 comment | 66 complexity | 9a0b42cf109c52c40cb4c25467570bde MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: MultiTerm.php 23775 2011-03-01 17:25:24Z ralph $
  21. */
  22. /** Zend_Search_Lucene_Search_Query */
  23. require_once 'Zend/Search/Lucene/Search/Query.php';
  24. /**
  25. * @category Zend
  26. * @package Zend_Search_Lucene
  27. * @subpackage Search
  28. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  29. * @license http://framework.zend.com/license/new-bsd New BSD License
  30. */
  31. class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query
  32. {
  33. /**
  34. * Terms to find.
  35. * Array of Zend_Search_Lucene_Index_Term
  36. *
  37. * @var array
  38. */
  39. private $_terms = array();
  40. /**
  41. * Term signs.
  42. * If true then term is required.
  43. * If false then term is prohibited.
  44. * If null then term is neither prohibited, nor required
  45. *
  46. * If array is null then all terms are required
  47. *
  48. * @var array
  49. */
  50. private $_signs;
  51. /**
  52. * Result vector.
  53. *
  54. * @var array
  55. */
  56. private $_resVector = null;
  57. /**
  58. * Terms positions vectors.
  59. * Array of Arrays:
  60. * term1Id => (docId => freq, ...)
  61. * term2Id => (docId => freq, ...)
  62. *
  63. * @var array
  64. */
  65. private $_termsFreqs = array();
  66. /**
  67. * A score factor based on the fraction of all query terms
  68. * that a document contains.
  69. * float for conjunction queries
  70. * array of float for non conjunction queries
  71. *
  72. * @var mixed
  73. */
  74. private $_coord = null;
  75. /**
  76. * Terms weights
  77. * array of Zend_Search_Lucene_Search_Weight
  78. *
  79. * @var array
  80. */
  81. private $_weights = array();
  82. /**
  83. * Class constructor. Create a new multi-term query object.
  84. *
  85. * if $signs array is omitted then all terms are required
  86. * it differs from addTerm() behavior, but should never be used
  87. *
  88. * @param array $terms Array of Zend_Search_Lucene_Index_Term objects
  89. * @param array $signs Array of signs. Sign is boolean|null.
  90. * @throws Zend_Search_Lucene_Exception
  91. */
  92. public function __construct($terms = null, $signs = null)
  93. {
  94. if (is_array($terms)) {
  95. require_once 'Zend/Search/Lucene.php';
  96. if (count($terms) > Zend_Search_Lucene::getTermsPerQueryLimit()) {
  97. throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
  98. }
  99. $this->_terms = $terms;
  100. $this->_signs = null;
  101. // Check if all terms are required
  102. if (is_array($signs)) {
  103. foreach ($signs as $sign ) {
  104. if ($sign !== true) {
  105. $this->_signs = $signs;
  106. break;
  107. }
  108. }
  109. }
  110. }
  111. }
  112. /**
  113. * Add a $term (Zend_Search_Lucene_Index_Term) to this query.
  114. *
  115. * The sign is specified as:
  116. * TRUE - term is required
  117. * FALSE - term is prohibited
  118. * NULL - term is neither prohibited, nor required
  119. *
  120. * @param Zend_Search_Lucene_Index_Term $term
  121. * @param boolean|null $sign
  122. * @return void
  123. */
  124. public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign = null) {
  125. if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required
  126. if ($this->_signs === null) { // Check, If all previous terms are required
  127. $this->_signs = array();
  128. foreach ($this->_terms as $prevTerm) {
  129. $this->_signs[] = true;
  130. }
  131. }
  132. $this->_signs[] = $sign;
  133. }
  134. $this->_terms[] = $term;
  135. }
  136. /**
  137. * Re-write query into primitive queries in the context of specified index
  138. *
  139. * @param Zend_Search_Lucene_Interface $index
  140. * @return Zend_Search_Lucene_Search_Query
  141. */
  142. public function rewrite(Zend_Search_Lucene_Interface $index)
  143. {
  144. if (count($this->_terms) == 0) {
  145. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  146. return new Zend_Search_Lucene_Search_Query_Empty();
  147. }
  148. // Check, that all fields are qualified
  149. $allQualified = true;
  150. foreach ($this->_terms as $term) {
  151. if ($term->field === null) {
  152. $allQualified = false;
  153. break;
  154. }
  155. }
  156. if ($allQualified) {
  157. return $this;
  158. } else {
  159. /** transform multiterm query to boolean and apply rewrite() method to subqueries. */
  160. require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
  161. $query = new Zend_Search_Lucene_Search_Query_Boolean();
  162. $query->setBoost($this->getBoost());
  163. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  164. foreach ($this->_terms as $termId => $term) {
  165. $subquery = new Zend_Search_Lucene_Search_Query_Term($term);
  166. $query->addSubquery($subquery->rewrite($index),
  167. ($this->_signs === null)? true : $this->_signs[$termId]);
  168. }
  169. return $query;
  170. }
  171. }
  172. /**
  173. * Optimize query in the context of specified index
  174. *
  175. * @param Zend_Search_Lucene_Interface $index
  176. * @return Zend_Search_Lucene_Search_Query
  177. */
  178. public function optimize(Zend_Search_Lucene_Interface $index)
  179. {
  180. $terms = $this->_terms;
  181. $signs = $this->_signs;
  182. foreach ($terms as $id => $term) {
  183. if (!$index->hasTerm($term)) {
  184. if ($signs === null || $signs[$id] === true) {
  185. // Term is required
  186. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  187. return new Zend_Search_Lucene_Search_Query_Empty();
  188. } else {
  189. // Term is optional or prohibited
  190. // Remove it from terms and signs list
  191. unset($terms[$id]);
  192. unset($signs[$id]);
  193. }
  194. }
  195. }
  196. // Check if all presented terms are prohibited
  197. $allProhibited = true;
  198. if ($signs === null) {
  199. $allProhibited = false;
  200. } else {
  201. foreach ($signs as $sign) {
  202. if ($sign !== false) {
  203. $allProhibited = false;
  204. break;
  205. }
  206. }
  207. }
  208. if ($allProhibited) {
  209. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  210. return new Zend_Search_Lucene_Search_Query_Empty();
  211. }
  212. /**
  213. * @todo make an optimization for repeated terms
  214. * (they may have different signs)
  215. */
  216. if (count($terms) == 1) {
  217. // It's already checked, that it's not a prohibited term
  218. // It's one term query with one required or optional element
  219. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  220. $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
  221. $optimizedQuery->setBoost($this->getBoost());
  222. return $optimizedQuery;
  223. }
  224. if (count($terms) == 0) {
  225. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  226. return new Zend_Search_Lucene_Search_Query_Empty();
  227. }
  228. $optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $signs);
  229. $optimizedQuery->setBoost($this->getBoost());
  230. return $optimizedQuery;
  231. }
  232. /**
  233. * Returns query term
  234. *
  235. * @return array
  236. */
  237. public function getTerms()
  238. {
  239. return $this->_terms;
  240. }
  241. /**
  242. * Return terms signs
  243. *
  244. * @return array
  245. */
  246. public function getSigns()
  247. {
  248. return $this->_signs;
  249. }
  250. /**
  251. * Set weight for specified term
  252. *
  253. * @param integer $num
  254. * @param Zend_Search_Lucene_Search_Weight_Term $weight
  255. */
  256. public function setWeight($num, $weight)
  257. {
  258. $this->_weights[$num] = $weight;
  259. }
  260. /**
  261. * Constructs an appropriate Weight implementation for this query.
  262. *
  263. * @param Zend_Search_Lucene_Interface $reader
  264. * @return Zend_Search_Lucene_Search_Weight
  265. */
  266. public function createWeight(Zend_Search_Lucene_Interface $reader)
  267. {
  268. require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php';
  269. $this->_weight = new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader);
  270. return $this->_weight;
  271. }
  272. /**
  273. * Calculate result vector for Conjunction query
  274. * (like '+something +another')
  275. *
  276. * @param Zend_Search_Lucene_Interface $reader
  277. */
  278. private function _calculateConjunctionResult(Zend_Search_Lucene_Interface $reader)
  279. {
  280. $this->_resVector = null;
  281. if (count($this->_terms) == 0) {
  282. $this->_resVector = array();
  283. }
  284. // Order terms by selectivity
  285. $docFreqs = array();
  286. $ids = array();
  287. foreach ($this->_terms as $id => $term) {
  288. $docFreqs[] = $reader->docFreq($term);
  289. $ids[] = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison
  290. }
  291. array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC,
  292. $ids, SORT_ASC, SORT_NUMERIC,
  293. $this->_terms);
  294. require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
  295. $docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
  296. foreach ($this->_terms as $termId => $term) {
  297. $termDocs = $reader->termDocs($term, $docsFilter);
  298. }
  299. // Treat last retrieved docs vector as a result set
  300. // (filter collects data for other terms)
  301. $this->_resVector = array_flip($termDocs);
  302. foreach ($this->_terms as $termId => $term) {
  303. $this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter);
  304. }
  305. // ksort($this->_resVector, SORT_NUMERIC);
  306. // Docs are returned ordered. Used algorithms doesn't change elements order.
  307. }
  308. /**
  309. * Calculate result vector for non Conjunction query
  310. * (like '+something -another')
  311. *
  312. * @param Zend_Search_Lucene_Interface $reader
  313. */
  314. private function _calculateNonConjunctionResult(Zend_Search_Lucene_Interface $reader)
  315. {
  316. $requiredVectors = array();
  317. $requiredVectorsSizes = array();
  318. $requiredVectorsIds = array(); // is used to prevent arrays comparison
  319. $optional = array();
  320. $prohibited = array();
  321. foreach ($this->_terms as $termId => $term) {
  322. $termDocs = array_flip($reader->termDocs($term));
  323. if ($this->_signs[$termId] === true) {
  324. // required
  325. $requiredVectors[] = $termDocs;
  326. $requiredVectorsSizes[] = count($termDocs);
  327. $requiredVectorsIds[] = $termId;
  328. } elseif ($this->_signs[$termId] === false) {
  329. // prohibited
  330. // array union
  331. $prohibited += $termDocs;
  332. } else {
  333. // neither required, nor prohibited
  334. // array union
  335. $optional += $termDocs;
  336. }
  337. $this->_termsFreqs[$termId] = $reader->termFreqs($term);
  338. }
  339. // sort resvectors in order of subquery cardinality increasing
  340. array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
  341. $requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
  342. $requiredVectors);
  343. $required = null;
  344. foreach ($requiredVectors as $nextResVector) {
  345. if($required === null) {
  346. $required = $nextResVector;
  347. } else {
  348. //$required = array_intersect_key($required, $nextResVector);
  349. /**
  350. * This code is used as workaround for array_intersect_key() slowness problem.
  351. */
  352. $updatedVector = array();
  353. foreach ($required as $id => $value) {
  354. if (isset($nextResVector[$id])) {
  355. $updatedVector[$id] = $value;
  356. }
  357. }
  358. $required = $updatedVector;
  359. }
  360. if (count($required) == 0) {
  361. // Empty result set, we don't need to check other terms
  362. break;
  363. }
  364. }
  365. if ($required !== null) {
  366. $this->_resVector = $required;
  367. } else {
  368. $this->_resVector = $optional;
  369. }
  370. if (count($prohibited) != 0) {
  371. // $this->_resVector = array_diff_key($this->_resVector, $prohibited);
  372. /**
  373. * This code is used as workaround for array_diff_key() slowness problem.
  374. */
  375. if (count($this->_resVector) < count($prohibited)) {
  376. $updatedVector = $this->_resVector;
  377. foreach ($this->_resVector as $id => $value) {
  378. if (isset($prohibited[$id])) {
  379. unset($updatedVector[$id]);
  380. }
  381. }
  382. $this->_resVector = $updatedVector;
  383. } else {
  384. $updatedVector = $this->_resVector;
  385. foreach ($prohibited as $id => $value) {
  386. unset($updatedVector[$id]);
  387. }
  388. $this->_resVector = $updatedVector;
  389. }
  390. }
  391. ksort($this->_resVector, SORT_NUMERIC);
  392. }
  393. /**
  394. * Score calculator for conjunction queries (all terms are required)
  395. *
  396. * @param integer $docId
  397. * @param Zend_Search_Lucene_Interface $reader
  398. * @return float
  399. */
  400. public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
  401. {
  402. if ($this->_coord === null) {
  403. $this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
  404. count($this->_terms) );
  405. }
  406. $score = 0.0;
  407. foreach ($this->_terms as $termId => $term) {
  408. /**
  409. * We don't need to check that term freq is not 0
  410. * Score calculation is performed only for matched docs
  411. */
  412. $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
  413. $this->_weights[$termId]->getValue() *
  414. $reader->norm($docId, $term->field);
  415. }
  416. return $score * $this->_coord * $this->getBoost();
  417. }
  418. /**
  419. * Score calculator for non conjunction queries (not all terms are required)
  420. *
  421. * @param integer $docId
  422. * @param Zend_Search_Lucene_Interface $reader
  423. * @return float
  424. */
  425. public function _nonConjunctionScore($docId, $reader)
  426. {
  427. if ($this->_coord === null) {
  428. $this->_coord = array();
  429. $maxCoord = 0;
  430. foreach ($this->_signs as $sign) {
  431. if ($sign !== false /* not prohibited */) {
  432. $maxCoord++;
  433. }
  434. }
  435. for ($count = 0; $count <= $maxCoord; $count++) {
  436. $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
  437. }
  438. }
  439. $score = 0.0;
  440. $matchedTerms = 0;
  441. foreach ($this->_terms as $termId=>$term) {
  442. // Check if term is
  443. if ($this->_signs[$termId] !== false && // not prohibited
  444. isset($this->_termsFreqs[$termId][$docId]) // matched
  445. ) {
  446. $matchedTerms++;
  447. /**
  448. * We don't need to check that term freq is not 0
  449. * Score calculation is performed only for matched docs
  450. */
  451. $score +=
  452. $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
  453. $this->_weights[$termId]->getValue() *
  454. $reader->norm($docId, $term->field);
  455. }
  456. }
  457. return $score * $this->_coord[$matchedTerms] * $this->getBoost();
  458. }
  459. /**
  460. * Execute query in context of index reader
  461. * It also initializes necessary internal structures
  462. *
  463. * @param Zend_Search_Lucene_Interface $reader
  464. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  465. */
  466. public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
  467. {
  468. if ($this->_signs === null) {
  469. $this->_calculateConjunctionResult($reader);
  470. } else {
  471. $this->_calculateNonConjunctionResult($reader);
  472. }
  473. // Initialize weight if it's not done yet
  474. $this->_initWeight($reader);
  475. }
  476. /**
  477. * Get document ids likely matching the query
  478. *
  479. * It's an array with document ids as keys (performance considerations)
  480. *
  481. * @return array
  482. */
  483. public function matchedDocs()
  484. {
  485. return $this->_resVector;
  486. }
  487. /**
  488. * Score specified document
  489. *
  490. * @param integer $docId
  491. * @param Zend_Search_Lucene_Interface $reader
  492. * @return float
  493. */
  494. public function score($docId, Zend_Search_Lucene_Interface $reader)
  495. {
  496. if (isset($this->_resVector[$docId])) {
  497. if ($this->_signs === null) {
  498. return $this->_conjunctionScore($docId, $reader);
  499. } else {
  500. return $this->_nonConjunctionScore($docId, $reader);
  501. }
  502. } else {
  503. return 0;
  504. }
  505. }
  506. /**
  507. * Return query terms
  508. *
  509. * @return array
  510. */
  511. public function getQueryTerms()
  512. {
  513. if ($this->_signs === null) {
  514. return $this->_terms;
  515. }
  516. $terms = array();
  517. foreach ($this->_signs as $id => $sign) {
  518. if ($sign !== false) {
  519. $terms[] = $this->_terms[$id];
  520. }
  521. }
  522. return $terms;
  523. }
  524. /**
  525. * Query specific matches highlighting
  526. *
  527. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  528. */
  529. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  530. {
  531. $words = array();
  532. if ($this->_signs === null) {
  533. foreach ($this->_terms as $term) {
  534. $words[] = $term->text;
  535. }
  536. } else {
  537. foreach ($this->_signs as $id => $sign) {
  538. if ($sign !== false) {
  539. $words[] = $this->_terms[$id]->text;
  540. }
  541. }
  542. }
  543. $highlighter->highlight($words);
  544. }
  545. /**
  546. * Print a query
  547. *
  548. * @return string
  549. */
  550. public function __toString()
  551. {
  552. // It's used only for query visualisation, so we don't care about characters escaping
  553. $query = '';
  554. foreach ($this->_terms as $id => $term) {
  555. if ($id != 0) {
  556. $query .= ' ';
  557. }
  558. if ($this->_signs === null || $this->_signs[$id] === true) {
  559. $query .= '+';
  560. } else if ($this->_signs[$id] === false) {
  561. $query .= '-';
  562. }
  563. if ($term->field !== null) {
  564. $query .= $term->field . ':';
  565. }
  566. $query .= $term->text;
  567. }
  568. if ($this->getBoost() != 1) {
  569. $query = '(' . $query . ')^' . round($this->getBoost(), 4);
  570. }
  571. return $query;
  572. }
  573. }