PageRenderTime 59ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/src/application/libraries/Zend/Search/Lucene/Search/Query/Phrase.php

https://bitbucket.org/masnug/grc276-blog-laravel
PHP | 576 lines | 296 code | 83 blank | 197 comment | 65 complexity | e70da27501fffc35cd0cbb5364d615ff MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Search
  18. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Phrase.php 23775 2011-03-01 17:25:24Z ralph $
  21. */
  22. /** Zend_Search_Lucene_Search_Query */
  23. require_once 'Zend/Search/Lucene/Search/Query.php';
  24. /**
  25. * A Query that matches documents containing a particular sequence of terms.
  26. *
  27. * @category Zend
  28. * @package Zend_Search_Lucene
  29. * @subpackage Search
  30. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  31. * @license http://framework.zend.com/license/new-bsd New BSD License
  32. */
  33. class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query
  34. {
  35. /**
  36. * Terms to find.
  37. * Array of Zend_Search_Lucene_Index_Term objects.
  38. *
  39. * @var array
  40. */
  41. private $_terms;
  42. /**
  43. * Term positions (relative positions of terms within the phrase).
  44. * Array of integers
  45. *
  46. * @var array
  47. */
  48. private $_offsets;
  49. /**
  50. * Sets the number of other words permitted between words in query phrase.
  51. * If zero, then this is an exact phrase search. For larger values this works
  52. * like a WITHIN or NEAR operator.
  53. *
  54. * The slop is in fact an edit-distance, where the units correspond to
  55. * moves of terms in the query phrase out of position. For example, to switch
  56. * the order of two words requires two moves (the first move places the words
  57. * atop one another), so to permit re-orderings of phrases, the slop must be
  58. * at least two.
  59. * More exact matches are scored higher than sloppier matches, thus search
  60. * results are sorted by exactness.
  61. *
  62. * The slop is zero by default, requiring exact matches.
  63. *
  64. * @var integer
  65. */
  66. private $_slop;
  67. /**
  68. * Result vector.
  69. *
  70. * @var array
  71. */
  72. private $_resVector = null;
  73. /**
  74. * Terms positions vectors.
  75. * Array of Arrays:
  76. * term1Id => (docId => array( pos1, pos2, ... ), ...)
  77. * term2Id => (docId => array( pos1, pos2, ... ), ...)
  78. *
  79. * @var array
  80. */
  81. private $_termsPositions = array();
  82. /**
  83. * Class constructor. Create a new prase query.
  84. *
  85. * @param string $field Field to search.
  86. * @param array $terms Terms to search Array of strings.
  87. * @param array $offsets Relative term positions. Array of integers.
  88. * @throws Zend_Search_Lucene_Exception
  89. */
  90. public function __construct($terms = null, $offsets = null, $field = null)
  91. {
  92. $this->_slop = 0;
  93. if (is_array($terms)) {
  94. $this->_terms = array();
  95. require_once 'Zend/Search/Lucene/Index/Term.php';
  96. foreach ($terms as $termId => $termText) {
  97. $this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field):
  98. new Zend_Search_Lucene_Index_Term($termText);
  99. }
  100. } else if ($terms === null) {
  101. $this->_terms = array();
  102. } else {
  103. require_once 'Zend/Search/Lucene/Exception.php';
  104. throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null');
  105. }
  106. if (is_array($offsets)) {
  107. if (count($this->_terms) != count($offsets)) {
  108. require_once 'Zend/Search/Lucene/Exception.php';
  109. throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.');
  110. }
  111. $this->_offsets = $offsets;
  112. } else if ($offsets === null) {
  113. $this->_offsets = array();
  114. foreach ($this->_terms as $termId => $term) {
  115. $position = count($this->_offsets);
  116. $this->_offsets[$termId] = $position;
  117. }
  118. } else {
  119. require_once 'Zend/Search/Lucene/Exception.php';
  120. throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null');
  121. }
  122. }
  123. /**
  124. * Set slop
  125. *
  126. * @param integer $slop
  127. */
  128. public function setSlop($slop)
  129. {
  130. $this->_slop = $slop;
  131. }
  132. /**
  133. * Get slop
  134. *
  135. * @return integer
  136. */
  137. public function getSlop()
  138. {
  139. return $this->_slop;
  140. }
  141. /**
  142. * Adds a term to the end of the query phrase.
  143. * The relative position of the term is specified explicitly or the one immediately
  144. * after the last term added.
  145. *
  146. * @param Zend_Search_Lucene_Index_Term $term
  147. * @param integer $position
  148. */
  149. public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) {
  150. if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
  151. require_once 'Zend/Search/Lucene/Exception.php';
  152. throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' .
  153. $term->field . ':' . $term->text);
  154. }
  155. $this->_terms[] = $term;
  156. if ($position !== null) {
  157. $this->_offsets[] = $position;
  158. } else if (count($this->_offsets) != 0) {
  159. $this->_offsets[] = end($this->_offsets) + 1;
  160. } else {
  161. $this->_offsets[] = 0;
  162. }
  163. }
  164. /**
  165. * Re-write query into primitive queries in the context of specified index
  166. *
  167. * @param Zend_Search_Lucene_Interface $index
  168. * @return Zend_Search_Lucene_Search_Query
  169. */
  170. public function rewrite(Zend_Search_Lucene_Interface $index)
  171. {
  172. if (count($this->_terms) == 0) {
  173. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  174. return new Zend_Search_Lucene_Search_Query_Empty();
  175. } else if ($this->_terms[0]->field !== null) {
  176. return $this;
  177. } else {
  178. require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
  179. $query = new Zend_Search_Lucene_Search_Query_Boolean();
  180. $query->setBoost($this->getBoost());
  181. foreach ($index->getFieldNames(true) as $fieldName) {
  182. $subquery = new Zend_Search_Lucene_Search_Query_Phrase();
  183. $subquery->setSlop($this->getSlop());
  184. require_once 'Zend/Search/Lucene/Index/Term.php';
  185. foreach ($this->_terms as $termId => $term) {
  186. $qualifiedTerm = new Zend_Search_Lucene_Index_Term($term->text, $fieldName);
  187. $subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]);
  188. }
  189. $query->addSubquery($subquery);
  190. }
  191. return $query;
  192. }
  193. }
  194. /**
  195. * Optimize query in the context of specified index
  196. *
  197. * @param Zend_Search_Lucene_Interface $index
  198. * @return Zend_Search_Lucene_Search_Query
  199. */
  200. public function optimize(Zend_Search_Lucene_Interface $index)
  201. {
  202. // Check, that index contains all phrase terms
  203. foreach ($this->_terms as $term) {
  204. if (!$index->hasTerm($term)) {
  205. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  206. return new Zend_Search_Lucene_Search_Query_Empty();
  207. }
  208. }
  209. if (count($this->_terms) == 1) {
  210. // It's one term query
  211. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  212. $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($this->_terms));
  213. $optimizedQuery->setBoost($this->getBoost());
  214. return $optimizedQuery;
  215. }
  216. if (count($this->_terms) == 0) {
  217. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  218. return new Zend_Search_Lucene_Search_Query_Empty();
  219. }
  220. return $this;
  221. }
  222. /**
  223. * Returns query term
  224. *
  225. * @return array
  226. */
  227. public function getTerms()
  228. {
  229. return $this->_terms;
  230. }
  231. /**
  232. * Set weight for specified term
  233. *
  234. * @param integer $num
  235. * @param Zend_Search_Lucene_Search_Weight_Term $weight
  236. */
  237. public function setWeight($num, $weight)
  238. {
  239. $this->_weights[$num] = $weight;
  240. }
  241. /**
  242. * Constructs an appropriate Weight implementation for this query.
  243. *
  244. * @param Zend_Search_Lucene_Interface $reader
  245. * @return Zend_Search_Lucene_Search_Weight
  246. */
  247. public function createWeight(Zend_Search_Lucene_Interface $reader)
  248. {
  249. require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php';
  250. $this->_weight = new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader);
  251. return $this->_weight;
  252. }
  253. /**
  254. * Score calculator for exact phrase queries (terms sequence is fixed)
  255. *
  256. * @param integer $docId
  257. * @return float
  258. */
  259. public function _exactPhraseFreq($docId)
  260. {
  261. $freq = 0;
  262. // Term Id with lowest cardinality
  263. $lowCardTermId = null;
  264. // Calculate $lowCardTermId
  265. foreach ($this->_terms as $termId => $term) {
  266. if ($lowCardTermId === null ||
  267. count($this->_termsPositions[$termId][$docId]) <
  268. count($this->_termsPositions[$lowCardTermId][$docId]) ) {
  269. $lowCardTermId = $termId;
  270. }
  271. }
  272. // Walk through positions of the term with lowest cardinality
  273. foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
  274. // We expect phrase to be found
  275. $freq++;
  276. // Walk through other terms
  277. foreach ($this->_terms as $termId => $term) {
  278. if ($termId != $lowCardTermId) {
  279. $expectedPosition = $lowCardPos +
  280. ($this->_offsets[$termId] -
  281. $this->_offsets[$lowCardTermId]);
  282. if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
  283. $freq--; // Phrase wasn't found.
  284. break;
  285. }
  286. }
  287. }
  288. }
  289. return $freq;
  290. }
  291. /**
  292. * Score calculator for sloppy phrase queries (terms sequence is fixed)
  293. *
  294. * @param integer $docId
  295. * @param Zend_Search_Lucene_Interface $reader
  296. * @return float
  297. */
  298. public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader)
  299. {
  300. $freq = 0;
  301. $phraseQueue = array();
  302. $phraseQueue[0] = array(); // empty phrase
  303. $lastTerm = null;
  304. // Walk through the terms to create phrases.
  305. foreach ($this->_terms as $termId => $term) {
  306. $queueSize = count($phraseQueue);
  307. $firstPass = true;
  308. // Walk through the term positions.
  309. // Each term position produces a set of phrases.
  310. foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
  311. if ($firstPass) {
  312. for ($count = 0; $count < $queueSize; $count++) {
  313. $phraseQueue[$count][$termId] = $termPosition;
  314. }
  315. } else {
  316. for ($count = 0; $count < $queueSize; $count++) {
  317. if ($lastTerm !== null &&
  318. abs( $termPosition - $phraseQueue[$count][$lastTerm] -
  319. ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
  320. continue;
  321. }
  322. $newPhraseId = count($phraseQueue);
  323. $phraseQueue[$newPhraseId] = $phraseQueue[$count];
  324. $phraseQueue[$newPhraseId][$termId] = $termPosition;
  325. }
  326. }
  327. $firstPass = false;
  328. }
  329. $lastTerm = $termId;
  330. }
  331. foreach ($phraseQueue as $phrasePos) {
  332. $minDistance = null;
  333. for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
  334. $distance = 0;
  335. $start = reset($phrasePos) - reset($this->_offsets) + $shift;
  336. foreach ($this->_terms as $termId => $term) {
  337. $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
  338. if($distance > $this->_slop) {
  339. break;
  340. }
  341. }
  342. if ($minDistance === null || $distance < $minDistance) {
  343. $minDistance = $distance;
  344. }
  345. }
  346. if ($minDistance <= $this->_slop) {
  347. $freq += $reader->getSimilarity()->sloppyFreq($minDistance);
  348. }
  349. }
  350. return $freq;
  351. }
  352. /**
  353. * Execute query in context of index reader
  354. * It also initializes necessary internal structures
  355. *
  356. * @param Zend_Search_Lucene_Interface $reader
  357. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  358. */
  359. public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
  360. {
  361. $this->_resVector = null;
  362. if (count($this->_terms) == 0) {
  363. $this->_resVector = array();
  364. }
  365. $resVectors = array();
  366. $resVectorsSizes = array();
  367. $resVectorsIds = array(); // is used to prevent arrays comparison
  368. foreach ($this->_terms as $termId => $term) {
  369. $resVectors[] = array_flip($reader->termDocs($term));
  370. $resVectorsSizes[] = count(end($resVectors));
  371. $resVectorsIds[] = $termId;
  372. $this->_termsPositions[$termId] = $reader->termPositions($term);
  373. }
  374. // sort resvectors in order of subquery cardinality increasing
  375. array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
  376. $resVectorsIds, SORT_ASC, SORT_NUMERIC,
  377. $resVectors);
  378. foreach ($resVectors as $nextResVector) {
  379. if($this->_resVector === null) {
  380. $this->_resVector = $nextResVector;
  381. } else {
  382. //$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
  383. /**
  384. * This code is used as workaround for array_intersect_key() slowness problem.
  385. */
  386. $updatedVector = array();
  387. foreach ($this->_resVector as $id => $value) {
  388. if (isset($nextResVector[$id])) {
  389. $updatedVector[$id] = $value;
  390. }
  391. }
  392. $this->_resVector = $updatedVector;
  393. }
  394. if (count($this->_resVector) == 0) {
  395. // Empty result set, we don't need to check other terms
  396. break;
  397. }
  398. }
  399. // ksort($this->_resVector, SORT_NUMERIC);
  400. // Docs are returned ordered. Used algorithm doesn't change elements order.
  401. // Initialize weight if it's not done yet
  402. $this->_initWeight($reader);
  403. }
  404. /**
  405. * Get document ids likely matching the query
  406. *
  407. * It's an array with document ids as keys (performance considerations)
  408. *
  409. * @return array
  410. */
  411. public function matchedDocs()
  412. {
  413. return $this->_resVector;
  414. }
  415. /**
  416. * Score specified document
  417. *
  418. * @param integer $docId
  419. * @param Zend_Search_Lucene_Interface $reader
  420. * @return float
  421. */
  422. public function score($docId, Zend_Search_Lucene_Interface $reader)
  423. {
  424. if (isset($this->_resVector[$docId])) {
  425. if ($this->_slop == 0) {
  426. $freq = $this->_exactPhraseFreq($docId);
  427. } else {
  428. $freq = $this->_sloppyPhraseFreq($docId, $reader);
  429. }
  430. if ($freq != 0) {
  431. $tf = $reader->getSimilarity()->tf($freq);
  432. $weight = $this->_weight->getValue();
  433. $norm = $reader->norm($docId, reset($this->_terms)->field);
  434. return $tf * $weight * $norm * $this->getBoost();
  435. }
  436. // Included in result, but culculated freq is zero
  437. return 0;
  438. } else {
  439. return 0;
  440. }
  441. }
  442. /**
  443. * Return query terms
  444. *
  445. * @return array
  446. */
  447. public function getQueryTerms()
  448. {
  449. return $this->_terms;
  450. }
  451. /**
  452. * Query specific matches highlighting
  453. *
  454. * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
  455. */
  456. protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
  457. {
  458. $words = array();
  459. foreach ($this->_terms as $term) {
  460. $words[] = $term->text;
  461. }
  462. $highlighter->highlight($words);
  463. }
  464. /**
  465. * Print a query
  466. *
  467. * @return string
  468. */
  469. public function __toString()
  470. {
  471. // It's used only for query visualisation, so we don't care about characters escaping
  472. if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) {
  473. $query = $this->_terms[0]->field . ':';
  474. } else {
  475. $query = '';
  476. }
  477. $query .= '"';
  478. foreach ($this->_terms as $id => $term) {
  479. if ($id != 0) {
  480. $query .= ' ';
  481. }
  482. $query .= $term->text;
  483. }
  484. $query .= '"';
  485. if ($this->_slop != 0) {
  486. $query .= '~' . $this->_slop;
  487. }
  488. if ($this->getBoost() != 1) {
  489. $query .= '^' . round($this->getBoost(), 4);
  490. }
  491. return $query;
  492. }
  493. }