PageRenderTime 84ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/protected/modules/search/vendors/Zend/Search/Lucene.php

https://bitbucket.org/rohitrox/hotc
PHP | 1579 lines | 744 code | 259 blank | 576 comment | 135 complexity | 67f19b24145532735299ca70fc319864 MD5 | raw file
Possible License(s): MIT
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  18. * @license http://framework.zend.com/license/new-bsd New BSD License
  19. * @version $Id: Lucene.php 23775 2011-03-01 17:25:24Z ralph $
  20. */
  21. /** User land classes and interfaces turned on by Zend/Search/Lucene.php file inclusion. */
  22. /** @todo Section should be removed with ZF 2.0 release as obsolete */
  23. /** Zend_Search_Lucene_Document_Html */
  24. require_once 'Zend/Search/Lucene/Document/Html.php';
  25. /** Zend_Search_Lucene_Document_Docx */
  26. require_once 'Zend/Search/Lucene/Document/Docx.php';
  27. /** Zend_Search_Lucene_Document_Pptx */
  28. require_once 'Zend/Search/Lucene/Document/Pptx.php';
  29. /** Zend_Search_Lucene_Document_Xlsx */
  30. require_once 'Zend/Search/Lucene/Document/Xlsx.php';
  31. /** Zend_Search_Lucene_Search_QueryParser */
  32. require_once 'Zend/Search/Lucene/Search/QueryParser.php';
  33. /** Zend_Search_Lucene_Search_QueryHit */
  34. require_once 'Zend/Search/Lucene/Search/QueryHit.php';
  35. /** Zend_Search_Lucene_Analysis_Analyzer */
  36. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  37. /** Zend_Search_Lucene_Search_Query_Term */
  38. require_once 'Zend/Search/Lucene/Search/Query/Term.php';
  39. /** Zend_Search_Lucene_Search_Query_Phrase */
  40. require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
  41. /** Zend_Search_Lucene_Search_Query_MultiTerm */
  42. require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
  43. /** Zend_Search_Lucene_Search_Query_Wildcard */
  44. require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
  45. /** Zend_Search_Lucene_Search_Query_Range */
  46. require_once 'Zend/Search/Lucene/Search/Query/Range.php';
  47. /** Zend_Search_Lucene_Search_Query_Fuzzy */
  48. require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
  49. /** Zend_Search_Lucene_Search_Query_Boolean */
  50. require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
  51. /** Zend_Search_Lucene_Search_Query_Empty */
  52. require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
  53. /** Zend_Search_Lucene_Search_Query_Insignificant */
  54. require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
  55. /** Internally used classes */
  56. /** Zend_Search_Lucene_Interface */
  57. require_once 'Zend/Search/Lucene/Interface.php';
  58. /** Zend_Search_Lucene_Index_SegmentInfo */
  59. require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
  60. /** Zend_Search_Lucene_LockManager */
  61. require_once 'Zend/Search/Lucene/LockManager.php';
  62. /**
  63. * @category Zend
  64. * @package Zend_Search_Lucene
  65. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  66. * @license http://framework.zend.com/license/new-bsd New BSD License
  67. */
  68. class Zend_Search_Lucene implements Zend_Search_Lucene_Interface
  69. {
  70. /**
  71. * Default field name for search
  72. *
  73. * Null means search through all fields
  74. *
  75. * @var string
  76. */
  77. private static $_defaultSearchField = null;
  78. /**
  79. * Result set limit
  80. *
  81. * 0 means no limit
  82. *
  83. * @var integer
  84. */
  85. private static $_resultSetLimit = 0;
  86. /**
  87. * Terms per query limit
  88. *
  89. * 0 means no limit
  90. *
  91. * @var integer
  92. */
  93. private static $_termsPerQueryLimit = 1024;
  94. /**
  95. * File system adapter.
  96. *
  97. * @var Zend_Search_Lucene_Storage_Directory
  98. */
  99. private $_directory = null;
  100. /**
  101. * File system adapter closing option
  102. *
  103. * @var boolean
  104. */
  105. private $_closeDirOnExit = true;
  106. /**
  107. * Writer for this index, not instantiated unless required.
  108. *
  109. * @var Zend_Search_Lucene_Index_Writer
  110. */
  111. private $_writer = null;
  112. /**
  113. * Array of Zend_Search_Lucene_Index_SegmentInfo objects for current version of index.
  114. *
  115. * @var array Zend_Search_Lucene_Index_SegmentInfo
  116. */
  117. private $_segmentInfos = array();
  118. /**
  119. * Number of documents in this index.
  120. *
  121. * @var integer
  122. */
  123. private $_docCount = 0;
  124. /**
  125. * Flag for index changes
  126. *
  127. * @var boolean
  128. */
  129. private $_hasChanges = false;
  130. /**
  131. * Signal, that index is already closed, changes are fixed and resources are cleaned up
  132. *
  133. * @var boolean
  134. */
  135. private $_closed = false;
  136. /**
  137. * Number of references to the index object
  138. *
  139. * @var integer
  140. */
  141. private $_refCount = 0;
  142. /**
  143. * Current segment generation
  144. *
  145. * @var integer
  146. */
  147. private $_generation;
  148. const FORMAT_PRE_2_1 = 0;
  149. const FORMAT_2_1 = 1;
  150. const FORMAT_2_3 = 2;
  151. /**
  152. * Index format version
  153. *
  154. * @var integer
  155. */
  156. private $_formatVersion;
  157. /**
  158. * Create index
  159. *
  160. * @param mixed $directory
  161. * @return Zend_Search_Lucene_Interface
  162. */
  163. public static function create($directory)
  164. {
  165. /** Zend_Search_Lucene_Proxy */
  166. require_once 'Zend/Search/Lucene/Proxy.php';
  167. return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, true));
  168. }
  169. /**
  170. * Open index
  171. *
  172. * @param mixed $directory
  173. * @return Zend_Search_Lucene_Interface
  174. */
  175. public static function open($directory)
  176. {
  177. /** Zend_Search_Lucene_Proxy */
  178. require_once 'Zend/Search/Lucene/Proxy.php';
  179. return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false));
  180. }
  181. /** Generation retrieving counter */
  182. const GENERATION_RETRIEVE_COUNT = 10;
  183. /** Pause between generation retrieving attempts in milliseconds */
  184. const GENERATION_RETRIEVE_PAUSE = 50;
  185. /**
  186. * Get current generation number
  187. *
  188. * Returns generation number
  189. * 0 means pre-2.1 index format
  190. * -1 means there are no segments files.
  191. *
  192. * @param Zend_Search_Lucene_Storage_Directory $directory
  193. * @return integer
  194. * @throws Zend_Search_Lucene_Exception
  195. */
  196. public static function getActualGeneration(Zend_Search_Lucene_Storage_Directory $directory)
  197. {
  198. /**
  199. * Zend_Search_Lucene uses segments.gen file to retrieve current generation number
  200. *
  201. * Apache Lucene index format documentation mentions this method only as a fallback method
  202. *
  203. * Nevertheless we use it according to the performance considerations
  204. *
  205. * @todo check if we can use some modification of Apache Lucene generation determination algorithm
  206. * without performance problems
  207. */
  208. require_once 'Zend/Search/Lucene/Exception.php';
  209. try {
  210. for ($count = 0; $count < self::GENERATION_RETRIEVE_COUNT; $count++) {
  211. // Try to get generation file
  212. $genFile = $directory->getFileObject('segments.gen', false);
  213. $format = $genFile->readInt();
  214. if ($format != (int)0xFFFFFFFE) {
  215. throw new Zend_Search_Lucene_Exception('Wrong segments.gen file format');
  216. }
  217. $gen1 = $genFile->readLong();
  218. $gen2 = $genFile->readLong();
  219. if ($gen1 == $gen2) {
  220. return $gen1;
  221. }
  222. usleep(self::GENERATION_RETRIEVE_PAUSE * 1000);
  223. }
  224. // All passes are failed
  225. throw new Zend_Search_Lucene_Exception('Index is under processing now');
  226. } catch (Zend_Search_Lucene_Exception $e) {
  227. if (strpos($e->getMessage(), 'is not readable') !== false) {
  228. try {
  229. // Try to open old style segments file
  230. $segmentsFile = $directory->getFileObject('segments', false);
  231. // It's pre-2.1 index
  232. return 0;
  233. } catch (Zend_Search_Lucene_Exception $e) {
  234. if (strpos($e->getMessage(), 'is not readable') !== false) {
  235. return -1;
  236. } else {
  237. throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
  238. }
  239. }
  240. } else {
  241. throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
  242. }
  243. }
  244. return -1;
  245. }
  246. /**
  247. * Get generation number associated with this index instance
  248. *
  249. * The same generation number in pair with document number or query string
  250. * guarantees to give the same result while index retrieving.
  251. * So it may be used for search result caching.
  252. *
  253. * @return integer
  254. */
  255. public function getGeneration()
  256. {
  257. return $this->_generation;
  258. }
  259. /**
  260. * Get segments file name
  261. *
  262. * @param integer $generation
  263. * @return string
  264. */
  265. public static function getSegmentFileName($generation)
  266. {
  267. if ($generation == 0) {
  268. return 'segments';
  269. }
  270. return 'segments_' . base_convert($generation, 10, 36);
  271. }
  272. /**
  273. * Get index format version
  274. *
  275. * @return integer
  276. */
  277. public function getFormatVersion()
  278. {
  279. return $this->_formatVersion;
  280. }
  281. /**
  282. * Set index format version.
  283. * Index is converted to this format at the nearest upfdate time
  284. *
  285. * @param int $formatVersion
  286. * @throws Zend_Search_Lucene_Exception
  287. */
  288. public function setFormatVersion($formatVersion)
  289. {
  290. if ($formatVersion != self::FORMAT_PRE_2_1 &&
  291. $formatVersion != self::FORMAT_2_1 &&
  292. $formatVersion != self::FORMAT_2_3) {
  293. require_once 'Zend/Search/Lucene/Exception.php';
  294. throw new Zend_Search_Lucene_Exception('Unsupported index format');
  295. }
  296. $this->_formatVersion = $formatVersion;
  297. }
  298. /**
  299. * Read segments file for pre-2.1 Lucene index format
  300. *
  301. * @throws Zend_Search_Lucene_Exception
  302. */
  303. private function _readPre21SegmentsFile()
  304. {
  305. $segmentsFile = $this->_directory->getFileObject('segments');
  306. $format = $segmentsFile->readInt();
  307. if ($format != (int)0xFFFFFFFF) {
  308. require_once 'Zend/Search/Lucene/Exception.php';
  309. throw new Zend_Search_Lucene_Exception('Wrong segments file format');
  310. }
  311. // read version
  312. $segmentsFile->readLong();
  313. // read segment name counter
  314. $segmentsFile->readInt();
  315. $segments = $segmentsFile->readInt();
  316. $this->_docCount = 0;
  317. // read segmentInfos
  318. for ($count = 0; $count < $segments; $count++) {
  319. $segName = $segmentsFile->readString();
  320. $segSize = $segmentsFile->readInt();
  321. $this->_docCount += $segSize;
  322. $this->_segmentInfos[$segName] =
  323. new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
  324. $segName,
  325. $segSize);
  326. }
  327. // Use 2.1 as a target version. Index will be reorganized at update time.
  328. $this->_formatVersion = self::FORMAT_2_1;
  329. }
  330. /**
  331. * Read segments file
  332. *
  333. * @throws Zend_Search_Lucene_Exception
  334. */
  335. private function _readSegmentsFile()
  336. {
  337. $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation));
  338. $format = $segmentsFile->readInt();
  339. if ($format == (int)0xFFFFFFFC) {
  340. $this->_formatVersion = self::FORMAT_2_3;
  341. } else if ($format == (int)0xFFFFFFFD) {
  342. $this->_formatVersion = self::FORMAT_2_1;
  343. } else {
  344. require_once 'Zend/Search/Lucene/Exception.php';
  345. throw new Zend_Search_Lucene_Exception('Unsupported segments file format');
  346. }
  347. // read version
  348. $segmentsFile->readLong();
  349. // read segment name counter
  350. $segmentsFile->readInt();
  351. $segments = $segmentsFile->readInt();
  352. $this->_docCount = 0;
  353. // read segmentInfos
  354. for ($count = 0; $count < $segments; $count++) {
  355. $segName = $segmentsFile->readString();
  356. $segSize = $segmentsFile->readInt();
  357. // 2.1+ specific properties
  358. $delGen = $segmentsFile->readLong();
  359. if ($this->_formatVersion == self::FORMAT_2_3) {
  360. $docStoreOffset = $segmentsFile->readInt();
  361. if ($docStoreOffset != (int)0xFFFFFFFF) {
  362. $docStoreSegment = $segmentsFile->readString();
  363. $docStoreIsCompoundFile = $segmentsFile->readByte();
  364. $docStoreOptions = array('offset' => $docStoreOffset,
  365. 'segment' => $docStoreSegment,
  366. 'isCompound' => ($docStoreIsCompoundFile == 1));
  367. } else {
  368. $docStoreOptions = null;
  369. }
  370. } else {
  371. $docStoreOptions = null;
  372. }
  373. $hasSingleNormFile = $segmentsFile->readByte();
  374. $numField = $segmentsFile->readInt();
  375. $normGens = array();
  376. if ($numField != (int)0xFFFFFFFF) {
  377. for ($count1 = 0; $count1 < $numField; $count1++) {
  378. $normGens[] = $segmentsFile->readLong();
  379. }
  380. require_once 'Zend/Search/Lucene/Exception.php';
  381. throw new Zend_Search_Lucene_Exception('Separate norm files are not supported. Optimize index to use it with Zend_Search_Lucene.');
  382. }
  383. $isCompoundByte = $segmentsFile->readByte();
  384. if ($isCompoundByte == 0xFF) {
  385. // The segment is not a compound file
  386. $isCompound = false;
  387. } else if ($isCompoundByte == 0x00) {
  388. // The status is unknown
  389. $isCompound = null;
  390. } else if ($isCompoundByte == 0x01) {
  391. // The segment is a compound file
  392. $isCompound = true;
  393. }
  394. $this->_docCount += $segSize;
  395. $this->_segmentInfos[$segName] =
  396. new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
  397. $segName,
  398. $segSize,
  399. $delGen,
  400. $docStoreOptions,
  401. $hasSingleNormFile,
  402. $isCompound);
  403. }
  404. }
  405. /**
  406. * Opens the index.
  407. *
  408. * IndexReader constructor needs Directory as a parameter. It should be
  409. * a string with a path to the index folder or a Directory object.
  410. *
  411. * @param Zend_Search_Lucene_Storage_Directory_Filesystem|string $directory
  412. * @throws Zend_Search_Lucene_Exception
  413. */
  414. public function __construct($directory = null, $create = false)
  415. {
  416. if ($directory === null) {
  417. require_once 'Zend/Search/Lucene/Exception.php';
  418. throw new Zend_Search_Exception('No index directory specified');
  419. }
  420. if (is_string($directory)) {
  421. require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
  422. $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory);
  423. $this->_closeDirOnExit = true;
  424. } else {
  425. $this->_directory = $directory;
  426. $this->_closeDirOnExit = false;
  427. }
  428. $this->_segmentInfos = array();
  429. // Mark index as "under processing" to prevent other processes from premature index cleaning
  430. Zend_Search_Lucene_LockManager::obtainReadLock($this->_directory);
  431. $this->_generation = self::getActualGeneration($this->_directory);
  432. if ($create) {
  433. require_once 'Zend/Search/Lucene/Exception.php';
  434. try {
  435. Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
  436. } catch (Zend_Search_Lucene_Exception $e) {
  437. Zend_Search_Lucene_LockManager::releaseReadLock($this->_directory);
  438. if (strpos($e->getMessage(), 'Can\'t obtain exclusive index lock') === false) {
  439. throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
  440. } else {
  441. throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now', 0, $e);
  442. }
  443. }
  444. if ($this->_generation == -1) {
  445. // Directory doesn't contain existing index, start from 1
  446. $this->_generation = 1;
  447. $nameCounter = 0;
  448. } else {
  449. // Directory contains existing index
  450. $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation));
  451. $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
  452. $nameCounter = $segmentsFile->readInt();
  453. $this->_generation++;
  454. }
  455. require_once 'Zend/Search/Lucene/Index/Writer.php';
  456. Zend_Search_Lucene_Index_Writer::createIndex($this->_directory, $this->_generation, $nameCounter);
  457. Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
  458. }
  459. if ($this->_generation == -1) {
  460. require_once 'Zend/Search/Lucene/Exception.php';
  461. throw new Zend_Search_Lucene_Exception('Index doesn\'t exists in the specified directory.');
  462. } else if ($this->_generation == 0) {
  463. $this->_readPre21SegmentsFile();
  464. } else {
  465. $this->_readSegmentsFile();
  466. }
  467. }
  468. /**
  469. * Close current index and free resources
  470. */
  471. private function _close()
  472. {
  473. if ($this->_closed) {
  474. // index is already closed and resources are cleaned up
  475. return;
  476. }
  477. $this->commit();
  478. // Release "under processing" flag
  479. Zend_Search_Lucene_LockManager::releaseReadLock($this->_directory);
  480. if ($this->_closeDirOnExit) {
  481. $this->_directory->close();
  482. }
  483. $this->_directory = null;
  484. $this->_writer = null;
  485. $this->_segmentInfos = null;
  486. $this->_closed = true;
  487. }
  488. /**
  489. * Add reference to the index object
  490. *
  491. * @internal
  492. */
  493. public function addReference()
  494. {
  495. $this->_refCount++;
  496. }
  497. /**
  498. * Remove reference from the index object
  499. *
  500. * When reference count becomes zero, index is closed and resources are cleaned up
  501. *
  502. * @internal
  503. */
  504. public function removeReference()
  505. {
  506. $this->_refCount--;
  507. if ($this->_refCount == 0) {
  508. $this->_close();
  509. }
  510. }
  511. /**
  512. * Object destructor
  513. */
  514. public function __destruct()
  515. {
  516. $this->_close();
  517. }
  518. /**
  519. * Returns an instance of Zend_Search_Lucene_Index_Writer for the index
  520. *
  521. * @return Zend_Search_Lucene_Index_Writer
  522. */
  523. private function _getIndexWriter()
  524. {
  525. if ($this->_writer === null) {
  526. require_once 'Zend/Search/Lucene/Index/Writer.php';
  527. $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory,
  528. $this->_segmentInfos,
  529. $this->_formatVersion);
  530. }
  531. return $this->_writer;
  532. }
  533. /**
  534. * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
  535. *
  536. * @return Zend_Search_Lucene_Storage_Directory
  537. */
  538. public function getDirectory()
  539. {
  540. return $this->_directory;
  541. }
  542. /**
  543. * Returns the total number of documents in this index (including deleted documents).
  544. *
  545. * @return integer
  546. */
  547. public function count()
  548. {
  549. return $this->_docCount;
  550. }
  551. /**
  552. * Returns one greater than the largest possible document number.
  553. * This may be used to, e.g., determine how big to allocate a structure which will have
  554. * an element for every document number in an index.
  555. *
  556. * @return integer
  557. */
  558. public function maxDoc()
  559. {
  560. return $this->count();
  561. }
  562. /**
  563. * Returns the total number of non-deleted documents in this index.
  564. *
  565. * @return integer
  566. */
  567. public function numDocs()
  568. {
  569. $numDocs = 0;
  570. foreach ($this->_segmentInfos as $segmentInfo) {
  571. $numDocs += $segmentInfo->numDocs();
  572. }
  573. return $numDocs;
  574. }
  575. /**
  576. * Checks, that document is deleted
  577. *
  578. * @param integer $id
  579. * @return boolean
  580. * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
  581. */
  582. public function isDeleted($id)
  583. {
  584. $this->commit();
  585. if ($id >= $this->_docCount) {
  586. require_once 'Zend/Search/Lucene/Exception.php';
  587. throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
  588. }
  589. $segmentStartId = 0;
  590. foreach ($this->_segmentInfos as $segmentInfo) {
  591. if ($segmentStartId + $segmentInfo->count() > $id) {
  592. break;
  593. }
  594. $segmentStartId += $segmentInfo->count();
  595. }
  596. return $segmentInfo->isDeleted($id - $segmentStartId);
  597. }
  598. /**
  599. * Set default search field.
  600. *
  601. * Null means, that search is performed through all fields by default
  602. *
  603. * Default value is null
  604. *
  605. * @param string $fieldName
  606. */
  607. public static function setDefaultSearchField($fieldName)
  608. {
  609. self::$_defaultSearchField = $fieldName;
  610. }
  611. /**
  612. * Get default search field.
  613. *
  614. * Null means, that search is performed through all fields by default
  615. *
  616. * @return string
  617. */
  618. public static function getDefaultSearchField()
  619. {
  620. return self::$_defaultSearchField;
  621. }
  622. /**
  623. * Set result set limit.
  624. *
  625. * 0 (default) means no limit
  626. *
  627. * @param integer $limit
  628. */
  629. public static function setResultSetLimit($limit)
  630. {
  631. self::$_resultSetLimit = $limit;
  632. }
  633. /**
  634. * Get result set limit.
  635. *
  636. * 0 means no limit
  637. *
  638. * @return integer
  639. */
  640. public static function getResultSetLimit()
  641. {
  642. return self::$_resultSetLimit;
  643. }
  644. /**
  645. * Set terms per query limit.
  646. *
  647. * 0 means no limit
  648. *
  649. * @param integer $limit
  650. */
  651. public static function setTermsPerQueryLimit($limit)
  652. {
  653. self::$_termsPerQueryLimit = $limit;
  654. }
  655. /**
  656. * Get result set limit.
  657. *
  658. * 0 (default) means no limit
  659. *
  660. * @return integer
  661. */
  662. public static function getTermsPerQueryLimit()
  663. {
  664. return self::$_termsPerQueryLimit;
  665. }
  666. /**
  667. * Retrieve index maxBufferedDocs option
  668. *
  669. * maxBufferedDocs is a minimal number of documents required before
  670. * the buffered in-memory documents are written into a new Segment
  671. *
  672. * Default value is 10
  673. *
  674. * @return integer
  675. */
  676. public function getMaxBufferedDocs()
  677. {
  678. return $this->_getIndexWriter()->maxBufferedDocs;
  679. }
  680. /**
  681. * Set index maxBufferedDocs option
  682. *
  683. * maxBufferedDocs is a minimal number of documents required before
  684. * the buffered in-memory documents are written into a new Segment
  685. *
  686. * Default value is 10
  687. *
  688. * @param integer $maxBufferedDocs
  689. */
  690. public function setMaxBufferedDocs($maxBufferedDocs)
  691. {
  692. $this->_getIndexWriter()->maxBufferedDocs = $maxBufferedDocs;
  693. }
  694. /**
  695. * Retrieve index maxMergeDocs option
  696. *
  697. * maxMergeDocs is a largest number of documents ever merged by addDocument().
  698. * Small values (e.g., less than 10,000) are best for interactive indexing,
  699. * as this limits the length of pauses while indexing to a few seconds.
  700. * Larger values are best for batched indexing and speedier searches.
  701. *
  702. * Default value is PHP_INT_MAX
  703. *
  704. * @return integer
  705. */
  706. public function getMaxMergeDocs()
  707. {
  708. return $this->_getIndexWriter()->maxMergeDocs;
  709. }
  710. /**
  711. * Set index maxMergeDocs option
  712. *
  713. * maxMergeDocs is a largest number of documents ever merged by addDocument().
  714. * Small values (e.g., less than 10,000) are best for interactive indexing,
  715. * as this limits the length of pauses while indexing to a few seconds.
  716. * Larger values are best for batched indexing and speedier searches.
  717. *
  718. * Default value is PHP_INT_MAX
  719. *
  720. * @param integer $maxMergeDocs
  721. */
  722. public function setMaxMergeDocs($maxMergeDocs)
  723. {
  724. $this->_getIndexWriter()->maxMergeDocs = $maxMergeDocs;
  725. }
  726. /**
  727. * Retrieve index mergeFactor option
  728. *
  729. * mergeFactor determines how often segment indices are merged by addDocument().
  730. * With smaller values, less RAM is used while indexing,
  731. * and searches on unoptimized indices are faster,
  732. * but indexing speed is slower.
  733. * With larger values, more RAM is used during indexing,
  734. * and while searches on unoptimized indices are slower,
  735. * indexing is faster.
  736. * Thus larger values (> 10) are best for batch index creation,
  737. * and smaller values (< 10) for indices that are interactively maintained.
  738. *
  739. * Default value is 10
  740. *
  741. * @return integer
  742. */
  743. public function getMergeFactor()
  744. {
  745. return $this->_getIndexWriter()->mergeFactor;
  746. }
  747. /**
  748. * Set index mergeFactor option
  749. *
  750. * mergeFactor determines how often segment indices are merged by addDocument().
  751. * With smaller values, less RAM is used while indexing,
  752. * and searches on unoptimized indices are faster,
  753. * but indexing speed is slower.
  754. * With larger values, more RAM is used during indexing,
  755. * and while searches on unoptimized indices are slower,
  756. * indexing is faster.
  757. * Thus larger values (> 10) are best for batch index creation,
  758. * and smaller values (< 10) for indices that are interactively maintained.
  759. *
  760. * Default value is 10
  761. *
  762. * @param integer $maxMergeDocs
  763. */
  764. public function setMergeFactor($mergeFactor)
  765. {
  766. $this->_getIndexWriter()->mergeFactor = $mergeFactor;
  767. }
  768. /**
  769. * Performs a query against the index and returns an array
  770. * of Zend_Search_Lucene_Search_QueryHit objects.
  771. * Input is a string or Zend_Search_Lucene_Search_Query.
  772. *
  773. * @param Zend_Search_Lucene_Search_QueryParser|string $query
  774. * @return array Zend_Search_Lucene_Search_QueryHit
  775. * @throws Zend_Search_Lucene_Exception
  776. */
  777. public function find($query)
  778. {
  779. if (is_string($query)) {
  780. require_once 'Zend/Search/Lucene/Search/QueryParser.php';
  781. $query = Zend_Search_Lucene_Search_QueryParser::parse($query);
  782. }
  783. if (!$query instanceof Zend_Search_Lucene_Search_Query) {
  784. require_once 'Zend/Search/Lucene/Exception.php';
  785. throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object');
  786. }
  787. $this->commit();
  788. $hits = array();
  789. $scores = array();
  790. $ids = array();
  791. $query = $query->rewrite($this)->optimize($this);
  792. $query->execute($this);
  793. $topScore = 0;
  794. /** Zend_Search_Lucene_Search_QueryHit */
  795. require_once 'Zend/Search/Lucene/Search/QueryHit.php';
  796. foreach ($query->matchedDocs() as $id => $num) {
  797. $docScore = $query->score($id, $this);
  798. if( $docScore != 0 ) {
  799. $hit = new Zend_Search_Lucene_Search_QueryHit($this);
  800. $hit->id = $id;
  801. $hit->score = $docScore;
  802. $hits[] = $hit;
  803. $ids[] = $id;
  804. $scores[] = $docScore;
  805. if ($docScore > $topScore) {
  806. $topScore = $docScore;
  807. }
  808. }
  809. if (self::$_resultSetLimit != 0 && count($hits) >= self::$_resultSetLimit) {
  810. break;
  811. }
  812. }
  813. if (count($hits) == 0) {
  814. // skip sorting, which may cause a error on empty index
  815. return array();
  816. }
  817. if ($topScore > 1) {
  818. foreach ($hits as $hit) {
  819. $hit->score /= $topScore;
  820. }
  821. }
  822. if (func_num_args() == 1) {
  823. // sort by scores
  824. array_multisort($scores, SORT_DESC, SORT_NUMERIC,
  825. $ids, SORT_ASC, SORT_NUMERIC,
  826. $hits);
  827. } else {
  828. // sort by given field names
  829. $argList = func_get_args();
  830. $fieldNames = $this->getFieldNames();
  831. $sortArgs = array();
  832. // PHP 5.3 now expects all arguments to array_multisort be passed by
  833. // reference (if it's invoked through call_user_func_array());
  834. // since constants can't be passed by reference, create some placeholder variables.
  835. $sortReg = SORT_REGULAR;
  836. $sortAsc = SORT_ASC;
  837. $sortNum = SORT_NUMERIC;
  838. $sortFieldValues = array();
  839. require_once 'Zend/Search/Lucene/Exception.php';
  840. for ($count = 1; $count < count($argList); $count++) {
  841. $fieldName = $argList[$count];
  842. if (!is_string($fieldName)) {
  843. throw new Zend_Search_Lucene_Exception('Field name must be a string.');
  844. }
  845. if (strtolower($fieldName) == 'score') {
  846. $sortArgs[] = &$scores;
  847. } else {
  848. if (!in_array($fieldName, $fieldNames)) {
  849. throw new Zend_Search_Lucene_Exception('Wrong field name.');
  850. }
  851. if (!isset($sortFieldValues[$fieldName])) {
  852. $valuesArray = array();
  853. foreach ($hits as $hit) {
  854. try {
  855. $value = $hit->getDocument()->getFieldValue($fieldName);
  856. } catch (Zend_Search_Lucene_Exception $e) {
  857. if (strpos($e->getMessage(), 'not found') === false) {
  858. throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
  859. } else {
  860. $value = null;
  861. }
  862. }
  863. $valuesArray[] = $value;
  864. }
  865. // Collect loaded values in $sortFieldValues
  866. // Required for PHP 5.3 which translates references into values when source
  867. // variable is destroyed
  868. $sortFieldValues[$fieldName] = $valuesArray;
  869. }
  870. $sortArgs[] = &$sortFieldValues[$fieldName];
  871. }
  872. if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
  873. $count++;
  874. $sortArgs[] = &$argList[$count];
  875. if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
  876. $count++;
  877. $sortArgs[] = &$argList[$count];
  878. } else {
  879. if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) {
  880. $sortArgs[] = &$sortReg;
  881. } else {
  882. $sortArgs[] = &$sortAsc;
  883. }
  884. }
  885. } else {
  886. $sortArgs[] = &$sortAsc;
  887. $sortArgs[] = &$sortReg;
  888. }
  889. }
  890. // Sort by id's if values are equal
  891. $sortArgs[] = &$ids;
  892. $sortArgs[] = &$sortAsc;
  893. $sortArgs[] = &$sortNum;
  894. // Array to be sorted
  895. $sortArgs[] = &$hits;
  896. // Do sort
  897. call_user_func_array('array_multisort', $sortArgs);
  898. }
  899. return $hits;
  900. }
  901. /**
  902. * Returns a list of all unique field names that exist in this index.
  903. *
  904. * @param boolean $indexed
  905. * @return array
  906. */
  907. public function getFieldNames($indexed = false)
  908. {
  909. $result = array();
  910. foreach( $this->_segmentInfos as $segmentInfo ) {
  911. $result = array_merge($result, $segmentInfo->getFields($indexed));
  912. }
  913. return $result;
  914. }
  915. /**
  916. * Returns a Zend_Search_Lucene_Document object for the document
  917. * number $id in this index.
  918. *
  919. * @param integer|Zend_Search_Lucene_Search_QueryHit $id
  920. * @return Zend_Search_Lucene_Document
  921. * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
  922. */
  923. public function getDocument($id)
  924. {
  925. if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
  926. /* @var $id Zend_Search_Lucene_Search_QueryHit */
  927. $id = $id->id;
  928. }
  929. if ($id >= $this->_docCount) {
  930. require_once 'Zend/Search/Lucene/Exception.php';
  931. throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
  932. }
  933. $segmentStartId = 0;
  934. foreach ($this->_segmentInfos as $segmentInfo) {
  935. if ($segmentStartId + $segmentInfo->count() > $id) {
  936. break;
  937. }
  938. $segmentStartId += $segmentInfo->count();
  939. }
  940. $fdxFile = $segmentInfo->openCompoundFile('.fdx');
  941. $fdxFile->seek(($id-$segmentStartId)*8, SEEK_CUR);
  942. $fieldValuesPosition = $fdxFile->readLong();
  943. $fdtFile = $segmentInfo->openCompoundFile('.fdt');
  944. $fdtFile->seek($fieldValuesPosition, SEEK_CUR);
  945. $fieldCount = $fdtFile->readVInt();
  946. $doc = new Zend_Search_Lucene_Document();
  947. for ($count = 0; $count < $fieldCount; $count++) {
  948. $fieldNum = $fdtFile->readVInt();
  949. $bits = $fdtFile->readByte();
  950. $fieldInfo = $segmentInfo->getField($fieldNum);
  951. if (!($bits & 2)) { // Text data
  952. $field = new Zend_Search_Lucene_Field($fieldInfo->name,
  953. $fdtFile->readString(),
  954. 'UTF-8',
  955. true,
  956. $fieldInfo->isIndexed,
  957. $bits & 1 );
  958. } else { // Binary data
  959. $field = new Zend_Search_Lucene_Field($fieldInfo->name,
  960. $fdtFile->readBinary(),
  961. '',
  962. true,
  963. $fieldInfo->isIndexed,
  964. $bits & 1,
  965. true );
  966. }
  967. $doc->addField($field);
  968. }
  969. return $doc;
  970. }
  971. /**
  972. * Returns true if index contain documents with specified term.
  973. *
  974. * Is used for query optimization.
  975. *
  976. * @param Zend_Search_Lucene_Index_Term $term
  977. * @return boolean
  978. */
  979. public function hasTerm(Zend_Search_Lucene_Index_Term $term)
  980. {
  981. foreach ($this->_segmentInfos as $segInfo) {
  982. if ($segInfo->getTermInfo($term) !== null) {
  983. return true;
  984. }
  985. }
  986. return false;
  987. }
  988. /**
  989. * Returns IDs of all documents containing term.
  990. *
  991. * @param Zend_Search_Lucene_Index_Term $term
  992. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  993. * @return array
  994. */
  995. public function termDocs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
  996. {
  997. $subResults = array();
  998. $segmentStartDocId = 0;
  999. foreach ($this->_segmentInfos as $segmentInfo) {
  1000. $subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter);
  1001. $segmentStartDocId += $segmentInfo->count();
  1002. }
  1003. if (count($subResults) == 0) {
  1004. return array();
  1005. } else if (count($subResults) == 1) {
  1006. // Index is optimized (only one segment)
  1007. // Do not perform array reindexing
  1008. return reset($subResults);
  1009. } else {
  1010. $result = call_user_func_array('array_merge', $subResults);
  1011. }
  1012. return $result;
  1013. }
  1014. /**
  1015. * Returns documents filter for all documents containing term.
  1016. *
  1017. * It performs the same operation as termDocs, but return result as
  1018. * Zend_Search_Lucene_Index_DocsFilter object
  1019. *
  1020. * @param Zend_Search_Lucene_Index_Term $term
  1021. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  1022. * @return Zend_Search_Lucene_Index_DocsFilter
  1023. */
  1024. public function termDocsFilter(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
  1025. {
  1026. $segmentStartDocId = 0;
  1027. $result = new Zend_Search_Lucene_Index_DocsFilter();
  1028. foreach ($this->_segmentInfos as $segmentInfo) {
  1029. $subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter);
  1030. $segmentStartDocId += $segmentInfo->count();
  1031. }
  1032. if (count($subResults) == 0) {
  1033. return array();
  1034. } else if (count($subResults) == 1) {
  1035. // Index is optimized (only one segment)
  1036. // Do not perform array reindexing
  1037. return reset($subResults);
  1038. } else {
  1039. $result = call_user_func_array('array_merge', $subResults);
  1040. }
  1041. return $result;
  1042. }
  1043. /**
  1044. * Returns an array of all term freqs.
  1045. * Result array structure: array(docId => freq, ...)
  1046. *
  1047. * @param Zend_Search_Lucene_Index_Term $term
  1048. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  1049. * @return integer
  1050. */
  1051. public function termFreqs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
  1052. {
  1053. $result = array();
  1054. $segmentStartDocId = 0;
  1055. foreach ($this->_segmentInfos as $segmentInfo) {
  1056. $result += $segmentInfo->termFreqs($term, $segmentStartDocId, $docsFilter);
  1057. $segmentStartDocId += $segmentInfo->count();
  1058. }
  1059. return $result;
  1060. }
  1061. /**
  1062. * Returns an array of all term positions in the documents.
  1063. * Result array structure: array(docId => array(pos1, pos2, ...), ...)
  1064. *
  1065. * @param Zend_Search_Lucene_Index_Term $term
  1066. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  1067. * @return array
  1068. */
  1069. public function termPositions(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
  1070. {
  1071. $result = array();
  1072. $segmentStartDocId = 0;
  1073. foreach ($this->_segmentInfos as $segmentInfo) {
  1074. $result += $segmentInfo->termPositions($term, $segmentStartDocId, $docsFilter);
  1075. $segmentStartDocId += $segmentInfo->count();
  1076. }
  1077. return $result;
  1078. }
  1079. /**
  1080. * Returns the number of documents in this index containing the $term.
  1081. *
  1082. * @param Zend_Search_Lucene_Index_Term $term
  1083. * @return integer
  1084. */
  1085. public function docFreq(Zend_Search_Lucene_Index_Term $term)
  1086. {
  1087. $result = 0;
  1088. foreach ($this->_segmentInfos as $segInfo) {
  1089. $termInfo = $segInfo->getTermInfo($term);
  1090. if ($termInfo !== null) {
  1091. $result += $termInfo->docFreq;
  1092. }
  1093. }
  1094. return $result;
  1095. }
  1096. /**
  1097. * Retrive similarity used by index reader
  1098. *
  1099. * @return Zend_Search_Lucene_Search_Similarity
  1100. */
  1101. public function getSimilarity()
  1102. {
  1103. /** Zend_Search_Lucene_Search_Similarity */
  1104. require_once 'Zend/Search/Lucene/Search/Similarity.php';
  1105. return Zend_Search_Lucene_Search_Similarity::getDefault();
  1106. }
  1107. /**
  1108. * Returns a normalization factor for "field, document" pair.
  1109. *
  1110. * @param integer $id
  1111. * @param string $fieldName
  1112. * @return float
  1113. */
  1114. public function norm($id, $fieldName)
  1115. {
  1116. if ($id >= $this->_docCount) {
  1117. return null;
  1118. }
  1119. $segmentStartId = 0;
  1120. foreach ($this->_segmentInfos as $segInfo) {
  1121. if ($segmentStartId + $segInfo->count() > $id) {
  1122. break;
  1123. }
  1124. $segmentStartId += $segInfo->count();
  1125. }
  1126. if ($segInfo->isDeleted($id - $segmentStartId)) {
  1127. return 0;
  1128. }
  1129. return $segInfo->norm($id - $segmentStartId, $fieldName);
  1130. }
  1131. /**
  1132. * Returns true if any documents have been deleted from this index.
  1133. *
  1134. * @return boolean
  1135. */
  1136. public function hasDeletions()
  1137. {
  1138. foreach ($this->_segmentInfos as $segmentInfo) {
  1139. if ($segmentInfo->hasDeletions()) {
  1140. return true;
  1141. }
  1142. }
  1143. return false;
  1144. }
  1145. /**
  1146. * Deletes a document from the index.
  1147. * $id is an internal document id
  1148. *
  1149. * @param integer|Zend_Search_Lucene_Search_QueryHit $id
  1150. * @throws Zend_Search_Lucene_Exception
  1151. */
  1152. public function delete($id)
  1153. {
  1154. if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
  1155. /* @var $id Zend_Search_Lucene_Search_QueryHit */
  1156. $id = $id->id;
  1157. }
  1158. if ($id >= $this->_docCount) {
  1159. require_once 'Zend/Search/Lucene/Exception.php';
  1160. throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
  1161. }
  1162. $segmentStartId = 0;
  1163. foreach ($this->_segmentInfos as $segmentInfo) {
  1164. if ($segmentStartId + $segmentInfo->count() > $id) {
  1165. break;
  1166. }
  1167. $segmentStartId += $segmentInfo->count();
  1168. }
  1169. $segmentInfo->delete($id - $segmentStartId);
  1170. $this->_hasChanges = true;
  1171. }
  1172. /**
  1173. * Adds a document to this index.
  1174. *
  1175. * @param Zend_Search_Lucene_Document $document
  1176. */
  1177. public function addDocument(Zend_Search_Lucene_Document $document)
  1178. {
  1179. $this->_getIndexWriter()->addDocument($document);
  1180. $this->_docCount++;
  1181. $this->_hasChanges = true;
  1182. }
  1183. /**
  1184. * Update document counter
  1185. */
  1186. private function _updateDocCount()
  1187. {
  1188. $this->_docCount = 0;
  1189. foreach ($this->_segmentInfos as $segInfo) {
  1190. $this->_docCount += $segInfo->count();
  1191. }
  1192. }
  1193. /**
  1194. * Commit changes resulting from delete() or undeleteAll() operations.
  1195. *
  1196. * @todo undeleteAll processing.
  1197. */
  1198. public function commit()
  1199. {
  1200. if ($this->_hasChanges) {
  1201. $this->_getIndexWriter()->commit();
  1202. $this->_updateDocCount();
  1203. $this->_hasChanges = false;
  1204. }
  1205. }
  1206. /**
  1207. * Optimize index.
  1208. *
  1209. * Merges all segments into one
  1210. */
  1211. public function optimize()
  1212. {
  1213. // Commit changes if any changes have been made
  1214. $this->commit();
  1215. if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) {
  1216. $this->_getIndexWriter()->optimize();
  1217. $this->_updateDocCount();
  1218. }
  1219. }
  1220. /**
  1221. * Returns an array of all terms in this index.
  1222. *
  1223. * @return array
  1224. */
  1225. public function terms()
  1226. {
  1227. $result = array();
  1228. /** Zend_Search_Lucene_Index_TermsPriorityQueue */
  1229. require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
  1230. $segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
  1231. foreach ($this->_segmentInfos as $segmentInfo) {
  1232. $segmentInfo->resetTermsStream();
  1233. // Skip "empty" segments
  1234. if ($segmentInfo->currentTerm() !== null) {
  1235. $segmentInfoQueue->put($segmentInfo);
  1236. }
  1237. }
  1238. while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
  1239. if ($segmentInfoQueue->top() === null ||
  1240. $segmentInfoQueue->top()->currentTerm()->key() !=
  1241. $segmentInfo->currentTerm()->key()) {
  1242. // We got new term
  1243. $result[] = $segmentInfo->currentTerm();
  1244. }
  1245. if ($segmentInfo->nextTerm() !== null) {
  1246. // Put segment back into the priority queue
  1247. $segmentInfoQueue->put($segmentInfo);
  1248. }
  1249. }
  1250. return $result;
  1251. }
  1252. /**
  1253. * Terms stream priority queue object
  1254. *
  1255. * @var Zend_Search_Lucene_TermStreamsPriorityQueue
  1256. */
  1257. private $_termsStream = null;
  1258. /**
  1259. * Reset terms stream.
  1260. */
  1261. public function resetTermsStream()
  1262. {
  1263. if ($this->_termsStream === null) {
  1264. /** Zend_Search_Lucene_TermStreamsPriorityQueue */
  1265. require_once 'Zend/Search/Lucene/TermStreamsPriorityQueue.php';
  1266. $this->_termsStream = new Zend_Search_Lucene_TermStreamsPriorityQueue($this->_segmentInfos);
  1267. } else {
  1268. $this->_termsStream->resetTermsStream();
  1269. }
  1270. }
  1271. /**
  1272. * Skip terms stream up to the specified term preffix.
  1273. *
  1274. * Prefix contains fully specified field info and portion of searched term
  1275. *
  1276. * @param Zend_Search_Lucene_Index_Term $prefix
  1277. */
  1278. public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
  1279. {
  1280. $this->_termsStream->skipTo($prefix);
  1281. }
  1282. /**
  1283. * Scans terms dictionary and returns next term
  1284. *
  1285. * @return Zend_Search_Lucene_Index_Term|null
  1286. */
  1287. public function nextTerm()
  1288. {
  1289. return $this->_termsStream->nextTerm();
  1290. }
  1291. /**
  1292. * Returns term in current position
  1293. *
  1294. * @return Zend_Search_Lucene_Index_Term|null
  1295. */
  1296. public function currentTerm()
  1297. {
  1298. return $this->_termsStream->currentTerm();
  1299. }
  1300. /**
  1301. * Close terms stream
  1302. *
  1303. * Should be used for resources clean up if stream is not read up to the end
  1304. */
  1305. public function closeTermsStream()
  1306. {
  1307. $this->_termsStream->closeTermsStream();
  1308. $this->_termsStream = null;
  1309. }
  1310. /*************************************************************************
  1311. @todo UNIMPLEMENTED
  1312. *************************************************************************/
  1313. /**
  1314. * Undeletes all documents currently marked as deleted in this index.
  1315. *
  1316. * @todo Implementation
  1317. */
  1318. public function undeleteAll()
  1319. {}
  1320. }