PageRenderTime 28ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/Zend/Search/Lucene/Index/SegmentInfo.php

https://bitbucket.org/mercysam/zfs
PHP | 2093 lines | 1186 code | 302 blank | 605 comment | 329 complexity | 55edc0e0cb34764e5082982f899fb874 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Index_DictionaryLoader */
  22. require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php';
  23. /** Zend_Search_Lucene_Index_DocsFilter */
  24. require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
  25. /**
  26. * @category Zend
  27. * @package Zend_Search_Lucene
  28. * @subpackage Index
  29. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  30. * @license http://framework.zend.com/license/new-bsd New BSD License
  31. */
  32. class Zend_Search_Lucene_Index_SegmentInfo
  33. {
  34. /**
  35. * "Full scan vs fetch" boundary.
  36. *
  37. * If filter selectivity is less than this value, then full scan is performed
  38. * (since term entries fetching has some additional overhead).
  39. */
  40. const FULL_SCAN_VS_FETCH_BOUNDARY = 5;
  41. /**
  42. * Number of docs in a segment
  43. *
  44. * @var integer
  45. */
  46. private $_docCount;
  47. /**
  48. * Segment name
  49. *
  50. * @var string
  51. */
  52. private $_name;
  53. /**
  54. * Term Dictionary Index
  55. *
  56. * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
  57. * of performance considerations)
  58. * [0] -> $termValue
  59. * [1] -> $termFieldNum
  60. *
  61. * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  62. *
  63. * @var array
  64. */
  65. private $_termDictionary;
  66. /**
  67. * Term Dictionary Index TermInfos
  68. *
  69. * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
  70. * of performance considerations)
  71. * [0] -> $docFreq
  72. * [1] -> $freqPointer
  73. * [2] -> $proxPointer
  74. * [3] -> $skipOffset
  75. * [4] -> $indexPointer
  76. *
  77. * @var array
  78. */
  79. private $_termDictionaryInfos;
  80. /**
  81. * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
  82. *
  83. * @var array
  84. */
  85. private $_fields;
  86. /**
  87. * Field positions in a dictionary.
  88. * (Term dictionary contains filelds ordered by names)
  89. *
  90. * @var array
  91. */
  92. private $_fieldsDicPositions;
  93. /**
  94. * Associative array where the key is the file name and the value is data offset
  95. * in a compound segment file (.csf).
  96. *
  97. * @var array
  98. */
  99. private $_segFiles;
  100. /**
  101. * Associative array where the key is the file name and the value is file size (.csf).
  102. *
  103. * @var array
  104. */
  105. private $_segFileSizes;
  106. /**
  107. * Delete file generation number
  108. *
  109. * -2 means autodetect latest delete generation
  110. * -1 means 'there is no delete file'
  111. * 0 means pre-2.1 format delete file
  112. * X specifies used delete file
  113. *
  114. * @var integer
  115. */
  116. private $_delGen;
  117. /**
  118. * Segment has single norms file
  119. *
  120. * If true then one .nrm file is used for all fields
  121. * Otherwise .fN files are used
  122. *
  123. * @var boolean
  124. */
  125. private $_hasSingleNormFile;
  126. /**
  127. * Use compound segment file (*.cfs) to collect all other segment files
  128. * (excluding .del files)
  129. *
  130. * @var boolean
  131. */
  132. private $_isCompound;
  133. /**
  134. * File system adapter.
  135. *
  136. * @var Zend_Search_Lucene_Storage_Directory_Filesystem
  137. */
  138. private $_directory;
  139. /**
  140. * Normalization factors.
  141. * An array fieldName => normVector
  142. * normVector is a binary string.
  143. * Each byte corresponds to an indexed document in a segment and
  144. * encodes normalization factor (float value, encoded by
  145. * Zend_Search_Lucene_Search_Similarity::encodeNorm())
  146. *
  147. * @var array
  148. */
  149. private $_norms = array();
  150. /**
  151. * List of deleted documents.
  152. * bitset if bitset extension is loaded or array otherwise.
  153. *
  154. * @var mixed
  155. */
  156. private $_deleted = null;
  157. /**
  158. * $this->_deleted update flag
  159. *
  160. * @var boolean
  161. */
  162. private $_deletedDirty = false;
  163. /**
  164. * True if segment uses shared doc store
  165. *
  166. * @var boolean
  167. */
  168. private $_usesSharedDocStore;
  169. /*
  170. * Shared doc store options.
  171. * It's an assotiative array with the following items:
  172. * - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin
  173. * - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files.
  174. * - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file).
  175. */
  176. private $_sharedDocStoreOptions;
  177. /**
  178. * Zend_Search_Lucene_Index_SegmentInfo constructor
  179. *
  180. * @param Zend_Search_Lucene_Storage_Directory $directory
  181. * @param string $name
  182. * @param integer $docCount
  183. * @param integer $delGen
  184. * @param array|null $docStoreOptions
  185. * @param boolean $hasSingleNormFile
  186. * @param boolean $isCompound
  187. */
  188. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null)
  189. {
  190. $this->_directory = $directory;
  191. $this->_name = $name;
  192. $this->_docCount = $docCount;
  193. if ($docStoreOptions !== null) {
  194. $this->_usesSharedDocStore = true;
  195. $this->_sharedDocStoreOptions = $docStoreOptions;
  196. if ($docStoreOptions['isCompound']) {
  197. $cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx');
  198. $cfxFilesCount = $cfxFile->readVInt();
  199. $cfxFiles = array();
  200. $cfxFileSizes = array();
  201. for ($count = 0; $count < $cfxFilesCount; $count++) {
  202. $dataOffset = $cfxFile->readLong();
  203. if ($count != 0) {
  204. $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles);
  205. }
  206. $fileName = $cfxFile->readString();
  207. $cfxFiles[$fileName] = $dataOffset;
  208. }
  209. if ($count != 0) {
  210. $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset;
  211. }
  212. $this->_sharedDocStoreOptions['files'] = $cfxFiles;
  213. $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes;
  214. }
  215. }
  216. $this->_hasSingleNormFile = $hasSingleNormFile;
  217. $this->_delGen = $delGen;
  218. $this->_termDictionary = null;
  219. if ($isCompound !== null) {
  220. $this->_isCompound = $isCompound;
  221. } else {
  222. // It's a pre-2.1 segment or isCompound is set to 'unknown'
  223. // Detect if segment uses compound file
  224. require_once 'Zend/Search/Lucene/Exception.php';
  225. try {
  226. // Try to open compound file
  227. $this->_directory->getFileObject($name . '.cfs');
  228. // Compound file is found
  229. $this->_isCompound = true;
  230. } catch (Zend_Search_Lucene_Exception $e) {
  231. if (strpos($e->getMessage(), 'is not readable') !== false) {
  232. // Compound file is not found or is not readable
  233. $this->_isCompound = false;
  234. } else {
  235. throw $e;
  236. }
  237. }
  238. }
  239. $this->_segFiles = array();
  240. if ($this->_isCompound) {
  241. $cfsFile = $this->_directory->getFileObject($name . '.cfs');
  242. $segFilesCount = $cfsFile->readVInt();
  243. for ($count = 0; $count < $segFilesCount; $count++) {
  244. $dataOffset = $cfsFile->readLong();
  245. if ($count != 0) {
  246. $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
  247. }
  248. $fileName = $cfsFile->readString();
  249. $this->_segFiles[$fileName] = $dataOffset;
  250. }
  251. if ($count != 0) {
  252. $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
  253. }
  254. }
  255. $fnmFile = $this->openCompoundFile('.fnm');
  256. $fieldsCount = $fnmFile->readVInt();
  257. $fieldNames = array();
  258. $fieldNums = array();
  259. $this->_fields = array();
  260. for ($count=0; $count < $fieldsCount; $count++) {
  261. $fieldName = $fnmFile->readString();
  262. $fieldBits = $fnmFile->readByte();
  263. $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
  264. $fieldBits & 0x01 /* field is indexed */,
  265. $count,
  266. $fieldBits & 0x02 /* termvectors are stored */,
  267. $fieldBits & 0x10 /* norms are omitted */,
  268. $fieldBits & 0x20 /* payloads are stored */);
  269. if ($fieldBits & 0x10) {
  270. // norms are omitted for the indexed field
  271. $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
  272. }
  273. $fieldNums[$count] = $count;
  274. $fieldNames[$count] = $fieldName;
  275. }
  276. array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
  277. $this->_fieldsDicPositions = array_flip($fieldNums);
  278. if ($this->_delGen == -2) {
  279. // SegmentInfo constructor is invoked from index writer
  280. // Autodetect current delete file generation number
  281. $this->_delGen = $this->_detectLatestDelGen();
  282. }
  283. // Load deletions
  284. $this->_deleted = $this->_loadDelFile();
  285. }
  286. /**
  287. * Load detetions file
  288. *
  289. * Returns bitset or an array depending on bitset extension availability
  290. *
  291. * @return mixed
  292. * @throws Zend_Search_Lucene_Exception
  293. */
  294. private function _loadDelFile()
  295. {
  296. if ($this->_delGen == -1) {
  297. // There is no delete file for this segment
  298. return null;
  299. } else if ($this->_delGen == 0) {
  300. // It's a segment with pre-2.1 format delete file
  301. // Try to load deletions file
  302. return $this->_loadPre21DelFile();
  303. } else {
  304. // It's 2.1+ format deleteions file
  305. return $this->_load21DelFile();
  306. }
  307. }
  308. /**
  309. * Load pre-2.1 detetions file
  310. *
  311. * Returns bitset or an array depending on bitset extension availability
  312. *
  313. * @return mixed
  314. * @throws Zend_Search_Lucene_Exception
  315. */
  316. private function _loadPre21DelFile()
  317. {
  318. require_once 'Zend/Search/Lucene/Exception.php';
  319. try {
  320. // '.del' files always stored in a separate file
  321. // Segment compound is not used
  322. $delFile = $this->_directory->getFileObject($this->_name . '.del');
  323. $byteCount = $delFile->readInt();
  324. $byteCount = ceil($byteCount/8);
  325. $bitCount = $delFile->readInt();
  326. if ($bitCount == 0) {
  327. $delBytes = '';
  328. } else {
  329. $delBytes = $delFile->readBytes($byteCount);
  330. }
  331. if (extension_loaded('bitset')) {
  332. return $delBytes;
  333. } else {
  334. $deletions = array();
  335. for ($count = 0; $count < $byteCount; $count++) {
  336. $byte = ord($delBytes[$count]);
  337. for ($bit = 0; $bit < 8; $bit++) {
  338. if ($byte & (1<<$bit)) {
  339. $deletions[$count*8 + $bit] = 1;
  340. }
  341. }
  342. }
  343. return $deletions;
  344. }
  345. } catch(Zend_Search_Lucene_Exception $e) {
  346. if (strpos($e->getMessage(), 'is not readable') === false) {
  347. throw $e;
  348. }
  349. // There is no deletion file
  350. $this->_delGen = -1;
  351. return null;
  352. }
  353. }
  354. /**
  355. * Load 2.1+ format detetions file
  356. *
  357. * Returns bitset or an array depending on bitset extension availability
  358. *
  359. * @return mixed
  360. */
  361. private function _load21DelFile()
  362. {
  363. $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
  364. $format = $delFile->readInt();
  365. if ($format == (int)0xFFFFFFFF) {
  366. if (extension_loaded('bitset')) {
  367. $deletions = bitset_empty();
  368. } else {
  369. $deletions = array();
  370. }
  371. $byteCount = $delFile->readInt();
  372. $bitCount = $delFile->readInt();
  373. $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
  374. $byteNum = 0;
  375. do {
  376. $dgap = $delFile->readVInt();
  377. $nonZeroByte = $delFile->readByte();
  378. $byteNum += $dgap;
  379. if (extension_loaded('bitset')) {
  380. for ($bit = 0; $bit < 8; $bit++) {
  381. if ($nonZeroByte & (1<<$bit)) {
  382. bitset_incl($deletions, $byteNum*8 + $bit);
  383. }
  384. }
  385. return $deletions;
  386. } else {
  387. for ($bit = 0; $bit < 8; $bit++) {
  388. if ($nonZeroByte & (1<<$bit)) {
  389. $deletions[$byteNum*8 + $bit] = 1;
  390. }
  391. }
  392. return (count($deletions) > 0) ? $deletions : null;
  393. }
  394. } while ($delFile->tell() < $delFileSize);
  395. } else {
  396. // $format is actually byte count
  397. $byteCount = ceil($format/8);
  398. $bitCount = $delFile->readInt();
  399. if ($bitCount == 0) {
  400. $delBytes = '';
  401. } else {
  402. $delBytes = $delFile->readBytes($byteCount);
  403. }
  404. if (extension_loaded('bitset')) {
  405. return $delBytes;
  406. } else {
  407. $deletions = array();
  408. for ($count = 0; $count < $byteCount; $count++) {
  409. $byte = ord($delBytes[$count]);
  410. for ($bit = 0; $bit < 8; $bit++) {
  411. if ($byte & (1<<$bit)) {
  412. $deletions[$count*8 + $bit] = 1;
  413. }
  414. }
  415. }
  416. return (count($deletions) > 0) ? $deletions : null;
  417. }
  418. }
  419. }
  420. /**
  421. * Opens index file stoted within compound index file
  422. *
  423. * @param string $extension
  424. * @param boolean $shareHandler
  425. * @throws Zend_Search_Lucene_Exception
  426. * @return Zend_Search_Lucene_Storage_File
  427. */
  428. public function openCompoundFile($extension, $shareHandler = true)
  429. {
  430. if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
  431. $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx';
  432. $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt';
  433. if (!$this->_sharedDocStoreOptions['isCompound']) {
  434. $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler);
  435. $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
  436. if ($extension == '.fdx') {
  437. // '.fdx' file is requested
  438. return $fdxFile;
  439. } else {
  440. // '.fdt' file is requested
  441. $fdtStartOffset = $fdxFile->readLong();
  442. $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler);
  443. $fdtFile->seek($fdtStartOffset, SEEK_CUR);
  444. return $fdtFile;
  445. }
  446. }
  447. if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) {
  448. require_once 'Zend/Search/Lucene/Exception.php';
  449. throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
  450. . $fdxFName . ' file.' );
  451. }
  452. if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) {
  453. require_once 'Zend/Search/Lucene/Exception.php';
  454. throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
  455. . $fdtFName . ' file.' );
  456. }
  457. // Open shared docstore segment file
  458. $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler);
  459. // Seek to the start of '.fdx' file within compound file
  460. $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]);
  461. // Seek to the start of current segment documents section
  462. $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
  463. if ($extension == '.fdx') {
  464. // '.fdx' file is requested
  465. return $cfxFile;
  466. } else {
  467. // '.fdt' file is requested
  468. $fdtStartOffset = $cfxFile->readLong();
  469. // Seek to the start of '.fdt' file within compound file
  470. $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]);
  471. // Seek to the start of current segment documents section
  472. $cfxFile->seek($fdtStartOffset, SEEK_CUR);
  473. return $fdtFile;
  474. }
  475. }
  476. $filename = $this->_name . $extension;
  477. if (!$this->_isCompound) {
  478. return $this->_directory->getFileObject($filename, $shareHandler);
  479. }
  480. if( !isset($this->_segFiles[$filename]) ) {
  481. require_once 'Zend/Search/Lucene/Exception.php';
  482. throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain '
  483. . $filename . ' file.' );
  484. }
  485. $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
  486. $file->seek($this->_segFiles[$filename]);
  487. return $file;
  488. }
  489. /**
  490. * Get compound file length
  491. *
  492. * @param string $extension
  493. * @return integer
  494. */
  495. public function compoundFileLength($extension)
  496. {
  497. if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
  498. $filename = $this->_sharedDocStoreOptions['segment'] . $extension;
  499. if (!$this->_sharedDocStoreOptions['isCompound']) {
  500. return $this->_directory->fileLength($filename);
  501. }
  502. if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) {
  503. require_once 'Zend/Search/Lucene/Exception.php';
  504. throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain '
  505. . $filename . ' file.' );
  506. }
  507. return $this->_sharedDocStoreOptions['fileSizes'][$filename];
  508. }
  509. $filename = $this->_name . $extension;
  510. // Try to get common file first
  511. if ($this->_directory->fileExists($filename)) {
  512. return $this->_directory->fileLength($filename);
  513. }
  514. if( !isset($this->_segFileSizes[$filename]) ) {
  515. require_once 'Zend/Search/Lucene/Exception.php';
  516. throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
  517. . $filename . ' file.' );
  518. }
  519. return $this->_segFileSizes[$filename];
  520. }
  521. /**
  522. * Returns field index or -1 if field is not found
  523. *
  524. * @param string $fieldName
  525. * @return integer
  526. */
  527. public function getFieldNum($fieldName)
  528. {
  529. foreach( $this->_fields as $field ) {
  530. if( $field->name == $fieldName ) {
  531. return $field->number;
  532. }
  533. }
  534. return -1;
  535. }
  536. /**
  537. * Returns field info for specified field
  538. *
  539. * @param integer $fieldNum
  540. * @return Zend_Search_Lucene_Index_FieldInfo
  541. */
  542. public function getField($fieldNum)
  543. {
  544. return $this->_fields[$fieldNum];
  545. }
  546. /**
  547. * Returns array of fields.
  548. * if $indexed parameter is true, then returns only indexed fields.
  549. *
  550. * @param boolean $indexed
  551. * @return array
  552. */
  553. public function getFields($indexed = false)
  554. {
  555. $result = array();
  556. foreach( $this->_fields as $field ) {
  557. if( (!$indexed) || $field->isIndexed ) {
  558. $result[ $field->name ] = $field->name;
  559. }
  560. }
  561. return $result;
  562. }
  563. /**
  564. * Returns array of FieldInfo objects.
  565. *
  566. * @return array
  567. */
  568. public function getFieldInfos()
  569. {
  570. return $this->_fields;
  571. }
  572. /**
  573. * Returns actual deletions file generation number.
  574. *
  575. * @return integer
  576. */
  577. public function getDelGen()
  578. {
  579. return $this->_delGen;
  580. }
  581. /**
  582. * Returns the total number of documents in this segment (including deleted documents).
  583. *
  584. * @return integer
  585. */
  586. public function count()
  587. {
  588. return $this->_docCount;
  589. }
  590. /**
  591. * Returns number of deleted documents.
  592. *
  593. * @return integer
  594. */
  595. private function _deletedCount()
  596. {
  597. if ($this->_deleted === null) {
  598. return 0;
  599. }
  600. if (extension_loaded('bitset')) {
  601. return count(bitset_to_array($this->_deleted));
  602. } else {
  603. return count($this->_deleted);
  604. }
  605. }
  606. /**
  607. * Returns the total number of non-deleted documents in this segment.
  608. *
  609. * @return integer
  610. */
  611. public function numDocs()
  612. {
  613. if ($this->hasDeletions()) {
  614. return $this->_docCount - $this->_deletedCount();
  615. } else {
  616. return $this->_docCount;
  617. }
  618. }
  619. /**
  620. * Get field position in a fields dictionary
  621. *
  622. * @param integer $fieldNum
  623. * @return integer
  624. */
  625. private function _getFieldPosition($fieldNum) {
  626. // Treat values which are not in a translation table as a 'direct value'
  627. return isset($this->_fieldsDicPositions[$fieldNum]) ?
  628. $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
  629. }
  630. /**
  631. * Return segment name
  632. *
  633. * @return string
  634. */
  635. public function getName()
  636. {
  637. return $this->_name;
  638. }
  639. /**
  640. * TermInfo cache
  641. *
  642. * Size is 1024.
  643. * Numbers are used instead of class constants because of performance considerations
  644. *
  645. * @var array
  646. */
  647. private $_termInfoCache = array();
  648. private function _cleanUpTermInfoCache()
  649. {
  650. // Clean 256 term infos
  651. foreach ($this->_termInfoCache as $key => $termInfo) {
  652. unset($this->_termInfoCache[$key]);
  653. // leave 768 last used term infos
  654. if (count($this->_termInfoCache) == 768) {
  655. break;
  656. }
  657. }
  658. }
  659. /**
  660. * Load terms dictionary index
  661. *
  662. * @throws Zend_Search_Lucene_Exception
  663. */
  664. private function _loadDictionaryIndex()
  665. {
  666. // Check, if index is already serialized
  667. if ($this->_directory->fileExists($this->_name . '.sti')) {
  668. // Load serialized dictionary index data
  669. $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
  670. $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
  671. // Load dictionary index data
  672. if (($unserializedData = @unserialize($stiFileData)) !== false) {
  673. list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData;
  674. return;
  675. }
  676. }
  677. // Load data from .tii file and generate .sti file
  678. // Prefetch dictionary index data
  679. $tiiFile = $this->openCompoundFile('.tii');
  680. $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
  681. // Load dictionary index data
  682. list($this->_termDictionary, $this->_termDictionaryInfos) =
  683. Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
  684. $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
  685. $stiFile = $this->_directory->createFile($this->_name . '.sti');
  686. $stiFile->writeBytes($stiFileData);
  687. }
  688. /**
  689. * Scans terms dictionary and returns term info
  690. *
  691. * @param Zend_Search_Lucene_Index_Term $term
  692. * @return Zend_Search_Lucene_Index_TermInfo
  693. */
  694. public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
  695. {
  696. $termKey = $term->key();
  697. if (isset($this->_termInfoCache[$termKey])) {
  698. $termInfo = $this->_termInfoCache[$termKey];
  699. // Move termInfo to the end of cache
  700. unset($this->_termInfoCache[$termKey]);
  701. $this->_termInfoCache[$termKey] = $termInfo;
  702. return $termInfo;
  703. }
  704. if ($this->_termDictionary === null) {
  705. $this->_loadDictionaryIndex();
  706. }
  707. $searchField = $this->getFieldNum($term->field);
  708. if ($searchField == -1) {
  709. return null;
  710. }
  711. $searchDicField = $this->_getFieldPosition($searchField);
  712. // search for appropriate value in dictionary
  713. $lowIndex = 0;
  714. $highIndex = count($this->_termDictionary)-1;
  715. while ($highIndex >= $lowIndex) {
  716. // $mid = ($highIndex - $lowIndex)/2;
  717. $mid = ($highIndex + $lowIndex) >> 1;
  718. $midTerm = $this->_termDictionary[$mid];
  719. $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
  720. $delta = $searchDicField - $fieldNum;
  721. if ($delta == 0) {
  722. $delta = strcmp($term->text, $midTerm[1] /* text */);
  723. }
  724. if ($delta < 0) {
  725. $highIndex = $mid-1;
  726. } elseif ($delta > 0) {
  727. $lowIndex = $mid+1;
  728. } else {
  729. // return $this->_termDictionaryInfos[$mid]; // We got it!
  730. $a = $this->_termDictionaryInfos[$mid];
  731. $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
  732. // Put loaded termInfo into cache
  733. $this->_termInfoCache[$termKey] = $termInfo;
  734. return $termInfo;
  735. }
  736. }
  737. if ($highIndex == -1) {
  738. // Term is out of the dictionary range
  739. return null;
  740. }
  741. $prevPosition = $highIndex;
  742. $prevTerm = $this->_termDictionary[$prevPosition];
  743. $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
  744. $tisFile = $this->openCompoundFile('.tis');
  745. $tiVersion = $tisFile->readInt();
  746. if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
  747. $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
  748. require_once 'Zend/Search/Lucene/Exception.php';
  749. throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
  750. }
  751. $termCount = $tisFile->readLong();
  752. $indexInterval = $tisFile->readInt();
  753. $skipInterval = $tisFile->readInt();
  754. if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
  755. $maxSkipLevels = $tisFile->readInt();
  756. }
  757. $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR);
  758. $termValue = $prevTerm[1] /* text */;
  759. $termFieldNum = $prevTerm[0] /* field */;
  760. $freqPointer = $prevTermInfo[1] /* freqPointer */;
  761. $proxPointer = $prevTermInfo[2] /* proxPointer */;
  762. for ($count = $prevPosition*$indexInterval + 1;
  763. $count <= $termCount &&
  764. ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
  765. ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
  766. strcmp($termValue, $term->text) < 0) );
  767. $count++) {
  768. $termPrefixLength = $tisFile->readVInt();
  769. $termSuffix = $tisFile->readString();
  770. $termFieldNum = $tisFile->readVInt();
  771. $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
  772. $docFreq = $tisFile->readVInt();
  773. $freqPointer += $tisFile->readVInt();
  774. $proxPointer += $tisFile->readVInt();
  775. if( $docFreq >= $skipInterval ) {
  776. $skipOffset = $tisFile->readVInt();
  777. } else {
  778. $skipOffset = 0;
  779. }
  780. }
  781. if ($termFieldNum == $searchField && $termValue == $term->text) {
  782. $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
  783. } else {
  784. $termInfo = null;
  785. }
  786. // Put loaded termInfo into cache
  787. $this->_termInfoCache[$termKey] = $termInfo;
  788. if (count($this->_termInfoCache) == 1024) {
  789. $this->_cleanUpTermInfoCache();
  790. }
  791. return $termInfo;
  792. }
  793. /**
  794. * Returns IDs of all the documents containing term.
  795. *
  796. * @param Zend_Search_Lucene_Index_Term $term
  797. * @param integer $shift
  798. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  799. * @return array
  800. */
  801. public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
  802. {
  803. $termInfo = $this->getTermInfo($term);
  804. if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
  805. if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
  806. $docsFilter->segmentFilters[$this->_name] = array();
  807. }
  808. return array();
  809. }
  810. $frqFile = $this->openCompoundFile('.frq');
  811. $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
  812. $docId = 0;
  813. $result = array();
  814. if ($docsFilter !== null) {
  815. if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
  816. require_once 'Zend/Search/Lucene/Exception.php';
  817. throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
  818. }
  819. if (isset($docsFilter->segmentFilters[$this->_name])) {
  820. // Filter already has some data for the current segment
  821. // Make short name for the filter (which doesn't need additional dereferencing)
  822. $filter = &$docsFilter->segmentFilters[$this->_name];
  823. // Check if filter is not empty
  824. if (count($filter) == 0) {
  825. return array();
  826. }
  827. if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
  828. // Perform fetching
  829. // ---------------------------------------------------------------
  830. $updatedFilterData = array();
  831. for( $count=0; $count < $termInfo->docFreq; $count++ ) {
  832. $docDelta = $frqFile->readVInt();
  833. if( $docDelta % 2 == 1 ) {
  834. $docId += ($docDelta-1)/2;
  835. } else {
  836. $docId += $docDelta/2;
  837. // read freq
  838. $frqFile->readVInt();
  839. }
  840. if (isset($filter[$docId])) {
  841. $result[] = $shift + $docId;
  842. $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  843. }
  844. }
  845. $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
  846. // ---------------------------------------------------------------
  847. } else {
  848. // Perform full scan
  849. $updatedFilterData = array();
  850. for( $count=0; $count < $termInfo->docFreq; $count++ ) {
  851. $docDelta = $frqFile->readVInt();
  852. if( $docDelta % 2 == 1 ) {
  853. $docId += ($docDelta-1)/2;
  854. } else {
  855. $docId += $docDelta/2;
  856. // read freq
  857. $frqFile->readVInt();
  858. }
  859. if (isset($filter[$docId])) {
  860. $result[] = $shift + $docId;
  861. $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  862. }
  863. }
  864. $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
  865. }
  866. } else {
  867. // Filter is present, but doesn't has data for the current segment yet
  868. $filterData = array();
  869. for( $count=0; $count < $termInfo->docFreq; $count++ ) {
  870. $docDelta = $frqFile->readVInt();
  871. if( $docDelta % 2 == 1 ) {
  872. $docId += ($docDelta-1)/2;
  873. } else {
  874. $docId += $docDelta/2;
  875. // read freq
  876. $frqFile->readVInt();
  877. }
  878. $result[] = $shift + $docId;
  879. $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  880. }
  881. $docsFilter->segmentFilters[$this->_name] = $filterData;
  882. }
  883. } else {
  884. for( $count=0; $count < $termInfo->docFreq; $count++ ) {
  885. $docDelta = $frqFile->readVInt();
  886. if( $docDelta % 2 == 1 ) {
  887. $docId += ($docDelta-1)/2;
  888. } else {
  889. $docId += $docDelta/2;
  890. // read freq
  891. $frqFile->readVInt();
  892. }
  893. $result[] = $shift + $docId;
  894. }
  895. }
  896. return $result;
  897. }
  898. /**
  899. * Returns term freqs array.
  900. * Result array structure: array(docId => freq, ...)
  901. *
  902. * @param Zend_Search_Lucene_Index_Term $term
  903. * @param integer $shift
  904. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  905. * @return Zend_Search_Lucene_Index_TermInfo
  906. */
  907. public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
  908. {
  909. $termInfo = $this->getTermInfo($term);
  910. if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
  911. if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
  912. $docsFilter->segmentFilters[$this->_name] = array();
  913. }
  914. return array();
  915. }
  916. $frqFile = $this->openCompoundFile('.frq');
  917. $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
  918. $result = array();
  919. $docId = 0;
  920. $result = array();
  921. if ($docsFilter !== null) {
  922. if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
  923. require_once 'Zend/Search/Lucene/Exception.php';
  924. throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
  925. }
  926. if (isset($docsFilter->segmentFilters[$this->_name])) {
  927. // Filter already has some data for the current segment
  928. // Make short name for the filter (which doesn't need additional dereferencing)
  929. $filter = &$docsFilter->segmentFilters[$this->_name];
  930. // Check if filter is not empty
  931. if (count($filter) == 0) {
  932. return array();
  933. }
  934. if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
  935. // Perform fetching
  936. // ---------------------------------------------------------------
  937. $updatedFilterData = array();
  938. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  939. $docDelta = $frqFile->readVInt();
  940. if ($docDelta % 2 == 1) {
  941. $docId += ($docDelta-1)/2;
  942. if (isset($filter[$docId])) {
  943. $result[$shift + $docId] = 1;
  944. $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  945. }
  946. } else {
  947. $docId += $docDelta/2;
  948. if (isset($filter[$docId])) {
  949. $result[$shift + $docId] = $frqFile->readVInt();
  950. $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  951. }
  952. }
  953. }
  954. $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
  955. // ---------------------------------------------------------------
  956. } else {
  957. // Perform full scan
  958. $updatedFilterData = array();
  959. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  960. $docDelta = $frqFile->readVInt();
  961. if ($docDelta % 2 == 1) {
  962. $docId += ($docDelta-1)/2;
  963. if (isset($filter[$docId])) {
  964. $result[$shift + $docId] = 1;
  965. $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
  966. }
  967. } else {
  968. $docId += $docDelta/2;
  969. if (isset($filter[$docId])) {
  970. $result[$shift + $docId] = $frqFile->readVInt();
  971. $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
  972. }
  973. }
  974. }
  975. $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
  976. }
  977. } else {
  978. // Filter doesn't has data for current segment
  979. $filterData = array();
  980. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  981. $docDelta = $frqFile->readVInt();
  982. if ($docDelta % 2 == 1) {
  983. $docId += ($docDelta-1)/2;
  984. $result[$shift + $docId] = 1;
  985. $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  986. } else {
  987. $docId += $docDelta/2;
  988. $result[$shift + $docId] = $frqFile->readVInt();
  989. $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  990. }
  991. }
  992. $docsFilter->segmentFilters[$this->_name] = $filterData;
  993. }
  994. } else {
  995. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  996. $docDelta = $frqFile->readVInt();
  997. if ($docDelta % 2 == 1) {
  998. $docId += ($docDelta-1)/2;
  999. $result[$shift + $docId] = 1;
  1000. } else {
  1001. $docId += $docDelta/2;
  1002. $result[$shift + $docId] = $frqFile->readVInt();
  1003. }
  1004. }
  1005. }
  1006. return $result;
  1007. }
  1008. /**
  1009. * Returns term positions array.
  1010. * Result array structure: array(docId => array(pos1, pos2, ...), ...)
  1011. *
  1012. * @param Zend_Search_Lucene_Index_Term $term
  1013. * @param integer $shift
  1014. * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
  1015. * @return Zend_Search_Lucene_Index_TermInfo
  1016. */
  1017. public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
  1018. {
  1019. $termInfo = $this->getTermInfo($term);
  1020. if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
  1021. if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
  1022. $docsFilter->segmentFilters[$this->_name] = array();
  1023. }
  1024. return array();
  1025. }
  1026. $frqFile = $this->openCompoundFile('.frq');
  1027. $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
  1028. $docId = 0;
  1029. $freqs = array();
  1030. if ($docsFilter !== null) {
  1031. if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
  1032. require_once 'Zend/Search/Lucene/Exception.php';
  1033. throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
  1034. }
  1035. if (isset($docsFilter->segmentFilters[$this->_name])) {
  1036. // Filter already has some data for the current segment
  1037. // Make short name for the filter (which doesn't need additional dereferencing)
  1038. $filter = &$docsFilter->segmentFilters[$this->_name];
  1039. // Check if filter is not empty
  1040. if (count($filter) == 0) {
  1041. return array();
  1042. }
  1043. if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
  1044. // Perform fetching
  1045. // ---------------------------------------------------------------
  1046. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  1047. $docDelta = $frqFile->readVInt();
  1048. if ($docDelta % 2 == 1) {
  1049. $docId += ($docDelta-1)/2;
  1050. $freqs[$docId] = 1;
  1051. } else {
  1052. $docId += $docDelta/2;
  1053. $freqs[$docId] = $frqFile->readVInt();
  1054. }
  1055. }
  1056. $updatedFilterData = array();
  1057. $result = array();
  1058. $prxFile = $this->openCompoundFile('.prx');
  1059. $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
  1060. foreach ($freqs as $docId => $freq) {
  1061. $termPosition = 0;
  1062. $positions = array();
  1063. // we have to read .prx file to get right position for next doc
  1064. // even filter doesn't match current document
  1065. for ($count = 0; $count < $freq; $count++ ) {
  1066. $termPosition += $prxFile->readVInt();
  1067. $positions[] = $termPosition;
  1068. }
  1069. // Include into updated filter and into result only if doc is matched by filter
  1070. if (isset($filter[$docId])) {
  1071. $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  1072. $result[$shift + $docId] = $positions;
  1073. }
  1074. }
  1075. $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
  1076. // ---------------------------------------------------------------
  1077. } else {
  1078. // Perform full scan
  1079. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  1080. $docDelta = $frqFile->readVInt();
  1081. if ($docDelta % 2 == 1) {
  1082. $docId += ($docDelta-1)/2;
  1083. $freqs[$docId] = 1;
  1084. } else {
  1085. $docId += $docDelta/2;
  1086. $freqs[$docId] = $frqFile->readVInt();
  1087. }
  1088. }
  1089. $updatedFilterData = array();
  1090. $result = array();
  1091. $prxFile = $this->openCompoundFile('.prx');
  1092. $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
  1093. foreach ($freqs as $docId => $freq) {
  1094. $termPosition = 0;
  1095. $positions = array();
  1096. // we have to read .prx file to get right position for next doc
  1097. // even filter doesn't match current document
  1098. for ($count = 0; $count < $freq; $count++ ) {
  1099. $termPosition += $prxFile->readVInt();
  1100. $positions[] = $termPosition;
  1101. }
  1102. // Include into updated filter and into result only if doc is matched by filter
  1103. if (isset($filter[$docId])) {
  1104. $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  1105. $result[$shift + $docId] = $positions;
  1106. }
  1107. }
  1108. $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
  1109. }
  1110. } else {
  1111. // Filter doesn't has data for current segment
  1112. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  1113. $docDelta = $frqFile->readVInt();
  1114. if ($docDelta % 2 == 1) {
  1115. $docId += ($docDelta-1)/2;
  1116. $freqs[$docId] = 1;
  1117. } else {
  1118. $docId += $docDelta/2;
  1119. $freqs[$docId] = $frqFile->readVInt();
  1120. }
  1121. }
  1122. $filterData = array();
  1123. $result = array();
  1124. $prxFile = $this->openCompoundFile('.prx');
  1125. $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
  1126. foreach ($freqs as $docId => $freq) {
  1127. $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
  1128. $termPosition = 0;
  1129. $positions = array();
  1130. for ($count = 0; $count < $freq; $count++ ) {
  1131. $termPosition += $prxFile->readVInt();
  1132. $positions[] = $termPosition;
  1133. }
  1134. $result[$shift + $docId] = $positions;
  1135. }
  1136. $docsFilter->segmentFilters[$this->_name] = $filterData;
  1137. }
  1138. } else {
  1139. for ($count = 0; $count < $termInfo->docFreq; $count++) {
  1140. $docDelta = $frqFile->readVInt();
  1141. if ($docDelta % 2 == 1) {
  1142. $docId += ($docDelta-1)/2;
  1143. $freqs[$docId] = 1;
  1144. } else {
  1145. $docId += $docDelta/2;
  1146. $freqs[$docId] = $frqFile->readVInt();
  1147. }
  1148. }
  1149. $result = array();
  1150. $prxFile = $this->openCompoundFile('.prx');
  1151. $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
  1152. foreach ($freqs as $docId => $freq) {
  1153. $termPosition = 0;
  1154. $positions = array();
  1155. for ($count = 0; $count < $freq; $count++ ) {
  1156. $termPosition += $prxFile->readVInt();
  1157. $positions[] = $termPosition;
  1158. }
  1159. $result[$shift + $docId] = $positions;
  1160. }
  1161. }
  1162. return $result;
  1163. }
  1164. /**
  1165. * Load normalizatin factors from an index file
  1166. *
  1167. * @param integer $fieldNum
  1168. * @throws Zend_Search_Lucene_Exception
  1169. */
  1170. private function _loadNorm($fieldNum)
  1171. {
  1172. if ($this->_hasSingleNormFile) {
  1173. $normfFile = $this->openCompoundFile('.nrm');
  1174. $header = $normfFile->readBytes(3);
  1175. $headerFormatVersion = $normfFile->readByte();
  1176. if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) {
  1177. require_once 'Zend/Search/Lucene/Exception.php';
  1178. throw new Zend_Search_Lucene_Exception('Wrong norms file format.');
  1179. }
  1180. foreach ($this->_fields as $fNum => $fieldInfo) {
  1181. if ($fieldInfo->isIndexed) {
  1182. $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount);
  1183. }
  1184. }
  1185. } else {
  1186. $fFile = $this->openCompoundFile('.f' . $fieldNum);
  1187. $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
  1188. }
  1189. }
  1190. /**
  1191. * Returns normalization factor for specified documents
  1192. *
  1193. * @param integer $id
  1194. * @param string $fieldName
  1195. * @return float
  1196. */
  1197. public function norm($id, $fieldName)
  1198. {
  1199. $fieldNum = $this->getFieldNum($fieldName);
  1200. if ( !($this->_fields[$fieldNum]->isIndexed) ) {
  1201. return null;
  1202. }
  1203. if (!isset($this->_norms[$fieldNum])) {
  1204. $this->_loadNorm($fieldNum);
  1205. }
  1206. return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) );
  1207. }
  1208. /**
  1209. * Returns norm vector, encoded in a byte string
  1210. *
  1211. * @param string $fieldName
  1212. * @return string
  1213. */
  1214. public function normVector($fieldName)
  1215. {
  1216. $fieldNum = $this->getFieldNum($fieldName);
  1217. if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
  1218. $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
  1219. return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
  1220. $this->_docCount);
  1221. }
  1222. if (!isset($this->_norms[$fieldNum])) {
  1223. $this->_loadNorm($fieldNum);
  1224. }
  1225. return $this->_norms[$fieldNum];
  1226. }
  1227. /**
  1228. * Returns true if any documents have been deleted from this index segment.
  1229. *
  1230. * @return boolean
  1231. */
  1232. public function hasDeletions()
  1233. {
  1234. return $this->_deleted !== null;
  1235. }
  1236. /**
  1237. * Returns true if segment has single norms file.
  1238. *
  1239. * @return boolean
  1240. */
  1241. public function hasSingleNormFile()
  1242. {
  1243. return $this->_hasSingleNormFile ? true : false;
  1244. }
  1245. /**
  1246. * Returns true if segment is stored using compound segment file.
  1247. *
  1248. * @return boolean
  1249. */
  1250. public function isCompound()
  1251. {
  1252. return $this->_isCompound;
  1253. }
  1254. /**
  1255. * Deletes a document from the index segment.
  1256. * $id is an internal document id
  1257. *
  1258. * @param integer
  1259. */
  1260. public function delete($id)
  1261. {
  1262. $this->_deletedDirty = true;
  1263. if (extension_loaded('bitset')) {
  1264. if ($this->_deleted === null) {
  1265. $this->_deleted = bitset_empty($id);
  1266. }
  1267. bitset_incl($this->_deleted, $id);
  1268. } else {
  1269. if ($this->_deleted === null) {
  1270. $this->_deleted = array();
  1271. }
  1272. $this->_deleted[$id] = 1;
  1273. }
  1274. }
  1275. /**
  1276. * Checks, that document is deleted
  1277. *
  1278. * @param integer
  1279. * @return boolean
  1280. */
  1281. public function isDeleted($id)
  1282. {
  1283. if ($this->_deleted === null) {
  1284. return false;
  1285. }
  1286. if (extension_loaded('bitset')) {
  1287. return bitset_in($this->_deleted, $id);
  1288. } else {
  1289. return isset($this->_deleted[$id]);
  1290. }
  1291. }
  1292. /**
  1293. * Detect latest delete generation
  1294. *
  1295. * Is actualy used from writeChanges() method or from the constructor if it's invoked from
  1296. * Index writer. In both cases index write lock is already obtained, so we shouldn't care
  1297. * about it
  1298. *
  1299. * @return integer
  1300. */
  1301. private function _detectLatestDelGen()
  1302. {
  1303. $delFileList = array();
  1304. foreach ($this->_directory->fileList() as $file) {
  1305. if ($file == $this->_name . '.del') {
  1306. // Matches <segment_name>.del file name
  1307. $delFileList[] = 0;
  1308. } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) {
  1309. // Matches <segment_name>_NNN.del file names
  1310. $delFileList[] = (int)base_convert($matches[1], 36, 10);
  1311. }
  1312. }
  1313. if (count($delFileList) == 0) {
  1314. // There is no deletions file for current segment in the directory
  1315. // Set deletions file generation number to 1
  1316. return -1;
  1317. } else {
  1318. // There are some deletions files for current segment in the directory
  1319. // Set deletions file generation number to the highest nuber
  1320. return max($delFileList);
  1321. }
  1322. }
  1323. /**
  1324. * Write changes if it's necessary.
  1325. *
  1326. * This method must be invoked only from the Writer _updateSegments() method,
  1327. * so index Write lock has to be already obtained.
  1328. *
  1329. * @internal
  1330. * @throws Zend_Search_Lucene_Exceptions
  1331. */
  1332. public function writeChanges()
  1333. {
  1334. // Get new generation number
  1335. $latestDelGen = $this->_detectLatestDelGen();
  1336. if (!$this->_deletedDirty) {
  1337. // There was no deletions by current process
  1338. if ($latestDelGen == $this->_delGen) {
  1339. // Delete file hasn't been updated by any concurrent process
  1340. return;
  1341. } else if ($latestDelGen > $this->_delGen) {
  1342. // Delete file has been updated by some concurrent process
  1343. // Reload deletions file
  1344. $this->_delGen = $latestDelGen;
  1345. $this->_deleted = $this->_loadDelFile();
  1346. return;
  1347. } else {
  1348. require_once 'Zend/Search/Lucene/Exception.php';
  1349. throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.');
  1350. }
  1351. }
  1352. if ($latestDelGen > $this->_delGen) {
  1353. // Merge current deletions with latest deletions file
  1354. $this->_delGen = $latestDelGen;
  1355. $latestDelete = $this->_loadDelFile();
  1356. if (extension_loaded('bitset')) {
  1357. $this->_deleted = bitset_union($this->_deleted, $latestDelete);
  1358. } else {
  1359. $this->_deleted += $latestDelete;
  1360. }
  1361. }
  1362. if (extension_loaded('bitset')) {
  1363. $delBytes = $this->_deleted;
  1364. $bitCount = count(bitset_to_array($delBytes));
  1365. } else {
  1366. $byteCount = floor($this->_docCount/8)+1;
  1367. $delBytes = str_repeat(chr(0), $byteCount);
  1368. for ($count = 0; $count < $byteCount; $count++) {
  1369. $byte = 0;
  1370. for ($bit = 0; $bit < 8; $bit++) {
  1371. if (isset($this->_deleted[$count*8 + $bit])) {
  1372. $byte |= (1<<$bit);
  1373. }
  1374. }
  1375. $delBytes[$count] = chr($byte);
  1376. }
  1377. $bitCount = count($this->_deleted);
  1378. }
  1379. if ($this->_delGen == -1) {
  1380. // Set delete file generation number to 1
  1381. $this->_delGen = 1;
  1382. } else {
  1383. // Increase delete file generation number by 1
  1384. $this->_delGen++;
  1385. }
  1386. $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
  1387. $delFile->writeInt($this->_docCount);
  1388. $delFile->writeInt($bitCount);
  1389. $delFile->writeBytes($delBytes);
  1390. $this->_deletedDirty = false;
  1391. }
  1392. /**
  1393. * Term Dictionary File object for stream like terms reading
  1394. *
  1395. * @var Zend_Search_Lucene_Storage_File
  1396. */
  1397. private $_tisFile = null;
  1398. /**
  1399. * Actual offset of the .tis file data
  1400. *
  1401. * @var integer
  1402. */
  1403. private $_tisFileOffset;
  1404. /**
  1405. * Frequencies File object for stream like terms reading
  1406. *
  1407. * @var Zend_Search_Lucene_Storage_File
  1408. */
  1409. private $_frqFile = null;
  1410. /**
  1411. * Actual offset of the .frq file data
  1412. *
  1413. * @var integer
  1414. */
  1415. private $_frqFileOffset;
  1416. /**
  1417. * Positions File object for stream like terms reading
  1418. *
  1419. * @var Zend_Search_Lucene_Storage_File
  1420. */
  1421. private $_prxFile = null;
  1422. /**
  1423. * Actual offset of the .prx file in the compound file
  1424. *
  1425. * @var integer
  1426. */
  1427. private $_prxFileOffset;
  1428. /**
  1429. * Actual number of terms in term stream
  1430. *
  1431. * @var integer
  1432. */
  1433. private $_termCount = 0;
  1434. /**
  1435. * Overall number of terms in term stream
  1436. *
  1437. * @var integer
  1438. */
  1439. private $_termNum = 0;
  1440. /**
  1441. * Segment index interval
  1442. *
  1443. * @var integer
  1444. */
  1445. private $_indexInterval;
  1446. /**
  1447. * Segment skip interval
  1448. *
  1449. * @var integer
  1450. */
  1451. private $_skipInterval;
  1452. /**
  1453. * Last TermInfo in a terms stream
  1454. *
  1455. * @var Zend_Search_Lucene_Index_TermInfo
  1456. */
  1457. private $_lastTermInfo = null;
  1458. /**
  1459. * Last Term in a terms stream
  1460. *
  1461. * @var Zend_Search_Lucene_Index_Term
  1462. */
  1463. private $_lastTerm = null;
  1464. /**
  1465. * Map of the document IDs
  1466. * Used to get new docID after removing deleted documents.
  1467. * It's not very effective from memory usage point of view,
  1468. * but much more faster, then other methods
  1469. *
  1470. * @var array|null
  1471. */
  1472. private $_docMap = null;
  1473. /**
  1474. * An array of all term positions in the documents.
  1475. * Array structure: array( docId => array( pos1, pos2, ...), ...)
  1476. *
  1477. * Is set to null if term positions loading has to be skipped
  1478. *
  1479. * @var array|null
  1480. */
  1481. private $_lastTermPositions;
  1482. /**
  1483. * Terms scan mode
  1484. *
  1485. * Values:
  1486. *
  1487. * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved
  1488. * self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved
  1489. * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved
  1490. * document numbers are compacted (shifted if segment has deleted documents)
  1491. *
  1492. * @var integer
  1493. */
  1494. private $_termsScanMode;
  1495. /** Scan modes */
  1496. const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved
  1497. const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved
  1498. const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved
  1499. // document numbers are compacted (shifted if segment contains deleted documents)
  1500. /**
  1501. * Reset terms stream
  1502. *
  1503. * $startId - id for the fist document
  1504. * $compact - remove deleted documents
  1505. *
  1506. * Returns start document id for the next segment
  1507. *
  1508. * @param integer $startId
  1509. * @param integer $mode
  1510. * @throws Zend_Search_Lucene_Exception
  1511. * @return integer
  1512. */
  1513. public function reset($startId = 0, $mode = self::SM_TERMS_ONLY)
  1514. {
  1515. if ($this->_tisFile !== null) {
  1516. $this->_tisFile = null;
  1517. }
  1518. $this->_tisFile = $this->openCompoundFile('.tis', false);
  1519. $this->_tisFileOffset = $this->_tisFile->tell();
  1520. $tiVersion = $this->_tisFile->readInt();
  1521. if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
  1522. $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
  1523. require_once 'Zend/Search/Lucene/Exception.php';
  1524. throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
  1525. }
  1526. $this->_termCount =
  1527. $this->_termNum = $this->_tisFile->readLong(); // Read terms count
  1528. $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval
  1529. $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
  1530. if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
  1531. $maxSkipLevels = $this->_tisFile->readInt();
  1532. }
  1533. if ($this->_frqFile !== null) {
  1534. $this->_frqFile = null;
  1535. }
  1536. if ($this->_prxFile !== null) {
  1537. $this->_prxFile = null;
  1538. }
  1539. $this->_docMap = array();
  1540. $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
  1541. $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
  1542. $this->_lastTermPositions = null;
  1543. $this->_termsScanMode = $mode;
  1544. switch ($mode) {
  1545. case self::SM_TERMS_ONLY:
  1546. // Do nothing
  1547. break;
  1548. case self::SM_FULL_INFO:
  1549. // break intentionally omitted
  1550. case self::SM_MERGE_INFO:
  1551. $this->_frqFile = $this->openCompoundFile('.frq', false);
  1552. $this->_frqFileOffset = $this->_frqFile->tell();
  1553. $this->_prxFile = $this->openCompoundFile('.prx', false);
  1554. $this->_prxFileOffset = $this->_prxFile->tell();
  1555. for ($count = 0; $count < $this->_docCount; $count++) {
  1556. if (!$this->isDeleted($count)) {
  1557. $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count);
  1558. }
  1559. }
  1560. break;
  1561. default:
  1562. require_once 'Zend/Search/Lucene/Exception.php';
  1563. throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
  1564. break;
  1565. }
  1566. $this->nextTerm();
  1567. return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
  1568. }
  1569. /**
  1570. * Skip terms stream up to specified term preffix.
  1571. *
  1572. * Prefix contains fully specified field info and portion of searched term
  1573. *
  1574. * @param Zend_Search_Lucene_Index_Term $prefix
  1575. * @throws Zend_Search_Lucene_Exception
  1576. */
  1577. public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
  1578. {
  1579. if ($this->_termDictionary === null) {
  1580. $this->_loadDictionaryIndex();
  1581. }
  1582. $searchField = $this->getFieldNum($prefix->field);
  1583. if ($searchField == -1) {
  1584. /**
  1585. * Field is not presented in this segment
  1586. * Go to the end of dictionary
  1587. */
  1588. $this->_tisFile = null;
  1589. $this->_frqFile = null;
  1590. $this->_prxFile = null;
  1591. $this->_lastTerm = null;
  1592. $this->_lastTermInfo = null;
  1593. $this->_lastTermPositions = null;
  1594. return;
  1595. }
  1596. $searchDicField = $this->_getFieldPosition($searchField);
  1597. // search for appropriate value in dictionary
  1598. $lowIndex = 0;
  1599. $highIndex = count($this->_termDictionary)-1;
  1600. while ($highIndex >= $lowIndex) {
  1601. // $mid = ($highIndex - $lowIndex)/2;
  1602. $mid = ($highIndex + $lowIndex) >> 1;
  1603. $midTerm = $this->_termDictionary[$mid];
  1604. $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
  1605. $delta = $searchDicField - $fieldNum;
  1606. if ($delta == 0) {
  1607. $delta = strcmp($prefix->text, $midTerm[1] /* text */);
  1608. }
  1609. if ($delta < 0) {
  1610. $highIndex = $mid-1;
  1611. } elseif ($delta > 0) {
  1612. $lowIndex = $mid+1;
  1613. } else {
  1614. // We have reached term we are looking for
  1615. break;
  1616. }
  1617. }
  1618. if ($highIndex == -1) {
  1619. // Term is out of the dictionary range
  1620. $this->_tisFile = null;
  1621. $this->_frqFile = null;
  1622. $this->_prxFile = null;
  1623. $this->_lastTerm = null;
  1624. $this->_lastTermInfo = null;
  1625. $this->_lastTermPositions = null;
  1626. return;
  1627. }
  1628. $prevPosition = $highIndex;
  1629. $prevTerm = $this->_termDictionary[$prevPosition];
  1630. $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
  1631. if ($this->_tisFile === null) {
  1632. // The end of terms stream is reached and terms dictionary file is closed
  1633. // Perform mini-reset operation
  1634. $this->_tisFile = $this->openCompoundFile('.tis', false);
  1635. if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
  1636. $this->_frqFile = $this->openCompoundFile('.frq', false);
  1637. $this->_prxFile = $this->openCompoundFile('.prx', false);
  1638. }
  1639. }
  1640. $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET);
  1641. $this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */,
  1642. ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name);
  1643. $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */,
  1644. $prevTermInfo[1] /* freqPointer */,
  1645. $prevTermInfo[2] /* proxPointer */,
  1646. $prevTermInfo[3] /* skipOffset */);
  1647. $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval;
  1648. if ($highIndex == 0) {
  1649. // skip start entry
  1650. $this->nextTerm();
  1651. } else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) {
  1652. // We got exact match in the dictionary index
  1653. if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
  1654. $this->_lastTermPositions = array();
  1655. $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
  1656. $freqs = array(); $docId = 0;
  1657. for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
  1658. $docDelta = $this->_frqFile->readVInt();
  1659. if( $docDelta % 2 == 1 ) {
  1660. $docId += ($docDelta-1)/2;
  1661. $freqs[ $docId ] = 1;
  1662. } else {
  1663. $docId += $docDelta/2;
  1664. $freqs[ $docId ] = $this->_frqFile->readVInt();
  1665. }
  1666. }
  1667. $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
  1668. foreach ($freqs as $docId => $freq) {
  1669. $termPosition = 0; $positions = array();
  1670. for ($count = 0; $count < $freq; $count++ ) {
  1671. $termPosition += $this->_prxFile->readVInt();
  1672. $positions[] = $termPosition;
  1673. }
  1674. if (isset($this->_docMap[$docId])) {
  1675. $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
  1676. }
  1677. }
  1678. }
  1679. return;
  1680. }
  1681. // Search term matching specified prefix
  1682. while ($this->_lastTerm !== null) {
  1683. if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 ||
  1684. ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) {
  1685. // Current term matches or greate than the pattern
  1686. return;
  1687. }
  1688. $this->nextTerm();
  1689. }
  1690. }
  1691. /**
  1692. * Scans terms dictionary and returns next term
  1693. *
  1694. * @return Zend_Search_Lucene_Index_Term|null
  1695. */
  1696. public function nextTerm()
  1697. {
  1698. if ($this->_tisFile === null || $this->_termCount == 0) {
  1699. $this->_lastTerm = null;
  1700. $this->_lastTermInfo = null;
  1701. $this->_lastTermPositions = null;
  1702. $this->_docMap = null;
  1703. // may be necessary for "empty" segment
  1704. $this->_tisFile = null;
  1705. $this->_frqFile = null;
  1706. $this->_prxFile = null;
  1707. return null;
  1708. }
  1709. $termPrefixLength = $this->_tisFile->readVInt();
  1710. $termSuffix = $this->_tisFile->readString();
  1711. $termFieldNum = $this->_tisFile->readVInt();
  1712. $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
  1713. $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
  1714. $docFreq = $this->_tisFile->readVInt();
  1715. $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
  1716. $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
  1717. if ($docFreq >= $this->_skipInterval) {
  1718. $skipOffset = $this->_tisFile->readVInt();
  1719. } else {
  1720. $skipOffset = 0;
  1721. }
  1722. $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
  1723. if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
  1724. $this->_lastTermPositions = array();
  1725. $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
  1726. $freqs = array(); $docId = 0;
  1727. for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
  1728. $docDelta = $this->_frqFile->readVInt();
  1729. if( $docDelta % 2 == 1 ) {
  1730. $docId += ($docDelta-1)/2;
  1731. $freqs[ $docId ] = 1;
  1732. } else {
  1733. $docId += $docDelta/2;
  1734. $freqs[ $docId ] = $this->_frqFile->readVInt();
  1735. }
  1736. }
  1737. $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
  1738. foreach ($freqs as $docId => $freq) {
  1739. $termPosition = 0; $positions = array();
  1740. for ($count = 0; $count < $freq; $count++ ) {
  1741. $termPosition += $this->_prxFile->readVInt();
  1742. $positions[] = $termPosition;
  1743. }
  1744. if (isset($this->_docMap[$docId])) {
  1745. $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
  1746. }
  1747. }
  1748. }
  1749. $this->_termCount--;
  1750. if ($this->_termCount == 0) {
  1751. $this->_tisFile = null;
  1752. $this->_frqFile = null;
  1753. $this->_prxFile = null;
  1754. }
  1755. return $this->_lastTerm;
  1756. }
  1757. /**
  1758. * Close terms stream
  1759. *
  1760. * Should be used for resources clean up if stream is not read up to the end
  1761. */
  1762. public function closeTermsStream()
  1763. {
  1764. $this->_tisFile = null;
  1765. $this->_frqFile = null;
  1766. $this->_prxFile = null;
  1767. $this->_lastTerm = null;
  1768. $this->_lastTermInfo = null;
  1769. $this->_lastTermPositions = null;
  1770. $this->_docMap = null;
  1771. }
  1772. /**
  1773. * Returns term in current position
  1774. *
  1775. * @return Zend_Search_Lucene_Index_Term|null
  1776. */
  1777. public function currentTerm()
  1778. {
  1779. return $this->_lastTerm;
  1780. }
  1781. /**
  1782. * Returns an array of all term positions in the documents.
  1783. * Return array structure: array( docId => array( pos1, pos2, ...), ...)
  1784. *
  1785. * @return array
  1786. */
  1787. public function currentTermPositions()
  1788. {
  1789. return $this->_lastTermPositions;
  1790. }
  1791. }