PageRenderTime 50ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/website/library/Zend/Search/Lucene/Index/SegmentWriter.php

https://bitbucket.org/efdac/e-forest_platform
PHP | 634 lines | 278 code | 88 blank | 268 comment | 31 complexity | 32643bc91dfa4345b6188faefab731f3 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: SegmentWriter.php 20096 2010-01-06 02:05:09Z bkarwin $
  21. */
  22. /** Zend_Search_Lucene_Index_FieldInfo */
  23. require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
  24. /** Zend_Search_Lucene_Index_Term */
  25. require_once 'Zend/Search/Lucene/Index/Term.php';
  26. /** Zend_Search_Lucene_Index_TermInfo */
  27. require_once 'Zend/Search/Lucene/Index/TermInfo.php';
  28. /**
  29. * @category Zend
  30. * @package Zend_Search_Lucene
  31. * @subpackage Index
  32. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. abstract class Zend_Search_Lucene_Index_SegmentWriter
  36. {
  37. /**
  38. * Expert: The fraction of terms in the "dictionary" which should be stored
  39. * in RAM. Smaller values use more memory, but make searching slightly
  40. * faster, while larger values use less memory and make searching slightly
  41. * slower. Searching is typically not dominated by dictionary lookup, so
  42. * tweaking this is rarely useful.
  43. *
  44. * @var integer
  45. */
  46. public static $indexInterval = 128;
  47. /**
  48. * Expert: The fraction of TermDocs entries stored in skip tables.
  49. * Larger values result in smaller indexes, greater acceleration, but fewer
  50. * accelerable cases, while smaller values result in bigger indexes,
  51. * less acceleration and more
  52. * accelerable cases. More detailed experiments would be useful here.
  53. *
  54. * 0x7FFFFFFF indicates that we don't use skip data
  55. *
  56. * Note: not used in current implementation
  57. *
  58. * @var integer
  59. */
  60. public static $skipInterval = 0x7FFFFFFF;
  61. /**
  62. * Expert: The maximum number of skip levels. Smaller values result in
  63. * slightly smaller indexes, but slower skipping in big posting lists.
  64. *
  65. * 0 indicates that we don't use skip data
  66. *
  67. * Note: not used in current implementation
  68. *
  69. * @var integer
  70. */
  71. public static $maxSkipLevels = 0;
  72. /**
  73. * Number of docs in a segment
  74. *
  75. * @var integer
  76. */
  77. protected $_docCount = 0;
  78. /**
  79. * Segment name
  80. *
  81. * @var string
  82. */
  83. protected $_name;
  84. /**
  85. * File system adapter.
  86. *
  87. * @var Zend_Search_Lucene_Storage_Directory
  88. */
  89. protected $_directory;
  90. /**
  91. * List of the index files.
  92. * Used for automatic compound file generation
  93. *
  94. * @var unknown_type
  95. */
  96. protected $_files = array();
  97. /**
  98. * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
  99. *
  100. * @var array
  101. */
  102. protected $_fields = array();
  103. /**
  104. * Normalization factors.
  105. * An array fieldName => normVector
  106. * normVector is a binary string.
  107. * Each byte corresponds to an indexed document in a segment and
  108. * encodes normalization factor (float value, encoded by
  109. * Zend_Search_Lucene_Search_Similarity::encodeNorm())
  110. *
  111. * @var array
  112. */
  113. protected $_norms = array();
  114. /**
  115. * '.fdx' file - Stored Fields, the field index.
  116. *
  117. * @var Zend_Search_Lucene_Storage_File
  118. */
  119. protected $_fdxFile = null;
  120. /**
  121. * '.fdt' file - Stored Fields, the field data.
  122. *
  123. * @var Zend_Search_Lucene_Storage_File
  124. */
  125. protected $_fdtFile = null;
  126. /**
  127. * Object constructor.
  128. *
  129. * @param Zend_Search_Lucene_Storage_Directory $directory
  130. * @param string $name
  131. */
  132. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
  133. {
  134. $this->_directory = $directory;
  135. $this->_name = $name;
  136. }
  137. /**
  138. * Add field to the segment
  139. *
  140. * Returns actual field number
  141. *
  142. * @param Zend_Search_Lucene_Field $field
  143. * @return integer
  144. */
  145. public function addField(Zend_Search_Lucene_Field $field)
  146. {
  147. if (!isset($this->_fields[$field->name])) {
  148. $fieldNumber = count($this->_fields);
  149. $this->_fields[$field->name] =
  150. new Zend_Search_Lucene_Index_FieldInfo($field->name,
  151. $field->isIndexed,
  152. $fieldNumber,
  153. $field->storeTermVector);
  154. return $fieldNumber;
  155. } else {
  156. $this->_fields[$field->name]->isIndexed |= $field->isIndexed;
  157. $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
  158. return $this->_fields[$field->name]->number;
  159. }
  160. }
  161. /**
  162. * Add fieldInfo to the segment
  163. *
  164. * Returns actual field number
  165. *
  166. * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
  167. * @return integer
  168. */
  169. public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
  170. {
  171. if (!isset($this->_fields[$fieldInfo->name])) {
  172. $fieldNumber = count($this->_fields);
  173. $this->_fields[$fieldInfo->name] =
  174. new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
  175. $fieldInfo->isIndexed,
  176. $fieldNumber,
  177. $fieldInfo->storeTermVector);
  178. return $fieldNumber;
  179. } else {
  180. $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
  181. $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
  182. return $this->_fields[$fieldInfo->name]->number;
  183. }
  184. }
  185. /**
  186. * Returns array of FieldInfo objects.
  187. *
  188. * @return array
  189. */
  190. public function getFieldInfos()
  191. {
  192. return $this->_fields;
  193. }
  194. /**
  195. * Add stored fields information
  196. *
  197. * @param array $storedFields array of Zend_Search_Lucene_Field objects
  198. */
  199. public function addStoredFields($storedFields)
  200. {
  201. if (!isset($this->_fdxFile)) {
  202. $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
  203. $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
  204. $this->_files[] = $this->_name . '.fdx';
  205. $this->_files[] = $this->_name . '.fdt';
  206. }
  207. $this->_fdxFile->writeLong($this->_fdtFile->tell());
  208. $this->_fdtFile->writeVInt(count($storedFields));
  209. foreach ($storedFields as $field) {
  210. $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
  211. $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
  212. ($field->isBinary ? 0x02 : 0x00) |
  213. 0x00; /* 0x04 - third bit, compressed (ZLIB) */
  214. $this->_fdtFile->writeByte($fieldBits);
  215. if ($field->isBinary) {
  216. $this->_fdtFile->writeVInt(strlen($field->value));
  217. $this->_fdtFile->writeBytes($field->value);
  218. } else {
  219. $this->_fdtFile->writeString($field->getUtf8Value());
  220. }
  221. }
  222. $this->_docCount++;
  223. }
  224. /**
  225. * Returns the total number of documents in this segment.
  226. *
  227. * @return integer
  228. */
  229. public function count()
  230. {
  231. return $this->_docCount;
  232. }
  233. /**
  234. * Return segment name
  235. *
  236. * @return string
  237. */
  238. public function getName()
  239. {
  240. return $this->_name;
  241. }
  242. /**
  243. * Dump Field Info (.fnm) segment file
  244. */
  245. protected function _dumpFNM()
  246. {
  247. $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
  248. $fnmFile->writeVInt(count($this->_fields));
  249. $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
  250. // Write header
  251. $nrmFile->writeBytes('NRM');
  252. // Write format specifier
  253. $nrmFile->writeByte((int)0xFF);
  254. foreach ($this->_fields as $field) {
  255. $fnmFile->writeString($field->name);
  256. $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
  257. ($field->storeTermVector ? 0x02 : 0x00)
  258. // not supported yet 0x04 /* term positions are stored with the term vectors */ |
  259. // not supported yet 0x08 /* term offsets are stored with the term vectors */ |
  260. );
  261. if ($field->isIndexed) {
  262. // pre-2.1 index mode (not used now)
  263. // $normFileName = $this->_name . '.f' . $field->number;
  264. // $fFile = $this->_directory->createFile($normFileName);
  265. // $fFile->writeBytes($this->_norms[$field->name]);
  266. // $this->_files[] = $normFileName;
  267. $nrmFile->writeBytes($this->_norms[$field->name]);
  268. }
  269. }
  270. $this->_files[] = $this->_name . '.fnm';
  271. $this->_files[] = $this->_name . '.nrm';
  272. }
  273. /**
  274. * Term Dictionary file
  275. *
  276. * @var Zend_Search_Lucene_Storage_File
  277. */
  278. private $_tisFile = null;
  279. /**
  280. * Term Dictionary index file
  281. *
  282. * @var Zend_Search_Lucene_Storage_File
  283. */
  284. private $_tiiFile = null;
  285. /**
  286. * Frequencies file
  287. *
  288. * @var Zend_Search_Lucene_Storage_File
  289. */
  290. private $_frqFile = null;
  291. /**
  292. * Positions file
  293. *
  294. * @var Zend_Search_Lucene_Storage_File
  295. */
  296. private $_prxFile = null;
  297. /**
  298. * Number of written terms
  299. *
  300. * @var integer
  301. */
  302. private $_termCount;
  303. /**
  304. * Last saved term
  305. *
  306. * @var Zend_Search_Lucene_Index_Term
  307. */
  308. private $_prevTerm;
  309. /**
  310. * Last saved term info
  311. *
  312. * @var Zend_Search_Lucene_Index_TermInfo
  313. */
  314. private $_prevTermInfo;
  315. /**
  316. * Last saved index term
  317. *
  318. * @var Zend_Search_Lucene_Index_Term
  319. */
  320. private $_prevIndexTerm;
  321. /**
  322. * Last saved index term info
  323. *
  324. * @var Zend_Search_Lucene_Index_TermInfo
  325. */
  326. private $_prevIndexTermInfo;
  327. /**
  328. * Last term dictionary file position
  329. *
  330. * @var integer
  331. */
  332. private $_lastIndexPosition;
  333. /**
  334. * Create dicrionary, frequency and positions files and write necessary headers
  335. */
  336. public function initializeDictionaryFiles()
  337. {
  338. $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
  339. $this->_tisFile->writeInt((int)0xFFFFFFFD);
  340. $this->_tisFile->writeLong(0 /* dummy data for terms count */);
  341. $this->_tisFile->writeInt(self::$indexInterval);
  342. $this->_tisFile->writeInt(self::$skipInterval);
  343. $this->_tisFile->writeInt(self::$maxSkipLevels);
  344. $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
  345. $this->_tiiFile->writeInt((int)0xFFFFFFFD);
  346. $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
  347. $this->_tiiFile->writeInt(self::$indexInterval);
  348. $this->_tiiFile->writeInt(self::$skipInterval);
  349. $this->_tiiFile->writeInt(self::$maxSkipLevels);
  350. /** Dump dictionary header */
  351. $this->_tiiFile->writeVInt(0); // preffix length
  352. $this->_tiiFile->writeString(''); // suffix
  353. $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
  354. $this->_tiiFile->writeByte((int)0x0F);
  355. $this->_tiiFile->writeVInt(0); // DocFreq
  356. $this->_tiiFile->writeVInt(0); // FreqDelta
  357. $this->_tiiFile->writeVInt(0); // ProxDelta
  358. $this->_tiiFile->writeVInt(24); // IndexDelta
  359. $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
  360. $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
  361. $this->_files[] = $this->_name . '.tis';
  362. $this->_files[] = $this->_name . '.tii';
  363. $this->_files[] = $this->_name . '.frq';
  364. $this->_files[] = $this->_name . '.prx';
  365. $this->_prevTerm = null;
  366. $this->_prevTermInfo = null;
  367. $this->_prevIndexTerm = null;
  368. $this->_prevIndexTermInfo = null;
  369. $this->_lastIndexPosition = 24;
  370. $this->_termCount = 0;
  371. }
  372. /**
  373. * Add term
  374. *
  375. * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
  376. *
  377. * @param Zend_Search_Lucene_Index_Term $termEntry
  378. * @param array $termDocs
  379. */
  380. public function addTerm($termEntry, $termDocs)
  381. {
  382. $freqPointer = $this->_frqFile->tell();
  383. $proxPointer = $this->_prxFile->tell();
  384. $prevDoc = 0;
  385. foreach ($termDocs as $docId => $termPositions) {
  386. $docDelta = ($docId - $prevDoc)*2;
  387. $prevDoc = $docId;
  388. if (count($termPositions) > 1) {
  389. $this->_frqFile->writeVInt($docDelta);
  390. $this->_frqFile->writeVInt(count($termPositions));
  391. } else {
  392. $this->_frqFile->writeVInt($docDelta + 1);
  393. }
  394. $prevPosition = 0;
  395. foreach ($termPositions as $position) {
  396. $this->_prxFile->writeVInt($position - $prevPosition);
  397. $prevPosition = $position;
  398. }
  399. }
  400. if (count($termDocs) >= self::$skipInterval) {
  401. /**
  402. * @todo Write Skip Data to a freq file.
  403. * It's not used now, but make index more optimal
  404. */
  405. $skipOffset = $this->_frqFile->tell() - $freqPointer;
  406. } else {
  407. $skipOffset = 0;
  408. }
  409. $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
  410. $this->_fields[$termEntry->field]->number);
  411. $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
  412. $freqPointer, $proxPointer, $skipOffset);
  413. $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
  414. if (($this->_termCount + 1) % self::$indexInterval == 0) {
  415. $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
  416. $indexPosition = $this->_tisFile->tell();
  417. $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
  418. $this->_lastIndexPosition = $indexPosition;
  419. }
  420. $this->_termCount++;
  421. }
  422. /**
  423. * Close dictionary
  424. */
  425. public function closeDictionaryFiles()
  426. {
  427. $this->_tisFile->seek(4);
  428. $this->_tisFile->writeLong($this->_termCount);
  429. $this->_tiiFile->seek(4);
  430. // + 1 is used to count an additional special index entry (empty term at the start of the list)
  431. $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
  432. }
  433. /**
  434. * Dump Term Dictionary segment file entry.
  435. * Used to write entry to .tis or .tii files
  436. *
  437. * @param Zend_Search_Lucene_Storage_File $dicFile
  438. * @param Zend_Search_Lucene_Index_Term $prevTerm
  439. * @param Zend_Search_Lucene_Index_Term $term
  440. * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
  441. * @param Zend_Search_Lucene_Index_TermInfo $termInfo
  442. */
  443. protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
  444. &$prevTerm, Zend_Search_Lucene_Index_Term $term,
  445. &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
  446. {
  447. if (isset($prevTerm) && $prevTerm->field == $term->field) {
  448. $matchedBytes = 0;
  449. $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
  450. while ($matchedBytes < $maxBytes &&
  451. $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
  452. $matchedBytes++;
  453. }
  454. // Calculate actual matched UTF-8 pattern
  455. $prefixBytes = 0;
  456. $prefixChars = 0;
  457. while ($prefixBytes < $matchedBytes) {
  458. $charBytes = 1;
  459. if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
  460. $charBytes++;
  461. if (ord($term->text[$prefixBytes]) & 0x20 ) {
  462. $charBytes++;
  463. if (ord($term->text[$prefixBytes]) & 0x10 ) {
  464. $charBytes++;
  465. }
  466. }
  467. }
  468. if ($prefixBytes + $charBytes > $matchedBytes) {
  469. // char crosses matched bytes boundary
  470. // skip char
  471. break;
  472. }
  473. $prefixChars++;
  474. $prefixBytes += $charBytes;
  475. }
  476. // Write preffix length
  477. $dicFile->writeVInt($prefixChars);
  478. // Write suffix
  479. $dicFile->writeString(substr($term->text, $prefixBytes));
  480. } else {
  481. // Write preffix length
  482. $dicFile->writeVInt(0);
  483. // Write suffix
  484. $dicFile->writeString($term->text);
  485. }
  486. // Write field number
  487. $dicFile->writeVInt($term->field);
  488. // DocFreq (the count of documents which contain the term)
  489. $dicFile->writeVInt($termInfo->docFreq);
  490. $prevTerm = $term;
  491. if (!isset($prevTermInfo)) {
  492. // Write FreqDelta
  493. $dicFile->writeVInt($termInfo->freqPointer);
  494. // Write ProxDelta
  495. $dicFile->writeVInt($termInfo->proxPointer);
  496. } else {
  497. // Write FreqDelta
  498. $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
  499. // Write ProxDelta
  500. $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
  501. }
  502. // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
  503. if ($termInfo->skipOffset != 0) {
  504. $dicFile->writeVInt($termInfo->skipOffset);
  505. }
  506. $prevTermInfo = $termInfo;
  507. }
  508. /**
  509. * Generate compound index file
  510. */
  511. protected function _generateCFS()
  512. {
  513. $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
  514. $cfsFile->writeVInt(count($this->_files));
  515. $dataOffsetPointers = array();
  516. foreach ($this->_files as $fileName) {
  517. $dataOffsetPointers[$fileName] = $cfsFile->tell();
  518. $cfsFile->writeLong(0); // write dummy data
  519. $cfsFile->writeString($fileName);
  520. }
  521. foreach ($this->_files as $fileName) {
  522. // Get actual data offset
  523. $dataOffset = $cfsFile->tell();
  524. // Seek to the data offset pointer
  525. $cfsFile->seek($dataOffsetPointers[$fileName]);
  526. // Write actual data offset value
  527. $cfsFile->writeLong($dataOffset);
  528. // Seek back to the end of file
  529. $cfsFile->seek($dataOffset);
  530. $dataFile = $this->_directory->getFileObject($fileName);
  531. $byteCount = $this->_directory->fileLength($fileName);
  532. while ($byteCount > 0) {
  533. $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
  534. $byteCount -= strlen($data);
  535. $cfsFile->writeBytes($data);
  536. }
  537. $this->_directory->deleteFile($fileName);
  538. }
  539. }
  540. /**
  541. * Close segment, write it to disk and return segment info
  542. *
  543. * @return Zend_Search_Lucene_Index_SegmentInfo
  544. */
  545. abstract public function close();
  546. }