PageRenderTime 68ms CodeModel.GetById 24ms RepoModel.GetById 4ms app.codeStats 0ms

/library/Zend/Search/Lucene/Index/SegmentWriter.php

https://bitbucket.org/baruffaldi/website-2008-computer-shopping-3
PHP | 631 lines | 277 code | 88 blank | 266 comment | 31 complexity | 9672b23c95e9822fb53736bd59385f2f MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Exception */
  22. require_once 'Zend/Search/Lucene/Exception.php';
  23. /** Zend_Search_Lucene_Index_SegmentInfo */
  24. require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
  25. /**
  26. * @category Zend
  27. * @package Zend_Search_Lucene
  28. * @subpackage Index
  29. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  30. * @license http://framework.zend.com/license/new-bsd New BSD License
  31. */
  32. abstract class Zend_Search_Lucene_Index_SegmentWriter
  33. {
  34. /**
  35. * Expert: The fraction of terms in the "dictionary" which should be stored
  36. * in RAM. Smaller values use more memory, but make searching slightly
  37. * faster, while larger values use less memory and make searching slightly
  38. * slower. Searching is typically not dominated by dictionary lookup, so
  39. * tweaking this is rarely useful.
  40. *
  41. * @var integer
  42. */
  43. public static $indexInterval = 128;
  44. /**
  45. * Expert: The fraction of TermDocs entries stored in skip tables.
  46. * Larger values result in smaller indexes, greater acceleration, but fewer
  47. * accelerable cases, while smaller values result in bigger indexes,
  48. * less acceleration and more
  49. * accelerable cases. More detailed experiments would be useful here.
  50. *
  51. * 0x7FFFFFFF indicates that we don't use skip data
  52. *
  53. * Note: not used in current implementation
  54. *
  55. * @var integer
  56. */
  57. public static $skipInterval = 0x7FFFFFFF;
  58. /**
  59. * Expert: The maximum number of skip levels. Smaller values result in
  60. * slightly smaller indexes, but slower skipping in big posting lists.
  61. *
  62. * 0 indicates that we don't use skip data
  63. *
  64. * Note: not used in current implementation
  65. *
  66. * @var integer
  67. */
  68. public static $maxSkipLevels = 0;
  69. /**
  70. * Number of docs in a segment
  71. *
  72. * @var integer
  73. */
  74. protected $_docCount = 0;
  75. /**
  76. * Segment name
  77. *
  78. * @var string
  79. */
  80. protected $_name;
  81. /**
  82. * File system adapter.
  83. *
  84. * @var Zend_Search_Lucene_Storage_Directory
  85. */
  86. protected $_directory;
  87. /**
  88. * List of the index files.
  89. * Used for automatic compound file generation
  90. *
  91. * @var unknown_type
  92. */
  93. protected $_files = array();
  94. /**
  95. * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
  96. *
  97. * @var array
  98. */
  99. protected $_fields = array();
  100. /**
  101. * Normalization factors.
  102. * An array fieldName => normVector
  103. * normVector is a binary string.
  104. * Each byte corresponds to an indexed document in a segment and
  105. * encodes normalization factor (float value, encoded by
  106. * Zend_Search_Lucene_Search_Similarity::encodeNorm())
  107. *
  108. * @var array
  109. */
  110. protected $_norms = array();
  111. /**
  112. * '.fdx' file - Stored Fields, the field index.
  113. *
  114. * @var Zend_Search_Lucene_Storage_File
  115. */
  116. protected $_fdxFile = null;
  117. /**
  118. * '.fdt' file - Stored Fields, the field data.
  119. *
  120. * @var Zend_Search_Lucene_Storage_File
  121. */
  122. protected $_fdtFile = null;
  123. /**
  124. * Object constructor.
  125. *
  126. * @param Zend_Search_Lucene_Storage_Directory $directory
  127. * @param string $name
  128. */
  129. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
  130. {
  131. $this->_directory = $directory;
  132. $this->_name = $name;
  133. }
  134. /**
  135. * Add field to the segment
  136. *
  137. * Returns actual field number
  138. *
  139. * @param Zend_Search_Lucene_Field $field
  140. * @return integer
  141. */
  142. public function addField(Zend_Search_Lucene_Field $field)
  143. {
  144. if (!isset($this->_fields[$field->name])) {
  145. $fieldNumber = count($this->_fields);
  146. $this->_fields[$field->name] =
  147. new Zend_Search_Lucene_Index_FieldInfo($field->name,
  148. $field->isIndexed,
  149. $fieldNumber,
  150. $field->storeTermVector);
  151. return $fieldNumber;
  152. } else {
  153. $this->_fields[$field->name]->isIndexed |= $field->isIndexed;
  154. $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
  155. return $this->_fields[$field->name]->number;
  156. }
  157. }
  158. /**
  159. * Add fieldInfo to the segment
  160. *
  161. * Returns actual field number
  162. *
  163. * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
  164. * @return integer
  165. */
  166. public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
  167. {
  168. if (!isset($this->_fields[$fieldInfo->name])) {
  169. $fieldNumber = count($this->_fields);
  170. $this->_fields[$fieldInfo->name] =
  171. new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
  172. $fieldInfo->isIndexed,
  173. $fieldNumber,
  174. $fieldInfo->storeTermVector);
  175. return $fieldNumber;
  176. } else {
  177. $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
  178. $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
  179. return $this->_fields[$fieldInfo->name]->number;
  180. }
  181. }
  182. /**
  183. * Returns array of FieldInfo objects.
  184. *
  185. * @return array
  186. */
  187. public function getFieldInfos()
  188. {
  189. return $this->_fields;
  190. }
  191. /**
  192. * Add stored fields information
  193. *
  194. * @param array $storedFields array of Zend_Search_Lucene_Field objects
  195. */
  196. public function addStoredFields($storedFields)
  197. {
  198. if (!isset($this->_fdxFile)) {
  199. $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
  200. $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
  201. $this->_files[] = $this->_name . '.fdx';
  202. $this->_files[] = $this->_name . '.fdt';
  203. }
  204. $this->_fdxFile->writeLong($this->_fdtFile->tell());
  205. $this->_fdtFile->writeVInt(count($storedFields));
  206. foreach ($storedFields as $field) {
  207. $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
  208. $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
  209. ($field->isBinary ? 0x02 : 0x00) |
  210. 0x00; /* 0x04 - third bit, compressed (ZLIB) */
  211. $this->_fdtFile->writeByte($fieldBits);
  212. if ($field->isBinary) {
  213. $this->_fdtFile->writeVInt(strlen($field->value));
  214. $this->_fdtFile->writeBytes($field->value);
  215. } else {
  216. $this->_fdtFile->writeString($field->getUtf8Value());
  217. }
  218. }
  219. $this->_docCount++;
  220. }
  221. /**
  222. * Returns the total number of documents in this segment.
  223. *
  224. * @return integer
  225. */
  226. public function count()
  227. {
  228. return $this->_docCount;
  229. }
  230. /**
  231. * Return segment name
  232. *
  233. * @return string
  234. */
  235. public function getName()
  236. {
  237. return $this->_name;
  238. }
  239. /**
  240. * Dump Field Info (.fnm) segment file
  241. */
  242. protected function _dumpFNM()
  243. {
  244. $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
  245. $fnmFile->writeVInt(count($this->_fields));
  246. $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
  247. // Write header
  248. $nrmFile->writeBytes('NRM');
  249. // Write format specifier
  250. $nrmFile->writeByte((int)0xFF);
  251. foreach ($this->_fields as $field) {
  252. $fnmFile->writeString($field->name);
  253. $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
  254. ($field->storeTermVector ? 0x02 : 0x00)
  255. // not supported yet 0x04 /* term positions are stored with the term vectors */ |
  256. // not supported yet 0x08 /* term offsets are stored with the term vectors */ |
  257. );
  258. if ($field->isIndexed) {
  259. // pre-2.1 index mode (not used now)
  260. // $normFileName = $this->_name . '.f' . $field->number;
  261. // $fFile = $this->_directory->createFile($normFileName);
  262. // $fFile->writeBytes($this->_norms[$field->name]);
  263. // $this->_files[] = $normFileName;
  264. $nrmFile->writeBytes($this->_norms[$field->name]);
  265. }
  266. }
  267. $this->_files[] = $this->_name . '.fnm';
  268. $this->_files[] = $this->_name . '.nrm';
  269. }
  270. /**
  271. * Term Dictionary file
  272. *
  273. * @var Zend_Search_Lucene_Storage_File
  274. */
  275. private $_tisFile = null;
  276. /**
  277. * Term Dictionary index file
  278. *
  279. * @var Zend_Search_Lucene_Storage_File
  280. */
  281. private $_tiiFile = null;
  282. /**
  283. * Frequencies file
  284. *
  285. * @var Zend_Search_Lucene_Storage_File
  286. */
  287. private $_frqFile = null;
  288. /**
  289. * Positions file
  290. *
  291. * @var Zend_Search_Lucene_Storage_File
  292. */
  293. private $_prxFile = null;
  294. /**
  295. * Number of written terms
  296. *
  297. * @var integer
  298. */
  299. private $_termCount;
  300. /**
  301. * Last saved term
  302. *
  303. * @var Zend_Search_Lucene_Index_Term
  304. */
  305. private $_prevTerm;
  306. /**
  307. * Last saved term info
  308. *
  309. * @var Zend_Search_Lucene_Index_TermInfo
  310. */
  311. private $_prevTermInfo;
  312. /**
  313. * Last saved index term
  314. *
  315. * @var Zend_Search_Lucene_Index_Term
  316. */
  317. private $_prevIndexTerm;
  318. /**
  319. * Last saved index term info
  320. *
  321. * @var Zend_Search_Lucene_Index_TermInfo
  322. */
  323. private $_prevIndexTermInfo;
  324. /**
  325. * Last term dictionary file position
  326. *
  327. * @var integer
  328. */
  329. private $_lastIndexPosition;
  330. /**
  331. * Create dicrionary, frequency and positions files and write necessary headers
  332. */
  333. public function initializeDictionaryFiles()
  334. {
  335. $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
  336. $this->_tisFile->writeInt((int)0xFFFFFFFD);
  337. $this->_tisFile->writeLong(0 /* dummy data for terms count */);
  338. $this->_tisFile->writeInt(self::$indexInterval);
  339. $this->_tisFile->writeInt(self::$skipInterval);
  340. $this->_tisFile->writeInt(self::$maxSkipLevels);
  341. $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
  342. $this->_tiiFile->writeInt((int)0xFFFFFFFD);
  343. $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
  344. $this->_tiiFile->writeInt(self::$indexInterval);
  345. $this->_tiiFile->writeInt(self::$skipInterval);
  346. $this->_tiiFile->writeInt(self::$maxSkipLevels);
  347. /** Dump dictionary header */
  348. $this->_tiiFile->writeVInt(0); // preffix length
  349. $this->_tiiFile->writeString(''); // suffix
  350. $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
  351. $this->_tiiFile->writeByte((int)0x0F);
  352. $this->_tiiFile->writeVInt(0); // DocFreq
  353. $this->_tiiFile->writeVInt(0); // FreqDelta
  354. $this->_tiiFile->writeVInt(0); // ProxDelta
  355. $this->_tiiFile->writeVInt(24); // IndexDelta
  356. $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
  357. $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
  358. $this->_files[] = $this->_name . '.tis';
  359. $this->_files[] = $this->_name . '.tii';
  360. $this->_files[] = $this->_name . '.frq';
  361. $this->_files[] = $this->_name . '.prx';
  362. $this->_prevTerm = null;
  363. $this->_prevTermInfo = null;
  364. $this->_prevIndexTerm = null;
  365. $this->_prevIndexTermInfo = null;
  366. $this->_lastIndexPosition = 24;
  367. $this->_termCount = 0;
  368. }
  369. /**
  370. * Add term
  371. *
  372. * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
  373. *
  374. * @param Zend_Search_Lucene_Index_Term $termEntry
  375. * @param array $termDocs
  376. */
  377. public function addTerm($termEntry, $termDocs)
  378. {
  379. $freqPointer = $this->_frqFile->tell();
  380. $proxPointer = $this->_prxFile->tell();
  381. $prevDoc = 0;
  382. foreach ($termDocs as $docId => $termPositions) {
  383. $docDelta = ($docId - $prevDoc)*2;
  384. $prevDoc = $docId;
  385. if (count($termPositions) > 1) {
  386. $this->_frqFile->writeVInt($docDelta);
  387. $this->_frqFile->writeVInt(count($termPositions));
  388. } else {
  389. $this->_frqFile->writeVInt($docDelta + 1);
  390. }
  391. $prevPosition = 0;
  392. foreach ($termPositions as $position) {
  393. $this->_prxFile->writeVInt($position - $prevPosition);
  394. $prevPosition = $position;
  395. }
  396. }
  397. if (count($termDocs) >= self::$skipInterval) {
  398. /**
  399. * @todo Write Skip Data to a freq file.
  400. * It's not used now, but make index more optimal
  401. */
  402. $skipOffset = $this->_frqFile->tell() - $freqPointer;
  403. } else {
  404. $skipOffset = 0;
  405. }
  406. $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
  407. $this->_fields[$termEntry->field]->number);
  408. $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
  409. $freqPointer, $proxPointer, $skipOffset);
  410. $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
  411. if (($this->_termCount + 1) % self::$indexInterval == 0) {
  412. $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
  413. $indexPosition = $this->_tisFile->tell();
  414. $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
  415. $this->_lastIndexPosition = $indexPosition;
  416. }
  417. $this->_termCount++;
  418. }
  419. /**
  420. * Close dictionary
  421. */
  422. public function closeDictionaryFiles()
  423. {
  424. $this->_tisFile->seek(4);
  425. $this->_tisFile->writeLong($this->_termCount);
  426. $this->_tiiFile->seek(4);
  427. // + 1 is used to count an additional special index entry (empty term at the start of the list)
  428. $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
  429. }
  430. /**
  431. * Dump Term Dictionary segment file entry.
  432. * Used to write entry to .tis or .tii files
  433. *
  434. * @param Zend_Search_Lucene_Storage_File $dicFile
  435. * @param Zend_Search_Lucene_Index_Term $prevTerm
  436. * @param Zend_Search_Lucene_Index_Term $term
  437. * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
  438. * @param Zend_Search_Lucene_Index_TermInfo $termInfo
  439. */
  440. protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
  441. &$prevTerm, Zend_Search_Lucene_Index_Term $term,
  442. &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
  443. {
  444. if (isset($prevTerm) && $prevTerm->field == $term->field) {
  445. $matchedBytes = 0;
  446. $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
  447. while ($matchedBytes < $maxBytes &&
  448. $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
  449. $matchedBytes++;
  450. }
  451. // Calculate actual matched UTF-8 pattern
  452. $prefixBytes = 0;
  453. $prefixChars = 0;
  454. while ($prefixBytes < $matchedBytes) {
  455. $charBytes = 1;
  456. if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
  457. $charBytes++;
  458. if (ord($term->text[$prefixBytes]) & 0x20 ) {
  459. $charBytes++;
  460. if (ord($term->text[$prefixBytes]) & 0x10 ) {
  461. $charBytes++;
  462. }
  463. }
  464. }
  465. if ($prefixBytes + $charBytes > $matchedBytes) {
  466. // char crosses matched bytes boundary
  467. // skip char
  468. break;
  469. }
  470. $prefixChars++;
  471. $prefixBytes += $charBytes;
  472. }
  473. // Write preffix length
  474. $dicFile->writeVInt($prefixChars);
  475. // Write suffix
  476. $dicFile->writeString(substr($term->text, $prefixBytes));
  477. } else {
  478. // Write preffix length
  479. $dicFile->writeVInt(0);
  480. // Write suffix
  481. $dicFile->writeString($term->text);
  482. }
  483. // Write field number
  484. $dicFile->writeVInt($term->field);
  485. // DocFreq (the count of documents which contain the term)
  486. $dicFile->writeVInt($termInfo->docFreq);
  487. $prevTerm = $term;
  488. if (!isset($prevTermInfo)) {
  489. // Write FreqDelta
  490. $dicFile->writeVInt($termInfo->freqPointer);
  491. // Write ProxDelta
  492. $dicFile->writeVInt($termInfo->proxPointer);
  493. } else {
  494. // Write FreqDelta
  495. $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
  496. // Write ProxDelta
  497. $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
  498. }
  499. // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
  500. if ($termInfo->skipOffset != 0) {
  501. $dicFile->writeVInt($termInfo->skipOffset);
  502. }
  503. $prevTermInfo = $termInfo;
  504. }
  505. /**
  506. * Generate compound index file
  507. */
  508. protected function _generateCFS()
  509. {
  510. $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
  511. $cfsFile->writeVInt(count($this->_files));
  512. $dataOffsetPointers = array();
  513. foreach ($this->_files as $fileName) {
  514. $dataOffsetPointers[$fileName] = $cfsFile->tell();
  515. $cfsFile->writeLong(0); // write dummy data
  516. $cfsFile->writeString($fileName);
  517. }
  518. foreach ($this->_files as $fileName) {
  519. // Get actual data offset
  520. $dataOffset = $cfsFile->tell();
  521. // Seek to the data offset pointer
  522. $cfsFile->seek($dataOffsetPointers[$fileName]);
  523. // Write actual data offset value
  524. $cfsFile->writeLong($dataOffset);
  525. // Seek back to the end of file
  526. $cfsFile->seek($dataOffset);
  527. $dataFile = $this->_directory->getFileObject($fileName);
  528. $byteCount = $this->_directory->fileLength($fileName);
  529. while ($byteCount > 0) {
  530. $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
  531. $byteCount -= strlen($data);
  532. $cfsFile->writeBytes($data);
  533. }
  534. $this->_directory->deleteFile($fileName);
  535. }
  536. }
  537. /**
  538. * Close segment, write it to disk and return segment info
  539. *
  540. * @return Zend_Search_Lucene_Index_SegmentInfo
  541. */
  542. abstract public function close();
  543. }