PageRenderTime 79ms CodeModel.GetById 19ms RepoModel.GetById 3ms app.codeStats 0ms

/lib/Zend/Search/Lucene/Index/SegmentWriter.php

https://bitbucket.org/mercysam/zfs
PHP | 626 lines | 276 code | 85 blank | 265 comment | 31 complexity | c98cc2f2f85d4fa169b28ee95aaa66ae MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /** Zend_Search_Lucene_Index_SegmentInfo */
  22. require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
  23. /**
  24. * @category Zend
  25. * @package Zend_Search_Lucene
  26. * @subpackage Index
  27. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  28. * @license http://framework.zend.com/license/new-bsd New BSD License
  29. */
  30. abstract class Zend_Search_Lucene_Index_SegmentWriter
  31. {
  32. /**
  33. * Expert: The fraction of terms in the "dictionary" which should be stored
  34. * in RAM. Smaller values use more memory, but make searching slightly
  35. * faster, while larger values use less memory and make searching slightly
  36. * slower. Searching is typically not dominated by dictionary lookup, so
  37. * tweaking this is rarely useful.
  38. *
  39. * @var integer
  40. */
  41. public static $indexInterval = 128;
  42. /**
  43. * Expert: The fraction of TermDocs entries stored in skip tables.
  44. * Larger values result in smaller indexes, greater acceleration, but fewer
  45. * accelerable cases, while smaller values result in bigger indexes,
  46. * less acceleration and more
  47. * accelerable cases. More detailed experiments would be useful here.
  48. *
  49. * 0x7FFFFFFF indicates that we don't use skip data
  50. *
  51. * Note: not used in current implementation
  52. *
  53. * @var integer
  54. */
  55. public static $skipInterval = 0x7FFFFFFF;
  56. /**
  57. * Expert: The maximum number of skip levels. Smaller values result in
  58. * slightly smaller indexes, but slower skipping in big posting lists.
  59. *
  60. * 0 indicates that we don't use skip data
  61. *
  62. * Note: not used in current implementation
  63. *
  64. * @var integer
  65. */
  66. public static $maxSkipLevels = 0;
  67. /**
  68. * Number of docs in a segment
  69. *
  70. * @var integer
  71. */
  72. protected $_docCount = 0;
  73. /**
  74. * Segment name
  75. *
  76. * @var string
  77. */
  78. protected $_name;
  79. /**
  80. * File system adapter.
  81. *
  82. * @var Zend_Search_Lucene_Storage_Directory
  83. */
  84. protected $_directory;
  85. /**
  86. * List of the index files.
  87. * Used for automatic compound file generation
  88. *
  89. * @var unknown_type
  90. */
  91. protected $_files = array();
  92. /**
  93. * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
  94. *
  95. * @var array
  96. */
  97. protected $_fields = array();
  98. /**
  99. * Normalization factors.
  100. * An array fieldName => normVector
  101. * normVector is a binary string.
  102. * Each byte corresponds to an indexed document in a segment and
  103. * encodes normalization factor (float value, encoded by
  104. * Zend_Search_Lucene_Search_Similarity::encodeNorm())
  105. *
  106. * @var array
  107. */
  108. protected $_norms = array();
  109. /**
  110. * '.fdx' file - Stored Fields, the field index.
  111. *
  112. * @var Zend_Search_Lucene_Storage_File
  113. */
  114. protected $_fdxFile = null;
  115. /**
  116. * '.fdt' file - Stored Fields, the field data.
  117. *
  118. * @var Zend_Search_Lucene_Storage_File
  119. */
  120. protected $_fdtFile = null;
  121. /**
  122. * Object constructor.
  123. *
  124. * @param Zend_Search_Lucene_Storage_Directory $directory
  125. * @param string $name
  126. */
  127. public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
  128. {
  129. $this->_directory = $directory;
  130. $this->_name = $name;
  131. }
  132. /**
  133. * Add field to the segment
  134. *
  135. * Returns actual field number
  136. *
  137. * @param Zend_Search_Lucene_Field $field
  138. * @return integer
  139. */
  140. public function addField(Zend_Search_Lucene_Field $field)
  141. {
  142. if (!isset($this->_fields[$field->name])) {
  143. $fieldNumber = count($this->_fields);
  144. $this->_fields[$field->name] =
  145. new Zend_Search_Lucene_Index_FieldInfo($field->name,
  146. $field->isIndexed,
  147. $fieldNumber,
  148. $field->storeTermVector);
  149. return $fieldNumber;
  150. } else {
  151. $this->_fields[$field->name]->isIndexed |= $field->isIndexed;
  152. $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
  153. return $this->_fields[$field->name]->number;
  154. }
  155. }
  156. /**
  157. * Add fieldInfo to the segment
  158. *
  159. * Returns actual field number
  160. *
  161. * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
  162. * @return integer
  163. */
  164. public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
  165. {
  166. if (!isset($this->_fields[$fieldInfo->name])) {
  167. $fieldNumber = count($this->_fields);
  168. $this->_fields[$fieldInfo->name] =
  169. new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
  170. $fieldInfo->isIndexed,
  171. $fieldNumber,
  172. $fieldInfo->storeTermVector);
  173. return $fieldNumber;
  174. } else {
  175. $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
  176. $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
  177. return $this->_fields[$fieldInfo->name]->number;
  178. }
  179. }
  180. /**
  181. * Returns array of FieldInfo objects.
  182. *
  183. * @return array
  184. */
  185. public function getFieldInfos()
  186. {
  187. return $this->_fields;
  188. }
  189. /**
  190. * Add stored fields information
  191. *
  192. * @param array $storedFields array of Zend_Search_Lucene_Field objects
  193. */
  194. public function addStoredFields($storedFields)
  195. {
  196. if (!isset($this->_fdxFile)) {
  197. $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
  198. $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
  199. $this->_files[] = $this->_name . '.fdx';
  200. $this->_files[] = $this->_name . '.fdt';
  201. }
  202. $this->_fdxFile->writeLong($this->_fdtFile->tell());
  203. $this->_fdtFile->writeVInt(count($storedFields));
  204. foreach ($storedFields as $field) {
  205. $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
  206. $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
  207. ($field->isBinary ? 0x02 : 0x00) |
  208. 0x00; /* 0x04 - third bit, compressed (ZLIB) */
  209. $this->_fdtFile->writeByte($fieldBits);
  210. if ($field->isBinary) {
  211. $this->_fdtFile->writeVInt(strlen($field->value));
  212. $this->_fdtFile->writeBytes($field->value);
  213. } else {
  214. $this->_fdtFile->writeString($field->getUtf8Value());
  215. }
  216. }
  217. $this->_docCount++;
  218. }
  219. /**
  220. * Returns the total number of documents in this segment.
  221. *
  222. * @return integer
  223. */
  224. public function count()
  225. {
  226. return $this->_docCount;
  227. }
  228. /**
  229. * Return segment name
  230. *
  231. * @return string
  232. */
  233. public function getName()
  234. {
  235. return $this->_name;
  236. }
  237. /**
  238. * Dump Field Info (.fnm) segment file
  239. */
  240. protected function _dumpFNM()
  241. {
  242. $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
  243. $fnmFile->writeVInt(count($this->_fields));
  244. $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
  245. // Write header
  246. $nrmFile->writeBytes('NRM');
  247. // Write format specifier
  248. $nrmFile->writeByte((int)0xFF);
  249. foreach ($this->_fields as $field) {
  250. $fnmFile->writeString($field->name);
  251. $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
  252. ($field->storeTermVector ? 0x02 : 0x00)
  253. // not supported yet 0x04 /* term positions are stored with the term vectors */ |
  254. // not supported yet 0x08 /* term offsets are stored with the term vectors */ |
  255. );
  256. if ($field->isIndexed) {
  257. // pre-2.1 index mode (not used now)
  258. // $normFileName = $this->_name . '.f' . $field->number;
  259. // $fFile = $this->_directory->createFile($normFileName);
  260. // $fFile->writeBytes($this->_norms[$field->name]);
  261. // $this->_files[] = $normFileName;
  262. $nrmFile->writeBytes($this->_norms[$field->name]);
  263. }
  264. }
  265. $this->_files[] = $this->_name . '.fnm';
  266. $this->_files[] = $this->_name . '.nrm';
  267. }
  268. /**
  269. * Term Dictionary file
  270. *
  271. * @var Zend_Search_Lucene_Storage_File
  272. */
  273. private $_tisFile = null;
  274. /**
  275. * Term Dictionary index file
  276. *
  277. * @var Zend_Search_Lucene_Storage_File
  278. */
  279. private $_tiiFile = null;
  280. /**
  281. * Frequencies file
  282. *
  283. * @var Zend_Search_Lucene_Storage_File
  284. */
  285. private $_frqFile = null;
  286. /**
  287. * Positions file
  288. *
  289. * @var Zend_Search_Lucene_Storage_File
  290. */
  291. private $_prxFile = null;
  292. /**
  293. * Number of written terms
  294. *
  295. * @var integer
  296. */
  297. private $_termCount;
  298. /**
  299. * Last saved term
  300. *
  301. * @var Zend_Search_Lucene_Index_Term
  302. */
  303. private $_prevTerm;
  304. /**
  305. * Last saved term info
  306. *
  307. * @var Zend_Search_Lucene_Index_TermInfo
  308. */
  309. private $_prevTermInfo;
  310. /**
  311. * Last saved index term
  312. *
  313. * @var Zend_Search_Lucene_Index_Term
  314. */
  315. private $_prevIndexTerm;
  316. /**
  317. * Last saved index term info
  318. *
  319. * @var Zend_Search_Lucene_Index_TermInfo
  320. */
  321. private $_prevIndexTermInfo;
  322. /**
  323. * Last term dictionary file position
  324. *
  325. * @var integer
  326. */
  327. private $_lastIndexPosition;
  328. /**
  329. * Create dicrionary, frequency and positions files and write necessary headers
  330. */
  331. public function initializeDictionaryFiles()
  332. {
  333. $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
  334. $this->_tisFile->writeInt((int)0xFFFFFFFD);
  335. $this->_tisFile->writeLong(0 /* dummy data for terms count */);
  336. $this->_tisFile->writeInt(self::$indexInterval);
  337. $this->_tisFile->writeInt(self::$skipInterval);
  338. $this->_tisFile->writeInt(self::$maxSkipLevels);
  339. $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
  340. $this->_tiiFile->writeInt((int)0xFFFFFFFD);
  341. $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
  342. $this->_tiiFile->writeInt(self::$indexInterval);
  343. $this->_tiiFile->writeInt(self::$skipInterval);
  344. $this->_tiiFile->writeInt(self::$maxSkipLevels);
  345. /** Dump dictionary header */
  346. $this->_tiiFile->writeVInt(0); // preffix length
  347. $this->_tiiFile->writeString(''); // suffix
  348. $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
  349. $this->_tiiFile->writeByte((int)0x0F);
  350. $this->_tiiFile->writeVInt(0); // DocFreq
  351. $this->_tiiFile->writeVInt(0); // FreqDelta
  352. $this->_tiiFile->writeVInt(0); // ProxDelta
  353. $this->_tiiFile->writeVInt(24); // IndexDelta
  354. $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
  355. $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
  356. $this->_files[] = $this->_name . '.tis';
  357. $this->_files[] = $this->_name . '.tii';
  358. $this->_files[] = $this->_name . '.frq';
  359. $this->_files[] = $this->_name . '.prx';
  360. $this->_prevTerm = null;
  361. $this->_prevTermInfo = null;
  362. $this->_prevIndexTerm = null;
  363. $this->_prevIndexTermInfo = null;
  364. $this->_lastIndexPosition = 24;
  365. $this->_termCount = 0;
  366. }
  367. /**
  368. * Add term
  369. *
  370. * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
  371. *
  372. * @param Zend_Search_Lucene_Index_Term $termEntry
  373. * @param array $termDocs
  374. */
  375. public function addTerm($termEntry, $termDocs)
  376. {
  377. $freqPointer = $this->_frqFile->tell();
  378. $proxPointer = $this->_prxFile->tell();
  379. $prevDoc = 0;
  380. foreach ($termDocs as $docId => $termPositions) {
  381. $docDelta = ($docId - $prevDoc)*2;
  382. $prevDoc = $docId;
  383. if (count($termPositions) > 1) {
  384. $this->_frqFile->writeVInt($docDelta);
  385. $this->_frqFile->writeVInt(count($termPositions));
  386. } else {
  387. $this->_frqFile->writeVInt($docDelta + 1);
  388. }
  389. $prevPosition = 0;
  390. foreach ($termPositions as $position) {
  391. $this->_prxFile->writeVInt($position - $prevPosition);
  392. $prevPosition = $position;
  393. }
  394. }
  395. if (count($termDocs) >= self::$skipInterval) {
  396. /**
  397. * @todo Write Skip Data to a freq file.
  398. * It's not used now, but make index more optimal
  399. */
  400. $skipOffset = $this->_frqFile->tell() - $freqPointer;
  401. } else {
  402. $skipOffset = 0;
  403. }
  404. $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
  405. $this->_fields[$termEntry->field]->number);
  406. $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
  407. $freqPointer, $proxPointer, $skipOffset);
  408. $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
  409. if (($this->_termCount + 1) % self::$indexInterval == 0) {
  410. $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
  411. $indexPosition = $this->_tisFile->tell();
  412. $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
  413. $this->_lastIndexPosition = $indexPosition;
  414. }
  415. $this->_termCount++;
  416. }
  417. /**
  418. * Close dictionary
  419. */
  420. public function closeDictionaryFiles()
  421. {
  422. $this->_tisFile->seek(4);
  423. $this->_tisFile->writeLong($this->_termCount);
  424. $this->_tiiFile->seek(4);
  425. // + 1 is used to count an additional special index entry (empty term at the start of the list)
  426. $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
  427. }
  428. /**
  429. * Dump Term Dictionary segment file entry.
  430. * Used to write entry to .tis or .tii files
  431. *
  432. * @param Zend_Search_Lucene_Storage_File $dicFile
  433. * @param Zend_Search_Lucene_Index_Term $prevTerm
  434. * @param Zend_Search_Lucene_Index_Term $term
  435. * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
  436. * @param Zend_Search_Lucene_Index_TermInfo $termInfo
  437. */
  438. protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
  439. &$prevTerm, Zend_Search_Lucene_Index_Term $term,
  440. &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
  441. {
  442. if (isset($prevTerm) && $prevTerm->field == $term->field) {
  443. $matchedBytes = 0;
  444. $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
  445. while ($matchedBytes < $maxBytes &&
  446. $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
  447. $matchedBytes++;
  448. }
  449. // Calculate actual matched UTF-8 pattern
  450. $prefixBytes = 0;
  451. $prefixChars = 0;
  452. while ($prefixBytes < $matchedBytes) {
  453. $charBytes = 1;
  454. if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
  455. $charBytes++;
  456. if (ord($term->text[$prefixBytes]) & 0x20 ) {
  457. $charBytes++;
  458. if (ord($term->text[$prefixBytes]) & 0x10 ) {
  459. $charBytes++;
  460. }
  461. }
  462. }
  463. if ($prefixBytes + $charBytes > $matchedBytes) {
  464. // char crosses matched bytes boundary
  465. // skip char
  466. break;
  467. }
  468. $prefixChars++;
  469. $prefixBytes += $charBytes;
  470. }
  471. // Write preffix length
  472. $dicFile->writeVInt($prefixChars);
  473. // Write suffix
  474. $dicFile->writeString(substr($term->text, $prefixBytes));
  475. } else {
  476. // Write preffix length
  477. $dicFile->writeVInt(0);
  478. // Write suffix
  479. $dicFile->writeString($term->text);
  480. }
  481. // Write field number
  482. $dicFile->writeVInt($term->field);
  483. // DocFreq (the count of documents which contain the term)
  484. $dicFile->writeVInt($termInfo->docFreq);
  485. $prevTerm = $term;
  486. if (!isset($prevTermInfo)) {
  487. // Write FreqDelta
  488. $dicFile->writeVInt($termInfo->freqPointer);
  489. // Write ProxDelta
  490. $dicFile->writeVInt($termInfo->proxPointer);
  491. } else {
  492. // Write FreqDelta
  493. $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
  494. // Write ProxDelta
  495. $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
  496. }
  497. // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
  498. if ($termInfo->skipOffset != 0) {
  499. $dicFile->writeVInt($termInfo->skipOffset);
  500. }
  501. $prevTermInfo = $termInfo;
  502. }
  503. /**
  504. * Generate compound index file
  505. */
  506. protected function _generateCFS()
  507. {
  508. $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
  509. $cfsFile->writeVInt(count($this->_files));
  510. $dataOffsetPointers = array();
  511. foreach ($this->_files as $fileName) {
  512. $dataOffsetPointers[$fileName] = $cfsFile->tell();
  513. $cfsFile->writeLong(0); // write dummy data
  514. $cfsFile->writeString($fileName);
  515. }
  516. foreach ($this->_files as $fileName) {
  517. // Get actual data offset
  518. $dataOffset = $cfsFile->tell();
  519. // Seek to the data offset pointer
  520. $cfsFile->seek($dataOffsetPointers[$fileName]);
  521. // Write actual data offset value
  522. $cfsFile->writeLong($dataOffset);
  523. // Seek back to the end of file
  524. $cfsFile->seek($dataOffset);
  525. $dataFile = $this->_directory->getFileObject($fileName);
  526. $byteCount = $this->_directory->fileLength($fileName);
  527. while ($byteCount > 0) {
  528. $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
  529. $byteCount -= strlen($data);
  530. $cfsFile->writeBytes($data);
  531. }
  532. $this->_directory->deleteFile($fileName);
  533. }
  534. }
  535. /**
  536. * Close segment, write it to disk and return segment info
  537. *
  538. * @return Zend_Search_Lucene_Index_SegmentInfo
  539. */
  540. abstract public function close();
  541. }