PageRenderTime 247ms CodeModel.GetById 111ms app.highlight 19ms RepoModel.GetById 111ms app.codeStats 1ms

/library/Zend/Search/Lucene/Index/SegmentWriter.php

https://bitbucket.org/baruffaldi/website-2008-computer-shopping-3
PHP | 631 lines | 277 code | 88 blank | 266 comment | 31 complexity | 9672b23c95e9822fb53736bd59385f2f MD5 | raw file
  1<?php
  2/**
  3 * Zend Framework
  4 *
  5 * LICENSE
  6 *
  7 * This source file is subject to the new BSD license that is bundled
  8 * with this package in the file LICENSE.txt.
  9 * It is also available through the world-wide-web at this URL:
 10 * http://framework.zend.com/license/new-bsd
 11 * If you did not receive a copy of the license and are unable to
 12 * obtain it through the world-wide-web, please send an email
 13 * to license@zend.com so we can send you a copy immediately.
 14 *
 15 * @category   Zend
 16 * @package    Zend_Search_Lucene
 17 * @subpackage Index
 18 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 19 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 20 */
 21
 22
 23/** Zend_Search_Lucene_Exception */
 24require_once 'Zend/Search/Lucene/Exception.php';
 25
 26/** Zend_Search_Lucene_Index_SegmentInfo */
 27require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
 28
 29
 30/**
 31 * @category   Zend
 32 * @package    Zend_Search_Lucene
 33 * @subpackage Index
 34 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 35 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 36 */
 37abstract class Zend_Search_Lucene_Index_SegmentWriter
 38{
 39    /**
 40     * Expert: The fraction of terms in the "dictionary" which should be stored
 41     * in RAM.  Smaller values use more memory, but make searching slightly
 42     * faster, while larger values use less memory and make searching slightly
 43     * slower.  Searching is typically not dominated by dictionary lookup, so
 44     * tweaking this is rarely useful.
 45     *
 46     * @var integer
 47     */
 48    public static $indexInterval = 128;
 49
 50    /**
 51     * Expert: The fraction of TermDocs entries stored in skip tables.
 52     * Larger values result in smaller indexes, greater acceleration, but fewer
 53     * accelerable cases, while smaller values result in bigger indexes,
 54     * less acceleration and more
 55     * accelerable cases. More detailed experiments would be useful here.
 56     *
 57     * 0x7FFFFFFF indicates that we don't use skip data
 58     *
 59     * Note: not used in current implementation
 60     *
 61     * @var integer
 62     */
 63    public static $skipInterval = 0x7FFFFFFF;
 64
 65    /**
 66     * Expert: The maximum number of skip levels. Smaller values result in
 67     * slightly smaller indexes, but slower skipping in big posting lists.
 68     *
 69     * 0 indicates that we don't use skip data
 70     *
 71     * Note: not used in current implementation
 72     *
 73     * @var integer
 74     */
 75    public static $maxSkipLevels = 0;
 76
 77    /**
 78     * Number of docs in a segment
 79     *
 80     * @var integer
 81     */
 82    protected $_docCount = 0;
 83
 84    /**
 85     * Segment name
 86     *
 87     * @var string
 88     */
 89    protected $_name;
 90
 91    /**
 92     * File system adapter.
 93     *
 94     * @var Zend_Search_Lucene_Storage_Directory
 95     */
 96    protected $_directory;
 97
 98    /**
 99     * List of the index files.
100     * Used for automatic compound file generation
101     *
102     * @var unknown_type
103     */
104    protected $_files = array();
105
106    /**
107     * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
108     *
109     * @var array
110     */
111    protected $_fields = array();
112
113    /**
114     * Normalization factors.
115     * An array fieldName => normVector
116     * normVector is a binary string.
117     * Each byte corresponds to an indexed document in a segment and
118     * encodes normalization factor (float value, encoded by
119     * Zend_Search_Lucene_Search_Similarity::encodeNorm())
120     *
121     * @var array
122     */
123    protected $_norms = array();
124
125
126    /**
127     * '.fdx'  file - Stored Fields, the field index.
128     *
129     * @var Zend_Search_Lucene_Storage_File
130     */
131    protected $_fdxFile = null;
132
133    /**
134     * '.fdt'  file - Stored Fields, the field data.
135     *
136     * @var Zend_Search_Lucene_Storage_File
137     */
138    protected $_fdtFile = null;
139
140
141    /**
142     * Object constructor.
143     *
144     * @param Zend_Search_Lucene_Storage_Directory $directory
145     * @param string $name
146     */
147    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
148    {
149        $this->_directory = $directory;
150        $this->_name      = $name;
151    }
152
153
154    /**
155     * Add field to the segment
156     *
157     * Returns actual field number
158     *
159     * @param Zend_Search_Lucene_Field $field
160     * @return integer
161     */
162    public function addField(Zend_Search_Lucene_Field $field)
163    {
164        if (!isset($this->_fields[$field->name])) {
165            $fieldNumber = count($this->_fields);
166            $this->_fields[$field->name] =
167                                new Zend_Search_Lucene_Index_FieldInfo($field->name,
168                                                                       $field->isIndexed,
169                                                                       $fieldNumber,
170                                                                       $field->storeTermVector);
171
172            return $fieldNumber;
173        } else {
174            $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
175            $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
176
177            return $this->_fields[$field->name]->number;
178        }
179    }
180
181    /**
182     * Add fieldInfo to the segment
183     *
184     * Returns actual field number
185     *
186     * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
187     * @return integer
188     */
189    public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
190    {
191        if (!isset($this->_fields[$fieldInfo->name])) {
192            $fieldNumber = count($this->_fields);
193            $this->_fields[$fieldInfo->name] =
194                                new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
195                                                                       $fieldInfo->isIndexed,
196                                                                       $fieldNumber,
197                                                                       $fieldInfo->storeTermVector);
198
199            return $fieldNumber;
200        } else {
201            $this->_fields[$fieldInfo->name]->isIndexed       |= $fieldInfo->isIndexed;
202            $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
203
204            return $this->_fields[$fieldInfo->name]->number;
205        }
206    }
207
208    /**
209     * Returns array of FieldInfo objects.
210     *
211     * @return array
212     */
213    public function getFieldInfos()
214    {
215        return $this->_fields;
216    }
217
218    /**
219     * Add stored fields information
220     *
221     * @param array $storedFields array of Zend_Search_Lucene_Field objects
222     */
223    public function addStoredFields($storedFields)
224    {
225        if (!isset($this->_fdxFile)) {
226            $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
227            $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
228
229            $this->_files[] = $this->_name . '.fdx';
230            $this->_files[] = $this->_name . '.fdt';
231        }
232
233        $this->_fdxFile->writeLong($this->_fdtFile->tell());
234        $this->_fdtFile->writeVInt(count($storedFields));
235        foreach ($storedFields as $field) {
236            $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
237            $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
238                         ($field->isBinary ?    0x02 : 0x00) |
239                         0x00; /* 0x04 - third bit, compressed (ZLIB) */
240            $this->_fdtFile->writeByte($fieldBits);
241            if ($field->isBinary) {
242                $this->_fdtFile->writeVInt(strlen($field->value));
243                $this->_fdtFile->writeBytes($field->value);
244            } else {
245                $this->_fdtFile->writeString($field->getUtf8Value());
246            }
247        }
248
249        $this->_docCount++;
250    }
251
252    /**
253     * Returns the total number of documents in this segment.
254     *
255     * @return integer
256     */
257    public function count()
258    {
259        return $this->_docCount;
260    }
261
262    /**
263     * Return segment name
264     *
265     * @return string
266     */
267    public function getName()
268    {
269        return $this->_name;
270    }
271
272    /**
273     * Dump Field Info (.fnm) segment file
274     */
275    protected function _dumpFNM()
276    {
277        $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
278        $fnmFile->writeVInt(count($this->_fields));
279
280        $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
281        // Write header
282        $nrmFile->writeBytes('NRM');
283        // Write format specifier
284        $nrmFile->writeByte((int)0xFF);
285
286        foreach ($this->_fields as $field) {
287            $fnmFile->writeString($field->name);
288            $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
289                                ($field->storeTermVector ? 0x02 : 0x00)
290// not supported yet            0x04 /* term positions are stored with the term vectors */ |
291// not supported yet            0x08 /* term offsets are stored with the term vectors */   |
292                               );
293
294            if ($field->isIndexed) {
295                // pre-2.1 index mode (not used now)
296                // $normFileName = $this->_name . '.f' . $field->number;
297                // $fFile = $this->_directory->createFile($normFileName);
298                // $fFile->writeBytes($this->_norms[$field->name]);
299                // $this->_files[] = $normFileName;
300
301                $nrmFile->writeBytes($this->_norms[$field->name]);
302            }
303        }
304
305        $this->_files[] = $this->_name . '.fnm';
306        $this->_files[] = $this->_name . '.nrm';
307    }
308
309
310
311    /**
312     * Term Dictionary file
313     *
314     * @var Zend_Search_Lucene_Storage_File
315     */
316    private $_tisFile = null;
317
318    /**
319     * Term Dictionary index file
320     *
321     * @var Zend_Search_Lucene_Storage_File
322     */
323    private $_tiiFile = null;
324
325    /**
326     * Frequencies file
327     *
328     * @var Zend_Search_Lucene_Storage_File
329     */
330    private $_frqFile = null;
331
332    /**
333     * Positions file
334     *
335     * @var Zend_Search_Lucene_Storage_File
336     */
337    private $_prxFile = null;
338
339    /**
340     * Number of written terms
341     *
342     * @var integer
343     */
344    private $_termCount;
345
346
347    /**
348     * Last saved term
349     *
350     * @var Zend_Search_Lucene_Index_Term
351     */
352    private $_prevTerm;
353
354    /**
355     * Last saved term info
356     *
357     * @var Zend_Search_Lucene_Index_TermInfo
358     */
359    private $_prevTermInfo;
360
361    /**
362     * Last saved index term
363     *
364     * @var Zend_Search_Lucene_Index_Term
365     */
366    private $_prevIndexTerm;
367
368    /**
369     * Last saved index term info
370     *
371     * @var Zend_Search_Lucene_Index_TermInfo
372     */
373    private $_prevIndexTermInfo;
374
375    /**
376     * Last term dictionary file position
377     *
378     * @var integer
379     */
380    private $_lastIndexPosition;
381
382    /**
383     * Create dicrionary, frequency and positions files and write necessary headers
384     */
385    public function initializeDictionaryFiles()
386    {
387        $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
388        $this->_tisFile->writeInt((int)0xFFFFFFFD);
389        $this->_tisFile->writeLong(0 /* dummy data for terms count */);
390        $this->_tisFile->writeInt(self::$indexInterval);
391        $this->_tisFile->writeInt(self::$skipInterval);
392        $this->_tisFile->writeInt(self::$maxSkipLevels);
393
394        $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
395        $this->_tiiFile->writeInt((int)0xFFFFFFFD);
396        $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
397        $this->_tiiFile->writeInt(self::$indexInterval);
398        $this->_tiiFile->writeInt(self::$skipInterval);
399        $this->_tiiFile->writeInt(self::$maxSkipLevels);
400
401        /** Dump dictionary header */
402        $this->_tiiFile->writeVInt(0);                    // preffix length
403        $this->_tiiFile->writeString('');                 // suffix
404        $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
405        $this->_tiiFile->writeByte((int)0x0F);
406        $this->_tiiFile->writeVInt(0);                    // DocFreq
407        $this->_tiiFile->writeVInt(0);                    // FreqDelta
408        $this->_tiiFile->writeVInt(0);                    // ProxDelta
409        $this->_tiiFile->writeVInt(24);                   // IndexDelta
410
411        $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
412        $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
413
414        $this->_files[] = $this->_name . '.tis';
415        $this->_files[] = $this->_name . '.tii';
416        $this->_files[] = $this->_name . '.frq';
417        $this->_files[] = $this->_name . '.prx';
418
419        $this->_prevTerm          = null;
420        $this->_prevTermInfo      = null;
421        $this->_prevIndexTerm     = null;
422        $this->_prevIndexTermInfo = null;
423        $this->_lastIndexPosition = 24;
424        $this->_termCount         = 0;
425
426    }
427
428    /**
429     * Add term
430     *
431     * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
432     *
433     * @param Zend_Search_Lucene_Index_Term $termEntry
434     * @param array $termDocs
435     */
436    public function addTerm($termEntry, $termDocs)
437    {
438        $freqPointer = $this->_frqFile->tell();
439        $proxPointer = $this->_prxFile->tell();
440
441        $prevDoc = 0;
442        foreach ($termDocs as $docId => $termPositions) {
443            $docDelta = ($docId - $prevDoc)*2;
444            $prevDoc = $docId;
445            if (count($termPositions) > 1) {
446                $this->_frqFile->writeVInt($docDelta);
447                $this->_frqFile->writeVInt(count($termPositions));
448            } else {
449                $this->_frqFile->writeVInt($docDelta + 1);
450            }
451
452            $prevPosition = 0;
453            foreach ($termPositions as $position) {
454                $this->_prxFile->writeVInt($position - $prevPosition);
455                $prevPosition = $position;
456            }
457        }
458
459        if (count($termDocs) >= self::$skipInterval) {
460            /**
461             * @todo Write Skip Data to a freq file.
462             * It's not used now, but make index more optimal
463             */
464            $skipOffset = $this->_frqFile->tell() - $freqPointer;
465        } else {
466            $skipOffset = 0;
467        }
468
469        $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
470                                                  $this->_fields[$termEntry->field]->number);
471        $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
472                                                          $freqPointer, $proxPointer, $skipOffset);
473
474        $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
475
476        if (($this->_termCount + 1) % self::$indexInterval == 0) {
477            $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
478
479            $indexPosition = $this->_tisFile->tell();
480            $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
481            $this->_lastIndexPosition = $indexPosition;
482
483        }
484        $this->_termCount++;
485    }
486
487    /**
488     * Close dictionary
489     */
490    public function closeDictionaryFiles()
491    {
492        $this->_tisFile->seek(4);
493        $this->_tisFile->writeLong($this->_termCount);
494
495        $this->_tiiFile->seek(4);
496        // + 1 is used to count an additional special index entry (empty term at the start of the list)
497        $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
498    }
499
500
501    /**
502     * Dump Term Dictionary segment file entry.
503     * Used to write entry to .tis or .tii files
504     *
505     * @param Zend_Search_Lucene_Storage_File $dicFile
506     * @param Zend_Search_Lucene_Index_Term $prevTerm
507     * @param Zend_Search_Lucene_Index_Term $term
508     * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
509     * @param Zend_Search_Lucene_Index_TermInfo $termInfo
510     */
511    protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
512                                        &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
513                                        &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
514    {
515        if (isset($prevTerm) && $prevTerm->field == $term->field) {
516            $matchedBytes = 0;
517            $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
518            while ($matchedBytes < $maxBytes  &&
519                   $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
520                $matchedBytes++;
521            }
522
523            // Calculate actual matched UTF-8 pattern
524            $prefixBytes = 0;
525            $prefixChars = 0;
526            while ($prefixBytes < $matchedBytes) {
527                $charBytes = 1;
528                if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
529                    $charBytes++;
530                    if (ord($term->text[$prefixBytes]) & 0x20 ) {
531                        $charBytes++;
532                        if (ord($term->text[$prefixBytes]) & 0x10 ) {
533                            $charBytes++;
534                        }
535                    }
536                }
537
538                if ($prefixBytes + $charBytes > $matchedBytes) {
539                    // char crosses matched bytes boundary
540                    // skip char
541                    break;
542                }
543
544                $prefixChars++;
545                $prefixBytes += $charBytes;
546            }
547
548            // Write preffix length
549            $dicFile->writeVInt($prefixChars);
550            // Write suffix
551            $dicFile->writeString(substr($term->text, $prefixBytes));
552        } else {
553            // Write preffix length
554            $dicFile->writeVInt(0);
555            // Write suffix
556            $dicFile->writeString($term->text);
557        }
558        // Write field number
559        $dicFile->writeVInt($term->field);
560        // DocFreq (the count of documents which contain the term)
561        $dicFile->writeVInt($termInfo->docFreq);
562
563        $prevTerm = $term;
564
565        if (!isset($prevTermInfo)) {
566            // Write FreqDelta
567            $dicFile->writeVInt($termInfo->freqPointer);
568            // Write ProxDelta
569            $dicFile->writeVInt($termInfo->proxPointer);
570        } else {
571            // Write FreqDelta
572            $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
573            // Write ProxDelta
574            $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
575        }
576        // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
577        if ($termInfo->skipOffset != 0) {
578            $dicFile->writeVInt($termInfo->skipOffset);
579        }
580
581        $prevTermInfo = $termInfo;
582    }
583
584
585    /**
586     * Generate compound index file
587     */
588    protected function _generateCFS()
589    {
590        $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
591        $cfsFile->writeVInt(count($this->_files));
592
593        $dataOffsetPointers = array();
594        foreach ($this->_files as $fileName) {
595            $dataOffsetPointers[$fileName] = $cfsFile->tell();
596            $cfsFile->writeLong(0); // write dummy data
597            $cfsFile->writeString($fileName);
598        }
599
600        foreach ($this->_files as $fileName) {
601            // Get actual data offset
602            $dataOffset = $cfsFile->tell();
603            // Seek to the data offset pointer
604            $cfsFile->seek($dataOffsetPointers[$fileName]);
605            // Write actual data offset value
606            $cfsFile->writeLong($dataOffset);
607            // Seek back to the end of file
608            $cfsFile->seek($dataOffset);
609
610            $dataFile = $this->_directory->getFileObject($fileName);
611
612            $byteCount = $this->_directory->fileLength($fileName);
613            while ($byteCount > 0) {
614                $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
615                $byteCount -= strlen($data);
616                $cfsFile->writeBytes($data);
617            }
618
619            $this->_directory->deleteFile($fileName);
620        }
621    }
622
623
624    /**
625     * Close segment, write it to disk and return segment info
626     *
627     * @return Zend_Search_Lucene_Index_SegmentInfo
628     */
629    abstract public function close();
630}
631