PageRenderTime 191ms CodeModel.GetById 101ms app.highlight 51ms RepoModel.GetById 31ms app.codeStats 1ms

/library/Zend/Search/Lucene/Index/SegmentWriter.php

https://bitbucket.org/fabiancarlos/feature_seguimentos
PHP | 634 lines | 278 code | 88 blank | 268 comment | 31 complexity | 38d31267b44040be2ecf5c95e9bdf5a4 MD5 | raw file
  1<?php
  2/**
  3 * Zend Framework
  4 *
  5 * LICENSE
  6 *
  7 * This source file is subject to the new BSD license that is bundled
  8 * with this package in the file LICENSE.txt.
  9 * It is also available through the world-wide-web at this URL:
 10 * http://framework.zend.com/license/new-bsd
 11 * If you did not receive a copy of the license and are unable to
 12 * obtain it through the world-wide-web, please send an email
 13 * to license@zend.com so we can send you a copy immediately.
 14 *
 15 * @category   Zend
 16 * @package    Zend_Search_Lucene
 17 * @subpackage Index
 18 * @copyright  Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
 19 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 20 * @version    $Id: SegmentWriter.php 23775 2011-03-01 17:25:24Z ralph $
 21 */
 22
 23
 24/** Zend_Search_Lucene_Index_FieldInfo */
 25require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
 26
 27/** Zend_Search_Lucene_Index_Term */
 28require_once 'Zend/Search/Lucene/Index/Term.php';
 29
 30/** Zend_Search_Lucene_Index_TermInfo */
 31require_once 'Zend/Search/Lucene/Index/TermInfo.php';
 32
 33/**
 34 * @category   Zend
 35 * @package    Zend_Search_Lucene
 36 * @subpackage Index
 37 * @copyright  Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
 38 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 39 */
 40abstract class Zend_Search_Lucene_Index_SegmentWriter
 41{
 42    /**
 43     * Expert: The fraction of terms in the "dictionary" which should be stored
 44     * in RAM.  Smaller values use more memory, but make searching slightly
 45     * faster, while larger values use less memory and make searching slightly
 46     * slower.  Searching is typically not dominated by dictionary lookup, so
 47     * tweaking this is rarely useful.
 48     *
 49     * @var integer
 50     */
 51    public static $indexInterval = 128;
 52
 53    /**
 54     * Expert: The fraction of TermDocs entries stored in skip tables.
 55     * Larger values result in smaller indexes, greater acceleration, but fewer
 56     * accelerable cases, while smaller values result in bigger indexes,
 57     * less acceleration and more
 58     * accelerable cases. More detailed experiments would be useful here.
 59     *
 60     * 0x7FFFFFFF indicates that we don't use skip data
 61     *
 62     * Note: not used in current implementation
 63     *
 64     * @var integer
 65     */
 66    public static $skipInterval = 0x7FFFFFFF;
 67
 68    /**
 69     * Expert: The maximum number of skip levels. Smaller values result in
 70     * slightly smaller indexes, but slower skipping in big posting lists.
 71     *
 72     * 0 indicates that we don't use skip data
 73     *
 74     * Note: not used in current implementation
 75     *
 76     * @var integer
 77     */
 78    public static $maxSkipLevels = 0;
 79
 80    /**
 81     * Number of docs in a segment
 82     *
 83     * @var integer
 84     */
 85    protected $_docCount = 0;
 86
 87    /**
 88     * Segment name
 89     *
 90     * @var string
 91     */
 92    protected $_name;
 93
 94    /**
 95     * File system adapter.
 96     *
 97     * @var Zend_Search_Lucene_Storage_Directory
 98     */
 99    protected $_directory;
100
101    /**
102     * List of the index files.
103     * Used for automatic compound file generation
104     *
105     * @var unknown_type
106     */
107    protected $_files = array();
108
109    /**
110     * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
111     *
112     * @var array
113     */
114    protected $_fields = array();
115
116    /**
117     * Normalization factors.
118     * An array fieldName => normVector
119     * normVector is a binary string.
120     * Each byte corresponds to an indexed document in a segment and
121     * encodes normalization factor (float value, encoded by
122     * Zend_Search_Lucene_Search_Similarity::encodeNorm())
123     *
124     * @var array
125     */
126    protected $_norms = array();
127
128
129    /**
130     * '.fdx'  file - Stored Fields, the field index.
131     *
132     * @var Zend_Search_Lucene_Storage_File
133     */
134    protected $_fdxFile = null;
135
136    /**
137     * '.fdt'  file - Stored Fields, the field data.
138     *
139     * @var Zend_Search_Lucene_Storage_File
140     */
141    protected $_fdtFile = null;
142
143
144    /**
145     * Object constructor.
146     *
147     * @param Zend_Search_Lucene_Storage_Directory $directory
148     * @param string $name
149     */
150    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
151    {
152        $this->_directory = $directory;
153        $this->_name      = $name;
154    }
155
156
157    /**
158     * Add field to the segment
159     *
160     * Returns actual field number
161     *
162     * @param Zend_Search_Lucene_Field $field
163     * @return integer
164     */
165    public function addField(Zend_Search_Lucene_Field $field)
166    {
167        if (!isset($this->_fields[$field->name])) {
168            $fieldNumber = count($this->_fields);
169            $this->_fields[$field->name] =
170                                new Zend_Search_Lucene_Index_FieldInfo($field->name,
171                                                                       $field->isIndexed,
172                                                                       $fieldNumber,
173                                                                       $field->storeTermVector);
174
175            return $fieldNumber;
176        } else {
177            $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
178            $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
179
180            return $this->_fields[$field->name]->number;
181        }
182    }
183
184    /**
185     * Add fieldInfo to the segment
186     *
187     * Returns actual field number
188     *
189     * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
190     * @return integer
191     */
192    public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
193    {
194        if (!isset($this->_fields[$fieldInfo->name])) {
195            $fieldNumber = count($this->_fields);
196            $this->_fields[$fieldInfo->name] =
197                                new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
198                                                                       $fieldInfo->isIndexed,
199                                                                       $fieldNumber,
200                                                                       $fieldInfo->storeTermVector);
201
202            return $fieldNumber;
203        } else {
204            $this->_fields[$fieldInfo->name]->isIndexed       |= $fieldInfo->isIndexed;
205            $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
206
207            return $this->_fields[$fieldInfo->name]->number;
208        }
209    }
210
211    /**
212     * Returns array of FieldInfo objects.
213     *
214     * @return array
215     */
216    public function getFieldInfos()
217    {
218        return $this->_fields;
219    }
220
221    /**
222     * Add stored fields information
223     *
224     * @param array $storedFields array of Zend_Search_Lucene_Field objects
225     */
226    public function addStoredFields($storedFields)
227    {
228        if (!isset($this->_fdxFile)) {
229            $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
230            $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
231
232            $this->_files[] = $this->_name . '.fdx';
233            $this->_files[] = $this->_name . '.fdt';
234        }
235
236        $this->_fdxFile->writeLong($this->_fdtFile->tell());
237        $this->_fdtFile->writeVInt(count($storedFields));
238        foreach ($storedFields as $field) {
239            $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
240            $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
241                         ($field->isBinary ?    0x02 : 0x00) |
242                         0x00; /* 0x04 - third bit, compressed (ZLIB) */
243            $this->_fdtFile->writeByte($fieldBits);
244            if ($field->isBinary) {
245                $this->_fdtFile->writeVInt(strlen($field->value));
246                $this->_fdtFile->writeBytes($field->value);
247            } else {
248                $this->_fdtFile->writeString($field->getUtf8Value());
249            }
250        }
251
252        $this->_docCount++;
253    }
254
255    /**
256     * Returns the total number of documents in this segment.
257     *
258     * @return integer
259     */
260    public function count()
261    {
262        return $this->_docCount;
263    }
264
265    /**
266     * Return segment name
267     *
268     * @return string
269     */
270    public function getName()
271    {
272        return $this->_name;
273    }
274
275    /**
276     * Dump Field Info (.fnm) segment file
277     */
278    protected function _dumpFNM()
279    {
280        $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
281        $fnmFile->writeVInt(count($this->_fields));
282
283        $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
284        // Write header
285        $nrmFile->writeBytes('NRM');
286        // Write format specifier
287        $nrmFile->writeByte((int)0xFF);
288
289        foreach ($this->_fields as $field) {
290            $fnmFile->writeString($field->name);
291            $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
292                                ($field->storeTermVector ? 0x02 : 0x00)
293// not supported yet            0x04 /* term positions are stored with the term vectors */ |
294// not supported yet            0x08 /* term offsets are stored with the term vectors */   |
295                               );
296
297            if ($field->isIndexed) {
298                // pre-2.1 index mode (not used now)
299                // $normFileName = $this->_name . '.f' . $field->number;
300                // $fFile = $this->_directory->createFile($normFileName);
301                // $fFile->writeBytes($this->_norms[$field->name]);
302                // $this->_files[] = $normFileName;
303
304                $nrmFile->writeBytes($this->_norms[$field->name]);
305            }
306        }
307
308        $this->_files[] = $this->_name . '.fnm';
309        $this->_files[] = $this->_name . '.nrm';
310    }
311
312
313
314    /**
315     * Term Dictionary file
316     *
317     * @var Zend_Search_Lucene_Storage_File
318     */
319    private $_tisFile = null;
320
321    /**
322     * Term Dictionary index file
323     *
324     * @var Zend_Search_Lucene_Storage_File
325     */
326    private $_tiiFile = null;
327
328    /**
329     * Frequencies file
330     *
331     * @var Zend_Search_Lucene_Storage_File
332     */
333    private $_frqFile = null;
334
335    /**
336     * Positions file
337     *
338     * @var Zend_Search_Lucene_Storage_File
339     */
340    private $_prxFile = null;
341
342    /**
343     * Number of written terms
344     *
345     * @var integer
346     */
347    private $_termCount;
348
349
350    /**
351     * Last saved term
352     *
353     * @var Zend_Search_Lucene_Index_Term
354     */
355    private $_prevTerm;
356
357    /**
358     * Last saved term info
359     *
360     * @var Zend_Search_Lucene_Index_TermInfo
361     */
362    private $_prevTermInfo;
363
364    /**
365     * Last saved index term
366     *
367     * @var Zend_Search_Lucene_Index_Term
368     */
369    private $_prevIndexTerm;
370
371    /**
372     * Last saved index term info
373     *
374     * @var Zend_Search_Lucene_Index_TermInfo
375     */
376    private $_prevIndexTermInfo;
377
378    /**
379     * Last term dictionary file position
380     *
381     * @var integer
382     */
383    private $_lastIndexPosition;
384
385    /**
386     * Create dicrionary, frequency and positions files and write necessary headers
387     */
388    public function initializeDictionaryFiles()
389    {
390        $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
391        $this->_tisFile->writeInt((int)0xFFFFFFFD);
392        $this->_tisFile->writeLong(0 /* dummy data for terms count */);
393        $this->_tisFile->writeInt(self::$indexInterval);
394        $this->_tisFile->writeInt(self::$skipInterval);
395        $this->_tisFile->writeInt(self::$maxSkipLevels);
396
397        $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
398        $this->_tiiFile->writeInt((int)0xFFFFFFFD);
399        $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
400        $this->_tiiFile->writeInt(self::$indexInterval);
401        $this->_tiiFile->writeInt(self::$skipInterval);
402        $this->_tiiFile->writeInt(self::$maxSkipLevels);
403
404        /** Dump dictionary header */
405        $this->_tiiFile->writeVInt(0);                    // preffix length
406        $this->_tiiFile->writeString('');                 // suffix
407        $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
408        $this->_tiiFile->writeByte((int)0x0F);
409        $this->_tiiFile->writeVInt(0);                    // DocFreq
410        $this->_tiiFile->writeVInt(0);                    // FreqDelta
411        $this->_tiiFile->writeVInt(0);                    // ProxDelta
412        $this->_tiiFile->writeVInt(24);                   // IndexDelta
413
414        $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
415        $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
416
417        $this->_files[] = $this->_name . '.tis';
418        $this->_files[] = $this->_name . '.tii';
419        $this->_files[] = $this->_name . '.frq';
420        $this->_files[] = $this->_name . '.prx';
421
422        $this->_prevTerm          = null;
423        $this->_prevTermInfo      = null;
424        $this->_prevIndexTerm     = null;
425        $this->_prevIndexTermInfo = null;
426        $this->_lastIndexPosition = 24;
427        $this->_termCount         = 0;
428
429    }
430
431    /**
432     * Add term
433     *
434     * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
435     *
436     * @param Zend_Search_Lucene_Index_Term $termEntry
437     * @param array $termDocs
438     */
439    public function addTerm($termEntry, $termDocs)
440    {
441        $freqPointer = $this->_frqFile->tell();
442        $proxPointer = $this->_prxFile->tell();
443
444        $prevDoc = 0;
445        foreach ($termDocs as $docId => $termPositions) {
446            $docDelta = ($docId - $prevDoc)*2;
447            $prevDoc = $docId;
448            if (count($termPositions) > 1) {
449                $this->_frqFile->writeVInt($docDelta);
450                $this->_frqFile->writeVInt(count($termPositions));
451            } else {
452                $this->_frqFile->writeVInt($docDelta + 1);
453            }
454
455            $prevPosition = 0;
456            foreach ($termPositions as $position) {
457                $this->_prxFile->writeVInt($position - $prevPosition);
458                $prevPosition = $position;
459            }
460        }
461
462        if (count($termDocs) >= self::$skipInterval) {
463            /**
464             * @todo Write Skip Data to a freq file.
465             * It's not used now, but make index more optimal
466             */
467            $skipOffset = $this->_frqFile->tell() - $freqPointer;
468        } else {
469            $skipOffset = 0;
470        }
471
472        $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
473                                                  $this->_fields[$termEntry->field]->number);
474        $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
475                                                          $freqPointer, $proxPointer, $skipOffset);
476
477        $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
478
479        if (($this->_termCount + 1) % self::$indexInterval == 0) {
480            $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
481
482            $indexPosition = $this->_tisFile->tell();
483            $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
484            $this->_lastIndexPosition = $indexPosition;
485
486        }
487        $this->_termCount++;
488    }
489
490    /**
491     * Close dictionary
492     */
493    public function closeDictionaryFiles()
494    {
495        $this->_tisFile->seek(4);
496        $this->_tisFile->writeLong($this->_termCount);
497
498        $this->_tiiFile->seek(4);
499        // + 1 is used to count an additional special index entry (empty term at the start of the list)
500        $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
501    }
502
503
504    /**
505     * Dump Term Dictionary segment file entry.
506     * Used to write entry to .tis or .tii files
507     *
508     * @param Zend_Search_Lucene_Storage_File $dicFile
509     * @param Zend_Search_Lucene_Index_Term $prevTerm
510     * @param Zend_Search_Lucene_Index_Term $term
511     * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
512     * @param Zend_Search_Lucene_Index_TermInfo $termInfo
513     */
514    protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
515                                        &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
516                                        &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
517    {
518        if (isset($prevTerm) && $prevTerm->field == $term->field) {
519            $matchedBytes = 0;
520            $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
521            while ($matchedBytes < $maxBytes  &&
522                   $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
523                $matchedBytes++;
524            }
525
526            // Calculate actual matched UTF-8 pattern
527            $prefixBytes = 0;
528            $prefixChars = 0;
529            while ($prefixBytes < $matchedBytes) {
530                $charBytes = 1;
531                if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
532                    $charBytes++;
533                    if (ord($term->text[$prefixBytes]) & 0x20 ) {
534                        $charBytes++;
535                        if (ord($term->text[$prefixBytes]) & 0x10 ) {
536                            $charBytes++;
537                        }
538                    }
539                }
540
541                if ($prefixBytes + $charBytes > $matchedBytes) {
542                    // char crosses matched bytes boundary
543                    // skip char
544                    break;
545                }
546
547                $prefixChars++;
548                $prefixBytes += $charBytes;
549            }
550
551            // Write preffix length
552            $dicFile->writeVInt($prefixChars);
553            // Write suffix
554            $dicFile->writeString(substr($term->text, $prefixBytes));
555        } else {
556            // Write preffix length
557            $dicFile->writeVInt(0);
558            // Write suffix
559            $dicFile->writeString($term->text);
560        }
561        // Write field number
562        $dicFile->writeVInt($term->field);
563        // DocFreq (the count of documents which contain the term)
564        $dicFile->writeVInt($termInfo->docFreq);
565
566        $prevTerm = $term;
567
568        if (!isset($prevTermInfo)) {
569            // Write FreqDelta
570            $dicFile->writeVInt($termInfo->freqPointer);
571            // Write ProxDelta
572            $dicFile->writeVInt($termInfo->proxPointer);
573        } else {
574            // Write FreqDelta
575            $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
576            // Write ProxDelta
577            $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
578        }
579        // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
580        if ($termInfo->skipOffset != 0) {
581            $dicFile->writeVInt($termInfo->skipOffset);
582        }
583
584        $prevTermInfo = $termInfo;
585    }
586
587
588    /**
589     * Generate compound index file
590     */
591    protected function _generateCFS()
592    {
593        $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
594        $cfsFile->writeVInt(count($this->_files));
595
596        $dataOffsetPointers = array();
597        foreach ($this->_files as $fileName) {
598            $dataOffsetPointers[$fileName] = $cfsFile->tell();
599            $cfsFile->writeLong(0); // write dummy data
600            $cfsFile->writeString($fileName);
601        }
602
603        foreach ($this->_files as $fileName) {
604            // Get actual data offset
605            $dataOffset = $cfsFile->tell();
606            // Seek to the data offset pointer
607            $cfsFile->seek($dataOffsetPointers[$fileName]);
608            // Write actual data offset value
609            $cfsFile->writeLong($dataOffset);
610            // Seek back to the end of file
611            $cfsFile->seek($dataOffset);
612
613            $dataFile = $this->_directory->getFileObject($fileName);
614
615            $byteCount = $this->_directory->fileLength($fileName);
616            while ($byteCount > 0) {
617                $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
618                $byteCount -= strlen($data);
619                $cfsFile->writeBytes($data);
620            }
621
622            $this->_directory->deleteFile($fileName);
623        }
624    }
625
626
627    /**
628     * Close segment, write it to disk and return segment info
629     *
630     * @return Zend_Search_Lucene_Index_SegmentInfo
631     */
632    abstract public function close();
633}
634