PageRenderTime 80ms CodeModel.GetById 40ms app.highlight 15ms RepoModel.GetById 18ms app.codeStats 1ms

/library/Zend/Search/Lucene/Index/DictionaryLoader.php

https://bitbucket.org/baruffaldi/website-2008-computer-shopping-3
PHP | 266 lines | 167 code | 30 blank | 69 comment | 55 complexity | ff41c7736c98eee2730cba233e457dde MD5 | raw file
  1<?php
  2/**
  3 * Zend Framework
  4 *
  5 * LICENSE
  6 *
  7 * This source file is subject to the new BSD license that is bundled
  8 * with this package in the file LICENSE.txt.
  9 * It is also available through the world-wide-web at this URL:
 10 * http://framework.zend.com/license/new-bsd
 11 * If you did not receive a copy of the license and are unable to
 12 * obtain it through the world-wide-web, please send an email
 13 * to license@zend.com so we can send you a copy immediately.
 14 *
 15 * @category   Zend
 16 * @package    Zend_Search_Lucene
 17 * @subpackage Index
 18 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 19 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 20 */
 21
 22
 23/** Zend_Search_Lucene_Exception */
 24require_once 'Zend/Search/Lucene/Exception.php';
 25
 26
 27/**
 28 * Dictionary loader
 29 *
 30 * It's a dummy class which is created to encapsulate non-good structured code.
 31 * Manual "method inlining" is performed to increase dictionary index loading operation
 32 * which is major bottelneck for search performance.
 33 *
 34 *
 35 * @category   Zend
 36 * @package    Zend_Search_Lucene
 37 * @subpackage Index
 38 * @copyright  Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
 39 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 40 */
 41class Zend_Search_Lucene_Index_DictionaryLoader
 42{
 43    /**
 44     * Dictionary index loader.
 45     *
 46     * It takes a string which is actually <segment_name>.tii index file data and
 47     * returns two arrays - term and tremInfo lists.
 48     *
 49     * See Zend_Search_Lucene_Index_SegmintInfo class for details
 50     *
 51     * @param string $data
 52     * @return array
 53     * @throws Zend_Search_Lucene_Exception
 54     */
 55    public static function load($data)
 56    {
 57        $termDictionary = array();
 58        $termInfos      = array();
 59        $pos = 0;
 60
 61        // $tiVersion = $tiiFile->readInt();
 62        $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8  | ord($data[3]);
 63        $pos += 4;
 64        if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
 65            $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
 66            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
 67        }
 68
 69        // $indexTermCount = $tiiFile->readLong();
 70        if (PHP_INT_SIZE > 4) {
 71            $indexTermCount = ord($data[$pos]) << 56  |
 72                              ord($data[$pos+1]) << 48  |
 73                              ord($data[$pos+2]) << 40  |
 74                              ord($data[$pos+3]) << 32  |
 75                              ord($data[$pos+4]) << 24  |
 76                              ord($data[$pos+5]) << 16  |
 77                              ord($data[$pos+6]) << 8   |
 78                              ord($data[$pos+7]);
 79        } else {
 80            if ((ord($data[$pos])            != 0) ||
 81                (ord($data[$pos+1])          != 0) ||
 82                (ord($data[$pos+2])          != 0) ||
 83                (ord($data[$pos+3])          != 0) ||
 84                ((ord($data[$pos+4]) & 0x80) != 0)) {
 85                     throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
 86                 }
 87
 88            $indexTermCount = ord($data[$pos+4]) << 24  |
 89                              ord($data[$pos+5]) << 16  |
 90                              ord($data[$pos+6]) << 8   |
 91                              ord($data[$pos+7]);
 92        }
 93        $pos += 8;
 94
 95        //                  $tiiFile->readInt();  // IndexInterval
 96        $pos += 4;
 97
 98        // $skipInterval   = $tiiFile->readInt();
 99        $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8  | ord($data[$pos+3]);
100        $pos += 4;
101        if ($indexTermCount < 1) {
102            throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
103        }
104
105        if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
106            /* Skip MaxSkipLevels value */
107            $pos += 4;
108        }
109
110        $prevTerm     = '';
111        $freqPointer  =  0;
112        $proxPointer  =  0;
113        $indexPointer =  0;
114        for ($count = 0; $count < $indexTermCount; $count++) {
115            //$termPrefixLength = $tiiFile->readVInt();
116            $nbyte = ord($data[$pos++]);
117            $termPrefixLength = $nbyte & 0x7F;
118            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
119                $nbyte = ord($data[$pos++]);
120                $termPrefixLength |= ($nbyte & 0x7F) << $shift;
121            }
122
123            // $termSuffix       = $tiiFile->readString();
124            $nbyte = ord($data[$pos++]);
125            $len = $nbyte & 0x7F;
126            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
127                $nbyte = ord($data[$pos++]);
128                $len |= ($nbyte & 0x7F) << $shift;
129            }
130            if ($len == 0) {
131                $termSuffix = '';
132            } else {
133                $termSuffix = substr($data, $pos, $len);
134                $pos += $len;
135                for ($count1 = 0; $count1 < $len; $count1++ ) {
136                    if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
137                        $addBytes = 1;
138                        if (ord($termSuffix[$count1]) & 0x20 ) {
139                            $addBytes++;
140
141	                        // Never used for Java Lucene created index.
142	                        // Java2 doesn't encode strings in four bytes
143	                        if (ord($termSuffix[$count1]) & 0x10 ) {
144	                            $addBytes++;
145	                        }
146                        }
147                        $termSuffix .= substr($data, $pos, $addBytes);
148                        $pos += $addBytes;
149                        $len += $addBytes;
150
151                        // Check for null character. Java2 encodes null character
152                        // in two bytes.
153                        if (ord($termSuffix[$count1]) == 0xC0 &&
154                            ord($termSuffix[$count1+1]) == 0x80   ) {
155                            $termSuffix[$count1] = 0;
156                            $termSuffix = substr($termSuffix,0,$count1+1)
157                                        . substr($termSuffix,$count1+2);
158                        }
159                        $count1 += $addBytes;
160                    }
161                }
162            }
163
164            // $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
165            $pb = 0; $pc = 0;
166            while ($pb < strlen($prevTerm)  &&  $pc < $termPrefixLength) {
167                $charBytes = 1;
168                if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
169                    $charBytes++;
170                    if (ord($prevTerm[$pb]) & 0x20 ) {
171                        $charBytes++;
172                        if (ord($prevTerm[$pb]) & 0x10 ) {
173                            $charBytes++;
174                        }
175                    }
176                }
177
178                if ($pb + $charBytes > strlen($data)) {
179                    // wrong character
180                    break;
181                }
182
183                $pc++;
184                $pb += $charBytes;
185            }
186            $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
187
188            // $termFieldNum     = $tiiFile->readVInt();
189            $nbyte = ord($data[$pos++]);
190            $termFieldNum = $nbyte & 0x7F;
191            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
192                $nbyte = ord($data[$pos++]);
193                $termFieldNum |= ($nbyte & 0x7F) << $shift;
194            }
195
196            // $docFreq          = $tiiFile->readVInt();
197            $nbyte = ord($data[$pos++]);
198            $docFreq = $nbyte & 0x7F;
199            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
200                $nbyte = ord($data[$pos++]);
201                $docFreq |= ($nbyte & 0x7F) << $shift;
202            }
203
204            // $freqPointer     += $tiiFile->readVInt();
205            $nbyte = ord($data[$pos++]);
206            $vint = $nbyte & 0x7F;
207            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
208                $nbyte = ord($data[$pos++]);
209                $vint |= ($nbyte & 0x7F) << $shift;
210            }
211            $freqPointer += $vint;
212
213            // $proxPointer     += $tiiFile->readVInt();
214            $nbyte = ord($data[$pos++]);
215            $vint = $nbyte & 0x7F;
216            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
217                $nbyte = ord($data[$pos++]);
218                $vint |= ($nbyte & 0x7F) << $shift;
219            }
220            $proxPointer += $vint;
221
222            if( $docFreq >= $skipInterval ) {
223                // $skipDelta = $tiiFile->readVInt();
224                $nbyte = ord($data[$pos++]);
225                $vint = $nbyte & 0x7F;
226                for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
227                    $nbyte = ord($data[$pos++]);
228                    $vint |= ($nbyte & 0x7F) << $shift;
229                }
230                $skipDelta = $vint;
231            } else {
232                $skipDelta = 0;
233            }
234
235            // $indexPointer += $tiiFile->readVInt();
236            $nbyte = ord($data[$pos++]);
237            $vint = $nbyte & 0x7F;
238            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
239                $nbyte = ord($data[$pos++]);
240                $vint |= ($nbyte & 0x7F) << $shift;
241            }
242            $indexPointer += $vint;
243
244
245            // $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
246            $termDictionary[] = array($termFieldNum, $termValue);
247
248            $termInfos[] =
249                 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
250                 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
251
252            $prevTerm = $termValue;
253        }
254
255        // Check special index entry mark
256        if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
257            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
258        } else if (PHP_INT_SIZE > 4){
259            // Treat 64-bit 0xFFFFFFFF as -1
260            $termDictionary[0][0] = -1;
261        }
262
263        return array(&$termDictionary, &$termInfos);
264    }
265}
266