PageRenderTime 104ms CodeModel.GetById 50ms app.highlight 13ms RepoModel.GetById 37ms app.codeStats 0ms

/Search/Lucene/Index/DictionaryLoader.php

https://bitbucket.org/gkawka/zend-framework
PHP | 268 lines | 171 code | 28 blank | 69 comment | 54 complexity | 31696d8504fb6a577538ef327009cc8b MD5 | raw file
  1<?php
  2/**
  3 * Zend Framework
  4 *
  5 * LICENSE
  6 *
  7 * This source file is subject to the new BSD license that is bundled
  8 * with this package in the file LICENSE.txt.
  9 * It is also available through the world-wide-web at this URL:
 10 * http://framework.zend.com/license/new-bsd
 11 * If you did not receive a copy of the license and are unable to
 12 * obtain it through the world-wide-web, please send an email
 13 * to license@zend.com so we can send you a copy immediately.
 14 *
 15 * @category   Zend
 16 * @package    Zend_Search_Lucene
 17 * @subpackage Index
 18 * @copyright  Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 19 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 20 * @version    $Id: DictionaryLoader.php 24593 2012-01-05 20:35:02Z matthew $
 21 */
 22
 23/**
 24 * Dictionary loader
 25 *
 26 * It's a dummy class which is created to encapsulate non-good structured code.
 27 * Manual "method inlining" is performed to increase dictionary index loading operation
 28 * which is major bottelneck for search performance.
 29 *
 30 *
 31 * @category   Zend
 32 * @package    Zend_Search_Lucene
 33 * @subpackage Index
 34 * @copyright  Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 35 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 36 */
 37class Zend_Search_Lucene_Index_DictionaryLoader
 38{
 39    /**
 40     * Dictionary index loader.
 41     *
 42     * It takes a string which is actually <segment_name>.tii index file data and
 43     * returns two arrays - term and tremInfo lists.
 44     *
 45     * See Zend_Search_Lucene_Index_SegmintInfo class for details
 46     *
 47     * @param string $data
 48     * @return array
 49     * @throws Zend_Search_Lucene_Exception
 50     */
 51    public static function load($data)
 52    {
 53        $termDictionary = array();
 54        $termInfos      = array();
 55        $pos = 0;
 56
 57        // $tiVersion = $tiiFile->readInt();
 58        $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8  | ord($data[3]);
 59        $pos += 4;
 60        if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
 61            $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
 62                require_once 'Zend/Search/Lucene/Exception.php';
 63                throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
 64        }
 65
 66        // $indexTermCount = $tiiFile->readLong();
 67        if (PHP_INT_SIZE > 4) {
 68            $indexTermCount = ord($data[$pos]) << 56  |
 69                              ord($data[$pos+1]) << 48  |
 70                              ord($data[$pos+2]) << 40  |
 71                              ord($data[$pos+3]) << 32  |
 72                              ord($data[$pos+4]) << 24  |
 73                              ord($data[$pos+5]) << 16  |
 74                              ord($data[$pos+6]) << 8   |
 75                              ord($data[$pos+7]);
 76        } else {
 77            if ((ord($data[$pos])            != 0) ||
 78                (ord($data[$pos+1])          != 0) ||
 79                (ord($data[$pos+2])          != 0) ||
 80                (ord($data[$pos+3])          != 0) ||
 81                ((ord($data[$pos+4]) & 0x80) != 0)) {
 82                    require_once 'Zend/Search/Lucene/Exception.php';
 83                    throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
 84                 }
 85
 86            $indexTermCount = ord($data[$pos+4]) << 24  |
 87                              ord($data[$pos+5]) << 16  |
 88                              ord($data[$pos+6]) << 8   |
 89                              ord($data[$pos+7]);
 90        }
 91        $pos += 8;
 92
 93        //                  $tiiFile->readInt();  // IndexInterval
 94        $pos += 4;
 95
 96        // $skipInterval   = $tiiFile->readInt();
 97        $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8  | ord($data[$pos+3]);
 98        $pos += 4;
 99        if ($indexTermCount < 1) {
100            require_once 'Zend/Search/Lucene/Exception.php';
101            throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
102        }
103
104        if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
105            /* Skip MaxSkipLevels value */
106            $pos += 4;
107        }
108
109        $prevTerm     = '';
110        $freqPointer  =  0;
111        $proxPointer  =  0;
112        $indexPointer =  0;
113        for ($count = 0; $count < $indexTermCount; $count++) {
114            //$termPrefixLength = $tiiFile->readVInt();
115            $nbyte = ord($data[$pos++]);
116            $termPrefixLength = $nbyte & 0x7F;
117            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
118                $nbyte = ord($data[$pos++]);
119                $termPrefixLength |= ($nbyte & 0x7F) << $shift;
120            }
121
122            // $termSuffix       = $tiiFile->readString();
123            $nbyte = ord($data[$pos++]);
124            $len = $nbyte & 0x7F;
125            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
126                $nbyte = ord($data[$pos++]);
127                $len |= ($nbyte & 0x7F) << $shift;
128            }
129            if ($len == 0) {
130                $termSuffix = '';
131            } else {
132                $termSuffix = substr($data, $pos, $len);
133                $pos += $len;
134                for ($count1 = 0; $count1 < $len; $count1++ ) {
135                    if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
136                        $addBytes = 1;
137                        if (ord($termSuffix[$count1]) & 0x20 ) {
138                            $addBytes++;
139
140                            // Never used for Java Lucene created index.
141                            // Java2 doesn't encode strings in four bytes
142                            if (ord($termSuffix[$count1]) & 0x10 ) {
143                                $addBytes++;
144                            }
145                        }
146                        $termSuffix .= substr($data, $pos, $addBytes);
147                        $pos += $addBytes;
148                        $len += $addBytes;
149
150                        // Check for null character. Java2 encodes null character
151                        // in two bytes.
152                        if (ord($termSuffix[$count1]) == 0xC0 &&
153                            ord($termSuffix[$count1+1]) == 0x80   ) {
154                            $termSuffix[$count1] = 0;
155                            $termSuffix = substr($termSuffix,0,$count1+1)
156                                        . substr($termSuffix,$count1+2);
157                        }
158                        $count1 += $addBytes;
159                    }
160                }
161            }
162
163            // $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
164            $pb = 0; $pc = 0;
165            while ($pb < strlen($prevTerm)  &&  $pc < $termPrefixLength) {
166                $charBytes = 1;
167                if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
168                    $charBytes++;
169                    if (ord($prevTerm[$pb]) & 0x20 ) {
170                        $charBytes++;
171                        if (ord($prevTerm[$pb]) & 0x10 ) {
172                            $charBytes++;
173                        }
174                    }
175                }
176
177                if ($pb + $charBytes > strlen($data)) {
178                    // wrong character
179                    break;
180                }
181
182                $pc++;
183                $pb += $charBytes;
184            }
185            $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
186
187            // $termFieldNum     = $tiiFile->readVInt();
188            $nbyte = ord($data[$pos++]);
189            $termFieldNum = $nbyte & 0x7F;
190            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
191                $nbyte = ord($data[$pos++]);
192                $termFieldNum |= ($nbyte & 0x7F) << $shift;
193            }
194
195            // $docFreq          = $tiiFile->readVInt();
196            $nbyte = ord($data[$pos++]);
197            $docFreq = $nbyte & 0x7F;
198            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
199                $nbyte = ord($data[$pos++]);
200                $docFreq |= ($nbyte & 0x7F) << $shift;
201            }
202
203            // $freqPointer     += $tiiFile->readVInt();
204            $nbyte = ord($data[$pos++]);
205            $vint = $nbyte & 0x7F;
206            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
207                $nbyte = ord($data[$pos++]);
208                $vint |= ($nbyte & 0x7F) << $shift;
209            }
210            $freqPointer += $vint;
211
212            // $proxPointer     += $tiiFile->readVInt();
213            $nbyte = ord($data[$pos++]);
214            $vint = $nbyte & 0x7F;
215            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
216                $nbyte = ord($data[$pos++]);
217                $vint |= ($nbyte & 0x7F) << $shift;
218            }
219            $proxPointer += $vint;
220
221            if( $docFreq >= $skipInterval ) {
222                // $skipDelta = $tiiFile->readVInt();
223                $nbyte = ord($data[$pos++]);
224                $vint = $nbyte & 0x7F;
225                for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
226                    $nbyte = ord($data[$pos++]);
227                    $vint |= ($nbyte & 0x7F) << $shift;
228                }
229                $skipDelta = $vint;
230            } else {
231                $skipDelta = 0;
232            }
233
234            // $indexPointer += $tiiFile->readVInt();
235            $nbyte = ord($data[$pos++]);
236            $vint = $nbyte & 0x7F;
237            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
238                $nbyte = ord($data[$pos++]);
239                $vint |= ($nbyte & 0x7F) << $shift;
240            }
241            $indexPointer += $vint;
242
243
244            // $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
245            $termDictionary[] = array($termFieldNum, $termValue);
246
247            $termInfos[] =
248                 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
249                 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
250
251            $prevTerm = $termValue;
252        }
253
254        // Check special index entry mark
255        if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
256            require_once 'Zend/Search/Lucene/Exception.php';
257            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
258        }
259
260        if (PHP_INT_SIZE > 4) {
261            // Treat 64-bit 0xFFFFFFFF as -1
262            $termDictionary[0][0] = -1;
263        }
264
265        return array($termDictionary, $termInfos);
266    }
267}
268