PageRenderTime 49ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/library/Zend/Search/Lucene/Index/DictionaryLoader.php

https://bitbucket.org/hamidrezas/melobit
PHP | 268 lines | 171 code | 28 blank | 69 comment | 54 complexity | 374e2379ebe180c4484ef27f09fac010 MD5 | raw file
Possible License(s): AGPL-1.0
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: DictionaryLoader.php 24594 2012-01-05 21:27:01Z matthew $
  21. */
  22. /**
  23. * Dictionary loader
  24. *
  25. * It's a dummy class which is created to encapsulate non-good structured code.
  26. * Manual "method inlining" is performed to increase dictionary index loading operation
  27. * which is major bottelneck for search performance.
  28. *
  29. *
  30. * @category Zend
  31. * @package Zend_Search_Lucene
  32. * @subpackage Index
  33. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  34. * @license http://framework.zend.com/license/new-bsd New BSD License
  35. */
  36. class Zend_Search_Lucene_Index_DictionaryLoader
  37. {
  38. /**
  39. * Dictionary index loader.
  40. *
  41. * It takes a string which is actually <segment_name>.tii index file data and
  42. * returns two arrays - term and tremInfo lists.
  43. *
  44. * See Zend_Search_Lucene_Index_SegmintInfo class for details
  45. *
  46. * @param string $data
  47. * @return array
  48. * @throws Zend_Search_Lucene_Exception
  49. */
  50. public static function load($data)
  51. {
  52. $termDictionary = array();
  53. $termInfos = array();
  54. $pos = 0;
  55. // $tiVersion = $tiiFile->readInt();
  56. $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
  57. $pos += 4;
  58. if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
  59. $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
  60. require_once 'Zend/Search/Lucene/Exception.php';
  61. throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
  62. }
  63. // $indexTermCount = $tiiFile->readLong();
  64. if (PHP_INT_SIZE > 4) {
  65. $indexTermCount = ord($data[$pos]) << 56 |
  66. ord($data[$pos+1]) << 48 |
  67. ord($data[$pos+2]) << 40 |
  68. ord($data[$pos+3]) << 32 |
  69. ord($data[$pos+4]) << 24 |
  70. ord($data[$pos+5]) << 16 |
  71. ord($data[$pos+6]) << 8 |
  72. ord($data[$pos+7]);
  73. } else {
  74. if ((ord($data[$pos]) != 0) ||
  75. (ord($data[$pos+1]) != 0) ||
  76. (ord($data[$pos+2]) != 0) ||
  77. (ord($data[$pos+3]) != 0) ||
  78. ((ord($data[$pos+4]) & 0x80) != 0)) {
  79. require_once 'Zend/Search/Lucene/Exception.php';
  80. throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
  81. }
  82. $indexTermCount = ord($data[$pos+4]) << 24 |
  83. ord($data[$pos+5]) << 16 |
  84. ord($data[$pos+6]) << 8 |
  85. ord($data[$pos+7]);
  86. }
  87. $pos += 8;
  88. // $tiiFile->readInt(); // IndexInterval
  89. $pos += 4;
  90. // $skipInterval = $tiiFile->readInt();
  91. $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
  92. $pos += 4;
  93. if ($indexTermCount < 1) {
  94. require_once 'Zend/Search/Lucene/Exception.php';
  95. throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
  96. }
  97. if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
  98. /* Skip MaxSkipLevels value */
  99. $pos += 4;
  100. }
  101. $prevTerm = '';
  102. $freqPointer = 0;
  103. $proxPointer = 0;
  104. $indexPointer = 0;
  105. for ($count = 0; $count < $indexTermCount; $count++) {
  106. //$termPrefixLength = $tiiFile->readVInt();
  107. $nbyte = ord($data[$pos++]);
  108. $termPrefixLength = $nbyte & 0x7F;
  109. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  110. $nbyte = ord($data[$pos++]);
  111. $termPrefixLength |= ($nbyte & 0x7F) << $shift;
  112. }
  113. // $termSuffix = $tiiFile->readString();
  114. $nbyte = ord($data[$pos++]);
  115. $len = $nbyte & 0x7F;
  116. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  117. $nbyte = ord($data[$pos++]);
  118. $len |= ($nbyte & 0x7F) << $shift;
  119. }
  120. if ($len == 0) {
  121. $termSuffix = '';
  122. } else {
  123. $termSuffix = substr($data, $pos, $len);
  124. $pos += $len;
  125. for ($count1 = 0; $count1 < $len; $count1++ ) {
  126. if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
  127. $addBytes = 1;
  128. if (ord($termSuffix[$count1]) & 0x20 ) {
  129. $addBytes++;
  130. // Never used for Java Lucene created index.
  131. // Java2 doesn't encode strings in four bytes
  132. if (ord($termSuffix[$count1]) & 0x10 ) {
  133. $addBytes++;
  134. }
  135. }
  136. $termSuffix .= substr($data, $pos, $addBytes);
  137. $pos += $addBytes;
  138. $len += $addBytes;
  139. // Check for null character. Java2 encodes null character
  140. // in two bytes.
  141. if (ord($termSuffix[$count1]) == 0xC0 &&
  142. ord($termSuffix[$count1+1]) == 0x80 ) {
  143. $termSuffix[$count1] = 0;
  144. $termSuffix = substr($termSuffix,0,$count1+1)
  145. . substr($termSuffix,$count1+2);
  146. }
  147. $count1 += $addBytes;
  148. }
  149. }
  150. }
  151. // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
  152. $pb = 0; $pc = 0;
  153. while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
  154. $charBytes = 1;
  155. if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
  156. $charBytes++;
  157. if (ord($prevTerm[$pb]) & 0x20 ) {
  158. $charBytes++;
  159. if (ord($prevTerm[$pb]) & 0x10 ) {
  160. $charBytes++;
  161. }
  162. }
  163. }
  164. if ($pb + $charBytes > strlen($data)) {
  165. // wrong character
  166. break;
  167. }
  168. $pc++;
  169. $pb += $charBytes;
  170. }
  171. $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
  172. // $termFieldNum = $tiiFile->readVInt();
  173. $nbyte = ord($data[$pos++]);
  174. $termFieldNum = $nbyte & 0x7F;
  175. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  176. $nbyte = ord($data[$pos++]);
  177. $termFieldNum |= ($nbyte & 0x7F) << $shift;
  178. }
  179. // $docFreq = $tiiFile->readVInt();
  180. $nbyte = ord($data[$pos++]);
  181. $docFreq = $nbyte & 0x7F;
  182. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  183. $nbyte = ord($data[$pos++]);
  184. $docFreq |= ($nbyte & 0x7F) << $shift;
  185. }
  186. // $freqPointer += $tiiFile->readVInt();
  187. $nbyte = ord($data[$pos++]);
  188. $vint = $nbyte & 0x7F;
  189. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  190. $nbyte = ord($data[$pos++]);
  191. $vint |= ($nbyte & 0x7F) << $shift;
  192. }
  193. $freqPointer += $vint;
  194. // $proxPointer += $tiiFile->readVInt();
  195. $nbyte = ord($data[$pos++]);
  196. $vint = $nbyte & 0x7F;
  197. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  198. $nbyte = ord($data[$pos++]);
  199. $vint |= ($nbyte & 0x7F) << $shift;
  200. }
  201. $proxPointer += $vint;
  202. if( $docFreq >= $skipInterval ) {
  203. // $skipDelta = $tiiFile->readVInt();
  204. $nbyte = ord($data[$pos++]);
  205. $vint = $nbyte & 0x7F;
  206. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  207. $nbyte = ord($data[$pos++]);
  208. $vint |= ($nbyte & 0x7F) << $shift;
  209. }
  210. $skipDelta = $vint;
  211. } else {
  212. $skipDelta = 0;
  213. }
  214. // $indexPointer += $tiiFile->readVInt();
  215. $nbyte = ord($data[$pos++]);
  216. $vint = $nbyte & 0x7F;
  217. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  218. $nbyte = ord($data[$pos++]);
  219. $vint |= ($nbyte & 0x7F) << $shift;
  220. }
  221. $indexPointer += $vint;
  222. // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
  223. $termDictionary[] = array($termFieldNum, $termValue);
  224. $termInfos[] =
  225. // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
  226. array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
  227. $prevTerm = $termValue;
  228. }
  229. // Check special index entry mark
  230. if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
  231. require_once 'Zend/Search/Lucene/Exception.php';
  232. throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
  233. }
  234. if (PHP_INT_SIZE > 4) {
  235. // Treat 64-bit 0xFFFFFFFF as -1
  236. $termDictionary[0][0] = -1;
  237. }
  238. return array($termDictionary, $termInfos);
  239. }
  240. }