PageRenderTime 42ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/Zend/Search/Lucene/Index/DictionaryLoader.php

https://bitbucket.org/mercysam/zfs
PHP | 265 lines | 170 code | 27 blank | 68 comment | 55 complexity | 3bb9bb2f2cb371dffc65c0ef65ad84a1 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. */
  21. /**
  22. * Dictionary loader
  23. *
  24. * It's a dummy class which is created to encapsulate non-good structured code.
  25. * Manual "method inlining" is performed to increase dictionary index loading operation
  26. * which is major bottelneck for search performance.
  27. *
  28. *
  29. * @category Zend
  30. * @package Zend_Search_Lucene
  31. * @subpackage Index
  32. * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
  33. * @license http://framework.zend.com/license/new-bsd New BSD License
  34. */
  35. class Zend_Search_Lucene_Index_DictionaryLoader
  36. {
  37. /**
  38. * Dictionary index loader.
  39. *
  40. * It takes a string which is actually <segment_name>.tii index file data and
  41. * returns two arrays - term and tremInfo lists.
  42. *
  43. * See Zend_Search_Lucene_Index_SegmintInfo class for details
  44. *
  45. * @param string $data
  46. * @return array
  47. * @throws Zend_Search_Lucene_Exception
  48. */
  49. public static function load($data)
  50. {
  51. $termDictionary = array();
  52. $termInfos = array();
  53. $pos = 0;
  54. // $tiVersion = $tiiFile->readInt();
  55. $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
  56. $pos += 4;
  57. if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
  58. $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
  59. require_once 'Zend/Search/Lucene/Exception.php';
  60. throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
  61. }
  62. // $indexTermCount = $tiiFile->readLong();
  63. if (PHP_INT_SIZE > 4) {
  64. $indexTermCount = ord($data[$pos]) << 56 |
  65. ord($data[$pos+1]) << 48 |
  66. ord($data[$pos+2]) << 40 |
  67. ord($data[$pos+3]) << 32 |
  68. ord($data[$pos+4]) << 24 |
  69. ord($data[$pos+5]) << 16 |
  70. ord($data[$pos+6]) << 8 |
  71. ord($data[$pos+7]);
  72. } else {
  73. if ((ord($data[$pos]) != 0) ||
  74. (ord($data[$pos+1]) != 0) ||
  75. (ord($data[$pos+2]) != 0) ||
  76. (ord($data[$pos+3]) != 0) ||
  77. ((ord($data[$pos+4]) & 0x80) != 0)) {
  78. require_once 'Zend/Search/Lucene/Exception.php';
  79. throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
  80. }
  81. $indexTermCount = ord($data[$pos+4]) << 24 |
  82. ord($data[$pos+5]) << 16 |
  83. ord($data[$pos+6]) << 8 |
  84. ord($data[$pos+7]);
  85. }
  86. $pos += 8;
  87. // $tiiFile->readInt(); // IndexInterval
  88. $pos += 4;
  89. // $skipInterval = $tiiFile->readInt();
  90. $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
  91. $pos += 4;
  92. if ($indexTermCount < 1) {
  93. require_once 'Zend/Search/Lucene/Exception.php';
  94. throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
  95. }
  96. if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
  97. /* Skip MaxSkipLevels value */
  98. $pos += 4;
  99. }
  100. $prevTerm = '';
  101. $freqPointer = 0;
  102. $proxPointer = 0;
  103. $indexPointer = 0;
  104. for ($count = 0; $count < $indexTermCount; $count++) {
  105. //$termPrefixLength = $tiiFile->readVInt();
  106. $nbyte = ord($data[$pos++]);
  107. $termPrefixLength = $nbyte & 0x7F;
  108. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  109. $nbyte = ord($data[$pos++]);
  110. $termPrefixLength |= ($nbyte & 0x7F) << $shift;
  111. }
  112. // $termSuffix = $tiiFile->readString();
  113. $nbyte = ord($data[$pos++]);
  114. $len = $nbyte & 0x7F;
  115. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  116. $nbyte = ord($data[$pos++]);
  117. $len |= ($nbyte & 0x7F) << $shift;
  118. }
  119. if ($len == 0) {
  120. $termSuffix = '';
  121. } else {
  122. $termSuffix = substr($data, $pos, $len);
  123. $pos += $len;
  124. for ($count1 = 0; $count1 < $len; $count1++ ) {
  125. if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
  126. $addBytes = 1;
  127. if (ord($termSuffix[$count1]) & 0x20 ) {
  128. $addBytes++;
  129. // Never used for Java Lucene created index.
  130. // Java2 doesn't encode strings in four bytes
  131. if (ord($termSuffix[$count1]) & 0x10 ) {
  132. $addBytes++;
  133. }
  134. }
  135. $termSuffix .= substr($data, $pos, $addBytes);
  136. $pos += $addBytes;
  137. $len += $addBytes;
  138. // Check for null character. Java2 encodes null character
  139. // in two bytes.
  140. if (ord($termSuffix[$count1]) == 0xC0 &&
  141. ord($termSuffix[$count1+1]) == 0x80 ) {
  142. $termSuffix[$count1] = 0;
  143. $termSuffix = substr($termSuffix,0,$count1+1)
  144. . substr($termSuffix,$count1+2);
  145. }
  146. $count1 += $addBytes;
  147. }
  148. }
  149. }
  150. // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
  151. $pb = 0; $pc = 0;
  152. while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
  153. $charBytes = 1;
  154. if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
  155. $charBytes++;
  156. if (ord($prevTerm[$pb]) & 0x20 ) {
  157. $charBytes++;
  158. if (ord($prevTerm[$pb]) & 0x10 ) {
  159. $charBytes++;
  160. }
  161. }
  162. }
  163. if ($pb + $charBytes > strlen($data)) {
  164. // wrong character
  165. break;
  166. }
  167. $pc++;
  168. $pb += $charBytes;
  169. }
  170. $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
  171. // $termFieldNum = $tiiFile->readVInt();
  172. $nbyte = ord($data[$pos++]);
  173. $termFieldNum = $nbyte & 0x7F;
  174. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  175. $nbyte = ord($data[$pos++]);
  176. $termFieldNum |= ($nbyte & 0x7F) << $shift;
  177. }
  178. // $docFreq = $tiiFile->readVInt();
  179. $nbyte = ord($data[$pos++]);
  180. $docFreq = $nbyte & 0x7F;
  181. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  182. $nbyte = ord($data[$pos++]);
  183. $docFreq |= ($nbyte & 0x7F) << $shift;
  184. }
  185. // $freqPointer += $tiiFile->readVInt();
  186. $nbyte = ord($data[$pos++]);
  187. $vint = $nbyte & 0x7F;
  188. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  189. $nbyte = ord($data[$pos++]);
  190. $vint |= ($nbyte & 0x7F) << $shift;
  191. }
  192. $freqPointer += $vint;
  193. // $proxPointer += $tiiFile->readVInt();
  194. $nbyte = ord($data[$pos++]);
  195. $vint = $nbyte & 0x7F;
  196. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  197. $nbyte = ord($data[$pos++]);
  198. $vint |= ($nbyte & 0x7F) << $shift;
  199. }
  200. $proxPointer += $vint;
  201. if( $docFreq >= $skipInterval ) {
  202. // $skipDelta = $tiiFile->readVInt();
  203. $nbyte = ord($data[$pos++]);
  204. $vint = $nbyte & 0x7F;
  205. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  206. $nbyte = ord($data[$pos++]);
  207. $vint |= ($nbyte & 0x7F) << $shift;
  208. }
  209. $skipDelta = $vint;
  210. } else {
  211. $skipDelta = 0;
  212. }
  213. // $indexPointer += $tiiFile->readVInt();
  214. $nbyte = ord($data[$pos++]);
  215. $vint = $nbyte & 0x7F;
  216. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  217. $nbyte = ord($data[$pos++]);
  218. $vint |= ($nbyte & 0x7F) << $shift;
  219. }
  220. $indexPointer += $vint;
  221. // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
  222. $termDictionary[] = array($termFieldNum, $termValue);
  223. $termInfos[] =
  224. // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
  225. array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
  226. $prevTerm = $termValue;
  227. }
  228. // Check special index entry mark
  229. if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
  230. require_once 'Zend/Search/Lucene/Exception.php';
  231. throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
  232. } else if (PHP_INT_SIZE > 4){
  233. // Treat 64-bit 0xFFFFFFFF as -1
  234. $termDictionary[0][0] = -1;
  235. }
  236. return array(&$termDictionary, &$termInfos);
  237. }
  238. }