/library/Zend/Search/Lucene/Index/DictionaryLoader.php

https://github.com/Exercise/zf2 · PHP · 267 lines · 169 code · 29 blank · 69 comment · 54 complexity · 2cbcd9621b81935beb06672986e4541e MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Index
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /**
  23. * @namespace
  24. */
  25. namespace Zend\Search\Lucene\Index;
  26. use Zend\Search\Lucene;
  27. /**
  28. * Dictionary loader
  29. *
  30. * It's a dummy class which is created to encapsulate non-good structured code.
  31. * Manual "method inlining" is performed to increase dictionary index loading operation
  32. * which is major bottelneck for search performance.
  33. *
  34. * @uses \Zend\Search\Lucene\Exception
  35. * @category Zend
  36. * @package Zend_Search_Lucene
  37. * @subpackage Index
  38. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  39. * @license http://framework.zend.com/license/new-bsd New BSD License
  40. */
  41. class DictionaryLoader
  42. {
  43. /**
  44. * Dictionary index loader.
  45. *
  46. * It takes a string which is actually <segment_name>.tii index file data and
  47. * returns two arrays - term and tremInfo lists.
  48. *
  49. * See Zend_Search_Lucene_Index_SegmintInfo class for details
  50. *
  51. * @param string $data
  52. * @return array
  53. * @throws \Zend\Search\Lucene\Exception
  54. */
  55. public static function load($data)
  56. {
  57. $termDictionary = array();
  58. $termInfos = array();
  59. $pos = 0;
  60. // $tiVersion = $tiiFile->readInt();
  61. $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
  62. $pos += 4;
  63. if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
  64. $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
  65. throw new Lucene\Exception('Wrong TermInfoIndexFile file format');
  66. }
  67. // $indexTermCount = $tiiFile->readLong();
  68. if (PHP_INT_SIZE > 4) {
  69. $indexTermCount = ord($data[$pos]) << 56 |
  70. ord($data[$pos+1]) << 48 |
  71. ord($data[$pos+2]) << 40 |
  72. ord($data[$pos+3]) << 32 |
  73. ord($data[$pos+4]) << 24 |
  74. ord($data[$pos+5]) << 16 |
  75. ord($data[$pos+6]) << 8 |
  76. ord($data[$pos+7]);
  77. } else {
  78. if ((ord($data[$pos]) != 0) ||
  79. (ord($data[$pos+1]) != 0) ||
  80. (ord($data[$pos+2]) != 0) ||
  81. (ord($data[$pos+3]) != 0) ||
  82. ((ord($data[$pos+4]) & 0x80) != 0)) {
  83. throw new Lucene\Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
  84. }
  85. $indexTermCount = ord($data[$pos+4]) << 24 |
  86. ord($data[$pos+5]) << 16 |
  87. ord($data[$pos+6]) << 8 |
  88. ord($data[$pos+7]);
  89. }
  90. $pos += 8;
  91. // $tiiFile->readInt(); // IndexInterval
  92. $pos += 4;
  93. // $skipInterval = $tiiFile->readInt();
  94. $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
  95. $pos += 4;
  96. if ($indexTermCount < 1) {
  97. throw new Lucene\Exception('Wrong number of terms in a term dictionary index');
  98. }
  99. if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
  100. /* Skip MaxSkipLevels value */
  101. $pos += 4;
  102. }
  103. $prevTerm = '';
  104. $freqPointer = 0;
  105. $proxPointer = 0;
  106. $indexPointer = 0;
  107. for ($count = 0; $count < $indexTermCount; $count++) {
  108. //$termPrefixLength = $tiiFile->readVInt();
  109. $nbyte = ord($data[$pos++]);
  110. $termPrefixLength = $nbyte & 0x7F;
  111. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  112. $nbyte = ord($data[$pos++]);
  113. $termPrefixLength |= ($nbyte & 0x7F) << $shift;
  114. }
  115. // $termSuffix = $tiiFile->readString();
  116. $nbyte = ord($data[$pos++]);
  117. $len = $nbyte & 0x7F;
  118. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  119. $nbyte = ord($data[$pos++]);
  120. $len |= ($nbyte & 0x7F) << $shift;
  121. }
  122. if ($len == 0) {
  123. $termSuffix = '';
  124. } else {
  125. $termSuffix = substr($data, $pos, $len);
  126. $pos += $len;
  127. for ($count1 = 0; $count1 < $len; $count1++ ) {
  128. if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
  129. $addBytes = 1;
  130. if (ord($termSuffix[$count1]) & 0x20 ) {
  131. $addBytes++;
  132. // Never used for Java Lucene created index.
  133. // Java2 doesn't encode strings in four bytes
  134. if (ord($termSuffix[$count1]) & 0x10 ) {
  135. $addBytes++;
  136. }
  137. }
  138. $termSuffix .= substr($data, $pos, $addBytes);
  139. $pos += $addBytes;
  140. $len += $addBytes;
  141. // Check for null character. Java2 encodes null character
  142. // in two bytes.
  143. if (ord($termSuffix[$count1]) == 0xC0 &&
  144. ord($termSuffix[$count1+1]) == 0x80 ) {
  145. $termSuffix[$count1] = 0;
  146. $termSuffix = substr($termSuffix,0,$count1+1)
  147. . substr($termSuffix,$count1+2);
  148. }
  149. $count1 += $addBytes;
  150. }
  151. }
  152. }
  153. $pb = 0; $pc = 0;
  154. while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
  155. $charBytes = 1;
  156. if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
  157. $charBytes++;
  158. if (ord($prevTerm[$pb]) & 0x20 ) {
  159. $charBytes++;
  160. if (ord($prevTerm[$pb]) & 0x10 ) {
  161. $charBytes++;
  162. }
  163. }
  164. }
  165. if ($pb + $charBytes > strlen($data)) {
  166. // wrong character
  167. break;
  168. }
  169. $pc++;
  170. $pb += $charBytes;
  171. }
  172. $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
  173. // $termFieldNum = $tiiFile->readVInt();
  174. $nbyte = ord($data[$pos++]);
  175. $termFieldNum = $nbyte & 0x7F;
  176. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  177. $nbyte = ord($data[$pos++]);
  178. $termFieldNum |= ($nbyte & 0x7F) << $shift;
  179. }
  180. // $docFreq = $tiiFile->readVInt();
  181. $nbyte = ord($data[$pos++]);
  182. $docFreq = $nbyte & 0x7F;
  183. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  184. $nbyte = ord($data[$pos++]);
  185. $docFreq |= ($nbyte & 0x7F) << $shift;
  186. }
  187. // $freqPointer += $tiiFile->readVInt();
  188. $nbyte = ord($data[$pos++]);
  189. $vint = $nbyte & 0x7F;
  190. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  191. $nbyte = ord($data[$pos++]);
  192. $vint |= ($nbyte & 0x7F) << $shift;
  193. }
  194. $freqPointer += $vint;
  195. // $proxPointer += $tiiFile->readVInt();
  196. $nbyte = ord($data[$pos++]);
  197. $vint = $nbyte & 0x7F;
  198. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  199. $nbyte = ord($data[$pos++]);
  200. $vint |= ($nbyte & 0x7F) << $shift;
  201. }
  202. $proxPointer += $vint;
  203. if( $docFreq >= $skipInterval ) {
  204. // $skipDelta = $tiiFile->readVInt();
  205. $nbyte = ord($data[$pos++]);
  206. $vint = $nbyte & 0x7F;
  207. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  208. $nbyte = ord($data[$pos++]);
  209. $vint |= ($nbyte & 0x7F) << $shift;
  210. }
  211. $skipDelta = $vint;
  212. } else {
  213. $skipDelta = 0;
  214. }
  215. // $indexPointer += $tiiFile->readVInt();
  216. $nbyte = ord($data[$pos++]);
  217. $vint = $nbyte & 0x7F;
  218. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  219. $nbyte = ord($data[$pos++]);
  220. $vint |= ($nbyte & 0x7F) << $shift;
  221. }
  222. $indexPointer += $vint;
  223. $termDictionary[] = array($termFieldNum, $termValue);
  224. $termInfos[] =
  225. array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
  226. $prevTerm = $termValue;
  227. }
  228. // Check special index entry mark
  229. if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
  230. throw new Lucene\Exception('Wrong TermInfoIndexFile file format');
  231. }
  232. if (PHP_INT_SIZE > 4) {
  233. // Treat 64-bit 0xFFFFFFFF as -1
  234. $termDictionary[0][0] = -1;
  235. }
  236. return array($termDictionary, $termInfos);
  237. }
  238. }