PageRenderTime 35ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/library/Zend/Search/Lucene/Index/DictionaryLoader.php

https://bitbucket.org/aboozar/zf2
PHP | 251 lines | 170 code | 30 blank | 51 comment | 54 complexity | 0c71ee6a305844e5f8ed9bf74c2f2598 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. <?php
  2. /**
  3. * Zend Framework (http://framework.zend.com/)
  4. *
  5. * @link http://github.com/zendframework/zf2 for the canonical source repository
  6. * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
  7. * @license http://framework.zend.com/license/new-bsd New BSD License
  8. * @package Zend_Search
  9. */
  10. namespace Zend\Search\Lucene\Index;
  11. use Zend\Search\Lucene;
  12. use Zend\Search\Lucene\Exception\InvalidFileFormatException;
  13. /**
  14. * Dictionary loader
  15. *
  16. * It's a dummy class which is created to encapsulate non-good structured code.
  17. * Manual "method inlining" is performed to increase dictionary index loading operation
  18. * which is major bottelneck for search performance.
  19. *
  20. * @category Zend
  21. * @package Zend_Search_Lucene
  22. * @subpackage Index
  23. */
  24. class DictionaryLoader
  25. {
  26. /**
  27. * Dictionary index loader.
  28. *
  29. * It takes a string which is actually <segment_name>.tii index file data and
  30. * returns two arrays - term and tremInfo lists.
  31. *
  32. * See Zend_Search_Lucene_Index_SegmintInfo class for details
  33. *
  34. * @param string $data
  35. * @return array
  36. * @throws \Zend\Search\Lucene\Exception\InvalidFileFormatException
  37. */
  38. public static function load($data)
  39. {
  40. $termDictionary = array();
  41. $termInfos = array();
  42. $pos = 0;
  43. // $tiVersion = $tiiFile->readInt();
  44. $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
  45. $pos += 4;
  46. if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
  47. $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
  48. throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format');
  49. }
  50. // $indexTermCount = $tiiFile->readLong();
  51. if (PHP_INT_SIZE > 4) {
  52. $indexTermCount = ord($data[$pos]) << 56 |
  53. ord($data[$pos+1]) << 48 |
  54. ord($data[$pos+2]) << 40 |
  55. ord($data[$pos+3]) << 32 |
  56. ord($data[$pos+4]) << 24 |
  57. ord($data[$pos+5]) << 16 |
  58. ord($data[$pos+6]) << 8 |
  59. ord($data[$pos+7]);
  60. } else {
  61. if ((ord($data[$pos]) != 0) ||
  62. (ord($data[$pos+1]) != 0) ||
  63. (ord($data[$pos+2]) != 0) ||
  64. (ord($data[$pos+3]) != 0) ||
  65. ((ord($data[$pos+4]) & 0x80) != 0)) {
  66. throw new InvalidFileFormatException('Largest supported segment size (for 32-bit mode) is 2Gb');
  67. }
  68. $indexTermCount = ord($data[$pos+4]) << 24 |
  69. ord($data[$pos+5]) << 16 |
  70. ord($data[$pos+6]) << 8 |
  71. ord($data[$pos+7]);
  72. }
  73. $pos += 8;
  74. // $tiiFile->readInt(); // IndexInterval
  75. $pos += 4;
  76. // $skipInterval = $tiiFile->readInt();
  77. $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
  78. $pos += 4;
  79. if ($indexTermCount < 1) {
  80. throw new InvalidFileFormatException('Wrong number of terms in a term dictionary index');
  81. }
  82. if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
  83. /* Skip MaxSkipLevels value */
  84. $pos += 4;
  85. }
  86. $prevTerm = '';
  87. $freqPointer = 0;
  88. $proxPointer = 0;
  89. $indexPointer = 0;
  90. for ($count = 0; $count < $indexTermCount; $count++) {
  91. //$termPrefixLength = $tiiFile->readVInt();
  92. $nbyte = ord($data[$pos++]);
  93. $termPrefixLength = $nbyte & 0x7F;
  94. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  95. $nbyte = ord($data[$pos++]);
  96. $termPrefixLength |= ($nbyte & 0x7F) << $shift;
  97. }
  98. // $termSuffix = $tiiFile->readString();
  99. $nbyte = ord($data[$pos++]);
  100. $len = $nbyte & 0x7F;
  101. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  102. $nbyte = ord($data[$pos++]);
  103. $len |= ($nbyte & 0x7F) << $shift;
  104. }
  105. if ($len == 0) {
  106. $termSuffix = '';
  107. } else {
  108. $termSuffix = substr($data, $pos, $len);
  109. $pos += $len;
  110. for ($count1 = 0; $count1 < $len; $count1++ ) {
  111. if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
  112. $addBytes = 1;
  113. if (ord($termSuffix[$count1]) & 0x20 ) {
  114. $addBytes++;
  115. // Never used for Java Lucene created index.
  116. // Java2 doesn't encode strings in four bytes
  117. if (ord($termSuffix[$count1]) & 0x10 ) {
  118. $addBytes++;
  119. }
  120. }
  121. $termSuffix .= substr($data, $pos, $addBytes);
  122. $pos += $addBytes;
  123. $len += $addBytes;
  124. // Check for null character. Java2 encodes null character
  125. // in two bytes.
  126. if (ord($termSuffix[$count1]) == 0xC0 &&
  127. ord($termSuffix[$count1+1]) == 0x80 ) {
  128. $termSuffix[$count1] = 0;
  129. $termSuffix = substr($termSuffix,0,$count1+1)
  130. . substr($termSuffix,$count1+2);
  131. }
  132. $count1 += $addBytes;
  133. }
  134. }
  135. }
  136. $pb = 0; $pc = 0;
  137. while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
  138. $charBytes = 1;
  139. if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
  140. $charBytes++;
  141. if (ord($prevTerm[$pb]) & 0x20 ) {
  142. $charBytes++;
  143. if (ord($prevTerm[$pb]) & 0x10 ) {
  144. $charBytes++;
  145. }
  146. }
  147. }
  148. if ($pb + $charBytes > strlen($data)) {
  149. // wrong character
  150. break;
  151. }
  152. $pc++;
  153. $pb += $charBytes;
  154. }
  155. $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
  156. // $termFieldNum = $tiiFile->readVInt();
  157. $nbyte = ord($data[$pos++]);
  158. $termFieldNum = $nbyte & 0x7F;
  159. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  160. $nbyte = ord($data[$pos++]);
  161. $termFieldNum |= ($nbyte & 0x7F) << $shift;
  162. }
  163. // $docFreq = $tiiFile->readVInt();
  164. $nbyte = ord($data[$pos++]);
  165. $docFreq = $nbyte & 0x7F;
  166. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  167. $nbyte = ord($data[$pos++]);
  168. $docFreq |= ($nbyte & 0x7F) << $shift;
  169. }
  170. // $freqPointer += $tiiFile->readVInt();
  171. $nbyte = ord($data[$pos++]);
  172. $vint = $nbyte & 0x7F;
  173. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  174. $nbyte = ord($data[$pos++]);
  175. $vint |= ($nbyte & 0x7F) << $shift;
  176. }
  177. $freqPointer += $vint;
  178. // $proxPointer += $tiiFile->readVInt();
  179. $nbyte = ord($data[$pos++]);
  180. $vint = $nbyte & 0x7F;
  181. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  182. $nbyte = ord($data[$pos++]);
  183. $vint |= ($nbyte & 0x7F) << $shift;
  184. }
  185. $proxPointer += $vint;
  186. if( $docFreq >= $skipInterval ) {
  187. // $skipDelta = $tiiFile->readVInt();
  188. $nbyte = ord($data[$pos++]);
  189. $vint = $nbyte & 0x7F;
  190. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  191. $nbyte = ord($data[$pos++]);
  192. $vint |= ($nbyte & 0x7F) << $shift;
  193. }
  194. $skipDelta = $vint;
  195. } else {
  196. $skipDelta = 0;
  197. }
  198. // $indexPointer += $tiiFile->readVInt();
  199. $nbyte = ord($data[$pos++]);
  200. $vint = $nbyte & 0x7F;
  201. for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
  202. $nbyte = ord($data[$pos++]);
  203. $vint |= ($nbyte & 0x7F) << $shift;
  204. }
  205. $indexPointer += $vint;
  206. $termDictionary[] = array($termFieldNum, $termValue);
  207. $termInfos[] =
  208. array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
  209. $prevTerm = $termValue;
  210. }
  211. // Check special index entry mark
  212. if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
  213. throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format');
  214. }
  215. if (PHP_INT_SIZE > 4) {
  216. // Treat 64-bit 0xFFFFFFFF as -1
  217. $termDictionary[0][0] = -1;
  218. }
  219. return array($termDictionary, $termInfos);
  220. }
  221. }