PageRenderTime 46ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/src/java/main/ivory/core/data/document/LazyIntDocVector.java

https://github.com/earljwagner/Ivory
Java | 343 lines | 262 code | 41 blank | 40 comment | 38 complexity | e6395ffe7cf2dd2de83a51d4ad5e020b MD5 | raw file
  1. /*
  2. * Ivory: A Hadoop toolkit for web-scale information retrieval
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License"); you
  5. * may not use this file except in compliance with the License. You may
  6. * obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  13. * implied. See the License for the specific language governing
  14. * permissions and limitations under the License.
  15. */
  16. package ivory.core.data.document;
  17. import ivory.core.compression.BitInputStream;
  18. import ivory.core.compression.BitOutputStream;
  19. import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
  20. import ivory.core.data.index.TermPositions;
  21. import java.io.ByteArrayInputStream;
  22. import java.io.ByteArrayOutputStream;
  23. import java.io.DataInput;
  24. import java.io.DataOutput;
  25. import java.io.IOException;
  26. import java.util.Iterator;
  27. import java.util.Map;
  28. import java.util.SortedMap;
  29. import org.apache.hadoop.io.WritableUtils;
  30. /**
  31. * Implementation of {@link IntDocVector} that lazily decodes term and
  32. * positional information on demand.
  33. *
  34. * @author Tamer Elsayed
  35. * @author Jimmy Lin
  36. */
  37. public class LazyIntDocVector implements IntDocVector {
  38. private SortedMap<Integer, int[]> termPositionsMap = null;
  39. private byte[] bytes = null;
  40. private int numTerms;
  41. private transient ByteArrayOutputStream bytesOut = null;
  42. private transient BitOutputStream bitsOut = null;
  43. public LazyIntDocVector() {}
  44. public LazyIntDocVector(SortedMap<Integer, int[]> termPositionsMap) {
  45. this.termPositionsMap = termPositionsMap;
  46. }
  47. public void setTermPositionsMap(SortedMap<Integer, int[]> termPositionsMap) {
  48. this.termPositionsMap = termPositionsMap;
  49. }
  50. @Override
  51. public void write(DataOutput out) throws IOException {
  52. if (bytes != null) {
  53. // This would happen if we're reading in an already-encoded
  54. // doc vector; if that's the case, simply write out the byte array
  55. writeRawBytes(out);
  56. } else if (termPositionsMap != null) {
  57. writeTermPositionsMap(out);
  58. } else {
  59. throw new RuntimeException("Unable to write LazyIntDocVector!");
  60. }
  61. }
  62. private void writeRawBytes(DataOutput out) {
  63. try {
  64. WritableUtils.writeVInt(out, bytes.length);
  65. out.write(bytes);
  66. } catch (IOException e) {
  67. throw new RuntimeException("Error writing LazyIntDocVector raw bytes");
  68. }
  69. }
  70. private void writeTermPositionsMap(DataOutput out) {
  71. try {
  72. numTerms = termPositionsMap.size();
  73. // Write # of terms.
  74. WritableUtils.writeVInt(out, numTerms);
  75. if (numTerms == 0)
  76. return;
  77. bytesOut = new ByteArrayOutputStream();
  78. bitsOut = new BitOutputStream(bytesOut);
  79. Iterator<Map.Entry<Integer, int[]>> it = termPositionsMap.entrySet().iterator();
  80. Map.Entry<Integer, int[]> posting = it.next();
  81. int[] positions = posting.getValue();
  82. TermPositions tp = new TermPositions();
  83. // Write out the first termid.
  84. int lastTerm = posting.getKey().intValue();
  85. bitsOut.writeBinary(32, lastTerm);
  86. // Write out the tf value.
  87. bitsOut.writeGamma((short) positions.length);
  88. tp.set(positions, (short) positions.length);
  89. // Write out the positions.
  90. writePositions(bitsOut, tp);
  91. int curTerm;
  92. while (it.hasNext()) {
  93. posting = it.next();
  94. curTerm = posting.getKey().intValue();
  95. positions = posting.getValue();
  96. int tgap = curTerm - lastTerm;
  97. if (tgap <= 0) {
  98. throw new RuntimeException("Error: encountered invalid t-gap. termid=" + curTerm);
  99. }
  100. // Write out the gap.
  101. bitsOut.writeGamma(tgap);
  102. tp.set(positions, (short) positions.length);
  103. // Write out the tf value.
  104. bitsOut.writeGamma((short) positions.length);
  105. // Write out the positions.
  106. writePositions(bitsOut, tp);
  107. lastTerm = curTerm;
  108. }
  109. bitsOut.padAndFlush();
  110. bitsOut.close();
  111. byte[] bytes = bytesOut.toByteArray();
  112. WritableUtils.writeVInt(out, bytes.length);
  113. out.write(bytes);
  114. } catch (IOException e) {
  115. throw new RuntimeException("Error writing LazyIntDocVector term positions map", e);
  116. } catch (ArithmeticException e) {
  117. throw new RuntimeException(e);
  118. }
  119. }
  120. @Override
  121. public void readFields(DataInput in) throws IOException {
  122. numTerms = WritableUtils.readVInt(in);
  123. if (numTerms == 0) {
  124. bytes = null;
  125. return;
  126. }
  127. bytes = new byte[WritableUtils.readVInt(in)];
  128. in.readFully(bytes);
  129. }
  130. // Passing in docno and tf basically for error checking purposes.
  131. protected static void writePositions(BitOutputStream t, TermPositions p) throws IOException {
  132. int[] pos = p.getPositions();
  133. if (p.getTf() == 1) {
  134. // If tf=1, just write out the single term position.
  135. t.writeGamma(pos[0]);
  136. } else {
  137. // If tf > 1, write out skip information if we want to bypass the
  138. // positional information during decoding.
  139. t.writeGamma(p.getEncodedSize());
  140. // Keep track of where we are in the stream.
  141. int skip1 = (int) t.getByteOffset() * 8 + t.getBitOffset();
  142. // Write out first position.
  143. t.writeGamma(pos[0]);
  144. // Write out rest of positions using p-gaps (first order positional differences).
  145. for (int c = 1; c < p.getTf(); c++) {
  146. int pgap = pos[c] - pos[c - 1];
  147. if (pos[c] <= 0 || pgap == 0) {
  148. throw new RuntimeException("Error: invalid term positions. positions=" + p.toString());
  149. }
  150. t.writeGamma(pgap);
  151. }
  152. // Find out where we are in the stream now.
  153. int skip2 = (int) t.getByteOffset() * 8 + t.getBitOffset();
  154. // Verify that the skip information is indeed valid.
  155. if (skip1 + p.getEncodedSize() != skip2) {
  156. throw new RuntimeException("Ivalid skip information: skip_pos1=" + skip1
  157. + ", skip_pos2=" + skip2 + ", size=" + p.getEncodedSize());
  158. }
  159. }
  160. }
  161. @Override
  162. public String toString() {
  163. StringBuffer s = new StringBuffer("[");
  164. try {
  165. Reader r = this.getReader();
  166. while (r.hasMoreTerms()) {
  167. int id = r.nextTerm();
  168. TermPositions pos = new TermPositions();
  169. r.getPositions(pos);
  170. s.append("(" + id + ", " + pos.getTf() + ", " + pos + ")");
  171. }
  172. s.append("]");
  173. } catch (Exception e) {
  174. e.printStackTrace();
  175. }
  176. return s.toString();
  177. }
  178. public String toStringWithTerms(DefaultFrequencySortedDictionary map) {
  179. StringBuffer s = new StringBuffer("");
  180. try {
  181. Reader r = this.getReader();
  182. while (r.hasMoreTerms()) {
  183. int id = r.nextTerm();
  184. TermPositions pos = new TermPositions();
  185. r.getPositions(pos);
  186. s.append(String.format("(%d, %d, %s)", map.getTerm(id), pos.getTf(), pos));
  187. }
  188. s.append("]");
  189. } catch (Exception e) {
  190. e.printStackTrace();
  191. }
  192. return s.toString();
  193. }
  194. @Override
  195. public Reader getReader() throws IOException {
  196. return new Reader(bytes, numTerms);
  197. }
  198. public static class Reader implements IntDocVector.Reader {
  199. private ByteArrayInputStream bytesIn;
  200. private BitInputStream bitsIn;
  201. private int p = -1;
  202. private int prevTermID = -1;
  203. private short prevTf = -1;
  204. private int termCnt;
  205. private boolean needToReadPositions = false;
  206. public Reader(byte[] bytes, int n) throws IOException {
  207. this.termCnt = n;
  208. if (termCnt > 0) {
  209. bytesIn = new ByteArrayInputStream(bytes);
  210. bitsIn = new BitInputStream(bytesIn);
  211. }
  212. }
  213. @Override
  214. public int getNumberOfTerms() {
  215. return termCnt;
  216. }
  217. @Override
  218. public short getTf() {
  219. return prevTf;
  220. }
  221. @Override
  222. public void reset() {
  223. try {
  224. bytesIn.reset();
  225. bitsIn = new BitInputStream(bytesIn);
  226. p = -1;
  227. prevTf = -1;
  228. needToReadPositions = false;
  229. } catch (IOException e) {
  230. throw new RuntimeException(e);
  231. }
  232. }
  233. @Override
  234. public int nextTerm() {
  235. int id = -1;
  236. try {
  237. p++;
  238. if (needToReadPositions) {
  239. skipPositions(prevTf);
  240. }
  241. needToReadPositions = true;
  242. if (p == 0) {
  243. prevTermID = bitsIn.readBinary(32);
  244. prevTf = (short) bitsIn.readGamma();
  245. return prevTermID;
  246. } else {
  247. if (p > termCnt - 1) {
  248. return -1;
  249. }
  250. id = bitsIn.readGamma() + prevTermID;
  251. prevTermID = id;
  252. prevTf = (short) bitsIn.readGamma();
  253. return id;
  254. }
  255. } catch (IOException e) {
  256. e.printStackTrace();
  257. throw new RuntimeException();
  258. }
  259. }
  260. @Override
  261. public int[] getPositions() {
  262. int[] pos = null;
  263. try {
  264. if (prevTf == 1) {
  265. pos = new int[1];
  266. pos[0] = bitsIn.readGamma();
  267. } else {
  268. bitsIn.readGamma();
  269. pos = new int[prevTf];
  270. pos[0] = bitsIn.readGamma();
  271. for (int i = 1; i < prevTf; i++) {
  272. pos[i] = (pos[i - 1] + bitsIn.readGamma());
  273. }
  274. }
  275. } catch (IOException e) {
  276. throw new RuntimeException("Error reading bits:", e);
  277. }
  278. needToReadPositions = false;
  279. return pos;
  280. }
  281. @Override
  282. public boolean getPositions(TermPositions tp) {
  283. int[] pos = getPositions();
  284. if (pos == null) {
  285. return false;
  286. }
  287. tp.set(pos, (short) pos.length);
  288. return true;
  289. }
  290. @Override
  291. public boolean hasMoreTerms() {
  292. return !(p >= termCnt - 1);
  293. }
  294. private void skipPositions(int tf) throws IOException {
  295. if (tf == 1) {
  296. bitsIn.readGamma();
  297. } else {
  298. bitsIn.skipBits(bitsIn.readGamma());
  299. }
  300. }
  301. }
  302. }