/src/java/main/ivory/core/data/document/LazyIntDocVector.java
Java | 343 lines | 262 code | 41 blank | 40 comment | 38 complexity | e6395ffe7cf2dd2de83a51d4ad5e020b MD5 | raw file
- /*
- * Ivory: A Hadoop toolkit for web-scale information retrieval
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you
- * may not use this file except in compliance with the License. You may
- * obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * permissions and limitations under the License.
- */
-
- package ivory.core.data.document;
-
- import ivory.core.compression.BitInputStream;
- import ivory.core.compression.BitOutputStream;
- import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
- import ivory.core.data.index.TermPositions;
-
- import java.io.ByteArrayInputStream;
- import java.io.ByteArrayOutputStream;
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
- import java.util.Iterator;
- import java.util.Map;
- import java.util.SortedMap;
-
- import org.apache.hadoop.io.WritableUtils;
-
- /**
- * Implementation of {@link IntDocVector} that lazily decodes term and
- * positional information on demand.
- *
- * @author Tamer Elsayed
- * @author Jimmy Lin
- */
- public class LazyIntDocVector implements IntDocVector {
- private SortedMap<Integer, int[]> termPositionsMap = null;
- private byte[] bytes = null;
- private int numTerms;
-
- private transient ByteArrayOutputStream bytesOut = null;
- private transient BitOutputStream bitsOut = null;
-
- public LazyIntDocVector() {}
-
- public LazyIntDocVector(SortedMap<Integer, int[]> termPositionsMap) {
- this.termPositionsMap = termPositionsMap;
- }
-
- public void setTermPositionsMap(SortedMap<Integer, int[]> termPositionsMap) {
- this.termPositionsMap = termPositionsMap;
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- if (bytes != null) {
- // This would happen if we're reading in an already-encoded
- // doc vector; if that's the case, simply write out the byte array
- writeRawBytes(out);
- } else if (termPositionsMap != null) {
- writeTermPositionsMap(out);
- } else {
- throw new RuntimeException("Unable to write LazyIntDocVector!");
- }
- }
-
- private void writeRawBytes(DataOutput out) {
- try {
- WritableUtils.writeVInt(out, bytes.length);
- out.write(bytes);
- } catch (IOException e) {
- throw new RuntimeException("Error writing LazyIntDocVector raw bytes");
- }
- }
-
- private void writeTermPositionsMap(DataOutput out) {
- try {
- numTerms = termPositionsMap.size();
-
- // Write # of terms.
- WritableUtils.writeVInt(out, numTerms);
- if (numTerms == 0)
- return;
-
- bytesOut = new ByteArrayOutputStream();
- bitsOut = new BitOutputStream(bytesOut);
-
- Iterator<Map.Entry<Integer, int[]>> it = termPositionsMap.entrySet().iterator();
- Map.Entry<Integer, int[]> posting = it.next();
- int[] positions = posting.getValue();
- TermPositions tp = new TermPositions();
- // Write out the first termid.
- int lastTerm = posting.getKey().intValue();
- bitsOut.writeBinary(32, lastTerm);
- // Write out the tf value.
- bitsOut.writeGamma((short) positions.length);
- tp.set(positions, (short) positions.length);
- // Write out the positions.
- writePositions(bitsOut, tp);
-
- int curTerm;
- while (it.hasNext()) {
- posting = it.next();
- curTerm = posting.getKey().intValue();
- positions = posting.getValue();
- int tgap = curTerm - lastTerm;
- if (tgap <= 0) {
- throw new RuntimeException("Error: encountered invalid t-gap. termid=" + curTerm);
- }
- // Write out the gap.
- bitsOut.writeGamma(tgap);
- tp.set(positions, (short) positions.length);
- // Write out the tf value.
- bitsOut.writeGamma((short) positions.length);
- // Write out the positions.
- writePositions(bitsOut, tp);
- lastTerm = curTerm;
- }
-
- bitsOut.padAndFlush();
- bitsOut.close();
- byte[] bytes = bytesOut.toByteArray();
- WritableUtils.writeVInt(out, bytes.length);
- out.write(bytes);
- } catch (IOException e) {
- throw new RuntimeException("Error writing LazyIntDocVector term positions map", e);
- } catch (ArithmeticException e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- numTerms = WritableUtils.readVInt(in);
- if (numTerms == 0) {
- bytes = null;
- return;
- }
- bytes = new byte[WritableUtils.readVInt(in)];
- in.readFully(bytes);
- }
-
- // Passing in docno and tf basically for error checking purposes.
- protected static void writePositions(BitOutputStream t, TermPositions p) throws IOException {
- int[] pos = p.getPositions();
-
- if (p.getTf() == 1) {
- // If tf=1, just write out the single term position.
- t.writeGamma(pos[0]);
- } else {
- // If tf > 1, write out skip information if we want to bypass the
- // positional information during decoding.
- t.writeGamma(p.getEncodedSize());
-
- // Keep track of where we are in the stream.
- int skip1 = (int) t.getByteOffset() * 8 + t.getBitOffset();
-
- // Write out first position.
- t.writeGamma(pos[0]);
- // Write out rest of positions using p-gaps (first order positional differences).
- for (int c = 1; c < p.getTf(); c++) {
- int pgap = pos[c] - pos[c - 1];
- if (pos[c] <= 0 || pgap == 0) {
- throw new RuntimeException("Error: invalid term positions. positions=" + p.toString());
- }
- t.writeGamma(pgap);
- }
-
- // Find out where we are in the stream now.
- int skip2 = (int) t.getByteOffset() * 8 + t.getBitOffset();
-
- // Verify that the skip information is indeed valid.
- if (skip1 + p.getEncodedSize() != skip2) {
- throw new RuntimeException("Ivalid skip information: skip_pos1=" + skip1
- + ", skip_pos2=" + skip2 + ", size=" + p.getEncodedSize());
- }
- }
- }
-
- @Override
- public String toString() {
- StringBuffer s = new StringBuffer("[");
- try {
- Reader r = this.getReader();
- while (r.hasMoreTerms()) {
- int id = r.nextTerm();
- TermPositions pos = new TermPositions();
- r.getPositions(pos);
- s.append("(" + id + ", " + pos.getTf() + ", " + pos + ")");
- }
- s.append("]");
- } catch (Exception e) {
- e.printStackTrace();
- }
- return s.toString();
- }
-
- public String toStringWithTerms(DefaultFrequencySortedDictionary map) {
- StringBuffer s = new StringBuffer("");
- try {
- Reader r = this.getReader();
- while (r.hasMoreTerms()) {
- int id = r.nextTerm();
- TermPositions pos = new TermPositions();
- r.getPositions(pos);
- s.append(String.format("(%d, %d, %s)", map.getTerm(id), pos.getTf(), pos));
- }
- s.append("]");
- } catch (Exception e) {
- e.printStackTrace();
- }
- return s.toString();
- }
-
- @Override
- public Reader getReader() throws IOException {
- return new Reader(bytes, numTerms);
- }
-
- public static class Reader implements IntDocVector.Reader {
- private ByteArrayInputStream bytesIn;
- private BitInputStream bitsIn;
- private int p = -1;
- private int prevTermID = -1;
- private short prevTf = -1;
- private int termCnt;
- private boolean needToReadPositions = false;
-
- public Reader(byte[] bytes, int n) throws IOException {
- this.termCnt = n;
- if (termCnt > 0) {
- bytesIn = new ByteArrayInputStream(bytes);
- bitsIn = new BitInputStream(bytesIn);
- }
- }
-
- @Override
- public int getNumberOfTerms() {
- return termCnt;
- }
-
- @Override
- public short getTf() {
- return prevTf;
- }
-
- @Override
- public void reset() {
- try {
- bytesIn.reset();
- bitsIn = new BitInputStream(bytesIn);
- p = -1;
- prevTf = -1;
- needToReadPositions = false;
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public int nextTerm() {
- int id = -1;
- try {
- p++;
- if (needToReadPositions) {
- skipPositions(prevTf);
- }
- needToReadPositions = true;
- if (p == 0) {
- prevTermID = bitsIn.readBinary(32);
- prevTf = (short) bitsIn.readGamma();
- return prevTermID;
- } else {
- if (p > termCnt - 1) {
- return -1;
- }
- id = bitsIn.readGamma() + prevTermID;
- prevTermID = id;
- prevTf = (short) bitsIn.readGamma();
- return id;
- }
- } catch (IOException e) {
- e.printStackTrace();
- throw new RuntimeException();
- }
- }
-
- @Override
- public int[] getPositions() {
- int[] pos = null;
- try {
- if (prevTf == 1) {
- pos = new int[1];
- pos[0] = bitsIn.readGamma();
- } else {
- bitsIn.readGamma();
- pos = new int[prevTf];
- pos[0] = bitsIn.readGamma();
- for (int i = 1; i < prevTf; i++) {
- pos[i] = (pos[i - 1] + bitsIn.readGamma());
- }
- }
- } catch (IOException e) {
- throw new RuntimeException("Error reading bits:", e);
- }
- needToReadPositions = false;
-
- return pos;
- }
-
- @Override
- public boolean getPositions(TermPositions tp) {
- int[] pos = getPositions();
-
- if (pos == null) {
- return false;
- }
-
- tp.set(pos, (short) pos.length);
-
- return true;
- }
-
- @Override
- public boolean hasMoreTerms() {
- return !(p >= termCnt - 1);
- }
-
- private void skipPositions(int tf) throws IOException {
- if (tf == 1) {
- bitsIn.readGamma();
- } else {
- bitsIn.skipBits(bitsIn.readGamma());
- }
- }
- }
- }