PageRenderTime 96ms CodeModel.GetById 21ms app.highlight 68ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableSerDe.java

#
Java | 601 lines | 483 code | 36 blank | 82 comment | 91 complexity | 90302c7a8ed143beeb1781bdc7a71cd1 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.serde2.binarysortable;
 20
 21import java.io.IOException;
 22import java.util.ArrayList;
 23import java.util.Arrays;
 24import java.util.HashMap;
 25import java.util.List;
 26import java.util.Map;
 27import java.util.Properties;
 28
 29import org.apache.commons.logging.Log;
 30import org.apache.commons.logging.LogFactory;
 31import org.apache.hadoop.conf.Configuration;
 32import org.apache.hadoop.hive.serde.Constants;
 33import org.apache.hadoop.hive.serde2.SerDe;
 34import org.apache.hadoop.hive.serde2.SerDeException;
 35import org.apache.hadoop.hive.serde2.io.ByteWritable;
 36import org.apache.hadoop.hive.serde2.io.DoubleWritable;
 37import org.apache.hadoop.hive.serde2.io.ShortWritable;
 38import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
 39import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
 40import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 41import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 42import org.apache.hadoop.hive.serde2.objectinspector.StructField;
 43import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 44import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
 45import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion;
 46import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
 47import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
 48import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
 49import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
 50import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
 51import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
 52import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
 53import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
 54import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
 55import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
 56import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
 57import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
 58import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
 59import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 60import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
 61import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
 62import org.apache.hadoop.io.BooleanWritable;
 63import org.apache.hadoop.io.BytesWritable;
 64import org.apache.hadoop.io.FloatWritable;
 65import org.apache.hadoop.io.IntWritable;
 66import org.apache.hadoop.io.LongWritable;
 67import org.apache.hadoop.io.Text;
 68import org.apache.hadoop.io.Writable;
 69
 70/**
 71 * BinarySortableSerDe can be used to write data in a way that the data can be
 72 * compared byte-by-byte with the same order.
 73 * 
 74 * The data format: NULL: a single byte \0 NON-NULL Primitives: ALWAYS prepend a
 75 * single byte \1, and then: Boolean: FALSE = \1, TRUE = \2 Byte: flip the
 76 * sign-bit to make sure negative comes before positive Short: flip the sign-bit
 77 * to make sure negative comes before positive Int: flip the sign-bit to make
 78 * sure negative comes before positive Long: flip the sign-bit to make sure
 79 * negative comes before positive Double: flip the sign-bit for positive double,
 80 * and all bits for negative double values String: NULL-terminated UTF-8 string,
 81 * with NULL escaped to \1 \1, and \1 escaped to \1 \2 NON-NULL Complex Types:
 82 * ALWAYS prepend a single byte \1, and then: Struct: one field by one field.
 83 * List: \1 followed by each element, and \0 to terminate Map: \1 followed by
 84 * each key and then each value, and \0 to terminate
 85 * 
 86 * This SerDe takes an additional parameter SERIALIZATION_SORT_ORDER which is a
 87 * string containing only "+" and "-". The length of the string should equal to
 88 * the number of fields in the top-level struct for serialization. "+" means the
 89 * field should be sorted ascendingly, and "-" means descendingly. The sub
 90 * fields in the same top-level field will have the same sort order.
 91 * 
 92 */
 93public class BinarySortableSerDe implements SerDe {
 94
 95  public static final Log LOG = LogFactory.getLog(BinarySortableSerDe.class
 96      .getName());
 97
 98  List<String> columnNames;
 99  List<TypeInfo> columnTypes;
100
101  TypeInfo rowTypeInfo;
102  StructObjectInspector rowObjectInspector;
103
104  boolean[] columnSortOrderIsDesc;
105
106  @Override
107  public void initialize(Configuration conf, Properties tbl)
108      throws SerDeException {
109
110    // Get column names and sort order
111    String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS);
112    String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES);
113    if (columnNameProperty.length() == 0) {
114      columnNames = new ArrayList<String>();
115    } else {
116      columnNames = Arrays.asList(columnNameProperty.split(","));
117    }
118    if (columnTypeProperty.length() == 0) {
119      columnTypes = new ArrayList<TypeInfo>();
120    } else {
121      columnTypes = TypeInfoUtils
122          .getTypeInfosFromTypeString(columnTypeProperty);
123    }
124    assert (columnNames.size() == columnTypes.size());
125
126    // Create row related objects
127    rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
128    rowObjectInspector = (StructObjectInspector) TypeInfoUtils
129        .getStandardWritableObjectInspectorFromTypeInfo(rowTypeInfo);
130    row = new ArrayList<Object>(columnNames.size());
131    for (int i = 0; i < columnNames.size(); i++) {
132      row.add(null);
133    }
134
135    // Get the sort order
136    String columnSortOrder = tbl
137        .getProperty(Constants.SERIALIZATION_SORT_ORDER);
138    columnSortOrderIsDesc = new boolean[columnNames.size()];
139    for (int i = 0; i < columnSortOrderIsDesc.length; i++) {
140      columnSortOrderIsDesc[i] = (columnSortOrder != null && columnSortOrder
141          .charAt(i) == '-');
142    }
143  }
144
145  @Override
146  public Class<? extends Writable> getSerializedClass() {
147    return BytesWritable.class;
148  }
149
150  @Override
151  public ObjectInspector getObjectInspector() throws SerDeException {
152    return rowObjectInspector;
153  }
154
155  ArrayList<Object> row;
156  InputByteBuffer inputByteBuffer = new InputByteBuffer();
157
158  @Override
159  public Object deserialize(Writable blob) throws SerDeException {
160    BytesWritable data = (BytesWritable) blob;
161    inputByteBuffer.reset(data.get(), 0, data.getSize());
162
163    try {
164      for (int i = 0; i < columnNames.size(); i++) {
165        row.set(i, deserialize(inputByteBuffer, columnTypes.get(i),
166            columnSortOrderIsDesc[i], row.get(i)));
167      }
168    } catch (IOException e) {
169      throw new SerDeException(e);
170    }
171
172    return row;
173  }
174
175  static Object deserialize(InputByteBuffer buffer, TypeInfo type,
176      boolean invert, Object reuse) throws IOException {
177
178    // Is this field a null?
179    byte isNull = buffer.read(invert);
180    if (isNull == 0) {
181      return null;
182    }
183    assert (isNull == 1);
184
185    switch (type.getCategory()) {
186    case PRIMITIVE: {
187      PrimitiveTypeInfo ptype = (PrimitiveTypeInfo) type;
188      switch (ptype.getPrimitiveCategory()) {
189      case VOID: {
190        return null;
191      }
192      case BOOLEAN: {
193        BooleanWritable r = reuse == null ? new BooleanWritable()
194            : (BooleanWritable) reuse;
195        byte b = buffer.read(invert);
196        assert (b == 1 || b == 2);
197        r.set(b == 2);
198        return r;
199      }
200      case BYTE: {
201        ByteWritable r = reuse == null ? new ByteWritable()
202            : (ByteWritable) reuse;
203        r.set((byte) (buffer.read(invert) ^ 0x80));
204        return r;
205      }
206      case SHORT: {
207        ShortWritable r = reuse == null ? new ShortWritable()
208            : (ShortWritable) reuse;
209        int v = buffer.read(invert) ^ 0x80;
210        v = (v << 8) + (buffer.read(invert) & 0xff);
211        r.set((short) v);
212        return r;
213      }
214      case INT: {
215        IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse;
216        int v = buffer.read(invert) ^ 0x80;
217        for (int i = 0; i < 3; i++) {
218          v = (v << 8) + (buffer.read(invert) & 0xff);
219        }
220        r.set(v);
221        return r;
222      }
223      case LONG: {
224        LongWritable r = reuse == null ? new LongWritable()
225            : (LongWritable) reuse;
226        long v = buffer.read(invert) ^ 0x80;
227        for (int i = 0; i < 7; i++) {
228          v = (v << 8) + (buffer.read(invert) & 0xff);
229        }
230        r.set(v);
231        return r;
232      }
233      case FLOAT: {
234        FloatWritable r = reuse == null ? new FloatWritable()
235            : (FloatWritable) reuse;
236        int v = 0;
237        for (int i = 0; i < 4; i++) {
238          v = (v << 8) + (buffer.read(invert) & 0xff);
239        }
240        if ((v & (1 << 31)) == 0) {
241          // negative number, flip all bits
242          v = ~v;
243        } else {
244          // positive number, flip the first bit
245          v = v ^ (1 << 31);
246        }
247        r.set(Float.intBitsToFloat(v));
248        return r;
249      }
250      case DOUBLE: {
251        DoubleWritable r = reuse == null ? new DoubleWritable()
252            : (DoubleWritable) reuse;
253        long v = 0;
254        for (int i = 0; i < 8; i++) {
255          v = (v << 8) + (buffer.read(invert) & 0xff);
256        }
257        if ((v & (1L << 63)) == 0) {
258          // negative number, flip all bits
259          v = ~v;
260        } else {
261          // positive number, flip the first bit
262          v = v ^ (1L << 63);
263        }
264        r.set(Double.longBitsToDouble(v));
265        return r;
266      }
267      case STRING: {
268        Text r = reuse == null ? new Text() : (Text) reuse;
269        // Get the actual length first
270        int start = buffer.tell();
271        int length = 0;
272        do {
273          byte b = buffer.read(invert);
274          if (b == 0) {
275            // end of string
276            break;
277          }
278          if (b == 1) {
279            // the last char is an escape char. read the actual char
280            buffer.read(invert);
281          }
282          length++;
283        } while (true);
284
285        if (length == buffer.tell() - start) {
286          // No escaping happened, so we are already done.
287          r.set(buffer.getData(), start, length);
288        } else {
289          // Escaping happened, we need to copy byte-by-byte.
290          // 1. Set the length first.
291          r.set(buffer.getData(), start, length);
292          // 2. Reset the pointer.
293          buffer.seek(start);
294          // 3. Copy the data.
295          byte[] rdata = r.getBytes();
296          for (int i = 0; i < length; i++) {
297            byte b = buffer.read(invert);
298            if (b == 1) {
299              // The last char is an escape char, read the actual char.
300              // The serialization format escape \0 to \1, and \1 to \2,
301              // to make sure the string is null-terminated.
302              b = (byte) (buffer.read(invert) - 1);
303            }
304            rdata[i] = b;
305          }
306          // 4. Read the null terminator.
307          byte b = buffer.read(invert);
308          assert (b == 0);
309        }
310        return r;
311      }
312      default: {
313        throw new RuntimeException("Unrecognized type: "
314            + ptype.getPrimitiveCategory());
315      }
316      }
317    }
318    case LIST: {
319      ListTypeInfo ltype = (ListTypeInfo) type;
320      TypeInfo etype = ltype.getListElementTypeInfo();
321
322      // Create the list if needed
323      ArrayList<Object> r = reuse == null ? new ArrayList<Object>()
324          : (ArrayList<Object>) reuse;
325
326      // Read the list
327      int size = 0;
328      while (true) {
329        int more = buffer.read(invert);
330        if (more == 0) {
331          // \0 to terminate
332          break;
333        }
334        // \1 followed by each element
335        assert (more == 1);
336        if (size == r.size()) {
337          r.add(null);
338        }
339        r.set(size, deserialize(buffer, etype, invert, r.get(size)));
340        size++;
341      }
342      // Remove additional elements if the list is reused
343      while (r.size() > size) {
344        r.remove(r.size() - 1);
345      }
346      return r;
347    }
348    case MAP: {
349      MapTypeInfo mtype = (MapTypeInfo) type;
350      TypeInfo ktype = mtype.getMapKeyTypeInfo();
351      TypeInfo vtype = mtype.getMapValueTypeInfo();
352
353      // Create the map if needed
354      Map<Object, Object> r;
355      if (reuse == null) {
356        r = new HashMap<Object, Object>();
357      } else {
358        r = (HashMap<Object, Object>) reuse;
359        r.clear();
360      }
361
362      while (true) {
363        int more = buffer.read(invert);
364        if (more == 0) {
365          // \0 to terminate
366          break;
367        }
368        // \1 followed by each key and then each value
369        assert (more == 1);
370        Object k = deserialize(buffer, ktype, invert, null);
371        Object v = deserialize(buffer, vtype, invert, null);
372        r.put(k, v);
373      }
374      return r;
375    }
376    case STRUCT: {
377      StructTypeInfo stype = (StructTypeInfo) type;
378      List<TypeInfo> fieldTypes = stype.getAllStructFieldTypeInfos();
379      int size = fieldTypes.size();
380      // Create the struct if needed
381      ArrayList<Object> r = reuse == null ? new ArrayList<Object>(size)
382          : (ArrayList<Object>) reuse;
383      assert (r.size() <= size);
384      // Set the size of the struct
385      while (r.size() < size) {
386        r.add(null);
387      }
388      // Read one field by one field
389      for (int eid = 0; eid < size; eid++) {
390        r
391            .set(eid, deserialize(buffer, fieldTypes.get(eid), invert, r
392            .get(eid)));
393      }
394      return r;
395    }
396    case UNION: {
397      UnionTypeInfo utype = (UnionTypeInfo) type;
398      StandardUnion r = reuse == null ? new StandardUnion()
399          : (StandardUnion) reuse;
400      // Read the tag
401      byte tag = buffer.read(invert);
402      r.setTag(tag);
403      r.setObject(deserialize(buffer, utype.getAllUnionObjectTypeInfos().get(tag),
404          invert, null));
405      return r;
406    }
407    default: {
408      throw new RuntimeException("Unrecognized type: " + type.getCategory());
409    }
410    }
411  }
412
413  BytesWritable serializeBytesWritable = new BytesWritable();
414  OutputByteBuffer outputByteBuffer = new OutputByteBuffer();
415
416  @Override
417  public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
418    outputByteBuffer.reset();
419    StructObjectInspector soi = (StructObjectInspector) objInspector;
420    List<? extends StructField> fields = soi.getAllStructFieldRefs();
421
422    for (int i = 0; i < columnNames.size(); i++) {
423      serialize(outputByteBuffer, soi.getStructFieldData(obj, fields.get(i)),
424          fields.get(i).getFieldObjectInspector(), columnSortOrderIsDesc[i]);
425    }
426
427    serializeBytesWritable.set(outputByteBuffer.getData(), 0, outputByteBuffer
428        .getLength());
429    return serializeBytesWritable;
430  }
431
432  static void serialize(OutputByteBuffer buffer, Object o, ObjectInspector oi,
433      boolean invert) {
434    // Is this field a null?
435    if (o == null) {
436      buffer.write((byte) 0, invert);
437      return;
438    }
439    // This field is not a null.
440    buffer.write((byte) 1, invert);
441
442    switch (oi.getCategory()) {
443    case PRIMITIVE: {
444      PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
445      switch (poi.getPrimitiveCategory()) {
446      case VOID: {
447        return;
448      }
449      case BOOLEAN: {
450        boolean v = ((BooleanObjectInspector) poi).get(o);
451        buffer.write((byte) (v ? 2 : 1), invert);
452        return;
453      }
454      case BYTE: {
455        ByteObjectInspector boi = (ByteObjectInspector) poi;
456        byte v = boi.get(o);
457        buffer.write((byte) (v ^ 0x80), invert);
458        return;
459      }
460      case SHORT: {
461        ShortObjectInspector spoi = (ShortObjectInspector) poi;
462        short v = spoi.get(o);
463        buffer.write((byte) ((v >> 8) ^ 0x80), invert);
464        buffer.write((byte) v, invert);
465        return;
466      }
467      case INT: {
468        IntObjectInspector ioi = (IntObjectInspector) poi;
469        int v = ioi.get(o);
470        buffer.write((byte) ((v >> 24) ^ 0x80), invert);
471        buffer.write((byte) (v >> 16), invert);
472        buffer.write((byte) (v >> 8), invert);
473        buffer.write((byte) v, invert);
474        return;
475      }
476      case LONG: {
477        LongObjectInspector loi = (LongObjectInspector) poi;
478        long v = loi.get(o);
479        buffer.write((byte) ((v >> 56) ^ 0x80), invert);
480        buffer.write((byte) (v >> 48), invert);
481        buffer.write((byte) (v >> 40), invert);
482        buffer.write((byte) (v >> 32), invert);
483        buffer.write((byte) (v >> 24), invert);
484        buffer.write((byte) (v >> 16), invert);
485        buffer.write((byte) (v >> 8), invert);
486        buffer.write((byte) v, invert);
487        return;
488      }
489      case FLOAT: {
490        FloatObjectInspector foi = (FloatObjectInspector) poi;
491        int v = Float.floatToIntBits(foi.get(o));
492        if ((v & (1 << 31)) != 0) {
493          // negative number, flip all bits
494          v = ~v;
495        } else {
496          // positive number, flip the first bit
497          v = v ^ (1 << 31);
498        }
499        buffer.write((byte) (v >> 24), invert);
500        buffer.write((byte) (v >> 16), invert);
501        buffer.write((byte) (v >> 8), invert);
502        buffer.write((byte) v, invert);
503        return;
504      }
505      case DOUBLE: {
506        DoubleObjectInspector doi = (DoubleObjectInspector) poi;
507        long v = Double.doubleToLongBits(doi.get(o));
508        if ((v & (1L << 63)) != 0) {
509          // negative number, flip all bits
510          v = ~v;
511        } else {
512          // positive number, flip the first bit
513          v = v ^ (1L << 63);
514        }
515        buffer.write((byte) (v >> 56), invert);
516        buffer.write((byte) (v >> 48), invert);
517        buffer.write((byte) (v >> 40), invert);
518        buffer.write((byte) (v >> 32), invert);
519        buffer.write((byte) (v >> 24), invert);
520        buffer.write((byte) (v >> 16), invert);
521        buffer.write((byte) (v >> 8), invert);
522        buffer.write((byte) v, invert);
523        return;
524      }
525      case STRING: {
526        StringObjectInspector soi = (StringObjectInspector) poi;
527        Text t = soi.getPrimitiveWritableObject(o);
528        byte[] data = t.getBytes();
529        int length = t.getLength();
530        for (int i = 0; i < length; i++) {
531          if (data[i] == 0 || data[i] == 1) {
532            buffer.write((byte) 1, invert);
533            buffer.write((byte) (data[i] + 1), invert);
534          } else {
535            buffer.write(data[i], invert);
536          }
537        }
538        buffer.write((byte) 0, invert);
539        return;
540      }
541      default: {
542        throw new RuntimeException("Unrecognized type: "
543            + poi.getPrimitiveCategory());
544      }
545      }
546    }
547    case LIST: {
548      ListObjectInspector loi = (ListObjectInspector) oi;
549      ObjectInspector eoi = loi.getListElementObjectInspector();
550
551      // \1 followed by each element
552      int size = loi.getListLength(o);
553      for (int eid = 0; eid < size; eid++) {
554        buffer.write((byte) 1, invert);
555        serialize(buffer, loi.getListElement(o, eid), eoi, invert);
556      }
557      // and \0 to terminate
558      buffer.write((byte) 0, invert);
559      return;
560    }
561    case MAP: {
562      MapObjectInspector moi = (MapObjectInspector) oi;
563      ObjectInspector koi = moi.getMapKeyObjectInspector();
564      ObjectInspector voi = moi.getMapValueObjectInspector();
565
566      // \1 followed by each key and then each value
567      Map<?, ?> map = moi.getMap(o);
568      for (Map.Entry<?, ?> entry : map.entrySet()) {
569        buffer.write((byte) 1, invert);
570        serialize(buffer, entry.getKey(), koi, invert);
571        serialize(buffer, entry.getValue(), voi, invert);
572      }
573      // and \0 to terminate
574      buffer.write((byte) 0, invert);
575      return;
576    }
577    case STRUCT: {
578      StructObjectInspector soi = (StructObjectInspector) oi;
579      List<? extends StructField> fields = soi.getAllStructFieldRefs();
580
581      for (int i = 0; i < fields.size(); i++) {
582        serialize(buffer, soi.getStructFieldData(o, fields.get(i)), fields.get(
583            i).getFieldObjectInspector(), invert);
584      }
585      return;
586    }
587    case UNION: {
588      UnionObjectInspector uoi = (UnionObjectInspector) oi;
589      byte tag = uoi.getTag(o);
590      buffer.write(tag, invert);
591      serialize(buffer, uoi.getField(o), uoi.getObjectInspectors().get(tag),
592          invert);
593      return;
594    }
595    default: {
596      throw new RuntimeException("Unrecognized type: " + oi.getCategory());
597    }
598    }
599
600  }
601}