PageRenderTime 64ms CodeModel.GetById 41ms app.highlight 18ms RepoModel.GetById 2ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java

#
Java | 320 lines | 224 code | 31 blank | 65 comment | 34 complexity | 1aaeb1e06d9cab04e17aa28e3a5dac24 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.exec;
 20
 21import java.io.IOException;
 22import java.io.Serializable;
 23import java.util.ArrayList;
 24import java.util.Arrays;
 25import java.util.List;
 26import java.util.Random;
 27
 28import org.apache.hadoop.conf.Configuration;
 29import org.apache.hadoop.hive.conf.HiveConf;
 30import org.apache.hadoop.hive.ql.io.HiveKey;
 31import org.apache.hadoop.hive.ql.metadata.HiveException;
 32import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 33import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
 34import org.apache.hadoop.hive.ql.plan.TableDesc;
 35import org.apache.hadoop.hive.ql.plan.api.OperatorType;
 36import org.apache.hadoop.hive.serde2.SerDeException;
 37import org.apache.hadoop.hive.serde2.Serializer;
 38import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
 39import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 40import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 41import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
 42import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 43import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
 44import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion;
 45import org.apache.hadoop.io.BytesWritable;
 46import org.apache.hadoop.io.Text;
 47import org.apache.hadoop.io.Writable;
 48
 49/**
 50 * Reduce Sink Operator sends output to the reduce stage.
 51 **/
 52public class ReduceSinkOperator extends TerminalOperator<ReduceSinkDesc>
 53    implements Serializable {
 54
 55  private static final long serialVersionUID = 1L;
 56
 57  /**
 58   * The evaluators for the key columns. Key columns decide the sort order on
 59   * the reducer side. Key columns are passed to the reducer in the "key".
 60   */
 61  protected transient ExprNodeEvaluator[] keyEval;
 62  /**
 63   * The evaluators for the value columns. Value columns are passed to reducer
 64   * in the "value".
 65   */
 66  protected transient ExprNodeEvaluator[] valueEval;
 67  /**
 68   * The evaluators for the partition columns (CLUSTER BY or DISTRIBUTE BY in
 69   * Hive language). Partition columns decide the reducer that the current row
 70   * goes to. Partition columns are not passed to reducer.
 71   */
 72  protected transient ExprNodeEvaluator[] partitionEval;
 73
 74  // TODO: we use MetadataTypedColumnsetSerDe for now, till DynamicSerDe is
 75  // ready
 76  transient Serializer keySerializer;
 77  transient boolean keyIsText;
 78  transient Serializer valueSerializer;
 79  transient int tag;
 80  transient byte[] tagByte = new byte[1];
 81  transient protected int numDistributionKeys;
 82  transient protected int numDistinctExprs;
 83
 84  @Override
 85  protected void initializeOp(Configuration hconf) throws HiveException {
 86
 87    try {
 88      keyEval = new ExprNodeEvaluator[conf.getKeyCols().size()];
 89      int i = 0;
 90      for (ExprNodeDesc e : conf.getKeyCols()) {
 91        keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
 92      }
 93
 94      numDistributionKeys = conf.getNumDistributionKeys();
 95      distinctColIndices = conf.getDistinctColumnIndices();
 96      numDistinctExprs = distinctColIndices.size();
 97
 98      valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
 99      i = 0;
100      for (ExprNodeDesc e : conf.getValueCols()) {
101        valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
102      }
103
104      partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
105      i = 0;
106      for (ExprNodeDesc e : conf.getPartitionCols()) {
107        partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
108      }
109
110      tag = conf.getTag();
111      tagByte[0] = (byte) tag;
112      LOG.info("Using tag = " + tag);
113
114      TableDesc keyTableDesc = conf.getKeySerializeInfo();
115      keySerializer = (Serializer) keyTableDesc.getDeserializerClass()
116          .newInstance();
117      keySerializer.initialize(null, keyTableDesc.getProperties());
118      keyIsText = keySerializer.getSerializedClass().equals(Text.class);
119
120      TableDesc valueTableDesc = conf.getValueSerializeInfo();
121      valueSerializer = (Serializer) valueTableDesc.getDeserializerClass()
122          .newInstance();
123      valueSerializer.initialize(null, valueTableDesc.getProperties());
124
125      firstRow = true;
126      initializeChildren(hconf);
127    } catch (Exception e) {
128      e.printStackTrace();
129      throw new RuntimeException(e);
130    }
131  }
132
133  transient InspectableObject tempInspectableObject = new InspectableObject();
134  transient HiveKey keyWritable = new HiveKey();
135  transient Writable value;
136
137  transient StructObjectInspector keyObjectInspector;
138  transient StructObjectInspector valueObjectInspector;
139  transient ObjectInspector[] partitionObjectInspectors;
140
141  transient Object[][] cachedKeys;
142  transient Object[] cachedValues;
143  transient List<List<Integer>> distinctColIndices;
144
145  boolean firstRow;
146
147  transient Random random;
148
149  /**
150   * Initializes array of ExprNodeEvaluator. Adds Union field for distinct
151   * column indices for group by.
152   * Puts the return values into a StructObjectInspector with output column
153   * names.
154   *
155   * If distinctColIndices is empty, the object inspector is same as
156   * {@link Operator#initEvaluatorsAndReturnStruct(ExprNodeEvaluator[], List, ObjectInspector)}
157   */
158  protected static StructObjectInspector initEvaluatorsAndReturnStruct(
159      ExprNodeEvaluator[] evals, List<List<Integer>> distinctColIndices,
160      List<String> outputColNames,
161      int length, ObjectInspector rowInspector)
162      throws HiveException {
163    int inspectorLen = evals.length > length ? length + 1 : evals.length;
164    List<ObjectInspector> sois = new ArrayList<ObjectInspector>(inspectorLen);
165
166    // keys
167    ObjectInspector[] fieldObjectInspectors = initEvaluators(evals, 0, length, rowInspector);
168    sois.addAll(Arrays.asList(fieldObjectInspectors));
169
170    if (evals.length > length) {
171      // union keys
172      List<ObjectInspector> uois = new ArrayList<ObjectInspector>();
173      for (List<Integer> distinctCols : distinctColIndices) {
174        List<String> names = new ArrayList<String>();
175        List<ObjectInspector> eois = new ArrayList<ObjectInspector>();
176        int numExprs = 0;
177        for (int i : distinctCols) {
178          names.add(HiveConf.getColumnInternalName(numExprs));
179          eois.add(evals[i].initialize(rowInspector));
180          numExprs++;
181        }
182        uois.add(ObjectInspectorFactory.getStandardStructObjectInspector(names, eois));
183      }
184      UnionObjectInspector uoi =
185        ObjectInspectorFactory.getStandardUnionObjectInspector(uois);
186      sois.add(uoi);
187    }
188    return ObjectInspectorFactory.getStandardStructObjectInspector(outputColNames, sois );
189  }
190
191  @Override
192  public void processOp(Object row, int tag) throws HiveException {
193    try {
194      ObjectInspector rowInspector = inputObjInspectors[tag];
195      if (firstRow) {
196        firstRow = false;
197        keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval,
198            distinctColIndices,
199            conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector);
200        valueObjectInspector = initEvaluatorsAndReturnStruct(valueEval, conf
201            .getOutputValueColumnNames(), rowInspector);
202        partitionObjectInspectors = initEvaluators(partitionEval, rowInspector);
203        int numKeys = numDistinctExprs > 0 ? numDistinctExprs : 1;
204        int keyLen = numDistinctExprs > 0 ? numDistributionKeys + 1 :
205          numDistributionKeys;
206        cachedKeys = new Object[numKeys][keyLen];
207        cachedValues = new Object[valueEval.length];
208      }
209
210      // Evaluate the HashCode
211      int keyHashCode = 0;
212      if (partitionEval.length == 0) {
213        // If no partition cols, just distribute the data uniformly to provide
214        // better
215        // load balance. If the requirement is to have a single reducer, we
216        // should set
217        // the number of reducers to 1.
218        // Use a constant seed to make the code deterministic.
219        if (random == null) {
220          random = new Random(12345);
221        }
222        keyHashCode = random.nextInt();
223      } else {
224        for (int i = 0; i < partitionEval.length; i++) {
225          Object o = partitionEval[i].evaluate(row);
226          keyHashCode = keyHashCode * 31
227              + ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]);
228        }
229      }
230
231      // Evaluate the value
232      for (int i = 0; i < valueEval.length; i++) {
233        cachedValues[i] = valueEval[i].evaluate(row);
234      }
235      // Serialize the value
236      value = valueSerializer.serialize(cachedValues, valueObjectInspector);
237
238      // Evaluate the keys
239      Object[] distributionKeys = new Object[numDistributionKeys];
240      for (int i = 0; i < numDistributionKeys; i++) {
241        distributionKeys[i] = keyEval[i].evaluate(row);
242      }
243
244      if (numDistinctExprs > 0) {
245        // with distinct key(s)
246        for (int i = 0; i < numDistinctExprs; i++) {
247          System.arraycopy(distributionKeys, 0, cachedKeys[i], 0, numDistributionKeys);
248          Object[] distinctParameters =
249            new Object[distinctColIndices.get(i).size()];
250          for (int j = 0; j < distinctParameters.length; j++) {
251            distinctParameters[j] =
252              keyEval[distinctColIndices.get(i).get(j)].evaluate(row);
253          }
254          cachedKeys[i][numDistributionKeys] =
255              new StandardUnion((byte)i, distinctParameters);
256        }
257      } else {
258        // no distinct key
259        System.arraycopy(distributionKeys, 0, cachedKeys[0], 0, numDistributionKeys);
260      }
261      // Serialize the keys and append the tag
262      for (int i = 0; i < cachedKeys.length; i++) {
263        if (keyIsText) {
264          Text key = (Text) keySerializer.serialize(cachedKeys[i],
265              keyObjectInspector);
266          if (tag == -1) {
267            keyWritable.set(key.getBytes(), 0, key.getLength());
268          } else {
269            int keyLength = key.getLength();
270            keyWritable.setSize(keyLength + 1);
271            System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength);
272            keyWritable.get()[keyLength] = tagByte[0];
273          }
274        } else {
275          // Must be BytesWritable
276          BytesWritable key = (BytesWritable) keySerializer.serialize(
277              cachedKeys[i], keyObjectInspector);
278          if (tag == -1) {
279            keyWritable.set(key.getBytes(), 0, key.getLength());
280          } else {
281            int keyLength = key.getLength();
282            keyWritable.setSize(keyLength + 1);
283            System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength);
284            keyWritable.get()[keyLength] = tagByte[0];
285          }
286        }
287        keyWritable.setHashCode(keyHashCode);
288        if (out != null) {
289          out.collect(keyWritable, value);
290          // Since this is a terminal operator, update counters explicitly -
291          // forward is not called
292          if (counterNameToEnum != null) {
293            ++outputRows;
294            if (outputRows % 1000 == 0) {
295              incrCounter(numOutputRowsCntr, outputRows);
296              outputRows = 0;
297            }
298          }
299        }
300      }
301    } catch (SerDeException e) {
302      throw new HiveException(e);
303    } catch (IOException e) {
304      throw new HiveException(e);
305    }
306  }
307
308  /**
309   * @return the name of the operator
310   */
311  @Override
312  public String getName() {
313    return new String("RS");
314  }
315
316  @Override
317  public OperatorType getType() {
318    return OperatorType.REDUCESINK;
319  }
320}