PageRenderTime 85ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java

#
Java | 320 lines | 224 code | 31 blank | 65 comment | 34 complexity | 1aaeb1e06d9cab04e17aa28e3a5dac24 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.io.IOException;
  20. import java.io.Serializable;
  21. import java.util.ArrayList;
  22. import java.util.Arrays;
  23. import java.util.List;
  24. import java.util.Random;
  25. import org.apache.hadoop.conf.Configuration;
  26. import org.apache.hadoop.hive.conf.HiveConf;
  27. import org.apache.hadoop.hive.ql.io.HiveKey;
  28. import org.apache.hadoop.hive.ql.metadata.HiveException;
  29. import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
  30. import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
  31. import org.apache.hadoop.hive.ql.plan.TableDesc;
  32. import org.apache.hadoop.hive.ql.plan.api.OperatorType;
  33. import org.apache.hadoop.hive.serde2.SerDeException;
  34. import org.apache.hadoop.hive.serde2.Serializer;
  35. import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
  36. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
  37. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
  38. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
  39. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
  40. import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
  41. import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion;
  42. import org.apache.hadoop.io.BytesWritable;
  43. import org.apache.hadoop.io.Text;
  44. import org.apache.hadoop.io.Writable;
  45. /**
  46. * Reduce Sink Operator sends output to the reduce stage.
  47. **/
  48. public class ReduceSinkOperator extends TerminalOperator<ReduceSinkDesc>
  49. implements Serializable {
  50. private static final long serialVersionUID = 1L;
  51. /**
  52. * The evaluators for the key columns. Key columns decide the sort order on
  53. * the reducer side. Key columns are passed to the reducer in the "key".
  54. */
  55. protected transient ExprNodeEvaluator[] keyEval;
  56. /**
  57. * The evaluators for the value columns. Value columns are passed to reducer
  58. * in the "value".
  59. */
  60. protected transient ExprNodeEvaluator[] valueEval;
  61. /**
  62. * The evaluators for the partition columns (CLUSTER BY or DISTRIBUTE BY in
  63. * Hive language). Partition columns decide the reducer that the current row
  64. * goes to. Partition columns are not passed to reducer.
  65. */
  66. protected transient ExprNodeEvaluator[] partitionEval;
  67. // TODO: we use MetadataTypedColumnsetSerDe for now, till DynamicSerDe is
  68. // ready
  69. transient Serializer keySerializer;
  70. transient boolean keyIsText;
  71. transient Serializer valueSerializer;
  72. transient int tag;
  73. transient byte[] tagByte = new byte[1];
  74. transient protected int numDistributionKeys;
  75. transient protected int numDistinctExprs;
  76. @Override
  77. protected void initializeOp(Configuration hconf) throws HiveException {
  78. try {
  79. keyEval = new ExprNodeEvaluator[conf.getKeyCols().size()];
  80. int i = 0;
  81. for (ExprNodeDesc e : conf.getKeyCols()) {
  82. keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
  83. }
  84. numDistributionKeys = conf.getNumDistributionKeys();
  85. distinctColIndices = conf.getDistinctColumnIndices();
  86. numDistinctExprs = distinctColIndices.size();
  87. valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
  88. i = 0;
  89. for (ExprNodeDesc e : conf.getValueCols()) {
  90. valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
  91. }
  92. partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
  93. i = 0;
  94. for (ExprNodeDesc e : conf.getPartitionCols()) {
  95. partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
  96. }
  97. tag = conf.getTag();
  98. tagByte[0] = (byte) tag;
  99. LOG.info("Using tag = " + tag);
  100. TableDesc keyTableDesc = conf.getKeySerializeInfo();
  101. keySerializer = (Serializer) keyTableDesc.getDeserializerClass()
  102. .newInstance();
  103. keySerializer.initialize(null, keyTableDesc.getProperties());
  104. keyIsText = keySerializer.getSerializedClass().equals(Text.class);
  105. TableDesc valueTableDesc = conf.getValueSerializeInfo();
  106. valueSerializer = (Serializer) valueTableDesc.getDeserializerClass()
  107. .newInstance();
  108. valueSerializer.initialize(null, valueTableDesc.getProperties());
  109. firstRow = true;
  110. initializeChildren(hconf);
  111. } catch (Exception e) {
  112. e.printStackTrace();
  113. throw new RuntimeException(e);
  114. }
  115. }
  116. transient InspectableObject tempInspectableObject = new InspectableObject();
  117. transient HiveKey keyWritable = new HiveKey();
  118. transient Writable value;
  119. transient StructObjectInspector keyObjectInspector;
  120. transient StructObjectInspector valueObjectInspector;
  121. transient ObjectInspector[] partitionObjectInspectors;
  122. transient Object[][] cachedKeys;
  123. transient Object[] cachedValues;
  124. transient List<List<Integer>> distinctColIndices;
  125. boolean firstRow;
  126. transient Random random;
  127. /**
  128. * Initializes array of ExprNodeEvaluator. Adds Union field for distinct
  129. * column indices for group by.
  130. * Puts the return values into a StructObjectInspector with output column
  131. * names.
  132. *
  133. * If distinctColIndices is empty, the object inspector is same as
  134. * {@link Operator#initEvaluatorsAndReturnStruct(ExprNodeEvaluator[], List, ObjectInspector)}
  135. */
  136. protected static StructObjectInspector initEvaluatorsAndReturnStruct(
  137. ExprNodeEvaluator[] evals, List<List<Integer>> distinctColIndices,
  138. List<String> outputColNames,
  139. int length, ObjectInspector rowInspector)
  140. throws HiveException {
  141. int inspectorLen = evals.length > length ? length + 1 : evals.length;
  142. List<ObjectInspector> sois = new ArrayList<ObjectInspector>(inspectorLen);
  143. // keys
  144. ObjectInspector[] fieldObjectInspectors = initEvaluators(evals, 0, length, rowInspector);
  145. sois.addAll(Arrays.asList(fieldObjectInspectors));
  146. if (evals.length > length) {
  147. // union keys
  148. List<ObjectInspector> uois = new ArrayList<ObjectInspector>();
  149. for (List<Integer> distinctCols : distinctColIndices) {
  150. List<String> names = new ArrayList<String>();
  151. List<ObjectInspector> eois = new ArrayList<ObjectInspector>();
  152. int numExprs = 0;
  153. for (int i : distinctCols) {
  154. names.add(HiveConf.getColumnInternalName(numExprs));
  155. eois.add(evals[i].initialize(rowInspector));
  156. numExprs++;
  157. }
  158. uois.add(ObjectInspectorFactory.getStandardStructObjectInspector(names, eois));
  159. }
  160. UnionObjectInspector uoi =
  161. ObjectInspectorFactory.getStandardUnionObjectInspector(uois);
  162. sois.add(uoi);
  163. }
  164. return ObjectInspectorFactory.getStandardStructObjectInspector(outputColNames, sois );
  165. }
  166. @Override
  167. public void processOp(Object row, int tag) throws HiveException {
  168. try {
  169. ObjectInspector rowInspector = inputObjInspectors[tag];
  170. if (firstRow) {
  171. firstRow = false;
  172. keyObjectInspector = initEvaluatorsAndReturnStruct(keyEval,
  173. distinctColIndices,
  174. conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector);
  175. valueObjectInspector = initEvaluatorsAndReturnStruct(valueEval, conf
  176. .getOutputValueColumnNames(), rowInspector);
  177. partitionObjectInspectors = initEvaluators(partitionEval, rowInspector);
  178. int numKeys = numDistinctExprs > 0 ? numDistinctExprs : 1;
  179. int keyLen = numDistinctExprs > 0 ? numDistributionKeys + 1 :
  180. numDistributionKeys;
  181. cachedKeys = new Object[numKeys][keyLen];
  182. cachedValues = new Object[valueEval.length];
  183. }
  184. // Evaluate the HashCode
  185. int keyHashCode = 0;
  186. if (partitionEval.length == 0) {
  187. // If no partition cols, just distribute the data uniformly to provide
  188. // better
  189. // load balance. If the requirement is to have a single reducer, we
  190. // should set
  191. // the number of reducers to 1.
  192. // Use a constant seed to make the code deterministic.
  193. if (random == null) {
  194. random = new Random(12345);
  195. }
  196. keyHashCode = random.nextInt();
  197. } else {
  198. for (int i = 0; i < partitionEval.length; i++) {
  199. Object o = partitionEval[i].evaluate(row);
  200. keyHashCode = keyHashCode * 31
  201. + ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]);
  202. }
  203. }
  204. // Evaluate the value
  205. for (int i = 0; i < valueEval.length; i++) {
  206. cachedValues[i] = valueEval[i].evaluate(row);
  207. }
  208. // Serialize the value
  209. value = valueSerializer.serialize(cachedValues, valueObjectInspector);
  210. // Evaluate the keys
  211. Object[] distributionKeys = new Object[numDistributionKeys];
  212. for (int i = 0; i < numDistributionKeys; i++) {
  213. distributionKeys[i] = keyEval[i].evaluate(row);
  214. }
  215. if (numDistinctExprs > 0) {
  216. // with distinct key(s)
  217. for (int i = 0; i < numDistinctExprs; i++) {
  218. System.arraycopy(distributionKeys, 0, cachedKeys[i], 0, numDistributionKeys);
  219. Object[] distinctParameters =
  220. new Object[distinctColIndices.get(i).size()];
  221. for (int j = 0; j < distinctParameters.length; j++) {
  222. distinctParameters[j] =
  223. keyEval[distinctColIndices.get(i).get(j)].evaluate(row);
  224. }
  225. cachedKeys[i][numDistributionKeys] =
  226. new StandardUnion((byte)i, distinctParameters);
  227. }
  228. } else {
  229. // no distinct key
  230. System.arraycopy(distributionKeys, 0, cachedKeys[0], 0, numDistributionKeys);
  231. }
  232. // Serialize the keys and append the tag
  233. for (int i = 0; i < cachedKeys.length; i++) {
  234. if (keyIsText) {
  235. Text key = (Text) keySerializer.serialize(cachedKeys[i],
  236. keyObjectInspector);
  237. if (tag == -1) {
  238. keyWritable.set(key.getBytes(), 0, key.getLength());
  239. } else {
  240. int keyLength = key.getLength();
  241. keyWritable.setSize(keyLength + 1);
  242. System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength);
  243. keyWritable.get()[keyLength] = tagByte[0];
  244. }
  245. } else {
  246. // Must be BytesWritable
  247. BytesWritable key = (BytesWritable) keySerializer.serialize(
  248. cachedKeys[i], keyObjectInspector);
  249. if (tag == -1) {
  250. keyWritable.set(key.getBytes(), 0, key.getLength());
  251. } else {
  252. int keyLength = key.getLength();
  253. keyWritable.setSize(keyLength + 1);
  254. System.arraycopy(key.getBytes(), 0, keyWritable.get(), 0, keyLength);
  255. keyWritable.get()[keyLength] = tagByte[0];
  256. }
  257. }
  258. keyWritable.setHashCode(keyHashCode);
  259. if (out != null) {
  260. out.collect(keyWritable, value);
  261. // Since this is a terminal operator, update counters explicitly -
  262. // forward is not called
  263. if (counterNameToEnum != null) {
  264. ++outputRows;
  265. if (outputRows % 1000 == 0) {
  266. incrCounter(numOutputRowsCntr, outputRows);
  267. outputRows = 0;
  268. }
  269. }
  270. }
  271. }
  272. } catch (SerDeException e) {
  273. throw new HiveException(e);
  274. } catch (IOException e) {
  275. throw new HiveException(e);
  276. }
  277. }
  278. /**
  279. * @return the name of the operator
  280. */
  281. @Override
  282. public String getName() {
  283. return new String("RS");
  284. }
  285. @Override
  286. public OperatorType getType() {
  287. return OperatorType.REDUCESINK;
  288. }
  289. }