PageRenderTime 50ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java

#
Java | 317 lines | 223 code | 55 blank | 39 comment | 38 complexity | fc25693d795272bb278fec5d080b846f MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.HashMap;
  22. import java.util.Map;
  23. import org.apache.commons.logging.Log;
  24. import org.apache.commons.logging.LogFactory;
  25. import org.apache.hadoop.conf.Configuration;
  26. import org.apache.hadoop.filecache.DistributedCache;
  27. import org.apache.hadoop.fs.FileSystem;
  28. import org.apache.hadoop.fs.Path;
  29. import org.apache.hadoop.hive.conf.HiveConf;
  30. import org.apache.hadoop.hive.ql.exec.HashTableSinkOperator.HashTableSinkObjectCtx;
  31. import org.apache.hadoop.hive.ql.exec.persistence.AbstractMapJoinKey;
  32. import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper;
  33. import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectValue;
  34. import org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer;
  35. import org.apache.hadoop.hive.ql.metadata.HiveException;
  36. import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
  37. import org.apache.hadoop.hive.ql.plan.TableDesc;
  38. import org.apache.hadoop.hive.ql.plan.api.OperatorType;
  39. import org.apache.hadoop.hive.serde2.SerDe;
  40. import org.apache.hadoop.hive.serde2.SerDeException;
  41. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
  42. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
  43. import org.apache.hadoop.util.ReflectionUtils;
  44. /**
  45. * Map side Join operator implementation.
  46. */
  47. public class MapJoinOperator extends AbstractMapJoinOperator<MapJoinDesc> implements Serializable {
  48. private static final long serialVersionUID = 1L;
  49. private static final Log LOG = LogFactory.getLog(MapJoinOperator.class.getName());
  50. protected transient Map<Byte, HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>> mapJoinTables;
  51. private static final transient String[] FATAL_ERR_MSG = {
  52. null, // counter value 0 means no error
  53. "Mapside join size exceeds hive.mapjoin.maxsize. "
  54. + "Please increase that or remove the mapjoin hint."};
  55. protected transient Map<Byte, MapJoinRowContainer<ArrayList<Object>>> rowContainerMap;
  56. transient int metadataKeyTag;
  57. transient int[] metadataValueTag;
  58. transient int maxMapJoinSize;
  59. private int bigTableAlias;
  60. public MapJoinOperator() {
  61. }
  62. public MapJoinOperator(AbstractMapJoinOperator<? extends MapJoinDesc> mjop) {
  63. super(mjop);
  64. }
  65. @Override
  66. protected void initializeOp(Configuration hconf) throws HiveException {
  67. super.initializeOp(hconf);
  68. maxMapJoinSize = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEMAXMAPJOINSIZE);
  69. metadataValueTag = new int[numAliases];
  70. for (int pos = 0; pos < numAliases; pos++) {
  71. metadataValueTag[pos] = -1;
  72. }
  73. metadataKeyTag = -1;
  74. bigTableAlias = order[posBigTable];
  75. mapJoinTables = new HashMap<Byte, HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>>();
  76. rowContainerMap = new HashMap<Byte, MapJoinRowContainer<ArrayList<Object>>>();
  77. // initialize the hash tables for other tables
  78. for (int pos = 0; pos < numAliases; pos++) {
  79. if (pos == posBigTable) {
  80. continue;
  81. }
  82. HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue> hashTable = new HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>();
  83. mapJoinTables.put(Byte.valueOf((byte) pos), hashTable);
  84. MapJoinRowContainer<ArrayList<Object>> rowContainer = new MapJoinRowContainer<ArrayList<Object>>();
  85. rowContainerMap.put(Byte.valueOf((byte) pos), rowContainer);
  86. }
  87. }
  88. @Override
  89. protected void fatalErrorMessage(StringBuilder errMsg, long counterCode) {
  90. errMsg.append("Operator " + getOperatorId() + " (id=" + id + "): "
  91. + FATAL_ERR_MSG[(int) counterCode]);
  92. }
  93. public void generateMapMetaData() throws HiveException, SerDeException {
  94. // generate the meta data for key
  95. // index for key is -1
  96. TableDesc keyTableDesc = conf.getKeyTblDesc();
  97. SerDe keySerializer = (SerDe) ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(),
  98. null);
  99. keySerializer.initialize(null, keyTableDesc.getProperties());
  100. MapJoinMetaData.put(Integer.valueOf(metadataKeyTag), new HashTableSinkObjectCtx(
  101. ObjectInspectorUtils.getStandardObjectInspector(keySerializer.getObjectInspector(),
  102. ObjectInspectorCopyOption.WRITABLE), keySerializer, keyTableDesc, hconf));
  103. // index for values is just alias
  104. for (int tag = 0; tag < order.length; tag++) {
  105. int alias = (int) order[tag];
  106. if (alias == this.bigTableAlias) {
  107. continue;
  108. }
  109. TableDesc valueTableDesc = conf.getValueTblDescs().get(tag);
  110. SerDe valueSerDe = (SerDe) ReflectionUtils.newInstance(valueTableDesc.getDeserializerClass(),
  111. null);
  112. valueSerDe.initialize(null, valueTableDesc.getProperties());
  113. MapJoinMetaData.put(Integer.valueOf(alias), new HashTableSinkObjectCtx(ObjectInspectorUtils
  114. .getStandardObjectInspector(valueSerDe.getObjectInspector(),
  115. ObjectInspectorCopyOption.WRITABLE), valueSerDe, valueTableDesc, hconf));
  116. }
  117. }
  118. private void loadHashTable() throws HiveException {
  119. boolean localMode = HiveConf.getVar(hconf, HiveConf.ConfVars.HADOOPJT).equals("local");
  120. String baseDir = null;
  121. HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue> hashtable;
  122. Byte pos;
  123. String currentInputFile = HiveConf.getVar(hconf, HiveConf.ConfVars.HADOOPMAPFILENAME);
  124. LOG.info("******* Load from HashTable File: input : " + currentInputFile);
  125. String currentFileName;
  126. if (this.getExecContext().getLocalWork().getInputFileChangeSensitive()) {
  127. currentFileName = this.getFileName(currentInputFile);
  128. } else {
  129. currentFileName = "-";
  130. }
  131. try {
  132. if (localMode) {
  133. baseDir = this.getExecContext().getLocalWork().getTmpFileURI();
  134. } else {
  135. Path[] localArchives;
  136. String stageID = this.getExecContext().getLocalWork().getStageID();
  137. String suffix = Utilities.generateTarFileName(stageID);
  138. FileSystem localFs = FileSystem.getLocal(hconf);
  139. localArchives = DistributedCache.getLocalCacheArchives(this.hconf);
  140. Path archive;
  141. for (int j = 0; j < localArchives.length; j++) {
  142. archive = localArchives[j];
  143. if (!archive.getName().endsWith(suffix)) {
  144. continue;
  145. }
  146. Path archiveLocalLink = archive.makeQualified(localFs);
  147. baseDir = archiveLocalLink.toUri().getPath();
  148. }
  149. }
  150. for (Map.Entry<Byte, HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>> entry : mapJoinTables
  151. .entrySet()) {
  152. pos = entry.getKey();
  153. hashtable = entry.getValue();
  154. String filePath = Utilities.generatePath(baseDir, pos, currentFileName);
  155. Path path = new Path(filePath);
  156. LOG.info("\tLoad back 1 hashtable file from tmp file uri:" + path.toString());
  157. hashtable.initilizePersistentHash(path.toUri().getPath());
  158. }
  159. } catch (Exception e) {
  160. LOG.error("Load Distributed Cache Error");
  161. throw new HiveException(e.getMessage());
  162. }
  163. }
  164. // Load the hash table
  165. @Override
  166. public void cleanUpInputFileChangedOp() throws HiveException {
  167. try {
  168. if (firstRow) {
  169. // generate the map metadata
  170. generateMapMetaData();
  171. firstRow = false;
  172. }
  173. loadHashTable();
  174. } catch (SerDeException e) {
  175. e.printStackTrace();
  176. throw new HiveException(e);
  177. }
  178. }
  179. @Override
  180. public void processOp(Object row, int tag) throws HiveException {
  181. try {
  182. if (firstRow) {
  183. // generate the map metadata
  184. generateMapMetaData();
  185. firstRow = false;
  186. }
  187. // get alias
  188. alias = order[tag];
  189. // alias = (byte)tag;
  190. if ((lastAlias == null) || (!lastAlias.equals(alias))) {
  191. nextSz = joinEmitInterval;
  192. }
  193. // compute keys and values as StandardObjects
  194. AbstractMapJoinKey key = JoinUtil.computeMapJoinKeys(row, joinKeys.get(alias),
  195. joinKeysObjectInspectors.get(alias));
  196. ArrayList<Object> value = JoinUtil.computeValues(row, joinValues.get(alias),
  197. joinValuesObjectInspectors.get(alias), joinFilters.get(alias), joinFilterObjectInspectors
  198. .get(alias), noOuterJoin);
  199. // Add the value to the ArrayList
  200. storage.get((byte) tag).add(value);
  201. for (Byte pos : order) {
  202. if (pos.intValue() != tag) {
  203. MapJoinObjectValue o = mapJoinTables.get(pos).get(key);
  204. MapJoinRowContainer<ArrayList<Object>> rowContainer = rowContainerMap.get(pos);
  205. // there is no join-value or join-key has all null elements
  206. if (o == null || key.hasAnyNulls()) {
  207. if (noOuterJoin) {
  208. storage.put(pos, emptyList);
  209. } else {
  210. storage.put(pos, dummyObjVectors[pos.intValue()]);
  211. }
  212. } else {
  213. rowContainer.reset(o.getObj());
  214. storage.put(pos, rowContainer);
  215. }
  216. }
  217. }
  218. // generate the output records
  219. checkAndGenObject();
  220. // done with the row
  221. storage.get((byte) tag).clear();
  222. for (Byte pos : order) {
  223. if (pos.intValue() != tag) {
  224. storage.put(pos, null);
  225. }
  226. }
  227. } catch (SerDeException e) {
  228. e.printStackTrace();
  229. throw new HiveException(e);
  230. }
  231. }
  232. private String getFileName(String path) {
  233. if (path == null || path.length() == 0) {
  234. return null;
  235. }
  236. int last_separator = path.lastIndexOf(Path.SEPARATOR) + 1;
  237. String fileName = path.substring(last_separator);
  238. return fileName;
  239. }
  240. @Override
  241. public void closeOp(boolean abort) throws HiveException {
  242. if (mapJoinTables != null) {
  243. for (HashMapWrapper hashTable : mapJoinTables.values()) {
  244. hashTable.close();
  245. }
  246. }
  247. super.closeOp(abort);
  248. }
  249. /**
  250. * Implements the getName function for the Node Interface.
  251. *
  252. * @return the name of the operator
  253. */
  254. @Override
  255. public String getName() {
  256. return "MAPJOIN";
  257. }
  258. @Override
  259. public OperatorType getType() {
  260. return OperatorType.MAPJOIN;
  261. }
  262. }