PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java

#
Java | 589 lines | 430 code | 60 blank | 99 comment | 80 complexity | 0ee2e8d9506d7472927b5b6d6d4ac4ff MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.exec;
  19. import java.io.Serializable;
  20. import java.util.ArrayList;
  21. import java.util.Arrays;
  22. import java.util.HashMap;
  23. import java.util.LinkedHashMap;
  24. import java.util.List;
  25. import java.util.Map;
  26. import java.util.Set;
  27. import java.util.HashSet;
  28. import java.util.Map.Entry;
  29. import java.util.Properties;
  30. import org.apache.hadoop.conf.Configuration;
  31. import org.apache.hadoop.fs.Path;
  32. import org.apache.hadoop.hive.conf.HiveConf;
  33. import org.apache.hadoop.hive.ql.io.IOContext;
  34. import org.apache.hadoop.hive.ql.metadata.HiveException;
  35. import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
  36. import org.apache.hadoop.hive.ql.plan.api.OperatorType;
  37. import org.apache.hadoop.hive.ql.plan.MapredWork;
  38. import org.apache.hadoop.hive.ql.plan.PartitionDesc;
  39. import org.apache.hadoop.hive.ql.plan.TableScanDesc;
  40. import org.apache.hadoop.hive.serde2.Deserializer;
  41. import org.apache.hadoop.hive.serde2.SerDeException;
  42. import org.apache.hadoop.hive.serde2.SerDeUtils;
  43. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
  44. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
  45. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
  46. import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
  47. import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
  48. import org.apache.hadoop.io.LongWritable;
  49. import org.apache.hadoop.io.Text;
  50. import org.apache.hadoop.io.Writable;
  51. import org.apache.hadoop.util.StringUtils;
  52. /**
  53. * Map operator. This triggers overall map side processing. This is a little
  54. * different from regular operators in that it starts off by processing a
  55. * Writable data structure from a Table (instead of a Hive Object).
  56. **/
  57. public class MapOperator extends Operator<MapredWork> implements Serializable {
  58. private static final long serialVersionUID = 1L;
  59. /**
  60. * Counter.
  61. *
  62. */
  63. public static enum Counter {
  64. DESERIALIZE_ERRORS
  65. }
  66. private final transient LongWritable deserialize_error_count = new LongWritable();
  67. private transient Deserializer deserializer;
  68. private transient Object[] rowWithPart;
  69. private transient Writable[] vcValues;
  70. private transient List<VirtualColumn> vcs;
  71. private transient Object[] rowWithPartAndVC;
  72. private transient StructObjectInspector rowObjectInspector;
  73. private transient boolean isPartitioned;
  74. private transient boolean hasVC;
  75. private Map<MapInputPath, MapOpCtx> opCtxMap;
  76. private Set<MapInputPath> listInputPaths = new HashSet<MapInputPath>();
  77. private Map<Operator<? extends Serializable>, java.util.ArrayList<String>> operatorToPaths;
  78. private final java.util.ArrayList<String> childrenPaths = new ArrayList<String>();
  79. private ArrayList<Operator<? extends Serializable>> extraChildrenToClose = null;
  80. private static class MapInputPath {
  81. String path;
  82. String alias;
  83. Operator<? extends Serializable> op;
  84. /**
  85. * @param path
  86. * @param alias
  87. * @param op
  88. */
  89. public MapInputPath(String path, String alias,
  90. Operator<? extends Serializable> op) {
  91. this.path = path;
  92. this.alias = alias;
  93. this.op = op;
  94. }
  95. @Override
  96. public boolean equals(Object o) {
  97. if (o instanceof MapInputPath) {
  98. MapInputPath mObj = (MapInputPath) o;
  99. if (mObj == null) {
  100. return false;
  101. }
  102. return path.equals(mObj.path) && alias.equals(mObj.alias)
  103. && op.equals(mObj.op);
  104. }
  105. return false;
  106. }
  107. @Override
  108. public int hashCode() {
  109. return (op == null) ? 0 : op.hashCode();
  110. }
  111. public Operator<? extends Serializable> getOp() {
  112. return op;
  113. }
  114. public void setOp(Operator<? extends Serializable> op) {
  115. this.op = op;
  116. }
  117. }
  118. private static class MapOpCtx {
  119. boolean isPartitioned;
  120. StructObjectInspector rawRowObjectInspector; //without partition
  121. StructObjectInspector partObjectInspector; // partition
  122. StructObjectInspector rowObjectInspector;
  123. Object[] rowWithPart;
  124. Deserializer deserializer;
  125. public String tableName;
  126. public String partName;
  127. /**
  128. * @param isPartitioned
  129. * @param rowObjectInspector
  130. * @param rowWithPart
  131. */
  132. public MapOpCtx(boolean isPartitioned,
  133. StructObjectInspector rowObjectInspector,
  134. StructObjectInspector rawRowObjectInspector,
  135. StructObjectInspector partObjectInspector,
  136. Object[] rowWithPart,
  137. Deserializer deserializer) {
  138. this.isPartitioned = isPartitioned;
  139. this.rowObjectInspector = rowObjectInspector;
  140. this.rawRowObjectInspector = rawRowObjectInspector;
  141. this.partObjectInspector = partObjectInspector;
  142. this.rowWithPart = rowWithPart;
  143. this.deserializer = deserializer;
  144. }
  145. /**
  146. * @return the isPartitioned
  147. */
  148. public boolean isPartitioned() {
  149. return isPartitioned;
  150. }
  151. /**
  152. * @return the rowObjectInspector
  153. */
  154. public StructObjectInspector getRowObjectInspector() {
  155. return rowObjectInspector;
  156. }
  157. /**
  158. * @return the rowWithPart
  159. */
  160. public Object[] getRowWithPart() {
  161. return rowWithPart;
  162. }
  163. /**
  164. * @return the deserializer
  165. */
  166. public Deserializer getDeserializer() {
  167. return deserializer;
  168. }
  169. }
  170. /**
  171. * Initializes this map op as the root of the tree. It sets JobConf &
  172. * MapRedWork and starts initialization of the operator tree rooted at this
  173. * op.
  174. *
  175. * @param hconf
  176. * @param mrwork
  177. * @throws HiveException
  178. */
  179. public void initializeAsRoot(Configuration hconf, MapredWork mrwork)
  180. throws HiveException {
  181. setConf(mrwork);
  182. setChildren(hconf);
  183. initialize(hconf, null);
  184. }
  185. private static MapOpCtx initObjectInspector(MapredWork conf,
  186. Configuration hconf, String onefile) throws HiveException,
  187. ClassNotFoundException, InstantiationException, IllegalAccessException,
  188. SerDeException {
  189. PartitionDesc td = conf.getPathToPartitionInfo().get(onefile);
  190. LinkedHashMap<String, String> partSpec = td.getPartSpec();
  191. Properties tblProps = td.getProperties();
  192. Class sdclass = td.getDeserializerClass();
  193. if (sdclass == null) {
  194. String className = td.getSerdeClassName();
  195. if ((className == "") || (className == null)) {
  196. throw new HiveException(
  197. "SerDe class or the SerDe class name is not set for table: "
  198. + td.getProperties().getProperty("name"));
  199. }
  200. sdclass = hconf.getClassByName(className);
  201. }
  202. String tableName = String.valueOf(tblProps.getProperty("name"));
  203. String partName = String.valueOf(partSpec);
  204. // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME, tableName);
  205. // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, partName);
  206. Deserializer deserializer = (Deserializer) sdclass.newInstance();
  207. deserializer.initialize(hconf, tblProps);
  208. StructObjectInspector rawRowObjectInspector = (StructObjectInspector) deserializer
  209. .getObjectInspector();
  210. MapOpCtx opCtx = null;
  211. // Next check if this table has partitions and if so
  212. // get the list of partition names as well as allocate
  213. // the serdes for the partition columns
  214. String pcols = tblProps
  215. .getProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
  216. // Log LOG = LogFactory.getLog(MapOperator.class.getName());
  217. if (pcols != null && pcols.length() > 0) {
  218. String[] partKeys = pcols.trim().split("/");
  219. List<String> partNames = new ArrayList<String>(partKeys.length);
  220. Object[] partValues = new Object[partKeys.length];
  221. List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(
  222. partKeys.length);
  223. for (int i = 0; i < partKeys.length; i++) {
  224. String key = partKeys[i];
  225. partNames.add(key);
  226. // Partitions do not exist for this table
  227. if (partSpec == null) {
  228. partValues[i] = new Text();
  229. } else {
  230. partValues[i] = new Text(partSpec.get(key));
  231. }
  232. partObjectInspectors
  233. .add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
  234. }
  235. StructObjectInspector partObjectInspector = ObjectInspectorFactory
  236. .getStandardStructObjectInspector(partNames, partObjectInspectors);
  237. Object[] rowWithPart = new Object[2];
  238. rowWithPart[1] = partValues;
  239. StructObjectInspector rowObjectInspector = ObjectInspectorFactory
  240. .getUnionStructObjectInspector(Arrays
  241. .asList(new StructObjectInspector[] {rawRowObjectInspector, partObjectInspector}));
  242. // LOG.info("dump " + tableName + " " + partName + " " +
  243. // rowObjectInspector.getTypeName());
  244. opCtx = new MapOpCtx(true, rowObjectInspector, rawRowObjectInspector ,partObjectInspector,rowWithPart, deserializer);
  245. } else {
  246. // LOG.info("dump2 " + tableName + " " + partName + " " +
  247. // rowObjectInspector.getTypeName());
  248. opCtx = new MapOpCtx(false, rawRowObjectInspector, rawRowObjectInspector, null, null, deserializer);
  249. }
  250. opCtx.tableName = tableName;
  251. opCtx.partName = partName;
  252. return opCtx;
  253. }
  254. /**
  255. * Set the inspectors given a input. Since a mapper can span multiple partitions, the inspectors
  256. * need to be changed if the input changes
  257. **/
  258. private void setInspectorInput(MapInputPath inp) {
  259. Operator<? extends Serializable> op = inp.getOp();
  260. deserializer = opCtxMap.get(inp).getDeserializer();
  261. isPartitioned = opCtxMap.get(inp).isPartitioned();
  262. rowWithPart = opCtxMap.get(inp).getRowWithPart();
  263. rowObjectInspector = opCtxMap.get(inp).getRowObjectInspector();
  264. if (listInputPaths.contains(inp)) {
  265. return;
  266. }
  267. listInputPaths.add(inp);
  268. StructObjectInspector rawRowObjectInspector = opCtxMap.get(inp).rawRowObjectInspector;
  269. StructObjectInspector partObjectInspector = opCtxMap.get(inp).partObjectInspector;
  270. if (op instanceof TableScanOperator) {
  271. TableScanOperator tsOp = (TableScanOperator) op;
  272. TableScanDesc tsDesc = tsOp.getConf();
  273. if(tsDesc != null) {
  274. this.vcs = tsDesc.getVirtualCols();
  275. if (vcs != null && vcs.size() > 0) {
  276. this.hasVC = true;
  277. List<String> vcNames = new ArrayList<String>(vcs.size());
  278. this.vcValues = new Writable[vcs.size()];
  279. List<ObjectInspector> vcsObjectInspectors = new ArrayList<ObjectInspector>(vcs.size());
  280. for (int i = 0; i < vcs.size(); i++) {
  281. VirtualColumn vc = vcs.get(i);
  282. vcsObjectInspectors.add(
  283. PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
  284. ((PrimitiveTypeInfo) vc.getTypeInfo()).getPrimitiveCategory()));
  285. vcNames.add(vc.getName());
  286. }
  287. StructObjectInspector vcStructObjectInspector = ObjectInspectorFactory
  288. .getStandardStructObjectInspector(vcNames,
  289. vcsObjectInspectors);
  290. if (isPartitioned) {
  291. this.rowWithPartAndVC = new Object[3];
  292. this.rowWithPartAndVC[1] = this.rowWithPart[1];
  293. } else {
  294. this.rowWithPartAndVC = new Object[2];
  295. }
  296. if(partObjectInspector == null) {
  297. this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays
  298. .asList(new StructObjectInspector[] {
  299. rowObjectInspector, vcStructObjectInspector }));
  300. } else {
  301. this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays
  302. .asList(new StructObjectInspector[] {
  303. rawRowObjectInspector, partObjectInspector, vcStructObjectInspector }));
  304. }
  305. opCtxMap.get(inp).rowObjectInspector = this.rowObjectInspector;
  306. }
  307. }
  308. }
  309. }
  310. public void setChildren(Configuration hconf) throws HiveException {
  311. Path fpath = new Path((new Path(HiveConf.getVar(hconf,
  312. HiveConf.ConfVars.HADOOPMAPFILENAME))).toUri().getPath());
  313. ArrayList<Operator<? extends Serializable>> children = new ArrayList<Operator<? extends Serializable>>();
  314. opCtxMap = new HashMap<MapInputPath, MapOpCtx>();
  315. operatorToPaths = new HashMap<Operator<? extends Serializable>, java.util.ArrayList<String>>();
  316. statsMap.put(Counter.DESERIALIZE_ERRORS, deserialize_error_count);
  317. try {
  318. boolean done = false;
  319. for (String onefile : conf.getPathToAliases().keySet()) {
  320. MapOpCtx opCtx = initObjectInspector(conf, hconf, onefile);
  321. Path onepath = new Path(new Path(onefile).toUri().getPath());
  322. List<String> aliases = conf.getPathToAliases().get(onefile);
  323. for (String onealias : aliases) {
  324. Operator<? extends Serializable> op = conf.getAliasToWork().get(
  325. onealias);
  326. LOG.info("Adding alias " + onealias + " to work list for file "
  327. + onefile);
  328. MapInputPath inp = new MapInputPath(onefile, onealias, op);
  329. opCtxMap.put(inp, opCtx);
  330. if (operatorToPaths.get(op) == null) {
  331. operatorToPaths.put(op, new java.util.ArrayList<String>());
  332. }
  333. operatorToPaths.get(op).add(onefile);
  334. op
  335. .setParentOperators(new ArrayList<Operator<? extends Serializable>>());
  336. op.getParentOperators().add(this);
  337. // check for the operators who will process rows coming to this Map
  338. // Operator
  339. if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) {
  340. children.add(op);
  341. childrenPaths.add(onefile);
  342. LOG.info("dump " + op.getName() + " "
  343. + opCtxMap.get(inp).getRowObjectInspector().getTypeName());
  344. if (!done) {
  345. setInspectorInput(inp);
  346. done = true;
  347. }
  348. }
  349. }
  350. }
  351. if (children.size() == 0) {
  352. // didn't find match for input file path in configuration!
  353. // serious problem ..
  354. LOG.error("Configuration does not have any alias for path: "
  355. + fpath.toUri().getPath());
  356. throw new HiveException("Configuration and input path are inconsistent");
  357. }
  358. // we found all the operators that we are supposed to process.
  359. setChildOperators(children);
  360. } catch (Exception e) {
  361. throw new HiveException(e);
  362. }
  363. }
  364. @Override
  365. public void initializeOp(Configuration hconf) throws HiveException {
  366. // set that parent initialization is done and call initialize on children
  367. state = State.INIT;
  368. List<Operator<? extends Serializable>> children = getChildOperators();
  369. for (Entry<MapInputPath, MapOpCtx> entry : opCtxMap.entrySet()) {
  370. // Add alias, table name, and partitions to hadoop conf so that their
  371. // children will
  372. // inherit these
  373. HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME,
  374. entry.getValue().tableName);
  375. HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, entry
  376. .getValue().partName);
  377. MapInputPath input = entry.getKey();
  378. Operator<? extends Serializable> op = input.op;
  379. // op is not in the children list, so need to remember it and close it
  380. // afterwards
  381. if (children.indexOf(op) == -1) {
  382. if (extraChildrenToClose == null) {
  383. extraChildrenToClose = new ArrayList<Operator<? extends Serializable>>();
  384. }
  385. extraChildrenToClose.add(op);
  386. }
  387. // multiple input paths may corresponding the same operator (tree). The
  388. // below logic is to avoid initialize one operator multiple times if there
  389. // is one input path in this mapper's input paths.
  390. boolean shouldInit = true;
  391. List<String> paths = operatorToPaths.get(op);
  392. for (String path : paths) {
  393. if (childrenPaths.contains(path) && !path.equals(input.path)) {
  394. shouldInit = false;
  395. break;
  396. }
  397. }
  398. if (shouldInit) {
  399. op.initialize(hconf, new ObjectInspector[] {entry.getValue().getRowObjectInspector()});
  400. }
  401. }
  402. }
  403. /**
  404. * close extra child operators that are initialized but are not executed.
  405. */
  406. @Override
  407. public void closeOp(boolean abort) throws HiveException {
  408. if (extraChildrenToClose != null) {
  409. for (Operator<? extends Serializable> op : extraChildrenToClose) {
  410. op.close(abort);
  411. }
  412. }
  413. }
  414. // Change the serializer etc. since it is a new file, and split can span
  415. // multiple files/partitions.
  416. public void cleanUpInputFileChangedOp() throws HiveException {
  417. Path fpath = new Path((new Path(this.getExecContext().getCurrentInputFile()))
  418. .toUri().getPath());
  419. for (String onefile : conf.getPathToAliases().keySet()) {
  420. Path onepath = new Path(new Path(onefile).toUri().getPath());
  421. // check for the operators who will process rows coming to this Map
  422. // Operator
  423. if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) {
  424. String onealias = conf.getPathToAliases().get(onefile).get(0);
  425. Operator<? extends Serializable> op =
  426. conf.getAliasToWork().get(onealias);
  427. LOG.info("Processing alias " + onealias + " for file " + onefile);
  428. MapInputPath inp = new MapInputPath(onefile, onealias, op);
  429. setInspectorInput(inp);
  430. break;
  431. }
  432. }
  433. }
  434. public void process(Writable value) throws HiveException {
  435. // A mapper can span multiple files/partitions.
  436. // The serializers need to be reset if the input file changed
  437. if ((this.getExecContext() != null) &&
  438. this.getExecContext().inputFileChanged()) {
  439. LOG.info("Processing path " + this.getExecContext().getCurrentInputFile());
  440. // The child operators cleanup if input file has changed
  441. cleanUpInputFileChanged();
  442. }
  443. Object row = null;
  444. try {
  445. if (this.hasVC) {
  446. this.rowWithPartAndVC[0] = deserializer.deserialize(value);
  447. int vcPos = isPartitioned ? 2 : 1;
  448. populateVirtualColumnValues();
  449. this.rowWithPartAndVC[vcPos] = this.vcValues;
  450. } else if (!isPartitioned) {
  451. row = deserializer.deserialize((Writable)value);
  452. } else {
  453. rowWithPart[0] = deserializer.deserialize((Writable)value);
  454. }
  455. } catch (Exception e) {
  456. // Serialize the row and output.
  457. String rawRowString;
  458. try {
  459. rawRowString = value.toString();
  460. } catch (Exception e2) {
  461. rawRowString = "[Error getting row data with exception " +
  462. StringUtils.stringifyException(e2) + " ]";
  463. }
  464. // TODO: policy on deserialization errors
  465. deserialize_error_count.set(deserialize_error_count.get() + 1);
  466. throw new HiveException("Hive Runtime Error while processing writable " + rawRowString, e);
  467. }
  468. try {
  469. if (this.hasVC) {
  470. forward(this.rowWithPartAndVC, this.rowObjectInspector);
  471. } else if (!isPartitioned) {
  472. forward(row, rowObjectInspector);
  473. } else {
  474. forward(rowWithPart, rowObjectInspector);
  475. }
  476. } catch (Exception e) {
  477. // Serialize the row and output the error message.
  478. String rowString;
  479. try {
  480. if (this.hasVC) {
  481. rowString = SerDeUtils.getJSONString(rowWithPartAndVC, rowObjectInspector);
  482. } else if (!isPartitioned) {
  483. rowString = SerDeUtils.getJSONString(row, rowObjectInspector);
  484. } else {
  485. rowString = SerDeUtils.getJSONString(rowWithPart, rowObjectInspector);
  486. }
  487. } catch (Exception e2) {
  488. rowString = "[Error getting row data with exception " +
  489. StringUtils.stringifyException(e2) + " ]";
  490. }
  491. throw new HiveException("Hive Runtime Error while processing row " + rowString, e);
  492. }
  493. }
  494. private void populateVirtualColumnValues() {
  495. if (this.vcs != null) {
  496. ExecMapperContext mapExecCxt = this.getExecContext();
  497. IOContext ioCxt = mapExecCxt.getIoCxt();
  498. for (int i = 0; i < vcs.size(); i++) {
  499. VirtualColumn vc = vcs.get(i);
  500. if (vc.equals(VirtualColumn.FILENAME) && mapExecCxt.inputFileChanged()) {
  501. this.vcValues[i] = new Text(mapExecCxt.getCurrentInputFile());
  502. } else if (vc.equals(VirtualColumn.BLOCKOFFSET)) {
  503. long current = ioCxt.getCurrentBlockStart();
  504. LongWritable old = (LongWritable) this.vcValues[i];
  505. if (old == null) {
  506. old = new LongWritable(current);
  507. this.vcValues[i] = old;
  508. continue;
  509. }
  510. if (current != old.get()) {
  511. old.set(current);
  512. }
  513. }
  514. }
  515. }
  516. }
  517. @Override
  518. public void processOp(Object row, int tag) throws HiveException {
  519. throw new HiveException("Hive 2 Internal error: should not be called!");
  520. }
  521. @Override
  522. public String getName() {
  523. return "MAP";
  524. }
  525. @Override
  526. public OperatorType getType() {
  527. return null;
  528. }
  529. }