PageRenderTime 84ms CodeModel.GetById 13ms app.highlight 65ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java

#
Java | 589 lines | 430 code | 60 blank | 99 comment | 80 complexity | 0ee2e8d9506d7472927b5b6d6d4ac4ff MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.exec;
 20
 21import java.io.Serializable;
 22import java.util.ArrayList;
 23import java.util.Arrays;
 24import java.util.HashMap;
 25import java.util.LinkedHashMap;
 26import java.util.List;
 27import java.util.Map;
 28import java.util.Set;
 29import java.util.HashSet;
 30import java.util.Map.Entry;
 31import java.util.Properties;
 32
 33import org.apache.hadoop.conf.Configuration;
 34import org.apache.hadoop.fs.Path;
 35import org.apache.hadoop.hive.conf.HiveConf;
 36import org.apache.hadoop.hive.ql.io.IOContext;
 37import org.apache.hadoop.hive.ql.metadata.HiveException;
 38import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
 39import org.apache.hadoop.hive.ql.plan.api.OperatorType;
 40import org.apache.hadoop.hive.ql.plan.MapredWork;
 41import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 42import org.apache.hadoop.hive.ql.plan.TableScanDesc;
 43import org.apache.hadoop.hive.serde2.Deserializer;
 44import org.apache.hadoop.hive.serde2.SerDeException;
 45import org.apache.hadoop.hive.serde2.SerDeUtils;
 46import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 47import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 48import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 49import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 50import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
 51import org.apache.hadoop.io.LongWritable;
 52import org.apache.hadoop.io.Text;
 53import org.apache.hadoop.io.Writable;
 54import org.apache.hadoop.util.StringUtils;
 55
 56/**
 57 * Map operator. This triggers overall map side processing. This is a little
 58 * different from regular operators in that it starts off by processing a
 59 * Writable data structure from a Table (instead of a Hive Object).
 60 **/
 61public class MapOperator extends Operator<MapredWork> implements Serializable {
 62
 63  private static final long serialVersionUID = 1L;
 64
 65  /**
 66   * Counter.
 67   *
 68   */
 69  public static enum Counter {
 70    DESERIALIZE_ERRORS
 71  }
 72
 73  private final transient LongWritable deserialize_error_count = new LongWritable();
 74  private transient Deserializer deserializer;
 75
 76  private transient Object[] rowWithPart;
 77  private transient Writable[] vcValues;
 78  private transient List<VirtualColumn> vcs;
 79  private transient Object[] rowWithPartAndVC;
 80  private transient StructObjectInspector rowObjectInspector;
 81  private transient boolean isPartitioned;
 82  private transient boolean hasVC;
 83  private Map<MapInputPath, MapOpCtx> opCtxMap;
 84  private Set<MapInputPath> listInputPaths = new HashSet<MapInputPath>();
 85
 86  private Map<Operator<? extends Serializable>, java.util.ArrayList<String>> operatorToPaths;
 87
 88  private final java.util.ArrayList<String> childrenPaths = new ArrayList<String>();
 89
 90  private ArrayList<Operator<? extends Serializable>> extraChildrenToClose = null;
 91
 92  private static class MapInputPath {
 93    String path;
 94    String alias;
 95    Operator<? extends Serializable> op;
 96
 97    /**
 98     * @param path
 99     * @param alias
100     * @param op
101     */
102    public MapInputPath(String path, String alias,
103        Operator<? extends Serializable> op) {
104      this.path = path;
105      this.alias = alias;
106      this.op = op;
107    }
108
109    @Override
110    public boolean equals(Object o) {
111      if (o instanceof MapInputPath) {
112        MapInputPath mObj = (MapInputPath) o;
113        if (mObj == null) {
114          return false;
115        }
116        return path.equals(mObj.path) && alias.equals(mObj.alias)
117            && op.equals(mObj.op);
118      }
119
120      return false;
121    }
122
123    @Override
124    public int hashCode() {
125      return (op == null) ? 0 : op.hashCode();
126    }
127
128    public Operator<? extends Serializable> getOp() {
129      return op;
130    }
131
132    public void setOp(Operator<? extends Serializable> op) {
133      this.op = op;
134    }
135
136  }
137
138  private static class MapOpCtx {
139    boolean isPartitioned;
140    StructObjectInspector rawRowObjectInspector; //without partition
141    StructObjectInspector partObjectInspector; // partition
142    StructObjectInspector rowObjectInspector;
143    Object[] rowWithPart;
144    Deserializer deserializer;
145    public String tableName;
146    public String partName;
147
148    /**
149     * @param isPartitioned
150     * @param rowObjectInspector
151     * @param rowWithPart
152     */
153    public MapOpCtx(boolean isPartitioned,
154        StructObjectInspector rowObjectInspector,
155        StructObjectInspector rawRowObjectInspector,
156        StructObjectInspector partObjectInspector,
157        Object[] rowWithPart,
158        Deserializer deserializer) {
159      this.isPartitioned = isPartitioned;
160      this.rowObjectInspector = rowObjectInspector;
161      this.rawRowObjectInspector = rawRowObjectInspector;
162      this.partObjectInspector = partObjectInspector;
163      this.rowWithPart = rowWithPart;
164      this.deserializer = deserializer;
165    }
166
167    /**
168     * @return the isPartitioned
169     */
170    public boolean isPartitioned() {
171      return isPartitioned;
172    }
173
174    /**
175     * @return the rowObjectInspector
176     */
177    public StructObjectInspector getRowObjectInspector() {
178      return rowObjectInspector;
179    }
180
181    /**
182     * @return the rowWithPart
183     */
184    public Object[] getRowWithPart() {
185      return rowWithPart;
186    }
187
188    /**
189     * @return the deserializer
190     */
191    public Deserializer getDeserializer() {
192      return deserializer;
193    }
194  }
195
196  /**
197   * Initializes this map op as the root of the tree. It sets JobConf &
198   * MapRedWork and starts initialization of the operator tree rooted at this
199   * op.
200   *
201   * @param hconf
202   * @param mrwork
203   * @throws HiveException
204   */
205  public void initializeAsRoot(Configuration hconf, MapredWork mrwork)
206      throws HiveException {
207    setConf(mrwork);
208    setChildren(hconf);
209    initialize(hconf, null);
210  }
211
212  private static MapOpCtx initObjectInspector(MapredWork conf,
213      Configuration hconf, String onefile) throws HiveException,
214      ClassNotFoundException, InstantiationException, IllegalAccessException,
215      SerDeException {
216    PartitionDesc td = conf.getPathToPartitionInfo().get(onefile);
217    LinkedHashMap<String, String> partSpec = td.getPartSpec();
218    Properties tblProps = td.getProperties();
219
220    Class sdclass = td.getDeserializerClass();
221    if (sdclass == null) {
222      String className = td.getSerdeClassName();
223      if ((className == "") || (className == null)) {
224        throw new HiveException(
225            "SerDe class or the SerDe class name is not set for table: "
226            + td.getProperties().getProperty("name"));
227      }
228      sdclass = hconf.getClassByName(className);
229    }
230
231    String tableName = String.valueOf(tblProps.getProperty("name"));
232    String partName = String.valueOf(partSpec);
233    // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME, tableName);
234    // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, partName);
235    Deserializer deserializer = (Deserializer) sdclass.newInstance();
236    deserializer.initialize(hconf, tblProps);
237    StructObjectInspector rawRowObjectInspector = (StructObjectInspector) deserializer
238        .getObjectInspector();
239
240    MapOpCtx opCtx = null;
241    // Next check if this table has partitions and if so
242    // get the list of partition names as well as allocate
243    // the serdes for the partition columns
244    String pcols = tblProps
245        .getProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
246    // Log LOG = LogFactory.getLog(MapOperator.class.getName());
247    if (pcols != null && pcols.length() > 0) {
248      String[] partKeys = pcols.trim().split("/");
249      List<String> partNames = new ArrayList<String>(partKeys.length);
250      Object[] partValues = new Object[partKeys.length];
251      List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(
252          partKeys.length);
253      for (int i = 0; i < partKeys.length; i++) {
254        String key = partKeys[i];
255        partNames.add(key);
256        // Partitions do not exist for this table
257        if (partSpec == null) {
258          partValues[i] = new Text();
259        } else {
260          partValues[i] = new Text(partSpec.get(key));
261        }
262        partObjectInspectors
263            .add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
264      }
265      StructObjectInspector partObjectInspector = ObjectInspectorFactory
266          .getStandardStructObjectInspector(partNames, partObjectInspectors);
267
268      Object[] rowWithPart = new Object[2];
269      rowWithPart[1] = partValues;
270      StructObjectInspector rowObjectInspector = ObjectInspectorFactory
271          .getUnionStructObjectInspector(Arrays
272          .asList(new StructObjectInspector[] {rawRowObjectInspector, partObjectInspector}));
273      // LOG.info("dump " + tableName + " " + partName + " " +
274      // rowObjectInspector.getTypeName());
275      opCtx = new MapOpCtx(true, rowObjectInspector, rawRowObjectInspector ,partObjectInspector,rowWithPart, deserializer);
276    } else {
277      // LOG.info("dump2 " + tableName + " " + partName + " " +
278      // rowObjectInspector.getTypeName());
279      opCtx = new MapOpCtx(false, rawRowObjectInspector, rawRowObjectInspector, null, null, deserializer);
280    }
281    opCtx.tableName = tableName;
282    opCtx.partName = partName;
283    return opCtx;
284  }
285
286  /**
287   * Set the inspectors given a input. Since a mapper can span multiple partitions, the inspectors
288   * need to be changed if the input changes
289   **/
290  private void setInspectorInput(MapInputPath inp) {
291    Operator<? extends Serializable> op = inp.getOp();
292
293    deserializer  = opCtxMap.get(inp).getDeserializer();
294    isPartitioned = opCtxMap.get(inp).isPartitioned();
295    rowWithPart   = opCtxMap.get(inp).getRowWithPart();
296    rowObjectInspector = opCtxMap.get(inp).getRowObjectInspector();
297    if (listInputPaths.contains(inp)) {
298      return;
299    }
300
301    listInputPaths.add(inp);
302    StructObjectInspector rawRowObjectInspector = opCtxMap.get(inp).rawRowObjectInspector;
303    StructObjectInspector partObjectInspector = opCtxMap.get(inp).partObjectInspector;
304    if (op instanceof TableScanOperator) {
305      TableScanOperator tsOp = (TableScanOperator) op;
306      TableScanDesc tsDesc = tsOp.getConf();
307      if(tsDesc != null) {
308        this.vcs = tsDesc.getVirtualCols();
309        if (vcs != null && vcs.size() > 0) {
310          this.hasVC = true;
311          List<String> vcNames = new ArrayList<String>(vcs.size());
312          this.vcValues = new Writable[vcs.size()];
313          List<ObjectInspector> vcsObjectInspectors = new ArrayList<ObjectInspector>(vcs.size());
314          for (int i = 0; i < vcs.size(); i++) {
315            VirtualColumn vc = vcs.get(i);
316            vcsObjectInspectors.add(
317              PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
318                ((PrimitiveTypeInfo) vc.getTypeInfo()).getPrimitiveCategory()));
319            vcNames.add(vc.getName());
320          }
321          StructObjectInspector vcStructObjectInspector = ObjectInspectorFactory
322            .getStandardStructObjectInspector(vcNames,
323                                              vcsObjectInspectors);
324          if (isPartitioned) {
325            this.rowWithPartAndVC = new Object[3];
326            this.rowWithPartAndVC[1] = this.rowWithPart[1];
327          } else {
328            this.rowWithPartAndVC = new Object[2];
329          }
330          if(partObjectInspector == null) {
331            this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays
332                                        .asList(new StructObjectInspector[] {
333                                            rowObjectInspector, vcStructObjectInspector }));
334          } else {
335            this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays
336                                        .asList(new StructObjectInspector[] {
337                                            rawRowObjectInspector, partObjectInspector, vcStructObjectInspector }));
338          }
339          opCtxMap.get(inp).rowObjectInspector = this.rowObjectInspector;
340        }
341      }
342    }
343  }
344
345  public void setChildren(Configuration hconf) throws HiveException {
346
347    Path fpath = new Path((new Path(HiveConf.getVar(hconf,
348        HiveConf.ConfVars.HADOOPMAPFILENAME))).toUri().getPath());
349
350    ArrayList<Operator<? extends Serializable>> children = new ArrayList<Operator<? extends Serializable>>();
351    opCtxMap = new HashMap<MapInputPath, MapOpCtx>();
352    operatorToPaths = new HashMap<Operator<? extends Serializable>, java.util.ArrayList<String>>();
353
354    statsMap.put(Counter.DESERIALIZE_ERRORS, deserialize_error_count);
355
356    try {
357      boolean done = false;
358      for (String onefile : conf.getPathToAliases().keySet()) {
359        MapOpCtx opCtx = initObjectInspector(conf, hconf, onefile);
360        Path onepath = new Path(new Path(onefile).toUri().getPath());
361        List<String> aliases = conf.getPathToAliases().get(onefile);
362
363        for (String onealias : aliases) {
364          Operator<? extends Serializable> op = conf.getAliasToWork().get(
365              onealias);
366          LOG.info("Adding alias " + onealias + " to work list for file "
367              + onefile);
368          MapInputPath inp = new MapInputPath(onefile, onealias, op);
369          opCtxMap.put(inp, opCtx);
370          if (operatorToPaths.get(op) == null) {
371            operatorToPaths.put(op, new java.util.ArrayList<String>());
372          }
373          operatorToPaths.get(op).add(onefile);
374
375          op
376              .setParentOperators(new ArrayList<Operator<? extends Serializable>>());
377          op.getParentOperators().add(this);
378          // check for the operators who will process rows coming to this Map
379          // Operator
380          if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) {
381            children.add(op);
382            childrenPaths.add(onefile);
383            LOG.info("dump " + op.getName() + " "
384                + opCtxMap.get(inp).getRowObjectInspector().getTypeName());
385            if (!done) {
386              setInspectorInput(inp);
387              done = true;
388            }
389          }
390        }
391      }
392      if (children.size() == 0) {
393        // didn't find match for input file path in configuration!
394        // serious problem ..
395        LOG.error("Configuration does not have any alias for path: "
396            + fpath.toUri().getPath());
397        throw new HiveException("Configuration and input path are inconsistent");
398      }
399
400      // we found all the operators that we are supposed to process.
401      setChildOperators(children);
402    } catch (Exception e) {
403      throw new HiveException(e);
404    }
405  }
406
407  @Override
408  public void initializeOp(Configuration hconf) throws HiveException {
409    // set that parent initialization is done and call initialize on children
410    state = State.INIT;
411    List<Operator<? extends Serializable>> children = getChildOperators();
412
413    for (Entry<MapInputPath, MapOpCtx> entry : opCtxMap.entrySet()) {
414      // Add alias, table name, and partitions to hadoop conf so that their
415      // children will
416      // inherit these
417      HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME,
418          entry.getValue().tableName);
419      HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, entry
420          .getValue().partName);
421      MapInputPath input = entry.getKey();
422      Operator<? extends Serializable> op = input.op;
423      // op is not in the children list, so need to remember it and close it
424      // afterwards
425      if (children.indexOf(op) == -1) {
426        if (extraChildrenToClose == null) {
427          extraChildrenToClose = new ArrayList<Operator<? extends Serializable>>();
428        }
429        extraChildrenToClose.add(op);
430      }
431
432      // multiple input paths may corresponding the same operator (tree). The
433      // below logic is to avoid initialize one operator multiple times if there
434      // is one input path in this mapper's input paths.
435      boolean shouldInit = true;
436      List<String> paths = operatorToPaths.get(op);
437      for (String path : paths) {
438        if (childrenPaths.contains(path) && !path.equals(input.path)) {
439          shouldInit = false;
440          break;
441        }
442      }
443      if (shouldInit) {
444        op.initialize(hconf, new ObjectInspector[] {entry.getValue().getRowObjectInspector()});
445      }
446    }
447  }
448
449  /**
450   * close extra child operators that are initialized but are not executed.
451   */
452  @Override
453  public void closeOp(boolean abort) throws HiveException {
454    if (extraChildrenToClose != null) {
455      for (Operator<? extends Serializable> op : extraChildrenToClose) {
456        op.close(abort);
457      }
458    }
459  }
460
461  // Change the serializer etc. since it is a new file, and split can span
462  // multiple files/partitions.
463  public void cleanUpInputFileChangedOp() throws HiveException {
464    Path fpath = new Path((new Path(this.getExecContext().getCurrentInputFile()))
465                          .toUri().getPath());
466
467    for (String onefile : conf.getPathToAliases().keySet()) {
468      Path onepath = new Path(new Path(onefile).toUri().getPath());
469      // check for the operators who will process rows coming to this Map
470      // Operator
471      if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) {
472        String onealias = conf.getPathToAliases().get(onefile).get(0);
473        Operator<? extends Serializable> op =
474          conf.getAliasToWork().get(onealias);
475
476        LOG.info("Processing alias " + onealias + " for file " + onefile);
477
478        MapInputPath inp = new MapInputPath(onefile, onealias, op);
479        setInspectorInput(inp);
480        break;
481      }
482    }
483  }
484
485  public void process(Writable value) throws HiveException {
486    // A mapper can span multiple files/partitions.
487    // The serializers need to be reset if the input file changed
488    if ((this.getExecContext() != null) &&
489        this.getExecContext().inputFileChanged()) {
490      LOG.info("Processing path " + this.getExecContext().getCurrentInputFile());
491
492      // The child operators cleanup if input file has changed
493      cleanUpInputFileChanged();
494    }
495
496    Object row = null;
497    try {
498      if (this.hasVC) {
499        this.rowWithPartAndVC[0] = deserializer.deserialize(value);
500        int vcPos = isPartitioned ? 2 : 1;
501        populateVirtualColumnValues();
502        this.rowWithPartAndVC[vcPos] = this.vcValues;
503      } else if (!isPartitioned) {
504        row = deserializer.deserialize((Writable)value);
505      } else {
506        rowWithPart[0] = deserializer.deserialize((Writable)value);
507      }
508    } catch (Exception e) {
509      // Serialize the row and output.
510      String rawRowString;
511      try {
512        rawRowString = value.toString();
513      } catch (Exception e2) {
514        rawRowString = "[Error getting row data with exception " +
515            StringUtils.stringifyException(e2) + " ]";
516      }
517
518      // TODO: policy on deserialization errors
519      deserialize_error_count.set(deserialize_error_count.get() + 1);
520      throw new HiveException("Hive Runtime Error while processing writable " + rawRowString, e);
521    }
522
523    try {
524      if (this.hasVC) {
525        forward(this.rowWithPartAndVC, this.rowObjectInspector);
526      } else if (!isPartitioned) {
527        forward(row, rowObjectInspector);
528      } else {
529        forward(rowWithPart, rowObjectInspector);
530      }
531    } catch (Exception e) {
532      // Serialize the row and output the error message.
533      String rowString;
534      try {
535        if (this.hasVC) {
536          rowString = SerDeUtils.getJSONString(rowWithPartAndVC, rowObjectInspector);
537        } else if (!isPartitioned) {
538          rowString = SerDeUtils.getJSONString(row, rowObjectInspector);
539        } else {
540          rowString = SerDeUtils.getJSONString(rowWithPart, rowObjectInspector);
541        }
542      } catch (Exception e2) {
543        rowString = "[Error getting row data with exception " +
544            StringUtils.stringifyException(e2) + " ]";
545      }
546      throw new HiveException("Hive Runtime Error while processing row " + rowString, e);
547    }
548  }
549
550  private void populateVirtualColumnValues() {
551    if (this.vcs != null) {
552      ExecMapperContext mapExecCxt = this.getExecContext();
553      IOContext ioCxt = mapExecCxt.getIoCxt();
554      for (int i = 0; i < vcs.size(); i++) {
555        VirtualColumn vc = vcs.get(i);
556        if (vc.equals(VirtualColumn.FILENAME) && mapExecCxt.inputFileChanged()) {
557          this.vcValues[i] = new Text(mapExecCxt.getCurrentInputFile());
558        } else if (vc.equals(VirtualColumn.BLOCKOFFSET)) {
559          long current = ioCxt.getCurrentBlockStart();
560          LongWritable old = (LongWritable) this.vcValues[i];
561          if (old == null) {
562            old = new LongWritable(current);
563            this.vcValues[i] = old;
564            continue;
565          }
566          if (current != old.get()) {
567            old.set(current);
568          }
569        }
570      }
571    }
572  }
573
574  @Override
575  public void processOp(Object row, int tag) throws HiveException {
576    throw new HiveException("Hive 2 Internal error: should not be called!");
577  }
578
579  @Override
580  public String getName() {
581    return "MAP";
582  }
583
584  @Override
585  public OperatorType getType() {
586    return null;
587  }
588
589}