MapOperator.java | searchcode

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java

# · Java · 589 lines · 430 code · 60 blank · 99 comment · 80 complexity · 0ee2e8d9506d7472927b5b6d6d4ac4ff MD5 · raw file


/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.io.IOContext;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;

/**
 * Map operator. This triggers overall map side processing. This is a little
 * different from regular operators in that it starts off by processing a
 * Writable data structure from a Table (instead of a Hive Object).
 **/
public class MapOperator extends Operator<MapredWork> implements Serializable {

  private static final long serialVersionUID = 1L;

  /**
   * Counter.
   *
   */
  public static enum Counter {
    DESERIALIZE_ERRORS
  }

  private final transient LongWritable deserialize_error_count = new LongWritable();
  private transient Deserializer deserializer;

  private transient Object[] rowWithPart;
  private transient Writable[] vcValues;
  private transient List<VirtualColumn> vcs;
  private transient Object[] rowWithPartAndVC;
  private transient StructObjectInspector rowObjectInspector;
  private transient boolean isPartitioned;
  private transient boolean hasVC;
  private Map<MapInputPath, MapOpCtx> opCtxMap;
  private Set<MapInputPath> listInputPaths = new HashSet<MapInputPath>();

  private Map<Operator<? extends Serializable>, java.util.ArrayList<String>> operatorToPaths;

  private final java.util.ArrayList<String> childrenPaths = new ArrayList<String>();

  private ArrayList<Operator<? extends Serializable>> extraChildrenToClose = null;

  private static class MapInputPath {
    String path;
    String alias;
    Operator<? extends Serializable> op;

    /**
     * @param path
     * @param alias
     * @param op
     */
    public MapInputPath(String path, String alias,
        Operator<? extends Serializable> op) {
      this.path = path;
      this.alias = alias;
      this.op = op;
    }

    @Override
    public boolean equals(Object o) {
      if (o instanceof MapInputPath) {
        MapInputPath mObj = (MapInputPath) o;
        if (mObj == null) {
          return false;
        }
        return path.equals(mObj.path) && alias.equals(mObj.alias)
            && op.equals(mObj.op);
      }

      return false;
    }

    @Override
    public int hashCode() {
      return (op == null) ? 0 : op.hashCode();
    }

    public Operator<? extends Serializable> getOp() {
      return op;
    }

    public void setOp(Operator<? extends Serializable> op) {
      this.op = op;
    }

  }

  private static class MapOpCtx {
    boolean isPartitioned;
    StructObjectInspector rawRowObjectInspector; //without partition
    StructObjectInspector partObjectInspector; // partition
    StructObjectInspector rowObjectInspector;
    Object[] rowWithPart;
    Deserializer deserializer;
    public String tableName;
    public String partName;

    /**
     * @param isPartitioned
     * @param rowObjectInspector
     * @param rowWithPart
     */
    public MapOpCtx(boolean isPartitioned,
        StructObjectInspector rowObjectInspector,
        StructObjectInspector rawRowObjectInspector,
        StructObjectInspector partObjectInspector,
        Object[] rowWithPart,
        Deserializer deserializer) {
      this.isPartitioned = isPartitioned;
      this.rowObjectInspector = rowObjectInspector;
      this.rawRowObjectInspector = rawRowObjectInspector;
      this.partObjectInspector = partObjectInspector;
      this.rowWithPart = rowWithPart;
      this.deserializer = deserializer;
    }

    /**
     * @return the isPartitioned
     */
    public boolean isPartitioned() {
      return isPartitioned;
    }

    /**
     * @return the rowObjectInspector
     */
    public StructObjectInspector getRowObjectInspector() {
      return rowObjectInspector;
    }

    /**
     * @return the rowWithPart
     */
    public Object[] getRowWithPart() {
      return rowWithPart;
    }

    /**
     * @return the deserializer
     */
    public Deserializer getDeserializer() {
      return deserializer;
    }
  }

  /**
   * Initializes this map op as the root of the tree. It sets JobConf &
   * MapRedWork and starts initialization of the operator tree rooted at this
   * op.
   *
   * @param hconf
   * @param mrwork
   * @throws HiveException
   */
  public void initializeAsRoot(Configuration hconf, MapredWork mrwork)
      throws HiveException {
    setConf(mrwork);
    setChildren(hconf);
    initialize(hconf, null);
  }

  private static MapOpCtx initObjectInspector(MapredWork conf,
      Configuration hconf, String onefile) throws HiveException,
      ClassNotFoundException, InstantiationException, IllegalAccessException,
      SerDeException {
    PartitionDesc td = conf.getPathToPartitionInfo().get(onefile);
    LinkedHashMap<String, String> partSpec = td.getPartSpec();
    Properties tblProps = td.getProperties();

    Class sdclass = td.getDeserializerClass();
    if (sdclass == null) {
      String className = td.getSerdeClassName();
      if ((className == "") || (className == null)) {
        throw new HiveException(
            "SerDe class or the SerDe class name is not set for table: "
            + td.getProperties().getProperty("name"));
      }
      sdclass = hconf.getClassByName(className);
    }

    String tableName = String.valueOf(tblProps.getProperty("name"));
    String partName = String.valueOf(partSpec);
    // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME, tableName);
    // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, partName);
    Deserializer deserializer = (Deserializer) sdclass.newInstance();
    deserializer.initialize(hconf, tblProps);
    StructObjectInspector rawRowObjectInspector = (StructObjectInspector) deserializer
        .getObjectInspector();

    MapOpCtx opCtx = null;
    // Next check if this table has partitions and if so
    // get the list of partition names as well as allocate
    // the serdes for the partition columns
    String pcols = tblProps
        .getProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
    // Log LOG = LogFactory.getLog(MapOperator.class.getName());
    if (pcols != null && pcols.length() > 0) {
      String[] partKeys = pcols.trim().split("/");
      List<String> partNames = new ArrayList<String>(partKeys.length);
      Object[] partValues = new Object[partKeys.length];
      List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(
          partKeys.length);
      for (int i = 0; i < partKeys.length; i++) {
        String key = partKeys[i];
        partNames.add(key);
        // Partitions do not exist for this table
        if (partSpec == null) {
          partValues[i] = new Text();
        } else {
          partValues[i] = new Text(partSpec.get(key));
        }
        partObjectInspectors
            .add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
      }
      StructObjectInspector partObjectInspector = ObjectInspectorFactory
          .getStandardStructObjectInspector(partNames, partObjectInspectors);

      Object[] rowWithPart = new Object[2];
      rowWithPart[1] = partValues;
      StructObjectInspector rowObjectInspector = ObjectInspectorFactory
          .getUnionStructObjectInspector(Arrays
          .asList(new StructObjectInspector[] {rawRowObjectInspector, partObjectInspector}));
      // LOG.info("dump " + tableName + " " + partName + " " +
      // rowObjectInspector.getTypeName());
      opCtx = new MapOpCtx(true, rowObjectInspector, rawRowObjectInspector ,partObjectInspector,rowWithPart, deserializer);
    } else {
      // LOG.info("dump2 " + tableName + " " + partName + " " +
      // rowObjectInspector.getTypeName());
      opCtx = new MapOpCtx(false, rawRowObjectInspector, rawRowObjectInspector, null, null, deserializer);
    }
    opCtx.tableName = tableName;
    opCtx.partName = partName;
    return opCtx;
  }

  /**
   * Set the inspectors given a input. Since a mapper can span multiple partitions, the inspectors
   * need to be changed if the input changes
   **/
  private void setInspectorInput(MapInputPath inp) {
    Operator<? extends Serializable> op = inp.getOp();

    deserializer  = opCtxMap.get(inp).getDeserializer();
    isPartitioned = opCtxMap.get(inp).isPartitioned();
    rowWithPart   = opCtxMap.get(inp).getRowWithPart();
    rowObjectInspector = opCtxMap.get(inp).getRowObjectInspector();
    if (listInputPaths.contains(inp)) {
      return;
    }

    listInputPaths.add(inp);
    StructObjectInspector rawRowObjectInspector = opCtxMap.get(inp).rawRowObjectInspector;
    StructObjectInspector partObjectInspector = opCtxMap.get(inp).partObjectInspector;
    if (op instanceof TableScanOperator) {
      TableScanOperator tsOp = (TableScanOperator) op;
      TableScanDesc tsDesc = tsOp.getConf();
      if(tsDesc != null) {
        this.vcs = tsDesc.getVirtualCols();
        if (vcs != null && vcs.size() > 0) {
          this.hasVC = true;
          List<String> vcNames = new ArrayList<String>(vcs.size());
          this.vcValues = new Writable[vcs.size()];
          List<ObjectInspector> vcsObjectInspectors = new ArrayList<ObjectInspector>(vcs.size());
          for (int i = 0; i < vcs.size(); i++) {
            VirtualColumn vc = vcs.get(i);
            vcsObjectInspectors.add(
              PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
                ((PrimitiveTypeInfo) vc.getTypeInfo()).getPrimitiveCategory()));
            vcNames.add(vc.getName());
          }
          StructObjectInspector vcStructObjectInspector = ObjectInspectorFactory
            .getStandardStructObjectInspector(vcNames,
                                              vcsObjectInspectors);
          if (isPartitioned) {
            this.rowWithPartAndVC = new Object[3];
            this.rowWithPartAndVC[1] = this.rowWithPart[1];
          } else {
            this.rowWithPartAndVC = new Object[2];
          }
          if(partObjectInspector == null) {
            this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays
                                        .asList(new StructObjectInspector[] {
                                            rowObjectInspector, vcStructObjectInspector }));
          } else {
            this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays
                                        .asList(new StructObjectInspector[] {
                                            rawRowObjectInspector, partObjectInspector, vcStructObjectInspector }));
          }
          opCtxMap.get(inp).rowObjectInspector = this.rowObjectInspector;
        }
      }
    }
  }

  public void setChildren(Configuration hconf) throws HiveException {

    Path fpath = new Path((new Path(HiveConf.getVar(hconf,
        HiveConf.ConfVars.HADOOPMAPFILENAME))).toUri().getPath());

    ArrayList<Operator<? extends Serializable>> children = new ArrayList<Operator<? extends Serializable>>();
    opCtxMap = new HashMap<MapInputPath, MapOpCtx>();
    operatorToPaths = new HashMap<Operator<? extends Serializable>, java.util.ArrayList<String>>();

    statsMap.put(Counter.DESERIALIZE_ERRORS, deserialize_error_count);

    try {
      boolean done = false;
      for (String onefile : conf.getPathToAliases().keySet()) {
        MapOpCtx opCtx = initObjectInspector(conf, hconf, onefile);
        Path onepath = new Path(new Path(onefile).toUri().getPath());
        List<String> aliases = conf.getPathToAliases().get(onefile);

        for (String onealias : aliases) {
          Operator<? extends Serializable> op = conf.getAliasToWork().get(
              onealias);
          LOG.info("Adding alias " + onealias + " to work list for file "
              + onefile);
          MapInputPath inp = new MapInputPath(onefile, onealias, op);
          opCtxMap.put(inp, opCtx);
          if (operatorToPaths.get(op) == null) {
            operatorToPaths.put(op, new java.util.ArrayList<String>());
          }
          operatorToPaths.get(op).add(onefile);

          op
              .setParentOperators(new ArrayList<Operator<? extends Serializable>>());
          op.getParentOperators().add(this);
          // check for the operators who will process rows coming to this Map
          // Operator
          if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) {
            children.add(op);
            childrenPaths.add(onefile);
            LOG.info("dump " + op.getName() + " "
                + opCtxMap.get(inp).getRowObjectInspector().getTypeName());
            if (!done) {
              setInspectorInput(inp);
              done = true;
            }
          }
        }
      }
      if (children.size() == 0) {
        // didn't find match for input file path in configuration!
        // serious problem ..
        LOG.error("Configuration does not have any alias for path: "
            + fpath.toUri().getPath());
        throw new HiveException("Configuration and input path are inconsistent");
      }

      // we found all the operators that we are supposed to process.
      setChildOperators(children);
    } catch (Exception e) {
      throw new HiveException(e);
    }
  }

  @Override
  public void initializeOp(Configuration hconf) throws HiveException {
    // set that parent initialization is done and call initialize on children
    state = State.INIT;
    List<Operator<? extends Serializable>> children = getChildOperators();

    for (Entry<MapInputPath, MapOpCtx> entry : opCtxMap.entrySet()) {
      // Add alias, table name, and partitions to hadoop conf so that their
      // children will
      // inherit these
      HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME,
          entry.getValue().tableName);
      HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, entry
          .getValue().partName);
      MapInputPath input = entry.getKey();
      Operator<? extends Serializable> op = input.op;
      // op is not in the children list, so need to remember it and close it
      // afterwards
      if (children.indexOf(op) == -1) {
        if (extraChildrenToClose == null) {
          extraChildrenToClose = new ArrayList<Operator<? extends Serializable>>();
        }
        extraChildrenToClose.add(op);
      }

      // multiple input paths may corresponding the same operator (tree). The
      // below logic is to avoid initialize one operator multiple times if there
      // is one input path in this mapper's input paths.
      boolean shouldInit = true;
      List<String> paths = operatorToPaths.get(op);
      for (String path : paths) {
        if (childrenPaths.contains(path) && !path.equals(input.path)) {
          shouldInit = false;
          break;
        }
      }
      if (shouldInit) {
        op.initialize(hconf, new ObjectInspector[] {entry.getValue().getRowObjectInspector()});
      }
    }
  }

  /**
   * close extra child operators that are initialized but are not executed.
   */
  @Override
  public void closeOp(boolean abort) throws HiveException {
    if (extraChildrenToClose != null) {
      for (Operator<? extends Serializable> op : extraChildrenToClose) {
        op.close(abort);
      }
    }
  }

  // Change the serializer etc. since it is a new file, and split can span
  // multiple files/partitions.
  public void cleanUpInputFileChangedOp() throws HiveException {
    Path fpath = new Path((new Path(this.getExecContext().getCurrentInputFile()))
                          .toUri().getPath());

    for (String onefile : conf.getPathToAliases().keySet()) {
      Path onepath = new Path(new Path(onefile).toUri().getPath());
      // check for the operators who will process rows coming to this Map
      // Operator
      if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) {
        String onealias = conf.getPathToAliases().get(onefile).get(0);
        Operator<? extends Serializable> op =
          conf.getAliasToWork().get(onealias);

        LOG.info("Processing alias " + onealias + " for file " + onefile);

        MapInputPath inp = new MapInputPath(onefile, onealias, op);
        setInspectorInput(inp);
        break;
      }
    }
  }

  public void process(Writable value) throws HiveException {
    // A mapper can span multiple files/partitions.
    // The serializers need to be reset if the input file changed
    if ((this.getExecContext() != null) &&
        this.getExecContext().inputFileChanged()) {
      LOG.info("Processing path " + this.getExecContext().getCurrentInputFile());

      // The child operators cleanup if input file has changed
      cleanUpInputFileChanged();
    }

    Object row = null;
    try {
      if (this.hasVC) {
        this.rowWithPartAndVC[0] = deserializer.deserialize(value);
        int vcPos = isPartitioned ? 2 : 1;
        populateVirtualColumnValues();
        this.rowWithPartAndVC[vcPos] = this.vcValues;
      } else if (!isPartitioned) {
        row = deserializer.deserialize((Writable)value);
      } else {
        rowWithPart[0] = deserializer.deserialize((Writable)value);
      }
    } catch (Exception e) {
      // Serialize the row and output.
      String rawRowString;
      try {
        rawRowString = value.toString();
      } catch (Exception e2) {
        rawRowString = "[Error getting row data with exception " +
            StringUtils.stringifyException(e2) + " ]";
      }

      // TODO: policy on deserialization errors
      deserialize_error_count.set(deserialize_error_count.get() + 1);
      throw new HiveException("Hive Runtime Error while processing writable " + rawRowString, e);
    }

    try {
      if (this.hasVC) {
        forward(this.rowWithPartAndVC, this.rowObjectInspector);
      } else if (!isPartitioned) {
        forward(row, rowObjectInspector);
      } else {
        forward(rowWithPart, rowObjectInspector);
      }
    } catch (Exception e) {
      // Serialize the row and output the error message.
      String rowString;
      try {
        if (this.hasVC) {
          rowString = SerDeUtils.getJSONString(rowWithPartAndVC, rowObjectInspector);
        } else if (!isPartitioned) {
          rowString = SerDeUtils.getJSONString(row, rowObjectInspector);
        } else {
          rowString = SerDeUtils.getJSONString(rowWithPart, rowObjectInspector);
        }
      } catch (Exception e2) {
        rowString = "[Error getting row data with exception " +
            StringUtils.stringifyException(e2) + " ]";
      }
      throw new HiveException("Hive Runtime Error while processing row " + rowString, e);
    }
  }

  private void populateVirtualColumnValues() {
    if (this.vcs != null) {
      ExecMapperContext mapExecCxt = this.getExecContext();
      IOContext ioCxt = mapExecCxt.getIoCxt();
      for (int i = 0; i < vcs.size(); i++) {
        VirtualColumn vc = vcs.get(i);
        if (vc.equals(VirtualColumn.FILENAME) && mapExecCxt.inputFileChanged()) {
          this.vcValues[i] = new Text(mapExecCxt.getCurrentInputFile());
        } else if (vc.equals(VirtualColumn.BLOCKOFFSET)) {
          long current = ioCxt.getCurrentBlockStart();
          LongWritable old = (LongWritable) this.vcValues[i];
          if (old == null) {
            old = new LongWritable(current);
            this.vcValues[i] = old;
            continue;
          }
          if (current != old.get()) {
            old.set(current);
          }
        }
      }
    }
  }

  @Override
  public void processOp(Object row, int tag) throws HiveException {
    throw new HiveException("Hive 2 Internal error: should not be called!");
  }

  @Override
  public String getName() {
    return "MAP";
  }

  @Override
  public OperatorType getType() {
    return null;
  }

}

Tech Fingerprint

Alerts (27)

'Object[]' Avoid using raw 'Object[]'. Prefer typed arrays (e.g., String[]) or generic Collections (e.g., List<Object> or List<SpecificType>). If converting a collection, use `toArray(new Type[0])` or `toArray(Type[]::new)`.
76 79 143 157 184 250 268
'HashSet<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
84
'ArrayList<' Maintainability Info: Method parameters and return types should generally use interface types (e.g., List<T>, Set<T>, Map<T, K>) instead of concrete implementation types (e.g., ArrayList<T>, HashMap<T, K>). This improves flexibility and hides implementation details.
88
'instanceof' Frequent 'instanceof' checks can indicate a need for better polymorphism (using overridden methods in subclasses) or visitor pattern. Consider if the design can be improved.
111 304
'public' Maintainability Info: Public non-final fields violate encapsulation. Prefer making fields private and providing public getter/setter methods if access is needed.
145 146
Complexity hotspot; line 223 (total complexity: 4)
223
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
366 476 544 546
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
402 508 513 531 542
Complexity hotspot; lines 425 to 426 (total complexity: 4)
425 426
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
500
'return null;' Returning null forces callers to perform null checks, risking NullPointerException. Consider using Optional<T> (Java 8+), throwing an exception, or returning a Null Object/empty collection instead.
586