/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
Java | 5009 lines | 3560 code | 483 blank | 966 comment | 767 complexity | 0a4c994fbc3d9ff2194ad80e8b9978ba MD5 | raw file
Possible License(s): Apache-2.0
Large files files are truncated, but you can click here to view the full file
- /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.hive.ql.exec;
- import com.fasterxml.jackson.databind.ObjectMapper;
- import java.beans.DefaultPersistenceDelegate;
- import java.beans.Encoder;
- import java.beans.Expression;
- import java.beans.Statement;
- import java.io.ByteArrayInputStream;
- import java.io.ByteArrayOutputStream;
- import java.io.DataInput;
- import java.io.EOFException;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.OutputStream;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.net.URL;
- import java.net.URLClassLoader;
- import java.net.URLDecoder;
- import java.security.AccessController;
- import java.sql.Connection;
- import java.sql.DriverManager;
- import java.sql.PreparedStatement;
- import java.sql.SQLException;
- import java.sql.SQLFeatureNotSupportedException;
- import java.sql.SQLTransientException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Base64;
- import java.util.Calendar;
- import java.util.Collection;
- import java.util.Collections;
- import java.util.Enumeration;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.Iterator;
- import java.util.LinkedHashMap;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.Map;
- import java.util.Optional;
- import java.util.Properties;
- import java.util.Random;
- import java.util.Set;
- import java.util.UUID;
- import java.util.concurrent.Callable;
- import java.util.concurrent.ConcurrentHashMap;
- import java.util.concurrent.ExecutionException;
- import java.util.concurrent.ExecutorService;
- import java.util.concurrent.Executors;
- import java.util.concurrent.Future;
- import java.util.concurrent.ThreadLocalRandom;
- import java.util.concurrent.atomic.AtomicLong;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import java.util.zip.Deflater;
- import java.util.zip.DeflaterOutputStream;
- import java.util.zip.InflaterInputStream;
- import org.apache.commons.collections.MapUtils;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.commons.lang3.StringEscapeUtils;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.filecache.DistributedCache;
- import org.apache.hadoop.fs.ContentSummary;
- import org.apache.hadoop.fs.FSDataInputStream;
- import org.apache.hadoop.fs.FSDataOutputStream;
- import org.apache.hadoop.fs.FileStatus;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.LocatedFileStatus;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.fs.PathFilter;
- import org.apache.hadoop.fs.RemoteIterator;
- import org.apache.hadoop.fs.permission.FsPermission;
- import org.apache.hadoop.hive.common.BlobStorageUtils;
- import org.apache.hadoop.hive.common.FileUtils;
- import org.apache.hadoop.hive.common.HiveInterruptCallback;
- import org.apache.hadoop.hive.common.HiveInterruptUtils;
- import org.apache.hadoop.hive.common.HiveStatsUtils;
- import org.apache.hadoop.hive.common.JavaUtils;
- import org.apache.hadoop.hive.common.StatsSetupConst;
- import org.apache.hadoop.hive.common.StringInternUtils;
- import org.apache.hadoop.hive.common.TableName;
- import org.apache.hadoop.hive.common.ValidWriteIdList;
- import org.apache.hadoop.hive.conf.HiveConf;
- import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
- import org.apache.hadoop.hive.metastore.Warehouse;
- import org.apache.hadoop.hive.metastore.api.FieldSchema;
- import org.apache.hadoop.hive.metastore.api.Order;
- import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
- import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
- import org.apache.hadoop.hive.ql.Context;
- import org.apache.hadoop.hive.ql.ErrorMsg;
- import org.apache.hadoop.hive.ql.DriverState;
- import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
- import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
- import org.apache.hadoop.hive.ql.exec.mr.ExecMapper;
- import org.apache.hadoop.hive.ql.exec.mr.ExecReducer;
- import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
- import org.apache.hadoop.hive.ql.exec.spark.SparkTask;
- import org.apache.hadoop.hive.ql.exec.tez.DagUtils;
- import org.apache.hadoop.hive.ql.exec.tez.TezTask;
- import org.apache.hadoop.hive.ql.exec.util.DAGTraversal;
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
- import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
- import org.apache.hadoop.hive.ql.io.AcidUtils;
- import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
- import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
- import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
- import org.apache.hadoop.hive.ql.io.HiveInputFormat;
- import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
- import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
- import org.apache.hadoop.hive.ql.io.IOConstants;
- import org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat;
- import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat;
- import org.apache.hadoop.hive.ql.io.RCFile;
- import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat;
- import org.apache.hadoop.hive.ql.io.SelfDescribingInputFormatInterface;
- import org.apache.hadoop.hive.ql.io.merge.MergeFileMapper;
- import org.apache.hadoop.hive.ql.io.merge.MergeFileWork;
- import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateMapper;
- import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateWork;
- import org.apache.hadoop.hive.ql.log.PerfLogger;
- import org.apache.hadoop.hive.ql.metadata.HiveException;
- import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
- import org.apache.hadoop.hive.ql.metadata.HiveUtils;
- import org.apache.hadoop.hive.ql.metadata.InputEstimator;
- import org.apache.hadoop.hive.ql.metadata.Partition;
- import org.apache.hadoop.hive.ql.metadata.Table;
- import org.apache.hadoop.hive.ql.parse.SemanticException;
- import org.apache.hadoop.hive.ql.plan.BaseWork;
- import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
- import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
- import org.apache.hadoop.hive.ql.plan.IStatsGatherDesc;
- import org.apache.hadoop.hive.ql.plan.MapWork;
- import org.apache.hadoop.hive.ql.plan.MapredWork;
- import org.apache.hadoop.hive.ql.plan.MergeJoinWork;
- import org.apache.hadoop.hive.ql.plan.OperatorDesc;
- import org.apache.hadoop.hive.ql.plan.PartitionDesc;
- import org.apache.hadoop.hive.ql.plan.PlanUtils;
- import org.apache.hadoop.hive.ql.plan.ReduceWork;
- import org.apache.hadoop.hive.ql.plan.TableDesc;
- import org.apache.hadoop.hive.ql.secrets.URISecretSource;
- import org.apache.hadoop.hive.ql.session.SessionState;
- import org.apache.hadoop.hive.ql.stats.StatsFactory;
- import org.apache.hadoop.hive.ql.stats.StatsPublisher;
- import org.apache.hadoop.hive.serde.serdeConstants;
- import org.apache.hadoop.hive.serde2.AbstractSerDe;
- import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe;
- import org.apache.hadoop.hive.serde2.SerDeException;
- import org.apache.hadoop.hive.serde2.SerDeUtils;
- import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
- import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
- import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
- import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
- import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
- import org.apache.hadoop.hive.serde2.objectinspector.StructField;
- import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
- import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
- import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
- import org.apache.hadoop.hive.shims.ShimLoader;
- import org.apache.hadoop.io.IOUtils;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.io.SequenceFile.CompressionType;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.Writable;
- import org.apache.hadoop.io.compress.CompressionCodec;
- import org.apache.hadoop.io.compress.DefaultCodec;
- import org.apache.hadoop.mapred.FileInputFormat;
- import org.apache.hadoop.mapred.FileOutputFormat;
- import org.apache.hadoop.mapred.FileSplit;
- import org.apache.hadoop.mapred.InputFormat;
- import org.apache.hadoop.mapred.InputSplit;
- import org.apache.hadoop.mapred.JobConf;
- import org.apache.hadoop.mapred.RecordReader;
- import org.apache.hadoop.mapred.Reporter;
- import org.apache.hadoop.mapred.SequenceFileInputFormat;
- import org.apache.hadoop.mapred.SequenceFileOutputFormat;
- import org.apache.hadoop.mapred.TextInputFormat;
- import org.apache.hadoop.security.Credentials;
- import org.apache.hadoop.security.UserGroupInformation;
- import org.apache.hadoop.security.alias.CredentialProviderFactory;
- import org.apache.hadoop.util.Progressable;
- import org.apache.hive.common.util.ACLConfigurationParser;
- import org.apache.hive.common.util.ReflectionUtil;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import com.esotericsoftware.kryo.Kryo;
- import com.google.common.annotations.VisibleForTesting;
- import com.google.common.base.Preconditions;
- import com.google.common.collect.Maps;
- import com.google.common.util.concurrent.MoreExecutors;
- import com.google.common.util.concurrent.ThreadFactoryBuilder;
- /**
- * Utilities.
- *
- */
- @SuppressWarnings({ "nls", "deprecation" })
- public final class Utilities {
- /**
- * Mapper to use to serialize/deserialize JSON objects ().
- */
- public static final ObjectMapper JSON_MAPPER = new ObjectMapper();
- /**
- * A logger mostly used to trace-log the details of Hive table file operations. Filtering the
- * logs for FileOperations (with trace logs present) allows one to debug what Hive has done with
- * various files and directories while committing writes, as well as reading.
- */
- public static final Logger FILE_OP_LOGGER = LoggerFactory.getLogger("FileOperations");
- public static final Logger LOGGER = LoggerFactory.getLogger(Utilities.class);
- /**
- * The object in the reducer are composed of these top level fields.
- */
- public static final String HADOOP_LOCAL_FS = "file:///";
- public static final String HADOOP_LOCAL_FS_SCHEME = "file";
- public static final String MAP_PLAN_NAME = "map.xml";
- public static final String REDUCE_PLAN_NAME = "reduce.xml";
- public static final String MERGE_PLAN_NAME = "merge.xml";
- public static final String INPUT_NAME = "iocontext.input.name";
- public static final String HAS_MAP_WORK = "has.map.work";
- public static final String HAS_REDUCE_WORK = "has.reduce.work";
- public static final String MAPRED_MAPPER_CLASS = "mapred.mapper.class";
- public static final String MAPRED_REDUCER_CLASS = "mapred.reducer.class";
- public static final String HIVE_ADDED_JARS = "hive.added.jars";
- public static final String VECTOR_MODE = "VECTOR_MODE";
- public static final String USE_VECTORIZED_INPUT_FILE_FORMAT = "USE_VECTORIZED_INPUT_FILE_FORMAT";
- public static final String MAPNAME = "Map ";
- public static final String REDUCENAME = "Reducer ";
- public static final String ENSURE_OPERATORS_EXECUTED = "ENSURE_OPERATORS_EXECUTED";
- @Deprecated
- protected static final String DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX = "mapred.dfsclient.parallelism.max";
- // all common whitespaces as defined in Character.isWhitespace(char)
- // Used primarily as a workaround until TEXT-175 is released
- public static final char[] COMMON_WHITESPACE_CHARS =
- { '\t', '\n', '\u000B', '\f', '\r', '\u001C', '\u001D', '\u001E', '\u001F', ' ' };
- private static final Object INPUT_SUMMARY_LOCK = new Object();
- private static final Object ROOT_HDFS_DIR_LOCK = new Object();
- @FunctionalInterface
- public interface SupplierWithCheckedException<T, X extends Exception> {
- T get() throws X;
- }
- /**
- * ReduceField:
- * KEY: record key
- * VALUE: record value
- */
- public static enum ReduceField {
- KEY(0), VALUE(1);
- int position;
- ReduceField(int position) {
- this.position = position;
- };
- };
- public static List<String> reduceFieldNameList;
- static {
- reduceFieldNameList = new ArrayList<String>();
- for (ReduceField r : ReduceField.values()) {
- reduceFieldNameList.add(r.toString());
- }
- }
- public static String removeValueTag(String column) {
- if (column.startsWith(ReduceField.VALUE + ".")) {
- return column.substring(6);
- }
- return column;
- }
- private Utilities() {
- // prevent instantiation
- }
- private static GlobalWorkMapFactory gWorkMap = new GlobalWorkMapFactory();
- private static final String CLASS_NAME = Utilities.class.getName();
- private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME);
- public static void clearWork(Configuration conf) {
- Path mapPath = getPlanPath(conf, MAP_PLAN_NAME);
- Path reducePath = getPlanPath(conf, REDUCE_PLAN_NAME);
- // if the plan path hasn't been initialized just return, nothing to clean.
- if (mapPath == null && reducePath == null) {
- return;
- }
- try {
- if (!HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
- FileSystem fs = mapPath.getFileSystem(conf);
- try {
- fs.delete(mapPath, true);
- } catch (FileNotFoundException e) {
- // delete if exists, don't panic if it doesn't
- }
- try {
- fs.delete(reducePath, true);
- } catch (FileNotFoundException e) {
- // delete if exists, don't panic if it doesn't
- }
- }
- } catch (Exception e) {
- LOG.warn("Failed to clean-up tmp directories.", e);
- } finally {
- // where a single process works with multiple plans - we must clear
- // the cache before working with the next plan.
- clearWorkMapForConf(conf);
- }
- }
- public static MapredWork getMapRedWork(Configuration conf) {
- MapredWork w = new MapredWork();
- w.setMapWork(getMapWork(conf));
- w.setReduceWork(getReduceWork(conf));
- return w;
- }
- public static void cacheMapWork(Configuration conf, MapWork work, Path hiveScratchDir) {
- cacheBaseWork(conf, MAP_PLAN_NAME, work, hiveScratchDir);
- }
- public static void setMapWork(Configuration conf, MapWork work) {
- setBaseWork(conf, MAP_PLAN_NAME, work);
- }
- public static MapWork getMapWork(Configuration conf) {
- if (!conf.getBoolean(HAS_MAP_WORK, false)) {
- return null;
- }
- return (MapWork) getBaseWork(conf, MAP_PLAN_NAME);
- }
- public static void setReduceWork(Configuration conf, ReduceWork work) {
- setBaseWork(conf, REDUCE_PLAN_NAME, work);
- }
- public static ReduceWork getReduceWork(Configuration conf) {
- if (!conf.getBoolean(HAS_REDUCE_WORK, false)) {
- return null;
- }
- return (ReduceWork) getBaseWork(conf, REDUCE_PLAN_NAME);
- }
- public static Path setMergeWork(JobConf conf, MergeJoinWork mergeJoinWork, Path mrScratchDir,
- boolean useCache) {
- for (BaseWork baseWork : mergeJoinWork.getBaseWorkList()) {
- setBaseWork(conf, baseWork, mrScratchDir, baseWork.getName() + MERGE_PLAN_NAME, useCache);
- String prefixes = conf.get(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES);
- if (prefixes == null) {
- prefixes = baseWork.getName();
- } else {
- prefixes = prefixes + "," + baseWork.getName();
- }
- conf.set(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES, prefixes);
- }
- // nothing to return
- return null;
- }
- public static BaseWork getMergeWork(Configuration jconf) {
- String currentMergePrefix = jconf.get(DagUtils.TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX);
- if (StringUtils.isEmpty(currentMergePrefix)) {
- return null;
- }
- return getMergeWork(jconf, jconf.get(DagUtils.TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX));
- }
- public static BaseWork getMergeWork(Configuration jconf, String prefix) {
- if (StringUtils.isEmpty(prefix)) {
- return null;
- }
- return getBaseWork(jconf, prefix + MERGE_PLAN_NAME);
- }
- public static void cacheBaseWork(Configuration conf, String name, BaseWork work,
- Path hiveScratchDir) {
- try {
- setPlanPath(conf, hiveScratchDir);
- setBaseWork(conf, name, work);
- } catch (IOException e) {
- LOG.error("Failed to cache plan", e);
- throw new RuntimeException(e);
- }
- }
- /**
- * Pushes work into the global work map
- */
- private static void setBaseWork(Configuration conf, String name, BaseWork work) {
- Path path = getPlanPath(conf, name);
- setHasWork(conf, name);
- gWorkMap.get(conf).put(path, work);
- }
- /**
- * Returns the Map or Reduce plan
- * Side effect: the BaseWork returned is also placed in the gWorkMap
- * @param conf
- * @param name
- * @return BaseWork based on the name supplied will return null if name is null
- * @throws RuntimeException if the configuration files are not proper or if plan can not be loaded
- */
- private static BaseWork getBaseWork(Configuration conf, String name) {
- Path path = getPlanPath(conf, name);
- LOG.debug("PLAN PATH = {}", path);
- if (path == null) { // Map/reduce plan may not be generated
- return null;
- }
- BaseWork gWork = gWorkMap.get(conf).get(path);
- if (gWork != null) {
- LOG.debug("Found plan in cache for name: {}", name);
- return gWork;
- }
- InputStream in = null;
- Kryo kryo = SerializationUtilities.borrowKryo();
- try {
- String engine = HiveConf.getVar(conf, ConfVars.HIVE_EXECUTION_ENGINE);
- if (engine.equals("spark")) {
- // TODO Add jar into current thread context classloader as it may be invoked by Spark driver inside
- // threads, should be unnecessary while SPARK-5377 is resolved.
- String addedJars = conf.get(HIVE_ADDED_JARS);
- if (StringUtils.isNotEmpty(addedJars)) {
- AddToClassPathAction addAction = new AddToClassPathAction(
- Thread.currentThread().getContextClassLoader(), Arrays.asList(addedJars.split(";"))
- );
- ClassLoader newLoader = AccessController.doPrivileged(addAction);
- Thread.currentThread().setContextClassLoader(newLoader);
- kryo.setClassLoader(newLoader);
- }
- }
- Path localPath = path;
- LOG.debug("local path = {}", localPath);
- final long serializedSize;
- final String planMode;
- if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
- String planStringPath = path.toUri().getPath();
- LOG.debug("Loading plan from string: {}", planStringPath);
- String planString = conf.getRaw(planStringPath);
- if (planString == null) {
- LOG.info("Could not find plan string in conf");
- return null;
- }
- serializedSize = planString.length();
- planMode = "RPC";
- byte[] planBytes = Base64.getDecoder().decode(planString);
- in = new ByteArrayInputStream(planBytes);
- in = new InflaterInputStream(in);
- } else {
- LOG.debug("Open file to read in plan: {}", localPath);
- FileSystem fs = localPath.getFileSystem(conf);
- in = fs.open(localPath);
- serializedSize = fs.getFileStatus(localPath).getLen();
- planMode = "FILE";
- }
- if(MAP_PLAN_NAME.equals(name)){
- if (ExecMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))){
- gWork = SerializationUtilities.deserializePlan(kryo, in, MapWork.class);
- } else if(MergeFileMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
- gWork = SerializationUtilities.deserializePlan(kryo, in, MergeFileWork.class);
- } else if(ColumnTruncateMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) {
- gWork = SerializationUtilities.deserializePlan(kryo, in, ColumnTruncateWork.class);
- } else {
- throw new RuntimeException("unable to determine work from configuration ."
- + MAPRED_MAPPER_CLASS + " was "+ conf.get(MAPRED_MAPPER_CLASS));
- }
- } else if (REDUCE_PLAN_NAME.equals(name)) {
- if(ExecReducer.class.getName().equals(conf.get(MAPRED_REDUCER_CLASS))) {
- gWork = SerializationUtilities.deserializePlan(kryo, in, ReduceWork.class);
- } else {
- throw new RuntimeException("unable to determine work from configuration ."
- + MAPRED_REDUCER_CLASS +" was "+ conf.get(MAPRED_REDUCER_CLASS));
- }
- } else if (name.contains(MERGE_PLAN_NAME)) {
- if (name.startsWith(MAPNAME)) {
- gWork = SerializationUtilities.deserializePlan(kryo, in, MapWork.class);
- } else if (name.startsWith(REDUCENAME)) {
- gWork = SerializationUtilities.deserializePlan(kryo, in, ReduceWork.class);
- } else {
- throw new RuntimeException("Unknown work type: " + name);
- }
- }
- LOG.info("Deserialized plan (via {}) - name: {} size: {}", planMode,
- gWork.getName(), humanReadableByteCount(serializedSize));
- gWorkMap.get(conf).put(path, gWork);
- return gWork;
- } catch (FileNotFoundException fnf) {
- // happens. e.g.: no reduce work.
- LOG.debug("No plan file found: {}", path, fnf);
- return null;
- } catch (Exception e) {
- String msg = "Failed to load plan: " + path;
- LOG.error(msg, e);
- throw new RuntimeException(msg, e);
- } finally {
- SerializationUtilities.releaseKryo(kryo);
- IOUtils.closeStream(in);
- }
- }
- private static void setHasWork(Configuration conf, String name) {
- if (MAP_PLAN_NAME.equals(name)) {
- conf.setBoolean(HAS_MAP_WORK, true);
- } else if (REDUCE_PLAN_NAME.equals(name)) {
- conf.setBoolean(HAS_REDUCE_WORK, true);
- }
- }
- public static List<String> getFieldSchemaString(List<FieldSchema> fl) {
- if (fl == null) {
- return null;
- }
- ArrayList<String> ret = new ArrayList<String>();
- for (FieldSchema f : fl) {
- ret.add(f.getName() + " " + f.getType()
- + (f.getComment() != null ? (" " + f.getComment()) : ""));
- }
- return ret;
- }
- public static void setMapRedWork(Configuration conf, MapredWork w, Path hiveScratchDir) {
- String useName = conf.get(INPUT_NAME);
- if (useName == null) {
- useName = "mapreduce:" + hiveScratchDir;
- }
- conf.set(INPUT_NAME, useName);
- setMapWork(conf, w.getMapWork(), hiveScratchDir, true);
- if (w.getReduceWork() != null) {
- conf.set(INPUT_NAME, useName);
- setReduceWork(conf, w.getReduceWork(), hiveScratchDir, true);
- }
- }
- public static Path setMapWork(Configuration conf, MapWork w, Path hiveScratchDir, boolean useCache) {
- return setBaseWork(conf, w, hiveScratchDir, MAP_PLAN_NAME, useCache);
- }
- public static Path setReduceWork(Configuration conf, ReduceWork w, Path hiveScratchDir, boolean useCache) {
- return setBaseWork(conf, w, hiveScratchDir, REDUCE_PLAN_NAME, useCache);
- }
- private static Path setBaseWork(Configuration conf, BaseWork w, Path hiveScratchDir, String name, boolean useCache) {
- Kryo kryo = SerializationUtilities.borrowKryo(conf);
- try {
- setPlanPath(conf, hiveScratchDir);
- Path planPath = getPlanPath(conf, name);
- setHasWork(conf, name);
- OutputStream out = null;
- final long serializedSize;
- final String planMode;
- if (HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
- // add it to the conf
- ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
- try {
- out = new DeflaterOutputStream(byteOut, new Deflater(Deflater.BEST_SPEED));
- SerializationUtilities.serializePlan(kryo, w, out);
- out.close();
- out = null;
- } finally {
- IOUtils.closeStream(out);
- }
- final String serializedPlan = Base64.getEncoder().encodeToString(byteOut.toByteArray());
- serializedSize = serializedPlan.length();
- planMode = "RPC";
- conf.set(planPath.toUri().getPath(), serializedPlan);
- } else {
- // use the default file system of the conf
- FileSystem fs = planPath.getFileSystem(conf);
- try {
- out = fs.create(planPath);
- SerializationUtilities.serializePlan(kryo, w, out);
- out.close();
- out = null;
- long fileLen = fs.getFileStatus(planPath).getLen();
- serializedSize = fileLen;
- planMode = "FILE";
- } finally {
- IOUtils.closeStream(out);
- }
- // Serialize the plan to the default hdfs instance
- // Except for hadoop local mode execution where we should be
- // able to get the plan directly from the cache
- if (useCache && !ShimLoader.getHadoopShims().isLocalMode(conf)) {
- // Set up distributed cache
- if (!DistributedCache.getSymlink(conf)) {
- DistributedCache.createSymlink(conf);
- }
- String uriWithLink = planPath.toUri().toString() + "#" + name;
- DistributedCache.addCacheFile(new URI(uriWithLink), conf);
- // set replication of the plan file to a high number. we use the same
- // replication factor as used by the hadoop jobclient for job.xml etc.
- short replication = (short) conf.getInt("mapred.submit.replication", 10);
- fs.setReplication(planPath, replication);
- }
- }
- LOG.info("Serialized plan (via {}) - name: {} size: {}", planMode, w.getName(),
- humanReadableByteCount(serializedSize));
- // Cache the plan in this process
- gWorkMap.get(conf).put(planPath, w);
- return planPath;
- } catch (Exception e) {
- String msg = "Error caching " + name;
- LOG.error(msg, e);
- throw new RuntimeException(msg, e);
- } finally {
- SerializationUtilities.releaseKryo(kryo);
- }
- }
- private static Path getPlanPath(Configuration conf, String name) {
- Path planPath = getPlanPath(conf);
- if (planPath == null) {
- return null;
- }
- return new Path(planPath, name);
- }
- private static void setPlanPath(Configuration conf, Path hiveScratchDir) throws IOException {
- if (getPlanPath(conf) == null) {
- // this is the unique conf ID, which is kept in JobConf as part of the plan file name
- String jobID = UUID.randomUUID().toString();
- Path planPath = new Path(hiveScratchDir, jobID);
- if (!HiveConf.getBoolVar(conf, ConfVars.HIVE_RPC_QUERY_PLAN)) {
- FileSystem fs = planPath.getFileSystem(conf);
- // since we are doing RPC creating a directory is un-necessary
- fs.mkdirs(planPath);
- }
- HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, planPath.toUri().toString());
- }
- }
- public static Path getPlanPath(Configuration conf) {
- String plan = HiveConf.getVar(conf, HiveConf.ConfVars.PLAN);
- if (plan != null && !plan.isEmpty()) {
- return new Path(plan);
- }
- return null;
- }
- public static class CollectionPersistenceDelegate extends DefaultPersistenceDelegate {
- @Override
- protected Expression instantiate(Object oldInstance, Encoder out) {
- return new Expression(oldInstance, oldInstance.getClass(), "new", null);
- }
- @Override
- protected void initialize(Class<?> type, Object oldInstance, Object newInstance, Encoder out) {
- Iterator<?> ite = ((Collection<?>) oldInstance).iterator();
- while (ite.hasNext()) {
- out.writeStatement(new Statement(oldInstance, "add", new Object[] {ite.next()}));
- }
- }
- }
- @VisibleForTesting
- public static TableDesc defaultTd;
- static {
- // by default we expect ^A separated strings
- // This tableDesc does not provide column names. We should always use
- // PlanUtils.getDefaultTableDesc(String separatorCode, String columns)
- // or getBinarySortableTableDesc(List<FieldSchema> fieldSchemas) when
- // we know the column names.
- /**
- * Generate the table descriptor of MetadataTypedColumnsetSerDe with the
- * separatorCode. MetaDataTypedColumnsetSerDe is used because LazySimpleSerDe
- * does not support a table with a single column "col" with type
- * "array<string>".
- */
- defaultTd = new TableDesc(TextInputFormat.class, IgnoreKeyTextOutputFormat.class,
- Utilities.makeProperties(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT,
- "" + Utilities.ctrlaCode, serdeConstants.SERIALIZATION_LIB,
- MetadataTypedColumnsetSerDe.class.getName()));
- }
- public static final int carriageReturnCode = 13;
- public static final int newLineCode = 10;
- public static final int tabCode = 9;
- public static final int ctrlaCode = 1;
- public static final String INDENT = " ";
- // Note: When DDL supports specifying what string to represent null,
- // we should specify "NULL" to represent null in the temp table, and then
- // we can make the following translation deprecated.
- public static final String nullStringStorage = "\\N";
- public static final String nullStringOutput = "NULL";
- /**
- * Gets the task id if we are running as a Hadoop job. Gets a random number otherwise.
- */
- public static String getTaskId(Configuration hconf) {
- String taskid = (hconf == null) ? null : hconf.get("mapred.task.id");
- if (StringUtils.isEmpty(taskid)) {
- return (Integer
- .toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE)));
- } else {
- /*
- * extract the task and attempt id from the hadoop taskid. in version 17 the leading component
- * was 'task_'. thereafter the leading component is 'attempt_'. in 17 - hadoop also seems to
- * have used _map_ and _reduce_ to denote map/reduce task types
- */
- String ret = taskid.replaceAll(".*_[mr]_", "").replaceAll(".*_(map|reduce)_", "");
- return (ret);
- }
- }
- public static Properties makeProperties(String... olist) {
- Properties ret = new Properties();
- for (int i = 0; i < olist.length; i += 2) {
- ret.setProperty(olist[i], olist[i + 1]);
- }
- return (ret);
- }
- public static ArrayList makeList(Object... olist) {
- ArrayList ret = new ArrayList();
- for (Object element : olist) {
- ret.add(element);
- }
- return (ret);
- }
- public static TableDesc getTableDesc(Table tbl) {
- Properties props = tbl.getMetadata();
- props.put(serdeConstants.SERIALIZATION_LIB, tbl.getDeserializer().getClass().getName());
- if (tbl.getMetaTable() != null) {
- props.put("metaTable", tbl.getMetaTable());
- }
- return (new TableDesc(tbl.getInputFormatClass(), tbl
- .getOutputFormatClass(), props));
- }
- // column names and column types are all delimited by comma
- public static TableDesc getTableDesc(String cols, String colTypes) {
- Properties properties = new Properties();
- properties.put(serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode);
- properties.put(serdeConstants.LIST_COLUMNS, cols);
- properties.put(serdeConstants.LIST_COLUMN_TYPES, colTypes);
- properties.put(serdeConstants.SERIALIZATION_LIB, LazySimpleSerDe.class.getName());
- properties.put(hive_metastoreConstants.TABLE_BUCKETING_VERSION, "-1");
- return (new TableDesc(SequenceFileInputFormat.class,
- HiveSequenceFileOutputFormat.class, properties));
- }
- public static PartitionDesc getPartitionDesc(Partition part, TableDesc tableDesc) throws
- HiveException {
- return new PartitionDesc(part, tableDesc);
- }
- public static PartitionDesc getPartitionDescFromTableDesc(TableDesc tblDesc, Partition part,
- boolean usePartSchemaProperties) throws HiveException {
- return new PartitionDesc(part, tblDesc, usePartSchemaProperties);
- }
- private static boolean isWhitespace(int c) {
- if (c == -1) {
- return false;
- }
- return Character.isWhitespace((char) c);
- }
- public static boolean contentsEqual(InputStream is1, InputStream is2, boolean ignoreWhitespace)
- throws IOException {
- try {
- if ((is1 == is2) || (is1 == null && is2 == null)) {
- return true;
- }
- if (is1 == null || is2 == null) {
- return false;
- }
- while (true) {
- int c1 = is1.read();
- while (ignoreWhitespace && isWhitespace(c1)) {
- c1 = is1.read();
- }
- int c2 = is2.read();
- while (ignoreWhitespace && isWhitespace(c2)) {
- c2 = is2.read();
- }
- if (c1 == -1 && c2 == -1) {
- return true;
- }
- if (c1 != c2) {
- break;
- }
- }
- } catch (FileNotFoundException e) {
- LOG.warn("Could not compare files. One or both cannot be found", e);
- }
- return false;
- }
- /**
- * convert "From src insert blah blah" to "From src insert ... blah"
- */
- public static String abbreviate(String str, int max) {
- str = str.trim();
- int len = str.length();
- int suffixlength = 20;
- if (len <= max) {
- return str;
- }
- suffixlength = Math.min(suffixlength, (max - 3) / 2);
- String rev = StringUtils.reverse(str);
- // get the last few words
- String suffix = StringUtils.abbreviate(rev, suffixlength);
- suffix = StringUtils.reverse(suffix);
- // first few ..
- String prefix = StringUtils.abbreviate(str, max - suffix.length());
- return prefix + suffix;
- }
- public static final String NSTR = "";
- /**
- * StreamStatus.
- *
- */
- public static enum StreamStatus {
- EOF, TERMINATED
- }
- public static StreamStatus readColumn(DataInput in, OutputStream out) throws IOException {
- while (true) {
- int b;
- try {
- b = in.readByte();
- } catch (EOFException e) {
- return StreamStatus.EOF;
- }
- if (b == Utilities.newLineCode) {
- return StreamStatus.TERMINATED;
- }
- out.write(b);
- }
- // Unreachable
- }
- /**
- * Convert an output stream to a compressed output stream based on codecs codecs in the Job
- * Configuration. Caller specifies directly whether file is compressed or not
- *
- * @param jc
- * Job Configuration
- * @param out
- * Output Stream to be converted into compressed output stream
- * @param isCompressed
- * whether the output stream needs to be compressed or not
- * @return compressed output stream
- */
- public static OutputStream createCompressedStream(JobConf jc, OutputStream out,
- boolean isCompressed) throws IOException {
- if (isCompressed) {
- Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
- DefaultCodec.class);
- CompressionCodec codec = ReflectionUtil.newInstance(codecClass, jc);
- return codec.createOutputStream(out);
- } else {
- return (out);
- }
- }
- /**
- * Based on compression option, output format, and configured output codec -
- * get extension for output file. Text files require an extension, whereas
- * others, like sequence files, do not.
- * <p>
- * The property <code>hive.output.file.extension</code> is used to determine
- * the extension - if set, it will override other logic for choosing an
- * extension.
- *
- * @param jc
- * Job Configuration
- * @param isCompressed
- * Whether the output file is compressed or not
- * @param hiveOutputFormat
- * The output format, used to detect if the format is text
- * @return the required file extension (example: .gz)
- */
- public static String getFileExtension(JobConf jc, boolean isCompressed,
- HiveOutputFormat<?, ?> hiveOutputFormat) {
- String extension = HiveConf.getVar(jc, HiveConf.ConfVars.OUTPUT_FILE_EXTENSION);
- if (!StringUtils.isEmpty(extension)) {
- return extension;
- }
- if ((hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) && isCompressed) {
- Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc,
- DefaultCodec.class);
- CompressionCodec codec = ReflectionUtil.newInstance(codecClass, jc);
- return codec.getDefaultExtension();
- }
- return StringUtils.EMPTY;
- }
- /**
- * Create a sequencefile output stream based on job configuration Uses user supplied compression
- * flag (rather than obtaining it from the Job Configuration).
- *
- * @param jc
- * Job configuration
- * @param fs
- * File System to create file in
- * @param file
- * Path to be created
- * @param keyClass
- * Java Class for key
- * @param valClass
- * Java Class for value
- * @return output stream over the created sequencefile
- */
- public static SequenceFile.Writer createSequenceWriter(JobConf jc, FileSystem fs, Path file,
- Class<?> keyClass, Class<?> valClass, boolean isCompressed, Progressable progressable)
- throws IOException {
- CompressionCodec codec = null;
- CompressionType compressionType = CompressionType.NONE;
- Class<? extends CompressionCodec> codecClass = null;
- if (isCompressed) {
- compressionType = SequenceFileOutputFormat.getOutputCompressionType(jc);
- codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
- codec = ReflectionUtil.newInstance(codecClass, jc);
- }
- return SequenceFile.createWriter(fs, jc, file, keyClass, valClass, compressionType, codec,
- progressable);
- }
- /**
- * Create a RCFile output stream based on job configuration Uses user supplied compression flag
- * (rather than obtaining it from the Job Configuration).
- *
- * @param jc
- * Job configuration
- * @param fs
- * File System to create file in
- * @param file
- * Path to be created
- * @return output stream over the created rcfile
- */
- public static RCFile.Writer createRCFileWriter(JobConf jc, FileSystem fs, Path file,
- boolean isCompressed, Progressable progressable) throws IOException {
- CompressionCodec codec = null;
- if (isCompressed) {
- Class<?> codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
- codec = (CompressionCodec) ReflectionUtil.newInstance(codecClass, jc);
- }
- return new RCFile.Writer(fs, jc, file, progressable, codec);
- }
- /**
- * Shamelessly cloned from GenericOptionsParser.
- */
- public static String realFile(String newFile, Configuration conf) throws IOException {
- Path path = new Path(newFile);
- URI pathURI = path.toUri();
- FileSystem fs;
- if (pathURI.getScheme() == null) {
- fs = FileSystem.getLocal(conf);
- } else {
- fs = path.getFileSystem(conf);
- }
- if (!fs.exists(path)) {
- return null;
- }
- String file = path.makeQualified(fs).toString();
- return file;
- }
- public static List<String> mergeUniqElems(List<String> src, List<String> dest) {
- if (dest == null) {
- return src;
- }
- if (src == null) {
- return dest;
- }
- int pos = 0;
- while (pos < dest.size()) {
- if (!src.contains(dest.get(pos))) {
- src.add(dest.get(pos));
- }
- pos++;
- }
- return src;
- }
- private static final String tmpPrefix = "_tmp.";
- private static final String taskTmpPrefix = "_task_tmp.";
- public static Path toTaskTempPath(Path orig) {
- if (orig.getName().indexOf(taskTmpPrefix) == 0) {
- return orig;
- }
- return new Path(orig.getParent(), taskTmpPrefix + orig.getName());
- }
- public static Path toTempPath(Path orig) {
- if (orig.getName().indexOf(tmpPrefix) == 0) {
- return orig;
- }
- return new Path(orig.getParent(), tmpPrefix + orig.getName());
- }
- /**
- * Given a path, convert to a temporary path.
- */
- public static Path toTempPath(String orig) {
- return toTempPath(new Path(orig));
- }
- /**
- * Detect if the supplied file is a temporary path.
- */
- private static boolean isTempPath(FileStatus file) {
- String name = file.getPath().getName();
- // in addition to detecting hive temporary files, we also check hadoop
- // temporary folders that used to show up in older releases
- return (name.startsWith("_task") || name.startsWith(tmpPrefix));
- }
- /**
- * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
- * existing file with the same name, the new file's name will be appended with "_1", "_2", etc.
- *
- * @param fs
- * the FileSystem where src and dst are on.
- * @param src
- * the src directory
- * @param dst
- * the target directory
- * @throws IOException
- */
- public static void rename(FileSystem fs, Path src, Path dst) throws IOException, HiveException {
- if (!fs.rename(src, dst)) {
- throw new HiveException("Unable to move: " + src + " to: " + dst);
- }
- }
- private static void moveFileOrDir(FileSystem fs, FileStatus file, Path dst) throws IOException,
- HiveException {
- Path srcFilePath = file.getPath();
- String fileName = srcFilePath.getName();
- Path dstFilePath = new Path(dst, fileName);
- if (file.isDir()) {
- renameOrMoveFiles(fs, srcFilePath, dstFilePath);
- } else {
- moveFile(fs, srcFilePath, dst, fileName);
- }
- }
- /**
- * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
- * existing file with the same name, the new file's name will be generated based on the file name.
- * If the file name confirms to hive managed file NNNNNN_Y(_copy_YY) then it will create NNNNN_Y_copy_XX
- * else it will append _1, _2, ....
- * @param fs
- * the FileSystem where src and dst are on.
- * @param srcFile
- * the src file
- * @param destDir
- * the target directory
- * @param destFileName
- * the target filename
- * @return The final path the file was moved to.
- * @throws IOException, HiveException
- */
- public static Path moveFile(FileSystem fs, Path srcFile, Path destDir, String destFileName)
- throws IOException, HiveException {
- Path dstFilePath = new Path(destDir, destFileName);
- if (fs.exists(dstFilePath)) {
- ParsedOutputFileName parsedFileName = ParsedOutputFileName.parse(destFileName);
- int suffix = 0;
- do {
- suffix++;
- if (parsedFileName.matches()) {
- dstFilePath = new Path(destDir, parsedFileName.makeFilenameWithCopyIndex(suffix));
- } else {
- dstFilePath = new Path(destDir, destFileName + "_" + suffix);
- }
- } while (fs.exists(dstFilePath));
- }
- if (!fs.rename(srcFile, dstFilePath)) {
- throw new HiveException("Unable to move: " + srcFile + " to: " + dstFilePath);
- }
- return dstFilePath;
- }
- /**
- * Rename src to dst, or in the case dst already exists, move files in src to dst. If there is an
- * existing file with the same name, the new file's name will be generated based on the file name.
- * If the file name confirms to hive managed file NNNNNN_Y(_copy_YY) then it will create NNNNN_Y_copy_XX
- * else it will append _1, _2, ....
- *
- * @param fs
- * the FileSystem where src and dst are on.
- * @param src
- * the src directory
- * @param dst
- * the target directory
- * @throws IOException
- */
- public static void renameOrMoveFiles(FileSystem fs, Path src, Path dst) throws IOException,
- HiveException {
- if (!fs.exists(dst)) {
- if (!fs.rename(src, dst)) {
- throw new HiveException("Unable to move: " + src + " to: " + dst);
- }
- } else {
- // move file by file
- FileStatus[] files = fs.listStatus(src);
- for (FileStatus file : files) {
- Utilities.moveFileOrDir(fs, file, dst);
- }
- }
- }
- /**
- * Rename src to dst, or in the case dst already exists, move files in src
- * to dst. If there is an existing file with the same name, the new file's
- * name will be appended with "_1", "_2", etc. Happens in parallel mode.
- *
- * @param conf
- *
- * @param fs
- * the FileSystem where src and dst are on.
- * @param src
- * the src directory
- * @param dst
- * the target directory
- * @throws IOException
- */
- public static void renameOrMoveFilesInParallel(Configuration conf,
- FileSystem fs, Path src, Path dst) throws IOException, HiveException {
- if (!fs.exists(dst)) {
- if (!fs.rename(src, dst)) {
- throw new HiveException("Unable to move: " + src + " to: " + dst);
- }
- } else {
- // move files in parallel
- LOG.info("Moving files from {} to {}", src, dst);
- final ExecutorService pool = createMoveThreadPool(conf);
- List<Future<Void>> futures = new LinkedList<>();
- final FileStatus[] files = fs.listStatus(src);
- for (FileStatus file : files) {
- futures.add(pool.submit(new Callable<Void>() {
- @Override
- public Void call() throws HiveException {
- try {
- Utilities.moveFileOrDir(fs, file, dst);
- } catch (Exception e) {
- throw new HiveException(e);
- }
- return null;
- }
- }));
- }
- shutdownAndCleanup(pool, futures);
- LOG.info("Rename files from {} to {} is complete", src, dst);
- }
- }
- public static final String COPY_KEYWORD = "_copy_"; // copy keyword
- /**
- * This breaks a prefixed bucket number into the prefix and the taskID
- */
- private static final Pattern PREFIXED_TASK_ID_REGEX =
- Pattern.compile("^(.*?\\(.*\\))?([0-9]+)$");
- /**
- * This breaks a prefixed bucket number out into a single integer
- */
- private static final Pattern PREFIXED_BUCKET_ID_REGEX =
- Pattern.compile("^(0*([0-9]+))_([0-9]+).*");
- /**
- * Get the task id from the filename. It is assumed that the filename is derived from the output
- * of getTaskId
- *
- * @param filename
- * filename to extract taskid from
- */
- public static String getTaskIdFromFilename(String filename) {
- return getIdFromFilename(filename, false, false);
- }
- /**
- * Get the part-spec + task id from the filename. It is assumed that the filename is derived
- * from the output of getTaskId
- *
- * @param filename
- * filename to extract taskid from
- */
- private static String getPrefixedTaskIdFromFilename(String filename) {
- return getIdFromFilename(filename, true, false);
- }
- private static int getAttemptIdFromFilename(String filename) {
- return Integer.parseInt(getIdFromFilename(filename, true, true));
- }
- private static String getIdFromFilename(String filepath, boolean isPrefixed, boolean isTaskAttempt) {
- String filename = filepath;
- int dirEnd = filepath.lastIndexOf(Path.SEPARATOR);
- if (dirEnd != -1) {
- filename = filepath.substring(dirEnd + 1);
- }
- ParsedOutputFileName parsedOutputFileName = ParsedOutputFileName.parse(filename);
- String taskId;
- if (parsedOutputFileName.matches()) {
- if (isTaskAttempt) {
- taskId = parsedOutputFileName.getAttemptId();
- } else {
- taskId = isPrefixed ? parsedOutputFileName.getPrefixedTaskId() : parsedOutputFileName.getTaskId();
- }
- } else {
- taskId = filename;
- LOG.warn("Unable to get task id from file name: {}. Using last component {}"
- + " as task id.", filepath, taskId);
- }
- if (isTaskAttempt) {
- LOG.debug("TaskAttemptId for {} = {}", filepath, taskId);
- } else {
- LOG.debug("TaskId for {} = {}", filepath, taskId);
- }
- return taskId;
- }
- /**
- * Replace the task id from the filename. It is assumed that the filename is derived from the
- * output of getTaskId
- *
- * @param filename
- * filename to replace taskid "0_0" or "0_0.gz" by 33 to "33_0" or "33_0.gz"
- */
- public static String replaceTaskIdFromFilename(String filename, int bucketNum) {
- return replaceTaskIdFromFilename(filename, String.valueOf(bucketNum));
- }
- public static String replaceTaskIdFromFilename(String filename, String fileId) {
- String taskId = getTaskIdFromFilename(filename);
- String newTaskId = replaceTaskId(taskId, fileId);
- String ret = replaceTaskIdFromFilename(filename, taskId, newTaskId);
- return (ret);
- }
- /**
- * Replace taskId with input bucketNum. For example, if taskId is 000000 and bucketNum is 1,
- * return should be 000001; if taskId is (ds%3D1)000000 and bucketNum is 1, return should be
- * (ds%3D1)000001. This method is different from the replaceTaskId(String, String) method.
- * In this method, the pattern is in taskId.
- * @param taskId
- * @param bucketNum
- * @return
- */
- public static String replaceTaskId(String taskId, int bucketNum) {
- String bucketNumStr = String.valueOf(bucketNum);
- Matcher m = PREFIXED_TASK_ID_REGEX.matcher(taskId);
- if (!m.matches()) {
- LOG.warn("Unable to determine bucket number from task id: {}. Using " +
- "task ID as bucket number.", taskId);
- return adjustBucketNumLen(bucketNumStr, taskId);
- } else {
- String adjustedBucketNum = adjustBucketNumLen(bucketNumStr, m.group(2));
- return (m.group(1) == null ? StringUtils.EMPTY : m.group(1)) + adjustedBucketNum;
- }
- }
- /**
- * Returns strBucketNum with enough 0's prefixing the task ID portion of the String to make it
- * equal in length to taskId
- *
- * @param taskId - the taskId used as a template for length
- * @param strBucketNum - the bucket number of the output, may or may not be prefixed
- * @return
- */
- private static String replaceTaskId(String taskId, String strBucketNum) {
- Matcher m = PREFIXED_TASK_ID_REGEX.matcher(strBucketNum);
- if (!m.matches()) {
- LOG.warn("Unable to determine bucket number from file ID: {}. Using " +
- "file ID as bucket number.", strBucketNum);
- return adjustBucketNumLen(strBucketN…
Large files files are truncated, but you can click here to view the full file