ScriptOperator.java - This Java code implements a ScriptOpe…

/ql/src/java/org/apache/hadoop/hive/ql/exec/ScriptOperator.java

http://github.com/apache/hive · Java · 839 lines · 629 code · 100 blank · 110 comment · 119 complexity · 3b839d65588fcbbb6ef7131846edab89 MD5 · raw file


/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ScriptDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Shell;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkEnv;
import org.apache.spark.SparkFiles;

/**
 * ScriptOperator.
 *
 */
public class ScriptOperator extends Operator<ScriptDesc> implements
    Serializable {

  private static final long serialVersionUID = 1L;

  /**
   * Counter.
   *
   */
  public static enum Counter {
    DESERIALIZE_ERRORS, SERIALIZE_ERRORS
  }

  private final transient LongWritable deserialize_error_count = new LongWritable();
  private final transient LongWritable serialize_error_count = new LongWritable();

  transient Thread outThread = null;
  transient Thread errThread = null;
  transient Process scriptPid = null;
  transient Configuration hconf;
  // Input to the script
  transient Serializer scriptInputSerializer;
  // Output from the script
  transient Deserializer scriptOutputDeserializer;
  transient volatile Throwable scriptError = null;
  transient RecordWriter scriptOutWriter = null;
  // List of conf entries not to turn into env vars
  transient Set<String> blackListedConfEntries = null;

  static final String IO_EXCEPTION_BROKEN_PIPE_STRING = "Broken pipe";
  static final String IO_EXCEPTION_STREAM_CLOSED = "Stream closed";

  /**
   * sends periodic reports back to the tracker.
   */
  transient AutoProgressor autoProgressor;

  // first row - the process should only be started if necessary, as it may
  // conflict with some
  // of the user assumptions.
  transient boolean firstRow;


  String safeEnvVarName(String name) {
    StringBuilder safe = new StringBuilder();
    int len = name.length();

    for (int i = 0; i < len; i++) {
      char c = name.charAt(i);
      char s;
      if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z')
          || (c >= 'a' && c <= 'z')) {
        s = c;
      } else {
        s = '_';
      }
      safe.append(s);
    }
    return safe.toString();
  }

  /**
   * Most UNIX implementations impose some limit on the total size of environment variables and
   * size of strings. To fit in this limit we need sometimes to truncate strings.  Also,
   * some values tend be long and are meaningless to scripts, so strain them out.
   * @param value environment variable value to check
   * @param name name of variable (used only for logging purposes)
   * @param truncate truncate value or not
   * @return original value, or truncated one if it's length is more then 20KB and
   * truncate flag is set
   * @see <a href="http://www.kernel.org/doc/man-pages/online/pages/man2/execve.2.html">Linux
   * Man page</a> for more details
   */
  String safeEnvVarValue(String value, String name, boolean truncate) {
    final int lenLimit = 20*1024;
    if (truncate && value.length() > lenLimit) {
      value = value.substring(0, lenLimit);
      LOG.warn("Length of environment variable " + name + " was truncated to " + lenLimit
          + " bytes to fit system limits.");
    }
    return value;
  }

  /**
   * Checks whether a given configuration name is blacklisted and should not be converted
   * to an environment variable.
   */
  boolean blackListed(Configuration conf, String name) {
    if (blackListedConfEntries == null) {
      blackListedConfEntries = new HashSet<String>();
      if (conf != null) {
        String bl = conf.get(HiveConf.ConfVars.HIVESCRIPT_ENV_BLACKLIST.toString(),
          HiveConf.ConfVars.HIVESCRIPT_ENV_BLACKLIST.getDefaultValue());
        if (bl != null && !bl.isEmpty()) {
          String[] bls = bl.split(",");
          Collections.addAll(blackListedConfEntries, bls);
        }
      }
    }
    return blackListedConfEntries.contains(name);
  }

  /**
   * addJobConfToEnvironment is mostly shamelessly copied from hadoop streaming. Added additional
   * check on environment variable length
   */
  void addJobConfToEnvironment(Configuration conf, Map<String, String> env) {
    Iterator<Map.Entry<String, String>> it = conf.iterator();
    while (it.hasNext()) {
      Map.Entry<String, String> en = it.next();
      String name = en.getKey();
      if (!blackListed(conf, name)) {
        // String value = (String)en.getValue(); // does not apply variable
        // expansion
        String value = conf.get(name); // does variable expansion
        name = safeEnvVarName(name);
        boolean truncate = conf
            .getBoolean(HiveConf.ConfVars.HIVESCRIPTTRUNCATEENV.toString(), false);
        value = safeEnvVarValue(value, name, truncate);
        env.put(name, value);
      }
    }
  }

  /**
   * Maps a relative pathname to an absolute pathname using the PATH environment.
   */
  public class PathFinder {
    String pathenv; // a string of pathnames
    String pathSep; // the path separator
    String fileSep; // the file separator in a directory

    /**
     * Construct a PathFinder object using the path from the specified system
     * environment variable.
     */
    public PathFinder(String envpath) {
      pathenv = System.getenv(envpath);
      pathSep = System.getProperty("path.separator");
      fileSep = System.getProperty("file.separator");
    }

    /**
     * Appends the specified component to the path list.
     */
    public void prependPathComponent(String str) {
      pathenv = str + pathSep + pathenv;
    }

    /**
     * Returns the full path name of this file if it is listed in the path.
     */
    public File getAbsolutePath(String filename) {
      if (pathenv == null || pathSep == null || fileSep == null) {
        return null;
      }

      int val = -1;
      String classvalue = pathenv + pathSep;

      while (((val = classvalue.indexOf(pathSep)) >= 0)
          && classvalue.length() > 0) {
        //
        // Extract each entry from the pathenv
        //
        String entry = classvalue.substring(0, val).trim();
        File f = new File(entry);

        try {
          if (f.isDirectory()) {
            //
            // this entry in the pathenv is a directory.
            // see if the required file is in this directory
            //
            f = new File(entry + fileSep + filename);
          }
          //
          // see if the filename matches and we can read it
          //
          if (f.isFile() && f.canRead()) {
            return f;
          }
        } catch (Exception exp) {
        }
        classvalue = classvalue.substring(val + 1).trim();
      }
      return null;
    }
  }

  /** Kryo ctor. */
  protected ScriptOperator() {
    super();
  }

  public ScriptOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    firstRow = true;

    statsMap.put(Counter.DESERIALIZE_ERRORS.toString(), deserialize_error_count);
    statsMap.put(Counter.SERIALIZE_ERRORS.toString(), serialize_error_count);

    try {
      this.hconf = hconf;

      AbstractSerDe outputSerDe = conf.getScriptOutputInfo().getSerDeClass().newInstance();
      outputSerDe.initialize(hconf, conf.getScriptOutputInfo().getProperties(), null);

      AbstractSerDe inputSerde = conf.getScriptInputInfo().getSerDeClass().newInstance();
      inputSerde.initialize(hconf, conf.getScriptInputInfo().getProperties(), null);

      scriptOutputDeserializer = outputSerDe;
      scriptInputSerializer = inputSerde;

      outputObjInspector = scriptOutputDeserializer.getObjectInspector();

    } catch (Exception e) {
      throw new HiveException(ErrorMsg.SCRIPT_INIT_ERROR.getErrorCodedMsg(), e);
    }
  }

  boolean isBrokenPipeException(IOException e) {
    return (e.getMessage().equalsIgnoreCase(IO_EXCEPTION_BROKEN_PIPE_STRING) ||
            e.getMessage().equalsIgnoreCase(IO_EXCEPTION_STREAM_CLOSED));
  }

  boolean allowPartialConsumption() {
    return HiveConf.getBoolVar(hconf, HiveConf.ConfVars.ALLOWPARTIALCONSUMP);
  }

  void displayBrokenPipeInfo() {
    LOG.info("The script did not consume all input data. This is considered as an error.");
    LOG.info("set " + HiveConf.ConfVars.ALLOWPARTIALCONSUMP.toString() + "=true; to ignore it.");
    return;
  }

  private transient String tableName;
  private transient String partitionName ;

  @Override
  public void setInputContext(String tableName, String partitionName) {
    this.tableName = tableName;
    this.partitionName = partitionName;
    super.setInputContext(tableName, partitionName);
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    // initialize the user's process only when you receive the first row
    if (firstRow) {
      firstRow = false;
      SparkConf sparkConf = null;
      try {
        String[] cmdArgs = splitArgs(conf.getScriptCmd());

        String prog = cmdArgs[0];
        File currentDir = new File(".").getAbsoluteFile();

        if (!new File(prog).isAbsolute()) {
          PathFinder finder = new PathFinder("PATH");
          finder.prependPathComponent(currentDir.toString());

          // In spark local mode, we need to search added files in root directory.
          if (HiveConf.getVar(hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
            sparkConf = SparkEnv.get().conf();
            finder.prependPathComponent(SparkFiles.getRootDirectory());
          }
          File f = finder.getAbsolutePath(prog);
          if (f != null) {
            cmdArgs[0] = f.getAbsolutePath();
          }
          f = null;
        }

        String[] wrappedCmdArgs = addWrapper(cmdArgs);
        LOG.info("Executing " + Arrays.asList(wrappedCmdArgs));
        LOG.info("tablename=" + tableName);
        LOG.info("partname=" + partitionName);
        LOG.info("alias=" + alias);

        ProcessBuilder pb = new ProcessBuilder(wrappedCmdArgs);
        Map<String, String> env = pb.environment();
        addJobConfToEnvironment(hconf, env);
        env.put(safeEnvVarName(HiveConf.ConfVars.HIVEALIAS.varname), String
            .valueOf(alias));

        // Create an environment variable that uniquely identifies this script
        // operator
        String idEnvVarName = HiveConf.getVar(hconf,
            HiveConf.ConfVars.HIVESCRIPTIDENVVAR);
        String idEnvVarVal = getOperatorId();
        env.put(safeEnvVarName(idEnvVarName), idEnvVarVal);

        // For spark, in non-local mode, any added dependencies are stored at
        // SparkFiles::getRootDirectory, which is the executor's working directory.
        // In local mode, we need to manually point the process's working directory to it,
        // in order to make the dependencies accessible.
        if (sparkConf != null) {
          String master = sparkConf.get("spark.master");
          if (master.equals("local") || master.startsWith("local[")) {
            pb.directory(new File(SparkFiles.getRootDirectory()));
          }
        }

        scriptPid = pb.start(); // Runtime.getRuntime().exec(wrappedCmdArgs);

        DataOutputStream scriptOut = new DataOutputStream(
            new BufferedOutputStream(scriptPid.getOutputStream()));
        DataInputStream scriptIn = new DataInputStream(new BufferedInputStream(
            scriptPid.getInputStream()));
        DataInputStream scriptErr = new DataInputStream(
            new BufferedInputStream(scriptPid.getErrorStream()));

        scriptOutWriter = conf.getInRecordWriterClass().newInstance();
        scriptOutWriter.initialize(scriptOut, hconf);

        RecordReader scriptOutputReader = conf.getOutRecordReaderClass()
            .newInstance();
        scriptOutputReader.initialize(scriptIn, hconf, conf
            .getScriptOutputInfo().getProperties());

        outThread = new StreamThread(scriptOutputReader,
            new OutputStreamProcessor(scriptOutputDeserializer
            .getObjectInspector()), "OutputProcessor");

        RecordReader scriptErrReader = conf.getErrRecordReaderClass()
            .newInstance();
        scriptErrReader.initialize(scriptErr, hconf, conf.getScriptErrInfo()
            .getProperties());

        errThread = new StreamThread(scriptErrReader, new ErrorStreamProcessor(
            HiveConf.getIntVar(hconf, HiveConf.ConfVars.SCRIPTERRORLIMIT)),
            "ErrorProcessor");

        if (HiveConf
            .getBoolVar(hconf, HiveConf.ConfVars.HIVESCRIPTAUTOPROGRESS)) {
          autoProgressor = new AutoProgressor(this.getClass().getName(),
              reporter, Utilities.getDefaultNotificationInterval(hconf),
              HiveConf.getTimeVar(
                  hconf, HiveConf.ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT, TimeUnit.MILLISECONDS));
          autoProgressor.go();
        }

        outThread.start();
        errThread.start();
      } catch (Exception e) {
        throw new HiveException(ErrorMsg.SCRIPT_INIT_ERROR.getErrorCodedMsg(), e);
      }
    }

    if (scriptError != null) {
      throw new HiveException(ErrorMsg.SCRIPT_GENERIC_ERROR.getErrorCodedMsg(), scriptError);
    }

    try {
      Writable res = scriptInputSerializer.serialize(row,
          inputObjInspectors[tag]);
      scriptOutWriter.write(res);
    } catch (SerDeException e) {
      LOG.error("Error in serializing the row: " + e.getMessage());
      scriptError = e;
      serialize_error_count.set(serialize_error_count.get() + 1);
      throw new HiveException(e);
    } catch (IOException e) {
      if (isBrokenPipeException(e) && allowPartialConsumption()) {
        // Give the outThread a chance to finish before marking the operator as done
        try {
          scriptPid.waitFor();
        } catch (InterruptedException interruptedException) {
        }
        // best effort attempt to write all output from the script before marking the operator
        // as done
        try {
          if (outThread != null) {
            outThread.join(0);
          }
        } catch (Exception e2) {
          LOG.warn("Exception in closing outThread", e2);
        }
        setDone(true);
        LOG.warn("Got broken pipe during write: ignoring exception and setting operator to done");
      } else {
        LOG.error("Error in writing to script: " + e.getMessage());
        if (isBrokenPipeException(e)) {
          displayBrokenPipeInfo();
        }
        scriptError = e;
        throw new HiveException(ErrorMsg.SCRIPT_IO_ERROR.getErrorCodedMsg(), e);
      }
    }
  }

  @Override
  public void close(boolean abort) throws HiveException {

    boolean new_abort = abort;
    if (!abort) {
      if (scriptError != null) {
        throw new HiveException(ErrorMsg.SCRIPT_GENERIC_ERROR.getErrorCodedMsg(), scriptError);
      }
      // everything ok. try normal shutdown
      try {
        try {
          if (scriptOutWriter != null) {
            scriptOutWriter.close();
          }
        } catch (IOException e) {
          if (isBrokenPipeException(e) && allowPartialConsumption()) {
            LOG.warn("Got broken pipe: ignoring exception");
          } else {
            if (isBrokenPipeException(e)) {
              displayBrokenPipeInfo();
            }
            throw e;
          }
        }
        int exitVal = 0;
        if (scriptPid != null) {
          exitVal = scriptPid.waitFor();
        }
        if (exitVal != 0) {
          LOG.error("Script failed with code " + exitVal);
          new_abort = true;
        }
      } catch (IOException e) {
        LOG.error("Got exception", e);
        new_abort = true;
      } catch (InterruptedException e) {
      }

    } else {

      // Error already occurred, but we still want to get the
      // error code of the child process if possible.
      try {
        // Interrupt the current thread after 1 second
        final Thread mythread = Thread.currentThread();
        Timer timer = new Timer(true);
        timer.schedule(new TimerTask() {
          @Override
          public void run() {
            mythread.interrupt();
          }
        }, 1000);
        // Wait for the child process to finish
        int exitVal = 0;
        if (scriptPid != null) {
          scriptPid.waitFor();
        }
        // Cancel the timer
        timer.cancel();
        // Output the exit code
        LOG.error("Script exited with code " + exitVal);
      } catch (InterruptedException e) {
        // Ignore
        LOG.error("Script has not exited yet. It will be killed.");
      }
    }

    // try these best effort
    try {
      if (outThread != null) {
        outThread.join(0);
      }
    } catch (Exception e) {
      LOG.warn("Exception in closing outThread", e);
    }

    try {
      if (errThread != null) {
        errThread.join(0);
      }
    } catch (Exception e) {
      LOG.warn("Exception in closing errThread", e);
    }

    try {
      if (scriptPid != null) {
        scriptPid.destroy();
      }
    } catch (Exception e) {
      LOG.warn("Exception in destroying scriptPid", e);
    }

    super.close(new_abort);

    if (new_abort && !abort) {
      throw new HiveException(ErrorMsg.SCRIPT_CLOSING_ERROR.getErrorCodedMsg());
    }
  }

  interface StreamProcessor {
    void processLine(Writable line) throws HiveException;

    void close() throws HiveException;
  }

  class OutputStreamProcessor implements StreamProcessor {
    Object row;
    ObjectInspector rowInspector;

    public OutputStreamProcessor(ObjectInspector rowInspector) {
      this.rowInspector = rowInspector;
    }

    @Override
    public void processLine(Writable line) throws HiveException {
      try {
        row = scriptOutputDeserializer.deserialize(line);
      } catch (SerDeException e) {
        deserialize_error_count.set(deserialize_error_count.get() + 1);
        return;
      }
      forward(row, rowInspector);
    }

    @Override
    public void close() {
    }
  }

  class CounterStatusProcessor {

    private final String reporterPrefix;
    private final String counterPrefix;
    private final String statusPrefix;
    private final Reporter reporter;

    CounterStatusProcessor(Configuration hconf, Reporter reporter){
      this.reporterPrefix = HiveConf.getVar(hconf, HiveConf.ConfVars.STREAMREPORTERPERFIX);
      this.counterPrefix = reporterPrefix + "counter:";
      this.statusPrefix = reporterPrefix + "status:";
      this.reporter = reporter;
    }

    private boolean process(String line) {
      if (line.startsWith(reporterPrefix)){
        if (line.startsWith(counterPrefix)){
          incrCounter(line);
        }
        if (line.startsWith(statusPrefix)){
          setStatus(line);
        }
        return true;
      } else {
        return false;
      }
    }

    private void incrCounter(String line) {
      String  trimmedLine = line.substring(counterPrefix.length()).trim();
      String[] columns = trimmedLine.split(",");
      if (columns.length == 3) {
        try {
          reporter.incrCounter(columns[0], columns[1], Long.parseLong(columns[2]));
        } catch (NumberFormatException e) {
            LOG.warn("Cannot parse counter increment '" + columns[2] +
                "' from line " + line);
        }
      } else {
        LOG.warn("Cannot parse counter line: " + line);
      }
    }

    private void setStatus(String line) {
      reporter.setStatus(line.substring(statusPrefix.length()).trim());
    }
  }
  /**
   * The processor for stderr stream.
   */
  class ErrorStreamProcessor implements StreamProcessor {
    private long bytesCopied = 0;
    private final long maxBytes;
    private long lastReportTime;
    private CounterStatusProcessor counterStatus;

    public ErrorStreamProcessor(int maxBytes) {
      this.maxBytes = maxBytes;
      lastReportTime = 0;
      if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.STREAMREPORTERENABLED)){
        counterStatus = new CounterStatusProcessor(hconf, reporter);
      }
    }

    @Override
    public void processLine(Writable line) throws HiveException {

      String stringLine = line.toString();
      int len = 0;

      if (line instanceof Text) {
        len = ((Text) line).getLength();
      } else if (line instanceof BytesWritable) {
        len = ((BytesWritable) line).getSize();
      }

      // Report progress for each stderr line, but no more frequently than once
      // per minute.
      long now = System.currentTimeMillis();
      // reporter is a member variable of the Operator class.
      if (now - lastReportTime > 60 * 1000 && reporter != null) {
        LOG.info("ErrorStreamProcessor calling reporter.progress()");
        lastReportTime = now;
        reporter.progress();
      }

      if (reporter != null) {
        if (counterStatus != null) {
          if (counterStatus.process(stringLine)) {
            return;
          }
        }
      }

      if ((maxBytes < 0) || (bytesCopied < maxBytes)) {
        System.err.println(stringLine);
      }
      if (bytesCopied < maxBytes && bytesCopied + len >= maxBytes) {
        System.err.println("Operator " + id + " " + getName()
            + ": exceeding stderr limit of " + maxBytes
            + " bytes, will truncate stderr messages.");
      }
      bytesCopied += len;
    }

    @Override
    public void close() {
    }

  }

  class StreamThread extends Thread {

    RecordReader in;
    StreamProcessor proc;
    String name;

    StreamThread(RecordReader in, StreamProcessor proc, String name) {
      this.in = in;
      this.proc = proc;
      this.name = name;
      setDaemon(true);
    }

    @Override
    public void run() {
      try {
        Writable row = in.createRow();

        while (true) {
          long bytes = in.next(row);

          if (bytes <= 0) {
            break;
          }
          proc.processLine(row);
        }
        LOG.info("StreamThread " + name + " done");

      } catch (Throwable th) {
        scriptError = th;
        LOG.warn("Exception in StreamThread.run()", th);
      } finally {
        try {
          if (in != null) {
            in.close();
          }
        } catch (Exception e) {
          LOG.warn(name + ": error in closing ..", e);
        }
        try
        {
          if (null != proc) {
            proc.close();
          }
        }catch (Exception e) {
          LOG.warn(": error in closing ..", e);
        }
      }
    }
  }

  /**
   * Wrap the script in a wrapper that allows admins to control.
   */
  protected String[] addWrapper(String[] inArgs) {
    String wrapper = HiveConf.getVar(hconf, HiveConf.ConfVars.SCRIPTWRAPPER);
    if (wrapper == null) {
      return inArgs;
    }

    String[] wrapComponents = splitArgs(wrapper);
    int totallength = wrapComponents.length + inArgs.length;
    String[] finalArgv = new String[totallength];
    for (int i = 0; i < wrapComponents.length; i++) {
      finalArgv[i] = wrapComponents[i];
    }
    for (int i = 0; i < inArgs.length; i++) {
      finalArgv[wrapComponents.length + i] = inArgs[i];
    }
    return finalArgv;
  }

  // Code below shameless borrowed from Hadoop Streaming

  public static String[] splitArgs(String args) {
    final int OUTSIDE = 1;
    final int SINGLEQ = 2;
    final int DOUBLEQ = 3;

    ArrayList argList = new ArrayList();
    char[] ch = args.toCharArray();
    int clen = ch.length;
    int state = OUTSIDE;
    int argstart = 0;
    for (int c = 0; c <= clen; c++) {
      boolean last = (c == clen);
      int lastState = state;
      boolean endToken = false;
      if (!last) {
        if (ch[c] == '\'') {
          if (state == OUTSIDE) {
            state = SINGLEQ;
          } else if (state == SINGLEQ) {
            state = OUTSIDE;
          }
          endToken = (state != lastState);
        } else if (ch[c] == '"') {
          if (state == OUTSIDE) {
            state = DOUBLEQ;
          } else if (state == DOUBLEQ) {
            state = OUTSIDE;
          }
          endToken = (state != lastState);
        } else if (ch[c] == ' ') {
          if (state == OUTSIDE) {
            endToken = true;
          }
        }
      }
      if (last || endToken) {
        if (c == argstart) {
          // unquoted space
        } else {
          String a;
          a = args.substring(argstart, c);
          argList.add(a);
        }
        argstart = c + 1;
        lastState = state;
      }
    }
    return (String[]) argList.toArray(new String[0]);
  }

  @Override
  public String getName() {
    return ScriptOperator.getOperatorName();
  }

  static public String getOperatorName() {
    return "SCR";
  }

  @Override
  public OperatorType getType() {
    return OperatorType.SCRIPT;
  }
}

Summary ✨

This Java code implements a ScriptOperator class that wraps a script in a wrapper to allow admins to control its execution. It uses Hadoop Streaming’s syntax for splitting arguments and handles quoting, spaces, and escaping. The operator can be used as a custom operator in a MapReduce job, allowing users to execute scripts on large datasets.

Tech Fingerprint

Alerts (29)

'volatile' Ensure 'volatile' is used correctly for visibility across threads. It does *not* guarantee atomicity for compound actions (like incrementing); consider atomic classes (java.util.concurrent.atomic) for those.
89
Complexity hotspot; lines 115 to 116 (total complexity: 6)
115 116
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
139 778 779
Complexity hotspot; line 218 (total complexity: 6)
218
'return null;' Returning null forces callers to perform null checks, risking NullPointerException. Consider using Optional<T> (Java 8+), throwing an exception, or returning a Null Object/empty collection instead.
219 251
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
223 728
'catch' Correctness Info: Empty catch block detected. Swallowing exceptions without logging or handling can hide errors and make debugging difficult.
247
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
247 286 415 446 534 542 550 738 746
'.close()' Manual .close() call detected. Prefer using try-with-resources (Java 7+) for automatic and safer resource management, especially handling exceptions during close.
474 736 744
'==' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
622
'instanceof' Frequent 'instanceof' checks can indicate a need for better polymorphism (using overridden methods in subclasses) or visitor pattern. Consider if the design can be improved.
661 663
'!=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
671
'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
781