options.py | searchcode

/mrjob/options.py

Large files files are truncated, but you can click here to view the full file

# -*- coding: utf-8 -*-
# Copyright 2009-2016 Yelp and Contributors
# Copyright 2017 Yelp
# Copyright 2018 Yelp, Google, Inc., and Contributors
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to populate py:class:`~argparse.ArgumentParser``
objects with categorized command line parameters.
"""
from __future__ import print_function

import json
import re
from argparse import Action
from argparse import ArgumentParser
from argparse import SUPPRESS
from logging import getLogger

from mrjob.conf import combine_cmds
from mrjob.conf import combine_dicts
from mrjob.conf import combine_envs
from mrjob.conf import combine_jobconfs
from mrjob.conf import combine_lists
from mrjob.conf import combine_paths
from mrjob.conf import combine_path_lists
from mrjob.parse import _parse_port_range_list
from mrjob.util import shlex_split

log = getLogger(__name__)

#: cleanup options:
#:
#: * ``'ALL'``: delete logs and local and remote temp files; stop cluster
#:   if on EMR and the job is not done when cleanup is run.
#: * ``'CLOUD_TMP'``: delete temp files on cloud storage (e.g. S3) only
#: * ``'CLUSTER'``: terminate the cluster if on EMR and the job is not done
#:    on cleanup
#: * ``'HADOOP_TMP'``: delete temp files on HDFS only
#: * ``'JOB'``: stop job if on EMR and the job is not done when cleanup runs
#: * ``'LOCAL_TMP'``: delete local temp files only
#: * ``'LOGS'``: delete logs only
#: * ``'NONE'``: delete nothing
#: * ``'TMP'``: delete local, HDFS, and cloud storage temp files, but not logs
CLEANUP_CHOICES = [
    'ALL',
    'CLOUD_TMP',
    'CLUSTER',
    'HADOOP_TMP',
    'JOB',
    'LOCAL_TMP',
    'LOGS',
    'NONE',
    'TMP',
]

# use to identify malformed JSON
_PROBABLY_JSON_RE = re.compile(r'^\s*[\{\[\"].*$')

# names of runners
_RUNNER_ALIASES = {
    'dataproc',
    'emr',
    'hadoop',
    'inline',
    'local',
    'spark',
}


### custom actions ###

def _default_to(namespace, dest, value):
    """Helper function; set the given attribute to *value* if it's None."""
    if getattr(namespace, dest) is None:
        setattr(namespace, dest, value)


# these actions are only used by _add_runner_args(), so we can assume *value*
# is a string

class _KeyValueAction(Action):
    """action for KEY=VALUE pairs"""
    # used for --cmdenv, --jobconf, and more

    def __call__(self, parser, namespace, value, option_string=None):
        try:
            k, v = value.split('=', 1)
        except ValueError:
            parser.error('%s argument %r is not of the form KEY=VALUE' % (
                option_string, value))

        _default_to(namespace, self.dest, {})
        getattr(namespace, self.dest)[k] = v


class _KeyNoneValueAction(Action):
    """action to set KEY to None"""
    def __call__(self, parser, namespace, value, option_string=None):
        _default_to(namespace, self.dest, {})
        getattr(namespace, self.dest)[value] = None


class _CleanupAction(Action):
    """action to parse a comma-separated list of cleanup constants."""

    def __call__(self, parser, namespace, value, option_string=None):
        result = []

        for choice in value.split(','):
            if choice in CLEANUP_CHOICES:
                result.append(choice)
            else:
                parser.error(
                    '%s got %s, which is not one of: %s' %
                    (option_string, choice, ', '.join(CLEANUP_CHOICES)))

        if 'NONE' in result and len(set(result)) > 1:
            parser.error(
                '%s: Cannot clean up both nothing and something!' %
                option_string)

        setattr(namespace, self.dest, result)


class _CommaSeparatedListAction(Action):
    """action to parse a comma-separated list of subnets.

    This eliminates whitespace
    """
    def __call__(self, parser, namespace, value, option_string=None):
        items = [s.strip() for s in value.split(',') if s]

        setattr(namespace, self.dest, items)


class _AppendCommaSeparatedItemsAction(Action):
    """action to parse a comma-separated list and append
    each of them to an existing list.

    This eliminates whitespace
    """
    def __call__(self, parser, namespace, value, option_string=None):
        _default_to(namespace, self.dest, [])

        items = [s.strip() for s in value.split(',') if s]

        getattr(namespace, self.dest).extend(items)


class _AppendArgsAction(Action):
    """action to parse one or more arguments and append them to a list."""
    def __call__(self, parser, namespace, value, option_string=None):
        _default_to(namespace, self.dest, [])

        args = shlex_split(value)

        getattr(namespace, self.dest).extend(args)


class _AppendJSONAction(Action):
    """action to parse JSON and append it to a list."""
    def __call__(self, parser, namespace, value, option_string=None):
        _default_to(namespace, self.dest, [])

        try:
            j = json.loads(value)
        except ValueError as e:
            parser.error('Malformed JSON passed to %s: %s' % (
                option_string, str(e)))

        getattr(namespace, self.dest).append(j)


class _KeyJSONValueAction(Action):
    """action for KEY=<json> pairs. Allows value to be a string, as long
    as it doesn't start with ``[``, ``{``, or ``"``."""
    # used for --extra-cluster-param

    def __call__(self, parser, namespace, value, option_string=None):
        try:
            k, v = value.split('=', 1)
        except ValueError:
            parser.error('%s argument %r is not of the form KEY=VALUE' % (
                option_string, value))

        try:
            v = json.loads(v)
        except ValueError:
            if _PROBABLY_JSON_RE.match(v):
                parser.error('%s argument %r is not valid JSON' % (
                    option_string, value))

        _default_to(namespace, self.dest, {})
        getattr(namespace, self.dest)[k] = v


class _JSONAction(Action):
    """action to parse a JSON"""

    def __call__(self, parser, namespace, value, option_string=None):
        try:
            j = json.loads(value)
        except ValueError as e:
            parser.error('Malformed JSON passed to %s: %s' % (
                option_string, str(e)))

        setattr(namespace, self.dest, j)


class _PortRangeAction(Action):
    """action to parse --ssh-bind-ports"""

    def __call__(self, parser, namespace, value, option_string=None):
        try:
            ports = _parse_port_range_list(value)
        except ValueError as e:
            parser.error('%s: invalid port range list %r: \n%s' %
                         (option_string, value, e.args[0]))

        setattr(namespace, self.dest, ports)


### mux opts ###

# these are used by MRJob to determine what part of a job to run
#
# this just maps dest to the args and kwargs to ArgumentParser.add_argument
# (minus the dest keyword arg)
_STEP_OPTS = dict(
    run_combiner=(
        ['--combiner'],
        dict(
            action='store_true',
            help='run a combiner',
        ),
    ),
    run_mapper=(
        ['--mapper'],
        dict(
            action='store_true',
            help='run a mapper'
        ),
    ),
    run_reducer=(
        ['--reducer'],
        dict(
            action='store_true',
            help='run a reducer',
        ),
    ),
    run_spark=(
        ['--spark'],
        dict(
            action='store_true',
            help='run Spark code',
        ),
    ),
    step_num=(
        ['--step-num'],
        dict(
            type=int,
            default=0,
            help='which step to execute (default is 0)',
        ),
    ),
)

# don't show these unless someone types --help -v --deprecated
_DEPRECATED_STEP_OPTS = set()  # none at the moment

# don't show these unless someone types --help --deprecated
_DEPRECATED_NON_RUNNER_OPTS = {'deprecated'}

### runner opts ###

# map from runner option name to dict with the following keys (all optional):
# cloud_role:
#   'connect' if needed when interacting with cloud services at all
#   'launch' if needed when creating a new cluster
#   (cloud runner options with no cloud role are only needed when running jobs)
# combiner: combiner func from mrjob.conf used to combine option values.
#   (if left blank, we use combine_values())
# deprecated: if true, this option is deprecated and slated for removal
# deprecated_aliases: list of old names for this option slated for removal
# switches: list of switches to add to ArgumentParser for this option. Items
#   have the format (['--switch-names', ...], dict(**kwargs)), where kwargs
#   can be:
#     action -- action to pass to add_argument() (e.g. 'store_true')
#     deprecated -- if True, this switch is deprecated and slated for removal
#     deprecated_aliases -- list of old '--switch-names' slated for removal
#     help -- help string to pass to add_argument()
#     type -- option type for add_argument() to enforce (e.g. float).
#   You can't set the ArgumentParser's default; we use [] if *action* is
#   'append' and None otherwise.
#
# the list of which options apply to which runner is in the runner class
# itself (e.g. EMRJobRunner.OPT_NAMES)
_RUNNER_OPTS = dict(
    additional_emr_info=dict(
        cloud_role='launch',
        switches=[
            (['--additional-emr-info'], dict(
                help='A JSON string for selecting additional features on EMR',
            )),
        ],
    ),
    applications=dict(
        cloud_role='launch',
        combiner=combine_lists,
        switches=[
            (['--applications', '--application'], dict(
                action=_AppendCommaSeparatedItemsAction,
                help=('Additional applications to run on 4.x and 5.x'
                      ' AMIs, separated by commas (e.g.'
                      ' "Ganglia,Spark")'),
            )),
        ],
    ),
    aws_access_key_id=dict(
        cloud_role='connect',
    ),
    aws_secret_access_key=dict(
        cloud_role='connect',
    ),
    aws_session_token=dict(
        cloud_role='connect',
    ),
    bootstrap=dict(
        cloud_role='launch',
        combiner=combine_lists,
        switches=[
            (['--bootstrap'], dict(
                action='append',
                help=('A shell command to set up libraries etc. before any'
                      ' steps (e.g. "sudo apt-get -qy install python3"). You'
                      ' may interpolate files available via URL or locally'
                      ' with Hadoop Distributed Cache syntax'
                      ' ("sudo yum install -y foo.rpm#")'),
            )),
        ],
    ),
    bootstrap_actions=dict(
        cloud_role='launch',
        combiner=combine_lists,
        switches=[
            (['--bootstrap-action'], dict(
                action='append',
                help=('Raw bootstrap action scripts to run before any of the'
                      ' other bootstrap steps. You can use --bootstrap-action'
                      ' more than once. Local scripts will be automatically'
                      ' uploaded to S3. To add arguments, just use quotes:'
                      ' "foo.sh arg1 arg2"'),
            )),
        ],
    ),
    bootstrap_mrjob=dict(
        cloud_role='launch',
        switches=[
            (['--bootstrap-mrjob'], dict(
                action='store_true',
                help=("Automatically zip up the mrjob library and install it"
                      " when we run the mrjob. This is the default. Use"
                      " --no-bootstrap-mrjob if you've already installed"
                      " mrjob on your Hadoop cluster."),
            )),
            (['--no-bootstrap-mrjob'], dict(
                action='store_false',
                help=("Don't automatically zip up the mrjob library and"
                      " install it when we run this job. Use this if you've"
                      " already installed mrjob on your Hadoop cluster."),
            )),
        ],
    ),
    bootstrap_python=dict(
        cloud_role='launch',
        switches=[
            (['--bootstrap-python'], dict(
                action='store_true',
                help=('Attempt to install a compatible version of Python'
                      ' at bootstrap time. Currently this only does anything'
                      ' for Python 3, for which it is enabled by default.'),
            )),
            (['--no-bootstrap-python'], dict(
                action='store_false',
                help=("Don't automatically try to install a compatible version"
                      " of Python at bootstrap time."),
            )),
        ],
    ),
    bootstrap_spark=dict(
        cloud_role='launch',
        switches=[
            (['--bootstrap-spark'], dict(
                action='store_true',
                help="Auto-install Spark on the cluster (even if not needed)."
            )),
            (['--no-bootstrap-spark'], dict(
                action='store_false',
                help="Don't auto-install Spark on the cluster."
            )),
        ],
    ),
    check_input_paths=dict(
        switches=[
            (['--check-input-paths'], dict(
                action='store_true',
                help='Check input paths exist before running (the default)',
            )),
            (['--no-check-input-paths'], dict(
                action='store_false',
                help='Skip the checks to ensure all input paths exist',
            )),
        ],
    ),
    check_cluster_every=dict(
        switches=[
            (['--check-cluster-every'], dict(
                help=('How often (in seconds) to check status of your'
                      ' job/cluster'),
                type=float,
            )),
        ],
    ),
    cleanup=dict(
        switches=[
            (['--cleanup'], dict(
                action=_CleanupAction,
                help=('Comma-separated list of which directories to delete'
                      ' when a job succeeds, e.g. TMP,LOGS. Choices:'
                      ' %s (default: ALL)' % ', '.join(CLEANUP_CHOICES)),
            )),
        ],
    ),
    cleanup_on_failure=dict(
        switches=[
            (['--cleanup-on-failure'], dict(
                action=_CleanupAction,
                help=('Comma-separated list of which directories to delete'
                      ' when a job fails, e.g. TMP,LOGS. Choices:'
                      ' %s (default: NONE)' % ', '.join(CLEANUP_CHOICES)),
            )),
        ],
    ),
    cloud_fs_sync_secs=dict(
        cloud_role='launch',
        switches=[
            (['--cloud-fs-sync-secs'], dict(
                help=('How long to wait for remote FS to reach eventual'
                      ' consistency. This'
                      ' is typically less than a second but the'
                      ' default is 5.0 to be safe.'),
                type=float,
            )),
        ],
    ),
    cloud_log_dir=dict(
        cloud_role='launch',
        combiner=combine_paths,
        switches=[
            (['--cloud-log-dir'], dict(
                help='URI on remote FS to write logs into',
            )),
        ],
    ),
    cloud_tmp_dir=dict(
        cloud_role='launch',
        combiner=combine_paths,
        switches=[
            (['--cloud-tmp-dir'], dict(
                help='URI on remote FS to use as our temp directory.',
            )),
        ],
    ),
    cloud_part_size_mb=dict(
        cloud_role='launch',
        deprecated_aliases=['cloud_upload_part_size'],
        switches=[
            (['--cloud-part-size-mb'], dict(
                deprecated_aliases=['--cloud-upload-part-size'],
                help=('Upload files to cloud FS in parts no bigger than this'
                      ' many megabytes. Default is 100 MiB. Set to 0 to'
                      ' disable multipart uploading entirely.'),
                type=float,
            )),
        ],
    ),
    cluster_id=dict(
        switches=[
            (['--cluster-id'], dict(
                help='ID of an existing cluster to run our job on',
            )),
        ],
    ),
    cluster_properties=dict(
        cloud_role='launch',
        combiner=combine_dicts,
        switches=[
            (['--cluster-property'], dict(
                action=_KeyValueAction,
                help=('Properties to set in Hadoop config files on Dataproc.'
                      'Args take the form file_prefix:property=value.'
                      ' You can use --cluster-property multiple times.'
                      ' For more info, see'
                      ' https://cloud.google.com/dataproc/docs/concepts'
                      '/configuring-clusters/cluster-properties'),
            )),
        ],
    ),
    cmdenv=dict(
        combiner=combine_envs,
        switches=[
            (['--cmdenv'], dict(
                action=_KeyValueAction,
                help=('Set an environment variable for your job inside Hadoop '
                      'streaming/Spark. Must take the form KEY=VALUE.'
                      ' You can use --cmdenv multiple times.'),
            )),
        ],
    ),
    core_instance_config=dict(
        cloud_role='launch',
        switches=[
            (['--core-instance-config'], dict(
                action=_JSONAction,
                help=('detailed JSON dict of configs for the core'
                      ' (worker) instances'
                      ' on Dataproc, including disk config. For format, see'
                      ' https://cloud.google.com/dataproc/docs/reference/rest'
                      '/v1/projects.regions.clusters#InstanceGroupConfig'
                      ' (except that fields in your JSON should use'
                      ' snake_case, not camelCase).')
            )),
        ],
    ),
    core_instance_bid_price=dict(
        cloud_role='launch',
        switches=[
            (['--core-instance-bid-price'], dict(
                help=('Bid price to specify for core nodes when'
                      ' setting them up as EC2 spot instances (you probably'
                      ' only want to do this for task instances).'),
            )),
        ],
    ),
    core_instance_type=dict(
        cloud_role='launch',
        switches=[
            (['--core-instance-type'], dict(
                help='Type of GCE/EC2 core instance(s) to launch',
            )),
        ],
    ),
    ebs_root_volume_gb=dict(
        cloud_role='launch',
        switches=[
            (['--ebs-root-volume-gb'], dict(
                help=('Size of root EBS volume, in GiB. Must be an integer.'
                      'Set to 0 to use the default'),
                type=int,
            )),
        ],
    ),
    ec2_endpoint=dict(
        cloud_role='connect',
        switches=[
            (['--ec2-endpoint'], dict(
                help=('Force mrjob to connect to EC2 on this endpoint'
                      ' (e.g. ec2.us-west-1.amazonaws.com).'
                      ' Default is to infer this from region.'),
            )),
        ],
    ),
    ec2_key_pair=dict(
        cloud_role='launch',
        switches=[
            (['--ec2-key-pair'], dict(
                help='Name of the SSH key pair you set up for EMR',
            )),
        ],
    ),
    ec2_key_pair_file=dict(
        combiner=combine_paths,
        switches=[
            (['--ec2-key-pair-file'], dict(
                help='Path to file containing SSH key for EMR',
            )),
        ],
    ),
    emr_action_on_failure=dict(
        cloud_role='launch',
        switches=[
            (['--emr-action-on-failure'], dict(
                help=('Action to take when a step fails'
                      ' (e.g. TERMINATE_CLUSTER, CANCEL_AND_WAIT, CONTINUE)'),
            )),
        ],
    ),
    emr_configurations=dict(
        cloud_role='launch',
        combiner=combine_lists,
        switches=[
            (['--emr-configuration'], dict(
                action=_AppendJSONAction,
                help=('Configuration to use on 4.x AMIs as a JSON-encoded'
                      ' dict; see'
                      ' http://docs.aws.amazon.com/ElasticMapReduce/latest/'
                      'ReleaseGuide/emr-configure-apps.html for examples'),
            )),
        ],
    ),
    emr_endpoint=dict(
        cloud_role='connect',
        switches=[
            (['--emr-endpoint'], dict(
                help=('Force mrjob to connect to EMR on this endpoint'
                      ' (e.g. us-west-1.elasticmapreduce.amazonaws.com).'
                      ' Default is to infer this from region.'),
            )),
        ],
    ),
    emulate_map_input_file=dict(
        switches=[
            (['--emulate-map-input-file'], dict(
                action='store_true',
                help=("In the first mapper, set $mapreduce_map_input_file to"
                      " the input file path, like Hadoop would, to support"
                      " jobs that use"
                      " jobconf_from_env('mapreduce.map.input.file')."
                      " Ignored if job sets HADOOP_INPUT_FORMAT."),
            )),
            (['--no-emulate-map-input-file'], dict(
                action='store_false',
                help=("Disables setting $mapreduce_map_input_file"),
            )),
        ],
    ),
    enable_emr_debugging=dict(
        cloud_role='launch',
        switches=[
            (['--enable-emr-debugging'], dict(
                action='store_true',
                help='Enable storage of Hadoop logs in SimpleDB',
            )),
            (['--disable-emr-debugging'], dict(
                action='store_false',
                help=('Disable storage of Hadoop logs in SimpleDB (the'
                      ' default)'),
            )),
        ],
    ),
    extra_cluster_params=dict(
        cloud_role='launch',
        combiner=combine_dicts,
        switches=[
            (['--extra-cluster-param'], dict(
                action=_KeyJSONValueAction,
                help=('extra parameter to pass to cloud API when creating'
                      ' a cluster, to access features not currently supported'
                      ' by mrjob. Takes the form <param>=<value>, where value'
                      ' is JSON or a string. Use <param>=null to unset a'
                      ' parameter'),
            )),
        ],
    ),
    gcloud_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--gcloud-bin'], dict(help='path to gcloud binary')),
        ],
    ),
    gcs_region=dict(
        cloud_role='connect',
        switches=[
            (['--gcs-region'], dict(
                help='region to create Google Cloud Storage buckets in',
            )),
        ],
    ),
    hadoop_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--hadoop-bin'], dict(help='path to hadoop binary')),
        ],
    ),
    hadoop_extra_args=dict(
        combiner=combine_lists,
        switches=[
            (['--hadoop-args'], dict(
                action=_AppendArgsAction,
                help=('One or more arguments to pass to the hadoop binary.'
                      ' (e.g. --hadoop-args="-fs file:///").'),
            )),
        ],
    ),
    hadoop_log_dirs=dict(
        combiner=combine_path_lists,
        switches=[
            (['--hadoop-log-dirs'], dict(
                action='append',
                help=('Directory to search for hadoop logs in. You can use'
                      ' --hadoop-log-dir multiple times.'),
            )),
        ],
    ),
    hadoop_streaming_jar=dict(
        combiner=combine_paths,
        switches=[
            (['--hadoop-streaming-jar'], dict(
                help=('Path of your hadoop streaming jar (locally, or on'
                      ' S3/HDFS). In EMR, use a file:// URI to refer to a jar'
                      ' on the master node of your cluster.'),
            )),
        ],
    ),
    hadoop_tmp_dir=dict(
        combiner=combine_paths,
        switches=[
            (['--hadoop-tmp-dir'], dict(
                help='Temp space on HDFS (default is tmp/mrjob)',
            )),
        ],
    ),
    hadoop_version=dict(
        switches=[
            (['--hadoop-version'], dict(
                help='Specific version of Hadoop to simulate',
            )),
        ],
    ),
    iam_endpoint=dict(
        cloud_role='launch',  # not 'connect'; only used to create clusters
        switches=[
            (['--iam-endpoint'], dict(
                help=('Force mrjob to connect to IAM on this endpoint'
                      ' (e.g. iam.us-gov.amazonaws.com)'),
            )),
        ],
    ),
    iam_instance_profile=dict(
        cloud_role='launch',
        switches=[
            (['--iam-instance-profile'], dict(
                help=('EC2 instance profile to use for the EMR cluster -- see'
                      ' "Configure IAM Roles for Amazon EMR" in AWS docs'),
            )),
        ],
    ),
    iam_service_role=dict(
        cloud_role='launch',
        switches=[
            (['--iam-service-role'], dict(
                help=('IAM service role to use for the EMR cluster -- see'
                      ' "Configure IAM Roles for Amazon EMR" in AWS docs')
            )),
        ],
    ),
    image_id=dict(
        cloud_role='launch',
        switches=[
            (['--image-id'], dict(
                help='ID of custom AWS machine image (AMI) to use',
            )),
        ],
    ),
    image_version=dict(
        cloud_role='launch',
        switches=[
            (['--image-version'], dict(
                help='version of EMR/Dataproc machine image to run',
            )),
        ],
    ),
    instance_groups=dict(
        cloud_role='launch',
        switches=[
            (['--instance-groups'], dict(
                action=_JSONAction,
                help=('detailed JSON list of EMR instance configs, including'
                      ' EBS configuration. See docs for --instance-groups'
                      ' at http://docs.aws.amazon.com/cli/latest/reference'
                      '/emr/create-cluster.html'),
            )),
        ],
    ),
    instance_fleets=dict(
        cloud_role='launch',
        switches=[
            (['--instance-fleets'], dict(
                action=_JSONAction,
                help=('detailed JSON list of instance fleets, including'
                      ' EBS configuration. See docs for --instance-fleets'
                      ' at http://docs.aws.amazon.com/cli/latest/reference'
                      '/emr/create-cluster.html'),
            )),
        ],
    ),
    instance_type=dict(
        cloud_role='launch',
        switches=[
            (['--instance-type'], dict(
                help=('Type of GCE/EC2 instance(s) to launch \n'
                      ' GCE - e.g. n1-standard-1, n1-highcpu-4, n1-highmem-4'
                      ' -- See'
                      ' https://cloud.google.com/compute/docs/machine-types\n'
                      ' EC2 - e.g. m1.medium, c3.xlarge, r3.xlarge '
                      ' -- See http://aws.amazon.com/ec2/instance-types/'),
            )),
        ],
    ),
    jobconf=dict(
        combiner=combine_jobconfs,
        switches=[
            (['-D', '--jobconf'], dict(
                action=_KeyValueAction,
                help=('passed through to hadoop streaming as -D and to Spark'
                      ' as --conf. Should take the form KEY=VALUE'),
            )),
        ],
    ),
    label=dict(
        cloud_role='launch',
        switches=[
            (['--label'], dict(
                help='Alternate label for the job, to help us identify it.',
            )),
        ],
    ),
    libjars=dict(
        combiner=combine_path_lists,
        switches=[
            (['--libjars'], dict(
                action=_AppendCommaSeparatedItemsAction,
                help=('Paths of JARs to pass to Hadoop with -libjars,'
                      ' separated by commas. On EMR,'
                      ' these can also be URIs; use file:/// to'
                      ' reference JARs already on the EMR cluster.')
            )),
        ],
    ),
    local_tmp_dir=dict(
        combiner=combine_paths,
        switches=[
            (['--local-tmp-dir'], dict(
                help='temp directory on local filesystem',
            )),
        ],
    ),
    master_instance_bid_price=dict(
        cloud_role='launch',
        switches=[
            (['--master-instance-bid-price'], dict(
                help=('Bid price to specify for the master node when'
                      ' setting it up as an EC2 spot instance (you probably'
                      ' only want to do this for task instances).'),
            )),
        ],
    ),
    master_instance_config=dict(
        cloud_role='launch',
        switches=[
            (['--master-instance-config'], dict(
                action=_JSONAction,
                help=('detailed JSON dict of configs for the master instance'
                      ' on Dataproc including disk config. For format, see'
                      ' https://cloud.google.com/dataproc/docs/reference/rest'
                      '/v1/projects.regions.clusters#InstanceGroupConfig'
                      ' (except that fields in your JSON should use'
                      ' snake_case, not camelCase).')
            )),
        ],
    ),
    master_instance_type=dict(
        cloud_role='launch',
        switches=[
            (['--master-instance-type'], dict(
                help='Type of GCE/EC2 master instance to launch',
            )),
        ],
    ),
    max_mins_idle=dict(
        cloud_role='launch',
        switches=[
            (['--max-mins-idle'], dict(
                help=("If we create a cluster, have it automatically"
                      " terminate itself after it's been idle this many"
                      " minutes"),
                type=float,
            )),
        ],
    ),
    # Spark runner only, only passed in on the command line (see #2040)
    max_output_files=dict(
        switches=[
            (['--max-output-files'], dict(
                help=('Maximum number of output files when running a'
                      ' streaming job on Spark; just runs rdd.coalesce()'
                      ' before outputting files'),
                type=int,
            )),
        ],
    ),
    network=dict(
        cloud_role='launch',
        switches=[
            (['--network'], dict(
                help=('URI of Google Compute Engine network to launch cluster'
                      " in. Can't be used with --subnet."),
            )),
        ],
    ),
    num_core_instances=dict(
        cloud_role='launch',
        switches=[
            (['--num-core-instances'], dict(
                help='Total number of core instances to launch',
                type=int,
            )),
        ],
    ),
    num_task_instances=dict(
        cloud_role='launch',
        switches=[
            (['--num-task-instances'], dict(
                help='Total number of task instances to launch',
                type=int,
            )),
        ],
    ),
    num_cores=dict(
        cloud_role='launch',
        switches=[
            (['--num-cores'], dict(
                help='Total number of core to use while running in local mode',
                type=int,
            )),
        ],
    ),
    owner=dict(
        cloud_role='launch',
        switches=[
            (['--owner'], dict(
                help='User who ran the job (default is the current user)',
            )),
        ],
    ),
    pool_clusters=dict(
        cloud_role='launch',
        switches=[
            (['--pool-clusters'], dict(
                action='store_true',
                help=('Add to an existing cluster or create a new one that'
                      ' does not terminate when the job completes.'),
            )),
            (['--no-pool-clusters'], dict(
                action='store_false',
                help="Don't run job on a pooled cluster (the default)",
            )),
        ],
    ),
    pool_name=dict(
        cloud_role='launch',
        switches=[
            (['--pool-name'], dict(
                help='Specify a pool name to join. Default is "default"',
            )),
        ],
    ),
    pool_wait_minutes=dict(
        switches=[
            (['--pool-wait-minutes'], dict(
                help=('Wait for a number of minutes for a cluster to finish'
                      ' if a job finishes, run job on its cluster. Otherwise'
                      " create a new one. (0, the default, means don't wait)"),
                type=int,
            )),
        ],
    ),
    project_id=dict(
        cloud_role='connect',
        deprecated_aliases=['gcp_project'],
        switches=[
            (['--project-id'], dict(
                deprecated_aliases=['--gcp-project'],
                help=('Project to use when connecting to Google Cloud Services'
                      ' and to run Cloud Dataproc jobs in')
            )),
        ],
    ),
    py_files=dict(
        combiner=combine_path_lists,
        switches=[
            (['--py-files'], dict(
                action=_AppendCommaSeparatedItemsAction,
                help=('.zip or .egg files to add to PYTHONPATH,'
                      ' separated by commas'),
            )),
        ],
    ),
    python_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--python-bin'], dict(
                help=('Alternate python command. You can include arguments,'
                      ' e.g. --python-bin "python -v"'),
            )),
        ],
    ),
    read_logs=dict(
        switches=[
            (['--read-logs'], dict(
                action='store_true',
                help=('Parse logs generated by the job to get counters and'
                      ' cause of error (the default).')
            )),
            (['--no-read-logs'], dict(
                action='store_false',
                help="Don't list or read logs generated by the job."
            )),
        ],
    ),
    region=dict(
        cloud_role='connect',
        switches=[
            (['--region'], dict(
                help='GCE/AWS region to run Dataproc/EMR jobs in.',
            )),
        ],
    ),
    release_label=dict(
        cloud_role='launch',
        switches=[
            (['--release-label'], dict(
                help=('Release Label (e.g. "emr-4.0.0"). Overrides'
                      ' --image-version'),
            )),
        ],
    ),
    s3_endpoint=dict(
        cloud_role='connect',
        switches=[
            (['--s3-endpoint'], dict(
                help=("Force mrjob to connect to S3 on this endpoint (e.g."
                      " s3-us-west-1.amazonaws.com). You usually shouldn't"
                      " set this; by default mrjob will choose the correct"
                      " endpoint for each S3 bucket based on its location."),
            )),
        ],
    ),
    s3_region=dict(
        cloud_role='connect',
        switches=[
            (['--s3-region'], dict(
                help='AWS region to create s3 buckets in',
            )),
        ],
    ),
    service_account=dict(
        cloud_role='launch',
        switches=[
            (['--service-account'], dict(
                help=('Service account to use when creating a Dataproc'
                      ' cluster. Usually takes the form'
                      ' [account_id]@[project_id].iam.gserviceaccount.com.'
                      ' Set to "" to use the default.'),
            )),
        ],
    ),
    service_account_scopes=dict(
        cloud_role='launch',
        switches=[
            (['--service-account-scopes'], dict(
                action=_CommaSeparatedListAction,
                help=("A comma-separated list of service account scopes"
                      " on Dataproc, used to limit your cluster's access."
                      " For each scope, you can specify the"
                      " full URI or just the name (e.g. 'logging.write')"),
            )),
        ],
    ),
    setup=dict(
        combiner=combine_lists,
        switches=[
            (['--setup'], dict(
                action='append',
                help=('A command to run before each mapper/reducer step in the'
                      ' shell ("touch foo"). You may interpolate files'
                      ' available via URL or on your local filesystem using'
                      ' Hadoop Distributed Cache syntax (". setup.sh#"). To'
                      ' interpolate archives, use #/: "cd foo.tar.gz#/; make'),
            )),
        ],
    ),
    sh_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--sh-bin'], dict(
                help=('Alternate shell command for setup scripts. You may'
                      ' include arguments, e.g. --sh-bin "bash -ex"'),
            )),
        ],
    ),
    skip_internal_protocol=dict(
        switches=[
            (['--skip-internal-protocol'], dict(
                action='store_true',
                help=("Don't use the job's internal protocol to communicate"
                      " between tasks internal to the job, instead relying"
                      " on Spark to encode and decode raw data structures.")
            )),
            (['--no-skip-internal-protocol'], dict(
                action='store_false',
                help='Use internal protocols as usual',
            )),
        ],
    ),
    sort_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--sort-bin'], dict(
                help=('Alternate shell command for the external sort binary.'
                      'You may include arguments, e.g. --sort-bin "sort -r"')
            )),
        ],
    ),
    spark_args=dict(
        combiner=combine_lists,
        switches=[
            (['--spark-args'], dict(
                action=_AppendArgsAction,
                help=('One or more arguments to pass to spark-submit'
                      ' (e.g. --spark-args="--properties-file my.conf").'),
            )),
        ],
    ),
    spark_deploy_mode=dict(
        switches=[
            (['--spark-deploy-mode'], dict(
                help=('--deploy-mode argument to spark-submit (e.g.'
                      ' "cluster". Default is "client"'),
            )),
        ]
    ),
    spark_master=dict(
        switches=[
            (['--spark-master'], dict(
                help=('--master argument to spark-submit (e.g. '
                      'spark://host:port, local). Default is "yarn"'),
            )),
        ],
    ),
    spark_submit_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--spark-submit-bin'], dict(
                help='spark-submit binary. You may include arguments.'
            )),
        ],
    ),
    spark_tmp_dir=dict(
        cloud_role='launch',
        combiner=combine_paths,
        switches=[
            (['--spark-tmp-dir'], dict(
                help=('optional URI visible to Spark executors to use as our'
                      ' temp directory.'),
            )),
        ],
    ),
    ssh_add_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--ssh-add-bin'], dict(
                help=("Name/path of ssh-add binary. Arguments are allowed"
                      " (e.g. --ssh-bin 'ssh-add -v')"),
            )),
        ],
    ),
    ssh_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--ssh-bin'], dict(
                help=("Name/path of ssh binary. Arguments are allowed (e.g."
                      " --ssh-bin 'ssh -v')"),
            )),
        ],
    ),
    ssh_bind_ports=dict(
        switches=[
            (['--ssh-bind-ports'], dict(
                action=_PortRangeAction,
                help=('A list of port ranges that are safe to listen on,'
                      ' delimited by colons and commas, with syntax like'
                      ' 2000[:2001][,2003,2005:2008,etc].'
                      ' Defaults to 40001:40840.'),
            )),
        ],
    ),
    ssh_tunnel=dict(
        switches=[
            (['--ssh-tunnel'], dict(
                action='store_true',
                help=('Open an SSH tunnel to the Hadoop job tracker/resource'
                      ' manager'),
            )),
            (['--no-ssh-tunnel'], dict(
                action='store_false',
                help=("Don't open an SSH tunnel to the Hadoop job"
                      " tracker/resource manager (the default)"),
            )),
        ],
    ),
    ssh_tunnel_is_open=dict(
        switches=[
            (['--ssh-tunnel-is-open'], dict(
                action='store_true',
                help=('Make ssh tunnel accessible from remote hosts (not just'
                      ' localhost)'),
            )),
            (['--ssh-tunnel-is-closed'], dict(
                action='store_false',
                help=('Make ssh tunnel accessible from localhost only (the'
                      ' default)'),
            )),
        ],
    ),
    subnet=dict(
        cloud_role='launch',
        switches=[
            (['--subnet'], dict(
                help=('ID of Amazon VPC subnet/URI of Google Compute Engine'
                      ' subnetwork to launch cluster in.'),
            )),
            (['--subnets'], dict(
                action=_CommaSeparatedListAction,
                help=('Like --subnet, but with a comma-separated list, to'
                      ' specify multiple subnets in conjunction with'
                      ' --instance-fleets (EMR only)'),
            )),
        ],
    ),
    tags=dict(
        cloud_role='launch',
        combiner=combine_dicts,
        switches=[
            (['--tag'], dict(
                action=_KeyValueAction,
                help=('Metadata tags to apply to the EMR cluster; '
                      'should take the form KEY=VALUE. You can use --tag '
                      'multiple times'),
            )),
        ],
    ),
    task_instance_bid_price=dict(
        cloud_role='launch',
        switches=[
            (['--task-instance-bid-price'], dict(
                help=('Bid price to specify for task nodes when'
                      ' setting them up as EC2 spot instances'),
            )),
        ],
    ),
    task_instance_config=dict(
        cloud_role='launch',
        switches=[
            (['--task-instance-config'], dict(
                action=_JSONAction,
                help=('detailed JSON dict of configs for the task'
                      ' (secondary worker) instances'
                      ' on Dataproc including disk config. For format, see'
                      ' https://cloud.google.com/dataproc/docs/reference/rest'
                      '/v1/projects.regions.clusters#InstanceGroupConfig'
                      ' (except that fields in your JSON should use'
                      ' snake_case, not camelCase).')
            )),
        ],
    ),
    task_instance_type=dict(
        cloud_role='launch',
        switches=[
            (['--task-instance-type'], dict(
                help='Type of GCE/EC2 task instance(s) to launch',
            )),
        ],
    ),
    task_python_bin=dict(
        combiner=combine_cmds,
        switches=[
            (['--task-python-bin'], dict(
                help=('Name/path of alternate python command to use to'
                      " run tasks (e.g. mappers); doesn't affect setup"
                      ' wrapper scripts. Defaults to'
                      ' current Python interpreter.'),
            )),
        ],
    ),
    upload_archives=dict(
        combiner=combine_path_lists,
        switches=[
            (['--archives'], dict(
                action=_AppendCommaSeparatedItemsAction,
                help=('Archives to unpack in the working directory of the'
                      ' script, separated by commas. Use "#" to assign a'
                      ' different name to each directory (e.g. '
                      '"foo-libs.zip#lib,bar.tar.gz#bar")'),
            )),
        ],
    ),
    upload_dirs=dict(
        combiner=combine_path_lists,
        switches=[
            (['--dirs'], dict(
                action=_AppendCommaSeparatedItemsAction,
                help=('Directories to tarball and unpack in the working'
                      ' directory of the script, separated by commas. Append'
                      '#<name> to each directory to assign a different name'
                      ' (e.g. "foo#lib,bar#local-bar")'),
            )),
        ],
    ),
    upload_files=dict(
        combiner=combine_path_lists,
        switches=[
            (['--files'], dict(
                action=_AppendCommaSeparatedItemsAction,
                help=('Files to copy to the working directory of the script,'
                      ' separated by commas. Use "#"'
                      ' to assign a different name to each file (e.g. '
                      '"foo.db#bar.db")'),
            )),
        ],
    ),
    zone=dict(
        cloud_role='launch',
        switches=[
            (['--zone'], dict(
                help=('GCE zone/AWS availability zone to run Dataproc/EMR jobs'
                      ' in.'),
            )),
        ],
    ),
)


def _combiners(opt_names, runner_alias=None):
    return {
        name: config['combiner']
        for name, config in _RUNNER_OPTS.items()
        if name in opt_names and 'combiner' in config
    }


def _deprecated_aliases(opt_names):
    results = {}

    for name, config in _RUNNER_OPTS.items():
        if name not in opt_names:
            continue

        if config.get('deprecated_aliases'):
            for alias in config['deprecated_aliases']:
                results[alias] = name

    return results


def _filter_by_role(opt_names, *cloud_roles):
    return {
        opt_name
        for opt_name, conf in _RUNNER_OPTS.items()
        if opt_name in opt_names and conf.get('cloud_role') in cloud_roles
    }


def _add_runner_args(parser, opt_names=None, include_deprecated=True,
                     customize_switches=None, suppress_switches=None):
    """add switches for the given runner opts to the given
    ArgumentParser, alphabetically by destination. If *opt_names* is
    None, include all runner opts."""
    if opt_names is None:
        opt_names = set(_RUNNER_OPTS)

    for opt_name in sorted(opt_names):
        _add_runner_args_for_opt(
            parser, opt_name,
            include_deprecated=include_deprecated,
            customize_switches=customize_switches,
            suppress_switches=suppress_switches
        )


def _add_runner_args_for_opt(parser, opt_name, include_deprecated=True,
                             customize_switches=None, suppress_switches=None):
    """Add switches for a single option (*opt_name*) to the given parser."""
    if customize_switches is None:
        customize_switches = {}

    if suppress_switches is None:
        suppress_switches = set()

    conf = _RUNNER_OPTS[opt_name]

    if conf.get('deprecated') and not include_deprecated:
        return

    switches = conf.get('switches') or []

    def suppressed(switches):
        return any(sw in suppress_switches for sw in switches)

    for args, kwargs in switches:
        kwargs = dict(kwargs)

        # allow customization
        for switch in args:
            if switch in customize_switches:
                kwargs.update(customize_switches[switch])

        deprecated_aliases = kwargs.pop('deprecated_aliases', None)
        deprecated = kwargs.pop('deprecated', False)

        # add this switch
        if (include_deprecated or not deprecated) and not suppressed(args):
            kwargs['dest'] = opt_name

            if kwargs.get('action') == 'append':
                kwargs['default'] = []
            else:
                kwargs['default'] = None

            parser.add_argument(*args, **kwargs)

        # add a switch for deprecated aliases
        if (deprecated_aliases and include_deprecated and
                not suppressed(deprecated_aliases)):
            help = 'Deprecated alias%s for %s' % (
                ('es' if len(deprecated_aliases) > 1 else ''),
                args[-1])
            parser.add_argument(
                *deprecated_aliases,
                **combine_dicts(kwargs, dict(help=help)))


### non-runner switches ###

def _add_basic_args(parser):
    """Switches for all command line tools"""

    parser.add_argument(
        '-c', '--conf-path', dest='conf_paths',
        action='append',
        help='Path to alternate mrjob.conf file to read from')

    parser.add_argument(
        '--no-conf', dest='conf_paths', action='store_const', const=[],
        help="Don't load mrjob.conf even if it's available")

    parser.add_argument(
        '-q', '--quiet', dest='quiet', default=None,
        action='store_true',
        help="Don't print anything to stderr")

    parser.add_argument(
        '-v', '--verbose', dest='verbose', default=None,
        action='store_true', help='print more messages to stderr')


def _add_job_args(parser, include_deprecated=True, include_steps=True):

    parser.add_argument(
        '--cat-output', dest='cat_output',
        default=None, action='store_true',
        help="Stream job output to stdout")

    parser.add_argument(
        '--no-cat-output', dest='cat_output',
        default=None, action='store_false',
        help="Don't stream job output to stdout")

    if include_deprecated:
      …
Large files files are truncated, but you can click here to view the full file