PageRenderTime 50ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/mrjob/options.py

http://github.com/Yelp/mrjob
Python | 1638 lines | 1529 code | 35 blank | 74 comment | 41 complexity | 05898f730d8bed28bf732025e1756e5c MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. # Copyright 2009-2016 Yelp and Contributors
  3. # Copyright 2017 Yelp
  4. # Copyright 2018 Yelp, Google, Inc., and Contributors
  5. # Copyright 2019 Yelp
  6. #
  7. # Licensed under the Apache License, Version 2.0 (the "License");
  8. # you may not use this file except in compliance with the License.
  9. # You may obtain a copy of the License at
  10. #
  11. # http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS,
  15. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. # See the License for the specific language governing permissions and
  17. # limitations under the License.
  18. """Functions to populate py:class:`~argparse.ArgumentParser``
  19. objects with categorized command line parameters.
  20. """
  21. from __future__ import print_function
  22. import json
  23. import re
  24. from argparse import Action
  25. from argparse import ArgumentParser
  26. from argparse import SUPPRESS
  27. from logging import getLogger
  28. from mrjob.conf import combine_cmds
  29. from mrjob.conf import combine_dicts
  30. from mrjob.conf import combine_envs
  31. from mrjob.conf import combine_jobconfs
  32. from mrjob.conf import combine_lists
  33. from mrjob.conf import combine_paths
  34. from mrjob.conf import combine_path_lists
  35. from mrjob.parse import _parse_port_range_list
  36. from mrjob.util import shlex_split
  37. log = getLogger(__name__)
  38. #: cleanup options:
  39. #:
  40. #: * ``'ALL'``: delete logs and local and remote temp files; stop cluster
  41. #: if on EMR and the job is not done when cleanup is run.
  42. #: * ``'CLOUD_TMP'``: delete temp files on cloud storage (e.g. S3) only
  43. #: * ``'CLUSTER'``: terminate the cluster if on EMR and the job is not done
  44. #: on cleanup
  45. #: * ``'HADOOP_TMP'``: delete temp files on HDFS only
  46. #: * ``'JOB'``: stop job if on EMR and the job is not done when cleanup runs
  47. #: * ``'LOCAL_TMP'``: delete local temp files only
  48. #: * ``'LOGS'``: delete logs only
  49. #: * ``'NONE'``: delete nothing
  50. #: * ``'TMP'``: delete local, HDFS, and cloud storage temp files, but not logs
  51. CLEANUP_CHOICES = [
  52. 'ALL',
  53. 'CLOUD_TMP',
  54. 'CLUSTER',
  55. 'HADOOP_TMP',
  56. 'JOB',
  57. 'LOCAL_TMP',
  58. 'LOGS',
  59. 'NONE',
  60. 'TMP',
  61. ]
  62. # use to identify malformed JSON
  63. _PROBABLY_JSON_RE = re.compile(r'^\s*[\{\[\"].*$')
  64. # names of runners
  65. _RUNNER_ALIASES = {
  66. 'dataproc',
  67. 'emr',
  68. 'hadoop',
  69. 'inline',
  70. 'local',
  71. 'spark',
  72. }
  73. ### custom actions ###
  74. def _default_to(namespace, dest, value):
  75. """Helper function; set the given attribute to *value* if it's None."""
  76. if getattr(namespace, dest) is None:
  77. setattr(namespace, dest, value)
  78. # these actions are only used by _add_runner_args(), so we can assume *value*
  79. # is a string
  80. class _KeyValueAction(Action):
  81. """action for KEY=VALUE pairs"""
  82. # used for --cmdenv, --jobconf, and more
  83. def __call__(self, parser, namespace, value, option_string=None):
  84. try:
  85. k, v = value.split('=', 1)
  86. except ValueError:
  87. parser.error('%s argument %r is not of the form KEY=VALUE' % (
  88. option_string, value))
  89. _default_to(namespace, self.dest, {})
  90. getattr(namespace, self.dest)[k] = v
  91. class _KeyNoneValueAction(Action):
  92. """action to set KEY to None"""
  93. def __call__(self, parser, namespace, value, option_string=None):
  94. _default_to(namespace, self.dest, {})
  95. getattr(namespace, self.dest)[value] = None
  96. class _CleanupAction(Action):
  97. """action to parse a comma-separated list of cleanup constants."""
  98. def __call__(self, parser, namespace, value, option_string=None):
  99. result = []
  100. for choice in value.split(','):
  101. if choice in CLEANUP_CHOICES:
  102. result.append(choice)
  103. else:
  104. parser.error(
  105. '%s got %s, which is not one of: %s' %
  106. (option_string, choice, ', '.join(CLEANUP_CHOICES)))
  107. if 'NONE' in result and len(set(result)) > 1:
  108. parser.error(
  109. '%s: Cannot clean up both nothing and something!' %
  110. option_string)
  111. setattr(namespace, self.dest, result)
  112. class _CommaSeparatedListAction(Action):
  113. """action to parse a comma-separated list of subnets.
  114. This eliminates whitespace
  115. """
  116. def __call__(self, parser, namespace, value, option_string=None):
  117. items = [s.strip() for s in value.split(',') if s]
  118. setattr(namespace, self.dest, items)
  119. class _AppendCommaSeparatedItemsAction(Action):
  120. """action to parse a comma-separated list and append
  121. each of them to an existing list.
  122. This eliminates whitespace
  123. """
  124. def __call__(self, parser, namespace, value, option_string=None):
  125. _default_to(namespace, self.dest, [])
  126. items = [s.strip() for s in value.split(',') if s]
  127. getattr(namespace, self.dest).extend(items)
  128. class _AppendArgsAction(Action):
  129. """action to parse one or more arguments and append them to a list."""
  130. def __call__(self, parser, namespace, value, option_string=None):
  131. _default_to(namespace, self.dest, [])
  132. args = shlex_split(value)
  133. getattr(namespace, self.dest).extend(args)
  134. class _AppendJSONAction(Action):
  135. """action to parse JSON and append it to a list."""
  136. def __call__(self, parser, namespace, value, option_string=None):
  137. _default_to(namespace, self.dest, [])
  138. try:
  139. j = json.loads(value)
  140. except ValueError as e:
  141. parser.error('Malformed JSON passed to %s: %s' % (
  142. option_string, str(e)))
  143. getattr(namespace, self.dest).append(j)
  144. class _KeyJSONValueAction(Action):
  145. """action for KEY=<json> pairs. Allows value to be a string, as long
  146. as it doesn't start with ``[``, ``{``, or ``"``."""
  147. # used for --extra-cluster-param
  148. def __call__(self, parser, namespace, value, option_string=None):
  149. try:
  150. k, v = value.split('=', 1)
  151. except ValueError:
  152. parser.error('%s argument %r is not of the form KEY=VALUE' % (
  153. option_string, value))
  154. try:
  155. v = json.loads(v)
  156. except ValueError:
  157. if _PROBABLY_JSON_RE.match(v):
  158. parser.error('%s argument %r is not valid JSON' % (
  159. option_string, value))
  160. _default_to(namespace, self.dest, {})
  161. getattr(namespace, self.dest)[k] = v
  162. class _JSONAction(Action):
  163. """action to parse a JSON"""
  164. def __call__(self, parser, namespace, value, option_string=None):
  165. try:
  166. j = json.loads(value)
  167. except ValueError as e:
  168. parser.error('Malformed JSON passed to %s: %s' % (
  169. option_string, str(e)))
  170. setattr(namespace, self.dest, j)
  171. class _PortRangeAction(Action):
  172. """action to parse --ssh-bind-ports"""
  173. def __call__(self, parser, namespace, value, option_string=None):
  174. try:
  175. ports = _parse_port_range_list(value)
  176. except ValueError as e:
  177. parser.error('%s: invalid port range list %r: \n%s' %
  178. (option_string, value, e.args[0]))
  179. setattr(namespace, self.dest, ports)
  180. ### mux opts ###
  181. # these are used by MRJob to determine what part of a job to run
  182. #
  183. # this just maps dest to the args and kwargs to ArgumentParser.add_argument
  184. # (minus the dest keyword arg)
  185. _STEP_OPTS = dict(
  186. run_combiner=(
  187. ['--combiner'],
  188. dict(
  189. action='store_true',
  190. help='run a combiner',
  191. ),
  192. ),
  193. run_mapper=(
  194. ['--mapper'],
  195. dict(
  196. action='store_true',
  197. help='run a mapper'
  198. ),
  199. ),
  200. run_reducer=(
  201. ['--reducer'],
  202. dict(
  203. action='store_true',
  204. help='run a reducer',
  205. ),
  206. ),
  207. run_spark=(
  208. ['--spark'],
  209. dict(
  210. action='store_true',
  211. help='run Spark code',
  212. ),
  213. ),
  214. step_num=(
  215. ['--step-num'],
  216. dict(
  217. type=int,
  218. default=0,
  219. help='which step to execute (default is 0)',
  220. ),
  221. ),
  222. )
  223. # don't show these unless someone types --help -v --deprecated
  224. _DEPRECATED_STEP_OPTS = set() # none at the moment
  225. # don't show these unless someone types --help --deprecated
  226. _DEPRECATED_NON_RUNNER_OPTS = {'deprecated'}
  227. ### runner opts ###
  228. # map from runner option name to dict with the following keys (all optional):
  229. # cloud_role:
  230. # 'connect' if needed when interacting with cloud services at all
  231. # 'launch' if needed when creating a new cluster
  232. # (cloud runner options with no cloud role are only needed when running jobs)
  233. # combiner: combiner func from mrjob.conf used to combine option values.
  234. # (if left blank, we use combine_values())
  235. # deprecated: if true, this option is deprecated and slated for removal
  236. # deprecated_aliases: list of old names for this option slated for removal
  237. # switches: list of switches to add to ArgumentParser for this option. Items
  238. # have the format (['--switch-names', ...], dict(**kwargs)), where kwargs
  239. # can be:
  240. # action -- action to pass to add_argument() (e.g. 'store_true')
  241. # deprecated -- if True, this switch is deprecated and slated for removal
  242. # deprecated_aliases -- list of old '--switch-names' slated for removal
  243. # help -- help string to pass to add_argument()
  244. # type -- option type for add_argument() to enforce (e.g. float).
  245. # You can't set the ArgumentParser's default; we use [] if *action* is
  246. # 'append' and None otherwise.
  247. #
  248. # the list of which options apply to which runner is in the runner class
  249. # itself (e.g. EMRJobRunner.OPT_NAMES)
  250. _RUNNER_OPTS = dict(
  251. additional_emr_info=dict(
  252. cloud_role='launch',
  253. switches=[
  254. (['--additional-emr-info'], dict(
  255. help='A JSON string for selecting additional features on EMR',
  256. )),
  257. ],
  258. ),
  259. applications=dict(
  260. cloud_role='launch',
  261. combiner=combine_lists,
  262. switches=[
  263. (['--applications', '--application'], dict(
  264. action=_AppendCommaSeparatedItemsAction,
  265. help=('Additional applications to run on 4.x and 5.x'
  266. ' AMIs, separated by commas (e.g.'
  267. ' "Ganglia,Spark")'),
  268. )),
  269. ],
  270. ),
  271. aws_access_key_id=dict(
  272. cloud_role='connect',
  273. ),
  274. aws_secret_access_key=dict(
  275. cloud_role='connect',
  276. ),
  277. aws_session_token=dict(
  278. cloud_role='connect',
  279. ),
  280. bootstrap=dict(
  281. cloud_role='launch',
  282. combiner=combine_lists,
  283. switches=[
  284. (['--bootstrap'], dict(
  285. action='append',
  286. help=('A shell command to set up libraries etc. before any'
  287. ' steps (e.g. "sudo apt-get -qy install python3"). You'
  288. ' may interpolate files available via URL or locally'
  289. ' with Hadoop Distributed Cache syntax'
  290. ' ("sudo yum install -y foo.rpm#")'),
  291. )),
  292. ],
  293. ),
  294. bootstrap_actions=dict(
  295. cloud_role='launch',
  296. combiner=combine_lists,
  297. switches=[
  298. (['--bootstrap-action'], dict(
  299. action='append',
  300. help=('Raw bootstrap action scripts to run before any of the'
  301. ' other bootstrap steps. You can use --bootstrap-action'
  302. ' more than once. Local scripts will be automatically'
  303. ' uploaded to S3. To add arguments, just use quotes:'
  304. ' "foo.sh arg1 arg2"'),
  305. )),
  306. ],
  307. ),
  308. bootstrap_mrjob=dict(
  309. cloud_role='launch',
  310. switches=[
  311. (['--bootstrap-mrjob'], dict(
  312. action='store_true',
  313. help=("Automatically zip up the mrjob library and install it"
  314. " when we run the mrjob. This is the default. Use"
  315. " --no-bootstrap-mrjob if you've already installed"
  316. " mrjob on your Hadoop cluster."),
  317. )),
  318. (['--no-bootstrap-mrjob'], dict(
  319. action='store_false',
  320. help=("Don't automatically zip up the mrjob library and"
  321. " install it when we run this job. Use this if you've"
  322. " already installed mrjob on your Hadoop cluster."),
  323. )),
  324. ],
  325. ),
  326. bootstrap_python=dict(
  327. cloud_role='launch',
  328. switches=[
  329. (['--bootstrap-python'], dict(
  330. action='store_true',
  331. help=('Attempt to install a compatible version of Python'
  332. ' at bootstrap time. Currently this only does anything'
  333. ' for Python 3, for which it is enabled by default.'),
  334. )),
  335. (['--no-bootstrap-python'], dict(
  336. action='store_false',
  337. help=("Don't automatically try to install a compatible version"
  338. " of Python at bootstrap time."),
  339. )),
  340. ],
  341. ),
  342. bootstrap_spark=dict(
  343. cloud_role='launch',
  344. switches=[
  345. (['--bootstrap-spark'], dict(
  346. action='store_true',
  347. help="Auto-install Spark on the cluster (even if not needed)."
  348. )),
  349. (['--no-bootstrap-spark'], dict(
  350. action='store_false',
  351. help="Don't auto-install Spark on the cluster."
  352. )),
  353. ],
  354. ),
  355. check_input_paths=dict(
  356. switches=[
  357. (['--check-input-paths'], dict(
  358. action='store_true',
  359. help='Check input paths exist before running (the default)',
  360. )),
  361. (['--no-check-input-paths'], dict(
  362. action='store_false',
  363. help='Skip the checks to ensure all input paths exist',
  364. )),
  365. ],
  366. ),
  367. check_cluster_every=dict(
  368. switches=[
  369. (['--check-cluster-every'], dict(
  370. help=('How often (in seconds) to check status of your'
  371. ' job/cluster'),
  372. type=float,
  373. )),
  374. ],
  375. ),
  376. cleanup=dict(
  377. switches=[
  378. (['--cleanup'], dict(
  379. action=_CleanupAction,
  380. help=('Comma-separated list of which directories to delete'
  381. ' when a job succeeds, e.g. TMP,LOGS. Choices:'
  382. ' %s (default: ALL)' % ', '.join(CLEANUP_CHOICES)),
  383. )),
  384. ],
  385. ),
  386. cleanup_on_failure=dict(
  387. switches=[
  388. (['--cleanup-on-failure'], dict(
  389. action=_CleanupAction,
  390. help=('Comma-separated list of which directories to delete'
  391. ' when a job fails, e.g. TMP,LOGS. Choices:'
  392. ' %s (default: NONE)' % ', '.join(CLEANUP_CHOICES)),
  393. )),
  394. ],
  395. ),
  396. cloud_fs_sync_secs=dict(
  397. cloud_role='launch',
  398. switches=[
  399. (['--cloud-fs-sync-secs'], dict(
  400. help=('How long to wait for remote FS to reach eventual'
  401. ' consistency. This'
  402. ' is typically less than a second but the'
  403. ' default is 5.0 to be safe.'),
  404. type=float,
  405. )),
  406. ],
  407. ),
  408. cloud_log_dir=dict(
  409. cloud_role='launch',
  410. combiner=combine_paths,
  411. switches=[
  412. (['--cloud-log-dir'], dict(
  413. help='URI on remote FS to write logs into',
  414. )),
  415. ],
  416. ),
  417. cloud_tmp_dir=dict(
  418. cloud_role='launch',
  419. combiner=combine_paths,
  420. switches=[
  421. (['--cloud-tmp-dir'], dict(
  422. help='URI on remote FS to use as our temp directory.',
  423. )),
  424. ],
  425. ),
  426. cloud_part_size_mb=dict(
  427. cloud_role='launch',
  428. deprecated_aliases=['cloud_upload_part_size'],
  429. switches=[
  430. (['--cloud-part-size-mb'], dict(
  431. deprecated_aliases=['--cloud-upload-part-size'],
  432. help=('Upload files to cloud FS in parts no bigger than this'
  433. ' many megabytes. Default is 100 MiB. Set to 0 to'
  434. ' disable multipart uploading entirely.'),
  435. type=float,
  436. )),
  437. ],
  438. ),
  439. cluster_id=dict(
  440. switches=[
  441. (['--cluster-id'], dict(
  442. help='ID of an existing cluster to run our job on',
  443. )),
  444. ],
  445. ),
  446. cluster_properties=dict(
  447. cloud_role='launch',
  448. combiner=combine_dicts,
  449. switches=[
  450. (['--cluster-property'], dict(
  451. action=_KeyValueAction,
  452. help=('Properties to set in Hadoop config files on Dataproc.'
  453. 'Args take the form file_prefix:property=value.'
  454. ' You can use --cluster-property multiple times.'
  455. ' For more info, see'
  456. ' https://cloud.google.com/dataproc/docs/concepts'
  457. '/configuring-clusters/cluster-properties'),
  458. )),
  459. ],
  460. ),
  461. cmdenv=dict(
  462. combiner=combine_envs,
  463. switches=[
  464. (['--cmdenv'], dict(
  465. action=_KeyValueAction,
  466. help=('Set an environment variable for your job inside Hadoop '
  467. 'streaming/Spark. Must take the form KEY=VALUE.'
  468. ' You can use --cmdenv multiple times.'),
  469. )),
  470. ],
  471. ),
  472. core_instance_config=dict(
  473. cloud_role='launch',
  474. switches=[
  475. (['--core-instance-config'], dict(
  476. action=_JSONAction,
  477. help=('detailed JSON dict of configs for the core'
  478. ' (worker) instances'
  479. ' on Dataproc, including disk config. For format, see'
  480. ' https://cloud.google.com/dataproc/docs/reference/rest'
  481. '/v1/projects.regions.clusters#InstanceGroupConfig'
  482. ' (except that fields in your JSON should use'
  483. ' snake_case, not camelCase).')
  484. )),
  485. ],
  486. ),
  487. core_instance_bid_price=dict(
  488. cloud_role='launch',
  489. switches=[
  490. (['--core-instance-bid-price'], dict(
  491. help=('Bid price to specify for core nodes when'
  492. ' setting them up as EC2 spot instances (you probably'
  493. ' only want to do this for task instances).'),
  494. )),
  495. ],
  496. ),
  497. core_instance_type=dict(
  498. cloud_role='launch',
  499. switches=[
  500. (['--core-instance-type'], dict(
  501. help='Type of GCE/EC2 core instance(s) to launch',
  502. )),
  503. ],
  504. ),
  505. ebs_root_volume_gb=dict(
  506. cloud_role='launch',
  507. switches=[
  508. (['--ebs-root-volume-gb'], dict(
  509. help=('Size of root EBS volume, in GiB. Must be an integer.'
  510. 'Set to 0 to use the default'),
  511. type=int,
  512. )),
  513. ],
  514. ),
  515. ec2_endpoint=dict(
  516. cloud_role='connect',
  517. switches=[
  518. (['--ec2-endpoint'], dict(
  519. help=('Force mrjob to connect to EC2 on this endpoint'
  520. ' (e.g. ec2.us-west-1.amazonaws.com).'
  521. ' Default is to infer this from region.'),
  522. )),
  523. ],
  524. ),
  525. ec2_key_pair=dict(
  526. cloud_role='launch',
  527. switches=[
  528. (['--ec2-key-pair'], dict(
  529. help='Name of the SSH key pair you set up for EMR',
  530. )),
  531. ],
  532. ),
  533. ec2_key_pair_file=dict(
  534. combiner=combine_paths,
  535. switches=[
  536. (['--ec2-key-pair-file'], dict(
  537. help='Path to file containing SSH key for EMR',
  538. )),
  539. ],
  540. ),
  541. emr_action_on_failure=dict(
  542. cloud_role='launch',
  543. switches=[
  544. (['--emr-action-on-failure'], dict(
  545. help=('Action to take when a step fails'
  546. ' (e.g. TERMINATE_CLUSTER, CANCEL_AND_WAIT, CONTINUE)'),
  547. )),
  548. ],
  549. ),
  550. emr_configurations=dict(
  551. cloud_role='launch',
  552. combiner=combine_lists,
  553. switches=[
  554. (['--emr-configuration'], dict(
  555. action=_AppendJSONAction,
  556. help=('Configuration to use on 4.x AMIs as a JSON-encoded'
  557. ' dict; see'
  558. ' http://docs.aws.amazon.com/ElasticMapReduce/latest/'
  559. 'ReleaseGuide/emr-configure-apps.html for examples'),
  560. )),
  561. ],
  562. ),
  563. emr_endpoint=dict(
  564. cloud_role='connect',
  565. switches=[
  566. (['--emr-endpoint'], dict(
  567. help=('Force mrjob to connect to EMR on this endpoint'
  568. ' (e.g. us-west-1.elasticmapreduce.amazonaws.com).'
  569. ' Default is to infer this from region.'),
  570. )),
  571. ],
  572. ),
  573. emulate_map_input_file=dict(
  574. switches=[
  575. (['--emulate-map-input-file'], dict(
  576. action='store_true',
  577. help=("In the first mapper, set $mapreduce_map_input_file to"
  578. " the input file path, like Hadoop would, to support"
  579. " jobs that use"
  580. " jobconf_from_env('mapreduce.map.input.file')."
  581. " Ignored if job sets HADOOP_INPUT_FORMAT."),
  582. )),
  583. (['--no-emulate-map-input-file'], dict(
  584. action='store_false',
  585. help=("Disables setting $mapreduce_map_input_file"),
  586. )),
  587. ],
  588. ),
  589. enable_emr_debugging=dict(
  590. cloud_role='launch',
  591. switches=[
  592. (['--enable-emr-debugging'], dict(
  593. action='store_true',
  594. help='Enable storage of Hadoop logs in SimpleDB',
  595. )),
  596. (['--disable-emr-debugging'], dict(
  597. action='store_false',
  598. help=('Disable storage of Hadoop logs in SimpleDB (the'
  599. ' default)'),
  600. )),
  601. ],
  602. ),
  603. extra_cluster_params=dict(
  604. cloud_role='launch',
  605. combiner=combine_dicts,
  606. switches=[
  607. (['--extra-cluster-param'], dict(
  608. action=_KeyJSONValueAction,
  609. help=('extra parameter to pass to cloud API when creating'
  610. ' a cluster, to access features not currently supported'
  611. ' by mrjob. Takes the form <param>=<value>, where value'
  612. ' is JSON or a string. Use <param>=null to unset a'
  613. ' parameter'),
  614. )),
  615. ],
  616. ),
  617. gcloud_bin=dict(
  618. combiner=combine_cmds,
  619. switches=[
  620. (['--gcloud-bin'], dict(help='path to gcloud binary')),
  621. ],
  622. ),
  623. gcs_region=dict(
  624. cloud_role='connect',
  625. switches=[
  626. (['--gcs-region'], dict(
  627. help='region to create Google Cloud Storage buckets in',
  628. )),
  629. ],
  630. ),
  631. hadoop_bin=dict(
  632. combiner=combine_cmds,
  633. switches=[
  634. (['--hadoop-bin'], dict(help='path to hadoop binary')),
  635. ],
  636. ),
  637. hadoop_extra_args=dict(
  638. combiner=combine_lists,
  639. switches=[
  640. (['--hadoop-args'], dict(
  641. action=_AppendArgsAction,
  642. help=('One or more arguments to pass to the hadoop binary.'
  643. ' (e.g. --hadoop-args="-fs file:///").'),
  644. )),
  645. ],
  646. ),
  647. hadoop_log_dirs=dict(
  648. combiner=combine_path_lists,
  649. switches=[
  650. (['--hadoop-log-dirs'], dict(
  651. action='append',
  652. help=('Directory to search for hadoop logs in. You can use'
  653. ' --hadoop-log-dir multiple times.'),
  654. )),
  655. ],
  656. ),
  657. hadoop_streaming_jar=dict(
  658. combiner=combine_paths,
  659. switches=[
  660. (['--hadoop-streaming-jar'], dict(
  661. help=('Path of your hadoop streaming jar (locally, or on'
  662. ' S3/HDFS). In EMR, use a file:// URI to refer to a jar'
  663. ' on the master node of your cluster.'),
  664. )),
  665. ],
  666. ),
  667. hadoop_tmp_dir=dict(
  668. combiner=combine_paths,
  669. switches=[
  670. (['--hadoop-tmp-dir'], dict(
  671. help='Temp space on HDFS (default is tmp/mrjob)',
  672. )),
  673. ],
  674. ),
  675. hadoop_version=dict(
  676. switches=[
  677. (['--hadoop-version'], dict(
  678. help='Specific version of Hadoop to simulate',
  679. )),
  680. ],
  681. ),
  682. iam_endpoint=dict(
  683. cloud_role='launch', # not 'connect'; only used to create clusters
  684. switches=[
  685. (['--iam-endpoint'], dict(
  686. help=('Force mrjob to connect to IAM on this endpoint'
  687. ' (e.g. iam.us-gov.amazonaws.com)'),
  688. )),
  689. ],
  690. ),
  691. iam_instance_profile=dict(
  692. cloud_role='launch',
  693. switches=[
  694. (['--iam-instance-profile'], dict(
  695. help=('EC2 instance profile to use for the EMR cluster -- see'
  696. ' "Configure IAM Roles for Amazon EMR" in AWS docs'),
  697. )),
  698. ],
  699. ),
  700. iam_service_role=dict(
  701. cloud_role='launch',
  702. switches=[
  703. (['--iam-service-role'], dict(
  704. help=('IAM service role to use for the EMR cluster -- see'
  705. ' "Configure IAM Roles for Amazon EMR" in AWS docs')
  706. )),
  707. ],
  708. ),
  709. image_id=dict(
  710. cloud_role='launch',
  711. switches=[
  712. (['--image-id'], dict(
  713. help='ID of custom AWS machine image (AMI) to use',
  714. )),
  715. ],
  716. ),
  717. image_version=dict(
  718. cloud_role='launch',
  719. switches=[
  720. (['--image-version'], dict(
  721. help='version of EMR/Dataproc machine image to run',
  722. )),
  723. ],
  724. ),
  725. instance_groups=dict(
  726. cloud_role='launch',
  727. switches=[
  728. (['--instance-groups'], dict(
  729. action=_JSONAction,
  730. help=('detailed JSON list of EMR instance configs, including'
  731. ' EBS configuration. See docs for --instance-groups'
  732. ' at http://docs.aws.amazon.com/cli/latest/reference'
  733. '/emr/create-cluster.html'),
  734. )),
  735. ],
  736. ),
  737. instance_fleets=dict(
  738. cloud_role='launch',
  739. switches=[
  740. (['--instance-fleets'], dict(
  741. action=_JSONAction,
  742. help=('detailed JSON list of instance fleets, including'
  743. ' EBS configuration. See docs for --instance-fleets'
  744. ' at http://docs.aws.amazon.com/cli/latest/reference'
  745. '/emr/create-cluster.html'),
  746. )),
  747. ],
  748. ),
  749. instance_type=dict(
  750. cloud_role='launch',
  751. switches=[
  752. (['--instance-type'], dict(
  753. help=('Type of GCE/EC2 instance(s) to launch \n'
  754. ' GCE - e.g. n1-standard-1, n1-highcpu-4, n1-highmem-4'
  755. ' -- See'
  756. ' https://cloud.google.com/compute/docs/machine-types\n'
  757. ' EC2 - e.g. m1.medium, c3.xlarge, r3.xlarge '
  758. ' -- See http://aws.amazon.com/ec2/instance-types/'),
  759. )),
  760. ],
  761. ),
  762. jobconf=dict(
  763. combiner=combine_jobconfs,
  764. switches=[
  765. (['-D', '--jobconf'], dict(
  766. action=_KeyValueAction,
  767. help=('passed through to hadoop streaming as -D and to Spark'
  768. ' as --conf. Should take the form KEY=VALUE'),
  769. )),
  770. ],
  771. ),
  772. label=dict(
  773. cloud_role='launch',
  774. switches=[
  775. (['--label'], dict(
  776. help='Alternate label for the job, to help us identify it.',
  777. )),
  778. ],
  779. ),
  780. libjars=dict(
  781. combiner=combine_path_lists,
  782. switches=[
  783. (['--libjars'], dict(
  784. action=_AppendCommaSeparatedItemsAction,
  785. help=('Paths of JARs to pass to Hadoop with -libjars,'
  786. ' separated by commas. On EMR,'
  787. ' these can also be URIs; use file:/// to'
  788. ' reference JARs already on the EMR cluster.')
  789. )),
  790. ],
  791. ),
  792. local_tmp_dir=dict(
  793. combiner=combine_paths,
  794. switches=[
  795. (['--local-tmp-dir'], dict(
  796. help='temp directory on local filesystem',
  797. )),
  798. ],
  799. ),
  800. master_instance_bid_price=dict(
  801. cloud_role='launch',
  802. switches=[
  803. (['--master-instance-bid-price'], dict(
  804. help=('Bid price to specify for the master node when'
  805. ' setting it up as an EC2 spot instance (you probably'
  806. ' only want to do this for task instances).'),
  807. )),
  808. ],
  809. ),
  810. master_instance_config=dict(
  811. cloud_role='launch',
  812. switches=[
  813. (['--master-instance-config'], dict(
  814. action=_JSONAction,
  815. help=('detailed JSON dict of configs for the master instance'
  816. ' on Dataproc including disk config. For format, see'
  817. ' https://cloud.google.com/dataproc/docs/reference/rest'
  818. '/v1/projects.regions.clusters#InstanceGroupConfig'
  819. ' (except that fields in your JSON should use'
  820. ' snake_case, not camelCase).')
  821. )),
  822. ],
  823. ),
  824. master_instance_type=dict(
  825. cloud_role='launch',
  826. switches=[
  827. (['--master-instance-type'], dict(
  828. help='Type of GCE/EC2 master instance to launch',
  829. )),
  830. ],
  831. ),
  832. max_mins_idle=dict(
  833. cloud_role='launch',
  834. switches=[
  835. (['--max-mins-idle'], dict(
  836. help=("If we create a cluster, have it automatically"
  837. " terminate itself after it's been idle this many"
  838. " minutes"),
  839. type=float,
  840. )),
  841. ],
  842. ),
  843. # Spark runner only, only passed in on the command line (see #2040)
  844. max_output_files=dict(
  845. switches=[
  846. (['--max-output-files'], dict(
  847. help=('Maximum number of output files when running a'
  848. ' streaming job on Spark; just runs rdd.coalesce()'
  849. ' before outputting files'),
  850. type=int,
  851. )),
  852. ],
  853. ),
  854. network=dict(
  855. cloud_role='launch',
  856. switches=[
  857. (['--network'], dict(
  858. help=('URI of Google Compute Engine network to launch cluster'
  859. " in. Can't be used with --subnet."),
  860. )),
  861. ],
  862. ),
  863. num_core_instances=dict(
  864. cloud_role='launch',
  865. switches=[
  866. (['--num-core-instances'], dict(
  867. help='Total number of core instances to launch',
  868. type=int,
  869. )),
  870. ],
  871. ),
  872. num_task_instances=dict(
  873. cloud_role='launch',
  874. switches=[
  875. (['--num-task-instances'], dict(
  876. help='Total number of task instances to launch',
  877. type=int,
  878. )),
  879. ],
  880. ),
  881. num_cores=dict(
  882. cloud_role='launch',
  883. switches=[
  884. (['--num-cores'], dict(
  885. help='Total number of core to use while running in local mode',
  886. type=int,
  887. )),
  888. ],
  889. ),
  890. owner=dict(
  891. cloud_role='launch',
  892. switches=[
  893. (['--owner'], dict(
  894. help='User who ran the job (default is the current user)',
  895. )),
  896. ],
  897. ),
  898. pool_clusters=dict(
  899. cloud_role='launch',
  900. switches=[
  901. (['--pool-clusters'], dict(
  902. action='store_true',
  903. help=('Add to an existing cluster or create a new one that'
  904. ' does not terminate when the job completes.'),
  905. )),
  906. (['--no-pool-clusters'], dict(
  907. action='store_false',
  908. help="Don't run job on a pooled cluster (the default)",
  909. )),
  910. ],
  911. ),
  912. pool_name=dict(
  913. cloud_role='launch',
  914. switches=[
  915. (['--pool-name'], dict(
  916. help='Specify a pool name to join. Default is "default"',
  917. )),
  918. ],
  919. ),
  920. pool_wait_minutes=dict(
  921. switches=[
  922. (['--pool-wait-minutes'], dict(
  923. help=('Wait for a number of minutes for a cluster to finish'
  924. ' if a job finishes, run job on its cluster. Otherwise'
  925. " create a new one. (0, the default, means don't wait)"),
  926. type=int,
  927. )),
  928. ],
  929. ),
  930. project_id=dict(
  931. cloud_role='connect',
  932. deprecated_aliases=['gcp_project'],
  933. switches=[
  934. (['--project-id'], dict(
  935. deprecated_aliases=['--gcp-project'],
  936. help=('Project to use when connecting to Google Cloud Services'
  937. ' and to run Cloud Dataproc jobs in')
  938. )),
  939. ],
  940. ),
  941. py_files=dict(
  942. combiner=combine_path_lists,
  943. switches=[
  944. (['--py-files'], dict(
  945. action=_AppendCommaSeparatedItemsAction,
  946. help=('.zip or .egg files to add to PYTHONPATH,'
  947. ' separated by commas'),
  948. )),
  949. ],
  950. ),
  951. python_bin=dict(
  952. combiner=combine_cmds,
  953. switches=[
  954. (['--python-bin'], dict(
  955. help=('Alternate python command. You can include arguments,'
  956. ' e.g. --python-bin "python -v"'),
  957. )),
  958. ],
  959. ),
  960. read_logs=dict(
  961. switches=[
  962. (['--read-logs'], dict(
  963. action='store_true',
  964. help=('Parse logs generated by the job to get counters and'
  965. ' cause of error (the default).')
  966. )),
  967. (['--no-read-logs'], dict(
  968. action='store_false',
  969. help="Don't list or read logs generated by the job."
  970. )),
  971. ],
  972. ),
  973. region=dict(
  974. cloud_role='connect',
  975. switches=[
  976. (['--region'], dict(
  977. help='GCE/AWS region to run Dataproc/EMR jobs in.',
  978. )),
  979. ],
  980. ),
  981. release_label=dict(
  982. cloud_role='launch',
  983. switches=[
  984. (['--release-label'], dict(
  985. help=('Release Label (e.g. "emr-4.0.0"). Overrides'
  986. ' --image-version'),
  987. )),
  988. ],
  989. ),
  990. s3_endpoint=dict(
  991. cloud_role='connect',
  992. switches=[
  993. (['--s3-endpoint'], dict(
  994. help=("Force mrjob to connect to S3 on this endpoint (e.g."
  995. " s3-us-west-1.amazonaws.com). You usually shouldn't"
  996. " set this; by default mrjob will choose the correct"
  997. " endpoint for each S3 bucket based on its location."),
  998. )),
  999. ],
  1000. ),
  1001. s3_region=dict(
  1002. cloud_role='connect',
  1003. switches=[
  1004. (['--s3-region'], dict(
  1005. help='AWS region to create s3 buckets in',
  1006. )),
  1007. ],
  1008. ),
  1009. service_account=dict(
  1010. cloud_role='launch',
  1011. switches=[
  1012. (['--service-account'], dict(
  1013. help=('Service account to use when creating a Dataproc'
  1014. ' cluster. Usually takes the form'
  1015. ' [account_id]@[project_id].iam.gserviceaccount.com.'
  1016. ' Set to "" to use the default.'),
  1017. )),
  1018. ],
  1019. ),
  1020. service_account_scopes=dict(
  1021. cloud_role='launch',
  1022. switches=[
  1023. (['--service-account-scopes'], dict(
  1024. action=_CommaSeparatedListAction,
  1025. help=("A comma-separated list of service account scopes"
  1026. " on Dataproc, used to limit your cluster's access."
  1027. " For each scope, you can specify the"
  1028. " full URI or just the name (e.g. 'logging.write')"),
  1029. )),
  1030. ],
  1031. ),
  1032. setup=dict(
  1033. combiner=combine_lists,
  1034. switches=[
  1035. (['--setup'], dict(
  1036. action='append',
  1037. help=('A command to run before each mapper/reducer step in the'
  1038. ' shell ("touch foo"). You may interpolate files'
  1039. ' available via URL or on your local filesystem using'
  1040. ' Hadoop Distributed Cache syntax (". setup.sh#"). To'
  1041. ' interpolate archives, use #/: "cd foo.tar.gz#/; make'),
  1042. )),
  1043. ],
  1044. ),
  1045. sh_bin=dict(
  1046. combiner=combine_cmds,
  1047. switches=[
  1048. (['--sh-bin'], dict(
  1049. help=('Alternate shell command for setup scripts. You may'
  1050. ' include arguments, e.g. --sh-bin "bash -ex"'),
  1051. )),
  1052. ],
  1053. ),
  1054. skip_internal_protocol=dict(
  1055. switches=[
  1056. (['--skip-internal-protocol'], dict(
  1057. action='store_true',
  1058. help=("Don't use the job's internal protocol to communicate"
  1059. " between tasks internal to the job, instead relying"
  1060. " on Spark to encode and decode raw data structures.")
  1061. )),
  1062. (['--no-skip-internal-protocol'], dict(
  1063. action='store_false',
  1064. help='Use internal protocols as usual',
  1065. )),
  1066. ],
  1067. ),
  1068. sort_bin=dict(
  1069. combiner=combine_cmds,
  1070. switches=[
  1071. (['--sort-bin'], dict(
  1072. help=('Alternate shell command for the external sort binary.'
  1073. 'You may include arguments, e.g. --sort-bin "sort -r"')
  1074. )),
  1075. ],
  1076. ),
  1077. spark_args=dict(
  1078. combiner=combine_lists,
  1079. switches=[
  1080. (['--spark-args'], dict(
  1081. action=_AppendArgsAction,
  1082. help=('One or more arguments to pass to spark-submit'
  1083. ' (e.g. --spark-args="--properties-file my.conf").'),
  1084. )),
  1085. ],
  1086. ),
  1087. spark_deploy_mode=dict(
  1088. switches=[
  1089. (['--spark-deploy-mode'], dict(
  1090. help=('--deploy-mode argument to spark-submit (e.g.'
  1091. ' "cluster". Default is "client"'),
  1092. )),
  1093. ]
  1094. ),
  1095. spark_master=dict(
  1096. switches=[
  1097. (['--spark-master'], dict(
  1098. help=('--master argument to spark-submit (e.g. '
  1099. 'spark://host:port, local). Default is "yarn"'),
  1100. )),
  1101. ],
  1102. ),
  1103. spark_submit_bin=dict(
  1104. combiner=combine_cmds,
  1105. switches=[
  1106. (['--spark-submit-bin'], dict(
  1107. help='spark-submit binary. You may include arguments.'
  1108. )),
  1109. ],
  1110. ),
  1111. spark_tmp_dir=dict(
  1112. cloud_role='launch',
  1113. combiner=combine_paths,
  1114. switches=[
  1115. (['--spark-tmp-dir'], dict(
  1116. help=('optional URI visible to Spark executors to use as our'
  1117. ' temp directory.'),
  1118. )),
  1119. ],
  1120. ),
  1121. ssh_add_bin=dict(
  1122. combiner=combine_cmds,
  1123. switches=[
  1124. (['--ssh-add-bin'], dict(
  1125. help=("Name/path of ssh-add binary. Arguments are allowed"
  1126. " (e.g. --ssh-bin 'ssh-add -v')"),
  1127. )),
  1128. ],
  1129. ),
  1130. ssh_bin=dict(
  1131. combiner=combine_cmds,
  1132. switches=[
  1133. (['--ssh-bin'], dict(
  1134. help=("Name/path of ssh binary. Arguments are allowed (e.g."
  1135. " --ssh-bin 'ssh -v')"),
  1136. )),
  1137. ],
  1138. ),
  1139. ssh_bind_ports=dict(
  1140. switches=[
  1141. (['--ssh-bind-ports'], dict(
  1142. action=_PortRangeAction,
  1143. help=('A list of port ranges that are safe to listen on,'
  1144. ' delimited by colons and commas, with syntax like'
  1145. ' 2000[:2001][,2003,2005:2008,etc].'
  1146. ' Defaults to 40001:40840.'),
  1147. )),
  1148. ],
  1149. ),
  1150. ssh_tunnel=dict(
  1151. switches=[
  1152. (['--ssh-tunnel'], dict(
  1153. action='store_true',
  1154. help=('Open an SSH tunnel to the Hadoop job tracker/resource'
  1155. ' manager'),
  1156. )),
  1157. (['--no-ssh-tunnel'], dict(
  1158. action='store_false',
  1159. help=("Don't open an SSH tunnel to the Hadoop job"
  1160. " tracker/resource manager (the default)"),
  1161. )),
  1162. ],
  1163. ),
  1164. ssh_tunnel_is_open=dict(
  1165. switches=[
  1166. (['--ssh-tunnel-is-open'], dict(
  1167. action='store_true',
  1168. help=('Make ssh tunnel accessible from remote hosts (not just'
  1169. ' localhost)'),
  1170. )),
  1171. (['--ssh-tunnel-is-closed'], dict(
  1172. action='store_false',
  1173. help=('Make ssh tunnel accessible from localhost only (the'
  1174. ' default)'),
  1175. )),
  1176. ],
  1177. ),
  1178. subnet=dict(
  1179. cloud_role='launch',
  1180. switches=[
  1181. (['--subnet'], dict(
  1182. help=('ID of Amazon VPC subnet/URI of Google Compute Engine'
  1183. ' subnetwork to launch cluster in.'),
  1184. )),
  1185. (['--subnets'], dict(
  1186. action=_CommaSeparatedListAction,
  1187. help=('Like --subnet, but with a comma-separated list, to'
  1188. ' specify multiple subnets in conjunction with'
  1189. ' --instance-fleets (EMR only)'),
  1190. )),
  1191. ],
  1192. ),
  1193. tags=dict(
  1194. cloud_role='launch',
  1195. combiner=combine_dicts,
  1196. switches=[
  1197. (['--tag'], dict(
  1198. action=_KeyValueAction,
  1199. help=('Metadata tags to apply to the EMR cluster; '
  1200. 'should take the form KEY=VALUE. You can use --tag '
  1201. 'multiple times'),
  1202. )),
  1203. ],
  1204. ),
  1205. task_instance_bid_price=dict(
  1206. cloud_role='launch',
  1207. switches=[
  1208. (['--task-instance-bid-price'], dict(
  1209. help=('Bid price to specify for task nodes when'
  1210. ' setting them up as EC2 spot instances'),
  1211. )),
  1212. ],
  1213. ),
  1214. task_instance_config=dict(
  1215. cloud_role='launch',
  1216. switches=[
  1217. (['--task-instance-config'], dict(
  1218. action=_JSONAction,
  1219. help=('detailed JSON dict of configs for the task'
  1220. ' (secondary worker) instances'
  1221. ' on Dataproc including disk config. For format, see'
  1222. ' https://cloud.google.com/dataproc/docs/reference/rest'
  1223. '/v1/projects.regions.clusters#InstanceGroupConfig'
  1224. ' (except that fields in your JSON should use'
  1225. ' snake_case, not camelCase).')
  1226. )),
  1227. ],
  1228. ),
  1229. task_instance_type=dict(
  1230. cloud_role='launch',
  1231. switches=[
  1232. (['--task-instance-type'], dict(
  1233. help='Type of GCE/EC2 task instance(s) to launch',
  1234. )),
  1235. ],
  1236. ),
  1237. task_python_bin=dict(
  1238. combiner=combine_cmds,
  1239. switches=[
  1240. (['--task-python-bin'], dict(
  1241. help=('Name/path of alternate python command to use to'
  1242. " run tasks (e.g. mappers); doesn't affect setup"
  1243. ' wrapper scripts. Defaults to'
  1244. ' current Python interpreter.'),
  1245. )),
  1246. ],
  1247. ),
  1248. upload_archives=dict(
  1249. combiner=combine_path_lists,
  1250. switches=[
  1251. (['--archives'], dict(
  1252. action=_AppendCommaSeparatedItemsAction,
  1253. help=('Archives to unpack in the working directory of the'
  1254. ' script, separated by commas. Use "#" to assign a'
  1255. ' different name to each directory (e.g. '
  1256. '"foo-libs.zip#lib,bar.tar.gz#bar")'),
  1257. )),
  1258. ],
  1259. ),
  1260. upload_dirs=dict(
  1261. combiner=combine_path_lists,
  1262. switches=[
  1263. (['--dirs'], dict(
  1264. action=_AppendCommaSeparatedItemsAction,
  1265. help=('Directories to tarball and unpack in the working'
  1266. ' directory of the script, separated by commas. Append'
  1267. '#<name> to each directory to assign a different name'
  1268. ' (e.g. "foo#lib,bar#local-bar")'),
  1269. )),
  1270. ],
  1271. ),
  1272. upload_files=dict(
  1273. combiner=combine_path_lists,
  1274. switches=[
  1275. (['--files'], dict(
  1276. action=_AppendCommaSeparatedItemsAction,
  1277. help=('Files to copy to the working directory of the script,'
  1278. ' separated by commas. Use "#"'
  1279. ' to assign a different name to each file (e.g. '
  1280. '"foo.db#bar.db")'),
  1281. )),
  1282. ],
  1283. ),
  1284. zone=dict(
  1285. cloud_role='launch',
  1286. switches=[
  1287. (['--zone'], dict(
  1288. help=('GCE zone/AWS availability zone to run Dataproc/EMR jobs'
  1289. ' in.'),
  1290. )),
  1291. ],
  1292. ),
  1293. )
  1294. def _combiners(opt_names, runner_alias=None):
  1295. return {
  1296. name: config['combiner']
  1297. for name, config in _RUNNER_OPTS.items()
  1298. if name in opt_names and 'combiner' in config
  1299. }
  1300. def _deprecated_aliases(opt_names):
  1301. results = {}
  1302. for name, config in _RUNNER_OPTS.items():
  1303. if name not in opt_names:
  1304. continue
  1305. if config.get('deprecated_aliases'):
  1306. for alias in config['deprecated_aliases']:
  1307. results[alias] = name
  1308. return results
  1309. def _filter_by_role(opt_names, *cloud_roles):
  1310. return {
  1311. opt_name
  1312. for opt_name, conf in _RUNNER_OPTS.items()
  1313. if opt_name in opt_names and conf.get('cloud_role') in cloud_roles
  1314. }
  1315. def _add_runner_args(parser, opt_names=None, include_deprecated=True,
  1316. customize_switches=None, suppress_switches=None):
  1317. """add switches for the given runner opts to the given
  1318. ArgumentParser, alphabetically by destination. If *opt_names* is
  1319. None, include all runner opts."""
  1320. if opt_names is None:
  1321. opt_names = set(_RUNNER_OPTS)
  1322. for opt_name in sorted(opt_names):
  1323. _add_runner_args_for_opt(
  1324. parser, opt_name,
  1325. include_deprecated=include_deprecated,
  1326. customize_switches=customize_switches,
  1327. suppress_switches=suppress_switches
  1328. )
  1329. def _add_runner_args_for_opt(parser, opt_name, include_deprecated=True,
  1330. customize_switches=None, suppress_switches=None):
  1331. """Add switches for a single option (*opt_name*) to the given parser."""
  1332. if customize_switches is None:
  1333. customize_switches = {}
  1334. if suppress_switches is None:
  1335. suppress_switches = set()
  1336. conf = _RUNNER_OPTS[opt_name]
  1337. if conf.get('deprecated') and not include_deprecated:
  1338. return
  1339. switches = conf.get('switches') or []
  1340. def suppressed(switches):
  1341. return any(sw in suppress_switches for sw in switches)
  1342. for args, kwargs in switches:
  1343. kwargs = dict(kwargs)
  1344. # allow customization
  1345. for switch in args:
  1346. if switch in customize_switches:
  1347. kwargs.update(customize_switches[switch])
  1348. deprecated_aliases = kwargs.pop('deprecated_aliases', None)
  1349. deprecated = kwargs.pop('deprecated', False)
  1350. # add this switch
  1351. if (include_deprecated or not deprecated) and not suppressed(args):
  1352. kwargs['dest'] = opt_name
  1353. if kwargs.get('action') == 'append':
  1354. kwargs['default'] = []
  1355. else:
  1356. kwargs['default'] = None
  1357. parser.add_argument(*args, **kwargs)
  1358. # add a switch for deprecated aliases
  1359. if (deprecated_aliases and include_deprecated and
  1360. not suppressed(deprecated_aliases)):
  1361. help = 'Deprecated alias%s for %s' % (
  1362. ('es' if len(deprecated_aliases) > 1 else ''),
  1363. args[-1])
  1364. parser.add_argument(
  1365. *deprecated_aliases,
  1366. **combine_dicts(kwargs, dict(help=help)))
  1367. ### non-runner switches ###
  1368. def _add_basic_args(parser):
  1369. """Switches for all command line tools"""
  1370. parser.add_argument(
  1371. '-c', '--conf-path', dest='conf_paths',
  1372. action='append',
  1373. help='Path to alternate mrjob.conf file to read from')
  1374. parser.add_argument(
  1375. '--no-conf', dest='conf_paths', action='store_const', const=[],
  1376. help="Don't load mrjob.conf even if it's available")
  1377. parser.add_argument(
  1378. '-q', '--quiet', dest='quiet', default=None,
  1379. action='store_true',
  1380. help="Don't print anything to stderr")
  1381. parser.add_argument(
  1382. '-v', '--verbose', dest='verbose', default=None,
  1383. action='store_true', help='print more messages to stderr')
  1384. def _add_job_args(parser, include_deprecated=True, include_steps=True):
  1385. parser.add_argument(
  1386. '--cat-output', dest='cat_output',
  1387. default=None, action='store_true',
  1388. help="Stream job output to stdout")
  1389. parser.add_argument(
  1390. '--no-cat-output', dest='cat_output',
  1391. default=None, action='store_false',
  1392. help="Don't stream job output to stdout")
  1393. if include_deprecated:

Large files files are truncated, but you can click here to view the full file