PageRenderTime 56ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/spark-ec2/spark_ec2.py

https://github.com/chaordic/ignition-core
Python | 1273 lines | 1171 code | 56 blank | 46 comment | 108 complexity | f2ae9e5666c5e159277af42f8c7c6e15 MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Licensed to the Apache Software Foundation (ASF) under one
  5. # or more contributor license agreements. See the NOTICE file
  6. # distributed with this work for additional information
  7. # regarding copyright ownership. The ASF licenses this file
  8. # to you under the Apache License, Version 2.0 (the
  9. # "License"); you may not use this file except in compliance
  10. # with the License. You may obtain a copy of the License at
  11. #
  12. # http://www.apache.org/licenses/LICENSE-2.0
  13. #
  14. # Unless required by applicable law or agreed to in writing, software
  15. # distributed under the License is distributed on an "AS IS" BASIS,
  16. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17. # See the License for the specific language governing permissions and
  18. # limitations under the License.
  19. #
  20. from __future__ import with_statement
  21. import hashlib
  22. import logging
  23. import os
  24. import os.path
  25. import pipes
  26. import random
  27. import shutil
  28. import string
  29. from stat import S_IRUSR
  30. import subprocess
  31. import sys
  32. import tarfile
  33. import tempfile
  34. import textwrap
  35. import time
  36. import urllib2
  37. import warnings
  38. from datetime import datetime
  39. from optparse import OptionParser
  40. from sys import stderr
  41. SPARK_EC2_VERSION = "1.3.0"
  42. SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
  43. VALID_SPARK_VERSIONS = set([
  44. "0.7.3",
  45. "0.8.0",
  46. "0.8.1",
  47. "0.9.0",
  48. "0.9.1",
  49. "0.9.2",
  50. "1.0.0",
  51. "1.0.1",
  52. "1.0.2",
  53. "1.1.0",
  54. "1.1.1",
  55. "1.2.0",
  56. "1.2.1",
  57. "1.3.0",
  58. ])
  59. DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
  60. DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
  61. # Default location to get the spark-ec2 scripts (and ami-list) from
  62. DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2"
  63. DEFAULT_SPARK_EC2_BRANCH = "branch-1.3"
  64. import boto
  65. from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
  66. from boto import ec2
  67. class UsageError(Exception):
  68. pass
  69. # Configure and parse our command-line arguments
  70. def parse_args():
  71. parser = OptionParser(
  72. prog="spark-ec2",
  73. version="%prog {v}".format(v=SPARK_EC2_VERSION),
  74. usage="%prog [options] <action> <cluster_name>\n\n"
  75. + "<action> can be: launch, destroy, login, stop, start, get-master, reboot-slaves")
  76. parser.add_option(
  77. "-s", "--slaves", type="int", default=1,
  78. help="Number of slaves to launch (default: %default)")
  79. parser.add_option(
  80. "-w", "--wait", type="int",
  81. help="DEPRECATED (no longer necessary) - Seconds to wait for nodes to start")
  82. parser.add_option(
  83. "-k", "--key-pair",
  84. help="Key pair to use on instances")
  85. parser.add_option(
  86. "-i", "--identity-file",
  87. help="SSH private key file to use for logging into instances")
  88. parser.add_option(
  89. "-t", "--instance-type", default="m1.large",
  90. help="Type of instance to launch (default: %default). " +
  91. "WARNING: must be 64-bit; small instances won't work")
  92. parser.add_option(
  93. "-m", "--master-instance-type", default="",
  94. help="Master instance type (leave empty for same as instance-type)")
  95. parser.add_option(
  96. "-r", "--region", default="us-east-1",
  97. help="EC2 region zone to launch instances in")
  98. parser.add_option(
  99. "-z", "--zone", default="",
  100. help="Availability zone to launch instances in, or 'all' to spread " +
  101. "slaves across multiple (an additional $0.01/Gb for bandwidth" +
  102. "between zones applies) (default: a single zone chosen at random)")
  103. parser.add_option(
  104. "-a", "--ami",
  105. help="Amazon Machine Image ID to use")
  106. parser.add_option("--master-ami",
  107. help="Amazon Machine Image ID to use for the Master")
  108. parser.add_option(
  109. "-v", "--spark-version", default=DEFAULT_SPARK_VERSION,
  110. help="Version of Spark to use: 'X.Y.Z' or a specific git hash (default: %default)")
  111. parser.add_option(
  112. "--spark-git-repo",
  113. default=DEFAULT_SPARK_GITHUB_REPO,
  114. help="Github repo from which to checkout supplied commit hash (default: %default)")
  115. parser.add_option(
  116. "--spark-ec2-git-repo",
  117. default=DEFAULT_SPARK_EC2_GITHUB_REPO,
  118. help="Github repo from which to checkout spark-ec2 (default: %default)")
  119. parser.add_option(
  120. "--spark-ec2-git-branch",
  121. default=DEFAULT_SPARK_EC2_BRANCH,
  122. help="Github repo branch of spark-ec2 to use (default: %default)")
  123. parser.add_option(
  124. "--hadoop-major-version", default="1",
  125. help="Major version of Hadoop (default: %default)")
  126. parser.add_option(
  127. "-D", metavar="[ADDRESS:]PORT", dest="proxy_port",
  128. help="Use SSH dynamic port forwarding to create a SOCKS proxy at " +
  129. "the given local address (for use with login)")
  130. parser.add_option(
  131. "--resume", action="store_true", default=False,
  132. help="Resume installation on a previously launched cluster " +
  133. "(for debugging)")
  134. parser.add_option(
  135. "--ebs-vol-size", metavar="SIZE", type="int", default=0,
  136. help="Size (in GB) of each EBS volume.")
  137. parser.add_option(
  138. "--ebs-vol-type", default="standard",
  139. help="EBS volume type (e.g. 'gp2', 'standard').")
  140. parser.add_option(
  141. "--ebs-vol-num", type="int", default=1,
  142. help="Number of EBS volumes to attach to each node as /vol[x]. " +
  143. "The volumes will be deleted when the instances terminate. " +
  144. "Only possible on EBS-backed AMIs. " +
  145. "EBS volumes are only attached if --ebs-vol-size > 0." +
  146. "Only support up to 8 EBS volumes.")
  147. parser.add_option(
  148. "--placement-group", type="string", default=None,
  149. help="Which placement group to try and launch " +
  150. "instances into. Assumes placement group is already " +
  151. "created.")
  152. parser.add_option(
  153. "--swap", metavar="SWAP", type="int", default=1024,
  154. help="Swap space to set up per node, in MB (default: %default)")
  155. parser.add_option(
  156. "--spot-price", metavar="PRICE", type="float",
  157. help="If specified, launch slaves as spot instances with the given " +
  158. "maximum price (in dollars)")
  159. parser.add_option(
  160. "--ganglia", action="store_true", default=True,
  161. help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " +
  162. "the Ganglia page will be publicly accessible")
  163. parser.add_option(
  164. "--no-ganglia", action="store_false", dest="ganglia",
  165. help="Disable Ganglia monitoring for the cluster")
  166. parser.add_option(
  167. "-u", "--user", default="root",
  168. help="The SSH user you want to connect as (default: %default)")
  169. parser.add_option(
  170. "--delete-groups", action="store_true", default=False,
  171. help="When destroying a cluster, delete the security groups that were created")
  172. parser.add_option(
  173. "--use-existing-master", action="store_true", default=False,
  174. help="Launch fresh slaves, but use an existing stopped master if possible")
  175. parser.add_option(
  176. "--worker-instances", type="int", default=1,
  177. help="Number of instances per worker: variable SPARK_WORKER_INSTANCES (default: %default)")
  178. parser.add_option(
  179. "--master-opts", type="string", default="",
  180. help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
  181. "(e.g -Dspark.worker.timeout=180)")
  182. parser.add_option(
  183. "--user-data", type="string", default="",
  184. help="Path to a user-data file (most AMI's interpret this as an initialization script)")
  185. parser.add_option(
  186. "--security-group-prefix", type="string", default=None,
  187. help="Use this prefix for the security group rather than the cluster name.")
  188. parser.add_option(
  189. "--authorized-address", type="string", default="0.0.0.0/0",
  190. help="Address to authorize on created security groups (default: %default)")
  191. parser.add_option(
  192. "--additional-security-group", type="string", default="",
  193. help="Additional security group to place the machines in")
  194. parser.add_option(
  195. "--copy-aws-credentials", action="store_true", default=False,
  196. help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
  197. parser.add_option(
  198. "--subnet-id", default=None,
  199. help="VPC subnet to launch instances in")
  200. parser.add_option(
  201. "--vpc-id", default=None,
  202. help="VPC to launch instances in")
  203. (opts, args) = parser.parse_args()
  204. if len(args) != 2:
  205. parser.print_help()
  206. sys.exit(1)
  207. (action, cluster_name) = args
  208. # Boto config check
  209. # http://boto.cloudhackers.com/en/latest/boto_config_tut.html
  210. home_dir = os.getenv('HOME')
  211. if home_dir is None or not os.path.isfile(home_dir + '/.boto'):
  212. if not os.path.isfile('/etc/boto.cfg'):
  213. if os.getenv('AWS_ACCESS_KEY_ID') is None:
  214. print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " +
  215. "must be set")
  216. sys.exit(1)
  217. if os.getenv('AWS_SECRET_ACCESS_KEY') is None:
  218. print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " +
  219. "must be set")
  220. sys.exit(1)
  221. return (opts, action, cluster_name)
  222. # Get the EC2 security group of the given name, creating it if it doesn't exist
  223. def get_or_make_group(conn, name, vpc_id):
  224. groups = conn.get_all_security_groups()
  225. group = [g for g in groups if g.name == name]
  226. if len(group) > 0:
  227. return group[0]
  228. else:
  229. print "Creating security group " + name
  230. return conn.create_security_group(name, "Spark EC2 group", vpc_id)
  231. def check_if_http_resource_exists(resource):
  232. request = urllib2.Request(resource)
  233. request.get_method = lambda: 'HEAD'
  234. try:
  235. response = urllib2.urlopen(request)
  236. if response.getcode() == 200:
  237. return True
  238. else:
  239. raise RuntimeError("Resource {resource} not found. Error: {code}".format(resource, response.getcode()))
  240. except urllib2.HTTPError, e:
  241. print >> stderr, "Unable to check if HTTP resource {url} exists. Error: {code}".format(
  242. url=resource,
  243. code=e.code)
  244. return False
  245. def get_validate_spark_version(version, repo):
  246. if version.startswith("http"):
  247. #check if custom package URL exists
  248. if check_if_http_resource_exists:
  249. return version
  250. else:
  251. print >> stderr, "Unable to validate pre-built spark version {version}".format(version=version)
  252. sys.exit(1)
  253. elif "." in version:
  254. version = version.replace("v", "")
  255. if version not in VALID_SPARK_VERSIONS:
  256. print >> stderr, "Don't know about Spark version: {v}".format(v=version)
  257. sys.exit(1)
  258. return version
  259. else:
  260. github_commit_url = "{repo}/commit/{commit_hash}".format(repo=repo, commit_hash=version)
  261. if not check_if_http_resource_exists(github_commit_url):
  262. print >> stderr, "Couldn't validate Spark commit: {repo} / {commit}".format(
  263. repo=repo, commit=version)
  264. sys.exit(1)
  265. else:
  266. return version
  267. # Check whether a given EC2 instance object is in a state we consider active,
  268. # i.e. not terminating or terminated. We count both stopping and stopped as
  269. # active since we can restart stopped clusters.
  270. def is_active(instance):
  271. return (instance.state in ['pending', 'running', 'stopping', 'stopped'])
  272. # Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
  273. # Last Updated: 2014-06-20
  274. # For easy maintainability, please keep this manually-inputted dictionary sorted by key.
  275. EC2_INSTANCE_TYPES = {
  276. "c1.medium": "pvm",
  277. "c1.xlarge": "pvm",
  278. "c3.2xlarge": "pvm",
  279. "c3.4xlarge": "pvm",
  280. "c3.8xlarge": "pvm",
  281. "c3.large": "pvm",
  282. "c3.xlarge": "pvm",
  283. "cc1.4xlarge": "hvm",
  284. "cc2.8xlarge": "hvm",
  285. "cg1.4xlarge": "hvm",
  286. "cr1.8xlarge": "hvm",
  287. "hi1.4xlarge": "pvm",
  288. "hs1.8xlarge": "pvm",
  289. "i2.2xlarge": "hvm",
  290. "i2.4xlarge": "hvm",
  291. "i2.8xlarge": "hvm",
  292. "i2.xlarge": "hvm",
  293. "m1.large": "pvm",
  294. "m1.medium": "pvm",
  295. "m1.small": "pvm",
  296. "m1.xlarge": "pvm",
  297. "m2.2xlarge": "pvm",
  298. "m2.4xlarge": "pvm",
  299. "m2.xlarge": "pvm",
  300. "m3.2xlarge": "hvm",
  301. "m3.large": "hvm",
  302. "m3.medium": "hvm",
  303. "m3.xlarge": "hvm",
  304. "r3.2xlarge": "hvm",
  305. "r3.4xlarge": "hvm",
  306. "r3.8xlarge": "hvm",
  307. "r3.large": "hvm",
  308. "r3.xlarge": "hvm",
  309. "t1.micro": "pvm",
  310. "t2.medium": "hvm",
  311. "t2.micro": "hvm",
  312. "t2.small": "hvm",
  313. }
  314. # Attempt to resolve an appropriate AMI given the architecture and region of the request.
  315. def get_spark_ami(instance_type, region, spark_ec2_git_repo, spark_ec2_git_branch):
  316. if instance_type in EC2_INSTANCE_TYPES:
  317. instance_type = EC2_INSTANCE_TYPES[instance_type]
  318. else:
  319. instance_type = "pvm"
  320. print >> stderr,\
  321. "Don't recognize %s, assuming type is pvm" % instance_type
  322. # URL prefix from which to fetch AMI information
  323. ami_prefix = "{r}/{b}/ami-list".format(
  324. r=spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
  325. b=spark_ec2_git_branch)
  326. ami_path = "%s/%s/%s" % (ami_prefix, region, instance_type)
  327. try:
  328. ami = urllib2.urlopen(ami_path).read().strip()
  329. print "Spark AMI for %s: %s" % (instance_type, ami)
  330. except:
  331. print >> stderr, "Could not resolve AMI at: " + ami_path
  332. sys.exit(1)
  333. return ami
  334. # Launch a cluster of the given name, by setting up its security groups,
  335. # and then starting new instances in them.
  336. # Returns a tuple of EC2 reservation objects for the master and slaves
  337. # Fails if there already instances running in the cluster's groups.
  338. def launch_cluster(conn, opts, cluster_name):
  339. if opts.identity_file is None:
  340. print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
  341. sys.exit(1)
  342. if opts.key_pair is None:
  343. print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
  344. sys.exit(1)
  345. user_data_content = None
  346. if opts.user_data:
  347. with open(opts.user_data) as user_data_file:
  348. user_data_content = user_data_file.read()
  349. print "Setting up security groups..."
  350. if opts.security_group_prefix is None:
  351. master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id)
  352. slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
  353. else:
  354. master_group = get_or_make_group(conn, opts.security_group_prefix + "-master", opts.vpc_id)
  355. slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves", opts.vpc_id)
  356. authorized_address = opts.authorized_address
  357. if master_group.rules == []: # Group was just now created
  358. if opts.vpc_id is None:
  359. master_group.authorize(src_group=master_group)
  360. master_group.authorize(src_group=slave_group)
  361. else:
  362. master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
  363. src_group=master_group)
  364. master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
  365. src_group=master_group)
  366. master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
  367. src_group=master_group)
  368. master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
  369. src_group=slave_group)
  370. master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
  371. src_group=slave_group)
  372. master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
  373. src_group=slave_group)
  374. master_group.authorize('tcp', 22, 22, authorized_address)
  375. master_group.authorize('tcp', 8080, 8081, authorized_address)
  376. master_group.authorize('tcp', 18080, 18080, authorized_address)
  377. master_group.authorize('tcp', 19999, 19999, authorized_address)
  378. master_group.authorize('tcp', 50030, 50030, authorized_address)
  379. master_group.authorize('tcp', 50070, 50070, authorized_address)
  380. master_group.authorize('tcp', 60070, 60070, authorized_address)
  381. master_group.authorize('tcp', 4040, 4045, authorized_address)
  382. if opts.ganglia:
  383. master_group.authorize('tcp', 5080, 5080, authorized_address)
  384. if slave_group.rules == []: # Group was just now created
  385. if opts.vpc_id is None:
  386. slave_group.authorize(src_group=master_group)
  387. slave_group.authorize(src_group=slave_group)
  388. else:
  389. slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
  390. src_group=master_group)
  391. slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
  392. src_group=master_group)
  393. slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
  394. src_group=master_group)
  395. slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
  396. src_group=slave_group)
  397. slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
  398. src_group=slave_group)
  399. slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
  400. src_group=slave_group)
  401. slave_group.authorize('tcp', 22, 22, authorized_address)
  402. slave_group.authorize('tcp', 8080, 8081, authorized_address)
  403. slave_group.authorize('tcp', 50060, 50060, authorized_address)
  404. slave_group.authorize('tcp', 50075, 50075, authorized_address)
  405. slave_group.authorize('tcp', 60060, 60060, authorized_address)
  406. slave_group.authorize('tcp', 60075, 60075, authorized_address)
  407. # Check if instances are already running in our groups
  408. existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
  409. die_on_error=False)
  410. if existing_slaves or (existing_masters and not opts.use_existing_master):
  411. print >> stderr, ("ERROR: There are already instances running in " +
  412. "group %s or %s" % (master_group.name, slave_group.name))
  413. sys.exit(1)
  414. # Figure out Spark AMI
  415. if opts.ami is None:
  416. opts.ami = get_spark_ami(opts.instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch)
  417. if opts.master_ami is None:
  418. opts.master_ami = get_spark_ami(opts.master_instance_type, opts.region, opts.spark_ec2_git_repo, opts.spark_ec2_git_branch)
  419. # we use group ids to work around https://github.com/boto/boto/issues/350
  420. additional_group_ids = []
  421. if opts.additional_security_group:
  422. additional_group_ids = [sg.id
  423. for sg in conn.get_all_security_groups()
  424. if opts.additional_security_group in (sg.name, sg.id)]
  425. print "Launching instances..."
  426. try:
  427. image = conn.get_all_images(image_ids=[opts.ami])[0]
  428. except:
  429. print >> stderr, "Could not find AMI " + opts.ami
  430. sys.exit(1)
  431. try:
  432. master_image = conn.get_all_images(image_ids=[opts.master_ami])[0]
  433. except:
  434. print >> stderr, "Could not find AMI " + opts.master_ami
  435. sys.exit(1)
  436. # Create block device mapping so that we can add EBS volumes if asked to.
  437. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
  438. block_map = BlockDeviceMapping()
  439. if opts.ebs_vol_size > 0:
  440. for i in range(opts.ebs_vol_num):
  441. device = EBSBlockDeviceType()
  442. device.size = opts.ebs_vol_size
  443. device.volume_type = opts.ebs_vol_type
  444. device.delete_on_termination = True
  445. block_map["/dev/sd" + chr(ord('s') + i)] = device
  446. # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
  447. if opts.instance_type.startswith('m3.'):
  448. for i in range(get_num_disks(opts.instance_type)):
  449. dev = BlockDeviceType()
  450. dev.ephemeral_name = 'ephemeral%d' % i
  451. # The first ephemeral drive is /dev/sdb.
  452. name = '/dev/sd' + string.letters[i + 1]
  453. block_map[name] = dev
  454. # Launch slaves
  455. if opts.spot_price is not None:
  456. # Launch spot instances with the requested price
  457. print ("Requesting %d slaves as spot instances with price $%.3f" %
  458. (opts.slaves, opts.spot_price))
  459. zones = get_zones(conn, opts)
  460. num_zones = len(zones)
  461. i = 0
  462. my_req_ids = []
  463. for zone in zones:
  464. num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
  465. slave_reqs = conn.request_spot_instances(
  466. price=opts.spot_price,
  467. image_id=opts.ami,
  468. launch_group="launch-group-%s" % cluster_name,
  469. placement=zone,
  470. count=num_slaves_this_zone,
  471. key_name=opts.key_pair,
  472. security_group_ids=[slave_group.id] + additional_group_ids,
  473. instance_type=opts.instance_type,
  474. block_device_map=block_map,
  475. subnet_id=opts.subnet_id,
  476. placement_group=opts.placement_group,
  477. user_data=user_data_content)
  478. my_req_ids += [req.id for req in slave_reqs]
  479. i += 1
  480. print "Waiting for spot instances to be granted..."
  481. try:
  482. while True:
  483. time.sleep(10)
  484. reqs = conn.get_all_spot_instance_requests()
  485. id_to_req = {}
  486. for r in reqs:
  487. id_to_req[r.id] = r
  488. active_instance_ids = []
  489. for i in my_req_ids:
  490. if i in id_to_req and id_to_req[i].state == "active":
  491. active_instance_ids.append(id_to_req[i].instance_id)
  492. if len(active_instance_ids) == opts.slaves:
  493. print "All %d slaves granted" % opts.slaves
  494. reservations = conn.get_all_reservations(active_instance_ids)
  495. slave_nodes = []
  496. for r in reservations:
  497. slave_nodes += r.instances
  498. break
  499. else:
  500. print "%d of %d slaves granted, waiting longer" % (
  501. len(active_instance_ids), opts.slaves)
  502. except:
  503. print "Canceling spot instance requests"
  504. conn.cancel_spot_instance_requests(my_req_ids)
  505. # Log a warning if any of these requests actually launched instances:
  506. (master_nodes, slave_nodes) = get_existing_cluster(
  507. conn, opts, cluster_name, die_on_error=False)
  508. running = len(master_nodes) + len(slave_nodes)
  509. if running:
  510. print >> stderr, ("WARNING: %d instances are still running" % running)
  511. sys.exit(0)
  512. else:
  513. # Launch non-spot instances
  514. zones = get_zones(conn, opts)
  515. num_zones = len(zones)
  516. i = 0
  517. slave_nodes = []
  518. for zone in zones:
  519. num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
  520. if num_slaves_this_zone > 0:
  521. slave_res = image.run(key_name=opts.key_pair,
  522. security_group_ids=[slave_group.id] + additional_group_ids,
  523. instance_type=opts.instance_type,
  524. placement=zone,
  525. min_count=num_slaves_this_zone,
  526. max_count=num_slaves_this_zone,
  527. block_device_map=block_map,
  528. subnet_id=opts.subnet_id,
  529. placement_group=opts.placement_group,
  530. user_data=user_data_content)
  531. slave_nodes += slave_res.instances
  532. print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
  533. zone, slave_res.id)
  534. i += 1
  535. # Launch or resume masters
  536. if existing_masters:
  537. print "Starting master..."
  538. for inst in existing_masters:
  539. if inst.state not in ["shutting-down", "terminated"]:
  540. inst.start()
  541. master_nodes = existing_masters
  542. else:
  543. master_type = opts.master_instance_type
  544. if master_type == "":
  545. master_type = opts.instance_type
  546. if opts.zone == 'all':
  547. opts.zone = random.choice(conn.get_all_zones()).name
  548. master_res = master_image.run(key_name=opts.key_pair,
  549. security_group_ids=[master_group.id] + additional_group_ids,
  550. instance_type=master_type,
  551. placement=opts.zone,
  552. min_count=1,
  553. max_count=1,
  554. block_device_map=block_map,
  555. subnet_id=opts.subnet_id,
  556. placement_group=opts.placement_group,
  557. user_data=user_data_content)
  558. master_nodes = master_res.instances
  559. print "Launched master in %s, regid = %s" % (zone, master_res.id)
  560. # This wait time corresponds to SPARK-4983
  561. print "Waiting for AWS to propagate instance metadata..."
  562. time.sleep(5)
  563. # Give the instances descriptive names
  564. for master in master_nodes:
  565. master.add_tag(
  566. key='Name',
  567. value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
  568. for slave in slave_nodes:
  569. slave.add_tag(
  570. key='Name',
  571. value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
  572. # Return all the instances
  573. return (master_nodes, slave_nodes)
  574. # Get the EC2 instances in an existing cluster if available.
  575. # Returns a tuple of lists of EC2 instance objects for the masters and slaves
  576. def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
  577. print "Searching for existing cluster " + cluster_name + "..."
  578. reservations = conn.get_all_reservations()
  579. master_nodes = []
  580. slave_nodes = []
  581. for res in reservations:
  582. active = [i for i in res.instances if is_active(i)]
  583. for inst in active:
  584. group_names = [g.name for g in inst.groups]
  585. if (cluster_name + "-master") in group_names:
  586. master_nodes.append(inst)
  587. elif (cluster_name + "-slaves") in group_names:
  588. slave_nodes.append(inst)
  589. if any((master_nodes, slave_nodes)):
  590. print "Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes))
  591. if master_nodes != [] or not die_on_error:
  592. return (master_nodes, slave_nodes)
  593. else:
  594. if master_nodes == [] and slave_nodes != []:
  595. print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master"
  596. else:
  597. print >> sys.stderr, "ERROR: Could not find any existing cluster"
  598. sys.exit(1)
  599. # Deploy configuration files and run setup scripts on a newly launched
  600. # or started EC2 cluster.
  601. def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
  602. master = master_nodes[0].public_dns_name
  603. if deploy_ssh_key:
  604. print "Generating cluster's SSH key on master..."
  605. key_setup = """
  606. [ -f ~/.ssh/id_rsa ] ||
  607. (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa &&
  608. cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys)
  609. """
  610. ssh(master, opts, key_setup)
  611. dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh'])
  612. print "Transferring cluster's SSH key to slaves..."
  613. for slave in slave_nodes:
  614. print slave.public_dns_name
  615. ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar)
  616. modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
  617. 'mapreduce', 'spark-standalone', 'tachyon']
  618. if opts.hadoop_major_version == "1":
  619. modules = filter(lambda x: x != "mapreduce", modules)
  620. if opts.ganglia:
  621. modules.append('ganglia')
  622. # NOTE: We should clone the repository before running deploy_files to
  623. # prevent ec2-variables.sh from being overwritten
  624. print "Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
  625. r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)
  626. ssh(
  627. host=master,
  628. opts=opts,
  629. command="rm -rf spark-ec2"
  630. + " && "
  631. + "git clone {r} -b {b} spark-ec2".format(r=opts.spark_ec2_git_repo,
  632. b=opts.spark_ec2_git_branch)
  633. )
  634. print "Deploying files to master..."
  635. deploy_files(
  636. conn=conn,
  637. root_dir=SPARK_EC2_DIR + "/" + "deploy.generic",
  638. opts=opts,
  639. master_nodes=master_nodes,
  640. slave_nodes=slave_nodes,
  641. modules=modules
  642. )
  643. print "Running setup on master..."
  644. setup_spark_cluster(master, opts)
  645. print "Done!"
  646. def setup_spark_cluster(master, opts):
  647. ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
  648. ssh(master, opts, "spark-ec2/setup.sh")
  649. print "Spark standalone cluster started at http://%s:8080" % master
  650. if opts.ganglia:
  651. print "Ganglia started at http://%s:5080/ganglia" % master
  652. def is_ssh_available(host, opts, print_ssh_output=True):
  653. """
  654. Check if SSH is available on a host.
  655. """
  656. s = subprocess.Popen(
  657. ssh_command(opts) + ['-t', '-t', '-o', 'ConnectTimeout=3',
  658. '%s@%s' % (opts.user, host), stringify_command('true')],
  659. stdout=subprocess.PIPE,
  660. stderr=subprocess.STDOUT # we pipe stderr through stdout to preserve output order
  661. )
  662. cmd_output = s.communicate()[0] # [1] is stderr, which we redirected to stdout
  663. if s.returncode != 0 and print_ssh_output:
  664. # extra leading newline is for spacing in wait_for_cluster_state()
  665. print textwrap.dedent("""\n
  666. Warning: SSH connection error. (This could be temporary.)
  667. Host: {h}
  668. SSH return code: {r}
  669. SSH output: {o}
  670. """).format(
  671. h=host,
  672. r=s.returncode,
  673. o=cmd_output.strip()
  674. )
  675. return s.returncode == 0
  676. def is_cluster_ssh_available(cluster_instances, opts):
  677. """
  678. Check if SSH is available on all the instances in a cluster.
  679. """
  680. for i in cluster_instances:
  681. if not is_ssh_available(host=i.ip_address, opts=opts):
  682. return False
  683. else:
  684. return True
  685. def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
  686. """
  687. Wait for all the instances in the cluster to reach a designated state.
  688. cluster_instances: a list of boto.ec2.instance.Instance
  689. cluster_state: a string representing the desired state of all the instances in the cluster
  690. value can be 'ssh-ready' or a valid value from boto.ec2.instance.InstanceState such as
  691. 'running', 'terminated', etc.
  692. (would be nice to replace this with a proper enum: http://stackoverflow.com/a/1695250)
  693. """
  694. sys.stdout.write(
  695. "Waiting for cluster to enter '{s}' state.".format(s=cluster_state)
  696. )
  697. sys.stdout.flush()
  698. start_time = datetime.now()
  699. num_attempts = 0
  700. while True:
  701. time.sleep(5 * num_attempts) # seconds
  702. for i in cluster_instances:
  703. i.update()
  704. statuses = conn.get_all_instance_status(instance_ids=[i.id for i in cluster_instances])
  705. if cluster_state == 'ssh-ready':
  706. if all(i.state == 'running' for i in cluster_instances) and \
  707. all(s.system_status.status == 'ok' for s in statuses) and \
  708. all(s.instance_status.status == 'ok' for s in statuses) and \
  709. is_cluster_ssh_available(cluster_instances, opts):
  710. break
  711. else:
  712. if all(i.state == cluster_state for i in cluster_instances):
  713. break
  714. num_attempts += 1
  715. sys.stdout.write(".")
  716. sys.stdout.flush()
  717. sys.stdout.write("\n")
  718. end_time = datetime.now()
  719. print "Cluster is now in '{s}' state. Waited {t} seconds.".format(
  720. s=cluster_state,
  721. t=(end_time - start_time).seconds
  722. )
  723. # Get number of local disks available for a given EC2 instance type.
  724. def get_num_disks(instance_type):
  725. # Source: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html
  726. # Last Updated: 2014-06-20
  727. # For easy maintainability, please keep this manually-inputted dictionary sorted by key.
  728. disks_by_instance = {
  729. "c1.medium": 1,
  730. "c1.xlarge": 4,
  731. "c3.2xlarge": 2,
  732. "c3.4xlarge": 2,
  733. "c3.8xlarge": 2,
  734. "c3.large": 2,
  735. "c3.xlarge": 2,
  736. "cc1.4xlarge": 2,
  737. "cc2.8xlarge": 4,
  738. "cg1.4xlarge": 2,
  739. "cr1.8xlarge": 2,
  740. "g2.2xlarge": 1,
  741. "hi1.4xlarge": 2,
  742. "hs1.8xlarge": 24,
  743. "i2.2xlarge": 2,
  744. "i2.4xlarge": 4,
  745. "i2.8xlarge": 8,
  746. "i2.xlarge": 1,
  747. "m1.large": 2,
  748. "m1.medium": 1,
  749. "m1.small": 1,
  750. "m1.xlarge": 4,
  751. "m2.2xlarge": 1,
  752. "m2.4xlarge": 2,
  753. "m2.xlarge": 1,
  754. "m3.2xlarge": 2,
  755. "m3.large": 1,
  756. "m3.medium": 1,
  757. "m3.xlarge": 2,
  758. "r3.2xlarge": 1,
  759. "r3.4xlarge": 1,
  760. "r3.8xlarge": 2,
  761. "r3.large": 1,
  762. "r3.xlarge": 1,
  763. "t1.micro": 0,
  764. }
  765. if instance_type in disks_by_instance:
  766. return disks_by_instance[instance_type]
  767. else:
  768. print >> stderr, ("WARNING: Don't know number of disks on instance type %s; assuming 1"
  769. % instance_type)
  770. return 1
  771. # Deploy the configuration file templates in a given local directory to
  772. # a cluster, filling in any template parameters with information about the
  773. # cluster (e.g. lists of masters and slaves). Files are only deployed to
  774. # the first master instance in the cluster, and we expect the setup
  775. # script to be run on that instance to copy them to other nodes.
  776. #
  777. # root_dir should be an absolute path to the directory with the files we want to deploy.
  778. def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
  779. active_master = master_nodes[0].public_dns_name
  780. num_disks = get_num_disks(opts.instance_type)
  781. hdfs_data_dirs = "/mnt/ephemeral-hdfs/data"
  782. mapred_local_dirs = "/mnt/hadoop/mrlocal"
  783. spark_local_dirs = "/mnt/spark"
  784. if num_disks > 1:
  785. for i in range(2, num_disks + 1):
  786. hdfs_data_dirs += ",/mnt%d/ephemeral-hdfs/data" % i
  787. mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i
  788. spark_local_dirs += ",/mnt%d/spark" % i
  789. cluster_url = "%s:7077" % active_master
  790. if opts.spark_version.startswith("http"):
  791. # Custom pre-built spark package
  792. spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
  793. elif "." in opts.spark_version:
  794. # Pre-built Spark deploy
  795. spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
  796. else:
  797. # Spark-only custom deploy
  798. spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
  799. template_vars = {
  800. "master_list": '\n'.join([i.public_dns_name for i in master_nodes]),
  801. "active_master": active_master,
  802. "slave_list": '\n'.join([i.public_dns_name for i in slave_nodes]),
  803. "cluster_url": cluster_url,
  804. "hdfs_data_dirs": hdfs_data_dirs,
  805. "mapred_local_dirs": mapred_local_dirs,
  806. "spark_local_dirs": spark_local_dirs,
  807. "swap": str(opts.swap),
  808. "modules": '\n'.join(modules),
  809. "spark_version": spark_v,
  810. "hadoop_major_version": opts.hadoop_major_version,
  811. "spark_worker_instances": "%d" % opts.worker_instances,
  812. "spark_master_opts": opts.master_opts
  813. }
  814. if opts.copy_aws_credentials:
  815. template_vars["aws_access_key_id"] = conn.aws_access_key_id
  816. template_vars["aws_secret_access_key"] = conn.aws_secret_access_key
  817. else:
  818. template_vars["aws_access_key_id"] = ""
  819. template_vars["aws_secret_access_key"] = ""
  820. # Create a temp directory in which we will place all the files to be
  821. # deployed after we substitue template parameters in them
  822. tmp_dir = tempfile.mkdtemp()
  823. for path, dirs, files in os.walk(root_dir):
  824. if path.find(".svn") == -1:
  825. dest_dir = os.path.join('/', path[len(root_dir):])
  826. local_dir = tmp_dir + dest_dir
  827. if not os.path.exists(local_dir):
  828. os.makedirs(local_dir)
  829. for filename in files:
  830. if filename[0] not in '#.~' and filename[-1] != '~':
  831. dest_file = os.path.join(dest_dir, filename)
  832. local_file = tmp_dir + dest_file
  833. with open(os.path.join(path, filename)) as src:
  834. with open(local_file, "w") as dest:
  835. text = src.read()
  836. for key in template_vars:
  837. text = text.replace("{{" + key + "}}", template_vars[key])
  838. dest.write(text)
  839. dest.close()
  840. # rsync the whole directory over to the master machine
  841. command = [
  842. 'rsync', '-rv',
  843. '-e', stringify_command(ssh_command(opts)),
  844. "%s/" % tmp_dir,
  845. "%s@%s:/" % (opts.user, active_master)
  846. ]
  847. subprocess.check_call(command)
  848. # Remove the temp directory we created above
  849. shutil.rmtree(tmp_dir)
  850. def stringify_command(parts):
  851. if isinstance(parts, str):
  852. return parts
  853. else:
  854. return ' '.join(map(pipes.quote, parts))
  855. def ssh_args(opts):
  856. parts = ['-o', 'StrictHostKeyChecking=no']
  857. parts += ['-o', 'UserKnownHostsFile=/dev/null']
  858. if opts.identity_file is not None:
  859. parts += ['-i', opts.identity_file]
  860. return parts
  861. def ssh_command(opts):
  862. return ['ssh'] + ssh_args(opts)
  863. # Run a command on a host through ssh, retrying up to five times
  864. # and then throwing an exception if ssh continues to fail.
  865. def ssh(host, opts, command):
  866. tries = 0
  867. while True:
  868. try:
  869. return subprocess.check_call(
  870. ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host),
  871. stringify_command(command)])
  872. except subprocess.CalledProcessError as e:
  873. if tries > 5:
  874. # If this was an ssh failure, provide the user with hints.
  875. if e.returncode == 255:
  876. raise UsageError(
  877. "Failed to SSH to remote host {0}.\n" +
  878. "Please check that you have provided the correct --identity-file and " +
  879. "--key-pair parameters and try again.".format(host))
  880. else:
  881. raise e
  882. print >> stderr, \
  883. "Error executing remote command, retrying after 30 seconds: {0}".format(e)
  884. time.sleep(30)
  885. tries = tries + 1
  886. # Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990)
  887. def _check_output(*popenargs, **kwargs):
  888. if 'stdout' in kwargs:
  889. raise ValueError('stdout argument not allowed, it will be overridden.')
  890. process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
  891. output, unused_err = process.communicate()
  892. retcode = process.poll()
  893. if retcode:
  894. cmd = kwargs.get("args")
  895. if cmd is None:
  896. cmd = popenargs[0]
  897. raise subprocess.CalledProcessError(retcode, cmd, output=output)
  898. return output
  899. def ssh_read(host, opts, command):
  900. return _check_output(
  901. ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)])
  902. def ssh_write(host, opts, command, arguments):
  903. tries = 0
  904. while True:
  905. proc = subprocess.Popen(
  906. ssh_command(opts) + ['%s@%s' % (opts.user, host), stringify_command(command)],
  907. stdin=subprocess.PIPE)
  908. proc.stdin.write(arguments)
  909. proc.stdin.close()
  910. status = proc.wait()
  911. if status == 0:
  912. break
  913. elif tries > 5:
  914. raise RuntimeError("ssh_write failed with error %s" % proc.returncode)
  915. else:
  916. print >> stderr, \
  917. "Error {0} while executing remote command, retrying after 30 seconds".format(status)
  918. time.sleep(30)
  919. tries = tries + 1
  920. # Gets a list of zones to launch instances in
  921. def get_zones(conn, opts):
  922. if opts.zone == 'all':
  923. zones = [z.name for z in conn.get_all_zones()]
  924. else:
  925. zones = [opts.zone]
  926. return zones
  927. # Gets the number of items in a partition
  928. def get_partition(total, num_partitions, current_partitions):
  929. num_slaves_this_zone = total / num_partitions
  930. if (total % num_partitions) - current_partitions > 0:
  931. num_slaves_this_zone += 1
  932. return num_slaves_this_zone
  933. def real_main():
  934. (opts, action, cluster_name) = parse_args()
  935. # Input parameter validation
  936. get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
  937. if opts.wait is not None:
  938. # NOTE: DeprecationWarnings are silent in 2.7+ by default.
  939. # To show them, run Python with the -Wdefault switch.
  940. # See: https://docs.python.org/3.5/whatsnew/2.7.html
  941. warnings.warn(
  942. "This option is deprecated and has no effect. "
  943. "spark-ec2 automatically waits as long as necessary for clusters to start up.",
  944. DeprecationWarning
  945. )
  946. if opts.identity_file is not None:
  947. if not os.path.exists(opts.identity_file):
  948. print >> stderr,\
  949. "ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file)
  950. sys.exit(1)
  951. file_mode = os.stat(opts.identity_file).st_mode
  952. if not (file_mode & S_IRUSR) or not oct(file_mode)[-2:] == '00':
  953. print >> stderr, "ERROR: The identity file must be accessible only by you."
  954. print >> stderr, 'You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file)
  955. sys.exit(1)
  956. if opts.instance_type not in EC2_INSTANCE_TYPES:
  957. print >> stderr, "Warning: Unrecognized EC2 instance type for instance-type: {t}".format(
  958. t=opts.instance_type)
  959. if opts.master_instance_type != "":
  960. if opts.master_instance_type not in EC2_INSTANCE_TYPES:
  961. print >> stderr, \
  962. "Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format(
  963. t=opts.master_instance_type)
  964. if opts.ebs_vol_num > 8:
  965. print >> stderr, "ebs-vol-num cannot be greater than 8"
  966. sys.exit(1)
  967. # Prevent breaking ami_prefix (/, .git and startswith checks)
  968. # Prevent forks with non spark-ec2 names for now.
  969. if opts.spark_ec2_git_repo.endswith("/") or \
  970. opts.spark_ec2_git_repo.endswith(".git") or \
  971. not opts.spark_ec2_git_repo.startswith("https://github.com") or \
  972. not opts.spark_ec2_git_repo.endswith("spark-ec2"):
  973. print >> stderr, "spark-ec2-git-repo must be a github repo and it must not have a " \
  974. "trailing / or .git. " \
  975. "Furthermore, we currently only support forks named spark-ec2."
  976. sys.exit(1)
  977. try:
  978. conn = ec2.connect_to_region(opts.region)
  979. except Exception as e:
  980. print >> stderr, (e)
  981. sys.exit(1)
  982. # Select an AZ at random if it was not specified.
  983. if opts.zone == "":
  984. opts.zone = random.choice(conn.get_all_zones()).name
  985. if action == "launch":
  986. if opts.slaves <= 0:
  987. print >> sys.stderr, "ERROR: You have to start at least 1 slave"
  988. sys.exit(1)
  989. if opts.resume:
  990. (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
  991. else:
  992. (master_nodes, slave_nodes) = launch_cluster(conn, opts, cluster_name)
  993. wait_for_cluster_state(
  994. conn=conn,
  995. opts=opts,
  996. cluster_instances=(master_nodes + slave_nodes),
  997. cluster_state='ssh-ready'
  998. )
  999. setup_cluster(conn, master_nodes, slave_nodes, opts, True)
  1000. elif action == "destroy":
  1001. print "Are you sure you want to destroy the cluster %s?" % cluster_name
  1002. print "The following instances will be terminated:"
  1003. (master_nodes, slave_nodes) = get_existing_cluster(
  1004. conn, opts, cluster_name, die_on_error=False)
  1005. for inst in master_nodes + slave_nodes:
  1006. print "> %s" % inst.public_dns_name
  1007. msg = "ALL DATA ON ALL NODES WILL BE LOST!!\nDestroy cluster %s (y/N): " % cluster_name
  1008. response = raw_input(msg)
  1009. if response == "y":
  1010. print "Terminating master..."
  1011. for inst in master_nodes:
  1012. inst.terminate()
  1013. print "Terminating slaves..."
  1014. for inst in slave_nodes:
  1015. inst.terminate()
  1016. # Delete security groups as well
  1017. if opts.delete_groups:
  1018. print "Deleting security groups (this will take some time)..."
  1019. group_names = [cluster_name + "-master", cluster_name + "-slaves"]
  1020. wait_for_cluster_state(
  1021. conn=conn,
  1022. opts=opts,
  1023. cluster_instances=(master_nodes + slave_nodes),
  1024. cluster_state='terminated'
  1025. )
  1026. attempt = 1
  1027. while attempt <= 3:
  1028. print "Attempt %d" % attempt
  1029. groups = [g for g in conn.get_all_security_groups() if g.name in group_names]
  1030. success = True
  1031. # Delete individual rules in all groups before deleting groups to
  1032. # remove dependencies between them
  1033. for group in groups:
  1034. print "Deleting rules in security group " + group.name
  1035. for rule in group.rules:
  1036. for grant in rule.grants:
  1037. success &= group.revoke(ip_protocol=rule.ip_protocol,
  1038. from_port=rule.from_port,
  1039. to_port=rule.to_port,
  1040. src_group=grant)
  1041. # Sleep for AWS eventual-consistency to catch up, and for instances
  1042. # to terminate
  1043. time.sleep(30) # Yes, it does have to be this long :-(
  1044. for group in groups:
  1045. try:
  1046. conn.delete_security_group(group.name)
  1047. print "Deleted security group " + group.name
  1048. except boto.exception.EC2ResponseError:
  1049. success = False
  1050. print "Failed to delete security group " + group.name
  1051. # Unfortunately, group.revoke() returns True even if a rule was not
  1052. # deleted, so this needs to be rerun if something fails
  1053. if success:
  1054. break
  1055. attempt += 1
  1056. if not success:
  1057. print "Failed to delete all security groups after 3 tries."
  1058. print "Try re-running in a few minutes."
  1059. elif action == "login":
  1060. (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
  1061. master = master_nodes[0].public_dns_name
  1062. print "Logging into master " + master + "..."
  1063. proxy_opt = []
  1064. if opts.proxy_port is not None:
  1065. proxy_opt = ['-D', opts.proxy_port]
  1066. subprocess.check_call(
  1067. ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)])
  1068. elif action == "reboot-slaves":
  1069. response = raw_input(
  1070. "Are you sure you want to reboot the cluster " +
  1071. cluster_name + " slaves?\n" +
  1072. "Reboot cluster slaves " + cluster_name + " (y/N): ")
  1073. if response == "y":
  1074. (master_nodes, slave_nodes) = get_existing_cluster(
  1075. conn, opts, cluster_name, die_on_error=False)
  1076. print "Rebooting slaves..."
  1077. for inst in slave_nodes:
  1078. if inst.state not in ["shutting-down", "terminated"]:
  1079. print "Rebooting " + inst.id
  1080. inst.reboot()
  1081. elif action == "get-master":
  1082. (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
  1083. print master_nodes[0].public_dns_name
  1084. elif action == "stop":
  1085. response = raw_input(
  1086. "Are you sure you want to stop the cluster " +
  1087. cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " +
  1088. "BUT THE CLUSTER WILL KEEP USING SPACE ON\n" +
  1089. "AMAZON EBS IF IT IS EBS-BACKED!!\n" +
  1090. "All data on spot-instance slaves will be lost.\n" +
  1091. "Stop cluster " + cluster_name + " (y/N): ")
  1092. if response == "y":
  1093. (master_nodes, slave_nodes) = get_existing_cluster(
  1094. conn, opts, cluster_name, die_on_error=False)
  1095. print "Stopping master..."
  1096. for inst in master_nodes:
  1097. if inst.state not in ["shutting-down", "terminated"]:
  1098. inst.stop()
  1099. print "Stopping slaves..."
  1100. for inst in slave_nodes:
  1101. if inst.state not in ["shutting-down", "terminated"]:
  1102. if inst.spot_instance_request_id:
  1103. inst.terminate()
  1104. else:
  1105. inst.stop()
  1106. elif action == "start":
  1107. (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
  1108. print "Starting slaves..."
  1109. for inst in slave_nodes:
  1110. if inst.state not in ["shutting-down", "terminated"]:
  1111. inst.start()
  1112. print "Starting master..."
  1113. for inst in master_nodes:
  1114. if inst.state not in ["shutting-down", "terminated"]:
  1115. inst.start()
  1116. wait_for_cluster_state(
  1117. conn=conn,
  1118. opts=opts,
  1119. cluster_instances=(master_nodes + slave_nodes),
  1120. cluster_state='ssh-ready'
  1121. )
  1122. setup_cluster(conn, master_nodes, slave_nodes, opts, False)
  1123. else:
  1124. print >> stderr, "Invalid action: %s" % action
  1125. sys.exit(1)
  1126. def main():
  1127. try:
  1128. real_main()
  1129. except UsageError, e:
  1130. print >> stderr, "\nError:\n", e
  1131. sys.exit(1)
  1132. if __name__ == "__main__":
  1133. logging.basicConfig()
  1134. main()