/lib/galaxy/cloud/providers/ec2.py
Python | 1033 lines | 1006 code | 14 blank | 13 comment | 59 complexity | d6f582a6b4ede3dc825c5f2ce22096b4 MD5 | raw file
1import subprocess, threading, os, errno, time, datetime 2from Queue import Queue, Empty 3from datetime import datetime 4 5from galaxy import model # Database interaction class 6from galaxy.model import mapping 7from galaxy.datatypes.data import nice_size 8from galaxy.util.bunch import Bunch 9from galaxy.cloud import UCIwrapper 10from Queue import Queue 11from sqlalchemy import or_, and_ 12 13import galaxy.eggs 14galaxy.eggs.require("boto") 15from boto.ec2.connection import EC2Connection 16from boto.ec2.regioninfo import RegionInfo 17import boto.exception 18import boto 19 20import logging 21log = logging.getLogger( __name__ ) 22 23uci_states = Bunch( 24 NEW_UCI = "newUCI", 25 NEW = "new", 26 CREATING = "creating", 27 DELETING_UCI = "deletingUCI", 28 DELETING = "deleting", 29 SUBMITTED_UCI = "submittedUCI", 30 SUBMITTED = "submitted", 31 SHUTTING_DOWN_UCI = "shutting-downUCI", 32 SHUTTING_DOWN = "shutting-down", 33 AVAILABLE = "available", 34 RUNNING = "running", 35 PENDING = "pending", 36 ERROR = "error", 37 DELETED = "deleted", 38 SNAPSHOT_UCI = "snapshotUCI", 39 SNAPSHOT = "snapshot" 40) 41 42instance_states = Bunch( 43 TERMINATED = "terminated", 44 SUBMITTED = "submitted", 45 RUNNING = "running", 46 PENDING = "pending", 47 SHUTTING_DOWN = "shutting-down", 48 ERROR = "error" 49) 50 51store_status = Bunch( 52 WAITING = "waiting", 53 IN_USE = "in-use", 54 CREATING = "creating", 55 DELETED = 'deleted', 56 ERROR = "error" 57) 58 59snapshot_status = Bunch( 60 SUBMITTED = 'submitted', 61 PENDING = 'pending', 62 COMPLETED = 'completed', 63 DELETE = 'delete', 64 DELETED= 'deleted', 65 ERROR = "error" 66) 67 68class EC2CloudProvider( object ): 69 """ 70 Amazon EC2-based cloud provider implementation for managing instances. 71 """ 72 STOP_SIGNAL = object() 73 def __init__( self, app ): 74 self.type = "ec2" # cloud provider type (e.g., ec2, eucalyptus, opennebula) 75 self.zone = "us-east-1a" 76 self.security_group = "galaxyWeb" 77 self.queue = Queue() 78 self.sa_session = app.model.context 79 80 self.threads = [] 81 nworkers = 5 82 log.info( "Starting EC2 cloud controller workers..." ) 83 for i in range( nworkers ): 84 worker = threading.Thread( target=self.run_next ) 85 worker.start() 86 self.threads.append( worker ) 87 log.debug( "%d EC2 cloud workers ready", nworkers ) 88 89 def shutdown( self ): 90 """Attempts to gracefully shut down the monitor thread""" 91 log.info( "sending stop signal to worker threads in EC2 cloud manager" ) 92 for i in range( len( self.threads ) ): 93 self.queue.put( self.STOP_SIGNAL ) 94 log.info( "EC2 cloud manager stopped" ) 95 96 def put( self, uci_wrapper ): 97 """ 98 Add uci_wrapper object to the end of the request queue to be handled by 99 this cloud provider. 100 """ 101 state = uci_wrapper.get_uci_state() 102 uci_wrapper.change_state( state.split('U')[0] ) # remove 'UCI' from end of state description (i.e., mark as accepted and ready for processing) 103 self.queue.put( uci_wrapper ) 104 105 def run_next( self ): 106 """Process next request, waiting until one is available if necessary.""" 107 cnt = 0 108 while 1: 109 110 uci_wrapper = self.queue.get() 111 uci_state = uci_wrapper.get_uci_state() 112 if uci_state is self.STOP_SIGNAL: 113 return 114 try: 115 if uci_state==uci_states.NEW: 116 self.create_uci( uci_wrapper ) 117 elif uci_state==uci_states.DELETING: 118 self.delete_uci( uci_wrapper ) 119 elif uci_state==uci_states.SUBMITTED: 120 self.start_uci( uci_wrapper ) 121 elif uci_state==uci_states.SHUTTING_DOWN: 122 self.stop_uci( uci_wrapper ) 123 elif uci_state==uci_states.SNAPSHOT: 124 self.snapshot_uci( uci_wrapper ) 125 except: 126 log.exception( "Uncaught exception executing cloud request." ) 127 cnt += 1 128 129 def get_connection( self, uci_wrapper ): 130 """ 131 Establishes cloud connection using user's credentials associated with given UCI 132 """ 133 log.debug( 'Establishing %s cloud connection.' % self.type ) 134 provider = uci_wrapper.get_provider() 135 try: 136 region = RegionInfo( None, provider.region_name, provider.region_endpoint ) 137 except Exception, ex: 138 err = "Selecting region with cloud provider failed: " + str( ex ) 139 log.error( err ) 140 uci_wrapper.set_error( err, True ) 141 return None 142 try: 143 conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(), 144 aws_secret_access_key=uci_wrapper.get_secret_key(), 145 is_secure=provider.is_secure, 146 region=region, 147 path=provider.path ) 148 except boto.exception.EC2ResponseError, e: 149 err = "Establishing connection with cloud failed: " + str( e ) 150 log.error( err ) 151 uci_wrapper.set_error( err, True ) 152 return None 153 154 return conn 155 156 def check_key_pair( self, uci_wrapper, conn ): 157 """ 158 Check if a key pair associated with this UCI exists on cloud provider. 159 If yes, return key pair name; otherwise, generate a key pair with the cloud 160 provider and, again, return key pair name. 161 Key pair name for given UCI is generated from UCI's name and suffix '_kp' 162 """ 163 kp = None 164 kp_name = uci_wrapper.get_name().replace(' ','_') + "_kp" 165 log.debug( "Checking user's key pair: '%s'" % kp_name ) 166 try: 167 kp = conn.get_key_pair( kp_name ) 168 uci_kp_name = uci_wrapper.get_key_pair_name() 169 uci_material = uci_wrapper.get_key_pair_material() 170 if kp != None: 171 if kp.name != uci_kp_name or uci_material == None: 172 # key pair exists on the cloud but not in local database, so re-generate it (i.e., delete and then create) 173 try: 174 conn.delete_key_pair( kp_name ) 175 kp = self.create_key_pair( conn, kp_name ) 176 uci_wrapper.set_key_pair( kp.name, kp.material ) 177 except boto.exception.EC2ResponseError, e: 178 err = "EC2 response error while deleting key pair: " + str( e ) 179 log.error( err ) 180 uci_wrapper.set_error( err, True ) 181 else: 182 try: 183 kp = self.create_key_pair( conn, kp_name ) 184 uci_wrapper.set_key_pair( kp.name, kp.material ) 185 except boto.exception.EC2ResponseError, e: 186 err = "EC2 response error while creating key pair: " + str( e ) 187 log.error( err ) 188 uci_wrapper.set_error( err, True ) 189 except Exception, ex: 190 err = "Exception while creating key pair: " + str( ex ) 191 log.error( err ) 192 uci_wrapper.set_error( err, True ) 193 except boto.exception.EC2ResponseError, e: # No keypair under this name exists so create it 194 if e.code == 'InvalidKeyPair.NotFound': 195 log.info( "No keypair found, creating keypair '%s'" % kp_name ) 196 kp = self.create_key_pair( conn, kp_name ) 197 uci_wrapper.set_key_pair( kp.name, kp.material ) 198 else: 199 err = "EC2 response error while retrieving key pair: " + str( e ) 200 log.error( err ) 201 uci_wrapper.set_error( err, True ) 202 203 if kp != None: 204 return kp.name 205 else: 206 return None 207 208 def create_key_pair( self, conn, kp_name ): 209 """ Initiate creation of key pair under kp_name by current cloud provider. """ 210 try: 211 return conn.create_key_pair( kp_name ) 212 except boto.exception.EC2ResponseError, e: 213 return None 214 215 def get_mi_id( self, uci_wrapper, i_index ): 216 """ 217 Get appropriate machine image (mi) based on instance size. 218 """ 219 i_type = uci_wrapper.get_instance_type( i_index ) 220 if i_type=='m1.small' or i_type=='c1.medium': 221 arch = 'i386' 222 else: 223 arch = 'x86_64' 224 225 mi = self.sa_session.query( model.CloudImage ).filter_by( deleted=False, provider_type=self.type, architecture=arch ).first() 226 if mi: 227 return mi.image_id 228 else: 229 err = "Machine image could not be retrieved" 230 log.error( "%s for UCI '%s'." % (err, uci_wrapper.get_name() ) ) 231 uci_wrapper.set_error( err+". Contact site administrator to ensure needed machine image is registered.", True ) 232 return None 233 234 def create_uci( self, uci_wrapper ): 235 """ 236 Create User Configured Instance (UCI) - i.e., create storage volume on cloud provider 237 and register relevant information in local Galaxy database. 238 """ 239 conn = self.get_connection( uci_wrapper ) 240 if uci_wrapper.get_uci_availability_zone()=='': 241 log.info( "Availability zone for UCI (i.e., storage volume) was not selected, using default zone: %s" % self.zone ) 242 uci_wrapper.set_store_availability_zone( self.zone ) 243 244 log.info( "Creating volume in zone '%s'..." % uci_wrapper.get_uci_availability_zone() ) 245 # Because only 1 storage volume may be created at UCI config time, index of this storage volume in local Galaxy DB w.r.t 246 # current UCI is 0, so reference it in following methods 247 vol = conn.create_volume( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone(), snapshot=None ) 248 uci_wrapper.set_store_volume_id( 0, vol.id ) 249 250 # Wait for a while to ensure volume was created 251# vol_status = vol.status 252# for i in range( 30 ): 253# if vol_status is not "available": 254# log.debug( 'Updating volume status; current status: %s' % vol_status ) 255# vol_status = vol.status 256# time.sleep(3) 257# if i is 29: 258# log.debug( "Error while creating volume '%s'; stuck in state '%s'; deleting volume." % ( vol.id, vol_status ) ) 259# conn.delete_volume( vol.id ) 260# uci_wrapper.change_state( uci_state='error' ) 261# return 262 263 # Retrieve created volume again to get updated status 264 try: 265 vl = conn.get_all_volumes( [vol.id] ) 266 except boto.exception.EC2ResponseError, e: 267 err = "EC2 response error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( e ) 268 log.error( err ) 269 uci_wrapper.set_store_status( vol.id, uci_states.ERROR ) 270 uci_wrapper.set_error( err, True ) 271 return 272 except Exception, ex: 273 err = "Error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( ex ) 274 log.error( err ) 275 uci_wrapper.set_error( err, True ) 276 return 277 278 if len( vl ) > 0: 279 uci_wrapper.change_state( uci_state=vl[0].status ) 280 uci_wrapper.set_store_status( vol.id, vl[0].status ) 281 else: 282 err = "Volume '" + vol.id +"' not found by EC2 after being created." 283 log.error( err ) 284 uci_wrapper.set_store_status( vol.id, uci_states.ERROR ) 285 uci_wrapper.set_error( err, True ) 286 287 def delete_uci( self, uci_wrapper ): 288 """ 289 Delete UCI - i.e., delete all storage volumes associated with this UCI. 290 NOTE that this implies deletion of any and all data associated 291 with this UCI from the cloud. All data will be deleted. 292 Information in local Galaxy database is marked as deleted but not actually removed 293 from the database. 294 """ 295 conn = self.get_connection( uci_wrapper ) 296 vl = [] # volume list 297 count = 0 # counter for checking if all volumes assoc. w/ UCI were deleted 298 299 # Get all volumes assoc. w/ UCI, delete them from cloud as well as in local DB 300 vl = uci_wrapper.get_all_stores() 301 deletedList = [] 302 failedList = [] 303 for v in vl: 304 log.debug( "Deleting volume with id='%s'" % v.volume_id ) 305 try: 306 if conn.delete_volume( v.volume_id ): 307 deletedList.append( v.volume_id ) 308 v.deleted = True 309 self.sa_session.add( v ) 310 self.sa_session.flush() 311 count += 1 312 else: 313 failedList.append( v.volume_id ) 314 except boto.exception.EC2ResponseError, e: 315 err = "EC2 response error while deleting storage volume '" + v.volume_id + "': " + str( e ) 316 log.error( err ) 317 uci_wrapper.set_store_error( err, store_id = v.volume_id ) 318 uci_wrapper.set_error( err, True ) 319 320 # Delete UCI if all of associated 321 if count == len( vl ): 322 uci_wrapper.set_deleted() 323 else: 324 err = "Deleting following volume(s) failed: " + str( failedList ) + ". However, these volumes were successfully deleted: " \ 325 + str( deletedList ) + ". MANUAL intervention and processing needed." 326 log.error( err ) 327 uci_wrapper.set_error( err, True ) 328 329 def snapshot_uci( self, uci_wrapper ): 330 """ 331 Initiate creation of a snapshot by cloud provider for all storage volumes 332 associated with this UCI. 333 """ 334 if uci_wrapper.get_uci_state() != uci_states.ERROR: 335 conn = self.get_connection( uci_wrapper ) 336 337 snapshots = uci_wrapper.get_snapshots( status = snapshot_status.SUBMITTED ) 338 for snapshot in snapshots: 339 log.debug( "Snapshot DB id: '%s', volume id: '%s'" % ( snapshot.id, snapshot.store.volume_id ) ) 340 try: 341 snap = conn.create_snapshot( volume_id=snapshot.store.volume_id ) 342 snap_id = str( snap ).split(':')[1] 343 uci_wrapper.set_snapshot_id( snapshot.id, snap_id ) 344 sh = conn.get_all_snapshots( snap_id ) # get updated status 345 uci_wrapper.set_snapshot_status( status=sh[0].status, snap_id=snap_id ) 346 except boto.exception.EC2ResponseError, e: 347 err = "EC2 response error while creating snapshot: " + str( e ) 348 log.error( err ) 349 uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True ) 350 uci_wrapper.set_error( err, True ) 351 return 352 except Exception, ex: 353 err = "Error while creating snapshot: " + str( ex ) 354 log.error( err ) 355 uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True ) 356 uci_wrapper.set_error( err, True ) 357 return 358 359 uci_wrapper.change_state( uci_state=uci_states.AVAILABLE ) 360 361 def add_storage_to_uci( self, name ): 362 """ Adds more storage to specified UCI 363 TODO""" 364 365 def dummy_start_uci( self, uci_wrapper ): 366 367 uci = uci_wrapper.get_uci() 368 log.debug( "Would be starting instance '%s'" % uci.name ) 369 uci_wrapper.change_state( uci_state.PENDING ) 370# log.debug( "Sleeping a bit... (%s)" % uci.name ) 371# time.sleep(20) 372# log.debug( "Woke up! (%s)" % uci.name ) 373 374 def start_uci( self, uci_wrapper ): 375 """ 376 Start instance(s) of given UCI on the cloud. 377 """ 378 if uci_wrapper.get_uci_state() != uci_states.ERROR: 379 conn = self.get_connection( uci_wrapper ) 380 self.check_key_pair( uci_wrapper, conn ) 381 if uci_wrapper.get_key_pair_name() == None: 382 err = "Key pair not found" 383 log.error( "%s for UCI '%s'." % ( err, uci_wrapper.get_name() ) ) 384 uci_wrapper.set_error( err + ". Try resetting the state and starting the instance again.", True ) 385 return 386 387 i_indexes = uci_wrapper.get_instances_indexes( state=instance_states.SUBMITTED ) # Get indexes of i_indexes associated with this UCI that are in 'submitted' state 388 log.debug( "Starting instances with IDs: '%s' associated with UCI '%s' " % ( i_indexes, uci_wrapper.get_name(), ) ) 389 if len( i_indexes ) > 0: 390 for i_index in i_indexes: 391 # Get machine image for current instance 392 mi_id = self.get_mi_id( uci_wrapper, i_index ) 393 log.debug( "mi_id: %s, uci_wrapper.get_key_pair_name(): %s" % ( mi_id, uci_wrapper.get_key_pair_name() ) ) 394 uci_wrapper.set_mi( i_index, mi_id ) 395 396 if mi_id != None: 397 # Check if galaxy security group exists (and create it if it does not) 398 log.debug( "Setting up '%s' security group." % self.security_group ) 399 try: 400 conn.get_all_security_groups( [self.security_group] ) # security groups 401 except boto.exception.EC2ResponseError, e: 402 if e.code == 'InvalidGroup.NotFound': 403 log.info( "No security group found, creating security group '%s'" % self.security_group ) 404 try: 405 gSecurityGroup = conn.create_security_group(self.security_group, 'Security group for Galaxy.') 406 gSecurityGroup.authorize( 'tcp', 80, 80, '0.0.0.0/0' ) # Open HTTP port 407 gSecurityGroup.authorize( 'tcp', 22, 22, '0.0.0.0/0' ) # Open SSH port 408 except boto.exception.EC2ResponseError, ee: 409 err = "EC2 response error while creating security group: " + str( ee ) 410 log.error( err ) 411 uci_wrapper.set_error( err, True ) 412 else: 413 err = "EC2 response error while retrieving security group: " + str( e ) 414 log.error( err ) 415 uci_wrapper.set_error( err, True ) 416 417 418 if uci_wrapper.get_uci_state() != uci_states.ERROR: 419 # Start an instance 420 log.debug( "Starting instance for UCI '%s'" % uci_wrapper.get_name() ) 421 #TODO: Once multiple volumes can be attached to a single instance, update 'userdata' composition 422 userdata = uci_wrapper.get_store_volume_id()+"|"+uci_wrapper.get_access_key()+"|"+uci_wrapper.get_secret_key() 423 log.debug( "Using following command: conn.run_instances( image_id='%s', key_name='%s', security_groups=['%s'], user_data=[OMITTED], instance_type='%s', placement='%s' )" 424 % ( mi_id, uci_wrapper.get_key_pair_name(), self.security_group, uci_wrapper.get_instance_type( i_index ), uci_wrapper.get_uci_availability_zone() ) ) 425 reservation = None 426 try: 427 reservation = conn.run_instances( image_id=mi_id, 428 key_name=uci_wrapper.get_key_pair_name(), 429 security_groups=[self.security_group], 430 user_data=userdata, 431 instance_type=uci_wrapper.get_instance_type( i_index ), 432 placement=uci_wrapper.get_uci_availability_zone() ) 433 except boto.exception.EC2ResponseError, e: 434 err = "EC2 response error when starting UCI '"+ uci_wrapper.get_name() +"': " + str( e ) 435 log.error( err ) 436 uci_wrapper.set_error( err, True ) 437 except Exception, ex: 438 err = "Error when starting UCI '" + uci_wrapper.get_name() + "': " + str( ex ) 439 log.error( err ) 440 uci_wrapper.set_error( err, True ) 441 # Record newly available instance data into local Galaxy database 442 if reservation: 443 l_time = datetime.utcnow() 444 # uci_wrapper.set_instance_launch_time( self.format_time( reservation.instances[0].launch_time ), i_index=i_index ) 445 uci_wrapper.set_instance_launch_time( l_time, i_index=i_index ) 446 if not uci_wrapper.uci_launch_time_set(): 447 uci_wrapper.set_uci_launch_time( l_time ) 448 try: 449 uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] ) 450 # TODO: if more than a single instance will be started through single reservation, change this reference to element [0] 451 i_id = str( reservation.instances[0]).split(":")[1] 452 uci_wrapper.set_instance_id( i_index, i_id ) 453 s = reservation.instances[0].state 454 uci_wrapper.change_state( s, i_id, s ) 455 uci_wrapper.set_security_group_name( self.security_group, i_id=i_id ) 456 vol_id = uci_wrapper.get_store_volume_id( store_id=0 ) # TODO: Once more that one vol/UCI is allowed, update this! 457 uci_wrapper.set_store_status( vol_id, store_status.WAITING ) 458 log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_uci_state() ) ) 459 except boto.exception.EC2ResponseError, e: 460 err = "EC2 response error when retrieving instance information for UCI '" + uci_wrapper.get_name() + "': " + str( e ) 461 log.error( err ) 462 uci_wrapper.set_error( err, True ) 463 else: 464 log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() ) 465 else: 466 err = "No instances in state '"+ instance_states.SUBMITTED +"' found for UCI '" + uci_wrapper.get_name() + \ 467 "'. Nothing to start." 468 log.error( err ) 469 uci_wrapper.set_error( err, True ) 470 else: 471 log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() ) 472 473 def stop_uci( self, uci_wrapper): 474 """ 475 Stop all of cloud instances associated with given UCI. 476 """ 477 conn = self.get_connection( uci_wrapper ) 478 479 # Get all instances associated with given UCI 480 il = uci_wrapper.get_instances_ids() # instance list 481 # Process list of instances and remove any references to empty instance id's 482 for i in il: 483 if i is None: 484 il.remove( i ) 485 log.debug( 'List of instances being terminated: %s' % il ) 486 rl = conn.get_all_instances( il ) # Reservation list associated with given instances 487 488 # Initiate shutdown of all instances under given UCI 489 cnt = 0 490 stopped = [] 491 not_stopped = [] 492 for r in rl: 493 for inst in r.instances: 494 log.debug( "Sending stop signal to instance '%s' associated with reservation '%s'." % ( inst, r ) ) 495 try: 496 inst.stop() 497 uci_wrapper.set_stop_time( datetime.utcnow(), i_id=inst.id ) 498 uci_wrapper.change_state( instance_id=inst.id, i_state=inst.update() ) 499 stopped.append( inst ) 500 except boto.exception.EC2ResponseError, e: 501 not_stopped.append( inst ) 502 err = "EC2 response error when stopping instance '" + inst.instance_id + "': " + str(e) 503 log.error( err ) 504 uci_wrapper.set_error( err, True ) 505 506 uci_wrapper.reset_uci_launch_time() 507 log.debug( "Termination was initiated for all instances of UCI '%s'." % uci_wrapper.get_name() ) 508 509 510# dbInstances = get_instances( trans, uci ) #TODO: handle list! 511# 512# # Get actual cloud instance object 513# cloudInstance = get_cloud_instance( conn, dbInstances.instance_id ) 514# 515# # TODO: Detach persistent storage volume(s) from instance and update volume data in local database 516# stores = get_stores( trans, uci ) 517# for i, store in enumerate( stores ): 518# log.debug( "Detaching volume '%s' to instance '%s'." % ( store.volume_id, dbInstances.instance_id ) ) 519# mntDevice = store.device 520# volStat = None 521## Detaching volume does not work with Eucalyptus Public Cloud, so comment it out 522## try: 523## volStat = conn.detach_volume( store.volume_id, dbInstances.instance_id, mntDevice ) 524## except: 525## log.debug ( 'Error detaching volume; still going to try and stop instance %s.' % dbInstances.instance_id ) 526# store.attach_time = None 527# store.device = None 528# store.i_id = None 529# store.status = volStat 530# log.debug ( '***** volume status: %s' % volStat ) 531# 532# 533# # Stop the instance and update status in local database 534# cloudInstance.stop() 535# dbInstances.stop_time = datetime.utcnow() 536# while cloudInstance.state != 'terminated': 537# log.debug( "Stopping instance %s state; current state: %s" % ( str( cloudInstance ).split(":")[1], cloudInstance.state ) ) 538# time.sleep(3) 539# cloudInstance.update() 540# dbInstances.state = cloudInstance.state 541# 542# # Reset relevant UCI fields 543# uci.state = 'available' 544# uci.launch_time = None 545# 546# # Persist 547# session = trans.sa_session 548## session.save_or_update( stores ) 549# session.save_or_update( dbInstances ) # TODO: Is this going to work w/ multiple instances stored in dbInstances variable? 550# session.save_or_update( uci ) 551# session.flush() 552# trans.log_event( "User stopped cloud instance '%s'" % uci.name ) 553# trans.set_message( "Galaxy instance '%s' stopped." % uci.name ) 554 555 def update( self ): 556 """ 557 Run status update on all instances that are in 'running', 'pending', or 'shutting-down' state. 558 Run status update on all storage volumes whose status is 'in-use', 'creating', or 'None'. 559 Run status update on all snapshots whose status is 'pending' or 'delete' 560 Run status update on any zombie UCIs, i.e., UCI's that is in 'submitted' state for an 561 extended period of time. 562 563 Reason behind this method is to sync state of local DB and real-world resources 564 """ 565 log.debug( "Running general status update for %s UCIs..." % self.type ) 566 # Update instances 567 instances = self.sa_session.query( model.CloudInstance ) \ 568 .filter( or_( model.CloudInstance.table.c.state==instance_states.RUNNING, 569 model.CloudInstance.table.c.state==instance_states.PENDING, 570 model.CloudInstance.table.c.state==instance_states.SHUTTING_DOWN ) ) \ 571 .all() 572 for inst in instances: 573 if self.type == inst.uci.credentials.provider.type: 574 log.debug( "[%s] Running general status update on instance '%s'" % ( inst.uci.credentials.provider.type, inst.instance_id ) ) 575 self.update_instance( inst ) 576 577 # Update storage volume(s) 578 stores = self.sa_session.query( model.CloudStore ) \ 579 .filter( or_( model.CloudStore.table.c.status==store_status.IN_USE, 580 model.CloudStore.table.c.status==store_status.CREATING, 581 model.CloudStore.table.c.status==store_status.WAITING, 582 model.CloudStore.table.c.status==None ) ) \ 583 .all() 584 for store in stores: 585 if self.type == store.uci.credentials.provider.type: # and store.volume_id != None: 586 log.debug( "[%s] Running general status update on store with local database ID: '%s'" % ( store.uci.credentials.provider.type, store.id ) ) 587 self.update_store( store ) 588# else: 589# log.error( "[%s] There exists an entry for UCI (%s) storage volume without an ID. Storage volume might have been created with " 590# "cloud provider though. Manual check is recommended." % ( store.uci.credentials.provider.type, store.uci.name ) ) 591# store.uci.error = "There exists an entry in local database for a storage volume without an ID. Storage volume might have been created " \ 592# "with cloud provider though. Manual check is recommended. After understanding what happened, local database entry for given " \ 593# "storage volume should be updated." 594# store.status = store_status.ERROR 595# store.uci.state = uci_states.ERROR 596# store.uci.flush() 597# store.flush() 598 599 # Update pending snapshots or delete ones marked for deletion 600 snapshots = self.sa_session.query( model.CloudSnapshot ) \ 601 .filter( or_( model.CloudSnapshot.table.c.status == snapshot_status.PENDING, model.CloudSnapshot.table.c.status == snapshot_status.DELETE ) ) \ 602 .all() 603 for snapshot in snapshots: 604 if self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.PENDING: 605 log.debug( "[%s] Running general status update on snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) ) 606 self.update_snapshot( snapshot ) 607 elif self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.DELETE: 608 log.debug( "[%s] Initiating deletion of snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) ) 609 self.delete_snapshot( snapshot ) 610 611 # Attempt at updating any zombie UCIs (i.e., instances that have been in SUBMITTED state for longer than expected - see below for exact time) 612 zombies = self.sa_session.query( model.UCI ).filter_by( state=uci_states.SUBMITTED ).all() 613 for zombie in zombies: 614 z_instances = self.sa_session.query( model.CloudInstance ) \ 615 .filter_by( uci_id=zombie.id ) \ 616 .filter( or_( model.CloudInstance.table.c.state != instance_states.TERMINATED, 617 model.CloudInstance.table.c.state == None ) ) \ 618 .all() 619 for z_inst in z_instances: 620 if self.type == z_inst.uci.credentials.provider.type: 621# log.debug( "z_inst.id: '%s', state: '%s'" % ( z_inst.id, z_inst.state ) ) 622 td = datetime.utcnow() - z_inst.update_time 623 if td.seconds > 180: # if instance has been in SUBMITTED state for more than 3 minutes 624 log.debug( "[%s] Running zombie repair update on instance with DB id '%s'" % ( z_inst.uci.credentials.provider.type, z_inst.id ) ) 625 self.process_zombie( z_inst ) 626 627 def update_instance( self, inst ): 628 """ 629 Update information in local database for given instance as it is obtained from cloud provider. 630 Along with updating information about given instance, information about the UCI controlling 631 this instance is also updated. 632 """ 633 # Get credentials associated wit this instance 634 uci_id = inst.uci_id 635 uci = self.sa_session.query( model.UCI ).get( uci_id ) 636 self.sa_session.refresh( uci ) 637 conn = self.get_connection_from_uci( uci ) 638 639 # Get reservations handle for given instance 640 try: 641 rl= conn.get_all_instances( [inst.instance_id] ) 642 except boto.exception.EC2ResponseError, e: 643 err = "Retrieving instance(s) from cloud failed for UCI '"+ uci.name +"' during general status update: " + str( e ) 644 log.error( err ) 645 uci.error = err 646 uci.state = uci_states.ERROR 647 self.sa_session.add( uci ) 648 self.sa_session.flush() 649 return None 650 651 # Because references to reservations are deleted shortly after instances have been terminated, getting an empty list as a response to a query 652 # typically means the instance has successfully shut down but the check was not performed in short enough amount of time. Until an alternative solution 653 # is found, below code sets state of given UCI to 'error' to indicate to the user something out of ordinary happened. 654 if len( rl ) == 0: 655 err = "Instance ID '"+inst.instance_id+"' was not found by the cloud provider. Instance might have crashed or otherwise been terminated."+ \ 656 "Manual check is recommended." 657 log.error( err ) 658 inst.error = err 659 uci.error = err 660 inst.state = instance_states.TERMINATED 661 uci.state = uci_states.ERROR 662 uci.launch_time = None 663 self.sa_session.add( inst ) 664 self.sa_session.add( uci ) 665 self.sa_session.flush() 666 # Update instance status in local DB with info from cloud provider 667 for r in rl: 668 for i, cInst in enumerate( r.instances ): 669 try: 670 s = cInst.update() 671 log.debug( "Checking state of cloud instance '%s' associated with UCI '%s' and reservation '%s'. State='%s'" % ( cInst, uci.name, r, s ) ) 672 if s != inst.state: 673 inst.state = s 674 self.sa_session.add( inst ) 675 self.sa_session.flush() 676 # After instance has shut down, ensure UCI is marked as 'available' 677 if s == instance_states.TERMINATED and uci.state != uci_states.ERROR: 678 uci.state = uci_states.AVAILABLE 679 uci.launch_time = None 680 self.sa_session.add( uci ) 681 self.sa_session.flush() 682 # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed. 683 if s != uci.state and s != instance_states.TERMINATED: 684 uci.state = s 685 self.sa_session.add( uci ) 686 self.sa_session.flush() 687 if cInst.public_dns_name != inst.public_dns: 688 inst.public_dns = cInst.public_dns_name 689 self.sa_session.add( inst ) 690 self.sa_session.flush() 691 if cInst.private_dns_name != inst.private_dns: 692 inst.private_dns = cInst.private_dns_name 693 self.sa_session.add( inst ) 694 self.sa_session.flush() 695 except boto.exception.EC2ResponseError, e: 696 err = "Updating instance status from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e ) 697 log.error( err ) 698 uci.error = err 699 uci.state = uci_states.ERROR 700 self.sa_session.add( uci ) 701 self.sa_session.flush() 702 return None 703 704 def update_store( self, store ): 705 """ 706 Update information in local database for given storage volume as it is obtained from cloud provider. 707 Along with updating information about given storage volume, information about the UCI controlling 708 this storage volume is also updated. 709 """ 710 # Get credentials associated wit this store 711 uci_id = store.uci_id 712 uci = self.sa_session.query( model.UCI ).get( uci_id ) 713 self.sa_session.refresh( uci ) 714 conn = self.get_connection_from_uci( uci ) 715 716 # Get reservations handle for given store 717 try: 718 log.debug( "Updating storage volume command: vl = conn.get_all_volumes( [%s] )" % store.volume_id ) 719 vl = conn.get_all_volumes( [store.volume_id] ) 720 except boto.exception.EC2ResponseError, e: 721 err = "Retrieving volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e ) 722 log.error( err ) 723 uci.error = err 724 uci.state = uci_states.ERROR 725 self.sa_session.add( uci ) 726 self.sa_session.flush() 727 return None 728 729 # Update store status in local DB with info from cloud provider 730 if len(vl) > 0: 731 try: 732 log.debug( "Storage volume '%s' current status: '%s'" % (store.volume_id, vl[0].status ) ) 733 if store.status != vl[0].status: 734 # In case something failed during creation of UCI but actual storage volume was created and yet 735 # UCI state remained as 'new', try to remedy this by updating UCI state here 736 if ( store.status == None ) and ( store.volume_id != None ): 737 uci.state = vl[0].status 738 self.sa_session.add( uci ) 739 self.sa_session.flush() 740 # If UCI was marked in state 'CREATING', update its status to reflect new status 741 elif ( uci.state == uci_states.CREATING ): 742 uci.state = vl[0].status 743 self.sa_session.add( uci ) 744 self.sa_session.flush() 745 746 store.status = vl[0].status 747 self.sa_session.add( store ) 748 self.sa_session.flush() 749 if store.inst != None: 750 if store.inst.instance_id != vl[0].instance_id: 751 store.inst.instance_id = vl[0].instance_id 752 self.sa_session.add( store ) 753 self.sa_session.flush() 754 if store.attach_time != vl[0].attach_time: 755 store.attach_time = vl[0].attach_time 756 self.sa_session.add( store ) 757 self.sa_session.flush() 758 if store.device != vl[0].device: 759 store.device = vl[0].device 760 self.sa_session.add( store ) 761 self.sa_session.flush() 762 except boto.exception.EC2ResponseError, e: 763 err = "Updating status of volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e ) 764 log.error( err ) 765 uci.error = err 766 uci.state = uci_states.ERROR 767 self.sa_session.add( uci ) 768 self.sa_session.flush() 769 return None 770 else: 771 err = "No storage volumes returned by cloud provider on general update" 772 log.error( "%s for UCI '%s'" % ( err, uci.name ) ) 773 store.status = store_status.ERROR 774 store.error = err 775 uci.error = err 776 uci.state = uci_states.ERROR 777 self.sa_session.add( uci ) 778 self.sa_session.add( store ) 779 self.sa_session.flush() 780 781 def update_snapshot( self, snapshot ): 782 """ 783 Update information in local database for given snapshot as it is obtained from cloud provider. 784 Along with updating information about given snapshot, information about the UCI controlling 785 this snapshot is also updated. 786 """ 787 # Get credentials associated wit this store 788 uci_id = snapshot.uci_id 789 uci = self.sa_session.query( model.UCI ).get( uci_id ) 790 self.sa_session.refresh( uci ) 791 conn = self.get_connection_from_uci( uci ) 792 793 try: 794 log.debug( "Updating status of snapshot '%s'" % snapshot.snapshot_id ) 795 snap = conn.get_all_snapshots( [snapshot.snapshot_id] ) 796 if len( snap ) > 0: 797 log.debug( "Snapshot '%s' status: %s" % ( snapshot.snapshot_id, snap[0].status ) ) 798 snapshot.status = snap[0].status 799 self.sa_session.add( snapshot ) 800 self.sa_session.flush() 801 else: 802 err = "No snapshots returned by EC2 on general update" 803 log.error( "%s for UCI '%s'" % ( err, uci.name ) ) 804 snapshot.status = snapshot_status.ERROR 805 snapshot.error = err 806 uci.error = err 807 uci.state = uci_states.ERROR 808 self.sa_session.add( uci ) 809 self.sa_session.add( snapshot ) 810 self.sa_session.flush() 811 except boto.exception.EC2ResponseError, e: 812 err = "EC2 response error while updating snapshot status: " + str( e ) 813 log.error( err ) 814 snapshot.status = snapshot_status.ERROR 815 snapshot.error = err 816 uci.error = err 817 uci.state = uci_states.ERROR 818 self.sa_session.add( uci ) 819 self.sa_session.add( snapshot ) 820 self.sa_session.flush() 821 except Exception, ex: 822 err = "Error while updating snapshot status: " + str( ex ) 823 log.error( err ) 824 snapshot.status = snapshot_status.ERROR 825 snapshot.error = err 826 uci.error = err 827 uci.state = uci_states.ERROR 828 self.sa_session.add( uci ) 829 self.sa_session.add( snapshot ) 830 self.sa_session.flush() 831 832 def delete_snapshot( self, snapshot ): 833 """ 834 Initiate deletion of given snapshot from cloud provider. 835 """ 836 if snapshot.status == snapshot_status.DELETE: 837 # Get credentials associated wit this store 838 uci_id = snapshot.uci_id 839 uci = self.sa_session.query( model.UCI ).get( uci_id ) 840 self.sa_session.refresh( uci ) 841 conn = self.get_connection_from_uci( uci ) 842 843 try: 844 log.debug( "Deleting snapshot '%s'" % snapshot.snapshot_id ) 845 snap = conn.delete_snapshot( snapshot.snapshot_id ) 846 if snap == True: 847 snapshot.deleted = True 848 snapshot.status = snapshot_status.DELETED 849 self.sa_session.add( snapshot ) 850 self.sa_session.flush() 851 return snap 852 except boto.exception.EC2ResponseError, e: 853 err = "EC2 response error while deleting snapshot: " + str( e ) 854 log.error( err ) 855 snapshot.status = snapshot_status.ERROR 856 snapshot.error = err 857 uci.error = err 858 uci.state = uci_states.ERROR 859 self.sa_session.add( uci ) 860 self.sa_session.add( snapshot ) 861 self.sa_session.flush() 862 except Exception, ex: 863 err = "Error while deleting snapshot: " + str( ex ) 864 log.error( err ) 865 snapshot.status = snapshot_status.ERROR 866 snapshot.error = err 867 uci.error = err 868 uci.state = uci_states.ERROR 869 self.sa_session.add( uci ) 870 self.sa_session.add( snapshot ) 871 self.sa_session.flush() 872 else: 873 err = "Cannot delete snapshot '"+snapshot.snapshot_id+"' because its status is '"+snapshot.status+"'. Only snapshots with '" + \ 874 snapshot_status.COMPLETED+"' status can be deleted." 875 log.error( err ) 876 snapshot.error = err 877 self.sa_session.add( snapshot ) 878 self.sa_session.flush() 879 880 def process_zombie( self, inst ): 881 """ 882 Attempt at discovering if starting a cloud instance was successful but local database was not updated 883 accordingly or if something else failed and instance was never started. Currently, no automatic 884 repairs are being attempted; instead, appropriate error messages are set. 885 """ 886 uci_id = inst.uci_id 887 uci = self.sa_session.query( model.UCI ).get( uci_id ) 888 self.sa_session.refresh( uci ) 889 890 # Check if any instance-specific information was written to local DB; if 'yes', set instance and UCI's error message 891 # suggesting manual check. 892 if inst.launch_time != None or inst.reservation_id != None or inst.instance_id != None: 893 # Try to recover state - this is best-case effort, so if something does not work immediately, not 894 # recovery steps are attempted. Recovery is based on hope that instance_id is available in local DB; if not, 895 # report as error. 896 # Fields attempting to be recovered are: reservation_id, instance status, and launch_time 897 if inst.instance_id != None: 898 conn = self.get_connection_from_uci( uci ) 899 rl = conn.get_all_instances( [inst.instance_id] ) # reservation list 900 # Update local DB with relevant data from instance 901 if inst.reservation_id == None: 902 try: 903 inst.reservation_id = str(rl[0]).split(":")[1] 904 except: # something failed, so skip 905 pass 906 907 try: 908 state = rl[0].instances[0].update() 909 inst.state = state 910 uci.state = state 911 self.sa_session.add( inst ) 912 self.sa_session.add( uci ) 913 self.sa_session.flush() 914 except: # something failed, so skip 915 pass 916 917 if inst.launch_time == None: 918 try: 919 launch_time = self.format_time( rl[0].instances[0].launch_time ) 920 inst.launch_time = launch_time 921 self.sa_session.add( inst ) 922 self.sa_session.flush() 923 if inst.uci.launch_time == None: 924 uci.launch_time = launch_time 925 self.sa_session.add( uci ) 926 self.sa_session.flush() 927 except: # something failed, so skip 928 pass 929 else: 930 err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \ 931 "' seems to have failed. Because it appears that cloud instance might have gotten started, manual check is recommended." 932 inst.error = err 933 inst.state = instance_states.ERROR 934 inst.uci.error = err 935 inst.uci.state = uci_states.ERROR 936 log.error( err ) 937 self.sa_session.add( inst ) 938 self.sa_session.add( uci ) 939 self.sa_session.flush() 940 941 else: #Instance most likely never got processed, so set error message suggesting user to try starting instance again. 942 err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \ 943 "' seems to have failed. Because it appears that cloud instance never got started, it should be safe to reset state and try " \ 944 "starting the instance again." 945 inst.error = err 946 inst.state = instance_states.ERROR 947 uci.error = err 948 uci.state = uci_states.ERROR 949 log.error( err ) 950 self.sa_session.add( inst ) 951 self.sa_session.add( uci ) 952 self.sa_session.flush() 953# uw = UCIwrapper( inst.uci ) 954# log.debug( "Try automatically re-submitting UCI '%s'." % uw.get_name() ) 955 956 def get_connection_from_uci( self, uci ): 957 """ 958 Establish and return connection to cloud provider. Information needed to do so is obtained 959 directly from uci database object. 960 """ 961 log.debug( 'Establishing %s cloud connection' % self.type ) 962 a_key = uci.credentials.access_key 963 s_key = uci.credentials.secret_key 964 # Get connection 965 try: 966 region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) 967 conn = EC2Connection( aws_access_key_id=a_key, 968 aws_secret_access_key=s_key, 969 is_secure=uci.credentials.provider.is_secure, 970 region=region, 971 path=uci.credentials.provider.path ) 972 except boto.exception.EC2ResponseError, e: 973 err = "Establishing connection with cloud failed: " + str( e ) 974 log.error( err ) 975 uci.error = err 976 uci.state = uci_states.ERROR 977 self.sa_session.add( uci ) 978 self.sa_session.flush() 979 return None 980 981 return conn 982 983# def updateUCI( self, uci ): 984# """ 985# Runs a global status update on all storage volumes and all instances that are 986# associated with specified UCI 987# """ 988# conn = self.get_connection( uci ) 989# 990# # Update status of storage volumes 991# vl = model.CloudStore.filter( model.CloudInstance.table.c.uci_id == uci.id ).all() 992# vols = [] 993# for v in vl: 994# vols.append( v.volume_id ) 995# try: 996# volumes = conn.get_all_volumes( vols ) 997# for i, v in enumerate( volumes ): 998# uci.store[i].i_id = v.instance_id 999# uci.store[i].status = v.status 1000# uci.store[i].device = v.device 1001# uci.store[i].flush() 1002# except: 1003# log.debug( "Error updating status of volume(s) associated with UCI '%s'. Status was not updated." % uci.name ) 1004# pass 1005# 1006# # Update status of instances 1007# il = model.CloudInstance.filter_by( uci_id=uci.id ).filter( model.CloudInstance.table.c.state != 'terminated' ).all() 1008# instanceList = [] 1009# for i in il: 1010# instanceList.append( i.instance_id ) 1011# log.debug( 'instanceList: %s' % instanceList ) 1012# try: 1013# reservations = conn.get_all_instances( instanceList ) 1014# for i, r in enumerate( reservations ): 1015# uci.instance[i].state = r.instances[0].update() 1016# log.debug('updating instance %s; status: %s' % ( uci.instance[i].instance_id, uci.instance[i].state ) ) 1017# uci.state = uci.instance[i].state 1018# uci.instance[i].public_dns = r.instances[0].dns_name 1019# uci.instance[i].private_dns = r.instances[0].private_dns_name 1020# uci.instance[i].flush() 1021# uci.flush() 1022# except: 1023# log.debug( "Error updating status of instances associated with UCI '%s'. Instance status was not updated." % uci.name ) 1024# pass 1025 1026 # --------- Helper methods ------------ 1027 1028 def format_time( self, time ): 1029 dict = {'T':' ', 'Z':''} 1030 for i, j in dict.iteritems(): 1031 time = time.replace(i, j) 1032 return time 1033