/lib/galaxy/cloud/providers/eucalyptus.py
Python | 1039 lines | 935 code | 27 blank | 77 comment | 65 complexity | 7dc733a90ef920d6a637fd832094d258 MD5 | raw file
1import subprocess, threading, os, errno, time, datetime 2from Queue import Queue, Empty 3from datetime import datetime 4 5from galaxy import model # Database interaction class 6from galaxy.model import mapping 7from galaxy.datatypes.data import nice_size 8from galaxy.util.bunch import Bunch 9from galaxy.cloud import UCIwrapper 10from Queue import Queue 11from sqlalchemy import or_, and_ 12 13import galaxy.eggs 14galaxy.eggs.require("boto") 15from boto.ec2.connection import EC2Connection 16from boto.ec2.regioninfo import RegionInfo 17import boto.exception 18import boto 19 20import logging 21log = logging.getLogger( __name__ ) 22 23uci_states = Bunch( 24 NEW_UCI = "newUCI", 25 NEW = "new", 26 CREATING = "creating", 27 DELETING_UCI = "deletingUCI", 28 DELETING = "deleting", 29 SUBMITTED_UCI = "submittedUCI", 30 SUBMITTED = "submitted", 31 SHUTTING_DOWN_UCI = "shutting-downUCI", 32 SHUTTING_DOWN = "shutting-down", 33 ADD_STORAGE_UCI = "add-storageUCI", 34 ADD_STORAGE = "add-storage", 35 AVAILABLE = "available", 36 RUNNING = "running", 37 PENDING = "pending", 38 ERROR = "error", 39 DELETED = "deleted", 40 SNAPSHOT_UCI = "snapshotUCI", 41 SNAPSHOT = "snapshot" 42) 43 44instance_states = Bunch( 45 TERMINATED = "terminated", 46 SUBMITTED = "submitted", 47 RUNNING = "running", 48 ADDING = "adding-storage", 49 PENDING = "pending", 50 SHUTTING_DOWN = "shutting-down", 51 ERROR = "error" 52) 53 54store_status = Bunch( 55 WAITING = "waiting", 56 IN_USE = "in-use", 57 ADDING = "adding", 58 CREATING = "creating", 59 DELETED = 'deleted', 60 ERROR = "error" 61) 62 63snapshot_status = Bunch( 64 SUBMITTED = 'submitted', 65 PENDING = 'pending', 66 COMPLETED = 'completed', 67 DELETE = 'delete', 68 DELETED= 'deleted', 69 ERROR = "error" 70) 71 72class EucalyptusCloudProvider( object ): 73 """ 74 Eucalyptus-based cloud provider implementation for managing instances. 75 """ 76 STOP_SIGNAL = object() 77 def __init__( self, app ): 78 self.type = "eucalyptus" # cloud provider type (e.g., ec2, eucalyptus, opennebula) 79 self.zone = "epc" 80 self.queue = Queue() 81 self.sa_session = app.model.context 82 83 self.threads = [] 84 nworkers = 5 85 log.info( "Starting eucalyptus cloud controller workers..." ) 86 for i in range( nworkers ): 87 worker = threading.Thread( target=self.run_next ) 88 worker.start() 89 self.threads.append( worker ) 90 log.debug( "%d eucalyptus cloud workers ready", nworkers ) 91 92 def shutdown( self ): 93 """Attempts to gracefully shut down the monitor thread""" 94 log.info( "sending stop signal to worker threads in eucalyptus cloud manager" ) 95 for i in range( len( self.threads ) ): 96 self.queue.put( self.STOP_SIGNAL ) 97 log.info( "eucalyptus cloud manager stopped" ) 98 99 def put( self, uci_wrapper ): 100 """ 101 Add uci_wrapper object to the end of the request queue to be handled by 102 this cloud provider. 103 """ 104 state = uci_wrapper.get_uci_state() 105 uci_wrapper.change_state( state.split('U')[0] ) # remove 'UCI' from end of state description (i.e., mark as accepted and ready for processing) 106 self.queue.put( uci_wrapper ) 107 108 def run_next( self ): 109 """Process next request, waiting until one is available if necessary.""" 110 cnt = 0 111 while 1: 112 uci_wrapper = self.queue.get() 113 uci_state = uci_wrapper.get_uci_state() 114 if uci_state is self.STOP_SIGNAL: 115 return 116 try: 117 if uci_state==uci_states.NEW: 118 self.create_uci( uci_wrapper ) 119 elif uci_state==uci_states.DELETING: 120 self.delete_uci( uci_wrapper ) 121 elif uci_state==uci_states.SUBMITTED: 122 self.start_uci( uci_wrapper ) 123 #self.dummy_start_uci( uci_wrapper ) 124 elif uci_state==uci_states.SHUTTING_DOWN: 125 self.stop_uci( uci_wrapper ) 126 elif uci_state==uci_states.SNAPSHOT: 127 self.snapshot_uci( uci_wrapper ) 128 elif uci_state==uci_states.ADD_STORAGE: 129 self.add_storage_to_uci( uci_wrapper ) 130 except: 131 log.exception( "Uncaught exception executing cloud request." ) 132 cnt += 1 133 134 def get_connection( self, uci_wrapper ): 135 """ 136 Establishes cloud connection using user's credentials associated with given UCI 137 """ 138 log.debug( 'Establishing %s cloud connection.' % self.type ) 139 provider = uci_wrapper.get_provider() 140 try: 141 region = RegionInfo( None, provider.region_name, provider.region_endpoint ) 142 except Exception, ex: 143 err = "Selecting region with cloud provider failed: " + str( ex ) 144 log.error( err ) 145 uci_wrapper.set_error( err, True ) 146 return None 147 try: 148 conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(), 149 aws_secret_access_key=uci_wrapper.get_secret_key(), 150 is_secure=provider.is_secure, 151 port=provider.port, 152 region=region, 153 path=provider.path ) 154 except boto.exception.EC2ResponseError, e: 155 err = "Establishing connection with cloud failed: " + str( e ) 156 log.error( err ) 157 uci_wrapper.set_error( err, True ) 158 return None 159 160 return conn 161 162 def check_key_pair( self, uci_wrapper, conn ): 163 """ 164 Check if a key pair associated with this UCI exists on cloud provider. 165 If yes, return key pair name; otherwise, generate a key pair with the cloud 166 provider and, again, return key pair name. 167 Key pair name for given UCI is generated from UCI's name and suffix '_kp' 168 """ 169 kp = None 170 kp_name = uci_wrapper.get_name().replace(' ','_') + "_kp" 171 log.debug( "Checking user's key pair: '%s'" % kp_name ) 172 try: 173 kp = conn.get_key_pair( kp_name ) 174 uci_kp_name = uci_wrapper.get_key_pair_name() 175 uci_material = uci_wrapper.get_key_pair_material() 176 if kp != None: 177 if kp.name != uci_kp_name or uci_material == None: 178 # key pair exists on the cloud but not in local database, so re-generate it (i.e., delete and then create) 179 try: 180 conn.delete_key_pair( kp_name ) 181 kp = self.create_key_pair( conn, kp_name ) 182 uci_wrapper.set_key_pair( kp.name, kp.material ) 183 except boto.exception.EC2ResponseError, e: 184 err = "EC2 response error while deleting key pair: " + str( e ) 185 log.error( err ) 186 uci_wrapper.set_error( err, True ) 187 else: 188 try: 189 kp = self.create_key_pair( conn, kp_name ) 190 uci_wrapper.set_key_pair( kp.name, kp.material ) 191 except boto.exception.EC2ResponseError, e: 192 err = "EC2 response error while creating key pair: " + str( e ) 193 log.error( err ) 194 uci_wrapper.set_error( err, True ) 195 except Exception, ex: 196 err = "Exception while creating key pair: " + str( ex ) 197 log.error( err ) 198 uci_wrapper.set_error( err, True ) 199 except boto.exception.EC2ResponseError, e: # No keypair under this name exists so create it 200 if e.code == 'InvalidKeyPair.NotFound': 201 log.info( "No keypair found, creating keypair '%s'" % kp_name ) 202 kp = self.create_key_pair( conn, kp_name ) 203 uci_wrapper.set_key_pair( kp.name, kp.material ) 204 else: 205 err = "EC2 response error while retrieving key pair: " + str( e ) 206 log.error( err ) 207 uci_wrapper.set_error( err, True ) 208 209 if kp != None: 210 return kp.name 211 else: 212 return None 213 214 def create_key_pair( self, conn, kp_name ): 215 """ Initiate creation of key pair under kp_name by current cloud provider. """ 216 try: 217 return conn.create_key_pair( kp_name ) 218 except boto.exception.EC2ResponseError, e: 219 return None 220 221 def get_mi_id( self, uci_wrapper, i_index ): 222 """ 223 Get appropriate machine image (mi) ID based on instance type. 224 """ 225 i_type = uci_wrapper.get_instance_type( i_index ) 226 if i_type=='m1.small' or i_type=='c1.medium': 227 arch = 'i386' 228 else: 229 arch = 'x86_64' 230 231 mi = self.sa_session.query( model.CloudImage ).filter_by( deleted=False, provider_type=self.type, architecture=arch ).first() 232 if mi: 233 return mi.image_id 234 else: 235 err = "Machine image could not be retrieved" 236 log.error( "%s for UCI '%s'." % (err, uci_wrapper.get_name() ) ) 237 uci_wrapper.set_error( err+". Contact site administrator to ensure needed machine image is registered.", True ) 238 return None 239 240 def create_uci( self, uci_wrapper ): 241 """ 242 Create User Configured Instance (UCI) - i.e., create storage volume on cloud provider 243 and register relevant information in local Galaxy database. 244 """ 245 conn = self.get_connection( uci_wrapper ) 246 247 # Because only 1 storage volume may be created at UCI config time, index of this storage volume in local Galaxy DB w.r.t 248 # current UCI is 0; therefore, it can be referenced in following code 249 log.info( "Creating volume in zone '%s'..." % uci_wrapper.get_uci_availability_zone() ) 250 if uci_wrapper.get_uci_availability_zone()=='': 251 log.info( "Availability zone for UCI (i.e., storage volume) was not selected, using default zone: %s" % self.zone ) 252 uci_wrapper.set_store_availability_zone( self.zone ) 253 254# log.debug( "Creating volume; using command: conn.create_volume( %s, '%s', snapshot=None )" % ( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone() )) 255# vol = conn.create_volume( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone(), snapshot=None ) 256# uci_wrapper.set_store_volume_id( 0, vol.id ) 257 store = uci_wrapper.get_all_stores_in_status( store_status.ADDING )[0] # Because at UCI creation time only 1 storage volume can be created, reference it directly 258 259 log.info( "Creating storage volume in zone '%s' of size '%s'..." % ( uci_wrapper.get_uci_availability_zone(), store.size ) ) 260 # Because only 1 storage volume may be created at UCI config time, index of this storage volume in local Galaxy DB w.r.t 261 # current UCI is 0, so reference it in following methods 262 vol = conn.create_volume( store.size, uci_wrapper.get_uci_availability_zone(), snapshot=None ) 263 uci_wrapper.set_store_volume_id( store.id, vol.id ) 264 265 # Retrieve created volume again to get updated status 266 try: 267 vl = conn.get_all_volumes( [vol.id] ) 268 except boto.exception.EC2ResponseError, e: 269 err = "EC2 response error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( e ) 270 log.error( err ) 271 uci_wrapper.set_store_status( vol.id, uci_states.ERROR ) 272 uci_wrapper.set_error( err, True ) 273 return 274 except Exception, ex: 275 err = "Error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( ex ) 276 log.error( err ) 277 uci_wrapper.set_error( err, True ) 278 return 279 280 if len( vl ) > 0: 281 # EPC does not allow creation of storage volumes (it deletes one as soon as it is created, so manually set uci_state here) 282 if vl[0].status == store_status.DELETED: 283 uci_wrapper.change_state( uci_state=uci_states.AVAILABLE ) 284 else: 285 uci_wrapper.change_state( uci_state=vl[0].status ) 286 uci_wrapper.set_store_status( vol.id, vl[0].status ) 287 else: 288 err = "Volume '" + vol.id +"' not found by EC2 after being created." 289 log.error( err ) 290 uci_wrapper.set_store_status( vol.id, uci_states.ERROR ) 291 uci_wrapper.set_error( err, True ) 292 293 def delete_uci( self, uci_wrapper ): 294 """ 295 Delete UCI - i.e., delete all storage volumes associated with this UCI. 296 NOTE that this implies deletion of any and all data associated 297 with this UCI from the cloud. All data will be deleted. 298 Information in local Galaxy database is marked as deleted but not actually removed 299 from the database. 300 """ 301 conn = self.get_connection( uci_wrapper ) 302 vl = [] # volume list 303 count = 0 # counter for checking if all volumes assoc. w/ UCI were deleted 304 305 # Get all volumes assoc. w/ UCI, delete them from cloud as well as in local DB 306 vl = uci_wrapper.get_all_stores() 307 deletedList = [] 308 failedList = [] 309 for v in vl: 310 log.debug( "Deleting volume with id='%s'" % v.volume_id ) 311 try: 312 if conn.delete_volume( v.volume_id ): 313 deletedList.append( v.volume_id ) 314 v.deleted = True 315 self.sa_session.add( v ) 316 self.sa_session.flush() 317 count += 1 318 else: 319 failedList.append( v.volume_id ) 320 except boto.exception.EC2ResponseError, e: 321 err = "EC2 response error while deleting storage volume '" + v.volume_id + "': " + str( e ) 322 log.error( err ) 323 uci_wrapper.set_store_error( err, store_id = v.volume_id ) 324 uci_wrapper.set_error( err, True ) 325 326 # Delete UCI if all of associated 327 if count == len( vl ): 328 uci_wrapper.set_deleted() 329 else: 330 err = "Deleting following volume(s) failed: "+ str( failedList )+". However, these volumes were successfully deleted: " \ 331 + str( deletedList ) +". MANUAL intervention and processing needed." 332 log.error( err ) 333 uci_wrapper.set_error( err, True ) 334 335 def snapshot_uci( self, uci_wrapper ): 336 """ 337 Initiate creation of a snapshot by cloud provider for all storage volumes 338 associated with this UCI. 339 """ 340 if uci_wrapper.get_uci_state() != uci_states.ERROR: 341 conn = self.get_connection( uci_wrapper ) 342 343 snapshots = uci_wrapper.get_snapshots( status = snapshot_status.SUBMITTED ) 344 for snapshot in snapshots: 345 log.debug( "Snapshot DB id: '%s', volume id: '%s'" % ( snapshot.id, snapshot.store.volume_id ) ) 346 try: 347 snap = conn.create_snapshot( volume_id=snapshot.store.volume_id ) 348 snap_id = str( snap ).split(':')[1] 349 uci_wrapper.set_snapshot_id( snapshot.id, snap_id ) 350 sh = conn.get_all_snapshots( snap_id ) # get updated status 351 uci_wrapper.set_snapshot_status( status=sh[0].status, snap_id=snap_id ) 352 except boto.exception.EC2ResponseError, e: 353 err = "Cloud provider response error while creating snapshot: " + str( e ) 354 log.error( err ) 355 uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True ) 356 uci_wrapper.set_error( err, True ) 357 return 358 except Exception, ex: 359 err = "Error while creating snapshot: " + str( ex ) 360 log.error( err ) 361 uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True ) 362 uci_wrapper.set_error( err, True ) 363 return 364 365 uci_wrapper.change_state( uci_state=uci_states.AVAILABLE ) 366 367# if uci_wrapper.get_uci_state() != uci_states.ERROR: 368# 369# snapshots = uci_wrapper.get_snapshots( status = 'submitted' ) 370# for snapshot in snapshots: 371# uci_wrapper.set_snapshot_id( snapshot.id, None, 'euca_error' ) 372# 373# log.debug( "Eucalyptus snapshot attempted by user for UCI '%s'" % uci_wrapper.get_name() ) 374# uci_wrapper.set_error( "Eucalyptus does not support creation of snapshots at this moment. No snapshot or other changes were performed. \ 375# Feel free to resent state of this instance and use it normally.", True ) 376 377 378 def add_storage_to_uci( self, uci_wrapper ): 379 """ Adds more storage to specified UCI """ 380 uci_wrapper.set_error( "Adding storage to eucalyptus-based clouds is not yet supported.", True ) 381 382 def dummy_start_uci( self, uci_wrapper ): 383 384 uci = uci_wrapper.get_uci() 385 log.debug( "Would be starting instance '%s'" % uci.name ) 386# uci_wrapper.change_state( uci_states.SUBMITTED_UCI ) 387# log.debug( "Set UCI state to SUBMITTED_UCI" ) 388 log.debug( "Sleeping a bit... (%s)" % uci.name ) 389 time.sleep(10) 390 log.debug( "Woke up! (%s)" % uci.name ) 391 392 def start_uci( self, uci_wrapper ): 393 """ 394 Start instance(s) of given UCI on the cloud. 395 """ 396 if uci_wrapper.get_uci_state() != uci_states.ERROR: 397 conn = self.get_connection( uci_wrapper ) 398 self.check_key_pair( uci_wrapper, conn ) 399 if uci_wrapper.get_key_pair_name() == None: 400 err = "Key pair not found" 401 log.error( "%s for UCI '%s'." % ( err, uci_wrapper.get_name() ) ) 402 uci_wrapper.set_error( err + ". Try resetting the state and starting the instance again.", True ) 403 return 404 405 i_indexes = uci_wrapper.get_instances_indexes( state=instance_states.SUBMITTED ) # Get indexes of i_indexes associated with this UCI that are in 'submitted' state 406 log.debug( "Starting instances with IDs: '%s' associated with UCI '%s' " % ( i_indexes, uci_wrapper.get_name(), ) ) 407 if len( i_indexes ) > 0: 408 for i_index in i_indexes: 409 # Get machine image for current instance 410 mi_id = self.get_mi_id( uci_wrapper, i_index ) 411 log.debug( "mi_id: %s, uci_wrapper.get_key_pair_name(): %s" % ( mi_id, uci_wrapper.get_key_pair_name() ) ) 412 uci_wrapper.set_mi( i_index, mi_id ) 413 414 if uci_wrapper.get_uci_state() != uci_states.ERROR: 415 # Start an instance 416 log.debug( "Starting UCI instance '%s'" % uci_wrapper.get_name() ) 417 log.debug( "Using following command: conn.run_instances( image_id='%s', key_name='%s', instance_type='%s' )" 418 % ( mi_id, uci_wrapper.get_key_pair_name(), uci_wrapper.get_instance_type( i_index ) ) ) 419 reservation = None 420 try: 421 reservation = conn.run_instances( image_id=mi_id, 422 key_name=uci_wrapper.get_key_pair_name(), 423 instance_type=uci_wrapper.get_instance_type( i_index ) ) 424 except boto.exception.EC2ResponseError, e: 425 err = "EC2 response error when starting UCI '"+ uci_wrapper.get_name() +"': " + str( e ) 426 log.error( err ) 427 uci_wrapper.set_error( err, True ) 428 except Exception, ex: 429 err = "Error when starting UCI '" + uci_wrapper.get_name() + "': " + str( ex ) 430 log.error( err ) 431 uci_wrapper.set_error( err, True ) 432 # Record newly available instance data into local Galaxy database 433 if reservation: 434 l_time = datetime.utcnow() 435# uci_wrapper.set_instance_launch_time( self.format_time( reservation.instances[0].launch_time ), i_index=i_index ) 436 uci_wrapper.set_instance_launch_time( l_time, i_index=i_index ) 437 if not uci_wrapper.uci_launch_time_set(): 438 uci_wrapper.set_uci_launch_time( l_time ) 439 try: 440 uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] ) 441 # TODO: if more than a single instance will be started through single reservation, change this reference from element [0] 442 i_id = str( reservation.instances[0]).split(":")[1] 443 uci_wrapper.set_instance_id( i_index, i_id ) 444 s = reservation.instances[0].state 445 uci_wrapper.change_state( s, i_id, s ) 446 vol_id = uci_wrapper.get_store_volume_id( store_id=0 ) # TODO: Once more that one vol/UCI is allowed, update this! 447 uci_wrapper.set_store_status( vol_id, store_status.WAITING ) 448 log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_uci_state() ) ) 449 except boto.exception.EC2ResponseError, e: 450 err = "EC2 response error when retrieving instance information for UCI '" + uci_wrapper.get_name() + "': " + str( e ) 451 log.error( err ) 452 uci_wrapper.set_error( err, True ) 453 else: 454 log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() ) 455 else: 456 err = "No instances in state '"+ instance_states.SUBMITTED +"' found for UCI '" + uci_wrapper.get_name() + \ 457 "'. Nothing to start." 458 log.error( err ) 459 uci_wrapper.set_error( err, True ) 460 else: 461 log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() ) 462 463 def stop_uci( self, uci_wrapper): 464 """ 465 Stop all cloud instances associated with given UCI. 466 """ 467 conn = self.get_connection( uci_wrapper ) 468 469 # Get all instances associated with given UCI 470 il = uci_wrapper.get_instances_ids() # instance list 471 # Process list of instances and remove any references to empty instance id's 472 for i in il: 473 if i is None: 474 il.remove( i ) 475 log.debug( 'List of instances being terminated: %s' % il ) 476 rl = conn.get_all_instances( il ) # Reservation list associated with given instances 477 478 # Initiate shutdown of all instances under given UCI 479 cnt = 0 480 stopped = [] 481 not_stopped = [] 482 for r in rl: 483 for inst in r.instances: 484 log.debug( "Sending stop signal to instance '%s' associated with reservation '%s' (UCI: %s)." % ( inst, r, uci_wrapper.get_name() ) ) 485 try: 486 inst.stop() 487 uci_wrapper.set_stop_time( datetime.utcnow(), i_id=inst.id ) 488 uci_wrapper.change_state( instance_id=inst.id, i_state=inst.update() ) 489 stopped.append( inst ) 490 except boto.exception.EC2ResponseError, e: 491 not_stopped.append( inst ) 492 err = "EC2 response error when stopping instance '" + inst.instance_id + "': " + str( e ) 493 log.error( err ) 494 uci_wrapper.set_error( err, True ) 495 496 uci_wrapper.reset_uci_launch_time() 497 log.debug( "Termination was initiated for all instances of UCI '%s'." % uci_wrapper.get_name() ) 498 499# dbInstances = get_instances( trans, uci ) #TODO: handle list! 500# 501# # Get actual cloud instance object 502# cloudInstance = get_cloud_instance( conn, dbInstances.instance_id ) 503# 504# # TODO: Detach persistent storage volume(s) from instance and update volume data in local database 505# stores = get_stores( trans, uci ) 506# for i, store in enumerate( stores ): 507# log.debug( "Detaching volume '%s' to instance '%s'." % ( store.volume_id, dbInstances.instance_id ) ) 508# mntDevice = store.device 509# volStat = None 510## Detaching volume does not work with Eucalyptus Public Cloud, so comment it out 511## try: 512## volStat = conn.detach_volume( store.volume_id, dbInstances.instance_id, mntDevice ) 513## except: 514## log.debug ( 'Error detaching volume; still going to try and stop instance %s.' % dbInstances.instance_id ) 515# store.attach_time = None 516# store.device = None 517# store.inst.instance_id = None 518# store.status = volStat 519# log.debug ( '***** volume status: %s' % volStat ) 520# 521# # Stop the instance and update status in local database 522# cloudInstance.stop() 523# dbInstances.stop_time = datetime.utcnow() 524# while cloudInstance.state != 'terminated': 525# log.debug( "Stopping instance %s state; current state: %s" % ( str( cloudInstance ).split(":")[1], cloudInstance.state ) ) 526# time.sleep(3) 527# cloudInstance.update() 528# dbInstances.state = cloudInstance.state 529# 530# # Reset relevant UCI fields 531# uci.state = 'available' 532# uci.launch_time = None 533# 534# # Persist 535# session = trans.sa_session 536## session.save_or_update( stores ) 537# session.save_or_update( dbInstances ) # TODO: Is this going to work w/ multiple instances stored in dbInstances variable? 538# session.save_or_update( uci ) 539# session.flush() 540# trans.log_event( "User stopped cloud instance '%s'" % uci.name ) 541# trans.set_message( "Galaxy instance '%s' stopped." % uci.name ) 542 543 def update( self ): 544 """ 545 Run status update on all instances that are in 'running', 'pending', or 'shutting-down' state. 546 Run status update on all storage volumes whose status is 'in-use', 'creating', or 'None'. 547 Run status update on all snapshots whose status is 'pending' or 'delete' 548 Run status update on any zombie UCIs, i.e., UCI's that is in 'submitted' state for an 549 extended period of time. 550 551 Reason behind this method is to sync state of local DB and real-world resources 552 """ 553 log.debug( "Running general status update for %s UCIs..." % self.type ) 554 # Update instances 555 instances = self.sa_session.query( model.CloudInstance ) \ 556 .filter( or_( model.CloudInstance.table.c.state==instance_states.RUNNING, 557 model.CloudInstance.table.c.state==instance_states.PENDING, 558 model.CloudInstance.table.c.state==instance_states.SHUTTING_DOWN ) ) \ 559 .all() 560 for inst in instances: 561 if self.type == inst.uci.credentials.provider.type: 562 log.debug( "[%s] Running general status update on instance '%s'" % ( inst.uci.credentials.provider.type, inst.instance_id ) ) 563 self.update_instance( inst ) 564 565 # Update storage volume(s) 566 stores = self.sa_session.query( model.CloudStore ) \ 567 .filter( or_( model.CloudStore.table.c.status==store_status.IN_USE, 568 model.CloudStore.table.c.status==store_status.CREATING, 569 model.CloudStore.table.c.status==store_status.WAITING, 570 model.CloudStore.table.c.status==None ) ) \ 571 .all() 572 for store in stores: 573 if self.type == store.uci.credentials.provider.type: # and store.volume_id != None: 574 log.debug( "[%s] Running general status update on store with local database ID: '%s'" % ( store.uci.credentials.provider.type, store.id ) ) 575 self.update_store( store ) 576 577 # Update pending snapshots or delete ones marked for deletion 578 snapshots = self.sa_session.query( model.CloudSnapshot ) \ 579 .filter( or_( model.CloudSnapshot.table.c.status == snapshot_status.PENDING, model.CloudSnapshot.table.c.status == snapshot_status.DELETE ) ) \ 580 .all() 581 for snapshot in snapshots: 582 if self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.PENDING: 583 log.debug( "[%s] Running general status update on snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) ) 584 self.update_snapshot( snapshot ) 585 elif self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.DELETE: 586 log.debug( "[%s] Initiating deletion of snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) ) 587 self.delete_snapshot( snapshot ) 588 589 # Attempt at updating any zombie UCIs (i.e., instances that have been in SUBMITTED state for longer than expected - see below for exact time) 590 zombies = self.sa_session.query( model.UCI ).filter_by( state=uci_states.SUBMITTED ).all() 591 for zombie in zombies: 592 log.debug( "zombie UCI: %s" % zombie.name ) 593 z_instances = self.sa_session.query( model.CloudInstance ) \ 594 .filter( or_( model.CloudInstance.table.c.state != instance_states.TERMINATED, 595 model.CloudInstance.table.c.state == None ) ) \ 596 .all() 597 for z_inst in z_instances: 598 if self.type == z_inst.uci.credentials.provider.type: 599# log.debug( "z_inst.id: '%s', state: '%s'" % ( z_inst.id, z_inst.state ) ) 600 td = datetime.utcnow() - z_inst.update_time 601# log.debug( "z_inst.id: %s, time delta is %s sec" % ( z_inst.id, td.seconds ) ) 602 if td.seconds > 180: # if instance has been in SUBMITTED state for more than 3 minutes 603 log.debug( "[%s](td=%s) Running zombie repair update on instance with DB id '%s'" % ( z_inst.uci.credentials.provider.type, td.seconds, z_inst.id ) ) 604 self.process_zombie( z_inst ) 605 606 def update_instance( self, inst ): 607 """ 608 Update information in local database for given instance as it is obtained from cloud provider. 609 Along with updating information about given instance, information about the UCI controlling 610 this instance is also updated. 611 """ 612 # Get credentials associated wit this instance 613 uci_id = inst.uci_id 614 uci = self.sa_session.query( model.UCI ).get( uci_id ) 615 self.sa_session.refresh( uci ) 616 conn = self.get_connection_from_uci( uci ) 617 618 # Get reservations handle for given instance 619 try: 620 rl= conn.get_all_instances( [inst.instance_id] ) 621 except boto.exception.EC2ResponseError, e: 622 err = "Retrieving instance(s) from cloud failed for UCI '"+ uci.name +"' during general status update: " + str( e ) 623 log.error( err ) 624 uci.error = err 625 uci.state = uci_states.ERROR 626 self.sa_session.add( uci ) 627 self.sa_session.flush() 628 return None 629 630 # Because references to reservations are deleted shortly after instances have been terminated, getting an empty list as a response to a query 631 # typically means the instance has successfully shut down but the check was not performed in short enough amount of time. Until an alternative solution 632 # is found, below code sets state of given UCI to 'error' to indicate to the user something out of ordinary happened. 633 if len( rl ) == 0: 634 err = "Instance ID '"+inst.instance_id+"' was not found by the cloud provider. Instance might have crashed or otherwise been terminated."+ \ 635 "Manual check is recommended." 636 log.error( err ) 637 inst.error = err 638 uci.error = err 639 inst.state = instance_states.TERMINATED 640 uci.state = uci_states.ERROR 641 uci.launch_time = None 642 self.sa_session.add( inst ) 643 self.sa_session.add( uci ) 644 self.sa_session.flush() 645 # Update instance status in local DB with info from cloud provider 646 for r in rl: 647 for i, cInst in enumerate( r.instances ): 648 try: 649 s = cInst.update() 650 log.debug( "Checking state of cloud instance '%s' associated with UCI '%s' and reservation '%s'. State='%s'" % ( cInst, uci.name, r, s ) ) 651 if s != inst.state: 652 inst.state = s 653 self.sa_session.add( inst ) 654 self.sa_session.flush() 655 # After instance has shut down, ensure UCI is marked as 'available' 656 if s == instance_states.TERMINATED and uci.state != uci_states.ERROR: 657 uci.state = uci_states.AVAILABLE 658 uci.launch_time = None 659 self.sa_session.add( uci ) 660 self.sa_session.flush() 661 # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed. 662 if s != uci.state and s != instance_states.TERMINATED: 663 uci.state = s 664 self.sa_session.add( uci ) 665 self.sa_session.flush() 666 if cInst.public_dns_name != inst.public_dns: 667 inst.public_dns = cInst.public_dns_name 668 self.sa_session.add( inst ) 669 self.sa_session.flush() 670 if cInst.private_dns_name != inst.private_dns: 671 inst.private_dns = cInst.private_dns_name 672 self.sa_session.add( inst ) 673 self.sa_session.flush() 674 except boto.exception.EC2ResponseError, e: 675 err = "Updating instance status from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e ) 676 log.error( err ) 677 uci.error = err 678 uci.state = uci_states.ERROR 679 self.sa_session.add( uci ) 680 self.sa_session.flush() 681 return None 682 683 def update_store( self, store ): 684 """ 685 Update information in local database for given storage volume as it is obtained from cloud provider. 686 Along with updating information about given storage volume, information about the UCI controlling 687 this storage volume is also updated. 688 """ 689 # Get credentials associated wit this store 690 uci_id = store.uci_id 691 uci = self.sa_session.query( model.UCI ).get( uci_id ) 692 self.sa_session.refresh( uci ) 693 conn = self.get_connection_from_uci( uci ) 694 695 if store.volume_id != None: 696 # Get reservations handle for given store 697 try: 698 log.debug( "Updating storage volume command: vl = conn.get_all_volumes( [%s] )" % store.volume_id ) 699 vl = conn.get_all_volumes( [store.volume_id] ) 700 except boto.exception.EC2ResponseError, e: 701 err = "Retrieving volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e ) 702 log.error( err ) 703 uci.error = err 704 uci.state = uci_states.ERROR 705 self.sa_session.add( uci ) 706 self.sa_session.flush() 707 return None 708 709 # Update store status in local DB with info from cloud provider 710 if len(vl) > 0: 711 try: 712 log.debug( "Storage volume '%s' current status: '%s'" % (store.volume_id, vl[0].status ) ) 713 if store.status != vl[0].status: 714 # In case something failed during creation of UCI but actual storage volume was created and yet 715 # UCI state remained as 'new', try to remedy this by updating UCI state here 716 if ( store.status == None ) and ( store.volume_id != None ): 717 uci.state = vl[0].status 718 self.sa_session.add( uci ) 719 self.sa_session.flush() 720 # If UCI was marked in state 'CREATING', update its status to reflect new status 721 elif ( uci.state == uci_states.CREATING ): 722 # Because Eucalyptus Public Cloud (EPC) deletes volumes immediately after they are created, artificially 723 # set status of given UCI to 'available' based on storage volume's availability zone (i.e., it's residing 724 # in EPC as opposed to some other Eucalyptus based cloud that allows creation of storage volumes. 725 if store.availability_zone == 'epc': 726 uci.state = uci_states.AVAILABLE 727 else: 728 uci.state = vl[0].status 729 730 self.sa_session.add( uci ) 731 self.sa_session.flush() 732 733 store.status = vl[0].status 734 self.sa_session.add( store ) 735 self.sa_session.flush() 736 if store.inst != None: 737 if store.inst.instance_id != vl[0].instance_id: 738 store.inst.instance_id = vl[0].instance_id 739 self.sa_session.add( store ) 740 self.sa_session.flush() 741 if store.attach_time != vl[0].attach_time: 742 store.attach_time = vl[0].attach_time 743 self.sa_session.add( store ) 744 self.sa_session.flush() 745 if store.device != vl[0].device: 746 store.device = vl[0].device 747 self.sa_session.add( store ) 748 self.sa_session.flush() 749 except boto.exception.EC2ResponseError, e: 750 err = "Updating status of volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e ) 751 log.error( err ) 752 uci.error = err 753 uci.state = uci_states.ERROR 754 self.sa_session.add( uci ) 755 self.sa_session.flush() 756 return None 757 else: 758 err = "No storage volumes returned by cloud provider on general update" 759 log.error( "%s for UCI '%s'" % ( err, uci.name ) ) 760 store.status = store_status.ERROR 761 store.error = err 762 uci.error = err 763 uci.state = uci_states.ERROR 764 self.sa_session.add( uci ) 765 self.sa_session.add( store ) 766 self.sa_session.flush() 767 else: 768 err = "Missing storage volume ID in local database on general update. Manual check is needed to check " \ 769 "if storage volume was actually created by cloud provider." 770 log.error( "%s (for UCI '%s')" % ( err, uci.name ) ) 771 store.status = store_status.ERROR 772 store.error = err 773 uci.error = err 774 uci.state = uci_states.ERROR 775 self.sa_session.add( uci ) 776 self.sa_session.add( store ) 777 self.sa_session.flush() 778 779 def update_snapshot( self, snapshot ): 780 """ 781 Update information in local database for given snapshot as it is obtained from cloud provider. 782 Along with updating information about given snapshot, information about the UCI controlling 783 this snapshot is also updated. 784 """ 785 # Get credentials associated wit this store 786 uci_id = snapshot.uci_id 787 uci = self.sa_session.query( model.UCI ).get( uci_id ) 788 self.sa_session.refresh( uci ) 789 conn = self.get_connection_from_uci( uci ) 790 791 try: 792 log.debug( "Updating status of snapshot '%s'" % snapshot.snapshot_id ) 793 snap = conn.get_all_snapshots( [snapshot.snapshot_id] ) 794 if len( snap ) > 0: 795 log.debug( "Snapshot '%s' status: %s" % ( snapshot.snapshot_id, snap[0].status ) ) 796 snapshot.status = snap[0].status 797 self.sa_session.add( snapshot ) 798 self.sa_session.flush() 799 else: 800 err = "No snapshots returned by EC2 on general update" 801 log.error( "%s for UCI '%s'" % ( err, uci.name ) ) 802 snapshot.status = snapshot_status.ERROR 803 snapshot.error = err 804 uci.error = err 805 uci.state = uci_states.ERROR 806 self.sa_session.add( uci ) 807 self.sa_session.add( snapshot ) 808 self.sa_session.flush() 809 except boto.exception.EC2ResponseError, e: 810 err = "EC2 response error while updating snapshot status: " + str( e ) 811 log.error( err ) 812 snapshot.status = snapshot_status.ERROR 813 snapshot.error = err 814 uci.error = err 815 uci.state = uci_states.ERROR 816 self.sa_session.add( uci ) 817 self.sa_session.add( snapshot ) 818 self.sa_session.flush() 819 except Exception, ex: 820 err = "Error while updating snapshot status: " + str( ex ) 821 log.error( err ) 822 snapshot.status = snapshot_status.ERROR 823 snapshot.error = err 824 uci.error = err 825 uci.state = uci_states.ERROR 826 self.sa_session.add( uci ) 827 self.sa_session.add( snapshot ) 828 self.sa_session.flush() 829 830 def delete_snapshot( self, snapshot ): 831 """ 832 Initiate deletion of given snapshot from cloud provider. 833 """ 834 if snapshot.status == snapshot_status.DELETE: 835 # Get credentials associated wit this store 836 uci_id = snapshot.uci_id 837 uci = self.sa_session.query( model.UCI ).get( uci_id ) 838 self.sa_session.refresh( uci ) 839 conn = self.get_connection_from_uci( uci ) 840 841 try: 842 log.debug( "Deleting snapshot '%s'" % snapshot.snapshot_id ) 843 snap = conn.delete_snapshot( snapshot.snapshot_id ) 844 if snap == True: 845 snapshot.deleted = True 846 snapshot.status = snapshot_status.DELETED 847 self.sa_session.add( snapshot ) 848 self.sa_session.flush() 849 return snap 850 except boto.exception.EC2ResponseError, e: 851 err = "EC2 response error while deleting snapshot: " + str( e ) 852 log.error( err ) 853 snapshot.status = snapshot_status.ERROR 854 snapshot.error = err 855 uci.error = err 856 uci.state = uci_states.ERROR 857 self.sa_session.add( uci ) 858 self.sa_session.add( snapshot ) 859 self.sa_session.flush() 860 except Exception, ex: 861 err = "Error while deleting snapshot: " + str( ex ) 862 log.error( err ) 863 snapshot.status = snapshot_status.ERROR 864 snapshot.error = err 865 uci.error = err 866 uci.state = uci_states.ERROR 867 self.sa_session.add( uci ) 868 self.sa_session.add( snapshot ) 869 self.sa_session.flush() 870 else: 871 err = "Cannot delete snapshot '"+snapshot.snapshot_id+"' because its status is '"+snapshot.status+"'. Only snapshots with '" + \ 872 snapshot_status.COMPLETED+"' status can be deleted." 873 log.error( err ) 874 snapshot.error = err 875 self.sa_session.add( snapshot ) 876 self.sa_session.flush() 877 878 def process_zombie( self, inst ): 879 """ 880 Attempt at discovering if starting a cloud instance was successful but local database was not updated 881 accordingly or if something else failed and instance was never started. Currently, no automatic 882 repairs are being attempted; instead, appropriate error messages are set. 883 """ 884 uci_id = inst.uci_id 885 uci = self.sa_session.query( model.UCI ).get( uci_id ) 886 self.sa_session.refresh( uci ) 887 888 # Check if any instance-specific information was written to local DB; if 'yes', set instance and UCI's error message 889 # suggesting manual check. 890 if inst.launch_time != None or inst.reservation_id != None or inst.instance_id != None: 891 # Try to recover state - this is best-case effort, so if something does not work immediately, not 892 # recovery steps are attempted. Recovery is based on hope that instance_id is available in local DB; if not, 893 # report as error. 894 # Fields attempting to be recovered are: reservation_id, instance status, and launch_time 895 if inst.instance_id != None: 896 conn = self.get_connection_from_uci( uci ) 897 rl = conn.get_all_instances( [inst.instance_id] ) # reservation list 898 # Update local DB with relevant data from instance 899 if inst.reservation_id == None: 900 try: 901 inst.reservation_id = str(rl[0]).split(":")[1] 902 except: # something failed, so skip 903 pass 904 905 try: 906 state = rl[0].instances[0].update() 907 inst.state = state 908 uci.state = state 909 self.sa_session.add( inst ) 910 self.sa_session.add( uci ) 911 self.sa_session.flush() 912 except: # something failed, so skip 913 pass 914 915 if inst.launch_time == None: 916 try: 917 launch_time = self.format_time( rl[0].instances[0].launch_time ) 918 inst.launch_time = launch_time 919 self.sa_session.add( inst ) 920 self.sa_session.flush() 921 if inst.uci.launch_time == None: 922 uci.launch_time = launch_time 923 self.sa_session.add( uci ) 924 self.sa_session.flush() 925 except: # something failed, so skip 926 pass 927 else: 928 err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \ 929 "' seems to have failed. Because it appears that cloud instance might have gotten started, manual check is recommended." 930 inst.error = err 931 inst.state = instance_states.ERROR 932 inst.uci.error = err 933 inst.uci.state = uci_states.ERROR 934 log.error( err ) 935 self.sa_session.add( inst ) 936 self.sa_session.add( uci ) 937 self.sa_session.flush() 938 939 else: #Instance most likely never got processed, so set error message suggesting user to try starting instance again. 940 err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \ 941 "' seems to have failed. Because it appears that cloud instance never got started, it should be safe to reset state and try " \ 942 "starting the instance again." 943 inst.error = err 944 inst.state = instance_states.ERROR 945 uci.error = err 946 uci.state = uci_states.ERROR 947 log.error( err ) 948 self.sa_session.add( inst ) 949 self.sa_session.add( uci ) 950 self.sa_session.flush() 951# uw = UCIwrapper( inst.uci ) 952# log.debug( "Try automatically re-submitting UCI '%s'." % uw.get_name() ) 953 954 def get_connection_from_uci( self, uci ): 955 """ 956 Establish and return connection to cloud provider. Information needed to do so is obtained 957 directly from uci database object. 958 """ 959 log.debug( 'Establishing %s cloud connection' % self.type ) 960 a_key = uci.credentials.access_key 961 s_key = uci.credentials.secret_key 962 # Get connection 963 try: 964 region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint ) 965# log.debug( "[%s] Using following command to connect to cloud provider: " 966# "conn = EC2Connection( aws_access_key_id=%s, " 967# "aws_secret_access_key=%s, " 968# "port=%s, " 969# "is_secure=%s, " 970# "region=region, " 971# "path=%s )" % ( self.type, a_key, s_key, uci.credentials.provider.is_secure, uci.credentials.provider.port, uci.credentials.provider.path ) ) 972 conn = EC2Connection( aws_access_key_id=a_key, 973 aws_secret_access_key=s_key, 974 is_secure=uci.credentials.provider.is_secure, 975 port=uci.credentials.provider.port, 976 region=region, 977 path=uci.credentials.provider.path ) 978 except boto.exception.EC2ResponseError, e: 979 err = "Establishing connection with cloud failed: " + str( e ) 980 log.error( err ) 981 uci.error = err 982 uci.state = uci_states.ERROR 983 self.sa_session.add( uci ) 984 self.sa_session.flush() 985 return None 986 987 return conn 988 989# def updateUCI( self, uci ): 990# """ 991# Runs a global status update on all storage volumes and all instances that are 992# associated with specified UCI 993# """ 994# conn = self.get_connection( uci ) 995# 996# # Update status of storage volumes 997# vl = model.CloudStore.filter( model.CloudInstance.table.c.uci_id == uci.id ).all() 998# vols = [] 999# for v in vl: 1000# vols.append( v.volume_id ) 1001# try: 1002# volumes = conn.get_all_volumes( vols ) 1003# for i, v in enumerate( volumes ): 1004# uci.store[i].inst.instance_id = v.instance_id 1005# uci.store[i].status = v.status 1006# uci.store[i].device = v.device 1007# uci.store[i].flush() 1008# except: 1009# log.debug( "Error updating status of volume(s) associated with UCI '%s'. Status was not updated." % uci.name ) 1010# pass 1011# 1012# # Update status of instances 1013# il = model.CloudInstance.filter_by( uci_id=uci.id ).filter( model.CloudInstance.table.c.state != 'terminated' ).all() 1014# instanceList = [] 1015# for i in il: 1016# instanceList.append( i.instance_id ) 1017# log.debug( 'instanceList: %s' % instanceList ) 1018# try: 1019# reservations = conn.get_all_instances( instanceList ) 1020# for i, r in enumerate( reservations ): 1021# uci.instance[i].state = r.instances[0].update() 1022# log.debug('updating instance %s; status: %s' % ( uci.instance[i].instance_id, uci.instance[i].state ) ) 1023# uci.state = uci.instance[i].state 1024# uci.instance[i].public_dns = r.instances[0].dns_name 1025# uci.instance[i].private_dns = r.instances[0].private_dns_name 1026# uci.instance[i].flush() 1027# uci.flush() 1028# except: 1029# log.debug( "Error updating status of instances associated with UCI '%s'. Instance status was not updated." % uci.name ) 1030# pass 1031 1032 # --------- Helper methods ------------ 1033 1034 def format_time( self, time ): 1035 dict = {'T':' ', 'Z':''} 1036 for i, j in dict.iteritems(): 1037 time = time.replace(i, j) 1038 return time 1039