PageRenderTime 63ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/cloud/providers/ec2.py

https://bitbucket.org/ajish/galaxy-omelogic
Python | 1033 lines | 1006 code | 14 blank | 13 comment | 65 complexity | d6f582a6b4ede3dc825c5f2ce22096b4 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. import subprocess, threading, os, errno, time, datetime
  2. from Queue import Queue, Empty
  3. from datetime import datetime
  4. from galaxy import model # Database interaction class
  5. from galaxy.model import mapping
  6. from galaxy.datatypes.data import nice_size
  7. from galaxy.util.bunch import Bunch
  8. from galaxy.cloud import UCIwrapper
  9. from Queue import Queue
  10. from sqlalchemy import or_, and_
  11. import galaxy.eggs
  12. galaxy.eggs.require("boto")
  13. from boto.ec2.connection import EC2Connection
  14. from boto.ec2.regioninfo import RegionInfo
  15. import boto.exception
  16. import boto
  17. import logging
  18. log = logging.getLogger( __name__ )
  19. uci_states = Bunch(
  20. NEW_UCI = "newUCI",
  21. NEW = "new",
  22. CREATING = "creating",
  23. DELETING_UCI = "deletingUCI",
  24. DELETING = "deleting",
  25. SUBMITTED_UCI = "submittedUCI",
  26. SUBMITTED = "submitted",
  27. SHUTTING_DOWN_UCI = "shutting-downUCI",
  28. SHUTTING_DOWN = "shutting-down",
  29. AVAILABLE = "available",
  30. RUNNING = "running",
  31. PENDING = "pending",
  32. ERROR = "error",
  33. DELETED = "deleted",
  34. SNAPSHOT_UCI = "snapshotUCI",
  35. SNAPSHOT = "snapshot"
  36. )
  37. instance_states = Bunch(
  38. TERMINATED = "terminated",
  39. SUBMITTED = "submitted",
  40. RUNNING = "running",
  41. PENDING = "pending",
  42. SHUTTING_DOWN = "shutting-down",
  43. ERROR = "error"
  44. )
  45. store_status = Bunch(
  46. WAITING = "waiting",
  47. IN_USE = "in-use",
  48. CREATING = "creating",
  49. DELETED = 'deleted',
  50. ERROR = "error"
  51. )
  52. snapshot_status = Bunch(
  53. SUBMITTED = 'submitted',
  54. PENDING = 'pending',
  55. COMPLETED = 'completed',
  56. DELETE = 'delete',
  57. DELETED= 'deleted',
  58. ERROR = "error"
  59. )
  60. class EC2CloudProvider( object ):
  61. """
  62. Amazon EC2-based cloud provider implementation for managing instances.
  63. """
  64. STOP_SIGNAL = object()
  65. def __init__( self, app ):
  66. self.type = "ec2" # cloud provider type (e.g., ec2, eucalyptus, opennebula)
  67. self.zone = "us-east-1a"
  68. self.security_group = "galaxyWeb"
  69. self.queue = Queue()
  70. self.sa_session = app.model.context
  71. self.threads = []
  72. nworkers = 5
  73. log.info( "Starting EC2 cloud controller workers..." )
  74. for i in range( nworkers ):
  75. worker = threading.Thread( target=self.run_next )
  76. worker.start()
  77. self.threads.append( worker )
  78. log.debug( "%d EC2 cloud workers ready", nworkers )
  79. def shutdown( self ):
  80. """Attempts to gracefully shut down the monitor thread"""
  81. log.info( "sending stop signal to worker threads in EC2 cloud manager" )
  82. for i in range( len( self.threads ) ):
  83. self.queue.put( self.STOP_SIGNAL )
  84. log.info( "EC2 cloud manager stopped" )
  85. def put( self, uci_wrapper ):
  86. """
  87. Add uci_wrapper object to the end of the request queue to be handled by
  88. this cloud provider.
  89. """
  90. state = uci_wrapper.get_uci_state()
  91. uci_wrapper.change_state( state.split('U')[0] ) # remove 'UCI' from end of state description (i.e., mark as accepted and ready for processing)
  92. self.queue.put( uci_wrapper )
  93. def run_next( self ):
  94. """Process next request, waiting until one is available if necessary."""
  95. cnt = 0
  96. while 1:
  97. uci_wrapper = self.queue.get()
  98. uci_state = uci_wrapper.get_uci_state()
  99. if uci_state is self.STOP_SIGNAL:
  100. return
  101. try:
  102. if uci_state==uci_states.NEW:
  103. self.create_uci( uci_wrapper )
  104. elif uci_state==uci_states.DELETING:
  105. self.delete_uci( uci_wrapper )
  106. elif uci_state==uci_states.SUBMITTED:
  107. self.start_uci( uci_wrapper )
  108. elif uci_state==uci_states.SHUTTING_DOWN:
  109. self.stop_uci( uci_wrapper )
  110. elif uci_state==uci_states.SNAPSHOT:
  111. self.snapshot_uci( uci_wrapper )
  112. except:
  113. log.exception( "Uncaught exception executing cloud request." )
  114. cnt += 1
  115. def get_connection( self, uci_wrapper ):
  116. """
  117. Establishes cloud connection using user's credentials associated with given UCI
  118. """
  119. log.debug( 'Establishing %s cloud connection.' % self.type )
  120. provider = uci_wrapper.get_provider()
  121. try:
  122. region = RegionInfo( None, provider.region_name, provider.region_endpoint )
  123. except Exception, ex:
  124. err = "Selecting region with cloud provider failed: " + str( ex )
  125. log.error( err )
  126. uci_wrapper.set_error( err, True )
  127. return None
  128. try:
  129. conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(),
  130. aws_secret_access_key=uci_wrapper.get_secret_key(),
  131. is_secure=provider.is_secure,
  132. region=region,
  133. path=provider.path )
  134. except boto.exception.EC2ResponseError, e:
  135. err = "Establishing connection with cloud failed: " + str( e )
  136. log.error( err )
  137. uci_wrapper.set_error( err, True )
  138. return None
  139. return conn
  140. def check_key_pair( self, uci_wrapper, conn ):
  141. """
  142. Check if a key pair associated with this UCI exists on cloud provider.
  143. If yes, return key pair name; otherwise, generate a key pair with the cloud
  144. provider and, again, return key pair name.
  145. Key pair name for given UCI is generated from UCI's name and suffix '_kp'
  146. """
  147. kp = None
  148. kp_name = uci_wrapper.get_name().replace(' ','_') + "_kp"
  149. log.debug( "Checking user's key pair: '%s'" % kp_name )
  150. try:
  151. kp = conn.get_key_pair( kp_name )
  152. uci_kp_name = uci_wrapper.get_key_pair_name()
  153. uci_material = uci_wrapper.get_key_pair_material()
  154. if kp != None:
  155. if kp.name != uci_kp_name or uci_material == None:
  156. # key pair exists on the cloud but not in local database, so re-generate it (i.e., delete and then create)
  157. try:
  158. conn.delete_key_pair( kp_name )
  159. kp = self.create_key_pair( conn, kp_name )
  160. uci_wrapper.set_key_pair( kp.name, kp.material )
  161. except boto.exception.EC2ResponseError, e:
  162. err = "EC2 response error while deleting key pair: " + str( e )
  163. log.error( err )
  164. uci_wrapper.set_error( err, True )
  165. else:
  166. try:
  167. kp = self.create_key_pair( conn, kp_name )
  168. uci_wrapper.set_key_pair( kp.name, kp.material )
  169. except boto.exception.EC2ResponseError, e:
  170. err = "EC2 response error while creating key pair: " + str( e )
  171. log.error( err )
  172. uci_wrapper.set_error( err, True )
  173. except Exception, ex:
  174. err = "Exception while creating key pair: " + str( ex )
  175. log.error( err )
  176. uci_wrapper.set_error( err, True )
  177. except boto.exception.EC2ResponseError, e: # No keypair under this name exists so create it
  178. if e.code == 'InvalidKeyPair.NotFound':
  179. log.info( "No keypair found, creating keypair '%s'" % kp_name )
  180. kp = self.create_key_pair( conn, kp_name )
  181. uci_wrapper.set_key_pair( kp.name, kp.material )
  182. else:
  183. err = "EC2 response error while retrieving key pair: " + str( e )
  184. log.error( err )
  185. uci_wrapper.set_error( err, True )
  186. if kp != None:
  187. return kp.name
  188. else:
  189. return None
  190. def create_key_pair( self, conn, kp_name ):
  191. """ Initiate creation of key pair under kp_name by current cloud provider. """
  192. try:
  193. return conn.create_key_pair( kp_name )
  194. except boto.exception.EC2ResponseError, e:
  195. return None
  196. def get_mi_id( self, uci_wrapper, i_index ):
  197. """
  198. Get appropriate machine image (mi) based on instance size.
  199. """
  200. i_type = uci_wrapper.get_instance_type( i_index )
  201. if i_type=='m1.small' or i_type=='c1.medium':
  202. arch = 'i386'
  203. else:
  204. arch = 'x86_64'
  205. mi = self.sa_session.query( model.CloudImage ).filter_by( deleted=False, provider_type=self.type, architecture=arch ).first()
  206. if mi:
  207. return mi.image_id
  208. else:
  209. err = "Machine image could not be retrieved"
  210. log.error( "%s for UCI '%s'." % (err, uci_wrapper.get_name() ) )
  211. uci_wrapper.set_error( err+". Contact site administrator to ensure needed machine image is registered.", True )
  212. return None
  213. def create_uci( self, uci_wrapper ):
  214. """
  215. Create User Configured Instance (UCI) - i.e., create storage volume on cloud provider
  216. and register relevant information in local Galaxy database.
  217. """
  218. conn = self.get_connection( uci_wrapper )
  219. if uci_wrapper.get_uci_availability_zone()=='':
  220. log.info( "Availability zone for UCI (i.e., storage volume) was not selected, using default zone: %s" % self.zone )
  221. uci_wrapper.set_store_availability_zone( self.zone )
  222. log.info( "Creating volume in zone '%s'..." % uci_wrapper.get_uci_availability_zone() )
  223. # Because only 1 storage volume may be created at UCI config time, index of this storage volume in local Galaxy DB w.r.t
  224. # current UCI is 0, so reference it in following methods
  225. vol = conn.create_volume( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone(), snapshot=None )
  226. uci_wrapper.set_store_volume_id( 0, vol.id )
  227. # Wait for a while to ensure volume was created
  228. # vol_status = vol.status
  229. # for i in range( 30 ):
  230. # if vol_status is not "available":
  231. # log.debug( 'Updating volume status; current status: %s' % vol_status )
  232. # vol_status = vol.status
  233. # time.sleep(3)
  234. # if i is 29:
  235. # log.debug( "Error while creating volume '%s'; stuck in state '%s'; deleting volume." % ( vol.id, vol_status ) )
  236. # conn.delete_volume( vol.id )
  237. # uci_wrapper.change_state( uci_state='error' )
  238. # return
  239. # Retrieve created volume again to get updated status
  240. try:
  241. vl = conn.get_all_volumes( [vol.id] )
  242. except boto.exception.EC2ResponseError, e:
  243. err = "EC2 response error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( e )
  244. log.error( err )
  245. uci_wrapper.set_store_status( vol.id, uci_states.ERROR )
  246. uci_wrapper.set_error( err, True )
  247. return
  248. except Exception, ex:
  249. err = "Error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( ex )
  250. log.error( err )
  251. uci_wrapper.set_error( err, True )
  252. return
  253. if len( vl ) > 0:
  254. uci_wrapper.change_state( uci_state=vl[0].status )
  255. uci_wrapper.set_store_status( vol.id, vl[0].status )
  256. else:
  257. err = "Volume '" + vol.id +"' not found by EC2 after being created."
  258. log.error( err )
  259. uci_wrapper.set_store_status( vol.id, uci_states.ERROR )
  260. uci_wrapper.set_error( err, True )
  261. def delete_uci( self, uci_wrapper ):
  262. """
  263. Delete UCI - i.e., delete all storage volumes associated with this UCI.
  264. NOTE that this implies deletion of any and all data associated
  265. with this UCI from the cloud. All data will be deleted.
  266. Information in local Galaxy database is marked as deleted but not actually removed
  267. from the database.
  268. """
  269. conn = self.get_connection( uci_wrapper )
  270. vl = [] # volume list
  271. count = 0 # counter for checking if all volumes assoc. w/ UCI were deleted
  272. # Get all volumes assoc. w/ UCI, delete them from cloud as well as in local DB
  273. vl = uci_wrapper.get_all_stores()
  274. deletedList = []
  275. failedList = []
  276. for v in vl:
  277. log.debug( "Deleting volume with id='%s'" % v.volume_id )
  278. try:
  279. if conn.delete_volume( v.volume_id ):
  280. deletedList.append( v.volume_id )
  281. v.deleted = True
  282. self.sa_session.add( v )
  283. self.sa_session.flush()
  284. count += 1
  285. else:
  286. failedList.append( v.volume_id )
  287. except boto.exception.EC2ResponseError, e:
  288. err = "EC2 response error while deleting storage volume '" + v.volume_id + "': " + str( e )
  289. log.error( err )
  290. uci_wrapper.set_store_error( err, store_id = v.volume_id )
  291. uci_wrapper.set_error( err, True )
  292. # Delete UCI if all of associated
  293. if count == len( vl ):
  294. uci_wrapper.set_deleted()
  295. else:
  296. err = "Deleting following volume(s) failed: " + str( failedList ) + ". However, these volumes were successfully deleted: " \
  297. + str( deletedList ) + ". MANUAL intervention and processing needed."
  298. log.error( err )
  299. uci_wrapper.set_error( err, True )
  300. def snapshot_uci( self, uci_wrapper ):
  301. """
  302. Initiate creation of a snapshot by cloud provider for all storage volumes
  303. associated with this UCI.
  304. """
  305. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  306. conn = self.get_connection( uci_wrapper )
  307. snapshots = uci_wrapper.get_snapshots( status = snapshot_status.SUBMITTED )
  308. for snapshot in snapshots:
  309. log.debug( "Snapshot DB id: '%s', volume id: '%s'" % ( snapshot.id, snapshot.store.volume_id ) )
  310. try:
  311. snap = conn.create_snapshot( volume_id=snapshot.store.volume_id )
  312. snap_id = str( snap ).split(':')[1]
  313. uci_wrapper.set_snapshot_id( snapshot.id, snap_id )
  314. sh = conn.get_all_snapshots( snap_id ) # get updated status
  315. uci_wrapper.set_snapshot_status( status=sh[0].status, snap_id=snap_id )
  316. except boto.exception.EC2ResponseError, e:
  317. err = "EC2 response error while creating snapshot: " + str( e )
  318. log.error( err )
  319. uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True )
  320. uci_wrapper.set_error( err, True )
  321. return
  322. except Exception, ex:
  323. err = "Error while creating snapshot: " + str( ex )
  324. log.error( err )
  325. uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True )
  326. uci_wrapper.set_error( err, True )
  327. return
  328. uci_wrapper.change_state( uci_state=uci_states.AVAILABLE )
  329. def add_storage_to_uci( self, name ):
  330. """ Adds more storage to specified UCI
  331. TODO"""
  332. def dummy_start_uci( self, uci_wrapper ):
  333. uci = uci_wrapper.get_uci()
  334. log.debug( "Would be starting instance '%s'" % uci.name )
  335. uci_wrapper.change_state( uci_state.PENDING )
  336. # log.debug( "Sleeping a bit... (%s)" % uci.name )
  337. # time.sleep(20)
  338. # log.debug( "Woke up! (%s)" % uci.name )
  339. def start_uci( self, uci_wrapper ):
  340. """
  341. Start instance(s) of given UCI on the cloud.
  342. """
  343. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  344. conn = self.get_connection( uci_wrapper )
  345. self.check_key_pair( uci_wrapper, conn )
  346. if uci_wrapper.get_key_pair_name() == None:
  347. err = "Key pair not found"
  348. log.error( "%s for UCI '%s'." % ( err, uci_wrapper.get_name() ) )
  349. uci_wrapper.set_error( err + ". Try resetting the state and starting the instance again.", True )
  350. return
  351. i_indexes = uci_wrapper.get_instances_indexes( state=instance_states.SUBMITTED ) # Get indexes of i_indexes associated with this UCI that are in 'submitted' state
  352. log.debug( "Starting instances with IDs: '%s' associated with UCI '%s' " % ( i_indexes, uci_wrapper.get_name(), ) )
  353. if len( i_indexes ) > 0:
  354. for i_index in i_indexes:
  355. # Get machine image for current instance
  356. mi_id = self.get_mi_id( uci_wrapper, i_index )
  357. log.debug( "mi_id: %s, uci_wrapper.get_key_pair_name(): %s" % ( mi_id, uci_wrapper.get_key_pair_name() ) )
  358. uci_wrapper.set_mi( i_index, mi_id )
  359. if mi_id != None:
  360. # Check if galaxy security group exists (and create it if it does not)
  361. log.debug( "Setting up '%s' security group." % self.security_group )
  362. try:
  363. conn.get_all_security_groups( [self.security_group] ) # security groups
  364. except boto.exception.EC2ResponseError, e:
  365. if e.code == 'InvalidGroup.NotFound':
  366. log.info( "No security group found, creating security group '%s'" % self.security_group )
  367. try:
  368. gSecurityGroup = conn.create_security_group(self.security_group, 'Security group for Galaxy.')
  369. gSecurityGroup.authorize( 'tcp', 80, 80, '0.0.0.0/0' ) # Open HTTP port
  370. gSecurityGroup.authorize( 'tcp', 22, 22, '0.0.0.0/0' ) # Open SSH port
  371. except boto.exception.EC2ResponseError, ee:
  372. err = "EC2 response error while creating security group: " + str( ee )
  373. log.error( err )
  374. uci_wrapper.set_error( err, True )
  375. else:
  376. err = "EC2 response error while retrieving security group: " + str( e )
  377. log.error( err )
  378. uci_wrapper.set_error( err, True )
  379. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  380. # Start an instance
  381. log.debug( "Starting instance for UCI '%s'" % uci_wrapper.get_name() )
  382. #TODO: Once multiple volumes can be attached to a single instance, update 'userdata' composition
  383. userdata = uci_wrapper.get_store_volume_id()+"|"+uci_wrapper.get_access_key()+"|"+uci_wrapper.get_secret_key()
  384. log.debug( "Using following command: conn.run_instances( image_id='%s', key_name='%s', security_groups=['%s'], user_data=[OMITTED], instance_type='%s', placement='%s' )"
  385. % ( mi_id, uci_wrapper.get_key_pair_name(), self.security_group, uci_wrapper.get_instance_type( i_index ), uci_wrapper.get_uci_availability_zone() ) )
  386. reservation = None
  387. try:
  388. reservation = conn.run_instances( image_id=mi_id,
  389. key_name=uci_wrapper.get_key_pair_name(),
  390. security_groups=[self.security_group],
  391. user_data=userdata,
  392. instance_type=uci_wrapper.get_instance_type( i_index ),
  393. placement=uci_wrapper.get_uci_availability_zone() )
  394. except boto.exception.EC2ResponseError, e:
  395. err = "EC2 response error when starting UCI '"+ uci_wrapper.get_name() +"': " + str( e )
  396. log.error( err )
  397. uci_wrapper.set_error( err, True )
  398. except Exception, ex:
  399. err = "Error when starting UCI '" + uci_wrapper.get_name() + "': " + str( ex )
  400. log.error( err )
  401. uci_wrapper.set_error( err, True )
  402. # Record newly available instance data into local Galaxy database
  403. if reservation:
  404. l_time = datetime.utcnow()
  405. # uci_wrapper.set_instance_launch_time( self.format_time( reservation.instances[0].launch_time ), i_index=i_index )
  406. uci_wrapper.set_instance_launch_time( l_time, i_index=i_index )
  407. if not uci_wrapper.uci_launch_time_set():
  408. uci_wrapper.set_uci_launch_time( l_time )
  409. try:
  410. uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] )
  411. # TODO: if more than a single instance will be started through single reservation, change this reference to element [0]
  412. i_id = str( reservation.instances[0]).split(":")[1]
  413. uci_wrapper.set_instance_id( i_index, i_id )
  414. s = reservation.instances[0].state
  415. uci_wrapper.change_state( s, i_id, s )
  416. uci_wrapper.set_security_group_name( self.security_group, i_id=i_id )
  417. vol_id = uci_wrapper.get_store_volume_id( store_id=0 ) # TODO: Once more that one vol/UCI is allowed, update this!
  418. uci_wrapper.set_store_status( vol_id, store_status.WAITING )
  419. log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_uci_state() ) )
  420. except boto.exception.EC2ResponseError, e:
  421. err = "EC2 response error when retrieving instance information for UCI '" + uci_wrapper.get_name() + "': " + str( e )
  422. log.error( err )
  423. uci_wrapper.set_error( err, True )
  424. else:
  425. log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() )
  426. else:
  427. err = "No instances in state '"+ instance_states.SUBMITTED +"' found for UCI '" + uci_wrapper.get_name() + \
  428. "'. Nothing to start."
  429. log.error( err )
  430. uci_wrapper.set_error( err, True )
  431. else:
  432. log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() )
  433. def stop_uci( self, uci_wrapper):
  434. """
  435. Stop all of cloud instances associated with given UCI.
  436. """
  437. conn = self.get_connection( uci_wrapper )
  438. # Get all instances associated with given UCI
  439. il = uci_wrapper.get_instances_ids() # instance list
  440. # Process list of instances and remove any references to empty instance id's
  441. for i in il:
  442. if i is None:
  443. il.remove( i )
  444. log.debug( 'List of instances being terminated: %s' % il )
  445. rl = conn.get_all_instances( il ) # Reservation list associated with given instances
  446. # Initiate shutdown of all instances under given UCI
  447. cnt = 0
  448. stopped = []
  449. not_stopped = []
  450. for r in rl:
  451. for inst in r.instances:
  452. log.debug( "Sending stop signal to instance '%s' associated with reservation '%s'." % ( inst, r ) )
  453. try:
  454. inst.stop()
  455. uci_wrapper.set_stop_time( datetime.utcnow(), i_id=inst.id )
  456. uci_wrapper.change_state( instance_id=inst.id, i_state=inst.update() )
  457. stopped.append( inst )
  458. except boto.exception.EC2ResponseError, e:
  459. not_stopped.append( inst )
  460. err = "EC2 response error when stopping instance '" + inst.instance_id + "': " + str(e)
  461. log.error( err )
  462. uci_wrapper.set_error( err, True )
  463. uci_wrapper.reset_uci_launch_time()
  464. log.debug( "Termination was initiated for all instances of UCI '%s'." % uci_wrapper.get_name() )
  465. # dbInstances = get_instances( trans, uci ) #TODO: handle list!
  466. #
  467. # # Get actual cloud instance object
  468. # cloudInstance = get_cloud_instance( conn, dbInstances.instance_id )
  469. #
  470. # # TODO: Detach persistent storage volume(s) from instance and update volume data in local database
  471. # stores = get_stores( trans, uci )
  472. # for i, store in enumerate( stores ):
  473. # log.debug( "Detaching volume '%s' to instance '%s'." % ( store.volume_id, dbInstances.instance_id ) )
  474. # mntDevice = store.device
  475. # volStat = None
  476. ## Detaching volume does not work with Eucalyptus Public Cloud, so comment it out
  477. ## try:
  478. ## volStat = conn.detach_volume( store.volume_id, dbInstances.instance_id, mntDevice )
  479. ## except:
  480. ## log.debug ( 'Error detaching volume; still going to try and stop instance %s.' % dbInstances.instance_id )
  481. # store.attach_time = None
  482. # store.device = None
  483. # store.i_id = None
  484. # store.status = volStat
  485. # log.debug ( '***** volume status: %s' % volStat )
  486. #
  487. #
  488. # # Stop the instance and update status in local database
  489. # cloudInstance.stop()
  490. # dbInstances.stop_time = datetime.utcnow()
  491. # while cloudInstance.state != 'terminated':
  492. # log.debug( "Stopping instance %s state; current state: %s" % ( str( cloudInstance ).split(":")[1], cloudInstance.state ) )
  493. # time.sleep(3)
  494. # cloudInstance.update()
  495. # dbInstances.state = cloudInstance.state
  496. #
  497. # # Reset relevant UCI fields
  498. # uci.state = 'available'
  499. # uci.launch_time = None
  500. #
  501. # # Persist
  502. # session = trans.sa_session
  503. ## session.save_or_update( stores )
  504. # session.save_or_update( dbInstances ) # TODO: Is this going to work w/ multiple instances stored in dbInstances variable?
  505. # session.save_or_update( uci )
  506. # session.flush()
  507. # trans.log_event( "User stopped cloud instance '%s'" % uci.name )
  508. # trans.set_message( "Galaxy instance '%s' stopped." % uci.name )
  509. def update( self ):
  510. """
  511. Run status update on all instances that are in 'running', 'pending', or 'shutting-down' state.
  512. Run status update on all storage volumes whose status is 'in-use', 'creating', or 'None'.
  513. Run status update on all snapshots whose status is 'pending' or 'delete'
  514. Run status update on any zombie UCIs, i.e., UCI's that is in 'submitted' state for an
  515. extended period of time.
  516. Reason behind this method is to sync state of local DB and real-world resources
  517. """
  518. log.debug( "Running general status update for %s UCIs..." % self.type )
  519. # Update instances
  520. instances = self.sa_session.query( model.CloudInstance ) \
  521. .filter( or_( model.CloudInstance.table.c.state==instance_states.RUNNING,
  522. model.CloudInstance.table.c.state==instance_states.PENDING,
  523. model.CloudInstance.table.c.state==instance_states.SHUTTING_DOWN ) ) \
  524. .all()
  525. for inst in instances:
  526. if self.type == inst.uci.credentials.provider.type:
  527. log.debug( "[%s] Running general status update on instance '%s'" % ( inst.uci.credentials.provider.type, inst.instance_id ) )
  528. self.update_instance( inst )
  529. # Update storage volume(s)
  530. stores = self.sa_session.query( model.CloudStore ) \
  531. .filter( or_( model.CloudStore.table.c.status==store_status.IN_USE,
  532. model.CloudStore.table.c.status==store_status.CREATING,
  533. model.CloudStore.table.c.status==store_status.WAITING,
  534. model.CloudStore.table.c.status==None ) ) \
  535. .all()
  536. for store in stores:
  537. if self.type == store.uci.credentials.provider.type: # and store.volume_id != None:
  538. log.debug( "[%s] Running general status update on store with local database ID: '%s'" % ( store.uci.credentials.provider.type, store.id ) )
  539. self.update_store( store )
  540. # else:
  541. # log.error( "[%s] There exists an entry for UCI (%s) storage volume without an ID. Storage volume might have been created with "
  542. # "cloud provider though. Manual check is recommended." % ( store.uci.credentials.provider.type, store.uci.name ) )
  543. # store.uci.error = "There exists an entry in local database for a storage volume without an ID. Storage volume might have been created " \
  544. # "with cloud provider though. Manual check is recommended. After understanding what happened, local database entry for given " \
  545. # "storage volume should be updated."
  546. # store.status = store_status.ERROR
  547. # store.uci.state = uci_states.ERROR
  548. # store.uci.flush()
  549. # store.flush()
  550. # Update pending snapshots or delete ones marked for deletion
  551. snapshots = self.sa_session.query( model.CloudSnapshot ) \
  552. .filter( or_( model.CloudSnapshot.table.c.status == snapshot_status.PENDING, model.CloudSnapshot.table.c.status == snapshot_status.DELETE ) ) \
  553. .all()
  554. for snapshot in snapshots:
  555. if self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.PENDING:
  556. log.debug( "[%s] Running general status update on snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) )
  557. self.update_snapshot( snapshot )
  558. elif self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.DELETE:
  559. log.debug( "[%s] Initiating deletion of snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) )
  560. self.delete_snapshot( snapshot )
  561. # Attempt at updating any zombie UCIs (i.e., instances that have been in SUBMITTED state for longer than expected - see below for exact time)
  562. zombies = self.sa_session.query( model.UCI ).filter_by( state=uci_states.SUBMITTED ).all()
  563. for zombie in zombies:
  564. z_instances = self.sa_session.query( model.CloudInstance ) \
  565. .filter_by( uci_id=zombie.id ) \
  566. .filter( or_( model.CloudInstance.table.c.state != instance_states.TERMINATED,
  567. model.CloudInstance.table.c.state == None ) ) \
  568. .all()
  569. for z_inst in z_instances:
  570. if self.type == z_inst.uci.credentials.provider.type:
  571. # log.debug( "z_inst.id: '%s', state: '%s'" % ( z_inst.id, z_inst.state ) )
  572. td = datetime.utcnow() - z_inst.update_time
  573. if td.seconds > 180: # if instance has been in SUBMITTED state for more than 3 minutes
  574. log.debug( "[%s] Running zombie repair update on instance with DB id '%s'" % ( z_inst.uci.credentials.provider.type, z_inst.id ) )
  575. self.process_zombie( z_inst )
  576. def update_instance( self, inst ):
  577. """
  578. Update information in local database for given instance as it is obtained from cloud provider.
  579. Along with updating information about given instance, information about the UCI controlling
  580. this instance is also updated.
  581. """
  582. # Get credentials associated wit this instance
  583. uci_id = inst.uci_id
  584. uci = self.sa_session.query( model.UCI ).get( uci_id )
  585. self.sa_session.refresh( uci )
  586. conn = self.get_connection_from_uci( uci )
  587. # Get reservations handle for given instance
  588. try:
  589. rl= conn.get_all_instances( [inst.instance_id] )
  590. except boto.exception.EC2ResponseError, e:
  591. err = "Retrieving instance(s) from cloud failed for UCI '"+ uci.name +"' during general status update: " + str( e )
  592. log.error( err )
  593. uci.error = err
  594. uci.state = uci_states.ERROR
  595. self.sa_session.add( uci )
  596. self.sa_session.flush()
  597. return None
  598. # Because references to reservations are deleted shortly after instances have been terminated, getting an empty list as a response to a query
  599. # typically means the instance has successfully shut down but the check was not performed in short enough amount of time. Until an alternative solution
  600. # is found, below code sets state of given UCI to 'error' to indicate to the user something out of ordinary happened.
  601. if len( rl ) == 0:
  602. err = "Instance ID '"+inst.instance_id+"' was not found by the cloud provider. Instance might have crashed or otherwise been terminated."+ \
  603. "Manual check is recommended."
  604. log.error( err )
  605. inst.error = err
  606. uci.error = err
  607. inst.state = instance_states.TERMINATED
  608. uci.state = uci_states.ERROR
  609. uci.launch_time = None
  610. self.sa_session.add( inst )
  611. self.sa_session.add( uci )
  612. self.sa_session.flush()
  613. # Update instance status in local DB with info from cloud provider
  614. for r in rl:
  615. for i, cInst in enumerate( r.instances ):
  616. try:
  617. s = cInst.update()
  618. log.debug( "Checking state of cloud instance '%s' associated with UCI '%s' and reservation '%s'. State='%s'" % ( cInst, uci.name, r, s ) )
  619. if s != inst.state:
  620. inst.state = s
  621. self.sa_session.add( inst )
  622. self.sa_session.flush()
  623. # After instance has shut down, ensure UCI is marked as 'available'
  624. if s == instance_states.TERMINATED and uci.state != uci_states.ERROR:
  625. uci.state = uci_states.AVAILABLE
  626. uci.launch_time = None
  627. self.sa_session.add( uci )
  628. self.sa_session.flush()
  629. # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed.
  630. if s != uci.state and s != instance_states.TERMINATED:
  631. uci.state = s
  632. self.sa_session.add( uci )
  633. self.sa_session.flush()
  634. if cInst.public_dns_name != inst.public_dns:
  635. inst.public_dns = cInst.public_dns_name
  636. self.sa_session.add( inst )
  637. self.sa_session.flush()
  638. if cInst.private_dns_name != inst.private_dns:
  639. inst.private_dns = cInst.private_dns_name
  640. self.sa_session.add( inst )
  641. self.sa_session.flush()
  642. except boto.exception.EC2ResponseError, e:
  643. err = "Updating instance status from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  644. log.error( err )
  645. uci.error = err
  646. uci.state = uci_states.ERROR
  647. self.sa_session.add( uci )
  648. self.sa_session.flush()
  649. return None
  650. def update_store( self, store ):
  651. """
  652. Update information in local database for given storage volume as it is obtained from cloud provider.
  653. Along with updating information about given storage volume, information about the UCI controlling
  654. this storage volume is also updated.
  655. """
  656. # Get credentials associated wit this store
  657. uci_id = store.uci_id
  658. uci = self.sa_session.query( model.UCI ).get( uci_id )
  659. self.sa_session.refresh( uci )
  660. conn = self.get_connection_from_uci( uci )
  661. # Get reservations handle for given store
  662. try:
  663. log.debug( "Updating storage volume command: vl = conn.get_all_volumes( [%s] )" % store.volume_id )
  664. vl = conn.get_all_volumes( [store.volume_id] )
  665. except boto.exception.EC2ResponseError, e:
  666. err = "Retrieving volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  667. log.error( err )
  668. uci.error = err
  669. uci.state = uci_states.ERROR
  670. self.sa_session.add( uci )
  671. self.sa_session.flush()
  672. return None
  673. # Update store status in local DB with info from cloud provider
  674. if len(vl) > 0:
  675. try:
  676. log.debug( "Storage volume '%s' current status: '%s'" % (store.volume_id, vl[0].status ) )
  677. if store.status != vl[0].status:
  678. # In case something failed during creation of UCI but actual storage volume was created and yet
  679. # UCI state remained as 'new', try to remedy this by updating UCI state here
  680. if ( store.status == None ) and ( store.volume_id != None ):
  681. uci.state = vl[0].status
  682. self.sa_session.add( uci )
  683. self.sa_session.flush()
  684. # If UCI was marked in state 'CREATING', update its status to reflect new status
  685. elif ( uci.state == uci_states.CREATING ):
  686. uci.state = vl[0].status
  687. self.sa_session.add( uci )
  688. self.sa_session.flush()
  689. store.status = vl[0].status
  690. self.sa_session.add( store )
  691. self.sa_session.flush()
  692. if store.inst != None:
  693. if store.inst.instance_id != vl[0].instance_id:
  694. store.inst.instance_id = vl[0].instance_id
  695. self.sa_session.add( store )
  696. self.sa_session.flush()
  697. if store.attach_time != vl[0].attach_time:
  698. store.attach_time = vl[0].attach_time
  699. self.sa_session.add( store )
  700. self.sa_session.flush()
  701. if store.device != vl[0].device:
  702. store.device = vl[0].device
  703. self.sa_session.add( store )
  704. self.sa_session.flush()
  705. except boto.exception.EC2ResponseError, e:
  706. err = "Updating status of volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  707. log.error( err )
  708. uci.error = err
  709. uci.state = uci_states.ERROR
  710. self.sa_session.add( uci )
  711. self.sa_session.flush()
  712. return None
  713. else:
  714. err = "No storage volumes returned by cloud provider on general update"
  715. log.error( "%s for UCI '%s'" % ( err, uci.name ) )
  716. store.status = store_status.ERROR
  717. store.error = err
  718. uci.error = err
  719. uci.state = uci_states.ERROR
  720. self.sa_session.add( uci )
  721. self.sa_session.add( store )
  722. self.sa_session.flush()
  723. def update_snapshot( self, snapshot ):
  724. """
  725. Update information in local database for given snapshot as it is obtained from cloud provider.
  726. Along with updating information about given snapshot, information about the UCI controlling
  727. this snapshot is also updated.
  728. """
  729. # Get credentials associated wit this store
  730. uci_id = snapshot.uci_id
  731. uci = self.sa_session.query( model.UCI ).get( uci_id )
  732. self.sa_session.refresh( uci )
  733. conn = self.get_connection_from_uci( uci )
  734. try:
  735. log.debug( "Updating status of snapshot '%s'" % snapshot.snapshot_id )
  736. snap = conn.get_all_snapshots( [snapshot.snapshot_id] )
  737. if len( snap ) > 0:
  738. log.debug( "Snapshot '%s' status: %s" % ( snapshot.snapshot_id, snap[0].status ) )
  739. snapshot.status = snap[0].status
  740. self.sa_session.add( snapshot )
  741. self.sa_session.flush()
  742. else:
  743. err = "No snapshots returned by EC2 on general update"
  744. log.error( "%s for UCI '%s'" % ( err, uci.name ) )
  745. snapshot.status = snapshot_status.ERROR
  746. snapshot.error = err
  747. uci.error = err
  748. uci.state = uci_states.ERROR
  749. self.sa_session.add( uci )
  750. self.sa_session.add( snapshot )
  751. self.sa_session.flush()
  752. except boto.exception.EC2ResponseError, e:
  753. err = "EC2 response error while updating snapshot status: " + str( e )
  754. log.error( err )
  755. snapshot.status = snapshot_status.ERROR
  756. snapshot.error = err
  757. uci.error = err
  758. uci.state = uci_states.ERROR
  759. self.sa_session.add( uci )
  760. self.sa_session.add( snapshot )
  761. self.sa_session.flush()
  762. except Exception, ex:
  763. err = "Error while updating snapshot status: " + str( ex )
  764. log.error( err )
  765. snapshot.status = snapshot_status.ERROR
  766. snapshot.error = err
  767. uci.error = err
  768. uci.state = uci_states.ERROR
  769. self.sa_session.add( uci )
  770. self.sa_session.add( snapshot )
  771. self.sa_session.flush()
  772. def delete_snapshot( self, snapshot ):
  773. """
  774. Initiate deletion of given snapshot from cloud provider.
  775. """
  776. if snapshot.status == snapshot_status.DELETE:
  777. # Get credentials associated wit this store
  778. uci_id = snapshot.uci_id
  779. uci = self.sa_session.query( model.UCI ).get( uci_id )
  780. self.sa_session.refresh( uci )
  781. conn = self.get_connection_from_uci( uci )
  782. try:
  783. log.debug( "Deleting snapshot '%s'" % snapshot.snapshot_id )
  784. snap = conn.delete_snapshot( snapshot.snapshot_id )
  785. if snap == True:
  786. snapshot.deleted = True
  787. snapshot.status = snapshot_status.DELETED
  788. self.sa_session.add( snapshot )
  789. self.sa_session.flush()
  790. return snap
  791. except boto.exception.EC2ResponseError, e:
  792. err = "EC2 response error while deleting snapshot: " + str( e )
  793. log.error( err )
  794. snapshot.status = snapshot_status.ERROR
  795. snapshot.error = err
  796. uci.error = err
  797. uci.state = uci_states.ERROR
  798. self.sa_session.add( uci )
  799. self.sa_session.add( snapshot )
  800. self.sa_session.flush()
  801. except Exception, ex:
  802. err = "Error while deleting snapshot: " + str( ex )
  803. log.error( err )
  804. snapshot.status = snapshot_status.ERROR
  805. snapshot.error = err
  806. uci.error = err
  807. uci.state = uci_states.ERROR
  808. self.sa_session.add( uci )
  809. self.sa_session.add( snapshot )
  810. self.sa_session.flush()
  811. else:
  812. err = "Cannot delete snapshot '"+snapshot.snapshot_id+"' because its status is '"+snapshot.status+"'. Only snapshots with '" + \
  813. snapshot_status.COMPLETED+"' status can be deleted."
  814. log.error( err )
  815. snapshot.error = err
  816. self.sa_session.add( snapshot )
  817. self.sa_session.flush()
  818. def process_zombie( self, inst ):
  819. """
  820. Attempt at discovering if starting a cloud instance was successful but local database was not updated
  821. accordingly or if something else failed and instance was never started. Currently, no automatic
  822. repairs are being attempted; instead, appropriate error messages are set.
  823. """
  824. uci_id = inst.uci_id
  825. uci = self.sa_session.query( model.UCI ).get( uci_id )
  826. self.sa_session.refresh( uci )
  827. # Check if any instance-specific information was written to local DB; if 'yes', set instance and UCI's error message
  828. # suggesting manual check.
  829. if inst.launch_time != None or inst.reservation_id != None or inst.instance_id != None:
  830. # Try to recover state - this is best-case effort, so if something does not work immediately, not
  831. # recovery steps are attempted. Recovery is based on hope that instance_id is available in local DB; if not,
  832. # report as error.
  833. # Fields attempting to be recovered are: reservation_id, instance status, and launch_time
  834. if inst.instance_id != None:
  835. conn = self.get_connection_from_uci( uci )
  836. rl = conn.get_all_instances( [inst.instance_id] ) # reservation list
  837. # Update local DB with relevant data from instance
  838. if inst.reservation_id == None:
  839. try:
  840. inst.reservation_id = str(rl[0]).split(":")[1]
  841. except: # something failed, so skip
  842. pass
  843. try:
  844. state = rl[0].instances[0].update()
  845. inst.state = state
  846. uci.state = state
  847. self.sa_session.add( inst )
  848. self.sa_session.add( uci )
  849. self.sa_session.flush()
  850. except: # something failed, so skip
  851. pass
  852. if inst.launch_time == None:
  853. try:
  854. launch_time = self.format_time( rl[0].instances[0].launch_time )
  855. inst.launch_time = launch_time
  856. self.sa_session.add( inst )
  857. self.sa_session.flush()
  858. if inst.uci.launch_time == None:
  859. uci.launch_time = launch_time
  860. self.sa_session.add( uci )
  861. self.sa_session.flush()
  862. except: # something failed, so skip
  863. pass
  864. else:
  865. err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \
  866. "' seems to have failed. Because it appears that cloud instance might have gotten started, manual check is recommended."
  867. inst.error = err
  868. inst.state = instance_states.ERROR
  869. inst.uci.error = err
  870. inst.uci.state = uci_states.ERROR
  871. log.error( err )
  872. self.sa_session.add( inst )
  873. self.sa_session.add( uci )
  874. self.sa_session.flush()
  875. else: #Instance most likely never got processed, so set error message suggesting user to try starting instance again.
  876. err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \
  877. "' seems to have failed. Because it appears that cloud instance never got started, it should be safe to reset state and try " \
  878. "starting the instance again."
  879. inst.error = err
  880. inst.state = instance_states.ERROR
  881. uci.error = err
  882. uci.state = uci_states.ERROR
  883. log.error( err )
  884. self.sa_session.add( inst )
  885. self.sa_session.add( uci )
  886. self.sa_session.flush()
  887. # uw = UCIwrapper( inst.uci )
  888. # log.debug( "Try automatically re-submitting UCI '%s'." % uw.get_name() )
  889. def get_connection_from_uci( self, uci ):
  890. """
  891. Establish and return connection to cloud provider. Information needed to do so is obtained
  892. directly from uci database object.
  893. """
  894. log.debug( 'Establishing %s cloud connection' % self.type )
  895. a_key = uci.credentials.access_key
  896. s_key = uci.credentials

Large files files are truncated, but you can click here to view the full file