PageRenderTime 66ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/cloud/providers/eucalyptus.py

https://bitbucket.org/afgane/galaxy-central-cloud
Python | 1039 lines | 935 code | 27 blank | 77 comment | 75 complexity | 7dc733a90ef920d6a637fd832094d258 MD5 | raw file
  1. import subprocess, threading, os, errno, time, datetime
  2. from Queue import Queue, Empty
  3. from datetime import datetime
  4. from galaxy import model # Database interaction class
  5. from galaxy.model import mapping
  6. from galaxy.datatypes.data import nice_size
  7. from galaxy.util.bunch import Bunch
  8. from galaxy.cloud import UCIwrapper
  9. from Queue import Queue
  10. from sqlalchemy import or_, and_
  11. import galaxy.eggs
  12. galaxy.eggs.require("boto")
  13. from boto.ec2.connection import EC2Connection
  14. from boto.ec2.regioninfo import RegionInfo
  15. import boto.exception
  16. import boto
  17. import logging
  18. log = logging.getLogger( __name__ )
  19. uci_states = Bunch(
  20. NEW_UCI = "newUCI",
  21. NEW = "new",
  22. CREATING = "creating",
  23. DELETING_UCI = "deletingUCI",
  24. DELETING = "deleting",
  25. SUBMITTED_UCI = "submittedUCI",
  26. SUBMITTED = "submitted",
  27. SHUTTING_DOWN_UCI = "shutting-downUCI",
  28. SHUTTING_DOWN = "shutting-down",
  29. ADD_STORAGE_UCI = "add-storageUCI",
  30. ADD_STORAGE = "add-storage",
  31. AVAILABLE = "available",
  32. RUNNING = "running",
  33. PENDING = "pending",
  34. ERROR = "error",
  35. DELETED = "deleted",
  36. SNAPSHOT_UCI = "snapshotUCI",
  37. SNAPSHOT = "snapshot"
  38. )
  39. instance_states = Bunch(
  40. TERMINATED = "terminated",
  41. SUBMITTED = "submitted",
  42. RUNNING = "running",
  43. ADDING = "adding-storage",
  44. PENDING = "pending",
  45. SHUTTING_DOWN = "shutting-down",
  46. ERROR = "error"
  47. )
  48. store_status = Bunch(
  49. WAITING = "waiting",
  50. IN_USE = "in-use",
  51. ADDING = "adding",
  52. CREATING = "creating",
  53. DELETED = 'deleted',
  54. ERROR = "error"
  55. )
  56. snapshot_status = Bunch(
  57. SUBMITTED = 'submitted',
  58. PENDING = 'pending',
  59. COMPLETED = 'completed',
  60. DELETE = 'delete',
  61. DELETED= 'deleted',
  62. ERROR = "error"
  63. )
  64. class EucalyptusCloudProvider( object ):
  65. """
  66. Eucalyptus-based cloud provider implementation for managing instances.
  67. """
  68. STOP_SIGNAL = object()
  69. def __init__( self, app ):
  70. self.type = "eucalyptus" # cloud provider type (e.g., ec2, eucalyptus, opennebula)
  71. self.zone = "epc"
  72. self.queue = Queue()
  73. self.sa_session = app.model.context
  74. self.threads = []
  75. nworkers = 5
  76. log.info( "Starting eucalyptus cloud controller workers..." )
  77. for i in range( nworkers ):
  78. worker = threading.Thread( target=self.run_next )
  79. worker.start()
  80. self.threads.append( worker )
  81. log.debug( "%d eucalyptus cloud workers ready", nworkers )
  82. def shutdown( self ):
  83. """Attempts to gracefully shut down the monitor thread"""
  84. log.info( "sending stop signal to worker threads in eucalyptus cloud manager" )
  85. for i in range( len( self.threads ) ):
  86. self.queue.put( self.STOP_SIGNAL )
  87. log.info( "eucalyptus cloud manager stopped" )
  88. def put( self, uci_wrapper ):
  89. """
  90. Add uci_wrapper object to the end of the request queue to be handled by
  91. this cloud provider.
  92. """
  93. state = uci_wrapper.get_uci_state()
  94. uci_wrapper.change_state( state.split('U')[0] ) # remove 'UCI' from end of state description (i.e., mark as accepted and ready for processing)
  95. self.queue.put( uci_wrapper )
  96. def run_next( self ):
  97. """Process next request, waiting until one is available if necessary."""
  98. cnt = 0
  99. while 1:
  100. uci_wrapper = self.queue.get()
  101. uci_state = uci_wrapper.get_uci_state()
  102. if uci_state is self.STOP_SIGNAL:
  103. return
  104. try:
  105. if uci_state==uci_states.NEW:
  106. self.create_uci( uci_wrapper )
  107. elif uci_state==uci_states.DELETING:
  108. self.delete_uci( uci_wrapper )
  109. elif uci_state==uci_states.SUBMITTED:
  110. self.start_uci( uci_wrapper )
  111. #self.dummy_start_uci( uci_wrapper )
  112. elif uci_state==uci_states.SHUTTING_DOWN:
  113. self.stop_uci( uci_wrapper )
  114. elif uci_state==uci_states.SNAPSHOT:
  115. self.snapshot_uci( uci_wrapper )
  116. elif uci_state==uci_states.ADD_STORAGE:
  117. self.add_storage_to_uci( uci_wrapper )
  118. except:
  119. log.exception( "Uncaught exception executing cloud request." )
  120. cnt += 1
  121. def get_connection( self, uci_wrapper ):
  122. """
  123. Establishes cloud connection using user's credentials associated with given UCI
  124. """
  125. log.debug( 'Establishing %s cloud connection.' % self.type )
  126. provider = uci_wrapper.get_provider()
  127. try:
  128. region = RegionInfo( None, provider.region_name, provider.region_endpoint )
  129. except Exception, ex:
  130. err = "Selecting region with cloud provider failed: " + str( ex )
  131. log.error( err )
  132. uci_wrapper.set_error( err, True )
  133. return None
  134. try:
  135. conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(),
  136. aws_secret_access_key=uci_wrapper.get_secret_key(),
  137. is_secure=provider.is_secure,
  138. port=provider.port,
  139. region=region,
  140. path=provider.path )
  141. except boto.exception.EC2ResponseError, e:
  142. err = "Establishing connection with cloud failed: " + str( e )
  143. log.error( err )
  144. uci_wrapper.set_error( err, True )
  145. return None
  146. return conn
  147. def check_key_pair( self, uci_wrapper, conn ):
  148. """
  149. Check if a key pair associated with this UCI exists on cloud provider.
  150. If yes, return key pair name; otherwise, generate a key pair with the cloud
  151. provider and, again, return key pair name.
  152. Key pair name for given UCI is generated from UCI's name and suffix '_kp'
  153. """
  154. kp = None
  155. kp_name = uci_wrapper.get_name().replace(' ','_') + "_kp"
  156. log.debug( "Checking user's key pair: '%s'" % kp_name )
  157. try:
  158. kp = conn.get_key_pair( kp_name )
  159. uci_kp_name = uci_wrapper.get_key_pair_name()
  160. uci_material = uci_wrapper.get_key_pair_material()
  161. if kp != None:
  162. if kp.name != uci_kp_name or uci_material == None:
  163. # key pair exists on the cloud but not in local database, so re-generate it (i.e., delete and then create)
  164. try:
  165. conn.delete_key_pair( kp_name )
  166. kp = self.create_key_pair( conn, kp_name )
  167. uci_wrapper.set_key_pair( kp.name, kp.material )
  168. except boto.exception.EC2ResponseError, e:
  169. err = "EC2 response error while deleting key pair: " + str( e )
  170. log.error( err )
  171. uci_wrapper.set_error( err, True )
  172. else:
  173. try:
  174. kp = self.create_key_pair( conn, kp_name )
  175. uci_wrapper.set_key_pair( kp.name, kp.material )
  176. except boto.exception.EC2ResponseError, e:
  177. err = "EC2 response error while creating key pair: " + str( e )
  178. log.error( err )
  179. uci_wrapper.set_error( err, True )
  180. except Exception, ex:
  181. err = "Exception while creating key pair: " + str( ex )
  182. log.error( err )
  183. uci_wrapper.set_error( err, True )
  184. except boto.exception.EC2ResponseError, e: # No keypair under this name exists so create it
  185. if e.code == 'InvalidKeyPair.NotFound':
  186. log.info( "No keypair found, creating keypair '%s'" % kp_name )
  187. kp = self.create_key_pair( conn, kp_name )
  188. uci_wrapper.set_key_pair( kp.name, kp.material )
  189. else:
  190. err = "EC2 response error while retrieving key pair: " + str( e )
  191. log.error( err )
  192. uci_wrapper.set_error( err, True )
  193. if kp != None:
  194. return kp.name
  195. else:
  196. return None
  197. def create_key_pair( self, conn, kp_name ):
  198. """ Initiate creation of key pair under kp_name by current cloud provider. """
  199. try:
  200. return conn.create_key_pair( kp_name )
  201. except boto.exception.EC2ResponseError, e:
  202. return None
  203. def get_mi_id( self, uci_wrapper, i_index ):
  204. """
  205. Get appropriate machine image (mi) ID based on instance type.
  206. """
  207. i_type = uci_wrapper.get_instance_type( i_index )
  208. if i_type=='m1.small' or i_type=='c1.medium':
  209. arch = 'i386'
  210. else:
  211. arch = 'x86_64'
  212. mi = self.sa_session.query( model.CloudImage ).filter_by( deleted=False, provider_type=self.type, architecture=arch ).first()
  213. if mi:
  214. return mi.image_id
  215. else:
  216. err = "Machine image could not be retrieved"
  217. log.error( "%s for UCI '%s'." % (err, uci_wrapper.get_name() ) )
  218. uci_wrapper.set_error( err+". Contact site administrator to ensure needed machine image is registered.", True )
  219. return None
  220. def create_uci( self, uci_wrapper ):
  221. """
  222. Create User Configured Instance (UCI) - i.e., create storage volume on cloud provider
  223. and register relevant information in local Galaxy database.
  224. """
  225. conn = self.get_connection( uci_wrapper )
  226. # Because only 1 storage volume may be created at UCI config time, index of this storage volume in local Galaxy DB w.r.t
  227. # current UCI is 0; therefore, it can be referenced in following code
  228. log.info( "Creating volume in zone '%s'..." % uci_wrapper.get_uci_availability_zone() )
  229. if uci_wrapper.get_uci_availability_zone()=='':
  230. log.info( "Availability zone for UCI (i.e., storage volume) was not selected, using default zone: %s" % self.zone )
  231. uci_wrapper.set_store_availability_zone( self.zone )
  232. # log.debug( "Creating volume; using command: conn.create_volume( %s, '%s', snapshot=None )" % ( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone() ))
  233. # vol = conn.create_volume( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone(), snapshot=None )
  234. # uci_wrapper.set_store_volume_id( 0, vol.id )
  235. store = uci_wrapper.get_all_stores_in_status( store_status.ADDING )[0] # Because at UCI creation time only 1 storage volume can be created, reference it directly
  236. log.info( "Creating storage volume in zone '%s' of size '%s'..." % ( uci_wrapper.get_uci_availability_zone(), store.size ) )
  237. # Because only 1 storage volume may be created at UCI config time, index of this storage volume in local Galaxy DB w.r.t
  238. # current UCI is 0, so reference it in following methods
  239. vol = conn.create_volume( store.size, uci_wrapper.get_uci_availability_zone(), snapshot=None )
  240. uci_wrapper.set_store_volume_id( store.id, vol.id )
  241. # Retrieve created volume again to get updated status
  242. try:
  243. vl = conn.get_all_volumes( [vol.id] )
  244. except boto.exception.EC2ResponseError, e:
  245. err = "EC2 response error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( e )
  246. log.error( err )
  247. uci_wrapper.set_store_status( vol.id, uci_states.ERROR )
  248. uci_wrapper.set_error( err, True )
  249. return
  250. except Exception, ex:
  251. err = "Error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( ex )
  252. log.error( err )
  253. uci_wrapper.set_error( err, True )
  254. return
  255. if len( vl ) > 0:
  256. # EPC does not allow creation of storage volumes (it deletes one as soon as it is created, so manually set uci_state here)
  257. if vl[0].status == store_status.DELETED:
  258. uci_wrapper.change_state( uci_state=uci_states.AVAILABLE )
  259. else:
  260. uci_wrapper.change_state( uci_state=vl[0].status )
  261. uci_wrapper.set_store_status( vol.id, vl[0].status )
  262. else:
  263. err = "Volume '" + vol.id +"' not found by EC2 after being created."
  264. log.error( err )
  265. uci_wrapper.set_store_status( vol.id, uci_states.ERROR )
  266. uci_wrapper.set_error( err, True )
  267. def delete_uci( self, uci_wrapper ):
  268. """
  269. Delete UCI - i.e., delete all storage volumes associated with this UCI.
  270. NOTE that this implies deletion of any and all data associated
  271. with this UCI from the cloud. All data will be deleted.
  272. Information in local Galaxy database is marked as deleted but not actually removed
  273. from the database.
  274. """
  275. conn = self.get_connection( uci_wrapper )
  276. vl = [] # volume list
  277. count = 0 # counter for checking if all volumes assoc. w/ UCI were deleted
  278. # Get all volumes assoc. w/ UCI, delete them from cloud as well as in local DB
  279. vl = uci_wrapper.get_all_stores()
  280. deletedList = []
  281. failedList = []
  282. for v in vl:
  283. log.debug( "Deleting volume with id='%s'" % v.volume_id )
  284. try:
  285. if conn.delete_volume( v.volume_id ):
  286. deletedList.append( v.volume_id )
  287. v.deleted = True
  288. self.sa_session.add( v )
  289. self.sa_session.flush()
  290. count += 1
  291. else:
  292. failedList.append( v.volume_id )
  293. except boto.exception.EC2ResponseError, e:
  294. err = "EC2 response error while deleting storage volume '" + v.volume_id + "': " + str( e )
  295. log.error( err )
  296. uci_wrapper.set_store_error( err, store_id = v.volume_id )
  297. uci_wrapper.set_error( err, True )
  298. # Delete UCI if all of associated
  299. if count == len( vl ):
  300. uci_wrapper.set_deleted()
  301. else:
  302. err = "Deleting following volume(s) failed: "+ str( failedList )+". However, these volumes were successfully deleted: " \
  303. + str( deletedList ) +". MANUAL intervention and processing needed."
  304. log.error( err )
  305. uci_wrapper.set_error( err, True )
  306. def snapshot_uci( self, uci_wrapper ):
  307. """
  308. Initiate creation of a snapshot by cloud provider for all storage volumes
  309. associated with this UCI.
  310. """
  311. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  312. conn = self.get_connection( uci_wrapper )
  313. snapshots = uci_wrapper.get_snapshots( status = snapshot_status.SUBMITTED )
  314. for snapshot in snapshots:
  315. log.debug( "Snapshot DB id: '%s', volume id: '%s'" % ( snapshot.id, snapshot.store.volume_id ) )
  316. try:
  317. snap = conn.create_snapshot( volume_id=snapshot.store.volume_id )
  318. snap_id = str( snap ).split(':')[1]
  319. uci_wrapper.set_snapshot_id( snapshot.id, snap_id )
  320. sh = conn.get_all_snapshots( snap_id ) # get updated status
  321. uci_wrapper.set_snapshot_status( status=sh[0].status, snap_id=snap_id )
  322. except boto.exception.EC2ResponseError, e:
  323. err = "Cloud provider response error while creating snapshot: " + str( e )
  324. log.error( err )
  325. uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True )
  326. uci_wrapper.set_error( err, True )
  327. return
  328. except Exception, ex:
  329. err = "Error while creating snapshot: " + str( ex )
  330. log.error( err )
  331. uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True )
  332. uci_wrapper.set_error( err, True )
  333. return
  334. uci_wrapper.change_state( uci_state=uci_states.AVAILABLE )
  335. # if uci_wrapper.get_uci_state() != uci_states.ERROR:
  336. #
  337. # snapshots = uci_wrapper.get_snapshots( status = 'submitted' )
  338. # for snapshot in snapshots:
  339. # uci_wrapper.set_snapshot_id( snapshot.id, None, 'euca_error' )
  340. #
  341. # log.debug( "Eucalyptus snapshot attempted by user for UCI '%s'" % uci_wrapper.get_name() )
  342. # uci_wrapper.set_error( "Eucalyptus does not support creation of snapshots at this moment. No snapshot or other changes were performed. \
  343. # Feel free to resent state of this instance and use it normally.", True )
  344. def add_storage_to_uci( self, uci_wrapper ):
  345. """ Adds more storage to specified UCI """
  346. uci_wrapper.set_error( "Adding storage to eucalyptus-based clouds is not yet supported.", True )
  347. def dummy_start_uci( self, uci_wrapper ):
  348. uci = uci_wrapper.get_uci()
  349. log.debug( "Would be starting instance '%s'" % uci.name )
  350. # uci_wrapper.change_state( uci_states.SUBMITTED_UCI )
  351. # log.debug( "Set UCI state to SUBMITTED_UCI" )
  352. log.debug( "Sleeping a bit... (%s)" % uci.name )
  353. time.sleep(10)
  354. log.debug( "Woke up! (%s)" % uci.name )
  355. def start_uci( self, uci_wrapper ):
  356. """
  357. Start instance(s) of given UCI on the cloud.
  358. """
  359. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  360. conn = self.get_connection( uci_wrapper )
  361. self.check_key_pair( uci_wrapper, conn )
  362. if uci_wrapper.get_key_pair_name() == None:
  363. err = "Key pair not found"
  364. log.error( "%s for UCI '%s'." % ( err, uci_wrapper.get_name() ) )
  365. uci_wrapper.set_error( err + ". Try resetting the state and starting the instance again.", True )
  366. return
  367. i_indexes = uci_wrapper.get_instances_indexes( state=instance_states.SUBMITTED ) # Get indexes of i_indexes associated with this UCI that are in 'submitted' state
  368. log.debug( "Starting instances with IDs: '%s' associated with UCI '%s' " % ( i_indexes, uci_wrapper.get_name(), ) )
  369. if len( i_indexes ) > 0:
  370. for i_index in i_indexes:
  371. # Get machine image for current instance
  372. mi_id = self.get_mi_id( uci_wrapper, i_index )
  373. log.debug( "mi_id: %s, uci_wrapper.get_key_pair_name(): %s" % ( mi_id, uci_wrapper.get_key_pair_name() ) )
  374. uci_wrapper.set_mi( i_index, mi_id )
  375. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  376. # Start an instance
  377. log.debug( "Starting UCI instance '%s'" % uci_wrapper.get_name() )
  378. log.debug( "Using following command: conn.run_instances( image_id='%s', key_name='%s', instance_type='%s' )"
  379. % ( mi_id, uci_wrapper.get_key_pair_name(), uci_wrapper.get_instance_type( i_index ) ) )
  380. reservation = None
  381. try:
  382. reservation = conn.run_instances( image_id=mi_id,
  383. key_name=uci_wrapper.get_key_pair_name(),
  384. instance_type=uci_wrapper.get_instance_type( i_index ) )
  385. except boto.exception.EC2ResponseError, e:
  386. err = "EC2 response error when starting UCI '"+ uci_wrapper.get_name() +"': " + str( e )
  387. log.error( err )
  388. uci_wrapper.set_error( err, True )
  389. except Exception, ex:
  390. err = "Error when starting UCI '" + uci_wrapper.get_name() + "': " + str( ex )
  391. log.error( err )
  392. uci_wrapper.set_error( err, True )
  393. # Record newly available instance data into local Galaxy database
  394. if reservation:
  395. l_time = datetime.utcnow()
  396. # uci_wrapper.set_instance_launch_time( self.format_time( reservation.instances[0].launch_time ), i_index=i_index )
  397. uci_wrapper.set_instance_launch_time( l_time, i_index=i_index )
  398. if not uci_wrapper.uci_launch_time_set():
  399. uci_wrapper.set_uci_launch_time( l_time )
  400. try:
  401. uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] )
  402. # TODO: if more than a single instance will be started through single reservation, change this reference from element [0]
  403. i_id = str( reservation.instances[0]).split(":")[1]
  404. uci_wrapper.set_instance_id( i_index, i_id )
  405. s = reservation.instances[0].state
  406. uci_wrapper.change_state( s, i_id, s )
  407. vol_id = uci_wrapper.get_store_volume_id( store_id=0 ) # TODO: Once more that one vol/UCI is allowed, update this!
  408. uci_wrapper.set_store_status( vol_id, store_status.WAITING )
  409. log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_uci_state() ) )
  410. except boto.exception.EC2ResponseError, e:
  411. err = "EC2 response error when retrieving instance information for UCI '" + uci_wrapper.get_name() + "': " + str( e )
  412. log.error( err )
  413. uci_wrapper.set_error( err, True )
  414. else:
  415. log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() )
  416. else:
  417. err = "No instances in state '"+ instance_states.SUBMITTED +"' found for UCI '" + uci_wrapper.get_name() + \
  418. "'. Nothing to start."
  419. log.error( err )
  420. uci_wrapper.set_error( err, True )
  421. else:
  422. log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() )
  423. def stop_uci( self, uci_wrapper):
  424. """
  425. Stop all cloud instances associated with given UCI.
  426. """
  427. conn = self.get_connection( uci_wrapper )
  428. # Get all instances associated with given UCI
  429. il = uci_wrapper.get_instances_ids() # instance list
  430. # Process list of instances and remove any references to empty instance id's
  431. for i in il:
  432. if i is None:
  433. il.remove( i )
  434. log.debug( 'List of instances being terminated: %s' % il )
  435. rl = conn.get_all_instances( il ) # Reservation list associated with given instances
  436. # Initiate shutdown of all instances under given UCI
  437. cnt = 0
  438. stopped = []
  439. not_stopped = []
  440. for r in rl:
  441. for inst in r.instances:
  442. log.debug( "Sending stop signal to instance '%s' associated with reservation '%s' (UCI: %s)." % ( inst, r, uci_wrapper.get_name() ) )
  443. try:
  444. inst.stop()
  445. uci_wrapper.set_stop_time( datetime.utcnow(), i_id=inst.id )
  446. uci_wrapper.change_state( instance_id=inst.id, i_state=inst.update() )
  447. stopped.append( inst )
  448. except boto.exception.EC2ResponseError, e:
  449. not_stopped.append( inst )
  450. err = "EC2 response error when stopping instance '" + inst.instance_id + "': " + str( e )
  451. log.error( err )
  452. uci_wrapper.set_error( err, True )
  453. uci_wrapper.reset_uci_launch_time()
  454. log.debug( "Termination was initiated for all instances of UCI '%s'." % uci_wrapper.get_name() )
  455. # dbInstances = get_instances( trans, uci ) #TODO: handle list!
  456. #
  457. # # Get actual cloud instance object
  458. # cloudInstance = get_cloud_instance( conn, dbInstances.instance_id )
  459. #
  460. # # TODO: Detach persistent storage volume(s) from instance and update volume data in local database
  461. # stores = get_stores( trans, uci )
  462. # for i, store in enumerate( stores ):
  463. # log.debug( "Detaching volume '%s' to instance '%s'." % ( store.volume_id, dbInstances.instance_id ) )
  464. # mntDevice = store.device
  465. # volStat = None
  466. ## Detaching volume does not work with Eucalyptus Public Cloud, so comment it out
  467. ## try:
  468. ## volStat = conn.detach_volume( store.volume_id, dbInstances.instance_id, mntDevice )
  469. ## except:
  470. ## log.debug ( 'Error detaching volume; still going to try and stop instance %s.' % dbInstances.instance_id )
  471. # store.attach_time = None
  472. # store.device = None
  473. # store.inst.instance_id = None
  474. # store.status = volStat
  475. # log.debug ( '***** volume status: %s' % volStat )
  476. #
  477. # # Stop the instance and update status in local database
  478. # cloudInstance.stop()
  479. # dbInstances.stop_time = datetime.utcnow()
  480. # while cloudInstance.state != 'terminated':
  481. # log.debug( "Stopping instance %s state; current state: %s" % ( str( cloudInstance ).split(":")[1], cloudInstance.state ) )
  482. # time.sleep(3)
  483. # cloudInstance.update()
  484. # dbInstances.state = cloudInstance.state
  485. #
  486. # # Reset relevant UCI fields
  487. # uci.state = 'available'
  488. # uci.launch_time = None
  489. #
  490. # # Persist
  491. # session = trans.sa_session
  492. ## session.save_or_update( stores )
  493. # session.save_or_update( dbInstances ) # TODO: Is this going to work w/ multiple instances stored in dbInstances variable?
  494. # session.save_or_update( uci )
  495. # session.flush()
  496. # trans.log_event( "User stopped cloud instance '%s'" % uci.name )
  497. # trans.set_message( "Galaxy instance '%s' stopped." % uci.name )
  498. def update( self ):
  499. """
  500. Run status update on all instances that are in 'running', 'pending', or 'shutting-down' state.
  501. Run status update on all storage volumes whose status is 'in-use', 'creating', or 'None'.
  502. Run status update on all snapshots whose status is 'pending' or 'delete'
  503. Run status update on any zombie UCIs, i.e., UCI's that is in 'submitted' state for an
  504. extended period of time.
  505. Reason behind this method is to sync state of local DB and real-world resources
  506. """
  507. log.debug( "Running general status update for %s UCIs..." % self.type )
  508. # Update instances
  509. instances = self.sa_session.query( model.CloudInstance ) \
  510. .filter( or_( model.CloudInstance.table.c.state==instance_states.RUNNING,
  511. model.CloudInstance.table.c.state==instance_states.PENDING,
  512. model.CloudInstance.table.c.state==instance_states.SHUTTING_DOWN ) ) \
  513. .all()
  514. for inst in instances:
  515. if self.type == inst.uci.credentials.provider.type:
  516. log.debug( "[%s] Running general status update on instance '%s'" % ( inst.uci.credentials.provider.type, inst.instance_id ) )
  517. self.update_instance( inst )
  518. # Update storage volume(s)
  519. stores = self.sa_session.query( model.CloudStore ) \
  520. .filter( or_( model.CloudStore.table.c.status==store_status.IN_USE,
  521. model.CloudStore.table.c.status==store_status.CREATING,
  522. model.CloudStore.table.c.status==store_status.WAITING,
  523. model.CloudStore.table.c.status==None ) ) \
  524. .all()
  525. for store in stores:
  526. if self.type == store.uci.credentials.provider.type: # and store.volume_id != None:
  527. log.debug( "[%s] Running general status update on store with local database ID: '%s'" % ( store.uci.credentials.provider.type, store.id ) )
  528. self.update_store( store )
  529. # Update pending snapshots or delete ones marked for deletion
  530. snapshots = self.sa_session.query( model.CloudSnapshot ) \
  531. .filter( or_( model.CloudSnapshot.table.c.status == snapshot_status.PENDING, model.CloudSnapshot.table.c.status == snapshot_status.DELETE ) ) \
  532. .all()
  533. for snapshot in snapshots:
  534. if self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.PENDING:
  535. log.debug( "[%s] Running general status update on snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) )
  536. self.update_snapshot( snapshot )
  537. elif self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.DELETE:
  538. log.debug( "[%s] Initiating deletion of snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) )
  539. self.delete_snapshot( snapshot )
  540. # Attempt at updating any zombie UCIs (i.e., instances that have been in SUBMITTED state for longer than expected - see below for exact time)
  541. zombies = self.sa_session.query( model.UCI ).filter_by( state=uci_states.SUBMITTED ).all()
  542. for zombie in zombies:
  543. log.debug( "zombie UCI: %s" % zombie.name )
  544. z_instances = self.sa_session.query( model.CloudInstance ) \
  545. .filter( or_( model.CloudInstance.table.c.state != instance_states.TERMINATED,
  546. model.CloudInstance.table.c.state == None ) ) \
  547. .all()
  548. for z_inst in z_instances:
  549. if self.type == z_inst.uci.credentials.provider.type:
  550. # log.debug( "z_inst.id: '%s', state: '%s'" % ( z_inst.id, z_inst.state ) )
  551. td = datetime.utcnow() - z_inst.update_time
  552. # log.debug( "z_inst.id: %s, time delta is %s sec" % ( z_inst.id, td.seconds ) )
  553. if td.seconds > 180: # if instance has been in SUBMITTED state for more than 3 minutes
  554. log.debug( "[%s](td=%s) Running zombie repair update on instance with DB id '%s'" % ( z_inst.uci.credentials.provider.type, td.seconds, z_inst.id ) )
  555. self.process_zombie( z_inst )
  556. def update_instance( self, inst ):
  557. """
  558. Update information in local database for given instance as it is obtained from cloud provider.
  559. Along with updating information about given instance, information about the UCI controlling
  560. this instance is also updated.
  561. """
  562. # Get credentials associated wit this instance
  563. uci_id = inst.uci_id
  564. uci = self.sa_session.query( model.UCI ).get( uci_id )
  565. self.sa_session.refresh( uci )
  566. conn = self.get_connection_from_uci( uci )
  567. # Get reservations handle for given instance
  568. try:
  569. rl= conn.get_all_instances( [inst.instance_id] )
  570. except boto.exception.EC2ResponseError, e:
  571. err = "Retrieving instance(s) from cloud failed for UCI '"+ uci.name +"' during general status update: " + str( e )
  572. log.error( err )
  573. uci.error = err
  574. uci.state = uci_states.ERROR
  575. self.sa_session.add( uci )
  576. self.sa_session.flush()
  577. return None
  578. # Because references to reservations are deleted shortly after instances have been terminated, getting an empty list as a response to a query
  579. # typically means the instance has successfully shut down but the check was not performed in short enough amount of time. Until an alternative solution
  580. # is found, below code sets state of given UCI to 'error' to indicate to the user something out of ordinary happened.
  581. if len( rl ) == 0:
  582. err = "Instance ID '"+inst.instance_id+"' was not found by the cloud provider. Instance might have crashed or otherwise been terminated."+ \
  583. "Manual check is recommended."
  584. log.error( err )
  585. inst.error = err
  586. uci.error = err
  587. inst.state = instance_states.TERMINATED
  588. uci.state = uci_states.ERROR
  589. uci.launch_time = None
  590. self.sa_session.add( inst )
  591. self.sa_session.add( uci )
  592. self.sa_session.flush()
  593. # Update instance status in local DB with info from cloud provider
  594. for r in rl:
  595. for i, cInst in enumerate( r.instances ):
  596. try:
  597. s = cInst.update()
  598. log.debug( "Checking state of cloud instance '%s' associated with UCI '%s' and reservation '%s'. State='%s'" % ( cInst, uci.name, r, s ) )
  599. if s != inst.state:
  600. inst.state = s
  601. self.sa_session.add( inst )
  602. self.sa_session.flush()
  603. # After instance has shut down, ensure UCI is marked as 'available'
  604. if s == instance_states.TERMINATED and uci.state != uci_states.ERROR:
  605. uci.state = uci_states.AVAILABLE
  606. uci.launch_time = None
  607. self.sa_session.add( uci )
  608. self.sa_session.flush()
  609. # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed.
  610. if s != uci.state and s != instance_states.TERMINATED:
  611. uci.state = s
  612. self.sa_session.add( uci )
  613. self.sa_session.flush()
  614. if cInst.public_dns_name != inst.public_dns:
  615. inst.public_dns = cInst.public_dns_name
  616. self.sa_session.add( inst )
  617. self.sa_session.flush()
  618. if cInst.private_dns_name != inst.private_dns:
  619. inst.private_dns = cInst.private_dns_name
  620. self.sa_session.add( inst )
  621. self.sa_session.flush()
  622. except boto.exception.EC2ResponseError, e:
  623. err = "Updating instance status from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  624. log.error( err )
  625. uci.error = err
  626. uci.state = uci_states.ERROR
  627. self.sa_session.add( uci )
  628. self.sa_session.flush()
  629. return None
  630. def update_store( self, store ):
  631. """
  632. Update information in local database for given storage volume as it is obtained from cloud provider.
  633. Along with updating information about given storage volume, information about the UCI controlling
  634. this storage volume is also updated.
  635. """
  636. # Get credentials associated wit this store
  637. uci_id = store.uci_id
  638. uci = self.sa_session.query( model.UCI ).get( uci_id )
  639. self.sa_session.refresh( uci )
  640. conn = self.get_connection_from_uci( uci )
  641. if store.volume_id != None:
  642. # Get reservations handle for given store
  643. try:
  644. log.debug( "Updating storage volume command: vl = conn.get_all_volumes( [%s] )" % store.volume_id )
  645. vl = conn.get_all_volumes( [store.volume_id] )
  646. except boto.exception.EC2ResponseError, e:
  647. err = "Retrieving volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  648. log.error( err )
  649. uci.error = err
  650. uci.state = uci_states.ERROR
  651. self.sa_session.add( uci )
  652. self.sa_session.flush()
  653. return None
  654. # Update store status in local DB with info from cloud provider
  655. if len(vl) > 0:
  656. try:
  657. log.debug( "Storage volume '%s' current status: '%s'" % (store.volume_id, vl[0].status ) )
  658. if store.status != vl[0].status:
  659. # In case something failed during creation of UCI but actual storage volume was created and yet
  660. # UCI state remained as 'new', try to remedy this by updating UCI state here
  661. if ( store.status == None ) and ( store.volume_id != None ):
  662. uci.state = vl[0].status
  663. self.sa_session.add( uci )
  664. self.sa_session.flush()
  665. # If UCI was marked in state 'CREATING', update its status to reflect new status
  666. elif ( uci.state == uci_states.CREATING ):
  667. # Because Eucalyptus Public Cloud (EPC) deletes volumes immediately after they are created, artificially
  668. # set status of given UCI to 'available' based on storage volume's availability zone (i.e., it's residing
  669. # in EPC as opposed to some other Eucalyptus based cloud that allows creation of storage volumes.
  670. if store.availability_zone == 'epc':
  671. uci.state = uci_states.AVAILABLE
  672. else:
  673. uci.state = vl[0].status
  674. self.sa_session.add( uci )
  675. self.sa_session.flush()
  676. store.status = vl[0].status
  677. self.sa_session.add( store )
  678. self.sa_session.flush()
  679. if store.inst != None:
  680. if store.inst.instance_id != vl[0].instance_id:
  681. store.inst.instance_id = vl[0].instance_id
  682. self.sa_session.add( store )
  683. self.sa_session.flush()
  684. if store.attach_time != vl[0].attach_time:
  685. store.attach_time = vl[0].attach_time
  686. self.sa_session.add( store )
  687. self.sa_session.flush()
  688. if store.device != vl[0].device:
  689. store.device = vl[0].device
  690. self.sa_session.add( store )
  691. self.sa_session.flush()
  692. except boto.exception.EC2ResponseError, e:
  693. err = "Updating status of volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  694. log.error( err )
  695. uci.error = err
  696. uci.state = uci_states.ERROR
  697. self.sa_session.add( uci )
  698. self.sa_session.flush()
  699. return None
  700. else:
  701. err = "No storage volumes returned by cloud provider on general update"
  702. log.error( "%s for UCI '%s'" % ( err, uci.name ) )
  703. store.status = store_status.ERROR
  704. store.error = err
  705. uci.error = err
  706. uci.state = uci_states.ERROR
  707. self.sa_session.add( uci )
  708. self.sa_session.add( store )
  709. self.sa_session.flush()
  710. else:
  711. err = "Missing storage volume ID in local database on general update. Manual check is needed to check " \
  712. "if storage volume was actually created by cloud provider."
  713. log.error( "%s (for UCI '%s')" % ( err, uci.name ) )
  714. store.status = store_status.ERROR
  715. store.error = err
  716. uci.error = err
  717. uci.state = uci_states.ERROR
  718. self.sa_session.add( uci )
  719. self.sa_session.add( store )
  720. self.sa_session.flush()
  721. def update_snapshot( self, snapshot ):
  722. """
  723. Update information in local database for given snapshot as it is obtained from cloud provider.
  724. Along with updating information about given snapshot, information about the UCI controlling
  725. this snapshot is also updated.
  726. """
  727. # Get credentials associated wit this store
  728. uci_id = snapshot.uci_id
  729. uci = self.sa_session.query( model.UCI ).get( uci_id )
  730. self.sa_session.refresh( uci )
  731. conn = self.get_connection_from_uci( uci )
  732. try:
  733. log.debug( "Updating status of snapshot '%s'" % snapshot.snapshot_id )
  734. snap = conn.get_all_snapshots( [snapshot.snapshot_id] )
  735. if len( snap ) > 0:
  736. log.debug( "Snapshot '%s' status: %s" % ( snapshot.snapshot_id, snap[0].status ) )
  737. snapshot.status = snap[0].status
  738. self.sa_session.add( snapshot )
  739. self.sa_session.flush()
  740. else:
  741. err = "No snapshots returned by EC2 on general update"
  742. log.error( "%s for UCI '%s'" % ( err, uci.name ) )
  743. snapshot.status = snapshot_status.ERROR
  744. snapshot.error = err
  745. uci.error = err
  746. uci.state = uci_states.ERROR
  747. self.sa_session.add( uci )
  748. self.sa_session.add( snapshot )
  749. self.sa_session.flush()
  750. except boto.exception.EC2ResponseError, e:
  751. err = "EC2 response error while updating snapshot status: " + str( e )
  752. log.error( err )
  753. snapshot.status = snapshot_status.ERROR
  754. snapshot.error = err
  755. uci.error = err
  756. uci.state = uci_states.ERROR
  757. self.sa_session.add( uci )
  758. self.sa_session.add( snapshot )
  759. self.sa_session.flush()
  760. except Exception, ex:
  761. err = "Error while updating snapshot status: " + str( ex )
  762. log.error( err )
  763. snapshot.status = snapshot_status.ERROR
  764. snapshot.error = err
  765. uci.error = err
  766. uci.state = uci_states.ERROR
  767. self.sa_session.add( uci )
  768. self.sa_session.add( snapshot )
  769. self.sa_session.flush()
  770. def delete_snapshot( self, snapshot ):
  771. """
  772. Initiate deletion of given snapshot from cloud provider.
  773. """
  774. if snapshot.status == snapshot_status.DELETE:
  775. # Get credentials associated wit this store
  776. uci_id = snapshot.uci_id
  777. uci = self.sa_session.query( model.UCI ).get( uci_id )
  778. self.sa_session.refresh( uci )
  779. conn = self.get_connection_from_uci( uci )
  780. try:
  781. log.debug( "Deleting snapshot '%s'" % snapshot.snapshot_id )
  782. snap = conn.delete_snapshot( snapshot.snapshot_id )
  783. if snap == True:
  784. snapshot.deleted = True
  785. snapshot.status = snapshot_status.DELETED
  786. self.sa_session.add( snapshot )
  787. self.sa_session.flush()
  788. return snap
  789. except boto.exception.EC2ResponseError, e:
  790. err = "EC2 response error while deleting snapshot: " + str( e )
  791. log.error( err )
  792. snapshot.status = snapshot_status.ERROR
  793. snapshot.error = err
  794. uci.error = err
  795. uci.state = uci_states.ERROR
  796. self.sa_session.add( uci )
  797. self.sa_session.add( snapshot )
  798. self.sa_session.flush()
  799. except Exception, ex:
  800. err = "Error while deleting snapshot: " + str( ex )
  801. log.error( err )
  802. snapshot.status = snapshot_status.ERROR
  803. snapshot.error = err
  804. uci.error = err
  805. uci.state = uci_states.ERROR
  806. self.sa_session.add( uci )
  807. self.sa_session.add( snapshot )
  808. self.sa_session.flush()
  809. else:
  810. err = "Cannot delete snapshot '"+snapshot.snapshot_id+"' because its status is '"+snapshot.status+"'. Only snapshots with '" + \
  811. snapshot_status.COMPLETED+"' status can be deleted."
  812. log.error( err )
  813. snapshot.error = err
  814. self.sa_session.add( snapshot )
  815. self.sa_session.flush()
  816. def process_zombie( self, inst ):
  817. """
  818. Attempt at discovering if starting a cloud instance was successful but local database was not updated
  819. accordingly or if something else failed and instance was never started. Currently, no automatic
  820. repairs are being attempted; instead, appropriate error messages are set.
  821. """
  822. uci_id = inst.uci_id
  823. uci = self.sa_session.query( model.UCI ).get( uci_id )
  824. self.sa_session.refresh( uci )
  825. # Check if any instance-specific information was written to local DB; if 'yes', set instance and UCI's error message
  826. # suggesting manual check.
  827. if inst.launch_time != None or inst.reservation_id != None or inst.instance_id != None:
  828. # Try to recover state - this is best-case effort, so if something does not work immediately, not
  829. # recovery steps are attempted. Recovery is based on hope that instance_id is available in local DB; if not,
  830. # report as error.
  831. # Fields attempting to be recovered are: reservation_id, instance status, and launch_time
  832. if inst.instance_id != None:
  833. conn = self.get_connection_from_uci( uci )
  834. rl = conn.get_all_instances( [inst.instance_id] ) # reservation list
  835. # Update local DB with relevant data from instance
  836. if inst.reservation_id == None:
  837. try:
  838. inst.reservation_id = str(rl[0]).split(":")[1]
  839. except: # something failed, so skip
  840. pass
  841. try:
  842. state = rl[0].instances[0].update()
  843. inst.state = state
  844. uci.state = state
  845. self.sa_session.add( inst )
  846. self.sa_session.add( uci )
  847. self.sa_session.flush()
  848. except: # something failed, so skip
  849. pass
  850. if inst.launch_time == None:
  851. try:
  852. launch_time = self.format_time( rl[0].instances[0].launch_time )
  853. inst.launch_time = launch_time
  854. self.sa_session.add( inst )
  855. self.sa_session.flush()
  856. if inst.uci.launch_time == None:
  857. uci.launch_time = launch_time
  858. self.sa_session.add( uci )
  859. self.sa_session.flush()
  860. except: # something failed, so skip
  861. pass
  862. else:
  863. err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \
  864. "' seems to have failed. Because it appears that cloud instance might have gotten started, manual check is recommended."
  865. inst.error = err
  866. inst.state = instance_states.ERROR
  867. inst.uci.error = err
  868. inst.uci.state = uci_states.ERROR
  869. log.error( err )
  870. self.sa_session.add( inst )
  871. self.sa_session.add( uci )
  872. self.sa_session.flush()
  873. else: #Instance most likely never got processed, so set error message suggesting user to try starting instance again.
  874. err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \
  875. "' seems to have failed. Because it appears that cloud instance never got started, it should be safe to reset state and try " \
  876. "starting the instance again."
  877. inst.error = err
  878. inst.state = instance_states.ERROR
  879. uci.error = err
  880. uci.state = uci_states.ERROR
  881. log.error( err )
  882. self.sa_session.add( inst )
  883. self.sa_session.add( uci )
  884. self.sa_session.flush()
  885. # uw = UCIwrapper( inst.uci )
  886. # log.debug( "Try automatically re-submitting UCI '%s'." % uw.get_name() )
  887. def get_connection_from_uci( self, uci ):
  888. """
  889. Establish and return connection to cloud provider. Information needed to do so is obtained
  890. directly from uci database object.
  891. """
  892. log.debug( 'Establishing %s cloud connection' % self.type )
  893. a_key = uci.credentials.access_key
  894. s_key = uci.credentials.secret_key
  895. # Get connection
  896. try:
  897. region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint )
  898. # log.debug( "[%s] Using following command to connect to cloud provider: "
  899. # "conn = EC2Connection( aws_access_key_id=%s, "
  900. # "aws_secret_access_key=%s, "
  901. # "port=%s, "
  902. # "is_secure=%s, "
  903. # "region=region, "
  904. # "path=%s )" % ( self.type, a_key, s_key, uci.credentials.provider.is_secure, uci.credentials.provider.port, uci.credentials.provider.path ) )
  905. conn = EC2Connection( aws_access_key_id=a_key,
  906. aws_secret_access_key=s_key,
  907. is_secure=uci.credentials.provider.is_secure,
  908. port=uci.credentials.provider.port,
  909. region=region,
  910. path=uci.credentials.provider.path )
  911. except boto.exception.EC2ResponseError, e:
  912. err = "Establishing connection with cloud failed: " + str( e )
  913. log.error( err )
  914. uci.error = err
  915. uci.state = uci_states.ERROR
  916. self.sa_session.add( uci )
  917. self.sa_session.flush()
  918. return None
  919. return conn
  920. # def updateUCI( self, uci ):
  921. # """
  922. # Runs a global status update on all storage volumes and all instances that are
  923. # associated with specified UCI
  924. # """
  925. # conn = self.get_connection( uci )
  926. #
  927. # # Update status of storage volumes
  928. # vl = model.CloudStore.filter( model.CloudInstance.table.c.uci_id == uci.id ).all()
  929. # vols = []
  930. # for v in vl:
  931. # vols.append( v.volume_id )
  932. # try:
  933. # volumes = conn.get_all_volumes( vols )
  934. # for i, v in enumerate( volumes ):
  935. # uci.store[i].inst.instance_id = v.instance_id
  936. # uci.store[i].status = v.status
  937. # uci.store[i].device = v.device
  938. # uci.store[i].flush()
  939. # except:
  940. # log.debug( "Error updating status of volume(s) associated with UCI '%s'. Status was not updated." % uci.name )
  941. # pass
  942. #
  943. # # Update status of instances
  944. # il = model.CloudInstance.filter_by( uci_id=uci.id ).filter( model.CloudInstance.table.c.state != 'terminated' ).all()
  945. # instanceList = []
  946. # for i in il:
  947. # instanceList.append( i.instance_id )
  948. # log.debug( 'instanceList: %s' % instanceList )
  949. # try:
  950. # reservations = conn.get_all_instances( instanceList )
  951. # for i, r in enumerate( reservations ):
  952. # uci.instance[i].state = r.instances[0].update()
  953. # log.debug('updating instance %s; status: %s' % ( uci.instance[i].instance_id, uci.instance[i].state ) )
  954. # uci.state = uci.instance[i].state
  955. # uci.instance[i].public_dns = r.instances[0].dns_name
  956. # uci.instance[i].private_dns = r.instances[0].private_dns_name
  957. # uci.instance[i].flush()
  958. # uci.flush()
  959. # except:
  960. # log.debug( "Error updating status of instances associated with UCI '%s'. Instance status was not updated." % uci.name )
  961. # pass
  962. # --------- Helper methods ------------
  963. def format_time( self, time ):
  964. dict = {'T':' ', 'Z':''}
  965. for i, j in dict.iteritems():
  966. time = time.replace(i, j)
  967. return time