PageRenderTime 66ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/galaxy/cloud/providers/eucalyptus.py

https://bitbucket.org/ajish/galaxy-omelogic
Python | 1025 lines | 921 code | 27 blank | 77 comment | 74 complexity | a8937e5a91375e2fb3bedaabda6d817b MD5 | raw file
  1. import subprocess, threading, os, errno, time, datetime
  2. from Queue import Queue, Empty
  3. from datetime import datetime
  4. from galaxy import model # Database interaction class
  5. from galaxy.model import mapping
  6. from galaxy.datatypes.data import nice_size
  7. from galaxy.util.bunch import Bunch
  8. from galaxy.cloud import UCIwrapper
  9. from Queue import Queue
  10. from sqlalchemy import or_, and_
  11. import galaxy.eggs
  12. galaxy.eggs.require("boto")
  13. from boto.ec2.connection import EC2Connection
  14. from boto.ec2.regioninfo import RegionInfo
  15. import boto.exception
  16. import boto
  17. import logging
  18. log = logging.getLogger( __name__ )
  19. uci_states = Bunch(
  20. NEW_UCI = "newUCI",
  21. NEW = "new",
  22. CREATING = "creating",
  23. DELETING_UCI = "deletingUCI",
  24. DELETING = "deleting",
  25. SUBMITTED_UCI = "submittedUCI",
  26. SUBMITTED = "submitted",
  27. SHUTTING_DOWN_UCI = "shutting-downUCI",
  28. SHUTTING_DOWN = "shutting-down",
  29. AVAILABLE = "available",
  30. RUNNING = "running",
  31. PENDING = "pending",
  32. ERROR = "error",
  33. DELETED = "deleted",
  34. SNAPSHOT_UCI = "snapshotUCI",
  35. SNAPSHOT = "snapshot"
  36. )
  37. instance_states = Bunch(
  38. TERMINATED = "terminated",
  39. SUBMITTED = "submitted",
  40. RUNNING = "running",
  41. PENDING = "pending",
  42. SHUTTING_DOWN = "shutting-down",
  43. ERROR = "error"
  44. )
  45. store_status = Bunch(
  46. WAITING = "waiting",
  47. IN_USE = "in-use",
  48. CREATING = "creating",
  49. DELETED = 'deleted',
  50. ERROR = "error"
  51. )
  52. snapshot_status = Bunch(
  53. SUBMITTED = 'submitted',
  54. PENDING = 'pending',
  55. COMPLETED = 'completed',
  56. DELETE = 'delete',
  57. DELETED= 'deleted',
  58. ERROR = "error"
  59. )
  60. class EucalyptusCloudProvider( object ):
  61. """
  62. Eucalyptus-based cloud provider implementation for managing instances.
  63. """
  64. STOP_SIGNAL = object()
  65. def __init__( self, app ):
  66. self.type = "eucalyptus" # cloud provider type (e.g., ec2, eucalyptus, opennebula)
  67. self.zone = "epc"
  68. self.queue = Queue()
  69. self.sa_session = app.model.context
  70. self.threads = []
  71. nworkers = 5
  72. log.info( "Starting eucalyptus cloud controller workers..." )
  73. for i in range( nworkers ):
  74. worker = threading.Thread( target=self.run_next )
  75. worker.start()
  76. self.threads.append( worker )
  77. log.debug( "%d eucalyptus cloud workers ready", nworkers )
  78. def shutdown( self ):
  79. """Attempts to gracefully shut down the monitor thread"""
  80. log.info( "sending stop signal to worker threads in eucalyptus cloud manager" )
  81. for i in range( len( self.threads ) ):
  82. self.queue.put( self.STOP_SIGNAL )
  83. log.info( "eucalyptus cloud manager stopped" )
  84. def put( self, uci_wrapper ):
  85. """
  86. Add uci_wrapper object to the end of the request queue to be handled by
  87. this cloud provider.
  88. """
  89. state = uci_wrapper.get_uci_state()
  90. uci_wrapper.change_state( state.split('U')[0] ) # remove 'UCI' from end of state description (i.e., mark as accepted and ready for processing)
  91. self.queue.put( uci_wrapper )
  92. def run_next( self ):
  93. """Process next request, waiting until one is available if necessary."""
  94. cnt = 0
  95. while 1:
  96. uci_wrapper = self.queue.get()
  97. uci_state = uci_wrapper.get_uci_state()
  98. if uci_state is self.STOP_SIGNAL:
  99. return
  100. try:
  101. if uci_state==uci_states.NEW:
  102. self.create_uci( uci_wrapper )
  103. elif uci_state==uci_states.DELETING:
  104. self.delete_uci( uci_wrapper )
  105. elif uci_state==uci_states.SUBMITTED:
  106. self.start_uci( uci_wrapper )
  107. #self.dummy_start_uci( uci_wrapper )
  108. elif uci_state==uci_states.SHUTTING_DOWN:
  109. self.stop_uci( uci_wrapper )
  110. elif uci_state==uci_states.SNAPSHOT:
  111. self.snapshot_uci( uci_wrapper )
  112. except:
  113. log.exception( "Uncaught exception executing cloud request." )
  114. cnt += 1
  115. def get_connection( self, uci_wrapper ):
  116. """
  117. Establishes cloud connection using user's credentials associated with given UCI
  118. """
  119. log.debug( 'Establishing %s cloud connection.' % self.type )
  120. provider = uci_wrapper.get_provider()
  121. try:
  122. region = RegionInfo( None, provider.region_name, provider.region_endpoint )
  123. except Exception, ex:
  124. err = "Selecting region with cloud provider failed: " + str( ex )
  125. log.error( err )
  126. uci_wrapper.set_error( err, True )
  127. return None
  128. try:
  129. conn = EC2Connection( aws_access_key_id=uci_wrapper.get_access_key(),
  130. aws_secret_access_key=uci_wrapper.get_secret_key(),
  131. is_secure=provider.is_secure,
  132. port=provider.port,
  133. region=region,
  134. path=provider.path )
  135. except boto.exception.EC2ResponseError, e:
  136. err = "Establishing connection with cloud failed: " + str( e )
  137. log.error( err )
  138. uci_wrapper.set_error( err, True )
  139. return None
  140. return conn
  141. def check_key_pair( self, uci_wrapper, conn ):
  142. """
  143. Check if a key pair associated with this UCI exists on cloud provider.
  144. If yes, return key pair name; otherwise, generate a key pair with the cloud
  145. provider and, again, return key pair name.
  146. Key pair name for given UCI is generated from UCI's name and suffix '_kp'
  147. """
  148. kp = None
  149. kp_name = uci_wrapper.get_name().replace(' ','_') + "_kp"
  150. log.debug( "Checking user's key pair: '%s'" % kp_name )
  151. try:
  152. kp = conn.get_key_pair( kp_name )
  153. uci_kp_name = uci_wrapper.get_key_pair_name()
  154. uci_material = uci_wrapper.get_key_pair_material()
  155. if kp != None:
  156. if kp.name != uci_kp_name or uci_material == None:
  157. # key pair exists on the cloud but not in local database, so re-generate it (i.e., delete and then create)
  158. try:
  159. conn.delete_key_pair( kp_name )
  160. kp = self.create_key_pair( conn, kp_name )
  161. uci_wrapper.set_key_pair( kp.name, kp.material )
  162. except boto.exception.EC2ResponseError, e:
  163. err = "EC2 response error while deleting key pair: " + str( e )
  164. log.error( err )
  165. uci_wrapper.set_error( err, True )
  166. else:
  167. try:
  168. kp = self.create_key_pair( conn, kp_name )
  169. uci_wrapper.set_key_pair( kp.name, kp.material )
  170. except boto.exception.EC2ResponseError, e:
  171. err = "EC2 response error while creating key pair: " + str( e )
  172. log.error( err )
  173. uci_wrapper.set_error( err, True )
  174. except Exception, ex:
  175. err = "Exception while creating key pair: " + str( ex )
  176. log.error( err )
  177. uci_wrapper.set_error( err, True )
  178. except boto.exception.EC2ResponseError, e: # No keypair under this name exists so create it
  179. if e.code == 'InvalidKeyPair.NotFound':
  180. log.info( "No keypair found, creating keypair '%s'" % kp_name )
  181. kp = self.create_key_pair( conn, kp_name )
  182. uci_wrapper.set_key_pair( kp.name, kp.material )
  183. else:
  184. err = "EC2 response error while retrieving key pair: " + str( e )
  185. log.error( err )
  186. uci_wrapper.set_error( err, True )
  187. if kp != None:
  188. return kp.name
  189. else:
  190. return None
  191. def create_key_pair( self, conn, kp_name ):
  192. """ Initiate creation of key pair under kp_name by current cloud provider. """
  193. try:
  194. return conn.create_key_pair( kp_name )
  195. except boto.exception.EC2ResponseError, e:
  196. return None
  197. def get_mi_id( self, uci_wrapper, i_index ):
  198. """
  199. Get appropriate machine image (mi) ID based on instance type.
  200. """
  201. i_type = uci_wrapper.get_instance_type( i_index )
  202. if i_type=='m1.small' or i_type=='c1.medium':
  203. arch = 'i386'
  204. else:
  205. arch = 'x86_64'
  206. mi = self.sa_session.query( model.CloudImage ).filter_by( deleted=False, provider_type=self.type, architecture=arch ).first()
  207. if mi:
  208. return mi.image_id
  209. else:
  210. err = "Machine image could not be retrieved"
  211. log.error( "%s for UCI '%s'." % (err, uci_wrapper.get_name() ) )
  212. uci_wrapper.set_error( err+". Contact site administrator to ensure needed machine image is registered.", True )
  213. return None
  214. def create_uci( self, uci_wrapper ):
  215. """
  216. Create User Configured Instance (UCI) - i.e., create storage volume on cloud provider
  217. and register relevant information in local Galaxy database.
  218. """
  219. conn = self.get_connection( uci_wrapper )
  220. # Because only 1 storage volume may be created at UCI config time, index of this storage volume in local Galaxy DB w.r.t
  221. # current UCI is 0; therefore, it can be referenced in following code
  222. log.info( "Creating volume in zone '%s'..." % uci_wrapper.get_uci_availability_zone() )
  223. if uci_wrapper.get_uci_availability_zone()=='':
  224. log.info( "Availability zone for UCI (i.e., storage volume) was not selected, using default zone: %s" % self.zone )
  225. uci_wrapper.set_store_availability_zone( self.zone )
  226. log.debug( "Creating volume; using command: conn.create_volume( %s, '%s', snapshot=None )" % ( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone() ))
  227. vol = conn.create_volume( uci_wrapper.get_store_size( 0 ), uci_wrapper.get_uci_availability_zone(), snapshot=None )
  228. uci_wrapper.set_store_volume_id( 0, vol.id )
  229. # Retrieve created volume again to get updated status
  230. try:
  231. vl = conn.get_all_volumes( [vol.id] )
  232. except boto.exception.EC2ResponseError, e:
  233. err = "EC2 response error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( e )
  234. log.error( err )
  235. uci_wrapper.set_store_status( vol.id, uci_states.ERROR )
  236. uci_wrapper.set_error( err, True )
  237. return
  238. except Exception, ex:
  239. err = "Error while retrieving (i.e., updating status) of just created storage volume '" + vol.id + "': " + str( ex )
  240. log.error( err )
  241. uci_wrapper.set_error( err, True )
  242. return
  243. if len( vl ) > 0:
  244. # EPC does not allow creation of storage volumes (it deletes one as soon as it is created, so manually set uci_state here)
  245. if vl[0].status == store_status.DELETED:
  246. uci_wrapper.change_state( uci_state=uci_states.AVAILABLE )
  247. else:
  248. uci_wrapper.change_state( uci_state=vl[0].status )
  249. uci_wrapper.set_store_status( vol.id, vl[0].status )
  250. else:
  251. err = "Volume '" + vol.id +"' not found by EC2 after being created."
  252. log.error( err )
  253. uci_wrapper.set_store_status( vol.id, uci_states.ERROR )
  254. uci_wrapper.set_error( err, True )
  255. def delete_uci( self, uci_wrapper ):
  256. """
  257. Delete UCI - i.e., delete all storage volumes associated with this UCI.
  258. NOTE that this implies deletion of any and all data associated
  259. with this UCI from the cloud. All data will be deleted.
  260. Information in local Galaxy database is marked as deleted but not actually removed
  261. from the database.
  262. """
  263. conn = self.get_connection( uci_wrapper )
  264. vl = [] # volume list
  265. count = 0 # counter for checking if all volumes assoc. w/ UCI were deleted
  266. # Get all volumes assoc. w/ UCI, delete them from cloud as well as in local DB
  267. vl = uci_wrapper.get_all_stores()
  268. deletedList = []
  269. failedList = []
  270. for v in vl:
  271. log.debug( "Deleting volume with id='%s'" % v.volume_id )
  272. try:
  273. if conn.delete_volume( v.volume_id ):
  274. deletedList.append( v.volume_id )
  275. v.deleted = True
  276. self.sa_session.add( v )
  277. self.sa_session.flush()
  278. count += 1
  279. else:
  280. failedList.append( v.volume_id )
  281. except boto.exception.EC2ResponseError, e:
  282. err = "EC2 response error while deleting storage volume '" + v.volume_id + "': " + str( e )
  283. log.error( err )
  284. uci_wrapper.set_store_error( err, store_id = v.volume_id )
  285. uci_wrapper.set_error( err, True )
  286. # Delete UCI if all of associated
  287. if count == len( vl ):
  288. uci_wrapper.set_deleted()
  289. else:
  290. err = "Deleting following volume(s) failed: "+ str( failedList )+". However, these volumes were successfully deleted: " \
  291. + str( deletedList ) +". MANUAL intervention and processing needed."
  292. log.error( err )
  293. uci_wrapper.set_error( err, True )
  294. def snapshot_uci( self, uci_wrapper ):
  295. """
  296. Initiate creation of a snapshot by cloud provider for all storage volumes
  297. associated with this UCI.
  298. """
  299. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  300. conn = self.get_connection( uci_wrapper )
  301. snapshots = uci_wrapper.get_snapshots( status = snapshot_status.SUBMITTED )
  302. for snapshot in snapshots:
  303. log.debug( "Snapshot DB id: '%s', volume id: '%s'" % ( snapshot.id, snapshot.store.volume_id ) )
  304. try:
  305. snap = conn.create_snapshot( volume_id=snapshot.store.volume_id )
  306. snap_id = str( snap ).split(':')[1]
  307. uci_wrapper.set_snapshot_id( snapshot.id, snap_id )
  308. sh = conn.get_all_snapshots( snap_id ) # get updated status
  309. uci_wrapper.set_snapshot_status( status=sh[0].status, snap_id=snap_id )
  310. except boto.exception.EC2ResponseError, e:
  311. err = "Cloud provider response error while creating snapshot: " + str( e )
  312. log.error( err )
  313. uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True )
  314. uci_wrapper.set_error( err, True )
  315. return
  316. except Exception, ex:
  317. err = "Error while creating snapshot: " + str( ex )
  318. log.error( err )
  319. uci_wrapper.set_snapshot_error( error=err, snap_index=snapshot.id, set_status=True )
  320. uci_wrapper.set_error( err, True )
  321. return
  322. uci_wrapper.change_state( uci_state=uci_states.AVAILABLE )
  323. # if uci_wrapper.get_uci_state() != uci_states.ERROR:
  324. #
  325. # snapshots = uci_wrapper.get_snapshots( status = 'submitted' )
  326. # for snapshot in snapshots:
  327. # uci_wrapper.set_snapshot_id( snapshot.id, None, 'euca_error' )
  328. #
  329. # log.debug( "Eucalyptus snapshot attempted by user for UCI '%s'" % uci_wrapper.get_name() )
  330. # uci_wrapper.set_error( "Eucalyptus does not support creation of snapshots at this moment. No snapshot or other changes were performed. \
  331. # Feel free to resent state of this instance and use it normally.", True )
  332. def add_storage_to_uci( self, uci_wrapper ):
  333. """ Adds more storage to specified UCI """
  334. def dummy_start_uci( self, uci_wrapper ):
  335. uci = uci_wrapper.get_uci()
  336. log.debug( "Would be starting instance '%s'" % uci.name )
  337. # uci_wrapper.change_state( uci_states.SUBMITTED_UCI )
  338. # log.debug( "Set UCI state to SUBMITTED_UCI" )
  339. log.debug( "Sleeping a bit... (%s)" % uci.name )
  340. time.sleep(10)
  341. log.debug( "Woke up! (%s)" % uci.name )
  342. def start_uci( self, uci_wrapper ):
  343. """
  344. Start instance(s) of given UCI on the cloud.
  345. """
  346. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  347. conn = self.get_connection( uci_wrapper )
  348. self.check_key_pair( uci_wrapper, conn )
  349. if uci_wrapper.get_key_pair_name() == None:
  350. err = "Key pair not found"
  351. log.error( "%s for UCI '%s'." % ( err, uci_wrapper.get_name() ) )
  352. uci_wrapper.set_error( err + ". Try resetting the state and starting the instance again.", True )
  353. return
  354. i_indexes = uci_wrapper.get_instances_indexes( state=instance_states.SUBMITTED ) # Get indexes of i_indexes associated with this UCI that are in 'submitted' state
  355. log.debug( "Starting instances with IDs: '%s' associated with UCI '%s' " % ( i_indexes, uci_wrapper.get_name(), ) )
  356. if len( i_indexes ) > 0:
  357. for i_index in i_indexes:
  358. # Get machine image for current instance
  359. mi_id = self.get_mi_id( uci_wrapper, i_index )
  360. log.debug( "mi_id: %s, uci_wrapper.get_key_pair_name(): %s" % ( mi_id, uci_wrapper.get_key_pair_name() ) )
  361. uci_wrapper.set_mi( i_index, mi_id )
  362. if uci_wrapper.get_uci_state() != uci_states.ERROR:
  363. # Start an instance
  364. log.debug( "Starting UCI instance '%s'" % uci_wrapper.get_name() )
  365. log.debug( "Using following command: conn.run_instances( image_id='%s', key_name='%s', instance_type='%s' )"
  366. % ( mi_id, uci_wrapper.get_key_pair_name(), uci_wrapper.get_instance_type( i_index ) ) )
  367. reservation = None
  368. try:
  369. reservation = conn.run_instances( image_id=mi_id,
  370. key_name=uci_wrapper.get_key_pair_name(),
  371. instance_type=uci_wrapper.get_instance_type( i_index ) )
  372. except boto.exception.EC2ResponseError, e:
  373. err = "EC2 response error when starting UCI '"+ uci_wrapper.get_name() +"': " + str( e )
  374. log.error( err )
  375. uci_wrapper.set_error( err, True )
  376. except Exception, ex:
  377. err = "Error when starting UCI '" + uci_wrapper.get_name() + "': " + str( ex )
  378. log.error( err )
  379. uci_wrapper.set_error( err, True )
  380. # Record newly available instance data into local Galaxy database
  381. if reservation:
  382. l_time = datetime.utcnow()
  383. # uci_wrapper.set_instance_launch_time( self.format_time( reservation.instances[0].launch_time ), i_index=i_index )
  384. uci_wrapper.set_instance_launch_time( l_time, i_index=i_index )
  385. if not uci_wrapper.uci_launch_time_set():
  386. uci_wrapper.set_uci_launch_time( l_time )
  387. try:
  388. uci_wrapper.set_reservation_id( i_index, str( reservation ).split(":")[1] )
  389. # TODO: if more than a single instance will be started through single reservation, change this reference from element [0]
  390. i_id = str( reservation.instances[0]).split(":")[1]
  391. uci_wrapper.set_instance_id( i_index, i_id )
  392. s = reservation.instances[0].state
  393. uci_wrapper.change_state( s, i_id, s )
  394. vol_id = uci_wrapper.get_store_volume_id( store_id=0 ) # TODO: Once more that one vol/UCI is allowed, update this!
  395. uci_wrapper.set_store_status( vol_id, store_status.WAITING )
  396. log.debug( "Instance of UCI '%s' started, current state: '%s'" % ( uci_wrapper.get_name(), uci_wrapper.get_uci_state() ) )
  397. except boto.exception.EC2ResponseError, e:
  398. err = "EC2 response error when retrieving instance information for UCI '" + uci_wrapper.get_name() + "': " + str( e )
  399. log.error( err )
  400. uci_wrapper.set_error( err, True )
  401. else:
  402. log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() )
  403. else:
  404. err = "No instances in state '"+ instance_states.SUBMITTED +"' found for UCI '" + uci_wrapper.get_name() + \
  405. "'. Nothing to start."
  406. log.error( err )
  407. uci_wrapper.set_error( err, True )
  408. else:
  409. log.error( "UCI '%s' is in 'error' state, starting instance was aborted." % uci_wrapper.get_name() )
  410. def stop_uci( self, uci_wrapper):
  411. """
  412. Stop all cloud instances associated with given UCI.
  413. """
  414. conn = self.get_connection( uci_wrapper )
  415. # Get all instances associated with given UCI
  416. il = uci_wrapper.get_instances_ids() # instance list
  417. # Process list of instances and remove any references to empty instance id's
  418. for i in il:
  419. if i is None:
  420. il.remove( i )
  421. log.debug( 'List of instances being terminated: %s' % il )
  422. rl = conn.get_all_instances( il ) # Reservation list associated with given instances
  423. # Initiate shutdown of all instances under given UCI
  424. cnt = 0
  425. stopped = []
  426. not_stopped = []
  427. for r in rl:
  428. for inst in r.instances:
  429. log.debug( "Sending stop signal to instance '%s' associated with reservation '%s' (UCI: %s)." % ( inst, r, uci_wrapper.get_name() ) )
  430. try:
  431. inst.stop()
  432. uci_wrapper.set_stop_time( datetime.utcnow(), i_id=inst.id )
  433. uci_wrapper.change_state( instance_id=inst.id, i_state=inst.update() )
  434. stopped.append( inst )
  435. except boto.exception.EC2ResponseError, e:
  436. not_stopped.append( inst )
  437. err = "EC2 response error when stopping instance '" + inst.instance_id + "': " + str( e )
  438. log.error( err )
  439. uci_wrapper.set_error( err, True )
  440. uci_wrapper.reset_uci_launch_time()
  441. log.debug( "Termination was initiated for all instances of UCI '%s'." % uci_wrapper.get_name() )
  442. # dbInstances = get_instances( trans, uci ) #TODO: handle list!
  443. #
  444. # # Get actual cloud instance object
  445. # cloudInstance = get_cloud_instance( conn, dbInstances.instance_id )
  446. #
  447. # # TODO: Detach persistent storage volume(s) from instance and update volume data in local database
  448. # stores = get_stores( trans, uci )
  449. # for i, store in enumerate( stores ):
  450. # log.debug( "Detaching volume '%s' to instance '%s'." % ( store.volume_id, dbInstances.instance_id ) )
  451. # mntDevice = store.device
  452. # volStat = None
  453. ## Detaching volume does not work with Eucalyptus Public Cloud, so comment it out
  454. ## try:
  455. ## volStat = conn.detach_volume( store.volume_id, dbInstances.instance_id, mntDevice )
  456. ## except:
  457. ## log.debug ( 'Error detaching volume; still going to try and stop instance %s.' % dbInstances.instance_id )
  458. # store.attach_time = None
  459. # store.device = None
  460. # store.inst.instance_id = None
  461. # store.status = volStat
  462. # log.debug ( '***** volume status: %s' % volStat )
  463. #
  464. # # Stop the instance and update status in local database
  465. # cloudInstance.stop()
  466. # dbInstances.stop_time = datetime.utcnow()
  467. # while cloudInstance.state != 'terminated':
  468. # log.debug( "Stopping instance %s state; current state: %s" % ( str( cloudInstance ).split(":")[1], cloudInstance.state ) )
  469. # time.sleep(3)
  470. # cloudInstance.update()
  471. # dbInstances.state = cloudInstance.state
  472. #
  473. # # Reset relevant UCI fields
  474. # uci.state = 'available'
  475. # uci.launch_time = None
  476. #
  477. # # Persist
  478. # session = trans.sa_session
  479. ## session.save_or_update( stores )
  480. # session.save_or_update( dbInstances ) # TODO: Is this going to work w/ multiple instances stored in dbInstances variable?
  481. # session.save_or_update( uci )
  482. # session.flush()
  483. # trans.log_event( "User stopped cloud instance '%s'" % uci.name )
  484. # trans.set_message( "Galaxy instance '%s' stopped." % uci.name )
  485. def update( self ):
  486. """
  487. Run status update on all instances that are in 'running', 'pending', or 'shutting-down' state.
  488. Run status update on all storage volumes whose status is 'in-use', 'creating', or 'None'.
  489. Run status update on all snapshots whose status is 'pending' or 'delete'
  490. Run status update on any zombie UCIs, i.e., UCI's that is in 'submitted' state for an
  491. extended period of time.
  492. Reason behind this method is to sync state of local DB and real-world resources
  493. """
  494. log.debug( "Running general status update for %s UCIs..." % self.type )
  495. # Update instances
  496. instances = self.sa_session.query( model.CloudInstance ) \
  497. .filter( or_( model.CloudInstance.table.c.state==instance_states.RUNNING,
  498. model.CloudInstance.table.c.state==instance_states.PENDING,
  499. model.CloudInstance.table.c.state==instance_states.SHUTTING_DOWN ) ) \
  500. .all()
  501. for inst in instances:
  502. if self.type == inst.uci.credentials.provider.type:
  503. log.debug( "[%s] Running general status update on instance '%s'" % ( inst.uci.credentials.provider.type, inst.instance_id ) )
  504. self.update_instance( inst )
  505. # Update storage volume(s)
  506. stores = self.sa_session.query( model.CloudStore ) \
  507. .filter( or_( model.CloudStore.table.c.status==store_status.IN_USE,
  508. model.CloudStore.table.c.status==store_status.CREATING,
  509. model.CloudStore.table.c.status==store_status.WAITING,
  510. model.CloudStore.table.c.status==None ) ) \
  511. .all()
  512. for store in stores:
  513. if self.type == store.uci.credentials.provider.type: # and store.volume_id != None:
  514. log.debug( "[%s] Running general status update on store with local database ID: '%s'" % ( store.uci.credentials.provider.type, store.id ) )
  515. self.update_store( store )
  516. # Update pending snapshots or delete ones marked for deletion
  517. snapshots = self.sa_session.query( model.CloudSnapshot ) \
  518. .filter( or_( model.CloudSnapshot.table.c.status == snapshot_status.PENDING, model.CloudSnapshot.table.c.status == snapshot_status.DELETE ) ) \
  519. .all()
  520. for snapshot in snapshots:
  521. if self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.PENDING:
  522. log.debug( "[%s] Running general status update on snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) )
  523. self.update_snapshot( snapshot )
  524. elif self.type == snapshot.uci.credentials.provider.type and snapshot.status == snapshot_status.DELETE:
  525. log.debug( "[%s] Initiating deletion of snapshot '%s'" % ( snapshot.uci.credentials.provider.type, snapshot.snapshot_id ) )
  526. self.delete_snapshot( snapshot )
  527. # Attempt at updating any zombie UCIs (i.e., instances that have been in SUBMITTED state for longer than expected - see below for exact time)
  528. zombies = self.sa_session.query( model.UCI ).filter_by( state=uci_states.SUBMITTED ).all()
  529. for zombie in zombies:
  530. log.debug( "zombie UCI: %s" % zombie.name )
  531. z_instances = self.sa_session.query( model.CloudInstance ) \
  532. .filter( or_( model.CloudInstance.table.c.state != instance_states.TERMINATED,
  533. model.CloudInstance.table.c.state == None ) ) \
  534. .all()
  535. for z_inst in z_instances:
  536. if self.type == z_inst.uci.credentials.provider.type:
  537. # log.debug( "z_inst.id: '%s', state: '%s'" % ( z_inst.id, z_inst.state ) )
  538. td = datetime.utcnow() - z_inst.update_time
  539. # log.debug( "z_inst.id: %s, time delta is %s sec" % ( z_inst.id, td.seconds ) )
  540. if td.seconds > 180: # if instance has been in SUBMITTED state for more than 3 minutes
  541. log.debug( "[%s](td=%s) Running zombie repair update on instance with DB id '%s'" % ( z_inst.uci.credentials.provider.type, td.seconds, z_inst.id ) )
  542. self.process_zombie( z_inst )
  543. def update_instance( self, inst ):
  544. """
  545. Update information in local database for given instance as it is obtained from cloud provider.
  546. Along with updating information about given instance, information about the UCI controlling
  547. this instance is also updated.
  548. """
  549. # Get credentials associated wit this instance
  550. uci_id = inst.uci_id
  551. uci = self.sa_session.query( model.UCI ).get( uci_id )
  552. self.sa_session.refresh( uci )
  553. conn = self.get_connection_from_uci( uci )
  554. # Get reservations handle for given instance
  555. try:
  556. rl= conn.get_all_instances( [inst.instance_id] )
  557. except boto.exception.EC2ResponseError, e:
  558. err = "Retrieving instance(s) from cloud failed for UCI '"+ uci.name +"' during general status update: " + str( e )
  559. log.error( err )
  560. uci.error = err
  561. uci.state = uci_states.ERROR
  562. self.sa_session.add( uci )
  563. self.sa_session.flush()
  564. return None
  565. # Because references to reservations are deleted shortly after instances have been terminated, getting an empty list as a response to a query
  566. # typically means the instance has successfully shut down but the check was not performed in short enough amount of time. Until an alternative solution
  567. # is found, below code sets state of given UCI to 'error' to indicate to the user something out of ordinary happened.
  568. if len( rl ) == 0:
  569. err = "Instance ID '"+inst.instance_id+"' was not found by the cloud provider. Instance might have crashed or otherwise been terminated."+ \
  570. "Manual check is recommended."
  571. log.error( err )
  572. inst.error = err
  573. uci.error = err
  574. inst.state = instance_states.TERMINATED
  575. uci.state = uci_states.ERROR
  576. uci.launch_time = None
  577. self.sa_session.add( inst )
  578. self.sa_session.add( uci )
  579. self.sa_session.flush()
  580. # Update instance status in local DB with info from cloud provider
  581. for r in rl:
  582. for i, cInst in enumerate( r.instances ):
  583. try:
  584. s = cInst.update()
  585. log.debug( "Checking state of cloud instance '%s' associated with UCI '%s' and reservation '%s'. State='%s'" % ( cInst, uci.name, r, s ) )
  586. if s != inst.state:
  587. inst.state = s
  588. self.sa_session.add( inst )
  589. self.sa_session.flush()
  590. # After instance has shut down, ensure UCI is marked as 'available'
  591. if s == instance_states.TERMINATED and uci.state != uci_states.ERROR:
  592. uci.state = uci_states.AVAILABLE
  593. uci.launch_time = None
  594. self.sa_session.add( uci )
  595. self.sa_session.flush()
  596. # Making sure state of UCI is updated. Once multiple instances become associated with single UCI, this will need to be changed.
  597. if s != uci.state and s != instance_states.TERMINATED:
  598. uci.state = s
  599. self.sa_session.add( uci )
  600. self.sa_session.flush()
  601. if cInst.public_dns_name != inst.public_dns:
  602. inst.public_dns = cInst.public_dns_name
  603. self.sa_session.add( inst )
  604. self.sa_session.flush()
  605. if cInst.private_dns_name != inst.private_dns:
  606. inst.private_dns = cInst.private_dns_name
  607. self.sa_session.add( inst )
  608. self.sa_session.flush()
  609. except boto.exception.EC2ResponseError, e:
  610. err = "Updating instance status from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  611. log.error( err )
  612. uci.error = err
  613. uci.state = uci_states.ERROR
  614. self.sa_session.add( uci )
  615. self.sa_session.flush()
  616. return None
  617. def update_store( self, store ):
  618. """
  619. Update information in local database for given storage volume as it is obtained from cloud provider.
  620. Along with updating information about given storage volume, information about the UCI controlling
  621. this storage volume is also updated.
  622. """
  623. # Get credentials associated wit this store
  624. uci_id = store.uci_id
  625. uci = self.sa_session.query( model.UCI ).get( uci_id )
  626. self.sa_session.refresh( uci )
  627. conn = self.get_connection_from_uci( uci )
  628. if store.volume_id != None:
  629. # Get reservations handle for given store
  630. try:
  631. log.debug( "Updating storage volume command: vl = conn.get_all_volumes( [%s] )" % store.volume_id )
  632. vl = conn.get_all_volumes( [store.volume_id] )
  633. except boto.exception.EC2ResponseError, e:
  634. err = "Retrieving volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  635. log.error( err )
  636. uci.error = err
  637. uci.state = uci_states.ERROR
  638. self.sa_session.add( uci )
  639. self.sa_session.flush()
  640. return None
  641. # Update store status in local DB with info from cloud provider
  642. if len(vl) > 0:
  643. try:
  644. log.debug( "Storage volume '%s' current status: '%s'" % (store.volume_id, vl[0].status ) )
  645. if store.status != vl[0].status:
  646. # In case something failed during creation of UCI but actual storage volume was created and yet
  647. # UCI state remained as 'new', try to remedy this by updating UCI state here
  648. if ( store.status == None ) and ( store.volume_id != None ):
  649. uci.state = vl[0].status
  650. self.sa_session.add( uci )
  651. self.sa_session.flush()
  652. # If UCI was marked in state 'CREATING', update its status to reflect new status
  653. elif ( uci.state == uci_states.CREATING ):
  654. # Because Eucalyptus Public Cloud (EPC) deletes volumes immediately after they are created, artificially
  655. # set status of given UCI to 'available' based on storage volume's availability zone (i.e., it's residing
  656. # in EPC as opposed to some other Eucalyptus based cloud that allows creation of storage volumes.
  657. if store.availability_zone == 'epc':
  658. uci.state = uci_states.AVAILABLE
  659. else:
  660. uci.state = vl[0].status
  661. self.sa_session.add( uci )
  662. self.sa_session.flush()
  663. store.status = vl[0].status
  664. self.sa_session.add( store )
  665. self.sa_session.flush()
  666. if store.inst != None:
  667. if store.inst.instance_id != vl[0].instance_id:
  668. store.inst.instance_id = vl[0].instance_id
  669. self.sa_session.add( store )
  670. self.sa_session.flush()
  671. if store.attach_time != vl[0].attach_time:
  672. store.attach_time = vl[0].attach_time
  673. self.sa_session.add( store )
  674. self.sa_session.flush()
  675. if store.device != vl[0].device:
  676. store.device = vl[0].device
  677. self.sa_session.add( store )
  678. self.sa_session.flush()
  679. except boto.exception.EC2ResponseError, e:
  680. err = "Updating status of volume(s) from cloud failed for UCI '"+ uci.name + "' during general status update: " + str( e )
  681. log.error( err )
  682. uci.error = err
  683. uci.state = uci_states.ERROR
  684. self.sa_session.add( uci )
  685. self.sa_session.flush()
  686. return None
  687. else:
  688. err = "No storage volumes returned by cloud provider on general update"
  689. log.error( "%s for UCI '%s'" % ( err, uci.name ) )
  690. store.status = store_status.ERROR
  691. store.error = err
  692. uci.error = err
  693. uci.state = uci_states.ERROR
  694. self.sa_session.add( uci )
  695. self.sa_session.add( store )
  696. self.sa_session.flush()
  697. else:
  698. err = "Missing storage volume ID in local database on general update. Manual check is needed to check " \
  699. "if storage volume was actually created by cloud provider."
  700. log.error( "%s (for UCI '%s')" % ( err, uci.name ) )
  701. store.status = store_status.ERROR
  702. store.error = err
  703. uci.error = err
  704. uci.state = uci_states.ERROR
  705. self.sa_session.add( uci )
  706. self.sa_session.add( store )
  707. self.sa_session.flush()
  708. def update_snapshot( self, snapshot ):
  709. """
  710. Update information in local database for given snapshot as it is obtained from cloud provider.
  711. Along with updating information about given snapshot, information about the UCI controlling
  712. this snapshot is also updated.
  713. """
  714. # Get credentials associated wit this store
  715. uci_id = snapshot.uci_id
  716. uci = self.sa_session.query( model.UCI ).get( uci_id )
  717. self.sa_session.refresh( uci )
  718. conn = self.get_connection_from_uci( uci )
  719. try:
  720. log.debug( "Updating status of snapshot '%s'" % snapshot.snapshot_id )
  721. snap = conn.get_all_snapshots( [snapshot.snapshot_id] )
  722. if len( snap ) > 0:
  723. log.debug( "Snapshot '%s' status: %s" % ( snapshot.snapshot_id, snap[0].status ) )
  724. snapshot.status = snap[0].status
  725. self.sa_session.add( snapshot )
  726. self.sa_session.flush()
  727. else:
  728. err = "No snapshots returned by EC2 on general update"
  729. log.error( "%s for UCI '%s'" % ( err, uci.name ) )
  730. snapshot.status = snapshot_status.ERROR
  731. snapshot.error = err
  732. uci.error = err
  733. uci.state = uci_states.ERROR
  734. self.sa_session.add( uci )
  735. self.sa_session.add( snapshot )
  736. self.sa_session.flush()
  737. except boto.exception.EC2ResponseError, e:
  738. err = "EC2 response error while updating snapshot status: " + str( e )
  739. log.error( err )
  740. snapshot.status = snapshot_status.ERROR
  741. snapshot.error = err
  742. uci.error = err
  743. uci.state = uci_states.ERROR
  744. self.sa_session.add( uci )
  745. self.sa_session.add( snapshot )
  746. self.sa_session.flush()
  747. except Exception, ex:
  748. err = "Error while updating snapshot status: " + str( ex )
  749. log.error( err )
  750. snapshot.status = snapshot_status.ERROR
  751. snapshot.error = err
  752. uci.error = err
  753. uci.state = uci_states.ERROR
  754. self.sa_session.add( uci )
  755. self.sa_session.add( snapshot )
  756. self.sa_session.flush()
  757. def delete_snapshot( self, snapshot ):
  758. """
  759. Initiate deletion of given snapshot from cloud provider.
  760. """
  761. if snapshot.status == snapshot_status.DELETE:
  762. # Get credentials associated wit this store
  763. uci_id = snapshot.uci_id
  764. uci = self.sa_session.query( model.UCI ).get( uci_id )
  765. self.sa_session.refresh( uci )
  766. conn = self.get_connection_from_uci( uci )
  767. try:
  768. log.debug( "Deleting snapshot '%s'" % snapshot.snapshot_id )
  769. snap = conn.delete_snapshot( snapshot.snapshot_id )
  770. if snap == True:
  771. snapshot.deleted = True
  772. snapshot.status = snapshot_status.DELETED
  773. self.sa_session.add( snapshot )
  774. self.sa_session.flush()
  775. return snap
  776. except boto.exception.EC2ResponseError, e:
  777. err = "EC2 response error while deleting snapshot: " + str( e )
  778. log.error( err )
  779. snapshot.status = snapshot_status.ERROR
  780. snapshot.error = err
  781. uci.error = err
  782. uci.state = uci_states.ERROR
  783. self.sa_session.add( uci )
  784. self.sa_session.add( snapshot )
  785. self.sa_session.flush()
  786. except Exception, ex:
  787. err = "Error while deleting snapshot: " + str( ex )
  788. log.error( err )
  789. snapshot.status = snapshot_status.ERROR
  790. snapshot.error = err
  791. uci.error = err
  792. uci.state = uci_states.ERROR
  793. self.sa_session.add( uci )
  794. self.sa_session.add( snapshot )
  795. self.sa_session.flush()
  796. else:
  797. err = "Cannot delete snapshot '"+snapshot.snapshot_id+"' because its status is '"+snapshot.status+"'. Only snapshots with '" + \
  798. snapshot_status.COMPLETED+"' status can be deleted."
  799. log.error( err )
  800. snapshot.error = err
  801. self.sa_session.add( snapshot )
  802. self.sa_session.flush()
  803. def process_zombie( self, inst ):
  804. """
  805. Attempt at discovering if starting a cloud instance was successful but local database was not updated
  806. accordingly or if something else failed and instance was never started. Currently, no automatic
  807. repairs are being attempted; instead, appropriate error messages are set.
  808. """
  809. uci_id = inst.uci_id
  810. uci = self.sa_session.query( model.UCI ).get( uci_id )
  811. self.sa_session.refresh( uci )
  812. # Check if any instance-specific information was written to local DB; if 'yes', set instance and UCI's error message
  813. # suggesting manual check.
  814. if inst.launch_time != None or inst.reservation_id != None or inst.instance_id != None:
  815. # Try to recover state - this is best-case effort, so if something does not work immediately, not
  816. # recovery steps are attempted. Recovery is based on hope that instance_id is available in local DB; if not,
  817. # report as error.
  818. # Fields attempting to be recovered are: reservation_id, instance status, and launch_time
  819. if inst.instance_id != None:
  820. conn = self.get_connection_from_uci( uci )
  821. rl = conn.get_all_instances( [inst.instance_id] ) # reservation list
  822. # Update local DB with relevant data from instance
  823. if inst.reservation_id == None:
  824. try:
  825. inst.reservation_id = str(rl[0]).split(":")[1]
  826. except: # something failed, so skip
  827. pass
  828. try:
  829. state = rl[0].instances[0].update()
  830. inst.state = state
  831. uci.state = state
  832. self.sa_session.add( inst )
  833. self.sa_session.add( uci )
  834. self.sa_session.flush()
  835. except: # something failed, so skip
  836. pass
  837. if inst.launch_time == None:
  838. try:
  839. launch_time = self.format_time( rl[0].instances[0].launch_time )
  840. inst.launch_time = launch_time
  841. self.sa_session.add( inst )
  842. self.sa_session.flush()
  843. if inst.uci.launch_time == None:
  844. uci.launch_time = launch_time
  845. self.sa_session.add( uci )
  846. self.sa_session.flush()
  847. except: # something failed, so skip
  848. pass
  849. else:
  850. err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \
  851. "' seems to have failed. Because it appears that cloud instance might have gotten started, manual check is recommended."
  852. inst.error = err
  853. inst.state = instance_states.ERROR
  854. inst.uci.error = err
  855. inst.uci.state = uci_states.ERROR
  856. log.error( err )
  857. self.sa_session.add( inst )
  858. self.sa_session.add( uci )
  859. self.sa_session.flush()
  860. else: #Instance most likely never got processed, so set error message suggesting user to try starting instance again.
  861. err = "Starting a machine instance (DB id: '"+str(inst.id)+"') associated with this UCI '" + str(inst.uci.name) + \
  862. "' seems to have failed. Because it appears that cloud instance never got started, it should be safe to reset state and try " \
  863. "starting the instance again."
  864. inst.error = err
  865. inst.state = instance_states.ERROR
  866. uci.error = err
  867. uci.state = uci_states.ERROR
  868. log.error( err )
  869. self.sa_session.add( inst )
  870. self.sa_session.add( uci )
  871. self.sa_session.flush()
  872. # uw = UCIwrapper( inst.uci )
  873. # log.debug( "Try automatically re-submitting UCI '%s'." % uw.get_name() )
  874. def get_connection_from_uci( self, uci ):
  875. """
  876. Establish and return connection to cloud provider. Information needed to do so is obtained
  877. directly from uci database object.
  878. """
  879. log.debug( 'Establishing %s cloud connection' % self.type )
  880. a_key = uci.credentials.access_key
  881. s_key = uci.credentials.secret_key
  882. # Get connection
  883. try:
  884. region = RegionInfo( None, uci.credentials.provider.region_name, uci.credentials.provider.region_endpoint )
  885. # log.debug( "[%s] Using following command to connect to cloud provider: "
  886. # "conn = EC2Connection( aws_access_key_id=%s, "
  887. # "aws_secret_access_key=%s, "
  888. # "port=%s, "
  889. # "is_secure=%s, "
  890. # "region=region, "
  891. # "path=%s )" % ( self.type, a_key, s_key, uci.credentials.provider.is_secure, uci.credentials.provider.port, uci.credentials.provider.path ) )
  892. conn = EC2Connection( aws_access_key_id=a_key,
  893. aws_secret_access_key=s_key,
  894. is_secure=uci.credentials.provider.is_secure,
  895. port=uci.credentials.provider.port,
  896. region=region,
  897. path=uci.credentials.provider.path )
  898. except boto.exception.EC2ResponseError, e:
  899. err = "Establishing connection with cloud failed: " + str( e )
  900. log.error( err )
  901. uci.error = err
  902. uci.state = uci_states.ERROR
  903. self.sa_session.add( uci )
  904. self.sa_session.flush()
  905. return None
  906. return conn
  907. # def updateUCI( self, uci ):
  908. # """
  909. # Runs a global status update on all storage volumes and all instances that are
  910. # associated with specified UCI
  911. # """
  912. # conn = self.get_connection( uci )
  913. #
  914. # # Update status of storage volumes
  915. # vl = model.CloudStore.filter( model.CloudInstance.table.c.uci_id == uci.id ).all()
  916. # vols = []
  917. # for v in vl:
  918. # vols.append( v.volume_id )
  919. # try:
  920. # volumes = conn.get_all_volumes( vols )
  921. # for i, v in enumerate( volumes ):
  922. # uci.store[i].inst.instance_id = v.instance_id
  923. # uci.store[i].status = v.status
  924. # uci.store[i].device = v.device
  925. # uci.store[i].flush()
  926. # except:
  927. # log.debug( "Error updating status of volume(s) associated with UCI '%s'. Status was not updated." % uci.name )
  928. # pass
  929. #
  930. # # Update status of instances
  931. # il = model.CloudInstance.filter_by( uci_id=uci.id ).filter( model.CloudInstance.table.c.state != 'terminated' ).all()
  932. # instanceList = []
  933. # for i in il:
  934. # instanceList.append( i.instance_id )
  935. # log.debug( 'instanceList: %s' % instanceList )
  936. # try:
  937. # reservations = conn.get_all_instances( instanceList )
  938. # for i, r in enumerate( reservations ):
  939. # uci.instance[i].state = r.instances[0].update()
  940. # log.debug('updating instance %s; status: %s' % ( uci.instance[i].instance_id, uci.instance[i].state ) )
  941. # uci.state = uci.instance[i].state
  942. # uci.instance[i].public_dns = r.instances[0].dns_name
  943. # uci.instance[i].private_dns = r.instances[0].private_dns_name
  944. # uci.instance[i].flush()
  945. # uci.flush()
  946. # except:
  947. # log.debug( "Error updating status of instances associated with UCI '%s'. Instance status was not updated." % uci.name )
  948. # pass
  949. # --------- Helper methods ------------
  950. def format_time( self, time ):
  951. dict = {'T':' ', 'Z':''}
  952. for i, j in dict.iteritems():
  953. time = time.replace(i, j)
  954. return time