PageRenderTime 97ms CodeModel.GetById 80ms RepoModel.GetById 2ms app.codeStats 0ms

/scripts/galaxy_messaging/server/data_transfer.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 239 lines | 197 code | 13 blank | 29 comment | 12 complexity | cfac29c7382a1fc211edf991e78b1610 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. Data Transfer Script: Sequencer to Galaxy
  4. This script is called from Galaxy RabbitMQ listener ( amqp_consumer.py ) once
  5. the lab admin starts the data transfer process using the user interface.
  6. Usage:
  7. python data_transfer.py <config_file>
  8. """
  9. import ConfigParser
  10. import cookielib
  11. import datetime
  12. import logging
  13. import optparse
  14. import os
  15. import shutil
  16. import sys
  17. import time
  18. import time
  19. import traceback
  20. import urllib
  21. import urllib2
  22. import xml.dom.minidom
  23. from xml_helper import get_value, get_value_index
  24. log = logging.getLogger( "datatx_" + str( os.getpid() ) )
  25. log.setLevel( logging.DEBUG )
  26. fh = logging.FileHandler( "data_transfer.log" )
  27. fh.setLevel( logging.DEBUG )
  28. formatter = logging.Formatter( "%(asctime)s - %(name)s - %(message)s" )
  29. fh.setFormatter( formatter )
  30. log.addHandler( fh )
  31. api_path = [ os.path.join( os.getcwd(), "scripts/api" ) ]
  32. sys.path.extend( api_path )
  33. import common as api
  34. assert sys.version_info[:2] >= ( 2, 4 )
  35. new_path = [ os.path.join( os.getcwd(), "lib" ) ]
  36. new_path.extend( sys.path[1:] ) # remove scripts/ from the path
  37. sys.path = new_path
  38. from galaxy import eggs
  39. from galaxy.model import SampleDataset
  40. from galaxy.web.api.samples import SamplesAPIController
  41. import pkg_resources
  42. pkg_resources.require( "pexpect" )
  43. import pexpect
  44. log.debug(str(dir(api)))
  45. class DataTransfer( object ):
  46. def __init__( self, msg, config_file ):
  47. log.info( msg )
  48. self.dom = xml.dom.minidom.parseString( msg )
  49. self.galaxy_host = get_value( self.dom, 'galaxy_host' )
  50. self.api_key = get_value( self.dom, 'api_key' )
  51. self.sequencer_host = get_value( self.dom, 'data_host' )
  52. self.sequencer_username = get_value( self.dom, 'data_user' )
  53. self.sequencer_password = get_value( self.dom, 'data_password' )
  54. self.request_id = get_value( self.dom, 'request_id' )
  55. self.sample_id = get_value( self.dom, 'sample_id' )
  56. self.library_id = get_value( self.dom, 'library_id' )
  57. self.folder_id = get_value( self.dom, 'folder_id' )
  58. self.dataset_files = []
  59. count=0
  60. while True:
  61. dataset_id = get_value_index( self.dom, 'dataset_id', count )
  62. file = get_value_index( self.dom, 'file', count )
  63. name = get_value_index( self.dom, 'name', count )
  64. if file:
  65. self.dataset_files.append( dict( name=name,
  66. dataset_id=int( dataset_id ),
  67. file=file ) )
  68. else:
  69. break
  70. count=count+1
  71. # read config variables
  72. config = ConfigParser.ConfigParser()
  73. retval = config.read( config_file )
  74. if not retval:
  75. error_msg = 'FATAL ERROR: Unable to open config file %s.' % config_file
  76. log.error( error_msg )
  77. sys.exit(1)
  78. try:
  79. self.config_id_secret = config.get( "app:main", "id_secret" )
  80. except ConfigParser.NoOptionError,e:
  81. self.config_id_secret = "USING THE DEFAULT IS NOT SECURE!"
  82. try:
  83. self.import_dir = config.get( "app:main", "library_import_dir" )
  84. except ConfigParser.NoOptionError,e:
  85. log.error( 'ERROR: library_import_dir config variable is not set in %s. ' % config_file )
  86. sys.exit( 1 )
  87. # create the destination directory within the import directory
  88. self.server_dir = os.path.join( self.import_dir,
  89. 'datatx_' + str( os.getpid() ) + '_' + datetime.date.today( ).strftime( "%d%b%Y" ) )
  90. try:
  91. os.mkdir( self.server_dir )
  92. except Exception, e:
  93. self.error_and_exit( str( e ) )
  94. def start( self ):
  95. '''
  96. This method executes the file transfer from the sequencer, adds the dataset
  97. to the data library & finally updates the data transfer status in the db
  98. '''
  99. # datatx
  100. self.transfer_files()
  101. # add the dataset to the given library
  102. self.add_to_library()
  103. # update the data transfer status in the db
  104. self.update_status( SampleDataset.transfer_status.COMPLETE )
  105. # cleanup
  106. #self.cleanup()
  107. sys.exit( 0 )
  108. def cleanup( self ):
  109. '''
  110. remove the directory created to store the dataset files temporarily
  111. before adding the same to the data library
  112. '''
  113. try:
  114. time.sleep( 60 )
  115. shutil.rmtree( self.server_dir )
  116. except:
  117. self.error_and_exit()
  118. def error_and_exit( self, msg='' ):
  119. '''
  120. This method is called any exception is raised. This prints the traceback
  121. and terminates this script
  122. '''
  123. log.error( traceback.format_exc() )
  124. log.error( 'FATAL ERROR.' + msg )
  125. self.update_status( 'Error', 'All', msg )
  126. sys.exit( 1 )
  127. def transfer_files( self ):
  128. '''
  129. This method executes a scp process using pexpect library to transfer
  130. the dataset file from the remote sequencer to the Galaxy server
  131. '''
  132. def print_ticks( d ):
  133. pass
  134. for i, dataset_file in enumerate( self.dataset_files ):
  135. self.update_status( SampleDataset.transfer_status.TRANSFERRING, dataset_file[ 'dataset_id' ] )
  136. try:
  137. cmd = "scp %s@%s:'%s' '%s/%s'" % ( self.sequencer_username,
  138. self.sequencer_host,
  139. dataset_file[ 'file' ].replace( ' ', '\ ' ),
  140. self.server_dir.replace( ' ', '\ ' ),
  141. dataset_file[ 'name' ].replace( ' ', '\ ' ) )
  142. log.debug( cmd )
  143. output = pexpect.run( cmd,
  144. events={ '.ssword:*': self.sequencer_password+'\r\n',
  145. pexpect.TIMEOUT: print_ticks },
  146. timeout=10 )
  147. log.debug( output )
  148. path = os.path.join( self.server_dir, os.path.basename( dataset_file[ 'name' ] ) )
  149. if not os.path.exists( path ):
  150. msg = 'Could not find the local file after transfer ( %s ).' % path
  151. log.error( msg )
  152. raise Exception( msg )
  153. except Exception, e:
  154. msg = traceback.format_exc()
  155. self.update_status( 'Error', dataset_file['dataset_id'], msg)
  156. def add_to_library( self ):
  157. '''
  158. This method adds the dataset file to the target data library & folder
  159. by opening the corresponding url in Galaxy server running.
  160. '''
  161. self.update_status( SampleDataset.transfer_status.ADD_TO_LIBRARY )
  162. try:
  163. data = {}
  164. data[ 'folder_id' ] = 'F%s' % api.encode_id( self.config_id_secret, self.folder_id )
  165. data[ 'file_type' ] = 'auto'
  166. data[ 'server_dir' ] = self.server_dir
  167. data[ 'dbkey' ] = ''
  168. data[ 'upload_option' ] = 'upload_directory'
  169. data[ 'create_type' ] = 'file'
  170. url = "http://%s/api/libraries/%s/contents" % ( self.galaxy_host,
  171. api.encode_id( self.config_id_secret, self.library_id ) )
  172. log.debug( str( ( self.api_key, url, data ) ) )
  173. retval = api.submit( self.api_key, url, data, return_formatted=False )
  174. log.debug( str( retval ) )
  175. except Exception, e:
  176. self.error_and_exit( str( e ) )
  177. def update_status( self, status, dataset_id='All', msg='' ):
  178. '''
  179. Update the data transfer status for this dataset in the database
  180. '''
  181. try:
  182. log.debug( 'Setting status "%s" for dataset "%s" of sample "%s"' % ( status, str( dataset_id ), str( self.sample_id) ) )
  183. sample_dataset_ids = []
  184. if dataset_id == 'All':
  185. for dataset in self.dataset_files:
  186. sample_dataset_ids.append( api.encode_id( self.config_id_secret, dataset[ 'dataset_id' ] ) )
  187. else:
  188. sample_dataset_ids.append( api.encode_id( self.config_id_secret, dataset_id ) )
  189. # update the transfer status
  190. data = {}
  191. data[ 'update_type' ] = SamplesAPIController.update_types.SAMPLE_DATASET[0]
  192. data[ 'sample_dataset_ids' ] = sample_dataset_ids
  193. data[ 'new_status' ] = status
  194. data[ 'error_msg' ] = msg
  195. url = "http://%s/api/samples/%s" % ( self.galaxy_host,
  196. api.encode_id( self.config_id_secret, self.sample_id ) )
  197. log.debug( str( ( self.api_key, url, data)))
  198. retval = api.update( self.api_key, url, data, return_formatted=False )
  199. log.debug( str( retval ) )
  200. except urllib2.URLError, e:
  201. log.debug( 'ERROR( sample_dataset_transfer_status ( %s ) ): %s' % ( url, str( e ) ) )
  202. log.error( traceback.format_exc() )
  203. except:
  204. log.error( traceback.format_exc() )
  205. log.error( 'FATAL ERROR' )
  206. sys.exit( 1 )
  207. if __name__ == '__main__':
  208. log.info( 'STARTING %i %s' % ( os.getpid(), str( sys.argv ) ) )
  209. #
  210. # Start the daemon
  211. #
  212. dt = DataTransfer( sys.argv[1], sys.argv[2])
  213. dt.start()
  214. sys.exit( 0 )