PageRenderTime 65ms CodeModel.GetById 46ms app.highlight 15ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/galaxy_messaging/server/data_transfer.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 239 lines | 197 code | 13 blank | 29 comment | 6 complexity | cfac29c7382a1fc211edf991e78b1610 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3
  4Data Transfer Script: Sequencer to Galaxy
  5
  6This script is called from Galaxy RabbitMQ listener ( amqp_consumer.py ) once 
  7the lab admin starts the data transfer process using the user interface.
  8
  9Usage:
 10
 11python data_transfer.py <config_file>
 12
 13
 14"""
 15import ConfigParser
 16import cookielib
 17import datetime
 18import logging
 19import optparse
 20import os
 21import shutil
 22import sys
 23import time
 24import time
 25import traceback
 26import urllib
 27import urllib2
 28import xml.dom.minidom
 29
 30
 31from xml_helper import get_value, get_value_index
 32
 33log = logging.getLogger( "datatx_" + str( os.getpid() ) )
 34log.setLevel( logging.DEBUG )
 35fh = logging.FileHandler( "data_transfer.log" )
 36fh.setLevel( logging.DEBUG )
 37formatter = logging.Formatter( "%(asctime)s - %(name)s - %(message)s" )
 38fh.setFormatter( formatter )
 39log.addHandler( fh )
 40
 41api_path = [ os.path.join( os.getcwd(), "scripts/api" ) ]
 42sys.path.extend( api_path )
 43import common as api
 44
 45assert sys.version_info[:2] >= ( 2, 4 )
 46new_path = [ os.path.join( os.getcwd(), "lib" ) ]
 47new_path.extend( sys.path[1:] ) # remove scripts/ from the path
 48sys.path = new_path
 49
 50from galaxy import eggs
 51from galaxy.model import SampleDataset
 52from galaxy.web.api.samples import SamplesAPIController
 53import pkg_resources
 54pkg_resources.require( "pexpect" )
 55import pexpect
 56
 57log.debug(str(dir(api)))
 58
 59class DataTransfer( object ):
 60
 61    def __init__( self, msg, config_file ):
 62        log.info( msg )
 63        self.dom = xml.dom.minidom.parseString( msg ) 
 64        self.galaxy_host = get_value( self.dom, 'galaxy_host' )
 65        self.api_key = get_value( self.dom, 'api_key' )
 66        self.sequencer_host = get_value( self.dom, 'data_host' )
 67        self.sequencer_username = get_value( self.dom, 'data_user' )
 68        self.sequencer_password = get_value( self.dom, 'data_password' )
 69        self.request_id = get_value( self.dom, 'request_id' )
 70        self.sample_id = get_value( self.dom, 'sample_id' )
 71        self.library_id = get_value( self.dom, 'library_id' )
 72        self.folder_id = get_value( self.dom, 'folder_id' )
 73        self.dataset_files = []
 74        count=0
 75        while True:
 76            dataset_id = get_value_index( self.dom, 'dataset_id', count )
 77            file = get_value_index( self.dom, 'file', count )
 78            name = get_value_index( self.dom, 'name', count )
 79            if file:
 80                self.dataset_files.append( dict( name=name,
 81                                                 dataset_id=int( dataset_id ),
 82                                                 file=file ) ) 
 83            else:
 84                break
 85            count=count+1
 86        # read config variables
 87        config = ConfigParser.ConfigParser()
 88        retval = config.read( config_file )
 89        if not retval:
 90            error_msg = 'FATAL ERROR: Unable to open config file %s.' % config_file
 91            log.error( error_msg )
 92            sys.exit(1)
 93        try:
 94            self.config_id_secret = config.get( "app:main", "id_secret" )
 95        except ConfigParser.NoOptionError,e:
 96            self.config_id_secret = "USING THE DEFAULT IS NOT SECURE!"
 97        try:
 98            self.import_dir = config.get( "app:main", "library_import_dir" )
 99        except ConfigParser.NoOptionError,e:
100            log.error( 'ERROR: library_import_dir config variable is not set in %s. ' % config_file )
101            sys.exit( 1 )
102        # create the destination directory within the import directory
103        self.server_dir = os.path.join( self.import_dir, 
104                                        'datatx_' + str( os.getpid() ) + '_' + datetime.date.today( ).strftime( "%d%b%Y" ) )
105        try:
106            os.mkdir( self.server_dir )
107        except Exception, e:
108            self.error_and_exit( str( e ) )
109     
110    def start( self ):
111        '''
112        This method executes the file transfer from the sequencer, adds the dataset
113        to the data library & finally updates the data transfer status in the db
114        '''
115        # datatx
116        self.transfer_files()
117        # add the dataset to the given library
118        self.add_to_library()
119        # update the data transfer status in the db
120        self.update_status( SampleDataset.transfer_status.COMPLETE )
121        # cleanup
122        #self.cleanup()    
123        sys.exit( 0 )
124        
125    def cleanup( self ):
126        '''
127        remove the directory created to store the dataset files temporarily
128        before adding the same to the data library
129        '''
130        try:
131            time.sleep( 60 )
132            shutil.rmtree( self.server_dir )
133        except:
134            self.error_and_exit()
135
136            
137    def error_and_exit( self, msg='' ):
138        '''
139        This method is called any exception is raised. This prints the traceback 
140        and terminates this script
141        '''
142        log.error( traceback.format_exc() )
143        log.error( 'FATAL ERROR.' + msg )
144        self.update_status( 'Error', 'All', msg )
145        sys.exit( 1 )
146        
147    def transfer_files( self ):
148        '''
149        This method executes a scp process using pexpect library to transfer
150        the dataset file from the remote sequencer to the Galaxy server
151        '''
152        def print_ticks( d ):
153            pass
154        for i, dataset_file in enumerate( self.dataset_files ):
155            self.update_status( SampleDataset.transfer_status.TRANSFERRING, dataset_file[ 'dataset_id' ] )
156            try:
157                cmd = "scp %s@%s:'%s' '%s/%s'" % (  self.sequencer_username,
158                                                    self.sequencer_host,
159                                                    dataset_file[ 'file' ].replace( ' ', '\ ' ),
160                                                    self.server_dir.replace( ' ', '\ ' ),
161                                                    dataset_file[ 'name' ].replace( ' ', '\ ' ) )
162                log.debug( cmd )
163                output = pexpect.run( cmd, 
164                                      events={ '.ssword:*': self.sequencer_password+'\r\n', 
165                                               pexpect.TIMEOUT: print_ticks }, 
166                                      timeout=10 )
167                log.debug( output )
168                path = os.path.join( self.server_dir, os.path.basename( dataset_file[ 'name' ] ) )
169                if not os.path.exists( path ):
170                    msg = 'Could not find the local file after transfer ( %s ).' % path
171                    log.error( msg )
172                    raise Exception( msg )
173            except Exception, e:
174                msg = traceback.format_exc()
175                self.update_status( 'Error', dataset_file['dataset_id'], msg)
176
177        
178    def add_to_library( self ):
179        '''
180        This method adds the dataset file to the target data library & folder
181        by opening the corresponding url in Galaxy server running.  
182        '''
183        self.update_status( SampleDataset.transfer_status.ADD_TO_LIBRARY )
184        try:
185            data = {}
186            data[ 'folder_id' ] = 'F%s' % api.encode_id( self.config_id_secret, self.folder_id )
187            data[ 'file_type' ] = 'auto'
188            data[ 'server_dir' ] = self.server_dir
189            data[ 'dbkey' ] = ''
190            data[ 'upload_option' ] = 'upload_directory'
191            data[ 'create_type' ] = 'file'
192            url = "http://%s/api/libraries/%s/contents" % ( self.galaxy_host, 
193                                                            api.encode_id(  self.config_id_secret, self.library_id ) )
194            log.debug(  str( ( self.api_key, url, data ) ) )
195            retval = api.submit( self.api_key, url, data, return_formatted=False )
196            log.debug(  str( retval ) )
197        except Exception, e:
198            self.error_and_exit( str(  e ) )
199            
200    def update_status( self, status, dataset_id='All', msg='' ):
201        '''
202        Update the data transfer status for this dataset in the database
203        '''
204        try:
205            log.debug( 'Setting status "%s" for dataset "%s" of sample "%s"' % (  status, str( dataset_id ), str( self.sample_id) ) )
206            sample_dataset_ids = []
207            if dataset_id == 'All':
208                for dataset in self.dataset_files:
209                    sample_dataset_ids.append( api.encode_id( self.config_id_secret, dataset[ 'dataset_id' ] ) )
210            else:
211                sample_dataset_ids.append( api.encode_id( self.config_id_secret, dataset_id ) )
212            # update the transfer status
213            data = {}
214            data[ 'update_type' ] = SamplesAPIController.update_types.SAMPLE_DATASET[0]
215            data[ 'sample_dataset_ids' ] = sample_dataset_ids
216            data[ 'new_status' ] = status
217            data[ 'error_msg' ] = msg
218            url = "http://%s/api/samples/%s" % ( self.galaxy_host,
219                                                 api.encode_id(  self.config_id_secret, self.sample_id ) )
220            log.debug( str( ( self.api_key, url, data)))
221            retval = api.update( self.api_key, url, data, return_formatted=False )
222            log.debug( str( retval ) )
223        except urllib2.URLError, e:
224            log.debug( 'ERROR( sample_dataset_transfer_status ( %s ) ): %s' % ( url, str( e ) ) )
225            log.error( traceback.format_exc() )
226        except:
227            log.error( traceback.format_exc() )
228            log.error( 'FATAL ERROR' )
229            sys.exit( 1 )
230            
231if __name__ == '__main__':
232    log.info( 'STARTING %i %s' % ( os.getpid(), str( sys.argv ) ) )
233    #
234    # Start the daemon
235    #
236    dt = DataTransfer( sys.argv[1], sys.argv[2])
237    dt.start()
238    sys.exit( 0 )
239