PageRenderTime 66ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/galaxy/jobs/deferred/data_transfer.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 384 lines | 360 code | 6 blank | 18 comment | 23 complexity | de0690f127ee12eca76e6fab3cd21897 MD5 | raw file
  1. """
  2. Module for managing data transfer jobs.
  3. """
  4. import logging, urllib2, re, shutil
  5. from galaxy import eggs
  6. from sqlalchemy import and_
  7. from galaxy.util.odict import odict
  8. from galaxy.workflow.modules import module_factory
  9. from galaxy.jobs.actions.post import ActionBox
  10. from galaxy.jobs.deferred import FakeTrans
  11. from galaxy.tools.parameters import visit_input_values
  12. from galaxy.tools.parameters.basic import DataToolParameter
  13. from galaxy.datatypes import sniff
  14. log = logging.getLogger( __name__ )
  15. __all__ = [ 'DataTransfer' ]
  16. class DataTransfer( object ):
  17. check_interval = 15
  18. dataset_name_re = re.compile( '(dataset\d+)_(name)' )
  19. dataset_datatype_re = re.compile( '(dataset\d+)_(datatype)' )
  20. def __init__( self, app ):
  21. self.app = app
  22. self.sa_session = app.model.context.current
  23. def create_job( self, trans, **kwd ):
  24. raise Exception( "Unimplemented Method" )
  25. def check_job( self, job ):
  26. raise Exception( "Unimplemented Method" )
  27. def run_job( self, job ):
  28. if job.params[ 'type' ] == 'init_transfer':
  29. # TODO: don't create new downloads on restart.
  30. if job.params[ 'protocol' ] in [ 'http', 'https' ]:
  31. results = []
  32. for result in job.params[ 'results' ].values():
  33. result[ 'transfer_job' ] = self.app.transfer_manager.new( protocol=job.params[ 'protocol' ],
  34. name=result[ 'name' ],
  35. datatype=result[ 'datatype' ],
  36. url=result[ 'url' ] )
  37. results.append( result )
  38. elif job.params[ 'protocol' ] == 'scp':
  39. results = []
  40. result = {}
  41. sample_datasets_dict = job.params[ 'sample_datasets_dict' ]
  42. # sample_datasets_dict looks something like the following. The outer dictionary keys are SampleDataset ids.
  43. # {'7': {'status': 'Not started', 'name': '3.bed', 'file_path': '/tmp/library/3.bed', 'sample_id': 7,
  44. # 'external_service_id': 2, 'error_msg': '', 'size': '8.0K'}}
  45. for sample_dataset_id, sample_dataset_info_dict in sample_datasets_dict.items():
  46. result = {}
  47. result[ 'transfer_job' ] = self.app.transfer_manager.new( protocol=job.params[ 'protocol' ],
  48. host=job.params[ 'host' ],
  49. user_name=job.params[ 'user_name' ],
  50. password=job.params[ 'password' ],
  51. sample_dataset_id=sample_dataset_id,
  52. status=sample_dataset_info_dict[ 'status' ],
  53. name=sample_dataset_info_dict[ 'name' ],
  54. file_path=sample_dataset_info_dict[ 'file_path' ],
  55. sample_id=sample_dataset_info_dict[ 'sample_id' ],
  56. external_service_id=sample_dataset_info_dict[ 'external_service_id' ],
  57. error_msg=sample_dataset_info_dict[ 'error_msg' ],
  58. size=sample_dataset_info_dict[ 'size' ] )
  59. results.append( result )
  60. self.app.transfer_manager.run( [ r[ 'transfer_job' ] for r in results ] )
  61. for result in results:
  62. transfer_job = result.pop( 'transfer_job' )
  63. self.create_job( None,
  64. transfer_job_id=transfer_job.id,
  65. result=transfer_job.params,
  66. sample_id=job.params[ 'sample_id' ] )
  67. # Update the state of the relevant SampleDataset
  68. new_status = self.app.model.SampleDataset.transfer_status.IN_QUEUE
  69. self._update_sample_dataset_status( protocol=job.params[ 'protocol' ],
  70. sample_id=job.params[ 'sample_id' ],
  71. result_dict=transfer_job.params,
  72. new_status=new_status,
  73. error_msg='' )
  74. job.state = self.app.model.DeferredJob.states.OK
  75. self.sa_session.add( job )
  76. self.sa_session.flush()
  77. # TODO: Error handling: failure executing, or errors returned from the manager
  78. if job.params[ 'type' ] == 'finish_transfer':
  79. protocol = job.params[ 'protocol' ]
  80. # Update the state of the relevant SampleDataset
  81. new_status = self.app.model.SampleDataset.transfer_status.ADD_TO_LIBRARY
  82. if protocol in [ 'http', 'https' ]:
  83. result_dict = job.params[ 'result' ]
  84. library_dataset_name = result_dict[ 'name' ]
  85. extension = result_dict[ 'datatype' ]
  86. elif protocol in [ 'scp' ]:
  87. # In this case, job.params will be a dictionary that contains a key named 'result'. The value
  88. # of the result key is a dictionary that looks something like:
  89. # {'sample_dataset_id': '8', 'status': 'Not started', 'protocol': 'scp', 'name': '3.bed',
  90. # 'file_path': '/data/library/3.bed', 'host': '127.0.0.1', 'sample_id': 8, 'external_service_id': 2,
  91. # 'local_path': '/tmp/kjl2Ss4', 'password': 'galaxy', 'user_name': 'gvk', 'error_msg': '', 'size': '8.0K'}
  92. try:
  93. tj = self.sa_session.query( self.app.model.TransferJob ).get( int( job.params['transfer_job_id'] ) )
  94. result_dict = tj.params
  95. result_dict['local_path'] = tj.path
  96. except Exception, e:
  97. log.error( "Updated transfer result unavailable, using old result. Error was: %s" % str( e ) )
  98. result_dict = job.params[ 'result' ]
  99. library_dataset_name = result_dict[ 'name' ]
  100. # Determine the data format (see the relevant TODO item in the manual_data_transfer plugin)..
  101. extension = sniff.guess_ext( result_dict[ 'local_path' ], sniff_order=self.app.datatypes_registry.sniff_order )
  102. self._update_sample_dataset_status( protocol=job.params[ 'protocol' ],
  103. sample_id=int( job.params[ 'sample_id' ] ),
  104. result_dict=result_dict,
  105. new_status=new_status,
  106. error_msg='' )
  107. sample = self.sa_session.query( self.app.model.Sample ).get( int( job.params[ 'sample_id' ] ) )
  108. ld = self.app.model.LibraryDataset( folder=sample.folder, name=library_dataset_name )
  109. self.sa_session.add( ld )
  110. self.sa_session.flush()
  111. self.app.security_agent.copy_library_permissions( FakeTrans( self.app ), sample.folder, ld )
  112. ldda = self.app.model.LibraryDatasetDatasetAssociation( name = library_dataset_name,
  113. extension = extension,
  114. dbkey = '?',
  115. library_dataset = ld,
  116. create_dataset = True,
  117. sa_session = self.sa_session )
  118. ldda.message = 'Transferred by the Data Transfer Plugin'
  119. self.sa_session.add( ldda )
  120. self.sa_session.flush()
  121. ldda.state = ldda.states.QUEUED # flushed in the set property
  122. ld.library_dataset_dataset_association_id = ldda.id
  123. self.sa_session.add( ld )
  124. self.sa_session.flush()
  125. try:
  126. # Move the dataset from its temporary location
  127. shutil.move( job.transfer_job.path, ldda.file_name )
  128. ldda.init_meta()
  129. for name, spec in ldda.metadata.spec.items():
  130. if name not in [ 'name', 'info', 'dbkey', 'base_name' ]:
  131. if spec.get( 'default' ):
  132. setattr( ldda.metadata, name, spec.unwrap( spec.get( 'default' ) ) )
  133. self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( self.app.datatypes_registry.set_external_metadata_tool,
  134. FakeTrans( self.app,
  135. history=sample.history,
  136. user=sample.request.user ),
  137. incoming = { 'input1':ldda } )
  138. ldda.state = ldda.states.OK
  139. # TODO: not sure if this flush is necessary
  140. self.sa_session.add( ldda )
  141. self.sa_session.flush()
  142. except Exception, e:
  143. log.exception( 'Failure preparing library dataset for finished transfer job (id: %s) via deferred job (id: %s):' % \
  144. ( str( job.transfer_job.id ), str( job.id ) ) )
  145. ldda.state = ldda.states.ERROR
  146. if sample.workflow:
  147. log.debug( "\n\nLogging sample mappings as: %s" % sample.workflow[ 'mappings' ] )
  148. log.debug( "job.params: %s" % job.params )
  149. # We have a workflow. Update all mappings to ldda's, and when the final one is done
  150. # execute_workflow with either the provided history, or a new one.
  151. sub_done = True
  152. rep_done = False
  153. for k, v in sample.workflow[ 'mappings' ].iteritems():
  154. if not 'hda' in v and v[ 'ds_tag' ].startswith( 'hi|' ):
  155. sample.workflow[ 'mappings' ][ k ][ 'hda' ] = self.app.security.decode_id( v[ 'ds_tag' ][3:] )
  156. for key, value in sample.workflow[ 'mappings' ].iteritems():
  157. if 'url' in value and value[ 'url' ] == job.params[ 'result' ][ 'url' ]:
  158. # DBTODO Make sure all ds| mappings get the URL of the dataset, for linking to later.
  159. # If this dataset maps to what we just finished, update the ldda id in the sample.
  160. sample.workflow[ 'mappings' ][ key ][ 'ldda' ] = ldda.id
  161. rep_done = True
  162. # DBTODO replace the hi| mappings with the hda here. Just rip off the first three chars.
  163. elif not 'ldda' in value and not 'hda' in value:
  164. # We're not done if some mappings still don't have ldda or hda mappings.
  165. sub_done = False
  166. if sub_done and rep_done:
  167. if not sample.history:
  168. new_history = self.app.model.History( name="New History From %s" % sample.name, user=sample.request.user )
  169. self.sa_session.add( new_history )
  170. sample.history = new_history
  171. self.sa_session.flush()
  172. self._execute_workflow( sample )
  173. # Check the workflow for substitution done-ness
  174. self.sa_session.add( sample )
  175. self.sa_session.flush()
  176. elif sample.history:
  177. # We don't have a workflow, but a history was provided.
  178. # No processing, go ahead and chunk everything in the history.
  179. if ldda.dataset.state in [ 'new', 'upload', 'queued', 'running', 'empty', 'discarded' ]:
  180. log.error("Cannot import dataset '%s' to user history since its state is '%s'. " % ( ldda.name, ldda.dataset.state ))
  181. elif ldda.dataset.state in [ 'ok', 'error' ]:
  182. ldda.to_history_dataset_association( target_history=sample.history, add_to_history=True )
  183. # Finished
  184. job.state = self.app.model.DeferredJob.states.OK
  185. self.sa_session.add( job )
  186. self.sa_session.flush()
  187. # Update the state of the relevant SampleDataset
  188. new_status = self.app.model.SampleDataset.transfer_status.COMPLETE
  189. self._update_sample_dataset_status( protocol=job.params[ 'protocol' ],
  190. sample_id=int( job.params[ 'sample_id' ] ),
  191. result_dict=job.params[ 'result' ],
  192. new_status=new_status,
  193. error_msg='' )
  194. if sample.datasets and not sample.untransferred_dataset_files:
  195. # Update the state of the sample to the sample's request type's final state.
  196. new_state = sample.request.type.final_sample_state
  197. self._update_sample_state( sample.id, new_state )
  198. # Update the state of the request, if possible
  199. self._update_request_state( sample.request.id )
  200. def _missing_params( self, params, required_params ):
  201. missing_params = filter( lambda x: x not in params, required_params )
  202. if missing_params:
  203. log.error( 'Job parameters missing required keys: %s' % ', '.join( missing_params ) )
  204. return True
  205. return False
  206. def _update_sample_dataset_status( self, protocol, sample_id, result_dict, new_status, error_msg=None ):
  207. # result_dict looks something like:
  208. # {'url': '127.0.0.1/data/filtered_subreads.fa', 'name': 'Filtered reads'}
  209. # Check if the new status is a valid transfer status
  210. valid_statuses = [ v[1] for v in self.app.model.SampleDataset.transfer_status.items() ]
  211. # TODO: error checking on valid new_status value
  212. if protocol in [ 'http', 'https' ]:
  213. sample_dataset = self.sa_session.query( self.app.model.SampleDataset ) \
  214. .filter( and_( self.app.model.SampleDataset.table.c.sample_id == sample_id,
  215. self.app.model.SampleDataset.table.c.name == result_dict[ 'name' ],
  216. self.app.model.SampleDataset.table.c.file_path == result_dict[ 'url' ] ) ) \
  217. .first()
  218. elif protocol in [ 'scp' ]:
  219. sample_dataset = self.sa_session.query( self.app.model.SampleDataset ).get( int( result_dict[ 'sample_dataset_id' ] ) )
  220. sample_dataset.status = new_status
  221. sample_dataset.error_msg = error_msg
  222. self.sa_session.add( sample_dataset )
  223. self.sa_session.flush()
  224. def _update_sample_state( self, sample_id, new_state, comment=None ):
  225. sample = self.sa_session.query( self.app.model.Sample ).get( sample_id )
  226. if comment is None:
  227. comment = 'Sample state set to %s' % str( new_state )
  228. event = self.app.model.SampleEvent( sample, new_state, comment )
  229. self.sa_session.add( event )
  230. self.sa_session.flush()
  231. def _update_request_state( self, request_id ):
  232. request = self.sa_session.query( self.app.model.Request ).get( request_id )
  233. # Make sure all the samples of the current request have the same state
  234. common_state = request.samples_have_common_state
  235. if not common_state:
  236. # If the current request state is complete and one of its samples moved from
  237. # the final sample state, then move the request state to In-progress
  238. if request.is_complete:
  239. message = "At least 1 sample state moved from the final sample state, so now the request's state is (%s)" % request.states.SUBMITTED
  240. event = self.app.model.RequestEvent( request, request.states.SUBMITTED, message )
  241. self.sa_session.add( event )
  242. self.sa_session.flush()
  243. else:
  244. final_state = False
  245. request_type_state = request.type.final_sample_state
  246. if common_state.id == request_type_state.id:
  247. # Since all the samples are in the final state, change the request state to 'Complete'
  248. comment = "All samples of this sequencing request are in the final sample state (%s). " % request_type_state.name
  249. state = request.states.COMPLETE
  250. final_state = True
  251. else:
  252. comment = "All samples of this sequencing request are in the (%s) sample state. " % common_state.name
  253. state = request.states.SUBMITTED
  254. event = self.app.model.RequestEvent( request, state, comment )
  255. self.sa_session.add( event )
  256. self.sa_session.flush()
  257. # TODO: handle email notification if it is configured to be sent when the samples are in this state.
  258. def _execute_workflow( self, sample):
  259. for key, value in sample.workflow['mappings'].iteritems():
  260. if 'hda' not in value and 'ldda' in value:
  261. # If HDA is already here, it's an external input, we're not copying anything.
  262. ldda = self.sa_session.query( self.app.model.LibraryDatasetDatasetAssociation ).get( value['ldda'] )
  263. if ldda.dataset.state in [ 'new', 'upload', 'queued', 'running', 'empty', 'discarded' ]:
  264. log.error("Cannot import dataset '%s' to user history since its state is '%s'. " % ( ldda.name, ldda.dataset.state ))
  265. elif ldda.dataset.state in [ 'ok', 'error' ]:
  266. hda = ldda.to_history_dataset_association( target_history=sample.history, add_to_history=True )
  267. sample.workflow['mappings'][key]['hda'] = hda.id
  268. self.sa_session.add( sample )
  269. self.sa_session.flush()
  270. workflow_dict = sample.workflow
  271. import copy
  272. new_wf_dict = copy.deepcopy(workflow_dict)
  273. for key in workflow_dict['mappings']:
  274. if not isinstance(key, int):
  275. new_wf_dict['mappings'][int(key)] = workflow_dict['mappings'][key]
  276. workflow_dict = new_wf_dict
  277. fk_trans = FakeTrans(self.app, history = sample.history, user=sample.request.user)
  278. workflow = self.sa_session.query(self.app.model.Workflow).get(workflow_dict['id'])
  279. if not workflow:
  280. log.error("Workflow mapping failure.")
  281. return
  282. if len( workflow.steps ) == 0:
  283. log.error( "Workflow cannot be run because it does not have any steps" )
  284. return
  285. if workflow.has_cycles:
  286. log.error( "Workflow cannot be run because it contains cycles" )
  287. return
  288. if workflow.has_errors:
  289. log.error( "Workflow cannot be run because of validation errors in some steps" )
  290. return
  291. # Build the state for each step
  292. errors = {}
  293. has_upgrade_messages = False
  294. has_errors = False
  295. # Build a fake dictionary prior to execution.
  296. # Prepare each step
  297. for step in workflow.steps:
  298. step.upgrade_messages = {}
  299. # Contruct modules
  300. if step.type == 'tool' or step.type is None:
  301. # Restore the tool state for the step
  302. step.module = module_factory.from_workflow_step( fk_trans, step )
  303. # Fix any missing parameters
  304. step.upgrade_messages = step.module.check_and_update_state()
  305. if step.upgrade_messages:
  306. has_upgrade_messages = True
  307. # Any connected input needs to have value DummyDataset (these
  308. # are not persisted so we need to do it every time)
  309. step.module.add_dummy_datasets( connections=step.input_connections )
  310. # Store state with the step
  311. step.state = step.module.state
  312. # Error dict
  313. if step.tool_errors:
  314. has_errors = True
  315. errors[step.id] = step.tool_errors
  316. else:
  317. ## Non-tool specific stuff?
  318. step.module = module_factory.from_workflow_step( fk_trans, step )
  319. step.state = step.module.get_runtime_state()
  320. # Connections by input name
  321. step.input_connections_by_name = dict( ( conn.input_name, conn ) for conn in step.input_connections )
  322. for step in workflow.steps:
  323. step.upgrade_messages = {}
  324. # Connections by input name
  325. step.input_connections_by_name = \
  326. dict( ( conn.input_name, conn ) for conn in step.input_connections )
  327. # Extract just the arguments for this step by prefix
  328. step_errors = None
  329. if step.type == 'tool' or step.type is None:
  330. module = module_factory.from_workflow_step( fk_trans, step )
  331. # Fix any missing parameters
  332. step.upgrade_messages = module.check_and_update_state()
  333. if step.upgrade_messages:
  334. has_upgrade_messages = True
  335. # Any connected input needs to have value DummyDataset (these
  336. # are not persisted so we need to do it every time)
  337. module.add_dummy_datasets( connections=step.input_connections )
  338. # Get the tool
  339. tool = module.tool
  340. # Get the state
  341. step.state = state = module.state
  342. # Get old errors
  343. old_errors = state.inputs.pop( "__errors__", {} )
  344. if step_errors:
  345. errors[step.id] = state.inputs["__errors__"] = step_errors
  346. # Run each step, connecting outputs to inputs
  347. workflow_invocation = self.app.model.WorkflowInvocation()
  348. workflow_invocation.workflow = workflow
  349. outputs = odict()
  350. for i, step in enumerate( workflow.steps ):
  351. job = None
  352. if step.type == 'tool' or step.type is None:
  353. tool = self.app.toolbox.tools_by_id[ step.tool_id ]
  354. def callback( input, value, prefixed_name, prefixed_label ):
  355. if isinstance( input, DataToolParameter ):
  356. if prefixed_name in step.input_connections_by_name:
  357. conn = step.input_connections_by_name[ prefixed_name ]
  358. return outputs[ conn.output_step.id ][ conn.output_name ]
  359. visit_input_values( tool.inputs, step.state.inputs, callback )
  360. job, out_data = tool.execute( fk_trans, step.state.inputs, history=sample.history)
  361. outputs[ step.id ] = out_data
  362. for pja in step.post_job_actions:
  363. if pja.action_type in ActionBox.immediate_actions:
  364. ActionBox.execute(self.app, self.sa_session, pja, job, replacement_dict=None)
  365. else:
  366. job.add_post_job_action(pja)
  367. else:
  368. job, out_data = step.module.execute( fk_trans, step.state)
  369. outputs[ step.id ] = out_data
  370. if step.id in workflow_dict['mappings']:
  371. data = self.sa_session.query( self.app.model.HistoryDatasetAssociation ).get( workflow_dict['mappings'][str(step.id)]['hda'] )
  372. outputs[ step.id ]['output'] = data
  373. workflow_invocation_step = self.app.model.WorkflowInvocationStep()
  374. workflow_invocation_step.workflow_invocation = workflow_invocation
  375. workflow_invocation_step.workflow_step = step
  376. workflow_invocation_step.job = job
  377. self.sa_session.add( workflow_invocation )
  378. self.sa_session.flush()