/tools/data_source/data_source.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 115 lines · 87 code · 16 blank · 12 comment · 22 complexity · c5a7750151e82fcc3a9ed47539da7215 MD5 · raw file

  1. #!/usr/bin/env python
  2. # Retrieves data from external data source applications and stores in a dataset file.
  3. # Data source application parameters are temporarily stored in the dataset file.
  4. import socket, urllib, sys, os
  5. from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg...
  6. from galaxy.util.json import from_json_string, to_json_string
  7. from galaxy.util import get_charset_from_http_headers
  8. import galaxy.model # need to import model before sniff to resolve a circular import dependency
  9. from galaxy.datatypes import sniff
  10. from galaxy.datatypes.registry import Registry
  11. from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_FILE
  12. assert sys.version_info[:2] >= ( 2, 4 )
  13. def stop_err( msg ):
  14. sys.stderr.write( msg )
  15. sys.exit()
  16. GALAXY_PARAM_PREFIX = 'GALAXY'
  17. GALAXY_ROOT_DIR = os.path.realpath( os.path.join( os.path.split( os.path.realpath( __file__ ) )[0], '..', '..' ) )
  18. GALAXY_DATATYPES_CONF_FILE = os.path.join( GALAXY_ROOT_DIR, 'datatypes_conf.xml' )
  19. def load_input_parameters( filename, erase_file = True ):
  20. datasource_params = {}
  21. try:
  22. json_params = from_json_string( open( filename, 'r' ).read() )
  23. datasource_params = json_params.get( 'param_dict' )
  24. except:
  25. json_params = None
  26. for line in open( filename, 'r' ):
  27. try:
  28. line = line.strip()
  29. fields = line.split( '\t' )
  30. datasource_params[ fields[0] ] = fields[1]
  31. except:
  32. continue
  33. if erase_file:
  34. open( filename, 'w' ).close() #open file for writing, then close, removes params from file
  35. return json_params, datasource_params
  36. def __main__():
  37. filename = sys.argv[1]
  38. import os
  39. try:
  40. max_file_size = int( sys.argv[2] )
  41. except:
  42. max_file_size = 0
  43. job_params, params = load_input_parameters( filename )
  44. if job_params is None: #using an older tabular file
  45. enhanced_handling = False
  46. job_params = dict( param_dict = params )
  47. job_params[ 'output_data' ] = [ dict( out_data_name = 'output',
  48. ext = 'data',
  49. file_name = filename,
  50. extra_files_path = None ) ]
  51. job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE = TOOL_PROVIDED_JOB_METADATA_FILE )
  52. else:
  53. enhanced_handling = True
  54. json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) #specially named file for output junk to pass onto set metadata
  55. datatypes_registry = Registry()
  56. datatypes_registry.load_datatypes( root_dir = job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] )
  57. URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded
  58. URL_method = params.get( 'URL_method', None )
  59. # The Python support for fetching resources from the web is layered. urllib uses the httplib
  60. # library, which in turn uses the socket library. As of Python 2.3 you can specify how long
  61. # a socket should wait for a response before timing out. By default the socket module has no
  62. # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2
  63. # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by
  64. # doing the following.
  65. socket.setdefaulttimeout( 600 )
  66. for data_dict in job_params[ 'output_data' ]:
  67. cur_filename = data_dict.get( 'file_name', filename )
  68. cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL )
  69. if not cur_URL:
  70. open( cur_filename, 'w' ).write( "" )
  71. stop_err( 'The remote data source application has not sent back a URL parameter in the request.' )
  72. # The following calls to urllib.urlopen() will use the above default timeout
  73. try:
  74. if not URL_method or URL_method == 'get':
  75. page = urllib.urlopen( cur_URL )
  76. elif URL_method == 'post':
  77. page = urllib.urlopen( cur_URL, urllib.urlencode( params ) )
  78. except Exception, e:
  79. stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
  80. if max_file_size:
  81. file_size = int( page.info().get( 'Content-Length', 0 ) )
  82. if file_size > max_file_size:
  83. stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
  84. #do sniff stream for multi_byte
  85. try:
  86. cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) )
  87. except Exception, e:
  88. stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) )
  89. #here import checks that upload tool performs
  90. if enhanced_handling:
  91. try:
  92. ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext = data_dict[ 'ext' ], is_multi_byte = is_multi_byte )
  93. except Exception, e:
  94. stop_err( str( e ) )
  95. info = dict( type = 'dataset',
  96. dataset_id = data_dict[ 'dataset_id' ],
  97. ext = ext)
  98. json_file.write( "%s\n" % to_json_string( info ) )
  99. if __name__ == "__main__": __main__()