/tools/expression/upload.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 363 lines · 303 code · 18 blank · 42 comment · 107 complexity · 93825743bfbd1a2179749ffdd7a4cd16 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Processes uploads from the user.
  3. # WARNING: Changes in this tool (particularly as related to parsing) may need
  4. # to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools
  5. import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii
  6. from galaxy import eggs
  7. # need to import model before sniff to resolve a circular import dependency
  8. import galaxy.model
  9. from galaxy.datatypes import sniff
  10. from galaxy.datatypes.binary import *
  11. from galaxy.datatypes.registry import Registry
  12. from galaxy import util
  13. from galaxy.util.json import *
  14. assert sys.version_info[:2] >= ( 2, 4 )
  15. def stop_err( msg, ret=1 ):
  16. sys.stderr.write( msg )
  17. sys.exit( ret )
  18. def file_err( msg, dataset, json_file ):
  19. json_file.write( to_json_string( dict( type = 'dataset',
  20. ext = 'data',
  21. dataset_id = dataset.dataset_id,
  22. stderr = msg ) ) + "\n" )
  23. try:
  24. os.remove( dataset.path )
  25. except:
  26. pass
  27. def safe_dict(d):
  28. """
  29. Recursively clone json structure with UTF-8 dictionary keys
  30. http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/
  31. """
  32. if isinstance(d, dict):
  33. return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()])
  34. elif isinstance(d, list):
  35. return [safe_dict(x) for x in d]
  36. else:
  37. return d
  38. def check_html( temp_name, chunk=None ):
  39. if chunk is None:
  40. temp = open(temp_name, "U")
  41. else:
  42. temp = chunk
  43. regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
  44. regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
  45. regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
  46. regexp4 = re.compile( "<META[^>]*>", re.I )
  47. regexp5 = re.compile( "<SCRIPT[^>]*>", re.I )
  48. lineno = 0
  49. for line in temp:
  50. lineno += 1
  51. matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line )
  52. if matches:
  53. if chunk is None:
  54. temp.close()
  55. return True
  56. if lineno > 100:
  57. break
  58. if chunk is None:
  59. temp.close()
  60. return False
  61. def check_binary( temp_name ):
  62. is_binary = False
  63. temp = open( temp_name, "U" )
  64. chars_read = 0
  65. for chars in temp:
  66. for char in chars:
  67. chars_read += 1
  68. if ord( char ) > 128:
  69. is_binary = True
  70. break
  71. if chars_read > 100:
  72. break
  73. if chars_read > 100:
  74. break
  75. temp.close()
  76. return is_binary
  77. def check_bam( temp_name ):
  78. return Bam().sniff( temp_name )
  79. def check_sff( temp_name ):
  80. return Sff().sniff( temp_name )
  81. def check_gzip( temp_name ):
  82. # This method returns a tuple of booleans representing ( is_gzipped, is_valid )
  83. # Make sure we have a gzipped file
  84. try:
  85. temp = open( temp_name, "U" )
  86. magic_check = temp.read( 2 )
  87. temp.close()
  88. if magic_check != util.gzip_magic:
  89. return ( False, False )
  90. except:
  91. return ( False, False )
  92. # We support some binary data types, so check if the compressed binary file is valid
  93. # If the file is Bam, it should already have been detected as such, so we'll just check
  94. # for sff format.
  95. try:
  96. header = gzip.open( temp_name ).read(4)
  97. if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
  98. return ( True, True )
  99. except:
  100. return( False, False )
  101. CHUNK_SIZE = 2**15 # 32Kb
  102. gzipped_file = gzip.GzipFile( temp_name, mode='rb' )
  103. chunk = gzipped_file.read( CHUNK_SIZE )
  104. gzipped_file.close()
  105. # See if we have a compressed HTML file
  106. if check_html( temp_name, chunk=chunk ):
  107. return ( True, False )
  108. return ( True, True )
  109. def check_zip( temp_name ):
  110. # Return: (is_zip, known_ext, exactly_one_pheno, gt_one, homogeneous, ext)
  111. if not zipfile.is_zipfile( temp_name ):
  112. return (False, False, False, False, False, None)
  113. zip_file = zipfile.ZipFile( temp_name, "r" )
  114. # Make sure the archive consists of valid files. The current rules are:
  115. # 1. The file type in the zip is homegeneous, except that there is exactly one .txt pheno file
  116. # 2. The rest of the files must be either .cel or .xys
  117. # 3. There must be at least two .cel or .xys
  118. hasPheno = False
  119. count = 0
  120. test_ext = None
  121. for name in zip_file.namelist():
  122. #Reason:modification to support folder in zip file
  123. #ext = name.split(".")[1].strip().lower()
  124. ext = os.path.splitext( name )[1].strip().lower().replace(".","")
  125. if(ext==""): #ignore folder
  126. continue
  127. count += 1
  128. if (not (ext == "txt" or ext == "cel" or ext == "xys")):
  129. return (True, False, False, False, False, ext)
  130. if (ext == "txt"):
  131. if (hasPheno):
  132. return (True, True, False, False, False, None)
  133. else:
  134. hasPheno = True
  135. elif (test_ext == None):
  136. test_ext = ext
  137. elif (ext != test_ext):
  138. return (True, True, True, True, False, None)
  139. zip_file.close()
  140. return ( True, True, hasPheno, (count >= 3), True, test_ext )
  141. def parse_outputs( args ):
  142. rval = {}
  143. for arg in args:
  144. id, files_path, path = arg.split( ':', 2 )
  145. rval[int( id )] = ( path, files_path )
  146. return rval
  147. def add_file( dataset, json_file, output_path ):
  148. data_type = None
  149. line_count = None
  150. converted_path = None
  151. stdout = None
  152. link_data_only = dataset.get( 'link_data_only', 'copy_files' )
  153. if dataset.type == 'url':
  154. try:
  155. temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' )
  156. except Exception, e:
  157. file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file )
  158. return
  159. dataset.path = temp_name
  160. # See if we have an empty file
  161. if not os.path.exists( dataset.path ):
  162. file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file )
  163. return
  164. if not os.path.getsize( dataset.path ) > 0:
  165. file_err( 'The uploaded file is empty', dataset, json_file )
  166. return
  167. if not dataset.type == 'url':
  168. # Already set is_multi_byte above if type == 'url'
  169. try:
  170. dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) )
  171. except UnicodeDecodeError, e:
  172. dataset.is_multi_byte = False
  173. # Is dataset content multi-byte?
  174. if dataset.is_multi_byte:
  175. data_type = 'multi-byte char'
  176. ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
  177. # Is dataset content supported sniffable binary?
  178. elif check_bam( dataset.path ):
  179. ext = 'bam'
  180. data_type = 'bam'
  181. elif check_sff( dataset.path ):
  182. ext = 'sff'
  183. data_type = 'sff'
  184. else:
  185. # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
  186. is_gzipped, is_valid = check_gzip( dataset.path )
  187. if is_gzipped and not is_valid:
  188. file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
  189. return
  190. elif is_gzipped and is_valid:
  191. # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
  192. CHUNK_SIZE = 2**20 # 1Mb
  193. fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False )
  194. gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
  195. while 1:
  196. try:
  197. chunk = gzipped_file.read( CHUNK_SIZE )
  198. except IOError:
  199. os.close( fd )
  200. os.remove( uncompressed )
  201. file_err( 'Problem decompressing gzipped data', dataset, json_file )
  202. return
  203. if not chunk:
  204. break
  205. os.write( fd, chunk )
  206. os.close( fd )
  207. gzipped_file.close()
  208. # Replace the gzipped file with the decompressed file
  209. shutil.move( uncompressed, dataset.path )
  210. dataset.name = dataset.name.rstrip( '.gz' )
  211. data_type = 'gzip'
  212. if not data_type:
  213. # See if we have a zip archive
  214. is_zipped, known_ext, one_pheno, gt_one, homogeneous, test_ext = check_zip( dataset.path )
  215. if (not is_zipped):
  216. file_err("CEL or NimbleGen files must be zipped.", dataset, json_file)
  217. if (not known_ext):
  218. file_err("Unknown file type in zip: " + test_ext, dataset, json_file)
  219. if (not one_pheno):
  220. file_err("There must be exactly one .txt pheno file in the zip.", dataset, json_file)
  221. if (not gt_one):
  222. file_err("There must be more than one CEL or XYS file in the zip.", dataset, json_file)
  223. if (not homogeneous):
  224. file_err("Except the .txt pheno file, other files must be all CEL or XYS.", dataset, json_file)
  225. data_type = 'zip'
  226. if (test_ext == 'cel'):
  227. ext = 'cel.zip'
  228. file_type = 'cel.zip'
  229. else:
  230. ext = 'xys.zip'
  231. file_type = 'xys.zip'
  232. if not data_type:
  233. if check_binary( dataset.path ):
  234. # We have a binary dataset, but it is not Bam or Sff
  235. data_type = 'binary'
  236. #binary_ok = False
  237. parts = dataset.name.split( "." )
  238. if len( parts ) > 1:
  239. ext = parts[1].strip().lower()
  240. if ext not in unsniffable_binary_formats:
  241. file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
  242. return
  243. elif ext in unsniffable_binary_formats and dataset.file_type != ext:
  244. err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
  245. file_err( err_msg, dataset, json_file )
  246. return
  247. if not data_type:
  248. # We must have a text file
  249. if check_html( dataset.path ):
  250. file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
  251. return
  252. if data_type != 'binary' and data_type != 'zip':
  253. # don't convert newlines on data we're only going to symlink
  254. if link_data_only == 'copy_files':
  255. in_place = True
  256. if dataset.type in ( 'server_dir', 'path_paste' ):
  257. in_place = False
  258. if dataset.space_to_tab:
  259. line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place )
  260. else:
  261. line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place )
  262. if dataset.file_type == 'auto':
  263. ext = sniff.guess_ext( dataset.path )
  264. else:
  265. ext = dataset.file_type
  266. data_type = ext
  267. # Save job info for the framework
  268. if ext == 'auto' and dataset.ext:
  269. ext = dataset.ext
  270. if ext == 'auto':
  271. ext = 'data'
  272. # Move the dataset to its "real" path
  273. if link_data_only == 'link_to_files':
  274. pass # data will remain in place
  275. elif dataset.type in ( 'server_dir', 'path_paste' ):
  276. if converted_path is not None:
  277. shutil.copy( converted_path, output_path )
  278. try:
  279. os.remove( converted_path )
  280. except:
  281. pass
  282. else:
  283. # this should not happen, but it's here just in case
  284. shutil.copy( dataset.path, output_path )
  285. else:
  286. shutil.move( dataset.path, output_path )
  287. # Write the job info
  288. info = dict( type = 'dataset',
  289. dataset_id = dataset.dataset_id,
  290. ext = ext,
  291. stdout = 'uploaded %s file' % data_type,
  292. name = dataset.name,
  293. line_count = line_count )
  294. json_file.write( to_json_string( info ) + "\n" )
  295. # Groom the dataset content if necessary
  296. datatype = Registry().get_datatype_by_extension( ext )
  297. datatype.groom_dataset_content( output_path )
  298. def add_composite_file( dataset, json_file, output_path, files_path ):
  299. if dataset.composite_files:
  300. os.mkdir( files_path )
  301. for name, value in dataset.composite_files.iteritems():
  302. value = util.bunch.Bunch( **value )
  303. if dataset.composite_file_paths[ value.name ] is None and not value.optional:
  304. file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file )
  305. break
  306. elif dataset.composite_file_paths[value.name] is not None:
  307. if not value.is_binary:
  308. if uploaded_dataset.composite_files[ value.name ].space_to_tab:
  309. sniff.convert_newlines_sep2tabs( dataset.composite_file_paths[ value.name ][ 'path' ] )
  310. else:
  311. sniff.convert_newlines( dataset.composite_file_paths[ value.name ][ 'path' ] )
  312. shutil.move( dataset.composite_file_paths[ value.name ][ 'path' ], os.path.join( files_path, name ) )
  313. # Move the dataset to its "real" path
  314. shutil.move( dataset.primary_file, output_path )
  315. # Write the job info
  316. info = dict( type = 'dataset',
  317. dataset_id = dataset.dataset_id,
  318. stdout = 'uploaded %s file' % dataset.file_type )
  319. json_file.write( to_json_string( info ) + "\n" )
  320. def __main__():
  321. if len( sys.argv ) < 2:
  322. print >>sys.stderr, 'usage: upload.py <json paramfile> <output spec> ...'
  323. sys.exit( 1 )
  324. output_paths = parse_outputs( sys.argv[2:] )
  325. json_file = open( 'galaxy.json', 'w' )
  326. for line in open( sys.argv[1], 'r' ):
  327. dataset = from_json_string( line )
  328. dataset = util.bunch.Bunch( **safe_dict( dataset ) )
  329. try:
  330. output_path = output_paths[int( dataset.dataset_id )][0]
  331. except:
  332. print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id
  333. sys.exit( 1 )
  334. if dataset.type == 'composite':
  335. files_path = output_paths[int( dataset.dataset_id )][1]
  336. add_composite_file( dataset, json_file, output_path, files_path )
  337. else:
  338. add_file( dataset, json_file, output_path )
  339. # clean up paramfile
  340. try:
  341. os.remove( sys.argv[1] )
  342. except:
  343. pass
  344. if __name__ == '__main__':
  345. __main__()