/tools/data_source/upload.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 569 lines · 486 code · 24 blank · 59 comment · 177 complexity · 9a079bdb5122a188944cfb359c048189 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Processes uploads from the user.
  3. # WARNING: Changes in this tool (particularly as related to parsing) may need
  4. # to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools
  5. import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii
  6. from galaxy import eggs
  7. # need to import model before sniff to resolve a circular import dependency
  8. import galaxy.model
  9. from galaxy.datatypes.checkers import *
  10. from galaxy.datatypes import sniff
  11. from galaxy.datatypes.binary import *
  12. from galaxy.datatypes.images import Pdf
  13. from galaxy.datatypes.registry import Registry
  14. from galaxy import util
  15. from galaxy.datatypes.util.image_util import *
  16. from galaxy.util.json import *
  17. try:
  18. import Image as PIL
  19. except ImportError:
  20. try:
  21. from PIL import Image as PIL
  22. except:
  23. PIL = None
  24. try:
  25. import bz2
  26. except:
  27. bz2 = None
  28. assert sys.version_info[:2] >= ( 2, 4 )
  29. def stop_err( msg, ret=1 ):
  30. sys.stderr.write( msg )
  31. sys.exit( ret )
  32. def file_err( msg, dataset, json_file ):
  33. json_file.write( to_json_string( dict( type = 'dataset',
  34. ext = 'data',
  35. dataset_id = dataset.dataset_id,
  36. stderr = msg ) ) + "\n" )
  37. # never remove a server-side upload
  38. if dataset.type in ( 'server_dir', 'path_paste' ):
  39. return
  40. try:
  41. os.remove( dataset.path )
  42. except:
  43. pass
  44. def safe_dict(d):
  45. """
  46. Recursively clone json structure with UTF-8 dictionary keys
  47. http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/
  48. """
  49. if isinstance(d, dict):
  50. return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()])
  51. elif isinstance(d, list):
  52. return [safe_dict(x) for x in d]
  53. else:
  54. return d
  55. def check_html( temp_name, chunk=None ):
  56. if chunk is None:
  57. temp = open(temp_name, "U")
  58. else:
  59. temp = chunk
  60. regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
  61. regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
  62. regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
  63. regexp4 = re.compile( "<META[^>]*>", re.I )
  64. regexp5 = re.compile( "<SCRIPT[^>]*>", re.I )
  65. lineno = 0
  66. for line in temp:
  67. lineno += 1
  68. matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line )
  69. if matches:
  70. if chunk is None:
  71. temp.close()
  72. return True
  73. if lineno > 100:
  74. break
  75. if chunk is None:
  76. temp.close()
  77. return False
  78. def check_binary( temp_name ):
  79. is_binary = False
  80. temp = open( temp_name, "U" )
  81. chars_read = 0
  82. for chars in temp:
  83. for char in chars:
  84. chars_read += 1
  85. if ord( char ) > 128:
  86. is_binary = True
  87. break
  88. if chars_read > 100:
  89. break
  90. if chars_read > 100:
  91. break
  92. temp.close()
  93. return is_binary
  94. def check_bam( file_path ):
  95. return Bam().sniff( file_path )
  96. def check_sff( file_path ):
  97. return Sff().sniff( file_path )
  98. def check_pdf( file_path ):
  99. return Pdf().sniff( file_path )
  100. def check_bigwig( file_path ):
  101. return BigWig().sniff( file_path )
  102. def check_bigbed( file_path ):
  103. return BigBed().sniff( file_path )
  104. def check_cel( filename ):
  105. return Cel().sniff( filename )
  106. def check_gzip( temp_name ):
  107. # This method returns a tuple of booleans representing ( is_gzipped, is_valid )
  108. # Make sure we have a gzipped file
  109. try:
  110. temp = open( temp_name, "U" )
  111. magic_check = temp.read( 2 )
  112. temp.close()
  113. if magic_check != util.gzip_magic:
  114. return ( False, False )
  115. except:
  116. return ( False, False )
  117. # We support some binary data types, so check if the compressed binary file is valid
  118. # If the file is Bam, it should already have been detected as such, so we'll just check
  119. # for sff format.
  120. try:
  121. header = gzip.open( temp_name ).read(4)
  122. if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
  123. return ( True, True )
  124. except:
  125. return( False, False )
  126. CHUNK_SIZE = 2**15 # 32Kb
  127. gzipped_file = gzip.GzipFile( temp_name, mode='rb' )
  128. chunk = gzipped_file.read( CHUNK_SIZE )
  129. gzipped_file.close()
  130. # See if we have a compressed HTML file
  131. if check_html( temp_name, chunk=chunk ):
  132. return ( True, False )
  133. return ( True, True )
  134. def check_bz2( temp_name ):
  135. try:
  136. temp = open( temp_name, "U" )
  137. magic_check = temp.read( 3 )
  138. temp.close()
  139. if magic_check != util.bz2_magic:
  140. return ( False, False )
  141. except:
  142. return( False, False )
  143. CHUNK_SIZE = 2**15 # reKb
  144. bzipped_file = bz2.BZ2File( temp_name, mode='rb' )
  145. chunk = bzipped_file.read( CHUNK_SIZE )
  146. bzipped_file.close()
  147. # See if we have a compressed HTML file
  148. if check_html( temp_name, chunk=chunk ):
  149. return ( True, False )
  150. return ( True, True )
  151. def check_zip( temp_name ):
  152. if zipfile.is_zipfile( temp_name ):
  153. return True
  154. return False
  155. def check_zip_for_expression( temp_name ):
  156. # Return: (is_zip, known_ext, exactly_one_pheno, gt_one, homogeneous, ext)
  157. if not zipfile.is_zipfile( temp_name ):
  158. return (False, False, False, False, False, None)
  159. zip_file = zipfile.ZipFile( temp_name, "r" )
  160. # Make sure the archive consists of valid files. The current rules are:
  161. # 1. The file type in the zip is homegeneous, except that there is exactly one .txt pheno file
  162. # 2. The rest of the files must be either .cel or .xys
  163. # 3. There must be at least two .cel or .xys
  164. hasPheno = False
  165. count = 0
  166. test_ext = None
  167. for name in zip_file.namelist():
  168. fileBaseName = os.path.basename(name)
  169. if(fileBaseName=="" or fileBaseName.startswith=="." or name.startswith("__MACOSX")):
  170. # ignore folder name, hidden file in *nix, or extra resource forks by Max OSX ZIP software.
  171. continue
  172. #Reason:modification to support folder in zip file
  173. #ext = name.split(".")[1].strip().lower()
  174. ext = os.path.splitext( name )[1].strip().lower().replace(".","")
  175. count += 1
  176. if (not (ext == "txt" or ext == "cel" or ext == "xys")):
  177. #return (True, False, False, False, False, ext)
  178. continue
  179. if (ext == "txt"):
  180. if (hasPheno):
  181. return (True, True, False, False, False, None)
  182. else:
  183. hasPheno = True
  184. elif (test_ext == None):
  185. test_ext = ext
  186. elif (ext != test_ext):
  187. return (True, True, True, True, False, None)
  188. zip_file.close()
  189. return ( True, True, hasPheno, (count >= 3), True, test_ext )
  190. def parse_outputs( args ):
  191. rval = {}
  192. for arg in args:
  193. id, files_path, path = arg.split( ':', 2 )
  194. rval[int( id )] = ( path, files_path )
  195. return rval
  196. def add_file( dataset, registry, json_file, output_path ):
  197. data_type = None
  198. line_count = None
  199. converted_path = None
  200. stdout = None
  201. link_data_only = dataset.get( 'link_data_only', 'copy_files' )
  202. in_place = dataset.get( 'in_place', True )
  203. try:
  204. ext = dataset.file_type
  205. except AttributeError:
  206. file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file )
  207. return
  208. if dataset.type == 'url':
  209. try:
  210. page = urllib.urlopen( dataset.path ) #page will be .close()ed by sniff methods
  211. temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) )
  212. except Exception, e:
  213. file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file )
  214. return
  215. dataset.path = temp_name
  216. # See if we have an empty file
  217. if not os.path.exists( dataset.path ):
  218. file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file )
  219. return
  220. if not os.path.getsize( dataset.path ) > 0:
  221. file_err( 'The uploaded file is empty', dataset, json_file )
  222. return
  223. if not dataset.type == 'url':
  224. # Already set is_multi_byte above if type == 'url'
  225. try:
  226. dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) )
  227. except UnicodeDecodeError, e:
  228. dataset.is_multi_byte = False
  229. # Is dataset an image?
  230. image = check_image( dataset.path )
  231. if image:
  232. if not PIL:
  233. image = None
  234. # get_image_ext() returns None if nor a supported Image type
  235. ext = get_image_ext( dataset.path, image )
  236. data_type = ext
  237. # Is dataset content multi-byte?
  238. elif dataset.is_multi_byte:
  239. data_type = 'multi-byte char'
  240. ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
  241. # Is dataset content supported sniffable binary?
  242. elif check_bam( dataset.path ):
  243. ext = 'bam'
  244. data_type = 'bam'
  245. elif check_sff( dataset.path ):
  246. ext = 'sff'
  247. data_type = 'sff'
  248. elif check_pdf( dataset.path ):
  249. ext = 'pdf'
  250. data_type = 'pdf'
  251. elif check_bigwig( dataset.path ):
  252. ext = 'bigwig'
  253. data_type = 'bigwig'
  254. elif check_bigbed( dataset.path ):
  255. ext = 'bigbed'
  256. data_type = 'bigbed'
  257. elif check_cel( dataset.name ):
  258. ext = 'cel'
  259. data_type = 'cel'
  260. else:
  261. type_info = Binary.is_sniffable_binary( dataset.path )
  262. if type_info:
  263. data_type = type_info[0]
  264. ext = type_info[1]
  265. if not data_type:
  266. # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
  267. is_gzipped, is_valid = check_gzip( dataset.path )
  268. if is_gzipped and not is_valid:
  269. file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
  270. return
  271. elif is_gzipped and is_valid:
  272. if link_data_only == 'copy_files':
  273. # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
  274. CHUNK_SIZE = 2**20 # 1Mb
  275. fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
  276. gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
  277. while 1:
  278. try:
  279. chunk = gzipped_file.read( CHUNK_SIZE )
  280. except IOError:
  281. os.close( fd )
  282. os.remove( uncompressed )
  283. file_err( 'Problem decompressing gzipped data', dataset, json_file )
  284. return
  285. if not chunk:
  286. break
  287. os.write( fd, chunk )
  288. os.close( fd )
  289. gzipped_file.close()
  290. # Replace the gzipped file with the decompressed file if it's safe to do so
  291. if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
  292. dataset.path = uncompressed
  293. else:
  294. shutil.move( uncompressed, dataset.path )
  295. os.chmod(dataset.path, 0644)
  296. dataset.name = dataset.name.rstrip( '.gz' )
  297. data_type = 'gzip'
  298. if not data_type and bz2 is not None:
  299. # See if we have a bz2 file, much like gzip
  300. is_bzipped, is_valid = check_bz2( dataset.path )
  301. if is_bzipped and not is_valid:
  302. file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
  303. return
  304. elif is_bzipped and is_valid:
  305. if link_data_only == 'copy_files':
  306. # We need to uncompress the temp_name file
  307. CHUNK_SIZE = 2**20 # 1Mb
  308. fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
  309. bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
  310. while 1:
  311. try:
  312. chunk = bzipped_file.read( CHUNK_SIZE )
  313. except IOError:
  314. os.close( fd )
  315. os.remove( uncompressed )
  316. file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
  317. return
  318. if not chunk:
  319. break
  320. os.write( fd, chunk )
  321. os.close( fd )
  322. bzipped_file.close()
  323. # Replace the bzipped file with the decompressed file if it's safe to do so
  324. if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
  325. dataset.path = uncompressed
  326. else:
  327. shutil.move( uncompressed, dataset.path )
  328. os.chmod(dataset.path, 0644)
  329. dataset.name = dataset.name.rstrip( '.bz2' )
  330. data_type = 'bz2'
  331. if not data_type:
  332. # See if we have a zip archive for expression data
  333. is_zipped_for_expression, known_ext, one_pheno, gt_one, homogeneous, test_ext = check_zip_for_expression( dataset.path )
  334. if (not is_zipped_for_expression):
  335. pass
  336. else:
  337. if (not one_pheno):
  338. file_err("There must be exactly one .txt pheno file in the zip at %s." % one_pheno, dataset, json_file)
  339. if (not gt_one):
  340. file_err("There must be more than one CEL or XYS file in the zip.", dataset, json_file)
  341. if (not homogeneous):
  342. file_err("Except the .txt pheno file, other files must be all CEL or XYS.", dataset, json_file)
  343. data_type = 'zip_for_expression'
  344. if (test_ext == 'cel'):
  345. ext = 'cel.zip'
  346. file_type = 'cel.zip'
  347. else:
  348. ext = 'xys.zip'
  349. file_type = 'xys.zip'
  350. if not data_type:
  351. # See if we have a zip archive
  352. is_zipped = check_zip( dataset.path )
  353. if is_zipped:
  354. if link_data_only == 'copy_files':
  355. CHUNK_SIZE = 2**20 # 1Mb
  356. uncompressed = None
  357. uncompressed_name = None
  358. unzipped = False
  359. z = zipfile.ZipFile( dataset.path )
  360. for name in z.namelist():
  361. if name.endswith('/'):
  362. continue
  363. if unzipped:
  364. stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
  365. break
  366. fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
  367. if sys.version_info[:2] >= ( 2, 6 ):
  368. zipped_file = z.open( name )
  369. while 1:
  370. try:
  371. chunk = zipped_file.read( CHUNK_SIZE )
  372. except IOError:
  373. os.close( fd )
  374. os.remove( uncompressed )
  375. file_err( 'Problem decompressing zipped data', dataset, json_file )
  376. return
  377. if not chunk:
  378. break
  379. os.write( fd, chunk )
  380. os.close( fd )
  381. zipped_file.close()
  382. uncompressed_name = name
  383. unzipped = True
  384. else:
  385. # python < 2.5 doesn't have a way to read members in chunks(!)
  386. try:
  387. outfile = open( uncompressed, 'wb' )
  388. outfile.write( z.read( name ) )
  389. outfile.close()
  390. uncompressed_name = name
  391. unzipped = True
  392. except IOError:
  393. os.close( fd )
  394. os.remove( uncompressed )
  395. file_err( 'Problem decompressing zipped data', dataset, json_file )
  396. return
  397. z.close()
  398. # Replace the zipped file with the decompressed file if it's safe to do so
  399. if uncompressed is not None:
  400. if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
  401. dataset.path = uncompressed
  402. else:
  403. shutil.move( uncompressed, dataset.path )
  404. os.chmod(dataset.path, 0644)
  405. dataset.name = uncompressed_name
  406. data_type = 'zip'
  407. if not data_type:
  408. if check_binary( dataset.path ):
  409. # We have a binary dataset, but it is not Bam, Sff or Pdf
  410. data_type = 'binary'
  411. #binary_ok = False
  412. parts = dataset.name.split( "." )
  413. if len( parts ) > 1:
  414. ext = parts[-1].strip().lower()
  415. if not Binary.is_ext_unsniffable(ext):
  416. file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
  417. return
  418. elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
  419. err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
  420. file_err( err_msg, dataset, json_file )
  421. return
  422. if not data_type:
  423. # We must have a text file
  424. if check_html( dataset.path ):
  425. file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
  426. return
  427. if data_type != 'binary' and data_type != 'zip_for_expression':
  428. if link_data_only == 'copy_files':
  429. if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
  430. in_place = False
  431. # Convert universal line endings to Posix line endings, but allow the user to turn it off,
  432. # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
  433. # corrupting the content of those files.
  434. if dataset.to_posix_lines:
  435. if dataset.space_to_tab:
  436. line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place )
  437. else:
  438. line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place )
  439. if dataset.file_type == 'auto':
  440. ext = sniff.guess_ext( dataset.path, registry.sniff_order )
  441. else:
  442. ext = dataset.file_type
  443. data_type = ext
  444. # Save job info for the framework
  445. if ext == 'auto' and dataset.ext:
  446. ext = dataset.ext
  447. if ext == 'auto':
  448. ext = 'data'
  449. datatype = registry.get_datatype_by_extension( ext )
  450. if dataset.type in ( 'server_dir', 'path_paste' ) and link_data_only == 'link_to_files':
  451. # Never alter a file that will not be copied to Galaxy's local file store.
  452. if datatype.dataset_content_needs_grooming( dataset.path ):
  453. err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
  454. '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
  455. file_err( err_msg, dataset, json_file )
  456. return
  457. if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
  458. # Move the dataset to its "real" path
  459. if converted_path is not None:
  460. shutil.copy( converted_path, output_path )
  461. try:
  462. os.remove( converted_path )
  463. except:
  464. pass
  465. else:
  466. # This should not happen, but it's here just in case
  467. shutil.copy( dataset.path, output_path )
  468. elif link_data_only == 'copy_files':
  469. shutil.move( dataset.path, output_path )
  470. # Write the job info
  471. stdout = stdout or 'uploaded %s file' % data_type
  472. info = dict( type = 'dataset',
  473. dataset_id = dataset.dataset_id,
  474. ext = ext,
  475. stdout = stdout,
  476. name = dataset.name,
  477. line_count = line_count )
  478. if dataset.get('uuid', None) is not None:
  479. info['uuid'] = dataset.get('uuid')
  480. json_file.write( to_json_string( info ) + "\n" )
  481. if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ):
  482. # Groom the dataset content if necessary
  483. datatype.groom_dataset_content( output_path )
  484. def add_composite_file( dataset, registry, json_file, output_path, files_path ):
  485. if dataset.composite_files:
  486. os.mkdir( files_path )
  487. for name, value in dataset.composite_files.iteritems():
  488. value = util.bunch.Bunch( **value )
  489. if dataset.composite_file_paths[ value.name ] is None and not value.optional:
  490. file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file )
  491. break
  492. elif dataset.composite_file_paths[value.name] is not None:
  493. dp = dataset.composite_file_paths[value.name][ 'path' ]
  494. isurl = dp.find('://') <> -1 # todo fixme
  495. if isurl:
  496. try:
  497. temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix='url_paste' )
  498. except Exception, e:
  499. file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file )
  500. return
  501. dataset.path = temp_name
  502. dp = temp_name
  503. if not value.is_binary:
  504. if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ):
  505. sniff.convert_newlines_sep2tabs( dp )
  506. else:
  507. sniff.convert_newlines( dp )
  508. shutil.move( dp, os.path.join( files_path, name ) )
  509. # Move the dataset to its "real" path
  510. shutil.move( dataset.primary_file, output_path )
  511. # Write the job info
  512. info = dict( type = 'dataset',
  513. dataset_id = dataset.dataset_id,
  514. stdout = 'uploaded %s file' % dataset.file_type )
  515. json_file.write( to_json_string( info ) + "\n" )
  516. def __main__():
  517. if len( sys.argv ) < 4:
  518. print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...'
  519. sys.exit( 1 )
  520. output_paths = parse_outputs( sys.argv[4:] )
  521. json_file = open( 'galaxy.json', 'w' )
  522. registry = Registry()
  523. registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] )
  524. for line in open( sys.argv[3], 'r' ):
  525. dataset = from_json_string( line )
  526. dataset = util.bunch.Bunch( **safe_dict( dataset ) )
  527. try:
  528. output_path = output_paths[int( dataset.dataset_id )][0]
  529. except:
  530. print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id
  531. sys.exit( 1 )
  532. if dataset.type == 'composite':
  533. files_path = output_paths[int( dataset.dataset_id )][1]
  534. add_composite_file( dataset, registry, json_file, output_path, files_path )
  535. else:
  536. add_file( dataset, registry, json_file, output_path )
  537. # clean up paramfile
  538. # TODO: this will not work when running as the actual user unless the
  539. # parent directory is writable by the user.
  540. try:
  541. os.remove( sys.argv[3] )
  542. except:
  543. pass
  544. if __name__ == '__main__':
  545. __main__()