upload.py - This is a Python script that takes in four argu…

/tools/data_source/upload.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 569 lines · 486 code · 24 blank · 59 comment · 177 complexity · 9a079bdb5122a188944cfb359c048189 MD5 · raw file

#!/usr/bin/env python
#Processes uploads from the user.

# WARNING: Changes in this tool (particularly as related to parsing) may need
# to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools

import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii
from galaxy import eggs
# need to import model before sniff to resolve a circular import dependency
import galaxy.model
from galaxy.datatypes.checkers import *
from galaxy.datatypes import sniff
from galaxy.datatypes.binary import *
from galaxy.datatypes.images import Pdf
from galaxy.datatypes.registry import Registry
from galaxy import util
from galaxy.datatypes.util.image_util import *
from galaxy.util.json import *

try:
    import Image as PIL
except ImportError:
    try:
        from PIL import Image as PIL
    except:
        PIL = None

try:
    import bz2
except:
    bz2 = None

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err( msg, ret=1 ):
    sys.stderr.write( msg )
    sys.exit( ret )
def file_err( msg, dataset, json_file ):
    json_file.write( to_json_string( dict( type = 'dataset',
                                           ext = 'data',
                                           dataset_id = dataset.dataset_id,
                                           stderr = msg ) ) + "\n" )
    # never remove a server-side upload
    if dataset.type in ( 'server_dir', 'path_paste' ):
        return
    try:
        os.remove( dataset.path )
    except:
        pass
def safe_dict(d):
    """
    Recursively clone json structure with UTF-8 dictionary keys
    http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/
    """
    if isinstance(d, dict):
        return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()])
    elif isinstance(d, list):
        return [safe_dict(x) for x in d]
    else:
        return d
def check_html( temp_name, chunk=None ):
    if chunk is None:
        temp = open(temp_name, "U")
    else:
        temp = chunk
    regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
    regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
    regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
    regexp4 = re.compile( "<META[^>]*>", re.I )
    regexp5 = re.compile( "<SCRIPT[^>]*>", re.I )
    lineno = 0
    for line in temp:
        lineno += 1
        matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line )
        if matches:
            if chunk is None:
                temp.close()
            return True
        if lineno > 100:
            break
    if chunk is None:
        temp.close()
    return False
def check_binary( temp_name ):
    is_binary = False
    temp = open( temp_name, "U" )
    chars_read = 0
    for chars in temp:
        for char in chars:
            chars_read += 1
            if ord( char ) > 128:
                is_binary = True
                break
            if chars_read > 100:
                break
        if chars_read > 100:
            break
    temp.close()
    return is_binary
def check_bam( file_path ):
    return Bam().sniff( file_path )
def check_sff( file_path ):
    return Sff().sniff( file_path )
def check_pdf( file_path ):
    return Pdf().sniff( file_path )
def check_bigwig( file_path ):
    return BigWig().sniff( file_path )
def check_bigbed( file_path ):
    return BigBed().sniff( file_path )
def check_cel( filename ):
    return Cel().sniff( filename )
def check_gzip( temp_name ):
    # This method returns a tuple of booleans representing ( is_gzipped, is_valid )
    # Make sure we have a gzipped file
    try:
        temp = open( temp_name, "U" )
        magic_check = temp.read( 2 )
        temp.close()
        if magic_check != util.gzip_magic:
            return ( False, False )
    except:
        return ( False, False )
    # We support some binary data types, so check if the compressed binary file is valid
    # If the file is Bam, it should already have been detected as such, so we'll just check
    # for sff format.
    try:
        header = gzip.open( temp_name ).read(4)
        if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
            return ( True, True )
    except:
        return( False, False )
    CHUNK_SIZE = 2**15 # 32Kb
    gzipped_file = gzip.GzipFile( temp_name, mode='rb' )
    chunk = gzipped_file.read( CHUNK_SIZE )
    gzipped_file.close()
    # See if we have a compressed HTML file
    if check_html( temp_name, chunk=chunk ):
        return ( True, False )
    return ( True, True )
def check_bz2( temp_name ):
    try:
        temp = open( temp_name, "U" )
        magic_check = temp.read( 3 )
        temp.close()
        if magic_check != util.bz2_magic:
            return ( False, False )
    except:
        return( False, False )
    CHUNK_SIZE = 2**15 # reKb
    bzipped_file = bz2.BZ2File( temp_name, mode='rb' )
    chunk = bzipped_file.read( CHUNK_SIZE )
    bzipped_file.close()
    # See if we have a compressed HTML file
    if check_html( temp_name, chunk=chunk ):
        return ( True, False )
    return ( True, True )
def check_zip( temp_name ):
    if zipfile.is_zipfile( temp_name ):
        return True
    return False
def check_zip_for_expression( temp_name ):
    # Return: (is_zip, known_ext, exactly_one_pheno, gt_one, homogeneous, ext)
    if not zipfile.is_zipfile( temp_name ):
        return (False, False, False, False, False, None)
    zip_file = zipfile.ZipFile( temp_name, "r" )
    # Make sure the archive consists of valid files.  The current rules are:
    # 1. The file type in the zip is homegeneous, except that there is exactly one .txt pheno file
    # 2. The rest of the files must be either .cel or .xys
    # 3. There must be at least two .cel or .xys

    hasPheno = False
    count = 0
    test_ext = None
    for name in zip_file.namelist():
        fileBaseName = os.path.basename(name)
        if(fileBaseName=="" or fileBaseName.startswith=="." or name.startswith("__MACOSX")):
            # ignore folder name, hidden file in *nix, or extra resource forks by Max OSX ZIP software.
            continue
        #Reason:modification to support folder in zip file
        #ext = name.split(".")[1].strip().lower()
        ext = os.path.splitext( name )[1].strip().lower().replace(".","")
        count += 1

        if (not (ext == "txt" or ext == "cel" or ext == "xys")):
            #return (True, False, False, False, False, ext)
            continue

        if (ext == "txt"):
            if (hasPheno):
                return (True, True, False, False, False, None)
            else:
                hasPheno = True
        elif (test_ext == None):
            test_ext = ext
        elif (ext != test_ext):
            return (True, True, True, True, False, None)
            
    zip_file.close()
    return ( True, True, hasPheno, (count >= 3), True, test_ext )

def parse_outputs( args ):
    rval = {}
    for arg in args:
        id, files_path, path = arg.split( ':', 2 )
        rval[int( id )] = ( path, files_path )
    return rval
def add_file( dataset, registry, json_file, output_path ):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get( 'link_data_only', 'copy_files' )
    in_place = dataset.get( 'in_place', True )

    try:
        ext = dataset.file_type
    except AttributeError:
        file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file )
        return

    if dataset.type == 'url':
        try:
            page = urllib.urlopen( dataset.path ) #page will be .close()ed by sniff methods
            temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) )
        except Exception, e:
            file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file )
            return
        dataset.path = temp_name
    # See if we have an empty file
    if not os.path.exists( dataset.path ):
        file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file )
        return
    if not os.path.getsize( dataset.path ) > 0:
        file_err( 'The uploaded file is empty', dataset, json_file )
        return
    if not dataset.type == 'url':
        # Already set is_multi_byte above if type == 'url'
        try:
            dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) )
        except UnicodeDecodeError, e:
            dataset.is_multi_byte = False
    # Is dataset an image?
    image = check_image( dataset.path )
    if image:
        if not PIL:
            image = None
        # get_image_ext() returns None if nor a supported Image type
        ext = get_image_ext( dataset.path, image )
        data_type = ext
    # Is dataset content multi-byte?
    elif dataset.is_multi_byte:
        data_type = 'multi-byte char'
        ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
    # Is dataset content supported sniffable binary?
    elif check_bam( dataset.path ):
        ext = 'bam'
        data_type = 'bam'
    elif check_sff( dataset.path ):
        ext = 'sff'
        data_type = 'sff'
    elif check_pdf( dataset.path ):
        ext = 'pdf'
        data_type = 'pdf'
    elif check_bigwig( dataset.path ):
        ext = 'bigwig'
        data_type = 'bigwig'
    elif check_bigbed( dataset.path ):
        ext = 'bigbed'
        data_type = 'bigbed'
    elif check_cel( dataset.name ):
        ext = 'cel'
        data_type = 'cel'
    else:
        type_info = Binary.is_sniffable_binary( dataset.path )
        if type_info:
            data_type = type_info[0]
            ext = type_info[1]
    if not data_type:
        # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
        is_gzipped, is_valid = check_gzip( dataset.path )
        if is_gzipped and not is_valid:
            file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
            return
        elif is_gzipped and is_valid:
            if link_data_only == 'copy_files':
                # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                CHUNK_SIZE = 2**20 # 1Mb
                fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
                gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
                while 1:
                    try:
                        chunk = gzipped_file.read( CHUNK_SIZE )
                    except IOError:
                        os.close( fd )
                        os.remove( uncompressed )
                        file_err( 'Problem decompressing gzipped data', dataset, json_file )
                        return
                    if not chunk:
                        break
                    os.write( fd, chunk )
                os.close( fd )
                gzipped_file.close()
                # Replace the gzipped file with the decompressed file if it's safe to do so
                if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
                    dataset.path = uncompressed
                else:
                    shutil.move( uncompressed, dataset.path )
                os.chmod(dataset.path, 0644)
            dataset.name = dataset.name.rstrip( '.gz' )
            data_type = 'gzip'
        if not data_type and bz2 is not None:
            # See if we have a bz2 file, much like gzip
            is_bzipped, is_valid = check_bz2( dataset.path )
            if is_bzipped and not is_valid:
                file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
                return
            elif is_bzipped and is_valid:
                if link_data_only == 'copy_files':
                    # We need to uncompress the temp_name file
                    CHUNK_SIZE = 2**20 # 1Mb
                    fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
                    bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
                    while 1:
                        try:
                            chunk = bzipped_file.read( CHUNK_SIZE )
                        except IOError:
                            os.close( fd )
                            os.remove( uncompressed )
                            file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
                            return
                        if not chunk:
                            break
                        os.write( fd, chunk )
                    os.close( fd )
                    bzipped_file.close()
                    # Replace the bzipped file with the decompressed file if it's safe to do so
                    if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
                        dataset.path = uncompressed
                    else:
                        shutil.move( uncompressed, dataset.path )
                    os.chmod(dataset.path, 0644)
                dataset.name = dataset.name.rstrip( '.bz2' )
                data_type = 'bz2'
        if not data_type:
            # See if we have a zip archive for expression data
            is_zipped_for_expression, known_ext, one_pheno, gt_one, homogeneous, test_ext = check_zip_for_expression( dataset.path )
            if (not is_zipped_for_expression):
                pass
            else:
                if (not one_pheno):
                    file_err("There must be exactly one .txt pheno file in the zip at %s." % one_pheno, dataset, json_file)
                if (not gt_one):
                    file_err("There must be more than one CEL or XYS file in the zip.", dataset, json_file)
                if (not homogeneous):
                    file_err("Except the .txt pheno file, other files must be all CEL or XYS.", dataset, json_file)

                data_type = 'zip_for_expression'
                if (test_ext == 'cel'):
                    ext = 'cel.zip'
                    file_type = 'cel.zip'
                else:
                    ext = 'xys.zip'
                    file_type = 'xys.zip'

        if not data_type:
            # See if we have a zip archive
            is_zipped = check_zip( dataset.path )
            if is_zipped:
                if link_data_only == 'copy_files':
                    CHUNK_SIZE = 2**20 # 1Mb
                    uncompressed = None
                    uncompressed_name = None
                    unzipped = False
                    z = zipfile.ZipFile( dataset.path )
                    for name in z.namelist():
                        if name.endswith('/'):
                            continue
                        if unzipped:
                            stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
                            break
                        fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
                        if sys.version_info[:2] >= ( 2, 6 ):
                            zipped_file = z.open( name )
                            while 1:
                                try:
                                    chunk = zipped_file.read( CHUNK_SIZE )
                                except IOError:
                                    os.close( fd )
                                    os.remove( uncompressed )
                                    file_err( 'Problem decompressing zipped data', dataset, json_file )
                                    return
                                if not chunk:
                                    break
                                os.write( fd, chunk )
                            os.close( fd )
                            zipped_file.close()
                            uncompressed_name = name
                            unzipped = True
                        else:
                            # python < 2.5 doesn't have a way to read members in chunks(!)
                            try:
                                outfile = open( uncompressed, 'wb' )
                                outfile.write( z.read( name ) )
                                outfile.close()
                                uncompressed_name = name
                                unzipped = True
                            except IOError:
                                os.close( fd )
                                os.remove( uncompressed )
                                file_err( 'Problem decompressing zipped data', dataset, json_file )
                                return
                    z.close()
                    # Replace the zipped file with the decompressed file if it's safe to do so
                    if uncompressed is not None:
                        if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
                            dataset.path = uncompressed
                        else:
                            shutil.move( uncompressed, dataset.path )
                        os.chmod(dataset.path, 0644)
                        dataset.name = uncompressed_name
                data_type = 'zip'
        if not data_type:
            if check_binary( dataset.path ):
                # We have a binary dataset, but it is not Bam, Sff or Pdf
                data_type = 'binary'
                #binary_ok = False
                parts = dataset.name.split( "." )
                if len( parts ) > 1:
                    ext = parts[-1].strip().lower()
                    if not Binary.is_ext_unsniffable(ext):
                        file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
                        return
                    elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
                        err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
                        file_err( err_msg, dataset, json_file )
                        return
        if not data_type:
            # We must have a text file
            if check_html( dataset.path ):
                file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
                return
        if data_type != 'binary' and data_type != 'zip_for_expression':
            if link_data_only == 'copy_files':
                if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
                    in_place = False
                # Convert universal line endings to Posix line endings, but allow the user to turn it off,
                # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
                # corrupting the content of those files.
                if dataset.to_posix_lines:
                    if dataset.space_to_tab:
                        line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place )
                    else:
                        line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place )
            if dataset.file_type == 'auto':
                ext = sniff.guess_ext( dataset.path, registry.sniff_order )
            else:
                ext = dataset.file_type
            data_type = ext
    # Save job info for the framework
    if ext == 'auto' and dataset.ext:
        ext = dataset.ext
    if ext == 'auto':
        ext = 'data'
    datatype = registry.get_datatype_by_extension( ext )
    if dataset.type in ( 'server_dir', 'path_paste' ) and link_data_only == 'link_to_files':
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming( dataset.path ):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            file_err( err_msg, dataset, json_file )
            return
    if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
        # Move the dataset to its "real" path
        if converted_path is not None:
            shutil.copy( converted_path, output_path )
            try:
                os.remove( converted_path )
            except:
                pass
        else:
            # This should not happen, but it's here just in case
            shutil.copy( dataset.path, output_path )
    elif link_data_only == 'copy_files':
        shutil.move( dataset.path, output_path )
    # Write the job info
    stdout = stdout or 'uploaded %s file' % data_type
    info = dict( type = 'dataset',
                 dataset_id = dataset.dataset_id,
                 ext = ext,
                 stdout = stdout,
                 name = dataset.name,
                 line_count = line_count )
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    json_file.write( to_json_string( info ) + "\n" )

    if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content( output_path )

def add_composite_file( dataset, registry, json_file, output_path, files_path ):
        if dataset.composite_files:
            os.mkdir( files_path )
            for name, value in dataset.composite_files.iteritems():
                value = util.bunch.Bunch( **value )
                if dataset.composite_file_paths[ value.name ] is None and not value.optional:
                    file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file )
                    break
                elif dataset.composite_file_paths[value.name] is not None:
                    dp = dataset.composite_file_paths[value.name][ 'path' ]
                    isurl = dp.find('://') <> -1 # todo fixme
                    if isurl:
                       try:
                           temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix='url_paste' )
                       except Exception, e:
                           file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file )
                           return
                       dataset.path = temp_name
                       dp = temp_name
                    if not value.is_binary:
                        if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ):
                            sniff.convert_newlines_sep2tabs( dp )
                        else:
                            sniff.convert_newlines( dp )
                    shutil.move( dp, os.path.join( files_path, name ) )
        # Move the dataset to its "real" path
        shutil.move( dataset.primary_file, output_path )
        # Write the job info
        info = dict( type = 'dataset',
                     dataset_id = dataset.dataset_id,
                     stdout = 'uploaded %s file' % dataset.file_type )
        json_file.write( to_json_string( info ) + "\n" )

def __main__():

    if len( sys.argv ) < 4:
        print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...'
        sys.exit( 1 )

    output_paths = parse_outputs( sys.argv[4:] )
    json_file = open( 'galaxy.json', 'w' )

    registry = Registry()
    registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] )

    for line in open( sys.argv[3], 'r' ):
        dataset = from_json_string( line )
        dataset = util.bunch.Bunch( **safe_dict( dataset ) )
        try:
            output_path = output_paths[int( dataset.dataset_id )][0]
        except:
            print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id
            sys.exit( 1 )
        if dataset.type == 'composite':
            files_path = output_paths[int( dataset.dataset_id )][1]
            add_composite_file( dataset, registry, json_file, output_path, files_path )
        else:
            add_file( dataset, registry, json_file, output_path )

    # clean up paramfile
    # TODO: this will not work when running as the actual user unless the
    # parent directory is writable by the user.
    try:
        os.remove( sys.argv[3] )
    except:
        pass

if __name__ == '__main__':
    __main__()
Summary ✨

This is a Python script that takes in four arguments: the root directory, the datatypes configuration file, the JSON parameter file, and the output specification. It creates a registry object from the datatypes configuration file and then reads each line of the JSON parameter file as a dictionary. For each dataset in the JSON parameter file, it moves the primary file to the output path specified in the output specification and writes a job info dictionary to the galaxy.json file. If the dataset is composite, it also creates a directory for the files and moves each file into that directory.
Tech Fingerprint

Alerts (49)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
11 13 17 18
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
25 30 48 121 130 147 478 551 565
'def' Ensure functions have docstrings for documentation
35 38 61 84 100 102 104 106 108 110 112 140 157 161 201 207 501
'isinstance(' Overuse may indicate design issues; consider polymorphism
55 57
'open(' Use 'with open()' to ensure Files are properly closed
63 86 223 514 541 546
Complexity hotspot; lines 74 to 76 (total complexity: 6)
74 75 76
'gzip.open(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
127
'gzip.GzipFile(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
133 289
'bz2.BZ2File(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
150 322
'zipfile.ZipFile(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
165 374
'== None' Use 'is' for None comparisons (e.g., x is None)
193