data.py - Valid first column and strand column values vor b…

/lib/galaxy/datatypes/data.py

https://bitbucket.org/h_morita_dbcls/galaxy-central · Python · 490 lines · 421 code · 11 blank · 58 comment · 35 complexity · 02b6a7faf1ce2cd624a39f387bcdc8bf MD5 · raw file

import logging, os, sys, time, tempfile
from galaxy import util
from galaxy.util.odict import odict
from galaxy.util.bunch import Bunch
from cgi import escape
import metadata
import zipfile
from metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions

log = logging.getLogger(__name__)

# Valid first column and strand column values vor bed, other formats
col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho']
valid_strand = ['+', '-', '.']

class DataMeta( type ):
    """
    Metaclass for Data class.  Sets up metadata spec.
    """
    def __init__( cls, name, bases, dict_ ):
        cls.metadata_spec = metadata.MetadataSpecCollection()
        for base in bases: #loop through bases (class/types) of cls
            if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata
                cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls
        metadata.Statement.process( cls )

class Data( object ):
    """
    Base class for all datatypes.  Implements basic interfaces as well
    as class methods for metadata.

    >>> class DataTest( Data ):
    ...     MetadataElement( name="test" )
    ...
    >>> DataTest.metadata_spec.test.name
    'test'
    >>> DataTest.metadata_spec.test.desc
    'test'
    >>> type( DataTest.metadata_spec.test.param )
    <class 'galaxy.datatypes.metadata.MetadataParameter'>
    
    """
    __metaclass__ = DataMeta
    # Add metadata elements
    MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" )
    # Stores the set of display applications, and viewing methods, supported by this datatype
    supported_display_apps = {}
    # If False, the peek is regenerated whenever a dataset of this type is copied
    copy_safe_peek = True
    # The dataset contains binary data --> do not space_to_tab or convert newlines, etc.
    # Allow binary file uploads of this type when True.
    is_binary = True
    # Allow user to change between this datatype and others. If False, this datatype
    # cannot be changed from or into.
    allow_datatype_change = True
    #Composite datatypes
    composite_type = None
    composite_files = odict()
    primary_file_name = 'index'
    #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata
    _max_optional_metadata_filesize = None
    
    def __init__(self, **kwd):
        """Initialize the datatype"""
        object.__init__(self, **kwd)
        self.supported_display_apps = self.supported_display_apps.copy()
        self.composite_files = self.composite_files.copy()
        self.display_applications = odict()
    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        fd = open(dataset.file_name, 'wb')
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd = open(dataset.file_name, 'wb')
        os.write(fd, data)
        os.close(fd)
    def get_raw_data( self, dataset ):
        """Returns the full data. To stream it open the file_name and read/write as needed"""
        try:
            return file(datset.file_name, 'rb').read(-1)
        except OSError, e:
            log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name))
            return ''
    def groom_dataset_content( self, file_name ):
        """This function is called on an output dataset file after the content is initially generated."""
        pass
    def init_meta( self, dataset, copy_from=None ):
        # Metadata should be left mostly uninitialized.  Dataset will
        # handle returning default values when metadata is not set.
        # copy_from allows metadata to be passed in that will be
        # copied. (although this seems ambiguous, see
        # Dataset.set_metadata.  It always copies the rhs in order to
        # flag the object as modified for SQLAlchemy.
        if copy_from:
            dataset.metadata = copy_from.metadata
    def set_meta( self, dataset, overwrite = True, **kwd ):
        """Unimplemented method, allows guessing of metadata from contents of file"""
        return True
    def missing_meta( self, dataset, check = [], skip = [] ):
        """
        Checks for empty metadata values, Returns True if non-optional metadata is missing
        Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored
        Specifying a list of 'skip' items will return True even when a named metadata value is missing
        """
        if check:
            to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ]
        else:
            to_check = dataset.metadata.items()
        for key, value in to_check:
            if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ):
                continue #we skip check for optional and nonrequested values here 
            if not value:
                return True
        return False
    def set_max_optional_metadata_filesize( self, max_value ):
        try:
            max_value = int( max_value )
        except:
            return
        self.__class__._max_optional_metadata_filesize = max_value
    def get_max_optional_metadata_filesize( self ):
        rval = self.__class__._max_optional_metadata_filesize
        if rval is None:
            return -1
        return rval
    max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize )
    def set_peek( self, dataset, is_multi_byte=False ):
        """Set the peek and blurb text"""
        if not dataset.dataset.purged:
            dataset.peek = ''
            dataset.blurb = 'data'
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'
    def display_peek(self, dataset ):
        """Create HTML table, used for displaying peek"""
        out = ['<table cellspacing="0" cellpadding="3">']
        try:
            if not dataset.peek:
                dataset.set_peek()
            data = dataset.peek
            lines =  data.splitlines()
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                if type( line ) is unicode:
                    out.append( '<tr><td>%s</td></tr>' % escape( line ) )
                else:
                    out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
            out.append( '</table>' )
            out = "".join( out )
        except Exception, exc:
            out = "Can't create peek %s" % str( exc )
        return out
    def display_name(self, dataset):
        """Returns formatted html of dataset name"""
        try:
            if type ( dataset.name ) is unicode:
                return escape( dataset.name )
            else:
                return escape( unicode( dataset.name, 'utf-8 ') )
        except:
            return "name unavailable"
    def display_info(self, dataset):
        """Returns formatted html of dataset info"""
        try:
            # Change new line chars to html
            info = escape( dataset.info )
            if info.find( '\r\n' ) >= 0:
                info = info.replace( '\r\n', '<br/>' )
            if info.find( '\r' ) >= 0:
                info = info.replace( '\r', '<br/>' )
            if info.find( '\n' ) >= 0:
                info = info.replace( '\n', '<br/>' )
                
            # Convert to unicode to display non-ascii characters.
            if type( info ) is not unicode:
                info = unicode( info, 'utf-8')
                
            return info
        except:
            return "info unavailable"
    def validate(self, dataset):
        """Unimplemented validate, return no exceptions"""
        return list()
    def repair_methods(self, dataset):
        """Unimplemented method, returns dict with method/option for repairing errors"""
        return None
    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'application/octet-stream'
    def add_display_app ( self, app_id, label, file_function, links_function ):
        """
        Adds a display app to the datatype.
        app_id is a unique id
        label is the primary display label, e.g., display at 'UCSC'
        file_function is a string containing the name of the function that returns a properly formatted display
        links_function is a string containing the name of the function that returns a list of (link_name,link)
        """
        self.supported_display_apps = self.supported_display_apps.copy()
        self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function}
    def remove_display_app (self, app_id):
        """Removes a display app from the datatype"""
        self.supported_display_apps = self.supported_display_apps.copy()
        try:
            del self.supported_display_apps[app_id]
        except:
            log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) )
    def clear_display_apps( self ):
        self.supported_display_apps = {}
    def add_display_application( self, display_application ):
        """New style display applications"""
        assert display_application.id not in self.display_applications, 'Attempted to add a display application twice'
        self.display_applications[ display_application.id ] = display_application
    def get_display_application( self, key, default = None ):
        return self.display_applications.get( key, default )
    def get_display_applications_by_dataset( self, dataset, trans ):
        rval = odict()
        for key, value in self.display_applications.iteritems():
            value = value.filter_by_dataset( dataset, trans )
            if value.links:
                rval[key] = value
        return rval
    def get_display_types(self):
        """Returns display types available"""
        return self.supported_display_apps.keys()
    def get_display_label(self, type):
        """Returns primary label for display app"""
        try:
            return self.supported_display_apps[type]['label']
        except:
            return 'unknown'
    def as_display_type(self, dataset, type, **kwd):
        """Returns modified file contents for a particular display type """
        try:
            if type in self.get_display_types():
                return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd)
        except:
            log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) )
        return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext)
    def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ):
        """
        Returns a list of tuples of (name, link) for a particular display type.  No check on
        'access' permissions is done here - if you can view the dataset, you can also save it
        or send it to a destination outside of Galaxy, so Galaxy security restrictions do not
        apply anyway.
        """
        try:
            if type in self.get_display_types():
                return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd )
        except:
            log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \
                           % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) )
        return []
    def get_converter_types(self, original_dataset, datatypes_registry):
        """Returns available converters by type for this dataset"""
        return datatypes_registry.get_converters_by_datatype(original_dataset.ext)
    def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ):
        """Returns ( target_ext, existing converted dataset )"""
        return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd )
    def convert_dataset(self, trans, original_dataset, target_type, return_output = False, visible = True, deps=None):
        """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
        converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type )
        
        if converter is None:
            raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) )
        #Generate parameter dictionary
        params = {}
        #determine input parameter name and add to params
        input_name = 'input1'
        for key, value in converter.inputs.items():
            if (deps) and (value.name in deps):
                params[value.name] = deps[value.name]
            elif value.type == 'data':
                input_name = key
            
        params[input_name] = original_dataset
        #Run converter, job is dispatched through Queue
        converted_dataset = converter.execute( trans, incoming = params, set_output_hid = visible )[1]
        if len(params) > 0:
            trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id )
        if not visible:
            for name, value in converted_dataset.iteritems():
                value.visible = False
        if return_output:
            return converted_dataset
        return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
    #We need to clear associated files before we set metadata
    #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after
    #We'll also clear after setting metadata, for backwards compatibility
    def after_setting_metadata( self, dataset ):
        """This function is called on the dataset after metadata is set."""
        dataset.clear_associated_files( metadata_safe = True )
    def before_setting_metadata( self, dataset ):
        """This function is called on the dataset before metadata is set."""
        dataset.clear_associated_files( metadata_safe = True )
    def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, space_to_tab = False, **kwds ):
        kwds[ 'name' ] = name
        kwds[ 'optional' ] = optional
        kwds[ 'mimetype' ] = mimetype
        kwds[ 'description' ] = description
        kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata
        kwds[ 'is_binary' ] = is_binary
        kwds[ 'space_to_tab' ] = space_to_tab
        return Bunch( **kwds )
    def add_composite_file( self, name, **kwds ):
        #self.composite_files = self.composite_files.copy()
        self.composite_files[ name ] = self.__new_composite_file( name, **kwds )
    def __substitute_composite_key( self, key, composite_file, dataset = None ):
        if composite_file.substitute_name_with_metadata:
            if dataset:
                meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
            else:
                meta_value = self.spec[composite_file.substitute_name_with_metadata].default
            return key % meta_value
        return key
    @property
    def writable_files( self, dataset = None ):
        files = odict()
        if self.composite_type != 'auto_primary_file':
            files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name )
        for key, value in self.get_composite_files( dataset = dataset ).iteritems():
            files[ key ] = value
        return files
    def get_composite_files( self, dataset = None ):
        def substitute_composite_key( key, composite_file ):
            if composite_file.substitute_name_with_metadata:
                if dataset:
                    meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
                else:
                    meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default
                return key % meta_value
            return key
        files = odict()
        for key, value in self.composite_files.iteritems():
            files[ substitute_composite_key( key, value ) ] = value
        return files
    def generate_auto_primary_file( self, dataset = None ):
        raise Exception( "generate_auto_primary_file is not implemented for this datatype." )
    @property
    def has_resolution(self):
        return False

class Text( Data ):
    file_ext = 'txt'

    """Add metadata elements"""
    MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 )

    def write_from_stream(self, dataset, stream):
        """Writes data from a stream"""
        # write it twice for now 
        fd, temp_name = tempfile.mkstemp()
        while 1:
            chunk = stream.read(1048576)
            if not chunk:
                break
            os.write(fd, chunk)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
    def set_raw_data(self, dataset, data):
        """Saves the data on the disc"""
        fd, temp_name = tempfile.mkstemp()
        os.write(fd, data)
        os.close(fd)
        # rewrite the file with unix newlines
        fp = open(dataset.file_name, 'wt')
        for line in file(temp_name, "U"):
            line = line.strip() + '\n'
            fp.write(line)
        fp.close()
        os.remove( temp_name )
    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/plain'
    def set_meta( self, dataset, **kwd ):
        """
        Set the number of lines of data in dataset,
        skipping all blank lines and comments.
        """
        data_lines = 0
        for line in file( dataset.file_name ):
            line = line.strip()
            if line and not line.startswith( '#' ):
                data_lines += 1
        dataset.metadata.data_lines = data_lines
    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
            if line_count is None:
                # See if line_count is stored in the metadata
                if dataset.metadata.data_lines:
                    dataset.blurb = "%s lines" % util.commaify( str( dataset.metadata.data_lines ) )
                else:
                    # Number of lines is not known ( this should not happen ), and auto-detect is
                    # needed to set metadata
                    dataset.blurb = "? lines"
            else:
                dataset.blurb = "%s lines" % util.commaify( str( line_count ) )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

class Newick( Text ):
    pass

# ------------- Utility methods --------------

def get_test_fname( fname ):
    """Returns test data filename"""
    path, name = os.path.split(__file__)
    full_path = os.path.join( path, 'test', fname )
    return full_path
def nice_size(size):
    """
    Returns a readably formatted string with the size

    >>> nice_size(100)
    '100.0 bytes'
    >>> nice_size(10000)
    '9.8 Kb'
    >>> nice_size(1000000)
    '976.6 Kb'
    >>> nice_size(100000000)
    '95.4 Mb'
    """
    words = [ 'bytes', 'Kb', 'Mb', 'Gb' ]
    try:
        size = float( size )
    except:
        return '??? bytes'
    for ind, word in enumerate(words):
        step  = 1024 ** (ind + 1)
        if step > size:
            size = size / float(1024 ** ind)
            out  = "%.1f %s" % (size, word)
            return out
    return '??? bytes'
def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5 ):
    """
    Returns the first LINE_COUNT lines wrapped to WIDTH
    
    ## >>> fname = get_test_fname('4.bed')
    ## >>> get_file_peek(fname)
    ## 'chr22    30128507    31828507    uc003bnx.1_cds_2_0_chr22_29227_f    0    +\n'
    """
    lines = []
    count = 0
    file_type = None
    data_checked = False
    temp = open( file_name, "U" )
    while count <= LINE_COUNT:
        line = temp.readline( WIDTH )
        if line and not is_multi_byte and not data_checked:
            # See if we have a compressed or binary file
            if line[0:2] == util.gzip_magic:
                file_type = 'gzipped'
                break
            else:
                for char in line:
                    if ord( char ) > 128:
                        file_type = 'binary'
                        break
            data_checked = True
        if file_type in [ 'gzipped', 'binary' ]:
            break
        lines.append( line )
        count += 1
    temp.close()
    if file_type in [ 'gzipped', 'binary' ]: 
        text = "%s file" % file_type 
    else:
        try:
            text = unicode( '\n'.join( lines ), 'utf-8' )
        except UnicodeDecodeError:
            text = "binary/unknown file"
    return text
Tech Fingerprint

Alerts (32)

'open(' Use 'with open()' to ensure Files are properly closed
71 80 464
'def' Ensure functions have docstrings for documentation
93 121 127 216 222 224 313 325 332 333 345 348 399
'= []' Avoid mutable defaults like '= []' or '= {}'; use None and initialize inside
105
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
124 169 188 214 238 245 258 443
'type(' Use isinstance() for type checking instead of type()
153 184
'list(' Avoid unnecessary list conversions; use generators where possible
192
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
213
'raise Exception(' Raise specific exception types for better error handling
273 346
'def' Avoid long function definitions; keep signatures concise for readability
304