PageRenderTime 42ms CodeModel.GetById 19ms app.highlight 19ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/galaxy/datatypes/data.py

https://bitbucket.org/h_morita_dbcls/galaxy-central
Python | 490 lines | 447 code | 10 blank | 33 comment | 37 complexity | 02b6a7faf1ce2cd624a39f387bcdc8bf MD5 | raw file
  1import logging, os, sys, time, tempfile
  2from galaxy import util
  3from galaxy.util.odict import odict
  4from galaxy.util.bunch import Bunch
  5from cgi import escape
  6import metadata
  7import zipfile
  8from metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions
  9
 10log = logging.getLogger(__name__)
 11
 12# Valid first column and strand column values vor bed, other formats
 13col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho']
 14valid_strand = ['+', '-', '.']
 15
 16class DataMeta( type ):
 17    """
 18    Metaclass for Data class.  Sets up metadata spec.
 19    """
 20    def __init__( cls, name, bases, dict_ ):
 21        cls.metadata_spec = metadata.MetadataSpecCollection()
 22        for base in bases: #loop through bases (class/types) of cls
 23            if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata
 24                cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls
 25        metadata.Statement.process( cls )
 26
 27class Data( object ):
 28    """
 29    Base class for all datatypes.  Implements basic interfaces as well
 30    as class methods for metadata.
 31
 32    >>> class DataTest( Data ):
 33    ...     MetadataElement( name="test" )
 34    ...
 35    >>> DataTest.metadata_spec.test.name
 36    'test'
 37    >>> DataTest.metadata_spec.test.desc
 38    'test'
 39    >>> type( DataTest.metadata_spec.test.param )
 40    <class 'galaxy.datatypes.metadata.MetadataParameter'>
 41    
 42    """
 43    __metaclass__ = DataMeta
 44    # Add metadata elements
 45    MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" )
 46    # Stores the set of display applications, and viewing methods, supported by this datatype
 47    supported_display_apps = {}
 48    # If False, the peek is regenerated whenever a dataset of this type is copied
 49    copy_safe_peek = True
 50    # The dataset contains binary data --> do not space_to_tab or convert newlines, etc.
 51    # Allow binary file uploads of this type when True.
 52    is_binary = True
 53    # Allow user to change between this datatype and others. If False, this datatype
 54    # cannot be changed from or into.
 55    allow_datatype_change = True
 56    #Composite datatypes
 57    composite_type = None
 58    composite_files = odict()
 59    primary_file_name = 'index'
 60    #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata
 61    _max_optional_metadata_filesize = None
 62    
 63    def __init__(self, **kwd):
 64        """Initialize the datatype"""
 65        object.__init__(self, **kwd)
 66        self.supported_display_apps = self.supported_display_apps.copy()
 67        self.composite_files = self.composite_files.copy()
 68        self.display_applications = odict()
 69    def write_from_stream(self, dataset, stream):
 70        """Writes data from a stream"""
 71        fd = open(dataset.file_name, 'wb')
 72        while 1:
 73            chunk = stream.read(1048576)
 74            if not chunk:
 75                break
 76            os.write(fd, chunk)
 77        os.close(fd)
 78    def set_raw_data(self, dataset, data):
 79        """Saves the data on the disc"""
 80        fd = open(dataset.file_name, 'wb')
 81        os.write(fd, data)
 82        os.close(fd)
 83    def get_raw_data( self, dataset ):
 84        """Returns the full data. To stream it open the file_name and read/write as needed"""
 85        try:
 86            return file(datset.file_name, 'rb').read(-1)
 87        except OSError, e:
 88            log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name))
 89            return ''
 90    def groom_dataset_content( self, file_name ):
 91        """This function is called on an output dataset file after the content is initially generated."""
 92        pass
 93    def init_meta( self, dataset, copy_from=None ):
 94        # Metadata should be left mostly uninitialized.  Dataset will
 95        # handle returning default values when metadata is not set.
 96        # copy_from allows metadata to be passed in that will be
 97        # copied. (although this seems ambiguous, see
 98        # Dataset.set_metadata.  It always copies the rhs in order to
 99        # flag the object as modified for SQLAlchemy.
100        if copy_from:
101            dataset.metadata = copy_from.metadata
102    def set_meta( self, dataset, overwrite = True, **kwd ):
103        """Unimplemented method, allows guessing of metadata from contents of file"""
104        return True
105    def missing_meta( self, dataset, check = [], skip = [] ):
106        """
107        Checks for empty metadata values, Returns True if non-optional metadata is missing
108        Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored
109        Specifying a list of 'skip' items will return True even when a named metadata value is missing
110        """
111        if check:
112            to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ]
113        else:
114            to_check = dataset.metadata.items()
115        for key, value in to_check:
116            if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ):
117                continue #we skip check for optional and nonrequested values here 
118            if not value:
119                return True
120        return False
121    def set_max_optional_metadata_filesize( self, max_value ):
122        try:
123            max_value = int( max_value )
124        except:
125            return
126        self.__class__._max_optional_metadata_filesize = max_value
127    def get_max_optional_metadata_filesize( self ):
128        rval = self.__class__._max_optional_metadata_filesize
129        if rval is None:
130            return -1
131        return rval
132    max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize )
133    def set_peek( self, dataset, is_multi_byte=False ):
134        """Set the peek and blurb text"""
135        if not dataset.dataset.purged:
136            dataset.peek = ''
137            dataset.blurb = 'data'
138        else:
139            dataset.peek = 'file does not exist'
140            dataset.blurb = 'file purged from disk'
141    def display_peek(self, dataset ):
142        """Create HTML table, used for displaying peek"""
143        out = ['<table cellspacing="0" cellpadding="3">']
144        try:
145            if not dataset.peek:
146                dataset.set_peek()
147            data = dataset.peek
148            lines =  data.splitlines()
149            for line in lines:
150                line = line.strip()
151                if not line:
152                    continue
153                if type( line ) is unicode:
154                    out.append( '<tr><td>%s</td></tr>' % escape( line ) )
155                else:
156                    out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
157            out.append( '</table>' )
158            out = "".join( out )
159        except Exception, exc:
160            out = "Can't create peek %s" % str( exc )
161        return out
162    def display_name(self, dataset):
163        """Returns formatted html of dataset name"""
164        try:
165            if type ( dataset.name ) is unicode:
166                return escape( dataset.name )
167            else:
168                return escape( unicode( dataset.name, 'utf-8 ') )
169        except:
170            return "name unavailable"
171    def display_info(self, dataset):
172        """Returns formatted html of dataset info"""
173        try:
174            # Change new line chars to html
175            info = escape( dataset.info )
176            if info.find( '\r\n' ) >= 0:
177                info = info.replace( '\r\n', '<br/>' )
178            if info.find( '\r' ) >= 0:
179                info = info.replace( '\r', '<br/>' )
180            if info.find( '\n' ) >= 0:
181                info = info.replace( '\n', '<br/>' )
182                
183            # Convert to unicode to display non-ascii characters.
184            if type( info ) is not unicode:
185                info = unicode( info, 'utf-8')
186                
187            return info
188        except:
189            return "info unavailable"
190    def validate(self, dataset):
191        """Unimplemented validate, return no exceptions"""
192        return list()
193    def repair_methods(self, dataset):
194        """Unimplemented method, returns dict with method/option for repairing errors"""
195        return None
196    def get_mime(self):
197        """Returns the mime type of the datatype"""
198        return 'application/octet-stream'
199    def add_display_app ( self, app_id, label, file_function, links_function ):
200        """
201        Adds a display app to the datatype.
202        app_id is a unique id
203        label is the primary display label, e.g., display at 'UCSC'
204        file_function is a string containing the name of the function that returns a properly formatted display
205        links_function is a string containing the name of the function that returns a list of (link_name,link)
206        """
207        self.supported_display_apps = self.supported_display_apps.copy()
208        self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function}
209    def remove_display_app (self, app_id):
210        """Removes a display app from the datatype"""
211        self.supported_display_apps = self.supported_display_apps.copy()
212        try:
213            del self.supported_display_apps[app_id]
214        except:
215            log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) )
216    def clear_display_apps( self ):
217        self.supported_display_apps = {}
218    def add_display_application( self, display_application ):
219        """New style display applications"""
220        assert display_application.id not in self.display_applications, 'Attempted to add a display application twice'
221        self.display_applications[ display_application.id ] = display_application
222    def get_display_application( self, key, default = None ):
223        return self.display_applications.get( key, default )
224    def get_display_applications_by_dataset( self, dataset, trans ):
225        rval = odict()
226        for key, value in self.display_applications.iteritems():
227            value = value.filter_by_dataset( dataset, trans )
228            if value.links:
229                rval[key] = value
230        return rval
231    def get_display_types(self):
232        """Returns display types available"""
233        return self.supported_display_apps.keys()
234    def get_display_label(self, type):
235        """Returns primary label for display app"""
236        try:
237            return self.supported_display_apps[type]['label']
238        except:
239            return 'unknown'
240    def as_display_type(self, dataset, type, **kwd):
241        """Returns modified file contents for a particular display type """
242        try:
243            if type in self.get_display_types():
244                return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd)
245        except:
246            log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) )
247        return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext)
248    def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ):
249        """
250        Returns a list of tuples of (name, link) for a particular display type.  No check on
251        'access' permissions is done here - if you can view the dataset, you can also save it
252        or send it to a destination outside of Galaxy, so Galaxy security restrictions do not
253        apply anyway.
254        """
255        try:
256            if type in self.get_display_types():
257                return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd )
258        except:
259            log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \
260                           % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) )
261        return []
262    def get_converter_types(self, original_dataset, datatypes_registry):
263        """Returns available converters by type for this dataset"""
264        return datatypes_registry.get_converters_by_datatype(original_dataset.ext)
265    def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ):
266        """Returns ( target_ext, existing converted dataset )"""
267        return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd )
268    def convert_dataset(self, trans, original_dataset, target_type, return_output = False, visible = True, deps=None):
269        """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
270        converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type )
271        
272        if converter is None:
273            raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) )
274        #Generate parameter dictionary
275        params = {}
276        #determine input parameter name and add to params
277        input_name = 'input1'
278        for key, value in converter.inputs.items():
279            if (deps) and (value.name in deps):
280                params[value.name] = deps[value.name]
281            elif value.type == 'data':
282                input_name = key
283            
284        params[input_name] = original_dataset
285        #Run converter, job is dispatched through Queue
286        converted_dataset = converter.execute( trans, incoming = params, set_output_hid = visible )[1]
287        if len(params) > 0:
288            trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id )
289        if not visible:
290            for name, value in converted_dataset.iteritems():
291                value.visible = False
292        if return_output:
293            return converted_dataset
294        return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
295    #We need to clear associated files before we set metadata
296    #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after
297    #We'll also clear after setting metadata, for backwards compatibility
298    def after_setting_metadata( self, dataset ):
299        """This function is called on the dataset after metadata is set."""
300        dataset.clear_associated_files( metadata_safe = True )
301    def before_setting_metadata( self, dataset ):
302        """This function is called on the dataset before metadata is set."""
303        dataset.clear_associated_files( metadata_safe = True )
304    def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, space_to_tab = False, **kwds ):
305        kwds[ 'name' ] = name
306        kwds[ 'optional' ] = optional
307        kwds[ 'mimetype' ] = mimetype
308        kwds[ 'description' ] = description
309        kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata
310        kwds[ 'is_binary' ] = is_binary
311        kwds[ 'space_to_tab' ] = space_to_tab
312        return Bunch( **kwds )
313    def add_composite_file( self, name, **kwds ):
314        #self.composite_files = self.composite_files.copy()
315        self.composite_files[ name ] = self.__new_composite_file( name, **kwds )
316    def __substitute_composite_key( self, key, composite_file, dataset = None ):
317        if composite_file.substitute_name_with_metadata:
318            if dataset:
319                meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
320            else:
321                meta_value = self.spec[composite_file.substitute_name_with_metadata].default
322            return key % meta_value
323        return key
324    @property
325    def writable_files( self, dataset = None ):
326        files = odict()
327        if self.composite_type != 'auto_primary_file':
328            files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name )
329        for key, value in self.get_composite_files( dataset = dataset ).iteritems():
330            files[ key ] = value
331        return files
332    def get_composite_files( self, dataset = None ):
333        def substitute_composite_key( key, composite_file ):
334            if composite_file.substitute_name_with_metadata:
335                if dataset:
336                    meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
337                else:
338                    meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default
339                return key % meta_value
340            return key
341        files = odict()
342        for key, value in self.composite_files.iteritems():
343            files[ substitute_composite_key( key, value ) ] = value
344        return files
345    def generate_auto_primary_file( self, dataset = None ):
346        raise Exception( "generate_auto_primary_file is not implemented for this datatype." )
347    @property
348    def has_resolution(self):
349        return False
350
351class Text( Data ):
352    file_ext = 'txt'
353
354    """Add metadata elements"""
355    MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 )
356
357    def write_from_stream(self, dataset, stream):
358        """Writes data from a stream"""
359        # write it twice for now 
360        fd, temp_name = tempfile.mkstemp()
361        while 1:
362            chunk = stream.read(1048576)
363            if not chunk:
364                break
365            os.write(fd, chunk)
366        os.close(fd)
367        # rewrite the file with unix newlines
368        fp = open(dataset.file_name, 'wt')
369        for line in file(temp_name, "U"):
370            line = line.strip() + '\n'
371            fp.write(line)
372        fp.close()
373    def set_raw_data(self, dataset, data):
374        """Saves the data on the disc"""
375        fd, temp_name = tempfile.mkstemp()
376        os.write(fd, data)
377        os.close(fd)
378        # rewrite the file with unix newlines
379        fp = open(dataset.file_name, 'wt')
380        for line in file(temp_name, "U"):
381            line = line.strip() + '\n'
382            fp.write(line)
383        fp.close()
384        os.remove( temp_name )
385    def get_mime(self):
386        """Returns the mime type of the datatype"""
387        return 'text/plain'
388    def set_meta( self, dataset, **kwd ):
389        """
390        Set the number of lines of data in dataset,
391        skipping all blank lines and comments.
392        """
393        data_lines = 0
394        for line in file( dataset.file_name ):
395            line = line.strip()
396            if line and not line.startswith( '#' ):
397                data_lines += 1
398        dataset.metadata.data_lines = data_lines
399    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
400        if not dataset.dataset.purged:
401            # The file must exist on disk for the get_file_peek() method
402            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
403            if line_count is None:
404                # See if line_count is stored in the metadata
405                if dataset.metadata.data_lines:
406                    dataset.blurb = "%s lines" % util.commaify( str( dataset.metadata.data_lines ) )
407                else:
408                    # Number of lines is not known ( this should not happen ), and auto-detect is
409                    # needed to set metadata
410                    dataset.blurb = "? lines"
411            else:
412                dataset.blurb = "%s lines" % util.commaify( str( line_count ) )
413        else:
414            dataset.peek = 'file does not exist'
415            dataset.blurb = 'file purged from disk'
416
417class Newick( Text ):
418    pass
419
420# ------------- Utility methods --------------
421
422def get_test_fname( fname ):
423    """Returns test data filename"""
424    path, name = os.path.split(__file__)
425    full_path = os.path.join( path, 'test', fname )
426    return full_path
427def nice_size(size):
428    """
429    Returns a readably formatted string with the size
430
431    >>> nice_size(100)
432    '100.0 bytes'
433    >>> nice_size(10000)
434    '9.8 Kb'
435    >>> nice_size(1000000)
436    '976.6 Kb'
437    >>> nice_size(100000000)
438    '95.4 Mb'
439    """
440    words = [ 'bytes', 'Kb', 'Mb', 'Gb' ]
441    try:
442        size = float( size )
443    except:
444        return '??? bytes'
445    for ind, word in enumerate(words):
446        step  = 1024 ** (ind + 1)
447        if step > size:
448            size = size / float(1024 ** ind)
449            out  = "%.1f %s" % (size, word)
450            return out
451    return '??? bytes'
452def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5 ):
453    """
454    Returns the first LINE_COUNT lines wrapped to WIDTH
455    
456    ## >>> fname = get_test_fname('4.bed')
457    ## >>> get_file_peek(fname)
458    ## 'chr22    30128507    31828507    uc003bnx.1_cds_2_0_chr22_29227_f    0    +\n'
459    """
460    lines = []
461    count = 0
462    file_type = None
463    data_checked = False
464    temp = open( file_name, "U" )
465    while count <= LINE_COUNT:
466        line = temp.readline( WIDTH )
467        if line and not is_multi_byte and not data_checked:
468            # See if we have a compressed or binary file
469            if line[0:2] == util.gzip_magic:
470                file_type = 'gzipped'
471                break
472            else:
473                for char in line:
474                    if ord( char ) > 128:
475                        file_type = 'binary'
476                        break
477            data_checked = True
478        if file_type in [ 'gzipped', 'binary' ]:
479            break
480        lines.append( line )
481        count += 1
482    temp.close()
483    if file_type in [ 'gzipped', 'binary' ]: 
484        text = "%s file" % file_type 
485    else:
486        try:
487            text = unicode( '\n'.join( lines ), 'utf-8' )
488        except UnicodeDecodeError:
489            text = "binary/unknown file"
490    return text