PageRenderTime 144ms CodeModel.GetById 25ms app.highlight 107ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/galaxy/datatypes/data.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 960 lines | 810 code | 42 blank | 108 comment | 113 complexity | 68f760e515d11695493f99021755a242 MD5 | raw file
  1import logging
  2import metadata
  3import mimetypes
  4import os
  5import shutil
  6import sys
  7import tempfile
  8import zipfile
  9from cgi import escape
 10from inspect import isclass
 11from galaxy import util
 12from galaxy.datatypes.metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions
 13from galaxy.util import inflector
 14from galaxy.util.bunch import Bunch
 15from galaxy.util.odict import odict
 16from galaxy.util.sanitize_html import sanitize_html
 17
 18import dataproviders
 19
 20from galaxy import eggs
 21eggs.require( "Paste" )
 22import paste
 23
 24XSS_VULNERABLE_MIME_TYPES = [
 25    'image/svg+xml',  # Unfiltered by Galaxy and may contain JS that would be executed by some browsers.
 26    'application/xml',  # Some browsers will evalute SVG embedded JS in such XML documents.
 27]
 28DEFAULT_MIME_TYPE = 'text/plain'  # Vulnerable mime types will be replaced with this.
 29
 30log = logging.getLogger(__name__)
 31
 32comptypes=[]  # Is this being used anywhere, why was this here? -JohnC
 33try:
 34    import zlib
 35    comptypes.append( 'zip' )
 36except ImportError:
 37    pass
 38
 39
 40# Valid first column and strand column values vor bed, other formats
 41col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho']
 42valid_strand = ['+', '-', '.']
 43
 44class DataMeta( type ):
 45    """
 46    Metaclass for Data class.  Sets up metadata spec.
 47    """
 48    def __init__( cls, name, bases, dict_ ):
 49        cls.metadata_spec = metadata.MetadataSpecCollection()
 50        for base in bases: #loop through bases (class/types) of cls
 51            if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata
 52                cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls
 53        metadata.Statement.process( cls )
 54
 55@dataproviders.decorators.has_dataproviders
 56class Data( object ):
 57    """
 58    Base class for all datatypes.  Implements basic interfaces as well
 59    as class methods for metadata.
 60
 61    >>> class DataTest( Data ):
 62    ...     MetadataElement( name="test" )
 63    ...
 64    >>> DataTest.metadata_spec.test.name
 65    'test'
 66    >>> DataTest.metadata_spec.test.desc
 67    'test'
 68    >>> type( DataTest.metadata_spec.test.param )
 69    <class 'galaxy.datatypes.metadata.MetadataParameter'>
 70
 71    """
 72    # Data is not chunkable by default.
 73    CHUNKABLE = False
 74
 75    #: dictionary of metadata fields for this datatype::
 76    metadata_spec = None
 77
 78    __metaclass__ = DataMeta
 79    # Add metadata elements
 80    MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" )
 81    # Stores the set of display applications, and viewing methods, supported by this datatype
 82    supported_display_apps = {}
 83    # If False, the peek is regenerated whenever a dataset of this type is copied
 84    copy_safe_peek = True
 85    # The dataset contains binary data --> do not space_to_tab or convert newlines, etc.
 86    # Allow binary file uploads of this type when True.
 87    is_binary = True
 88    # Allow user to change between this datatype and others. If False, this datatype
 89    # cannot be changed from or into.
 90    allow_datatype_change = True
 91    #Composite datatypes
 92    composite_type = None
 93    composite_files = odict()
 94    primary_file_name = 'index'
 95    #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata
 96    _max_optional_metadata_filesize = None
 97
 98    # Trackster track type.
 99    track_type = None
100
101    # Data sources.
102    data_sources = {}
103
104    def __init__(self, **kwd):
105        """Initialize the datatype"""
106        object.__init__(self, **kwd)
107        self.supported_display_apps = self.supported_display_apps.copy()
108        self.composite_files = self.composite_files.copy()
109        self.display_applications = odict()
110    def write_from_stream(self, dataset, stream):
111        """Writes data from a stream"""
112        fd = open(dataset.file_name, 'wb')
113        while 1:
114            chunk = stream.read(1048576)
115            if not chunk:
116                break
117            os.write(fd, chunk)
118        os.close(fd)
119    def set_raw_data(self, dataset, data):
120        """Saves the data on the disc"""
121        fd = open(dataset.file_name, 'wb')
122        os.write(fd, data)
123        os.close(fd)
124    def get_raw_data( self, dataset ):
125        """Returns the full data. To stream it open the file_name and read/write as needed"""
126        try:
127            return file(dataset.file_name, 'rb').read(-1)
128        except OSError, e:
129            log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name))
130            return ''
131    def dataset_content_needs_grooming( self, file_name ):
132        """This function is called on an output dataset file after the content is initially generated."""
133        return False
134    def groom_dataset_content( self, file_name ):
135        """This function is called on an output dataset file if dataset_content_needs_grooming returns True."""
136        pass
137    def init_meta( self, dataset, copy_from=None ):
138        # Metadata should be left mostly uninitialized.  Dataset will
139        # handle returning default values when metadata is not set.
140        # copy_from allows metadata to be passed in that will be
141        # copied. (although this seems ambiguous, see
142        # Dataset.set_metadata.  It always copies the rhs in order to
143        # flag the object as modified for SQLAlchemy.
144        if copy_from:
145            dataset.metadata = copy_from.metadata
146    def set_meta( self, dataset, overwrite = True, **kwd ):
147        """Unimplemented method, allows guessing of metadata from contents of file"""
148        return True
149    def missing_meta( self, dataset, check = [], skip = [] ):
150        """
151        Checks for empty metadata values, Returns True if non-optional metadata is missing
152        Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored
153        Specifying a list of 'skip' items will return True even when a named metadata value is missing
154        """
155        if check:
156            to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ]
157        else:
158            to_check = dataset.metadata.items()
159        for key, value in to_check:
160            if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ):
161                continue #we skip check for optional and nonrequested values here
162            if not value:
163                return True
164        return False
165    def set_max_optional_metadata_filesize( self, max_value ):
166        try:
167            max_value = int( max_value )
168        except:
169            return
170        self.__class__._max_optional_metadata_filesize = max_value
171    def get_max_optional_metadata_filesize( self ):
172        rval = self.__class__._max_optional_metadata_filesize
173        if rval is None:
174            return -1
175        return rval
176    max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize )
177    def set_peek( self, dataset, is_multi_byte=False ):
178        """Set the peek and blurb text"""
179        if not dataset.dataset.purged:
180            dataset.peek = ''
181            dataset.blurb = 'data'
182        else:
183            dataset.peek = 'file does not exist'
184            dataset.blurb = 'file purged from disk'
185
186    def display_peek(self, dataset ):
187        """Create HTML table, used for displaying peek"""
188        out = ['<table cellspacing="0" cellpadding="3">']
189        try:
190            if not dataset.peek:
191                dataset.set_peek()
192            data = dataset.peek
193            lines =  data.splitlines()
194            for line in lines:
195                line = line.strip()
196                if not line:
197                    continue
198                if type( line ) is unicode:
199                    out.append( '<tr><td>%s</td></tr>' % escape( line ) )
200                else:
201                    out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
202            out.append( '</table>' )
203            out = "".join( out )
204        except Exception, exc:
205            out = "Can't create peek %s" % str( exc )
206        return out
207
208    def _archive_main_file(self, archive, display_name, data_filename):
209        """Called from _archive_composite_dataset to add central file to archive.
210
211        Unless subclassed, this will add the main dataset file (argument data_filename)
212        to the archive, as an HTML file with its filename derived from the dataset name
213        (argument outfname).
214
215        Returns a tuple of boolean, string, string: (error, msg, messagetype)
216        """
217        error, msg, messagetype = False, "", ""
218        archname = '%s.html' % display_name  # fake the real nature of the html file
219        try:
220            archive.add(data_filename, archname)
221        except IOError:
222            error = True
223            log.exception("Unable to add composite parent %s to temporary library download archive" % data_filename)
224            msg = "Unable to create archive for download, please report this error"
225            messagetype = "error"
226        return error, msg, messagetype
227
228    def _archive_composite_dataset( self, trans, data=None, **kwd ):
229        # save a composite object into a compressed archive for downloading
230        params = util.Params( kwd )
231        valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
232        outfname = data.name[0:150]
233        outfname = ''.join(c in valid_chars and c or '_' for c in outfname)
234        if (params.do_action == None):
235            params.do_action = 'zip' # default
236        msg = util.restore_text( params.get( 'msg', ''  ) )
237        messagetype = params.get( 'messagetype', 'done' )
238        if not data:
239            msg = "You must select at least one dataset"
240            messagetype = 'error'
241        else:
242            error = False
243            try:
244                if (params.do_action == 'zip'):
245                    # Can't use mkstemp - the file must not exist first
246                    tmpd = tempfile.mkdtemp()
247                    util.umask_fix_perms( tmpd, trans.app.config.umask, 0777, trans.app.config.gid )
248                    tmpf = os.path.join( tmpd, 'library_download.' + params.do_action )
249                    archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED, True )
250                    archive.add = lambda x, y: archive.write( x, y.encode('CP437') )
251                elif params.do_action == 'tgz':
252                    archive = util.streamball.StreamBall( 'w|gz' )
253                elif params.do_action == 'tbz':
254                    archive = util.streamball.StreamBall( 'w|bz2' )
255            except (OSError, zipfile.BadZipFile):
256                error = True
257                log.exception( "Unable to create archive for download" )
258                msg = "Unable to create archive for %s for download, please report this error" % outfname
259                messagetype = 'error'
260            if not error:
261                current_user_roles = trans.get_current_user_roles()
262                ext = data.extension
263                path = data.file_name
264                fname = os.path.split(path)[-1]
265                efp = data.extra_files_path
266                #Add any central file to the archive,
267
268                display_name = os.path.splitext(outfname)[0]
269                if not display_name.endswith(ext):
270                    display_name = '%s_%s' % (display_name, ext)
271
272                error, msg, messagetype = self._archive_main_file(archive, display_name, path)
273                if not error:
274                    #Add any child files to the archive,
275                    for root, dirs, files in os.walk(efp):
276                        for fname in files:
277                            fpath = os.path.join(root,fname)
278                            rpath = os.path.relpath(fpath,efp)
279                            try:
280                                archive.add( fpath,rpath )
281                            except IOError:
282                                error = True
283                                log.exception( "Unable to add %s to temporary library download archive" % rpath)
284                                msg = "Unable to create archive for download, please report this error"
285                                messagetype = 'error'
286                                continue
287                if not error:
288                    if params.do_action == 'zip':
289                        archive.close()
290                        tmpfh = open( tmpf )
291                        # CANNOT clean up - unlink/rmdir was always failing because file handle retained to return - must rely on a cron job to clean up tmp
292                        trans.response.set_content_type( "application/x-zip-compressed" )
293                        trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="%s.zip"' % outfname
294                        return tmpfh
295                    else:
296                        trans.response.set_content_type( "application/x-tar" )
297                        outext = 'tgz'
298                        if params.do_action == 'tbz':
299                            outext = 'tbz'
300                        trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="%s.%s"' % (outfname,outext)
301                        archive.wsgi_status = trans.response.wsgi_status()
302                        archive.wsgi_headeritems = trans.response.wsgi_headeritems()
303                        return archive.stream
304        return trans.show_error_message( msg )
305
306    def _serve_raw(self, trans, dataset, to_ext):
307        trans.response.headers['Content-Length'] = int( os.stat( dataset.file_name ).st_size )
308        valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
309        fname = ''.join(c in valid_chars and c or '_' for c in dataset.name)[0:150]
310        trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename
311        trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (dataset.hid, fname, to_ext)
312        return open( dataset.file_name )
313
314    def display_data(self, trans, data, preview=False, filename=None, to_ext=None, size=None, offset=None, **kwd):
315        """ Old display method, for transition - though still used by API and
316        test framework. Datatypes should be very careful if overridding this
317        method and this interface between datatypes and Galaxy will likely
318        change.
319
320        TOOD: Document alternatives to overridding this method (data
321        providers?).
322        """
323        #Relocate all composite datatype display to a common location.
324        composite_extensions = trans.app.datatypes_registry.get_composite_extensions( )
325        composite_extensions.append('html') # for archiving composite datatypes
326        #Prevent IE8 from sniffing content type since we're explicit about it.  This prevents intentionally text/plain
327        #content from being rendered in the browser
328        trans.response.headers['X-Content-Type-Options'] = 'nosniff'
329        if isinstance( data, basestring ):
330            return data
331        if filename and filename != "index":
332            # For files in extra_files_path
333            file_path = trans.app.object_store.get_filename(data.dataset, extra_dir='dataset_%s_files' % data.dataset.id, alt_name=filename)
334            if os.path.exists( file_path ):
335                if os.path.isdir( file_path ):
336                    return trans.show_error_message( "Directory listing is not allowed." ) #TODO: Reconsider allowing listing of directories?
337                mime, encoding = mimetypes.guess_type( file_path )
338                if not mime:
339                    try:
340                        mime = trans.app.datatypes_registry.get_mimetype_by_extension( ".".split( file_path )[-1] )
341                    except:
342                        mime = "text/plain"
343                self._clean_and_set_mime_type( trans, mime )
344                return open( file_path )
345            else:
346                return trans.show_error_message( "Could not find '%s' on the extra files path %s." % ( filename, file_path ) )
347        self._clean_and_set_mime_type( trans, data.get_mime() )
348
349        trans.log_event( "Display dataset id: %s" % str( data.id ) )
350        from galaxy import datatypes #DBTODO REMOVE THIS AT REFACTOR
351        if to_ext or isinstance(data.datatype, datatypes.binary.Binary): # Saving the file, or binary file
352            if data.extension in composite_extensions:
353                return self._archive_composite_dataset( trans, data, **kwd )
354            else:
355                trans.response.headers['Content-Length'] = int( os.stat( data.file_name ).st_size )
356                if not to_ext:
357                    to_ext = data.extension
358                valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
359                fname = ''.join(c in valid_chars and c or '_' for c in data.name)[0:150]
360                trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename
361                trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (data.hid, fname, to_ext)
362                return open( data.file_name )
363        if not os.path.exists( data.file_name ):
364            raise paste.httpexceptions.HTTPNotFound( "File Not Found (%s)." % data.file_name )
365        max_peek_size = 1000000 # 1 MB
366        if isinstance(data.datatype, datatypes.images.Html):
367            max_peek_size = 10000000 # 10 MB for html
368        preview = util.string_as_bool( preview )
369        if not preview or isinstance(data.datatype, datatypes.images.Image) or os.stat( data.file_name ).st_size < max_peek_size:
370            if trans.app.config.sanitize_all_html and trans.response.get_content_type() == "text/html":
371                # Sanitize anytime we respond with plain text/html content.
372                return sanitize_html(open( data.file_name ).read())
373            return open( data.file_name )
374        else:
375            trans.response.set_content_type( "text/html" )
376            return trans.stream_template_mako( "/dataset/large_file.mako",
377                                            truncated_data = open( data.file_name ).read(max_peek_size),
378                                            data = data)
379
380    def display_name(self, dataset):
381        """Returns formatted html of dataset name"""
382        try:
383            if type ( dataset.name ) is unicode:
384                return escape( dataset.name )
385            else:
386                return escape( unicode( dataset.name, 'utf-8 ') )
387        except:
388            return "name unavailable"
389    def display_info(self, dataset):
390        """Returns formatted html of dataset info"""
391        try:
392            # Change new line chars to html
393            info = escape( dataset.info )
394            if info.find( '\r\n' ) >= 0:
395                info = info.replace( '\r\n', '<br/>' )
396            if info.find( '\r' ) >= 0:
397                info = info.replace( '\r', '<br/>' )
398            if info.find( '\n' ) >= 0:
399                info = info.replace( '\n', '<br/>' )
400
401            # Convert to unicode to display non-ascii characters.
402            if type( info ) is not unicode:
403                info = unicode( info, 'utf-8')
404
405            return info
406        except:
407            return "info unavailable"
408    def validate(self, dataset):
409        """Unimplemented validate, return no exceptions"""
410        return list()
411    def repair_methods(self, dataset):
412        """Unimplemented method, returns dict with method/option for repairing errors"""
413        return None
414    def get_mime(self):
415        """Returns the mime type of the datatype"""
416        return 'application/octet-stream'
417    def add_display_app ( self, app_id, label, file_function, links_function ):
418        """
419        Adds a display app to the datatype.
420        app_id is a unique id
421        label is the primary display label, e.g., display at 'UCSC'
422        file_function is a string containing the name of the function that returns a properly formatted display
423        links_function is a string containing the name of the function that returns a list of (link_name,link)
424        """
425        self.supported_display_apps = self.supported_display_apps.copy()
426        self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function}
427    def remove_display_app (self, app_id):
428        """Removes a display app from the datatype"""
429        self.supported_display_apps = self.supported_display_apps.copy()
430        try:
431            del self.supported_display_apps[app_id]
432        except:
433            log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) )
434    def clear_display_apps( self ):
435        self.supported_display_apps = {}
436    def add_display_application( self, display_application ):
437        """New style display applications"""
438        assert display_application.id not in self.display_applications, 'Attempted to add a display application twice'
439        self.display_applications[ display_application.id ] = display_application
440    def get_display_application( self, key, default = None ):
441        return self.display_applications.get( key, default )
442    def get_display_applications_by_dataset( self, dataset, trans ):
443        rval = odict()
444        for key, value in self.display_applications.iteritems():
445            value = value.filter_by_dataset( dataset, trans )
446            if value.links:
447                rval[key] = value
448        return rval
449    def get_display_types(self):
450        """Returns display types available"""
451        return self.supported_display_apps.keys()
452    def get_display_label(self, type):
453        """Returns primary label for display app"""
454        try:
455            return self.supported_display_apps[type]['label']
456        except:
457            return 'unknown'
458    def as_display_type(self, dataset, type, **kwd):
459        """Returns modified file contents for a particular display type """
460        try:
461            if type in self.get_display_types():
462                return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd)
463        except:
464            log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) )
465        return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext)
466    def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ):
467        """
468        Returns a list of tuples of (name, link) for a particular display type.  No check on
469        'access' permissions is done here - if you can view the dataset, you can also save it
470        or send it to a destination outside of Galaxy, so Galaxy security restrictions do not
471        apply anyway.
472        """
473        try:
474            if app.config.enable_old_display_applications and type in self.get_display_types():
475                return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd )
476        except:
477            log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \
478                           % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) )
479        return target_frame, []
480    def get_converter_types(self, original_dataset, datatypes_registry):
481        """Returns available converters by type for this dataset"""
482        return datatypes_registry.get_converters_by_datatype(original_dataset.ext)
483    def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ):
484        """Returns ( target_ext, existing converted dataset )"""
485        return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd )
486    def convert_dataset(self, trans, original_dataset, target_type, return_output=False, visible=True, deps=None, set_output_history=True):
487        """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
488        converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type )
489
490        if converter is None:
491            raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) )
492        #Generate parameter dictionary
493        params = {}
494        #determine input parameter name and add to params
495        input_name = 'input1'
496        for key, value in converter.inputs.items():
497            if deps and value.name in deps:
498                params[value.name] = deps[value.name]
499            elif value.type == 'data':
500                input_name = key
501
502        params[input_name] = original_dataset
503        #Run converter, job is dispatched through Queue
504        converted_dataset = converter.execute( trans, incoming=params, set_output_hid=visible, set_output_history=set_output_history)[1]
505        if len(params) > 0:
506            trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id )
507        if not visible:
508            for name, value in converted_dataset.iteritems():
509                value.visible = False
510        if return_output:
511            return converted_dataset
512        return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
513    #We need to clear associated files before we set metadata
514    #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after
515    #We'll also clear after setting metadata, for backwards compatibility
516    def after_setting_metadata( self, dataset ):
517        """This function is called on the dataset after metadata is set."""
518        dataset.clear_associated_files( metadata_safe = True )
519    def before_setting_metadata( self, dataset ):
520        """This function is called on the dataset before metadata is set."""
521        dataset.clear_associated_files( metadata_safe = True )
522    def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, to_posix_lines = True, space_to_tab = False, **kwds ):
523        kwds[ 'name' ] = name
524        kwds[ 'optional' ] = optional
525        kwds[ 'mimetype' ] = mimetype
526        kwds[ 'description' ] = description
527        kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata
528        kwds[ 'is_binary' ] = is_binary
529        kwds[ 'to_posix_lines' ] = to_posix_lines
530        kwds[ 'space_to_tab' ] = space_to_tab
531        return Bunch( **kwds )
532    def add_composite_file( self, name, **kwds ):
533        #self.composite_files = self.composite_files.copy()
534        self.composite_files[ name ] = self.__new_composite_file( name, **kwds )
535    def __substitute_composite_key( self, key, composite_file, dataset = None ):
536        if composite_file.substitute_name_with_metadata:
537            if dataset:
538                meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
539            else:
540                meta_value = self.spec[composite_file.substitute_name_with_metadata].default
541            return key % meta_value
542        return key
543    @property
544    def writable_files( self, dataset = None ):
545        files = odict()
546        if self.composite_type != 'auto_primary_file':
547            files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name )
548        for key, value in self.get_composite_files( dataset = dataset ).iteritems():
549            files[ key ] = value
550        return files
551    def get_composite_files( self, dataset = None ):
552        def substitute_composite_key( key, composite_file ):
553            if composite_file.substitute_name_with_metadata:
554                if dataset:
555                    meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
556                else:
557                    meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default
558                return key % meta_value
559            return key
560        files = odict()
561        for key, value in self.composite_files.iteritems():
562            files[ substitute_composite_key( key, value ) ] = value
563        return files
564    def generate_auto_primary_file( self, dataset = None ):
565        raise Exception( "generate_auto_primary_file is not implemented for this datatype." )
566    @property
567    def has_resolution(self):
568        return False
569
570    def matches_any( self, target_datatypes ):
571        """
572        Check if this datatype is of any of the target_datatypes or is
573        a subtype thereof.
574        """
575        datatype_classes = tuple( [ datatype if isclass( datatype ) else datatype.__class__ for datatype in target_datatypes ] )
576        return isinstance( self, datatype_classes )
577    def merge( split_files, output_file):
578        """
579            Merge files with copy.copyfileobj() will not hit the
580            max argument limitation of cat. gz and bz2 files are also working.
581        """
582        if not split_files:
583            raise ValueError('Asked to merge zero files as %s' % output_file)
584        elif len(split_files) == 1:
585            shutil.copyfileobj(open(split_files[0], 'rb'), open(output_file, 'wb'))
586        else:
587            fdst = open(output_file, 'wb')
588            for fsrc in split_files:
589                shutil.copyfileobj(open(fsrc, 'rb'), fdst)
590            fdst.close()
591
592    merge = staticmethod(merge)
593
594    def get_visualizations( self, dataset ):
595        """
596        Returns a list of visualizations for datatype.
597        """
598
599        if self.track_type:
600            return [ 'trackster', 'circster' ]
601        return []
602
603    # ------------- Dataproviders
604    def has_dataprovider( self, data_format ):
605        """
606        Returns True if `data_format` is available in `dataproviders`.
607        """
608        return ( data_format in self.dataproviders )
609
610    def dataprovider( self, dataset, data_format, **settings ):
611        """
612        Base dataprovider factory for all datatypes that returns the proper provider
613        for the given `data_format` or raises a `NoProviderAvailable`.
614        """
615        if self.has_dataprovider( data_format ):
616            return self.dataproviders[ data_format ]( self, dataset, **settings )
617        raise dataproviders.exceptions.NoProviderAvailable( self, data_format )
618
619    @dataproviders.decorators.dataprovider_factory( 'base' )
620    def base_dataprovider( self, dataset, **settings ):
621        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
622        return dataproviders.base.DataProvider( dataset_source, **settings )
623
624    @dataproviders.decorators.dataprovider_factory( 'chunk', dataproviders.chunk.ChunkDataProvider.settings )
625    def chunk_dataprovider( self, dataset, **settings ):
626        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
627        return dataproviders.chunk.ChunkDataProvider( dataset_source, **settings )
628
629    @dataproviders.decorators.dataprovider_factory( 'chunk64', dataproviders.chunk.Base64ChunkDataProvider.settings )
630    def chunk64_dataprovider( self, dataset, **settings ):
631        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
632        return dataproviders.chunk.Base64ChunkDataProvider( dataset_source, **settings )
633
634    def _clean_and_set_mime_type(self, trans, mime):
635        if mime.lower() in XSS_VULNERABLE_MIME_TYPES:
636            if not getattr( trans.app.config, "serve_xss_vulnerable_mimetypes", True ):
637                mime = DEFAULT_MIME_TYPE
638        trans.response.set_content_type( mime )
639
640
641@dataproviders.decorators.has_dataproviders
642class Text( Data ):
643    file_ext = 'txt'
644    line_class = 'line'
645
646    """Add metadata elements"""
647    MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 )
648
649    def write_from_stream(self, dataset, stream):
650        """Writes data from a stream"""
651        # write it twice for now
652        fd, temp_name = tempfile.mkstemp()
653        while 1:
654            chunk = stream.read(1048576)
655            if not chunk:
656                break
657            os.write(fd, chunk)
658        os.close(fd)
659        # rewrite the file with unix newlines
660        fp = open(dataset.file_name, 'wt')
661        for line in file(temp_name, "U"):
662            line = line.strip() + '\n'
663            fp.write(line)
664        fp.close()
665    def set_raw_data(self, dataset, data):
666        """Saves the data on the disc"""
667        fd, temp_name = tempfile.mkstemp()
668        os.write(fd, data)
669        os.close(fd)
670        # rewrite the file with unix newlines
671        fp = open(dataset.file_name, 'wt')
672        for line in file(temp_name, "U"):
673            line = line.strip() + '\n'
674            fp.write(line)
675        fp.close()
676        os.remove( temp_name )
677    def get_mime(self):
678        """Returns the mime type of the datatype"""
679        return 'text/plain'
680    def set_meta( self, dataset, **kwd ):
681        """
682        Set the number of lines of data in dataset.
683        """
684        dataset.metadata.data_lines = self.count_data_lines(dataset)
685    def estimate_file_lines( self, dataset ):
686        """
687        Perform a rough estimate by extrapolating number of lines from a small read.
688        """
689        sample_size = 1048576
690        dataset_fh = open( dataset.file_name )
691        dataset_read = dataset_fh.read(sample_size)
692        dataset_fh.close()
693        sample_lines = dataset_read.count('\n')
694        est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
695        return est_lines
696    def count_data_lines(self, dataset):
697        """
698        Count the number of lines of data in dataset,
699        skipping all blank lines and comments.
700        """
701        data_lines = 0
702        for line in file( dataset.file_name ):
703            line = line.strip()
704            if line and not line.startswith( '#' ):
705                data_lines += 1
706        return data_lines
707    def set_peek( self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[] ):
708        """
709        Set the peek.  This method is used by various subclasses of Text.
710        """
711        if not dataset.dataset.purged:
712            # The file must exist on disk for the get_file_peek() method
713            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars )
714            if line_count is None:
715                # See if line_count is stored in the metadata
716                if dataset.metadata.data_lines:
717                    dataset.blurb = "%s %s" % ( util.commaify( str(dataset.metadata.data_lines) ), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
718                else:
719                    # Number of lines is not known ( this should not happen ), and auto-detect is
720                    # needed to set metadata
721                    # This can happen when the file is larger than max_optional_metadata_filesize.
722                    if int(dataset.get_size()) <= 1048576:
723                        #Small dataset, recount all lines and reset peek afterward.
724                        lc = self.count_data_lines(dataset)
725                        dataset.metadata.data_lines = lc
726                        dataset.blurb = "%s %s" % ( util.commaify( str(lc) ), inflector.cond_plural(lc, self.line_class) )
727                    else:
728                        est_lines = self.estimate_file_lines(dataset)
729                        dataset.blurb = "~%s %s" % ( util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) )
730            else:
731                dataset.blurb = "%s %s" % ( util.commaify( str(line_count) ), inflector.cond_plural(line_count, self.line_class) )
732        else:
733            dataset.peek = 'file does not exist'
734            dataset.blurb = 'file purged from disk'
735
736    def split( cls, input_datasets, subdir_generator_function, split_params):
737        """
738        Split the input files by line.
739        """
740        if split_params is None:
741            return
742
743        if len(input_datasets) > 1:
744            raise Exception("Text file splitting does not support multiple files")
745        input_files = [ds.file_name for ds in input_datasets]
746
747        lines_per_file = None
748        chunk_size = None
749        if split_params['split_mode'] == 'number_of_parts':
750            lines_per_file = []
751            # Computing the length is expensive!
752            def _file_len(fname):
753                i = 0
754                f = open(fname)
755                for i, l in enumerate(f):
756                    pass
757                f.close()
758                return i + 1
759            length = _file_len(input_files[0])
760            parts = int(split_params['split_size'])
761            if length < parts:
762                parts = length
763            len_each, remainder = divmod(length, parts)
764            while length > 0:
765                chunk = len_each
766                if remainder > 0:
767                    chunk += 1
768                lines_per_file.append(chunk)
769                remainder=- 1
770                length -= chunk
771        elif split_params['split_mode'] == 'to_size':
772            chunk_size = int(split_params['split_size'])
773        else:
774            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
775
776        f = open(input_files[0], 'rt')
777        try:
778            chunk_idx = 0
779            file_done = False
780            part_file = None
781            while not file_done:
782                if lines_per_file is None:
783                    this_chunk_size = chunk_size
784                elif chunk_idx < len(lines_per_file):
785                    this_chunk_size = lines_per_file[chunk_idx]
786                    chunk_idx += 1
787                lines_remaining = this_chunk_size
788                part_file = None
789                while lines_remaining > 0:
790                    a_line = f.readline()
791                    if a_line == '':
792                        file_done = True
793                        break
794                    if part_file is None:
795                        part_dir = subdir_generator_function()
796                        part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
797                        part_file = open(part_path, 'w')
798                    part_file.write(a_line)
799                    lines_remaining -= 1
800                if part_file is not None:
801                    part_file.close()
802        except Exception,  e:
803            log.error('Unable to split files: %s' % str(e))
804            f.close()
805            if part_file is not None:
806                part_file.close()
807            raise
808        f.close()
809    split = classmethod(split)
810
811    # ------------- Dataproviders
812    @dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings )
813    def line_dataprovider( self, dataset, **settings ):
814        """
815        Returns an iterator over the dataset's lines (that have been `strip`ed)
816        optionally excluding blank lines and lines that start with a comment character.
817        """
818        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
819        return dataproviders.line.FilteredLineDataProvider( dataset_source, **settings )
820
821    @dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings )
822    def regex_line_dataprovider( self, dataset, **settings ):
823        """
824        Returns an iterator over the dataset's lines
825        optionally including/excluding lines that match one or more regex filters.
826        """
827        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
828        return dataproviders.line.RegexLineDataProvider( dataset_source, **settings )
829
830
831class GenericAsn1( Text ):
832    """Class for generic ASN.1 text format"""
833    file_ext = 'asn1'
834
835
836class LineCount( Text ):
837    """
838    Dataset contains a single line with a single integer that denotes the
839    line count for a related dataset. Used for custom builds.
840    """
841    pass
842
843
844class Newick( Text ):
845    """New Hampshire/Newick Format"""
846    file_ext = "nhx"
847
848    def __init__(self, **kwd):
849        """Initialize foobar datatype"""
850        Text.__init__( self, **kwd )
851
852    def init_meta( self, dataset, copy_from=None ):
853        Text.init_meta( self, dataset, copy_from=copy_from )
854
855    def sniff( self, filename ):
856        """ Returning false as the newick format is too general and cannot be sniffed."""
857        return False
858
859    def get_visualizations( self, dataset ):
860        """
861        Returns a list of visualizations for datatype.
862        """
863
864        return [ 'phyloviz' ]
865
866
867class Nexus( Text ):
868    """Nexus format as used By Paup, Mr Bayes, etc"""
869    file_ext = "nex"
870
871    def __init__(self, **kwd):
872        """Initialize foobar datatype"""
873        Text.__init__( self, **kwd )
874
875    def init_meta( self, dataset, copy_from=None ):
876        Text.init_meta( self, dataset, copy_from=copy_from )
877
878    def sniff( self, filename ):
879        """All Nexus Files Simply puts a '#NEXUS' in its first line"""
880        f = open( filename, "r" )
881        firstline = f.readline().upper()
882        f.close()
883
884        if "#NEXUS" in firstline:
885            return True
886        else:
887            return False
888
889    def get_visualizations( self, dataset ):
890        """
891        Returns a list of visualizations for datatype.
892        """
893
894        return [ 'phyloviz' ]
895
896
897# ------------- Utility methods --------------
898
899# nice_size used to be here, but to resolve cyclical dependencies it's been
900# moved to galaxy.util.  It belongs there anyway since it's used outside
901# datatypes.
902nice_size = util.nice_size
903
904def get_test_fname( fname ):
905    """Returns test data filename"""
906    path, name = os.path.split(__file__)
907    full_path = os.path.join( path, 'test', fname )
908    return full_path
909
910def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[] ):
911    """
912    Returns the first LINE_COUNT lines wrapped to WIDTH
913
914    ## >>> fname = get_test_fname('4.bed')
915    ## >>> get_file_peek(fname)
916    ## 'chr22    30128507    31828507    uc003bnx.1_cds_2_0_chr22_29227_f    0    +\n'
917
918    """
919    # Set size for file.readline() to a negative number to force it to
920    # read until either a newline or EOF.  Needed for datasets with very
921    # long lines.
922    if WIDTH == 'unlimited':
923        WIDTH = -1
924    lines = []
925    count = 0
926    file_type = None
927    data_checked = False
928    temp = open( file_name, "U" )
929    while count <= LINE_COUNT:
930        line = temp.readline( WIDTH )
931        if line and not is_multi_byte and not data_checked:
932            # See if we have a compressed or binary file
933            if line[0:2] == util.gzip_magic:
934                file_type = 'gzipped'
935                break
936            else:
937                for char in line:
938                    if ord( char ) > 128:
939                        file_type = 'binary'
940                        break
941            data_checked = True
942        if file_type in [ 'gzipped', 'binary' ]:
943            break
944        skip_line = False
945        for skipchar in skipchars:
946            if line.startswith( skipchar ):
947                skip_line = True
948                break
949        if not skip_line:
950            lines.append( line )
951            count += 1
952    temp.close()
953    if file_type in [ 'gzipped', 'binary' ]:
954        text = "%s file" % file_type
955    else:
956        try:
957            text = util.unicodify( '\n'.join( lines ) )
958        except UnicodeDecodeError:
959            text = "binary/unknown file"
960    return text