PageRenderTime 64ms CodeModel.GetById 14ms app.highlight 43ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/binary.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 640 lines | 435 code | 93 blank | 112 comment | 53 complexity | 66f53aece04bd2581ee30f81bc888b6e MD5 | raw file
  1"""
  2Binary classes
  3"""
  4
  5import binascii
  6import data
  7import gzip
  8import logging
  9import os
 10import shutil
 11import struct
 12import subprocess
 13import tempfile
 14import zipfile
 15
 16from urllib import urlencode, quote_plus
 17from galaxy import eggs
 18eggs.require( "bx-python" )
 19
 20from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP, TWOBIT_MAGIC_SIZE
 21
 22from galaxy.datatypes.metadata import MetadataElement
 23from galaxy.datatypes import metadata
 24from galaxy.datatypes.sniff import *
 25import dataproviders
 26
 27log = logging.getLogger(__name__)
 28
 29# Currently these supported binary data types must be manually set on upload
 30
 31class Binary( data.Data ):
 32    """Binary data"""
 33    sniffable_binary_formats = []
 34    unsniffable_binary_formats = []
 35
 36    @staticmethod
 37    def register_sniffable_binary_format(data_type, ext, type_class):
 38        Binary.sniffable_binary_formats.append({"type": data_type, "ext": ext, "class": type_class})
 39
 40    @staticmethod
 41    def register_unsniffable_binary_ext(ext):
 42        Binary.unsniffable_binary_formats.append(ext)
 43
 44    @staticmethod
 45    def is_sniffable_binary( filename ):
 46        format_information = None
 47        for format in Binary.sniffable_binary_formats:
 48            format_instance = format[ "class" ]()
 49            try:
 50                if format_instance.sniff(filename):
 51                    format_information = ( format["type"], format[ "ext" ] )
 52                    break
 53            except Exception:
 54                # Sniffer raised exception, could be any number of
 55                # reasons for this so there is not much to do besides
 56                # trying next sniffer.
 57                pass
 58        return format_information
 59
 60    @staticmethod
 61    def is_ext_unsniffable(ext):
 62        return ext in Binary.unsniffable_binary_formats
 63
 64    def set_peek( self, dataset, is_multi_byte=False ):
 65        """Set the peek and blurb text"""
 66        if not dataset.dataset.purged:
 67            dataset.peek = 'binary data'
 68            dataset.blurb = data.nice_size( dataset.get_size() )
 69        else:
 70            dataset.peek = 'file does not exist'
 71            dataset.blurb = 'file purged from disk'
 72
 73    def get_mime( self ):
 74        """Returns the mime type of the datatype"""
 75        return 'application/octet-stream'
 76
 77    def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None, size=None, offset=None, **kwd):
 78        trans.response.set_content_type(dataset.get_mime())
 79        trans.log_event( "Display dataset id: %s" % str( dataset.id ) )
 80        trans.response.headers['Content-Length'] = int( os.stat( dataset.file_name ).st_size )
 81        to_ext = dataset.extension
 82        valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 83        fname = ''.join(c in valid_chars and c or '_' for c in dataset.name)[0:150]
 84        trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename
 85        trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (dataset.hid, fname, to_ext)
 86        return open( dataset.file_name )
 87
 88
 89class Ab1( Binary ):
 90    """Class describing an ab1 binary sequence file"""
 91    file_ext = "ab1"
 92
 93    def set_peek( self, dataset, is_multi_byte=False ):
 94        if not dataset.dataset.purged:
 95            dataset.peek  = "Binary ab1 sequence file"
 96            dataset.blurb = data.nice_size( dataset.get_size() )
 97        else:
 98            dataset.peek = 'file does not exist'
 99            dataset.blurb = 'file purged from disk'
100
101    def display_peek( self, dataset ):
102        try:
103            return dataset.peek
104        except:
105            return "Binary ab1 sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
106
107Binary.register_unsniffable_binary_ext("ab1")
108
109
110class GenericAsn1Binary( Binary ):
111    """Class for generic ASN.1 binary format"""
112    file_ext = "asn1-binary"
113
114Binary.register_unsniffable_binary_ext("asn1-binary")
115
116
117@dataproviders.decorators.has_dataproviders
118class Bam( Binary ):
119    """Class describing a BAM binary file"""
120    file_ext = "bam"
121    track_type = "ReadTrack"
122    data_sources = { "data": "bai", "index": "bigwig" }
123
124    MetadataElement( name="bam_index", desc="BAM Index File", param=metadata.FileParameter, file_ext="bai", readonly=True, no_value=None, visible=False, optional=True )
125
126    def _get_samtools_version( self ):
127        # Determine the version of samtools being used.  Wouldn't it be nice if
128        # samtools provided a version flag to make this much simpler?
129        version = '0.0.0'
130        output = subprocess.Popen( [ 'samtools' ], stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[1]
131        lines = output.split( '\n' )
132        for line in lines:
133            if line.lower().startswith( 'version' ):
134                # Assuming line looks something like: version: 0.1.12a (r862)
135                version = line.split()[1]
136                break
137        return version
138
139    def _is_coordinate_sorted( self, file_name ):
140        """See if the input BAM file is sorted from the header information."""
141        params = [ "samtools", "view", "-H", file_name ]
142        output = subprocess.Popen( params, stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[0]
143        # find returns -1 if string is not found
144        return output.find( "SO:coordinate" ) != -1 or output.find( "SO:sorted" ) != -1
145
146    def dataset_content_needs_grooming( self, file_name ):
147        """See if file_name is a sorted BAM file"""
148        version = self._get_samtools_version()
149        if version < '0.1.13':
150            return not self._is_coordinate_sorted( file_name )
151        else:
152            # Samtools version 0.1.13 or newer produces an error condition when attempting to index an
153            # unsorted bam file - see http://biostar.stackexchange.com/questions/5273/is-my-bam-file-sorted.
154            # So when using a newer version of samtools, we'll first check if the input BAM file is sorted
155            # from the header information.  If the header is present and sorted, we do nothing by returning False.
156            # If it's present and unsorted or if it's missing, we'll index the bam file to see if it produces the
157            # error.  If it does, sorting is needed so we return True (otherwise False).
158            #
159            # TODO: we're creating an index file here and throwing it away.  We then create it again when
160            # the set_meta() method below is called later in the job process.  We need to enhance this overall
161            # process so we don't create an index twice.  In order to make it worth the time to implement the
162            # upload tool / framework to allow setting metadata from directly within the tool itself, it should be
163            # done generically so that all tools will have the ability.  In testing, a 6.6 gb BAM file took 128
164            # seconds to index with samtools, and 45 minutes to sort, so indexing is relatively inexpensive.
165            if self._is_coordinate_sorted( file_name ):
166                return False
167            index_name = tempfile.NamedTemporaryFile( prefix = "bam_index" ).name
168            stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name
169            command = 'samtools index %s %s' % ( file_name, index_name )
170            proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
171            exit_code = proc.wait()
172            stderr = open( stderr_name ).read().strip()
173            if stderr:
174                try:
175                    os.unlink( index_name )
176                except OSError:
177                    pass
178                try:
179                    os.unlink( stderr_name )
180                except OSError:
181                    pass
182                # Return True if unsorted error condition is found (find returns -1 if string is not found).
183                return stderr.find( "[bam_index_core] the alignment is not sorted" ) != -1
184            try:
185                os.unlink( index_name )
186            except OSError:
187                pass
188            try:
189                os.unlink( stderr_name )
190            except OSError:
191                pass
192            return False
193
194    def groom_dataset_content( self, file_name ):
195        """
196        Ensures that the Bam file contents are sorted.  This function is called
197        on an output dataset after the content is initially generated.
198        """
199        # Use samtools to sort the Bam file
200        ##$ samtools sort
201        ##Usage: samtools sort [-on] [-m <maxMem>] <in.bam> <out.prefix>
202        ## Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created.
203        ## This command may also create temporary files <out.prefix>.%d.bam when the
204        ## whole alignment cannot be fitted into memory ( controlled by option -m ).
205        #do this in a unique temp directory, because of possible <out.prefix>.%d.bam temp files
206        if not self.dataset_content_needs_grooming( file_name ):
207            # Don't re-sort if already sorted
208            return
209        tmp_dir = tempfile.mkdtemp()
210        tmp_sorted_dataset_file_name_prefix = os.path.join( tmp_dir, 'sorted' )
211        stderr_name = tempfile.NamedTemporaryFile( dir = tmp_dir, prefix = "bam_sort_stderr" ).name
212        samtools_created_sorted_file_name = "%s.bam" % tmp_sorted_dataset_file_name_prefix #samtools accepts a prefix, not a filename, it always adds .bam to the prefix
213        command = "samtools sort %s %s" % ( file_name, tmp_sorted_dataset_file_name_prefix )
214        proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
215        exit_code = proc.wait()
216        #Did sort succeed?
217        stderr = open( stderr_name ).read().strip()
218        if stderr:
219            if exit_code != 0:
220                shutil.rmtree( tmp_dir) #clean up
221                raise Exception, "Error Grooming BAM file contents: %s" % stderr
222            else:
223                print stderr
224        # Move samtools_created_sorted_file_name to our output dataset location
225        shutil.move( samtools_created_sorted_file_name, file_name )
226        # Remove temp file and empty temporary directory
227        os.unlink( stderr_name )
228        os.rmdir( tmp_dir )
229
230    def init_meta( self, dataset, copy_from=None ):
231        Binary.init_meta( self, dataset, copy_from=copy_from )
232
233    def set_meta( self, dataset, overwrite = True, **kwd ):
234        """ Creates the index for the BAM file. """
235        # These metadata values are not accessible by users, always overwrite
236        index_file = dataset.metadata.bam_index
237        if not index_file:
238            index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset )
239        # Create the Bam index
240        ##$ samtools index
241        ##Usage: samtools index <in.bam> [<out.index>]
242        stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name
243        command = 'samtools index %s %s' % ( dataset.file_name, index_file.file_name )
244        proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
245        exit_code = proc.wait()
246        #Did index succeed?
247        stderr = open( stderr_name ).read().strip()
248        if stderr:
249            if exit_code != 0:
250                os.unlink( stderr_name ) #clean up
251                raise Exception, "Error Setting BAM Metadata: %s" % stderr
252            else:
253                print stderr
254        dataset.metadata.bam_index = index_file
255        # Remove temp file
256        os.unlink( stderr_name )
257
258    def sniff( self, filename ):
259        # BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
260        # The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
261        try:
262            header = gzip.open( filename ).read(4)
263            if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ):
264                return True
265            return False
266        except:
267            return False
268
269    def set_peek( self, dataset, is_multi_byte=False ):
270        if not dataset.dataset.purged:
271            dataset.peek  = "Binary bam alignments file"
272            dataset.blurb = data.nice_size( dataset.get_size() )
273        else:
274            dataset.peek = 'file does not exist'
275            dataset.blurb = 'file purged from disk'
276
277    def display_peek( self, dataset ):
278        try:
279            return dataset.peek
280        except:
281            return "Binary bam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )
282
283    # ------------- Dataproviders
284    # pipe through samtools view
285    #ALSO: (as Sam)
286    # bam does not use '#' to indicate comments/headers - we need to strip out those headers from the std. providers
287    #TODO:?? seems like there should be an easier way to do/inherit this - metadata.comment_char?
288    #TODO: incorporate samtools options to control output: regions first, then flags, etc.
289    @dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings )
290    def line_dataprovider( self, dataset, **settings ):
291        samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
292        settings[ 'comment_char' ] = '@'
293        return dataproviders.line.FilteredLineDataProvider( samtools_source, **settings )
294
295    @dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings )
296    def regex_line_dataprovider( self, dataset, **settings ):
297        samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
298        settings[ 'comment_char' ] = '@'
299        return dataproviders.line.RegexLineDataProvider( samtools_source, **settings )
300
301    @dataproviders.decorators.dataprovider_factory( 'column', dataproviders.column.ColumnarDataProvider.settings )
302    def column_dataprovider( self, dataset, **settings ):
303        samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
304        settings[ 'comment_char' ] = '@'
305        return dataproviders.column.ColumnarDataProvider( samtools_source, **settings )
306
307    @dataproviders.decorators.dataprovider_factory( 'dict', dataproviders.column.DictDataProvider.settings )
308    def dict_dataprovider( self, dataset, **settings ):
309        samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
310        settings[ 'comment_char' ] = '@'
311        return dataproviders.column.DictDataProvider( samtools_source, **settings )
312
313    # these can't be used directly - may need BamColumn, BamDict (Bam metadata -> column/dict)
314    # OR - see genomic_region_dataprovider
315    #@dataproviders.decorators.dataprovider_factory( 'dataset-column', dataproviders.column.ColumnarDataProvider.settings )
316    #def dataset_column_dataprovider( self, dataset, **settings ):
317    #    settings[ 'comment_char' ] = '@'
318    #    return super( Sam, self ).dataset_column_dataprovider( dataset, **settings )
319
320    #@dataproviders.decorators.dataprovider_factory( 'dataset-dict', dataproviders.column.DictDataProvider.settings )
321    #def dataset_dict_dataprovider( self, dataset, **settings ):
322    #    settings[ 'comment_char' ] = '@'
323    #    return super( Sam, self ).dataset_dict_dataprovider( dataset, **settings )
324
325    @dataproviders.decorators.dataprovider_factory( 'header', dataproviders.line.RegexLineDataProvider.settings )
326    def header_dataprovider( self, dataset, **settings ):
327        # in this case we can use an option of samtools view to provide just what we need (w/o regex)
328        samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset, '-H' )
329        return dataproviders.line.RegexLineDataProvider( samtools_source, **settings )
330
331    @dataproviders.decorators.dataprovider_factory( 'id-seq-qual', dataproviders.column.DictDataProvider.settings )
332    def id_seq_qual_dataprovider( self, dataset, **settings ):
333        settings[ 'indeces' ] = [ 0, 9, 10 ]
334        settings[ 'column_types' ] = [ 'str', 'str', 'str' ]
335        settings[ 'column_names' ] = [ 'id', 'seq', 'qual' ]
336        return self.dict_dataprovider( dataset, **settings )
337
338    @dataproviders.decorators.dataprovider_factory( 'genomic-region', dataproviders.column.ColumnarDataProvider.settings )
339    def genomic_region_dataprovider( self, dataset, **settings ):
340        # GenomicRegionDataProvider currently requires a dataset as source - may not be necc.
341        #TODO:?? consider (at least) the possible use of a kwarg: metadata_source (def. to source.dataset),
342        #   or remove altogether...
343        #samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
344        #return dataproviders.dataset.GenomicRegionDataProvider( samtools_source, metadata_source=dataset,
345        #                                                        2, 3, 3, **settings )
346
347        # instead, set manually and use in-class column gen
348        settings[ 'indeces' ] = [ 2, 3, 3 ]
349        settings[ 'column_types' ] = [ 'str', 'int', 'int' ]
350        return self.column_dataprovider( dataset, **settings )
351
352    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict', dataproviders.column.DictDataProvider.settings )
353    def genomic_region_dict_dataprovider( self, dataset, **settings ):
354        settings[ 'indeces' ] = [ 2, 3, 3 ]
355        settings[ 'column_types' ] = [ 'str', 'int', 'int' ]
356        settings[ 'column_names' ] = [ 'chrom', 'start', 'end' ]
357        return self.dict_dataprovider( dataset, **settings )
358
359    @dataproviders.decorators.dataprovider_factory( 'samtools' )
360    def samtools_dataprovider( self, dataset, **settings ):
361        """Generic samtools interface - all options available through settings."""
362        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
363        return dataproviders.dataset.SamtoolsDataProvider( dataset_source, **settings )
364
365Binary.register_sniffable_binary_format("bam", "bam", Bam)
366
367
368class Bcf( Binary):
369    """Class describing a BCF file"""
370    file_ext = "bcf"
371
372    def sniff( self, filename ):
373        # BCF is compressed in the BGZF format, and must not be uncompressed in Galaxy.
374        # The first 3 bytes of any bcf file is 'BCF', and the file is binary.
375        try:
376            header = gzip.open( filename ).read(3)
377            if binascii.b2a_hex( header ) == binascii.hexlify( 'BCF' ):
378                return True
379            return False
380        except:
381            return False
382
383Binary.register_sniffable_binary_format("bcf", "bcf", Bcf)
384
385
386class H5( Binary ):
387    """Class describing an HDF5 file"""
388    file_ext = "h5"
389
390    def set_peek( self, dataset, is_multi_byte=False ):
391        if not dataset.dataset.purged:
392            dataset.peek  = "Binary h5 file"
393            dataset.blurb = data.nice_size( dataset.get_size() )
394        else:
395            dataset.peek = 'file does not exist'
396            dataset.blurb = 'file purged from disk'
397
398    def display_peek( self, dataset ):
399        try:
400            return dataset.peek
401        except:
402            return "Binary h5 sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
403
404Binary.register_unsniffable_binary_ext("h5")
405
406
407class Scf( Binary ):
408    """Class describing an scf binary sequence file"""
409    file_ext = "scf"
410
411    def set_peek( self, dataset, is_multi_byte=False ):
412        if not dataset.dataset.purged:
413            dataset.peek  = "Binary scf sequence file"
414            dataset.blurb = data.nice_size( dataset.get_size() )
415        else:
416            dataset.peek = 'file does not exist'
417            dataset.blurb = 'file purged from disk'
418
419    def display_peek( self, dataset ):
420        try:
421            return dataset.peek
422        except:
423            return "Binary scf sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
424
425Binary.register_unsniffable_binary_ext("scf")
426
427
428class Sff( Binary ):
429    """ Standard Flowgram Format (SFF) """
430    file_ext = "sff"
431
432    def __init__( self, **kwd ):
433        Binary.__init__( self, **kwd )
434
435    def sniff( self, filename ):
436        # The first 4 bytes of any sff file is '.sff', and the file is binary. For details
437        # about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format
438        try:
439            header = open( filename ).read(4)
440            if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
441                return True
442            return False
443        except:
444            return False
445
446    def set_peek( self, dataset, is_multi_byte=False ):
447        if not dataset.dataset.purged:
448            dataset.peek  = "Binary sff file"
449            dataset.blurb = data.nice_size( dataset.get_size() )
450        else:
451            dataset.peek = 'file does not exist'
452            dataset.blurb = 'file purged from disk'
453
454    def display_peek( self, dataset ):
455        try:
456            return dataset.peek
457        except:
458            return "Binary sff file (%s)" % ( data.nice_size( dataset.get_size() ) )
459
460Binary.register_sniffable_binary_format("sff", "sff", Sff)
461
462
463class BigWig(Binary):
464    """
465    Accessing binary BigWig files from UCSC.
466    The supplemental info in the paper has the binary details:
467    http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq351v1
468    """
469    track_type = "LineTrack"
470    data_sources = { "data_standalone": "bigwig" }
471
472    def __init__( self, **kwd ):
473        Binary.__init__( self, **kwd )
474        self._magic = 0x888FFC26
475        self._name = "BigWig"
476
477    def _unpack( self, pattern, handle ):
478        return struct.unpack( pattern, handle.read( struct.calcsize( pattern ) ) )
479
480    def sniff( self, filename ):
481        try:
482            magic = self._unpack( "I", open( filename ) )
483            return magic[0] == self._magic
484        except:
485            return False
486
487    def set_peek( self, dataset, is_multi_byte=False ):
488        if not dataset.dataset.purged:
489            dataset.peek  = "Binary UCSC %s file" % self._name
490            dataset.blurb = data.nice_size( dataset.get_size() )
491        else:
492            dataset.peek = 'file does not exist'
493            dataset.blurb = 'file purged from disk'
494
495    def display_peek( self, dataset ):
496        try:
497            return dataset.peek
498        except:
499            return "Binary UCSC %s file (%s)" % ( self._name, data.nice_size( dataset.get_size() ) )
500
501Binary.register_sniffable_binary_format("bigwig", "bigwig", BigWig)
502
503
504class BigBed(BigWig):
505    """BigBed support from UCSC."""
506
507    data_sources = { "data_standalone": "bigbed" }
508
509    def __init__( self, **kwd ):
510        Binary.__init__( self, **kwd )
511        self._magic = 0x8789F2EB
512        self._name = "BigBed"
513
514Binary.register_sniffable_binary_format("bigbed", "bigbed", BigBed)
515
516
517# Cistrome Customized Datatypes
518class Eset( data.Data ):
519    """Class describing an Expression Set"""
520    file_ext = "eset"
521    def set_peek( self, dataset, is_multi_byte=False ):
522        if not dataset.dataset.purged:
523            dataset.peek  = "Expression set"
524            dataset.blurb = data.nice_size( dataset.get_size() )
525        else:
526            dataset.peek = 'file does not exist'
527            dataset.blurb = 'file purged from disk'
528    def display_peek(self, dataset):
529        try:
530            return dataset.peek
531        except:
532            return "Expression set"
533    def get_mime(self):
534        """Returns the mime type of the datatype"""
535
536class XysZip( data.Data ):
537    """Class describing a zip archive of NimbleGen XYS files"""
538    file_ext = "xys.zip"
539    def set_peek( self, dataset, is_multi_byte=False ):
540        if not dataset.dataset.purged:
541            zip_file = zipfile.ZipFile( dataset.file_name, "r" )
542            num_files = len( zip_file.namelist() )
543            dataset.peek  = "Archive of %s NimbleGen XYS files" % ( str( num_files - 1 ) )
544            dataset.blurb = data.nice_size( dataset.get_size() )
545        else:
546            dataset.peek = 'file does not exist'
547            dataset.blurb = 'file purged from disk'
548    def display_peek(self, dataset):
549        try:
550            return dataset.peek
551        except:
552            return "NimbleGen XYS file archive (%s)" % ( data.nice_size( dataset.get_size() ) )
553    def get_mime(self):
554        """Returns the mime type of the datatype"""
555        return 'application/zip'
556
557class CelZip( data.Data ):
558    """Class describing a zip archive of Affymetrix CEL files"""
559    file_ext = "cel.zip"
560    def set_peek( self, dataset, is_multi_byte=False ):
561        if not dataset.dataset.purged:
562            zip_file = zipfile.ZipFile( dataset.file_name, "r" )
563            num_files = len( zip_file.namelist() )
564            dataset.peek  = "Archive of %s Affymetrix CEL files" % ( str( num_files - 1 ) )
565            dataset.blurb = data.nice_size( dataset.get_size() )
566        else:
567            dataset.peek = 'file does not exist'
568            dataset.blurb = 'file purged from disk'
569    def display_peek(self, dataset):
570        try:
571            return dataset.peek
572        except:
573            return "Affymetrix CEL file archive (%s)" % ( data.nice_size( dataset.get_size() ) )
574    def get_mime(self):
575        """Returns the mime type of the datatype"""
576        return 'application/zip'
577
578class Cel( data.Data ):
579    """Class describing an binary CEL file"""
580    file_ext = "cel"
581    def set_peek( self, dataset, is_multi_byte=False ):
582        if not dataset.dataset.purged:
583            export_url = "/history_add_to?" + urlencode({'history_id':dataset.history_id,'ext':'cel','name':'affymetrix cel file','info':'Cel file','dbkey':dataset.dbkey})
584            dataset.peek  = "Binary cel sequence file"
585            dataset.blurb = data.nice_size( dataset.get_size() )
586        else:
587            dataset.peek = 'file does not exist'
588            dataset.blurb = 'file purged from disk'
589    def display_peek(self, dataset):
590        try:
591            return dataset.peek
592        except:
593            return "Binary cel sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
594    def sniff( self, filename ):
595        """
596        Determines wether the file is in cel format
597
598        """
599        parts = filename.lower().split('.')
600        for p in parts:
601            if p == 'cel':
602                return True
603        return False
604    
605    def get_mime(self):
606        """Returns the mime type of the datatype"""
607        return 'application/octet-stream'
608# End
609
610class TwoBit (Binary):
611    """Class describing a TwoBit format nucleotide file"""
612
613    file_ext = "twobit"
614
615    def sniff(self, filename):
616        try:
617            # All twobit files start with a 16-byte header. If the file is smaller than 16 bytes, it's obviously not a valid twobit file.
618            if os.path.getsize(filename) < 16:
619                return False
620            input = file(filename)
621            magic = struct.unpack(">L", input.read(TWOBIT_MAGIC_SIZE))[0]
622            if magic == TWOBIT_MAGIC_NUMBER or magic == TWOBIT_MAGIC_NUMBER_SWAP:
623                return True
624        except IOError:
625            return False
626
627    def set_peek(self, dataset, is_multi_byte=False):
628        if not dataset.dataset.purged:
629            dataset.peek = "Binary TwoBit format nucleotide file"
630            dataset.blurb = data.nice_size(dataset.get_size())
631        else:
632            return super(TwoBit, self).set_peek(dataset, is_multi_byte)
633
634    def display_peek(self, dataset):
635        try:
636            return dataset.peek
637        except:
638            return "Binary TwoBit format nucleotide file (%s)" % (data.nice_size(dataset.get_size()))
639
640Binary.register_sniffable_binary_format("twobit", "twobit", TwoBit)