/lib/galaxy/datatypes/binary.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 640 lines · 435 code · 93 blank · 112 comment · 79 complexity · 66f53aece04bd2581ee30f81bc888b6e MD5 · raw file

  1. """
  2. Binary classes
  3. """
  4. import binascii
  5. import data
  6. import gzip
  7. import logging
  8. import os
  9. import shutil
  10. import struct
  11. import subprocess
  12. import tempfile
  13. import zipfile
  14. from urllib import urlencode, quote_plus
  15. from galaxy import eggs
  16. eggs.require( "bx-python" )
  17. from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP, TWOBIT_MAGIC_SIZE
  18. from galaxy.datatypes.metadata import MetadataElement
  19. from galaxy.datatypes import metadata
  20. from galaxy.datatypes.sniff import *
  21. import dataproviders
  22. log = logging.getLogger(__name__)
  23. # Currently these supported binary data types must be manually set on upload
  24. class Binary( data.Data ):
  25. """Binary data"""
  26. sniffable_binary_formats = []
  27. unsniffable_binary_formats = []
  28. @staticmethod
  29. def register_sniffable_binary_format(data_type, ext, type_class):
  30. Binary.sniffable_binary_formats.append({"type": data_type, "ext": ext, "class": type_class})
  31. @staticmethod
  32. def register_unsniffable_binary_ext(ext):
  33. Binary.unsniffable_binary_formats.append(ext)
  34. @staticmethod
  35. def is_sniffable_binary( filename ):
  36. format_information = None
  37. for format in Binary.sniffable_binary_formats:
  38. format_instance = format[ "class" ]()
  39. try:
  40. if format_instance.sniff(filename):
  41. format_information = ( format["type"], format[ "ext" ] )
  42. break
  43. except Exception:
  44. # Sniffer raised exception, could be any number of
  45. # reasons for this so there is not much to do besides
  46. # trying next sniffer.
  47. pass
  48. return format_information
  49. @staticmethod
  50. def is_ext_unsniffable(ext):
  51. return ext in Binary.unsniffable_binary_formats
  52. def set_peek( self, dataset, is_multi_byte=False ):
  53. """Set the peek and blurb text"""
  54. if not dataset.dataset.purged:
  55. dataset.peek = 'binary data'
  56. dataset.blurb = data.nice_size( dataset.get_size() )
  57. else:
  58. dataset.peek = 'file does not exist'
  59. dataset.blurb = 'file purged from disk'
  60. def get_mime( self ):
  61. """Returns the mime type of the datatype"""
  62. return 'application/octet-stream'
  63. def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None, size=None, offset=None, **kwd):
  64. trans.response.set_content_type(dataset.get_mime())
  65. trans.log_event( "Display dataset id: %s" % str( dataset.id ) )
  66. trans.response.headers['Content-Length'] = int( os.stat( dataset.file_name ).st_size )
  67. to_ext = dataset.extension
  68. valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
  69. fname = ''.join(c in valid_chars and c or '_' for c in dataset.name)[0:150]
  70. trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename
  71. trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (dataset.hid, fname, to_ext)
  72. return open( dataset.file_name )
  73. class Ab1( Binary ):
  74. """Class describing an ab1 binary sequence file"""
  75. file_ext = "ab1"
  76. def set_peek( self, dataset, is_multi_byte=False ):
  77. if not dataset.dataset.purged:
  78. dataset.peek = "Binary ab1 sequence file"
  79. dataset.blurb = data.nice_size( dataset.get_size() )
  80. else:
  81. dataset.peek = 'file does not exist'
  82. dataset.blurb = 'file purged from disk'
  83. def display_peek( self, dataset ):
  84. try:
  85. return dataset.peek
  86. except:
  87. return "Binary ab1 sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
  88. Binary.register_unsniffable_binary_ext("ab1")
  89. class GenericAsn1Binary( Binary ):
  90. """Class for generic ASN.1 binary format"""
  91. file_ext = "asn1-binary"
  92. Binary.register_unsniffable_binary_ext("asn1-binary")
  93. @dataproviders.decorators.has_dataproviders
  94. class Bam( Binary ):
  95. """Class describing a BAM binary file"""
  96. file_ext = "bam"
  97. track_type = "ReadTrack"
  98. data_sources = { "data": "bai", "index": "bigwig" }
  99. MetadataElement( name="bam_index", desc="BAM Index File", param=metadata.FileParameter, file_ext="bai", readonly=True, no_value=None, visible=False, optional=True )
  100. def _get_samtools_version( self ):
  101. # Determine the version of samtools being used. Wouldn't it be nice if
  102. # samtools provided a version flag to make this much simpler?
  103. version = '0.0.0'
  104. output = subprocess.Popen( [ 'samtools' ], stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[1]
  105. lines = output.split( '\n' )
  106. for line in lines:
  107. if line.lower().startswith( 'version' ):
  108. # Assuming line looks something like: version: 0.1.12a (r862)
  109. version = line.split()[1]
  110. break
  111. return version
  112. def _is_coordinate_sorted( self, file_name ):
  113. """See if the input BAM file is sorted from the header information."""
  114. params = [ "samtools", "view", "-H", file_name ]
  115. output = subprocess.Popen( params, stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[0]
  116. # find returns -1 if string is not found
  117. return output.find( "SO:coordinate" ) != -1 or output.find( "SO:sorted" ) != -1
  118. def dataset_content_needs_grooming( self, file_name ):
  119. """See if file_name is a sorted BAM file"""
  120. version = self._get_samtools_version()
  121. if version < '0.1.13':
  122. return not self._is_coordinate_sorted( file_name )
  123. else:
  124. # Samtools version 0.1.13 or newer produces an error condition when attempting to index an
  125. # unsorted bam file - see http://biostar.stackexchange.com/questions/5273/is-my-bam-file-sorted.
  126. # So when using a newer version of samtools, we'll first check if the input BAM file is sorted
  127. # from the header information. If the header is present and sorted, we do nothing by returning False.
  128. # If it's present and unsorted or if it's missing, we'll index the bam file to see if it produces the
  129. # error. If it does, sorting is needed so we return True (otherwise False).
  130. #
  131. # TODO: we're creating an index file here and throwing it away. We then create it again when
  132. # the set_meta() method below is called later in the job process. We need to enhance this overall
  133. # process so we don't create an index twice. In order to make it worth the time to implement the
  134. # upload tool / framework to allow setting metadata from directly within the tool itself, it should be
  135. # done generically so that all tools will have the ability. In testing, a 6.6 gb BAM file took 128
  136. # seconds to index with samtools, and 45 minutes to sort, so indexing is relatively inexpensive.
  137. if self._is_coordinate_sorted( file_name ):
  138. return False
  139. index_name = tempfile.NamedTemporaryFile( prefix = "bam_index" ).name
  140. stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name
  141. command = 'samtools index %s %s' % ( file_name, index_name )
  142. proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
  143. exit_code = proc.wait()
  144. stderr = open( stderr_name ).read().strip()
  145. if stderr:
  146. try:
  147. os.unlink( index_name )
  148. except OSError:
  149. pass
  150. try:
  151. os.unlink( stderr_name )
  152. except OSError:
  153. pass
  154. # Return True if unsorted error condition is found (find returns -1 if string is not found).
  155. return stderr.find( "[bam_index_core] the alignment is not sorted" ) != -1
  156. try:
  157. os.unlink( index_name )
  158. except OSError:
  159. pass
  160. try:
  161. os.unlink( stderr_name )
  162. except OSError:
  163. pass
  164. return False
  165. def groom_dataset_content( self, file_name ):
  166. """
  167. Ensures that the Bam file contents are sorted. This function is called
  168. on an output dataset after the content is initially generated.
  169. """
  170. # Use samtools to sort the Bam file
  171. ##$ samtools sort
  172. ##Usage: samtools sort [-on] [-m <maxMem>] <in.bam> <out.prefix>
  173. ## Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created.
  174. ## This command may also create temporary files <out.prefix>.%d.bam when the
  175. ## whole alignment cannot be fitted into memory ( controlled by option -m ).
  176. #do this in a unique temp directory, because of possible <out.prefix>.%d.bam temp files
  177. if not self.dataset_content_needs_grooming( file_name ):
  178. # Don't re-sort if already sorted
  179. return
  180. tmp_dir = tempfile.mkdtemp()
  181. tmp_sorted_dataset_file_name_prefix = os.path.join( tmp_dir, 'sorted' )
  182. stderr_name = tempfile.NamedTemporaryFile( dir = tmp_dir, prefix = "bam_sort_stderr" ).name
  183. samtools_created_sorted_file_name = "%s.bam" % tmp_sorted_dataset_file_name_prefix #samtools accepts a prefix, not a filename, it always adds .bam to the prefix
  184. command = "samtools sort %s %s" % ( file_name, tmp_sorted_dataset_file_name_prefix )
  185. proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
  186. exit_code = proc.wait()
  187. #Did sort succeed?
  188. stderr = open( stderr_name ).read().strip()
  189. if stderr:
  190. if exit_code != 0:
  191. shutil.rmtree( tmp_dir) #clean up
  192. raise Exception, "Error Grooming BAM file contents: %s" % stderr
  193. else:
  194. print stderr
  195. # Move samtools_created_sorted_file_name to our output dataset location
  196. shutil.move( samtools_created_sorted_file_name, file_name )
  197. # Remove temp file and empty temporary directory
  198. os.unlink( stderr_name )
  199. os.rmdir( tmp_dir )
  200. def init_meta( self, dataset, copy_from=None ):
  201. Binary.init_meta( self, dataset, copy_from=copy_from )
  202. def set_meta( self, dataset, overwrite = True, **kwd ):
  203. """ Creates the index for the BAM file. """
  204. # These metadata values are not accessible by users, always overwrite
  205. index_file = dataset.metadata.bam_index
  206. if not index_file:
  207. index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset )
  208. # Create the Bam index
  209. ##$ samtools index
  210. ##Usage: samtools index <in.bam> [<out.index>]
  211. stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name
  212. command = 'samtools index %s %s' % ( dataset.file_name, index_file.file_name )
  213. proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
  214. exit_code = proc.wait()
  215. #Did index succeed?
  216. stderr = open( stderr_name ).read().strip()
  217. if stderr:
  218. if exit_code != 0:
  219. os.unlink( stderr_name ) #clean up
  220. raise Exception, "Error Setting BAM Metadata: %s" % stderr
  221. else:
  222. print stderr
  223. dataset.metadata.bam_index = index_file
  224. # Remove temp file
  225. os.unlink( stderr_name )
  226. def sniff( self, filename ):
  227. # BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
  228. # The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
  229. try:
  230. header = gzip.open( filename ).read(4)
  231. if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ):
  232. return True
  233. return False
  234. except:
  235. return False
  236. def set_peek( self, dataset, is_multi_byte=False ):
  237. if not dataset.dataset.purged:
  238. dataset.peek = "Binary bam alignments file"
  239. dataset.blurb = data.nice_size( dataset.get_size() )
  240. else:
  241. dataset.peek = 'file does not exist'
  242. dataset.blurb = 'file purged from disk'
  243. def display_peek( self, dataset ):
  244. try:
  245. return dataset.peek
  246. except:
  247. return "Binary bam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )
  248. # ------------- Dataproviders
  249. # pipe through samtools view
  250. #ALSO: (as Sam)
  251. # bam does not use '#' to indicate comments/headers - we need to strip out those headers from the std. providers
  252. #TODO:?? seems like there should be an easier way to do/inherit this - metadata.comment_char?
  253. #TODO: incorporate samtools options to control output: regions first, then flags, etc.
  254. @dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings )
  255. def line_dataprovider( self, dataset, **settings ):
  256. samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
  257. settings[ 'comment_char' ] = '@'
  258. return dataproviders.line.FilteredLineDataProvider( samtools_source, **settings )
  259. @dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings )
  260. def regex_line_dataprovider( self, dataset, **settings ):
  261. samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
  262. settings[ 'comment_char' ] = '@'
  263. return dataproviders.line.RegexLineDataProvider( samtools_source, **settings )
  264. @dataproviders.decorators.dataprovider_factory( 'column', dataproviders.column.ColumnarDataProvider.settings )
  265. def column_dataprovider( self, dataset, **settings ):
  266. samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
  267. settings[ 'comment_char' ] = '@'
  268. return dataproviders.column.ColumnarDataProvider( samtools_source, **settings )
  269. @dataproviders.decorators.dataprovider_factory( 'dict', dataproviders.column.DictDataProvider.settings )
  270. def dict_dataprovider( self, dataset, **settings ):
  271. samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
  272. settings[ 'comment_char' ] = '@'
  273. return dataproviders.column.DictDataProvider( samtools_source, **settings )
  274. # these can't be used directly - may need BamColumn, BamDict (Bam metadata -> column/dict)
  275. # OR - see genomic_region_dataprovider
  276. #@dataproviders.decorators.dataprovider_factory( 'dataset-column', dataproviders.column.ColumnarDataProvider.settings )
  277. #def dataset_column_dataprovider( self, dataset, **settings ):
  278. # settings[ 'comment_char' ] = '@'
  279. # return super( Sam, self ).dataset_column_dataprovider( dataset, **settings )
  280. #@dataproviders.decorators.dataprovider_factory( 'dataset-dict', dataproviders.column.DictDataProvider.settings )
  281. #def dataset_dict_dataprovider( self, dataset, **settings ):
  282. # settings[ 'comment_char' ] = '@'
  283. # return super( Sam, self ).dataset_dict_dataprovider( dataset, **settings )
  284. @dataproviders.decorators.dataprovider_factory( 'header', dataproviders.line.RegexLineDataProvider.settings )
  285. def header_dataprovider( self, dataset, **settings ):
  286. # in this case we can use an option of samtools view to provide just what we need (w/o regex)
  287. samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset, '-H' )
  288. return dataproviders.line.RegexLineDataProvider( samtools_source, **settings )
  289. @dataproviders.decorators.dataprovider_factory( 'id-seq-qual', dataproviders.column.DictDataProvider.settings )
  290. def id_seq_qual_dataprovider( self, dataset, **settings ):
  291. settings[ 'indeces' ] = [ 0, 9, 10 ]
  292. settings[ 'column_types' ] = [ 'str', 'str', 'str' ]
  293. settings[ 'column_names' ] = [ 'id', 'seq', 'qual' ]
  294. return self.dict_dataprovider( dataset, **settings )
  295. @dataproviders.decorators.dataprovider_factory( 'genomic-region', dataproviders.column.ColumnarDataProvider.settings )
  296. def genomic_region_dataprovider( self, dataset, **settings ):
  297. # GenomicRegionDataProvider currently requires a dataset as source - may not be necc.
  298. #TODO:?? consider (at least) the possible use of a kwarg: metadata_source (def. to source.dataset),
  299. # or remove altogether...
  300. #samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset )
  301. #return dataproviders.dataset.GenomicRegionDataProvider( samtools_source, metadata_source=dataset,
  302. # 2, 3, 3, **settings )
  303. # instead, set manually and use in-class column gen
  304. settings[ 'indeces' ] = [ 2, 3, 3 ]
  305. settings[ 'column_types' ] = [ 'str', 'int', 'int' ]
  306. return self.column_dataprovider( dataset, **settings )
  307. @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict', dataproviders.column.DictDataProvider.settings )
  308. def genomic_region_dict_dataprovider( self, dataset, **settings ):
  309. settings[ 'indeces' ] = [ 2, 3, 3 ]
  310. settings[ 'column_types' ] = [ 'str', 'int', 'int' ]
  311. settings[ 'column_names' ] = [ 'chrom', 'start', 'end' ]
  312. return self.dict_dataprovider( dataset, **settings )
  313. @dataproviders.decorators.dataprovider_factory( 'samtools' )
  314. def samtools_dataprovider( self, dataset, **settings ):
  315. """Generic samtools interface - all options available through settings."""
  316. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  317. return dataproviders.dataset.SamtoolsDataProvider( dataset_source, **settings )
  318. Binary.register_sniffable_binary_format("bam", "bam", Bam)
  319. class Bcf( Binary):
  320. """Class describing a BCF file"""
  321. file_ext = "bcf"
  322. def sniff( self, filename ):
  323. # BCF is compressed in the BGZF format, and must not be uncompressed in Galaxy.
  324. # The first 3 bytes of any bcf file is 'BCF', and the file is binary.
  325. try:
  326. header = gzip.open( filename ).read(3)
  327. if binascii.b2a_hex( header ) == binascii.hexlify( 'BCF' ):
  328. return True
  329. return False
  330. except:
  331. return False
  332. Binary.register_sniffable_binary_format("bcf", "bcf", Bcf)
  333. class H5( Binary ):
  334. """Class describing an HDF5 file"""
  335. file_ext = "h5"
  336. def set_peek( self, dataset, is_multi_byte=False ):
  337. if not dataset.dataset.purged:
  338. dataset.peek = "Binary h5 file"
  339. dataset.blurb = data.nice_size( dataset.get_size() )
  340. else:
  341. dataset.peek = 'file does not exist'
  342. dataset.blurb = 'file purged from disk'
  343. def display_peek( self, dataset ):
  344. try:
  345. return dataset.peek
  346. except:
  347. return "Binary h5 sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
  348. Binary.register_unsniffable_binary_ext("h5")
  349. class Scf( Binary ):
  350. """Class describing an scf binary sequence file"""
  351. file_ext = "scf"
  352. def set_peek( self, dataset, is_multi_byte=False ):
  353. if not dataset.dataset.purged:
  354. dataset.peek = "Binary scf sequence file"
  355. dataset.blurb = data.nice_size( dataset.get_size() )
  356. else:
  357. dataset.peek = 'file does not exist'
  358. dataset.blurb = 'file purged from disk'
  359. def display_peek( self, dataset ):
  360. try:
  361. return dataset.peek
  362. except:
  363. return "Binary scf sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
  364. Binary.register_unsniffable_binary_ext("scf")
  365. class Sff( Binary ):
  366. """ Standard Flowgram Format (SFF) """
  367. file_ext = "sff"
  368. def __init__( self, **kwd ):
  369. Binary.__init__( self, **kwd )
  370. def sniff( self, filename ):
  371. # The first 4 bytes of any sff file is '.sff', and the file is binary. For details
  372. # about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format
  373. try:
  374. header = open( filename ).read(4)
  375. if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
  376. return True
  377. return False
  378. except:
  379. return False
  380. def set_peek( self, dataset, is_multi_byte=False ):
  381. if not dataset.dataset.purged:
  382. dataset.peek = "Binary sff file"
  383. dataset.blurb = data.nice_size( dataset.get_size() )
  384. else:
  385. dataset.peek = 'file does not exist'
  386. dataset.blurb = 'file purged from disk'
  387. def display_peek( self, dataset ):
  388. try:
  389. return dataset.peek
  390. except:
  391. return "Binary sff file (%s)" % ( data.nice_size( dataset.get_size() ) )
  392. Binary.register_sniffable_binary_format("sff", "sff", Sff)
  393. class BigWig(Binary):
  394. """
  395. Accessing binary BigWig files from UCSC.
  396. The supplemental info in the paper has the binary details:
  397. http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq351v1
  398. """
  399. track_type = "LineTrack"
  400. data_sources = { "data_standalone": "bigwig" }
  401. def __init__( self, **kwd ):
  402. Binary.__init__( self, **kwd )
  403. self._magic = 0x888FFC26
  404. self._name = "BigWig"
  405. def _unpack( self, pattern, handle ):
  406. return struct.unpack( pattern, handle.read( struct.calcsize( pattern ) ) )
  407. def sniff( self, filename ):
  408. try:
  409. magic = self._unpack( "I", open( filename ) )
  410. return magic[0] == self._magic
  411. except:
  412. return False
  413. def set_peek( self, dataset, is_multi_byte=False ):
  414. if not dataset.dataset.purged:
  415. dataset.peek = "Binary UCSC %s file" % self._name
  416. dataset.blurb = data.nice_size( dataset.get_size() )
  417. else:
  418. dataset.peek = 'file does not exist'
  419. dataset.blurb = 'file purged from disk'
  420. def display_peek( self, dataset ):
  421. try:
  422. return dataset.peek
  423. except:
  424. return "Binary UCSC %s file (%s)" % ( self._name, data.nice_size( dataset.get_size() ) )
  425. Binary.register_sniffable_binary_format("bigwig", "bigwig", BigWig)
  426. class BigBed(BigWig):
  427. """BigBed support from UCSC."""
  428. data_sources = { "data_standalone": "bigbed" }
  429. def __init__( self, **kwd ):
  430. Binary.__init__( self, **kwd )
  431. self._magic = 0x8789F2EB
  432. self._name = "BigBed"
  433. Binary.register_sniffable_binary_format("bigbed", "bigbed", BigBed)
  434. # Cistrome Customized Datatypes
  435. class Eset( data.Data ):
  436. """Class describing an Expression Set"""
  437. file_ext = "eset"
  438. def set_peek( self, dataset, is_multi_byte=False ):
  439. if not dataset.dataset.purged:
  440. dataset.peek = "Expression set"
  441. dataset.blurb = data.nice_size( dataset.get_size() )
  442. else:
  443. dataset.peek = 'file does not exist'
  444. dataset.blurb = 'file purged from disk'
  445. def display_peek(self, dataset):
  446. try:
  447. return dataset.peek
  448. except:
  449. return "Expression set"
  450. def get_mime(self):
  451. """Returns the mime type of the datatype"""
  452. class XysZip( data.Data ):
  453. """Class describing a zip archive of NimbleGen XYS files"""
  454. file_ext = "xys.zip"
  455. def set_peek( self, dataset, is_multi_byte=False ):
  456. if not dataset.dataset.purged:
  457. zip_file = zipfile.ZipFile( dataset.file_name, "r" )
  458. num_files = len( zip_file.namelist() )
  459. dataset.peek = "Archive of %s NimbleGen XYS files" % ( str( num_files - 1 ) )
  460. dataset.blurb = data.nice_size( dataset.get_size() )
  461. else:
  462. dataset.peek = 'file does not exist'
  463. dataset.blurb = 'file purged from disk'
  464. def display_peek(self, dataset):
  465. try:
  466. return dataset.peek
  467. except:
  468. return "NimbleGen XYS file archive (%s)" % ( data.nice_size( dataset.get_size() ) )
  469. def get_mime(self):
  470. """Returns the mime type of the datatype"""
  471. return 'application/zip'
  472. class CelZip( data.Data ):
  473. """Class describing a zip archive of Affymetrix CEL files"""
  474. file_ext = "cel.zip"
  475. def set_peek( self, dataset, is_multi_byte=False ):
  476. if not dataset.dataset.purged:
  477. zip_file = zipfile.ZipFile( dataset.file_name, "r" )
  478. num_files = len( zip_file.namelist() )
  479. dataset.peek = "Archive of %s Affymetrix CEL files" % ( str( num_files - 1 ) )
  480. dataset.blurb = data.nice_size( dataset.get_size() )
  481. else:
  482. dataset.peek = 'file does not exist'
  483. dataset.blurb = 'file purged from disk'
  484. def display_peek(self, dataset):
  485. try:
  486. return dataset.peek
  487. except:
  488. return "Affymetrix CEL file archive (%s)" % ( data.nice_size( dataset.get_size() ) )
  489. def get_mime(self):
  490. """Returns the mime type of the datatype"""
  491. return 'application/zip'
  492. class Cel( data.Data ):
  493. """Class describing an binary CEL file"""
  494. file_ext = "cel"
  495. def set_peek( self, dataset, is_multi_byte=False ):
  496. if not dataset.dataset.purged:
  497. export_url = "/history_add_to?" + urlencode({'history_id':dataset.history_id,'ext':'cel','name':'affymetrix cel file','info':'Cel file','dbkey':dataset.dbkey})
  498. dataset.peek = "Binary cel sequence file"
  499. dataset.blurb = data.nice_size( dataset.get_size() )
  500. else:
  501. dataset.peek = 'file does not exist'
  502. dataset.blurb = 'file purged from disk'
  503. def display_peek(self, dataset):
  504. try:
  505. return dataset.peek
  506. except:
  507. return "Binary cel sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
  508. def sniff( self, filename ):
  509. """
  510. Determines wether the file is in cel format
  511. """
  512. parts = filename.lower().split('.')
  513. for p in parts:
  514. if p == 'cel':
  515. return True
  516. return False
  517. def get_mime(self):
  518. """Returns the mime type of the datatype"""
  519. return 'application/octet-stream'
  520. # End
  521. class TwoBit (Binary):
  522. """Class describing a TwoBit format nucleotide file"""
  523. file_ext = "twobit"
  524. def sniff(self, filename):
  525. try:
  526. # All twobit files start with a 16-byte header. If the file is smaller than 16 bytes, it's obviously not a valid twobit file.
  527. if os.path.getsize(filename) < 16:
  528. return False
  529. input = file(filename)
  530. magic = struct.unpack(">L", input.read(TWOBIT_MAGIC_SIZE))[0]
  531. if magic == TWOBIT_MAGIC_NUMBER or magic == TWOBIT_MAGIC_NUMBER_SWAP:
  532. return True
  533. except IOError:
  534. return False
  535. def set_peek(self, dataset, is_multi_byte=False):
  536. if not dataset.dataset.purged:
  537. dataset.peek = "Binary TwoBit format nucleotide file"
  538. dataset.blurb = data.nice_size(dataset.get_size())
  539. else:
  540. return super(TwoBit, self).set_peek(dataset, is_multi_byte)
  541. def display_peek(self, dataset):
  542. try:
  543. return dataset.peek
  544. except:
  545. return "Binary TwoBit format nucleotide file (%s)" % (data.nice_size(dataset.get_size()))
  546. Binary.register_sniffable_binary_format("twobit", "twobit", TwoBit)