PageRenderTime 99ms CodeModel.GetById 52ms app.highlight 39ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/sequence.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 820 lines | 731 code | 38 blank | 51 comment | 47 complexity | 79558ae694a7238971c1a4f9def0a240 MD5 | raw file
  1"""
  2Sequence classes
  3"""
  4
  5from . import data
  6import gzip
  7import json
  8import logging
  9import os
 10import re
 11import string
 12
 13from cgi import escape
 14
 15from galaxy import eggs, util
 16from galaxy.datatypes import metadata
 17from galaxy.datatypes.checkers import is_gzip
 18from galaxy.datatypes.sniff import get_test_fname, get_headers
 19from galaxy.datatypes.metadata import MetadataElement
 20
 21
 22try:
 23    eggs.require( "bx-python" )
 24    import bx.align.maf
 25except:
 26    pass
 27
 28
 29log = logging.getLogger(__name__)
 30
 31class SequenceSplitLocations( data.Text ):
 32    """
 33    Class storing information about a sequence file composed of multiple gzip files concatenated as
 34    one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end.
 35
 36    The format of the file is JSON::
 37
 38      { "sections" : [
 39              { "start" : "x", "end" : "y", "sequences" : "z" },
 40              ...
 41      ]}
 42
 43    """
 44    def set_peek( self, dataset, is_multi_byte=False ):
 45        if not dataset.dataset.purged:
 46            try:
 47                parsed_data = json.load(open(dataset.file_name))
 48                # dataset.peek = json.dumps(data, sort_keys=True, indent=4)
 49                dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
 50                dataset.blurb = '%d sections' % len(parsed_data['sections'])
 51            except Exception, e:
 52                dataset.peek = 'Not FQTOC file'
 53                dataset.blurb = 'Not FQTOC file'
 54        else:
 55            dataset.peek = 'file does not exist'
 56            dataset.blurb = 'file purged from disk'
 57
 58    file_ext = "fqtoc"
 59
 60    def sniff( self, filename ):
 61        if os.path.getsize(filename) < 50000:
 62            try:
 63                data = json.load(open(filename))
 64                sections = data['sections']
 65                for section in sections:
 66                    if 'start' not in section or 'end' not in section or 'sequences' not in section:
 67                        return False
 68                return True
 69            except:
 70                pass
 71        return False
 72
 73
 74class Sequence( data.Text ):
 75    """Class describing a sequence"""
 76
 77    """Add metadata elements"""
 78    MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
 79
 80    def set_meta( self, dataset, **kwd ):
 81        """
 82        Set the number of sequences and the number of data lines in dataset.
 83        """
 84        data_lines = 0
 85        sequences = 0
 86        for line in file( dataset.file_name ):
 87            line = line.strip()
 88            if line and line.startswith( '#' ):
 89                # We don't count comment lines for sequence data types
 90                continue
 91            if line and line.startswith( '>' ):
 92                sequences += 1
 93                data_lines +=1
 94            else:
 95                data_lines += 1
 96        dataset.metadata.data_lines = data_lines
 97        dataset.metadata.sequences = sequences
 98    def set_peek( self, dataset, is_multi_byte=False ):
 99        if not dataset.dataset.purged:
100            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
101            if dataset.metadata.sequences:
102                dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) )
103            else:
104                dataset.blurb = data.nice_size( dataset.get_size() )
105        else:
106            dataset.peek = 'file does not exist'
107            dataset.blurb = 'file purged from disk'
108
109    def get_sequences_per_file(total_sequences, split_params):
110        if split_params['split_mode'] == 'number_of_parts':
111            # legacy basic mode - split into a specified number of parts
112            parts = int(split_params['split_size'])
113            sequences_per_file = [total_sequences/parts for i in range(parts)]
114            for i in range(total_sequences % parts):
115                sequences_per_file[i] += 1
116        elif split_params['split_mode'] == 'to_size':
117            # loop through the sections and calculate the number of sequences
118            chunk_size = long(split_params['split_size'])
119            rem = total_sequences % chunk_size
120            sequences_per_file = [chunk_size for i in range(total_sequences / chunk_size)]
121            # TODO: Should we invest the time in a better way to handle small remainders?
122            if rem > 0:
123                sequences_per_file.append(rem)
124        else:
125            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
126        return sequences_per_file
127    get_sequences_per_file = staticmethod(get_sequences_per_file)
128
129    def do_slow_split( cls, input_datasets, subdir_generator_function, split_params):
130        # count the sequences so we can split
131        # TODO: if metadata is present, take the number of lines / 4
132        if input_datasets[0].metadata is not None and input_datasets[0].metadata.sequences is not None:
133            total_sequences = input_datasets[0].metadata.sequences
134        else:
135            input_file = input_datasets[0].file_name
136            compress = is_gzip(input_file)
137            if compress:
138                # gzip is really slow before python 2.7!
139                in_file = gzip.GzipFile(input_file, 'r')
140            else:
141                # TODO
142                # if a file is not compressed, seek locations can be calculated and stored
143                # ideally, this would be done in metadata
144                # TODO
145                # Add BufferedReader if python 2.7?
146                in_file = open(input_file, 'rt')
147            total_sequences = long(0)
148            for i, line in enumerate(in_file):
149                total_sequences += 1
150            in_file.close()
151            total_sequences /= 4
152
153        sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
154        return cls.write_split_files(input_datasets, None, subdir_generator_function, sequences_per_file)
155    do_slow_split = classmethod(do_slow_split)
156
157    def do_fast_split( cls, input_datasets, toc_file_datasets, subdir_generator_function, split_params):
158        data = json.load(open(toc_file_datasets[0].file_name))
159        sections = data['sections']
160        total_sequences = long(0)
161        for section in sections:
162            total_sequences += long(section['sequences'])
163        sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
164        return cls.write_split_files(input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file)
165    do_fast_split = classmethod(do_fast_split)
166
167    def write_split_files(cls, input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file):
168        directories = []
169        def get_subdir(idx):
170            if idx < len(directories):
171                return directories[idx]
172            dir = subdir_generator_function()
173            directories.append(dir)
174            return dir
175
176        # we know how many splits and how many sequences in each. What remains is to write out instructions for the
177        # splitting of all the input files. To decouple the format of those instructions from this code, the exact format of
178        # those instructions is delegated to scripts
179        start_sequence=0
180        for part_no in range(len(sequences_per_file)):
181            dir = get_subdir(part_no)
182            for ds_no in range(len(input_datasets)):
183                ds = input_datasets[ds_no]
184                base_name = os.path.basename(ds.file_name)
185                part_path = os.path.join(dir, base_name)
186                split_data = dict(class_name='%s.%s' % (cls.__module__, cls.__name__),
187                                  output_name=part_path,
188                                  input_name=ds.file_name,
189                                  args=dict(start_sequence=start_sequence, num_sequences=sequences_per_file[part_no]))
190                if toc_file_datasets is not None:
191                    toc = toc_file_datasets[ds_no]
192                    split_data['args']['toc_file'] = toc.file_name
193                f = open(os.path.join(dir, 'split_info_%s.json' % base_name), 'w')
194                json.dump(split_data, f)
195                f.close()
196            start_sequence += sequences_per_file[part_no]
197        return directories
198    write_split_files = classmethod(write_split_files)
199
200    def split( cls, input_datasets, subdir_generator_function, split_params):
201        """Split a generic sequence file (not sensible or possible, see subclasses)."""
202        if split_params is None:
203            return None
204        raise NotImplementedError("Can't split generic sequence files")
205
206
207class Alignment( data.Text ):
208    """Class describing an alignment"""
209
210    """Add metadata elements"""
211    MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None )
212
213    def split( cls, input_datasets, subdir_generator_function, split_params):
214        """Split a generic alignment file (not sensible or possible, see subclasses)."""
215        if split_params is None:
216            return None
217        raise NotImplementedError("Can't split generic alignment files")
218
219
220class Fasta( Sequence ):
221    """Class representing a FASTA sequence"""
222    file_ext = "fasta"
223
224    def sniff( self, filename ):
225        """
226        Determines whether the file is in fasta format
227
228        A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
229        The first character of the description line is a greater-than (">") symbol in the first column.
230        All lines should be shorter than 80 characters
231
232        For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
233
234        Rules for sniffing as True:
235
236            We don't care about line length (other than empty lines).
237
238            The first non-empty line must start with '>' and the Very Next line.strip() must have sequence data and not be a header.
239
240                'sequence data' here is loosely defined as non-empty lines which do not start with '>'
241
242                This will cause Color Space FASTA (csfasta) to be detected as True (they are, after all, still FASTA files - they have a header line followed by sequence data)
243
244                    Previously this method did some checking to determine if the sequence data had integers (presumably to differentiate between fasta and csfasta)
245
246                    This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta
247
248            We will only check that the first purported sequence is correctly formatted.
249
250        >>> fname = get_test_fname( 'sequence.maf' )
251        >>> Fasta().sniff( fname )
252        False
253        >>> fname = get_test_fname( 'sequence.fasta' )
254        >>> Fasta().sniff( fname )
255        True
256        """
257
258        try:
259            fh = open( filename )
260            while True:
261                line = fh.readline()
262                if not line:
263                    break #EOF
264                line = line.strip()
265                if line: #first non-empty line
266                    if line.startswith( '>' ):
267                        #The next line.strip() must not be '', nor startwith '>'
268                        line = fh.readline().strip()
269                        if line == '' or line.startswith( '>' ):
270                            break
271                        return True
272                    else:
273                        break #we found a non-empty line, but its not a fasta header
274            fh.close()
275        except:
276            pass
277        return False
278
279    def split(cls, input_datasets, subdir_generator_function, split_params):
280        """Split a FASTA file sequence by sequence.
281
282        Note that even if split_mode="number_of_parts", the actual number of
283        sub-files produced may not match that requested by split_size.
284
285        If split_mode="to_size" then split_size is treated as the number of
286        FASTA records to put in each sub-file (not size in bytes).
287        """
288        if split_params is None:
289            return
290        if len(input_datasets) > 1:
291            raise Exception("FASTA file splitting does not support multiple files")
292        input_file = input_datasets[0].file_name
293
294        #Counting chunk size as number of sequences.
295        if 'split_mode' not in split_params:
296            raise Exception('Tool does not define a split mode')
297        elif split_params['split_mode'] == 'number_of_parts':
298            split_size = int(split_params['split_size'])
299            log.debug("Split %s into %i parts..." % (input_file, split_size))
300            #if split_mode = number_of_parts, and split_size = 10, and
301            #we know the number of sequences (say 1234), then divide by
302            #by ten, giving ten files of approx 123 sequences each.
303            if input_datasets[0].metadata is not None \
304            and input_datasets[0].metadata.sequences:
305                #Galaxy has already counted/estimated the number
306                batch_size = 1 + input_datasets[0].metadata.sequences // split_size
307                cls._count_split(input_file, batch_size, subdir_generator_function)
308            else:
309                #OK, if Galaxy hasn't counted them, it may be a big file.
310                #We're not going to count the records which would be slow
311                #and a waste of disk IO time - instead we'll split using
312                #the file size.
313                chunk_size = os.path.getsize(input_file) // split_size
314                cls._size_split(input_file, chunk_size, subdir_generator_function)
315        elif split_params['split_mode'] == 'to_size':
316            #Split the input file into as many sub-files as required,
317            #each containing to_size many sequences
318            batch_size = int(split_params['split_size'])
319            log.debug("Split %s into batches of %i records..." % (input_file, batch_size))
320            cls._count_split(input_file, batch_size, subdir_generator_function)
321        else:
322            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
323    split = classmethod(split)
324
325    def _size_split(cls, input_file, chunk_size, subdir_generator_function):
326        """Split a FASTA file into chunks based on size on disk.
327
328        This does of course preserve complete records - it only splits at the
329        start of a new FASTQ sequence record.
330        """
331        log.debug("Attemping to split FASTA file %s into chunks of %i bytes" \
332                  % (input_file, chunk_size))
333        f = open(input_file, "rU")
334        part_file = None
335        try:
336            #Note if the input FASTA file has no sequences, we will
337            #produce just one sub-file which will be a copy of it.
338            part_dir = subdir_generator_function()
339            part_path = os.path.join(part_dir, os.path.basename(input_file))
340            part_file = open(part_path, 'w')
341            log.debug("Writing %s part to %s" % (input_file, part_path))
342            start_offset = 0
343            while True:
344                offset = f.tell()
345                line = f.readline()
346                if not line:
347                    break
348                if line[0]==">" and offset - start_offset >= chunk_size:
349                    #Start a new sub-file
350                    part_file.close()
351                    part_dir = subdir_generator_function()
352                    part_path = os.path.join(part_dir, os.path.basename(input_file))
353                    part_file = open(part_path, 'w')
354                    log.debug("Writing %s part to %s" % (input_file, part_path))
355                    start_offset = f.tell()
356                part_file.write(line)
357        except Exception, e:
358            log.error('Unable to size split FASTA file: %s' % str(e))
359            f.close()
360            if part_file is not None:
361                part_file.close()
362            raise
363        f.close()
364    _size_split = classmethod(_size_split)
365
366    def _count_split(cls, input_file, chunk_size, subdir_generator_function):
367        """Split a FASTA file into chunks based on counting records."""
368        log.debug("Attemping to split FASTA file %s into chunks of %i sequences" \
369                  % (input_file, chunk_size))
370        f = open(input_file, "rU")
371        part_file = None
372        try:
373            #Note if the input FASTA file has no sequences, we will
374            #produce just one sub-file which will be a copy of it.
375            part_dir = subdir_generator_function()
376            part_path = os.path.join(part_dir, os.path.basename(input_file))
377            part_file = open(part_path, 'w')
378            log.debug("Writing %s part to %s" % (input_file, part_path))
379            rec_count = 0
380            while True:
381                line = f.readline()
382                if not line:
383                    break
384                if line[0]==">":
385                    rec_count += 1
386                    if rec_count > chunk_size:
387                        #Start a new sub-file
388                        part_file.close()
389                        part_dir = subdir_generator_function()
390                        part_path = os.path.join(part_dir, os.path.basename(input_file))
391                        part_file = open(part_path, 'w')
392                        log.debug("Writing %s part to %s" % (input_file, part_path))
393                        rec_count = 1
394                part_file.write(line)
395            part_file.close()
396        except Exception, e:
397            log.error('Unable to count split FASTA file: %s' % str(e))
398            f.close()
399            if part_file is not None:
400                part_file.close()
401            raise
402        f.close()
403    _count_split = classmethod(_count_split)
404
405
406class csFasta( Sequence ):
407    """ Class representing the SOLID Color-Space sequence ( csfasta ) """
408    file_ext = "csfasta"
409
410    def sniff( self, filename ):
411        """
412        Color-space sequence:
413            >2_15_85_F3
414            T213021013012303002332212012112221222112212222
415
416        >>> fname = get_test_fname( 'sequence.fasta' )
417        >>> csFasta().sniff( fname )
418        False
419        >>> fname = get_test_fname( 'sequence.csfasta' )
420        >>> csFasta().sniff( fname )
421        True
422        """
423        try:
424            fh = open( filename )
425            while True:
426                line = fh.readline()
427                if not line:
428                    break #EOF
429                line = line.strip()
430                if line and not line.startswith( '#' ): #first non-empty non-comment line
431                    if line.startswith( '>' ):
432                        line = fh.readline().strip()
433                        if line == '' or line.startswith( '>' ):
434                            break
435                        elif line[0] not in string.ascii_uppercase:
436                            return False
437                        elif len( line ) > 1 and not re.search( '^[\d.]+$', line[1:] ):
438                            return False
439                        return True
440                    else:
441                        break #we found a non-empty line, but it's not a header
442            fh.close()
443        except:
444            pass
445        return False
446
447    def set_meta( self, dataset, **kwd ):
448        if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
449            dataset.metadata.data_lines = None
450            dataset.metadata.sequences = None
451            return
452        return Sequence.set_meta( self, dataset, **kwd )
453
454
455class Fastq ( Sequence ):
456    """Class representing a generic FASTQ sequence"""
457    file_ext = "fastq"
458
459    def set_meta( self, dataset, **kwd ):
460        """
461        Set the number of sequences and the number of data lines
462        in dataset.
463        """
464        if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
465            dataset.metadata.data_lines = None
466            dataset.metadata.sequences = None
467            return
468        data_lines = 0
469        sequences = 0
470        seq_counter = 0     # blocks should be 4 lines long
471        for line in file( dataset.file_name ):
472            line = line.strip()
473            if line and line.startswith( '#' ) and not sequences:
474                # We don't count comment lines for sequence data types
475                continue
476            if line and line.startswith( '@' ):
477                if seq_counter >= 4:
478                    # count previous block
479                    # blocks should be 4 lines long
480                    sequences += 1
481                    seq_counter = 1
482                else:
483                    # in case quality line starts with @
484                    seq_counter += 1
485                data_lines += 1
486            else:
487                data_lines += 1
488                seq_counter += 1
489        if seq_counter >= 4:
490            # count final block
491            sequences += 1
492        dataset.metadata.data_lines = data_lines
493        dataset.metadata.sequences = sequences
494    def sniff ( self, filename ):
495        """
496        Determines whether the file is in generic fastq format
497        For details, see http://maq.sourceforge.net/fastq.shtml
498
499        Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina
500              These differ in the representation of the quality scores
501
502        >>> fname = get_test_fname( '1.fastqsanger' )
503        >>> Fastq().sniff( fname )
504        True
505        >>> fname = get_test_fname( '2.fastqsanger' )
506        >>> Fastq().sniff( fname )
507        True
508        """
509        headers = get_headers( filename, None )
510        bases_regexp = re.compile( "^[NGTAC]*" )
511        # check that first block looks like a fastq block
512        try:
513            if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]:
514                # Check the sequence line, make sure it contains only G/C/A/T/N
515                if not bases_regexp.match( headers[1][0] ):
516                    return False
517                return True
518            return False
519        except:
520            return False
521
522    def split( cls, input_datasets, subdir_generator_function, split_params):
523        """
524        FASTQ files are split on cluster boundaries, in increments of 4 lines
525        """
526        if split_params is None:
527            return None
528
529        # first, see if there are any associated FQTOC files that will give us the split locations
530        # if so, we don't need to read the files to do the splitting
531        toc_file_datasets = []
532        for ds in input_datasets:
533            tmp_ds = ds
534            fqtoc_file = None
535            while fqtoc_file is None and tmp_ds is not None:
536                fqtoc_file = tmp_ds.get_converted_files_by_type('fqtoc')
537                tmp_ds = tmp_ds.copied_from_library_dataset_dataset_association
538
539            if fqtoc_file is not None:
540                toc_file_datasets.append(fqtoc_file)
541
542        if len(toc_file_datasets) == len(input_datasets):
543            return cls.do_fast_split(input_datasets, toc_file_datasets, subdir_generator_function, split_params)
544        return cls.do_slow_split(input_datasets, subdir_generator_function, split_params)
545    split = classmethod(split)
546
547    def process_split_file(data):
548        """
549        This is called in the context of an external process launched by a Task (possibly not on the Galaxy machine)
550        to create the input files for the Task. The parameters:
551        data - a dict containing the contents of the split file
552        """
553        args = data['args']
554        input_name = data['input_name']
555        output_name = data['output_name']
556        start_sequence = long(args['start_sequence'])
557        sequence_count = long(args['num_sequences'])
558
559        if 'toc_file' in args:
560            toc_file = json.load(open(args['toc_file'], 'r'))
561            commands = Sequence.get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count)
562        else:
563            commands = Sequence.get_split_commands_sequential(is_gzip(input_name), input_name, output_name, start_sequence, sequence_count)
564        for cmd in commands:
565            if 0 != os.system(cmd):
566                raise Exception("Executing '%s' failed" % cmd)
567        return True
568    process_split_file = staticmethod(process_split_file)
569
570
571class FastqSanger( Fastq ):
572    """Class representing a FASTQ sequence ( the Sanger variant )"""
573    file_ext = "fastqsanger"
574
575class FastqSolexa( Fastq ):
576    """Class representing a FASTQ sequence ( the Solexa variant )"""
577    file_ext = "fastqsolexa"
578
579class FastqIllumina( Fastq ):
580    """Class representing a FASTQ sequence ( the Illumina 1.3+ variant )"""
581    file_ext = "fastqillumina"
582
583class FastqCSSanger( Fastq ):
584    """Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )"""
585    file_ext = "fastqcssanger"
586
587class Maf( Alignment ):
588    """Class describing a Maf alignment"""
589    file_ext = "maf"
590
591    #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
592    MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
593    MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
594    MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
595
596    def init_meta( self, dataset, copy_from=None ):
597        Alignment.init_meta( self, dataset, copy_from=copy_from )
598    def set_meta( self, dataset, overwrite = True, **kwd ):
599        """
600        Parses and sets species, chromosomes, index from MAF file.
601        """
602        #these metadata values are not accessable by users, always overwrite
603        #Imported here to avoid circular dependency
604        from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
605        indexes, species, species_chromosomes, blocks = build_maf_index_species_chromosomes( dataset.file_name )
606        if indexes is None:
607            return #this is not a MAF file
608        dataset.metadata.species = species
609        dataset.metadata.blocks = blocks
610
611        #write species chromosomes to a file
612        chrom_file = dataset.metadata.species_chromosomes
613        if not chrom_file:
614            chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset )
615        chrom_out = open( chrom_file.file_name, 'wb' )
616        for spec, chroms in species_chromosomes.items():
617            chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
618        chrom_out.close()
619        dataset.metadata.species_chromosomes = chrom_file
620
621        index_file = dataset.metadata.maf_index
622        if not index_file:
623            index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
624        indexes.write( open( index_file.file_name, 'wb' ) )
625        dataset.metadata.maf_index = index_file
626    def set_peek( self, dataset, is_multi_byte=False ):
627        if not dataset.dataset.purged:
628            # The file must exist on disk for the get_file_peek() method
629            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
630            if dataset.metadata.blocks:
631                dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) )
632            else:
633                # Number of blocks is not known ( this should not happen ), and auto-detect is
634                # needed to set metadata
635                dataset.blurb = "? blocks"
636        else:
637            dataset.peek = 'file does not exist'
638            dataset.blurb = 'file purged from disk'
639    def display_peek( self, dataset ):
640        """Returns formated html of peek"""
641        return self.make_html_table( dataset )
642    def make_html_table( self, dataset, skipchars=[] ):
643        """Create HTML table, used for displaying peek"""
644        out = ['<table cellspacing="0" cellpadding="3">']
645        try:
646            out.append('<tr><th>Species:&nbsp;')
647            for species in dataset.metadata.species:
648                out.append( '%s&nbsp;' % species )
649            out.append( '</th></tr>' )
650            if not dataset.peek:
651                dataset.set_peek()
652            data = dataset.peek
653            lines =  data.splitlines()
654            for line in lines:
655                line = line.strip()
656                if not line:
657                    continue
658                out.append( '<tr><td>%s</td></tr>' % escape( line ) )
659            out.append( '</table>' )
660            out = "".join( out )
661        except Exception, exc:
662            out = "Can't create peek %s" % exc
663        return out
664    def sniff( self, filename ):
665        """
666        Determines wether the file is in maf format
667
668        The .maf format is line-oriented. Each multiple alignment ends with a blank line.
669        Each sequence in an alignment is on a single line, which can get quite long, but
670        there is no length limit. Words in a line are delimited by any white space.
671        Lines starting with # are considered to be comments. Lines starting with ## can
672        be ignored by most programs, but contain meta-data of one form or another.
673
674        The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
675        variable=value pairs. There should be no white space surrounding the "=".
676
677        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5
678
679        >>> fname = get_test_fname( 'sequence.maf' )
680        >>> Maf().sniff( fname )
681        True
682        >>> fname = get_test_fname( 'sequence.fasta' )
683        >>> Maf().sniff( fname )
684        False
685        """
686        headers = get_headers( filename, None )
687        try:
688            if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf":
689                return True
690            else:
691                return False
692        except:
693            return False
694
695
696class MafCustomTrack( data.Text ):
697    file_ext = "mafcustomtrack"
698
699    MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' )
700    MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' )
701    MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' )
702
703    def set_meta( self, dataset, overwrite = True, **kwd ):
704        """
705        Parses and sets viewport metadata from MAF file.
706        """
707        max_block_check = 10
708        chrom = None
709        forward_strand_start = float( 'inf' )
710        forward_strand_end = 0
711        try:
712            maf_file = open( dataset.file_name )
713            maf_file.readline() #move past track line
714            for i, block in enumerate( bx.align.maf.Reader( maf_file ) ):
715                ref_comp = block.get_component_by_src_start( dataset.metadata.dbkey )
716                if ref_comp:
717                    ref_chrom = bx.align.maf.src_split( ref_comp.src )[-1]
718                    if chrom is None:
719                        chrom = ref_chrom
720                    if chrom == ref_chrom:
721                        forward_strand_start = min( forward_strand_start, ref_comp.forward_strand_start )
722                        forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end )
723                if i > max_block_check:
724                    break
725
726            if forward_strand_end > forward_strand_start:
727                dataset.metadata.vp_chromosome = chrom
728                dataset.metadata.vp_start = forward_strand_start
729                dataset.metadata.vp_end = forward_strand_end
730        except:
731            pass
732
733
734class Axt( data.Text ):
735    """Class describing an axt alignment"""
736
737    # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
738    # here simply for backward compatibility ( although it is still in the datatypes registry ).  Subclassing
739    # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
740
741    file_ext = "axt"
742
743    def sniff( self, filename ):
744        """
745        Determines whether the file is in axt format
746
747        axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
748        at Penn State University.
749
750        Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
751        Blocks are separated from one another by blank lines.
752
753        The summary line contains chromosomal position and size information about the alignment. It
754        consists of 9 required fields.
755
756        The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
757        (line 3) with inserts.  Repeats are indicated by lower-case letters.
758
759        For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
760
761        >>> fname = get_test_fname( 'alignment.axt' )
762        >>> Axt().sniff( fname )
763        True
764        >>> fname = get_test_fname( 'alignment.lav' )
765        >>> Axt().sniff( fname )
766        False
767        """
768        headers = get_headers( filename, None )
769        if len(headers) < 4:
770            return False
771        for hdr in headers:
772            if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
773                return True
774            if len(hdr) > 0 and not hdr[0].startswith("#"):
775                if len(hdr) != 9:
776                    return False
777                try:
778                    map ( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] )
779                except:
780                    return False
781                if hdr[7] not in data.valid_strand:
782                    return False
783                else:
784                    return True
785
786
787class Lav( data.Text ):
788    """Class describing a LAV alignment"""
789
790    file_ext = "lav"
791
792    # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
793    # here simply for backward compatibility ( although it is still in the datatypes registry ).  Subclassing
794    # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
795
796    def sniff( self, filename ):
797        """
798        Determines whether the file is in lav format
799
800        LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
801        The first line of a .lav file begins with #:lav.
802
803        For complete details see http://www.bioperl.org/wiki/LAV_alignment_format
804
805        >>> fname = get_test_fname( 'alignment.lav' )
806        >>> Lav().sniff( fname )
807        True
808        >>> fname = get_test_fname( 'alignment.axt' )
809        >>> Lav().sniff( fname )
810        False
811        """
812        headers = get_headers( filename, None )
813        try:
814            if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'):
815                return True
816            else:
817                return False
818        except:
819            return False
820