PageRenderTime 63ms CodeModel.GetById 10ms app.highlight 44ms RepoModel.GetById 2ms app.codeStats 0ms

/lib/galaxy/visualization/tracks/data_providers.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 837 lines | 754 code | 32 blank | 51 comment | 31 complexity | 811312d82113def83efdf8ca968c0a76 MD5 | raw file
  1"""
  2Data providers for tracks visualizations.
  3"""
  4
  5import sys
  6from math import ceil, log
  7import pkg_resources
  8pkg_resources.require( "bx-python" )
  9if sys.version_info[:2] == (2, 4):
 10    pkg_resources.require( "ctypes" )
 11pkg_resources.require( "pysam" )
 12pkg_resources.require( "numpy" )
 13from galaxy.datatypes.util.gff_util import *
 14from galaxy.util.json import from_json_string
 15from bx.interval_index_file import Indexes
 16from bx.bbi.bigwig_file import BigWigFile
 17from galaxy.util.lrucache import LRUCache
 18from galaxy.visualization.tracks.summary import *
 19import galaxy_utils.sequence.vcf
 20from galaxy.datatypes.tabular import Vcf
 21from galaxy.datatypes.interval import Bed, Gff, Gtf
 22
 23from pysam import csamtools, ctabix
 24
 25ERROR_MAX_VALS = "Only the first %i %s in this region are displayed."
 26
 27# Return None instead of NaN to pass jQuery 1.4's strict JSON
 28def float_nan(n):
 29    if n != n: # NaN != NaN
 30        return None
 31    else:
 32        return float(n)
 33        
 34def get_bounds( reads, start_pos_index, end_pos_index ):
 35    """
 36    Returns the minimum and maximum position for a set of reads.
 37    """
 38    max_low = sys.maxint
 39    max_high = -sys.maxint
 40    for read in reads:
 41        if read[ start_pos_index ] < max_low:
 42            max_low = read[ start_pos_index ]
 43        if read[ end_pos_index ] > max_high:
 44            max_high = read[ end_pos_index ]
 45    return max_low, max_high
 46        
 47class TracksDataProvider( object ):
 48    """ Base class for tracks data providers. """
 49    
 50    """ 
 51    Mapping from column name to payload data; this mapping is used to create
 52    filters. Key is column name, value is a dict with mandatory key 'index' and 
 53    optional key 'name'. E.g. this defines column 4
 54
 55    col_name_data_attr_mapping = {4 : { index: 5, name: 'Score' } }
 56    """
 57    col_name_data_attr_mapping = {}
 58    
 59    def __init__( self, converted_dataset=None, original_dataset=None, dependencies=None ):
 60        """ Create basic data provider. """
 61        self.converted_dataset = converted_dataset
 62        self.original_dataset = original_dataset
 63        self.dependencies = dependencies
 64        
 65    def write_data_to_file( self, chrom, start, end, filename ):
 66        """
 67        Write data in region defined by chrom, start, and end to a file.
 68        """
 69        # Override.
 70        pass
 71        
 72    def valid_chroms( self ):
 73        """
 74        Returns chroms/contigs that the dataset contains
 75        """
 76        return None # by default
 77    
 78    def has_data( self, chrom, start, end, **kwargs ):
 79        """
 80        Returns true if dataset has data in the specified genome window, false
 81        otherwise.
 82        """
 83        # Override.
 84        pass
 85        
 86    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
 87        """ 
 88        Returns data in region defined by chrom, start, and end. start_val and
 89        max_vals are used to denote the data to return: start_val is the first element to 
 90        return and max_vals indicates the number of values to return.
 91        
 92        Return value must be a dictionary with the following attributes:
 93            dataset_type, data
 94        """
 95        # Override.
 96        pass
 97        
 98    def get_filters( self ):
 99        """ 
100        Returns filters for provider's data. Return value is a list of 
101        filters; each filter is a dictionary with the keys 'name', 'index', 'type'.
102        NOTE: This method uses the original dataset's datatype and metadata to 
103        create the filters.
104        """
105        # Get column names.
106        try:
107            column_names = self.original_dataset.datatype.column_names
108        except AttributeError:
109            try:
110                column_names = range( self.original_dataset.metadata.columns )
111            except: # Give up
112                return []
113            
114        # Dataset must have column types; if not, cannot create filters.
115        try:
116            column_types = self.original_dataset.metadata.column_types
117        except AttributeError:
118            return []
119            
120        # Create and return filters.
121        filters = []
122        if self.original_dataset.metadata.viz_filter_cols:
123            for viz_col_index in self.original_dataset.metadata.viz_filter_cols:
124                # Some columns are optional, so can't assume that a filter 
125                # column is in dataset.
126                if viz_col_index >= len( column_names ):
127                    continue;
128                col_name = column_names[ viz_col_index ]
129                # Make sure that column has a mapped index. If not, do not add filter.
130                try:
131                    attrs = self.col_name_data_attr_mapping[ col_name ]
132                except KeyError:
133                    continue
134                filters.append(
135                    { 'name' : attrs[ 'name' ], 'type' : column_types[viz_col_index], \
136                    'index' : attrs[ 'index' ] } )
137        return filters
138
139class BedDataProvider( TracksDataProvider ):
140    """
141    Abstract class that processes BED data from text format to payload format.
142    
143    Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
144    """
145    
146    def get_iterator( self, chrom, start, end ):
147        raise "Unimplemented Method"
148        
149    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
150        iterator = self.get_iterator( chrom, start, end )
151        return self.process_data( iterator, start_val, max_vals, **kwargs )
152    
153    def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
154        """
155        Provides
156        """
157        # Build data to return. Payload format is:
158        # [ <guid/offset>, <start>, <end>, <name>, <score>, <strand>, <thick_start>,
159        #   <thick_end>, <blocks> ]
160        # 
161        # First three entries are mandatory, others are optional.
162        #
163        filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
164        no_detail = ( "no_detail" in kwargs )
165        rval = []
166        message = None
167        for count, line in enumerate( iterator ):
168            if count < start_val:
169                continue
170            if max_vals and count-start_val >= max_vals:
171                message = ERROR_MAX_VALS % ( max_vals, "features" )
172                break
173            # TODO: can we use column metadata to fill out payload?
174            # TODO: use function to set payload data
175
176            feature = line.split()
177            length = len(feature)
178            # Unique id is just a hash of the line
179            payload = [ hash(line), int(feature[1]), int(feature[2]) ]
180
181            if no_detail:
182                rval.append( payload )
183                continue
184
185            # Simpler way to add stuff, but type casting is not done.
186            # Name, score, strand, thick start, thick end.
187            #end = min( len( feature ), 8 )
188            #payload.extend( feature[ 3:end ] )
189
190            # Name, strand, thick start, thick end.
191            if length >= 4:
192                payload.append(feature[3])
193            if length >= 6:
194                payload.append(feature[5])
195            if length >= 8:
196                payload.append(int(feature[6]))
197                payload.append(int(feature[7]))
198
199            # Blocks.
200            if length >= 12:
201                block_sizes = [ int(n) for n in feature[10].split(',') if n != '']
202                block_starts = [ int(n) for n in feature[11].split(',') if n != '' ]
203                blocks = zip( block_sizes, block_starts )
204                payload.append( [ ( int(feature[1]) + block[1], int(feature[1]) + block[1] + block[0] ) for block in blocks ] )
205
206            # Score (filter data)    
207            if length >= 5 and filter_cols and filter_cols[0] == "Score":
208                payload.append( float(feature[4]) )
209
210            rval.append( payload )
211
212        return { 'data': rval, 'message': message }
213
214    def write_data_to_file( self, chrom, start, end, filename ):
215        iterator = self.get_iterator( chrom, start, end )
216        out = open( filename, "w" )
217        for line in iterator:
218            out.write( "%s\n" % line )
219        out.close()
220            
221class SummaryTreeDataProvider( TracksDataProvider ):
222    """
223    Summary tree data provider for the Galaxy track browser. 
224    """
225    
226    CACHE = LRUCache(20) # Store 20 recently accessed indices for performance
227    
228    def valid_chroms( self ):
229        st = summary_tree_from_file( self.converted_dataset.file_name )
230        return st.chrom_blocks.keys()
231        
232    
233    def get_summary( self, chrom, start, end, **kwargs ):
234        filename = self.converted_dataset.file_name
235        st = self.CACHE[filename]
236        if st is None:
237            st = summary_tree_from_file( self.converted_dataset.file_name )
238            self.CACHE[filename] = st
239
240        # If chrom is not found in blocks, try removing the first three 
241        # characters (e.g. 'chr') and see if that works. This enables the
242        # provider to handle chrome names defined as chrXXX and as XXX.
243        if chrom in st.chrom_blocks:
244            pass
245        elif chrom[3:] in st.chrom_blocks:
246            chrom = chrom[3:]
247        else:
248            return None
249
250        resolution = max(1, ceil(float(kwargs['resolution'])))
251
252        level = ceil( log( resolution, st.block_size ) ) - 1
253        level = int(max( level, 0 ))
254        if level <= 1:
255            return "detail"
256
257        stats = st.chrom_stats[chrom]
258        results = st.query(chrom, int(start), int(end), level)
259        if results == "detail" or results == "draw":
260            return results
261        else:
262            return results, stats[level]["max"], stats[level]["avg"], stats[level]["delta"]
263            
264    def has_data( self, chrom ):
265        """
266        Returns true if dataset has data for this chrom
267        """
268        
269        # Get summary tree.
270        filename = self.converted_dataset.file_name
271        st = self.CACHE[filename]
272        if st is None:
273            st = summary_tree_from_file( self.converted_dataset.file_name )
274            self.CACHE[filename] = st
275            
276        # Check for data.
277        return st.chrom_blocks.get(chrom, None) is not None or (chrom and st.chrom_blocks.get(chrom[3:], None) is not None)
278
279class BamDataProvider( TracksDataProvider ):
280    """
281    Provides access to intervals from a sorted indexed BAM file.
282    """
283    
284    def write_data_to_file( self, chrom, start, end, filename ):
285        """
286        Write reads in [chrom:start-end] to file.
287        """
288        
289        # Open current BAM file using index.
290        start, end = int(start), int(end)
291        bamfile = csamtools.Samfile( filename=self.original_dataset.file_name, mode='rb', \
292                                     index_filename=self.converted_dataset.file_name )
293        try:
294            data = bamfile.fetch(start=start, end=end, reference=chrom)
295        except ValueError, e:
296            # Some BAM files do not prefix chromosome names with chr, try without
297            if chrom.startswith( 'chr' ):
298                try:
299                    data = bamfile.fetch( start=start, end=end, reference=chrom[3:] )
300                except ValueError:
301                    return None
302            else:
303                return None
304        
305        # Write new BAM file.
306        # TODO: write headers as well?
307        new_bamfile = csamtools.Samfile( template=bamfile, filename=filename, mode='wb' )
308        for i, read in enumerate( data ):
309            new_bamfile.write( read )
310        new_bamfile.close()
311        
312        # Cleanup.
313        bamfile.close()
314    
315    def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
316        """
317        Fetch reads in the region and additional metadata.
318        
319        Returns a dict with the following attributes:
320            data - a list of reads with the format 
321                    [<guid>, <start>, <end>, <name>, <read_1>, <read_2>] 
322                where <read_1> has the format
323                    [<start>, <end>, <cigar>, ?<read_seq>?]
324                and <read_2> has the format
325                    [<start>, <end>, <cigar>, ?<read_seq>?]
326                For single-end reads, read has format:
327                    [<guid>, <start>, <end>, <name>, cigar, seq] 
328                NOTE: read end and sequence data are not valid for reads outside of
329                requested region and should not be used.
330            
331            max_low - lowest coordinate for the returned reads
332            max_high - highest coordinate for the returned reads
333            message - error/informative message
334        """
335        start, end = int(start), int(end)
336        orig_data_filename = self.original_dataset.file_name
337        index_filename = self.converted_dataset.file_name
338        no_detail = "no_detail" in kwargs
339        
340        # Attempt to open the BAM file with index
341        bamfile = csamtools.Samfile( filename=orig_data_filename, mode='rb', index_filename=index_filename )
342        message = None
343        try:
344            data = bamfile.fetch(start=start, end=end, reference=chrom)
345        except ValueError, e:
346            # Some BAM files do not prefix chromosome names with chr, try without
347            if chrom.startswith( 'chr' ):
348                try:
349                    data = bamfile.fetch( start=start, end=end, reference=chrom[3:] )
350                except ValueError:
351                    return None
352            else:
353                return None
354                
355        # Encode reads as list of lists.
356        results = []
357        paired_pending = {}
358        for count, read in enumerate( data ):
359            if count < start_val:
360                continue
361            if count-start_val >= max_vals:
362                message = ERROR_MAX_VALS % ( max_vals, "reads" )
363                break
364            qname = read.qname
365            seq = read.seq
366            if read.cigar is not None:
367                read_len = sum( [cig[1] for cig in read.cigar] ) # Use cigar to determine length
368            else:
369                read_len = len(seq) # If no cigar, just use sequence length
370
371            if read.is_proper_pair:
372                if qname in paired_pending: # one in dict is always first
373                    pair = paired_pending[qname]
374                    results.append( [ "%i_%s" % ( pair['start'], qname ), 
375                                      pair['start'], 
376                                      read.pos + read_len, 
377                                      qname, 
378                                      [ pair['start'], pair['end'], pair['cigar'], pair['seq'] ], 
379                                      [ read.pos, read.pos + read_len, read.cigar, seq ] 
380                                     ] )
381                    del paired_pending[qname]
382                else:
383                    paired_pending[qname] = { 'start': read.pos, 'end': read.pos + read_len, 'seq': seq, 'mate_start': read.mpos, 'rlen': read_len, 'cigar': read.cigar }
384            else:
385                results.append( [ "%i_%s" % ( read.pos, qname ), read.pos, read.pos + read_len, qname, read.cigar, read.seq] )
386                
387        # Take care of reads whose mates are out of range.
388        # TODO: count paired reads when adhering to max_vals?
389        for qname, read in paired_pending.iteritems():
390            if read['mate_start'] < read['start']:
391                # Mate is before read.
392                read_start = read['mate_start']
393                read_end = read['end']
394                # Make read_1 start=end so that length is 0 b/c we don't know
395                # read length.
396                r1 = [ read['mate_start'], read['mate_start'] ]
397                r2 = [ read['start'], read['end'], read['cigar'], read['seq'] ]
398            else:
399                # Mate is after read.
400                read_start = read['start']
401                # Make read_2 start=end so that length is 0 b/c we don't know
402                # read length. Hence, end of read is start of read_2.
403                read_end = read['mate_start']
404                r1 = [ read['start'], read['end'], read['cigar'], read['seq'] ]
405                r2 = [ read['mate_start'], read['mate_start'] ]
406
407            results.append( [ "%i_%s" % ( read_start, qname ), read_start, read_end, qname, r1, r2 ] )
408            
409        # Clean up.
410        bamfile.close()
411        
412        max_low, max_high = get_bounds( results, 1, 2 )
413                
414        return { 'data': results, 'message': message, 'max_low': max_low, 'max_high': max_high }
415
416class BBIDataProvider( TracksDataProvider ):
417    """
418    BBI data provider for the Galaxy track browser. 
419    """
420    def valid_chroms( self ):
421        # No way to return this info as of now
422        return None
423        
424    def has_data( self, chrom ):
425        f, bbi = self._get_dataset()
426        all_dat = bbi.query(chrom, 0, 2147483647, 1)
427        f.close()
428        return all_dat is not None
429        
430    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
431        # Bigwig has the possibility of it being a standalone bigwig file, in which case we use
432        # original_dataset, or coming from wig->bigwig conversion in which we use converted_dataset
433        f, bbi = self._get_dataset()
434        
435        if 'stats' in kwargs:
436            all_dat = bbi.query(chrom, 0, 2147483647, 1)
437            f.close()
438            if all_dat is None:
439                return None
440            
441            all_dat = all_dat[0] # only 1 summary
442            return { 'data' : { 'max': float( all_dat['max'] ), \
443                                'min': float( all_dat['min'] ), \
444                                'total_frequency': float( all_dat['coverage'] ) } \
445                    }
446                     
447        start = int(start)
448        end = int(end)
449        # The first zoom level for BBI files is 640. If too much is requested, it will look at each block instead
450        # of summaries. The calculation done is: zoom <> (end-start)/num_points/2.
451        # Thus, the optimal number of points is (end-start)/num_points/2 = 640
452        # num_points = (end-start) / 1280
453        num_points = (end-start) / 1280
454        if num_points < 1:
455            num_points = end - start
456        else:
457            num_points = min(num_points, 500)
458
459        data = bbi.query(chrom, start, end, num_points)
460        f.close()
461        
462        pos = start
463        step_size = (end - start) / num_points
464        result = []
465        if data:
466            for dat_dict in data:
467                result.append( (pos, float_nan(dat_dict['mean']) ) )
468                pos += step_size
469            
470        return { 'data': result }
471
472class BigBedDataProvider( BBIDataProvider ):
473    def _get_dataset( self ):
474        # Nothing converts to bigBed so we don't consider converted dataset
475        f = open( self.original_dataset.file_name )
476        return f, BigBedFile(file=f)
477
478class BigWigDataProvider (BBIDataProvider ):
479    def _get_dataset( self ):
480        if self.converted_dataset is not None:
481            f = open( self.converted_dataset.file_name )
482        else:
483            f = open( self.original_dataset.file_name )
484        return f, BigWigFile(file=f)
485
486class FilterableMixin:
487    def get_filters( self ):
488        """ Returns a dataset's filters. """
489        
490        # is_ functions taken from Tabular.set_meta
491        def is_int( column_text ):
492            try:
493                int( column_text )
494                return True
495            except: 
496                return False
497        def is_float( column_text ):
498            try:
499                float( column_text )
500                return True
501            except: 
502                if column_text.strip().lower() == 'na':
503                    return True #na is special cased to be a float
504                return False
505        
506        #
507        # Get filters.
508        # TODOs: 
509        # (a) might be useful to move this into each datatype's set_meta method;
510        # (b) could look at first N lines to ensure GTF attribute types are consistent.
511        #
512        filters = []
513        # HACK: first 8 fields are for drawing, so start filter column index at 9.
514        filter_col = 8
515        if isinstance( self.original_dataset.datatype, Gff ):
516            # Can filter by score and GTF attributes.
517            filters = [ { 'name': 'Score', 
518                          'type': 'int', 
519                          'index': filter_col, 
520                          'tool_id': 'Filter1',
521                          'tool_exp_name': 'c6' } ]
522            filter_col += 1
523            if isinstance( self.original_dataset.datatype, Gtf ):
524                # Create filters based on dataset metadata.
525                for name, a_type in self.original_dataset.metadata.attribute_types.items():
526                    if a_type in [ 'int', 'float' ]:
527                        filters.append( 
528                            { 'name': name,
529                              'type': a_type, 
530                              'index': filter_col, 
531                              'tool_id': 'gff_filter_by_attribute',
532                              'tool_exp_name': name } )
533                        filter_col += 1
534
535                '''
536                # Old code: use first line in dataset to find attributes.
537                for i, line in enumerate( open(self.original_dataset.file_name) ):
538                    if not line.startswith('#'):
539                        # Look at first line for attributes and types.
540                        attributes = parse_gff_attributes( line.split('\t')[8] )
541                        for attr, value in attributes.items():
542                            # Get attribute type.
543                            if is_int( value ):
544                                attr_type = 'int'
545                            elif is_float( value ):
546                                attr_type = 'float'
547                            else:
548                                attr_type = 'str'
549                            # Add to filters.
550                            if attr_type is not 'str':
551                                filters.append( { 'name': attr, 'type': attr_type, 'index': filter_col } )
552                                filter_col += 1
553                        break
554                '''
555        elif isinstance( self.original_dataset.datatype, Bed ):
556            # Can filter by score column only.
557            filters = [ { 'name': 'Score', 
558                          'type': 'int', 
559                          'index': filter_col, 
560                          'tool_id': 'Filter1',
561                          'tool_exp_name': 'c5'
562                           } ]
563
564        return filters
565    
566class TabixDataProvider( FilterableMixin, TracksDataProvider ):
567    """
568    Tabix index data provider for the Galaxy track browser.
569    """
570    
571    col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } }
572        
573    def get_iterator( self, chrom, start, end ):
574        start, end = int(start), int(end)
575        if end >= (2<<29):
576            end = (2<<29 - 1) # Tabix-enforced maximum
577                    
578        bgzip_fname = self.dependencies['bgzip'].file_name
579        
580        # if os.path.getsize(self.converted_dataset.file_name) == 0:
581            # return { 'kind': messages.ERROR, 'message': "Tabix converted size was 0, meaning the input file had invalid values." }
582        tabix = ctabix.Tabixfile(bgzip_fname, index_filename=self.converted_dataset.file_name)
583        
584        # If chrom is not found in indexes, try removing the first three 
585        # characters (e.g. 'chr') and see if that works. This enables the
586        # provider to handle chrome names defined as chrXXX and as XXX.
587        chrom = str(chrom)
588        if chrom not in tabix.contigs and chrom.startswith("chr") and (chrom[3:] in tabix.contigs):
589            chrom = chrom[3:]
590        
591        return tabix.fetch(reference=chrom, start=start, end=end)
592        
593    def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
594        iterator = self.get_iterator( chrom, start, end )
595        return self.process_data( iterator, start_val, max_vals, **kwargs )
596    
597class IntervalIndexDataProvider( FilterableMixin, TracksDataProvider ):
598    """
599    Interval index files used only for GFF files.
600    """
601    col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } }
602    
603    def write_data_to_file( self, chrom, start, end, filename ):
604        source = open( self.original_dataset.file_name )
605        index = Indexes( self.converted_dataset.file_name )
606        out = open( filename, 'w' )
607        for start, end, offset in index.find(chrom, start, end):
608            source.seek( offset )
609            
610            reader = GFFReaderWrapper( source, fix_strand=True )
611            feature = reader.next()
612            for interval in feature.intervals:
613                out.write(interval.raw_line + '\n')
614        out.close()
615    
616    def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
617        start, end = int(start), int(end)
618        source = open( self.original_dataset.file_name )
619        index = Indexes( self.converted_dataset.file_name )
620        results = []
621        message = None
622
623        # If chrom is not found in indexes, try removing the first three 
624        # characters (e.g. 'chr') and see if that works. This enables the
625        # provider to handle chrome names defined as chrXXX and as XXX.
626        chrom = str(chrom)
627        if chrom not in index.indexes and chrom[3:] in index.indexes:
628            chrom = chrom[3:]
629
630        #
631        # Build data to return. Payload format is:
632        # [ <guid/offset>, <start>, <end>, <name>, <score>, <strand>, <thick_start>,
633        #   <thick_end>, <blocks> ]
634        # 
635        # First three entries are mandatory, others are optional.
636        #
637        filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
638        no_detail = ( "no_detail" in kwargs )
639        for count, val in enumerate( index.find(chrom, start, end) ):
640            start, end, offset = val[0], val[1], val[2]
641            if count < start_val:
642                continue
643            if count-start_val >= max_vals:
644                message = ERROR_MAX_VALS % ( max_vals, "features" )
645                break
646            source.seek( offset )
647            # TODO: can we use column metadata to fill out payload?
648            
649            # GFF dataset.
650            reader = GFFReaderWrapper( source, fix_strand=True )
651            feature = reader.next()
652            payload = package_gff_feature( feature, no_detail, filter_cols )
653            payload.insert( 0, offset )
654
655            results.append( payload )
656
657        return { 'data': results, 'message': message }
658                
659class VcfDataProvider( TabixDataProvider ):
660    """
661    VCF data provider for the Galaxy track browser.
662
663    Payload format: 
664    [ uid (offset), start, end, ID, reference base(s), alternate base(s), quality score ]
665    """
666
667    col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } }
668
669    def process_data( self, iterator, start_val=0, max_vals=sys.maxint, **kwargs ):
670        rval = []
671        message = None
672        
673        for count, line in enumerate( iterator ):
674            if count < start_val:
675                continue
676            if count-start_val >= max_vals:
677                message = ERROR_MAX_VALS % ( "max_vals", "features" )
678                break
679            
680            feature = line.split()
681            payload = [ hash(line), int(feature[1])-1, int(feature[1]),
682                        # ID: 
683                        feature[2],
684                        # reference base(s):
685                        feature[3],
686                        # alternative base(s)
687                        feature[4],
688                        # phred quality score
689                        float( feature[5] )]
690            rval.append(payload)
691
692        return { 'data': rval, 'message': message }
693
694class GFFDataProvider( TracksDataProvider ):
695    """
696    Provide data from GFF file.
697    
698    NOTE: this data provider does not use indices, and hence will be very slow
699    for large datasets.
700    """
701    def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
702        start, end = int( start ), int( end )
703        source = open( self.original_dataset.file_name )
704        results = []
705        message = None
706        offset = 0
707        
708        for count, feature in enumerate( GFFReaderWrapper( source, fix_strand=True ) ):
709            if count < start_val:
710                continue
711            if count-start_val >= max_vals:
712                message = ERROR_MAX_VALS % ( max_vals, "reads" )
713                break
714                
715            feature_start, feature_end = convert_gff_coords_to_bed( [ feature.start, feature.end ] )
716            if feature.chrom != chrom or feature_start < start or feature_end > end:
717                continue
718            payload = package_gff_feature( feature )
719            payload.insert( 0, offset )
720            results.append( payload )
721            offset += feature.raw_size
722            
723        return { 'data': results, 'message': message }
724        
725class BedTabixDataProvider( TabixDataProvider, BedDataProvider ):
726    """
727    Provides data from a BED file indexed via tabix.
728    """
729    pass
730    
731class RawBedDataProvider( BedDataProvider ):
732    """
733    Provide data from BED file.
734
735    NOTE: this data provider does not use indices, and hence will be very slow
736    for large datasets.
737    """
738    
739    def get_iterator( self, chrom, start, end ):
740        def line_filter_iter():
741            for line in open( self.original_dataset.file_name ):
742                feature = line.split()
743                feature_chrom, feature_start, feature_end = feature[ 0:3 ]
744                if feature_chrom != chrom or feature_start > end or feature_end < start:
745                    continue
746                yield line
747        return line_filter_iter()
748       
749#        
750# Helper methods.
751#
752
753# Mapping from dataset type name to a class that can fetch data from a file of that
754# type. First key is converted dataset type; if result is another dict, second key
755# is original dataset type. TODO: This needs to be more flexible.
756dataset_type_name_to_data_provider = {
757    "tabix": { Vcf: VcfDataProvider, Bed: BedTabixDataProvider, "default" : TabixDataProvider },
758    "interval_index": IntervalIndexDataProvider,
759    "bai": BamDataProvider,
760    "summary_tree": SummaryTreeDataProvider,
761    "bigwig": BigWigDataProvider,
762    "bigbed": BigBedDataProvider
763}
764
765def get_data_provider( name=None, original_dataset=None ):
766    """
767    Returns data provider class by name and/or original dataset.
768    """
769    data_provider = None
770    if name:
771        value = dataset_type_name_to_data_provider[ name ]
772        if isinstance( value, dict ):
773            # Get converter by dataset extension; if there is no data provider,
774            # get the default.
775            data_provider = value.get( original_dataset.datatype.__class__, value.get( "default" ) )
776        else:
777            data_provider = value
778    elif original_dataset:
779        # Look up data provider from datatype's informaton.
780        try:
781            # Get data provider mapping and data provider for 'data'. If 
782            # provider available, use it; otherwise use generic provider.
783            _ , data_provider_mapping = original_dataset.datatype.get_track_type()
784            if 'data_standalone' in data_provider_mapping:
785                data_provider_name = data_provider_mapping[ 'data_standalone' ]
786            else:
787                data_provider_name = data_provider_mapping[ 'data' ]
788            if data_provider_name:
789                data_provider = get_data_provider( name=data_provider_name, original_dataset=original_dataset )
790            else: 
791                data_provider = TracksDataProvider
792        except:
793            pass
794    return data_provider
795
796def package_gff_feature( feature, no_detail=False, filter_cols=[] ):
797    """ Package a GFF feature in an array for data providers. """
798    feature = convert_gff_coords_to_bed( feature )
799    
800    # No detail means only start, end.
801    if no_detail:
802        return [ feature.start, feature.end ]
803    
804    # Return full feature.
805    payload = [ feature.start, 
806                feature.end, 
807                feature.name(), 
808                feature.strand,
809                # No notion of thick start, end in GFF, so make everything
810                # thick.
811                feature.start,
812                feature.end
813                ]
814    
815    # HACK: remove interval with name 'transcript' from feature. 
816    # Cufflinks puts this interval in each of its transcripts, 
817    # and they mess up trackster by covering the feature's blocks.
818    # This interval will always be a feature's first interval,
819    # and the GFF's third column is its feature name. 
820    if feature.intervals[0].fields[2] == 'transcript':
821        feature.intervals = feature.intervals[1:]
822    # Add blocks.
823    block_sizes = [ (interval.end - interval.start ) for interval in feature.intervals ]
824    block_starts = [ ( interval.start - feature.start ) for interval in feature.intervals ]
825    blocks = zip( block_sizes, block_starts )
826    payload.append( [ ( feature.start + block[1], feature.start + block[1] + block[0] ) for block in blocks ] )
827    
828    # Add filter data to payload.
829    for col in filter_cols:
830        if col == "Score":
831            payload.append( feature.score )
832        elif col in feature.attributes:
833            payload.append( feature.attributes[col] )
834        else:
835            # Dummy value.
836            payload.append( "na" )
837    return payload