PageRenderTime 195ms CodeModel.GetById 88ms app.highlight 96ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/galaxy/datatypes/interval.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1400 lines | 1261 code | 34 blank | 105 comment | 155 complexity | 73bdc523958f8956b87c680c31b83fec MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1"""
  2Interval datatypes
  3"""
  4
  5import pkg_resources
  6pkg_resources.require( "bx-python" )
  7
  8import logging, os, sys, tempfile
  9import data
 10from galaxy import util
 11from galaxy.datatypes.sniff import *
 12from galaxy.web import url_for
 13import urllib
 14from bx.intervals.io import *
 15from galaxy.datatypes import metadata
 16from galaxy.datatypes.metadata import MetadataElement
 17from galaxy.datatypes.tabular import Tabular
 18from galaxy.datatypes.util.gff_util import parse_gff_attributes
 19import math
 20import dataproviders
 21
 22log = logging.getLogger(__name__)
 23
 24# Contains the meta columns and the words that map to it; list aliases on the
 25# right side of the : in decreasing order of priority
 26alias_spec = {
 27    'chromCol'  : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name' ],
 28    'startCol'  : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)' ],
 29    'endCol'    : [ 'end'   , 'END'  , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)'  ],
 30    'strandCol' : [ 'strand', 'STRAND', 'Strand' ],
 31    'nameCol'   : [ 'name', 'NAME', 'Name', 'name2', 'NAME2', 'Name2', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Peptide ID' ]
 32}
 33
 34# a little faster lookup
 35alias_helper = {}
 36for key, value in alias_spec.items():
 37    for elem in value:
 38        alias_helper[elem] = key
 39
 40# Constants for configuring viewport generation: If a line is greater than
 41# VIEWPORT_MAX_READS_PER_LINE * VIEWPORT_READLINE_BUFFER_SIZE bytes in size,
 42# then we will not generate a viewport for that dataset
 43VIEWPORT_READLINE_BUFFER_SIZE = 1048576 # 1MB
 44VIEWPORT_MAX_READS_PER_LINE = 10
 45
 46@dataproviders.decorators.has_dataproviders
 47class Interval( Tabular ):
 48    """Tab delimited data containing interval information"""
 49    file_ext = "interval"
 50    line_class = "region"
 51    track_type = "FeatureTrack"
 52    data_sources = { "data": "tabix", "index": "bigwig" }
 53
 54    """Add metadata elements"""
 55    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
 56    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
 57    MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
 58    MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
 59    MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
 60    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
 61
 62    def __init__(self, **kwd):
 63        """Initialize interval datatype, by adding UCSC display apps"""
 64        Tabular.__init__(self, **kwd)
 65        self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
 66    def init_meta( self, dataset, copy_from=None ):
 67        Tabular.init_meta( self, dataset, copy_from=copy_from )
 68    def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ):
 69        """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand"""
 70        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 )
 71        if dataset.has_data():
 72            empty_line_count = 0
 73            num_check_lines = 100 # only check up to this many non empty lines
 74            for i, line in enumerate( file( dataset.file_name ) ):
 75                line = line.rstrip( '\r\n' )
 76                if line:
 77                    if ( first_line_is_header or line[0] == '#' ):
 78                        self.init_meta( dataset )
 79                        line = line.strip( '#' )
 80                        elems = line.split( '\t' )
 81                        for meta_name, header_list in alias_spec.iteritems():
 82                            for header_val in header_list:
 83                                if header_val in elems:
 84                                    #found highest priority header to meta_name
 85                                    setattr( dataset.metadata, meta_name, elems.index( header_val ) + 1 )
 86                                    break #next meta_name
 87                        break  # Our metadata is set, so break out of the outer loop
 88                    else:
 89                        # Header lines in Interval files are optional. For example, BED is Interval but has no header.
 90                        # We'll make a best guess at the location of the metadata columns.
 91                        metadata_is_set = False
 92                        elems = line.split( '\t' )
 93                        if len( elems ) > 2:
 94                            for str in data.col1_startswith:
 95                                if line.lower().startswith( str ):
 96                                    if overwrite or not dataset.metadata.element_is_set( 'chromCol' ):
 97                                        dataset.metadata.chromCol = 1
 98                                    try:
 99                                        int( elems[1] )
100                                        if overwrite or not dataset.metadata.element_is_set( 'startCol' ):
101                                            dataset.metadata.startCol = 2
102                                    except:
103                                        pass # Metadata default will be used
104                                    try:
105                                        int( elems[2] )
106                                        if overwrite or not dataset.metadata.element_is_set( 'endCol' ):
107                                            dataset.metadata.endCol = 3
108                                    except:
109                                        pass # Metadata default will be used
110                                    #we no longer want to guess that this column is the 'name', name must now be set manually for interval files
111                                    #we will still guess at the strand, as we can make a more educated guess
112                                    #if len( elems ) > 3:
113                                    #    try:
114                                    #        int( elems[3] )
115                                    #    except:
116                                    #        if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
117                                    #            dataset.metadata.nameCol = 4
118                                    if len( elems ) < 6 or elems[5] not in data.valid_strand:
119                                        if overwrite or not dataset.metadata.element_is_set(  'strandCol' ):
120                                            dataset.metadata.strandCol = 0
121                                    else:
122                                        if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
123                                            dataset.metadata.strandCol = 6
124                                    metadata_is_set = True
125                                    break
126                        if metadata_is_set or ( i - empty_line_count ) > num_check_lines:
127                            break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop
128                else:
129                    empty_line_count += 1
130    def displayable( self, dataset ):
131        try:
132            return dataset.has_data() \
133                and dataset.state == dataset.states.OK \
134                and dataset.metadata.columns > 0 \
135                and dataset.metadata.data_lines != 0 \
136                and dataset.metadata.chromCol \
137                and dataset.metadata.startCol \
138                and dataset.metadata.endCol
139        except:
140            return False
141
142    def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ):
143        """Return a chrom, start, stop tuple for viewing a file."""
144        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
145        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
146        if not self.displayable( dataset ):
147            return ( None, None, None )
148        try:
149            # If column indexes were not passwed, determine from metadata
150            if chrom_col is None:
151                chrom_col = int( dataset.metadata.chromCol ) - 1
152            if start_col is None:
153                start_col = int( dataset.metadata.startCol ) - 1
154            if end_col is None:
155                end_col = int( dataset.metadata.endCol ) - 1
156            # Scan lines of file to find a reasonable chromosome and range
157            chrom = None
158            start = sys.maxint
159            end = 0
160            max_col = max( chrom_col, start_col, end_col )
161            fh = open( dataset.file_name )
162            while True:
163                line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
164                # Stop if at end of file
165                if not line:
166                    break
167                # Skip comment lines
168                if not line.startswith( '#' ):
169                    try:
170                        fields = line.rstrip().split( '\t' )
171                        if len( fields ) > max_col:
172                            if chrom is None or chrom == fields[ chrom_col ]:
173                                start = min( start, int( fields[ start_col ] ) )
174                                end = max( end, int( fields[ end_col ] ) )
175                                # Set chrom last, in case start and end are not integers
176                                chrom = fields[ chrom_col ]
177                            viewport_feature_count -= 1
178                    except Exception, e:
179                        # Most likely a non-integer field has been encountered
180                        # for start / stop. Just ignore and make sure we finish
181                        # reading the line and decrementing the counters.
182                        pass
183                # Make sure we are at the next new line
184                readline_count = VIEWPORT_MAX_READS_PER_LINE
185                while line.rstrip( '\n\r' ) == line:
186                    assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
187                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
188                    if not line: break #EOF
189                    readline_count -= 1
190                max_line_count -= 1
191                if not viewport_feature_count or not max_line_count:
192                    #exceeded viewport or total line count to check
193                    break
194            if chrom is not None:
195                return ( chrom, str( start ), str( end ) ) # Necessary to return strings?
196        except Exception, e:
197            # Unexpected error, possibly missing metadata
198            log.exception( "Exception caught attempting to generate viewport for dataset '%d'", dataset.id )
199        return ( None, None, None )
200
201    def as_ucsc_display_file( self, dataset, **kwd ):
202        """Returns file contents with only the bed data"""
203        fd, temp_name = tempfile.mkstemp()
204        c, s, e, t, n = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol or 0, dataset.metadata.nameCol or 0
205        c, s, e, t, n  = int(c)-1, int(s)-1, int(e)-1, int(t)-1, int(n)-1
206        if t >= 0: # strand column (should) exists
207            for i, elems in enumerate( util.file_iter(dataset.file_name) ):
208                strand = "+"
209                name = "region_%i" % i
210                if n >= 0 and n < len( elems ): name = elems[n]
211                if t<len(elems): strand = elems[t]
212                tmp = [ elems[c], elems[s], elems[e], name, '0', strand ]
213                os.write(fd, '%s\n' % '\t'.join(tmp) )
214        elif n >= 0: # name column (should) exists
215            for i, elems in enumerate( util.file_iter(dataset.file_name) ):
216                name = "region_%i" % i
217                if n >= 0 and n < len( elems ): name = elems[n]
218                tmp = [ elems[c], elems[s], elems[e], name ]
219                os.write(fd, '%s\n' % '\t'.join(tmp) )
220        else:
221            for elems in util.file_iter(dataset.file_name):
222                tmp = [ elems[c], elems[s], elems[e] ]
223                os.write(fd, '%s\n' % '\t'.join(tmp) )
224        os.close(fd)
225        return open(temp_name)
226    def display_peek( self, dataset ):
227        """Returns formated html of peek"""
228        return Tabular.make_html_table( self, dataset, column_parameter_alias={'chromCol':'Chrom', 'startCol':'Start', 'endCol':'End', 'strandCol':'Strand', 'nameCol':'Name'} )
229    def ucsc_links( self, dataset, type, app, base_url ):
230        """
231        Generate links to UCSC genome browser sites based on the dbkey
232        and content of dataset.
233        """
234        # Filter UCSC sites to only those that are supported by this build and
235        # enabled.
236        valid_sites = [ ( name, url )
237                        for name, url in util.get_ucsc_by_build( dataset.dbkey )
238                        if name in app.config.ucsc_display_sites ]
239        if not valid_sites:
240            return []
241        # If there are any valid sites, we need to generate the estimated
242        # viewport
243        chrom, start, stop = self.get_estimated_display_viewport( dataset )
244        if chrom is None:
245            return []
246        # Accumulate links for valid sites
247        ret_val = []
248        for site_name, site_url in valid_sites:
249            internal_url = url_for( controller='dataset', dataset_id=dataset.id,
250                                    action='display_at', filename='ucsc_' + site_name )
251            display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at"
252                    % (base_url, url_for( controller='root' ), dataset.id, type) )
253            redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s"
254                    % (site_url, dataset.dbkey, chrom, start, stop ) )
255            link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
256            ret_val.append( ( site_name, link ) )
257        return ret_val
258    def validate( self, dataset ):
259        """Validate an interval file using the bx GenomicIntervalReader"""
260        errors = list()
261        c, s, e, t = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol
262        c, s, e, t = int(c)-1, int(s)-1, int(e)-1, int(t)-1
263        infile = open(dataset.file_name, "r")
264        reader = GenomicIntervalReader(
265            infile,
266            chrom_col = c,
267            start_col = s,
268            end_col = e,
269            strand_col = t)
270
271        while True:
272            try:
273                reader.next()
274            except ParseError, e:
275                errors.append(e)
276            except StopIteration:
277                infile.close()
278                return errors
279
280    def repair_methods( self, dataset ):
281        """Return options for removing errors along with a description"""
282        return [("lines","Remove erroneous lines")]
283
284    def sniff( self, filename ):
285        """
286        Checks for 'intervalness'
287
288        This format is mostly used by galaxy itself.  Valid interval files should include
289        a valid header comment, but this seems to be loosely regulated.
290
291        >>> fname = get_test_fname( 'test_space.txt' )
292        >>> Interval().sniff( fname )
293        False
294        >>> fname = get_test_fname( 'interval.interval' )
295        >>> Interval().sniff( fname )
296        True
297        """
298        headers = get_headers( filename, '\t' )
299        try:
300            """
301            If we got here, we already know the file is_column_based and is not bed,
302            so we'll just look for some valid data.
303            """
304            for hdr in headers:
305                if hdr and not hdr[0].startswith( '#' ):
306                    if len(hdr) < 3:
307                        return False
308                    try:
309                        # Assume chrom start and end are in column positions 1 and 2
310                        # respectively ( for 0 based columns )
311                        check = int( hdr[1] )
312                        check = int( hdr[2] )
313                    except:
314                        return False
315            return True
316        except:
317            return False
318
319    def get_track_window(self, dataset, data, start, end):
320        """
321        Assumes the incoming track data is sorted already.
322        """
323        window = list()
324        for record in data:
325            fields = record.rstrip("\n\r").split("\t")
326            record_chrom = fields[dataset.metadata.chromCol-1]
327            record_start = int(fields[dataset.metadata.startCol-1])
328            record_end = int(fields[dataset.metadata.endCol-1])
329            if record_start < end and record_end > start:
330                window.append( (record_chrom, record_start, record_end) )  #Yes I did want to use a generator here, but it doesn't work downstream
331        return window
332
333    def get_track_resolution( self, dataset, start, end):
334        return None
335
336    # ------------- Dataproviders
337    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
338                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
339    def genomic_region_dataprovider( self, dataset, **settings ):
340        return dataproviders.dataset.GenomicRegionDataProvider( dataset, **settings )
341
342    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
343                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
344    def genomic_region_dict_dataprovider( self, dataset, **settings ):
345        settings[ 'named_columns' ] = True
346        return self.genomic_region_dataprovider( dataset, **settings )
347
348    @dataproviders.decorators.dataprovider_factory( 'interval',
349                                                    dataproviders.dataset.IntervalDataProvider.settings )
350    def interval_dataprovider( self, dataset, **settings ):
351        return dataproviders.dataset.IntervalDataProvider( dataset, **settings )
352
353    @dataproviders.decorators.dataprovider_factory( 'interval-dict',
354                                                    dataproviders.dataset.IntervalDataProvider.settings )
355    def interval_dict_dataprovider( self, dataset, **settings ):
356        settings[ 'named_columns' ] = True
357        return self.interval_dataprovider( dataset, **settings )
358
359
360class BedGraph( Interval ):
361    """Tab delimited chrom/start/end/datavalue dataset"""
362
363    file_ext = "bedgraph"
364    track_type = "LineTrack"
365    data_sources = { "data": "bigwig", "index": "bigwig" }
366
367    def as_ucsc_display_file( self, dataset, **kwd ):
368        """
369            Returns file contents as is with no modifications.
370            TODO: this is a functional stub and will need to be enhanced moving forward to provide additional support for bedgraph.
371        """
372        return open( dataset.file_name )
373
374    def get_estimated_display_viewport( self, dataset, chrom_col = 0, start_col = 1, end_col = 2 ):
375        """
376            Set viewport based on dataset's first 100 lines.
377        """
378        return Interval.get_estimated_display_viewport( self, dataset, chrom_col = chrom_col, start_col = start_col, end_col = end_col )
379
380class Bed( Interval ):
381    """Tab delimited data in BED format"""
382    file_ext = "bed"
383    data_sources = { "data": "tabix", "index": "bigwig", "feature_search": "fli" }
384    track_type = Interval.track_type
385
386    """Add metadata elements"""
387    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
388    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
389    MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
390    MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
391    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
392    MetadataElement( name="viz_filter_cols", desc="Score column for visualization", default=[4], param=metadata.ColumnParameter, optional=True, multiple=True )
393    ###do we need to repeat these? they are the same as should be inherited from interval type
394
395    def set_meta( self, dataset, overwrite = True, **kwd ):
396        """Sets the metadata information for datasets previously determined to be in bed format."""
397        i = 0
398        if dataset.has_data():
399            for i, line in enumerate( file(dataset.file_name) ):
400                metadata_set = False
401                line = line.rstrip('\r\n')
402                if line and not line.startswith('#'):
403                    elems = line.split('\t')
404                    if len(elems) > 2:
405                        for startswith in data.col1_startswith:
406                            if line.lower().startswith( startswith ):
407                                if len( elems ) > 3:
408                                    if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
409                                        dataset.metadata.nameCol = 4
410                                if len(elems) < 6:
411                                    if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
412                                        dataset.metadata.strandCol = 0
413                                else:
414                                    if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
415                                        dataset.metadata.strandCol = 6
416                                metadata_set = True
417                                break
418                if metadata_set: break
419            Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
420
421    def as_ucsc_display_file( self, dataset, **kwd ):
422        """Returns file contents with only the bed data. If bed 6+, treat as interval."""
423        for line in open(dataset.file_name):
424            line = line.strip()
425            if line == "" or line.startswith("#"):
426                continue
427            fields = line.split('\t')
428            """check to see if this file doesn't conform to strict genome browser accepted bed"""
429            try:
430                if len(fields) > 12:
431                    return Interval.as_ucsc_display_file(self, dataset) #too many fields
432                if len(fields) > 6:
433                    int(fields[6])
434                    if len(fields) > 7:
435                        int(fields[7])
436                        if len(fields) > 8:
437                            if int(fields[8]) != 0:
438                                return Interval.as_ucsc_display_file(self, dataset)
439                            if len(fields) > 9:
440                                int(fields[9])
441                                if len(fields) > 10:
442                                    fields2 = fields[10].rstrip(",").split(",") #remove trailing comma and split on comma
443                                    for field in fields2:
444                                        int(field)
445                                    if len(fields) > 11:
446                                        fields2 = fields[11].rstrip(",").split(",") #remove trailing comma and split on comma
447                                        for field in fields2:
448                                            int(field)
449            except: return Interval.as_ucsc_display_file(self, dataset)
450            #only check first line for proper form
451            break
452
453        try: return open(dataset.file_name)
454        except: return "This item contains no content"
455
456    def sniff( self, filename ):
457        """
458        Checks for 'bedness'
459
460        BED lines have three required fields and nine additional optional fields.
461        The number of fields per line must be consistent throughout any single set of data in
462        an annotation track.  The order of the optional fields is binding: lower-numbered
463        fields must always be populated if higher-numbered fields are used.  The data type of
464        all 12 columns is:
465        1-str, 2-int, 3-int, 4-str, 5-int, 6-str, 7-int, 8-int, 9-int or list, 10-int, 11-list, 12-list
466
467        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format1
468
469        >>> fname = get_test_fname( 'test_tab.bed' )
470        >>> Bed().sniff( fname )
471        True
472        >>> fname = get_test_fname( 'interval1.bed' )
473        >>> Bed().sniff( fname )
474        True
475        >>> fname = get_test_fname( 'complete.bed' )
476        >>> Bed().sniff( fname )
477        True
478        """
479        headers = get_headers( filename, '\t' )
480        try:
481            if not headers: return False
482            for hdr in headers:
483                if (hdr[0] == '' or hdr[0].startswith( '#' )):
484                    continue
485                valid_col1 = False
486                if len(hdr) < 3 or len(hdr) > 12:
487                    return False
488                for str in data.col1_startswith:
489                    if hdr[0].lower().startswith(str):
490                        valid_col1 = True
491                        break
492                if valid_col1:
493                    try:
494                        int( hdr[1] )
495                        int( hdr[2] )
496                    except:
497                        return False
498                    if len( hdr ) > 4:
499                        #hdr[3] is a string, 'name', which defines the name of the BED line - difficult to test for this.
500                        #hdr[4] is an int, 'score', a score between 0 and 1000.
501                        try:
502                            if int( hdr[4] ) < 0 or int( hdr[4] ) > 1000: return False
503                        except:
504                            return False
505                    if len( hdr ) > 5:
506                        #hdr[5] is strand
507                        if hdr[5] not in data.valid_strand: return False
508                    if len( hdr ) > 6:
509                        #hdr[6] is thickStart, the starting position at which the feature is drawn thickly.
510                        try: int( hdr[6] )
511                        except: return False
512                    if len( hdr ) > 7:
513                        #hdr[7] is thickEnd, the ending position at which the feature is drawn thickly
514                        try: int( hdr[7] )
515                        except: return False
516                    if len( hdr ) > 8:
517                        #hdr[8] is itemRgb, an RGB value of the form R,G,B (e.g. 255,0,0).  However, this could also be an int (e.g., 0)
518                        try: int( hdr[8] )
519                        except:
520                            try: hdr[8].split(',')
521                            except: return False
522                    if len( hdr ) > 9:
523                        #hdr[9] is blockCount, the number of blocks (exons) in the BED line.
524                        try: block_count = int( hdr[9] )
525                        except: return False
526                    if len( hdr ) > 10:
527                        #hdr[10] is blockSizes - A comma-separated list of the block sizes.
528                        #Sometimes the blosck_sizes and block_starts lists end in extra commas
529                        try: block_sizes = hdr[10].rstrip(',').split(',')
530                        except: return False
531                    if len( hdr ) > 11:
532                        #hdr[11] is blockStarts - A comma-separated list of block starts.
533                        try: block_starts = hdr[11].rstrip(',').split(',')
534                        except: return False
535                        if len(block_sizes) != block_count or len(block_starts) != block_count: return False
536                else: return False
537            return True
538        except: return False
539
540class BedStrict( Bed ):
541    """Tab delimited data in strict BED format - no non-standard columns allowed"""
542
543    file_ext = "bedstrict"
544
545    #no user change of datatype allowed
546    allow_datatype_change = False
547
548    #Read only metadata elements
549    MetadataElement( name="chromCol", default=1, desc="Chrom column", readonly=True, param=metadata.MetadataParameter )
550    MetadataElement( name="startCol", default=2, desc="Start column", readonly=True, param=metadata.MetadataParameter ) #TODO: start and end should be able to be set to these or the proper thick[start/end]?
551    MetadataElement( name="endCol", default=3, desc="End column", readonly=True, param=metadata.MetadataParameter )
552    MetadataElement( name="strandCol", desc="Strand column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True )
553    MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True )
554    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
555
556    def __init__( self, **kwd ):
557        Tabular.__init__( self, **kwd )
558        self.clear_display_apps() #only new style display applications for this datatype
559
560    def set_meta( self, dataset, overwrite = True, **kwd ):
561        Tabular.set_meta( self, dataset, overwrite = overwrite, **kwd) #need column count first
562        if dataset.metadata.columns >= 4:
563            dataset.metadata.nameCol = 4
564            if dataset.metadata.columns >= 6:
565                dataset.metadata.strandCol = 6
566
567    def sniff( self, filename ):
568        return False #NOTE: This would require aggressively validating the entire file
569
570class Bed6( BedStrict ):
571    """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 6"""
572
573    file_ext = "bed6"
574
575class Bed12( BedStrict ):
576    """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 12"""
577
578    file_ext = "bed12"
579
580class _RemoteCallMixin:
581    def _get_remote_call_url( self, redirect_url, site_name, dataset, type, app, base_url ):
582        """Retrieve the URL to call out to an external site and retrieve data.
583        This routes our external URL through a local galaxy instance which makes
584        the data available, followed by redirecting to the remote site with a
585        link back to the available information.
586        """
587        internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='%s_%s' % ( type, site_name ) )
588        base_url = app.config.get( "display_at_callback", base_url )
589        display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % \
590                                         ( base_url, url_for( controller='root' ), dataset.id, type ) )
591        link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
592        return link
593
594
595@dataproviders.decorators.has_dataproviders
596class Gff( Tabular, _RemoteCallMixin ):
597    """Tab delimited data in Gff format"""
598    file_ext = "gff"
599    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
600    data_sources = { "data": "interval_index", "index": "bigwig", "feature_search": "fli" }
601    track_type = Interval.track_type
602
603    """Add metadata elements"""
604    MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False )
605    MetadataElement( name="column_types", default=['str','str','str','int','int','int','str','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
606
607    MetadataElement( name="attributes", default=0, desc="Number of attributes", readonly=True, visible=False, no_value=0 )
608    MetadataElement( name="attribute_types", default={}, desc="Attribute types", param=metadata.DictParameter, readonly=True, visible=False, no_value=[] )
609
610    def __init__( self, **kwd ):
611        """Initialize datatype, by adding GBrowse display app"""
612        Tabular.__init__(self, **kwd)
613        self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
614        self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
615
616    def set_attribute_metadata( self, dataset ):
617        """
618        Sets metadata elements for dataset's attributes.
619        """
620
621        # Use first N lines to set metadata for dataset attributes. Attributes
622        # not found in the first N lines will not have metadata.
623        num_lines = 200
624        attribute_types = {}
625        for i, line in enumerate( file ( dataset.file_name ) ):
626            if line and not line.startswith( '#' ):
627                elems = line.split( '\t' )
628                if len( elems ) == 9:
629                    try:
630                        # Loop through attributes to set types.
631                        for name, value in parse_gff_attributes( elems[8] ).items():
632                            # Default type is string.
633                            value_type = "str"
634                            try:
635                                # Try int.
636                                int( value )
637                                value_type = "int"
638                            except:
639                                try:
640                                    # Try float.
641                                    float( value )
642                                    value_type = "float"
643                                except:
644                                    pass
645                            attribute_types[ name ] = value_type
646                    except:
647                        pass
648                if i + 1 == num_lines:
649                    break
650
651        # Set attribute metadata and then set additional metadata.
652        dataset.metadata.attribute_types = attribute_types
653        dataset.metadata.attributes = len( attribute_types )
654
655    def set_meta( self, dataset, overwrite = True, **kwd ):
656        self.set_attribute_metadata( dataset )
657
658        i = 0
659        for i, line in enumerate( file ( dataset.file_name ) ):
660            line = line.rstrip('\r\n')
661            if line and not line.startswith( '#' ):
662                elems = line.split( '\t' )
663                if len(elems) == 9:
664                    try:
665                        int( elems[3] )
666                        int( elems[4] )
667                        break
668                    except:
669                        pass
670        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
671
672    def display_peek( self, dataset ):
673        """Returns formated html of peek"""
674        return Tabular.make_html_table( self, dataset, column_names=self.column_names )
675    def get_estimated_display_viewport( self, dataset ):
676        """
677        Return a chrom, start, stop tuple for viewing a file.  There are slight differences between gff 2 and gff 3
678        formats.  This function should correctly handle both...
679        """
680        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
681        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
682        if self.displayable( dataset ):
683            try:
684                seqid = None
685                start = sys.maxint
686                stop = 0
687                fh = open( dataset.file_name )
688                while True:
689                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
690                    if not line: break #EOF
691                    try:
692                        if line.startswith( '##sequence-region' ): # ##sequence-region IV 6000000 6030000
693                            elems = line.rstrip( '\n\r' ).split()
694                            if len( elems ) > 3:
695                                # line looks like:
696                                # ##sequence-region   ctg123 1 1497228
697                                seqid = elems[1] # IV
698                                start = int( elems[2] )# 6000000
699                                stop = int( elems[3] ) # 6030000
700                                break #use location declared in file
701                            elif len( elems ) == 2 and elems[1].find( '..' ) > 0:
702                                # line looks like this:
703                                # ##sequence-region X:120000..140000
704                                elems = elems[1].split( ':' )
705                                seqid = elems[0]
706                                start = int( elems[1].split( '..' )[0] )
707                                stop = int( elems[1].split( '..' )[1] )
708                                break #use location declared in file
709                            else:
710                                log.exception( "line (%s) uses an unsupported ##sequence-region definition." % str( line ) )
711                                #break #no break, if bad definition, we try another method
712                        elif line.startswith("browser position"):
713                            # Allow UCSC style browser and track info in the GFF file
714                            pos_info = line.split()[-1]
715                            seqid, startend = pos_info.split(":")
716                            start, stop = map( int, startend.split("-") )
717                            break #use location declared in file
718                        elif True not in map( line.startswith, ( '#', 'track', 'browser' ) ):# line.startswith() does not accept iterator in python2.4
719                            viewport_feature_count -= 1
720                            elems = line.rstrip( '\n\r' ).split( '\t' )
721                            if len( elems ) > 3:
722                                if not seqid:
723                                    # We can only set the viewport for a single chromosome
724                                    seqid = elems[0]
725                                if seqid == elems[0]:
726                                    # Make sure we have not spanned chromosomes
727                                    start = min( start, int( elems[3] ) )
728                                    stop = max( stop, int( elems[4] ) )
729                    except:
730                        #most likely start/stop is not an int or not enough fields
731                        pass
732                    #make sure we are at the next new line
733                    readline_count = VIEWPORT_MAX_READS_PER_LINE
734                    while line.rstrip( '\n\r' ) == line:
735                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
736                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
737                        if not line: break #EOF
738                        readline_count -= 1
739                    max_line_count -= 1
740                    if not viewport_feature_count or not max_line_count:
741                        #exceeded viewport or total line count to check
742                        break
743                if seqid is not None:
744                    return ( seqid, str( start ), str( stop ) ) #Necessary to return strings?
745            except Exception, e:
746                #unexpected error
747                log.exception( str( e ) )
748        return ( None, None, None ) #could not determine viewport
749    def ucsc_links( self, dataset, type, app, base_url ):
750        ret_val = []
751        seqid, start, stop = self.get_estimated_display_viewport( dataset )
752        if seqid is not None:
753            for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ):
754                if site_name in app.config.ucsc_display_sites:
755                    redirect_url = urllib.quote_plus(
756                            "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" %
757                            ( site_url, dataset.dbkey, seqid, start, stop ) )
758                    link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
759                    ret_val.append( ( site_name, link ) )
760        return ret_val
761    def gbrowse_links( self, dataset, type, app, base_url ):
762        ret_val = []
763        seqid, start, stop = self.get_estimated_display_viewport( dataset )
764        if seqid is not None:
765            for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ):
766                if site_name in app.config.gbrowse_display_sites:
767                    if seqid.startswith( 'chr' ) and len ( seqid ) > 3:
768                        seqid = seqid[3:]
769                    redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, seqid, start, stop ) )
770                    link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
771                    ret_val.append( ( site_name, link ) )
772        return ret_val
773    def sniff( self, filename ):
774        """
775        Determines whether the file is in gff format
776
777        GFF lines have nine required fields that must be tab-separated.
778
779        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format3
780
781        >>> fname = get_test_fname( 'gff_version_3.gff' )
782        >>> Gff().sniff( fname )
783        False
784        >>> fname = get_test_fname( 'test.gff' )
785        >>> Gff().sniff( fname )
786        True
787        """
788        headers = get_headers( filename, '\t' )
789        try:
790            if len(headers) < 2:
791                return False
792            for hdr in headers:
793                if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0:
794                    return False
795                if hdr and hdr[0] and not hdr[0].startswith( '#' ):
796                    if len(hdr) != 9:
797                        return False
798                    try:
799                        int( hdr[3] )
800                        int( hdr[4] )
801                    except:
802                        return False
803                    if hdr[5] != '.':
804                        try:
805                            score = float( hdr[5] )
806                        except:
807                            return False
808                    if hdr[6] not in data.valid_strand:
809                        return False
810            return True
811        except:
812            return False
813
814    # ------------- Dataproviders
815    # redefine bc super is Tabular
816    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
817                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
818    def genomic_region_dataprovider( self, dataset, **settings ):
819        return dataproviders.dataset.GenomicRegionDataProvider( dataset, 0, 3, 4, **settings )
820
821    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
822                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
823    def genomic_region_dict_dataprovider( self, dataset, **settings ):
824        settings[ 'named_columns' ] = True
825        return self.genomic_region_dataprovider( dataset, **settings )
826
827    @dataproviders.decorators.dataprovider_factory( 'interval',
828                                                    dataproviders.dataset.IntervalDataProvider.settings )
829    def interval_dataprovider( self, dataset, **settings ):
830        return dataproviders.dataset.IntervalDataProvider( dataset, 0, 3, 4, 6, 2, **settings )
831
832    @dataproviders.decorators.dataprovider_factory( 'interval-dict',
833                                                    dataproviders.dataset.IntervalDataProvider.settings )
834    def interval_dict_dataprovider( self, dataset, **settings ):
835        settings[ 'named_columns' ] = True
836        return self.interval_dataprovider( dataset, **settings )
837
838
839class Gff3( Gff ):
840    """Tab delimited data in Gff3 format"""
841    file_ext = "gff3"
842    valid_gff3_strand = ['+', '-', '.', '?']
843    valid_gff3_phase = ['.', '0', '1', '2']
844    column_names = [ 'Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes' ]
845    track_type = Interval.track_type
846
847    """Add metadata elements"""
848    MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
849
850    def __init__(self, **kwd):
851        """Initialize datatype, by adding GBrowse display app"""
852        Gff.__init__(self, **kwd)
853    def set_meta( self, dataset, overwrite = True, **kwd ):
854        self.set_attribute_metadata( dataset )
855
856        i = 0
857        for i, line in enumerate( file ( dataset.file_name ) ):
858            line = line.rstrip('\r\n')
859            if line and not line.startswith( '#' ):
860                elems = line.split( '\t' )
861                valid_start = False
862                valid_end = False
863                if len( elems ) == 9:
864                    try:
865                        start = int( elems[3] )
866                        valid_start = True
867                    except:
868                        if elems[3] == '.':
869                            valid_start = True
870                    try:
871                        end = int( elems[4] )
872                        valid_end = True
873                    except:
874                        if elems[4] == '.':
875                            valid_end = True
876                    strand = elems[6]
877                    phase = elems[7]
878                    if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase:
879                        break
880        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
881    def sniff( self, filename ):
882        """
883        Determines whether the file is in gff version 3 format
884
885        GFF 3 format:
886
887        1) adds a mechanism for representing more than one level
888           of hierarchical grouping of features and subfeatures.
889        2) separates the ideas of group membership and feature name/id
890        3) constrains the feature type field to be taken from a controlled
891           vocabulary.
892        4) allows a single feature, such as an exon, to belong to more than
893           one group at a time.
894        5) provides an explicit convention for pairwise alignments
895        6) provides an explicit convention for features that occupy disjunct regions
896
897        The format consists of 9 columns, separated by tabs (NOT spaces).
898
899        Undefined fields are replaced with the "." character, as described in the original GFF spec.
900
901        For complete details see http://song.sourceforge.net/gff3.shtml
902
903        >>> fname = get_test_fname( 'test.gff' )
904        >>> Gff3().sniff( fname )
905        False
906        >>> fname = get_test_fname('gff_version_3.gff')
907        >>> Gff3().sniff( fname )
908        True
909        """
910        headers = get_headers( filename, '\t' )
911        try:
912            if len(headers) < 2:
913                return False
914            for hdr in headers:
915                if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) >= 0:
916                    return True
917                elif hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) < 0:
918                    return False
919                # Header comments may have been stripped, so inspect the data
920                if hdr and hdr[0] and not hdr[0].startswith( '#' ):
921                    if len(hdr) != 9:
922                        return False
923                    try:
924                        int( hdr[3] )
925                    except:
926                        if hdr[3] != '.':
927                            return False
928                    try:
929                        int( hdr[4] )
930                    except:
931                        if hdr[4] != '.':
932                            return False
933                    if hdr[5] != '.':
934                        try:
935                            score = float( hdr[5] )
936                        except:
937                            return False
938                    if hdr[6] not in self.valid_gff3_strand:
939                        return False
940                    if hdr[7] not in self.valid_gff3_phase:
941                        return False
942            return True
943        except:
944            return False
945
946class Gtf( Gff ):
947    """Tab delimited data in Gtf format"""
948    file_ext = "gtf"
949    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Attributes' ]
950    track_type = Interval.track_type
951
952    """Add metadata elements"""
953    MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False )
954    MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
955
956    def sniff( self, filename ):
957        """
958        Determines whether the file is in gtf format
959
960        GTF lines have nine required fields that must be tab-separated. The first eight GTF fields are the same as GFF.
961        The group field has been expanded into a list of attributes. Each attribute consists of a type/value pair.
962        Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space.
963        The attribute list must begin with the two mandatory attributes:
964
965            gene_id value - A globally unique identifier for the genomic source of the sequence.
966            transcript_id value - A globally unique identifier for the predicted transcript.
967
968        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format4
969
970        >>> fname = get_test_fname( '1.bed' )
971        >>> Gtf().sniff( fname )
972        False
973        >>> fname = get_test_fname( 'test.gff' )
974        >>> Gtf().sniff( fname )
975        False
976        >>> fname = get_test_fname( 'test.gtf' )
977        >>> Gtf().sniff( fname )
978        True
979        """
980        headers = get_headers( filename, '\t' )
981        try:
982            if len(headers) < 2:
983                return False
984            for hdr in headers:
985                if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0:
986                    return False
987       

Large files files are truncated, but you can click here to view the full file