PageRenderTime 139ms CodeModel.GetById 24ms app.highlight 102ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/interval.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1400 lines | 1261 code | 34 blank | 105 comment | 155 complexity | 73bdc523958f8956b87c680c31b83fec MD5 | raw file
   1"""
   2Interval datatypes
   3"""
   4
   5import pkg_resources
   6pkg_resources.require( "bx-python" )
   7
   8import logging, os, sys, tempfile
   9import data
  10from galaxy import util
  11from galaxy.datatypes.sniff import *
  12from galaxy.web import url_for
  13import urllib
  14from bx.intervals.io import *
  15from galaxy.datatypes import metadata
  16from galaxy.datatypes.metadata import MetadataElement
  17from galaxy.datatypes.tabular import Tabular
  18from galaxy.datatypes.util.gff_util import parse_gff_attributes
  19import math
  20import dataproviders
  21
  22log = logging.getLogger(__name__)
  23
  24# Contains the meta columns and the words that map to it; list aliases on the
  25# right side of the : in decreasing order of priority
  26alias_spec = {
  27    'chromCol'  : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name' ],
  28    'startCol'  : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)' ],
  29    'endCol'    : [ 'end'   , 'END'  , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)'  ],
  30    'strandCol' : [ 'strand', 'STRAND', 'Strand' ],
  31    'nameCol'   : [ 'name', 'NAME', 'Name', 'name2', 'NAME2', 'Name2', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Peptide ID' ]
  32}
  33
  34# a little faster lookup
  35alias_helper = {}
  36for key, value in alias_spec.items():
  37    for elem in value:
  38        alias_helper[elem] = key
  39
  40# Constants for configuring viewport generation: If a line is greater than
  41# VIEWPORT_MAX_READS_PER_LINE * VIEWPORT_READLINE_BUFFER_SIZE bytes in size,
  42# then we will not generate a viewport for that dataset
  43VIEWPORT_READLINE_BUFFER_SIZE = 1048576 # 1MB
  44VIEWPORT_MAX_READS_PER_LINE = 10
  45
  46@dataproviders.decorators.has_dataproviders
  47class Interval( Tabular ):
  48    """Tab delimited data containing interval information"""
  49    file_ext = "interval"
  50    line_class = "region"
  51    track_type = "FeatureTrack"
  52    data_sources = { "data": "tabix", "index": "bigwig" }
  53
  54    """Add metadata elements"""
  55    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
  56    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
  57    MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
  58    MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
  59    MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
  60    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
  61
  62    def __init__(self, **kwd):
  63        """Initialize interval datatype, by adding UCSC display apps"""
  64        Tabular.__init__(self, **kwd)
  65        self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
  66    def init_meta( self, dataset, copy_from=None ):
  67        Tabular.init_meta( self, dataset, copy_from=copy_from )
  68    def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ):
  69        """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand"""
  70        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 )
  71        if dataset.has_data():
  72            empty_line_count = 0
  73            num_check_lines = 100 # only check up to this many non empty lines
  74            for i, line in enumerate( file( dataset.file_name ) ):
  75                line = line.rstrip( '\r\n' )
  76                if line:
  77                    if ( first_line_is_header or line[0] == '#' ):
  78                        self.init_meta( dataset )
  79                        line = line.strip( '#' )
  80                        elems = line.split( '\t' )
  81                        for meta_name, header_list in alias_spec.iteritems():
  82                            for header_val in header_list:
  83                                if header_val in elems:
  84                                    #found highest priority header to meta_name
  85                                    setattr( dataset.metadata, meta_name, elems.index( header_val ) + 1 )
  86                                    break #next meta_name
  87                        break  # Our metadata is set, so break out of the outer loop
  88                    else:
  89                        # Header lines in Interval files are optional. For example, BED is Interval but has no header.
  90                        # We'll make a best guess at the location of the metadata columns.
  91                        metadata_is_set = False
  92                        elems = line.split( '\t' )
  93                        if len( elems ) > 2:
  94                            for str in data.col1_startswith:
  95                                if line.lower().startswith( str ):
  96                                    if overwrite or not dataset.metadata.element_is_set( 'chromCol' ):
  97                                        dataset.metadata.chromCol = 1
  98                                    try:
  99                                        int( elems[1] )
 100                                        if overwrite or not dataset.metadata.element_is_set( 'startCol' ):
 101                                            dataset.metadata.startCol = 2
 102                                    except:
 103                                        pass # Metadata default will be used
 104                                    try:
 105                                        int( elems[2] )
 106                                        if overwrite or not dataset.metadata.element_is_set( 'endCol' ):
 107                                            dataset.metadata.endCol = 3
 108                                    except:
 109                                        pass # Metadata default will be used
 110                                    #we no longer want to guess that this column is the 'name', name must now be set manually for interval files
 111                                    #we will still guess at the strand, as we can make a more educated guess
 112                                    #if len( elems ) > 3:
 113                                    #    try:
 114                                    #        int( elems[3] )
 115                                    #    except:
 116                                    #        if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
 117                                    #            dataset.metadata.nameCol = 4
 118                                    if len( elems ) < 6 or elems[5] not in data.valid_strand:
 119                                        if overwrite or not dataset.metadata.element_is_set(  'strandCol' ):
 120                                            dataset.metadata.strandCol = 0
 121                                    else:
 122                                        if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
 123                                            dataset.metadata.strandCol = 6
 124                                    metadata_is_set = True
 125                                    break
 126                        if metadata_is_set or ( i - empty_line_count ) > num_check_lines:
 127                            break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop
 128                else:
 129                    empty_line_count += 1
 130    def displayable( self, dataset ):
 131        try:
 132            return dataset.has_data() \
 133                and dataset.state == dataset.states.OK \
 134                and dataset.metadata.columns > 0 \
 135                and dataset.metadata.data_lines != 0 \
 136                and dataset.metadata.chromCol \
 137                and dataset.metadata.startCol \
 138                and dataset.metadata.endCol
 139        except:
 140            return False
 141
 142    def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ):
 143        """Return a chrom, start, stop tuple for viewing a file."""
 144        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
 145        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
 146        if not self.displayable( dataset ):
 147            return ( None, None, None )
 148        try:
 149            # If column indexes were not passwed, determine from metadata
 150            if chrom_col is None:
 151                chrom_col = int( dataset.metadata.chromCol ) - 1
 152            if start_col is None:
 153                start_col = int( dataset.metadata.startCol ) - 1
 154            if end_col is None:
 155                end_col = int( dataset.metadata.endCol ) - 1
 156            # Scan lines of file to find a reasonable chromosome and range
 157            chrom = None
 158            start = sys.maxint
 159            end = 0
 160            max_col = max( chrom_col, start_col, end_col )
 161            fh = open( dataset.file_name )
 162            while True:
 163                line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
 164                # Stop if at end of file
 165                if not line:
 166                    break
 167                # Skip comment lines
 168                if not line.startswith( '#' ):
 169                    try:
 170                        fields = line.rstrip().split( '\t' )
 171                        if len( fields ) > max_col:
 172                            if chrom is None or chrom == fields[ chrom_col ]:
 173                                start = min( start, int( fields[ start_col ] ) )
 174                                end = max( end, int( fields[ end_col ] ) )
 175                                # Set chrom last, in case start and end are not integers
 176                                chrom = fields[ chrom_col ]
 177                            viewport_feature_count -= 1
 178                    except Exception, e:
 179                        # Most likely a non-integer field has been encountered
 180                        # for start / stop. Just ignore and make sure we finish
 181                        # reading the line and decrementing the counters.
 182                        pass
 183                # Make sure we are at the next new line
 184                readline_count = VIEWPORT_MAX_READS_PER_LINE
 185                while line.rstrip( '\n\r' ) == line:
 186                    assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
 187                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
 188                    if not line: break #EOF
 189                    readline_count -= 1
 190                max_line_count -= 1
 191                if not viewport_feature_count or not max_line_count:
 192                    #exceeded viewport or total line count to check
 193                    break
 194            if chrom is not None:
 195                return ( chrom, str( start ), str( end ) ) # Necessary to return strings?
 196        except Exception, e:
 197            # Unexpected error, possibly missing metadata
 198            log.exception( "Exception caught attempting to generate viewport for dataset '%d'", dataset.id )
 199        return ( None, None, None )
 200
 201    def as_ucsc_display_file( self, dataset, **kwd ):
 202        """Returns file contents with only the bed data"""
 203        fd, temp_name = tempfile.mkstemp()
 204        c, s, e, t, n = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol or 0, dataset.metadata.nameCol or 0
 205        c, s, e, t, n  = int(c)-1, int(s)-1, int(e)-1, int(t)-1, int(n)-1
 206        if t >= 0: # strand column (should) exists
 207            for i, elems in enumerate( util.file_iter(dataset.file_name) ):
 208                strand = "+"
 209                name = "region_%i" % i
 210                if n >= 0 and n < len( elems ): name = elems[n]
 211                if t<len(elems): strand = elems[t]
 212                tmp = [ elems[c], elems[s], elems[e], name, '0', strand ]
 213                os.write(fd, '%s\n' % '\t'.join(tmp) )
 214        elif n >= 0: # name column (should) exists
 215            for i, elems in enumerate( util.file_iter(dataset.file_name) ):
 216                name = "region_%i" % i
 217                if n >= 0 and n < len( elems ): name = elems[n]
 218                tmp = [ elems[c], elems[s], elems[e], name ]
 219                os.write(fd, '%s\n' % '\t'.join(tmp) )
 220        else:
 221            for elems in util.file_iter(dataset.file_name):
 222                tmp = [ elems[c], elems[s], elems[e] ]
 223                os.write(fd, '%s\n' % '\t'.join(tmp) )
 224        os.close(fd)
 225        return open(temp_name)
 226    def display_peek( self, dataset ):
 227        """Returns formated html of peek"""
 228        return Tabular.make_html_table( self, dataset, column_parameter_alias={'chromCol':'Chrom', 'startCol':'Start', 'endCol':'End', 'strandCol':'Strand', 'nameCol':'Name'} )
 229    def ucsc_links( self, dataset, type, app, base_url ):
 230        """
 231        Generate links to UCSC genome browser sites based on the dbkey
 232        and content of dataset.
 233        """
 234        # Filter UCSC sites to only those that are supported by this build and
 235        # enabled.
 236        valid_sites = [ ( name, url )
 237                        for name, url in util.get_ucsc_by_build( dataset.dbkey )
 238                        if name in app.config.ucsc_display_sites ]
 239        if not valid_sites:
 240            return []
 241        # If there are any valid sites, we need to generate the estimated
 242        # viewport
 243        chrom, start, stop = self.get_estimated_display_viewport( dataset )
 244        if chrom is None:
 245            return []
 246        # Accumulate links for valid sites
 247        ret_val = []
 248        for site_name, site_url in valid_sites:
 249            internal_url = url_for( controller='dataset', dataset_id=dataset.id,
 250                                    action='display_at', filename='ucsc_' + site_name )
 251            display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at"
 252                    % (base_url, url_for( controller='root' ), dataset.id, type) )
 253            redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s"
 254                    % (site_url, dataset.dbkey, chrom, start, stop ) )
 255            link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
 256            ret_val.append( ( site_name, link ) )
 257        return ret_val
 258    def validate( self, dataset ):
 259        """Validate an interval file using the bx GenomicIntervalReader"""
 260        errors = list()
 261        c, s, e, t = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol
 262        c, s, e, t = int(c)-1, int(s)-1, int(e)-1, int(t)-1
 263        infile = open(dataset.file_name, "r")
 264        reader = GenomicIntervalReader(
 265            infile,
 266            chrom_col = c,
 267            start_col = s,
 268            end_col = e,
 269            strand_col = t)
 270
 271        while True:
 272            try:
 273                reader.next()
 274            except ParseError, e:
 275                errors.append(e)
 276            except StopIteration:
 277                infile.close()
 278                return errors
 279
 280    def repair_methods( self, dataset ):
 281        """Return options for removing errors along with a description"""
 282        return [("lines","Remove erroneous lines")]
 283
 284    def sniff( self, filename ):
 285        """
 286        Checks for 'intervalness'
 287
 288        This format is mostly used by galaxy itself.  Valid interval files should include
 289        a valid header comment, but this seems to be loosely regulated.
 290
 291        >>> fname = get_test_fname( 'test_space.txt' )
 292        >>> Interval().sniff( fname )
 293        False
 294        >>> fname = get_test_fname( 'interval.interval' )
 295        >>> Interval().sniff( fname )
 296        True
 297        """
 298        headers = get_headers( filename, '\t' )
 299        try:
 300            """
 301            If we got here, we already know the file is_column_based and is not bed,
 302            so we'll just look for some valid data.
 303            """
 304            for hdr in headers:
 305                if hdr and not hdr[0].startswith( '#' ):
 306                    if len(hdr) < 3:
 307                        return False
 308                    try:
 309                        # Assume chrom start and end are in column positions 1 and 2
 310                        # respectively ( for 0 based columns )
 311                        check = int( hdr[1] )
 312                        check = int( hdr[2] )
 313                    except:
 314                        return False
 315            return True
 316        except:
 317            return False
 318
 319    def get_track_window(self, dataset, data, start, end):
 320        """
 321        Assumes the incoming track data is sorted already.
 322        """
 323        window = list()
 324        for record in data:
 325            fields = record.rstrip("\n\r").split("\t")
 326            record_chrom = fields[dataset.metadata.chromCol-1]
 327            record_start = int(fields[dataset.metadata.startCol-1])
 328            record_end = int(fields[dataset.metadata.endCol-1])
 329            if record_start < end and record_end > start:
 330                window.append( (record_chrom, record_start, record_end) )  #Yes I did want to use a generator here, but it doesn't work downstream
 331        return window
 332
 333    def get_track_resolution( self, dataset, start, end):
 334        return None
 335
 336    # ------------- Dataproviders
 337    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
 338                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
 339    def genomic_region_dataprovider( self, dataset, **settings ):
 340        return dataproviders.dataset.GenomicRegionDataProvider( dataset, **settings )
 341
 342    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
 343                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
 344    def genomic_region_dict_dataprovider( self, dataset, **settings ):
 345        settings[ 'named_columns' ] = True
 346        return self.genomic_region_dataprovider( dataset, **settings )
 347
 348    @dataproviders.decorators.dataprovider_factory( 'interval',
 349                                                    dataproviders.dataset.IntervalDataProvider.settings )
 350    def interval_dataprovider( self, dataset, **settings ):
 351        return dataproviders.dataset.IntervalDataProvider( dataset, **settings )
 352
 353    @dataproviders.decorators.dataprovider_factory( 'interval-dict',
 354                                                    dataproviders.dataset.IntervalDataProvider.settings )
 355    def interval_dict_dataprovider( self, dataset, **settings ):
 356        settings[ 'named_columns' ] = True
 357        return self.interval_dataprovider( dataset, **settings )
 358
 359
 360class BedGraph( Interval ):
 361    """Tab delimited chrom/start/end/datavalue dataset"""
 362
 363    file_ext = "bedgraph"
 364    track_type = "LineTrack"
 365    data_sources = { "data": "bigwig", "index": "bigwig" }
 366
 367    def as_ucsc_display_file( self, dataset, **kwd ):
 368        """
 369            Returns file contents as is with no modifications.
 370            TODO: this is a functional stub and will need to be enhanced moving forward to provide additional support for bedgraph.
 371        """
 372        return open( dataset.file_name )
 373
 374    def get_estimated_display_viewport( self, dataset, chrom_col = 0, start_col = 1, end_col = 2 ):
 375        """
 376            Set viewport based on dataset's first 100 lines.
 377        """
 378        return Interval.get_estimated_display_viewport( self, dataset, chrom_col = chrom_col, start_col = start_col, end_col = end_col )
 379
 380class Bed( Interval ):
 381    """Tab delimited data in BED format"""
 382    file_ext = "bed"
 383    data_sources = { "data": "tabix", "index": "bigwig", "feature_search": "fli" }
 384    track_type = Interval.track_type
 385
 386    """Add metadata elements"""
 387    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
 388    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
 389    MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
 390    MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
 391    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
 392    MetadataElement( name="viz_filter_cols", desc="Score column for visualization", default=[4], param=metadata.ColumnParameter, optional=True, multiple=True )
 393    ###do we need to repeat these? they are the same as should be inherited from interval type
 394
 395    def set_meta( self, dataset, overwrite = True, **kwd ):
 396        """Sets the metadata information for datasets previously determined to be in bed format."""
 397        i = 0
 398        if dataset.has_data():
 399            for i, line in enumerate( file(dataset.file_name) ):
 400                metadata_set = False
 401                line = line.rstrip('\r\n')
 402                if line and not line.startswith('#'):
 403                    elems = line.split('\t')
 404                    if len(elems) > 2:
 405                        for startswith in data.col1_startswith:
 406                            if line.lower().startswith( startswith ):
 407                                if len( elems ) > 3:
 408                                    if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
 409                                        dataset.metadata.nameCol = 4
 410                                if len(elems) < 6:
 411                                    if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
 412                                        dataset.metadata.strandCol = 0
 413                                else:
 414                                    if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
 415                                        dataset.metadata.strandCol = 6
 416                                metadata_set = True
 417                                break
 418                if metadata_set: break
 419            Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
 420
 421    def as_ucsc_display_file( self, dataset, **kwd ):
 422        """Returns file contents with only the bed data. If bed 6+, treat as interval."""
 423        for line in open(dataset.file_name):
 424            line = line.strip()
 425            if line == "" or line.startswith("#"):
 426                continue
 427            fields = line.split('\t')
 428            """check to see if this file doesn't conform to strict genome browser accepted bed"""
 429            try:
 430                if len(fields) > 12:
 431                    return Interval.as_ucsc_display_file(self, dataset) #too many fields
 432                if len(fields) > 6:
 433                    int(fields[6])
 434                    if len(fields) > 7:
 435                        int(fields[7])
 436                        if len(fields) > 8:
 437                            if int(fields[8]) != 0:
 438                                return Interval.as_ucsc_display_file(self, dataset)
 439                            if len(fields) > 9:
 440                                int(fields[9])
 441                                if len(fields) > 10:
 442                                    fields2 = fields[10].rstrip(",").split(",") #remove trailing comma and split on comma
 443                                    for field in fields2:
 444                                        int(field)
 445                                    if len(fields) > 11:
 446                                        fields2 = fields[11].rstrip(",").split(",") #remove trailing comma and split on comma
 447                                        for field in fields2:
 448                                            int(field)
 449            except: return Interval.as_ucsc_display_file(self, dataset)
 450            #only check first line for proper form
 451            break
 452
 453        try: return open(dataset.file_name)
 454        except: return "This item contains no content"
 455
 456    def sniff( self, filename ):
 457        """
 458        Checks for 'bedness'
 459
 460        BED lines have three required fields and nine additional optional fields.
 461        The number of fields per line must be consistent throughout any single set of data in
 462        an annotation track.  The order of the optional fields is binding: lower-numbered
 463        fields must always be populated if higher-numbered fields are used.  The data type of
 464        all 12 columns is:
 465        1-str, 2-int, 3-int, 4-str, 5-int, 6-str, 7-int, 8-int, 9-int or list, 10-int, 11-list, 12-list
 466
 467        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format1
 468
 469        >>> fname = get_test_fname( 'test_tab.bed' )
 470        >>> Bed().sniff( fname )
 471        True
 472        >>> fname = get_test_fname( 'interval1.bed' )
 473        >>> Bed().sniff( fname )
 474        True
 475        >>> fname = get_test_fname( 'complete.bed' )
 476        >>> Bed().sniff( fname )
 477        True
 478        """
 479        headers = get_headers( filename, '\t' )
 480        try:
 481            if not headers: return False
 482            for hdr in headers:
 483                if (hdr[0] == '' or hdr[0].startswith( '#' )):
 484                    continue
 485                valid_col1 = False
 486                if len(hdr) < 3 or len(hdr) > 12:
 487                    return False
 488                for str in data.col1_startswith:
 489                    if hdr[0].lower().startswith(str):
 490                        valid_col1 = True
 491                        break
 492                if valid_col1:
 493                    try:
 494                        int( hdr[1] )
 495                        int( hdr[2] )
 496                    except:
 497                        return False
 498                    if len( hdr ) > 4:
 499                        #hdr[3] is a string, 'name', which defines the name of the BED line - difficult to test for this.
 500                        #hdr[4] is an int, 'score', a score between 0 and 1000.
 501                        try:
 502                            if int( hdr[4] ) < 0 or int( hdr[4] ) > 1000: return False
 503                        except:
 504                            return False
 505                    if len( hdr ) > 5:
 506                        #hdr[5] is strand
 507                        if hdr[5] not in data.valid_strand: return False
 508                    if len( hdr ) > 6:
 509                        #hdr[6] is thickStart, the starting position at which the feature is drawn thickly.
 510                        try: int( hdr[6] )
 511                        except: return False
 512                    if len( hdr ) > 7:
 513                        #hdr[7] is thickEnd, the ending position at which the feature is drawn thickly
 514                        try: int( hdr[7] )
 515                        except: return False
 516                    if len( hdr ) > 8:
 517                        #hdr[8] is itemRgb, an RGB value of the form R,G,B (e.g. 255,0,0).  However, this could also be an int (e.g., 0)
 518                        try: int( hdr[8] )
 519                        except:
 520                            try: hdr[8].split(',')
 521                            except: return False
 522                    if len( hdr ) > 9:
 523                        #hdr[9] is blockCount, the number of blocks (exons) in the BED line.
 524                        try: block_count = int( hdr[9] )
 525                        except: return False
 526                    if len( hdr ) > 10:
 527                        #hdr[10] is blockSizes - A comma-separated list of the block sizes.
 528                        #Sometimes the blosck_sizes and block_starts lists end in extra commas
 529                        try: block_sizes = hdr[10].rstrip(',').split(',')
 530                        except: return False
 531                    if len( hdr ) > 11:
 532                        #hdr[11] is blockStarts - A comma-separated list of block starts.
 533                        try: block_starts = hdr[11].rstrip(',').split(',')
 534                        except: return False
 535                        if len(block_sizes) != block_count or len(block_starts) != block_count: return False
 536                else: return False
 537            return True
 538        except: return False
 539
 540class BedStrict( Bed ):
 541    """Tab delimited data in strict BED format - no non-standard columns allowed"""
 542
 543    file_ext = "bedstrict"
 544
 545    #no user change of datatype allowed
 546    allow_datatype_change = False
 547
 548    #Read only metadata elements
 549    MetadataElement( name="chromCol", default=1, desc="Chrom column", readonly=True, param=metadata.MetadataParameter )
 550    MetadataElement( name="startCol", default=2, desc="Start column", readonly=True, param=metadata.MetadataParameter ) #TODO: start and end should be able to be set to these or the proper thick[start/end]?
 551    MetadataElement( name="endCol", default=3, desc="End column", readonly=True, param=metadata.MetadataParameter )
 552    MetadataElement( name="strandCol", desc="Strand column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True )
 553    MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True )
 554    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
 555
 556    def __init__( self, **kwd ):
 557        Tabular.__init__( self, **kwd )
 558        self.clear_display_apps() #only new style display applications for this datatype
 559
 560    def set_meta( self, dataset, overwrite = True, **kwd ):
 561        Tabular.set_meta( self, dataset, overwrite = overwrite, **kwd) #need column count first
 562        if dataset.metadata.columns >= 4:
 563            dataset.metadata.nameCol = 4
 564            if dataset.metadata.columns >= 6:
 565                dataset.metadata.strandCol = 6
 566
 567    def sniff( self, filename ):
 568        return False #NOTE: This would require aggressively validating the entire file
 569
 570class Bed6( BedStrict ):
 571    """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 6"""
 572
 573    file_ext = "bed6"
 574
 575class Bed12( BedStrict ):
 576    """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 12"""
 577
 578    file_ext = "bed12"
 579
 580class _RemoteCallMixin:
 581    def _get_remote_call_url( self, redirect_url, site_name, dataset, type, app, base_url ):
 582        """Retrieve the URL to call out to an external site and retrieve data.
 583        This routes our external URL through a local galaxy instance which makes
 584        the data available, followed by redirecting to the remote site with a
 585        link back to the available information.
 586        """
 587        internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='%s_%s' % ( type, site_name ) )
 588        base_url = app.config.get( "display_at_callback", base_url )
 589        display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % \
 590                                         ( base_url, url_for( controller='root' ), dataset.id, type ) )
 591        link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
 592        return link
 593
 594
 595@dataproviders.decorators.has_dataproviders
 596class Gff( Tabular, _RemoteCallMixin ):
 597    """Tab delimited data in Gff format"""
 598    file_ext = "gff"
 599    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
 600    data_sources = { "data": "interval_index", "index": "bigwig", "feature_search": "fli" }
 601    track_type = Interval.track_type
 602
 603    """Add metadata elements"""
 604    MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False )
 605    MetadataElement( name="column_types", default=['str','str','str','int','int','int','str','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
 606
 607    MetadataElement( name="attributes", default=0, desc="Number of attributes", readonly=True, visible=False, no_value=0 )
 608    MetadataElement( name="attribute_types", default={}, desc="Attribute types", param=metadata.DictParameter, readonly=True, visible=False, no_value=[] )
 609
 610    def __init__( self, **kwd ):
 611        """Initialize datatype, by adding GBrowse display app"""
 612        Tabular.__init__(self, **kwd)
 613        self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
 614        self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
 615
 616    def set_attribute_metadata( self, dataset ):
 617        """
 618        Sets metadata elements for dataset's attributes.
 619        """
 620
 621        # Use first N lines to set metadata for dataset attributes. Attributes
 622        # not found in the first N lines will not have metadata.
 623        num_lines = 200
 624        attribute_types = {}
 625        for i, line in enumerate( file ( dataset.file_name ) ):
 626            if line and not line.startswith( '#' ):
 627                elems = line.split( '\t' )
 628                if len( elems ) == 9:
 629                    try:
 630                        # Loop through attributes to set types.
 631                        for name, value in parse_gff_attributes( elems[8] ).items():
 632                            # Default type is string.
 633                            value_type = "str"
 634                            try:
 635                                # Try int.
 636                                int( value )
 637                                value_type = "int"
 638                            except:
 639                                try:
 640                                    # Try float.
 641                                    float( value )
 642                                    value_type = "float"
 643                                except:
 644                                    pass
 645                            attribute_types[ name ] = value_type
 646                    except:
 647                        pass
 648                if i + 1 == num_lines:
 649                    break
 650
 651        # Set attribute metadata and then set additional metadata.
 652        dataset.metadata.attribute_types = attribute_types
 653        dataset.metadata.attributes = len( attribute_types )
 654
 655    def set_meta( self, dataset, overwrite = True, **kwd ):
 656        self.set_attribute_metadata( dataset )
 657
 658        i = 0
 659        for i, line in enumerate( file ( dataset.file_name ) ):
 660            line = line.rstrip('\r\n')
 661            if line and not line.startswith( '#' ):
 662                elems = line.split( '\t' )
 663                if len(elems) == 9:
 664                    try:
 665                        int( elems[3] )
 666                        int( elems[4] )
 667                        break
 668                    except:
 669                        pass
 670        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
 671
 672    def display_peek( self, dataset ):
 673        """Returns formated html of peek"""
 674        return Tabular.make_html_table( self, dataset, column_names=self.column_names )
 675    def get_estimated_display_viewport( self, dataset ):
 676        """
 677        Return a chrom, start, stop tuple for viewing a file.  There are slight differences between gff 2 and gff 3
 678        formats.  This function should correctly handle both...
 679        """
 680        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
 681        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
 682        if self.displayable( dataset ):
 683            try:
 684                seqid = None
 685                start = sys.maxint
 686                stop = 0
 687                fh = open( dataset.file_name )
 688                while True:
 689                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
 690                    if not line: break #EOF
 691                    try:
 692                        if line.startswith( '##sequence-region' ): # ##sequence-region IV 6000000 6030000
 693                            elems = line.rstrip( '\n\r' ).split()
 694                            if len( elems ) > 3:
 695                                # line looks like:
 696                                # ##sequence-region   ctg123 1 1497228
 697                                seqid = elems[1] # IV
 698                                start = int( elems[2] )# 6000000
 699                                stop = int( elems[3] ) # 6030000
 700                                break #use location declared in file
 701                            elif len( elems ) == 2 and elems[1].find( '..' ) > 0:
 702                                # line looks like this:
 703                                # ##sequence-region X:120000..140000
 704                                elems = elems[1].split( ':' )
 705                                seqid = elems[0]
 706                                start = int( elems[1].split( '..' )[0] )
 707                                stop = int( elems[1].split( '..' )[1] )
 708                                break #use location declared in file
 709                            else:
 710                                log.exception( "line (%s) uses an unsupported ##sequence-region definition." % str( line ) )
 711                                #break #no break, if bad definition, we try another method
 712                        elif line.startswith("browser position"):
 713                            # Allow UCSC style browser and track info in the GFF file
 714                            pos_info = line.split()[-1]
 715                            seqid, startend = pos_info.split(":")
 716                            start, stop = map( int, startend.split("-") )
 717                            break #use location declared in file
 718                        elif True not in map( line.startswith, ( '#', 'track', 'browser' ) ):# line.startswith() does not accept iterator in python2.4
 719                            viewport_feature_count -= 1
 720                            elems = line.rstrip( '\n\r' ).split( '\t' )
 721                            if len( elems ) > 3:
 722                                if not seqid:
 723                                    # We can only set the viewport for a single chromosome
 724                                    seqid = elems[0]
 725                                if seqid == elems[0]:
 726                                    # Make sure we have not spanned chromosomes
 727                                    start = min( start, int( elems[3] ) )
 728                                    stop = max( stop, int( elems[4] ) )
 729                    except:
 730                        #most likely start/stop is not an int or not enough fields
 731                        pass
 732                    #make sure we are at the next new line
 733                    readline_count = VIEWPORT_MAX_READS_PER_LINE
 734                    while line.rstrip( '\n\r' ) == line:
 735                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
 736                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
 737                        if not line: break #EOF
 738                        readline_count -= 1
 739                    max_line_count -= 1
 740                    if not viewport_feature_count or not max_line_count:
 741                        #exceeded viewport or total line count to check
 742                        break
 743                if seqid is not None:
 744                    return ( seqid, str( start ), str( stop ) ) #Necessary to return strings?
 745            except Exception, e:
 746                #unexpected error
 747                log.exception( str( e ) )
 748        return ( None, None, None ) #could not determine viewport
 749    def ucsc_links( self, dataset, type, app, base_url ):
 750        ret_val = []
 751        seqid, start, stop = self.get_estimated_display_viewport( dataset )
 752        if seqid is not None:
 753            for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ):
 754                if site_name in app.config.ucsc_display_sites:
 755                    redirect_url = urllib.quote_plus(
 756                            "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" %
 757                            ( site_url, dataset.dbkey, seqid, start, stop ) )
 758                    link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
 759                    ret_val.append( ( site_name, link ) )
 760        return ret_val
 761    def gbrowse_links( self, dataset, type, app, base_url ):
 762        ret_val = []
 763        seqid, start, stop = self.get_estimated_display_viewport( dataset )
 764        if seqid is not None:
 765            for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ):
 766                if site_name in app.config.gbrowse_display_sites:
 767                    if seqid.startswith( 'chr' ) and len ( seqid ) > 3:
 768                        seqid = seqid[3:]
 769                    redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, seqid, start, stop ) )
 770                    link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
 771                    ret_val.append( ( site_name, link ) )
 772        return ret_val
 773    def sniff( self, filename ):
 774        """
 775        Determines whether the file is in gff format
 776
 777        GFF lines have nine required fields that must be tab-separated.
 778
 779        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format3
 780
 781        >>> fname = get_test_fname( 'gff_version_3.gff' )
 782        >>> Gff().sniff( fname )
 783        False
 784        >>> fname = get_test_fname( 'test.gff' )
 785        >>> Gff().sniff( fname )
 786        True
 787        """
 788        headers = get_headers( filename, '\t' )
 789        try:
 790            if len(headers) < 2:
 791                return False
 792            for hdr in headers:
 793                if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0:
 794                    return False
 795                if hdr and hdr[0] and not hdr[0].startswith( '#' ):
 796                    if len(hdr) != 9:
 797                        return False
 798                    try:
 799                        int( hdr[3] )
 800                        int( hdr[4] )
 801                    except:
 802                        return False
 803                    if hdr[5] != '.':
 804                        try:
 805                            score = float( hdr[5] )
 806                        except:
 807                            return False
 808                    if hdr[6] not in data.valid_strand:
 809                        return False
 810            return True
 811        except:
 812            return False
 813
 814    # ------------- Dataproviders
 815    # redefine bc super is Tabular
 816    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
 817                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
 818    def genomic_region_dataprovider( self, dataset, **settings ):
 819        return dataproviders.dataset.GenomicRegionDataProvider( dataset, 0, 3, 4, **settings )
 820
 821    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
 822                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
 823    def genomic_region_dict_dataprovider( self, dataset, **settings ):
 824        settings[ 'named_columns' ] = True
 825        return self.genomic_region_dataprovider( dataset, **settings )
 826
 827    @dataproviders.decorators.dataprovider_factory( 'interval',
 828                                                    dataproviders.dataset.IntervalDataProvider.settings )
 829    def interval_dataprovider( self, dataset, **settings ):
 830        return dataproviders.dataset.IntervalDataProvider( dataset, 0, 3, 4, 6, 2, **settings )
 831
 832    @dataproviders.decorators.dataprovider_factory( 'interval-dict',
 833                                                    dataproviders.dataset.IntervalDataProvider.settings )
 834    def interval_dict_dataprovider( self, dataset, **settings ):
 835        settings[ 'named_columns' ] = True
 836        return self.interval_dataprovider( dataset, **settings )
 837
 838
 839class Gff3( Gff ):
 840    """Tab delimited data in Gff3 format"""
 841    file_ext = "gff3"
 842    valid_gff3_strand = ['+', '-', '.', '?']
 843    valid_gff3_phase = ['.', '0', '1', '2']
 844    column_names = [ 'Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes' ]
 845    track_type = Interval.track_type
 846
 847    """Add metadata elements"""
 848    MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
 849
 850    def __init__(self, **kwd):
 851        """Initialize datatype, by adding GBrowse display app"""
 852        Gff.__init__(self, **kwd)
 853    def set_meta( self, dataset, overwrite = True, **kwd ):
 854        self.set_attribute_metadata( dataset )
 855
 856        i = 0
 857        for i, line in enumerate( file ( dataset.file_name ) ):
 858            line = line.rstrip('\r\n')
 859            if line and not line.startswith( '#' ):
 860                elems = line.split( '\t' )
 861                valid_start = False
 862                valid_end = False
 863                if len( elems ) == 9:
 864                    try:
 865                        start = int( elems[3] )
 866                        valid_start = True
 867                    except:
 868                        if elems[3] == '.':
 869                            valid_start = True
 870                    try:
 871                        end = int( elems[4] )
 872                        valid_end = True
 873                    except:
 874                        if elems[4] == '.':
 875                            valid_end = True
 876                    strand = elems[6]
 877                    phase = elems[7]
 878                    if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase:
 879                        break
 880        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
 881    def sniff( self, filename ):
 882        """
 883        Determines whether the file is in gff version 3 format
 884
 885        GFF 3 format:
 886
 887        1) adds a mechanism for representing more than one level
 888           of hierarchical grouping of features and subfeatures.
 889        2) separates the ideas of group membership and feature name/id
 890        3) constrains the feature type field to be taken from a controlled
 891           vocabulary.
 892        4) allows a single feature, such as an exon, to belong to more than
 893           one group at a time.
 894        5) provides an explicit convention for pairwise alignments
 895        6) provides an explicit convention for features that occupy disjunct regions
 896
 897        The format consists of 9 columns, separated by tabs (NOT spaces).
 898
 899        Undefined fields are replaced with the "." character, as described in the original GFF spec.
 900
 901        For complete details see http://song.sourceforge.net/gff3.shtml
 902
 903        >>> fname = get_test_fname( 'test.gff' )
 904        >>> Gff3().sniff( fname )
 905        False
 906        >>> fname = get_test_fname('gff_version_3.gff')
 907        >>> Gff3().sniff( fname )
 908        True
 909        """
 910        headers = get_headers( filename, '\t' )
 911        try:
 912            if len(headers) < 2:
 913                return False
 914            for hdr in headers:
 915                if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) >= 0:
 916                    return True
 917                elif hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) < 0:
 918                    return False
 919                # Header comments may have been stripped, so inspect the data
 920                if hdr and hdr[0] and not hdr[0].startswith( '#' ):
 921                    if len(hdr) != 9:
 922                        return False
 923                    try:
 924                        int( hdr[3] )
 925                    except:
 926                        if hdr[3] != '.':
 927                            return False
 928                    try:
 929                        int( hdr[4] )
 930                    except:
 931                        if hdr[4] != '.':
 932                            return False
 933                    if hdr[5] != '.':
 934                        try:
 935                            score = float( hdr[5] )
 936                        except:
 937                            return False
 938                    if hdr[6] not in self.valid_gff3_strand:
 939                        return False
 940                    if hdr[7] not in self.valid_gff3_phase:
 941                        return False
 942            return True
 943        except:
 944            return False
 945
 946class Gtf( Gff ):
 947    """Tab delimited data in Gtf format"""
 948    file_ext = "gtf"
 949    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Attributes' ]
 950    track_type = Interval.track_type
 951
 952    """Add metadata elements"""
 953    MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False )
 954    MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
 955
 956    def sniff( self, filename ):
 957        """
 958        Determines whether the file is in gtf format
 959
 960        GTF lines have nine required fields that must be tab-separated. The first eight GTF fields are the same as GFF.
 961        The group field has been expanded into a list of attributes. Each attribute consists of a type/value pair.
 962        Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space.
 963        The attribute list must begin with the two mandatory attributes:
 964
 965            gene_id value - A globally unique identifier for the genomic source of the sequence.
 966            transcript_id value - A globally unique identifier for the predicted transcript.
 967
 968        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format4
 969
 970        >>> fname = get_test_fname( '1.bed' )
 971        >>> Gtf().sniff( fname )
 972        False
 973        >>> fname = get_test_fname( 'test.gff' )
 974        >>> Gtf().sniff( fname )
 975        False
 976        >>> fname = get_test_fname( 'test.gtf' )
 977        >>> Gtf().sniff( fname )
 978        True
 979        """
 980        headers = get_headers( filename, '\t' )
 981        try:
 982            if len(headers) < 2:
 983                return False
 984            for hdr in headers:
 985                if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0:
 986                    return False
 987                if hdr and hdr[0] and not hdr[0].startswith( '#' ):
 988                    if len(hdr) != 9:
 989                        return False
 990                    try:
 991                        int( hdr[3] )
 992                        int( hdr[4] )
 993                    except:
 994                        return False
 995                    if hdr[5] != '.':
 996                        try:
 997                            score = float( hdr[5] )
 998                        except:
 999                            return False
1000                    if hdr[6] not in data.valid_strand:
1001                        return False
1002
1003                    # Check attributes for gene_id, transcript_id
1004                    attributes = parse_gff_attributes( hdr[8] )
1005                    if len( attributes ) >= 2:
1006                        if 'gene_id' not in attributes:
1007                            return False
1008                        if 'transcript_id' not in attributes:
1009                            return False
1010                    else:
1011                        return False
1012            return True
1013        except:
1014            return False
1015
1016@dataproviders.decorators.has_dataproviders
1017class Wiggle( Tabular, _RemoteCallMixin ):
1018    """Tab delimited data in wiggle format"""
1019    file_ext = "wig"
1020    track_type = "LineTrack"
1021    data_sources = { "data": "bigwig", "index": "bigwig" }
1022
1023    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
1024
1025    def __init__( self, **kwd ):
1026        Tabular.__init__( self, **kwd )
1027        self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
1028        self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
1029    def get_estimated_display_viewport( self, dataset ):
1030        """Return a chrom, start, stop tuple for viewing a file."""
1031        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
1032        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
1033        if self.displayable( dataset ):
1034            try:
1035                chrom = None
1036                start = sys.maxint
1037                end = 0
1038                span = 1
1039                step = None
1040                fh = open( dataset.file_name )
1041                while True:
1042                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
1043                    if not line: break #EOF
1044                    try:
1045                        if line.startswith( "browser" ):
1046                            chr_info = line.rstrip( '\n\r' ).split()[-1]
1047                            chrom, coords = chr_info.split( ":" )
1048                            start, end = map( int, coords.split( "-" ) )
1049                            break # use the browser line
1050                        # variableStep chrom=chr20
1051                        if line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ):
1052                            if chrom is not None: break #different chrom or different section of the chrom
1053                            chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0]
1054                            if 'span=' in line:
1055                                span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] )
1056                            if 'step=' in line:
1057                                step = int( line.rstrip( '\n\r' ).split("step=")[1].split()[0] )
1058                                start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] )
1059                        else:
1060                            fields = line.rstrip( '\n\r' ).split()
1061                            if fields:
1062                                if step is not None:
1063                                    if not end:
1064                                        end = start + span
1065                                    else:
1066                                        end += step
1067                                else:
1068                                    start = min( int( fields[0] ), start )
1069                                    end = max( end, int( fields[0] ) + span )
1070                                viewport_feature_count -= 1
1071                    except:
1072                        pass
1073                    #make sure we are at the next new line
1074                    readline_count = VIEWPORT_MAX_READS_PER_LINE
1075                    while line.rstrip( '\n\r' ) == line:
1076                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
1077                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
1078                        if not line: break #EOF
1079                        readline_count -= 1
1080                    max_line_count -= 1
1081                    if not viewport_feature_count or not max_line_count:
1082                        #exceeded viewport or total line count to check
1083                        break
1084                if chrom is not None:
1085                    return ( chrom, str( start ), str( end ) ) #Necessary to return strings?
1086            except Exception, e:
1087                #unexpected error
1088                log.exception( str( e ) )
1089        return ( None, None, None ) #could not determine viewport
1090    def gbrowse_links( self, dataset, type, app, base_url ):
1091        ret_val = []
1092        chrom, start, stop = self.get_estimated_display_viewport( dataset )
1093        if chrom is not None:
1094            for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ):
1095                if site_name in app.config.gbrowse_display_sites:
1096                    if chrom.startswith( 'chr' ) and len ( chrom ) > 3:
1097                        chrom = chrom[3:]
1098                    redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, chrom, start, stop ) )
1099                    link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
1100                    ret_val.append( ( site_name, link ) )
1101        return ret_val
1102    def ucsc_links( self, dataset, type, app, base_url ):
1103        ret_val = []
1104        chrom, start, stop = self.get_estimated_display_viewport( dataset )
1105        if chrom is not None:
1106            for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ):
1107                if site_name in app.config.ucsc_display_sites:
1108                    redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % ( site_url, dataset.dbkey, chrom, start, stop ) )
1109                    link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
1110                    ret_val.append( ( site_name, link ) )
1111        return ret_val
1112    def display_peek( self, dataset ):
1113        """Returns formated html of peek"""
1114        return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
1115    def set_meta( self, dataset, overwrite = True, **kwd ):
1116        max_data_lines = None
1117        i = 0
1118        for i, line in enumerate( file ( dataset.file_name ) ):
1119            line = line.rstrip('\r\n')
1120            if line and not line.startswith( '#' ):
1121                elems = line.split( '\t' )
1122                try:
1123                    float( elems[0] ) #"Wiggle track data values can be integer or real, positive or negative values"
1124                    break
1125                except:
1126                    do_break = False
1127                    for col_startswith in data.col1_startswith:
1128                        if elems[0].lower().startswith( col_startswith ):
1129                            do_break = True
1130                            break
1131                    if do_break:
1132                        break
1133        if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
1134            #we'll arbitrarily only use the first 100 data lines in this wig file to calculate tabular attributes (column types)
1135            #this should be sufficient, except when we have mixed wig track types (bed, variable, fixed),
1136            #    but those cases are not a single table that would have consistant column definitions
1137            #optional metadata values set in Tabular class will be 'None'
1138            max_data_lines = 100
1139        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i, max_data_lines = max_data_lines )
1140    def sniff( self, filename ):
1141        """
1142        Determines wether the file is in wiggle format
1143
1144        The .wig format is line-oriented. Wiggle data is preceeded by a track definition line,
1145        which adds a number of options for controlling the default display of this track.
1146        Following the track definition line is the track data, which can be entered in several
1147        different formats.
1148
1149        The track definition line begins with the word 'track' followed by the track type.
1150        The track type with version is REQUIRED, and it currently must be wiggle_0.  For example,
1151        track type=wiggle_0...
1152
1153        For complete details see http://genome.ucsc.edu/goldenPath/help/wiggle.html
1154
1155        >>> fname = get_test_fname( 'interval1.bed' )
1156        >>> Wiggle().sniff( fname )
1157        False
1158        >>> fname = get_test_fname( 'wiggle.wig' )
1159        >>> Wiggle().sniff( fname )
1160        True
1161        """
1162        headers = get_headers( filename, None )
1163        try:
1164            for hdr in headers:
1165                if len(hdr) > 1 and hdr[0] == 'track' and hdr[1].startswith('type=wiggle'):
1166                    return True
1167            return False
1168        except:
1169            return False
1170    def get_track_window(self, dataset, data, start, end):
1171        """
1172        Assumes we have a numpy file.
1173        """
1174        # Maybe if we import here people will still be able to use Galaxy when numpy kills it
1175        pkg_resources.require("numpy>=1.2.1")
1176        #from numpy.lib import format
1177        import numpy
1178
1179        range = end - start
1180        # Determine appropriate resolution to plot ~1000 points
1181        resolution = ( 10 ** math.ceil( math.log10( range / 1000 ) ) )
1182        # Restrict to valid range
1183        resolution = min( resolution, 100000 )
1184        resolution = max( resolution, 1 )
1185        # Memory map the array (don't load all the data)
1186        data = numpy.load( data )
1187        # Grab just what we need
1188        t_start = math.floor( start / resolution )
1189        t_end = math.ceil( end / resolution )
1190        x = numpy.arange( t_start, t_end ) * resolution
1191        y = data[ t_start : t_end ]
1192
1193        return zip(x.tolist(), y.tolist())
1194    def get_track_resolution( self, dataset, start, end):
1195        range = end - start
1196        # Determine appropriate resolution to plot ~1000 points
1197        resolution = math.ceil( 10 ** math.ceil( math.log10( range / 1000 ) ) )
1198        # Restrict to valid range
1199        resolution = min( resolution, 100000 )
1200        resolution = max( resolution, 1 )
1201        return resolution
1202
1203    # ------------- Dataproviders
1204    @dataproviders.decorators.dataprovider_factory( 'wiggle', dataproviders.dataset.WiggleDataProvider.settings )
1205    def wiggle_dataprovider( self, dataset, **settings ):
1206        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
1207        return dataproviders.dataset.WiggleDataProvider( dataset_source, **settings )
1208
1209    @dataproviders.decorators.dataprovider_factory( 'wiggle-dict', dataproviders.dataset.WiggleDataProvider.settings )
1210    def wiggle_dict_dataprovider( self, dataset, **settings ):
1211        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
1212        settings[ 'named_columns' ] = True
1213        return dataproviders.dataset.WiggleDataProvider( dataset_source, **settings )
1214
1215
1216class CustomTrack ( Tabular ):
1217    """UCSC CustomTrack"""
1218    file_ext = "customtrack"
1219
1220    def __init__(self, **kwd):
1221        """Initialize interval datatype, by adding UCSC display app"""
1222        Tabular.__init__(self, **kwd)
1223        self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
1224    def set_meta( self, dataset, overwrite = True, **kwd ):
1225        Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
1226    def display_peek( self, dataset ):
1227        """Returns formated html of peek"""
1228        return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
1229    def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ):
1230        """Return a chrom, start, stop tuple for viewing a file."""
1231        #FIXME: only BED and WIG custom tracks are currently supported
1232        #As per previously existing behavior, viewport will only be over the first intervals
1233        max_line_count = 100 # maximum number of lines to check; includes comment lines
1234        variable_step_wig = False
1235        chrom = None
1236        span = 1
1237        if self.displayable( dataset ):
1238            try:
1239                fh = open( dataset.file_name )
1240                while True:
1241                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
1242                    if not line: break #EOF
1243                    if not line.startswith( '#' ):
1244                        try:
1245                            if variable_step_wig:
1246                                fields = line.rstrip().split()
1247                                if len( fields ) == 2:
1248                                    start = int( fields[ 0 ] )
1249                                    return ( chrom, str( start ), str( start + span ) )
1250                            elif line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ):
1251                                chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0]
1252                                if 'span=' in line:
1253                                    span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] )
1254                                if 'start=' in line:
1255                                    start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] )
1256                                    return ( chrom, str( start ), str( start + span )  )
1257                                else:
1258                                    variable_step_wig = True
1259                            else:
1260                                fields = line.rstrip().split( '\t' )
1261                                if len( fields ) >= 3:
1262                                    chrom = fields[ 0 ]
1263                                    start = int( fields[ 1 ] )
1264                                    end = int( fields[ 2 ] )
1265                                    return ( chrom, str( start ), str( end ) )
1266                        except Exception:
1267                            #most likely a non-integer field has been encountered for start / stop
1268                            continue
1269                    #make sure we are at the next new line
1270                    readline_count = VIEWPORT_MAX_READS_PER_LINE
1271                    while line.rstrip( '\n\r' ) == line:
1272                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
1273                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
1274                        if not line: break #EOF
1275                        readline_count -= 1
1276                    max_line_count -= 1
1277                    if not max_line_count:
1278                        #exceeded viewport or total line count to check
1279                        break
1280            except Exception, e:
1281                #unexpected error
1282                log.exception( str( e ) )
1283        return ( None, None, None ) #could not determine viewport
1284    def ucsc_links( self, dataset, type, app, base_url ):
1285        ret_val = []
1286        chrom, start, stop = self.get_estimated_display_viewport(dataset)
1287        if chrom is not None:
1288            for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey):
1289                if site_name in app.config.ucsc_display_sites:
1290                    internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name )
1291                    display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type) )
1292                    redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop ) )
1293                    link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
1294                    ret_val.append( (site_name, link) )
1295        return ret_val
1296    def sniff( self, filename ):
1297        """
1298        Determines whether the file is in customtrack format.
1299
1300        CustomTrack files are built within Galaxy and are basically bed or interval files with the first line looking
1301        something like this.
1302
1303        track name="User Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1
1304
1305        >>> fname = get_test_fname( 'complete.bed' )
1306        >>> CustomTrack().sniff( fname )
1307        False
1308        >>> fname = get_test_fname( 'ucsc.customtrack' )
1309        >>> CustomTrack().sniff( fname )
1310        True
1311        """
1312        headers = get_headers( filename, None )
1313        first_line = True
1314        for hdr in headers:
1315            if first_line:
1316                first_line = False
1317                try:
1318                    if hdr[0].startswith('track'):
1319                        color_found = False
1320                        visibility_found = False
1321                        for elem in hdr[1:]:
1322                            if elem.startswith('color'): color_found = True
1323                            if elem.startswith('visibility'): visibility_found = True
1324                            if color_found and visibility_found: break
1325                        if not color_found or not visibility_found: return False
1326                    else: return False
1327                except: return False
1328            else:
1329                try:
1330                    if hdr[0] and not hdr[0].startswith( '#' ):
1331                        if len( hdr ) < 3:
1332                            return False
1333                        try:
1334                            int( hdr[1] )
1335                            int( hdr[2] )
1336                        except:
1337                            return False
1338                except:
1339                    return False
1340        return True
1341
1342
1343class ENCODEPeak( Interval ):
1344    '''
1345    Human ENCODE peak format. There are both broad and narrow peak formats.
1346    Formats are very similar; narrow peak has an additional column, though.
1347
1348    Broad peak ( http://genome.ucsc.edu/FAQ/FAQformat#format13 ):
1349    This format is used to provide called regions of signal enrichment based
1350    on pooled, normalized (interpreted) data. It is a BED 6+3 format.
1351
1352    Narrow peak http://genome.ucsc.edu/FAQ/FAQformat#format12 and :
1353    This format is used to provide called peaks of signal enrichment based on
1354    pooled, normalized (interpreted) data. It is a BED6+4 format.
1355    '''
1356
1357    file_ext = "encodepeak"
1358    column_names = [ 'Chrom', 'Start', 'End', 'Name', 'Score', 'Strand', 'SignalValue', 'pValue', 'qValue', 'Peak' ]
1359    data_sources = { "data": "tabix", "index": "bigwig" }
1360
1361    """Add metadata elements"""
1362    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
1363    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
1364    MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
1365    MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
1366    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
1367
1368    def sniff( self, filename ):
1369        return False
1370
1371
1372class ChromatinInteractions( Interval ):
1373    '''
1374    Chromatin interactions obtained from 3C/5C/Hi-C experiments.
1375    '''
1376
1377    file_ext = "chrint"
1378    track_type = "DiagonalHeatmapTrack"
1379    data_sources = { "data": "tabix", "index": "bigwig" }
1380
1381    column_names = [ 'Chrom1', 'Start1', 'End1', 'Chrom2', 'Start2', 'End2', 'Value' ]
1382
1383    """Add metadata elements"""
1384    MetadataElement( name="chrom1Col", default=1, desc="Chrom1 column", param=metadata.ColumnParameter )
1385    MetadataElement( name="start1Col", default=2, desc="Start1 column", param=metadata.ColumnParameter )
1386    MetadataElement( name="end1Col", default=3, desc="End1 column", param=metadata.ColumnParameter )
1387    MetadataElement( name="chrom2Col", default=4, desc="Chrom2 column", param=metadata.ColumnParameter )
1388    MetadataElement( name="start2Col", default=5, desc="Start2 column", param=metadata.ColumnParameter )
1389    MetadataElement( name="end2Col", default=6, desc="End2 column", param=metadata.ColumnParameter )
1390    MetadataElement( name="valueCol", default=7, desc="Value column", param=metadata.ColumnParameter )
1391
1392    MetadataElement( name="columns", default=7, desc="Number of columns", readonly=True, visible=False )
1393
1394    def sniff( self, filename ):
1395        return False
1396
1397
1398if __name__ == '__main__':
1399    import doctest, sys
1400    doctest.testmod(sys.modules[__name__])