PageRenderTime 60ms CodeModel.GetById 29ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/util/gff_util.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 431 lines | 424 code | 2 blank | 5 comment | 0 complexity | 5db39e67f201339020b7afc4e14b789a MD5 | raw file
  1"""
  2Provides utilities for working with GFF files.
  3"""
  4
  5import copy
  6import pkg_resources; pkg_resources.require( "bx-python" )
  7from bx.intervals.io import *
  8from bx.tabular.io import Header, Comment
  9from galaxy.util.odict import odict
 10
 11class GFFInterval( GenomicInterval ):
 12    """
 13    A GFF interval, including attributes. If file is strictly a GFF file,
 14    only attribute is 'group.'
 15    """
 16    def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
 17                  strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
 18        # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
 19        # temporarily set strand and then unset after initing GenomicInterval.
 20        unknown_strand = False
 21        if not fix_strand and fields[ strand_col ] == '.':
 22            unknown_strand = True
 23            fields[ strand_col ] = '+'
 24        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
 25                                  default_strand, fix_strand=fix_strand )
 26        if unknown_strand:
 27            self.strand = '.'
 28            self.fields[ strand_col ] = '.'
 29
 30        # Handle feature, score column.
 31        self.feature_col = feature_col
 32        if self.feature_col >= self.nfields:
 33            raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
 34        self.feature = self.fields[ self.feature_col ]
 35        self.score_col = score_col
 36        if self.score_col >= self.nfields:
 37            raise MissingFieldError( "No field for score_col (%d)" % score_col )
 38        self.score = self.fields[ self.score_col ]
 39
 40        # GFF attributes.
 41        self.attributes = parse_gff_attributes( fields[8] )
 42
 43    def copy( self ):
 44        return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
 45                           self.end_col, self.strand_col, self.score_col, self.strand)
 46
 47class GFFFeature( GFFInterval ):
 48    """
 49    A GFF feature, which can include multiple intervals.
 50    """
 51    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
 52                  strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \
 53                  raw_size=0 ):
 54        # Use copy so that first interval and feature do not share fields.
 55        GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \
 56                              start_col, end_col, strand_col, score_col, default_strand, \
 57                              fix_strand=fix_strand )
 58        self.intervals = intervals
 59        self.raw_size = raw_size
 60        # Use intervals to set feature attributes.
 61        for interval in self.intervals:
 62            # Error checking. NOTE: intervals need not share the same strand.
 63            if interval.chrom != self.chrom:
 64                raise ValueError( "interval chrom does not match self chrom: %s != %s" % \
 65                                  ( interval.chrom, self.chrom ) )
 66            # Set start, end of interval.
 67            if interval.start < self.start:
 68                self.start = interval.start
 69            if interval.end > self.end:
 70                self.end = interval.end
 71
 72    def name( self ):
 73        """ Returns feature's name. """
 74        name = None
 75        # Preference for name: GTF, GFF3, GFF.
 76        for attr_name in [
 77                           # GTF:
 78                           'gene_id', 'transcript_id',
 79                           # GFF3:
 80                           'ID', 'id',
 81                           # GFF (TODO):
 82                           'group' ]:
 83            name = self.attributes.get( attr_name, None )
 84            if name is not None:
 85                break
 86        return name
 87
 88    def copy( self ):
 89        intervals_copy = []
 90        for interval in self.intervals:
 91            intervals_copy.append( interval.copy() )
 92        return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
 93                          self.score_col, self.strand, intervals=intervals_copy )
 94
 95    def lines( self ):
 96        lines = []
 97        for interval in self.intervals:
 98            lines.append( '\t'.join( interval.fields ) )
 99        return lines
100
101
102class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
103    """
104    Reader wrapper that reads GFF intervals/lines and automatically converts
105    them to BED format.
106    """
107
108    def parse_row( self, line ):
109        # HACK: this should return a GFF interval, but bx-python operations
110        # require GenomicInterval objects and subclasses will not work.
111        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
112                                    self.end_col, self.strand_col, self.default_strand, \
113                                    fix_strand=self.fix_strand )
114        interval = convert_gff_coords_to_bed( interval )
115        return interval
116
117class GFFReaderWrapper( NiceReaderWrapper ):
118    """
119    Reader wrapper for GFF files.
120
121    Wrapper has two major functions:
122
123    1. group entries for GFF file (via group column), GFF3 (via id attribute),
124       or GTF (via gene_id/transcript id);
125    2. convert coordinates from GFF format--starting and ending coordinates
126       are 1-based, closed--to the 'traditional'/BED interval format--0 based,
127       half-open. This is useful when using GFF files as inputs to tools that
128       expect traditional interval format.
129    """
130
131    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
132                  end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
133        NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
134                                    strand_col=strand_col, fix_strand=fix_strand, **kwargs )
135        self.feature_col = feature_col
136        self.score_col = score_col
137        self.convert_to_bed_coord = convert_to_bed_coord
138        self.last_line = None
139        self.cur_offset = 0
140        self.seed_interval = None
141        self.seed_interval_line_len = 0
142
143    def parse_row( self, line ):
144        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
145                                self.start_col, self.end_col, self.strand_col, self.score_col, \
146                                self.default_strand, fix_strand=self.fix_strand )
147        return interval
148
149    def next( self ):
150        """ Returns next GFFFeature. """
151
152        #
153        # Helper function.
154        #
155
156        def handle_parse_error( parse_error ):
157            """ Actions to take when ParseError found. """
158            if self.outstream:
159               if self.print_delegate and hasattr(self.print_delegate,"__call__"):
160                   self.print_delegate( self.outstream, e, self )
161            self.skipped += 1
162            # no reason to stuff an entire bad file into memmory
163            if self.skipped < 10:
164               self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
165
166            # For debugging, uncomment this to propogate parsing exceptions up.
167            # I.e. the underlying reason for an unexpected StopIteration exception
168            # can be found by uncommenting this.
169            # raise e
170
171        #
172        # Get next GFFFeature
173        #
174        raw_size = self.seed_interval_line_len
175
176        # If there is no seed interval, set one. Also, if there are no more
177        # intervals to read, this is where iterator dies.
178        if not self.seed_interval:
179            while not self.seed_interval:
180                try:
181                    self.seed_interval = GenomicIntervalReader.next( self )
182                except ParseError, e:
183                    handle_parse_error( e )
184                # TODO: When no longer supporting python 2.4 use finally:
185                #finally:
186                raw_size += len( self.current_line )
187
188        # If header or comment, clear seed interval and return it with its size.
189        if isinstance( self.seed_interval, ( Header, Comment ) ):
190            return_val = self.seed_interval
191            return_val.raw_size = len( self.current_line )
192            self.seed_interval = None
193            self.seed_interval_line_len = 0
194            return return_val
195
196        # Initialize feature identifier from seed.
197        feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
198        # For GFF3
199        feature_id = self.seed_interval.attributes.get( 'ID', None )
200        feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
201        # For GTF.
202        feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
203        feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
204
205        # Read all intervals associated with seed.
206        feature_intervals = []
207        feature_intervals.append( self.seed_interval )
208        while True:
209            try:
210                interval = GenomicIntervalReader.next( self )
211                raw_size += len( self.current_line )
212            except StopIteration, e:
213                # No more intervals to read, but last feature needs to be
214                # returned.
215                interval = None
216                raw_size += len( self.current_line )
217                break
218            except ParseError, e:
219                handle_parse_error( e )
220                raw_size += len( self.current_line )
221                continue
222            # TODO: When no longer supporting python 2.4 use finally:
223            #finally:
224            #raw_size += len( self.current_line )
225
226            # Ignore comments.
227            if isinstance( interval, Comment ):
228                continue
229
230            # Determine if interval is part of feature.
231            part_of = False
232            group = interval.attributes.get( 'group', None )
233            # GFF test:
234            if group and feature_group == group:
235                part_of = True
236            # GFF3 test:
237            parent_id = interval.attributes.get( 'Parent', None )
238            cur_id = interval.attributes.get( 'ID', None )
239            if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
240                part_of = True
241            # GTF test:
242            transcript_id = interval.attributes.get( 'transcript_id', None )
243            if transcript_id and transcript_id == feature_transcript_id:
244                part_of = True
245
246            # If interval is not part of feature, clean up and break.
247            if not part_of:
248                # Adjust raw size because current line is not part of feature.
249                raw_size -= len( self.current_line )
250                break
251
252            # Interval associated with feature.
253            feature_intervals.append( interval )
254
255        # Last interval read is the seed for the next interval.
256        self.seed_interval = interval
257        self.seed_interval_line_len = len( self.current_line )
258
259        # Return feature.
260        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
261                              self.end_col, self.strand_col, self.score_col, \
262                              self.default_strand, fix_strand=self.fix_strand, \
263                              intervals=feature_intervals, raw_size=raw_size )
264
265        # Convert to BED coords?
266        if self.convert_to_bed_coord:
267            convert_gff_coords_to_bed( feature )
268
269        return feature
270
271def convert_bed_coords_to_gff( interval ):
272    """
273    Converts an interval object's coordinates from BED format to GFF format.
274    Accepted object types include GenomicInterval and list (where the first
275    element in the list is the interval's start, and the second element is
276    the interval's end).
277    """
278    if isinstance( interval, GenomicInterval ):
279        interval.start += 1
280        if isinstance( interval, GFFFeature ):
281            for subinterval in interval.intervals:
282                convert_bed_coords_to_gff( subinterval )
283    elif type ( interval ) is list:
284        interval[ 0 ] += 1
285    return interval
286
287def convert_gff_coords_to_bed( interval ):
288    """
289    Converts an interval object's coordinates from GFF format to BED format.
290    Accepted object types include GFFFeature, GenomicInterval, and list (where
291    the first element in the list is the interval's start, and the second
292    element is the interval's end).
293    """
294    if isinstance( interval, GenomicInterval ):
295        interval.start -= 1
296        if isinstance( interval, GFFFeature ):
297            for subinterval in interval.intervals:
298                convert_gff_coords_to_bed( subinterval )
299    elif type ( interval ) is list:
300        interval[ 0 ] -= 1
301    return interval
302
303def parse_gff_attributes( attr_str ):
304    """
305    Parses a GFF/GTF attribute string and returns a dictionary of name-value
306    pairs. The general format for a GFF3 attributes string is
307
308        name1=value1;name2=value2
309
310    The general format for a GTF attribute string is
311
312        name1 "value1" ; name2 "value2"
313
314    The general format for a GFF attribute string is a single string that
315    denotes the interval's group; in this case, method returns a dictionary
316    with a single key-value pair, and key name is 'group'
317    """
318    attributes_list = attr_str.split(";")
319    attributes = {}
320    for name_value_pair in attributes_list:
321        # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
322        # attribute; next, try double quotes for GTF.
323        pair = name_value_pair.strip().split("=")
324        if len( pair ) == 1:
325            pair = name_value_pair.strip().split("\"")
326        if len( pair ) == 1:
327            # Could not split for some reason -- raise exception?
328            continue
329        if pair == '':
330            continue
331        name = pair[0].strip()
332        if name == '':
333            continue
334        # Need to strip double quote from values
335        value = pair[1].strip(" \"")
336        attributes[ name ] = value
337
338    if len( attributes ) == 0:
339        # Could not split attributes string, so entire string must be
340        # 'group' attribute. This is the case for strictly GFF files.
341        attributes['group'] = attr_str
342    return attributes
343
344def gff_attributes_to_str( attrs, gff_format ):
345    """
346    Convert GFF attributes to string. Supported formats are GFF3, GTF.
347    """
348    if gff_format == 'GTF':
349        format_string = '%s "%s"'
350        # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
351        id_attr = None
352        if 'group' in attrs:
353            id_attr = 'group'
354        elif 'ID' in attrs:
355            id_attr = 'ID'
356        elif 'Parent' in attrs:
357            id_attr = 'Parent'
358        if id_attr:
359            attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
360    elif gff_format == 'GFF3':
361        format_string = '%s=%s'
362    attrs_strs = []
363    for name, value in attrs.items():
364        attrs_strs.append( format_string % ( name, value ) )
365    return " ; ".join( attrs_strs )
366
367def read_unordered_gtf( iterator, strict=False ):
368    """
369    Returns GTF features found in an iterator. GTF lines need not be ordered
370    or clustered for reader to work. Reader returns GFFFeature objects sorted
371    by transcript_id, chrom, and start position.
372    """
373
374    # -- Get function that generates line/feature key. --
375
376    get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
377    if strict:
378        # Strict GTF parsing uses transcript_id only to group lines into feature.
379        key_fn = get_transcript_id
380    else:
381        # Use lenient parsing where chromosome + transcript_id is the key. This allows
382        # transcripts with same ID on different chromosomes; this occurs in some popular
383        # datasources, such as RefGenes in UCSC.
384        key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
385
386
387    # Aggregate intervals by transcript_id and collect comments.
388    feature_intervals = odict()
389    comments = []
390    for count, line in enumerate( iterator ):
391        if line.startswith( '#' ):
392            comments.append( Comment( line ) )
393            continue
394
395        line_key = key_fn( line.split('\t') )
396        if line_key in feature_intervals:
397            feature = feature_intervals[ line_key ]
398        else:
399            feature = []
400            feature_intervals[ line_key ] = feature
401        feature.append( GFFInterval( None, line.split( '\t' ) ) )
402
403    # Create features.
404    chroms_features = {}
405    for count, intervals in enumerate( feature_intervals.values() ):
406        # Sort intervals by start position.
407        intervals.sort( lambda a,b: cmp( a.start, b.start ) )
408        feature = GFFFeature( None, intervals=intervals )
409        if feature.chrom not in chroms_features:
410            chroms_features[ feature.chrom ] = []
411        chroms_features[ feature.chrom ].append( feature )
412
413    # Sort features by chrom, start position.
414    chroms_features_sorted = []
415    for chrom_features in chroms_features.values():
416        chroms_features_sorted.append( chrom_features )
417    chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
418    for features in chroms_features_sorted:
419        features.sort( lambda a,b: cmp( a.start, b.start ) )
420
421    # Yield comments first, then features.
422    # FIXME: comments can appear anywhere in file, not just the beginning.
423    # Ideally, then comments would be associated with features and output
424    # just before feature/line.
425    for comment in comments:
426        yield comment
427
428    for chrom_features in chroms_features_sorted:
429        for feature in chrom_features:
430            yield feature
431