PageRenderTime 31ms CodeModel.GetById 14ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/filters/gff_to_bed_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 133 lines | 78 code | 18 blank | 37 comment | 27 complexity | c5910f4d713d4d0fb80cbb5740a679e6 MD5 | raw file
  1#!/usr/bin/env python
  2import sys
  3from galaxy import eggs
  4from galaxy.datatypes.util.gff_util import parse_gff_attributes
  5
  6assert sys.version_info[:2] >= ( 2, 4 )
  7
  8def get_bed_line( chrom, name, strand, blocks ):
  9    """ Returns a BED line for given data. """
 10
 11    
 12    if len( blocks ) == 1:
 13        # Use simple BED format if there is only a single block:
 14        #   chrom, chromStart, chromEnd, name, score, strand
 15        #
 16        start, end = blocks[0]
 17        return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand )
 18
 19    #
 20    # Build lists for transcript blocks' starts, sizes.
 21    #
 22    
 23    # Get transcript start, end.
 24    t_start = sys.maxint
 25    t_end = -1
 26    for block_start, block_end in blocks:
 27        if block_start < t_start:
 28            t_start = block_start
 29        if block_end > t_end:
 30            t_end = block_end
 31            
 32    # Get block starts, sizes.
 33    block_starts = []
 34    block_sizes = []
 35    for block_start, block_end in blocks:
 36        block_starts.append( str( block_start - t_start ) )
 37        block_sizes.append( str( block_end - block_start ) )
 38    
 39    #
 40    # Create BED entry.
 41    # Bed format: chrom, chromStart, chromEnd, name, score, strand, \
 42    #               thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
 43    #
 44    # Render complete feature with thick blocks. There's no clear way to do this unless
 45    # we analyze the block names, but making everything thick makes more sense than
 46    # making everything thin.
 47    #
 48    return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \
 49            ( chrom, t_start, t_end, name, strand, t_start, t_end, len( block_starts ), 
 50                ",".join( block_sizes ), ",".join( block_starts ) )
 51
 52def __main__():
 53    input_name = sys.argv[1]
 54    output_name = sys.argv[2]
 55    skipped_lines = 0
 56    first_skipped_line = 0
 57    out = open( output_name, 'w' )
 58    i = 0
 59    cur_transcript_chrom = None
 60    cur_transcript_id = None
 61    cur_transcript_strand = None
 62    cur_transcripts_blocks = [] # (start, end) for each block.
 63    for i, line in enumerate( file( input_name ) ):
 64        line = line.rstrip( '\r\n' )
 65        if line and not line.startswith( '#' ):
 66            try:
 67                # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes
 68                elems = line.split( '\t' )
 69                start = str( long( elems[3] ) - 1 )
 70                coords = [ long( start ), long( elems[4] ) ]
 71                strand = elems[6]
 72                if strand not in ['+', '-']:
 73                    strand = '+'
 74                attributes = parse_gff_attributes( elems[8] )
 75                t_id = attributes.get( "transcript_id", None )
 76                    
 77                if not t_id:
 78                    #
 79                    # No transcript ID, so write last transcript and write current line as its own line.
 80                    #
 81                    
 82                    # Write previous transcript.
 83                    if cur_transcript_id:
 84                        # Write BED entry.
 85                        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
 86                    
 87                    # Replace any spaces in the name with underscores so UCSC will not complain.
 88                    name = elems[2].replace(" ", "_")
 89                    out.write( get_bed_line( elems[0], name, strand, [ coords ] ) )
 90                    continue
 91                
 92                # There is a transcript ID, so process line at transcript level.
 93                if t_id == cur_transcript_id:
 94                    # Line is element of transcript and will be a block in the BED entry.
 95                    cur_transcripts_blocks.append( coords )
 96                    continue
 97                    
 98                #
 99                # Line is part of new transcript; write previous transcript and start
100                # new transcript.
101                #
102                
103                # Write previous transcript.
104                if cur_transcript_id:
105                    # Write BED entry.
106                    out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
107
108                # Start new transcript.
109                cur_transcript_chrome = elems[0]
110                cur_transcript_id = t_id
111                cur_transcript_strand = strand
112                cur_transcripts_blocks = []
113                cur_transcripts_blocks.append( coords )    
114            except:
115                skipped_lines += 1
116                if not first_skipped_line:
117                    first_skipped_line = i + 1
118        else:
119            skipped_lines += 1
120            if not first_skipped_line:
121                first_skipped_line = i + 1
122    
123    # Write last transcript.
124    if cur_transcript_id:
125        # Write BED entry.
126        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
127    out.close()
128    info_msg = "%i lines converted to BED.  " % ( i + 1 - skipped_lines )
129    if skipped_lines > 0:
130        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
131    print info_msg
132
133if __name__ == "__main__": __main__()