gff_to_bed_converter.py - This Python script converts a GFF…

/tools/filters/gff_to_bed_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 133 lines · 78 code · 18 blank · 37 comment · 20 complexity · c5910f4d713d4d0fb80cbb5740a679e6 MD5 · raw file


#!/usr/bin/env python
import sys
from galaxy import eggs
from galaxy.datatypes.util.gff_util import parse_gff_attributes

assert sys.version_info[:2] >= ( 2, 4 )

def get_bed_line( chrom, name, strand, blocks ):
    """ Returns a BED line for given data. """

    
    if len( blocks ) == 1:
        # Use simple BED format if there is only a single block:
        #   chrom, chromStart, chromEnd, name, score, strand
        #
        start, end = blocks[0]
        return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand )

    #
    # Build lists for transcript blocks' starts, sizes.
    #
    
    # Get transcript start, end.
    t_start = sys.maxint
    t_end = -1
    for block_start, block_end in blocks:
        if block_start < t_start:
            t_start = block_start
        if block_end > t_end:
            t_end = block_end
            
    # Get block starts, sizes.
    block_starts = []
    block_sizes = []
    for block_start, block_end in blocks:
        block_starts.append( str( block_start - t_start ) )
        block_sizes.append( str( block_end - block_start ) )
    
    #
    # Create BED entry.
    # Bed format: chrom, chromStart, chromEnd, name, score, strand, \
    #               thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
    #
    # Render complete feature with thick blocks. There's no clear way to do this unless
    # we analyze the block names, but making everything thick makes more sense than
    # making everything thin.
    #
    return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \
            ( chrom, t_start, t_end, name, strand, t_start, t_end, len( block_starts ), 
                ",".join( block_sizes ), ",".join( block_starts ) )

def __main__():
    input_name = sys.argv[1]
    output_name = sys.argv[2]
    skipped_lines = 0
    first_skipped_line = 0
    out = open( output_name, 'w' )
    i = 0
    cur_transcript_chrom = None
    cur_transcript_id = None
    cur_transcript_strand = None
    cur_transcripts_blocks = [] # (start, end) for each block.
    for i, line in enumerate( file( input_name ) ):
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( '#' ):
            try:
                # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes
                elems = line.split( '\t' )
                start = str( long( elems[3] ) - 1 )
                coords = [ long( start ), long( elems[4] ) ]
                strand = elems[6]
                if strand not in ['+', '-']:
                    strand = '+'
                attributes = parse_gff_attributes( elems[8] )
                t_id = attributes.get( "transcript_id", None )
                    
                if not t_id:
                    #
                    # No transcript ID, so write last transcript and write current line as its own line.
                    #
                    
                    # Write previous transcript.
                    if cur_transcript_id:
                        # Write BED entry.
                        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
                    
                    # Replace any spaces in the name with underscores so UCSC will not complain.
                    name = elems[2].replace(" ", "_")
                    out.write( get_bed_line( elems[0], name, strand, [ coords ] ) )
                    continue
                
                # There is a transcript ID, so process line at transcript level.
                if t_id == cur_transcript_id:
                    # Line is element of transcript and will be a block in the BED entry.
                    cur_transcripts_blocks.append( coords )
                    continue
                    
                #
                # Line is part of new transcript; write previous transcript and start
                # new transcript.
                #
                
                # Write previous transcript.
                if cur_transcript_id:
                    # Write BED entry.
                    out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )

                # Start new transcript.
                cur_transcript_chrome = elems[0]
                cur_transcript_id = t_id
                cur_transcript_strand = strand
                cur_transcripts_blocks = []
                cur_transcripts_blocks.append( coords )    
            except:
                skipped_lines += 1
                if not first_skipped_line:
                    first_skipped_line = i + 1
        else:
            skipped_lines += 1
            if not first_skipped_line:
                first_skipped_line = i + 1
    
    # Write last transcript.
    if cur_transcript_id:
        # Write BED entry.
        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
    out.close()
    info_msg = "%i lines converted to BED.  " % ( i + 1 - skipped_lines )
    if skipped_lines > 0:
        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
    print info_msg

if __name__ == "__main__": __main__()

Summary ✨

This Python script converts a GFF file to BED format, which is a simple text-based format for representing genomic features such as genes and transcripts. The script takes two command-line arguments: the input GFF file and the output BED file. It reads the input GFF file line by line, skipping blank or comment lines, and converts each feature to a BED entry. Each BED entry consists of 9 tab-delimited fields representing the chromosome name, start position, end position, name, score, strand, thickStart, thickEnd, and itemRgb. The script also replaces any spaces in the name with underscores to avoid UCSC complaints.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (4)

Complexity hotspot; lines 65 to 66 (total complexity: 3)
65 66
'try:' Ensure try blocks have corresponding except or finally blocks
66
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
114