bed_to_gff_converter.py - This Python script converts a BED…

/lib/galaxy/datatypes/converters/bed_to_gff_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 73 lines · 67 code · 3 blank · 3 comment · 22 complexity · 784cb409f233a9aa1036937a7d36b388 MD5 · raw file


#!/usr/bin/env python
# This code exists in 2 places: ~/datatypes/converters and ~/tools/filters
import sys

assert sys.version_info[:2] >= ( 2, 4 )

def __main__():
    input_name = sys.argv[1]
    output_name = sys.argv[2]
    skipped_lines = 0
    first_skipped_line = 0
    out = open( output_name, 'w' )
    out.write( "##gff-version 2\n" )
    out.write( "##bed_to_gff_converter.py\n\n" )
    i = 0
    for i, line in enumerate( file( input_name ) ):
        complete_bed = False
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ):
            try:
                elems = line.split( '\t' )
                if len( elems ) == 12:
                    complete_bed = True
                chrom = elems[0]
                if complete_bed:
                    feature = "mRNA"
                else:
                    try:
                        feature = elems[3]
                    except:
                        feature = 'feature%d' % ( i + 1 )
                start = int( elems[1] ) + 1
                end = int( elems[2] )
                try:
                    score = elems[4]
                except:
                    score = '0'
                try:
                    strand = elems[5]
                except:
                    strand = '+'
                try:
                    group = elems[3]
                except:
                    group = 'group%d' % ( i + 1 )
                if complete_bed:
                    out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group  ) )
                else:
                    out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group  ) )
                if complete_bed:
                    # We have all the info necessary to annotate exons for genes and mRNAs
                    block_count = int( elems[9] )
                    block_sizes = elems[10].split( ',' )
                    block_starts = elems[11].split( ',' )
                    for j in range( block_count ):
                        exon_start = int( start ) + int( block_starts[j] )
                        exon_end = exon_start + int( block_sizes[j] ) - 1
                        out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) )
            except:
                skipped_lines += 1
                if not first_skipped_line:
                    first_skipped_line = i + 1
        else:
            skipped_lines += 1
            if not first_skipped_line:
                first_skipped_line = i + 1
    out.close()
    info_msg = "%i lines converted to GFF version 2.  " % ( i + 1 - skipped_lines )
    if skipped_lines > 0:
        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
    print info_msg

if __name__ == "__main__": __main__()

Summary ✨

This Python script converts a BED file to a GFF2 file, which is a text-based format for representing genomic features. The script takes two command-line arguments: the input BED file and the output GFF2 file. It reads the input BED file line by line, skipping blank or comment lines, and converts each line into a GFF2 record. The script also reports the number of lines converted and any lines that were skipped due to errors.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (8)

'open(' Use 'with open()' to ensure Files are properly closed
12
Complexity hotspot; lines 19 to 20 (total complexity: 5)
19 20
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
30 36 40 44 59