PageRenderTime 29ms CodeModel.GetById 15ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/converters/bed_to_gff_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 73 lines | 67 code | 3 blank | 3 comment | 21 complexity | 784cb409f233a9aa1036937a7d36b388 MD5 | raw file
 1#!/usr/bin/env python
 2# This code exists in 2 places: ~/datatypes/converters and ~/tools/filters
 3import sys
 4
 5assert sys.version_info[:2] >= ( 2, 4 )
 6
 7def __main__():
 8    input_name = sys.argv[1]
 9    output_name = sys.argv[2]
10    skipped_lines = 0
11    first_skipped_line = 0
12    out = open( output_name, 'w' )
13    out.write( "##gff-version 2\n" )
14    out.write( "##bed_to_gff_converter.py\n\n" )
15    i = 0
16    for i, line in enumerate( file( input_name ) ):
17        complete_bed = False
18        line = line.rstrip( '\r\n' )
19        if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ):
20            try:
21                elems = line.split( '\t' )
22                if len( elems ) == 12:
23                    complete_bed = True
24                chrom = elems[0]
25                if complete_bed:
26                    feature = "mRNA"
27                else:
28                    try:
29                        feature = elems[3]
30                    except:
31                        feature = 'feature%d' % ( i + 1 )
32                start = int( elems[1] ) + 1
33                end = int( elems[2] )
34                try:
35                    score = elems[4]
36                except:
37                    score = '0'
38                try:
39                    strand = elems[5]
40                except:
41                    strand = '+'
42                try:
43                    group = elems[3]
44                except:
45                    group = 'group%d' % ( i + 1 )
46                if complete_bed:
47                    out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group  ) )
48                else:
49                    out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group  ) )
50                if complete_bed:
51                    # We have all the info necessary to annotate exons for genes and mRNAs
52                    block_count = int( elems[9] )
53                    block_sizes = elems[10].split( ',' )
54                    block_starts = elems[11].split( ',' )
55                    for j in range( block_count ):
56                        exon_start = int( start ) + int( block_starts[j] )
57                        exon_end = exon_start + int( block_sizes[j] ) - 1
58                        out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) )
59            except:
60                skipped_lines += 1
61                if not first_skipped_line:
62                    first_skipped_line = i + 1
63        else:
64            skipped_lines += 1
65            if not first_skipped_line:
66                first_skipped_line = i + 1
67    out.close()
68    info_msg = "%i lines converted to GFF version 2.  " % ( i + 1 - skipped_lines )
69    if skipped_lines > 0:
70        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
71    print info_msg
72
73if __name__ == "__main__": __main__()