PageRenderTime 45ms CodeModel.GetById 19ms app.highlight 20ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/filters/gtf_to_bedgraph_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 80 lines | 69 code | 6 blank | 5 comment | 11 complexity | fffc489c0adff4d8a42850499e31652c MD5 | raw file
 1#!/usr/bin/env python
 2import os, sys, tempfile
 3
 4assert sys.version_info[:2] >= ( 2, 4 )
 5
 6def __main__():
 7    # Read parms.
 8    input_name = sys.argv[1]
 9    output_name = sys.argv[2]
10    attribute_name = sys.argv[3]
11    
12    # Create temp files.
13    tmp_name1 = tempfile.NamedTemporaryFile().name
14    tmp_name2 = tempfile.NamedTemporaryFile().name
15    
16    # Do conversion.
17    skipped_lines = 0
18    first_skipped_line = 0
19    out = open( tmp_name1, 'w' )
20    
21    # Write track data to temporary file.
22    i = 0
23    for i, line in enumerate( file( input_name ) ):
24        line = line.rstrip( '\r\n' )
25        
26        if line and not line.startswith( '#' ):
27            try:
28                elems = line.split( '\t' )
29                start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based.
30                strand = elems[6]
31                if strand not in ['+', '-']:
32                    strand = '+'
33                attributes_list = elems[8].split(";")
34                attributes = {}
35                for name_value_pair in attributes_list:
36                    pair = name_value_pair.strip().split(" ")
37                    name = pair[0].strip()
38                    if name == '':
39                        continue
40                    # Need to strip double quote from values
41                    value = pair[1].strip(" \"")
42                    attributes[name] = value
43                value = attributes[ attribute_name ]
44                # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
45                # BedGraph format: chrom, chromStart, chromEnd, value
46                out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) )
47            except:
48                skipped_lines += 1
49                if not first_skipped_line:
50                    first_skipped_line = i + 1
51        else:
52            skipped_lines += 1
53            if not first_skipped_line:
54                first_skipped_line = i + 1
55    out.close()
56    
57    # Sort tmp file by chromosome name and chromosome start to create ordered track data.
58    cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 )
59    try:
60        os.system(cmd)
61        os.remove(tmp_name1)
62    except Exception, ex:
63        sys.stderr.write( "%s\n" % ex )
64        sys.exit(1)
65        
66    # Create bedgraph file by combining track definition with ordered track data.
67    cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name )
68    try:
69        os.system(cmd)
70        os.remove(tmp_name2)
71    except Exception, ex:
72        sys.stderr.write( "%s\n" % ex )
73        sys.exit(1)
74    
75    info_msg = "%i lines converted to BEDGraph.  " % ( i + 1 - skipped_lines )
76    if skipped_lines > 0:
77        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
78    print info_msg
79
80if __name__ == "__main__": __main__()