PageRenderTime 39ms CodeModel.GetById 29ms app.highlight 8ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/ngs_rna/filter_transcripts_via_tracking.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 70 lines | 49 code | 6 blank | 15 comment | 5 complexity | 34756a9ee27636884e723c75347a3633 MD5 | raw file
 1#!/usr/bin/env python
 2import os, sys, tempfile
 3
 4assert sys.version_info[:2] >= ( 2, 4 )
 5
 6def __main__():
 7    """
 8    Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts
 9    produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only
10    transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for
11    a particular sample.
12    """
13    # Read parms.
14    tracking_file_name = sys.argv[1]
15    transcripts_file_name = sys.argv[2]
16    output_file_name = sys.argv[3]
17    sample_number = int ( sys.argv[4] )
18
19    # Open files.
20    transcripts_file = open( transcripts_file_name, 'r' )
21    output_file = open( output_file_name, 'w' )
22    
23    # Read transcript IDs from tracking file.
24    transcript_ids = {}
25    for i, line in enumerate( file( tracking_file_name ) ) :
26        # Split line into elements. Line format is 
27        # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>]
28        line = line.rstrip( '\r\n' )
29        elems = line.split( '\t' )
30        
31        # Get transcript info.
32        if sample_number == 1:
33            transcript_info = elems[4]
34        elif sample_number == 2:
35            transcript_info = elems[5]
36        if not transcript_info.startswith('q'):
37            # No transcript for this sample.
38            continue
39        
40        # Get and store transcript id.
41        transcript_id = transcript_info.split('|')[1]
42        transcript_id = transcript_id.strip('"')
43        transcript_ids[transcript_id] = ""
44        
45    # Filter transcripts file using transcript_ids
46    for i, line in enumerate( file( transcripts_file_name ) ):
47        # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
48        elems = line.split( '\t' )
49        
50        # Get attributes.
51        attributes_list = elems[8].split(";")
52        attributes = {}
53        for name_value_pair in attributes_list:
54            pair = name_value_pair.strip().split(" ")
55            name = pair[0].strip()
56            if name == '':
57                continue
58            # Need to strip double quote from values
59            value = pair[1].strip(" \"")
60            attributes[name] = value
61            
62        # Get element's transcript id.
63        transcript_id = attributes['transcript_id']
64        if transcript_id in transcript_ids:
65            output_file.write(line)
66        
67    # Clean up.
68    output_file.close()
69    
70if __name__ == "__main__": __main__()