PageRenderTime 33ms CodeModel.GetById 17ms app.highlight 12ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/metag_tools/blat_coverage_report.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 107 lines | 100 code | 6 blank | 1 comment | 8 complexity | 6594fb924f05b45df06ec572d8ce6d53 MD5 | raw file
  1#!/usr/bin/env python
  2
  3import os, sys
  4
  5assert sys.version_info[:2] >= ( 2, 4 )
  6
  7def stop_err( msg ):
  8    sys.stderr.write( "%s\n" % msg )
  9    sys.exit()
 10
 11def reverse_complement(s):
 12    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."}
 13    reversed_s = []
 14    for i in s:
 15        reversed_s.append(complement_dna[i])
 16    reversed_s.reverse()
 17    return "".join(reversed_s)
 18
 19def __main__():
 20    nuc_index = {'a':0,'t':1,'c':2,'g':3}
 21    diff_hash = {}    # key = (chrom, index)
 22    infile = sys.argv[1]
 23    outfile = sys.argv[2]
 24    invalid_lines = 0
 25    invalid_chars = 0
 26    data_id = ''
 27    data_seq = ''
 28
 29    for i, line in enumerate( open( infile ) ):
 30        line = line.rstrip( '\r\n' )
 31        if not line or line.startswith( '#' ):
 32            continue
 33        fields = line.split()
 34        if len(fields) != 23:    # standard number of pslx columns
 35            invalid_lines += 1
 36            continue
 37        if not fields[0].isdigit():
 38            invalid_lines += 1
 39            continue
 40        read_id = fields[9]
 41        chrom = fields[13]
 42        try:
 43            block_count = int(fields[17])
 44        except:
 45            invalid_lines += 1
 46            continue
 47        block_size = fields[18].split(',')
 48        read_start = fields[19].split(',')
 49        chrom_start = fields[20].split(',')
 50        read_seq = fields[21].split(',')
 51        chrom_seq = fields[22].split(',')
 52
 53        for j in range(block_count):
 54            try:
 55                this_block_size = int(block_size[j])
 56                this_read_start = int(read_start[j])
 57                this_chrom_start = int(chrom_start[j])
 58            except:
 59                invalid_lines += 1
 60                break
 61            this_read_seq = read_seq[j]
 62            this_chrom_seq = chrom_seq[j]
 63            
 64            if not this_read_seq.isalpha():
 65                continue
 66            if not this_chrom_seq.isalpha():
 67                continue
 68            
 69            # brut force to check coverage                
 70            for k in range(this_block_size):
 71                cur_index = this_chrom_start+k
 72                sub_a = this_read_seq[k:(k+1)].lower()
 73                sub_b = this_chrom_seq[k:(k+1)].lower()
 74                if not diff_hash.has_key((chrom, cur_index)):
 75                    try:
 76                        diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()]    # a, t, c, g, ref. nuc.
 77                    except Exception, e:
 78                        stop_err( str( e ) )
 79                if sub_a in ['a','t','c','g']:
 80                    diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1
 81                else:
 82                    invalid_chars += 1
 83                        
 84    outputfh = open(outfile, 'w')
 85    outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" )
 86    keys = diff_hash.keys()
 87    keys.sort()
 88    for i in keys:
 89        (chrom, location) = i
 90        sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ]    # did not include N's
 91        if sum == 0:
 92            continue
 93        ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum
 94        ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum
 95        ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum
 96        ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum
 97        (title_head, title_tail) = os.path.split(chrom)
 98        result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G ) 
 99        outputfh.write(result)
100    outputfh.close()
101
102    if invalid_lines:
103        print 'Skipped %d invalid lines. ' % ( invalid_lines )
104    if invalid_chars:
105        print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars)
106        
107if __name__ == '__main__': __main__()