/tools/metag_tools/blat_coverage_report.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 107 lines · 93 code · 12 blank · 2 comment · 22 complexity · 6594fb924f05b45df06ec572d8ce6d53 MD5 · raw file

  1. #!/usr/bin/env python
  2. import os, sys
  3. assert sys.version_info[:2] >= ( 2, 4 )
  4. def stop_err( msg ):
  5. sys.stderr.write( "%s\n" % msg )
  6. sys.exit()
  7. def reverse_complement(s):
  8. complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."}
  9. reversed_s = []
  10. for i in s:
  11. reversed_s.append(complement_dna[i])
  12. reversed_s.reverse()
  13. return "".join(reversed_s)
  14. def __main__():
  15. nuc_index = {'a':0,'t':1,'c':2,'g':3}
  16. diff_hash = {} # key = (chrom, index)
  17. infile = sys.argv[1]
  18. outfile = sys.argv[2]
  19. invalid_lines = 0
  20. invalid_chars = 0
  21. data_id = ''
  22. data_seq = ''
  23. for i, line in enumerate( open( infile ) ):
  24. line = line.rstrip( '\r\n' )
  25. if not line or line.startswith( '#' ):
  26. continue
  27. fields = line.split()
  28. if len(fields) != 23: # standard number of pslx columns
  29. invalid_lines += 1
  30. continue
  31. if not fields[0].isdigit():
  32. invalid_lines += 1
  33. continue
  34. read_id = fields[9]
  35. chrom = fields[13]
  36. try:
  37. block_count = int(fields[17])
  38. except:
  39. invalid_lines += 1
  40. continue
  41. block_size = fields[18].split(',')
  42. read_start = fields[19].split(',')
  43. chrom_start = fields[20].split(',')
  44. read_seq = fields[21].split(',')
  45. chrom_seq = fields[22].split(',')
  46. for j in range(block_count):
  47. try:
  48. this_block_size = int(block_size[j])
  49. this_read_start = int(read_start[j])
  50. this_chrom_start = int(chrom_start[j])
  51. except:
  52. invalid_lines += 1
  53. break
  54. this_read_seq = read_seq[j]
  55. this_chrom_seq = chrom_seq[j]
  56. if not this_read_seq.isalpha():
  57. continue
  58. if not this_chrom_seq.isalpha():
  59. continue
  60. # brut force to check coverage
  61. for k in range(this_block_size):
  62. cur_index = this_chrom_start+k
  63. sub_a = this_read_seq[k:(k+1)].lower()
  64. sub_b = this_chrom_seq[k:(k+1)].lower()
  65. if not diff_hash.has_key((chrom, cur_index)):
  66. try:
  67. diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()] # a, t, c, g, ref. nuc.
  68. except Exception, e:
  69. stop_err( str( e ) )
  70. if sub_a in ['a','t','c','g']:
  71. diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1
  72. else:
  73. invalid_chars += 1
  74. outputfh = open(outfile, 'w')
  75. outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" )
  76. keys = diff_hash.keys()
  77. keys.sort()
  78. for i in keys:
  79. (chrom, location) = i
  80. sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ] # did not include N's
  81. if sum == 0:
  82. continue
  83. ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum
  84. ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum
  85. ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum
  86. ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum
  87. (title_head, title_tail) = os.path.split(chrom)
  88. result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G )
  89. outputfh.write(result)
  90. outputfh.close()
  91. if invalid_lines:
  92. print 'Skipped %d invalid lines. ' % ( invalid_lines )
  93. if invalid_chars:
  94. print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars)
  95. if __name__ == '__main__': __main__()