/tools/indels/indel_analysis.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 227 lines · 173 code · 7 blank · 47 comment · 73 complexity · 3fb3a2ec9975bf037f9e32060f9cd7c6 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Given an input sam file, provides analysis of the indels..
  4. usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]
  5. -i, --input=i: The sam file to analyze
  6. -t, --threshold=t: The deletion frequency threshold
  7. -I, --out_ins=I: The interval output file showing insertions
  8. -D, --out_del=D: The interval output file showing deletions
  9. """
  10. import re, sys
  11. from galaxy import eggs
  12. import pkg_resources; pkg_resources.require( "bx-python" )
  13. from bx.cookbook import doc_optparse
  14. def stop_err( msg ):
  15. sys.stderr.write( '%s\n' % msg )
  16. sys.exit()
  17. def add_to_mis_matches( mis_matches, pos, bases ):
  18. """
  19. Adds the bases and counts to the mis_matches dict
  20. """
  21. for j, base in enumerate( bases ):
  22. try:
  23. mis_matches[ pos + j ][ base ] += 1
  24. except KeyError:
  25. try:
  26. mis_matches[ pos + j ][ base ] = 1
  27. except KeyError:
  28. mis_matches[ pos + j ] = { base: 1 }
  29. def __main__():
  30. #Parse Command Line
  31. options, args = doc_optparse.parse( __doc__ )
  32. # prep output files
  33. out_ins = open( options.out_ins, 'wb' )
  34. out_del = open( options.out_del, 'wb' )
  35. # patterns
  36. pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' )
  37. pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
  38. # for tracking occurences at each pos of ref
  39. mis_matches = {}
  40. indels = {}
  41. multi_indel_lines = 0
  42. # go through all lines in input file
  43. for i,line in enumerate( open( options.input, 'rb' ) ):
  44. if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) :
  45. split_line = line.split( '\t' )
  46. chrom = split_line[2].strip()
  47. pos = int( split_line[3].strip() )
  48. cigar = split_line[5].strip()
  49. bases = split_line[9].strip()
  50. # if not an indel or match, exit
  51. if chrom == '*':
  52. continue
  53. # find matches like 3M2D7M or 7M3I10M
  54. match = {}
  55. m = pat.match( cigar )
  56. # unprocessable CIGAR
  57. if not m:
  58. m = pat_multi.match( cigar )
  59. # skip this line if no match
  60. if not m:
  61. continue
  62. # account for multiple indels or operations we don't process
  63. else:
  64. multi_indel_lines += 1
  65. # get matching parts for the indel or full match if matching
  66. else:
  67. if not mis_matches.has_key( chrom ):
  68. mis_matches[ chrom ] = {}
  69. indels[ chrom ] = { 'D': {}, 'I': {} }
  70. parts = m.groupdict()
  71. if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ):
  72. match = parts
  73. # see if matches meet filter requirements
  74. if match:
  75. # match/mismatch
  76. if parts[ 'match_width' ]:
  77. add_to_mis_matches( mis_matches[ chrom ], pos, bases )
  78. # indel
  79. else:
  80. # pieces of CIGAR string
  81. left = int( match[ 'lmatch' ] )
  82. middle = int( match[ 'ins_del_width' ] )
  83. right = int( match[ 'rmatch' ] )
  84. left_bases = bases[ : left ]
  85. if match[ 'ins_del' ] == 'I':
  86. middle_bases = bases[ left : left + middle ]
  87. else:
  88. middle_bases = ''
  89. right_bases = bases[ -right : ]
  90. start = pos + left
  91. # add data to ref_pos dict for match/mismatch bases on left and on right
  92. add_to_mis_matches( mis_matches[ chrom ], pos, left_bases )
  93. if match[ 'ins_del' ] == 'I':
  94. add_to_mis_matches( mis_matches[ chrom ], start, right_bases )
  95. else:
  96. add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases )
  97. # for insertions, count instances of particular inserted bases
  98. if match[ 'ins_del' ] == 'I':
  99. if indels[ chrom ][ 'I' ].has_key( start ):
  100. try:
  101. indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1
  102. except KeyError:
  103. indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1
  104. else:
  105. indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 }
  106. # for deletions, count number of deletions bases
  107. else:
  108. if indels[ chrom ][ 'D' ].has_key( start ):
  109. try:
  110. indels[ chrom ][ 'D' ][ start ][ middle ] += 1
  111. except KeyError:
  112. indels[ chrom ][ 'D' ][ start ][ middle ] = 1
  113. else:
  114. indels[ chrom ][ 'D' ][ start ] = { middle: 1 }
  115. # compute deletion frequencies and insertion frequencies for checking against threshold
  116. freqs = {}
  117. ins_freqs = {}
  118. chroms = mis_matches.keys()
  119. chroms.sort()
  120. for chrom in chroms:
  121. freqs[ chrom ] = {}
  122. ins_freqs[ chrom ] = {}
  123. poses = mis_matches[ chrom ].keys()
  124. poses.extend( indels[ chrom ][ 'D' ].keys() )
  125. poses.extend( indels[ chrom ][ 'I' ].keys() )
  126. poses = list( set( poses ) )
  127. for pos in poses:
  128. # all reads touching this particular position
  129. freqs[ chrom ][ pos ] = {}
  130. sum_counts = 0.0
  131. sum_counts_end = 0.0
  132. # get basic counts (match/mismatch)
  133. try:
  134. sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) )
  135. except KeyError:
  136. pass
  137. try:
  138. sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )
  139. except KeyError:
  140. pass
  141. # add deletions also touching this position
  142. try:
  143. sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) )
  144. except KeyError:
  145. pass
  146. try:
  147. sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
  148. except KeyError:
  149. pass
  150. freqs[ chrom ][ pos ][ 'total' ] = sum_counts
  151. # calculate actual frequencies
  152. # deletions
  153. # frequencies for deletions
  154. try:
  155. for d in indels[ chrom ][ 'D' ][ pos ].keys():
  156. freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
  157. except KeyError:
  158. pass
  159. # frequencies for matches/mismatches
  160. try:
  161. for base in mis_matches[ chrom ][ pos ].keys():
  162. try:
  163. prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts
  164. freqs[ chrom ][ pos ][ base ] = prop
  165. except ZeroDivisionError:
  166. freqs[ chrom ][ pos ][ base ] = 0.0
  167. except KeyError:
  168. pass
  169. # insertions
  170. try:
  171. for bases in indels[ chrom ][ 'I' ][ pos ].keys():
  172. prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts )
  173. try:
  174. prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end )
  175. except ZeroDivisionError:
  176. prop_end = 0.0
  177. try:
  178. ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ]
  179. except KeyError:
  180. ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] }
  181. except KeyError, e:
  182. pass
  183. # output to files if meet threshold requirement
  184. threshold = float( options.threshold )
  185. #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' )
  186. #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' )
  187. for chrom in chroms:
  188. # deletions file
  189. poses = indels[ chrom ][ 'D' ].keys()
  190. poses.sort()
  191. for pos in poses:
  192. start = pos
  193. dels = indels[ chrom ][ 'D' ][ start ].keys()
  194. dels.sort()
  195. for d in dels:
  196. end = start + d
  197. prop = freqs[ chrom ][ start ][ d ]
  198. if prop > threshold :
  199. out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
  200. # insertions file
  201. poses = indels[ chrom ][ 'I' ].keys()
  202. poses.sort()
  203. for pos in poses:
  204. start = pos
  205. end = pos + 1
  206. ins_bases = indels[ chrom ][ 'I' ][ start ].keys()
  207. ins_bases.sort()
  208. for bases in ins_bases:
  209. prop_start = ins_freqs[ chrom ][ start ][ bases ][0]
  210. prop_end = ins_freqs[ chrom ][ start ][ bases ][1]
  211. if prop_start > threshold or prop_end > threshold:
  212. out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) )
  213. # close out files
  214. out_del.close()
  215. out_ins.close()
  216. # if skipped lines because of more than one indel, output message
  217. if multi_indel_lines > 0:
  218. sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
  219. if __name__=="__main__": __main__()