PageRenderTime 35ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/indels/indel_sam2interval.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 161 lines | 125 code | 9 blank | 27 comment | 36 complexity | 01984c112db9dfd2aaa4374cbb4a8b35 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. Allows user to filter out non-indels from SAM.
  4. usage: %prog [options]
  5. -i, --input=i: The input SAM file
  6. -u, --include_base=u: Whether or not to include the base for insertions
  7. -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown
  8. -o, --int_out=o: The interval output file for the converted SAM file
  9. -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file
  10. -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file
  11. """
  12. import re, sys
  13. from galaxy import eggs
  14. import pkg_resources; pkg_resources.require( "bx-python" )
  15. from bx.cookbook import doc_optparse
  16. def stop_err( msg ):
  17. sys.stderr.write( '%s\n' % msg )
  18. sys.exit()
  19. def numeric_sort( text1, text2 ):
  20. """
  21. For two items containing space-separated text, compares equivalent pieces
  22. numerically if both numeric or as text otherwise
  23. """
  24. pieces1 = text1.split()
  25. pieces2 = text2.split()
  26. if len( pieces1 ) == 0:
  27. return 1
  28. if len( pieces2 ) == 0:
  29. return -1
  30. for i, pc1 in enumerate( pieces1 ):
  31. if i == len( pieces2 ):
  32. return 1
  33. if not pieces2[i].isdigit():
  34. if pc1.isdigit():
  35. return -1
  36. else:
  37. if pc1 > pieces2[i]:
  38. return 1
  39. elif pc1 < pieces2[i]:
  40. return -1
  41. else:
  42. if not pc1.isdigit():
  43. return 1
  44. else:
  45. if int( pc1 ) > int( pieces2[i] ):
  46. return 1
  47. elif int( pc1 ) < int( pieces2[i] ):
  48. return -1
  49. if i < len( pieces2 ) - 1:
  50. return -1
  51. return 0
  52. def __main__():
  53. #Parse Command Line
  54. options, args = doc_optparse.parse( __doc__ )
  55. # open up output files
  56. output = open( options.int_out, 'wb' )
  57. if options.bed_ins_out != 'None':
  58. output_bed_ins = open( options.bed_ins_out, 'wb' )
  59. else:
  60. output_bed_ins = None
  61. if options.bed_del_out != 'None':
  62. output_bed_del = open( options.bed_del_out, 'wb' )
  63. else:
  64. output_bed_del = None
  65. # the pattern to match, assuming just one indel per cigar string
  66. pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
  67. pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
  68. # go through all lines in input file
  69. out_data = {}
  70. multi_indel_lines = 0
  71. for line in open( options.input, 'rb' ):
  72. if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
  73. split_line = line.split( '\t' )
  74. if split_line < 12:
  75. continue
  76. # grab relevant pieces
  77. cigar = split_line[5].strip()
  78. pos = int( split_line[3] )
  79. chr = split_line[2]
  80. base_string = split_line[9]
  81. # parse cigar string
  82. m = pat_indel.match( cigar )
  83. if not m:
  84. m = pat_multi.match( cigar )
  85. # skip this line if no match
  86. if not m:
  87. continue
  88. # account for multiple indels or operations we don't process
  89. else:
  90. multi_indel_lines += 1
  91. continue
  92. else:
  93. match = m.groupdict()
  94. left = int( match[ 'lmatch' ] )
  95. middle = int( match[ 'ins_del_width' ] )
  96. middle_type = match[ 'ins_del' ]
  97. bases = base_string[ left : left + middle ]
  98. # calculate start and end positions, and output to insertion or deletion file
  99. start = left + pos
  100. if middle_type == 'D':
  101. end = start + middle
  102. data = [ chr, start, end, 'D' ]
  103. if options.include_base == "true":
  104. data.append( '-' )
  105. else:
  106. end = start + 1
  107. data = [ chr, start, end, 'I' ]
  108. if options.include_base == "true":
  109. data.append( bases )
  110. location = '\t'.join( [ '%s' % d for d in data ] )
  111. try:
  112. out_data[ location ] += 1
  113. except KeyError:
  114. out_data[ location ] = 1
  115. # output to interval file
  116. # get all locations and sort
  117. locations = out_data.keys()
  118. locations.sort( numeric_sort )
  119. last_line = ''
  120. # output each location, either with counts or each occurrence
  121. for loc in locations:
  122. sp_loc = loc.split( '\t' )
  123. cur_line = '\t'.join( sp_loc[:3] )
  124. if options.collapse == 'true':
  125. output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
  126. if output_bed_del and sp_loc[3] == 'D':
  127. output_bed_del.write( '%s\n' % cur_line )
  128. if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
  129. output_bed_ins.write( '%s\n' % cur_line )
  130. last_line = cur_line
  131. else:
  132. for i in range( out_data[ loc ] ):
  133. output.write( '%s\n' % loc )
  134. if output_bed_del or output_bed_ins:
  135. if output_bed_del and sp_loc[3] == 'D':
  136. output_bed_del.write( '%s\n' % cur_line )
  137. if output_bed_ins and sp_loc[3] == 'I':
  138. output_bed_ins.write( '%s\n' % cur_line )
  139. # cleanup, close files
  140. if output_bed_ins:
  141. output_bed_ins.close()
  142. if output_bed_del:
  143. output_bed_del.close()
  144. output.close()
  145. # if skipped lines because of more than one indel, output message
  146. if multi_indel_lines > 0:
  147. sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
  148. if __name__=="__main__": __main__()