/tools/samtools/sam2interval.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 96 lines · 75 code · 19 blank · 2 comment · 12 complexity · f1b2daf2f7e80d203894ee2c7e59dbe2 MD5 · raw file

  1. #!/usr/bin/env python
  2. import sys
  3. import optparse
  4. import re
  5. def stop_err( msg ):
  6. sys.stderr.write( msg )
  7. sys.exit()
  8. def main():
  9. usage = """%prog [options]
  10. options (listed below) default to 'None' if omitted
  11. """
  12. parser = optparse.OptionParser(usage=usage)
  13. parser.add_option(
  14. '-f','--input_sam_file',
  15. metavar="INPUT_SAM_FILE",
  16. dest='input_sam',
  17. default = False,
  18. help='Name of the SAM file to be filtered. STDIN is default')
  19. parser.add_option(
  20. '-c','--flag_column',
  21. dest='flag_col',
  22. default = '2',
  23. help='Column containing SAM bitwise flag. 1-based')
  24. parser.add_option(
  25. '-s','--start_column',
  26. dest='start_col',
  27. default = '4',
  28. help='Column containing position. 1-based')
  29. parser.add_option(
  30. '-g','--cigar_column',
  31. dest='cigar_col',
  32. default = '6',
  33. help='Column containing CIGAR or extended CIGAR string')
  34. parser.add_option(
  35. '-r','--ref_column',
  36. dest='ref_col',
  37. default = '3',
  38. help='Column containing name of the reference sequence coordinate. 1-based')
  39. parser.add_option(
  40. '-e','--read_column',
  41. dest='read_col',
  42. default = '1',
  43. help='Column containing read name. 1-based')
  44. parser.add_option(
  45. '-p','--print_all',
  46. dest='prt_all',
  47. action='store_true',
  48. default = False,
  49. help='Print coordinates and original SAM?')
  50. options, args = parser.parse_args()
  51. if options.input_sam:
  52. infile = open ( options.input_sam, 'r')
  53. else:
  54. infile = sys.stdin
  55. cigar = re.compile( '\d+M|\d+N|\d+D|\d+P' )
  56. print '#chrom\tstart\tend\tstrand\tread_name' # provide a (partial) header so that strand is automatically set in metadata
  57. for line in infile:
  58. line = line.rstrip( '\r\n' )
  59. if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
  60. fields = line.split( '\t' )
  61. start = int( fields[ int( options.start_col ) - 1 ] ) - 1
  62. end = 0
  63. for op in cigar.findall( fields[ int( options.cigar_col) - 1 ] ):
  64. end += int( op[ 0:len( op ) - 1 ] )
  65. strand = '+'
  66. if bool( int( fields[ int( options.flag_col ) - 1 ] ) & 0x0010 ):
  67. strand = '-'
  68. read_name = fields[ int( options.read_col ) - 1 ]
  69. ref_name = fields[ int( options.ref_col ) - 1 ]
  70. if ref_name != '*':
  71. # Do not print lines with unmapped reads that contain '*' instead of chromosome name
  72. if options.prt_all:
  73. print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, line)
  74. else:
  75. print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, read_name)
  76. if __name__ == "__main__": main()