PageRenderTime 22ms CodeModel.GetById 7ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/samtools/sam2interval.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 96 lines | 92 code | 3 blank | 1 comment | 0 complexity | f1b2daf2f7e80d203894ee2c7e59dbe2 MD5 | raw file
 1#!/usr/bin/env python
 2
 3import sys
 4import optparse
 5import re
 6
 7def stop_err( msg ):
 8    sys.stderr.write( msg )
 9    sys.exit()
10
11def main():
12    usage = """%prog [options]
13    
14options (listed below) default to 'None' if omitted
15    """
16    parser = optparse.OptionParser(usage=usage)
17
18    parser.add_option(
19        '-f','--input_sam_file',
20        metavar="INPUT_SAM_FILE",
21        dest='input_sam',
22        default = False,
23        help='Name of the SAM file to be filtered. STDIN is default')
24            
25    parser.add_option(
26        '-c','--flag_column',
27        dest='flag_col',
28        default = '2',
29        help='Column containing SAM bitwise flag. 1-based')
30        
31    parser.add_option(
32        '-s','--start_column',
33        dest='start_col',
34        default = '4',
35        help='Column containing position. 1-based')
36
37    parser.add_option(
38        '-g','--cigar_column',
39        dest='cigar_col',
40        default = '6',
41        help='Column containing CIGAR or extended CIGAR string')
42
43    parser.add_option(
44        '-r','--ref_column',
45        dest='ref_col',
46        default = '3',
47        help='Column containing name of the reference sequence coordinate. 1-based')
48        
49    parser.add_option(
50        '-e','--read_column',
51        dest='read_col',
52        default = '1',
53        help='Column containing read name. 1-based')
54
55    parser.add_option(
56        '-p','--print_all',
57        dest='prt_all',
58        action='store_true',
59        default = False,
60        help='Print coordinates and original SAM?')
61    
62    options, args = parser.parse_args()
63
64    if options.input_sam:
65        infile = open ( options.input_sam, 'r')
66    else:
67        infile = sys.stdin
68
69    cigar = re.compile( '\d+M|\d+N|\d+D|\d+P' )
70
71    print '#chrom\tstart\tend\tstrand\tread_name' # provide a (partial) header so that strand is automatically set in metadata
72
73    for line in infile:
74        line = line.rstrip( '\r\n' )
75        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
76            fields = line.split( '\t' )
77            start = int( fields[ int( options.start_col ) - 1 ] ) - 1
78            end = 0
79            for op in cigar.findall( fields[ int( options.cigar_col) - 1 ] ):
80                end += int( op[ 0:len( op ) - 1 ] )
81                
82            strand = '+' 
83            if bool( int( fields[ int( options.flag_col ) - 1 ] ) & 0x0010 ):
84                strand = '-'
85            read_name = fields[ int( options.read_col ) - 1 ]
86            ref_name  = fields[ int( options.ref_col ) - 1 ]
87            
88            if ref_name != '*':
89                # Do not print lines with unmapped reads that contain '*' instead of chromosome name        
90                if options.prt_all: 
91                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, line)
92                else:
93                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, read_name)
94
95if __name__ == "__main__": main()
96