sam2interval.py - This Python script takes a SAM file as in…

/tools/samtools/sam2interval.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 96 lines · 75 code · 19 blank · 2 comment · 12 complexity · f1b2daf2f7e80d203894ee2c7e59dbe2 MD5 · raw file


#!/usr/bin/env python

import sys
import optparse
import re

def stop_err( msg ):
    sys.stderr.write( msg )
    sys.exit()

def main():
    usage = """%prog [options]
    
options (listed below) default to 'None' if omitted
    """
    parser = optparse.OptionParser(usage=usage)

    parser.add_option(
        '-f','--input_sam_file',
        metavar="INPUT_SAM_FILE",
        dest='input_sam',
        default = False,
        help='Name of the SAM file to be filtered. STDIN is default')
            
    parser.add_option(
        '-c','--flag_column',
        dest='flag_col',
        default = '2',
        help='Column containing SAM bitwise flag. 1-based')
        
    parser.add_option(
        '-s','--start_column',
        dest='start_col',
        default = '4',
        help='Column containing position. 1-based')

    parser.add_option(
        '-g','--cigar_column',
        dest='cigar_col',
        default = '6',
        help='Column containing CIGAR or extended CIGAR string')

    parser.add_option(
        '-r','--ref_column',
        dest='ref_col',
        default = '3',
        help='Column containing name of the reference sequence coordinate. 1-based')
        
    parser.add_option(
        '-e','--read_column',
        dest='read_col',
        default = '1',
        help='Column containing read name. 1-based')

    parser.add_option(
        '-p','--print_all',
        dest='prt_all',
        action='store_true',
        default = False,
        help='Print coordinates and original SAM?')
    
    options, args = parser.parse_args()

    if options.input_sam:
        infile = open ( options.input_sam, 'r')
    else:
        infile = sys.stdin

    cigar = re.compile( '\d+M|\d+N|\d+D|\d+P' )

    print '#chrom\tstart\tend\tstrand\tread_name' # provide a (partial) header so that strand is automatically set in metadata

    for line in infile:
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
            fields = line.split( '\t' )
            start = int( fields[ int( options.start_col ) - 1 ] ) - 1
            end = 0
            for op in cigar.findall( fields[ int( options.cigar_col) - 1 ] ):
                end += int( op[ 0:len( op ) - 1 ] )
                
            strand = '+' 
            if bool( int( fields[ int( options.flag_col ) - 1 ] ) & 0x0010 ):
                strand = '-'
            read_name = fields[ int( options.read_col ) - 1 ]
            ref_name  = fields[ int( options.ref_col ) - 1 ]
            
            if ref_name != '*':
                # Do not print lines with unmapped reads that contain '*' instead of chromosome name        
                if options.prt_all: 
                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, line)
                else:
                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, read_name)

if __name__ == "__main__": main()

Summary ✨

This Python script takes a SAM file as input and outputs a BED file with the coordinates of mapped reads. It uses regular expressions to parse the CIGAR string and determine the start and end positions of each read. The script also filters out unmapped reads that contain ‘*’ instead of chromosome names in the reference sequence column.

Tech Fingerprint

Alerts (2)

'def' Ensure functions have docstrings for documentation
7
Complexity hotspot; line 75 (total complexity: 3)
75