PageRenderTime 50ms CodeModel.GetById 13ms app.highlight 30ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/regVariation/getIndels.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 123 lines | 101 code | 9 blank | 13 comment | 11 complexity | 8ba383530dc9cf8ab9b0634b3461c174 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Estimate INDELs for pair-wise alignments.
  5
  6usage: %prog maf_input out_file1 out_file2
  7"""
  8
  9from __future__ import division
 10from galaxy import eggs
 11import pkg_resources 
 12pkg_resources.require( "bx-python" )
 13try:
 14    pkg_resources.require("numpy")
 15except:
 16    pass
 17import psyco_full
 18import sys
 19from bx.cookbook import doc_optparse
 20from galaxy.tools.exception_handling import *
 21import bx.align.maf
 22
 23assert sys.version_info[:2] >= ( 2, 4 )
 24
 25def main():   
 26    # Parsing Command Line here
 27    options, args = doc_optparse.parse( __doc__ )
 28    
 29    try:
 30        inp_file, out_file1 = args    
 31    except:
 32        print >> sys.stderr, "Tool initialization error."
 33        sys.exit()
 34    
 35    try:
 36        fin = open(inp_file,'r')
 37    except:
 38        print >> sys.stderr, "Unable to open input file"
 39        sys.exit()
 40    try:
 41        fout1 = open(out_file1,'w')
 42        #fout2 = open(out_file2,'w')
 43    except:
 44        print >> sys.stderr, "Unable to open output file"
 45        sys.exit()
 46
 47    try:
 48        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
 49    except:
 50        print >>sys.stderr, "Your MAF file appears to be malformed."
 51        sys.exit()
 52    maf_count = 0
 53    
 54    print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
 55    for block_ind, block in enumerate(maf_reader):
 56        if len(block.components) < 2:
 57            continue
 58        seq1 = block.components[0].text
 59        src1 = block.components[0].src
 60        start1 = block.components[0].start
 61        if len(block.components) == 2:
 62            seq2 = block.components[1].text
 63            src2 = block.components[1].src
 64            start2 = block.components[1].start
 65            #for pos in range(len(seq1)):
 66            nt_pos1 = start1-1    #position of the nucleotide (without counting gaps)
 67            nt_pos2 = start2-1
 68            pos = 0        #character column position
 69            gaplen1 = 0
 70            gaplen2 = 0
 71            prev_pos_gap1 = 0
 72            prev_pos_gap2 = 0
 73            while pos < len(seq1):
 74                if prev_pos_gap1 == 0:
 75                    gaplen1 = 0
 76                if prev_pos_gap2 == 0:
 77                    gaplen2 = 0
 78                    
 79                if seq1[pos] == '-':
 80                    if seq2[pos] != '-':
 81                        nt_pos2 += 1
 82                        gaplen1 += 1
 83                        prev_pos_gap1 = 1
 84                        #write 2
 85                        if prev_pos_gap2 == 1:
 86                            prev_pos_gap2 = 0
 87                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2)
 88                        if pos == len(seq1)-1:
 89                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1)
 90                    else:
 91                        prev_pos_gap1 = 0
 92                        prev_pos_gap2 = 0
 93                        """
 94                        if prev_pos_gap1 == 1:
 95                            prev_pos_gap1 = 0
 96                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1)
 97                        elif prev_pos_gap2 == 1:
 98                            prev_pos_gap2 = 0
 99                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2)
100                        """
101                else:
102                    nt_pos1 += 1
103                    if seq2[pos] != '-':
104                        nt_pos2 += 1
105                        #write both
106                        if prev_pos_gap1 == 1:
107                            prev_pos_gap1 = 0
108                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1)
109                        elif prev_pos_gap2 == 1:
110                            prev_pos_gap2 = 0
111                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2)
112                    else:
113                        gaplen2 += 1
114                        prev_pos_gap2 = 1
115                        #write 1
116                        if prev_pos_gap1 == 1:
117                            prev_pos_gap1 = 0
118                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1)
119                        if pos == len(seq1)-1:
120                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2)
121                pos += 1
122if __name__ == "__main__":
123    main()