/tools/regVariation/getIndels.py
Python | 123 lines | 101 code | 9 blank | 13 comment | 16 complexity | 8ba383530dc9cf8ab9b0634b3461c174 MD5 | raw file
- #!/usr/bin/env python
- """
- Estimate INDELs for pair-wise alignments.
- usage: %prog maf_input out_file1 out_file2
- """
- from __future__ import division
- from galaxy import eggs
- import pkg_resources
- pkg_resources.require( "bx-python" )
- try:
- pkg_resources.require("numpy")
- except:
- pass
- import psyco_full
- import sys
- from bx.cookbook import doc_optparse
- from galaxy.tools.exception_handling import *
- import bx.align.maf
- assert sys.version_info[:2] >= ( 2, 4 )
- def main():
- # Parsing Command Line here
- options, args = doc_optparse.parse( __doc__ )
-
- try:
- inp_file, out_file1 = args
- except:
- print >> sys.stderr, "Tool initialization error."
- sys.exit()
-
- try:
- fin = open(inp_file,'r')
- except:
- print >> sys.stderr, "Unable to open input file"
- sys.exit()
- try:
- fout1 = open(out_file1,'w')
- #fout2 = open(out_file2,'w')
- except:
- print >> sys.stderr, "Unable to open output file"
- sys.exit()
- try:
- maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
- except:
- print >>sys.stderr, "Your MAF file appears to be malformed."
- sys.exit()
- maf_count = 0
-
- print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
- for block_ind, block in enumerate(maf_reader):
- if len(block.components) < 2:
- continue
- seq1 = block.components[0].text
- src1 = block.components[0].src
- start1 = block.components[0].start
- if len(block.components) == 2:
- seq2 = block.components[1].text
- src2 = block.components[1].src
- start2 = block.components[1].start
- #for pos in range(len(seq1)):
- nt_pos1 = start1-1 #position of the nucleotide (without counting gaps)
- nt_pos2 = start2-1
- pos = 0 #character column position
- gaplen1 = 0
- gaplen2 = 0
- prev_pos_gap1 = 0
- prev_pos_gap2 = 0
- while pos < len(seq1):
- if prev_pos_gap1 == 0:
- gaplen1 = 0
- if prev_pos_gap2 == 0:
- gaplen2 = 0
-
- if seq1[pos] == '-':
- if seq2[pos] != '-':
- nt_pos2 += 1
- gaplen1 += 1
- prev_pos_gap1 = 1
- #write 2
- if prev_pos_gap2 == 1:
- prev_pos_gap2 = 0
- print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2)
- if pos == len(seq1)-1:
- print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1)
- else:
- prev_pos_gap1 = 0
- prev_pos_gap2 = 0
- """
- if prev_pos_gap1 == 1:
- prev_pos_gap1 = 0
- print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1)
- elif prev_pos_gap2 == 1:
- prev_pos_gap2 = 0
- print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2)
- """
- else:
- nt_pos1 += 1
- if seq2[pos] != '-':
- nt_pos2 += 1
- #write both
- if prev_pos_gap1 == 1:
- prev_pos_gap1 = 0
- print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1)
- elif prev_pos_gap2 == 1:
- prev_pos_gap2 = 0
- print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2)
- else:
- gaplen2 += 1
- prev_pos_gap2 = 1
- #write 1
- if prev_pos_gap1 == 1:
- prev_pos_gap1 = 0
- print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1)
- if pos == len(seq1)-1:
- print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2)
- pos += 1
- if __name__ == "__main__":
- main()