PageRenderTime 51ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/regVariation/getIndels.py

https://bitbucket.org/ialbert/galaxy-genetrack
Python | 123 lines | 101 code | 9 blank | 13 comment | 16 complexity | b1f500c99fd0ade8644a4b7202e23910 MD5 | raw file
  1. #!/usr/bin/python
  2. """
  3. Estimate INDELs for pair-wise alignments.
  4. usage: %prog maf_input out_file1 out_file2
  5. """
  6. from __future__ import division
  7. from galaxy import eggs
  8. import pkg_resources
  9. pkg_resources.require( "bx-python" )
  10. try:
  11. pkg_resources.require("numpy")
  12. except:
  13. pass
  14. import psyco_full
  15. import sys
  16. from bx.cookbook import doc_optparse
  17. from galaxy.tools.exception_handling import *
  18. import bx.align.maf
  19. assert sys.version_info[:2] >= ( 2, 4 )
  20. def main():
  21. # Parsing Command Line here
  22. options, args = doc_optparse.parse( __doc__ )
  23. try:
  24. inp_file, out_file1 = args
  25. except:
  26. print >> sys.stderr, "Tool initialization error."
  27. sys.exit()
  28. try:
  29. fin = open(inp_file,'r')
  30. except:
  31. print >> sys.stderr, "Unable to open input file"
  32. sys.exit()
  33. try:
  34. fout1 = open(out_file1,'w')
  35. #fout2 = open(out_file2,'w')
  36. except:
  37. print >> sys.stderr, "Unable to open output file"
  38. sys.exit()
  39. try:
  40. maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
  41. except:
  42. print >>sys.stderr, "Your MAF file appears to be malformed."
  43. sys.exit()
  44. maf_count = 0
  45. print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
  46. for block_ind, block in enumerate(maf_reader):
  47. if len(block.components) < 2:
  48. continue
  49. seq1 = block.components[0].text
  50. src1 = block.components[0].src
  51. start1 = block.components[0].start
  52. if len(block.components) == 2:
  53. seq2 = block.components[1].text
  54. src2 = block.components[1].src
  55. start2 = block.components[1].start
  56. #for pos in range(len(seq1)):
  57. nt_pos1 = start1-1 #position of the nucleotide (without counting gaps)
  58. nt_pos2 = start2-1
  59. pos = 0 #character column position
  60. gaplen1 = 0
  61. gaplen2 = 0
  62. prev_pos_gap1 = 0
  63. prev_pos_gap2 = 0
  64. while pos < len(seq1):
  65. if prev_pos_gap1 == 0:
  66. gaplen1 = 0
  67. if prev_pos_gap2 == 0:
  68. gaplen2 = 0
  69. if seq1[pos] == '-':
  70. if seq2[pos] != '-':
  71. nt_pos2 += 1
  72. gaplen1 += 1
  73. prev_pos_gap1 = 1
  74. #write 2
  75. if prev_pos_gap2 == 1:
  76. prev_pos_gap2 = 0
  77. print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2)
  78. if pos == len(seq1)-1:
  79. print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1)
  80. else:
  81. prev_pos_gap1 = 0
  82. prev_pos_gap2 = 0
  83. """
  84. if prev_pos_gap1 == 1:
  85. prev_pos_gap1 = 0
  86. print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1)
  87. elif prev_pos_gap2 == 1:
  88. prev_pos_gap2 = 0
  89. print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2)
  90. """
  91. else:
  92. nt_pos1 += 1
  93. if seq2[pos] != '-':
  94. nt_pos2 += 1
  95. #write both
  96. if prev_pos_gap1 == 1:
  97. prev_pos_gap1 = 0
  98. print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1)
  99. elif prev_pos_gap2 == 1:
  100. prev_pos_gap2 = 0
  101. print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2)
  102. else:
  103. gaplen2 += 1
  104. prev_pos_gap2 = 1
  105. #write 1
  106. if prev_pos_gap1 == 1:
  107. prev_pos_gap1 = 0
  108. print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1)
  109. if pos == len(seq1)-1:
  110. print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2)
  111. pos += 1
  112. if __name__ == "__main__":
  113. main()