/tools/regVariation/substitutions.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 87 lines · 69 code · 13 blank · 5 comment · 18 complexity · f2dc7ae5ec0865e260dbee185438ff97 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Guruprasad ANanda
  3. """
  4. Fetches substitutions from pairwise alignments.
  5. """
  6. from galaxy import eggs
  7. from galaxy.tools.util import maf_utilities
  8. import bx.align.maf
  9. import sys
  10. import os, fileinput
  11. def stop_err(msg):
  12. sys.stderr.write(msg)
  13. sys.exit()
  14. if len(sys.argv) < 3:
  15. stop_err("Incorrect number of arguments.")
  16. inp_file = sys.argv[1]
  17. out_file = sys.argv[2]
  18. fout = open(out_file, 'w')
  19. def fetchSubs(block):
  20. src1 = block.components[0].src
  21. sequence1 = block.components[0].text
  22. start1 = block.components[0].start
  23. end1 = block.components[0].end
  24. len1 = int(end1)-int(start1)
  25. len1_withgap = len(sequence1)
  26. for seq in range (1,len(block.components)):
  27. src2 = block.components[seq].src
  28. sequence2 = block.components[seq].text
  29. start2 = block.components[seq].start
  30. end2 = block.components[seq].end
  31. len2 = int(end2)-int(start2)
  32. sub_begin = None
  33. sub_end = None
  34. begin = False
  35. for nt in range(len1_withgap):
  36. if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character
  37. if sequence1[nt].upper() != sequence2[nt].upper():
  38. if not(begin):
  39. sub_begin = nt
  40. begin = True
  41. sub_end = nt
  42. else:
  43. if begin:
  44. print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),start1+sub_end-sequence1[0:sub_end].count('-'))
  45. print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),start2+sub_end-sequence2[0:sub_end].count('-'))
  46. begin = False
  47. else:
  48. if begin:
  49. print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),end1+sub_end-sequence1[0:sub_end].count('-'))
  50. print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),end2+sub_end-sequence2[0:sub_end].count('-'))
  51. begin = False
  52. ended = False
  53. def main():
  54. skipped = 0
  55. not_pairwise = 0
  56. try:
  57. maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
  58. except:
  59. stop_err("Your MAF file appears to be malformed.")
  60. print >>fout, "#Chr\tStart\tEnd"
  61. for block in maf_reader:
  62. if len(block.components) != 2:
  63. not_pairwise += 1
  64. continue
  65. try:
  66. fetchSubs(block)
  67. except:
  68. skipped += 1
  69. if not_pairwise:
  70. print "Skipped %d non-pairwise blocks" %(not_pairwise)
  71. if skipped:
  72. print "Skipped %d blocks" %(skipped)
  73. if __name__ == "__main__":
  74. main()