PageRenderTime 50ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/regVariation/getIndelRates_3way.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 249 lines | 221 code | 25 blank | 3 comment | 65 complexity | c4f2f0d5ad6752264faa2638d35856cf MD5 | raw file
  1. #!/usr/bin/env python
  2. #Guruprasad Ananda
  3. from galaxy import eggs
  4. import pkg_resources
  5. pkg_resources.require( "bx-python" )
  6. import sys, os, tempfile
  7. import traceback
  8. import fileinput
  9. from warnings import warn
  10. from galaxy.tools.util.galaxyops import *
  11. from bx.intervals.io import *
  12. from bx.intervals.operations import quicksect
  13. def stop_err(msg):
  14. sys.stderr.write(msg)
  15. sys.exit()
  16. def counter(node, start, end, sort_col):
  17. global full, blk_len, blk_list
  18. if node.start < start:
  19. if node.right:
  20. counter(node.right, start, end, sort_col)
  21. elif start <= node.start <= end and start <= node.end <= end:
  22. full += 1
  23. if node.other[0] not in blk_list:
  24. blk_list.append(node.other[0])
  25. blk_len += int(node.other[sort_col+2])
  26. if node.left and node.left.maxend > start:
  27. counter(node.left, start, end, sort_col)
  28. if node.right:
  29. counter(node.right, start, end, sort_col)
  30. elif node.start > end:
  31. if node.left:
  32. counter(node.left, start, end, sort_col)
  33. infile = sys.argv[1]
  34. fout = open(sys.argv[2],'w')
  35. int_file = sys.argv[3]
  36. if int_file != "None": #User has specified an interval file
  37. try:
  38. fint = open(int_file, 'r')
  39. dbkey_i = sys.argv[4]
  40. chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
  41. except:
  42. stop_err("Unable to open input Interval file")
  43. def main():
  44. for i, line in enumerate( file ( infile )):
  45. line = line.rstrip('\r\n')
  46. if len( line )>0 and not line.startswith( '#' ):
  47. elems = line.split( '\t' )
  48. break
  49. if i == 30:
  50. break # Hopefully we'll never get here...
  51. if len( elems ) != 18:
  52. stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." )
  53. for i, line in enumerate( file ( infile )):
  54. line = line.rstrip('\r\n')
  55. elems = line.split('\t')
  56. try:
  57. assert int(elems[0])
  58. assert len(elems) == 18
  59. if int_file != "None":
  60. if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]:
  61. stop_err("The species build corresponding to your interval file is not present in the Indel file.")
  62. if dbkey_i in elems[3]:
  63. sort_col = 4
  64. elif dbkey_i in elems[8]:
  65. sort_col = 9
  66. elif dbkey_i in elems[13]:
  67. sort_col = 14
  68. else:
  69. species = []
  70. species.append( elems[3].split('.')[0] )
  71. species.append( elems[8].split('.')[0] )
  72. species.append( elems[13].split('.')[0] )
  73. sort_col = 0 #Based on block numbers
  74. break
  75. except:
  76. continue
  77. fin = open(infile, 'r')
  78. skipped = 0
  79. if int_file == "None":
  80. sorted_infile = tempfile.NamedTemporaryFile()
  81. cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile
  82. try:
  83. os.system(cmdline)
  84. except:
  85. stop_err("Encountered error while sorting the input file.")
  86. print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2])
  87. prev_bnum = -1
  88. sorted_infile.seek(0)
  89. for line in sorted_infile.readlines():
  90. line = line.rstrip('\r\n')
  91. elems = line.split('\t')
  92. try:
  93. assert int(elems[0])
  94. assert len(elems) == 18
  95. new_bnum = int(elems[0])
  96. if new_bnum != prev_bnum:
  97. if prev_bnum != -1:
  98. irate = []
  99. drate = []
  100. for i,elem in enumerate(inserts):
  101. try:
  102. irate.append(str("%.2e" %(inserts[i]/blen[i])))
  103. except:
  104. irate.append('0')
  105. try:
  106. drate.append(str("%.2e" %(deletes[i]/blen[i])))
  107. except:
  108. drate.append('0')
  109. print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
  110. inserts = [0.0, 0.0, 0.0]
  111. deletes = [0.0, 0.0, 0.0]
  112. blen = []
  113. blen.append( int(elems[6]) )
  114. blen.append( int(elems[11]) )
  115. blen.append( int(elems[16]) )
  116. line_sp = elems[1].split('.')[0]
  117. sp_ind = species.index(line_sp)
  118. if elems[1].endswith('insert'):
  119. inserts[sp_ind] += 1
  120. elif elems[1].endswith('delete'):
  121. deletes[sp_ind] += 1
  122. prev_bnum = new_bnum
  123. except Exception, ei:
  124. #print >>sys.stderr, ei
  125. continue
  126. irate = []
  127. drate = []
  128. for i,elem in enumerate(inserts):
  129. try:
  130. irate.append(str("%.2e" %(inserts[i]/blen[i])))
  131. except:
  132. irate.append('0')
  133. try:
  134. drate.append(str("%.2e" %(deletes[i]/blen[i])))
  135. except:
  136. drate.append('0')
  137. print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
  138. sys.exit()
  139. inf = open(infile, 'r')
  140. start_met = False
  141. end_met = False
  142. sp_file = tempfile.NamedTemporaryFile()
  143. for n, line in enumerate(inf):
  144. line = line.rstrip('\r\n')
  145. elems = line.split('\t')
  146. try:
  147. assert int(elems[0])
  148. assert len(elems) == 18
  149. if dbkey_i not in elems[1]:
  150. if not(start_met):
  151. continue
  152. else:
  153. sp_end = n
  154. break
  155. else:
  156. print >>sp_file, line
  157. if not(start_met):
  158. start_met = True
  159. sp_start = n
  160. except:
  161. continue
  162. try:
  163. assert sp_end
  164. except:
  165. sp_end = n+1
  166. sp_file.seek(0)
  167. win = NiceReaderWrapper( fileinput.FileInput( int_file ),
  168. chrom_col=chr_col_i,
  169. start_col=start_col_i,
  170. end_col=end_col_i,
  171. strand_col=strand_col_i,
  172. fix_strand=True)
  173. indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
  174. chrom_col=1,
  175. start_col=sort_col,
  176. end_col=sort_col+1,
  177. strand_col=-1,
  178. fix_strand=True)
  179. indelTree = quicksect.IntervalTree()
  180. for item in indel:
  181. if type( item ) is GenomicInterval:
  182. indelTree.insert( item, indel.linenum, item.fields )
  183. result=[]
  184. global full, blk_len, blk_list
  185. for interval in win:
  186. if type( interval ) is Header:
  187. pass
  188. if type( interval ) is Comment:
  189. pass
  190. elif type( interval ) == GenomicInterval:
  191. chrom = interval.chrom
  192. start = int(interval.start)
  193. end = int(interval.end)
  194. if start > end:
  195. warn( "Interval start after end!" )
  196. ins_chr = "%s.%s_insert" %(dbkey_i,chrom)
  197. del_chr = "%s.%s_delete" %(dbkey_i,chrom)
  198. irate = 0
  199. drate = 0
  200. if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:
  201. pass
  202. else:
  203. if ins_chr in indelTree.chroms:
  204. full = 0.0
  205. blk_len = 0
  206. blk_list = []
  207. root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree
  208. counter(root, start, end, sort_col)
  209. if blk_len:
  210. irate = full/blk_len
  211. if del_chr in indelTree.chroms:
  212. full = 0.0
  213. blk_len = 0
  214. blk_list = []
  215. root = indelTree.chroms[del_chr] #root node for the chrom insertion tree
  216. counter(root, start, end, sort_col)
  217. if blk_len:
  218. drate = full/blk_len
  219. interval.fields.append(str("%.2e" %irate))
  220. interval.fields.append(str("%.2e" %drate))
  221. print >>fout, "\t".join(interval.fields)
  222. fout.flush()
  223. if __name__ == "__main__":
  224. main()