/tools/regVariation/microsats_mutability.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 489 lines · 480 code · 6 blank · 3 comment · 31 complexity · 2266701224b961d46a7cef7392d6e42f MD5 · raw file

  1. #!/usr/bin/env python
  2. #Guruprasad Ananda
  3. """
  4. This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool.
  5. """
  6. from galaxy import eggs
  7. import sys, string, re, commands, tempfile, os, fileinput
  8. from galaxy.tools.util.galaxyops import *
  9. from bx.intervals.io import *
  10. from bx.intervals.operations import quicksect
  11. fout = open(sys.argv[2],'w')
  12. p_group = int(sys.argv[3]) #primary "group-by" feature
  13. p_bin_size = int(sys.argv[4])
  14. s_group = int(sys.argv[5]) #sub-group by feature
  15. s_bin_size = int(sys.argv[6])
  16. mono_threshold = 9
  17. non_mono_threshold = 4
  18. p_group_cols = [p_group, p_group+7]
  19. s_group_cols = [s_group, s_group+7]
  20. num_generations = int(sys.argv[7])
  21. region = sys.argv[8]
  22. int_file = sys.argv[9]
  23. if int_file != "None": #User has specified an interval file
  24. try:
  25. fint = open(int_file, 'r')
  26. dbkey_i = sys.argv[10]
  27. chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )
  28. except:
  29. stop_err("Unable to open input Interval file")
  30. def stop_err(msg):
  31. sys.stderr.write(msg)
  32. sys.exit()
  33. def reverse_complement(text):
  34. DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
  35. comp = [ch for ch in text.translate(DNA_COMP)]
  36. comp.reverse()
  37. return "".join(comp)
  38. def get_unique_elems(elems):
  39. seen=set()
  40. return[x for x in elems if x not in seen and not seen.add(x)]
  41. def get_binned_lists(uniqlist, binsize):
  42. binnedlist=[]
  43. uniqlist.sort()
  44. start = int(uniqlist[0])
  45. bin_ind=0
  46. l_ind=0
  47. binnedlist.append([])
  48. while l_ind < len(uniqlist):
  49. elem = int(uniqlist[l_ind])
  50. if elem in range(start,start+binsize):
  51. binnedlist[bin_ind].append(elem)
  52. else:
  53. start += binsize
  54. bin_ind += 1
  55. binnedlist.append([])
  56. binnedlist[bin_ind].append(elem)
  57. l_ind += 1
  58. return binnedlist
  59. def fetch_weight(H,C,t):
  60. if (H-(C-H)) < t:
  61. return 2.0
  62. else:
  63. return 1.0
  64. def mutabilityEstimator(repeats1,repeats2,thresholds):
  65. mut_num = 0.0 #Mutability Numerator
  66. mut_den = 0.0 #Mutability denominator
  67. for ind,H in enumerate(repeats1):
  68. C = repeats2[ind]
  69. t = thresholds[ind]
  70. w = fetch_weight(H,C,t)
  71. mut_num += ((H-C)*(H-C)*w)
  72. mut_den += w
  73. return [mut_num, mut_den]
  74. def output_writer(blk, blk_lines):
  75. global winspecies, speciesind
  76. all_elems_1=[]
  77. all_elems_2=[]
  78. all_s_elems_1=[]
  79. all_s_elems_2=[]
  80. for bline in blk_lines:
  81. if not(bline):
  82. continue
  83. items = bline.split('\t')
  84. seq1 = items[1]
  85. start1 = items[2]
  86. end1 = items[3]
  87. seq2 = items[8]
  88. start2 = items[9]
  89. end2 = items[10]
  90. if p_group_cols[0] == 6:
  91. items[p_group_cols[0]] = int(items[p_group_cols[0]])
  92. items[p_group_cols[1]] = int(items[p_group_cols[1]])
  93. if s_group_cols[0] == 6:
  94. items[s_group_cols[0]] = int(items[s_group_cols[0]])
  95. items[s_group_cols[1]] = int(items[s_group_cols[1]])
  96. all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1
  97. all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2
  98. if s_group_cols[0] != -1: #sub-group is not None
  99. all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1
  100. all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2
  101. uniq_elems_1 = get_unique_elems(all_elems_1)
  102. uniq_elems_2 = get_unique_elems(all_elems_2)
  103. if s_group_cols[0] != -1:
  104. uniq_s_elems_1 = get_unique_elems(all_s_elems_1)
  105. uniq_s_elems_2 = get_unique_elems(all_s_elems_2)
  106. mut1={}
  107. mut2={}
  108. count1 = {}
  109. count2 = {}
  110. """
  111. if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc)
  112. uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y)))
  113. """
  114. if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number.
  115. uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size)
  116. uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size)
  117. if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number.
  118. uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size)
  119. uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size)
  120. for pitem1 in uniq_elems_1:
  121. #repeats1 = []
  122. #repeats2 = []
  123. thresholds = []
  124. if s_group_cols[0] != -1: #Sub-group by feature is not None
  125. for sitem1 in uniq_s_elems_1:
  126. repeats1 = []
  127. repeats2 = []
  128. if type(sitem1) == type(''):
  129. sitem1 = sitem1.strip()
  130. for bline in blk_lines:
  131. belems = bline.split('\t')
  132. if type(pitem1) == list:
  133. if p_group_cols[0] == 6:
  134. belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
  135. if belems[p_group_cols[0]] in pitem1:
  136. if belems[s_group_cols[0]]==sitem1:
  137. repeats1.append(int(belems[6]))
  138. repeats2.append(int(belems[13]))
  139. if belems[4] == 'mononucleotide':
  140. thresholds.append(mono_threshold)
  141. else:
  142. thresholds.append(non_mono_threshold)
  143. mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
  144. if region == 'align':
  145. count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
  146. else:
  147. if winspecies == 1:
  148. count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
  149. elif winspecies == 2:
  150. count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
  151. else:
  152. if type(sitem1) == list:
  153. if s_group_cols[0] == 6:
  154. belems[s_group_cols[0]] = int(belems[s_group_cols[0]])
  155. if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1:
  156. repeats1.append(int(belems[6]))
  157. repeats2.append(int(belems[13]))
  158. if belems[4] == 'mononucleotide':
  159. thresholds.append(mono_threshold)
  160. else:
  161. thresholds.append(non_mono_threshold)
  162. mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
  163. if region == 'align':
  164. count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
  165. else:
  166. if winspecies == 1:
  167. count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1)
  168. elif winspecies == 2:
  169. count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2)
  170. else:
  171. if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1:
  172. repeats1.append(int(belems[6]))
  173. repeats2.append(int(belems[13]))
  174. if belems[4] == 'mononucleotide':
  175. thresholds.append(mono_threshold)
  176. else:
  177. thresholds.append(non_mono_threshold)
  178. mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
  179. if region == 'align':
  180. count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
  181. else:
  182. if winspecies == 1:
  183. count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
  184. elif winspecies == 2:
  185. count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
  186. else: #Sub-group by feature is None
  187. for bline in blk_lines:
  188. belems = bline.split('\t')
  189. if type(pitem1) == list:
  190. #print >>sys.stderr, "item: " + str(item1)
  191. if p_group_cols[0] == 6:
  192. belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
  193. if belems[p_group_cols[0]] in pitem1:
  194. repeats1.append(int(belems[6]))
  195. repeats2.append(int(belems[13]))
  196. if belems[4] == 'mononucleotide':
  197. thresholds.append(mono_threshold)
  198. else:
  199. thresholds.append(non_mono_threshold)
  200. else:
  201. if belems[p_group_cols[0]]==pitem1:
  202. repeats1.append(int(belems[6]))
  203. repeats2.append(int(belems[13]))
  204. if belems[4] == 'mononucleotide':
  205. thresholds.append(mono_threshold)
  206. else:
  207. thresholds.append(non_mono_threshold)
  208. mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
  209. if region == 'align':
  210. count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2))
  211. else:
  212. if winspecies == 1:
  213. count1[str(pitem1)]=sum(repeats1)
  214. elif winspecies == 2:
  215. count1[str(pitem1)]=sum(repeats2)
  216. for pitem2 in uniq_elems_2:
  217. #repeats1 = []
  218. #repeats2 = []
  219. thresholds = []
  220. if s_group_cols[0] != -1: #Sub-group by feature is not None
  221. for sitem2 in uniq_s_elems_2:
  222. repeats1 = []
  223. repeats2 = []
  224. if type(sitem2)==type(''):
  225. sitem2 = sitem2.strip()
  226. for bline in blk_lines:
  227. belems = bline.split('\t')
  228. if type(pitem2) == list:
  229. if p_group_cols[0] == 6:
  230. belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
  231. if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2:
  232. repeats2.append(int(belems[13]))
  233. repeats1.append(int(belems[6]))
  234. if belems[4] == 'mononucleotide':
  235. thresholds.append(mono_threshold)
  236. else:
  237. thresholds.append(non_mono_threshold)
  238. mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
  239. #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2)
  240. if region == 'align':
  241. count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
  242. else:
  243. if winspecies == 1:
  244. count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
  245. elif winspecies == 2:
  246. count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
  247. else:
  248. if type(sitem2) == list:
  249. if s_group_cols[0] == 6:
  250. belems[s_group_cols[1]] = int(belems[s_group_cols[1]])
  251. if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2:
  252. repeats2.append(int(belems[13]))
  253. repeats1.append(int(belems[6]))
  254. if belems[4] == 'mononucleotide':
  255. thresholds.append(mono_threshold)
  256. else:
  257. thresholds.append(non_mono_threshold)
  258. mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
  259. if region == 'align':
  260. count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
  261. else:
  262. if winspecies == 1:
  263. count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
  264. elif winspecies == 2:
  265. count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
  266. else:
  267. if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2:
  268. repeats1.append(int(belems[13]))
  269. repeats2.append(int(belems[6]))
  270. if belems[4] == 'mononucleotide':
  271. thresholds.append(mono_threshold)
  272. else:
  273. thresholds.append(non_mono_threshold)
  274. mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
  275. if region == 'align':
  276. count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
  277. else:
  278. if winspecies == 1:
  279. count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
  280. elif winspecies == 2:
  281. count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
  282. else: #Sub-group by feature is None
  283. for bline in blk_lines:
  284. belems = bline.split('\t')
  285. if type(pitem2) == list:
  286. if p_group_cols[0] == 6:
  287. belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
  288. if belems[p_group_cols[1]] in pitem2:
  289. repeats2.append(int(belems[13]))
  290. repeats1.append(int(belems[6]))
  291. if belems[4] == 'mononucleotide':
  292. thresholds.append(mono_threshold)
  293. else:
  294. thresholds.append(non_mono_threshold)
  295. else:
  296. if belems[p_group_cols[1]]==pitem2:
  297. repeats2.append(int(belems[13]))
  298. repeats1.append(int(belems[6]))
  299. if belems[4] == 'mononucleotide':
  300. thresholds.append(mono_threshold)
  301. else:
  302. thresholds.append(non_mono_threshold)
  303. mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
  304. if region == 'align':
  305. count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2))
  306. else:
  307. if winspecies == 1:
  308. count2["%s" %(pitem2)]=sum(repeats2)
  309. elif winspecies == 2:
  310. count2["%s" %(pitem2)]=sum(repeats1)
  311. for key in mut1.keys():
  312. if key in mut2.keys():
  313. mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1])
  314. count = count1[key]
  315. del mut2[key]
  316. else:
  317. unit_found = False
  318. if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too
  319. if p_group_cols[0] == 7:
  320. this,other = 0,1
  321. else:
  322. this,other = 1,0
  323. groups1 = key.split('\t')
  324. mutn = mut1[key][0]
  325. mutd = mut1[key][1]
  326. count = 0
  327. for key2 in mut2.keys():
  328. groups2 = key2.split('\t')
  329. if groups1[other] == groups2[other]:
  330. if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2:
  331. #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1])
  332. mutn += mut2[key2][0]
  333. mutd += mut2[key2][1]
  334. count += int(count2[key2])
  335. unit_found = True
  336. del mut2[key2]
  337. #break
  338. if unit_found:
  339. mut = mutn/mutd
  340. else:
  341. mut = mut1[key][0]/mut1[key][1]
  342. count = count1[key]
  343. mut = "%.2e" %(mut/num_generations)
  344. if region == 'align':
  345. print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count)
  346. elif region == 'win':
  347. fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count))
  348. fout.flush()
  349. #catch any remaining repeats, for instance if the orthologous position contained different repeat units
  350. for remaining_key in mut2.keys():
  351. mut = mut2[remaining_key][0]/mut2[remaining_key][1]
  352. mut = "%.2e" %(mut/num_generations)
  353. count = count2[remaining_key]
  354. if region == 'align':
  355. print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
  356. elif region == 'win':
  357. fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count))
  358. fout.flush()
  359. #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
  360. def counter(node, start, end, report_func):
  361. if start <= node.start < end and start < node.end <= end:
  362. report_func(node)
  363. if node.right:
  364. counter(node.right, start, end, report_func)
  365. if node.left:
  366. counter(node.left, start, end, report_func)
  367. elif node.start < start and node.right:
  368. counter(node.right, start, end, report_func)
  369. elif node.start >= end and node.left and node.left.maxend > start:
  370. counter(node.left, start, end, report_func)
  371. def main():
  372. infile = sys.argv[1]
  373. for i, line in enumerate( file ( infile )):
  374. line = line.rstrip('\r\n')
  375. if len( line )>0 and not line.startswith( '#' ):
  376. elems = line.split( '\t' )
  377. break
  378. if i == 30:
  379. break # Hopefully we'll never get here...
  380. if len( elems ) != 15:
  381. stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." )
  382. global winspecies, speciesind
  383. if region == 'win':
  384. if dbkey_i in elems[1]:
  385. winspecies = 1
  386. speciesind = 1
  387. elif dbkey_i in elems[8]:
  388. winspecies = 2
  389. speciesind = 8
  390. else:
  391. stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.")
  392. fin = open(infile, 'r')
  393. skipped = 0
  394. blk=0
  395. win=0
  396. linestr=""
  397. if region == 'win':
  398. msats = NiceReaderWrapper( fileinput.FileInput( infile ),
  399. chrom_col = speciesind,
  400. start_col = speciesind+1,
  401. end_col = speciesind+2,
  402. strand_col = -1,
  403. fix_strand = True)
  404. msatTree = quicksect.IntervalTree()
  405. for item in msats:
  406. if type( item ) is GenomicInterval:
  407. msatTree.insert( item, msats.linenum, item.fields )
  408. for iline in fint:
  409. try:
  410. iline = iline.rstrip('\r\n')
  411. if not(iline) or iline == "":
  412. continue
  413. ielems = iline.strip("\r\n").split('\t')
  414. ichr = ielems[chr_col_i]
  415. istart = int(ielems[start_col_i])
  416. iend = int(ielems[end_col_i])
  417. isrc = "%s.%s" %(dbkey_i,ichr)
  418. if isrc not in msatTree.chroms:
  419. continue
  420. result = []
  421. root = msatTree.chroms[isrc] #root node for the chrom
  422. counter(root, istart, iend, lambda node: result.append( node ))
  423. if not(result):
  424. continue
  425. tmpfile1 = tempfile.NamedTemporaryFile('wb+')
  426. for node in result:
  427. tmpfile1.write("%s\n" % "\t".join( node.other ))
  428. tmpfile1.seek(0)
  429. output_writer(iline, tmpfile1.readlines())
  430. except:
  431. skipped+=1
  432. if skipped:
  433. print "Skipped %d intervals as invalid." %(skipped)
  434. elif region == 'align':
  435. if s_group_cols[0] != -1:
  436. print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
  437. else:
  438. print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
  439. prev_bnum = -1
  440. try:
  441. for line in fin:
  442. line = line.strip("\r\n")
  443. if not(line) or line == "":
  444. continue
  445. elems = line.split('\t')
  446. try:
  447. assert int(elems[0])
  448. assert len(elems) == 15
  449. except:
  450. continue
  451. new_bnum = int(elems[0])
  452. if new_bnum != prev_bnum:
  453. if prev_bnum != -1:
  454. output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
  455. linestr = line + "\n"
  456. else:
  457. linestr += line
  458. linestr += "\n"
  459. prev_bnum = new_bnum
  460. output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
  461. except Exception, ea:
  462. print >>sys.stderr, ea
  463. skipped += 1
  464. if skipped:
  465. print "Skipped %d lines as invalid." %(skipped)
  466. if __name__ == "__main__":
  467. main()