PageRenderTime 72ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/regVariation/microsatellite_birthdeath.pl

https://bitbucket.org/chapmanb/galaxy-central/
Perl | 4333 lines | 2916 code | 609 blank | 808 comment | 702 complexity | d1bf4746bc2b8d229099df9b29475ffa MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. #!/usr/bin/perl -w
  2. use strict;
  3. use warnings;
  4. use Term::ANSIColor;
  5. use Pod::Checker;
  6. use File::Basename;
  7. use IO::Handle;
  8. use Cwd;
  9. use File::Path qw(make_path remove_tree);
  10. use File::Temp qw/ tempfile tempdir /;
  11. my $tdir = tempdir( CLEANUP => 1 );
  12. chdir $tdir;
  13. my $dir = getcwd;
  14. # print "current dit=$dir\n";
  15. #<STDIN>;
  16. use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags
  17. $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species
  18. $gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies_orig @exactspecies @exacttags
  19. $mono_flanksimplicityRepno $di_flanksimplicityRepno $prop_of_seq_allowedtoAT $prop_of_seq_allowedtoCG);
  20. use FileHandle;
  21. use IO::Handle; # 5.004 or higher
  22. #my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat",
  23. #"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12",
  24. #"10","0.8");
  25. my @ar = @ARGV;
  26. my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar;
  27. $SIMILARITY_THRESH=$SIMILARITY_THRESH/100;
  28. #########################
  29. $SIMILARITY_THRESH = $SIMILARITY_THRESH/100;
  30. my $EDGE_DISTANCE = 10;
  31. my $COMPLEXITY_SUPPORT = 20;
  32. load_thresholds("9_10_12_12");
  33. my $FLANKINDEL_MAXTHRESH = 0.3;
  34. my $mono_flanksimplicityRepno=6;
  35. my $di_flanksimplicityRepno=10;
  36. my $prop_of_seq_allowedtoAT=0.5;
  37. my $prop_of_seq_allowedtoCG=0.66;
  38. #########################
  39. my $tspecies_set = $species_set;
  40. my %speciesReplacement = ();
  41. my %speciesReplacementTag = ();
  42. my %replacementArr= ();
  43. my %replacementArrTag= ();
  44. my %backReplacementArr= ();
  45. my %backReplacementArrTag= ();
  46. $tree_definition=~s/\s+//g;
  47. my $tree_definition_split = $tree_definition;
  48. $tree_definition_split =~ s/[\(\)]//g;
  49. my @gotSpecies = ($tree_definition_split =~ /(,)/g);
  50. # print "gotSpecies = @gotSpecies\n";
  51. if (scalar(@gotSpecies)+1 ==5){
  52. $speciesReplacement{1}="calJac1";
  53. $speciesReplacement{2}="rheMac2";
  54. $speciesReplacement{3}="ponAbe2";
  55. $speciesReplacement{4}="panTro2";
  56. $speciesReplacement{5}="hg18";
  57. $speciesReplacementTag{1}="M";
  58. $speciesReplacementTag{2}="R";
  59. $speciesReplacementTag{3}="O";
  60. $speciesReplacementTag{4}="C";
  61. $speciesReplacementTag{5}="H";
  62. $species_set="hg18,panTro2,ponAbe2,rheMac2,calJac1";
  63. }
  64. if (scalar(@gotSpecies)+1 ==4){
  65. $speciesReplacement{1}="rheMac2";
  66. $speciesReplacement{2}="ponAbe2";
  67. $speciesReplacement{3}="panTro2";
  68. $speciesReplacement{4}="hg18";
  69. $speciesReplacementTag{1}="R";
  70. $speciesReplacementTag{2}="O";
  71. $speciesReplacementTag{3}="C";
  72. $speciesReplacementTag{4}="H";
  73. $species_set="hg18,panTro2,ponAbe2,rheMac2";
  74. }
  75. # $tree_definition = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)";
  76. my $tree_definition_copy = $tree_definition;
  77. my $tree_definition_orig = $tree_definition;
  78. my $brackets = 0;
  79. while (1){
  80. #last if $tree_definition_copy !~ /\(/;
  81. $brackets++;
  82. # print "brackets = $brackets\n";
  83. last if $brackets > 6;
  84. $tree_definition_copy =~ s/^\(//g;
  85. $tree_definition_copy =~ s/\)$//g;
  86. # print "tree_definition_copy = $tree_definition_copy\n";
  87. my @arr = ();
  88. if ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
  89. @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
  90. # print "arr = @arr\n";
  91. $tree_definition_copy = $2;
  92. $replacementArr{$1} = $speciesReplacement{$brackets};
  93. $backReplacementArr{$speciesReplacement{$brackets}}=$1;
  94. $replacementArrTag{$1} = $speciesReplacementTag{$brackets};
  95. $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$1;
  96. # print "replacing $1 with $replacementArr{$1}\n";
  97. $sp_ident[$brackets-1] = $1;
  98. }
  99. elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
  100. @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
  101. # print "arr = @arr\n";
  102. $tree_definition_copy = $1;
  103. $replacementArr{$2} = $speciesReplacement{$brackets};
  104. $backReplacementArr{$speciesReplacement{$brackets}}=$2;
  105. $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
  106. $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
  107. # print "replacing $2 with $replacementArr{$2}\n";
  108. $sp_ident[$brackets-1] = $2;
  109. }
  110. elsif ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
  111. @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/;
  112. # print "arr = @arr .. TERMINAL\n";
  113. $tree_definition_copy = $1;
  114. $replacementArr{$2} = $speciesReplacement{$brackets};
  115. $replacementArr{$1} = $speciesReplacement{$brackets+1};
  116. $backReplacementArr{$speciesReplacement{$brackets}}=$2;
  117. $backReplacementArr{$speciesReplacement{$brackets+1}}=$1;
  118. $replacementArrTag{$1} = $speciesReplacementTag{$brackets+1};
  119. $backReplacementArrTag{$speciesReplacementTag{$brackets+1}}=$1;
  120. $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
  121. $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
  122. # print "replacing $1 with $replacementArr{$1}\n";
  123. # print "replacing $2 with $replacementArr{$2}\n";
  124. # print "replacing $1 with $replacementArrTag{$1}\n";
  125. # print "replacing $2 with $replacementArrTag{$2}\n";
  126. $sp_ident[$brackets-1] = $2;
  127. $sp_ident[$brackets] = $1;
  128. last;
  129. }
  130. elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_\(\),]+)\)$/){
  131. $tree_definition_copy =~ s/^\(//g;
  132. $tree_definition_copy =~ s/\)$//g;
  133. $brackets--;
  134. }
  135. }
  136. foreach my $key (keys %replacementArr){
  137. my $replacement = $replacementArr{$key};
  138. $tree_definition =~ s/$key/$replacement/g;
  139. }
  140. @sp_ident = reverse(@sp_ident);
  141. # print "modified tree_definition = $tree_definition\n";
  142. # print "done .. tree_definition = $tree_definition\n";
  143. # print "sp_ident = @sp_ident\n";
  144. #<STDIN>;
  145. my $complexity=int($COMPLEXITY_SUPPORT * (1/40));
  146. #print "complexity=$complexity\n";
  147. #<STDIN>;
  148. $printer = 1;
  149. my $rando = int(rand(1000));
  150. my $localdate = `date`;
  151. $localdate =~ /([0-9]+):([0-9]+):([0-9]+)/;
  152. my $info = $rando.$1.$2.$3;
  153. #---------------------------------------------------------------------------
  154. # GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES
  155. my @thresharr = (0, split(/,/,$thresholds));
  156. my $randno=int(rand(100000));
  157. my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt";
  158. my $megamatchlck = $megamatch.".lck";
  159. unlink $megamatchlck;
  160. my $selected= $orth;
  161. #my $eventfile = $orth;
  162. $selected = $selected."_SELECTED";
  163. #$selected = $selected."_".$SIMILARITY_THRESH;
  164. #my $runtime = $selected.".runtime";
  165. my $inputtags = "H:C:O:R:M";
  166. $inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/;
  167. my @all_tags = split(/:/, $inputtags);
  168. my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1";
  169. $inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/;
  170. #@sp_ident = split(/:/,$inputsp);
  171. my $junkfile = $orth."_junk";
  172. my $sh = load_sameHash(1);
  173. my $rh = load_revHash(1);
  174. #print "inputs are : \n"; foreach(@ARGV){print $_,"\n";}
  175. #open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!";
  176. open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!";
  177. #open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!";
  178. #my $ctlfile = "baseml\.ctl"; #$ARGV[4];
  179. #my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK
  180. my %registeredTrees = ();
  181. my @removalReasons =
  182. ("microsatellite is compound",
  183. "complex structure",
  184. "if no. if micros is more than no. of species",
  185. "if more than one micro per species ",
  186. "if microsat contains N",
  187. "different motif than required ",
  188. "more than zero interruptions",
  189. "microsat could not form key ",
  190. "orthologous microsats of different motif size ",
  191. "orthologous microsats of different motifs ",
  192. "microsats belong to different alignment blocks altogether",
  193. "microsat near edge",
  194. "microsat in low complexity region",
  195. "microsat flanks dont align well",
  196. "phylogeny not informative");
  197. my %allowedhash=();
  198. #---------------------------------------------------------------------------
  199. # WORKING ON MAKING THE MEGAMATCH FILE
  200. my $chromt=int(rand(10000));
  201. my $p_chr=$chromt;
  202. my $tree_definition_orig_copy = $tree_definition_orig;
  203. $tree_definition=~s/,/, /g;
  204. $tree_definition =~ s/, +/, /g;
  205. $tree_definition_orig=~s/,/, /g;
  206. $tree_definition_orig =~ s/, +/, /g;
  207. my @exactspeciesset_unarranged = split(/,/,$species_set);
  208. my @exactspeciesset_unarranged_orig = split(/,/,$tspecies_set);
  209. my $largesttree = "$tree_definition;";
  210. my $largesttree_orig = "$tree_definition_orig;";
  211. # print "largesttree = $largesttree\n";
  212. $tree_definition =~ s/\(//g;
  213. $tree_definition =~ s/\)//g;
  214. $tree_definition=~s/[\)\(, ]/\t/g;
  215. $tree_definition =~ s/\t+/\t/g;
  216. $tree_definition_orig =~ s/\(//g;
  217. $tree_definition_orig =~ s/\)//g;
  218. $tree_definition_orig =~s/[\)\(, ]/\t/g;
  219. $tree_definition_orig =~ s/\t+/\t/g;
  220. # print "tree_definition = $tree_definition tree_definition_orig = $tree_definition_orig\n";
  221. my @treespecies=split(/\t+/,$tree_definition);
  222. my @treespecies_orig=split(/\t+/,$tree_definition_orig);
  223. # print "tree_definition = $tree_definition .. treespecies=@treespecies ... treespecies_orig=@treespecies_orig\n";
  224. #<STDIN>;
  225. foreach my $spec (@treespecies){
  226. foreach my $espec (@exactspeciesset_unarranged){
  227. # print "spec=$spec and espec=$espec\n";
  228. push @exactspecies, $spec if $spec eq $espec;
  229. }
  230. }
  231. foreach my $spec (@treespecies_orig){
  232. foreach my $espec (@exactspeciesset_unarranged_orig){
  233. # print "spec=$spec and espec=$espec\n";
  234. push @exactspecies_orig, $spec if $spec eq $espec;
  235. }
  236. }
  237. my $focalspec = $exactspecies[0];
  238. my $focalspec_orig = $exactspecies_orig[0];
  239. # print "exactspecies=@exactspecies ... focalspec=$focalspec\n";
  240. # print "texactspecies=@exactspecies_orig ... focalspec_orig=$focalspec_orig\n";
  241. #<STDIN>;
  242. my $arranged_species_set = join(".",@exactspecies);
  243. my $arranged_species_set_orig = join(".",@exactspecies_orig);
  244. @exacttags=@exactspecies;
  245. my @exacttags_orig=@exactspecies_orig;
  246. foreach my $extag (@exacttags){
  247. $extag =~ s/hg18/H/g;
  248. $extag =~ s/panTro2/C/g;
  249. $extag =~ s/ponAbe2/O/g;
  250. $extag =~ s/rheMac2/R/g;
  251. $extag =~ s/calJac1/M/g;
  252. }
  253. foreach my $extag (@exacttags_orig){
  254. $extag =~ s/hg18/H/g;
  255. $extag =~ s/panTro2/C/g;
  256. $extag =~ s/ponAbe2/O/g;
  257. $extag =~ s/rheMac2/R/g;
  258. $extag =~ s/calJac1/M/g;
  259. }
  260. my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
  261. #print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
  262. maftoAxt_multispecies($maf, $tree_definition_orig_copy, $chr_name, $tspecies_set);
  263. #print "made files\n";
  264. my @filterseqfiles= ($chr_name);
  265. $largesttree =~ s/hg18/H/g;
  266. $largesttree =~ s/panTro2/C/g;
  267. $largesttree =~ s/ponAbe2/O/g;
  268. $largesttree =~ s/rheMac2/R/g;
  269. $largesttree =~ s/calJac1/M/g;
  270. #<STDIN>;
  271. #---------------------------------------------------------------------------
  272. my ($lagestnodes, $largestbranches) = get_nodes($largesttree);
  273. shift (@$lagestnodes);
  274. my @extendedtitle=();
  275. my $title = ();
  276. my $parttitle = ();
  277. my @titlearr = ();
  278. my @firsttitle=($focalspec_orig."chrom", $focalspec_orig."start", $focalspec_orig."end", $focalspec_orig."motif", $focalspec_orig."motifsize", $focalspec_orig."threshold");
  279. my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details);
  280. my @fititle=();
  281. foreach my $spec (split(",",$tspecies_set)){
  282. push @fititle, $spec;
  283. foreach my $name (@finames){
  284. push @fititle, $spec.".".$name;
  285. }
  286. }
  287. my @othertitle=qw(somechr somestart somened event source);
  288. my @fnames = ();
  289. push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ;
  290. push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ;
  291. push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ;
  292. push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ;
  293. push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num);
  294. my @fullnames = ();
  295. # print "revising\n";
  296. # print "H = $backReplacementArrTag{H}\n";
  297. # print "C = $backReplacementArrTag{C}\n";
  298. # print "O = $backReplacementArrTag{O}\n";
  299. # print "R = $backReplacementArrTag{R}\n";
  300. # print "M = $backReplacementArrTag{M}\n";
  301. foreach my $lnode (@$lagestnodes){
  302. my @pair = @$lnode;
  303. my @nodemutarr = ();
  304. for my $p (@pair){
  305. # print "p = $p\n";
  306. $p =~ s/[\(\), ]+//g;
  307. $p =~ s/([A-Z])/$1./g;
  308. $p =~ s/\.$//g;
  309. $p =~ s/H/$backReplacementArrTag{H}/g;
  310. $p =~ s/C/$backReplacementArrTag{C}/g;
  311. $p =~ s/O/$backReplacementArrTag{O}/g;
  312. $p =~ s/R/$backReplacementArrTag{R}/g;
  313. $p =~ s/M/$backReplacementArrTag{M}/g;
  314. foreach my $n (@fnames) { push @fullnames, $p.".".$n;}
  315. }
  316. }
  317. #print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle);
  318. #print SUMMARY "\t",join("\t", @fullnames);
  319. my $header = join("\t",@firsttitle, @fititle, @othertitle, @fullnames, @fnames, "tree", "cleancase");
  320. # print "header= $header\n";
  321. #<STDIN>;
  322. #print SUMMARY "\t",join("\t", @fnames);
  323. #$title= $title."\t".join("\t", @fnames);
  324. #print SUMMARY "\t","tree","\t", "cleancase", "\n";
  325. #$title= $title."\t"."tree"."\t"."cleancase". "\n";
  326. #print $title; #<STDIN>;
  327. #print "all_tags = @all_tags\n";
  328. for my $no (3 ... $#all_tags+1){
  329. # print "no=$no\n"; #<STDIN>;
  330. @tags = @all_tags[0 ... $no-1];
  331. # print "all_tags=>@all_tags< , tags = >@tags<\n" if $printer == 1; #<STDIN>;
  332. %template=();
  333. my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  334. #next if scalar(@tags) < 4;
  335. # print "now doing tags = @tags, no = $no\n";
  336. open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!";
  337. # print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion);
  338. ##################### T E M P O R A R Y #####################
  339. my @finaltitle=();
  340. my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13);
  341. my $endtitle = ();
  342. foreach my $tag (@tags){
  343. my @tempsingle = ();
  344. foreach my $single (@singletitle){
  345. push @tempsingle, $tag.$single;
  346. }
  347. @finaltitle = (@finaltitle, @tempsingle);
  348. }
  349. # print SUMMARY join("\t",@finaltitle),"\n";
  350. #############################################################
  351. #---------------------------------------------------------------------------
  352. # GET THE TREE FROM TREE FILE
  353. my $tree = ();
  354. $tree = "((H, C), O)" if $no == 3;
  355. $tree = "(((H, C), O), R)" if $no == 4;
  356. $tree = "((((H, C), O), R), M)" if $no == 5;
  357. # $tree=~s/;$//g;
  358. # print "our tree = $tree\n";
  359. #---------------------------------------------------------------------------
  360. # LOADING HASH CONTAINING ALL POSSIBLE TREES:
  361. $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt";
  362. %template=();
  363. %alternate=();
  364. load_allPossibleTrees($tree_decipherer, \%template, \%alternate);
  365. #---------------------------------------------------------------------------
  366. # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS
  367. %treesToReject=();
  368. %treesToIgnore=();
  369. load_treesToReject(@tags);
  370. load_treesToIgnore(@tags);
  371. #---------------------------------------------------------------------------
  372. # LOADING INPUT DATA INTO HASHES AND ARRAYS
  373. #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS
  374. #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF
  375. #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES).
  376. my %orths=();
  377. my $counterm = 0;
  378. my $loaded = 0;
  379. my %seen = ();
  380. my @allowedchrs = ();
  381. # print "no = $no\n"; #<STDIN>;
  382. while (my $line = <ORTH>){
  383. # print "line=$line\n";
  384. my $register1 = $line =~ s/>$exactspecies_orig[0]/>$replacementArrTag{$exactspecies_orig[0]}/g;
  385. my $register2 = $line =~ s/>$exactspecies_orig[1]/>$replacementArrTag{$exactspecies_orig[1]}/g;
  386. my $register3 = $line =~ s/>$exactspecies_orig[2]/>$replacementArrTag{$exactspecies_orig[2]}/g;
  387. my $register4 = $line =~ s/>$exactspecies_orig[3]/>$replacementArrTag{$exactspecies_orig[3]}/g;
  388. my $register5 = $line =~ s/>$exactspecies_orig[4]/>$replacementArrTag{$exactspecies_orig[4]}/g if exists $exactspecies_orig[4];
  389. # print "line = $line\n"; <STDIN>;
  390. # next if $register1 + $register2 + $register3 + $register4 + $register5 > scalar(@tags);
  391. my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros
  392. #print "micros=",printarr(@micros),"\n"; #<STDIN>;
  393. shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY
  394. $no_of_species = adjustCoordinates($micros[0]);
  395. # print "A: $no_of_species\n";
  396. next if $no_of_species != $no;
  397. # print "no = $no ... no_of_species=$no_of_species\n";#<STDIN>;
  398. $counterm++;
  399. #------------------------------------------------
  400. $nextcounter[0]++ if $line =~ /compound/;
  401. next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS
  402. #------------------------------------------------
  403. #next if $line =~ /[A-Za-z]>[a-zA-Z]/;
  404. #------------------------------------------------
  405. chomp $line;
  406. my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER
  407. #print "number of species = $match_count\n";
  408. my $stopper = 0;
  409. foreach my $mic (@micros){
  410. my @local = split(/\t/,$mic);
  411. if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++;
  412. last; }
  413. # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM
  414. }
  415. next if $stopper ==1;
  416. #------------------------------------------------
  417. $nextcounter[2]++ if (scalar(@micros) >$no_of_species);
  418. next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET.
  419. #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY
  420. #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING
  421. #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD.
  422. #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl
  423. my $nexter = 0;
  424. foreach my $tag (@tags){
  425. my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g);
  426. if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1;
  427. next;
  428. }
  429. }
  430. if ($nexter == 1){
  431. $nextcounter[3]++;
  432. next;
  433. }
  434. #------------------------------------------------
  435. foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM
  436. my @local = split(/\t/,$mic);
  437. if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++;
  438. last;}
  439. }
  440. next if $stopper ==1;
  441. #print "till here 1\n"; #<STDIN>;
  442. #------------------------------------------------
  443. my @micros_copy = @micros;
  444. my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST
  445. #2 MICROSAT IN THE CLUSTER.
  446. my @tempfields = split(/\t/,$tempmicro);
  447. my $prevtype = $tempfields[$typecord];
  448. my $tempmotif = $tempfields[$motifcord];
  449. my $tempfirstmotif = ();
  450. if (scalar(@tempfields) > $microsatcord + 2){
  451. if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS
  452. #2 IN THE FIRST MICROSAT OF THE CLUSTER
  453. $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
  454. }
  455. }
  456. if ($nexter == 1){
  457. $nextcounter[6]++;
  458. next;
  459. } #1 DONE OBTAINING INFORMATION REGARDING
  460. #2 THE FIRST MICROSAT FROM THE CLUSTER
  461. if ($tempmotif =~ /^\[/){
  462. $tempmotif =~ s/^\[//g;
  463. $tempmotif =~ /([a-zA-Z]+)\].*/;
  464. $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT
  465. }
  466. else {$tempfirstmotif = $tempmotif;}
  467. my $prevmotif = $tempfirstmotif;
  468. my $key = ();
  469. # print "searching temp micro for 0-9 $focalspec chr0-9a-zA-Z 0-9 0-9 \n";
  470. # print "tempmicro = $tempmicro .. looking for ([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\n"; <STDIN>;
  471. if ($tempmicro =~ /([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  472. # print "B: $no_of_species\n";
  473. $key = join("_",$2, $3, $4, $5);
  474. }
  475. else{
  476. # print "counld not form a key for temp\n"; # if $printer == 1;
  477. $nextcounter[7]++;
  478. next;
  479. }
  480. #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT
  481. #2 THE FIRST MICROSAT IN THE CLUSTER, THE
  482. #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS
  483. #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW)
  484. foreach my $micro (@micros_copy){
  485. my @fields = split(/\t/,$micro);
  486. #-----------------
  487. if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS
  488. if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
  489. $nextcounter[6]++;
  490. last; }
  491. }
  492. #-----------------
  493. if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) {
  494. $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1;
  495. $nextcounter[8]++;
  496. last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
  497. #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.)
  498. $prevtype = $fields[$typecord];
  499. my $motif = $fields[$motifcord];
  500. my $firstmotif = ();
  501. if ($motif =~ /^\[/){
  502. $motif =~ s/^\[//g;
  503. $motif =~ /([a-zA-Z]+)\].*/;
  504. $firstmotif = $1;
  505. }
  506. else {$firstmotif = $motif;}
  507. my $motifpattern = $firstmotif.$firstmotif;
  508. my $prevmotifpattern = $prevmotif.$prevmotif;
  509. if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) {
  510. $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1;
  511. $nextcounter[9]++;
  512. last;
  513. } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
  514. #2 TO DIFFERENT MOTIFS
  515. my $prevmotif = $firstmotif;
  516. #-----------------
  517. for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG
  518. #2 DIFFERENT ALIGNMENT BLOCKS
  519. if ($micro =~ /([0-9]+)\s+($focalspec_orig)\s([_0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  520. my $key2 = join("_",$2, $3, $4, $5);
  521. # print "key = $key .. key2 = $key2\n"; #<STDIN>;
  522. if ($key2 ne $key){
  523. # print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1;
  524. $nextcounter[10]++;
  525. $nexter = 1; last;
  526. }
  527. }
  528. else{
  529. # print "counld not form a key for $line\n"; # if $printer == 1;
  530. #<STDIN>;
  531. $nexter = 1; last;
  532. }
  533. }
  534. }
  535. #print "D2: $no_of_species\n";
  536. #####################
  537. if ($nexter == 1){
  538. # print "nexting\n"; # if $printer == 1;
  539. next;
  540. }
  541. else{
  542. # print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1;
  543. push (@{$orths{$key}},$line);
  544. $loaded++;
  545. if ($line =~ /($focalspec_orig)\s([_a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
  546. # print "$line\n" if $printer == 1; #if $line =~ /Contig/;
  547. # print "################ ################\n" if $printer == 1;
  548. push @allowedchrs, $2 if !exists $allowedhash{$2};
  549. $allowedhash{$2} = 1;
  550. my $key = join("\t",$1, $2, $3, $4);
  551. # print "C: $no_of_species .. key = $key\n";#<STDIN>;
  552. # print "print the shit: $key\n" ; #if $printer == 1;
  553. $seen{$key} = 1;
  554. }
  555. else { #print "Key could not be formed in SPUT for ($focalspec_orig) (chrom) ([0-9]+) ([0-9]+)\n";
  556. }
  557. }
  558. }
  559. close ORTH;
  560. # print "now studying where we lost microsatellites: @nextcounter\n";
  561. for my $reason (0 ... $#nextcounter){
  562. #print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
  563. }
  564. # print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n";
  565. # print "done filtering .. counterm = $counterm and loaded = $loaded\n";
  566. #----------------------------------------------------------------------------------------------------------------
  567. # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY.
  568. # print "adding files @filterseqfiles \n";
  569. #<STDIN>;
  570. while (1){
  571. if (-e $megamatchlck){
  572. # print "waiting to write into $megamatchlck\n";
  573. sleep 10;
  574. }
  575. else{
  576. open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!";
  577. open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!";
  578. last;
  579. }
  580. }
  581. foreach my $seqfile (@filterseqfiles){
  582. my $fullpath = $seqfile;
  583. open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!";
  584. my $matchlines = 0;
  585. while (my $line = <MATCH>) {
  586. #print "checking $line";
  587. if ($line =~ /($focalspec_orig)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
  588. my $key = join("\t",$1, $2, $3, $4);
  589. # print "key = $key\n";
  590. #print "------------------------------------------------------\n";
  591. #print "asking $line\n";
  592. if (exists $seen{$key}){
  593. #print "seen $line \n"; <STDIN>;
  594. while (1){
  595. $matchlines++;
  596. print MEGAM $line;
  597. $line = <MATCH>;
  598. print MEGAM "\n" if $line !~ /[0-9a-zA-Z]/;
  599. last if $line !~/[0-9a-zA-Z]/;
  600. }
  601. }
  602. else{
  603. # print "not seen\n";
  604. }
  605. }
  606. }
  607. # print "matchlines = $matchlines\n";
  608. close MATCH;
  609. }
  610. close MEGAMLCK;
  611. unlink $megamatchlck;
  612. close MEGAM;
  613. undef %seen;
  614. # print "done writitn to $megamatch\n";#<STDIN>;
  615. #----------------------------------------------------------------------------------------------------------------
  616. #<STDIN>;
  617. #---------------------------------------------------------------------------
  618. # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO
  619. # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE
  620. # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER
  621. #$printer = 1;
  622. my $microreadcounter=0;
  623. my $contigsentered=0;
  624. my $contignotrightcounter=0;
  625. my $keynotformedcounter=0;
  626. my $keynotfoundcounter= 0;
  627. my $dotcounter = 0;
  628. # print "opening $megamatch\n";
  629. open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!";
  630. # print "doing $megamatch\n " ;
  631. #<STDIN>;
  632. while (my $line = <BO>){
  633. # print $line; #<STDIN>;
  634. # print "." if $dotcounter % 100 ==0;
  635. # print "\n" if $dotcounter % 5000 ==0;
  636. # print "dotcounter = $dotcounter\n " if $printer == 1;
  637. next if $line !~ /^[0-9]+/;
  638. $dotcounter++;
  639. # print colored ['green'], "~" x 60, "\n" if $printer == 1;
  640. # print colored ['green'], $line;# if $printer == 1;
  641. chomp $line;
  642. my @fields2 = split(/\t/,$line);
  643. my $key2 = ();
  644. my $alignment_no = (); #1 TEMPORARY
  645. if ($line =~ /([0-9]+)\s+($focalspec_orig)\s([_\-s0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  646. # $key2 = join("\t",$1, $2, $4, $5);
  647. $key2 = join("_",$2, $3, $4, $5);
  648. # print "key = $key2\n";
  649. # print "key = $key2\n";
  650. $alignment_no=$1;
  651. }
  652. else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;}
  653. $no_of_species = adjustCoordinates($line);
  654. $contignotrightcounter++ if $no_of_species != $no;
  655. # print "contignotrightcounter=$contignotrightcounter\n";
  656. # print "no_of_species=$no_of_species\n";
  657. # print "no=$no\n";
  658. next if $no_of_species != $no;
  659. # print "D: $no_of_species\n";
  660. # print "E: $no_of_species\n";
  661. #<STDIN>;
  662. # print "key = $key2\n" if $printer == 1;
  663. my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS
  664. #2 ALIGNMENT BLOCK
  665. if (exists($orths{$key2})){
  666. @clusters = @{$orths{$key2}};
  667. $contigsentered++;
  668. delete $orths{$key2};
  669. }
  670. else{
  671. # print "orth does not exist\n";
  672. $keynotfoundcounter++;
  673. next;
  674. }
  675. my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK
  676. my $humseq = ();
  677. foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES
  678. my $seq = <BO>; #2 OF ALL SPECIES.
  679. chomp $seq;
  680. $sequences{$tag} = " ".$seq;
  681. #print "sequences = $sequences{$tag}\n" if $printer == 1;
  682. $humseq = $seq if $tag =~ /H/;
  683. }
  684. foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS
  685. # print "x" x 60, "\n" if $printer == 1;
  686. # print colored ['red'],"cluster = $cluster\n";
  687. $largesttree =~ s/hg18/H/g;
  688. $largesttree =~ s/panTro2/C/g;
  689. $largesttree =~ s/ponAbe2/O/g;
  690. $largesttree =~ s/rheMac2/R/g;
  691. $largesttree =~ s/calJac1/M/g;
  692. $microreadcounter++;
  693. my @micros = split(/>/,$cluster);
  694. shift @micros;
  695. my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND
  696. #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK
  697. my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES
  698. my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME
  699. #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES.
  700. my %microhash=();
  701. my %microsathash=();
  702. my %nonmicrosathash=();
  703. my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1
  704. # print "tags=@tags\n";
  705. for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE
  706. #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER
  707. #3 THE VARIABLE $TAG;
  708. my $tag = $tags[$i];
  709. # print $seq;
  710. my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES.
  711. #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL"
  712. foreach my $micro (@micros){
  713. # print "micro=$micro, tag=$tag\n";
  714. if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND..
  715. $locus = $micro;
  716. my @fields = split(/\t/,$micro);
  717. $motif = $fields[$motifcord];
  718. $microsathash{$tag}=$fields[$microsatcord];
  719. # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n";
  720. push(@starts, $fields[$startcord]);
  721. push(@ends, $fields[$endcord]);
  722. $start_hash{$tag}=$fields[$startcord];
  723. $end_hash{$tag}=$fields[$endcord];
  724. last;
  725. }
  726. else{$microsathash{$tag}="NULL"}
  727. }
  728. $microhash{$tag}=$locus;
  729. }
  730. my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE
  731. my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN
  732. #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS.
  733. # print "starts=@starts... ends=@ends\n";
  734. my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES
  735. my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES
  736. my %up_largeflanks = ();
  737. my %down_largeflanks = ();
  738. my %locusandflanks = ();
  739. my %locusandlargeflanks = ();
  740. my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the
  741. #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES
  742. my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the
  743. #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES
  744. my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO
  745. #2 EXTEME VALUES.
  746. my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE
  747. #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE
  748. #3 WIHIN THE ALIGNMENT
  749. next if !defined $extreme_start;
  750. next if !defined $extreme_end;
  751. next if $extreme_start > length($sequences{$tags[0]});
  752. next if $extreme_start < 0;
  753. next if $extreme_end > length($sequences{$tags[0]});
  754. for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING
  755. #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES
  756. #3 AS WELL AS THE EXTREME COORDINATES OF THE
  757. #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE
  758. #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN
  759. #6 MORE DETAIL.
  760. my $tag = $tags[$i];
  761. # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n";
  762. my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM
  763. #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS
  764. #3 CLUSTER.
  765. # print "seq length $tag = $sequences{$tag} = ",length($sequences{$tag})," extreme_end=$extreme_end\n" ;
  766. my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g);
  767. if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){
  768. $edge_microsat=1;
  769. last;
  770. }
  771. else{
  772. $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT);
  773. $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT);
  774. $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT);
  775. $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT);
  776. $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
  777. $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag};
  778. $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag};
  779. if ($microhash{$tag} ne "NULL"){
  780. $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start);
  781. $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag});
  782. $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start;
  783. # print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1;
  784. }
  785. else{
  786. $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
  787. }
  788. # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1;
  789. }
  790. }
  791. $nextcounter[11]++ if $edge_microsat==1;
  792. next if $edge_microsat==1;
  793. my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS
  794. #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE
  795. #3 FUNCTION sub test_complexity
  796. for my $i (0 ... $#tags){
  797. # print "i = $tags[$i]\n" if $printer == 1;
  798. if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){
  799. # print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1;
  800. $low_complexity =1; last;
  801. }
  802. }
  803. $nextcounter[12]++ if $low_complexity==1;
  804. next if $low_complexity == 1;
  805. my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY
  806. #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN
  807. #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD
  808. #4 AS DESCRIBED IN FUNCTION sub sequence_similarity
  809. my %donepair = ();
  810. for my $i (0 ... $#tags){
  811. # print "i = $tags[$i]\n" if $printer == 1;
  812. # next if $i == 0;
  813. # print colored ['magenta'],"THIS IS UP\n" if $printer == 1;
  814. for my $b (0 ... $#tags){
  815. next if $b == $i;
  816. my $pair = ();
  817. $pair = $i."_".$b if $i < $b;
  818. $pair = $b."_".$i if $b < $i;
  819. next if exists $donepair{$pair};
  820. my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
  821. my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
  822. $donepair{$pair} = $up_similarity."_".$down_similarity;
  823. # print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n";
  824. if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){
  825. $sequence_dissimilarity =1;
  826. last;
  827. }
  828. }
  829. }
  830. $nextcounter[13]++ if $sequence_dissimilarity==1;
  831. next if $sequence_dissimilarity == 1;
  832. my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq);
  833. # print "simplified_microsat=$simplified_microsat\n";
  834. my ($tree_analysis, $conformation) = treeStudy($simplified_microsat);
  835. # print "tree_analysis = $tree_analysis .. conformation=$conformation\n";
  836. #<STDIN>;
  837. # print SELECT "\"$conformation\"\t$tree_analysis\n";
  838. next if $tree_analysis =~ /DISCARD/;
  839. if (exists $treesToReject{$tree_analysis}){
  840. $nextcounter[14]++;
  841. next;
  842. }
  843. # print "F: $no_of_species\n";
  844. # my $adjuster=();
  845. # if ($no_of_species == 4){
  846. # my @sields = split(/\t/,$simplified_microsat);
  847. # my $somend = pop(@sields);
  848. # my $somestart = pop(@sields);
  849. # my $somechr = pop(@sields);
  850. # $adjuster = "NA\t" x 13 ;
  851. # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
  852. # }
  853. # if ($no_of_species == 3){
  854. # my @sields = split(/\t/,$simplified_microsat);
  855. # my $somend = pop(@sields);
  856. # my $somestart = pop(@sields);
  857. # my $somechr = pop(@sields);
  858. # $adjuster = "NA\t" x 26 ;
  859. # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
  860. # }
  861. #
  862. $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis};
  863. $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis};
  864. if (exists $treesToIgnore{$tree_analysis}){
  865. my @appendarr = ();
  866. # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
  867. #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
  868. # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n";
  869. foreach my $lnode (@$lagestnodes){
  870. my @pair = @$lnode;
  871. my @nodemutarr = ();
  872. for my $p (@pair){
  873. my @mutinfoarray1 = ();
  874. for (1 ... 38){
  875. # push (@mutinfoarray1, "NA")
  876. }
  877. # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  878. # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  879. }
  880. }
  881. for (1 ... 38){
  882. push (@appendarr, "NA")
  883. }
  884. # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
  885. # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
  886. # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
  887. next;
  888. }
  889. # print colored ['blue'],"cluster = $cluster\n";
  890. my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
  891. if ($mutations_array eq "NULL"){
  892. # print "cluster = $cluster \n"; <STDIN>;
  893. my @appendarr = ();
  894. # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize, "\t";
  895. # foreach my $lnode (@$lagestnodes){
  896. # my @pair = @$lnode;
  897. # my @nodemutarr = ();
  898. # for my $p (@pair){
  899. # my @mutinfoarray1 = ();
  900. # for (1 ... 38){
  901. # push (@mutinfoarray1, "NA")
  902. # }
  903. # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  904. # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  905. # }
  906. # }
  907. # for (1 ... 38){
  908. # push (@appendarr, "NA")
  909. # }
  910. # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
  911. # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
  912. # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
  913. next;
  914. }
  915. # print "sent: \n" if $printer == 1;
  916. # print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1;
  917. my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
  918. # print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1;
  919. my @finalmutations_array= ();
  920. @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL";
  921. @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL";
  922. # print "alt = $alternate{$conformation}\n";
  923. my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$conformation}, \@finalmutations_array);
  924. my $cleancase = "UNCLEAN";
  925. $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/;
  926. $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL";
  927. $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL";
  928. $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ;
  929. # print "besttree = $besttree ... cleancase=$cleancase\n"; #<STDIN>;
  930. my @selects = ("-C","+C","-H","+H","-HC","+HC","-O","+O","-H.-C","-H.-O","-HC,+C","-HC,+H","-HC.-O","-HCO,+HC","-HCO,+O","-O.-C","-O.-H",
  931. "+C.+O","+H.+C","+H.+O","+HC,-C","+HC,-H","+HC.+O","+HCO,-C","+HCO,-H","+HCO,-HC","+HCO,-O","+O.+C","+O.+H","+H.+C.+O","-H.-C.-O","+HCO","-HCO");
  932. next if (oneOf(@selects, $besttree) == 0);
  933. if ( ($besttree =~ /,/ || $besttree =~ /\./) && $cleancase eq "UNCLEAN"){
  934. $besttree = "$besttree / $tree_analysis";
  935. }
  936. $besttree = "NULL" if $treescore <= 0;
  937. while ($besttree =~ /[A-Z][A-Z]/){
  938. $besttree =~ s/([A-Z])([A-Z])/$1:$2/g;
  939. }
  940. if ($besttree !~ /NULL/){
  941. my @elements = ($besttree =~ /([A-Z])/g);
  942. foreach my $ele (@elements){
  943. # print "replacing $ele with $backReplacementArrTag{$ele}\n";
  944. $besttree =~ s/$ele/$backReplacementArrTag{$ele}/g if exists $backReplacementArrTag{$ele};
  945. }
  946. }
  947. my $endendstate = $focalspec_orig.".".$Hchrom."\t".$Hstart."\t".$Hend."\t".$locusmotif."\t".$locusmotifsize."\t".$tree_analysis."\t";
  948. next if $endendstate =~ /NA\tNA\tNA/;
  949. print SUMMARY $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t";
  950. # print "SUMMARY\t", $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t",$tree_analysis,"\t" ;
  951. my @mutinfoarray =();
  952. foreach my $lnode (@$lagestnodes){
  953. my @pair = @$lnode;
  954. my $joint = "(".join(", ",@pair).")";
  955. my @nodemutarr = ();
  956. for my $p (@pair){
  957. foreach my $mut (@finalmutations_array){
  958. $mut =~ /node=([A-Z, \(\)]+)/;
  959. push @nodemutarr, $mut if $p eq $1;
  960. }
  961. @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree);
  962. # print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
  963. # print join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
  964. }
  965. }
  966. # print "G: $no_of_species\n";
  967. my @alignmentarr = ();
  968. foreach my $key (keys %alignment){
  969. push @alignmentarr, $backReplacementArrTag{$key}.":".$alignment{$key};
  970. }
  971. # print "alignmentarr = @alignmentarr"; <STDIN>;
  972. @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree);
  973. print SUMMARY join ("\t", @mutinfoarray ),"\t";
  974. print SUMMARY join(",",@alignmentarr),"\n";
  975. # print join("\t","--------------","\n",$besttree, join("",@tags)),"\n" if scalar(@tags) < 5;
  976. # <STDIN> if scalar(@tags) < 5;
  977. # print $cleancase, "\n";
  978. # print join ("\t", @mutinfoarray,$cleancase,join(",",@alignmentarr)),"\n"; #<STDIN>;
  979. # print "summarized\n"; #<STDIN>;
  980. my %indelcatch = ();
  981. my %substcatch = ();
  982. my %typecatch = ();
  983. my %nodescatch = ();
  984. my $mutconcat = join("\t", @finalmutations_array)."\n";
  985. my %indelposcatch = ();
  986. my %subsposcatch = ();
  987. foreach my $fmut ( @finalmutations_array){
  988. # next if $fmut !~ /indeltype=[a-zA-Z]+/;
  989. #print RUN $fmut, "\n";
  990. $fmut =~ /node=([a-zA-Z, \(\)]+)/;
  991. my $lnode = $1;
  992. $nodescatch{$1}=1;
  993. if ($fmut =~ /type=substitution/){
  994. # print "fmut=$fmut\n";
  995. $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/;
  996. my $from=$1;
  997. # print "from=$from\n";
  998. my $to=$2;
  999. # print "to=$to\n";
  1000. push @{$substcatch{$lnode}} , ("from:".$from." to:".$to);
  1001. $fmut =~ /position=([0-9]+)/;
  1002. push @{$subsposcatch{$lnode}}, $1;
  1003. }
  1004. if ($fmut =~ /insertion=[a-zA-Z\-]+/){
  1005. $fmut =~ /insertion=([a-zA-Z\-]+)/;
  1006. push @{$indelcatch{$lnode}} , $1;
  1007. $fmut =~ /indeltype=([a-zA-Z]+)/;
  1008. push @{$typecatch{$lnode}}, $1;
  1009. $fmut =~ /position=([0-9]+)/;
  1010. push @{$indelposcatch{$lnode}}, $1;
  1011. }
  1012. if ($fmut =~ /deletion=[a-zA-Z\-]+/){
  1013. $fmut =~ /deletion=([a-zA-Z\-]+)/;
  1014. push @{$indelcatch{$lnode}} , $1;
  1015. $fmut =~ /indeltype=([a-zA-Z]+)/;
  1016. push @{$typecatch{$lnode}}, $1;
  1017. $fmut =~ /position=([0-9]+)/;
  1018. push @{$indelposcatch{$lnode}}, $1;
  1019. }
  1020. }
  1021. # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1;
  1022. # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1;
  1023. # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1;
  1024. # #print RUN ".";
  1025. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1026. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1027. # print colored ['red'],"finalmutations_array=\n" if $printer == 1;
  1028. # foreach (@finalmutations_array) {
  1029. # print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ;
  1030. # print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ;
  1031. # }# if $line =~ /cal/;# && $line =~ /chr4/;
  1032. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1033. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1034. # print "tree analysis = $tree_analysis\n" if $printer == 1;
  1035. # my $mutations = "@$mutations_array";
  1036. next;
  1037. for my $keys (@$nodes) {foreach my $key (@$keys){
  1038. #print "key = $key, => $branches_hash->{$key}\n";
  1039. }
  1040. # print "x" x 50, "\n";
  1041. }
  1042. my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat);
  1043. }
  1044. }
  1045. close BO;
  1046. # print "now studying where we lost microsatellites:\n";
  1047. # print "x" x 60,"\n";
  1048. for my $reason (0 ... $#nextcounter){
  1049. # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
  1050. }
  1051. # print "x" x 60,"\n";
  1052. # print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n";
  1053. # print " we lost $keynotformedcounter contigs as they did not form the key, \n";
  1054. # print "$contignotrightcounter contigs as they were not of the right species configuration\n";
  1055. # print "$keynotfoundcounter contigs as they did not contain the microsats\n";
  1056. # print "... In total we went through a file that had $dotcounter contigs...\n";
  1057. # print join ("\n","remaining orth keys = ", (keys %orths),"");
  1058. # print "------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ \n";
  1059. # print "now printing counted trees: \n";
  1060. if (scalar(keys %registeredTrees) > 0){
  1061. foreach my $keyb ( sort (keys %registeredTrees) )
  1062. {
  1063. # print "$keyb : $registeredTrees{$keyb}\n";
  1064. }
  1065. }
  1066. }
  1067. close SUMMARY;
  1068. my @summarizarr = ("+C=+C +R.+C -HCOR,+C",
  1069. "+H=+H +R.+H -HCOR,+H",
  1070. "-C=-C -R.-C +HCOR,-C",
  1071. "-H=-H -R.-H +HCOR,-H",
  1072. "+HC=+HC",
  1073. "-HC=-HC",
  1074. "+O=+O -HCOR,+O",
  1075. "-O=-O +HCOR,-O",
  1076. "+HCO=+HCO",
  1077. "-HCO=-HCO",
  1078. "+R=+R +R.+C +R.+H",
  1079. "-R=-R -R.-C -R.-H");
  1080. foreach my $line (@summarizarr){
  1081. next if $line !~ /[A-Za-z0-9]/;
  1082. # print $line;
  1083. chomp $line;
  1084. my @fields = split(/=/,$line);
  1085. # print "title = $fields[0]\n";
  1086. my @parts=split(/ +/, $fields[1]);
  1087. my %par

Large files files are truncated, but you can click here to view the full file