PageRenderTime 41ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/regVariation/microsatellite_birthdeath.pl

https://bitbucket.org/chapmanb/galaxy-central/
Perl | 4333 lines | 2916 code | 609 blank | 808 comment | 702 complexity | d1bf4746bc2b8d229099df9b29475ffa MD5 | raw file
  1. #!/usr/bin/perl -w
  2. use strict;
  3. use warnings;
  4. use Term::ANSIColor;
  5. use Pod::Checker;
  6. use File::Basename;
  7. use IO::Handle;
  8. use Cwd;
  9. use File::Path qw(make_path remove_tree);
  10. use File::Temp qw/ tempfile tempdir /;
  11. my $tdir = tempdir( CLEANUP => 1 );
  12. chdir $tdir;
  13. my $dir = getcwd;
  14. # print "current dit=$dir\n";
  15. #<STDIN>;
  16. use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags
  17. $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species
  18. $gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies_orig @exactspecies @exacttags
  19. $mono_flanksimplicityRepno $di_flanksimplicityRepno $prop_of_seq_allowedtoAT $prop_of_seq_allowedtoCG);
  20. use FileHandle;
  21. use IO::Handle; # 5.004 or higher
  22. #my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat",
  23. #"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12",
  24. #"10","0.8");
  25. my @ar = @ARGV;
  26. my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar;
  27. $SIMILARITY_THRESH=$SIMILARITY_THRESH/100;
  28. #########################
  29. $SIMILARITY_THRESH = $SIMILARITY_THRESH/100;
  30. my $EDGE_DISTANCE = 10;
  31. my $COMPLEXITY_SUPPORT = 20;
  32. load_thresholds("9_10_12_12");
  33. my $FLANKINDEL_MAXTHRESH = 0.3;
  34. my $mono_flanksimplicityRepno=6;
  35. my $di_flanksimplicityRepno=10;
  36. my $prop_of_seq_allowedtoAT=0.5;
  37. my $prop_of_seq_allowedtoCG=0.66;
  38. #########################
  39. my $tspecies_set = $species_set;
  40. my %speciesReplacement = ();
  41. my %speciesReplacementTag = ();
  42. my %replacementArr= ();
  43. my %replacementArrTag= ();
  44. my %backReplacementArr= ();
  45. my %backReplacementArrTag= ();
  46. $tree_definition=~s/\s+//g;
  47. my $tree_definition_split = $tree_definition;
  48. $tree_definition_split =~ s/[\(\)]//g;
  49. my @gotSpecies = ($tree_definition_split =~ /(,)/g);
  50. # print "gotSpecies = @gotSpecies\n";
  51. if (scalar(@gotSpecies)+1 ==5){
  52. $speciesReplacement{1}="calJac1";
  53. $speciesReplacement{2}="rheMac2";
  54. $speciesReplacement{3}="ponAbe2";
  55. $speciesReplacement{4}="panTro2";
  56. $speciesReplacement{5}="hg18";
  57. $speciesReplacementTag{1}="M";
  58. $speciesReplacementTag{2}="R";
  59. $speciesReplacementTag{3}="O";
  60. $speciesReplacementTag{4}="C";
  61. $speciesReplacementTag{5}="H";
  62. $species_set="hg18,panTro2,ponAbe2,rheMac2,calJac1";
  63. }
  64. if (scalar(@gotSpecies)+1 ==4){
  65. $speciesReplacement{1}="rheMac2";
  66. $speciesReplacement{2}="ponAbe2";
  67. $speciesReplacement{3}="panTro2";
  68. $speciesReplacement{4}="hg18";
  69. $speciesReplacementTag{1}="R";
  70. $speciesReplacementTag{2}="O";
  71. $speciesReplacementTag{3}="C";
  72. $speciesReplacementTag{4}="H";
  73. $species_set="hg18,panTro2,ponAbe2,rheMac2";
  74. }
  75. # $tree_definition = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)";
  76. my $tree_definition_copy = $tree_definition;
  77. my $tree_definition_orig = $tree_definition;
  78. my $brackets = 0;
  79. while (1){
  80. #last if $tree_definition_copy !~ /\(/;
  81. $brackets++;
  82. # print "brackets = $brackets\n";
  83. last if $brackets > 6;
  84. $tree_definition_copy =~ s/^\(//g;
  85. $tree_definition_copy =~ s/\)$//g;
  86. # print "tree_definition_copy = $tree_definition_copy\n";
  87. my @arr = ();
  88. if ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
  89. @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
  90. # print "arr = @arr\n";
  91. $tree_definition_copy = $2;
  92. $replacementArr{$1} = $speciesReplacement{$brackets};
  93. $backReplacementArr{$speciesReplacement{$brackets}}=$1;
  94. $replacementArrTag{$1} = $speciesReplacementTag{$brackets};
  95. $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$1;
  96. # print "replacing $1 with $replacementArr{$1}\n";
  97. $sp_ident[$brackets-1] = $1;
  98. }
  99. elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
  100. @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
  101. # print "arr = @arr\n";
  102. $tree_definition_copy = $1;
  103. $replacementArr{$2} = $speciesReplacement{$brackets};
  104. $backReplacementArr{$speciesReplacement{$brackets}}=$2;
  105. $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
  106. $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
  107. # print "replacing $2 with $replacementArr{$2}\n";
  108. $sp_ident[$brackets-1] = $2;
  109. }
  110. elsif ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
  111. @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/;
  112. # print "arr = @arr .. TERMINAL\n";
  113. $tree_definition_copy = $1;
  114. $replacementArr{$2} = $speciesReplacement{$brackets};
  115. $replacementArr{$1} = $speciesReplacement{$brackets+1};
  116. $backReplacementArr{$speciesReplacement{$brackets}}=$2;
  117. $backReplacementArr{$speciesReplacement{$brackets+1}}=$1;
  118. $replacementArrTag{$1} = $speciesReplacementTag{$brackets+1};
  119. $backReplacementArrTag{$speciesReplacementTag{$brackets+1}}=$1;
  120. $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
  121. $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
  122. # print "replacing $1 with $replacementArr{$1}\n";
  123. # print "replacing $2 with $replacementArr{$2}\n";
  124. # print "replacing $1 with $replacementArrTag{$1}\n";
  125. # print "replacing $2 with $replacementArrTag{$2}\n";
  126. $sp_ident[$brackets-1] = $2;
  127. $sp_ident[$brackets] = $1;
  128. last;
  129. }
  130. elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_\(\),]+)\)$/){
  131. $tree_definition_copy =~ s/^\(//g;
  132. $tree_definition_copy =~ s/\)$//g;
  133. $brackets--;
  134. }
  135. }
  136. foreach my $key (keys %replacementArr){
  137. my $replacement = $replacementArr{$key};
  138. $tree_definition =~ s/$key/$replacement/g;
  139. }
  140. @sp_ident = reverse(@sp_ident);
  141. # print "modified tree_definition = $tree_definition\n";
  142. # print "done .. tree_definition = $tree_definition\n";
  143. # print "sp_ident = @sp_ident\n";
  144. #<STDIN>;
  145. my $complexity=int($COMPLEXITY_SUPPORT * (1/40));
  146. #print "complexity=$complexity\n";
  147. #<STDIN>;
  148. $printer = 1;
  149. my $rando = int(rand(1000));
  150. my $localdate = `date`;
  151. $localdate =~ /([0-9]+):([0-9]+):([0-9]+)/;
  152. my $info = $rando.$1.$2.$3;
  153. #---------------------------------------------------------------------------
  154. # GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES
  155. my @thresharr = (0, split(/,/,$thresholds));
  156. my $randno=int(rand(100000));
  157. my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt";
  158. my $megamatchlck = $megamatch.".lck";
  159. unlink $megamatchlck;
  160. my $selected= $orth;
  161. #my $eventfile = $orth;
  162. $selected = $selected."_SELECTED";
  163. #$selected = $selected."_".$SIMILARITY_THRESH;
  164. #my $runtime = $selected.".runtime";
  165. my $inputtags = "H:C:O:R:M";
  166. $inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/;
  167. my @all_tags = split(/:/, $inputtags);
  168. my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1";
  169. $inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/;
  170. #@sp_ident = split(/:/,$inputsp);
  171. my $junkfile = $orth."_junk";
  172. my $sh = load_sameHash(1);
  173. my $rh = load_revHash(1);
  174. #print "inputs are : \n"; foreach(@ARGV){print $_,"\n";}
  175. #open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!";
  176. open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!";
  177. #open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!";
  178. #my $ctlfile = "baseml\.ctl"; #$ARGV[4];
  179. #my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK
  180. my %registeredTrees = ();
  181. my @removalReasons =
  182. ("microsatellite is compound",
  183. "complex structure",
  184. "if no. if micros is more than no. of species",
  185. "if more than one micro per species ",
  186. "if microsat contains N",
  187. "different motif than required ",
  188. "more than zero interruptions",
  189. "microsat could not form key ",
  190. "orthologous microsats of different motif size ",
  191. "orthologous microsats of different motifs ",
  192. "microsats belong to different alignment blocks altogether",
  193. "microsat near edge",
  194. "microsat in low complexity region",
  195. "microsat flanks dont align well",
  196. "phylogeny not informative");
  197. my %allowedhash=();
  198. #---------------------------------------------------------------------------
  199. # WORKING ON MAKING THE MEGAMATCH FILE
  200. my $chromt=int(rand(10000));
  201. my $p_chr=$chromt;
  202. my $tree_definition_orig_copy = $tree_definition_orig;
  203. $tree_definition=~s/,/, /g;
  204. $tree_definition =~ s/, +/, /g;
  205. $tree_definition_orig=~s/,/, /g;
  206. $tree_definition_orig =~ s/, +/, /g;
  207. my @exactspeciesset_unarranged = split(/,/,$species_set);
  208. my @exactspeciesset_unarranged_orig = split(/,/,$tspecies_set);
  209. my $largesttree = "$tree_definition;";
  210. my $largesttree_orig = "$tree_definition_orig;";
  211. # print "largesttree = $largesttree\n";
  212. $tree_definition =~ s/\(//g;
  213. $tree_definition =~ s/\)//g;
  214. $tree_definition=~s/[\)\(, ]/\t/g;
  215. $tree_definition =~ s/\t+/\t/g;
  216. $tree_definition_orig =~ s/\(//g;
  217. $tree_definition_orig =~ s/\)//g;
  218. $tree_definition_orig =~s/[\)\(, ]/\t/g;
  219. $tree_definition_orig =~ s/\t+/\t/g;
  220. # print "tree_definition = $tree_definition tree_definition_orig = $tree_definition_orig\n";
  221. my @treespecies=split(/\t+/,$tree_definition);
  222. my @treespecies_orig=split(/\t+/,$tree_definition_orig);
  223. # print "tree_definition = $tree_definition .. treespecies=@treespecies ... treespecies_orig=@treespecies_orig\n";
  224. #<STDIN>;
  225. foreach my $spec (@treespecies){
  226. foreach my $espec (@exactspeciesset_unarranged){
  227. # print "spec=$spec and espec=$espec\n";
  228. push @exactspecies, $spec if $spec eq $espec;
  229. }
  230. }
  231. foreach my $spec (@treespecies_orig){
  232. foreach my $espec (@exactspeciesset_unarranged_orig){
  233. # print "spec=$spec and espec=$espec\n";
  234. push @exactspecies_orig, $spec if $spec eq $espec;
  235. }
  236. }
  237. my $focalspec = $exactspecies[0];
  238. my $focalspec_orig = $exactspecies_orig[0];
  239. # print "exactspecies=@exactspecies ... focalspec=$focalspec\n";
  240. # print "texactspecies=@exactspecies_orig ... focalspec_orig=$focalspec_orig\n";
  241. #<STDIN>;
  242. my $arranged_species_set = join(".",@exactspecies);
  243. my $arranged_species_set_orig = join(".",@exactspecies_orig);
  244. @exacttags=@exactspecies;
  245. my @exacttags_orig=@exactspecies_orig;
  246. foreach my $extag (@exacttags){
  247. $extag =~ s/hg18/H/g;
  248. $extag =~ s/panTro2/C/g;
  249. $extag =~ s/ponAbe2/O/g;
  250. $extag =~ s/rheMac2/R/g;
  251. $extag =~ s/calJac1/M/g;
  252. }
  253. foreach my $extag (@exacttags_orig){
  254. $extag =~ s/hg18/H/g;
  255. $extag =~ s/panTro2/C/g;
  256. $extag =~ s/ponAbe2/O/g;
  257. $extag =~ s/rheMac2/R/g;
  258. $extag =~ s/calJac1/M/g;
  259. }
  260. my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
  261. #print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
  262. maftoAxt_multispecies($maf, $tree_definition_orig_copy, $chr_name, $tspecies_set);
  263. #print "made files\n";
  264. my @filterseqfiles= ($chr_name);
  265. $largesttree =~ s/hg18/H/g;
  266. $largesttree =~ s/panTro2/C/g;
  267. $largesttree =~ s/ponAbe2/O/g;
  268. $largesttree =~ s/rheMac2/R/g;
  269. $largesttree =~ s/calJac1/M/g;
  270. #<STDIN>;
  271. #---------------------------------------------------------------------------
  272. my ($lagestnodes, $largestbranches) = get_nodes($largesttree);
  273. shift (@$lagestnodes);
  274. my @extendedtitle=();
  275. my $title = ();
  276. my $parttitle = ();
  277. my @titlearr = ();
  278. my @firsttitle=($focalspec_orig."chrom", $focalspec_orig."start", $focalspec_orig."end", $focalspec_orig."motif", $focalspec_orig."motifsize", $focalspec_orig."threshold");
  279. my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details);
  280. my @fititle=();
  281. foreach my $spec (split(",",$tspecies_set)){
  282. push @fititle, $spec;
  283. foreach my $name (@finames){
  284. push @fititle, $spec.".".$name;
  285. }
  286. }
  287. my @othertitle=qw(somechr somestart somened event source);
  288. my @fnames = ();
  289. push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ;
  290. push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ;
  291. push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ;
  292. push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ;
  293. push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num);
  294. my @fullnames = ();
  295. # print "revising\n";
  296. # print "H = $backReplacementArrTag{H}\n";
  297. # print "C = $backReplacementArrTag{C}\n";
  298. # print "O = $backReplacementArrTag{O}\n";
  299. # print "R = $backReplacementArrTag{R}\n";
  300. # print "M = $backReplacementArrTag{M}\n";
  301. foreach my $lnode (@$lagestnodes){
  302. my @pair = @$lnode;
  303. my @nodemutarr = ();
  304. for my $p (@pair){
  305. # print "p = $p\n";
  306. $p =~ s/[\(\), ]+//g;
  307. $p =~ s/([A-Z])/$1./g;
  308. $p =~ s/\.$//g;
  309. $p =~ s/H/$backReplacementArrTag{H}/g;
  310. $p =~ s/C/$backReplacementArrTag{C}/g;
  311. $p =~ s/O/$backReplacementArrTag{O}/g;
  312. $p =~ s/R/$backReplacementArrTag{R}/g;
  313. $p =~ s/M/$backReplacementArrTag{M}/g;
  314. foreach my $n (@fnames) { push @fullnames, $p.".".$n;}
  315. }
  316. }
  317. #print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle);
  318. #print SUMMARY "\t",join("\t", @fullnames);
  319. my $header = join("\t",@firsttitle, @fititle, @othertitle, @fullnames, @fnames, "tree", "cleancase");
  320. # print "header= $header\n";
  321. #<STDIN>;
  322. #print SUMMARY "\t",join("\t", @fnames);
  323. #$title= $title."\t".join("\t", @fnames);
  324. #print SUMMARY "\t","tree","\t", "cleancase", "\n";
  325. #$title= $title."\t"."tree"."\t"."cleancase". "\n";
  326. #print $title; #<STDIN>;
  327. #print "all_tags = @all_tags\n";
  328. for my $no (3 ... $#all_tags+1){
  329. # print "no=$no\n"; #<STDIN>;
  330. @tags = @all_tags[0 ... $no-1];
  331. # print "all_tags=>@all_tags< , tags = >@tags<\n" if $printer == 1; #<STDIN>;
  332. %template=();
  333. my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  334. #next if scalar(@tags) < 4;
  335. # print "now doing tags = @tags, no = $no\n";
  336. open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!";
  337. # print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion);
  338. ##################### T E M P O R A R Y #####################
  339. my @finaltitle=();
  340. my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13);
  341. my $endtitle = ();
  342. foreach my $tag (@tags){
  343. my @tempsingle = ();
  344. foreach my $single (@singletitle){
  345. push @tempsingle, $tag.$single;
  346. }
  347. @finaltitle = (@finaltitle, @tempsingle);
  348. }
  349. # print SUMMARY join("\t",@finaltitle),"\n";
  350. #############################################################
  351. #---------------------------------------------------------------------------
  352. # GET THE TREE FROM TREE FILE
  353. my $tree = ();
  354. $tree = "((H, C), O)" if $no == 3;
  355. $tree = "(((H, C), O), R)" if $no == 4;
  356. $tree = "((((H, C), O), R), M)" if $no == 5;
  357. # $tree=~s/;$//g;
  358. # print "our tree = $tree\n";
  359. #---------------------------------------------------------------------------
  360. # LOADING HASH CONTAINING ALL POSSIBLE TREES:
  361. $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt";
  362. %template=();
  363. %alternate=();
  364. load_allPossibleTrees($tree_decipherer, \%template, \%alternate);
  365. #---------------------------------------------------------------------------
  366. # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS
  367. %treesToReject=();
  368. %treesToIgnore=();
  369. load_treesToReject(@tags);
  370. load_treesToIgnore(@tags);
  371. #---------------------------------------------------------------------------
  372. # LOADING INPUT DATA INTO HASHES AND ARRAYS
  373. #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS
  374. #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF
  375. #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES).
  376. my %orths=();
  377. my $counterm = 0;
  378. my $loaded = 0;
  379. my %seen = ();
  380. my @allowedchrs = ();
  381. # print "no = $no\n"; #<STDIN>;
  382. while (my $line = <ORTH>){
  383. # print "line=$line\n";
  384. my $register1 = $line =~ s/>$exactspecies_orig[0]/>$replacementArrTag{$exactspecies_orig[0]}/g;
  385. my $register2 = $line =~ s/>$exactspecies_orig[1]/>$replacementArrTag{$exactspecies_orig[1]}/g;
  386. my $register3 = $line =~ s/>$exactspecies_orig[2]/>$replacementArrTag{$exactspecies_orig[2]}/g;
  387. my $register4 = $line =~ s/>$exactspecies_orig[3]/>$replacementArrTag{$exactspecies_orig[3]}/g;
  388. my $register5 = $line =~ s/>$exactspecies_orig[4]/>$replacementArrTag{$exactspecies_orig[4]}/g if exists $exactspecies_orig[4];
  389. # print "line = $line\n"; <STDIN>;
  390. # next if $register1 + $register2 + $register3 + $register4 + $register5 > scalar(@tags);
  391. my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros
  392. #print "micros=",printarr(@micros),"\n"; #<STDIN>;
  393. shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY
  394. $no_of_species = adjustCoordinates($micros[0]);
  395. # print "A: $no_of_species\n";
  396. next if $no_of_species != $no;
  397. # print "no = $no ... no_of_species=$no_of_species\n";#<STDIN>;
  398. $counterm++;
  399. #------------------------------------------------
  400. $nextcounter[0]++ if $line =~ /compound/;
  401. next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS
  402. #------------------------------------------------
  403. #next if $line =~ /[A-Za-z]>[a-zA-Z]/;
  404. #------------------------------------------------
  405. chomp $line;
  406. my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER
  407. #print "number of species = $match_count\n";
  408. my $stopper = 0;
  409. foreach my $mic (@micros){
  410. my @local = split(/\t/,$mic);
  411. if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++;
  412. last; }
  413. # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM
  414. }
  415. next if $stopper ==1;
  416. #------------------------------------------------
  417. $nextcounter[2]++ if (scalar(@micros) >$no_of_species);
  418. next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET.
  419. #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY
  420. #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING
  421. #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD.
  422. #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl
  423. my $nexter = 0;
  424. foreach my $tag (@tags){
  425. my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g);
  426. if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1;
  427. next;
  428. }
  429. }
  430. if ($nexter == 1){
  431. $nextcounter[3]++;
  432. next;
  433. }
  434. #------------------------------------------------
  435. foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM
  436. my @local = split(/\t/,$mic);
  437. if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++;
  438. last;}
  439. }
  440. next if $stopper ==1;
  441. #print "till here 1\n"; #<STDIN>;
  442. #------------------------------------------------
  443. my @micros_copy = @micros;
  444. my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST
  445. #2 MICROSAT IN THE CLUSTER.
  446. my @tempfields = split(/\t/,$tempmicro);
  447. my $prevtype = $tempfields[$typecord];
  448. my $tempmotif = $tempfields[$motifcord];
  449. my $tempfirstmotif = ();
  450. if (scalar(@tempfields) > $microsatcord + 2){
  451. if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS
  452. #2 IN THE FIRST MICROSAT OF THE CLUSTER
  453. $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
  454. }
  455. }
  456. if ($nexter == 1){
  457. $nextcounter[6]++;
  458. next;
  459. } #1 DONE OBTAINING INFORMATION REGARDING
  460. #2 THE FIRST MICROSAT FROM THE CLUSTER
  461. if ($tempmotif =~ /^\[/){
  462. $tempmotif =~ s/^\[//g;
  463. $tempmotif =~ /([a-zA-Z]+)\].*/;
  464. $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT
  465. }
  466. else {$tempfirstmotif = $tempmotif;}
  467. my $prevmotif = $tempfirstmotif;
  468. my $key = ();
  469. # print "searching temp micro for 0-9 $focalspec chr0-9a-zA-Z 0-9 0-9 \n";
  470. # print "tempmicro = $tempmicro .. looking for ([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\n"; <STDIN>;
  471. if ($tempmicro =~ /([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  472. # print "B: $no_of_species\n";
  473. $key = join("_",$2, $3, $4, $5);
  474. }
  475. else{
  476. # print "counld not form a key for temp\n"; # if $printer == 1;
  477. $nextcounter[7]++;
  478. next;
  479. }
  480. #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT
  481. #2 THE FIRST MICROSAT IN THE CLUSTER, THE
  482. #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS
  483. #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW)
  484. foreach my $micro (@micros_copy){
  485. my @fields = split(/\t/,$micro);
  486. #-----------------
  487. if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS
  488. if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
  489. $nextcounter[6]++;
  490. last; }
  491. }
  492. #-----------------
  493. if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) {
  494. $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1;
  495. $nextcounter[8]++;
  496. last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
  497. #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.)
  498. $prevtype = $fields[$typecord];
  499. my $motif = $fields[$motifcord];
  500. my $firstmotif = ();
  501. if ($motif =~ /^\[/){
  502. $motif =~ s/^\[//g;
  503. $motif =~ /([a-zA-Z]+)\].*/;
  504. $firstmotif = $1;
  505. }
  506. else {$firstmotif = $motif;}
  507. my $motifpattern = $firstmotif.$firstmotif;
  508. my $prevmotifpattern = $prevmotif.$prevmotif;
  509. if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) {
  510. $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1;
  511. $nextcounter[9]++;
  512. last;
  513. } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
  514. #2 TO DIFFERENT MOTIFS
  515. my $prevmotif = $firstmotif;
  516. #-----------------
  517. for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG
  518. #2 DIFFERENT ALIGNMENT BLOCKS
  519. if ($micro =~ /([0-9]+)\s+($focalspec_orig)\s([_0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  520. my $key2 = join("_",$2, $3, $4, $5);
  521. # print "key = $key .. key2 = $key2\n"; #<STDIN>;
  522. if ($key2 ne $key){
  523. # print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1;
  524. $nextcounter[10]++;
  525. $nexter = 1; last;
  526. }
  527. }
  528. else{
  529. # print "counld not form a key for $line\n"; # if $printer == 1;
  530. #<STDIN>;
  531. $nexter = 1; last;
  532. }
  533. }
  534. }
  535. #print "D2: $no_of_species\n";
  536. #####################
  537. if ($nexter == 1){
  538. # print "nexting\n"; # if $printer == 1;
  539. next;
  540. }
  541. else{
  542. # print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1;
  543. push (@{$orths{$key}},$line);
  544. $loaded++;
  545. if ($line =~ /($focalspec_orig)\s([_a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
  546. # print "$line\n" if $printer == 1; #if $line =~ /Contig/;
  547. # print "################ ################\n" if $printer == 1;
  548. push @allowedchrs, $2 if !exists $allowedhash{$2};
  549. $allowedhash{$2} = 1;
  550. my $key = join("\t",$1, $2, $3, $4);
  551. # print "C: $no_of_species .. key = $key\n";#<STDIN>;
  552. # print "print the shit: $key\n" ; #if $printer == 1;
  553. $seen{$key} = 1;
  554. }
  555. else { #print "Key could not be formed in SPUT for ($focalspec_orig) (chrom) ([0-9]+) ([0-9]+)\n";
  556. }
  557. }
  558. }
  559. close ORTH;
  560. # print "now studying where we lost microsatellites: @nextcounter\n";
  561. for my $reason (0 ... $#nextcounter){
  562. #print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
  563. }
  564. # print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n";
  565. # print "done filtering .. counterm = $counterm and loaded = $loaded\n";
  566. #----------------------------------------------------------------------------------------------------------------
  567. # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY.
  568. # print "adding files @filterseqfiles \n";
  569. #<STDIN>;
  570. while (1){
  571. if (-e $megamatchlck){
  572. # print "waiting to write into $megamatchlck\n";
  573. sleep 10;
  574. }
  575. else{
  576. open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!";
  577. open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!";
  578. last;
  579. }
  580. }
  581. foreach my $seqfile (@filterseqfiles){
  582. my $fullpath = $seqfile;
  583. open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!";
  584. my $matchlines = 0;
  585. while (my $line = <MATCH>) {
  586. #print "checking $line";
  587. if ($line =~ /($focalspec_orig)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
  588. my $key = join("\t",$1, $2, $3, $4);
  589. # print "key = $key\n";
  590. #print "------------------------------------------------------\n";
  591. #print "asking $line\n";
  592. if (exists $seen{$key}){
  593. #print "seen $line \n"; <STDIN>;
  594. while (1){
  595. $matchlines++;
  596. print MEGAM $line;
  597. $line = <MATCH>;
  598. print MEGAM "\n" if $line !~ /[0-9a-zA-Z]/;
  599. last if $line !~/[0-9a-zA-Z]/;
  600. }
  601. }
  602. else{
  603. # print "not seen\n";
  604. }
  605. }
  606. }
  607. # print "matchlines = $matchlines\n";
  608. close MATCH;
  609. }
  610. close MEGAMLCK;
  611. unlink $megamatchlck;
  612. close MEGAM;
  613. undef %seen;
  614. # print "done writitn to $megamatch\n";#<STDIN>;
  615. #----------------------------------------------------------------------------------------------------------------
  616. #<STDIN>;
  617. #---------------------------------------------------------------------------
  618. # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO
  619. # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE
  620. # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER
  621. #$printer = 1;
  622. my $microreadcounter=0;
  623. my $contigsentered=0;
  624. my $contignotrightcounter=0;
  625. my $keynotformedcounter=0;
  626. my $keynotfoundcounter= 0;
  627. my $dotcounter = 0;
  628. # print "opening $megamatch\n";
  629. open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!";
  630. # print "doing $megamatch\n " ;
  631. #<STDIN>;
  632. while (my $line = <BO>){
  633. # print $line; #<STDIN>;
  634. # print "." if $dotcounter % 100 ==0;
  635. # print "\n" if $dotcounter % 5000 ==0;
  636. # print "dotcounter = $dotcounter\n " if $printer == 1;
  637. next if $line !~ /^[0-9]+/;
  638. $dotcounter++;
  639. # print colored ['green'], "~" x 60, "\n" if $printer == 1;
  640. # print colored ['green'], $line;# if $printer == 1;
  641. chomp $line;
  642. my @fields2 = split(/\t/,$line);
  643. my $key2 = ();
  644. my $alignment_no = (); #1 TEMPORARY
  645. if ($line =~ /([0-9]+)\s+($focalspec_orig)\s([_\-s0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  646. # $key2 = join("\t",$1, $2, $4, $5);
  647. $key2 = join("_",$2, $3, $4, $5);
  648. # print "key = $key2\n";
  649. # print "key = $key2\n";
  650. $alignment_no=$1;
  651. }
  652. else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;}
  653. $no_of_species = adjustCoordinates($line);
  654. $contignotrightcounter++ if $no_of_species != $no;
  655. # print "contignotrightcounter=$contignotrightcounter\n";
  656. # print "no_of_species=$no_of_species\n";
  657. # print "no=$no\n";
  658. next if $no_of_species != $no;
  659. # print "D: $no_of_species\n";
  660. # print "E: $no_of_species\n";
  661. #<STDIN>;
  662. # print "key = $key2\n" if $printer == 1;
  663. my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS
  664. #2 ALIGNMENT BLOCK
  665. if (exists($orths{$key2})){
  666. @clusters = @{$orths{$key2}};
  667. $contigsentered++;
  668. delete $orths{$key2};
  669. }
  670. else{
  671. # print "orth does not exist\n";
  672. $keynotfoundcounter++;
  673. next;
  674. }
  675. my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK
  676. my $humseq = ();
  677. foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES
  678. my $seq = <BO>; #2 OF ALL SPECIES.
  679. chomp $seq;
  680. $sequences{$tag} = " ".$seq;
  681. #print "sequences = $sequences{$tag}\n" if $printer == 1;
  682. $humseq = $seq if $tag =~ /H/;
  683. }
  684. foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS
  685. # print "x" x 60, "\n" if $printer == 1;
  686. # print colored ['red'],"cluster = $cluster\n";
  687. $largesttree =~ s/hg18/H/g;
  688. $largesttree =~ s/panTro2/C/g;
  689. $largesttree =~ s/ponAbe2/O/g;
  690. $largesttree =~ s/rheMac2/R/g;
  691. $largesttree =~ s/calJac1/M/g;
  692. $microreadcounter++;
  693. my @micros = split(/>/,$cluster);
  694. shift @micros;
  695. my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND
  696. #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK
  697. my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES
  698. my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME
  699. #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES.
  700. my %microhash=();
  701. my %microsathash=();
  702. my %nonmicrosathash=();
  703. my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1
  704. # print "tags=@tags\n";
  705. for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE
  706. #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER
  707. #3 THE VARIABLE $TAG;
  708. my $tag = $tags[$i];
  709. # print $seq;
  710. my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES.
  711. #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL"
  712. foreach my $micro (@micros){
  713. # print "micro=$micro, tag=$tag\n";
  714. if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND..
  715. $locus = $micro;
  716. my @fields = split(/\t/,$micro);
  717. $motif = $fields[$motifcord];
  718. $microsathash{$tag}=$fields[$microsatcord];
  719. # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n";
  720. push(@starts, $fields[$startcord]);
  721. push(@ends, $fields[$endcord]);
  722. $start_hash{$tag}=$fields[$startcord];
  723. $end_hash{$tag}=$fields[$endcord];
  724. last;
  725. }
  726. else{$microsathash{$tag}="NULL"}
  727. }
  728. $microhash{$tag}=$locus;
  729. }
  730. my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE
  731. my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN
  732. #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS.
  733. # print "starts=@starts... ends=@ends\n";
  734. my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES
  735. my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES
  736. my %up_largeflanks = ();
  737. my %down_largeflanks = ();
  738. my %locusandflanks = ();
  739. my %locusandlargeflanks = ();
  740. my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the
  741. #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES
  742. my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the
  743. #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES
  744. my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO
  745. #2 EXTEME VALUES.
  746. my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE
  747. #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE
  748. #3 WIHIN THE ALIGNMENT
  749. next if !defined $extreme_start;
  750. next if !defined $extreme_end;
  751. next if $extreme_start > length($sequences{$tags[0]});
  752. next if $extreme_start < 0;
  753. next if $extreme_end > length($sequences{$tags[0]});
  754. for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING
  755. #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES
  756. #3 AS WELL AS THE EXTREME COORDINATES OF THE
  757. #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE
  758. #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN
  759. #6 MORE DETAIL.
  760. my $tag = $tags[$i];
  761. # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n";
  762. my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM
  763. #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS
  764. #3 CLUSTER.
  765. # print "seq length $tag = $sequences{$tag} = ",length($sequences{$tag})," extreme_end=$extreme_end\n" ;
  766. my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g);
  767. if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){
  768. $edge_microsat=1;
  769. last;
  770. }
  771. else{
  772. $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT);
  773. $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT);
  774. $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT);
  775. $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT);
  776. $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
  777. $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag};
  778. $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag};
  779. if ($microhash{$tag} ne "NULL"){
  780. $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start);
  781. $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag});
  782. $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start;
  783. # print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1;
  784. }
  785. else{
  786. $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
  787. }
  788. # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1;
  789. }
  790. }
  791. $nextcounter[11]++ if $edge_microsat==1;
  792. next if $edge_microsat==1;
  793. my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS
  794. #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE
  795. #3 FUNCTION sub test_complexity
  796. for my $i (0 ... $#tags){
  797. # print "i = $tags[$i]\n" if $printer == 1;
  798. if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){
  799. # print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1;
  800. $low_complexity =1; last;
  801. }
  802. }
  803. $nextcounter[12]++ if $low_complexity==1;
  804. next if $low_complexity == 1;
  805. my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY
  806. #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN
  807. #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD
  808. #4 AS DESCRIBED IN FUNCTION sub sequence_similarity
  809. my %donepair = ();
  810. for my $i (0 ... $#tags){
  811. # print "i = $tags[$i]\n" if $printer == 1;
  812. # next if $i == 0;
  813. # print colored ['magenta'],"THIS IS UP\n" if $printer == 1;
  814. for my $b (0 ... $#tags){
  815. next if $b == $i;
  816. my $pair = ();
  817. $pair = $i."_".$b if $i < $b;
  818. $pair = $b."_".$i if $b < $i;
  819. next if exists $donepair{$pair};
  820. my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
  821. my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
  822. $donepair{$pair} = $up_similarity."_".$down_similarity;
  823. # print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n";
  824. if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){
  825. $sequence_dissimilarity =1;
  826. last;
  827. }
  828. }
  829. }
  830. $nextcounter[13]++ if $sequence_dissimilarity==1;
  831. next if $sequence_dissimilarity == 1;
  832. my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq);
  833. # print "simplified_microsat=$simplified_microsat\n";
  834. my ($tree_analysis, $conformation) = treeStudy($simplified_microsat);
  835. # print "tree_analysis = $tree_analysis .. conformation=$conformation\n";
  836. #<STDIN>;
  837. # print SELECT "\"$conformation\"\t$tree_analysis\n";
  838. next if $tree_analysis =~ /DISCARD/;
  839. if (exists $treesToReject{$tree_analysis}){
  840. $nextcounter[14]++;
  841. next;
  842. }
  843. # print "F: $no_of_species\n";
  844. # my $adjuster=();
  845. # if ($no_of_species == 4){
  846. # my @sields = split(/\t/,$simplified_microsat);
  847. # my $somend = pop(@sields);
  848. # my $somestart = pop(@sields);
  849. # my $somechr = pop(@sields);
  850. # $adjuster = "NA\t" x 13 ;
  851. # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
  852. # }
  853. # if ($no_of_species == 3){
  854. # my @sields = split(/\t/,$simplified_microsat);
  855. # my $somend = pop(@sields);
  856. # my $somestart = pop(@sields);
  857. # my $somechr = pop(@sields);
  858. # $adjuster = "NA\t" x 26 ;
  859. # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
  860. # }
  861. #
  862. $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis};
  863. $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis};
  864. if (exists $treesToIgnore{$tree_analysis}){
  865. my @appendarr = ();
  866. # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
  867. #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
  868. # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n";
  869. foreach my $lnode (@$lagestnodes){
  870. my @pair = @$lnode;
  871. my @nodemutarr = ();
  872. for my $p (@pair){
  873. my @mutinfoarray1 = ();
  874. for (1 ... 38){
  875. # push (@mutinfoarray1, "NA")
  876. }
  877. # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  878. # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  879. }
  880. }
  881. for (1 ... 38){
  882. push (@appendarr, "NA")
  883. }
  884. # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
  885. # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
  886. # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
  887. next;
  888. }
  889. # print colored ['blue'],"cluster = $cluster\n";
  890. my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
  891. if ($mutations_array eq "NULL"){
  892. # print "cluster = $cluster \n"; <STDIN>;
  893. my @appendarr = ();
  894. # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize, "\t";
  895. # foreach my $lnode (@$lagestnodes){
  896. # my @pair = @$lnode;
  897. # my @nodemutarr = ();
  898. # for my $p (@pair){
  899. # my @mutinfoarray1 = ();
  900. # for (1 ... 38){
  901. # push (@mutinfoarray1, "NA")
  902. # }
  903. # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  904. # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
  905. # }
  906. # }
  907. # for (1 ... 38){
  908. # push (@appendarr, "NA")
  909. # }
  910. # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
  911. # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
  912. # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
  913. next;
  914. }
  915. # print "sent: \n" if $printer == 1;
  916. # print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1;
  917. my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
  918. # print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1;
  919. my @finalmutations_array= ();
  920. @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL";
  921. @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL";
  922. # print "alt = $alternate{$conformation}\n";
  923. my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$conformation}, \@finalmutations_array);
  924. my $cleancase = "UNCLEAN";
  925. $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/;
  926. $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL";
  927. $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL";
  928. $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ;
  929. # print "besttree = $besttree ... cleancase=$cleancase\n"; #<STDIN>;
  930. my @selects = ("-C","+C","-H","+H","-HC","+HC","-O","+O","-H.-C","-H.-O","-HC,+C","-HC,+H","-HC.-O","-HCO,+HC","-HCO,+O","-O.-C","-O.-H",
  931. "+C.+O","+H.+C","+H.+O","+HC,-C","+HC,-H","+HC.+O","+HCO,-C","+HCO,-H","+HCO,-HC","+HCO,-O","+O.+C","+O.+H","+H.+C.+O","-H.-C.-O","+HCO","-HCO");
  932. next if (oneOf(@selects, $besttree) == 0);
  933. if ( ($besttree =~ /,/ || $besttree =~ /\./) && $cleancase eq "UNCLEAN"){
  934. $besttree = "$besttree / $tree_analysis";
  935. }
  936. $besttree = "NULL" if $treescore <= 0;
  937. while ($besttree =~ /[A-Z][A-Z]/){
  938. $besttree =~ s/([A-Z])([A-Z])/$1:$2/g;
  939. }
  940. if ($besttree !~ /NULL/){
  941. my @elements = ($besttree =~ /([A-Z])/g);
  942. foreach my $ele (@elements){
  943. # print "replacing $ele with $backReplacementArrTag{$ele}\n";
  944. $besttree =~ s/$ele/$backReplacementArrTag{$ele}/g if exists $backReplacementArrTag{$ele};
  945. }
  946. }
  947. my $endendstate = $focalspec_orig.".".$Hchrom."\t".$Hstart."\t".$Hend."\t".$locusmotif."\t".$locusmotifsize."\t".$tree_analysis."\t";
  948. next if $endendstate =~ /NA\tNA\tNA/;
  949. print SUMMARY $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t";
  950. # print "SUMMARY\t", $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t",$tree_analysis,"\t" ;
  951. my @mutinfoarray =();
  952. foreach my $lnode (@$lagestnodes){
  953. my @pair = @$lnode;
  954. my $joint = "(".join(", ",@pair).")";
  955. my @nodemutarr = ();
  956. for my $p (@pair){
  957. foreach my $mut (@finalmutations_array){
  958. $mut =~ /node=([A-Z, \(\)]+)/;
  959. push @nodemutarr, $mut if $p eq $1;
  960. }
  961. @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree);
  962. # print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
  963. # print join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
  964. }
  965. }
  966. # print "G: $no_of_species\n";
  967. my @alignmentarr = ();
  968. foreach my $key (keys %alignment){
  969. push @alignmentarr, $backReplacementArrTag{$key}.":".$alignment{$key};
  970. }
  971. # print "alignmentarr = @alignmentarr"; <STDIN>;
  972. @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree);
  973. print SUMMARY join ("\t", @mutinfoarray ),"\t";
  974. print SUMMARY join(",",@alignmentarr),"\n";
  975. # print join("\t","--------------","\n",$besttree, join("",@tags)),"\n" if scalar(@tags) < 5;
  976. # <STDIN> if scalar(@tags) < 5;
  977. # print $cleancase, "\n";
  978. # print join ("\t", @mutinfoarray,$cleancase,join(",",@alignmentarr)),"\n"; #<STDIN>;
  979. # print "summarized\n"; #<STDIN>;
  980. my %indelcatch = ();
  981. my %substcatch = ();
  982. my %typecatch = ();
  983. my %nodescatch = ();
  984. my $mutconcat = join("\t", @finalmutations_array)."\n";
  985. my %indelposcatch = ();
  986. my %subsposcatch = ();
  987. foreach my $fmut ( @finalmutations_array){
  988. # next if $fmut !~ /indeltype=[a-zA-Z]+/;
  989. #print RUN $fmut, "\n";
  990. $fmut =~ /node=([a-zA-Z, \(\)]+)/;
  991. my $lnode = $1;
  992. $nodescatch{$1}=1;
  993. if ($fmut =~ /type=substitution/){
  994. # print "fmut=$fmut\n";
  995. $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/;
  996. my $from=$1;
  997. # print "from=$from\n";
  998. my $to=$2;
  999. # print "to=$to\n";
  1000. push @{$substcatch{$lnode}} , ("from:".$from." to:".$to);
  1001. $fmut =~ /position=([0-9]+)/;
  1002. push @{$subsposcatch{$lnode}}, $1;
  1003. }
  1004. if ($fmut =~ /insertion=[a-zA-Z\-]+/){
  1005. $fmut =~ /insertion=([a-zA-Z\-]+)/;
  1006. push @{$indelcatch{$lnode}} , $1;
  1007. $fmut =~ /indeltype=([a-zA-Z]+)/;
  1008. push @{$typecatch{$lnode}}, $1;
  1009. $fmut =~ /position=([0-9]+)/;
  1010. push @{$indelposcatch{$lnode}}, $1;
  1011. }
  1012. if ($fmut =~ /deletion=[a-zA-Z\-]+/){
  1013. $fmut =~ /deletion=([a-zA-Z\-]+)/;
  1014. push @{$indelcatch{$lnode}} , $1;
  1015. $fmut =~ /indeltype=([a-zA-Z]+)/;
  1016. push @{$typecatch{$lnode}}, $1;
  1017. $fmut =~ /position=([0-9]+)/;
  1018. push @{$indelposcatch{$lnode}}, $1;
  1019. }
  1020. }
  1021. # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1;
  1022. # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1;
  1023. # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1;
  1024. # #print RUN ".";
  1025. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1026. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1027. # print colored ['red'],"finalmutations_array=\n" if $printer == 1;
  1028. # foreach (@finalmutations_array) {
  1029. # print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ;
  1030. # print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ;
  1031. # }# if $line =~ /cal/;# && $line =~ /chr4/;
  1032. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1033. # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
  1034. # print "tree analysis = $tree_analysis\n" if $printer == 1;
  1035. # my $mutations = "@$mutations_array";
  1036. next;
  1037. for my $keys (@$nodes) {foreach my $key (@$keys){
  1038. #print "key = $key, => $branches_hash->{$key}\n";
  1039. }
  1040. # print "x" x 50, "\n";
  1041. }
  1042. my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat);
  1043. }
  1044. }
  1045. close BO;
  1046. # print "now studying where we lost microsatellites:\n";
  1047. # print "x" x 60,"\n";
  1048. for my $reason (0 ... $#nextcounter){
  1049. # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
  1050. }
  1051. # print "x" x 60,"\n";
  1052. # print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n";
  1053. # print " we lost $keynotformedcounter contigs as they did not form the key, \n";
  1054. # print "$contignotrightcounter contigs as they were not of the right species configuration\n";
  1055. # print "$keynotfoundcounter contigs as they did not contain the microsats\n";
  1056. # print "... In total we went through a file that had $dotcounter contigs...\n";
  1057. # print join ("\n","remaining orth keys = ", (keys %orths),"");
  1058. # print "------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ \n";
  1059. # print "now printing counted trees: \n";
  1060. if (scalar(keys %registeredTrees) > 0){
  1061. foreach my $keyb ( sort (keys %registeredTrees) )
  1062. {
  1063. # print "$keyb : $registeredTrees{$keyb}\n";
  1064. }
  1065. }
  1066. }
  1067. close SUMMARY;
  1068. my @summarizarr = ("+C=+C +R.+C -HCOR,+C",
  1069. "+H=+H +R.+H -HCOR,+H",
  1070. "-C=-C -R.-C +HCOR,-C",
  1071. "-H=-H -R.-H +HCOR,-H",
  1072. "+HC=+HC",
  1073. "-HC=-HC",
  1074. "+O=+O -HCOR,+O",
  1075. "-O=-O +HCOR,-O",
  1076. "+HCO=+HCO",
  1077. "-HCO=-HCO",
  1078. "+R=+R +R.+C +R.+H",
  1079. "-R=-R -R.-C -R.-H");
  1080. foreach my $line (@summarizarr){
  1081. next if $line !~ /[A-Za-z0-9]/;
  1082. # print $line;
  1083. chomp $line;
  1084. my @fields = split(/=/,$line);
  1085. # print "title = $fields[0]\n";
  1086. my @parts=split(/ +/, $fields[1]);
  1087. my %partshash = ();
  1088. foreach my $part (@parts){$partshash{$part}=1;}
  1089. my $count=0;
  1090. foreach my $key ( sort keys %registeredTrees ){
  1091. next if !exists $partshash{$key};
  1092. # print "now adding $registeredTrees{$key} from $key\n";
  1093. $count+=$registeredTrees{$key};
  1094. }
  1095. # print "$fields[0] : $count\n";
  1096. }
  1097. my $rootdir = $dir;
  1098. $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
  1099. chdir $rootdir;
  1100. remove_tree($dir);
  1101. #--------------------------------------------------------------------------------------------------------
  1102. #--------------------------------------------------------------------------------------------------------
  1103. #--------------------------------------------------------------------------------------------------------
  1104. #--------------------------------------------------------------------------------------------------------
  1105. #--------------------------------------------------------------------------------------------------------
  1106. sub largest_number{
  1107. my $counter = 0;
  1108. my($max) = shift(@_);
  1109. foreach my $temp (@_) {
  1110. #print "finding largest array: $maxcounter \n";
  1111. if($temp > $max){
  1112. $max = $temp;
  1113. }
  1114. }
  1115. return($max);
  1116. }
  1117. sub smallest_number{
  1118. my $counter = 0;
  1119. my($min) = shift(@_);
  1120. foreach my $temp (@_) {
  1121. #print "finding largest array: $maxcounter \n";
  1122. if($temp < $min){
  1123. $min = $temp;
  1124. }
  1125. }
  1126. return($min);
  1127. }
  1128. #--------------------------------------------------------------------------------------------------------
  1129. #--------------------------------------------------------------------------------------------------------
  1130. #--------------------------------------------------------------------------------------------------------
  1131. #--------------------------------------------------------------------------------------------------------
  1132. #--------------------------------------------------------------------------------------------------------
  1133. sub baseml_parser{
  1134. my $outputfile = $_[0];
  1135. open(BOUT,"<$outputfile") or die "Cannot open output of upstream baseml $outputfile: $!";
  1136. my @info = ();
  1137. my @branchields = ();
  1138. my @distanceields = ();
  1139. my @bout = <BOUT>;
  1140. #print colored ['red'], @bout ,"\n";
  1141. for my $b (0 ... $#bout){
  1142. my $bine=$bout[$b];
  1143. #print colored ['yellow'], "sentence = ",$bine;
  1144. if ($bine =~ /TREE/){
  1145. $bine=$bout[$b++];
  1146. $bine=$bout[$b++];
  1147. $bine=$bout[$b++];
  1148. #print "FOUND",$bine;
  1149. chomp $bine;
  1150. $bine =~ s/^\s+//g;
  1151. @branchields = split(/\s+/,$bine);
  1152. $bine=$bout[$b++];
  1153. chomp $bine;
  1154. $bine =~ s/^\s+//g;
  1155. @distanceields = split(/\s+/,$bine);
  1156. #print "LASTING..............\n";
  1157. last;
  1158. }
  1159. else{
  1160. }
  1161. }
  1162. close BOUT;
  1163. # print "branchfields = @branchields and distanceields = @distanceields\n" if $printer == 1;
  1164. my %distance_hash=();
  1165. for my $d (0 ... $#branchields){
  1166. $distance_hash{$branchields[$d]} = $distanceields[$d];
  1167. }
  1168. $info[0] = $distance_hash{"9..1"} + $distance_hash{"9..2"};
  1169. $info[1] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+ $distance_hash{"8..3"};
  1170. $info[2] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"7..4"};
  1171. $info[3] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"6..7"}+$distance_hash{"6..5"};
  1172. # print "\nsending back: @info\n" if $printer == 1;
  1173. return join("\t",@info);
  1174. }
  1175. #--------------------------------------------------------------------------------------------------------
  1176. #--------------------------------------------------------------------------------------------------------
  1177. #--------------------------------------------------------------------------------------------------------
  1178. #--------------------------------------------------------------------------------------------------------
  1179. #--------------------------------------------------------------------------------------------------------
  1180. sub test_complexity{
  1181. my $printer = 0;
  1182. my $sequence = $_[0];
  1183. #print "sequence = $sequence\n";
  1184. my $COMPLEXITY_SUPPORT = $_[1];
  1185. my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #1 THIS IS AN ARBITRARY THRESHOLD SET FOR LOW COMPLEXITY.
  1186. #2 THE INSPIRATION WAS WEB MILLER'S MAIL SENT ON
  1187. #3 19 Apr 2008 WHERE HE CLASSED AS HIGH COMPLEXITY
  1188. #4 REGION, IF 40 BP OF SEQUENCE HAS AT LEAST 3 OF
  1189. #5 EACH NUCLEOTIDE. HENCE, I NORMALIZE THIS PARAMETER
  1190. #6 FOR THE ACTUAL LENGTH OF $FLANK_SUPPORT SET BY
  1191. #7 THE USER.
  1192. #8 WEB MILLER SENT THE MAIL TO YDK104@PSU.EDU
  1193. my $As = ($sequence=~ s/A/A/gi);
  1194. my $Ts = ($sequence=~ s/T/T/gi);
  1195. my $Gs = ($sequence=~ s/G/G/gi);
  1196. my $Cs = ($sequence=~ s/C/C/gi);
  1197. my $dashes = ($sequence=~ s/\-/-/gi);
  1198. $dashes = 0 if $sequence !~ /\-/;
  1199. # print "seq = $sequence, As=$As, Ts=$Ts, Gs=$Gs, Cs=$Cs, dashes=$dashes\n";
  1200. return "LOW" if $dashes > length($sequence)/2;
  1201. my $ans = ();
  1202. return "HIGH" if $As >= $complexity && $Ts >= $complexity && $Cs >= $complexity && $Gs >= $complexity;
  1203. my @nts = ("A","T","G","C","-");
  1204. my $lowcomplex = 0;
  1205. foreach my $nt (@nts){
  1206. $lowcomplex =1 if $sequence =~ /(($nt\-*){$mono_flanksimplicityRepno,})/i;
  1207. $lowcomplex =1 if $sequence =~ /(($nt[A-Za-z]){$di_flanksimplicityRepno,})/i;
  1208. $lowcomplex =1 if $sequence =~ /(([A-Za-z]$nt){$di_flanksimplicityRepno,})/i;
  1209. my $nont = ($sequence=~ s/$nt/$nt/gi);
  1210. $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoAT) && ($nt =~ /[AT\-]/);
  1211. $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoCG) && ($nt =~ /[CG]/);
  1212. }
  1213. # print "leaving for now.. $sequence\n" if $printer == 1 && $lowcomplex == 0;
  1214. #<STDIN>;
  1215. return "HIGH" if $lowcomplex == 0;
  1216. return "LOW" ;
  1217. }
  1218. #--------------------------------------------------------------------------------------------------------
  1219. #--------------------------------------------------------------------------------------------------------
  1220. #--------------------------------------------------------------------------------------------------------
  1221. #--------------------------------------------------------------------------------------------------------
  1222. #--------------------------------------------------------------------------------------------------------
  1223. sub sequence_similarity{
  1224. my $printer = 0;
  1225. my @seq1 = split(/\s*/, $_[0]);
  1226. my @seq2 = split(/\s*/, $_[1]);
  1227. my $similarity_thresh = $_[2];
  1228. my $info = $_[3];
  1229. # print "input = @_\n" if $printer == 1;
  1230. my $seq1str = $_[0];
  1231. my $seq2str = $_[1];
  1232. $seq1str=~s/\-//g; $seq2str=~s/\-//g;
  1233. my $similarity=0;
  1234. my $nucdiffs=0;
  1235. my $nucsims=0;
  1236. my $indeldiffs=0;
  1237. for my $i (0...$#seq1){
  1238. $similarity++ if $seq1[$i] =~ /$seq2[$i]/i ; #|| $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i ;
  1239. $nucsims++ if $seq1[$i] =~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i);
  1240. $nucdiffs++ if $seq1[$i] !~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i);
  1241. $indeldiffs++ if $seq1[$i] !~ /$seq2[$i]/i && $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i;
  1242. }
  1243. my $sim = $similarity/length($_[0]);
  1244. return ( $sim, $nucdiffs, $indeldiffs ); #<= $similarity_thresh;
  1245. }
  1246. #--------------------------------------------------------------------------------------------------------
  1247. #--------------------------------------------------------------------------------------------------------
  1248. #--------------------------------------------------------------------------------------------------------
  1249. sub load_treesToReject{
  1250. my @rejectlist = ();
  1251. my $alltags = join("",@_);
  1252. @rejectlist = qw (-HCOR +HCOR) if $alltags eq "HCORM";
  1253. @rejectlist = qw ( -HCO|+R +HCO|-R) if $alltags eq "HCOR";
  1254. @rejectlist = qw ( -HC|+O +HC|-O) if $alltags eq "HCO";
  1255. %treesToReject=();
  1256. $treesToReject{$_} = $_ foreach (@rejectlist);
  1257. #print "loaded to reject for $alltags; ", $treesToReject{$_},"\n" foreach (@rejectlist); #<STDIN>;
  1258. }
  1259. #--------------------------------------------------------------------------------------------------------
  1260. sub load_treesToIgnore{
  1261. my @rejectlist = ();
  1262. my $alltags = join("",@_);
  1263. @rejectlist = qw (-HCOR +HCOR +HCORM -HCORM) if $alltags eq "HCORM";
  1264. @rejectlist = qw ( -HCO|+R +HCO|-R +HCOR -HCOR) if $alltags eq "HCOR";
  1265. @rejectlist = qw ( -HC|+O +HC|-O +HCO -HCO) if $alltags eq "HCO";
  1266. %treesToIgnore=();
  1267. $treesToIgnore{$_} = $_ foreach (@rejectlist);
  1268. #print "loaded ", $treesToIgnore{$_},"\n" foreach (@rejectlist);
  1269. }
  1270. #--------------------------------------------------------------------------------------------------------
  1271. sub load_thresholds{
  1272. my @threshold_array=split(/[,_]/,$_[0]);
  1273. unshift @threshold_array, "0";
  1274. for my $size (1 ... 4){
  1275. $thresholdhash{$size}=$threshold_array[$size];
  1276. }
  1277. }
  1278. #--------------------------------------------------------------------------------------------------------
  1279. sub load_allPossibleTrees{
  1280. #1 THIS FILE STORES ALL POSSIBLE SCENARIOS OF MICROSATELLITE
  1281. #2 BIRTH AND DEATH EVENTS ON A 5-PRIMATE TREE OF H,C,O,R,M
  1282. #3 IN FORM OF A TEXT FILE. THIS WILL BE USED AS A TEMPLET
  1283. #4 TO COMPARE EACH MICROSATELLITE CLUSTER TO UNDERSTAND THE
  1284. #5 EVOLUTION OF EACH LOCUS. WE WILL THEN DISCARD SOME
  1285. #6 MICROSATS ACCRODING TO THEIR EVOLUTIONARY BEHAVIOUR ON
  1286. #7 THE TREE. MOST PROBABLY WE WILL REMOVE THOSE MICROSATS
  1287. #8 THAT ARE NOT SUFFICIENTLY INFORMATIVE, LIKE IN CASE OF
  1288. #9 AN OUTGROUP MICROSATELLITE BEING DIFFERENT FRON ALL OTHER
  1289. #10 SPECIES IN THE TREE.
  1290. my $tree_list = $_[0];
  1291. # print "file to be loaded: $tree_list\n";
  1292. my @trarr = ();
  1293. @trarr = ("#H C O CONCLUSION ALTERNATE",
  1294. "+ + + +HCO NA",
  1295. "+ _ _ +H NA",
  1296. "_ + _ +C NA",
  1297. "_ _ + -HC|+O NA",
  1298. "+ _ + -C +H",
  1299. "_ + + -H +C",
  1300. "+ + _ +HC|-O NA",
  1301. "_ _ _ -HCO NA") if $tree_list =~ /_HCO\.txt/;
  1302. @trarr = ("#H C O R CONCLUSION ALTERNATE",
  1303. "_ _ _ _ -HCOR NA",
  1304. "+ + + + +HCOR NA",
  1305. "+ + + _ +HCO|-R +H.+C.+O",
  1306. "+ + _ _ +HC +H.+C;-O",
  1307. "+ _ _ _ +H +HC,-C;+HC,-C",
  1308. "_ + _ _ +C +HC,-H;+HC,-H",
  1309. "_ _ + _ +O -HC|-H.-C",
  1310. "_ _ + + -HC -H.-C",
  1311. "+ _ _ + +H|-C.-O +HC,-C",
  1312. "_ + _ + +C -H.-O",
  1313. "_ + + _ -H +C.+O",
  1314. "_ _ _ + -HCO|+R NA",
  1315. "+ _ + _ +H.+O|-C NA",
  1316. "_ + + + -H -HC,+C",
  1317. "+ _ + + -C -HC,+H",
  1318. "+ + _ + -O +HC") if $tree_list =~ /_HCOR\.txt/;
  1319. @trarr = ("#H C O R M CONCLUSION ALTERNATE",
  1320. "+ + _ + + -O -HCO,+HC|-HCO,+HC;-HCO,(+H.+C)",
  1321. "+ _ + + + -C -HC,+H;+HCO,(+H.+O)",
  1322. "_ + + + + -H -HC,+C;-HCO,(+C.+O)",
  1323. "_ _ + _ _ +O +HCO,-HC;+HCO,(-H.-C)",
  1324. "_ + _ _ _ +C +HC,-H;+HCO,(-H.-O)",
  1325. "+ _ _ _ _ +H +HC,-C;+HCO,(-C.-O)",
  1326. "+ + + _ _ +HCO +H.+C.+O",
  1327. "_ _ _ + + -HCO -HC.-O;-H.-C.-O",
  1328. "+ _ _ + + -O.-C|-HCO,+H +R.+H;-HCO,(+R.+H)",
  1329. "_ + _ + + -O.-H|-HCO,+C +R.+C;-HCO,(+R.+C)",
  1330. "_ + + _ _ +HCO,-H|+O.+C NA",
  1331. "+ _ + _ _ +HCO,-C|+O.+H NA",
  1332. "_ _ + + + -HC -H.-C|-HCO,+O",
  1333. "+ + _ _ _ +HC +H.+C|+HCO,-O|-HCO,+HC;-HCO,(+H.+C)",
  1334. "+ + + + + +HCORM NA",
  1335. "_ _ + _ + DISCARD +O;+HCO,-HC;+HCO,(-H.-C)",
  1336. "_ + _ _ + +C +HC,-H;+HCO,(-H.-O)",
  1337. "+ _ _ _ + +H +HC,-C;+HCO,(-C.-O)",
  1338. "+ + _ _ + +HC -R.-O|+HCO,-O|+H.+C;-HCO,+HC;-HCO,(+H.+C)",
  1339. "+ _ + _ + DISCARD -R.-C|+HCO,-C|+H.+O NA",
  1340. "_ + + _ + DISCARD -R.-H|+HCO,-H|+C.+O NA",
  1341. "_ _ _ _ + DISCARD -HCOR NA",
  1342. "_ _ _ + _ DISCARD +R;-HC.-O;-H.-C.-O",
  1343. "+ + _ + _ -O +R.+HC|-HCO,+HC;+H.+C.+R|-HCO,(+H.+C)",
  1344. "+ + + + _ +HCOR NA",
  1345. "+ + + _ + DISCARD -R;+HCO;+HC.+O;+H.+C.+O",
  1346. "+ _ + + _ -C -HC,+H;+H.+O.+R|-HCO,(+H.+O)",
  1347. "_ + + + _ -H -HC,+C;+C.+O.+R|-HCO,(+C.+O)",
  1348. "_ _ + + _ -HC +R.+O|-HCO,+O|+HCO,-HC",
  1349. "_ + _ + _ +C +R.+C|-HCO,+C|-HC,+C +HCO,(-H.-O)",
  1350. "+ _ _ + _ +H +R.+H|-C.-O +HCO,(-C.-O)"
  1351. ) if $tree_list =~ /_HCORM\.txt/;
  1352. my $template_p = $_[1];
  1353. my $alternate_p = $_[2];
  1354. #1 THIS IS THE HASH IN WHICH INFORMATION FROM THE ABOVE FILE
  1355. #2 GETS STORED, USING THE WHILE LOOP BELOW. HERE, THE KEY
  1356. #3 OF EACH ROW IS THE EVOLUTIONARY CONFIGURATION OF A LOCUS
  1357. #4 ON THE PRIMATE TREE, BASED ON PRESENCE/ABSENCE OF A MICROSAT
  1358. #5 AT THAT LOCUS, LIKE SAY "+ + + _ _" .. EACH COLUMN BELONGS
  1359. #6 TO ONE SPECIES; HERE THE COLUMN NAMES ARE "H C O R M".
  1360. #7 THE VALUE FOR EACH ENTRY IS THE MEANING OF THE ABOVE
  1361. #8 CONFIGURATION (I.E., CONFIGURAION OF THE KEY. HERE, THE
  1362. #9 VALUE WILL BE +HCO, SIGNIFYING A BIRTH IN HUMAN-CHIMP-ORANG
  1363. #10 COMMON ANCESTOR. THIS HASH HAS BEEN LOADED HERE TO BE USED
  1364. #11 LATER BY THE SUBROUTINE sub treeStudy{} THAT STUDIES
  1365. #12 EVOLUTIONARY CONFIGURAION OF EACH MICROSAT LOCUS, AS
  1366. #13 MENTIONED ABOVE.
  1367. my @keys_array=();
  1368. foreach my $line (@trarr){
  1369. # print $line,"\n";
  1370. next if $line =~ /^#/;
  1371. chomp $line;
  1372. my @fields = split("\t", $line);
  1373. push @keys_array, $fields[0];
  1374. # print "loading: $fields[0]\n";
  1375. $template_p->{$fields[0]}[0] = $fields[1];
  1376. $template_p->{$fields[0]}[1] = 0;
  1377. $alternate_p->{$fields[0]} = $fields[2];
  1378. # $alternate_p->{$fields[1]} = $fields[2];
  1379. # print "loading alternate_p $fields[1] $fields[2]\n"; #<STDIN> if $fields[1] eq "+H";
  1380. }
  1381. # print "loaded the trees with keys: @keys_array\n";
  1382. return $template_p, \@keys_array, $alternate_p;
  1383. }
  1384. #--------------------------------------------------------------------------------------------------------
  1385. #--------------------------------------------------------------------------------------------------------
  1386. #--------------------------------------------------------------------------------------------------------
  1387. #--------------------------------------------------------------------------------------------------------
  1388. #--------------------------------------------------------------------------------------------------------
  1389. sub checkCleanCase{
  1390. my $printer = 0;
  1391. my $tree = $_[0];
  1392. my $finalalignment = $_[1];
  1393. #print "IN checkCleanCase: @_\n";
  1394. #<STDIN>;
  1395. my @indivspecies = $tree =~ /[A-Z]/g;
  1396. $finalalignment =~ s/\./_/g;
  1397. my @captured = $finalalignment =~ /[A-Za-z, \(\):]+\![:A-Za-z, \(\)]/g;
  1398. my $unclean = 0;
  1399. foreach my $sp (@indivspecies){
  1400. foreach my $cap (@captured){
  1401. $cap =~ s/:[A-Za-z\-]+//g;
  1402. my @sps = $cap =~ /[A-Z]+/g;
  1403. my $spsc = join("", @sps);
  1404. # print "checking whether imp species $sp is present in $cap i.e, in $spsc\n " if $printer == 1;
  1405. if ($spsc =~ /$sp/){
  1406. # print "foind : $sp\n";
  1407. $unclean = 1; last;
  1408. }
  1409. }
  1410. last if $unclean == 1;
  1411. }
  1412. #<STDIN>;
  1413. return "CLEAN" if $unclean == 0;
  1414. return "UNCLEAN";
  1415. }
  1416. #--------------------------------------------------------------------------------------------------------
  1417. #--------------------------------------------------------------------------------------------------------
  1418. #--------------------------------------------------------------------------------------------------------
  1419. #--------------------------------------------------------------------------------------------------------
  1420. #--------------------------------------------------------------------------------------------------------
  1421. sub adjustCoordinates{
  1422. my $line = $_[0];
  1423. return 0 if !defined $line;
  1424. #print "------x------x------x------x------x------x------x------x------\n";
  1425. #print $line,"\n\n";
  1426. my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig;
  1427. #print $line,"\n";
  1428. #print "------x------x------x------x------x------x------x------x------\n\n\n";
  1429. # my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g);
  1430. # print "line = $line\n";
  1431. $infocord = 2 + (4*$no_of_species) - 1;
  1432. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  1433. $motifcord = 2 + (4*$no_of_species) + 2 - 1;
  1434. $gapcord = $motifcord+1;
  1435. $startcord = $gapcord+1;
  1436. $strandcord = $startcord+1;
  1437. $endcord = $strandcord + 1;
  1438. $microsatcord = $endcord + 1;
  1439. $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  1440. $interr_poscord = $microsatcord + 3;
  1441. $no_of_interruptionscord = $microsatcord + 4;
  1442. $interrcord = $microsatcord + 2;
  1443. # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i;
  1444. return $no_of_species;
  1445. }
  1446. sub printhash{
  1447. my $alivehash = $_[0];
  1448. my @tags = @$_[1];
  1449. # print "print hash\n";
  1450. foreach my $tag (@tags){
  1451. # print "$tag=",$alivehash->{$tag},"\n" if exists $alivehash->{$tag};
  1452. }
  1453. return "\n"
  1454. }
  1455. sub peel_onion{
  1456. my $printer = 0;
  1457. # print "received: @_\n" ; #<STDIN>;
  1458. $printer = 0;
  1459. my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_;
  1460. # print "in peel onion.. tree = $tree \n" if $printer == 1;
  1461. my %sequence_hash=();
  1462. # for my $i (0 ... $#sequences){ $sequence_hash{$species[$i]}=$sequences->[$i]; }
  1463. my %node_sequences=();
  1464. my %node_alignments = (); #NEW, Nov 28 2008
  1465. my @tags=();
  1466. my @locus_sequences=();
  1467. my %alivehash=();
  1468. foreach my $tag (@$tagarray) {
  1469. #print "adding: $tag\n";
  1470. push(@tags, $tag);
  1471. $node_sequences{$tag}=join ".",split(/\s*/,$microsathash->{$tag}) if $microsathash->{$tag} ne "NULL";
  1472. $alivehash{$tag}= $tag if $microsathash->{$tag} ne "NULL";
  1473. $node_sequences{$tag}=join ".",split(/\s*/,$nonmicrosathash->{$tag}) if $microsathash->{$tag} eq "NULL";
  1474. $node_alignments{$tag}=join ".",split(/\s*/,$alignment->{$tag}) ;
  1475. push @locus_sequences, $node_sequences{$tag};
  1476. # print "adding to node_seq: $tag = ",$node_alignments{$tag},"\n";
  1477. }
  1478. #<STDIN>;
  1479. my ($nodes_arr, $branches_hash) = get_nodes($tree);
  1480. my @nodes=@$nodes_arr;
  1481. # print "recieved nodes = " if $printer == 1;
  1482. # foreach my $key (@nodes) {print "@$key " if $printer == 1;}
  1483. # print "\n" if $printer == 1;
  1484. #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS
  1485. foreach my $keys (@nodes){
  1486. my @pair = @$keys;
  1487. my $joint = "(".join(", ",@pair).")";
  1488. my $copykey = join "", @pair;
  1489. $copykey =~ s/[\W ]+//g;
  1490. # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1;
  1491. my $livestatus = 1;
  1492. foreach my $copy (split(/\s*/,$copykey)){
  1493. $livestatus = 0 if !exists $alivehash{$copy};
  1494. }
  1495. $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1;
  1496. # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1;
  1497. }
  1498. @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT.
  1499. my @mutations_array=();
  1500. my $joint = ();
  1501. foreach my $node (@nodes){
  1502. my @pair = @$node;
  1503. # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1;
  1504. $joint = "(".join(", ",@pair).")";
  1505. my @pair_sequences=();
  1506. foreach my $tag (@pair){
  1507. # print "$tag: $node_alignments{$tag}\n" if $printer == 1;
  1508. # print $node_alignments{$tag},"\n" if $printer == 1;
  1509. push @pair_sequences, $node_alignments{$tag};
  1510. }
  1511. # print "ppeel onion joint = $joint , pair_sequences=>@pair_sequences< , pair=>@pair<\n" if $printer == 1;
  1512. my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint);
  1513. $node_alignments{$joint}=$compared;
  1514. push( @mutations_array,split(/:/,$substitutions_list));
  1515. # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations =\n", join("\n",@mutations_array),"\n" if $printer == 1;
  1516. }
  1517. my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif);
  1518. return ($analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint}) if scalar @mutations_array > 0;
  1519. return ("NULL",\@nodes,$branches_hash, \%alivehash, "NULL") if scalar @mutations_array == 0;
  1520. }
  1521. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1522. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1523. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1524. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1525. sub get_nodes{
  1526. my $printer = 0;
  1527. my $tree=$_[0];
  1528. #$tree =~ s/ +//g;
  1529. $tree =~ s/\t+//g;
  1530. $tree=~s/;//g;
  1531. # print "tree=$tree\n" if $printer == 1;
  1532. my @nodes = ();
  1533. my @onions=($tree);
  1534. my %branches=();
  1535. foreach my $bite (@onions){
  1536. $bite=~ s/^\(|\)$//g;
  1537. chomp $bite;
  1538. # print "tree = $bite \n";
  1539. # <STDIN>;
  1540. $bite=~ /([ ,\(\)A-Z]+)\,\s*([ ,\(\)A-Z]+)/;
  1541. #$tree =~ /(\(\(\(H, C\), O\), R\))\, (M)/;
  1542. my @raw_nodes = ($1, $2);
  1543. # print "raw nodes = $1 and $2\n" if $printer == 1;
  1544. push(@nodes, [@raw_nodes]);
  1545. foreach my $node (@raw_nodes) {push (@onions, $node) if $node =~ /,/;}
  1546. foreach my $node (@raw_nodes) {$branches{$node}="(".$bite.")"; }
  1547. # print "onions = @onions\n" if $printer == 1;<STDIN> if $printer == 1;
  1548. }
  1549. $printer = 0;
  1550. return \@nodes, \%branches;
  1551. }
  1552. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1553. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1554. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1555. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1556. sub analyze_mutations{
  1557. my ($mutations_array, $nodes, $branches_hash, $alignment, $tags, $alivehash, $node_sequences, $microsatstarts, $motif) = @_;
  1558. my $locuslength = length($alignment->{$tags->[0]});
  1559. my $printer = 0;
  1560. # print " IN analyzed_mutations....\n" if $printer == 1; # \n mutations array = @$mutations_array, \nAND locuslength = $locuslength\n" if $printer == 1;
  1561. my %mutation_hash=();
  1562. my %froms_megahash=();
  1563. my %tos_megahash=();
  1564. my %position_hash=();
  1565. my @solutions_array=();
  1566. foreach my $mutation (@$mutations_array){
  1567. # print "loadin mutation: $mutation\n" if $printer == 1;
  1568. my %localhash= $mutation =~ /([\S ]+)=([\S ]+)/g;
  1569. $mutation_hash{$localhash{"position"}} = {%localhash};
  1570. push @{$position_hash{$localhash{"position"}}},$localhash{"node"};
  1571. # print "feeding position hash with $localhash{position}: $position_hash{$localhash{position}}[0]\n" if $printer == 1;
  1572. $froms_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"from"};
  1573. $tos_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"to"};
  1574. # print "just a trial: $mutation_hash{$localhash{position}}{position}\n" if $printer == 1;
  1575. # print "loadin in tos_megahash: $localhash{position} {$localhash{node} = $localhash{to}\n" if $printer == 1;
  1576. # print "loadin in from: $localhash{position} {$localhash{node} = $localhash{from}\n" if $printer == 1;
  1577. }
  1578. # print "now going through each position in loculength:\n" if $printer == 1;
  1579. ## <STDIN> if $printer == 1;
  1580. for my $pos (0 ... $locuslength-1){
  1581. # print "at position: $pos\n" if $printer == 1;
  1582. if (exists($mutation_hash{$pos})){
  1583. my @local_nodes=@{$position_hash{$pos}};
  1584. # print "found mutation: @{$position_hash{$pos}} : @local_nodes\n" if $printer == 1;
  1585. foreach my $local_node (@local_nodes){
  1586. # print "at local node: $local_node ... from state = $froms_megahash{$pos}{$local_node}\n" if $printer == 1;
  1587. my $open_insertion=();
  1588. my $open_deletion=();
  1589. my $open_to_substitution=();
  1590. my $open_from_substitution=();
  1591. if ($froms_megahash{$pos}{$local_node} eq "-"){
  1592. # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
  1593. # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}} && $printer == 1;
  1594. #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
  1595. $open_insertion=$tos_megahash{$pos}{$local_node};
  1596. for my $posnext ($pos+1 ... $locuslength-1){
  1597. # print "in first if .... studying posnext: $posnext\n" if $printer == 1;
  1598. last if !exists ($froms_megahash{$posnext}{$local_node});
  1599. # print "for posnext: $posnext, there exists $froms_megahash{$posnext}{$local_node}.. already, open_insertion = $open_insertion.. checking is $froms_megahash{$posnext}{$local_node} matters\n" if $printer == 1;
  1600. $open_insertion = $open_insertion.$tos_megahash{$posnext}{$local_node} if $froms_megahash{$posnext}{$local_node} eq "-";
  1601. # print "now open_insertion=$open_insertion\n" if $printer == 1;
  1602. delete $mutation_hash{$posnext} if $froms_megahash{$posnext}{$local_node} eq "-";
  1603. }
  1604. # print "1 Feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="),"\n" if $printer == 1;
  1605. push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="));
  1606. }
  1607. elsif ($tos_megahash{$pos}{$local_node} eq "-"){
  1608. # print "here exists a microsatellite to $local_node from $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
  1609. # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
  1610. #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
  1611. $open_deletion=$froms_megahash{$pos}{$local_node};
  1612. for my $posnext ($pos+1 ... $locuslength-1){
  1613. # print "in 1st elsif studying posnext: $posnext\n" if $printer == 1;
  1614. # print "nexting as nextpos does not exist\n" if !exists ($tos_megahash{$posnext}{$local_node}) && $printer == 1;
  1615. last if !exists ($tos_megahash{$posnext}{$local_node});
  1616. # print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1;
  1617. $open_deletion = $open_deletion.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} eq "-";
  1618. delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} eq "-";
  1619. }
  1620. # print "2 Feeding in:", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"), "\n" if $printer == 1;
  1621. push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"));
  1622. }
  1623. elsif ($tos_megahash{$pos}{$local_node} ne "-"){
  1624. # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
  1625. # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
  1626. #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
  1627. # print "microsatstart = $microsatstarts->{$local_node} \n" if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node} && $printer == 1;
  1628. next if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node};
  1629. $open_to_substitution=$tos_megahash{$pos}{$local_node};
  1630. $open_from_substitution=$froms_megahash{$pos}{$local_node};
  1631. # print "open from substitution: $open_from_substitution \n" if $printer == 1;
  1632. for my $posnext ($pos+1 ... $locuslength-1){
  1633. #print "in last elsif studying posnext: $posnext\n";
  1634. last if !exists ($tos_megahash{$posnext}{$local_node});
  1635. # print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1;
  1636. $open_to_substitution = $open_to_substitution.$tos_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-";
  1637. $open_from_substitution = $open_from_substitution.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-";
  1638. delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} ne "-" && $froms_megahash{$posnext}{$local_node} ;
  1639. }
  1640. # print "open from substitution: $open_from_substitution \n" if $printer == 1;
  1641. #IS THE STRETCH OF SUBSTITUTION MICROSATELLITE-LIKE?
  1642. my @motif_parts=split(/\s*/,$motif);
  1643. #GENERATING THE FLEXIBLE LEFT END
  1644. my $left_query=();
  1645. for my $k (1 ... $#motif_parts) {
  1646. $left_query= $motif_parts[$k]."|)";
  1647. $left_query="(".$left_query;
  1648. }
  1649. $left_query=$left_query."?";
  1650. # print "left_quewry = $left_query\n" if $printer == 1;
  1651. #GENERATING THE FLEXIBLE RIGHT END
  1652. my $right_query=();
  1653. for my $k (0 ... ($#motif_parts-1)) {
  1654. $right_query= "(|".$motif_parts[$k];
  1655. $right_query=$right_query.")";
  1656. }
  1657. $right_query=$right_query."?";
  1658. # print "right_query = $right_query\n" if $printer == 1;
  1659. # print "Hence, searching for: ^$left_query($motif)+$right_query\$\n" if $printer == 1;
  1660. my $motifcomb=$motif x 50;
  1661. # print "motifcomb = $motifcomb\n" if $printer == 1;
  1662. if ( ($motifcomb =~/$open_to_substitution/i) && (length ($open_to_substitution) >= length($motif)) ){
  1663. # print "sequence microsat-like\n" if $printer == 1;
  1664. my $all_microsat_like = 0;
  1665. # print "3 feeding in: ", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"), "\n" if $printer == 1;
  1666. push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"));
  1667. # print "4 feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="), "\n" if $printer == 1;
  1668. push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="));
  1669. }
  1670. else{
  1671. # print "5 feeding in: ", join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="), "\n" if $printer == 1;
  1672. push (@solutions_array, join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="));
  1673. }
  1674. #IS THE FROM-SEQUENCE MICROSATELLITE-LIKE?
  1675. }
  1676. #<STDIN> if $printer ==1;
  1677. }
  1678. #<STDIN> if $printer ==1;
  1679. }
  1680. }
  1681. # print "\n", "#" x 50, "\n" if $printer == 1;
  1682. foreach my $tag (@$tags){
  1683. # print "$tag: $alignment->{$tag}\n" if $printer == 1;
  1684. }
  1685. # print "\n", "#" x 50, "\n" if $printer == 1;
  1686. # print "returning SOLUTIONS ARRAY : \n",join("\n", @solutions_array),"\n" if $printer == 1;
  1687. #print "end\n";
  1688. #<STDIN> if
  1689. return \@solutions_array;
  1690. }
  1691. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1692. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1693. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1694. #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
  1695. sub base_by_base_simple{
  1696. my $printer = 0;
  1697. my ($motif, $locus, $no, $pair0, $pair1, $joint) = @_;
  1698. my @seq_array=();
  1699. # print "IN SUBROUTUNE base_by_base_simple.. information received = @_\n" if $printer == 1;
  1700. # print "pair0 = $pair0 and pair1 = $pair1\n" if $printer == 1;
  1701. my @example=split(/\./,$locus->[0]);
  1702. # print "example, for length = @example\n" if $printer == 1;
  1703. for my $i (0...$no-1){push(@seq_array, [split(/\./,$locus->[$i])]); }
  1704. my @compared_sequence=();
  1705. my @substitutions_list;
  1706. for my $i (0...scalar(@example)-1){
  1707. #print "i = $i\n" if $printer == 1;
  1708. #print "comparing $seq_array[0][$i] and $seq_array[1][$i] \n" ;#if $printer == 1;
  1709. if ($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] !~ /!/){
  1710. my $resolution= resolve_base($seq_array[0][$i],$seq_array[1][$i], $pair1 ,"keep" );
  1711. # print "ancestral = $resolution\n" if $printer == 1;
  1712. if ($resolution =~ /$seq_array[1][$i]/i && $resolution !~ /!/){
  1713. push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution );
  1714. }
  1715. elsif ( $resolution !~ /!/){
  1716. push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution);
  1717. }
  1718. push @compared_sequence,$resolution;
  1719. }
  1720. elsif ($seq_array[0][$i] !~ /!/ && $seq_array[1][$i] =~ /!/){
  1721. my $resolution= resolve_base($seq_array[1][$i],$seq_array[0][$i], $pair0, "invert" );
  1722. # print "ancestral = $resolution\n" if $printer == 1;
  1723. if ($resolution =~ /$seq_array[0][$i]/i && $resolution !~ /!/){
  1724. push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution);
  1725. }
  1726. elsif ( $resolution !~ /!/){
  1727. push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution);
  1728. }
  1729. push @compared_sequence,$resolution;
  1730. }
  1731. elsif($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] =~ /!/){
  1732. push @compared_sequence, add_bases($seq_array[0][$i],$seq_array[1][$i], $pair0, $pair1, $joint );
  1733. }
  1734. else{
  1735. if($seq_array[0][$i] !~ /^$seq_array[1][$i]$/i){
  1736. push @compared_sequence, $pair0.":".$seq_array[0][$i]."!".$pair1.":".$seq_array[1][$i];
  1737. }
  1738. else{
  1739. # print "perfect match\n" if $printer == 1;
  1740. push @compared_sequence, $seq_array[0][$i];
  1741. }
  1742. }
  1743. }
  1744. # print "returning: comared = @compared_sequence \nand substitutions list =\n", join("\n",@substitutions_list),"\n" if $printer == 1;
  1745. return join(".",@compared_sequence), join(":", @substitutions_list) if scalar (@substitutions_list) > 0;
  1746. return join(".",@compared_sequence), "" if scalar (@substitutions_list) == 0;
  1747. }
  1748. sub resolve_base{
  1749. my $printer = 0;
  1750. # print "IN SUBROUTUNE resolve_base.. information received = @_\n" if $printer == 1;
  1751. my ($optional, $single, $singlesp, $arg) = @_;
  1752. my @options=split(/!/,$optional);
  1753. foreach my $option(@options) {
  1754. $option=~s/[A-Z\(\) ,]+://g;
  1755. if ($option =~ /$single/i){
  1756. # print "option = $option , returning single: $single\n" if $printer == 1;
  1757. return $single;
  1758. }
  1759. }
  1760. # print "returning ",$optional."!".$singlesp.":".$single. "\n" if $arg eq "keep" && $printer == 1;
  1761. # print "returning ",$singlesp.":".$single."!".$optional. "\n" if $arg eq "invert" && $printer == 1;
  1762. return $optional."!".$singlesp.":".$single if $arg eq "keep";
  1763. return $singlesp.":".$single."!".$optional if $arg eq "invert";
  1764. }
  1765. sub same_length{
  1766. my $printer = 0;
  1767. my @locus = @_;
  1768. my $temp = shift @locus;
  1769. $temp=~s/-|,//g;
  1770. foreach my $l (@locus){
  1771. $l=~s/-|,//g;
  1772. return 0 if length($l) != length($temp);
  1773. $temp = $l;
  1774. }
  1775. return 1;
  1776. }
  1777. sub treeStudy{
  1778. my $printer = 1;
  1779. # print "template DEFINED.. received: @_\n" if defined %template;
  1780. # print "only received = @_" if !defined %template;
  1781. my $stopper = 0;
  1782. # if (!defined %template){ TEMP MASKED OCT 18 2012
  1783. $stopper = 1;
  1784. %template=();
  1785. # print "tree decipherer = $tree_decipherer\n" if $printer == 1;
  1786. my ( $template_ref, $keys_array)=load_allPossibleTrees($tree_decipherer, \%template);
  1787. # print "return = $template_ref and @{$keys_array}\n" if $printer == 1;
  1788. foreach my $key (@$keys_array){
  1789. # print "addding : $template_ref->{$key} for $key\n" if $printer == 1;
  1790. $template{$key} = $template_ref->{$key};
  1791. }
  1792. # } TEMP MASK OCT 18 2012 END
  1793. # <STDIN>;
  1794. for my $templet ( keys %template ) {
  1795. # print "$templet => @{$template{$templet}}\n";
  1796. }
  1797. # <STDIN> if !defined %template;
  1798. my $strict = 0;
  1799. my $H = 0;
  1800. my $Hchr = 1;
  1801. my $Hstart = 2;
  1802. my $Hend = 3;
  1803. my $Hmotif = 4;
  1804. my $Hmotiflen = 5;
  1805. my $Hmicro = 6;
  1806. my $Hstrand = 7;
  1807. my $Hmicrolen = 8;
  1808. my $Hinterpos = 9;
  1809. my $Hrelativepos = 10;
  1810. my $Hinter = 11;
  1811. my $Hinterlen = 12;
  1812. my $C = 13;
  1813. my $Cchr = 14;
  1814. my $Cstart = 15;
  1815. my $Cend = 16;
  1816. my $Cmotif = 17;
  1817. my $Cmotiflen = 18;
  1818. my $Cmicro = 19;
  1819. my $Cstrand = 20;
  1820. my $Cmicrolen = 21;
  1821. my $Cinterpos = 22;
  1822. my $Crelativepos = 23;
  1823. my $Cinter = 24;
  1824. my $Cinterlen = 25;
  1825. my $O = 26;
  1826. my $Ochr = 27;
  1827. my $Ostart = 28;
  1828. my $Oend = 29;
  1829. my $Omotif = 30;
  1830. my $Omotiflen = 31;
  1831. my $Omicro = 32;
  1832. my $Ostrand = 33;
  1833. my $Omicrolen = 34;
  1834. my $Ointerpos = 35;
  1835. my $Orelativepos = 36;
  1836. my $Ointer = 37;
  1837. my $Ointerlen = 38;
  1838. my $R = 39;
  1839. my $Rchr = 40;
  1840. my $Rstart = 41;
  1841. my $Rend = 42;
  1842. my $Rmotif = 43;
  1843. my $Rmotiflen = 44;
  1844. my $Rmicro = 45;
  1845. my $Rstrand = 46;
  1846. my $Rmicrolen = 47;
  1847. my $Rinterpos = 48;
  1848. my $Rrelativepos = 49;
  1849. my $Rinter = 50;
  1850. my $Rinterlen = 51;
  1851. my $Mchr = 52;
  1852. my $Mstart = 53;
  1853. my $Mend = 54;
  1854. my $M = 55;
  1855. my $Mmotif = 56;
  1856. my $Mmotiflen = 57;
  1857. my $Mmicro = 58;
  1858. my $Mstrand = 59;
  1859. my $Mmicrolen = 60;
  1860. my $Minterpos = 61;
  1861. my $Mrelativepos = 62;
  1862. my $Minter = 63;
  1863. my $Minterlen = 64;
  1864. #-------------------------------------------------------------------------------#
  1865. my @analysis=();
  1866. my %speciesOrder = ();
  1867. $speciesOrder{"H"} = 0;
  1868. $speciesOrder{"C"} = 1;
  1869. $speciesOrder{"O"} = 2;
  1870. $speciesOrder{"R"} = 3;
  1871. $speciesOrder{"M"} = 4;
  1872. #-------------------------------------------------------------------------------#
  1873. my $line = $_[0];
  1874. chomp $line;
  1875. my @f = split(/\t/,$line);
  1876. # print "received array : @f.. recieved tags = @tags\n" if $printer == 1;
  1877. # collect all motifs
  1878. my @motifs=();
  1879. @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif], $f[$Mmotif]) if $tags[$#tags] =~ /M/;
  1880. @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]) if $tags[$#tags] =~ /R/;
  1881. @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif]) if $tags[$#tags] =~ /O/;
  1882. # print "motifs in the array = $f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]\n" if $tags[$#tags] =~ /R/;;
  1883. # print "motifs = @motifs\n" if $printer == 1;
  1884. my @translation = ();
  1885. foreach my $motif (@motifs){
  1886. push(@translation, "_") if $motif eq "NA";
  1887. push(@translation, "+") if $motif ne "NA";
  1888. }
  1889. my $translate = join(" ", @translation);
  1890. # print "translate = >$translate< and analysis = $template{$translate}[0].. on the other hand, ",$template{"- - +"}[0],"\n";
  1891. my @analyses = split(/\|/,$template{$translate}[0]);
  1892. # print "motifs = @motifs, analyses = @analyses\n" if $printer == 1;
  1893. if (scalar(@analyses) == 1) {
  1894. #print "analysis = $analyses[0]\n";
  1895. if ($analyses[0] !~ /,|\./ ){
  1896. if ($analyses[0] =~ /\+/){
  1897. my $analysis = $analyses[0];
  1898. $analysis =~ s/\+|\-//g;
  1899. my @species = split(/\s*/,$analysis);
  1900. my @currentMotifs = ();
  1901. foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;
  1902. }
  1903. # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
  1904. $template{$translate}[1]++ if $strict == 1 && consistency(@currentMotifs) ne "NULL";
  1905. $template{$translate}[1]++ if $strict == 0;
  1906. # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
  1907. }
  1908. else{
  1909. my $analysis = $analyses[0];
  1910. $analysis =~ s/\+|\-//g;
  1911. my @species = split(/\s*/,$analysis);
  1912. my @currentMotifs = ();
  1913. my @complementarySpecies = ();
  1914. my $allSpecies = join("",@tags);
  1915. foreach my $specie (@species){ $allSpecies =~ s/$specie//g; }
  1916. foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;;
  1917. }
  1918. # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
  1919. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
  1920. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
  1921. # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
  1922. }
  1923. }
  1924. elsif ($analyses[0] =~ /,/) {
  1925. my @events = split(/,/,$analyses[0]);
  1926. # print "events = @events \n " if $printer == 1;
  1927. if ($events[0] =~ /\+/){
  1928. my $analysis1 = $events[0];
  1929. $analysis1 =~ s/\+|\-//g;
  1930. my $analysis2 = $events[1];
  1931. $analysis2 =~ s/\+|\-//g;
  1932. my @nSpecies = split(/\s*/,$analysis2);
  1933. # print "original anslysis = $analysis1 " if $printer == 1;
  1934. foreach my $specie (@nSpecies){ $analysis1=~ s/$specie//g;}
  1935. # print "processed anslysis = $analysis1 \n" if $printer == 1;
  1936. my @currentMotifs = ();
  1937. foreach my $specie (split(/\s*/,$analysis1)){push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
  1938. # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
  1939. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
  1940. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
  1941. # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
  1942. }
  1943. else{
  1944. my $analysis1 = $events[0];
  1945. $analysis1 =~ s/\+|\-//g;
  1946. my $analysis2 = $events[1];
  1947. $analysis2 =~ s/\+|\-//g;
  1948. my @pSpecies = split(/\s*/,$analysis2);
  1949. my @currentMotifs = ();
  1950. foreach my $specie (@pSpecies){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
  1951. # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
  1952. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
  1953. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
  1954. # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
  1955. }
  1956. }
  1957. elsif ($analyses[0] =~ /\./) {
  1958. my @events = split(/\./,$analyses[0]);
  1959. foreach my $event (@events){
  1960. # print "event = $event \n" if $printer == 1;
  1961. if ($event =~ /\+/){
  1962. my $analysis = $event;
  1963. $analysis =~ s/\+|\-//g;
  1964. my @species = split(/\s*/,$analysis);
  1965. my @currentMotifs = ();
  1966. foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
  1967. #print consistency(@currentMotifs),"<- \n";
  1968. # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
  1969. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
  1970. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
  1971. # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
  1972. }
  1973. else{
  1974. my $analysis = $event;
  1975. $analysis =~ s/\+|\-//g;
  1976. my @species = split(/\s*/,$analysis);
  1977. my @currentMotifs = ();
  1978. my @complementarySpecies = ();
  1979. my $allSpecies = join("",@tags);
  1980. foreach my $specie (@species){ $allSpecies =~ s/$specie//g; }
  1981. foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
  1982. #print consistency(@currentMotifs),"<- \n";
  1983. # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
  1984. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
  1985. $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
  1986. # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
  1987. }
  1988. }
  1989. }
  1990. }
  1991. else{
  1992. my $finalanalysis = ();
  1993. $template{$translate}[1]++;
  1994. foreach my $analysis (@analyses){ ;}
  1995. }
  1996. # test if motifs where microsats are present, as indeed of same the motif composition
  1997. for my $templet ( keys %template ) {
  1998. if (@{ $template{$templet} }[1] > 0){
  1999. $template{$templet}[1] = 0;
  2000. # print "now returning: @{$template{$templet}}[0], $templet\n";
  2001. return (@{$template{$templet}}[0], $templet);
  2002. }
  2003. }
  2004. undef %template;
  2005. # print "sending NULL\n" if $printer == 1;
  2006. return ("NULL", "NULL");
  2007. }
  2008. sub consistency{
  2009. my @motifs = @_;
  2010. # print "in consistency \n" if $printer == 1;
  2011. # print "motifs sent = >",join("|",@motifs),"< \n" if $printer == 1;
  2012. return $motifs[0] if scalar(@motifs) == 1;
  2013. my $prevmotif = shift(@motifs);
  2014. my $stopper = 0;
  2015. for my $i (0 ... $#motifs){
  2016. next if $motifs[$i] eq "NA";
  2017. my $templet = $motifs[$i].$motifs[$i];
  2018. if ($templet !~ /$prevmotif/i){
  2019. $stopper = 1; last;
  2020. }
  2021. }
  2022. return $prevmotif if $stopper == 0;
  2023. return "NULL" if $stopper == 1;
  2024. }
  2025. sub summarize_microsat{
  2026. my $printer = 1;
  2027. my $line = $_[0];
  2028. my $humseq = $_[1];
  2029. my @gaps = $line =~ /[0-9]+\t[0-9]+\t[\+\-]/g;
  2030. my @starts = $line =~ /[0-9]+\t[\+\-]/g;
  2031. my @ends = $line =~ /[\+\-]\t[0-9]+/g;
  2032. # print "starts = @starts\tends = @ends\n" if $printer == 1;
  2033. for my $i (0 ... $#gaps) {$gaps[$i] =~ s/\t[0-9]+\t[\+\-]//g;}
  2034. for my $i (0 ... $#starts) {$starts[$i] =~ s/\t[\+\-]//g;}
  2035. for my $i (0 ... $#ends) {$ends[$i] =~ s/[\+\-]\t//g;}
  2036. my $minstart = array_smallest_number(@starts);
  2037. my $maxend = array_largest_number(@ends);
  2038. my $humupstream_st = substr($humseq, 0, $minstart);
  2039. my $humupstream_en = substr($humseq, 0, $maxend);
  2040. my $no_of_gaps_to_start = 0;
  2041. my $no_of_gaps_to_end = 0;
  2042. $no_of_gaps_to_start = ($humupstream_st =~ s/\-/x/g) if $humupstream_st=~/\-/;
  2043. $no_of_gaps_to_end = ($humupstream_en =~ s/\-/x/g) if $humupstream_en=~/\-/;
  2044. my $locusmotif = ();
  2045. # print "IN SUB SUMMARIZE_MICROSAT $line\n" if $printer == 1;
  2046. #return "NULL" if $line =~ /compound/;
  2047. my $Hstart = "NA";
  2048. my $Hend = "NA";
  2049. chomp $line;
  2050. my $match_count = ($line =~ s/>/>/g);
  2051. #print "number of species = $match_count\n";
  2052. my @micros = split(/>/,$line);
  2053. shift @micros;
  2054. my $stopper = 0;
  2055. foreach my $mic (@micros){
  2056. my @local = split(/\t/,$mic);
  2057. if ($local[$microsatcord] =~ /N/) {$stopper =1; last;}
  2058. }
  2059. return "NULL" if $stopper ==1;
  2060. #------------------------------------------------------
  2061. my @arranged = ();
  2062. for my $arr (0 ... $#exacttags) {$arranged[$arr] = '0';}
  2063. foreach my $micro (@micros){
  2064. for my $i (0 ... $#exacttags){
  2065. if ($micro =~ /^$exacttags[$i]/){
  2066. $arranged[$i] = $micro;
  2067. last;
  2068. }
  2069. }
  2070. }
  2071. # print "arranged = @arranged \n" ; <STDIN>;;
  2072. my @endstatement = ();
  2073. my $turn = 0;
  2074. my $species_counter = 0;
  2075. # print scalar(@arranged),"\n";
  2076. my $species_no=0;
  2077. my $orthHchr = 0;
  2078. foreach my $micro (@arranged) {
  2079. $micro =~ s/\t\t/\t \t/g;
  2080. $micro =~ s/\t,/\t ,/g;
  2081. $micro =~ s/,\t/, \t/g;
  2082. # print "------------------------------------------------------------------------------------------\n" if $printer == 1;
  2083. chomp $micro;
  2084. if ($micro eq '0'){
  2085. push(@endstatement, join("\t",$exacttags[$species_counter],"NA","NA","NA","NA",0 ,"NA", "NA", 0,"NA","NA","NA", "NA" ));
  2086. $species_counter++;
  2087. # print join("|","ENDSTATEMENT:",@endstatement),"\n" if $printer == 1;
  2088. next;
  2089. }
  2090. # print $micro,"\n";
  2091. # print "micro = $micro \n" if $printer == 1;
  2092. my @fields = split(/\t/,$micro);
  2093. my $microcopy = $fields[$microsatcord];
  2094. $microcopy =~ s/\[|\]|-//g;
  2095. my $microsatlength = length($microcopy);
  2096. # print "microsat = $fields[$microsatcord] and microsatlength = $microsatlength\n" if $printer == 1;
  2097. # print "sp_ident = @sp_ident.. species_no=$species_no\n";
  2098. $micro =~ /$sp_ident[$species_no]\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/;
  2099. # print "$micro =~ /$sp_ident[$species_no] ([0-9a-zA-Z_]+) ([0-9]+) ([0-9]+)/\n";
  2100. my $sp_chr=$1;
  2101. my $sp_start=$2 + $fields[$startcord] - $fields[$gapcord];
  2102. my $sp_end= $sp_start + $microsatlength - 1;
  2103. $species_no++;
  2104. $micro =~ /$focalspec_orig\s(\S+)\s([0-9]+)\s([0-9]+)/;
  2105. $orthHchr=$1;
  2106. $Hstart=$2+$minstart-$no_of_gaps_to_start;
  2107. $Hend=$2+$maxend-$no_of_gaps_to_end;
  2108. # print "Hstart = $Hstart = $fields[4] + $fields[$startcord] - $fields[$gapcord]\n" if $printer == 1;
  2109. my $motif = $fields[$motifcord];
  2110. my $firstmotif = ();
  2111. my $strand = $fields[$strandcord];
  2112. # print "strand = $strand\n";
  2113. if ($motif =~ /^\[/){
  2114. $motif =~ s/^\[//g;
  2115. $motif =~ /([a-zA-Z]+)\].*/;
  2116. $firstmotif = $1;
  2117. }
  2118. else {$firstmotif = $motif;}
  2119. # print "firstmotif =$firstmotif : \n" if $printer == 1;
  2120. $firstmotif = allCaps($firstmotif);
  2121. if (exists $revHash{$firstmotif} && $turn == 0) {
  2122. $turn=1 if $species_counter==0;
  2123. $firstmotif = $revHash{$firstmotif};
  2124. }
  2125. elsif (exists $revHash{$firstmotif} && $turn == 1) {$firstmotif = $revHash{$firstmotif}; $turn = 1;}
  2126. # print "changed firstmotif =$firstmotif\n" if $printer == 1;
  2127. # <STDIN>;
  2128. $locusmotif = $firstmotif;
  2129. if (scalar(@fields) > $microsatcord + 2){
  2130. # print "fields = @fields ... interr_poscord=$interr_poscord=$fields[$interr_poscord] .. interrcord=$interrcord=$fields[$interrcord]\n" if $printer == 1;
  2131. my @interposes = ();
  2132. @interposes = split(",",$fields[$interr_poscord]) if $fields[$interr_poscord] =~ /,/;
  2133. $interposes[0] = $fields[$interr_poscord] if $fields[$interr_poscord] !~ /,/ ;
  2134. # print "interposes=@interposes\n" if $printer == 1;
  2135. my @relativeposes = ();
  2136. my @interruptions = ();
  2137. @interruptions = split(",",$fields[$interrcord]) if $fields[$interrcord] =~ /,/;
  2138. $interruptions[0] = $fields[$interrcord] if $fields[$interrcord] !~ /,/;
  2139. my @interlens = ();
  2140. for my $i (0 ... $#interposes){
  2141. my $interpos = $interposes[$i];
  2142. my $nexter = 0;
  2143. my $interruption = $interruptions[$i];
  2144. my $interlen = length($interruption);
  2145. push (@interlens, $interlen);
  2146. my $relativepos = (100 * $interpos) / $microsatlength;
  2147. # print "relativepos = $relativepos ,interpos=$interpos, interruption=$interruption, interlen=$interlen \n" if $printer == 1;
  2148. $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50;
  2149. # print "--> = $relativepos\n" if $printer == 1;
  2150. $interruption = "IND" if length($interruption) < 1;
  2151. if ($turn == 1){
  2152. $fields[$microsatcord] = switch_micro($fields[$microsatcord]);
  2153. $interruption = switch_nucl($interruption) unless $interruption eq "IND";
  2154. $interpos = ($microsatlength - $interpos) - $interlen + 2;
  2155. # print "turn interpos = $interpos for $fields[$microsatcord]\n" if $printer == 1;
  2156. $relativepos = (100 * $interpos) / $microsatlength;
  2157. $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50;
  2158. $strand = '+' if $strand eq '-';
  2159. $strand = '-' if $strand eq '+';
  2160. }
  2161. # print "final relativepos = $relativepos\n" if $printer == 1;
  2162. push(@relativeposes, $relativepos);
  2163. }
  2164. push(@endstatement,join("\t",($exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,join(",",@interposes),join(",",@relativeposes),join(",",@interruptions), join(",",@interlens))));
  2165. }
  2166. else{
  2167. push(@endstatement, join("\t",$exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,"NA","NA","NA", "NA"));
  2168. }
  2169. $species_counter++;
  2170. }
  2171. $locusmotif = $sameHash{$locusmotif} if exists $sameHash{$locusmotif};
  2172. $locusmotif = $revHash{$locusmotif} if exists $revHash{$locusmotif};
  2173. my $endst = join("\t", @endstatement, $orthHchr, $Hstart, $Hend);
  2174. # print join("\t", @endstatement, $orthHchr, $Hstart, $Hend), "\n" if $printer == 1;
  2175. return (join("\t", @endstatement, $orthHchr, $Hstart, $Hend), $orthHchr, $Hstart, $Hend, $locusmotif, length($locusmotif));
  2176. }
  2177. sub switch_nucl{
  2178. my @strand = split(/\s*/,$_[0]);
  2179. for my $i (0 ... $#strand){
  2180. if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;}
  2181. if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;}
  2182. if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;}
  2183. if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;}
  2184. }
  2185. return join("",@strand);
  2186. }
  2187. sub switch_micro{
  2188. my $micro = reverse($_[0]);
  2189. my @strand = split(/\s*/,$micro);
  2190. for my $i (0 ... $#strand){
  2191. if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;}
  2192. if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;}
  2193. if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;}
  2194. if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;}
  2195. if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;}
  2196. if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;}
  2197. }
  2198. return join("",@strand);
  2199. }
  2200. sub decipher_history{
  2201. my $printer = 0;
  2202. my ($mutations_array, $tags_string, $nodes, $branches_hash, $tree_analysis, $confirmation_string, $alivehash) = @_;
  2203. my %mutations_hash=();
  2204. foreach my $mutation (@$mutations_array){
  2205. # print "mutation = $mutation\n" if $printer == 1;
  2206. my %local = $mutation =~ /([\S ]+)=([\S ]+)/g;
  2207. push @{$mutations_hash{$local{"node"}}},$mutation;
  2208. # print "just for confirmation: $local{node} pushed as: $mutation\n" if $printer == 1;
  2209. }
  2210. my @nodes;
  2211. my @birth_steps=();
  2212. my @death_steps=();
  2213. my @tags=split(/\s*/,$tags_string);
  2214. my @confirmation=split(/\s+/,$confirmation_string);
  2215. my %info=();
  2216. for my $i (0 ... $#tags){
  2217. $info{$tags[$i]}=$confirmation[$i];
  2218. # print "feeding info: $tags[$i] = $info{$tags[$i]}\n" if $printer == 1;
  2219. }
  2220. for my $keys (@$nodes) {
  2221. foreach my $key (@$keys){
  2222. # print "current key = $key\n";
  2223. my $copykey = $key;
  2224. $copykey =~ s/[\W ]+//g;
  2225. my @copykeys=split(/\s*/,$copykey);
  2226. my $states=();
  2227. foreach my $copy (@copykeys){
  2228. $states=$states.$info{$copy};
  2229. }
  2230. # print "reduced key = $copykey and state = $states\n" if $printer == 1;
  2231. if (exists $mutations_hash{$key}) {
  2232. if ($states=~/\+/){
  2233. push @birth_steps, @{$mutations_hash{$key}};
  2234. $birth_steps[$#birth_steps] =~ s/\S+=//g;
  2235. delete $mutations_hash{$key};
  2236. }
  2237. else{
  2238. push @death_steps, @{$mutations_hash{$key}};
  2239. $death_steps[$#death_steps] =~ s/\S+=//g;
  2240. delete $mutations_hash{$key};
  2241. }
  2242. }
  2243. }
  2244. }
  2245. # print "conformation = $confirmation_string\n" if $printer == 1;
  2246. push (@birth_steps, "NULL") if scalar(@birth_steps) == 0;
  2247. push (@death_steps, "NULL") if scalar(@death_steps) == 0;
  2248. # print "birth steps = ",join("\n",@birth_steps)," and death steps = ",join("\n",@death_steps),"\n" if $printer == 1;
  2249. return \@birth_steps, \@death_steps;
  2250. }
  2251. sub fillAlignmentGaps{
  2252. my $printer = 0;
  2253. # print "received: @_\n" if $printer == 1;
  2254. my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_;
  2255. # print "in fillAlignmentGaps.. tree = $tree \n" if $printer == 1;
  2256. my %sequence_hash=();
  2257. my @phases = ();
  2258. my $concat = $motif.$motif;
  2259. my $motifsize = length($motif);
  2260. for my $i (1 ... $motifsize){
  2261. push @phases, substr($concat, $i, $motifsize);
  2262. }
  2263. my $concatalignment = ();
  2264. foreach my $tag (@tags){
  2265. $concatalignment = $concatalignment.$alignment->{$tag};
  2266. }
  2267. # print "returningg NULL","NULL","NULL", "NULL\n" if $concatalignment !~ /-/;
  2268. return 0, "NULL","NULL","NULL", "NULL","NULL" if $concatalignment !~ /-/;
  2269. my %node_sequences_temp=();
  2270. my %node_alignments_temp =(); #NEW, Nov 28 2008
  2271. my @tags=();
  2272. my @locus_sequences=();
  2273. my %alivehash=();
  2274. # print "IN fillAlignmentGaps\n";# <STDIN>;
  2275. my %fillrecord = ();
  2276. my $change = 0;
  2277. foreach my $tag (@$tagarray) {
  2278. #print "adding: $tag\n";
  2279. push(@tags, $tag);
  2280. if (exists $microsathash->{$tag}){
  2281. my $micro = $microsathash->{$tag};
  2282. my $orig_micro = $micro;
  2283. ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
  2284. $change = 1 if uc($micro) ne uc($orig_micro);
  2285. $node_sequences_temp{$tag}=$micro if $microsathash->{$tag} ne "NULL";
  2286. }
  2287. if (exists $nonmicrosathash->{$tag}){
  2288. my $micro = $nonmicrosathash->{$tag};
  2289. my $orig_micro = $micro;
  2290. ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
  2291. $change = 1 if uc($micro) ne uc($orig_micro);
  2292. $node_sequences_temp{$tag}=$micro if $nonmicrosathash->{$tag} ne "NULL";
  2293. }
  2294. if (exists $alignment->{$tag}){
  2295. my $micro = $alignment->{$tag};
  2296. my $orig_micro = $micro;
  2297. ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
  2298. $change = 1 if uc($micro) ne uc($orig_micro);
  2299. $node_alignments_temp{$tag}=$micro if $alignment->{$tag} ne "NULL";
  2300. }
  2301. #print "adding to node_sequences: $tag = ",$node_sequences_temp{$tag},"\n" if $printer == 1;
  2302. #print "adding to node_alignments: $tag = ",$node_alignments_temp{$tag},"\n" if $printer == 1;
  2303. }
  2304. my %node_sequences=();
  2305. my %node_alignments =(); #NEW, Nov 28 2008
  2306. foreach my $tag (@$tagarray) {
  2307. $node_sequences{$tag} = join ".",split(/\s*/,$node_sequences_temp{$tag});
  2308. $node_alignments{$tag} = join ".",split(/\s*/,$node_alignments_temp{$tag});
  2309. }
  2310. # print "\n", "#" x 50, "\n" if $printer == 1;
  2311. foreach my $tag (@tags){
  2312. # print "$tag: $alignment->{$tag} = $node_alignments{$tag}\n" if $printer == 1;
  2313. }
  2314. # print "\n", "#" x 50, "\n" if $printer == 1;
  2315. # print "change = $change\n";
  2316. #<STDIN> if $concatalignment=~/\-/;
  2317. # <STDIN> if $printer == 1 && $concatalignment =~ /\-/;
  2318. return 0, "NULL","NULL","NULL", "NULL", "NULL" if $change == 0;
  2319. my ($nodes_arr, $branches_hash) = get_nodes($tree);
  2320. my @nodes=@$nodes_arr;
  2321. # print "recieved nodes = @nodes\n" if $printer == 1;
  2322. #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS
  2323. foreach my $keys (@nodes){
  2324. my @pair = @$keys;
  2325. my $joint = "(".join(", ",@pair).")";
  2326. my $copykey = join "", @pair;
  2327. $copykey =~ s/[\W ]+//g;
  2328. # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1;
  2329. my $livestatus = 1;
  2330. foreach my $copy (split(/\s*/,$copykey)){
  2331. $livestatus = 0 if !exists $alivehash{$copy};
  2332. }
  2333. $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1;
  2334. # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1;
  2335. }
  2336. @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT.
  2337. my @mutations_array=();
  2338. my $joint = ();
  2339. foreach my $node (@nodes){
  2340. my @pair = @$node;
  2341. # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1;
  2342. $joint = "(".join(", ",@pair).")";
  2343. # print "joint = $joint \n" if $printer == 1;
  2344. my @pair_sequences=();
  2345. foreach my $tag (@pair){
  2346. # print "tag = $tag: " if $printer == 1;
  2347. # print $node_alignments{$tag},"\n" if $printer == 1;
  2348. push @pair_sequences, $node_alignments{$tag};
  2349. }
  2350. # print "fillgap\n";
  2351. my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint);
  2352. $node_alignments{$joint}=$compared;
  2353. push( @mutations_array,split(/:/,$substitutions_list));
  2354. # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations = @mutations_array\n" if $printer == 1;
  2355. }
  2356. # print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1;
  2357. # <STDIN> if $printer == 1;
  2358. my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif);
  2359. # print "returningt: ", $analayzed_mutations, \@nodes,"\n" if scalar @mutations_array > 0;;
  2360. # print "returningy: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1;
  2361. # print "final node alignment after filling for $joint= " if $printer == 1;
  2362. # print "$node_alignments{$joint}\n" if $printer == 1;
  2363. return 1, $analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint} if scalar @mutations_array > 0 ;
  2364. return 1, "NULL","NULL","NULL", "NULL", "NULL" if scalar @mutations_array == 0;
  2365. }
  2366. sub add_mutation{
  2367. my $printer = 0;
  2368. # print "IN SUBROUTUNE add_mutation.. information received = @_\n" if $printer == 1;
  2369. my ($i , $bite, $to, $from) = @_;
  2370. # print "bite = $bite.. all received info = ",join("^", @_),"\n" if $printer == 1;
  2371. # print "to=$to\n" if $printer == 1;
  2372. # print "tis split = ",join(" and ",split(/!/,$to)),"\n" if $printer == 1;
  2373. my @toields = split "!",$to;
  2374. # print "toilds = @toields\n" if $printer == 1;
  2375. my @mutations=();
  2376. foreach my $toield (@toields){
  2377. my @toinfo=split(":",$toield);
  2378. # print " at toinfo=@toinfo \n" if $printer == 1;
  2379. next if $toinfo[1] =~ /$from/i;
  2380. my @mutation = @toinfo if $toinfo[1] !~ /$from/i;
  2381. # print "adding to mutaton list: ", join(",", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="),"\n" if $printer == 1;
  2382. push (@mutations, join("\t", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="));
  2383. }
  2384. return @mutations;
  2385. }
  2386. sub add_bases{
  2387. my $printer = 0;
  2388. # print "IN SUBROUTUNE add_bases.. information received = @_\n" if $printer == 1;
  2389. my ($optional0, $optional1, $pair0, $pair1,$joint) = @_;
  2390. my $total_list=();
  2391. my @total_list0=split(/!/,$optional0);
  2392. my @total_list1=split(/!/,$optional1);
  2393. my @all_list=();
  2394. my %total_hash0=();
  2395. foreach my $entry (@total_list0) {
  2396. $entry = uc $entry;
  2397. $entry =~ /(\S+):(\S+)/;
  2398. $total_hash0{$2}=$1;
  2399. push @all_list, $2;
  2400. }
  2401. my %total_hash1=();
  2402. foreach my $entry (@total_list1) {
  2403. $entry = uc $entry;
  2404. $entry =~ /(\S+):(\S+)/;
  2405. $total_hash1{$2}=$1;
  2406. push @all_list, $2;
  2407. }
  2408. my %alphabetical_hash=();
  2409. my @return_options=();
  2410. for my $i (0 ... $#all_list){
  2411. my $alph = $all_list[$i];
  2412. if (exists $total_hash0{$alph} && exists $total_hash1{$alph}){
  2413. push(@return_options, $joint.":".$alph);
  2414. delete $total_hash0{$alph}; delete $total_hash1{$alph};
  2415. }
  2416. if (exists $total_hash0{$alph} && !exists $total_hash1{$alph}){
  2417. push(@return_options, $pair0.":".$alph);
  2418. delete $total_hash0{$alph};
  2419. }
  2420. if (!exists $total_hash0{$alph} && exists $total_hash1{$alph}){
  2421. push(@return_options, $pair1.":".$alph);
  2422. delete $total_hash1{$alph};
  2423. }
  2424. }
  2425. # print "returning ",join "!",@return_options,"\n" if $printer == 1;
  2426. return join "!",@return_options;
  2427. }
  2428. sub fillgaps{
  2429. # print "IN fillgaps: @_\n";
  2430. my ($micro, $phasesinput) = @_;
  2431. #print "in microsathash ,,.. micro = $micro\n";
  2432. return $micro if $micro !~ /\-/;
  2433. my $orig_micro = $micro;
  2434. my @phases = @$phasesinput;
  2435. my %tested_patterns = ();
  2436. foreach my $phase (@phases){
  2437. # print "considering phase: $phase\n";
  2438. my @phase_prefixes = ();
  2439. my @prephase_left_contexts = ();
  2440. my @prephase_right_contexts = ();
  2441. my @pregapsize = ();
  2442. my @prepostfilins = ();
  2443. my @phase_suffixes;
  2444. my @suffphase_left_contexts;
  2445. my @suffphase_right_contexts;
  2446. my @suffgapsize;
  2447. my @suffpostfilins;
  2448. my @postfilins = ();
  2449. my $motifsize = length($phases[0]);
  2450. my $change = 0;
  2451. for my $u (0 ... $motifsize-1){
  2452. my $concat = $phase.$phase.$phase.$phase;
  2453. my @concatarr = split(/\s*/, $concat);
  2454. my $l = 0;
  2455. while ($l < $u){
  2456. shift @concatarr;
  2457. $l++;
  2458. }
  2459. $concat = join ("", @concatarr);
  2460. for my $t (0 ... $motifsize-1){
  2461. for my $k (1 ... $motifsize-1){
  2462. push @phase_prefixes, substr($concat, $motifsize+$t, $k);
  2463. push @prephase_left_contexts, substr ($concat, $t, $motifsize);
  2464. push @prephase_right_contexts, substr ($concat, $motifsize+$t+$k+($motifsize-$k), 1);
  2465. push @pregapsize, $k;
  2466. push @prepostfilins, substr($concat, $motifsize+$t+$k, ($motifsize-$k));
  2467. # print "reading: $concat, t=$t, k=$k prefix: $prephase_left_contexts[$#prephase_left_contexts] $phase_prefixes[$#phase_prefixes] -x$pregapsize[$#pregapsize] $prephase_right_contexts[$#prephase_right_contexts]\n";
  2468. # print "phase_prefixes = $phase_prefixes[$#phase_prefixes]\n";
  2469. # print "prephase_left_contexts = $prephase_left_contexts[$#prephase_left_contexts]\n";
  2470. # print "prephase_right_contexts = $prephase_right_contexts[$#prephase_right_contexts]\n";
  2471. # print "pregapsize = $pregapsize[$#pregapsize]\n";
  2472. # print "prepostfilins = $prepostfilins[$#prepostfilins]\n";
  2473. }
  2474. }
  2475. }
  2476. # print "looking if $micro =~ /($phase\-{$motifsize})/i || $micro =~ /^(\-{$motifsize,}$phase)/i\n";
  2477. if ($micro =~ /($phase\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,}$phase)/i){
  2478. # print "micro: $micro needs further gap removal: $1\n";
  2479. while ($micro =~ /$phase(\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,})$phase/i){
  2480. # print "micro: $micro needs further gap removal: $1\n";
  2481. # print "phase being considered = $phase\n";
  2482. my $num = ();
  2483. $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi if $micro =~ /$phase\-{$motifsize,}/i;
  2484. $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi if $micro =~ /\-{$motifsize,}$phase/i;
  2485. # print "num = $num\n";
  2486. $change = 1 if $num == 1;
  2487. }
  2488. }
  2489. elsif ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){
  2490. while ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){
  2491. # print "checking lengths of $1 and $3 for $micro... \n";
  2492. my $num = ();
  2493. if (length($1) >= length($3)){
  2494. # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, >= , $3 \n";
  2495. $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ;
  2496. }
  2497. if (length($1) < length($3)){
  2498. # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, < , $3 \n";
  2499. $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ;
  2500. }
  2501. # print "micro changed to $micro\n";
  2502. }
  2503. }
  2504. elsif ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){
  2505. while ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){
  2506. # print "$micro matches ([A-Z]+)\-{$motifsize}(($phase)+) = 1=$1, - , 3=$3 \n";
  2507. my $num = 0;
  2508. $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ;
  2509. }
  2510. }
  2511. elsif ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){
  2512. while ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){
  2513. # print "$micro matches (($phase)+)\-{$motifsize,}([A-Z]+) = 1=$1, - , 3=$3 \n";
  2514. my $num = 0;
  2515. $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ;
  2516. }
  2517. }
  2518. # print "$orig_micro to $micro\n";
  2519. #s <STDIN>;
  2520. for my $h (0 ... $#phase_prefixes){
  2521. # print "searching using prefix : $prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]\n";
  2522. my $pattern = $prephase_left_contexts[$h].$phase_prefixes[$h].$pregapsize[$h].$prephase_right_contexts[$h];
  2523. # print "returning orig_micro = $orig_micro, micro = $micro \n" if exists $tested_patterns{$pattern};
  2524. if ($micro =~ /$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/i){
  2525. return $orig_micro if exists $tested_patterns{$pattern};
  2526. while ($micro =~ /($prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h])/i){
  2527. $tested_patterns{$pattern} = $pattern;
  2528. # print "micro: $micro needs further gap removal: $1\n";
  2529. # print "prefix being considered = $phase_prefixes[$h]\n";
  2530. my $num = ();
  2531. $num = ($micro =~ s/$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/$prephase_left_contexts[$h]$phase_prefixes[$h]$prepostfilins[$h]$prephase_right_contexts[$h]/gi) ;
  2532. # print "num = $num, micro = $micro\n";
  2533. $change = 1 if $num == 1;
  2534. return $orig_micro if $num > 1;
  2535. }
  2536. }
  2537. }
  2538. }
  2539. return $orig_micro if length($micro) != length($orig_micro);
  2540. return $micro;
  2541. }
  2542. sub selectMutationArray{
  2543. my $printer =0;
  2544. my $oldmutspt = $_[0];
  2545. my $newmutspt = $_[1];
  2546. my $tagstringpt = $_[2];
  2547. my $alivehashpt = $_[3];
  2548. my $alignmentpt = $_[4];
  2549. my $motif = $_[5];
  2550. my @alivehasharr=();
  2551. my @tags = @$tagstringpt;
  2552. my $alignmentln = length($alignmentpt->{$tags[0]});
  2553. foreach my $key (keys %$alivehashpt) { push @alivehasharr, $key; }
  2554. my %newside = ();
  2555. my %oldside = ();
  2556. my %newmuts = ();
  2557. my %commons = ();
  2558. my %olds = ();
  2559. foreach my $old (@$oldmutspt){
  2560. $olds{$old} = 1;
  2561. }
  2562. foreach my $new (@$newmutspt){
  2563. $commons{$new} = 1 if exists $olds{$new};;
  2564. }
  2565. foreach my $pos ( 0 ... $alignmentln){
  2566. #print "pos = $pos\n" if $printer == 1;
  2567. my $newyes = 0;
  2568. foreach my $mut (@$newmutspt){
  2569. $newmuts{$mut} = 1;
  2570. chomp $mut;
  2571. $newyes++;
  2572. $mut =~ s/=\t/= \t/g;
  2573. $mut =~ s/=$/= /g;
  2574. $mut =~ /node=([A-Z\(\), ]+)\stype=([a-zA-Z ]+)\sposition=([0-9 ]+)\sfrom=([a-zA-Z\- ]+)\sto=([a-zA-Z\- ]+)\sinsertion=([a-zA-Z\- ]+)\sdeletion=([a-zA-Z\- ]+)/;
  2575. my $node = $1;
  2576. next if $3 != $pos;
  2577. # print "new mut = $mut\n" if $printer == 1;
  2578. # print "node = $node, pos = $3 ... and alivehasharr = >@alivehasharr<\n" if $printer == 1;
  2579. my $alivenode = 0;
  2580. foreach my $key (@alivehasharr){
  2581. $alivenode = 1 if $key =~ /$node/;
  2582. }
  2583. # next if $alivenode == 0;
  2584. my $indel_type = " ";
  2585. if ($2 eq "insertion" || $2 eq "deletion"){
  2586. my $thisindel = ();
  2587. $thisindel = $6 if $2 eq "insertion";
  2588. $thisindel = $7 if $2 eq "deletion";
  2589. $indel_type = "i".checkIndelType($node, $thisindel, $motif,$alignmentpt,$3, $2) if $2 eq "insertion";
  2590. $indel_type = "d".checkIndelType($node, $thisindel, $motif,$alignmentpt, $3, $2) if $2 eq "deletion";
  2591. $indel_type = $indel_type."f" if $indel_type =~ /mot/ && length($thisindel) >= length($motif);
  2592. }
  2593. # print "indeltype = $indel_type\n" if $printer == 1;
  2594. my $added = 0;
  2595. if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){
  2596. # print "we have a preexisting one for $pos\n" if $printer == 1;
  2597. my @preexisting = @{$newside{$pos}};
  2598. foreach my $pre (@preexisting){
  2599. # print "looking at $pre\n" if $printer == 1;
  2600. next if $pre !~ /node=$node/;
  2601. next if $pre !~ /indeltype=([a-z]+)/;
  2602. my $currtype = $1;
  2603. if ($currtype =~ /inon/ && $indel_type =~ /dmot/){
  2604. delete $newside{$pos};
  2605. push @{$newside{$pos}}, $pre;
  2606. $added = 1;
  2607. }
  2608. if ($currtype =~ /dnon/ && $indel_type =~ /imot/){
  2609. delete $newside{$pos};
  2610. push @{$newside{$pos}}, $pre;
  2611. $added = 1;
  2612. }
  2613. if ($currtype =~ /dmot/ && $indel_type =~ /inon/){
  2614. delete $newside{$pos};
  2615. push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
  2616. $added = 1;
  2617. }
  2618. if ($currtype =~ /imot/ && $indel_type =~ /dnon/){
  2619. delete $newside{$pos};
  2620. push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
  2621. $added = 1;
  2622. }
  2623. }
  2624. }
  2625. # print "added = $added\n" if $printer == 1;
  2626. push @{$newside{$pos}}, $mut."\tindeltype=$indel_type" if $added == 0;
  2627. # print "for new pos,: $pos we have: @{$newside{$pos}}\n " if $printer == 1;
  2628. }
  2629. }
  2630. foreach my $pos ( 0 ... $alignmentln){
  2631. my $oldyes = 0;
  2632. foreach my $mut (@$oldmutspt){
  2633. chomp $mut;
  2634. $oldyes++;
  2635. $mut =~ s/=\t/= \t/g;
  2636. $mut =~ s/=$/= /g;
  2637. $mut =~ /node=([A-Z\(\), ]+)\ttype=([a-zA-Z ]+)\tposition=([0-9 ]+)\tfrom=([a-zA-Z\- ]+)\tto=([a-zA-Z\- ]+)\tinsertion=([a-zA-Z\- ]+)\tdeletion=([a-zA-Z\- ]+)/;
  2638. my $node = $1;
  2639. next if $3 != $pos;
  2640. # print "old mut = $mut\n" if $printer == 1;
  2641. my $alivenode = 0;
  2642. foreach my $key (@alivehasharr){
  2643. $alivenode = 1 if $key =~ /$node/;
  2644. }
  2645. #next if $alivenode == 0;
  2646. my $indel_type = " ";
  2647. if ($2 eq "insertion" || $2 eq "deletion"){
  2648. $indel_type = "i".checkIndelType($node, $6, $motif,$alignmentpt, $3, $2) if $2 eq "insertion";
  2649. $indel_type = "d".checkIndelType($node, $7, $motif,$alignmentpt, $3, $2) if $2 eq "deletion";
  2650. next if $indel_type =~/non/;
  2651. }
  2652. else{ next;}
  2653. my $imp=0;
  2654. $imp = 1 if $indel_type =~ /dmot/ && $alivenode == 0;
  2655. $imp = 1 if $indel_type =~ /imot/ && $alivenode == 1;
  2656. if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){
  2657. my @preexisting = @{$newside{$pos}};
  2658. # print "we have a preexisting one for $pos: @preexisting\n" if $printer == 1;
  2659. next if $imp == 0;
  2660. if (scalar(@preexisting) == 1){
  2661. my $foundmut = $preexisting[0];
  2662. $foundmut=~ /node=([A-Z, \(\)]+)/;
  2663. next if $1 eq $node;
  2664. if (exists $oldside{$pos} || exists $commons{$foundmut}){
  2665. # print "not replacing, but just adding\n" if $printer == 1;
  2666. push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
  2667. push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
  2668. next;
  2669. }
  2670. delete $newside{$pos};
  2671. push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
  2672. push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
  2673. # print "now new one is : @{$newside{$pos}}\n" if $printer == 1;
  2674. }
  2675. # print "for pos: $pos: @{$newside{$pos}}\n" if $printer == 1;
  2676. next;
  2677. }
  2678. my @news = @{$newside{$pos}} if exists $newside{$pos};
  2679. # print "mut = $mut and news = @news\n" if $printer == 1;
  2680. push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
  2681. push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
  2682. }
  2683. }
  2684. # print "in the end, our collected mutations = \n" if $printer == 1;
  2685. my @returnarr = ();
  2686. foreach my $key (keys %newside) {push @returnarr,@{$newside{$key}};}
  2687. # print join("\n", @returnarr),"\n" if $printer == 1;
  2688. #<STDIN>;
  2689. return @returnarr;
  2690. }
  2691. sub checkIndelType{
  2692. my $printer = 0;
  2693. my $node = $_[0];
  2694. my $indel = $_[1];
  2695. my $motif = $_[2];
  2696. my $alignmentpt = $_[3];
  2697. my $posit = $_[4];
  2698. my $type = $_[5];
  2699. my @phases =();
  2700. my %prephases = ();
  2701. my %postphases = ();
  2702. #print "motif = $motif\n";
  2703. # print "IN checkIndelType ... received: @_\n" if $printer == 1;
  2704. my $concat = $motif.$motif.$motif.$motif;
  2705. my $motiflength = length($motif);
  2706. if ($motiflength > length ($indel)){
  2707. return "non" if $motif !~ /$indel/i;
  2708. return checkIndelType_ComplexAnalysis($node, $indel, $motif, $alignmentpt, $posit, $type);
  2709. }
  2710. my $firstpass = 0;
  2711. for my $y (0 ... $motiflength-1){
  2712. my $phase = substr($concat, $motiflength+$y, $motiflength);
  2713. push @phases, $phase;
  2714. $firstpass = 1 if $indel =~ /$phase/i;
  2715. for my $k (0 ... length($motif)-1){
  2716. # print "at: motiflength=$motiflength , y=$y , k=$k.. for pre: $motiflength+$y-$k and post: $motiflength+$y-$k+$motiflength in $concat\n" if $printer == 1;
  2717. my $pre = substr($concat, $motiflength+$y-$k, $k );
  2718. my $post = substr($concat, $motiflength+$y+$motiflength, $k);
  2719. # print "adding to phases : $phase - $pre and $post\n" if $printer == 1;
  2720. push @{$prephases{$phase}} , $pre;
  2721. push @{$postphases{$phase}} , $post;
  2722. }
  2723. }
  2724. # print "firstpass 1= $firstpass\n" if $printer == 1;
  2725. return "non" if $firstpass ==0;
  2726. $firstpass =0;
  2727. foreach my $phase (@phases){
  2728. my @pres = @{$prephases{$phase}};
  2729. my @posts = @{$postphases{$phase}};
  2730. foreach my $pre (@pres){
  2731. foreach my $post (@posts){
  2732. $firstpass = 1 if $indel =~ /($pre)?($phase)+($post)?/i && length($indel) > (3 * length($motif));
  2733. $firstpass = 1 if $indel =~ /^($pre)?($phase)+($post)?$/i && length($indel) < (3 * length($motif));
  2734. # print "matched here : ($pre)?($phase)+($post)?\n" if $printer == 1;
  2735. last if $firstpass == 1;
  2736. }
  2737. last if $firstpass == 1;
  2738. }
  2739. last if $firstpass == 1;
  2740. }
  2741. # print "firstpass 2= $firstpass\n" if $printer == 1;
  2742. return "non" if $firstpass ==0;
  2743. return "mot" if $firstpass ==1;
  2744. }
  2745. sub checkIndelType_ComplexAnalysis{
  2746. my $printer = 0;
  2747. my $node = $_[0];
  2748. my $indel = $_[1];
  2749. my $motif = $_[2];
  2750. my $alignmentpt = $_[3];
  2751. my $pos = $_[4];
  2752. my $type = $_[5];
  2753. my @speciesinvolved = $node =~ /[A-Z]+/g;
  2754. my @seqs = ();
  2755. my $residualseq = length($motif) - length($indel);
  2756. # print "IN COMPLEX ANALYSIS ... received: @_ .... speciesinvolved = @speciesinvolved\n" if $printer == 1;
  2757. # print "we have position = $pos, sseq = $alignmentpt->{$speciesinvolved[0]}\n" if $printer == 1;
  2758. # print "residualseq = $residualseq\n" if $printer == 1;
  2759. # print "pos=$pos... got: @_\n" if $printer == 1;
  2760. foreach my $sp (@speciesinvolved){
  2761. my $spseq = $alignmentpt->{$sp};
  2762. #print "orig spseq = $spseq\n";
  2763. my $subseq = ();
  2764. if ($type eq "deletion"){
  2765. my @indelparts = split(/\s*/,$indel);
  2766. my @seqparts = split(/\s*/,$spseq);
  2767. for my $p ($pos ... $pos+length($indel)-1){
  2768. $seqparts[$p] = shift @indelparts;
  2769. }
  2770. $spseq = join("",@seqparts);
  2771. }
  2772. #print "mod spseq = $spseq\n";
  2773. # $spseq=~ s/\-//g if $type !~ /deletion/;
  2774. # print "substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq)\n" if $pos > 0 && $pos < (length($spseq) - length($motif)) && $printer == 1;
  2775. # print "substr($spseq, 0, length($indel)+$residualseq)\n" if $pos == 0 && $printer == 1;
  2776. # print "substr($spseq, $pos - $residualseq, length($indel)+$residualseq)\n" if $pos >= (length($spseq) - length($motif)) && $printer == 1;
  2777. $subseq = substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq) if $pos > 0 && $pos < (length($spseq) - length($motif)) ;
  2778. $subseq = substr($spseq, 0, length($indel)+$residualseq) if $pos == 0;
  2779. $subseq = substr($spseq, $pos - $residualseq, length($indel)+$residualseq) if $pos >= (length($spseq) - length($motif)) ;
  2780. # print "spseq = $spseq . subseq=$subseq . type = $type\n" if $printer == 1;
  2781. #<STDIN> if $subseq !~ /[a-z\-]/i;
  2782. $subseq =~ s/\-/$indel/g if $type =~ /insertion/;
  2783. push @seqs, $subseq;
  2784. # print "seqs = @seqs\n" if $printer == 1;
  2785. }
  2786. return "non" if checkIfSeqsIdentical(@seqs) eq "NO";
  2787. # print "checking for $seqs[0] \n" if $printer == 1;
  2788. my @phases =();
  2789. my %prephases = ();
  2790. my %postphases = ();
  2791. my $concat = $motif.$motif.$motif.$motif;
  2792. my $motiflength = length($motif);
  2793. my $firstpass = 0;
  2794. for my $y (0 ... $motiflength-1){
  2795. my $phase = substr($concat, $motiflength+$y, $motiflength);
  2796. push @phases, $phase;
  2797. $firstpass = 1 if $seqs[0] =~ /$phase/i;
  2798. for my $k (0 ... length($motif)-1){
  2799. my $pre = substr($concat, $motiflength+$y-$k, $k );
  2800. my $post = substr($concat, $motiflength+$y+$motiflength, $k);
  2801. # print "adding to phases : $phase - $pre and $post\n" if $printer == 1;
  2802. push @{$prephases{$phase}} , $pre;
  2803. push @{$postphases{$phase}} , $post;
  2804. }
  2805. }
  2806. # print "firstpass 1= $firstpass.. also, res-d = ",(length($seqs[0]))%(length($motif)),"\n" if $printer == 1;
  2807. return "non" if $firstpass ==0;
  2808. $firstpass =0;
  2809. foreach my $phase (@phases){
  2810. $firstpass = 1 if $seqs[0] =~ /^($phase)+$/i && ((length($seqs[0]))%(length($motif))) == 0;
  2811. if (((length($seqs[0]))%(length($motif))) != 0){
  2812. my @pres = @{$prephases{$phase}};
  2813. my @posts = @{$postphases{$phase}};
  2814. foreach my $pre (@pres){
  2815. foreach my $post (@posts){
  2816. next if $pre !~ /\S/ && $post !~ /\S/;
  2817. $firstpass = 1 if ($seqs[0] =~ /^($pre)($phase)+($post)$/i || $seqs[0] =~ /^($pre)($phase)+$/i || $seqs[0] =~ /^($phase)+($post)$/i);
  2818. # print "caught with $pre $phase $post\n" if $printer == 1;
  2819. last if $firstpass == 1;
  2820. }
  2821. last if $firstpass == 1;
  2822. }
  2823. }
  2824. last if $firstpass == 1;
  2825. }
  2826. #print "indel = $indel.. motif = $motif.. firstpass 2= mot\n" if $firstpass ==1;
  2827. #print "indel = $indel.. motif = $motif.. firstpass 2= non\n" if $firstpass ==0;
  2828. #<STDIN>;# if $firstpass ==1;
  2829. return "non" if $firstpass ==0;
  2830. return "mot" if $firstpass ==1;
  2831. }
  2832. sub checkIfSeqsIdentical{
  2833. my @seqs = @_;
  2834. my $identical = 1;
  2835. for my $j (1 ... $#seqs){
  2836. $identical = 0 if uc($seqs[0]) ne uc($seqs[$j]);
  2837. }
  2838. return "NO" if $identical == 0;
  2839. return "YES" if $identical == 1;
  2840. }
  2841. sub summarizeMutations{
  2842. my $mutspt = $_[0];
  2843. my @muts = @$mutspt;
  2844. my $tree = $_[1];
  2845. my @returnarr = ();
  2846. for (1 ... 38){
  2847. push @returnarr, "NA";
  2848. }
  2849. push @returnarr, "NULL";
  2850. return @returnarr if $tree eq "NULL" || scalar(@muts) < 1;
  2851. my @bspecies = ();
  2852. my @dspecies = ();
  2853. my $treecopy = $tree;
  2854. $treecopy =~ s/[\(\)]//g;
  2855. my @treeparts = split(/[\.,]+/, $treecopy);
  2856. for my $part (@treeparts){
  2857. if ($part =~ /\+/){
  2858. $part =~ s/\+//g;
  2859. #my @sp = split(/\s*/, $part);
  2860. #foreach my $p (@sp) {push @bspecies, $p;}
  2861. push @bspecies, $part;
  2862. }
  2863. if ($part =~ /\-/){
  2864. $part =~ s/\-//g;
  2865. #my @sp = split(/\s*/, $part);
  2866. #foreach my $p (@sp) {push @dspecies, $p;}
  2867. push @dspecies, $part;
  2868. }
  2869. }
  2870. #print "-------------------------------------------------------\n";
  2871. my ($insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) = (0,0,0,0,0,0,0,0);
  2872. my ($binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) = (0,0,0,0,0,0,0,0);
  2873. my ($dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) = (0,0,0,0,0,0,0,0);
  2874. my ($ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) = (0,0,0,0,0,0,0,0);
  2875. my ($substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) = (0,0,0,0,"NA","NA");
  2876. my @insertionsarr = (" ");
  2877. my @deletionsarr = (" ");
  2878. my @substitutionsarr = (" ");
  2879. foreach my $mut (@muts){
  2880. # print "mut = $mut\n";
  2881. chomp $mut;
  2882. $mut =~ s/=\t/= /g;
  2883. $mut =~ s/=$/= /g;
  2884. my %mhash = ();
  2885. my @mields = split(/\t/,$mut);
  2886. foreach my $m (@mields){
  2887. my @fields = split(/=/,$m);
  2888. next if $fields[1] eq " ";
  2889. $mhash{$fields[0]} = $fields[1];
  2890. }
  2891. my $myutype = ();
  2892. my $decided = 0;
  2893. my $localnode = $mhash{"node"};
  2894. $localnode =~ s/[\(\)\. ,]//g;
  2895. foreach my $s (@bspecies){
  2896. if ($localnode eq $s) {
  2897. $decided = 1; $myutype = "b";
  2898. }
  2899. }
  2900. foreach my $s (@dspecies){
  2901. if ($localnode eq $s) {
  2902. $decided = 1; $myutype = "d";
  2903. }
  2904. }
  2905. $myutype = "n" if $decided != 1;
  2906. # print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. \n";
  2907. # <STDIN> if $mhash{"type"} eq "insertion" && $myutype eq "b";
  2908. if ($mhash{"type"} eq "substitution"){
  2909. $substitutions++;
  2910. $bsubstitutions++ if $myutype eq "b";
  2911. $dsubstitutions++ if $myutype eq "d";
  2912. $nsubstitutions++ if $myutype eq "n";
  2913. # print "substitution: from= $mhash{from}, to = $mhash{to}, and type = myutype\n";
  2914. push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "b";
  2915. push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "d";
  2916. push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "n";
  2917. # print "substitutionsarr = @substitutionsarr\n";
  2918. # <STDIN>;
  2919. }
  2920. else{
  2921. #print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. indeltype=$mhash{indeltype}\n";
  2922. if ($mhash{"type"} eq "deletion"){
  2923. $deletions++;
  2924. $motdeletions++ if $mhash{"indeltype"} =~ /dmot/;
  2925. $motdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/;
  2926. $nondeletions++ if $mhash{"indeltype"} =~ /dnon/;
  2927. $bdeletions++ if $myutype eq "b";
  2928. $ddeletions++ if $myutype eq "d";
  2929. $ndeletions++ if $myutype eq "n";
  2930. $bmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "b";
  2931. $bmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "b";
  2932. $bnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "b";
  2933. $dmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "d";
  2934. $dmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "d";
  2935. $dnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "d";
  2936. $nmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "n";
  2937. $nmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "n";
  2938. $nnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "n";
  2939. push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "b";
  2940. push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "d";
  2941. push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "n";
  2942. }
  2943. if ($mhash{"type"} eq "insertion"){
  2944. $insertions++;
  2945. $motinsertions++ if $mhash{"indeltype"} =~ /imot/;
  2946. $motinsertionsf++ if $mhash{"indeltype"} =~ /imotf/;
  2947. $noninsertions++ if $mhash{"indeltype"} =~ /inon/;
  2948. $binsertions++ if $myutype eq "b";
  2949. $dinsertions++ if $myutype eq "d";
  2950. $ninsertions++ if $myutype eq "n";
  2951. $bmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "b";
  2952. $bmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "b";
  2953. $bnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "b";
  2954. $dmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "d";
  2955. $dmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "d";
  2956. $dnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "d";
  2957. $nmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "n";
  2958. $nmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "n";
  2959. $nnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "n";
  2960. push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "b";
  2961. push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "d";
  2962. push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "n";
  2963. }
  2964. }
  2965. }
  2966. $indels = "ins=".join(",",@insertionsarr).";dels=".join(",",@deletionsarr) if scalar(@insertionsarr) > 1 || scalar(@deletionsarr) > 1 ;
  2967. $subs = join(",",@substitutionsarr) if scalar(@substitutionsarr) > 1;
  2968. $indels =~ s/ //g;
  2969. $subs =~ s/ //g ;
  2970. #print "indels = $indels, subs=$subs\n";
  2971. ##<STDIN> if $indels =~ /[a-zA-Z0-9]/ || $subs =~ /[a-zA-Z0-9]/ ;
  2972. #print "tree = $tree, indels = $indels, subs = $subs, bspecies = @bspecies, dspecies = @dspecies \n";
  2973. my @returnarray = ();
  2974. push (@returnarray, $indels, $subs) ;
  2975. push @returnarray, $tree;
  2976. my @copy = @returnarray;
  2977. #print "\n\nreturnarray = @returnarray ... binsertions=$binsertions dinsertions=$dinsertions bsubstitutions=$bsubstitutions dsubstitutions=$dsubstitutions\n";
  2978. #<STDIN>;
  2979. return (@returnarray);
  2980. }
  2981. sub selectBetterTree{
  2982. my $printer = 1;
  2983. my $treestudy = $_[0];
  2984. my $alt = $_[1];
  2985. my $mutspt = $_[2];
  2986. my @muts = @$mutspt;
  2987. my @trees = (); my @alternatetrees=();
  2988. @trees = split(/\|/,$treestudy) if $treestudy =~ /\|/;
  2989. @alternatetrees = split(/[\|;]/,$alt) if $alt =~ /[\|;\(\)]/;
  2990. $trees[0] = $treestudy if $treestudy !~ /\|/;
  2991. $alternatetrees[0] = $alt if $alt !~ /[\|;\(\)]/;
  2992. my @alltrees = (@trees, @alternatetrees);
  2993. # push(@alltrees,@alternatetrees);
  2994. my %mutspecies = ();
  2995. # print "IN selectBetterTree..treestudy=$treestudy. alt=$alt. for: @_. trees=@trees<. alternatetrees=@alternatetrees\n" if $printer == 1;
  2996. #<STDIN>;
  2997. foreach my $mut (@muts){
  2998. # print colored ['green'],"mut = $mut\n" if $printer == 1;
  2999. $mut =~ /node=([A-Z,\(\) ]+)/;
  3000. my $node = $1;
  3001. $node =~s/[,\(\) ]+//g;
  3002. my @indivspecies = $node =~ /[A-Z]+/g;
  3003. #print "adding node: $node\n" if $printer == 1;
  3004. $mutspecies{$node} = $node;
  3005. }
  3006. my @treerecords = ();
  3007. my $treecount = -1;
  3008. foreach my $tree (@alltrees){
  3009. # print "checking with tree $tree\n" if $printer == 1;
  3010. $treecount++;
  3011. $treerecords[$treecount] = 0;
  3012. my @indivspecies = ($tree =~ /[A-Z]+/g);
  3013. # print "indivspecies=@indivspecies\n" if $printer == 1;
  3014. foreach my $species (@indivspecies){
  3015. # print "checkin if exists species: $species\n" if $printer == 1;
  3016. $treerecords[$treecount]+=2 if exists $mutspecies{$species} && $mutspecies{$species} !~ /indeltype=[a-z]mot/;
  3017. $treerecords[$treecount]+=1.5 if exists $mutspecies{$species} && $mutspecies{$species} =~ /indeltype=[a-z]mot/;
  3018. $treerecords[$treecount]-- if !exists $mutspecies{$species};
  3019. }
  3020. # print "for tree $tree, our treecount = $treerecords[$treecount]\n" if $printer == 1;
  3021. }
  3022. my @best_tree = array_largest_number_arrayPosition(@treerecords);
  3023. # print "treerecords = @treerecords. hence, best tree = @best_tree = $alltrees[$best_tree[0]], $treerecords[$best_tree[0]]\n" if $printer == 1;
  3024. #<STDIN>;
  3025. return ($alltrees[$best_tree[0]], $treerecords[$best_tree[0]]) if scalar(@best_tree) == 1;
  3026. # print "best_tree[0] = $best_tree[0], and treerecords = $treerecords[$best_tree[0]]\n" if $printer == 1;
  3027. return ("NULL", -1) if $treerecords[$best_tree[0]] < 1;
  3028. my $rando = int(rand($#trees));
  3029. return ($alltrees[$rando], $treerecords[$rando]) if scalar(@best_tree) > 1;
  3030. }
  3031. sub load_sameHash{
  3032. #my $g = %$_[0];
  3033. $sameHash{"CAGT"}="AGTC";
  3034. $sameHash{"ATGA"}="AATG";
  3035. $sameHash{"CAAC"}="AACC";
  3036. $sameHash{"GGAA"}="AAGG";
  3037. $sameHash{"TAAG"}="AAGT";
  3038. $sameHash{"CGAG"}="AGCG";
  3039. $sameHash{"TAGG"}="AGGT";
  3040. $sameHash{"GCAG"}="AGGC";
  3041. $sameHash{"TAGA"}="ATAG";
  3042. $sameHash{"TGA"}="ATG";
  3043. $sameHash{"CAAG"}="AAGC";
  3044. $sameHash{"CTAA"}="AACT";
  3045. $sameHash{"CAAT"}="AATC";
  3046. $sameHash{"GTAG"}="AGGT";
  3047. $sameHash{"GAAG"}="AAGG";
  3048. $sameHash{"CGA"}="ACG";
  3049. $sameHash{"GTAA"}="AAGT";
  3050. $sameHash{"ACAA"}="AAAC";
  3051. $sameHash{"GCGG"}="GGGC";
  3052. $sameHash{"ATCA"}="AATC";
  3053. $sameHash{"TAAC"}="AACT";
  3054. $sameHash{"GGCA"}="AGGC";
  3055. $sameHash{"TGAG"}="AGTG";
  3056. $sameHash{"AACA"}="AAAC";
  3057. $sameHash{"GAGC"}="AGCG";
  3058. $sameHash{"ACCA"}="AACC";
  3059. $sameHash{"TGAA"}="AATG";
  3060. $sameHash{"ACA"}="AAC";
  3061. $sameHash{"GAAC"}="AACG";
  3062. $sameHash{"GCA"}="AGC";
  3063. $sameHash{"CCAC"}="ACCC";
  3064. $sameHash{"CATA"}="ATAC";
  3065. $sameHash{"CAC"}="ACC";
  3066. $sameHash{"TACA"}="ATAC";
  3067. $sameHash{"GGAC"}="ACGG";
  3068. $sameHash{"AGA"}="AAG";
  3069. $sameHash{"ATAA"}="AAAT";
  3070. $sameHash{"CA"}="AC";
  3071. $sameHash{"CCCA"}="ACCC";
  3072. $sameHash{"TCAA"}="AATC";
  3073. $sameHash{"CAGA"}="AGAC";
  3074. $sameHash{"AATA"}="AAAT";
  3075. $sameHash{"CCA"}="ACC";
  3076. $sameHash{"AGAA"}="AAAG";
  3077. $sameHash{"AGTA"}="AAGT";
  3078. $sameHash{"GACG"}="ACGG";
  3079. $sameHash{"TCAG"}="AGTC";
  3080. $sameHash{"ACGA"}="AACG";
  3081. $sameHash{"CGCA"}="ACGC";
  3082. $sameHash{"GAGT"}="AGTG";
  3083. $sameHash{"GA"}="AG";
  3084. $sameHash{"TA"}="AT";
  3085. $sameHash{"TAA"}="AAT";
  3086. $sameHash{"CAG"}="AGC";
  3087. $sameHash{"GATA"}="ATAG";
  3088. $sameHash{"GTA"}="AGT";
  3089. $sameHash{"CCAA"}="AACC";
  3090. $sameHash{"TAG"}="AGT";
  3091. $sameHash{"CAAA"}="AAAC";
  3092. $sameHash{"AAGA"}="AAAG";
  3093. $sameHash{"CACG"}="ACGC";
  3094. $sameHash{"GTCA"}="AGTC";
  3095. $sameHash{"GGA"}="AGG";
  3096. $sameHash{"GGAT"}="ATGG";
  3097. $sameHash{"CGGG"}="GGGC";
  3098. $sameHash{"CGGA"}="ACGG";
  3099. $sameHash{"AGGA"}="AAGG";
  3100. $sameHash{"TAAA"}="AAAT";
  3101. $sameHash{"GAGA"}="AGAG";
  3102. $sameHash{"ACTA"}="AACT";
  3103. $sameHash{"GCGA"}="AGCG";
  3104. $sameHash{"CACA"}="ACAC";
  3105. $sameHash{"AGAT"}="ATAG";
  3106. $sameHash{"GAGG"}="AGGG";
  3107. $sameHash{"CGAC"}="ACCG";
  3108. $sameHash{"GGAG"}="AGGG";
  3109. $sameHash{"GCCA"}="AGCC";
  3110. $sameHash{"CCAG"}="AGCC";
  3111. $sameHash{"GAAA"}="AAAG";
  3112. $sameHash{"CAGG"}="AGGC";
  3113. $sameHash{"GAC"}="ACG";
  3114. $sameHash{"CAA"}="AAC";
  3115. $sameHash{"GACC"}="ACCG";
  3116. $sameHash{"GGCG"}="GGGC";
  3117. $sameHash{"GGTA"}="AGGT";
  3118. $sameHash{"AGCA"}="AAGC";
  3119. $sameHash{"GATG"}="ATGG";
  3120. $sameHash{"GTGA"}="AGTG";
  3121. $sameHash{"ACAG"}="AGAC";
  3122. $sameHash{"CGG"}="GGC";
  3123. $sameHash{"ATA"}="AAT";
  3124. $sameHash{"GACA"}="AGAC";
  3125. $sameHash{"GCAA"}="AAGC";
  3126. $sameHash{"CAGC"}="AGCC";
  3127. $sameHash{"GGGA"}="AGGG";
  3128. $sameHash{"GAG"}="AGG";
  3129. $sameHash{"ACAT"}="ATAC";
  3130. $sameHash{"GAAT"}="AATG";
  3131. $sameHash{"CACC"}="ACCC";
  3132. $sameHash{"GAT"}="ATG";
  3133. $sameHash{"GCG"}="GGC";
  3134. $sameHash{"GCAC"}="ACGC";
  3135. $sameHash{"GAA"}="AAG";
  3136. $sameHash{"TGGA"}="ATGG";
  3137. $sameHash{"CCGA"}="ACCG";
  3138. $sameHash{"CGAA"}="AACG";
  3139. }
  3140. sub load_revHash{
  3141. $revHash{"CTGA"}="AGTC";
  3142. $revHash{"TCTT"}="AAAG";
  3143. $revHash{"CTAG"}="AGCT";
  3144. $revHash{"GGTG"}="ACCC";
  3145. $revHash{"GCC"}="GGC";
  3146. $revHash{"GCTT"}="AAGC";
  3147. $revHash{"GCGT"}="ACGC";
  3148. $revHash{"GTTG"}="AACC";
  3149. $revHash{"CTCC"}="AGGG";
  3150. $revHash{"ATC"}="ATG";
  3151. $revHash{"CGAT"}="ATCG";
  3152. $revHash{"TTAA"}="AATT";
  3153. $revHash{"GTTC"}="AACG";
  3154. $revHash{"CTGC"}="AGGC";
  3155. $revHash{"TCGA"}="ATCG";
  3156. $revHash{"ATCT"}="ATAG";
  3157. $revHash{"GGTT"}="AACC";
  3158. $revHash{"CTTA"}="AAGT";
  3159. $revHash{"TGGC"}="AGCC";
  3160. $revHash{"CCG"}="GGC";
  3161. $revHash{"CGGC"}="GGCC";
  3162. $revHash{"TTAG"}="AACT";
  3163. $revHash{"GTG"}="ACC";
  3164. $revHash{"CTTT"}="AAAG";
  3165. $revHash{"TGCA"}="ATGC";
  3166. $revHash{"CGCT"}="AGCG";
  3167. $revHash{"TTCC"}="AAGG";
  3168. $revHash{"CT"}="AG";
  3169. $revHash{"C"}="G";
  3170. $revHash{"CTCT"}="AGAG";
  3171. $revHash{"ACTT"}="AAGT";
  3172. $revHash{"GGTC"}="ACCG";
  3173. $revHash{"ATTC"}="AATG";
  3174. $revHash{"GGGT"}="ACCC";
  3175. $revHash{"CCTA"}="AGGT";
  3176. $revHash{"CGCG"}="GCGC";
  3177. $revHash{"GTGT"}="ACAC";
  3178. $revHash{"GCCC"}="GGGC";
  3179. $revHash{"GTCG"}="ACCG";
  3180. $revHash{"TCCC"}="AGGG";
  3181. $revHash{"TTCA"}="AATG";
  3182. $revHash{"AGTT"}="AACT";
  3183. $revHash{"CCCT"}="AGGG";
  3184. $revHash{"CCGC"}="GGGC";
  3185. $revHash{"CTT"}="AAG";
  3186. $revHash{"TTGG"}="AACC";
  3187. $revHash{"ATT"}="AAT";
  3188. $revHash{"TAGC"}="AGCT";
  3189. $revHash{"ACTG"}="AGTC";
  3190. $revHash{"TCAC"}="AGTG";
  3191. $revHash{"CTGT"}="AGAC";
  3192. $revHash{"TGTG"}="ACAC";
  3193. $revHash{"ATCC"}="ATGG";
  3194. $revHash{"GTGG"}="ACCC";
  3195. $revHash{"TGGG"}="ACCC";
  3196. $revHash{"TCGG"}="ACCG";
  3197. $revHash{"CGGT"}="ACCG";
  3198. $revHash{"GCTC"}="AGCG";
  3199. $revHash{"TACG"}="ACGT";
  3200. $revHash{"GTTT"}="AAAC";
  3201. $revHash{"CAT"}="ATG";
  3202. $revHash{"CATG"}="ATGC";
  3203. $revHash{"GTTA"}="AACT";
  3204. $revHash{"CACT"}="AGTG";
  3205. $revHash{"TCAT"}="AATG";
  3206. $revHash{"TTA"}="AAT";
  3207. $revHash{"TGTA"}="ATAC";
  3208. $revHash{"TTTC"}="AAAG";
  3209. $revHash{"TACT"}="AAGT";
  3210. $revHash{"TGTT"}="AAAC";
  3211. $revHash{"CTA"}="AGT";
  3212. $revHash{"GACT"}="AGTC";
  3213. $revHash{"TTGC"}="AAGC";
  3214. $revHash{"TTC"}="AAG";
  3215. $revHash{"GCT"}="AGC";
  3216. $revHash{"GCAT"}="ATGC";
  3217. $revHash{"TGGT"}="AACC";
  3218. $revHash{"CCT"}="AGG";
  3219. $revHash{"CATC"}="ATGG";
  3220. $revHash{"CCAT"}="ATGG";
  3221. $revHash{"CCCG"}="GGGC";
  3222. $revHash{"TGCC"}="AGGC";
  3223. $revHash{"TG"}="AC";
  3224. $revHash{"TGCT"}="AAGC";
  3225. $revHash{"GCCG"}="GGCC";
  3226. $revHash{"TCTG"}="AGAC";
  3227. $revHash{"TGT"}="AAC";
  3228. $revHash{"TTAT"}="AAAT";
  3229. $revHash{"TAGT"}="AACT";
  3230. $revHash{"TATG"}="ATAC";
  3231. $revHash{"TTTA"}="AAAT";
  3232. $revHash{"CGTA"}="ACGT";
  3233. $revHash{"TA"}="AT";
  3234. $revHash{"TGTC"}="AGAC";
  3235. $revHash{"CTAT"}="ATAG";
  3236. $revHash{"TATA"}="ATAT";
  3237. $revHash{"TAC"}="AGT";
  3238. $revHash{"TC"}="AG";
  3239. $revHash{"CATT"}="AATG";
  3240. $revHash{"TCG"}="ACG";
  3241. $revHash{"ATTT"}="AAAT";
  3242. $revHash{"CGTG"}="ACGC";
  3243. $revHash{"CTG"}="AGC";
  3244. $revHash{"TCGT"}="AACG";
  3245. $revHash{"TCCG"}="ACGG";
  3246. $revHash{"GTT"}="AAC";
  3247. $revHash{"ATGT"}="ATAC";
  3248. $revHash{"CTTG"}="AAGC";
  3249. $revHash{"CCTT"}="AAGG";
  3250. $revHash{"GATC"}="ATCG";
  3251. $revHash{"CTGG"}="AGCC";
  3252. $revHash{"TTCT"}="AAAG";
  3253. $revHash{"CGTC"}="ACGG";
  3254. $revHash{"CG"}="GC";
  3255. $revHash{"TATT"}="AAAT";
  3256. $revHash{"CTCG"}="AGCG";
  3257. $revHash{"TCTC"}="AGAG";
  3258. $revHash{"TCCT"}="AAGG";
  3259. $revHash{"TGG"}="ACC";
  3260. $revHash{"ACTC"}="AGTG";
  3261. $revHash{"CTC"}="AGG";
  3262. $revHash{"CGC"}="GGC";
  3263. $revHash{"TTG"}="AAC";
  3264. $revHash{"ACCT"}="AGGT";
  3265. $revHash{"TCTA"}="ATAG";
  3266. $revHash{"GTAC"}="ACGT";
  3267. $revHash{"TTGA"}="AATC";
  3268. $revHash{"GTCC"}="ACGG";
  3269. $revHash{"GATT"}="AATC";
  3270. $revHash{"T"}="A";
  3271. $revHash{"CGTT"}="AACG";
  3272. $revHash{"GTC"}="ACG";
  3273. $revHash{"GCCT"}="AGGC";
  3274. $revHash{"TGC"}="AGC";
  3275. $revHash{"TTTG"}="AAAC";
  3276. $revHash{"GGCT"}="AGCC";
  3277. $revHash{"TCA"}="ATG";
  3278. $revHash{"GTGC"}="ACGC";
  3279. $revHash{"TGAT"}="AATC";
  3280. $revHash{"TAT"}="AAT";
  3281. $revHash{"CTAC"}="AGGT";
  3282. $revHash{"TGCG"}="ACGC";
  3283. $revHash{"CTCA"}="AGTG";
  3284. $revHash{"CTTC"}="AAGG";
  3285. $revHash{"GCTG"}="AGCC";
  3286. $revHash{"TATC"}="ATAG";
  3287. $revHash{"TAAT"}="AATT";
  3288. $revHash{"ACT"}="AGT";
  3289. $revHash{"TCGC"}="AGCG";
  3290. $revHash{"GGT"}="ACC";
  3291. $revHash{"TCC"}="AGG";
  3292. $revHash{"TTGT"}="AAAC";
  3293. $revHash{"TGAC"}="AGTC";
  3294. $revHash{"TTAC"}="AAGT";
  3295. $revHash{"CGT"}="ACG";
  3296. $revHash{"ATTA"}="AATT";
  3297. $revHash{"ATTG"}="AATC";
  3298. $revHash{"CCTC"}="AGGG";
  3299. $revHash{"CCGG"}="GGCC";
  3300. $revHash{"CCGT"}="ACGG";
  3301. $revHash{"TCCA"}="ATGG";
  3302. $revHash{"CGCC"}="GGGC";
  3303. $revHash{"GT"}="AC";
  3304. $revHash{"TTCG"}="AACG";
  3305. $revHash{"CCTG"}="AGGC";
  3306. $revHash{"TCT"}="AAG";
  3307. $revHash{"GTAT"}="ATAC";
  3308. $revHash{"GTCT"}="AGAC";
  3309. $revHash{"GCTA"}="AGCT";
  3310. $revHash{"TACC"}="AGGT";
  3311. }
  3312. sub allCaps{
  3313. my $motif = $_[0];
  3314. $motif =~ s/a/A/g;
  3315. $motif =~ s/c/C/g;
  3316. $motif =~ s/t/T/g;
  3317. $motif =~ s/g/G/g;
  3318. return $motif;
  3319. }
  3320. sub all_caps{
  3321. my @strand = split(/\s*/,$_[0]);
  3322. for my $i (0 ... $#strand){
  3323. if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;}
  3324. if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;}
  3325. if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;}
  3326. if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;}
  3327. }
  3328. return join("",@strand);
  3329. }
  3330. sub array_mean{
  3331. return "NA" if scalar(@_) == 0;
  3332. my $sum = 0;
  3333. foreach my $val (@_){
  3334. $sum = $sum + $val;
  3335. }
  3336. return ($sum/scalar(@_));
  3337. }
  3338. sub array_sum{
  3339. return "NA" if scalar(@_) == 0;
  3340. my $sum = 0;
  3341. foreach my $val (@_){
  3342. $sum = $sum + $val;
  3343. }
  3344. return ($sum);
  3345. }
  3346. sub variance{
  3347. return "NA" if scalar(@_) == 0;
  3348. return 0 if scalar(@_) == 1;
  3349. my $mean = array_mean(@_);
  3350. my $num = 0;
  3351. return 0 if scalar(@_) == 1;
  3352. # print "mean = $mean .. array = >@_<\n";
  3353. foreach my $ele (@_){
  3354. # print "$num = $num + ($ele-$mean)*($ele-$mean)\n";
  3355. $num = $num + ($ele-$mean)*($ele-$mean);
  3356. }
  3357. my $var = $num / scalar(@_);
  3358. return $var;
  3359. }
  3360. sub array_95confIntervals{
  3361. return "NA" if scalar(@_) <= 0;
  3362. my @sorted = sort { $a <=> $b } @_;
  3363. # print "@sorted=",scalar(@sorted), "\n";
  3364. my $aDeechNo = int((scalar(@sorted) * 2.5) / 100);
  3365. my $saaDeNo = int((scalar(@sorted) * 97.5) / 100);
  3366. return ($sorted[$aDeechNo], $sorted[$saaDeNo]);
  3367. }
  3368. sub array_median{
  3369. return "NA" if scalar(@_) == 0;
  3370. return $_[0] if scalar(@_) == 1;
  3371. my @sorted = sort { $a <=> $b } @_;
  3372. my $totalno = scalar(@sorted);
  3373. #print "sorted = @sorted\n";
  3374. my $pick = ();
  3375. if ($totalno % 2 == 1){
  3376. #print "odd set .. totalno = $totalno\n";
  3377. my $mid = $totalno / 2;
  3378. my $onehalfno = $mid - $mid % 1;
  3379. my $secondhalfno = $onehalfno + 1;
  3380. my $onehalf = $sorted[$onehalfno-1];
  3381. my $secondhalf = $sorted[$secondhalfno-1];
  3382. #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n";
  3383. $pick = $secondhalf;
  3384. }
  3385. else{
  3386. #print "even set .. totalno = $totalno\n";
  3387. my $mid = $totalno / 2;
  3388. my $onehalfno = $mid;
  3389. my $secondhalfno = $onehalfno + 1;
  3390. my $onehalf = $sorted[$onehalfno-1];
  3391. my $secondhalf = $sorted[$secondhalfno-1];
  3392. #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n";
  3393. $pick = ($onehalf + $secondhalf )/2;
  3394. }
  3395. #print "pick = $pick..\n";
  3396. return $pick;
  3397. }
  3398. sub array_numerical_sort{
  3399. return "NA" if scalar(@_) == 0;
  3400. my @sorted = sort { $a <=> $b } @_;
  3401. return (@sorted);
  3402. }
  3403. sub array_smallest_number{
  3404. return "NA" if scalar(@_) == 0;
  3405. return $_[0] if scalar(@_) == 1;
  3406. my @sorted = sort { $a <=> $b } @_;
  3407. return $sorted[0];
  3408. }
  3409. sub array_largest_number{
  3410. return "NA" if scalar(@_) == 0;
  3411. return $_[0] if scalar(@_) == 1;
  3412. my @sorted = sort { $a <=> $b } @_;
  3413. return $sorted[$#sorted];
  3414. }
  3415. sub array_largest_number_arrayPosition{
  3416. return "NA" if scalar(@_) == 0;
  3417. return 0 if scalar(@_) == 1;
  3418. my $maxpos = 0;
  3419. my @maxposes = ();
  3420. my @maxvals = ();
  3421. my $maxval = array_smallest_number(@_);
  3422. for my $i (0 ... $#_){
  3423. if ($_[$i] > $maxval){
  3424. $maxval = $_[$i];
  3425. $maxpos = $i;
  3426. }
  3427. if ($_[$i] == $maxval){
  3428. $maxval = $_[$i];
  3429. if (scalar(@maxposes) == 0){
  3430. push @maxposes, $i;
  3431. push @maxvals, $_[$i];
  3432. }
  3433. elsif ($maxvals[0] == $maxval){
  3434. push @maxposes, $i;
  3435. push @maxvals, $_[$i];
  3436. }
  3437. else{
  3438. @maxposes = (); @maxvals = ();
  3439. push @maxposes, $i;
  3440. push @maxvals, $_[$i];
  3441. }
  3442. }
  3443. }
  3444. return $maxpos if scalar(@maxposes) < 2;
  3445. return (@maxposes);
  3446. }
  3447. sub array_smallest_number_arrayPosition{
  3448. return "NA" if scalar(@_) == 0;
  3449. return 0 if scalar(@_) == 1;
  3450. my $minpos = 0;
  3451. my @minposes = ();
  3452. my @minvals = ();
  3453. my $minval = array_largest_number(@_);
  3454. my $maxval = array_smallest_number(@_);
  3455. #print "starting with $maxval, ending with $minval\n";
  3456. for my $i (0 ... $#_){
  3457. if ($_[$i] < $minval){
  3458. $minval = $_[$i];
  3459. $minpos = $i;
  3460. }
  3461. if ($_[$i] == $minval){
  3462. $minval = $_[$i];
  3463. if (scalar(@minposes) == 0){
  3464. push @minposes, $i;
  3465. push @minvals, $_[$i];
  3466. }
  3467. elsif ($minvals[0] == $minval){
  3468. push @minposes, $i;
  3469. push @minvals, $_[$i];
  3470. }
  3471. else{
  3472. @minposes = (); @minvals = ();
  3473. push @minposes, $i;
  3474. push @minvals, $_[$i];
  3475. }
  3476. }
  3477. }
  3478. #print "minposes=@minposes\n";
  3479. return $minpos if scalar(@minposes) < 2;
  3480. return (@minposes);
  3481. }
  3482. sub basic_stats{
  3483. my @arr = @_;
  3484. # print " array_smallest_number= ", array_smallest_number(@arr)," array_largest_number= ", array_largest_number(@arr), " array_mean= ",array_mean(@arr),"\n";
  3485. return ":";
  3486. }
  3487. #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
  3488. sub maftoAxt_multispecies {
  3489. my $printer = 0;
  3490. #print "in maftoAxt_multispecies : got @_\n";
  3491. my $fname=$_[0];
  3492. open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
  3493. my $treedefinition = $_[1];
  3494. #print "treedefinition= $treedefinition\n";
  3495. my @treedefinitions = MakeTrees($treedefinition);
  3496. open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
  3497. my $counter = 0;
  3498. my $exactspeciesset = $_[3];
  3499. my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
  3500. $treedefinition=~s/[\)\(, ]/\t/g;
  3501. my @species=split(/\t+/,$treedefinition);
  3502. @exactspeciesset_unarranged = @species;
  3503. # print "species=@species\n";
  3504. my @exactspeciesarr=();
  3505. foreach my $def (@treedefinitions){
  3506. $def=~s/[\)\(, ]/\t/g;
  3507. my @specs = split(/\t+/,$def);
  3508. my @exactspecies=();
  3509. foreach my $spec (@specs){
  3510. foreach my $espec (@exactspeciesset_unarranged){
  3511. # print "pushing >$spec< nd >$espec<\n" if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/;
  3512. push @exactspecies, $spec if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/;
  3513. }
  3514. }
  3515. #print "exactspecies = >@exactspecies<\n";
  3516. push @exactspeciesarr, [@exactspecies];
  3517. }
  3518. #<STDIN>;
  3519. #print "exactspeciesarr=@exactspeciesarr\n";
  3520. ###########
  3521. my $select = 2;
  3522. #select = 1 if all species need sequences to be present for each block otherwise, it is 0
  3523. #select = 2 only the allowed set make up the alignment. use the removeset
  3524. # information to detect alignmenets that have other important genomes aligned.
  3525. ###########
  3526. my @allowedset = ();
  3527. @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
  3528. @allowedset = join("_",0,@species) if $select == 1;
  3529. #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
  3530. foreach my $set (@exactspeciesarr){
  3531. my @openset = @$set;
  3532. push @allowedset, join("_",0,@openset) if $select == 2;
  3533. # print "openset = >@openset<, allowedset = @allowedset and exactspecies = @exactspecies\n";
  3534. }
  3535. # <STDIN>;
  3536. my $start = 0;
  3537. my @sequences = ();
  3538. my @titles = ();
  3539. my $species_counter = "0";
  3540. my $countermatch = 0;
  3541. my $outsideSpecies=0;
  3542. while(my $line = <IN>){
  3543. next if $line =~ /^#/;
  3544. next if $line =~ /^i/;
  3545. # print "$line .. species = @species\n";
  3546. chomp $line;
  3547. my @fields = split(/\s+/,$line);
  3548. chomp $line;
  3549. if ($line =~ /^a /){
  3550. $start = 1;
  3551. }
  3552. if ($line =~ /^s /){
  3553. # print "fields1 = $fields[1] , start = $start\n";
  3554. foreach my $sp (@species){
  3555. if ($fields[1] =~ /$sp/){
  3556. $species_counter = $species_counter."_".$sp;
  3557. push(@sequences, $fields[6]);
  3558. my @sp_info = split(/\./,$fields[1]);
  3559. my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
  3560. push(@titles, $title);
  3561. }
  3562. }
  3563. }
  3564. if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
  3565. my $arranged = reorderSpecies($species_counter, @species);
  3566. # print "species_counter=$species_counter .. arranged = $arranged\n";
  3567. my $stopper = 1;
  3568. my $arrno = 0;
  3569. foreach my $set (@allowedset){
  3570. # print "checking $set with $arranged\n";
  3571. if ($arranged eq $set){
  3572. # print "checked $set with $arranged\n";
  3573. $stopper = 0; last;
  3574. }
  3575. $arrno++;
  3576. }
  3577. if ($stopper == 0) {
  3578. # print " accepted\n";
  3579. @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
  3580. @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
  3581. my $filteredseq = filter_gaps(@sequences);
  3582. if ($filteredseq ne "SHORT"){
  3583. $counter++;
  3584. print OUT join (" ",$counter, @titles), "\n";
  3585. print OUT $filteredseq, "\n";
  3586. print OUT "\n";
  3587. $countermatch++;
  3588. }
  3589. # my @filtered_seq = split(/\t/,filter_gaps(@sequences) );
  3590. }
  3591. else{#print "\n";
  3592. }
  3593. @sequences = (); @titles = (); $start = 0;$species_counter = "0";
  3594. next;
  3595. }
  3596. }
  3597. close IN;
  3598. close OUT;
  3599. # print "countermatch = $countermatch\n";
  3600. # <STDIN>;
  3601. }
  3602. sub reorderSpecies{
  3603. my @inarr=@_;
  3604. my $currSpecies = shift (@inarr);
  3605. my $ordered_species = 0;
  3606. my @species=@inarr;
  3607. foreach my $order (@species){
  3608. $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
  3609. }
  3610. return $ordered_species;
  3611. }
  3612. sub filter_gaps{
  3613. my @sequences = @_;
  3614. # print "sequences sent are @sequences\n";
  3615. my $seq_length = length($sequences[0]);
  3616. my $seq_no = scalar(@sequences);
  3617. my $allgaps = ();
  3618. for (1 ... $seq_no){
  3619. $allgaps = $allgaps."-";
  3620. }
  3621. my @seq_array = ();
  3622. my $seq_counter = 0;
  3623. foreach my $seq (@sequences){
  3624. # my @sequence = split(/\s*/,$seq);
  3625. $seq_array[$seq_counter] = [split(/\s*/,$seq)];
  3626. # push @seq_array, [@sequence];
  3627. $seq_counter++;
  3628. }
  3629. my $g = 0;
  3630. while ( $g < $seq_length){
  3631. last if (!exists $seq_array[0][$g]);
  3632. my $bases = ();
  3633. for my $u (0 ... $#seq_array){
  3634. $bases = $bases.$seq_array[$u][$g];
  3635. }
  3636. # print $bases, "\n";
  3637. if ($bases eq $allgaps){
  3638. # print "bases are $bases, position is $g \n";
  3639. for my $seq (@seq_array){
  3640. splice(@$seq , $g, 1);
  3641. }
  3642. }
  3643. else {
  3644. $g++;
  3645. }
  3646. }
  3647. my @outs = ();
  3648. foreach my $seq (@seq_array){
  3649. push(@outs, join("",@$seq));
  3650. }
  3651. return "SHORT" if length($outs[0]) <=100;
  3652. return (join("\n", @outs));
  3653. }
  3654. sub allowedSetOfSpecies{
  3655. my @allowed_species = split(/_/,$_[0]);
  3656. unshift @allowed_species, 0;
  3657. # print "allowed set = @allowed_species \n";
  3658. my @output = ();
  3659. for (0 ... scalar(@allowed_species) - 4){
  3660. push(@output, join("_",@allowed_species));
  3661. pop @allowed_species;
  3662. }
  3663. return join(";",reverse(@output));
  3664. }
  3665. sub orderInfo{
  3666. my @info = split(/;/,$_[0]);
  3667. # print "info = @info";
  3668. my @old = split(/_/,$_[1]);
  3669. my @new = split(/_/,$_[2]);
  3670. shift @old; shift @new;
  3671. my @outinfo = ();
  3672. foreach my $spe (@new){
  3673. for my $no (0 ... $#old){
  3674. if ($spe eq $old[$no]){
  3675. push(@outinfo, $info[$no]);
  3676. }
  3677. }
  3678. }
  3679. # print "outinfo = @outinfo \n";
  3680. return join(";", @outinfo);
  3681. }
  3682. #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
  3683. sub printarr {
  3684. # print ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n";
  3685. # foreach my $line (@_) {print "$line\n";}
  3686. # print "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\n";
  3687. }
  3688. sub oneOf{
  3689. my @arr = @_;
  3690. my $element = $arr[$#arr];
  3691. @arr = @arr[0 ... $#arr-1];
  3692. my $present = 0;
  3693. foreach my $el (@arr){
  3694. $present = 1 if $el eq $element;
  3695. }
  3696. return $present;
  3697. }
  3698. #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx
  3699. sub MakeTrees{
  3700. my $tree = $_[0];
  3701. # my @parts=($tree);
  3702. my @parts=();
  3703. # print "parts=@parts\n";
  3704. while (1){
  3705. $tree =~ s/^\(//g;
  3706. $tree =~ s/\)$//g;
  3707. my @arr = ();
  3708. if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
  3709. @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
  3710. push @parts, "(".$tree.")";
  3711. $tree = $2;
  3712. }
  3713. elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
  3714. @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
  3715. push @parts, "(".$tree.")";
  3716. $tree = $1;
  3717. }
  3718. elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
  3719. last;
  3720. }
  3721. }
  3722. #print "parts=@parts\n";
  3723. return @parts;
  3724. }