microsatellite_birthdeath.pl - The Perl code reads a file, …

/tools/regVariation/microsatellite_birthdeath.pl

https://bitbucket.org/chapmanb/galaxy-central/ · Perl · 4333 lines · 2916 code · 609 blank · 808 comment · 701 complexity · d1bf4746bc2b8d229099df9b29475ffa MD5 · raw file
Large files are truncated click here to view the full file

#!/usr/bin/perl -w
use strict;
use warnings;
use Term::ANSIColor;
use Pod::Checker; 
use File::Basename;
use IO::Handle;
use Cwd;
use File::Path qw(make_path remove_tree);
use File::Temp qw/ tempfile tempdir /;
my $tdir = tempdir( CLEANUP => 1 );
chdir $tdir;
my $dir = getcwd;  
# print "current dit=$dir\n"; 
#<STDIN>;
use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags 
$infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species 
$gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies_orig @exactspecies @exacttags
$mono_flanksimplicityRepno $di_flanksimplicityRepno $prop_of_seq_allowedtoAT $prop_of_seq_allowedtoCG);
use FileHandle;
use IO::Handle;                     # 5.004 or higher



#my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat",
#"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12",
#"10","0.8");
my @ar = @ARGV;
my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar;
$SIMILARITY_THRESH=$SIMILARITY_THRESH/100;
#########################
$SIMILARITY_THRESH = $SIMILARITY_THRESH/100;
my $EDGE_DISTANCE = 10; 
my $COMPLEXITY_SUPPORT = 20;
load_thresholds("9_10_12_12");
my $FLANKINDEL_MAXTHRESH = 0.3;

my $mono_flanksimplicityRepno=6;
my $di_flanksimplicityRepno=10;
my $prop_of_seq_allowedtoAT=0.5;
my $prop_of_seq_allowedtoCG=0.66;

#########################
my $tspecies_set = $species_set;

my %speciesReplacement = ();
my %speciesReplacementTag = ();
my %replacementArr= ();
my %replacementArrTag= ();
my %backReplacementArr= ();
my %backReplacementArrTag= ();
$tree_definition=~s/\s+//g;

my $tree_definition_split = $tree_definition;
$tree_definition_split =~ s/[\(\)]//g;
my @gotSpecies = ($tree_definition_split =~ /(,)/g);
# print "gotSpecies = @gotSpecies\n";

if (scalar(@gotSpecies)+1 ==5){

	$speciesReplacement{1}="calJac1";
	$speciesReplacement{2}="rheMac2";
	$speciesReplacement{3}="ponAbe2";
	$speciesReplacement{4}="panTro2";
	$speciesReplacement{5}="hg18";

	$speciesReplacementTag{1}="M";
	$speciesReplacementTag{2}="R";
	$speciesReplacementTag{3}="O";
	$speciesReplacementTag{4}="C";
	$speciesReplacementTag{5}="H";
	$species_set="hg18,panTro2,ponAbe2,rheMac2,calJac1";

}
if (scalar(@gotSpecies)+1 ==4){

	$speciesReplacement{1}="rheMac2";
	$speciesReplacement{2}="ponAbe2";
	$speciesReplacement{3}="panTro2";
	$speciesReplacement{4}="hg18";

	$speciesReplacementTag{1}="R";
	$speciesReplacementTag{2}="O";
	$speciesReplacementTag{3}="C";
	$speciesReplacementTag{4}="H";
	$species_set="hg18,panTro2,ponAbe2,rheMac2";

}

#	$tree_definition = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)";
my $tree_definition_copy = $tree_definition;
my $tree_definition_orig = $tree_definition;
my $brackets = 0;

while (1){
	#last if $tree_definition_copy !~ /\(/;
	$brackets++;
# print "brackets = $brackets\n";
	last if $brackets > 6;
	$tree_definition_copy =~ s/^\(//g;
	$tree_definition_copy =~ s/\)$//g;
# print "tree_definition_copy = $tree_definition_copy\n";
	my @arr = ();

	if ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
		@arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
# print "arr = @arr\n";
		$tree_definition_copy = $2;
		$replacementArr{$1} = $speciesReplacement{$brackets};
		$backReplacementArr{$speciesReplacement{$brackets}}=$1;
		
		$replacementArrTag{$1} = $speciesReplacementTag{$brackets};
		$backReplacementArrTag{$speciesReplacementTag{$brackets}}=$1;
# print "replacing $1 with $replacementArr{$1}\n";
		
		$sp_ident[$brackets-1] = $1;
	
	}
	elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
		@arr = $tree_definition_copy =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
# print "arr = @arr\n";
		$tree_definition_copy = $1;
		$replacementArr{$2} = $speciesReplacement{$brackets};
		$backReplacementArr{$speciesReplacement{$brackets}}=$2;

		$replacementArrTag{$2} = $speciesReplacementTag{$brackets};
		$backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
# print "replacing $2 with $replacementArr{$2}\n";

		$sp_ident[$brackets-1] = $2;
	}
	elsif ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
		@arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/;
# print "arr = @arr .. TERMINAL\n";
		$tree_definition_copy = $1;
		$replacementArr{$2} = $speciesReplacement{$brackets};
		$replacementArr{$1} = $speciesReplacement{$brackets+1};
		$backReplacementArr{$speciesReplacement{$brackets}}=$2;
		$backReplacementArr{$speciesReplacement{$brackets+1}}=$1;

		$replacementArrTag{$1} = $speciesReplacementTag{$brackets+1};
		$backReplacementArrTag{$speciesReplacementTag{$brackets+1}}=$1;

		$replacementArrTag{$2} = $speciesReplacementTag{$brackets};
		$backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
# print "replacing $1 with $replacementArr{$1}\n";
# print "replacing $2 with $replacementArr{$2}\n";
# print "replacing $1 with $replacementArrTag{$1}\n";
# print "replacing $2 with $replacementArrTag{$2}\n";

		$sp_ident[$brackets-1] = $2;
		$sp_ident[$brackets] = $1;


		last;

	}
	elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_\(\),]+)\)$/){
		$tree_definition_copy =~ s/^\(//g;
		$tree_definition_copy =~ s/\)$//g;
		$brackets--;
	}		
}

foreach my $key (keys %replacementArr){
	my $replacement = $replacementArr{$key};
	$tree_definition =~ s/$key/$replacement/g;
}
@sp_ident = reverse(@sp_ident);
# print "modified tree_definition = $tree_definition\n";
# print "done .. tree_definition = $tree_definition\n";
# print "sp_ident = @sp_ident\n";
#<STDIN>;


my $complexity=int($COMPLEXITY_SUPPORT * (1/40));

#print "complexity=$complexity\n";
#<STDIN>;

$printer = 1;

my $rando = int(rand(1000));
my $localdate = `date`;
$localdate =~ /([0-9]+):([0-9]+):([0-9]+)/;
my $info = $rando.$1.$2.$3;

#---------------------------------------------------------------------------
# GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES


my @thresharr = (0, split(/,/,$thresholds));
my $randno=int(rand(100000));
my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt";
my $megamatchlck = $megamatch.".lck";
unlink $megamatchlck;

my $selected= $orth;
#my $eventfile = $orth;
 $selected = $selected."_SELECTED";
#$selected = $selected."_".$SIMILARITY_THRESH;
#my $runtime = $selected.".runtime";

my $inputtags = "H:C:O:R:M";
$inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/;

my @all_tags = split(/:/, $inputtags);
my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1";
$inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/;
#@sp_ident = split(/:/,$inputsp);
my $junkfile = $orth."_junk";

my $sh = load_sameHash(1);
my $rh = load_revHash(1);

#print "inputs are : \n"; foreach(@ARGV){print $_,"\n";} 
#open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!";
open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!";
#open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!";
#my $ctlfile = "baseml\.ctl"; #$ARGV[4];
#my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; 	#1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK 
my %registeredTrees = ();
my @removalReasons = 
("microsatellite is compound", 
"complex structure", 
"if no. if micros is more than no. of species", 
"if more than one micro per species ", 
"if microsat contains N", 
"different motif than required ", 
"more than zero interruptions", 
"microsat could not form key ", 
"orthologous microsats of different motif size ",
"orthologous microsats of different motifs ", 
"microsats belong to different alignment blocks altogether", 
"microsat near edge", 
"microsat in low complexity region", 
"microsat flanks dont align well", 
"phylogeny not informative");
my %allowedhash=();
#---------------------------------------------------------------------------
# WORKING ON MAKING THE MEGAMATCH FILE
my $chromt=int(rand(10000));
my $p_chr=$chromt;

my $tree_definition_orig_copy = $tree_definition_orig;

$tree_definition=~s/,/, /g;
$tree_definition =~ s/, +/, /g;
$tree_definition_orig=~s/,/, /g;
$tree_definition_orig =~ s/, +/, /g;
my @exactspeciesset_unarranged = split(/,/,$species_set);
my @exactspeciesset_unarranged_orig = split(/,/,$tspecies_set);
my $largesttree = "$tree_definition;";
my $largesttree_orig = "$tree_definition_orig;";
# print "largesttree = $largesttree\n";
$tree_definition =~ s/\(//g;
$tree_definition =~ s/\)//g;
$tree_definition=~s/[\)\(, ]/\t/g;
$tree_definition =~ s/\t+/\t/g;		

$tree_definition_orig =~ s/\(//g;
$tree_definition_orig =~ s/\)//g;
$tree_definition_orig =~s/[\)\(, ]/\t/g;
$tree_definition_orig =~ s/\t+/\t/g;		
# print "tree_definition = $tree_definition tree_definition_orig = $tree_definition_orig\n";

my @treespecies=split(/\t+/,$tree_definition);
my @treespecies_orig=split(/\t+/,$tree_definition_orig);
# print "tree_definition = $tree_definition .. treespecies=@treespecies ... treespecies_orig=@treespecies_orig\n";
#<STDIN>;

foreach my $spec (@treespecies){
	foreach my $espec (@exactspeciesset_unarranged){
# print "spec=$spec and espec=$espec\n";
		push @exactspecies, $spec if $spec eq $espec;
	}
}

foreach my $spec (@treespecies_orig){
	foreach my $espec (@exactspeciesset_unarranged_orig){
# print "spec=$spec and espec=$espec\n";
		push @exactspecies_orig, $spec if $spec eq $espec;
	}
}

my $focalspec = $exactspecies[0];
my $focalspec_orig = $exactspecies_orig[0];
# print "exactspecies=@exactspecies ... focalspec=$focalspec\n";
# print "texactspecies=@exactspecies_orig ... focalspec_orig=$focalspec_orig\n";
#<STDIN>;
my $arranged_species_set = join(".",@exactspecies);
my $arranged_species_set_orig = join(".",@exactspecies_orig);

@exacttags=@exactspecies;
my @exacttags_orig=@exactspecies_orig;

foreach my $extag (@exacttags){
	$extag =~ s/hg18/H/g;
	$extag =~ s/panTro2/C/g;
	$extag =~ s/ponAbe2/O/g;
	$extag =~ s/rheMac2/R/g;
	$extag =~ s/calJac1/M/g;
}

foreach my $extag (@exacttags_orig){
	$extag =~ s/hg18/H/g;
	$extag =~ s/panTro2/C/g;
	$extag =~ s/ponAbe2/O/g;
	$extag =~ s/rheMac2/R/g;
	$extag =~ s/calJac1/M/g;
}

my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
#print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; 

maftoAxt_multispecies($maf, $tree_definition_orig_copy, $chr_name, $tspecies_set);
#print "made files\n";
my @filterseqfiles= ($chr_name);
$largesttree =~ s/hg18/H/g;
$largesttree =~ s/panTro2/C/g;
$largesttree =~ s/ponAbe2/O/g;
$largesttree =~ s/rheMac2/R/g;
$largesttree =~ s/calJac1/M/g;
#<STDIN>;
#---------------------------------------------------------------------------

my ($lagestnodes, $largestbranches) = get_nodes($largesttree); 
shift (@$lagestnodes);
my @extendedtitle=();

my $title = ();
my $parttitle = ();
my @titlearr = ();
my @firsttitle=($focalspec_orig."chrom", $focalspec_orig."start", $focalspec_orig."end", $focalspec_orig."motif", $focalspec_orig."motifsize", $focalspec_orig."threshold");

my @finames= qw(chr	start	end	motif	motifsize	microsat	mutation	mutation.position	mutation.from	mutation.to	insertion.details	deletion.details);

my @fititle=();

foreach my $spec (split(",",$tspecies_set)){
	push @fititle, $spec;
	foreach my $name (@finames){
		push @fititle, $spec.".".$name;
	}
}


my @othertitle=qw(somechr somestart	somened	event source);

my @fnames = ();
push @fnames, qw(insertions_num  deletions_num  motinsertions_num  motinsertionsf_num  motdeletions_num  motdeletionsf_num  noninsertions_num  nondeletions_num) ;
push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ;
push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ;
push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ;
push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num);

my @fullnames = ();
# print "revising\n";
# print "H = $backReplacementArrTag{H}\n";
# print "C = $backReplacementArrTag{C}\n";
# print "O = $backReplacementArrTag{O}\n";
# print "R = $backReplacementArrTag{R}\n";
# print "M = $backReplacementArrTag{M}\n";

foreach my $lnode (@$lagestnodes){
	my @pair = @$lnode;
	my @nodemutarr = ();
	for my $p (@pair){
# print "p = $p\n";
		$p =~ s/[\(\), ]+//g;
		$p =~ s/([A-Z])/$1./g;
		$p =~ s/\.$//g;
		
		$p =~ s/H/$backReplacementArrTag{H}/g;
		$p =~ s/C/$backReplacementArrTag{C}/g;
		$p =~ s/O/$backReplacementArrTag{O}/g;
		$p =~ s/R/$backReplacementArrTag{R}/g;
		$p =~ s/M/$backReplacementArrTag{M}/g;
		foreach my $n (@fnames) {	push @fullnames, $p.".".$n;}
	}
}

#print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle);

#print SUMMARY "\t",join("\t", @fullnames);
my $header = join("\t",@firsttitle, @fititle, @othertitle,  @fullnames, @fnames, "tree", "cleancase");
# print "header= $header\n";
#<STDIN>;

#print SUMMARY "\t",join("\t", @fnames);
#$title=  $title."\t".join("\t", @fnames);

#print SUMMARY "\t","tree","\t", "cleancase", "\n";
#$title=  $title."\t"."tree"."\t"."cleancase". "\n";

#print $title; #<STDIN>;

#print "all_tags = @all_tags\n";

for my $no (3 ... $#all_tags+1){
#	print "no=$no\n"; #<STDIN>;
	@tags = @all_tags[0 ... $no-1];
# print "all_tags=>@all_tags< , tags = >@tags<\n" if $printer == 1; #<STDIN>;
	%template=();
	my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	#next if scalar(@tags) < 4;
# print "now doing tags = @tags, no = $no\n"; 
	open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!";
	
#	print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion);
	
	
	##################### T E M P O R A R Y #####################
	my @finaltitle=();
	my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13);
	my $endtitle = ();
	foreach my $tag (@tags){
		my @tempsingle = ();
		
		foreach my $single (@singletitle){
			push @tempsingle, $tag.$single;
		}
		@finaltitle = (@finaltitle, @tempsingle);	
	}

#	print SUMMARY join("\t",@finaltitle),"\n";
	
	#############################################################
	
	#---------------------------------------------------------------------------
	# GET THE TREE FROM TREE FILE
	my $tree = ();
	$tree = "((H, C), O)" if $no == 3;
	$tree = "(((H, C), O), R)" if $no == 4;
	$tree = "((((H, C), O), R), M)" if $no == 5;
#	$tree=~s/;$//g;
#	print "our tree = $tree\n";
	#---------------------------------------------------------------------------
	# LOADING HASH CONTAINING ALL POSSIBLE TREES:
	$tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt";
	%template=();
	%alternate=();
	load_allPossibleTrees($tree_decipherer, \%template, \%alternate);
	
	#---------------------------------------------------------------------------
	# LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS
	%treesToReject=();
	%treesToIgnore=();
	load_treesToReject(@tags); 
	load_treesToIgnore(@tags);
	#---------------------------------------------------------------------------
	# LOADING INPUT DATA INTO HASHES AND ARRAYS
	
	
	#1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS 
	#2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF
	#3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES).
	
	my %orths=();
	my $counterm = 0;
	my $loaded = 0;
	my %seen = ();
	my @allowedchrs = ();
#	print "no = $no\n"; #<STDIN>;

	while (my $line = <ORTH>){
	#	print "line=$line\n";
		my $register1 = $line =~ s/>$exactspecies_orig[0]/>$replacementArrTag{$exactspecies_orig[0]}/g;
		my $register2 = $line =~ s/>$exactspecies_orig[1]/>$replacementArrTag{$exactspecies_orig[1]}/g;
		my $register3 = $line =~ s/>$exactspecies_orig[2]/>$replacementArrTag{$exactspecies_orig[2]}/g;
		my $register4 = $line =~ s/>$exactspecies_orig[3]/>$replacementArrTag{$exactspecies_orig[3]}/g;
		my $register5 = $line =~ s/>$exactspecies_orig[4]/>$replacementArrTag{$exactspecies_orig[4]}/g if exists $exactspecies_orig[4];
		
	#	print "line = $line\n"; <STDIN>;
		
		
	#	next if $register1 + $register2 + $register3 + $register4 + $register5 > scalar(@tags);
		my @micros = split(/>/,$line); 									# LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros
		#print "micros=",printarr(@micros),"\n"; #<STDIN>;
		shift @micros; 													# EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY
		
		$no_of_species = adjustCoordinates($micros[0]);
#		print "A: $no_of_species\n";
		next if $no_of_species != $no;
# print "no = $no ... no_of_species=$no_of_species\n";#<STDIN>;
		$counterm++;
		#------------------------------------------------
		$nextcounter[0]++  if $line =~ /compound/;
		next if $line =~ /compound/; 									# GETTING RID OF COMPOUND MICROSATS
		#------------------------------------------------
		#next if $line =~ /[A-Za-z]>[a-zA-Z]/;
		#------------------------------------------------
		chomp $line;
		my $match_count = ($line =~ s/>/>/g); 							# COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER
		#print "number of species = $match_count\n";
		my $stopper = 0;	
		foreach my $mic (@micros){
			my @local = split(/\t/,$mic);
			if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++;
			last; } 
																		# REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM
		}
		next if $stopper ==1;
		#------------------------------------------------
		$nextcounter[2]++ if (scalar(@micros) >$no_of_species);
	
		next if (scalar(@micros) >$no_of_species); 						#1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET.
																		#2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY
																		#3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING
																		#4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD.
																		#5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl
		my $nexter = 0;
		foreach my $tag (@tags){
			my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g);
			if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1; 
				next; 
			}
		}
		
		if ($nexter == 1){
			$nextcounter[3]++;
			next;
		}
		#------------------------------------------------
		foreach my $mic (@micros){										#1	REMOVING MICROSATELLITES WITH ANY 'N's IN THEM
			my @local = split(/\t/,$mic);								
			if ($local[$microsatcord] =~ /N/) {$stopper =1; 		$nextcounter[4]++;
			last;}			
		}
		next if $stopper ==1;
		#print "till here 1\n"; #<STDIN>;
		#------------------------------------------------
		my @micros_copy = @micros;
		
		my $tempmicro = shift(@micros_copy);							#1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST 
																		#2 MICROSAT IN THE CLUSTER.
		my @tempfields = split(/\t/,$tempmicro);
		my $prevtype = $tempfields[$typecord];
		my $tempmotif = $tempfields[$motifcord];
		
		my $tempfirstmotif = ();
		if (scalar(@tempfields) > $microsatcord + 2){
			if ($tempfields[$no_of_interruptionscord] >= 1) {			#1	DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS
																		#2	IN THE FIRST MICROSAT OF THE CLUSTER
				$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; 
			}
		}
		if ($nexter == 1){
			$nextcounter[6]++;
			next;
		}															#1	DONE OBTAINING INFORMATION REGARDING 
																		#2	THE FIRST MICROSAT FROM THE CLUSTER
		
		if ($tempmotif =~ /^\[/){
			$tempmotif =~ s/^\[//g;
			$tempmotif =~ /([a-zA-Z]+)\].*/;
			$tempfirstmotif = $1;										#1 OBTAINING THE FIRTS MOTIF OF MICROSAT
		}
		else {$tempfirstmotif = $tempmotif;}
		my $prevmotif = $tempfirstmotif;
		
		my $key = ();
	#	print "searching temp micro for 0-9 $focalspec chr0-9a-zA-Z 0-9 0-9 \n";
	#	print "tempmicro = $tempmicro .. looking for ([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\n"; <STDIN>;
		if ($tempmicro =~ /([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
	#		print "B: $no_of_species\n";
			$key = join("_",$2, $3,  $4, $5);
		}
		else{
# print "counld not form  a key for temp\n"; # if $printer == 1;
			$nextcounter[7]++;
			next;
		}
		#-----------------												#1	NOW, AFTER OBTAINING INFORMATION ABOUT
																		#2	THE FIRST MICROSAT IN THE CLUSTER, THE
																		#3	FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS
																		#4	TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW)
	
		foreach my $micro (@micros_copy){
			my @fields = split(/\t/,$micro);
			#-----------------	
			if (scalar(@fields) > $microsatcord + 2){					#1	DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS
				if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; 
				$nextcounter[6]++;
				last; }											
			}
			#-----------------	
			if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) { 
				$nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1; 
				$nextcounter[8]++;
				last; }														#1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
			#-----------------											#2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.)
			$prevtype = $fields[$typecord];
			
			my $motif = $fields[$motifcord];
			my $firstmotif = ();
		
			if ($motif =~ /^\[/){
				$motif =~ s/^\[//g;
				$motif =~ /([a-zA-Z]+)\].*/;
				$firstmotif = $1;
			}
			else {$firstmotif = $motif;}
			
			my $motifpattern = $firstmotif.$firstmotif;
			my $prevmotifpattern = $prevmotif.$prevmotif;
			
			if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) {  
				$nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1;
				$nextcounter[9]++;
				last; 
			}														#1	DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
																	#2	TO DIFFERENT MOTIFS
			my $prevmotif = $firstmotif;
			#-----------------			
			
			for my $t (0 ... $#tags){								#1	DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG
																	#2	DIFFERENT ALIGNMENT BLOCKS
				if ($micro =~ /([0-9]+)\s+($focalspec_orig)\s([_0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
					my $key2 = join("_",$2, $3, $4, $5);
# print "key = $key .. key2 = $key2\n"; #<STDIN>;
					if ($key2 ne $key){
#						print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1;
						$nextcounter[10]++;
						$nexter = 1; last;
					}
				}
				else{
# print "counld not form  a key for $line\n"; # if $printer == 1; 
					#<STDIN>;
					$nexter = 1; last;
				}
			}
		}
		#print "D2: $no_of_species\n";

		#####################
		if ($nexter == 1){
# print "nexting\n"; # if $printer == 1; 
				next;
			}
		else{
# print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n"  if $printer == 1;
			push (@{$orths{$key}},$line);	
			$loaded++;
			if ($line =~ /($focalspec_orig)\s([_a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
				
#				print "$line\n"  if $printer == 1; #if $line =~ /Contig/;
#				print "################ ################\n" if $printer == 1;
				push @allowedchrs, $2 if !exists $allowedhash{$2};
				$allowedhash{$2} = 1; 
				my $key = join("\t",$1, $2, $3, $4);
#				print "C: $no_of_species .. key = $key\n";#<STDIN>;
# print "print the shit: $key\n" ; #if $printer  == 1;
				$seen{$key} = 1;
			}
			else { #print "Key could not be formed in SPUT for ($focalspec_orig) (chrom) ([0-9]+) ([0-9]+)\n";
			}
		}
	}
	close ORTH;
# print "now studying where we lost microsatellites: @nextcounter\n";
	for my $reason (0 ... $#nextcounter){
		#print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
	}
# print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n";
# print "done filtering .. counterm = $counterm and loaded = $loaded\n"; 
	#----------------------------------------------------------------------------------------------------------------
	# NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY.
# print "adding files @filterseqfiles \n";
	#<STDIN>;
	
	while (1){
		if (-e $megamatchlck){
# print "waiting to write into $megamatchlck\n";
			sleep 10;
		}
		else{
			open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!";	
			open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!";
			last;
		}
	}
	foreach my $seqfile (@filterseqfiles){
		my $fullpath = $seqfile;
		open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!";
		my $matchlines = 0;

		while (my $line = <MATCH>)	{
			#print "checking $line";
			if ($line =~ /($focalspec_orig)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
				my $key = join("\t",$1, $2, $3, $4);
# print "key = $key\n";
				#print "------------------------------------------------------\n";
				#print "asking $line\n";
				if (exists $seen{$key}){
				
				#print "seen $line \n"; <STDIN>;
					while (1){
						$matchlines++;
						print MEGAM $line;
						$line = <MATCH>;
						print MEGAM "\n" if $line !~ /[0-9a-zA-Z]/;
						last if $line !~/[0-9a-zA-Z]/;
					}
				}
				else{
# print "not seen\n";
				}
			}
		}
#		print "matchlines = $matchlines\n";
		close MATCH;
	}
	close MEGAMLCK;
	
	unlink $megamatchlck;
	close MEGAM;
	undef %seen;
# print "done writitn to $megamatch\n";#<STDIN>;
	#----------------------------------------------------------------------------------------------------------------
	#<STDIN>;
	#---------------------------------------------------------------------------
	# NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO
	# THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE 
	# FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER
	#$printer = 1;
	
	my $microreadcounter=0;
	my $contigsentered=0;
	my $contignotrightcounter=0;
	my $keynotformedcounter=0;
	my $keynotfoundcounter= 0;
	my $dotcounter = 0;

#	print "opening $megamatch\n";

	open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!";
# print "doing  $megamatch\n " ;

	#<STDIN>;
	
	while (my $line = <BO>){
#		print $line; #<STDIN>;
# print "." if $dotcounter % 100 ==0;
#		print "\n" if $dotcounter % 5000 ==0;
# print "dotcounter = $dotcounter\n " if $printer == 1;
		next if $line !~ /^[0-9]+/;
		$dotcounter++;
#		print colored ['green'], "~" x 60, "\n" if $printer == 1;
#		print colored ['green'], $line;# if $printer == 1;
		chomp $line;
		my @fields2 = split(/\t/,$line);
		my $key2 = ();
		my $alignment_no = ();										#1 TEMPORARY
		if ($line =~ /([0-9]+)\s+($focalspec_orig)\s([_\-s0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
#			$key2 = join("\t",$1, $2,  $4, $5);
			$key2 = join("_",$2, $3,  $4, $5);
#			print "key = $key2\n";

# print "key = $key2\n";
			$alignment_no=$1;
		}
		else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;}
		
		$no_of_species = adjustCoordinates($line);

		$contignotrightcounter++  if $no_of_species != $no;
# print "contignotrightcounter=$contignotrightcounter\n";
# print "no_of_species=$no_of_species\n";
# print "no=$no\n";
		
		next if $no_of_species != $no;
#		print "D: $no_of_species\n";
#		print "E: $no_of_species\n";
		#<STDIN>;
	#	print "key = $key2\n" if $printer == 1;
		my @clusters = ();											#1 EXTRACTING MICROSATS CORRESPONDING TO THIS 
																	#2 ALIGNMENT BLOCK
		if (exists($orths{$key2})){
			@clusters = @{$orths{$key2}};
			$contigsentered++;
			delete $orths{$key2};
		}
		else{
#			 print "orth does not exist\n";
			$keynotfoundcounter++;
			next;
		}
		
		my %sequences=();											#1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK
		my $humseq = ();
		foreach my $tag (@tags){									#1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES
			my $seq = <BO>;											#2 OF ALL SPECIES.
			chomp $seq;
			$sequences{$tag} =  " ".$seq;
			#print "sequences = $sequences{$tag}\n" if $printer == 1;
			$humseq = $seq if $tag =~ /H/;
		}
		
		
		foreach my $cluster (@clusters){							#1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS
# print "x" x 60, "\n" if $printer == 1;
#			print colored ['red'],"cluster = $cluster\n";
			$largesttree =~ s/hg18/H/g;
			$largesttree =~ s/panTro2/C/g;
			$largesttree =~ s/ponAbe2/O/g;
			$largesttree =~ s/rheMac2/R/g;
			$largesttree =~ s/calJac1/M/g;

			$microreadcounter++;
			my @micros = split(/>/,$cluster);
			
			
			
			
			
			
			shift @micros;
	
			my $edge_microsat=0;									#1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND
																	#2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK
			
			my @starts= ();	my %start_hash=();						#1 STORES THE START AND END COORDINATES OF MICROSATELLITES
			my @ends = ();	my %end_hash=();						#2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME 
																	#3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES.
			
			my %microhash=();
			my %microsathash=();
			my %nonmicrosathash=();
			my $motif=();											#1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1
# print "tags=@tags\n";
			for my $i (0 ... $#tags){								#1	FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE
																	#2	CORRESPONDING TO THE PARTICULAR SPECIES (AS PER 
																	#3	THE VARIABLE $TAG;
				my $tag = $tags[$i];
			#		print $seq;
				my $locus="NULL";										#1	THIS WILL STORE THE MICROSAT OF THIS SPECIES.
																		#2	IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL"
					
				foreach my $micro (@micros){	
				#	print "micro=$micro, tag=$tag\n";
					if ($micro =~ /^$tag/){								#1	MICROSAT OF THIS SPECIES FOUND..
						$locus  = $micro;
						my @fields = split(/\t/,$micro);
						$motif = $fields[$motifcord];
						$microsathash{$tag}=$fields[$microsatcord];
				#		print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n";
						push(@starts, $fields[$startcord]);
						push(@ends, $fields[$endcord]);
						$start_hash{$tag}=$fields[$startcord];
						$end_hash{$tag}=$fields[$endcord];
						last;
					}
					else{$microsathash{$tag}="NULL"}
				}
				$microhash{$tag}=$locus;
		
			}	
			
			
			
			my $extreme_start  = smallest_number(@starts);		#1 THESE TWO ARE THE EXTREME COORDINATES OF THE 
			my $extreme_end  = largest_number(@ends);			#2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN 
																#3 WHOM IT IS FOUND TO BE ORTHOLOGOUS.
			
#			print "starts=@starts... ends=@ends\n";
				
			my %up_flanks = ();									#1	CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES
			my %down_flanks = ();								#1	CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES
			
			my %up_largeflanks = ();
			my %down_largeflanks = ();
			
			my %locusandflanks = ();
			my %locusandlargeflanks = ();			
			
			my %up_internal_flanks=();							#1	CONTAINS SEQUENCE BETWEEN THE $extreme_start and the 
																#2	ACTUAL START OF MICROSATELLITE IN THE SPECIES
			my %down_internal_flanks=();						#1	CONTAINS SEQUENCE BETWEEN THE $extreme_end and the 
																#2	ACTUAL end OF MICROSATELLITE IN THE SPECIES
			
			my %alignment=();									#1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO
																#2 EXTEME VALUES.
			
			my %microsatstarts=();										#1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE
																#2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE
																#3 WIHIN THE ALIGNMENT
			next if !defined $extreme_start;
			next if !defined $extreme_end;
			next if $extreme_start > length($sequences{$tags[0]});
			next if $extreme_start < 0;
			next if $extreme_end > length($sequences{$tags[0]});
			
			for my $i (0 ... $#tags){							#1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING
																#2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES
																#3 AS WELL AS THE EXTREME COORDINATES OF THE 
																#4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE
																#5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN 
																#6 MORE DETAIL.
				my $tag = $tags[$i];		
#				print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n";
				my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g);		#1	NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM 
																										#2	AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS
																										#3	CLUSTER.
#				print "seq length $tag = $sequences{$tag} = ",length($sequences{$tag})," extreme_end=$extreme_end\n" ;	
				my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g);
				if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) <  $EDGE_DISTANCE){
					$edge_microsat=1;
					
					last;
				}
				else{
					$up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT);
					$down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT);

					$up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT);
					$down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT);


					$alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
					$locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag};
					$locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag};
					
					if ($microhash{$tag} ne "NULL"){
						$up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start);
						$down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag});
						$microsatstarts{$tag}=$start_hash{$tag}-$extreme_start;
#						print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1;
					}
					else{
						$nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
					
					}
			#		print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1;
										
				}
	
			}
			$nextcounter[11]++  if $edge_microsat==1;
			next if $edge_microsat==1;
			
			
			my $low_complexity = 0; 								#1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS
																	#2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE
																	#3 FUNCTION sub test_complexity
			
			
			for my $i (0 ... $#tags){
#				print "i = $tags[$i]\n" if $printer == 1;
				if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){
#					print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), "  and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1;
					$low_complexity =1; last;
				}
			}
			
			$nextcounter[12]++  if $low_complexity==1;
			next if $low_complexity == 1;
			
			
			my $sequence_dissimilarity = 0;										#1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY
																	#2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN
																	#3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD
																	#4 AS DESCRIBED IN FUNCTION sub sequence_similarity
			my %donepair = ();
			for my $i (0 ... $#tags){
			#	print "i = $tags[$i]\n" if $printer == 1;
#				next if $i == 0;
			#	print colored ['magenta'],"THIS IS UP\n" if $printer == 1;

				for my $b (0 ... $#tags){
					next if $b == $i;
					my $pair = ();
					$pair = $i."_".$b if $i < $b;
					$pair = $b."_".$i if $b < $i;
					next if exists $donepair{$pair};
					my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
					my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);				
					$donepair{$pair} = $up_similarity."_".$down_similarity;
					
#					print RUN "$up_similarity	$upnucdiffs	$upindeldiffs	$down_similarity	$downnucdiffs	$downindeldiffs\n";
					
					if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){
						$sequence_dissimilarity =1; 
						last;
					}
				}
			}
			$nextcounter[13]++  if $sequence_dissimilarity==1;

			next if $sequence_dissimilarity == 1;
			my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq);
#			print "simplified_microsat=$simplified_microsat\n"; 
			my ($tree_analysis,  $conformation) = treeStudy($simplified_microsat);
# print "tree_analysis = $tree_analysis .. conformation=$conformation\n"; 
			#<STDIN>;
			
#			print SELECT "\"$conformation\"\t$tree_analysis\n";
			
			next if $tree_analysis =~ /DISCARD/;
			if (exists $treesToReject{$tree_analysis}){
				$nextcounter[14]++;
				next;
			}

#			print "F: $no_of_species\n";

# 			my $adjuster=();
# 			if ($no_of_species == 4){
# 				my @sields = split(/\t/,$simplified_microsat);
# 				my $somend = pop(@sields);
# 				my $somestart = pop(@sields);
# 				my $somechr = pop(@sields);
# 				$adjuster = "NA\t" x 13 ;
# 				$simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
# 			}
# 			if ($no_of_species == 3){
# 				my @sields = split(/\t/,$simplified_microsat);
# 				my $somend = pop(@sields);
# 				my $somestart = pop(@sields);
# 				my $somechr = pop(@sields);
# 				$adjuster = "NA\t" x 26 ;
# 				$simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
# 			}
# 			
			$registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis};
			$registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis};
				
			if (exists $treesToIgnore{$tree_analysis}){
				my @appendarr = ();

#				print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
				#print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
#				print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n";
				
				foreach my $lnode (@$lagestnodes){
					my @pair = @$lnode;
					my @nodemutarr = ();
					for my $p (@pair){
						my @mutinfoarray1 = ();
						for (1 ... 38){
#							push (@mutinfoarray1, "NA")
						}
#						print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; 			
#						print  join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; 			
					}
	
				}
				for (1 ... 38){
					push (@appendarr, "NA")
				}
#				print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; 
#				print  join ("\t", @appendarr,"NULL", "NULL"),"\n"; 
		#		print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
				next;
			}
#			print colored ['blue'],"cluster = $cluster\n";
			
			my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
	
			if ($mutations_array eq "NULL"){
			#	print "cluster = $cluster \n"; <STDIN>;
				my @appendarr = ();

			#	print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize, "\t";

			#	foreach my $lnode (@$lagestnodes){
			#		my @pair = @$lnode;
			#		my @nodemutarr = ();
			#		for my $p (@pair){
			#			my @mutinfoarray1 = ();
			#			for (1 ... 38){
			#				push (@mutinfoarray1, "NA")
			#			}
			#			print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; 			
			#			print  join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; 			
			#		}
			#	}
			#	for (1 ... 38){
			#		push (@appendarr, "NA")
			#	}
			#	print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; 
			#	print  join ("\t", @appendarr,"NULL", "NULL"),"\n"; 
			#	print  join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
				next;
			}
			
			
#			print "sent: \n" if $printer == 1;
#			print "nodes = @$nodes,  branches array:\n" if $mutations_array ne "NULL" &&  $printer == 1;

			my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
#			print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" &&  $printer == 1;
			my @finalmutations_array= ();
			@finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL";
			@finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL";
# print "alt = $alternate{$conformation}\n";

			my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$conformation}, \@finalmutations_array);
			my $cleancase = "UNCLEAN";
			$cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/;
			$cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL";
			$cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL";
			$cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ;
# print "besttree = $besttree ... cleancase=$cleancase\n"; #<STDIN>;

			my @selects = ("-C","+C","-H","+H","-HC","+HC","-O","+O","-H.-C","-H.-O","-HC,+C","-HC,+H","-HC.-O","-HCO,+HC","-HCO,+O","-O.-C","-O.-H",
			"+C.+O","+H.+C","+H.+O","+HC,-C","+HC,-H","+HC.+O","+HCO,-C","+HCO,-H","+HCO,-HC","+HCO,-O","+O.+C","+O.+H","+H.+C.+O","-H.-C.-O","+HCO","-HCO");
			next if (oneOf(@selects, $besttree) == 0);
			if ( ($besttree =~ /,/ || $besttree =~ /\./) && $cleancase eq "UNCLEAN"){
				$besttree = "$besttree / $tree_analysis";				
			}

			$besttree = "NULL" if $treescore <= 0;
			
			while ($besttree =~ /[A-Z][A-Z]/){
				$besttree =~ s/([A-Z])([A-Z])/$1:$2/g;
			}
			
			if ($besttree !~ /NULL/){			
				my @elements = ($besttree =~ /([A-Z])/g);
				
				foreach my $ele (@elements){
#					 print "replacing $ele with $backReplacementArrTag{$ele}\n";
					$besttree =~ s/$ele/$backReplacementArrTag{$ele}/g if exists $backReplacementArrTag{$ele};
				}
			}
			my $endendstate = $focalspec_orig.".".$Hchrom."\t".$Hstart."\t".$Hend."\t".$locusmotif."\t".$locusmotifsize."\t".$tree_analysis."\t";
			next if $endendstate =~ /NA\tNA\tNA/;
			
			print SUMMARY $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t";
# print "SUMMARY\t", $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t",$tree_analysis,"\t" ;
			
			my @mutinfoarray =();
			
			foreach my $lnode (@$lagestnodes){
				my @pair = @$lnode;
				my $joint = "(".join(", ",@pair).")";
				my @nodemutarr = ();

				for my $p (@pair){
						foreach my $mut (@finalmutations_array){
						$mut =~ /node=([A-Z, \(\)]+)/;
						push @nodemutarr, $mut if $p eq $1;
					}
					@mutinfoarray = summarizeMutations(\@nodemutarr, $besttree);
					
				#	print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; 			
				#	print  join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; 			
				}
			}
			
#			print "G: $no_of_species\n";
			
			my @alignmentarr = ();
			
			foreach my $key (keys %alignment){
				push @alignmentarr, $backReplacementArrTag{$key}.":".$alignment{$key};
				
			}
#			print "alignmentarr = @alignmentarr"; <STDIN>;
			
			@mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree);
			print SUMMARY join ("\t", @mutinfoarray ),"\t"; 			
			print SUMMARY join(",",@alignmentarr),"\n";
#			print join("\t","--------------","\n",$besttree, join("",@tags)),"\n" if scalar(@tags) < 5;
#			<STDIN> if scalar(@tags) < 5;
#			print   $cleancase, "\n";
#			print join ("\t", @mutinfoarray,$cleancase,join(",",@alignmentarr)),"\n"; #<STDIN>; 			
# print "summarized\n"; #<STDIN>;
	
	
	
			my %indelcatch = ();
			my %substcatch = ();
			my %typecatch = ();
			my %nodescatch = ();
			my $mutconcat = join("\t", @finalmutations_array)."\n";
			my %indelposcatch = ();
			my %subsposcatch = ();
			
				foreach my $fmut ( @finalmutations_array){
#					next if $fmut !~ /indeltype=[a-zA-Z]+/;
					#print RUN $fmut, "\n";
					$fmut =~ /node=([a-zA-Z, \(\)]+)/;
					my $lnode = $1;
					$nodescatch{$1}=1;
					
					if ($fmut =~ /type=substitution/){
		#				print "fmut=$fmut\n";
						$fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/;
						my $from=$1;
		#				print "from=$from\n";
						my $to=$2;
		#				print "to=$to\n";	
						push @{$substcatch{$lnode}} , ("from:".$from." to:".$to);
						$fmut =~ /position=([0-9]+)/;
						push @{$subsposcatch{$lnode}}, $1;
					}
					
					if ($fmut =~ /insertion=[a-zA-Z\-]+/){
						$fmut =~ /insertion=([a-zA-Z\-]+)/;
						push @{$indelcatch{$lnode}} , $1;
						$fmut =~ /indeltype=([a-zA-Z]+)/;
						push @{$typecatch{$lnode}}, $1;
						$fmut =~ /position=([0-9]+)/;
						push @{$indelposcatch{$lnode}}, $1;
					}
					if ($fmut =~ /deletion=[a-zA-Z\-]+/){
						$fmut =~ /deletion=([a-zA-Z\-]+)/;
						push @{$indelcatch{$lnode}} , $1;
						$fmut =~ /indeltype=([a-zA-Z]+)/;
						push @{$typecatch{$lnode}}, $1;
						$fmut =~ /position=([0-9]+)/;
						push @{$indelposcatch{$lnode}}, $1;
					}
				}
							
		#	print  $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1;
		#	print join ("<\t>", @mutinfoarray),"\n" if $printer == 1; 
		#	print "where mutinfoarray = @mutinfoarray\n"  if $printer == 1;
		#	#print RUN ".";
			
		#	print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
		#	print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;

		#	print colored ['red'],"finalmutations_array=\n" if $printer == 1;
#			foreach (@finalmutations_array) {
#				print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1  ;
#				print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1  ;
				
#			}# if $line =~ /cal/;# && $line =~ /chr4/;
			
#			print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
#			print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
#			print "tree analysis = $tree_analysis\n" if $printer == 1;
			
		#	my $mutations = "@$mutations_array";


			next;
			for my $keys (@$nodes) {foreach my $key (@$keys){
										#print "key = $key, => $branches_hash->{$key}\n";
									} 
								#	print "x" x 50, "\n";
								}
			my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat);
		}
	}
	close BO;
# print "now studying where we lost microsatellites:\n";
# print "x" x 60,"\n";
	for my $reason (0 ... $#nextcounter){
#		print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
	}
# print "x" x 60,"\n";
# print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n";
# print " we lost $keynotformedcounter contigs as they did not form the key, \n";
# print "$contignotrightcounter contigs as they were not of the right species configuration\n";  
# print "$keynotfoundcounter contigs as they did not contain the microsats\n";  
# print "... In total we went through a file that had $dotcounter contigs...\n";  
#	print join ("\n","remaining orth keys = ", (keys %orths),"");
# print "------   ------   ------   ------   ------   ------   ------   ------   ------   ------   ------   \n";
#	print "now printing counted trees: \n";
		if (scalar(keys %registeredTrees) > 0){
		foreach my $keyb ( sort (keys %registeredTrees) )
		{
#			print "$keyb : $registeredTrees{$keyb}\n";
		}
	}
	
	
}	
close SUMMARY;

my @summarizarr = ("+C=+C +R.+C -HCOR,+C",
"+H=+H +R.+H -HCOR,+H",
"-C=-C -R.-C +HCOR,-C",
"-H=-H -R.-H +HCOR,-H",
"+HC=+HC",
"-HC=-HC",
"+O=+O -HCOR,+O",
"-O=-O +HCOR,-O",
"+HCO=+HCO",
"-HCO=-HCO",
"+R=+R +R.+C +R.+H",
"-R=-R -R.-C -R.-H");

foreach my $line (@summarizarr){
	next if $line !~ /[A-Za-z0-9]/;
#	print $line;
	chomp $line;
	my @fields = split(/=/,$line);
#	print "title = $fields[0]\n";
	my @parts=split(/ +/, $fields[1]);
	my %par…
Summary ✨

The Perl code reads a file, processes its contents, and outputs a series of strings. It appears to be a bioinformatics tool that aligns multiple genome sequences against each other, generating a tree-like structure representing their relationships. The output is a string representation of the tree, with nodes labeled as species names or accession numbers.
Alerts (4)

Complexity hotspot; lines 1101 to 1104 (total complexity: 12)
1101 1102 1103 1104