multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl

/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl

https://bitbucket.org/chapmanb/galaxy-central/ · Perl · 5606 lines · 4007 code · 748 blank · 851 comment · 803 complexity · e64ba560e907a6b8d7818123ba23613a MD5 · raw file
Large files are truncated click here to view the full file

#!/usr/bin/perl
use strict;
use warnings;
use Term::ANSIColor;
use File::Basename;
use IO::Handle;
use Cwd;
use File::Path;
use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);
use File::Path qw(make_path remove_tree);
use File::Temp qw/ tempfile tempdir /;
my $tdir = tempdir( CLEANUP => 1 );
chdir $tdir;
my $dir = getcwd;  
#print "dir = $dir\n";

#$ENV{'PATH'} .= ':' . dirname($0);
my $date = `date`;

my ($mafile, $mafile_sputt, $orthfile, $threshold_array,  $allspeciesin, $tree_definition_all, $separation) = @ARGV;
if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; }

$tree_definition_all =~ s/\s+//g;
$threshold_array =~ s/\s+//g;
$allspeciesin =~ s/\s+//g;
#-------------------------------------------------------------------------------
# WHICH SPUTNIK USED?
my $sputnikpath = ();
$sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;
#$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";
#print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath;
#exit if !-e $sputnikpath;
#$sputnikpath = "bx-sputnik" ;
#print "ARGV input = @ARGV\n";
#print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n  species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n";
#-------------------------------------------------------------------------------
# RUNFILE
#-------------------------------------------------------------------------------
$distance = 1; #bp
$distance++;
my @tree_definitions=MakeTrees($tree_definition_all);
my $allspeciesset = $tree_definition_all;
$allspeciesset =~ s/[\(\) ]+//g;
@allspecies = split(/,/,$allspeciesset);

my @outputfiles = ();
my $round = 0;
#my $tdir = tempdir( CLEANUP => 0 );
#chdir $tdir;

foreach my $tree_definition (@tree_definitions){
	my @commas = ($tree_definition =~ /,/g) ;
	#print "commas = @commas\n"; <STDIN>;
	next if scalar(@commas) <= 1;
	#print "species_set = $species_set\n";
	$treeSpeciesNum = scalar(@commas) + 1;
	$species_set = $tree_definition;
	$species_set =~ s/[\)\( ;]+//g;
	#print "species_set = $species_set\n"; <STDIN>;

	$round++;
	#-------------------------------------------------------------------------------
	# MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)
	$threshold_array=~ s/,/_/g;
	my @thresharr = split("_",$threshold_array);
	@thresholds=@thresharr;
	#my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));
	#print "current dit=$dir\n";
	#-------------------------------------------------------------------------------
	# CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY
	my @chrfiles=();
	
	#my $mafile =  "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];
	my $chromt=int(rand(10000));
	my $p_chr=$chromt;
	
	
	my @exactspeciesset_unarranged = split(/,/,$species_set);
	$tree_definition=~s/[\)\(, ]/\t/g;
	my @treespecies=split(/\t+/,$tree_definition);
	my @exactspecies=();
	
	foreach my $spec (@treespecies){
		foreach my $espec (@exactspeciesset_unarranged){
			push @exactspecies, $spec if $spec eq $espec;
		}
	}
	#print "exactspecies=@exactspecies\n";
	$focalspec = $exactspecies[0];
	my $arranged_species_set=join(".",@exactspecies);
	my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
	my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");
	#print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; 
	maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);
	maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set);
	#print "done maf to axt conversion\n";
	my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");
	artificial_axdata_inverter ($chr_name, $reverse_chr_name);
	#print "reverse_chr_name=$reverse_chr_name\n"; 
	#-------------------------------------------------------------------------------
	# FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT
	foreach my $direct ("reverse_direction","forward_direction"){
		$p_chr=$chromt;
		#print "direction = $direct\n";
		$p_chr = $p_chr."r" if $direct eq "reverse_direction";
		$p_chr = $p_chr if $direct eq "forward_direction";
		my $config = $species_set;
		$config=~s/,/./g;
		my @orgs = split(/\./,$arranged_species_set);
		#print "ORGS= @orgs\n";
		my @tag=@orgs;
			
		
		my $tags = join(",", @tag);
		my @tags=@tag;
		chomp $p_chr;
		$tags = join("_", split(/,/, $tags));
		my $pchr = "chr".$p_chr;
		
		my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array;
		my @sp_tags = ();
		
#		print "$ptag _ orthfile\n"; <STDIN>;
		#print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n";
		foreach my $sp (@tag){
			push(@sp_tags, ($sp.".".$ptag));
		}
	
		my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]);
		my @presp_tags = ();
		
		foreach my $sp (@tag){
			push(@presp_tags, ($sp.".".$preptag));
		}
	
		my $resultdir = "";
		my $orthdir = "";
		my $filtereddir = "";
		my $pipedir = "";
		
		my @title_queries = ();
		push(@title_queries, "^[0-9]+");
		my $sep="\\s";
		for my $or (0 ... $#orgs){
			my $title =  join($sep, ($orgs[$or],  "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]"));
			#$title =~ s/chr\\+\\s+\+/chr/g;
			push(@title_queries, $title);
		}
		my $title_query = join($sep, @title_queries);
		#print "title_queries=@title_queries\n";
		#print "query = >$title_query<\n"; 
		#print "orgs = @orgs\n"; 
		#-------------------------------------------------------------------------------
		# GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES
		my $t1input = $pchr.".".$arranged_species_set.".net.axt";
		
		my @t1outputs = ();
		
		foreach my $sp (@presp_tags){
			push(@t1outputs, $sp."_gap_op");
		}
		
		
		
		multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query); 
		#print "t1outputs=@t1outputs\n";
		#print "done t1\n"; <STDIN>;
		#-------------------------------------------------------------------------------
		#START T2.PL
		
		my $stag  = (); my $tag1 = (); my $tag2 = ();  my $schrs = ();
		
		for my $t (0 ... scalar(@tags)-1){
			multi_species_t2($t1outputs[$t], $tag[$t]);		
		}
		#-------------------------------------------------------------------------------
		#START T2.2.PL
		
		my @temp_tags = @tag;
		
		foreach my $sp (@presp_tags){
			my $t2input =  $sp."_nogap_op_unrand";
			multi_species_t2_2($t2input, shift(@temp_tags));
		}
		undef (@temp_tags);
		
		#-------------------------------------------------------------------------------
		#START SPUTNIK
		
		my @jobIDs = ();
		@temp_tags = @tag;
		my @sput_filelist = ();
		
		foreach my $sp (@presp_tags){
			#print "sp = $sp\n";
			my $sputnikoutput = $pipedir.$sp."_sput_op0";
			my $sputnikinput = $pipedir.$sp."_nogap_op_unrand";
			push(@sput_filelist, $sputnikinput);
			my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput;
		#	print "$sputnikcommand\n";
			my @sputnikcommand_system = $sputnikcommand;
			system(@sputnikcommand_system);
		}
	
		#-------------------------------------------------------------------------------
		#START SPUTNIK OUTPUT CORRECTOR
		
		foreach my $sp (@presp_tags){
			my $corroutput = $pipedir.$sp."_sput_op1";
			my $corrinput = $pipedir.$sp."_sput_op0";
			sputnikoutput_corrector($corrinput,$corroutput);
			
			my $t4output = $pipedir.$sp."_sput_op2";
			multi_species_t4($corroutput,$t4output);
		
			my $t5output = $pipedir.$sp."_sput_op3";
			multi_species_t5($t4output,$t5output);
			#print "done t5.pl for $sp\n";
		
			my $t6output = $pipedir.$sp."_sput_op4";
			multi_species_t6($t5output,$t6output,scalar(@orgs));
		}
		#-------------------------------------------------------------------------------
		#START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING
		
		foreach my $sp (@presp_tags){
			my $t9output = $pipedir.$sp."_gap_op_unrand_match";
			my $t9sequence = $pipedir.$sp."_gap_op_unrand2";
			my $t9micro = $pipedir.$sp."_sput_op4";
			t9($t9micro,$t9sequence,$t9output);
			
			my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match";
			my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2";
			t9($t9micro,$t9sequence2,$t9output2);
		}
		#print "done both t9.pl for all orgs\n";
	
		#-------------------------------------------------------------------------------
		# FIND COMPOUND MICROSATELLITES
		
		@jobIDs = ();
		my $species_counter = 0;
		
		foreach my $sp (@presp_tags){
			my $simple_microsats=$pipedir.$sp."_sput_op4_simple";
			my $compound_microsats=$pipedir.$sp."_sput_op4_compound";
			my $input_micro = $pipedir.$sp."_sput_op4";
			my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match";
			multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array );
			$species_counter++;
		}
		
		#-------------------------------------------------------------------------------
		# READING  AND FILTERING SIMPLE MICROSATELLITES
		my $spcounter2=0;
		foreach my $sp (@sp_tags){
			my $presp = $presp_tags[$spcounter2];
			$spcounter2++;
			my $simple_microsats=$pipedir.$presp."_sput_op4_simple";
			my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered";
			my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue";
			multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
		}
		
		#-------------------------------------------------------------------------------
		# ANALYZE  COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS
		
		$species_counter = 0;
		foreach my $sp (@sp_tags){
			my $presp = $presp_tags[$species_counter];
			my $compound_microsats = $pipedir.$presp."_sput_op4_compound";
			my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted";
			my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure";
			my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
			multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags));
			$species_counter++;
		}
		#-------------------------------------------------------------------------------
		# REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM.. 
		$species_counter = 0;
		
		foreach my $sp (@sp_tags){
			my $presp = $presp_tags[$species_counter];
			my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure";
			my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
			my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound";
			my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
			multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8");
			$species_counter++;
		}
		#-------------------------------------------------------------------------------
		# READING  AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES
		$species_counter = 0;
		
		foreach my $sp (@sp_tags){
			my $presp = $presp_tags[$species_counter];
		
			my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
			my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
			my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue";
			multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
		
			my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted";
			my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
			my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue";
			multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags));
		
			my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound";
			my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered";
			my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue";
			multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
			$species_counter++;
		}
		#print "done filtering both simple and compound microsatellites \n";
		
		#-------------------------------------------------------------------------------
		
		my @combinedarray = ();
		my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide");
		my @combinedarray_tags = ("mono", "di", "tri", "tetra");
		$species_counter = 0;
		
		foreach my $sp (@sp_tags){
			my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple";
			push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra";
			$species_counter++;
		}
		
		#-------------------------------------------------------------------------------
		# PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION	
		my $sp_counter = 0;
		foreach my $sp (@sp_tags){	
			my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
			my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
			my $simple = $pipedir.$sp."_sput_op4_simple_filtered";
			my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple";
			`cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`;
			for my $i (0 ... 3){
				`grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`;	
			}
			$sp_counter++;		
		}
		#print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n";
		
		#-------------------------------------------------------------------------------
		# BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES
		# ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES
		@temp_tags = @sp_tags;
		my $increment = 1000000;
		my @splist = ();
		my $targetdir = $pipedir;
		$species_counter=0;
		
		foreach my $sp (@sp_tags){
			my $presp = $presp_tags[$species_counter];
			$species_counter++;
			my $localtag = shift @temp_tags;
			my $locallist = $targetdir.$localtag."_".$p_chr."_list";
			push(@splist, $locallist);
			my $input = $pipedir.$presp."_nogap_op_unrand2_match";
			chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr);
		}
		
	
		my @unionarray = ();
		#print "splist=@splist\n";
		#-------------------------------------------------------------------------------
		# FIND INTERRUPTED MICROSATELLITES
		
		$species_counter = 0;		
		
		for my $i (0 .. $#combinedarray){
			
			@jobIDs = ();
			open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!";
			
			while (my $sp1  = <JLIST1>){
				#print "$splist[$i]: sp1=$sp1\n";
				chomp $sp1;
				
				for my $j (0 ... $#combinedarray_tags){
					my $interr  = $sp1."_interr_".$combinedarray_tags[$j];
					my $simple  = $sp1."_simple_".$combinedarray_tags[$j];
					push @{$unionarray[$i]}, $interr, $simple;	
					multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8"); 
				}		
			}
			$species_counter++;
		}
		close JLIST1;	
		#-------------------------------------------------------------------------------
		#	REUNION AND ZIPPING BEFORE T10.PL
		
		my @allarray = ();
		
		for my $i (0 ... $#sp_tags){	
			my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
			unlink $localfile if -e $localfile;
			push(@allarray, $localfile);
		
			my $unfiltered_localfile= $localfile."_unfiltered";
			my $residue_localfile= $localfile."_residue";
			
			unlink $unfiltered_localfile;
			#unlink $unfiltered_localfile;
			for my $j (0 ... $#{$unionarray[$i]}){
				#print "listing files for species $i  and list number $j= \n$unionarray[$i][$j] \n";
				`cat $unionarray[$i][$j] >> $unfiltered_localfile`;
				unlink $unionarray[$i][$j];
			}
	
			multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) );
			my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered";
			my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue";
			my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue";
			
			`cat $analyzed_compound >> $localfile`;
		}
		#-------------------------------------------------------------------------------
		# MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS
		
		my $toescape=0;
		
		
		for my $i (0 ... $#sp_tags){	
			my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
			$localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./;
			my $direction = $1;
			#print "localfile = $localfile , direction = $direction\n"; 
	#		`gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1;
			$toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/;
			last if $direction =~ /chr[0-9a-zA-Z]+r/;
			my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match";
			my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match";
			my $reverselocal = $localfile;
			$reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g;
			merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags)); 
			#-------------------------------------------------------------------------------
			my $forward_separate = $localfile."_separate";
			my $reverse_separate = $reverselocal."_separate";
			my $diff = $forward_separate."_diff";
			my $miss = $forward_separate."_miss";
			my $common = $forward_separate."_common";
			forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags)); 
			#-------------------------------------------------------------------------------
			my $symmetrical_file = $localfile."_symmetrical";
			my $merged_file = $localfile."_merged";
			#print "cating: $merged_file $common  into -> $symmetrical_file \n"; 
			`cat $merged_file $common > $symmetrical_file`;
			#-------------------------------------------------------------------------------
			my $t10output = $symmetrical_file."_fin_hit_all_2";
			new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs));
			#-------------------------------------------------------------------------------		
		}
		next if $toescape == 1;
		#------------------------------------------------------------------------------------------------
		# BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES
		
		
		my @micros_array = ();
		my $sampletag = ();
		for my $i (0 ... $#sp_tags){
			my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2";
			push(@micros_array, $finhitFile);
			$sampletag = $sp_tags[$i];
		}
		#$sampletag =~ s/^([A-Z]+\.)/ORTH_/;
		#$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp";
		my $orthfiletemp = $ptag."_orthfile";
		my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation);

		my $maskedorthfiletemp = $ptag."_orthfile_masked";
		qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp);

		push @outputfiles , $maskedorthfiletemp;
	}
	$date = `date`;
}

`cat @outputfiles > $orthfile`;

my $rootdir = $dir;
$rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
chdir $rootdir;
remove_tree($dir);

#print "date = $date\n";
#remove_tree($tdir);
#------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------

#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx 

sub maftoAxt_multispecies {
	#print "in maftoAxt_multispecies : got @_\n";
	my $fname=$_[0];
	open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
	my $treedefinition = $_[1];
	open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
	my $counter = 0;
	my $exactspeciesset = $_[3];
	my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
	
	$treedefinition=~s/[\)\(, ]/\t/g;
	my @species=split(/\t+/,$treedefinition);
	my @exactspecies=();
	
	foreach my $spec (@species){
		foreach my $espec (@exactspeciesset_unarranged){
			push @exactspecies, $spec if $spec eq $espec;
		}
	}
	#print "exactspecies=@exactspecies\n";
	
	###########
	my $select = 2;  
	#select = 1 if all species need sequences to be present for each block otherwise, it is 0
	#select = 2 only the allowed set make up the alignment. use the removeset
	# information to detect alignmenets that have other important genomes aligned.
	###########
	my @allowedset = ();
	@allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
	@allowedset = join("_",0,@species) if $select == 1;
	#print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; 
	@allowedset = join("_",0,@exactspecies) if $select == 2;
	#print "allowedset = @allowedset and exactspecies = @exactspecies\n";
	
	my $start = 0;
	my @sequences = ();
	my @titles = ();
	my $species_counter = "0";
	my $countermatch = 0;
	my $outsideSpecies=0;
	
	while(my $line = <IN>){
#		print $line;
		next if $line =~ /^#/;
		next if $line =~ /^i/;
		chomp $line;
		my @fields = split(/\s+/,$line);
		chomp $line;
		if ($line =~ /^a /){
			$start = 1;
		}
		
		if ($line =~ /^s /){
		
			foreach my $sp (@allspecies){
#				print "checking species $sp\n";
				if ($fields[1] =~ /$sp/){
					$species_counter = $species_counter."_".$sp;
					push(@sequences, $fields[6]);
					my @sp_info = split(/\./,$fields[1]);
					my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
					push(@titles, $title);				
#					print "species_counter = $species_counter\n";
				}
			}
		}
		
		if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
#			print "species_counter = $species_counter\n"; 
			my $arranged = reorderSpecies($species_counter, @allspecies);
			my $stopper = 1;
			my $arrno = 0;
			
#			print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n";
			if (scalar(@sequences) == scalar(@exactspecies)){
				foreach my $set (@allowedset){
#					print "testing $arranged against $set\n";
					if ($arranged eq $set){
						$stopper = 0; last;
					}
					$arrno++;
				}
			}
			else{
				$stopper = 1;
			}
			
			
			if ($stopper == 0) {
				@titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;				
				@sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;				
				my $filteredseq = filter_gaps(@sequences);
				
				if ($filteredseq ne "SHORT"){
					#print "printing"; <STDIN>;
					$counter++;
					print OUT join (" ",$counter, @titles), "\n";
					print OUT $filteredseq, "\n";
					print OUT "\n"; 
					$countermatch++;
				}
			}
			else{ #print "nexting\n";<STDIN>;
			}
	
			@sequences = (); @titles = (); $start = 0;$species_counter = "0";
			next;		
			
		}
	}
#	print "countermatch = $countermatch\n";
}

sub reorderSpecies{
	my @inarr=@_;
	my $currSpecies = shift (@inarr);
	my $ordered_species = 0;
	my @species=@inarr;
	#print "species = @species\n";
	foreach my $order (@species){
		$ordered_species = $ordered_species."_".$order	if	$currSpecies=~ /$order/;
	}
	return $ordered_species;

}

sub filter_gaps{
	my @sequences = @_;
#	print "sequences sent are @sequences\n";
	my $seq_length = length($sequences[0]);
	my $seq_no = scalar(@sequences);
	my $allgaps = ();
	for (1 ... $seq_no){
		$allgaps = $allgaps."-";
	}
	
	my @seq_array = ();
	my $seq_counter = 0;
	foreach my $seq (@sequences){
#		my @sequence = split(/\s*/,$seq);
		$seq_array[$seq_counter] = [split(/\s*/,$seq)];
#		push @seq_array, [@sequence];
		$seq_counter++;
	}
	my $g = 0;
	while ( $g < $seq_length){
		last if (!exists $seq_array[0][$g]);
		my $bases = ();
		for my $u (0 ... $#seq_array){
			$bases = $bases.$seq_array[$u][$g];
		}	
#		print $bases, "\n";
		if ($bases eq $allgaps){
#			print "bases are $bases, position is $g \n";
			for my $seq (@seq_array){
				splice(@$seq , $g, 1);
			}
		}
		else {
			$g++;
		}
	}
	
	my @outs = ();
	
	foreach my $seq (@seq_array){
		push(@outs, join("",@$seq));
	}
	return "SHORT" if length($outs[0]) <=100;
	return (join("\n", @outs));	
}


sub allowedSetOfSpecies{
	my @allowed_species = split(/_/,$_[0]);
	unshift @allowed_species, 0;
#	print "allowed set = @allowed_species \n";
	my @output = ();
	for (0 ... scalar(@allowed_species) - 4){
		push(@output, join("_",@allowed_species));
		pop @allowed_species;
	}
	return join(";",reverse(@output));

}


sub orderInfo{
	my @info = split(/;/,$_[0]);
#	print "info = @info";
	my @old = split(/_/,$_[1]);
	my @new = split(/_/,$_[2]);
	shift @old; shift @new;
	my @outinfo = ();
	foreach my $spe (@new){
		for my $no (0 ... $#old){
			if ($spe eq $old[$no]){
				push(@outinfo, $info[$no]);
			}
		}
	}
#	print "outinfo = @outinfo \n"; 
	return join(";", @outinfo);
}

#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx 

#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx 
sub artificial_axdata_inverter{
	open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!";
	open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!";
	my $linecounter=0;
	while (my $line = <IN>){
		$linecounter++;
		#print "$linecounter\n";
		chomp $line;	
		my $final_line = $line;	
		my $trycounter = 0;
		if ($line =~ /^[a-zA-Z\-]/){
		#	while ($final_line eq $line){
				my @fields = split(/\s*/,$line);
				
				$final_line = join("",reverse(@fields));
		#		print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
		#		$trycounter++;
		#		print "trying again....$trycounter : $final_line\n" if $final_line eq $line;
		#	}
		}
		
	#	print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
		if ($line =~ /^[0-9]/){
			$line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g;
			$final_line = $line;
		}
		print OUT $final_line,"\n";
		#print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
	}
	close OUT;
}
#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx 


#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx 

sub multi_species_t1 {

	my $input1 = $_[0];
	#print "@_\n"; <STDIN>;
	my @tags = split(/_/, $_[1]);
	my @outputs = split(/,/, $_[2]);
	my $title_query = $_[3];
	my @handles = ();
	
	open(FILEB,"<$input1")or die "Cannot open file: $input1 $!";
	my $i = 0;
	foreach my $path (@outputs){
		$handles[$i] = IO::Handle->new();
		open ($handles[$i], ">$path") or die "Can't open $path : $!";
		$i++;
	}
	
	my $curdef;
	my $start = 0;
	
	while (my $line = <FILEB> ) {
		if ($line =~ /^\d/){
			$line =~ s/ +/\t/g;
			my @fields = split(/\s+/, $line);
			if (($line =~ /$title_query/)){
				my $title = $line;
				my $counter = 0;
				foreach my $tag (@tags){
					$line = <FILEB>;
					print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line;  	
					$counter++;
				}
			}
			else{
					foreach my $tag (@tags){
					my $tine = <FILEB>;
				}		
			}
		
		}
	}
	
	foreach my $hand (@handles){
		$hand->close(); 
	}
	
	close FILEB;
}

#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx 

#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx 

sub multi_species_t2{
	
	my $input = $_[0];
	my $species = $_[1];
	my $output1 = $input."_unr";
	
	#------------------------------------------------------------------------------------------
	open (FILEF1, "<$input") or die "Cannot open file $input :$!";
	open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!";
	
	my $line1 = <FILEF1>;
	
	while($line1){
	{
	#    chomp($line);
		if ($line1 =~ (m/^\>$species/)){
		chomp($line1);
		print FILEF2 $line1;
		$line1 = <FILEF1>;
		chomp($line1);
		print FILEF2 "\t", $line1,"\n";
	   }   
	}
	$line1 = <FILEF1>;
	}
	
	close FILEF1;
	close FILEF2;
	#------------------------------------------------------------------------------------------
	
	my $output2 = $output1."and";
	my $output3 = $output1."and2";
	open(IN,"<$output1");
	open (FILEF3, ">$output2");
	open (FILEF4, ">$output3");
	
	
	while (<IN>){
		my $line = $_;
		chomp($line);
		my @fields=split (/\t/, $line);
	  #  print $line,"\n"; <STDIN>;
		if($line !~ /random/){
			print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n";
			print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n";	
		}
	}
	
	
	close IN;
	close FILEF3;
	close FILEF4;
	unlink $output1;
	
	#------------------------------------------------------------------------------------------
	# OLD T3.PL RUDIMENT
	
	my $t3output = $output2;
	$t3output =~ s/gap_op_unrand/nogap_op_unrand/g;
	
	open(IN,"<$output2");
	open(OUTA,">$t3output");
	
	
	while (<IN>){
		s/-//g unless /^>/;
		print OUTA;
	}
	
	close IN;
	close OUTA;
	#------------------------------------------------------------------------------------------
}
#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx 


#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx 
sub multi_species_t2_2{
	#print "IN multi_species_t2_2 : @_\n";
	my $input = $_[0];
	my $species = $_[1];
	my $output1 = $input."2";
	
	
	open (FILEF1, "<$input");
	open (FILEF2, ">$output1");
	
	my $line1 = <FILEF1>;
	
	while($line1){
	{
	#    chomp($line);
		if ($line1 =~ (m/^\>$species/)){
		chomp($line1);
		print FILEF2 $line1;
		$line1 = <FILEF1>;
		chomp($line1);
		print FILEF2 "\t", $line1,"\n";
	   }   
	}
	$line1 = <FILEF1>;
	}
	
	close FILEF1;
	close FILEF2;
}

#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx 


#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx 
sub sputnikoutput_corrector{
	my $input = $_[0];
	my $output = $_[1];
	open(IN,"<$input") or die "Cannot open file $input :$!";
	open(OUT,">$output") or die "Cannot open file $output :$!";
	my $tine;
	while (my $line=<IN>){
		if($line =~/length /){
			$tine = $line;
			$tine =~ s/\s+/\t/g;
			my @fields = split(/\t/,$tine);	
			if ($fields[6] > 60){
				print OUT $line;
				$line = <IN>;
			
				while (($line !~ /nucleotide/) && ($line !~ /^>/)){
					chomp $line;
					print OUT $line;
					$line = <IN>;
				}
				print OUT "\n";
				print OUT $line;
			}
			else{
				print OUT $line;
			}
		}
		else{
			print OUT $line;
		}
	}
	close IN;
	close OUT;
}
#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx 


#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx 
sub multi_species_t4{
#	print "multi_species_t4 : @_\n";
	my $input = $_[0];
	my $output = $_[1];
	open (FILEA, "<$input");
	open (FILEB, ">$output");
	
	my $line = <FILEA>;
	
	while ($line) {
	   # chomp $line;
		if ($line =~ />/) {
			chomp $line;
			print FILEB $line, "\n"; 
		}
	
		
		if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
		chomp $line;
		print FILEB $line, " " ;
		$line = <FILEA>;
		chomp $line;
		print FILEB $line,"\n";
		}
	
		$line = <FILEA>;
	}
	
	
	close FILEA;
	close FILEB;

}

#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx 


#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx 
sub multi_species_t5{
	
	my $input = $_[0];
	my $output = $_[1];
	
	open(FILEB,"<$input");
	open(FILEC,">$output");
	
	my $curdef;
	
	while (my $line = <FILEB> ) {
	
		if ($line =~ /^>/){
		chomp $line;
		$curdef = $line;
		next;
	}
	
	if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
		print  FILEC $curdef," ",$line;
	}
	
	}
	
	
	close FILEB;
	close FILEC;

}
#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx 


#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx 
sub multi_species_t6{
	my $input = $_[0];
	my $output = $_[1];	
	my $focalstrand=$_[3];
#	print "inpput = @_\n"; 
	open (FILE, "<$input");
	open (FILE_MICRO, ">$output");
	my $linecounter=0;	
	while (my $line = <FILE>){
		$linecounter++;
		chomp $line;
		#print "line = $line\n";
		#MONO#
		$line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/;
		my $strand=$1;
		my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g);
		#print "line = $line\n";
		my $specfieldsend = 2 + ($no_of_species*4) - 1;
		my @fields = split(/\s+/, $line);
		my @speciesdata = @fields[0 ... $specfieldsend];
		$line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/;
		my ($tide, $start, $end) = ($1, $2, $3);
		#print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n";
		if($line  =~ /mononucleotide/){
			print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields],  mono($fields[$#fields]),),"\n";
		}
		#DI#	
		elsif($line =~ /dinucleotide/){
			print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields],  di($fields[$#fields]),),"\n";
		}
		#TRI#	
		elsif($line =~ /trinucleotide/ ){
			print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields],  tri($fields[$#fields]),),"\n";
		}
		#TETRA#
		elsif($line =~ /tetranucleotide/){
			print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields],  tetra($fields[$#fields]),),"\n";
		}
		#PENTA#
		elsif($line =~ /pentanucleotide/){
			#print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields],  penta($fields[$#fields]),),"\n";
		}
		else{
		#	print "not: @fields\n";
		}
	}
#	print "linecounter=$linecounter\n"; 
	close FILE;
	close FILE_MICRO;
}

sub mono {
	my $st = $_[0];
	my $tp = unpack "A1"x(length($st)/1),$st;
	my $var1 = substr($tp, 0, 1);
	return join ("\t", $var1);
}
sub di {
	my $st = $_[0];
	my $tp = unpack "A2"x(length($st)/2),$st;
	my $var1 = substr($tp, 0, 2);
	return join ("\t", $var1);
}
sub tri {
	my $st = $_[0];
	my $tp = unpack "A3"x(length($st)/3),$st;
	my $var1 = substr($tp, 0, 3);
	return join ("\t", $var1);
}
sub tetra {
	my $st = $_[0];
	my $tp = unpack "A4"x(length($st)/4),$st;
	my $var1 = substr($tp, 0, 4);
	return join ("\t", $var1);
}
sub penta {
	my $st = $_[0];
	my $tp = unpack "A5"x(length($st)/5),$st;
	my $var1 = substr($tp, 0, 5);
	return join ("\t", $var1);
}

#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx 


#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx 
sub t9{
	my $input1 = $_[0];
	my $input2 = $_[1];
	my $output = $_[2];
	
	
	open(IN1,"<$input1") if -e $input1; 
	open(IN2,"<$input2") or die "cannot open file $_[1] : $!";
	open(OUT,">$output") or die "cannot open file $_[2] : $!"; 
	
	
	my %seen = ();
	my $prevkey = 0;
	
	if (-e $input1){
		while (my $line = <IN1>){
			chomp($line);
			my @fields = split(/\t/,$line);
			my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]);
		#	print "key in t9 = $key1\n";
			$seen{$key1}++	if ($prevkey ne $key1) ;
			$prevkey = $key1;
		}
#		print "done first hash\n";
		close IN1;
	}
	
	while (my $line = <IN2>){
	#	print $line, "**\n";
		if (-e $input1){
			chomp($line);
			my @fields = split(/\t/,$line);
			my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]);
			if (exists $seen{$key2}){
				print OUT "$line\n"	;
				delete $seen{$key2};
			}
		}
		else {
			print OUT "$line\n"	;		
#			print  "$line\n"	;		
		}
	}
	
	close IN2;
	close OUT;
}
#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx 


#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx  multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx  multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx 


sub multiSpecies_compound_microsat_hunter3{	
	
	my $input1 = $_[0];  ###### the *_sput_op4_ii file
	my $input2 = $_[1];  ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
	my $output1 = $_[2]; ###### plain microsatellite file
	my $output2 = $_[3]; ###### compound microsatellite file
	my $org = $_[4]; ###### 1 or 2
	$no_of_species = $_[5];
	#print "IN multiSpecies_compound_microsat_hunter3: @_\n"; 
	#my @tags = split(/\t/,$info);
	sub compoundify;
	open(IN,"<$input1") or die "Cannot open file $input1 $!";
	open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
	open(OUT,">$output1") or die "Cannot open file $output1 $!";
	open(OUT2,">$output2") or die "Cannot open file $output2 $!";
	$infocord = 2 + (4*$no_of_species) - 1;
	$startcord = 2 + (4*$no_of_species) + 2 - 1;
	$strandcord = 2 + (4*$no_of_species) + 3 - 1;
	$endcord = 2 + (4*$no_of_species) + 4 - 1;
	$microsatcord = 2 + (4*$no_of_species) + 5 - 1;
	$motifcord = 2 + (4*$no_of_species) + 6 - 1;
	my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; 
	
	my @thresholds = ("0");
	push(@thresholds, split(/_/,$_[6]));
	sub thresholdCheck;
	my %micros = ();
	while (my $line = <IN>){
	#	print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
		next if $line =~ /\t\t/;
		if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
			my $key = join("\t",$1, $2, $3, $4, $5);
		#	print $key, "#-#-#-#-#-#-#-#\n";
			push (@{$micros{$key}},$line);		
		}
		else{
		}
	}
	close IN;
	my @deletedlines = ();
	
	my $linecount = 0;
	
	while(my $sine = <SEQ>){
		my %microstart=();
		my %microend=();
	
		my @sields = split(/\t/,$sine);
	
		my $key = ();
	
		if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
			$key = join("\t",$1, $2, $3, $4, $5);
		#	print $key, "<-<-<-<-<-<-<-<\n";		
		}
		else{
		}
	
		if (exists $micros{$key}){
			$linecount++;	
			my @microstring = @{$micros{$key}};
			my @tempmicrostring = @{$micros{$key}};
			
			foreach my $line (@tempmicrostring){
				my @fields = split(/\t/,$line);
				my $start = $fields[$startcord];
				my $end = $fields[$endcord];
				push (@{$microstart{$start}},$line);
				push (@{$microend{$end}},$line);	
			}
			my $firstflag = 'down';
			while( my $line =shift(@microstring)){
	#			print "-----------\nline = $line ";
				chomp $line;
				my @fields = split(/\t/,$line);
				my $start = $fields[$startcord];
				my $end = $fields[$endcord];
				my $startmicro = $line;
				my $endmicro = $line;
	
			#	print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n";
	
				delete ($microstart{$start});
				delete ($microend{$end});
				my $flag = 'down';	
				my $startflag = 'down';
				my $endflag = 'down';
				my $prestart = $start - $distance;
				my $postend = $end + $distance;
				my @compoundlines = ();
				my %compoundhash = ();
				push (@compoundlines, $line);
				push (@{$compoundhash{$line}},$line);
				my $startrank = 1;
				my $endrank = 1;
				
				while( ($startflag eq "down") || ($endflag eq "down") ){
				if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) {
#					print "coming to the end of sequence,prestart = $prestart &  post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n";
					last;
				}
					
				$firstflag = "up";
				if ($startflag eq "down"){		
					for my $i ($prestart ... $start){
					
						if(exists $microend{$i}){	
							chomp $microend{$i}[0];
							if(exists $compoundhash{$microend{$i}[0]}) {next;}
	#						print "sending from microend $startmicro, $microend{$i}[0] |||\n";
							if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){
								push(@compoundlines, $microend{$i}[0]);
	#							print "accepted\n";
								my @tields = split(/\t/,$microend{$i}[0]);
								$startmicro = $microend{$i}[0];
								chomp $startmicro;
								$start = $tields[$startcord];
								$flag = 'down';
								$startrank++;
	#							print "startcompund = $microend{$i}[0]\n";
								delete $microend{$i};
								delete $microstart{$start};
								$startflag = 'down';
								$prestart = $start - $distance;
								last;
							}	
							else{
								$flag = 'up';
								$startflag = 'up';							
							}
						}
						else{
							$flag = 'up';
							$startflag = 'up';
						}
					}
				}
					
				$endrank = $startrank;
				
				if ($endflag eq "down"){				
					for my $i ($end ... $postend){
					
						if(exists $microstart{$i} ){
							chomp $microstart{$i}[0];
							if(exists $compoundhash{$microstart{$i}[0]}) {next;}	
	#						print "sending from microstart $endmicro, $microstart{$i}[0] |||\n";
	
							if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){
								push(@compoundlines, $microstart{$i}[0]);
	#								print "accepted\n";
								my @tields = split(/\t/,$microstart{$i}[0]);
								$end = $tields[$endcord]-0;
								$endmicro = $microstart{$i}[0];
								$endrank++;
								chomp $endmicro;
								$flag = 'down';
	#							print "endcompund = $microstart{$i}[0]\n";
								delete $microstart{$i};
								delete $microend{$end};
								shift @microstring;
								$postend = $end + $distance;
								$endflag = 'down';
								last;								
							}
							else{
								$flag = 'up';
								$endflag = 'up';							
							}
						}
						else{
							$flag = 'up';
							$endflag = 'up';
						}
					}
				}
	#			print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
			} 														#end while( $flag eq "down")
	#			print "compoundlines = @compoundlines \n";
			if (scalar (@compoundlines) == 1){
				print OUT $line,"\n";			
			}
			if (scalar (@compoundlines) > 1){
				my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]);
	#				print $compoundline,"\n";
				print OUT2 $compoundline,"\n";
			}
			} #end foreach my $line (@microstring){
		}	#if (exists $micros{$key}){
	
	
	}
	
	close OUT;
	close OUT2;
}


#------------------------------------------------------------------------------------------------
sub compoundify{
	my ($compoundlines, $sequence)  = @_;
#	print "\nfound to compound : @$compoundlines and$sequence \n";
	my $noOfComps = @$compoundlines;
#	print "Number of elements in hash is $noOfComps\n";
	my @starts;
	my @ends;
	foreach my $line (@$compoundlines){
#		print "compoundify.. line = $line \n";
		chomp $line;
		my @fields = split(/\t/,$line);
		my $start = $fields[$startcord];
		my $end = $fields[$endcord];
	#	print "start = $start, end = $end \n";
		push(@starts, $start);
		push(@ends,$end);		
	}
	my @temp = @$compoundlines;
	my $startline=$temp[0];
	my @mields  = split(/\t/,$startline);
	my $startcoord = $mields[$startcord];
	my $startgapsign=$mields[$endcord];
	my @startsorted = sort { $a <=> $b } @starts;
	my @endsorted = sort { $a <=> $b } @ends;
	my @intervals;
	for my $end (0 ... (scalar(@endsorted)-2)){
		my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1));
		push(@intervals,$interval);
	#	print "interval = $interval =\n";
	#	print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n";
	}
	push(@intervals,"");
	my $compoundmicrosat=();
	my $multiunit="";
	foreach my $line (@$compoundlines){
		my @fields = split(/\t/,$line);
		my $component="[".$fields[$microsatcord]."]".shift(@intervals);
		$compoundmicrosat=$compoundmicrosat.$component;
		$multiunit=$multiunit."[".$fields[$motifcord]."]";
#		print "multiunit = $multiunit\n";
	}
	my $compoundcopy = $compoundmicrosat;
	$compoundcopy =~ s/\[|\]//g;
	my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1;
	
	
	my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit));
	return $compoundline;
}	

#------------------------------------------------------------------------------------------------

sub identityMatch_thresholdCheck{
	my $line1 = $_[0];
	my $line2 = $_[1];
	my $rank = $_[2];
	my @lields1 = split(/\t/,$line1);
	my @lields2 = split(/\t/,$line2);
#	print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n";
	
	if (length($lields1[$motifcord]) == length($lields2[$motifcord])){
		my $probe = $lields1[$motifcord].$lields1[$motifcord];
			#print "$probe :: $lields2[$motifcord]\n";
		return "proceed" if $probe =~ /$lields2[$motifcord]/;
			#print "line recieved\n";
		if ($rank ==1){
			return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
		}
		else {
			return "proceed" if thresholdCheck($line2) eq "proceed";
			return "stop";
		}
	}
	else{
		if ($rank ==1){
			return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
		}
		else {
			return "proceed" if thresholdCheck($line2) eq "proceed";
			return "stop";
		}
	}
	return "stop";
}
#------------------------------------------------------------------------------------------------

sub thresholdCheck{
	my @checkthresholds=(0,@thresholds);
	#print "IN thresholdCheck: @_\n";
	my $line = $_[0];
	my @lields = split(/\t/,$line);
	return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])];
	return "stop";
}
#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx  multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx  multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx 


#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx  multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx  multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx 

sub multiSpecies_filtering_interrupted_microsats{
#	print "IN multiSpecies_filtering_interrupted_microsats: @_\n";
	my $unfiltered = $_[0];
	my $filtered = $_[1];
	my $residue = $_[2];
	my $no_of_species = $_[5];
	open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
	open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
	open(RES,">$residue") or die "Cannot open file $residue: $!";
		
	$infocord = 2 + (4*$no_of_species) - 1;
	$startcord = 2 + (4*$no_of_species) + 2 - 1;
	$strandcord = 2 + (4*$no_of_species) + 3 - 1;
	$endcord = 2 + (4*$no_of_species) + 4 - 1;
	$microsatcord = 2 + (4*$no_of_species) + 5 - 1;
	$motifcord = 2 + (4*$no_of_species) + 6 - 1;
	
	
	my @sub_thresholds = (0);
	
	push(@sub_thresholds, split(/_/,$_[3]));
	my @thresholds = (0);
	
	push(@thresholds, split(/_/,$_[4]));
	
	while (my $line = <UNF>) {
		next if $line !~ /[a-z]/;
		#print $line;
		chomp $line;
		my @fields = split(/\t/,$line);
		my $motif = $fields[$motifcord];
		my $realmotif = $motif;
		#print "motif = $motif\n";
		if ($motif =~ /^\[/){
			$motif =~ s/^\[//g;
			my @motifs = split(/\]/,$motif);
			$realmotif = $motifs[0];
		}
#		print "realmotif = $realmotif";
		my $motif_size = length($realmotif);
		
		my $microsat = $fields[$microsatcord];
#		print "microsat = $microsat\n";
		$microsat =~ s/^\[|\]$//sg;
		my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
		
		$microsat = join("",@microsats);
		if (length($microsat) < $thresholds[$motif_size]) {
		#	print length($microsat)," < ",$thresholds[$motif_size],"\n"; 
			print RES $line,"\n"; next;
		}
		my @lengths = ();
		foreach my $mic (@microsats){
			push(@lengths, length($mic));
		}
		if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) {
	#		print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n"; 
			print RES $line,"\n"; next;}
			else {print FIL $line,"\n"; next;
		}
	}
	close FIL;
	close RES;

}

sub largest_microsat{
	my $counter = 0;
	my($max) = shift(@_);
    foreach my $temp (@_) {
    	#print "finding largest array: $maxcounter \n";
    	if($temp > $max){
        	$max = $temp;
        }
    }
    return($max);
}

#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx  multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx  multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx 


#xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx  multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx  multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx…
Summary ✨

The Perl code appears to be a bioinformatics pipeline that filters and annotates genomic data. It reads in sequence files, identifies regions of interest (e.g., focal species), and removes masked areas from the sequences. The output is written to new files, including a filtered out file and a masked orthologous gene file.
Alerts (2)

Complexity hotspot; lines 1248 to 1249 (total complexity: 5)
1248 1249