/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl
Perl | 5606 lines | 5065 code | 245 blank | 296 comment | 163 complexity | e64ba560e907a6b8d7818123ba23613a MD5 | raw file
- #!/usr/bin/perl
- use strict;
- use warnings;
- use Term::ANSIColor;
- use File::Basename;
- use IO::Handle;
- use Cwd;
- use File::Path;
- use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);
- use File::Path qw(make_path remove_tree);
- use File::Temp qw/ tempfile tempdir /;
- my $tdir = tempdir( CLEANUP => 1 );
- chdir $tdir;
- my $dir = getcwd;
- #print "dir = $dir\n";
- #$ENV{'PATH'} .= ':' . dirname($0);
- my $date = `date`;
- my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV;
- if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; }
- $tree_definition_all =~ s/\s+//g;
- $threshold_array =~ s/\s+//g;
- $allspeciesin =~ s/\s+//g;
- #-------------------------------------------------------------------------------
- # WHICH SPUTNIK USED?
- my $sputnikpath = ();
- $sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;
- #$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";
- #print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath;
- #exit if !-e $sputnikpath;
- #$sputnikpath = "bx-sputnik" ;
- #print "ARGV input = @ARGV\n";
- #print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n";
- #-------------------------------------------------------------------------------
- # RUNFILE
- #-------------------------------------------------------------------------------
- $distance = 1; #bp
- $distance++;
- my @tree_definitions=MakeTrees($tree_definition_all);
- my $allspeciesset = $tree_definition_all;
- $allspeciesset =~ s/[\(\) ]+//g;
- @allspecies = split(/,/,$allspeciesset);
- my @outputfiles = ();
- my $round = 0;
- #my $tdir = tempdir( CLEANUP => 0 );
- #chdir $tdir;
- foreach my $tree_definition (@tree_definitions){
- my @commas = ($tree_definition =~ /,/g) ;
- #print "commas = @commas\n"; <STDIN>;
- next if scalar(@commas) <= 1;
- #print "species_set = $species_set\n";
- $treeSpeciesNum = scalar(@commas) + 1;
- $species_set = $tree_definition;
- $species_set =~ s/[\)\( ;]+//g;
- #print "species_set = $species_set\n"; <STDIN>;
- $round++;
- #-------------------------------------------------------------------------------
- # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)
- $threshold_array=~ s/,/_/g;
- my @thresharr = split("_",$threshold_array);
- @thresholds=@thresharr;
- #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));
- #print "current dit=$dir\n";
- #-------------------------------------------------------------------------------
- # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY
- my @chrfiles=();
-
- #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];
- my $chromt=int(rand(10000));
- my $p_chr=$chromt;
-
-
- my @exactspeciesset_unarranged = split(/,/,$species_set);
- $tree_definition=~s/[\)\(, ]/\t/g;
- my @treespecies=split(/\t+/,$tree_definition);
- my @exactspecies=();
-
- foreach my $spec (@treespecies){
- foreach my $espec (@exactspeciesset_unarranged){
- push @exactspecies, $spec if $spec eq $espec;
- }
- }
- #print "exactspecies=@exactspecies\n";
- $focalspec = $exactspecies[0];
- my $arranged_species_set=join(".",@exactspecies);
- my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
- my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");
- #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
- maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);
- maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set);
- #print "done maf to axt conversion\n";
- my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");
- artificial_axdata_inverter ($chr_name, $reverse_chr_name);
- #print "reverse_chr_name=$reverse_chr_name\n";
- #-------------------------------------------------------------------------------
- # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT
- foreach my $direct ("reverse_direction","forward_direction"){
- $p_chr=$chromt;
- #print "direction = $direct\n";
- $p_chr = $p_chr."r" if $direct eq "reverse_direction";
- $p_chr = $p_chr if $direct eq "forward_direction";
- my $config = $species_set;
- $config=~s/,/./g;
- my @orgs = split(/\./,$arranged_species_set);
- #print "ORGS= @orgs\n";
- my @tag=@orgs;
-
-
- my $tags = join(",", @tag);
- my @tags=@tag;
- chomp $p_chr;
- $tags = join("_", split(/,/, $tags));
- my $pchr = "chr".$p_chr;
-
- my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array;
- my @sp_tags = ();
-
- # print "$ptag _ orthfile\n"; <STDIN>;
- #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n";
- foreach my $sp (@tag){
- push(@sp_tags, ($sp.".".$ptag));
- }
-
- my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]);
- my @presp_tags = ();
-
- foreach my $sp (@tag){
- push(@presp_tags, ($sp.".".$preptag));
- }
-
- my $resultdir = "";
- my $orthdir = "";
- my $filtereddir = "";
- my $pipedir = "";
-
- my @title_queries = ();
- push(@title_queries, "^[0-9]+");
- my $sep="\\s";
- for my $or (0 ... $#orgs){
- my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]"));
- #$title =~ s/chr\\+\\s+\+/chr/g;
- push(@title_queries, $title);
- }
- my $title_query = join($sep, @title_queries);
- #print "title_queries=@title_queries\n";
- #print "query = >$title_query<\n";
- #print "orgs = @orgs\n";
- #-------------------------------------------------------------------------------
- # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES
- my $t1input = $pchr.".".$arranged_species_set.".net.axt";
-
- my @t1outputs = ();
-
- foreach my $sp (@presp_tags){
- push(@t1outputs, $sp."_gap_op");
- }
-
-
-
- multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query);
- #print "t1outputs=@t1outputs\n";
- #print "done t1\n"; <STDIN>;
- #-------------------------------------------------------------------------------
- #START T2.PL
-
- my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = ();
-
- for my $t (0 ... scalar(@tags)-1){
- multi_species_t2($t1outputs[$t], $tag[$t]);
- }
- #-------------------------------------------------------------------------------
- #START T2.2.PL
-
- my @temp_tags = @tag;
-
- foreach my $sp (@presp_tags){
- my $t2input = $sp."_nogap_op_unrand";
- multi_species_t2_2($t2input, shift(@temp_tags));
- }
- undef (@temp_tags);
-
- #-------------------------------------------------------------------------------
- #START SPUTNIK
-
- my @jobIDs = ();
- @temp_tags = @tag;
- my @sput_filelist = ();
-
- foreach my $sp (@presp_tags){
- #print "sp = $sp\n";
- my $sputnikoutput = $pipedir.$sp."_sput_op0";
- my $sputnikinput = $pipedir.$sp."_nogap_op_unrand";
- push(@sput_filelist, $sputnikinput);
- my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput;
- # print "$sputnikcommand\n";
- my @sputnikcommand_system = $sputnikcommand;
- system(@sputnikcommand_system);
- }
-
- #-------------------------------------------------------------------------------
- #START SPUTNIK OUTPUT CORRECTOR
-
- foreach my $sp (@presp_tags){
- my $corroutput = $pipedir.$sp."_sput_op1";
- my $corrinput = $pipedir.$sp."_sput_op0";
- sputnikoutput_corrector($corrinput,$corroutput);
-
- my $t4output = $pipedir.$sp."_sput_op2";
- multi_species_t4($corroutput,$t4output);
-
- my $t5output = $pipedir.$sp."_sput_op3";
- multi_species_t5($t4output,$t5output);
- #print "done t5.pl for $sp\n";
-
- my $t6output = $pipedir.$sp."_sput_op4";
- multi_species_t6($t5output,$t6output,scalar(@orgs));
- }
- #-------------------------------------------------------------------------------
- #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING
-
- foreach my $sp (@presp_tags){
- my $t9output = $pipedir.$sp."_gap_op_unrand_match";
- my $t9sequence = $pipedir.$sp."_gap_op_unrand2";
- my $t9micro = $pipedir.$sp."_sput_op4";
- t9($t9micro,$t9sequence,$t9output);
-
- my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match";
- my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2";
- t9($t9micro,$t9sequence2,$t9output2);
- }
- #print "done both t9.pl for all orgs\n";
-
- #-------------------------------------------------------------------------------
- # FIND COMPOUND MICROSATELLITES
-
- @jobIDs = ();
- my $species_counter = 0;
-
- foreach my $sp (@presp_tags){
- my $simple_microsats=$pipedir.$sp."_sput_op4_simple";
- my $compound_microsats=$pipedir.$sp."_sput_op4_compound";
- my $input_micro = $pipedir.$sp."_sput_op4";
- my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match";
- multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array );
- $species_counter++;
- }
-
- #-------------------------------------------------------------------------------
- # READING AND FILTERING SIMPLE MICROSATELLITES
- my $spcounter2=0;
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$spcounter2];
- $spcounter2++;
- my $simple_microsats=$pipedir.$presp."_sput_op4_simple";
- my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered";
- my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue";
- multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
- }
-
- #-------------------------------------------------------------------------------
- # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS
-
- $species_counter = 0;
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
- my $compound_microsats = $pipedir.$presp."_sput_op4_compound";
- my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted";
- my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure";
- my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
- multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags));
- $species_counter++;
- }
- #-------------------------------------------------------------------------------
- # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM..
- $species_counter = 0;
-
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
- my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure";
- my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
- my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound";
- my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
- multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8");
- $species_counter++;
- }
- #-------------------------------------------------------------------------------
- # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES
- $species_counter = 0;
-
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
-
- my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
- my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
- my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue";
- multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
-
- my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted";
- my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
- my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue";
- multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags));
-
- my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound";
- my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered";
- my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue";
- multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
- $species_counter++;
- }
- #print "done filtering both simple and compound microsatellites \n";
-
- #-------------------------------------------------------------------------------
-
- my @combinedarray = ();
- my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide");
- my @combinedarray_tags = ("mono", "di", "tri", "tetra");
- $species_counter = 0;
-
- foreach my $sp (@sp_tags){
- my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple";
- push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra";
- $species_counter++;
- }
-
- #-------------------------------------------------------------------------------
- # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION
- my $sp_counter = 0;
- foreach my $sp (@sp_tags){
- my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
- my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
- my $simple = $pipedir.$sp."_sput_op4_simple_filtered";
- my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple";
- `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`;
- for my $i (0 ... 3){
- `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`;
- }
- $sp_counter++;
- }
- #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n";
-
- #-------------------------------------------------------------------------------
- # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES
- # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES
- @temp_tags = @sp_tags;
- my $increment = 1000000;
- my @splist = ();
- my $targetdir = $pipedir;
- $species_counter=0;
-
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
- $species_counter++;
- my $localtag = shift @temp_tags;
- my $locallist = $targetdir.$localtag."_".$p_chr."_list";
- push(@splist, $locallist);
- my $input = $pipedir.$presp."_nogap_op_unrand2_match";
- chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr);
- }
-
-
- my @unionarray = ();
- #print "splist=@splist\n";
- #-------------------------------------------------------------------------------
- # FIND INTERRUPTED MICROSATELLITES
-
- $species_counter = 0;
-
- for my $i (0 .. $#combinedarray){
-
- @jobIDs = ();
- open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!";
-
- while (my $sp1 = <JLIST1>){
- #print "$splist[$i]: sp1=$sp1\n";
- chomp $sp1;
-
- for my $j (0 ... $#combinedarray_tags){
- my $interr = $sp1."_interr_".$combinedarray_tags[$j];
- my $simple = $sp1."_simple_".$combinedarray_tags[$j];
- push @{$unionarray[$i]}, $interr, $simple;
- multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8");
- }
- }
- $species_counter++;
- }
- close JLIST1;
- #-------------------------------------------------------------------------------
- # REUNION AND ZIPPING BEFORE T10.PL
-
- my @allarray = ();
-
- for my $i (0 ... $#sp_tags){
- my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
- unlink $localfile if -e $localfile;
- push(@allarray, $localfile);
-
- my $unfiltered_localfile= $localfile."_unfiltered";
- my $residue_localfile= $localfile."_residue";
-
- unlink $unfiltered_localfile;
- #unlink $unfiltered_localfile;
- for my $j (0 ... $#{$unionarray[$i]}){
- #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n";
- `cat $unionarray[$i][$j] >> $unfiltered_localfile`;
- unlink $unionarray[$i][$j];
- }
-
- multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) );
- my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered";
- my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue";
- my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue";
-
- `cat $analyzed_compound >> $localfile`;
- }
- #-------------------------------------------------------------------------------
- # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS
-
- my $toescape=0;
-
-
- for my $i (0 ... $#sp_tags){
- my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
- $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./;
- my $direction = $1;
- #print "localfile = $localfile , direction = $direction\n";
- # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1;
- $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/;
- last if $direction =~ /chr[0-9a-zA-Z]+r/;
- my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match";
- my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match";
- my $reverselocal = $localfile;
- $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g;
- merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags));
- #-------------------------------------------------------------------------------
- my $forward_separate = $localfile."_separate";
- my $reverse_separate = $reverselocal."_separate";
- my $diff = $forward_separate."_diff";
- my $miss = $forward_separate."_miss";
- my $common = $forward_separate."_common";
- forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags));
- #-------------------------------------------------------------------------------
- my $symmetrical_file = $localfile."_symmetrical";
- my $merged_file = $localfile."_merged";
- #print "cating: $merged_file $common into -> $symmetrical_file \n";
- `cat $merged_file $common > $symmetrical_file`;
- #-------------------------------------------------------------------------------
- my $t10output = $symmetrical_file."_fin_hit_all_2";
- new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs));
- #-------------------------------------------------------------------------------
- }
- next if $toescape == 1;
- #------------------------------------------------------------------------------------------------
- # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES
-
-
- my @micros_array = ();
- my $sampletag = ();
- for my $i (0 ... $#sp_tags){
- my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2";
- push(@micros_array, $finhitFile);
- $sampletag = $sp_tags[$i];
- }
- #$sampletag =~ s/^([A-Z]+\.)/ORTH_/;
- #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp";
- my $orthfiletemp = $ptag."_orthfile";
- my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation);
- my $maskedorthfiletemp = $ptag."_orthfile_masked";
- qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp);
- push @outputfiles , $maskedorthfiletemp;
- }
- $date = `date`;
- }
- `cat @outputfiles > $orthfile`;
- my $rootdir = $dir;
- $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
- chdir $rootdir;
- remove_tree($dir);
- #print "date = $date\n";
- #remove_tree($tdir);
- #------------------------------------------------------------------------------------------------
- #------------------------------------------------------------------------------------------------
- #------------------------------------------------------------------------------------------------
- #------------------------------------------------------------------------------------------------
- #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
- sub maftoAxt_multispecies {
- #print "in maftoAxt_multispecies : got @_\n";
- my $fname=$_[0];
- open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
- my $treedefinition = $_[1];
- open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
- my $counter = 0;
- my $exactspeciesset = $_[3];
- my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
-
- $treedefinition=~s/[\)\(, ]/\t/g;
- my @species=split(/\t+/,$treedefinition);
- my @exactspecies=();
-
- foreach my $spec (@species){
- foreach my $espec (@exactspeciesset_unarranged){
- push @exactspecies, $spec if $spec eq $espec;
- }
- }
- #print "exactspecies=@exactspecies\n";
-
- ###########
- my $select = 2;
- #select = 1 if all species need sequences to be present for each block otherwise, it is 0
- #select = 2 only the allowed set make up the alignment. use the removeset
- # information to detect alignmenets that have other important genomes aligned.
- ###########
- my @allowedset = ();
- @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
- @allowedset = join("_",0,@species) if $select == 1;
- #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
- @allowedset = join("_",0,@exactspecies) if $select == 2;
- #print "allowedset = @allowedset and exactspecies = @exactspecies\n";
-
- my $start = 0;
- my @sequences = ();
- my @titles = ();
- my $species_counter = "0";
- my $countermatch = 0;
- my $outsideSpecies=0;
-
- while(my $line = <IN>){
- # print $line;
- next if $line =~ /^#/;
- next if $line =~ /^i/;
- chomp $line;
- my @fields = split(/\s+/,$line);
- chomp $line;
- if ($line =~ /^a /){
- $start = 1;
- }
-
- if ($line =~ /^s /){
-
- foreach my $sp (@allspecies){
- # print "checking species $sp\n";
- if ($fields[1] =~ /$sp/){
- $species_counter = $species_counter."_".$sp;
- push(@sequences, $fields[6]);
- my @sp_info = split(/\./,$fields[1]);
- my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
- push(@titles, $title);
- # print "species_counter = $species_counter\n";
- }
- }
- }
-
- if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
- # print "species_counter = $species_counter\n";
- my $arranged = reorderSpecies($species_counter, @allspecies);
- my $stopper = 1;
- my $arrno = 0;
-
- # print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n";
- if (scalar(@sequences) == scalar(@exactspecies)){
- foreach my $set (@allowedset){
- # print "testing $arranged against $set\n";
- if ($arranged eq $set){
- $stopper = 0; last;
- }
- $arrno++;
- }
- }
- else{
- $stopper = 1;
- }
-
-
- if ($stopper == 0) {
- @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
- @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
- my $filteredseq = filter_gaps(@sequences);
-
- if ($filteredseq ne "SHORT"){
- #print "printing"; <STDIN>;
- $counter++;
- print OUT join (" ",$counter, @titles), "\n";
- print OUT $filteredseq, "\n";
- print OUT "\n";
- $countermatch++;
- }
- }
- else{ #print "nexting\n";<STDIN>;
- }
-
- @sequences = (); @titles = (); $start = 0;$species_counter = "0";
- next;
-
- }
- }
- # print "countermatch = $countermatch\n";
- }
- sub reorderSpecies{
- my @inarr=@_;
- my $currSpecies = shift (@inarr);
- my $ordered_species = 0;
- my @species=@inarr;
- #print "species = @species\n";
- foreach my $order (@species){
- $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
- }
- return $ordered_species;
- }
- sub filter_gaps{
- my @sequences = @_;
- # print "sequences sent are @sequences\n";
- my $seq_length = length($sequences[0]);
- my $seq_no = scalar(@sequences);
- my $allgaps = ();
- for (1 ... $seq_no){
- $allgaps = $allgaps."-";
- }
-
- my @seq_array = ();
- my $seq_counter = 0;
- foreach my $seq (@sequences){
- # my @sequence = split(/\s*/,$seq);
- $seq_array[$seq_counter] = [split(/\s*/,$seq)];
- # push @seq_array, [@sequence];
- $seq_counter++;
- }
- my $g = 0;
- while ( $g < $seq_length){
- last if (!exists $seq_array[0][$g]);
- my $bases = ();
- for my $u (0 ... $#seq_array){
- $bases = $bases.$seq_array[$u][$g];
- }
- # print $bases, "\n";
- if ($bases eq $allgaps){
- # print "bases are $bases, position is $g \n";
- for my $seq (@seq_array){
- splice(@$seq , $g, 1);
- }
- }
- else {
- $g++;
- }
- }
-
- my @outs = ();
-
- foreach my $seq (@seq_array){
- push(@outs, join("",@$seq));
- }
- return "SHORT" if length($outs[0]) <=100;
- return (join("\n", @outs));
- }
- sub allowedSetOfSpecies{
- my @allowed_species = split(/_/,$_[0]);
- unshift @allowed_species, 0;
- # print "allowed set = @allowed_species \n";
- my @output = ();
- for (0 ... scalar(@allowed_species) - 4){
- push(@output, join("_",@allowed_species));
- pop @allowed_species;
- }
- return join(";",reverse(@output));
- }
- sub orderInfo{
- my @info = split(/;/,$_[0]);
- # print "info = @info";
- my @old = split(/_/,$_[1]);
- my @new = split(/_/,$_[2]);
- shift @old; shift @new;
- my @outinfo = ();
- foreach my $spe (@new){
- for my $no (0 ... $#old){
- if ($spe eq $old[$no]){
- push(@outinfo, $info[$no]);
- }
- }
- }
- # print "outinfo = @outinfo \n";
- return join(";", @outinfo);
- }
- #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
- #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
- sub artificial_axdata_inverter{
- open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!";
- open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!";
- my $linecounter=0;
- while (my $line = <IN>){
- $linecounter++;
- #print "$linecounter\n";
- chomp $line;
- my $final_line = $line;
- my $trycounter = 0;
- if ($line =~ /^[a-zA-Z\-]/){
- # while ($final_line eq $line){
- my @fields = split(/\s*/,$line);
-
- $final_line = join("",reverse(@fields));
- # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
- # $trycounter++;
- # print "trying again....$trycounter : $final_line\n" if $final_line eq $line;
- # }
- }
-
- # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
- if ($line =~ /^[0-9]/){
- $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g;
- $final_line = $line;
- }
- print OUT $final_line,"\n";
- #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
- }
- close OUT;
- }
- #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
- #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
- sub multi_species_t1 {
- my $input1 = $_[0];
- #print "@_\n"; <STDIN>;
- my @tags = split(/_/, $_[1]);
- my @outputs = split(/,/, $_[2]);
- my $title_query = $_[3];
- my @handles = ();
-
- open(FILEB,"<$input1")or die "Cannot open file: $input1 $!";
- my $i = 0;
- foreach my $path (@outputs){
- $handles[$i] = IO::Handle->new();
- open ($handles[$i], ">$path") or die "Can't open $path : $!";
- $i++;
- }
-
- my $curdef;
- my $start = 0;
-
- while (my $line = <FILEB> ) {
- if ($line =~ /^\d/){
- $line =~ s/ +/\t/g;
- my @fields = split(/\s+/, $line);
- if (($line =~ /$title_query/)){
- my $title = $line;
- my $counter = 0;
- foreach my $tag (@tags){
- $line = <FILEB>;
- print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line;
- $counter++;
- }
- }
- else{
- foreach my $tag (@tags){
- my $tine = <FILEB>;
- }
- }
-
- }
- }
-
- foreach my $hand (@handles){
- $hand->close();
- }
-
- close FILEB;
- }
- #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
- #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
- sub multi_species_t2{
-
- my $input = $_[0];
- my $species = $_[1];
- my $output1 = $input."_unr";
-
- #------------------------------------------------------------------------------------------
- open (FILEF1, "<$input") or die "Cannot open file $input :$!";
- open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!";
-
- my $line1 = <FILEF1>;
-
- while($line1){
- {
- # chomp($line);
- if ($line1 =~ (m/^\>$species/)){
- chomp($line1);
- print FILEF2 $line1;
- $line1 = <FILEF1>;
- chomp($line1);
- print FILEF2 "\t", $line1,"\n";
- }
- }
- $line1 = <FILEF1>;
- }
-
- close FILEF1;
- close FILEF2;
- #------------------------------------------------------------------------------------------
-
- my $output2 = $output1."and";
- my $output3 = $output1."and2";
- open(IN,"<$output1");
- open (FILEF3, ">$output2");
- open (FILEF4, ">$output3");
-
-
- while (<IN>){
- my $line = $_;
- chomp($line);
- my @fields=split (/\t/, $line);
- # print $line,"\n"; <STDIN>;
- if($line !~ /random/){
- print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n";
- print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n";
- }
- }
-
-
- close IN;
- close FILEF3;
- close FILEF4;
- unlink $output1;
-
- #------------------------------------------------------------------------------------------
- # OLD T3.PL RUDIMENT
-
- my $t3output = $output2;
- $t3output =~ s/gap_op_unrand/nogap_op_unrand/g;
-
- open(IN,"<$output2");
- open(OUTA,">$t3output");
-
-
- while (<IN>){
- s/-//g unless /^>/;
- print OUTA;
- }
-
- close IN;
- close OUTA;
- #------------------------------------------------------------------------------------------
- }
- #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
- #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx
- sub multi_species_t2_2{
- #print "IN multi_species_t2_2 : @_\n";
- my $input = $_[0];
- my $species = $_[1];
- my $output1 = $input."2";
-
-
- open (FILEF1, "<$input");
- open (FILEF2, ">$output1");
-
- my $line1 = <FILEF1>;
-
- while($line1){
- {
- # chomp($line);
- if ($line1 =~ (m/^\>$species/)){
- chomp($line1);
- print FILEF2 $line1;
- $line1 = <FILEF1>;
- chomp($line1);
- print FILEF2 "\t", $line1,"\n";
- }
- }
- $line1 = <FILEF1>;
- }
-
- close FILEF1;
- close FILEF2;
- }
- #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx
- #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
- sub sputnikoutput_corrector{
- my $input = $_[0];
- my $output = $_[1];
- open(IN,"<$input") or die "Cannot open file $input :$!";
- open(OUT,">$output") or die "Cannot open file $output :$!";
- my $tine;
- while (my $line=<IN>){
- if($line =~/length /){
- $tine = $line;
- $tine =~ s/\s+/\t/g;
- my @fields = split(/\t/,$tine);
- if ($fields[6] > 60){
- print OUT $line;
- $line = <IN>;
-
- while (($line !~ /nucleotide/) && ($line !~ /^>/)){
- chomp $line;
- print OUT $line;
- $line = <IN>;
- }
- print OUT "\n";
- print OUT $line;
- }
- else{
- print OUT $line;
- }
- }
- else{
- print OUT $line;
- }
- }
- close IN;
- close OUT;
- }
- #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
- #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
- sub multi_species_t4{
- # print "multi_species_t4 : @_\n";
- my $input = $_[0];
- my $output = $_[1];
- open (FILEA, "<$input");
- open (FILEB, ">$output");
-
- my $line = <FILEA>;
-
- while ($line) {
- # chomp $line;
- if ($line =~ />/) {
- chomp $line;
- print FILEB $line, "\n";
- }
-
-
- if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
- chomp $line;
- print FILEB $line, " " ;
- $line = <FILEA>;
- chomp $line;
- print FILEB $line,"\n";
- }
-
- $line = <FILEA>;
- }
-
-
- close FILEA;
- close FILEB;
- }
- #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
- #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
- sub multi_species_t5{
-
- my $input = $_[0];
- my $output = $_[1];
-
- open(FILEB,"<$input");
- open(FILEC,">$output");
-
- my $curdef;
-
- while (my $line = <FILEB> ) {
-
- if ($line =~ /^>/){
- chomp $line;
- $curdef = $line;
- next;
- }
-
- if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
- print FILEC $curdef," ",$line;
- }
-
- }
-
-
- close FILEB;
- close FILEC;
- }
- #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
- #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
- sub multi_species_t6{
- my $input = $_[0];
- my $output = $_[1];
- my $focalstrand=$_[3];
- # print "inpput = @_\n";
- open (FILE, "<$input");
- open (FILE_MICRO, ">$output");
- my $linecounter=0;
- while (my $line = <FILE>){
- $linecounter++;
- chomp $line;
- #print "line = $line\n";
- #MONO#
- $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/;
- my $strand=$1;
- my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g);
- #print "line = $line\n";
- my $specfieldsend = 2 + ($no_of_species*4) - 1;
- my @fields = split(/\s+/, $line);
- my @speciesdata = @fields[0 ... $specfieldsend];
- $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/;
- my ($tide, $start, $end) = ($1, $2, $3);
- #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n";
- if($line =~ /mononucleotide/){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n";
- }
- #DI#
- elsif($line =~ /dinucleotide/){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n";
- }
- #TRI#
- elsif($line =~ /trinucleotide/ ){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n";
- }
- #TETRA#
- elsif($line =~ /tetranucleotide/){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n";
- }
- #PENTA#
- elsif($line =~ /pentanucleotide/){
- #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n";
- }
- else{
- # print "not: @fields\n";
- }
- }
- # print "linecounter=$linecounter\n";
- close FILE;
- close FILE_MICRO;
- }
- sub mono {
- my $st = $_[0];
- my $tp = unpack "A1"x(length($st)/1),$st;
- my $var1 = substr($tp, 0, 1);
- return join ("\t", $var1);
- }
- sub di {
- my $st = $_[0];
- my $tp = unpack "A2"x(length($st)/2),$st;
- my $var1 = substr($tp, 0, 2);
- return join ("\t", $var1);
- }
- sub tri {
- my $st = $_[0];
- my $tp = unpack "A3"x(length($st)/3),$st;
- my $var1 = substr($tp, 0, 3);
- return join ("\t", $var1);
- }
- sub tetra {
- my $st = $_[0];
- my $tp = unpack "A4"x(length($st)/4),$st;
- my $var1 = substr($tp, 0, 4);
- return join ("\t", $var1);
- }
- sub penta {
- my $st = $_[0];
- my $tp = unpack "A5"x(length($st)/5),$st;
- my $var1 = substr($tp, 0, 5);
- return join ("\t", $var1);
- }
- #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
- #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
- sub t9{
- my $input1 = $_[0];
- my $input2 = $_[1];
- my $output = $_[2];
-
-
- open(IN1,"<$input1") if -e $input1;
- open(IN2,"<$input2") or die "cannot open file $_[1] : $!";
- open(OUT,">$output") or die "cannot open file $_[2] : $!";
-
-
- my %seen = ();
- my $prevkey = 0;
-
- if (-e $input1){
- while (my $line = <IN1>){
- chomp($line);
- my @fields = split(/\t/,$line);
- my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]);
- # print "key in t9 = $key1\n";
- $seen{$key1}++ if ($prevkey ne $key1) ;
- $prevkey = $key1;
- }
- # print "done first hash\n";
- close IN1;
- }
-
- while (my $line = <IN2>){
- # print $line, "**\n";
- if (-e $input1){
- chomp($line);
- my @fields = split(/\t/,$line);
- my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]);
- if (exists $seen{$key2}){
- print OUT "$line\n" ;
- delete $seen{$key2};
- }
- }
- else {
- print OUT "$line\n" ;
- # print "$line\n" ;
- }
- }
-
- close IN2;
- close OUT;
- }
- #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
- sub multiSpecies_compound_microsat_hunter3{
-
- my $input1 = $_[0]; ###### the *_sput_op4_ii file
- my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
- my $output1 = $_[2]; ###### plain microsatellite file
- my $output2 = $_[3]; ###### compound microsatellite file
- my $org = $_[4]; ###### 1 or 2
- $no_of_species = $_[5];
- #print "IN multiSpecies_compound_microsat_hunter3: @_\n";
- #my @tags = split(/\t/,$info);
- sub compoundify;
- open(IN,"<$input1") or die "Cannot open file $input1 $!";
- open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
- open(OUT,">$output1") or die "Cannot open file $output1 $!";
- open(OUT2,">$output2") or die "Cannot open file $output2 $!";
- $infocord = 2 + (4*$no_of_species) - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
- my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
-
- my @thresholds = ("0");
- push(@thresholds, split(/_/,$_[6]));
- sub thresholdCheck;
- my %micros = ();
- while (my $line = <IN>){
- # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
- next if $line =~ /\t\t/;
- if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $2, $3, $4, $5);
- # print $key, "#-#-#-#-#-#-#-#\n";
- push (@{$micros{$key}},$line);
- }
- else{
- }
- }
- close IN;
- my @deletedlines = ();
-
- my $linecount = 0;
-
- while(my $sine = <SEQ>){
- my %microstart=();
- my %microend=();
-
- my @sields = split(/\t/,$sine);
-
- my $key = ();
-
- if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- $key = join("\t",$1, $2, $3, $4, $5);
- # print $key, "<-<-<-<-<-<-<-<\n";
- }
- else{
- }
-
- if (exists $micros{$key}){
- $linecount++;
- my @microstring = @{$micros{$key}};
- my @tempmicrostring = @{$micros{$key}};
-
- foreach my $line (@tempmicrostring){
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- push (@{$microstart{$start}},$line);
- push (@{$microend{$end}},$line);
- }
- my $firstflag = 'down';
- while( my $line =shift(@microstring)){
- # print "-----------\nline = $line ";
- chomp $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- my $startmicro = $line;
- my $endmicro = $line;
-
- # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n";
-
- delete ($microstart{$start});
- delete ($microend{$end});
- my $flag = 'down';
- my $startflag = 'down';
- my $endflag = 'down';
- my $prestart = $start - $distance;
- my $postend = $end + $distance;
- my @compoundlines = ();
- my %compoundhash = ();
- push (@compoundlines, $line);
- push (@{$compoundhash{$line}},$line);
- my $startrank = 1;
- my $endrank = 1;
-
- while( ($startflag eq "down") || ($endflag eq "down") ){
- if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) {
- # print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n";
- last;
- }
-
- $firstflag = "up";
- if ($startflag eq "down"){
- for my $i ($prestart ... $start){
-
- if(exists $microend{$i}){
- chomp $microend{$i}[0];
- if(exists $compoundhash{$microend{$i}[0]}) {next;}
- # print "sending from microend $startmicro, $microend{$i}[0] |||\n";
- if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){
- push(@compoundlines, $microend{$i}[0]);
- # print "accepted\n";
- my @tields = split(/\t/,$microend{$i}[0]);
- $startmicro = $microend{$i}[0];
- chomp $startmicro;
- $start = $tields[$startcord];
- $flag = 'down';
- $startrank++;
- # print "startcompund = $microend{$i}[0]\n";
- delete $microend{$i};
- delete $microstart{$start};
- $startflag = 'down';
- $prestart = $start - $distance;
- last;
- }
- else{
- $flag = 'up';
- $startflag = 'up';
- }
- }
- else{
- $flag = 'up';
- $startflag = 'up';
- }
- }
- }
-
- $endrank = $startrank;
-
- if ($endflag eq "down"){
- for my $i ($end ... $postend){
-
- if(exists $microstart{$i} ){
- chomp $microstart{$i}[0];
- if(exists $compoundhash{$microstart{$i}[0]}) {next;}
- # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n";
-
- if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){
- push(@compoundlines, $microstart{$i}[0]);
- # print "accepted\n";
- my @tields = split(/\t/,$microstart{$i}[0]);
- $end = $tields[$endcord]-0;
- $endmicro = $microstart{$i}[0];
- $endrank++;
- chomp $endmicro;
- $flag = 'down';
- # print "endcompund = $microstart{$i}[0]\n";
- delete $microstart{$i};
- delete $microend{$end};
- shift @microstring;
- $postend = $end + $distance;
- $endflag = 'down';
- last;
- }
- else{
- $flag = 'up';
- $endflag = 'up';
- }
- }
- else{
- $flag = 'up';
- $endflag = 'up';
- }
- }
- }
- # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
- } #end while( $flag eq "down")
- # print "compoundlines = @compoundlines \n";
- if (scalar (@compoundlines) == 1){
- print OUT $line,"\n";
- }
- if (scalar (@compoundlines) > 1){
- my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]);
- # print $compoundline,"\n";
- print OUT2 $compoundline,"\n";
- }
- } #end foreach my $line (@microstring){
- } #if (exists $micros{$key}){
-
-
- }
-
- close OUT;
- close OUT2;
- }
- #------------------------------------------------------------------------------------------------
- sub compoundify{
- my ($compoundlines, $sequence) = @_;
- # print "\nfound to compound : @$compoundlines and$sequence \n";
- my $noOfComps = @$compoundlines;
- # print "Number of elements in hash is $noOfComps\n";
- my @starts;
- my @ends;
- foreach my $line (@$compoundlines){
- # print "compoundify.. line = $line \n";
- chomp $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- # print "start = $start, end = $end \n";
- push(@starts, $start);
- push(@ends,$end);
- }
- my @temp = @$compoundlines;
- my $startline=$temp[0];
- my @mields = split(/\t/,$startline);
- my $startcoord = $mields[$startcord];
- my $startgapsign=$mields[$endcord];
- my @startsorted = sort { $a <=> $b } @starts;
- my @endsorted = sort { $a <=> $b } @ends;
- my @intervals;
- for my $end (0 ... (scalar(@endsorted)-2)){
- my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1));
- push(@intervals,$interval);
- # print "interval = $interval =\n";
- # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n";
- }
- push(@intervals,"");
- my $compoundmicrosat=();
- my $multiunit="";
- foreach my $line (@$compoundlines){
- my @fields = split(/\t/,$line);
- my $component="[".$fields[$microsatcord]."]".shift(@intervals);
- $compoundmicrosat=$compoundmicrosat.$component;
- $multiunit=$multiunit."[".$fields[$motifcord]."]";
- # print "multiunit = $multiunit\n";
- }
- my $compoundcopy = $compoundmicrosat;
- $compoundcopy =~ s/\[|\]//g;
- my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1;
-
-
- my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit));
- return $compoundline;
- }
- #------------------------------------------------------------------------------------------------
- sub identityMatch_thresholdCheck{
- my $line1 = $_[0];
- my $line2 = $_[1];
- my $rank = $_[2];
- my @lields1 = split(/\t/,$line1);
- my @lields2 = split(/\t/,$line2);
- # print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n";
-
- if (length($lields1[$motifcord]) == length($lields2[$motifcord])){
- my $probe = $lields1[$motifcord].$lields1[$motifcord];
- #print "$probe :: $lields2[$motifcord]\n";
- return "proceed" if $probe =~ /$lields2[$motifcord]/;
- #print "line recieved\n";
- if ($rank ==1){
- return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
- }
- else {
- return "proceed" if thresholdCheck($line2) eq "proceed";
- return "stop";
- }
- }
- else{
- if ($rank ==1){
- return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
- }
- else {
- return "proceed" if thresholdCheck($line2) eq "proceed";
- return "stop";
- }
- }
- return "stop";
- }
- #------------------------------------------------------------------------------------------------
- sub thresholdCheck{
- my @checkthresholds=(0,@thresholds);
- #print "IN thresholdCheck: @_\n";
- my $line = $_[0];
- my @lields = split(/\t/,$line);
- return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])];
- return "stop";
- }
- #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
- sub multiSpecies_filtering_interrupted_microsats{
- # print "IN multiSpecies_filtering_interrupted_microsats: @_\n";
- my $unfiltered = $_[0];
- my $filtered = $_[1];
- my $residue = $_[2];
- my $no_of_species = $_[5];
- open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
- open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
- open(RES,">$residue") or die "Cannot open file $residue: $!";
-
- $infocord = 2 + (4*$no_of_species) - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
-
-
- my @sub_thresholds = (0);
-
- push(@sub_thresholds, split(/_/,$_[3]));
- my @thresholds = (0);
-
- push(@thresholds, split(/_/,$_[4]));
-
- while (my $line = <UNF>) {
- next if $line !~ /[a-z]/;
- #print $line;
- chomp $line;
- my @fields = split(/\t/,$line);
- my $motif = $fields[$motifcord];
- my $realmotif = $motif;
- #print "motif = $motif\n";
- if ($motif =~ /^\[/){
- $motif =~ s/^\[//g;
- my @motifs = split(/\]/,$motif);
- $realmotif = $motifs[0];
- }
- # print "realmotif = $realmotif";
- my $motif_size = length($realmotif);
-
- my $microsat = $fields[$microsatcord];
- # print "microsat = $microsat\n";
- $microsat =~ s/^\[|\]$//sg;
- my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
-
- $microsat = join("",@microsats);
- if (length($microsat) < $thresholds[$motif_size]) {
- # print length($microsat)," < ",$thresholds[$motif_size],"\n";
- print RES $line,"\n"; next;
- }
- my @lengths = ();
- foreach my $mic (@microsats){
- push(@lengths, length($mic));
- }
- if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) {
- # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n";
- print RES $line,"\n"; next;}
- else {print FIL $line,"\n"; next;
- }
- }
- close FIL;
- close RES;
- }
- sub largest_microsat{
- my $counter = 0;
- my($max) = shift(@_);
- foreach my $temp (@_) {
- #print "finding largest array: $maxcounter \n";
- if($temp > $max){
- $max = $temp;
- }
- }
- return($max);
- }
- #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
- sub multiSpecies_compound_microsat_analyzer{
- ####### PARAMETER ########
- ##########################
-
- my $input1 = $_[0]; ###### the *_sput_op4_ii file
- my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
- my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
- my $output2 = $_[3]; ###### the pure compound microsatellites
- my $org = $_[4];
- my $no_of_species = $_[5];
- # print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n";
- $infocord = 2 + (4*$no_of_species) - 1;
- $typecord = 2 + (4*$no_of_species) + 1 - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
-
- open(IN,"<$input1") or die "Cannot open file $input1 $!";
- open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
-
- open(OUT,">$output1") or die "Cannot open file $output1 $!";
- open(OUT2,">$output2") or die "Cannot open file $output2 $!";
-
-
- # print "opened files \n";
- my %micros = ();
- my $keycounter=0;
- my $linecounter=0;
- while (my $line = <IN>){
- $linecounter++;
- if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
- push (@{$micros{$key}},$line);
- $keycounter++;
- }
- else{
- # print "no key\n";
- }
- }
- close IN;
- my @deletedlines = ();
- # print "done hash . linecounter=$linecounter, keycounter=$keycounter\n";
- #---------------------------------------------------------------------------------------------------
- # NOW READING THE SEQUENCE FILE
- my $keyfound=0;
- my $keyexists=0;
- my $inter=0;
- my $pure=0;
-
- while(my $sine = <SEQ>){
- my %microstart=();
- my %microend=();
- my @sields = split(/\t/,$sine);
- my $key = 0;
- if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
- # print $sine;
- # print $key;
- $keyfound++;
- }
- else{
-
- }
- #<STDIN> if !defined $key;
- if (exists $micros{$key}){
- $keyexists++;
- my @microstring = @{$micros{$key}};
-
- my @filteredmicrostring;
-
- foreach my $line (@microstring){
- chomp $line;
- my $copy_line = $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- # FOR COMPOUND MICROSATELLITES
- if ($fields[$typecord] eq "compound"){
- $line = compound_microsat_analyser($line);
- if ($line eq "NULL") {
- print OUT2 "$copy_line\n";
- $pure++;
- next;
- }
- else{
- print OUT "$line\n";
- $inter++;
- next;
- }
- }
- }
-
- } #if (exists $micros{$key}){
- }
- close OUT;
- close OUT2;
- # print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n";
- }
-
- sub compound_microsat_analyser{
- my $line = $_[0];
- my @fields = split(/\t/,$line);
- my $motifline = $fields[$motifcord];
- my $microsat = $fields[$microsatcord];
- $motifline =~ s/^\[|\]$//g;
- $microsat =~ s/^\[|\]$//g;
- $microsat =~ s/-//g;
- my @interruptions = ();
- my @motields = split(/\]\[/,$motifline);
- my @microields = split(/\][a-zA-Z|-]*\[/,$microsat);
- my @inields = split(/[.*]/,$microsat);
- shift @inields;
- my @motifcount = scalar(@motields);
- my $prevmotif = $motields[0];
- my $prevmicro = $microields[0];
- my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0]));
- my $localflag = 'down';
- my @infoarray = ();
-
- for my $l (1 ... (scalar(@motields)-1)){
- my $probe = $prevmotif.$prevmotif;
- if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;}
-
- if ($probe =~ /$motields[$l]/i){
- my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l]));
- my $curr_startphase = substr($microields[$l],0,length($motields[$l]));
- if ($curr_startphase =~ /$prevphase/i) {
- $infoarray[$l-1] = "insertion";
- }
- else {
- $infoarray[$l-1] = "indel/substitution";
- }
-
- $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase;
- next;
- }
- else {$localflag = "up"; last;}
- }
- if ($localflag eq 'up') {return "NULL";}
-
- if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";}
- if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";}
- if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";}
- if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";}
- if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";}
- @microields = split(/[\[|\]]/,$microsat);
- my @microsats = ();
- my @positions = ();
- my $lengthtracker = 0;
- for my $i (0 ... (scalar(@microields ) - 1)){
- if ($i%2 == 0){
- push(@microsats,$microields[$i]);
- $lengthtracker = $lengthtracker + length($microields[$i]);
- }
- else{
- push(@interruptions,$microields[$i]);
- push(@positions, $lengthtracker+1);
- $lengthtracker = $lengthtracker + length($microields[$i]);
- }
-
- }
- my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions)));
- return($returnline);
- }
- #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
- sub multiSpecies_compoundClarifyer{
- # print "IN multiSpecies_compoundClarifyer: @_\n";
- my $input1 = $_[0]; ###### the *_sput_compound
- my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
- my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
- my $output2 = $_[3]; ###### compound file
- my $org = $_[4];
- my $no_of_species = $_[5];
- @thresholds = "0";
- push(@thresholds, split(/_/,$_[6]));
-
-
- $infocord = 2 + (4*$no_of_species) - 1;
- $typecord = 2 + (4*$no_of_species) + 1 - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
- $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
-
- $interr_poscord = $motifcord + 3;
- $no_of_interruptionscord = $motifcord + 4;
- $interrcord = $motifcord + 2;
- $interrtypecord = $motifcord + 1;
-
-
- open(IN,"<$input1") or die "Cannot open file $input1 $!";
- open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
-
- open(INT,">$output1") or die "Cannot open file $output2 $!";
- open(COMP,">$output2") or die "Cannot open file $output2 $!";
- #open(CH,">changed") or die "Cannot open file changed $!";
-
- # print "opened files \n";
- my $linecounter = 0;
- my $microcounter = 0;
-
- my %micros = ();
- while (my $line = <IN>){
- # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
- $linecounter++;
- if ($line =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
- my $key = join("\t",$1, $2, $3, $4);
- # print $key, "#-#-#-#-#-#-#-#\n";
- # print "key = $key\n";
- push (@{$micros{$key}},$line);
- $microcounter++;
- }
- else {#print $line," key not made\n"; <STDIN>;
- }
- }
- # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
- close IN;
- my @deletedlines = ();
- # print "done hash \n";
- $linecounter = 0;
- #---------------------------------------------------------------------------------------------------
- # NOW READING THE SEQUENCE FILE
- my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide);
- $printer = 0;
-
- while(my $sine = <SEQ>){
- my %microstart=();
- my %microend=();
- my @sields = split(/\t/,$sine);
- my $key = ();
-
- # print "sine = $sine. focalspec = $focalspec \n"; #<STDIN>;
-
- if ($sine =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
-
- # if ($sine =~ /([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s/ ) {
- $key = join("\t",$1, $2, $3, $4);
- # print "key = $key\n";
- }
- else{
- # print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n";
- }
-
- if (exists $micros{$key}){
- my @microstring = @{$micros{$key}};
- delete $micros{$key};
-
- foreach my $line (@microstring){
- # print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1;
- # print "microsat = $line" if $printer == 1;
- $linecounter++;
- my $copy_line = $line;
- my @mields = split(/\t/,$line);
- my @fields = @mields;
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- my $microsat = $fields[$microsatcord];
- my $motifline = $fields[$motifcord];
- my $microsatcopy = $microsat;
- my $positioner = $microsat;
- $positioner =~ s/[a-zA-Z|-]/_/g;
- $microsatcopy =~ s/^\[|\]$//gs;
- chomp $microsatcopy;
- my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
- my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
- my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start);
- # print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1;
- shift @inields;
- #print "inields =@inields<\n";
- $motifline =~ s/^\[|\]$//gs;
- chomp $motifline;
- #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n";
- my @motields = split(/\]\[/,$motifline);
- my $seq = $microsatcopy;
- $seq =~ s/\[|\]//g;
- my $seqlen = length($seq);
- $seq = " ".$seq;
-
- my $longestmotif_no = longest_array_element(@motields);
- my $shortestmotif_no = shortest_array_element(@motields);
- #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n";
-
- my $search = $motields[$longestmotif_no].$motields[$longestmotif_no];
- if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){
- print COMP $line;
- next;
- }
-
- my @shortestmotif_nos = ();
- for my $m (0 ... $#motields){
- push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) );
- }
- ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
- my $newleft =();
- my $leftstopper = 0; my $rightstopper = 0;
- foreach my $shortmotif_no (@shortestmotif_nos){
- next if $shortmotif_no == 0;
- my $last_left = $shortmotif_no; #$#motields;
- my $last_hitter = 0;
- for (my $i =($shortmotif_no-1); $i>=0; $i--){
- my $search = $motields[$shortmotif_no];
- if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
- if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
- $last_hitter = 1;
- $last_left = $i+1; last;
- }
- my $probe = $motields[$i];
- if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
-
- if ($probe !~ /$search/){
- $last_hitter = 1;
- $last_left = $i+1;
- # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n";
- last;
- }
- $last_left--;$last_hitter = 1;
- # print "passed tests, last left = $last_left\n";
- }
- # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n";
- if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;}
- else {$leftstopper = 1;
- # print "leftstopper = 1\n";
- }
- }
-
- ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
- my $newright =();
- foreach my $shortmotif_no (@shortestmotif_nos){
- next if $shortmotif_no == $#motields;
- my $last_right = $shortmotif_no;# -1;
- for my $i ($shortmotif_no+1 ... $#motields){
- my $search = $motields[$shortmotif_no];
- if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
- if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
- $last_right = $i-1; last;
- }
- my $probe = $motields[$i];
- if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
- if ( $probe !~ /$search/){
- $last_right = $i-1; last;
- }
- $last_right++;
- }
- if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n";
- }
- else {$rightstopper = 1;
- }
- }
-
-
- if ($rightstopper == 1 && $leftstopper == 1){
- print COMP $line;
- # print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1;
- next;
- }
-
- # print "pased initial testing phase \n" if $printer == 1;
- my @outputs = ();
- my @orig_starts = ();
- my @orig_ends = ();
- for my $mic (0 ... $#microields){
- my $miclen = length($microields[$mic]);
- my $microleftlen = 0;
- #print "\nmic = $mic\n";
- if($mic > 0){
- for my $submin (0 ... $mic-1){
- my $interval = ();
- if (!exists $inields[$submin]) {$interval = "";}
- else {$interval = $inields[$submin];}
- #print "inield =$interval< and microield =$microields[$submin]<\n ";
- $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
- }
- }
- push(@orig_starts,($microleftlen+1));
- push(@orig_ends, ($microleftlen+1 + $miclen -1));
- }
-
- ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
-
-
- for my $mic (0 ... $#microields){
- my $miclen = length($microields[$mic]);
- my $microleftlen = 0;
- if($mic > 0){
- for my $submin (0 ... $mic-1){
- # if(!exists $inields[$submin]) {$inields[$submin] = "";}
- my $interval = ();
- if (!exists $inields[$submin]) {$interval = "";}
- else {$interval = $inields[$submin];}
- #print "inield =$interval< and microield =$microields[$submin]<\n ";
- $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
- }
- }
- $fields[$startcord] = $microleftlen+1;
- $fields[$endcord] = $fields[$startcord] + $miclen -1;
- $fields[$typecord] = $microsat_types[length($motields[$mic])];
- $fields[$microsatcord] = $microields[$mic];
- $fields[$motifcord] = $motields[$mic];
- my $templine = join("\t", (@fields[0 .. $motifcord]) );
- my $orig_templine = join("\t", (@fields[0 .. $motifcord]) );
- my $newline;
- my $lefter = 1; my $righter = 1;
- if ( $fields[$startcord] < 2){$lefter = 0;}
- if ($fields[$endcord] == $seqlen){$righter = 0;}
-
- while($lefter == 1){
- $newline = left_extender($templine, $seq,$org);
- # print "returned line from left extender= $newline \n" if $printer == 1;
- if ($newline eq $templine){$templine = $newline; last;}
- else {$templine = $newline;}
-
- if (left_extention_permission_giver($templine) eq "no") {last;}
- }
- while($righter == 1){
- $newline = right_extender($templine, $seq,$org);
- # print "returned line from right extender= $newline \n" if $printer == 1;
- if ($newline eq $templine){$templine = $newline; last;}
- else {$templine = $newline;}
- if (right_extention_permission_giver($templine) eq "no") {last;}
- }
- my @tempfields = split(/\t/,$templine);
- $tempfields[$microsatcord] =~ s/\]|\[//g;
- $tempfields[$motifcord] =~ s/^\[|\]$//gs;
- my @tempmotields = split(/\]\[/,$tempfields[$motifcord]);
-
- if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) {
- # print "scalar ( tempmotields) = 1\n" if $printer == 1;
- next;
- }
- my $prevmotif = shift(@tempmotields);
- my $stopper = 0;
-
- foreach my $tempmot (@tempmotields){
- if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;}
- my $search = $prevmotif.$prevmotif;
- if ($search !~ /$tempmot/) {$stopper = 1; last;}
- $prevmotif = $tempmot;
- }
- if ( $stopper == 1) {
- # print "length tempmot != length prevmotif\n" if $printer == 1;
- next;
- }
- my $lastend = 0;
- #----------------------------------------------------------
- my $left_captured = (); my $right_captured = ();
- my $left_bp = (); my $right_bp = ();
- # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n";
- for my $o (0 ... $#orig_starts){
- # print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1;
- # print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1;
-
- if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord])
- # print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1;
- $left_captured = $o;
- $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1;
- }
- elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord])
- # print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1;
- $right_captured = $o;
- $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1;
- }
- $lastend = $orig_ends[$o]
- }
- # print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1;
- my $leftmotif = (); my $left_trashed = ();
- if ($tempfields[$startcord] != $fields[$startcord]) {
- $leftmotif = $motields[$left_captured];
- # print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1;
- if ( $left_captured !~ /[0-9]+/) {#print $line,"\n", $templine,"\n";
- }
- $left_trashed = length($microields[$left_captured]) - $left_bp;
- }
- my $rightmotif = (); my $right_trashed = ();
- if ($tempfields[$endcord] != $fields[$endcord]) {
- # print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1;
- $rightmotif = $motields[$right_captured];
- $right_trashed = length($microields[$right_captured]) - $right_bp;
- }
-
- ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
- $stopper = 0;
- my $deletioner = 0;
- #if($tempfields[$startcord] != $fields[$startcord]){
- # print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1;
- if ($left_captured != 0){
- # print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1;
- for my $e (0 ... $left_captured-1){
- if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){
- # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
- $deletioner++; last;
- }
- if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
- # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
- $deletioner++; last;
- }
- if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
- # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
- $deletioner++; last;
- }
- }
- }
- #}
- # print "after left search, deletioner = $deletioner\n" if $printer == 1;
- if ($deletioner >= 1) {
- # print "deletioner = $deletioner\n" if $printer == 1;
- next;
- }
-
- $deletioner = 0;
-
- #if($tempfields[$endcord] != $fields[$endcord]){
- # print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1;
- if ($right_captured != $#microields){
- # print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1;
- for my $e ($right_captured+1 ... $#microields){
- if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){
- # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
- $deletioner++; last;
- }
- if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
- # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
- $deletioner++; last;
- }
- if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
- # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
- $deletioner++; last;
- }
- }
- }
- #}
- # print "deletioner = $deletioner\n" if $printer == 1;
- if ($deletioner >= 1) {
- next;
- }
- my $leftMotifs_notCaptured = ();
- my $rightMotifs_notCaptured = ();
-
- if ($tempfields[$startcord] != $fields[$startcord] ){
- #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n";
- if (length($leftmotif) == 1 && $left_trashed > 3){
- # print "invaded left motif is long mononucleotide" if $printer == 1;
- next;
-
- }
- elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){
- # print "invaded left motif too long" if $printer == 1;
- next;
- }
- }
- if ($tempfields[$endcord] != $fields[$endcord] ){
- #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n";
- if (length($rightmotif)==1 && $right_trashed){
- # print "invaded right motif is long mononucleotide" if $printer == 1;
- next;
-
- }
- elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){
- # print "invaded right motif too long" if $printer == 1;
- next;
-
- }
- }
- push @outputs, $templine;
- }
- if (scalar(@outputs) == 0){ print COMP $line; next;}
- # print "outputs are:", join("\n",@outputs),"\n";
- if (scalar(@outputs) == 1){
- my @oields = split(/\t/,$outputs[0]);
- my $start = $oields[$startcord]+$mields[$startcord]-1;
- my $end = $start+($oields[$endcord]-$oields[$startcord]);
- $oields[$startcord] = $start; $oields[$endcord] = $end;
- print INT join("\t",@oields), "\n";
- # print CH $line,;
- }
- if (scalar(@outputs) > 1){
- my $motif_min = 10;
- my $chosen_one = $outputs[0];
- foreach my $micro (@outputs){
- my @oields = split(/\t/,$micro);
- my $tempmotif = $oields[$motifcord];
- $tempmotif =~ s/^\[|\]$//gs;
- my @omots = split(/\]\[/, $tempmotif);
- # print "motif_min = $motif_min, current motif = $tempmotif\n";
- my $start = $oields[$startcord]+$mields[$startcord]-1;
- my $end = $start+($oields[$endcord]-$oields[$startcord]);
- $oields[$startcord] = $start; $oields[$endcord] = $end;
- if(length($omots[0]) < $motif_min) {
- $chosen_one = join("\t",@oields);
- $motif_min = length($omots[0]);
- }
- }
- print INT $chosen_one, "\n";
- # print "chosen one is ".$chosen_one, "\n";
- # print CH $line;
-
-
- }
-
- }
-
- } #if (exists $micros{$key}){
- else{
- }
- }
- close INT;
- close COMP;
- }
- sub left_extender{
- #print "left extender\n";
- my ($line, $seq, $org) = @_;
- # print "in left extender... line passed = $line and sequence is $seq\n";
- chomp $line;
- my @fields = split(/\t/,$line);
- my $rstart = $fields[$startcord];
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/\[|\]//g;
- my $rend = $rstart + length($microsat)-1;
- $microsat =~ s/-//g;
- my $motif = $fields[$motifcord];
- my $firstmotif = ();
- if ($motif =~ /^\[/){
- $motif =~ s/^\[//g;
- $motif =~ /([a-zA-Z]+)\].*/;
- $firstmotif = $1;
- }
- else {$firstmotif = $motif;}
-
- #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n";
- my $leftphase = substr($microsat, 0,length($firstmotif));
- my $phaser = $leftphase.$leftphase;
- my @phase = split(/\s*/,$leftphase);
- my @phases;
- my @copy_phases = @phases;
- my $crawler=0;
- for (0 ... (length($leftphase)-1)){
- push(@phases, substr($phaser, $crawler, length($leftphase)));
- $crawler++;
- }
- my $start = $rstart;
- my $end = $rend;
-
- my $leftseq = substr($seq, 0, $start);
- # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n";
- my @extentions = ();
- my @trappeds = ();
- my @intervalposs = ();
- my @trappedposs = ();
- my @trappedphases = ();
- my @intervals = ();
- my $firstmotif_length = length($firstmotif);
- foreach my $phase (@phases){
- # print "left phase\t",substr($leftseq, -10),"\t$phase\n";
- # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n";
- if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
- # print "in left pattern\n";
- my $trapped = $1;
- my $trappedpos = length($leftseq)-length($trapped);
- my $interval = $3;
- my $intervalpos = index($trapped, $interval) + 1;
- # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n";
- my $extention = substr($trapped, 0, length($trapped)-length($interval));
- my $leftpeep = substr($seq, 0, ($start-length($trapped)));
- my @passed_overhangs;
-
- for my $i (1 ... length($phase)-1){
- my $overhang = substr($phase, -length($phase)+$i);
- # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n";
- #TEMPORARY... BETTER METHOD NEEDED
- $leftpeep =~ s/-//g;
- if ($leftpeep =~ /$overhang$/i){
- push(@passed_overhangs,$overhang);
- # print "l overhang\n";
- }
- }
-
- if(scalar(@passed_overhangs)>0){
- my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
- $extention = $overhang.$extention;
- $trapped = $overhang.$trapped;
- #print "trapped extended to $trapped \n";
- $trappedpos = length($leftseq)-length($trapped);
- }
-
- push(@extentions,$extention);
- # print "extentions = @extentions \n";
- push(@trappeds,$trapped );
- push(@intervalposs,length($extention)+1);
- push(@trappedposs, $trappedpos);
- # print "trappeds = @trappeds\n";
- push(@trappedphases, substr($extention,0,length($phase)));
- push(@intervals, $interval);
- }
- }
- if (scalar(@trappeds == 0)) {return $line;}
-
- my $nikaal = shortest_array_element(@intervals);
-
- if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
- $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
- ##print "new fields 9 = $fields[9]\n";
- $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
- if($fields[$microsatcord] !~ /^\[/i){
- $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
- }
-
- $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
-
- if (exists ($fields[$motifcord+1])){
- $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
- }
- else{$fields[$motifcord+1] = "indel/deletion";}
- ##print "new fields 14 = $fields[14]\n";
-
- if (exists ($fields[$motifcord+2])){
- $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
- }
- else{$fields[$motifcord+2] = $intervals[$nikaal];}
- my @seventeen=();
- if (exists ($fields[$motifcord+3])){
- @seventeen = split(/,/,$fields[$motifcord+3]);
- # #print "scalarseventeen =@seventeen<-\n";
- for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
- $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
- $fields[$motifcord+4] = $fields[$motifcord+4]+1;
- }
-
- else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
-
- ##print "new fields 16 = $fields[16]\n";
- ##print "new fields 17 = $fields[17]\n";
-
-
- my $returnline = join("\t",@fields);
- my $pastline = $returnline;
- if ($fields[$microsatcord] =~ /\[/){
- $returnline = multiSpecies_compoundClarifyer_merge($returnline);
- }
- return $returnline;
- }
- sub right_extender{
- my ($line, $seq, $org) = @_;
- chomp $line;
- my @fields = split(/\t/,$line);
- my $rstart = $fields[$startcord];
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/\[|\]//g;
- my $rend = $rstart + length($microsat)-1;
- $microsat =~ s/-//g;
- my $motif = $fields[$motifcord];
- my $temp_lastmotif = ();
- if ($motif =~ /\]$/s){
- $motif =~ s/\]$//sg;
- $motif =~ /.*\[([a-zA-Z]+)/;
- $temp_lastmotif = $1;
- }
- else {$temp_lastmotif = $motif;}
- my $lastmotif = substr($microsat,-length($temp_lastmotif));
- ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n";
- my $rightphase = substr($microsat, -length($lastmotif));
- my $phaser = $rightphase.$rightphase;
- my @phase = split(/\s*/,$rightphase);
- my @phases;
- my @copy_phases = @phases;
- my $crawler=0;
- for (0 ... (length($rightphase)-1)){
- push(@phases, substr($phaser, $crawler, length($rightphase)));
- $crawler++;
- }
- my $start = $rstart;
- my $end = $rend;
-
- my $rightseq = substr($seq, $end+1);
- my @extentions = ();
- my @trappeds = ();
- my @intervalposs = ();
- my @trappedposs = ();
- my @trappedphases = ();
- my @intervals = ();
- my $lastmotif_length = length($lastmotif);
- foreach my $phase (@phases){
- if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
- my $trapped = $1;
- my $trappedpos = $end+1;
- my $interval = $2;
- my $intervalpos = index($trapped, $interval) + 1;
- my $extention = substr($trapped, length($interval));
- my $rightpeep = substr($seq, ($end+length($trapped))+1);
- my @passed_overhangs = "";
-
- #TEMPORARY... BETTER METHOD NEEDED
- $rightpeep =~ s/-//g;
- for my $i (1 ... length($phase)-1){
- my $overhang = substr($phase,0, $i);
- # #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n";
- if ($rightpeep =~ /^$overhang/i){
- push(@passed_overhangs, $overhang);
- # #print "r overhang\n";
- }
- }
- if (scalar(@passed_overhangs) > 0){
- my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
- $extention = $extention.$overhang;
- $trapped = $trapped.$overhang;
- # #print "trapped extended to $trapped \n";
- }
-
- push(@extentions,$extention);
- ##print "extentions = @extentions \n";
- push(@trappeds,$trapped );
- push(@intervalposs,$intervalpos);
- push(@trappedposs, $trappedpos);
- # #print "trappeds = @trappeds\n";
- push(@trappedphases, substr($extention,0,length($phase)));
- push(@intervals, $interval);
- }
- }
- if (scalar(@trappeds == 0)) {return $line;}
-
- # my $nikaal = longest_array_element(@trappeds);
- my $nikaal = shortest_array_element(@intervals);
-
- # #print "longest element found = $nikaal \n";
-
- if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
- $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
- ##print "new fields 9 = $fields[9]";
- $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
- ##print "new fields 11 = $fields[11]\n";
- if($fields[$microsatcord] !~ /^\[/i){
- $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
- }
-
- $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
- ##print "new fields 12 = $fields[12]\n";
-
- ##print "scalar of fields = ",scalar(@fields),"\n";
- if (exists ($fields[$motifcord+1])){
- # print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1];
- # <STDIN> if !exists $fields[$motifcord+1];
- $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
- }
- else{$fields[$motifcord+1] = "indel/deletion";}
- ##print "new fields 14 = $fields[14]\n";
-
- if (exists ($fields[$motifcord+2])){
- $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
- }
- else{$fields[$motifcord+2] = $intervals[$nikaal];}
- ##print "new fields 15 = $fields[15]\n";
- my @seventeen=();
- if (exists ($fields[$motifcord+3])){
- ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n";
- # print " print fields = @fields\n " if !exists $fields[$motifcord+3];
- <STDIN> if !exists $fields[$motifcord+3];
- my $currpos = length($microsat)+$intervalposs[$nikaal];
- $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
- $fields[$motifcord+4] = $fields[$motifcord+4]+1;
- }
-
- else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
-
- ##print "new fields 16 = $fields[16]\n";
-
- ##print "new fields 17 = $fields[17]\n";
- my $returnline = join("\t",@fields);
- my $pastline = $returnline;
- if ($fields[$microsatcord] =~ /\[/){
- $returnline = multiSpecies_compoundClarifyer_merge($returnline);
- }
- #print "finally right-extended line = ",$returnline,"\n";
- return $returnline;
- }
- sub longest_array_element{
- my $counter = 0;
- my($max) = shift(@_);
- my $maxcounter = 0;
- foreach my $temp (@_) {
- $counter++;
- #print "finding largest array: $maxcounter \n" if $prinkter == 1;
- if(length($temp) > length($max)){
- $max = $temp;
- $maxcounter = $counter;
- }
- }
- return($maxcounter);
- }
- sub shortest_array_element{
- my $counter = 0;
- my($min) = shift(@_);
- my $mincounter = 0;
- foreach my $temp (@_) {
- $counter++;
- #print "finding largest array: $mincounter \n" if $prinkter == 1;
- if(length($temp) < length($min)){
- $min = $temp;
- $mincounter = $counter;
- }
- }
- return($mincounter);
- }
- sub left_extention_permission_giver{
- my @fields = split(/\t/,$_[0]);
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/(^\[)|-//g;
- my $motif = $fields[$motifcord];
- my $firstmotif = ();
- my $firststretch = ();
- my @stretches=();
- if ($motif =~ /^\[/){
- $motif =~ s/^\[//g;
- $motif =~ /([a-zA-Z]+)\].*/;
- $firstmotif = $1;
- @stretches = split(/\]/,$microsat);
- $firststretch = $stretches[0];
- ##print "firststretch = $firststretch\n";
- }
- else {$firstmotif = $motif;$firststretch = $microsat;}
-
- if (length($firststretch) < $thresholds[length($firstmotif)]){
- return "no";
- }
- else {return "yes";}
- }
- sub right_extention_permission_giver{
- my @fields = split(/\t/,$_[0]);
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/-|(\]$)//sg;
- my $motif = $fields[$motifcord];
- my $temp_lastmotif = ();
- my $laststretch = ();
- my @stretches=();
- if ($motif =~ /\]/){
- $motif =~ s/\]$//gs;
- $motif =~ /.*\[([a-zA-Z]+)$/;
- $temp_lastmotif = $1;
- @stretches = split(/\[/,$microsat);
- $laststretch = pop(@stretches);
- ##print "last stretch = $laststretch\n";
- }
- else {$temp_lastmotif = $motif; $laststretch = $microsat;}
- if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
- return "no";
- }
- else { return "yes";}
- }
- sub multiSpecies_compoundClarifyer_merge{
- my $line = $_[0];
- #print "sent for mering: $line \n";
- my @mields = split(/\t/,$line);
- my @fields = @mields;
- my $microsat = $fields[$microsatcord];
- my $motifline = $fields[$motifcord];
- my $microsatcopy = $microsat;
- $microsatcopy =~ s/^\[|\]$//sg;
- my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
- my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
- shift @inields;
- #print "inields =@inields<\n";
- $motifline =~ s/^\[|\]$//sg;
- my @motields = split(/\]\[/,$motifline);
- my @firstmotifs = ();
- my @lastmotifs = ();
- for my $i (0 ... $#microields){
- $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
- $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
- }
- #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n";
- my @mergelist = ();
- my @inter_poses = split(/,/,$fields[$interr_poscord]);
- my $no_of_interruptions = $fields[$no_of_interruptionscord];
- my @interruptions = split(/,/,$fields[$interrcord]);
- my @interrtypes = split(/,/,$fields[$interrtypecord]);
- my $stopper = 0;
- for my $i (0 ... $#motields-1){
- #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n";
- if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){
- $stopper = 1;
- push(@mergelist, ($i)."_".($i+1));
- }
- }
-
- return $line if scalar(@mergelist) == 0;
-
- foreach my $merging (@mergelist){
- my @sets = split(/_/, $merging);
- my @tempmicro = ();
- my @tempmot = ();
- for my $i (0 ... $sets[0]-1){
- push(@tempmicro, "[".$microields[$i]."]");
- push(@tempmicro, $inields[$i]);
- push(@tempmot, "[".$motields[$i]."]");
- #print "adding pre-motifs number $i\n";
- }
- my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
- push (@tempmicro, $pusher);
- push(@tempmot, "[".$motields[$sets[0]]."]");
- my $outcoming = -2;
- for my $i ($sets[1]+1 ... $#microields-1){
- push(@tempmicro, "[".$microields[$i]."]");
- push(@tempmicro, $inields[$i]);
- push(@tempmot, "[".$motields[$i]."]");
- #print "adding post-motifs number $i\n";
- $outcoming = $i;
- }
- if ($outcoming != -2){
- #print "outcoming = $outcoming \n";
- push(@tempmicro, "[".$microields[$outcoming+1 ]."]");
- push(@tempmot,"[". $motields[$outcoming+1]."]");
- }
- $fields[$microsatcord] = join("",@tempmicro);
- $fields[$motifcord] = join("",@tempmot);
-
- splice(@interrtypes, $sets[0], 1);
- $fields[$interrtypecord] = join(",",@interrtypes);
- splice(@interruptions, $sets[0], 1);
- $fields[$interrcord] = join(",",@interruptions);
- splice(@inter_poses, $sets[0], 1);
- $fields[$interr_poscord] = join(",",@inter_poses);
- $no_of_interruptions = $no_of_interruptions - 1;
- }
- if ($no_of_interruptions == 0){
- $fields[$microsatcord] =~ s/^\[|\]$//sg;
- $fields[$motifcord] =~ s/^\[|\]$//sg;
- $line = join("\t", @fields[0 ... $motifcord]);
- }
- else{
- $line = join("\t", @fields);
- }
- return $line;
- }
- sub thrashallow{
- my $motif = $_[0];
- return 4 if length($motif) == 2;
- return 6 if length($motif) == 3;
- return 8 if length($motif) == 4;
-
- }
- #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
- sub multispecies_filtering_compound_microsats{
- my $unfiltered = $_[0];
- my $filtered = $_[1];
- my $residue = $_[2];
- my $no_of_species = $_[5];
- open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
- open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
- open(RES,">$residue") or die "Cannot open file $residue: $!";
-
- $infocord = 2 + (4*$no_of_species) - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
-
- my @sub_thresholds = ("0");
- push(@sub_thresholds, split(/_/,$_[3]));
- my @thresholds = ("0");
- push(@thresholds, split(/_/,$_[4]));
-
- while (my $line = <UNF>) {
- if ($line !~ /compound/){
- print FIL $line,"\n"; next;
- }
- chomp $line;
- my @fields = split(/\t/,$line);
- my $motifline = $fields[$motifcord];
- $motifline =~ s/^\[|\]$//g;
- my @motifs = split(/\]\[/,$motifline);
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/^\[|\]$|-//g;
- my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
-
- my $stopper = 0;
- for my $i (0 ... $#motifs){
- my @common = ();
- my $probe = $motifs[$i].$motifs[$i];
- my $motif_size = length($motifs[$i]);
-
- for my $j (0 ... $#motifs){
- next if length($motifs[$i]) != length($motifs[$j]);
- push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i;
- }
-
- if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;}
- else {next;}
- }
-
- if ($stopper == 1){
- print RES $line,"\n";
- }
- else { print FIL $line,"\n"; }
- }
- close FIL;
- close RES;
- }
- #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
- sub chromosome_unrand_breaker{
- # print "IN chromosome_unrand_breaker: @_\n ";
- my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
- my $dir = $_[1]; ###### directory where subsets are put
- my $output2 = $_[2]; ###### list of subset files
- my $increment = $_[3];
- my $info = $_[4];
- my $chr = $_[5];
- open(SEQ,"<$input1") or die "Cannot open file $input1 $!";
-
- open(OUT,">$output2") or die "Cannot open file $output2 $!";
-
- #---------------------------------------------------------------------------------------------------
- # NOW READING THE SEQUENCE FILE
-
- my $seed = 0;
- my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
- print OUT $subset,"\n";
- open(SUB,">$subset");
-
- while(my $sine = <SEQ>){
- $seed++;
- print SUB $sine;
-
- if ($seed%$increment == 0 ){
- close SUB;
- $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
- open(SUB,">$subset");
- print SUB $sine;
- print OUT $subset,"\n";
- # print $subset,"\n";
- }
- }
- close OUT;
- close SUB;
- }
- #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
- sub multiSpecies_interruptedMicrosatHunter{
- # print "IN multiSpecies_interruptedMicrosatHunter: @_\n";
- my $input1 = $_[0]; ###### the *_sput_op4_ii file
- my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
- my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
- my $output2 = $_[3]; ###### uninterrupted microsatellite file
- my $org = $_[4];
- my $no_of_species = $_[5];
-
- my @thresholds = "0";
- push(@thresholds, split(/_/,$_[6]));
-
- # print "thresholds = @thresholds \n";
- $infocord = 2 + (4*$no_of_species) - 1;
- $typecord = 2 + (4*$no_of_species) + 1 - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
- $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
-
- $interr_poscord = $motifcord + 3;
- $no_of_interruptionscord = $motifcord + 4;
- $interrcord = $motifcord + 2;
- $interrtypecord = $motifcord + 1;
-
-
- $prinkter = 0;
- # print "prionkytet = $prinkter\n";
-
- open(IN,"<$input1") or die "Cannot open file $input1 $!";
- open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
-
- open(INT,">$output1") or die "Cannot open file $output2 $!";
- open(UNINT,">$output2") or die "Cannot open file $output2 $!";
-
- # print "opened files !!\n";
- my $linecounter = 0;
- my $microcounter = 0;
-
- my %micros = ();
- while (my $line = <IN>){
- # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
- $linecounter++;
- if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+([0-9a-zA-Z_]+)\s([0-9]+)\s+([0-9]+)\s/ ) {
- my $key = join("\t",$1, $2, $3, $4, $5);
- # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1;
- push (@{$micros{$key}},$line);
- $microcounter++;
- }
- else {#print $line if $prinkter == 1;
- }
- }
- # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
- close IN;
- my @deletedlines = ();
- # print "done hash \n";
- $linecounter = 0;
- #---------------------------------------------------------------------------------------------------
- # NOW READING THE SEQUENCE FILE
- while(my $sine = <SEQ>){
- #print $linecounter,"\n" if $linecounter % 1000 == 0;
- my %microstart=();
- my %microend=();
- my @sields = split(/\t/,$sine);
- my $key = ();
- if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- $key = join("\t",$1, $2, $3, $4, $5);
- # print $key, "<-<-<-<-<-<-<-<\n";
- }
-
- # $prinkter = 1 if $sine =~ /^>H\t499\t/;
-
- if (exists $micros{$key}){
- my @microstring = @{$micros{$key}};
- delete $micros{$key};
- my @filteredmicrostring;
- # print "sequence = $sields[$sequencepos]" if $prinkter == 1;
- foreach my $line (@microstring){
- $linecounter++;
- my $copy_line = $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
-
- # print $line if $prinkter == 1;
- #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE
- my $newline;
- while(1){
- # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1;
- if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
-
- $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org);
- if ($newline eq $line){$line = $newline; last;}
- else {$line = $newline;}
-
- if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
- # print "returned line from left extender= $line \n" if $prinkter == 1;
- }
- while(1){
- # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1;
- if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
-
- $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org);
- if ($newline eq $line){$line = $newline; last;}
- else {$line = $newline;}
-
- if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
- # print "returned line from right extender= $line \n" if $prinkter == 1;
- }
- # print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1;
-
- my @tempfields = split(/\t/,$line);
- if ($tempfields[$microsatcord] =~ /\[/){
- print INT $line,"\n";
- }
- else{
- print UNINT $line,"\n";
- }
-
- if ($line =~ /NULL/){ next; }
- push(@filteredmicrostring, $line);
- push (@{$microstart{$start}},$line);
- push (@{$microend{$end}},$line);
- }
-
- my $firstflag = 'down';
-
- } #if (exists $micros{$key}){
- }
- close INT;
- close UNINT;
- # print "final number of lines = $linecounter\n";
- }
- sub multiSpecies_interruptedMicrosatHunter_left_extender{
- my ($line, $seq, $org) = @_;
- # print "left extender, like passed = $line\n" if $prinkter == 1;
- # print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1;
- chomp $line;
- my @fields = split(/\t/,$line);
- my $rstart = $fields[$startcord];
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/\[|\]//g;
- my $rend = $rstart + length($microsat)-1;
- $microsat =~ s/-//g;
- my $motif = $fields[$motifcord];
- my $firstmotif = ();
- if ($motif =~ /^\[/){
- $motif =~ s/^\[//g;
- $motif =~ /([a-zA-Z]+)\].*/;
- $firstmotif = $1;
- }
- else {$firstmotif = $motif;}
-
- # print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1;
- my $leftphase = substr($microsat, 0,length($firstmotif));
- my $phaser = $leftphase.$leftphase;
- my @phase = split(/\s*/,$leftphase);
- my @phases;
- my @copy_phases = @phases;
- my $crawler=0;
- for (0 ... (length($leftphase)-1)){
- push(@phases, substr($phaser, $crawler, length($leftphase)));
- $crawler++;
- }
- my $start = $rstart;
- my $end = $rend;
-
- my $leftseq = substr($seq, 0, $start);
- # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1;
- my @extentions = ();
- my @trappeds = ();
- my @intervalposs = ();
- my @trappedposs = ();
- my @trappedphases = ();
- my @intervals = ();
- my $firstmotif_length = length($firstmotif);
- foreach my $phase (@phases){
- # print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1;
- # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1;
- if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
- # print "in left pattern\n" if $prinkter == 1;
- my $trapped = $1;
- my $trappedpos = length($leftseq)-length($trapped);
- my $interval = $3;
- my $intervalpos = index($trapped, $interval) + 1;
- # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1;
- my $extention = substr($trapped, 0, length($trapped)-length($interval));
- my $leftpeep = substr($seq, 0, ($start-length($trapped)));
- my @passed_overhangs;
-
- for my $i (1 ... length($phase)-1){
- my $overhang = substr($phase, -length($phase)+$i);
- # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1;
- #TEMPORARY... BETTER METHOD NEEDED
- $leftpeep =~ s/-//g;
- if ($leftpeep =~ /$overhang$/i){
- push(@passed_overhangs,$overhang);
- # print "l overhang\n" if $prinkter == 1;
- }
- }
-
- if(scalar(@passed_overhangs)>0){
- my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
- $extention = $overhang.$extention;
- $trapped = $overhang.$trapped;
- # print "trapped extended to $trapped \n" if $prinkter == 1;
- $trappedpos = length($leftseq)-length($trapped);
- }
-
- push(@extentions,$extention);
- # print "extentions = @extentions \n" if $prinkter == 1;
- push(@trappeds,$trapped );
- push(@intervalposs,length($extention)+1);
- push(@trappedposs, $trappedpos);
- # print "trappeds = @trappeds\n" if $prinkter == 1;
- push(@trappedphases, substr($extention,0,length($phase)));
- push(@intervals, $interval);
- }
- }
- if (scalar(@trappeds == 0)) {return $line;}
-
- ############################ my $nikaal = longest_array_element(@trappeds);
- my $nikaal = shortest_array_element(@intervals);
-
- # print "longest element found = $nikaal \n" if $prinkter == 1;
-
- if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
- $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
- #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
- $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
- #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
- if($fields[$microsatcord] !~ /^\[/i){
- $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
- }
-
- $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
- #print "new fields 14 = $fields[12]\n" if $prinkter == 1;
-
- #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1;
-
- if (scalar(@fields) > $motifcord+1){
- $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
- }
- else{$fields[$motifcord+1] = "indel/deletion";}
- #print "new fields 14 = $fields[14]\n" if $prinkter == 1;
-
- if (scalar(@fields)>$motifcord+2){
- $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
- }
- else{$fields[$motifcord+2] = $intervals[$nikaal];}
- #print "new fields 15 = $fields[15]\n" if $prinkter == 1;
- my @seventeen=();
-
- if (scalar(@fields)>$motifcord+3){
- @seventeen = split(/,/,$fields[$motifcord+3]);
- # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1;
- for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
- $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
- $fields[$motifcord+4] = $fields[$motifcord+4]+1;
- }
-
- else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
-
- #print "new fields 16 = $fields[16]\n" if $prinkter == 1;
- #print "new fields 17 = $fields[17]\n" if $prinkter == 1;
-
- # return join("\t",@fields);
- my $returnline = join("\t",@fields);
- my $pastline = $returnline;
- if ($fields[$microsatcord] =~ /\[/){
- $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
- }
- # print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1;
- return $returnline;
- }
- sub multiSpecies_interruptedMicrosatHunter_right_extender{
- # print "right extender\n" if $prinkter == 1;
- my ($line, $seq, $org) = @_;
- # print "in right extender... line passed = $line\n" if $prinkter == 1;
- # print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1;
- chomp $line;
- my @fields = split(/\t/,$line);
- my $rstart = $fields[$startcord];
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/\[|\]//g;
- my $rend = $rstart + length($microsat)-1;
- $microsat =~ s/-//g;
- my $motif = $fields[$motifcord];
- my $temp_lastmotif = ();
- if ($motif =~ /\]$/){
- $motif =~ s/\]$//g;
- $motif =~ /.*\[([a-zA-Z]+)/;
- $temp_lastmotif = $1;
- }
- else {$temp_lastmotif = $motif;}
- my $lastmotif = substr($microsat,-length($temp_lastmotif));
- # print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1;
- my $rightphase = substr($microsat, -length($lastmotif));
- my $phaser = $rightphase.$rightphase;
- my @phase = split(/\s*/,$rightphase);
- my @phases;
- my @copy_phases = @phases;
- my $crawler=0;
- for (0 ... (length($rightphase)-1)){
- push(@phases, substr($phaser, $crawler, length($rightphase)));
- $crawler++;
- }
- my $start = $rstart;
- my $end = $rend;
-
- my $rightseq = substr($seq, $end+1);
- # print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1;
- # print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1;
- my @extentions = ();
- my @trappeds = ();
- my @intervalposs = ();
- my @trappedposs = ();
- my @trappedphases = ();
- my @intervals = ();
- my $lastmotif_length = length($lastmotif);
- foreach my $phase (@phases){
- # print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1;
- # print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1;
- if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
- # print "in right pattern\n" if $prinkter == 1;
- my $trapped = $1;
- my $trappedpos = $end+1;
- my $interval = $2;
- my $intervalpos = index($trapped, $interval) + 1;
- # print "trapped = $trapped, interval = $interval\n" if $prinkter == 1;
- my $extention = substr($trapped, length($interval));
- my $rightpeep = substr($seq, ($end+length($trapped))+1);
- my @passed_overhangs = "";
-
- #TEMPORARY... BETTER METHOD NEEDED
- $rightpeep =~ s/-//g;
- for my $i (1 ... length($phase)-1){
- my $overhang = substr($phase,0, $i);
- # print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1;
- if ($rightpeep =~ /^$overhang/i){
- push(@passed_overhangs, $overhang);
- # print "r overhang\n" if $prinkter == 1;
- }
- }
- if (scalar(@passed_overhangs) > 0){
- my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
- $extention = $extention.$overhang;
- $trapped = $trapped.$overhang;
- # print "trapped extended to $trapped \n" if $prinkter == 1;
- }
-
- push(@extentions,$extention);
- #print "extentions = @extentions \n" if $prinkter == 1;
- push(@trappeds,$trapped );
- push(@intervalposs,$intervalpos);
- push(@trappedposs, $trappedpos);
- # print "trappeds = @trappeds\n" if $prinkter == 1;
- push(@trappedphases, substr($extention,0,length($phase)));
- push(@intervals, $interval);
- }
- }
- if (scalar(@trappeds == 0)) {return $line;}
-
- ################################### my $nikaal = longest_array_element(@trappeds);
- my $nikaal = shortest_array_element(@intervals);
-
- # print "longest element found = $nikaal \n" if $prinkter == 1;
-
- if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
- $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
- $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
- if($fields[$microsatcord] !~ /^\[/i){
- $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
- }
-
- $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
-
-
- if (scalar(@fields) > $motifcord+1){
- $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
- }
- else{$fields[$motifcord+1] = "indel/deletion";}
-
- if (scalar(@fields)>$motifcord+2){
- $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
- }
- else{$fields[$motifcord+2] = $intervals[$nikaal];}
- my @seventeen=();
- if (scalar(@fields)>$motifcord+3){
- #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1;
- my $currpos = length($microsat)+$intervalposs[$nikaal];
- $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
- $fields[$motifcord+4] = $fields[$motifcord+4]+1;
- }
-
- else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
-
- # print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1;
- # return join("\t",@fields);
- my $returnline = join("\t",@fields);
- my $pastline = $returnline;
- if ($fields[$microsatcord] =~ /\[/){
- $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
- }
- # print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1;
- return $returnline;
- }
- sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{
- my @fields = split(/\t/,$_[0]);
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/(^\[)|-//sg;
- my $motif = $fields[$motifcord];
- chomp $motif;
- # print $motif, "\n" if $motif !~ /^\[/;
- my $firstmotif = ();
- my $firststretch = ();
- my @stretches=();
-
- # print "motif = $motif, microsat = $microsat\n" if $prinkter == 1;
- if ($motif =~ /^\[/){
- $motif =~ s/^\[//sg;
- $motif =~ /([a-zA-Z]+)\].*/;
- $firstmotif = $1;
- @stretches = split(/\]/,$microsat);
- $firststretch = $stretches[0];
- #print "firststretch = $firststretch\n" if $prinkter == 1;
- }
- else {$firstmotif = $motif;$firststretch = $microsat;}
- # print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1;
- if (length($firststretch) < $thresholds[length($firstmotif)]){
- return "no";
- }
- else {return "yes";}
- }
- sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{
- my @fields = split(/\t/,$_[0]);
- my $microsat = $fields[$microsatcord];
- $microsat =~ s/-|(\]$)//sg;
- my $motif = $fields[$motifcord];
- chomp $motif;
- my $temp_lastmotif = ();
- my $laststretch = ();
- my @stretches=();
- if ($motif =~ /\]/){
- $motif =~ s/\]$//sg;
- $motif =~ /.*\[([a-zA-Z]+)$/;
- $temp_lastmotif = $1;
- @stretches = split(/\[/,$microsat);
- $laststretch = pop(@stretches);
- #print "last stretch = $laststretch\n" if $prinkter == 1;
- }
- else {$temp_lastmotif = $motif; $laststretch = $microsat;}
- if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
- return "no";
- }
- else { return "yes";}
- }
- sub checking_substitutions{
-
- my ($line, $seq, $startprobes, $endprobes) = @_;
- #print "sequence = $seq \n" if $prinkter == 1;
- #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n";
- # <STDIN>;
- my @seqarray = split(/\s*/,$seq);
- my @startsubst_probes = split(/\|/,$startprobes);
- my @endsubst_probes = split(/\|/,$endprobes);
- chomp $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[11] - $fields[10];
- my $end = $fields[13] - $fields[10];
- my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9]
- $motif =~ s/\[|\]//g;
- my $microsat = $fields[14];
- $microsat =~ s/\[|\]//g;
- #------------------------------------------------------------------------
- # GETTING START AND END PHASES
- my $startphase = substr($microsat,0, length($motif));
- my $endphase = substr($microsat,-length($motif), length($motif));
- #print "start and end phases are - $startphase and $endphase\n";
- my $startflag = 'down';
- my $endflag = 'down';
- my $substitution_distance = length($motif);
- my $prestart = $start - $substitution_distance;
- my $postend = $end + $substitution_distance;
- my @endadds = ();
- my @startadds = ();
- if (($prestart < 0) || ($postend > scalar(@seqarray))) {
- last;
- }
- #------------------------------------------------------------------------#------------------------------------------------------------------------
- # CHECKING FOR SUBSTITUTION PROBES NOW
-
- if ($fields[8] ne "mononucleotide"){
- while ($startflag eq "down"){
- my $search = join("",@seqarray[$prestart...($start-1)]);
- #print "search is from $prestart...($start-1) = $search\n";
- foreach my $probe (@startsubst_probes){
- #print "\t\tprobe = $probe\n";
- if ($search =~ /^$probe/){
- #print "\tfound addition to the left - $search \n";
- my $copyprobe = $probe;
- my $type;
- my $subspos = 0;
- my $interruption = "";
- if ($search eq $startphase) { $type = "NONE";}
- else{
- $copyprobe =~ s/\[a-zA-Z\]/^/g;
- $subspos = index($copyprobe,"^") + 1;
- $type = "substitution";
- $interruption = substr($search, $subspos,1);
- }
- my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos);
- #print "adding information: $addinfo \n";
- push(@startadds, $addinfo);
- $prestart = $prestart - $substitution_distance;
- $start = $start-$substitution_distance;
- $startflag = 'down';
-
- last;
- }
- else{
- $startflag = 'up';
- }
- }
- }
- #<STDIN>;
- while ($endflag eq "down"){
- my $search = join("",@seqarray[($end+1)...$postend]);
- #print "search is from ($end+1)...$postend] = $search\n";
- foreach my $probe (@endsubst_probes){
- #print "\t\tprobe = $probe\n";
- if ($search =~ /$probe$/){
- my $copyprobe = $probe;
- my $type;
- my $subspos = 0;
- my $interruption = "";
- if ($search eq $endphase) { $type = "NONE";}
- else{
- $copyprobe =~ s/\[a-zA-Z\]/^/g;
- $subspos = index($copyprobe,"^") + 1;
- $type = "substitution";
- $interruption = substr($search, $subspos,1);
- }
- my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos);
- #print "adding information: $addinfo \n";
- push(@endadds, $addinfo);
- $postend = $postend + $substitution_distance;
- $end = $end+$substitution_distance;
- push(@endadds, $search);
- $endflag = 'down';
- last;
- }
- else{
- $endflag = 'up';
- }
- }
- }
- #print "startadds = @startadds, endadds = @endadds \n";
-
- }
- }
- sub microsat_packer{
- my $microsat = $_[0];
- my $addition = $_[1];
- }
- sub multiSpecies_interruptedMicrosatHunter_merge{
- $prinkter = 0;
- # print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n";
- my $line = $_[0];
- # print "sent for mering: $line \n" if $prinkter ==1;
- my @mields = split(/\t/,$line);
- my @fields = @mields;
- my $microsat = allCaps($fields[$microsatcord]);
- my $motifline = allCaps($fields[$motifcord]);
- my $microsatcopy = $microsat;
- # print "microsat = $microsat|\n" if $prinkter ==1;
- $microsatcopy =~ s/^\[|\]$//sg;
- chomp $microsatcopy;
- my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
- my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
- shift @inields;
- # print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1;
- $motifline =~ s/^\[|\]$//sg;
- my @motields = split(/\]\[/,$motifline);
- my @firstmotifs = ();
- my @lastmotifs = ();
- for my $i (0 ... $#microields){
- $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
- $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
- }
- # print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1;
- my @mergelist = ();
- my @inter_poses = split(/,/,$fields[$interr_poscord]);
- my $no_of_interruptions = $fields[$no_of_interruptionscord];
- my @interruptions = split(/,/,$fields[$interrcord]);
- my @interrtypes = split(/,/,$fields[$interrtypecord]);
- my $stopper = 0;
- for my $i (0 ... $#motields-1){
- # print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1;
- if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){
- $stopper = 1;
- push(@mergelist, ($i)."_".($i+1)); #<STDIN> if $prinkter ==1;
- }
- }
-
- # print "mergelist = @mergelist\n" if $prinkter ==1;
- return $line if scalar(@mergelist) == 0;
- # print "merging @mergelist\n" if $prinkter ==1;
- # <STDIN> if $prinkter ==1;
-
- foreach my $merging (@mergelist){
- my @sets = split(/_/, $merging);
- # print "sets = @sets\n" if $prinkter ==1;
- my @tempmicro = ();
- my @tempmot = ();
- # print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1;
- for my $i (0 ... $sets[0]-1){
- # print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1;
- push(@tempmicro, "[".$microields[$i]."]");
- push(@tempmicro, $inields[$i]);
- push(@tempmot, "[".$motields[$i]."]");
- # print "adding pre-motifs number $i\n" if $prinkter ==1;
- # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
- }
- # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
- # print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1;
- my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
- # print "middle is, from @motields - @sets, number 0 which is is\n";
- # print ": $motields[$sets[0]]\n";
- push (@tempmicro, $pusher);
- push(@tempmot, "[".$motields[$sets[0]]."]");
- push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]];
- my $outcoming = -2;
- # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
- # print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1;
- for my $i ($sets[1]+1 ... $#microields){
- # print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1;
- push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i];
- push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i];
- push(@tempmot, "[".$motields[$i]."]");
- # print "adding post-motifs number $i\n" if $prinkter ==1;
- $outcoming = $i;
- }
- # print "____________________________________________________________________________\n";
- $prinkter = 0;
- $fields[$microsatcord] = join("",@tempmicro);
- $fields[$motifcord] = join("",@tempmot);
- # print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1;
-
- splice(@interrtypes, $sets[0], 1);
- $fields[$interrtypecord] = join(",",@interrtypes);
- splice(@interruptions, $sets[0], 1);
- $fields[$interrcord] = join(",",@interruptions);
- splice(@inter_poses, $sets[0], 1);
- $fields[$interr_poscord] = join(",",@inter_poses);
- $no_of_interruptions = $no_of_interruptions - 1;
- }
- if ($no_of_interruptions == 0 && $line !~ /compound/){
- $fields[$microsatcord] =~ s/^\[|\]$//sg;
- $fields[$motifcord] =~ s/^\[|\]$//sg;
- $line = join("\t", @fields[0 ... $motifcord]);
- }
- else{
- $line = join("\t", @fields);
- }
- # print "post merging, the line is $line\n" if $prinkter ==1;
- #<STDIN> if $stopper ==1;
- return $line;
- }
- sub interval_asseser{
- my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3];
- }
- #---------------------------------------------------------------------------------------------------
- sub allCaps{
- my $motif = $_[0];
- $motif =~ s/a/A/g;
- $motif =~ s/c/C/g;
- $motif =~ s/t/T/g;
- $motif =~ s/g/G/g;
- return $motif;
- }
- #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
- sub merge_interruptedMicrosats{
- # print "IN merge_interruptedMicrosats: @_\n";
- my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
- my $input1 = $_[1]; ###### the *_sput_op4_ii file
- my $input2 = $_[2]; ###### the *_sput_op4_ii file
- $no_of_species = $_[3];
-
- my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward
- my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse
- my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward
- #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse
- #my $info = $_[4];
- #my @tags = split(/\t/,$info);
-
- open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
- open(INF,"<$input1") or die "Cannot open file $input1 $!";
- open(INR,"<$input2") or die "Cannot open file $input2 $!";
- open(OUTF,">$output1") or die "Cannot open file $output1 $!";
- open(OUTR,">$output2") or die "Cannot open file $output2 $!";
- open(MER,">$output3") or die "Cannot open file $output3 $!";
- #open(MERR,">$output4") or die "Cannot open file $output4 $!";
-
-
-
- $printer = 0;
-
- # print "files opened \n";
- $infocord = 2 + (4*$no_of_species) - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
- $typecord = $infocord + 1;
- my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
-
- $interrtypecord = $motifcord + 1;
- $interrcord = $motifcord + 2;
- $interr_poscord = $motifcord + 3;
- $no_of_interruptionscord = $motifcord + 4;
- $mergestarts = $no_of_interruptionscord+ 1;
- $mergeends = $no_of_interruptionscord+ 2;
- $mergemicros = $no_of_interruptionscord+ 3;
-
- # NOW ADDING FORWARD MICROSATELLITES TO HASH
- my %fmicros = ();
- my $microcounter=0;
- my $linecounter = 0;
- while (my $line = <INF>){
- # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
- $linecounter++;
- if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $2, $4, $5);
- # print $key, "#-#-#-#-#-#-#-#\n";
- push (@{$fmicros{$key}},$line);
- $microcounter++;
- }
- else {
- #print $line;
- }
- }
- # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
- close INF;
- my @deletedlines = ();
- # print "done forward hash \n";
- $linecounter = 0;
- #---------------------------------------------------------------------------------------------------
- # NOW ADDING REVERSE MICROSATELLITES TO HASH
- my %rmicros = ();
- $microcounter=0;
- while (my $line = <INR>){
- # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
- $linecounter++;
- if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $2, $4, $5);
- # print $key, "#-#-#-#-#-#-#-#\n";
- push (@{$rmicros{$key}},$line);
- $microcounter++;
- }
- else {
- #print "cant make key\n";
- }
- }
- # print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
- close INR;
- # print "done reverse hash \n";
- $linecounter = 0;
-
- #------------------------------------------------------------------------------------------------
-
- while(my $sine = <SEQ>){
- #<STDIN> if $sine =~ /16349128/;
- next if $sine !~ /[a-zA-Z0-9]/;
- # print "-" x 150, "\n" if $printer == 1;
- my @sields = split(/\t/,$sine);
- my @merged = ();
-
- my $key = ();
-
- if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- $key = join("\t",$1, $2, $4, $5);
- # print $key, "<-<-<-<-<-<-<-<\n";
- }
- # print "key = $key\n";
-
- my @sets1;
- my @sets2;
- chomp $sields[$sequencepos];
- my $rev_sequence = reverse($sields[$sequencepos]);
- $rev_sequence =~ s/ //g;
- $rev_sequence = " ".$rev_sequence;
- next if (!exists $fmicros{$key} && !exists $rmicros{$key});
-
- if (exists $fmicros{$key}){
- # print "line no : $linecount\n";
- my @raw_microstring = @{$fmicros{$key}};
- my %starts = (); my %ends = ();
- # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
- my @microstring=();
- for my $u (0 ... $#raw_microstring){
- my @tields = split(/\t/,$raw_microstring[$u]);
- next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
- push(@microstring, $raw_microstring[$u]);
- $starts{$tields[$startcord]} = $tields[$startcord];
- $ends{$tields[$endcord]} = $tields[$endcord];
- }
-
- # print "founf microstring in forward\n: @microstring\n";
- chomp @microstring;
- my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos]));
- @sets1 = split("\=", $clusterresult);
- my @temp = split(/_X0X_/,$sets1[0]) ; $microscanned+= scalar(@temp);
- # print "sets = ", join("<all\nmerged>", @sets1), "\n<<-sets1\n"; <STDIN>;
- } #if (exists $micros{$key}){
-
- if (exists $rmicros{$key}){
- # print "line no : $linecount\n";
- my @raw_microstring = @{$rmicros{$key}};
- my %starts = (); my %ends = ();
- # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
- my @microstring=();
- for my $u (0 ... $#raw_microstring){
- my @tields = split(/\t/,$raw_microstring[$u]);
- next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
- push(@microstring, $raw_microstring[$u]);
- $starts{$tields[$startcord]} = $tields[$startcord];
- $ends{$tields[$endcord]} = $tields[$endcord];
- }
- # print "founf microstring in reverse\n: @microstring\n"; <STDIN>;
- chomp @microstring;
- # print "sending reversed sequence\n";
- my $clusterresult = (find_clusters(@microstring, $rev_sequence ) );
- @sets2 = split("\=", $clusterresult);
- my @temp = split(/_X0X_/,$sets2[0]) ; $microscanned+= scalar(@temp);
- } #if (exists $micros{$key}){
-
- my @popout1 = ();
- my @popout2 = ();
- my @forwardset = ();
- if (exists $sets2[1] ){
- if(exists $sets1[0]) {
- push (@popout1, $sets1[0],$sets2[1]);
- my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));#
- print OUTF join("\n",split("_X0X_", $forwardset[0])), "\n";
- my @localmerged = split("_X0X_", $forwardset[1]);
- my $sequence = $sields[$sequencepos];
- $sequence =~ s/ //g;
- # print "\nforwardset = @forwardset\n";
- for my $j (0 ... $#localmerged){
- $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
- }
-
- push (@merged, @localmerged);
-
- }
- else{
- my @localmerged = split("_X0X_", $sets2[1]);
- my $sequence = $sields[$sequencepos];
- $sequence =~ s/ //g;
- for my $j (0 ... $#localmerged){
- # print "\nlocalmerged = @localmerged\n";
- $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
- }
-
- push (@merged, @localmerged);
- }
- }
- elsif (exists $sets1[0]){
- print OUTF join("\n",split("_X0X_", $sets1[0])), "\n";
- }
-
- my @reverseset= ();
- if (exists $sets1[1]){
- if (exists $sets2[0]){
- push (@popout2, $sets2[0],$sets1[1]);
- # print "popout2 = @popout2\n";
- my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos]));
- #print "reverseset = $reverseset[1] < --- reverseset1\n";
- print OUTR join("\n",split("_X0X_", $reverseset[0])), "\n";
- push(@merged, (split("_X0X_", $reverseset[1])));
- }
- else{
- push(@merged, (split("_X0X_", $sets1[1])));
- }
- }
- elsif (exists $sets2[0]){
- print OUTR join("\n",split("_X0X_", $sets2[0])), "\n";
-
- }
-
- if (scalar @merged > 0){
- my @filtered_merged = split("__",(filterDuplicates_merged(@merged)));
- print MER join("\n", @filtered_merged),"\n";
- }
- # <STDIN> if $sine =~ /16349128/;
-
- }
- close(SEQ);
- close(INF);
- close(INR);
- close(OUTF);
- close(OUTR);
- close(MER);
- }
- sub find_clusters{
- my @input = @_;
- my $sequence = pop(@input);
- $sequence =~ s/ //g;
- my @microstring0 = @input;
- # print "IN: find_clusters:\n";
- my %microstart=();
- my %microend=();
- my @nonmerged = ();
- my @mergedSet = ();
- # print "set of microsats = @microstring \n";
- my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0;
- # print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1;
- #<STDIN> if $printer == 1;
- my @tempmicrostring = @microstring;
- foreach my $line (@tempmicrostring){
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/;
- # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
- push (@{$microstart{$start}},$line);
- push (@{$microend{$end}},$line);
- }
- my $firstflag = 'down';
- while( my $line =shift(@microstring)){
- # print "-----------\nline = $line \n" if $printer == 1;
- chomp $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ;
- my $startmicro = $line;
- my $endmicro = $line;
- # print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
- delete ($microstart{$start});
- delete ($microend{$end});
- my $flag = 'down';
- my $startflag = 'down';
- my $endflag = 'down';
- my $prestart = $start - $distance;
- my $postend = $end + $distance;
- my @compoundlines = ();
- my %compoundhash = ();
- push (@compoundlines, $line);
- push (@{$compoundhash{$line}},$line);
- my $startrank = 1;
- my $endrank = 1;
- while( ($startflag eq "down") || ($endflag eq "down") ){
- # print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1;
- if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){
- # print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1;
- last;
- }
-
- $firstflag = "up";
- if ($startflag eq "down"){
- for my $i ($prestart ... $end){
- if(exists $microend{$i}){
- chomp $microend{$i}[0];
- if(exists $compoundhash{$microend{$i}[0]}) {next;}
- chomp $microend{$i}[0];
- push(@compoundlines, $microend{$i}[0]);
- my @tields = split(/\t/,$microend{$i}[0]);
- $startmicro = $microend{$i}[0];
- chomp $startmicro;
- $flag = 'down';
- $startrank++;
- # print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1;
- delete $microend{$i};
- delete $microstart{$tields[$startcord]};
- $end = $tields[$endcord];
- $startflag = 'down';
- $prestart = $tields[$startcord] - $distance;
- last;
- }
- else{
- $flag = 'up';
- $startflag = 'up';
- }
- }
- }
-
- if ($endflag eq "down"){
- for my $i ($start ... $postend){
- # print "$start ----> $i -----> $postend\n" if $printer == 1;
- if(exists $microstart{$i} ){
- chomp $microstart{$i}[0];
- if(exists $compoundhash{$microstart{$i}[0]}) {next;}
- chomp $microstart{$i}[0];
- push(@compoundlines, $microstart{$i}[0]);
- my @tields = split(/\t/,$microstart{$i}[0]);
- $endmicro = $microstart{$i}[0];
- $endrank++;
- chomp $endmicro;
- $flag = 'down';
- # print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1;
-
- delete $microstart{$i} if exists $microstart{$i} ;
- delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]};
- # print "done\n" if $printer == 1;
-
- shift @microstring;
- $end = $tields[$endcord];
- $postend = $tields[$endcord] + $distance;
- $endflag = 'down';
- last;
- }
- else{
- $flag = 'up';
- $endflag = 'up';
- }
- # print "out of the if\n" if $printer == 1;
- }
- # print "out of the for\n" if $printer == 1;
- }
- # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
- } #end while( $flag eq "down")
- # print "compoundlines = @compoundlines \n" if $printer == 1;
- if (scalar (@compoundlines) == 1){
- push(@nonmerged, $line);
- }
- if (scalar (@compoundlines) > 1){
- # print "FROM CLUSTERER\n" if $printer == 1;
- push(@mergedSet,merge_microsats(@compoundlines, $sequence) );
- }
- } #end foreach my $line (@microstring){
- # print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1;
- #<STDIN> if scalar(@mergedSet) > 0;
- # print "EXIT: find_clusters\n";
- return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet));
- }
- sub custom {
- $a->[$startcord+1] <=> $b->[$startcord+1];
- }
- sub popOuter {
- # print "\nIN: popOuter @_\n"; <STDIN>;
- my @all = split ("_X0X_",$_[0]);
- # <STDIN> if !defined $_[0];
- my @merged = split ("_X0X_",$_[1]);
- my $sequence = $_[2];
- my $seqlen = length($sequence);
- my %microstart=();
- my %microend=();
- my @mergedSet = ();
- my @nonmerged = ();
-
- foreach my $line (@all){
- my @fields = split(/\t/,$line);
- my $start = $seqlen - $fields[$startcord]+ 1;
- my $end = $seqlen - $fields[$endcord] + 1;
- push (@{$microstart{$start}},$line);
- push (@{$microend{$end}},$line);
- }
- my $firstflag = 'down';
- my %forPopouting = ();
- while( my $line =shift(@merged)){
- # print "\n MErgedline: $line .. startcord = $startcord ... endcord = $endcord\n" ;
- chomp $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- my $startmicro = $line;
- my $endmicro = $line;
-
-
- delete ($microstart{$start});
- delete ($microend{$end});
- my $flag = 'down';
- my $startflag = 'down';
- my $endflag = 'down';
- my $prestart = $start - $distance;
- my $postend = $end + $distance;
- my @compoundlines = ();
- my %compoundhash = ();
- push (@compoundlines, $line);
- my $startrank = 1;
- my $endrank = 1;
-
- # print "\nstart = $start, end = $end\n";
- # <STDIN>;
- for my $i ($start ... $end){
- if(exists $microend{$i}){
- # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n";
- chomp $microend{$i}[0];
- my @fields = split(/\t/,$microend{$i}[0]);
- delete $microstart{$seqlen - $fields[$startcord] + 1};
- my $invertseq = $sequence;
- $invertseq =~ s/ //g;
- push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq ));
- delete $microend{$i};
-
- }
- if(exists $microstart{$i} ){
- # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n";
- chomp $microstart{$i}[0];
- my @fields = split(/\t/,$microstart{$i}[0]);
- delete $microend{$seqlen - $fields[$endcord] + 1};
- my $invertseq = $sequence;
- $invertseq =~ s/ //g;
- push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) );
- delete $microstart{$i};
- }
- }
-
- if (scalar (@compoundlines) == 1){
- push(@mergedSet,join("\t",@compoundlines) );
- }
- else {
- # print "FROM POPOUTER\n" if $printer == 1;
- push(@mergedSet, merge_microsats(@compoundlines, $sequence) );
- }
- }
-
- foreach my $key (sort keys %microstart) {
- push(@nonmerged,$microstart{$key}[0]);
- }
-
- return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet) );
- }
- sub invert_justCoordinates{
- my $microsat = $_[0];
- # print "IN invert_justCoordinates ... @_\n" ; <STDIN>;
- chomp $microsat;
- my $seqLength = $_[1];
- my @fields = split(/\t/,$microsat);
- my $start = $seqLength - $fields[$endcord] + 1;
- my $end = $seqLength - $fields[$startcord] + 1;
- $fields[$startcord] = $start;
- $fields[$endcord] = $end;
- $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
- # print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1;
- return join("\t",@fields);
- }
- sub largest_number{
- my $counter = 0;
- my($max) = shift(@_);
- foreach my $temp (@_) {
- #print "finding largest array: $maxcounter \n";
- if($temp > $max){
- $max = $temp;
- }
- }
- return($max);
- }
- sub smallest_number{
- my $counter = 0;
- my($min) = shift(@_);
- foreach my $temp (@_) {
- #print "finding largest array: $maxcounter \n";
- if($temp < $min){
- $min = $temp;
- }
- }
- return($min);
- }
- sub filterDuplicates_merged{
- my @merged = @_;
- my %revmerged = ();
- my @fmerged = ();
- foreach my $micro (@merged) {
- my @fields = split(/\t/,$micro);
- if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){
- my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
- # print "adding ... \n$key\n$micro\n";
- push(@{$revmerged{$key}}, $micro);
- }
- else{
- # print "pushing.. $micro\n";
- push(@fmerged, $micro);
- }
- }
- # print "\n";
- foreach my $micro (@fmerged) {
- my @fields = split(/\t/,$micro);
- my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
- # print "searching for key $key\n";
- if (exists $revmerged{$key}){
- # print "deleting $revmerged{$key}[0]\n";
- delete $revmerged{$key};
- }
- }
- foreach my $key (sort keys %revmerged) {
- push(@fmerged,$revmerged{$key}[0]);
- }
- # print "returning ", join("\n", @fmerged),"\n" ;
- return join("__", @fmerged);
- }
- sub invert_microsat{
- my $micro = $_[0];
- chomp $micro;
- if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;}
- else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; }
- my $sequence = $_[1];
- $sequence =~ s/ //g;
- my $seqlen = length($sequence);
- my @fields = split(/\t/,$micro);
- my $start = $seqlen - $fields[$endcord] +1;
- my $end = $seqlen - $fields[$startcord] +1;
- $fields[$startcord] = $start;
- $fields[$endcord] = $end;
- $fields[$motifcord] = reverse_micro($fields[$motifcord]);
- $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
- if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){
- my @intertypes = split(/,/,$fields[$interrtypecord]);
- my @inters = split(/,/,$fields[$interrcord]);
- my @interposes = split(/,/,$fields[$interr_poscord]);
- $fields[$interrtypecord] = join(",",reverse(@intertypes));
- $fields[$no_of_interruptionscord] = scalar(@interposes);
- for my $i (0 ... $fields[$no_of_interruptionscord]-1){
- if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){
- $inters[$i] = reverse($inters[$i]);
- $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1;
- }
- else{
- $inters[$i] = "";
- $interposes[$i] = $interposes[$i] - 1;
- }
- $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1;
- }
-
- $fields[$interrcord] = join(",",reverse(@inters));
- $fields[$interr_poscord] = join(",",reverse(@interposes));
- }
-
- my $finalmicrosat = join("\t", @fields);
- return $finalmicrosat;
- }
- sub reverse_micro{
- my $micro = reverse($_[0]);
- my @strand = split(/\s*/,$micro);
- for my $i (0 ... $#strand){
- if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;}
- if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;}
- }
- return join("",@strand);
- }
- #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
- sub forward_reverse_sputoutput_comparer {
- # print "IN forward_reverse_sputoutput_comparer: @_\n";
- my $input0 = $_[0]; ###### the *nogap_unrand_match file
- my $input1 = $_[1]; ###### the real file, *sput* data
- my $input2 = $_[2]; ###### the reverse file, *sput* data
- my $output1 = $_[3]; ###### microsats different in real file
- my $output2 = $_[4]; ###### microsats missing in real file
- my $output3 = $_[5]; ###### microsats common among real and reverse file
- my $no_of_species = $_[6];
-
- $infocord = 2 + (4*$no_of_species) - 1;
- $typecord = 2 + (4*$no_of_species) + 1 - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
- $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
- $interrtypecord = $motifcord + 1;
- $interrcord = $motifcord + 2;
- $interr_poscord = $motifcord + 3;
- $no_of_interruptionscord = $motifcord + 4;
- $mergestarts = $no_of_interruptionscord+ 1;
- $mergeends = $no_of_interruptionscord+ 2;
- $mergemicros = $no_of_interruptionscord+ 3;
-
-
- open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
- open(INF,"<$input1") or die "Cannot open file $input1 $!";
- open(INR,"<$input2") or die "Cannot open file $input2 $!";
-
- open(DIFF,">$output1") or die "Cannot open file $output1 $!";
- #open(MISS,">$output2") or die "Cannot open file $output2 $!";
- open(SAME,">$output3") or die "Cannot open file $output3 $!";
-
-
- # print "opened files \n";
- my $linecounter = 0;
- my $fcounter = 0;
- my $rcounter = 0;
-
- $printer = 0;
- #---------------------------------------------------------------------------------------------------
- # NOW ADDING FORWARD MICROSATELLITES TO HASH
- my %fmicros = ();
- my $microcounter=0;
- while (my $line = <INF>){
- $linecounter++;
- if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
- # print $key, "#-#-#-#-#-#-#-#\n";
- push (@{$fmicros{$key}},$line);
- $microcounter++;
- }
- else {
- #print $line;
- }
- }
- # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
- close INF;
- my @deletedlines = ();
- # print "done forward hash \n";
- $linecounter = 0;
- #---------------------------------------------------------------------------------------------------
- # NOW ADDING REVERSE MICROSATELLITES TO HASH
- my %rmicros = ();
- $microcounter=0;
- while (my $line = <INR>){
- $linecounter++;
- if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
- # print $key, "#-#-#-#-#-#-#-#\n";
- push (@{$rmicros{$key}},$line);
- $microcounter++;
- }
- else {}
- }
- # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
- close INR;
- # print "done reverse hash \n";
- $linecounter = 0;
- #---------------------------------------------------------------------------------------------------
- #---------------------------------------------------------------------------------------------------
- # NOW READING THE SEQUENCE FILE
- while(my $sine = <SEQ>){
- my %microstart=();
- my %microend=();
- my @sields = split(/\t/,$sine);
- my $key = ();
- if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
- }
- else {
- next;
- }
- $printer = 0;
- my $sequence = $sields[$sequencepos];
- chomp $sequence;
- $sequence =~ s/ //g;
- my @localfs = ();
- my @localrs = ();
-
- if (exists $fmicros{$key}){
- @localfs = @{$fmicros{$key}};
- delete $fmicros{$key};
- }
-
- my %forwardstarts = ();
- my %forwardends = ();
-
- foreach my $f (@localfs){
- my @fields = split(/\t/,$f);
- push (@{$forwardstarts{$fields[$startcord]}},$f);
- push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]);
- }
-
- if (exists $rmicros{$key}){
- @localrs = @{$rmicros{$key}};
- delete $rmicros{$key};
- }
- else{
- }
-
- foreach my $r (@localrs){
- chomp $r;
- my @rields = split(/\t/,$r);
- # print "rields = @rields\n" if $printer == 1;
- my $reciprocalstart = length($sequence) - $rields[$endcord] + 1;
- my $reciprocalend = length($sequence) - $rields[$startcord] + 1;
- # print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1;
- my $microsat = reverse_micro(all_caps($rields[$microsatcord]));
- my @localcollection=();
- for my $i ($reciprocalstart+1 ... $reciprocalend-1){
- if (exists $forwardstarts{$i}){
- push(@localcollection, $forwardstarts{$i}[0] );
- delete $forwardstarts{$i};
- }
- if (exists $forwardends{$i}){
- next if !exists $forwardstarts{$forwardends{$i}[0]};
- push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] );
- }
- }
- if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);}
-
- if (scalar(@localcollection) == 0){
- print SAME invert_microsat($r,($sequence) ), "\n";
- }
-
- elsif (scalar(@localcollection) == 1){
- # print "f microsat = $localcollection[0]\n" if $printer == 1;
- my @lields = split(/\t/,$localcollection[0]);
- $lields[$microsatcord]=all_caps($lields[$microsatcord]);
- # print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1;
- # print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1;
- if ($microsat eq $lields[$microsatcord]){
- chomp $localcollection[0];
- print SAME $localcollection[0], "\n";
- }
- if ($microsat ne $lields[$microsatcord]){
- chomp $localcollection[0];
- my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence);
- # print "newmicro = $newmicro\n" if $printer == 1;
- if ($newmicro =~ /[a-zA-Z]/){
- print SAME $newmicro,"\n";
- }
- else{
- print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n";
- # print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1;
- # print "@rields\n@lields\n" if $printer == 1;
- }
- }
- }
- else{
- # print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1;
- }
- }
- }
-
- close(SEQ);
- close(INF);
- close(INR);
- close(DIFF);
- close(SAME);
-
- }
- sub all_caps{
- my @strand = split(/\s*/,$_[0]);
- for my $i (0 ... $#strand){
- if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;}
- if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;}
- if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;}
- if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;}
- }
- return join("",@strand);
- }
- sub microsatChooser{
- my $forward = $_[0];
- my $reverse = $_[1];
- my $sequence = $_[2];
- my $seqLength = length($sequence);
- $sequence =~ s/ //g;
- my @fields = split(/\t/,$forward);
- my @rields = split(/\t/,$reverse);
- my $r_start = $seqLength - $rields[$endcord] + 1;
- my $r_end = $seqLength - $rields[$startcord] + 1;
-
-
- my $f_microsat = $fields[$microsatcord];
- my $r_microsat = $rields[$microsatcord];
-
- if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) {
- return $forward if length($f_microsat) >= length($r_microsat);
- return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat);
- }
- return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord];
- my $f_microsat_copy = $f_microsat;
- my $r_microsat_copy = $r_microsat;
- $f_microsat_copy =~ s/^\[|\]$//g;
- $r_microsat_copy =~ s/^\[|\]$//g;
- my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
- my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
- my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
- my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
-
- my $f_motif = $fields[$motifcord];
- my $r_motif = $rields[$motifcord];
- my $f_motif_copy = $f_motif;
- my $r_motif_copy = $r_motif;
- $f_motif_copy =~ s/^\[|\]$//g;
- $r_motif_copy =~ s/^\[|\]$//g;
- my @f_motields = split(/\]\[/,$f_motif_copy);
- my @r_motields = split(/\]\[/,$r_motif_copy);
- my $f_purestretch = join("",@f_microields);
- my $r_purestretch = join("",@r_microields);
- if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){
- # print "now.. studying $forward\n$reverse\n" if $printer == 1;
- if ($fields[$typecord] eq $rields[$typecord]){
- # print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
- if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
- my $subset_answer = isSubset($forward, $reverse, $seqLength);
- # print "subset answer = $subset_answer\n" if $printer == 1;
- return $forward if $subset_answer == 1;
- return invert_microsat($reverse, $sequence) if $subset_answer == 2;
- return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
- return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
- return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch);
- return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch);
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ;
- }
- elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
- return $forward;
- }
- elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
- return invert_microsat($reverse, $sequence);
- }
- }
- else{
- my $fmotlen = ();
- my $rmotlen = ();
- $fmotlen =1 if $fields[$typecord] eq "mononucleotide";
- $fmotlen =2 if $fields[$typecord] eq "dinucleotide";
- $fmotlen =3 if $fields[$typecord] eq "trinucleotide";
- $fmotlen =4 if $fields[$typecord] eq "tetranucleotide";
- $rmotlen =1 if $rields[$typecord] eq "mononucleotide";
- $rmotlen =2 if $rields[$typecord] eq "dinucleotide";
- $rmotlen =3 if $rields[$typecord] eq "trinucleotide";
- $rmotlen =4 if $rields[$typecord] eq "tetranucleotide";
-
- if ($fmotlen < $rmotlen){
- if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){
- return $forward;
- }
- else{
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- }
- if ($fmotlen > $rmotlen){
- if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){
- return invert_microsat($reverse, $sequence);
- }
- else{
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- }
- }
- }
- if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){
- # print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
- if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
- my $subset_answer = isSubset($forward, $reverse, $seqLength);
- # print "subset answer = $subset_answer\n" if $printer == 1;
- return $forward if $subset_answer == 1;
- return invert_microsat($reverse, $sequence) if $subset_answer == 2;
- # print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1;
- return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
- return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
- if ($subset_answer == 3){
- if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){
- if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){
- return $forward;
- }
- else{
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- }
- if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){
- if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){
- return invert_microsat($reverse, $sequence);
- }
- else{
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- }
- }
- }
- elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
- return $forward;
- }
- elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
- return invert_microsat($reverse, $sequence);
- }
-
- }
-
- if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){
- # print "one compound, one nucleotide\n" if $printer == 1;
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){
- # print "one compound, one nucleotide\n" if $printer == 1;
- return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
- }
- }
- sub isSubset{
- my $forward = $_[0]; my @fields = split(/\t/,$forward);
- my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
- my $seqLength = $_[2];
- my $r_start = $seqLength - $rields[$endcord] + 1;
- my $r_end = $seqLength - $rields[$startcord] + 1;
- # print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1;
- return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end;
- return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end;
- return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord];
- return "3";
- }
- sub motifBYmotif_match{
- my $forward = $_[0];
- my $reverse = $_[1];
- $forward =~ s/^\[|\]$//g;
- $reverse =~ s/^\[|\]$//g;
- my @f_motields=split(/\]\[/, $forward);
- my @r_motields=split(/\]\[/, $reverse);
- my $finalresult = 0;
- if (scalar(@f_motields) != scalar(@r_motields)){
- my $subresult = 0;
- my @mega = (); my @sub = ();
- @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields);
- @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields);
- @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields);
- @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields);
-
- for my $i (0 ... $#sub){
- my $probe = $sub[$i].$sub[$i];
- # print "probing $probe and $mega[$i]\n" if $printer == 1;
- if ($probe =~ /$mega[$i]/) {$subresult = 1; }
- else {$subresult = 0; last; }
- }
-
- return 0 if $subresult == 0;
- return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f
- return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse
-
- }
- else{
- for my $i (0 ... $#f_motields){
- my $probe = $f_motields[$i].$f_motields[$i];
- if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;}
- else {$finalresult = 0 ;last;}
- }
- }
- # print "finalresult = $finalresult\n" if $printer == 1;
- return $finalresult;
- }
- sub merge_microsats{
- my @input = @_;
- my $sequence = pop(@input);
- $sequence =~ s/ //g;
- my @seq_string = @input;
- # print "IN: merge_microsats\n";
- # print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n";
- my $start;
- my $end;
- my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string;
- # print "\nrearranged into @micros \n";
- my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = ();
- my @fields = ();
- for my $i (0 ... $#micros){
- chomp $micros[$i];
- @fields = split(/\t/,$micros[$i]);
- push(@types, $fields[$typecord]);
- push(@motifs, $fields[$motifcord]);
- if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);}
- else { push(@interruptiontypes, "NA"); }
- if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);}
- else { push(@interruptions, "NA"); }
- if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);}
- else { push(@interrposes, "NA"); }
- if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);}
- else { push(@no_of_interruptions, "NA"); }
- if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));}
- else { push(@mergestart, $fields[$startcord]); }
- if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));}
- else { push(@mergeend, $fields[$endcord]); }
- if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);}
- else { push(@mergemicro, $fields[$microsatcord]); }
-
- }
- $start = smallest_number(@mergestart);
- $end = largest_number(@mergeend);
- my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]";
- my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro));
- return $microsat;
- }
- sub slided_microsat{
- my $forward = $_[0]; my @fields = split(/\t/,$forward);
- my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
- my $seqLength = $_[2];
- my $r_start = $seqLength - $rields[$endcord] + 1;
- my $r_end = $seqLength - $rields[$startcord] + 1;
- my $motlen =();
- $motlen =1 if $fields[$typecord] eq "mononucleotide";
- $motlen =2 if $fields[$typecord] eq "dinucleotide";
- $motlen =3 if $fields[$typecord] eq "trinucleotide";
- $motlen =4 if $fields[$typecord] eq "tetranucleotide";
-
- if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) {
- return 0;
- }
- else{
- return 1;
- }
-
- }
- #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
- sub new_multispecies_t10{
- my $input1 = $_[0]; #gap_op_unrand_match
- my $input2 = $_[1]; #sput
- my $output = $_[2]; #output
- my $bin = $output."_bin";
- my $orgs = join("|",split(/\./,$_[3]));
- my @organisms = split(/\./,$_[3]);
- my $no_of_species = scalar(@organisms); #3
- my $t10info = $output."_info";
- $prinkter = 0;
-
- open (MATCH, "<$input1");
- open (SPUT, "<$input2");
- open (OUT, ">$output");
- open (INFO, ">$t10info");
- sub microsat_bracketer;
- sub custom;
- my %seen = ();
- $infocord = 2 + (4*$no_of_species) - 1;
- $typecord = 2 + (4*$no_of_species) + 1 - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
- $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
- #---------------------------------------------------------------------------------------------------------------#
- # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES #
- #---------------------------------------------------------------------------------------------------------------#
- my $linecounter = 0;
- my $microcounter = 0;
- while (my $line = <SPUT>){
- chomp $line;
- # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
- next if $line !~ /[0-9a-z]+/;
- $linecounter++;
- # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
- # print $key, "#-#-#-#-#-#-#-#\n";
- if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $2, $3, $4, $5);
- # print "key = $key\n" if $prinkter == 1;
- push (@{$seen{$key}},$line);
- $microcounter++;
- }
- else {
- #print "could not make ker in SPUT : \n$line \n";
- }
- }
- # print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
- # print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
- close SPUT;
-
- #----------------------------------------------------------------------------------------------------------------
-
- #-------------------------------------------------------------------------------------------------------#
- # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY #
- # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN #
- # SECOND FILE FOLLOWED BY #
- # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION #
- #-------------------------------------------------------------------------------------------------------#
- my $ref = 0;
- my $ref2 = 0;
- my $ref3 = 0;
- my $ref4 = 0;
- my $deletes= 0;
- my $duplicates = 0;
- my $neighbors = 0;
- my $tooshort = 0;
- my $prevmicrol=();
- my $startnotfound = 0;
- my $matchkeysformed = 0;
- my $keysused = 0;
-
- while (my $line = <MATCH>) {
- # print colored ['magenta'], $line if $prinkter == 1;
- next if $line !~ /[a-zA-Z0-9]/;
- chomp $line;
- my @fields2 = split(/\t/,$line);
- my $key2 = ();
- # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
- if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- $matchkeysformed++;
- $key2 = join("\t",$1, $2, $3, $4, $5);
- # print "key = $key2 \n" if $prinkter == 1;
- }
- else{
- # print "could not make ker in SEQ : $line\n";
- next;
- }
- my $sequence = $fields2[$sequencepos];
- $sequence =~ s/\*/-/g;
- my $count = 0;
- if (exists $seen{$key2}){
- $keysused++;
- my @unsorted_raw = @{$seen{$key2}};
- delete $seen{$key2};
- my @sequencearr = split(/\s*/, $sequence);
-
- # print "sequencearr = @sequencearr\n" if $prinkter == 1;
-
- my $counter;
-
- my %start_database = ();
- my %end_database = ();
- foreach my $uns (@unsorted_raw){
- my @uields = split(/\t/,$uns);
- $start_database{$uields[$startcord]} = $uns;
- $end_database{$uields[$endcord]} = $uns;
- }
-
- my @unsorted = ();
- my %starts = (); my %ends = ();
- # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;}
- for my $u (0 ... $#unsorted_raw){
- my @tields = split(/\t/,$unsorted_raw[$u]);
- next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
- push(@unsorted, $unsorted_raw[$u]);
- $starts{$tields[$startcord]} = $unsorted_raw[$u];
- # print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1;
- }
-
- my $basecounter= 0;
- my $gapcounter = 0;
- my $poscounter = 0;
-
- for my $s (@sequencearr){
-
- $poscounter++;
- if ($s eq "-"){
- $gapcounter++; next;
- }
- else{
- $basecounter++;
- }
-
-
- #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
- #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
- #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1;
-
- if (exists $starts{$basecounter}){
- my $locus = $starts{$basecounter};
- # print "locus identified = $locus\n" if $prinkter == 1;
- my @fields3 = split(/\t/,$locus);
- my $start = $fields3[$startcord];
- my $end = $fields3[$endcord];
- my $motif = $fields3[$motifcord];
- my $microsat = $fields3[$microsatcord];
- my @leftbracketpos = ();
- my @rightbracketpos = ();
- my $bracket_picker = 'no';
- my $leftbrackets=();
- my $rightbrackets = ();
- my $micro_cpy = $microsat;
- # print "microsat = $microsat\n" if $prinkter == 1;
- while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';}
- while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);}
- $microsat =~ s/[\[\]\-\*]//g;
- # print "microsat = $microsat\n" if $prinkter == 1;
- my $human_search = join '-*', split //, $microsat;
- my $temp = substr($sequence, $poscounter-1);
- # print "with poscounter = $poscounter\n" if $prinkter == 1;
- my $search_result = ();
- my $posnow = ();
-
- # print "for $line, temp $temp or human_search $human_search not defined\n" if !defined $temp || !defined $human_search;
- # <STDIN> if !defined $temp || !defined $human_search;
-
- while ($temp =~ /($human_search)/gi){
- $search_result = $1;
- # $posnow = pos($temp);
- last;
- }
-
- my @gapspos = ();
- next if !defined $search_result;
-
- while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); }
- my $gaps = join("__",@gapspos);
-
- my $final_microsat = $search_result;
- if ($bracket_picker eq "yes"){
- $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets);
- }
-
- my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat);
-
- if ($bracket_picker eq "yes") {
- $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]);
- }
- print OUT $outsentence,"\n";
- }
- }
- }
- }
- my $unusedkeys = scalar(keys %seen);
- # print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
- # print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
- # print INFO "microsatellites too short = $tooshort\n";
- # print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
-
- #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
- #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
- #print "microsatellites too short = $tooshort\n";
- #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
- #print "unused keys = \n",join("\n", (keys %seen)),"\n";
- close (MATCH);
- close (SPUT);
- close (OUT);
- close (INFO);
- }
- sub microsat_bracketer{
- # print "in bracketer: @_\n";
- my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_;
- my @gaps = split(/__/,$gapspos);
- my @lefts = split(/__/,$leftbracketpos);
- my @rights = split(/__/,$rightbracketpos);
- my @new=();
- my $pure = $microsat;
- $pure =~ s/-//g;
- my $off = 0;
- my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights);
- push(@gaps, 0);
- push(@lefts,0);
- push(@rights,0);
-
- for my $i (1 ... $finallength){
- # print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n";
- if($rights[0] == $i){
- # print "pushed a ]\n";
- push(@new, "]");
- shift(@rights);
- push(@rights,0);
- for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
- next;
- }
- if($gaps[0] == $i){
- # print "pushed a -\n";
- push(@new, "-");
- shift(@gaps);
- push(@gaps, 0);
- for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;}
- for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;}
- next;
- }
- if($lefts[0] == $i){
- # print "pushed a [\n";
- push(@new, "[");
- shift(@lefts);
- push(@lefts,0);
- for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
- next;
- }
- else{
- my $pushed = substr($pure,$off,1);
- $off++;
- push(@new,$pushed );
- # print "pushed an alphabet, now new = @new, pushed = $pushed\n";
- next;
- }
- }
- my $returnmicrosat = join("",@new);
- # print "final microsat = $returnmicrosat \n";
- return($returnmicrosat);
- }
- #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
- sub multiSpecies_orthFinder4{
- #print "IN multiSpecies_orthFinder4: @_\n";
- my @handles = ();
- #1 SEPT 30TH 2008
- #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER
- #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL
- #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST
- #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED
- #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY
- #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES.
- #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE
- #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE
- #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN
- #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl
-
- #1 trial run:
- #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50
-
- $prinkter=0;
-
- #############
- my $CLUSTER_DIST = $_[4];
- #############
-
-
- my $aligns = $_[0];
- my @micros = split(/:/, $_[1]);
- my $orth = $_[2];
- #my $not_orth = "notorth";
- @tags = split(/:/, $_[3]);
-
- $no_of_species=scalar(@tags);
- my $junkfile = $orth."_junk";
- #open(JUNK,">$junkfile");
-
- #my $info = $output1."_info";
- #print "inputs are : \n"; foreach(@micros){print $_,"\n";}
- #print "info = @_\n";
-
-
- open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
- open (ORTH, ">$orth");
- my $output=$orth."_out";
- open (OUTP, ">$output");
-
-
- #open (NORTH, ">$not_orth");
- #open (INF, ">$info");
- my $i = 0;
- foreach my $path (@micros){
- $handles[$i] = IO::Handle->new();
- open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!";
- $i++;
- }
-
- #print "Opened files\n";
-
-
- $infocord = 2 + (4*$no_of_species) - 1;
- $typecord = 2 + (4*$no_of_species) + 1 - 1;
- $motifcord = $typecord + 1;
- $gapcord = $motifcord+1;
- $startcord = $gapcord + 1;
- $strandcord = $startcord + 1;
- $endcord = $strandcord + 1;
- $microsatcord = $endcord + 1;
- $sequencepos = 2 + (4*$no_of_species) + 1 -1 ;
- #$sequencepos = 17;
- # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES
- #----------------------------------------------------------------------------------------------------------------
- my @hasharr = ();
- foreach my $path (@micros){
- open(READ, "<$path") or die "Cannot open file $path :$!";
- my %single_hash = ();
- my $key = ();
- my $counter = 0;
- while (my $line = <READ>){
- $counter++;
- # print $line;
- chomp $line;
- my @fields1 = split(/\t/,$line);
- if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
- $key = join("\t",$1, $2, $4, $5);
-
- # print "key = : $key\n" if $prinkter == 1;
-
- # print $line if $prinkter == 1;
- push (@{$single_hash{$key}},$line);
- }
- else{
- # print "microsat line incompatible\n";
- }
- }
- push @hasharr, {%single_hash};
- # print "@{$single_hash{$key}} \n";
- # print "done $path: counter = $counter\n" if $prinkter == 1;
- close READ;
- }
- # print "Done hashes\n";
- #----------------------------------------------------------------------------------------------------------------
- my $question=();
- #----------------------------------------------------------------------------------------------------------------
- my @contigstarts = ();
- my @contigends = ();
-
- my %contigclusters = ();
- my %contigclustersFirstStartOnly=();
- my %contigclustersLastEndOnly=();
- my %contigclustersLastEndLengthOnly=();
- my %contigclustersFirstStartLengthOnly=();
- my %contigpath=();
- my $dotcounter = 0;
- while (my $line = <BO>){
- # print "x" x 60, "\n" if $prinkter == 1;
- $dotcounter++;
- # print "." if $dotcounter % 100 ==0;
- # print "\n" if $dotcounter % 5000 ==0;
- next if $line !~ /^[0-9]+/;
- # print $line if $prinkter == 1;
- chomp $line;
- my @fields2 = split(/\t/,$line);
- my $key2 = ();
- if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
- $key2 = join("\t",$1, $2, $4, $5);
- }
- else {
- # print "seq line $line incompatible\n" if $prinkter == 1;
- next;}
-
-
-
-
-
-
- my @sequences = ();
- for (0 ... $#tags){
- my $seq = <BO>;
- # print $seq;
- chomp $seq;
- push(@sequences , " ".$seq);
- }
- my @origsequences = @sequences;
- my $seqcopy = $sequences[0];
- my @strings = ();
- $seqcopy =~ s/[a-zA-Z]|-/x/g;
- my @string = split(/\s*/,$seqcopy);
-
- for my $s (0 ... $#tags){
- $sequences[$s] =~ s/-//g;
- $sequences[$s] =~ s/[a-zA-Z]/x/g;
- # print "length of sequence = ",length($sequences[$s]),"\n";
- my @tempstring = split(/\s*/,$sequences[$s]);
- push(@strings, [@tempstring])
-
- }
-
- my @species_list = ();
- my @micro_count = 0;
- my @starthash = ();
- my $stopper = 1;
- my @endhash = ();
-
- my @currentcontigstarts=();
- my @currentcontigends=();
- my @currentcontigchrs=();
-
- for my $i (0 ... $#tags){
- # print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1;
- my @temparr = ();
-
- if (exists $hasharr[$i]{$key2}){
- @temparr = @{$hasharr[$i]{$key2}};
-
- $line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/;
- ## print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9_])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
- # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
- my $startkey = $1."_SK0SK_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
- my $endkey = $1."_EK0EK_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
- $contigstarts[$i]{$startkey}= $key2;
- $contigends[$i]{$endkey}= $key2;
- # print "confirming existance: \n" if $prinkter == 1;
- # print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1;
- # print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1;
- $currentcontigchrs[$i]=$1;
- $currentcontigstarts[$i]=$2;
- $currentcontigends[$i]=$3;
-
- } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
- else {
- push (@starthash, {0 => "0"});
- push (@endhash, {0 => "0"});
- $currentcontigchrs[$i] = 0;
- next;
- }
- $stopper = 0;
- # print "exists: @temparr\n" if $prinkter == 1;
- push(@micro_count, scalar(@temparr));
- push(@species_list, [@temparr]);
- my @tempstart = (); my @tempend = ();
- my %localends = ();
- my %localhash = ();
- # print "---------------------------\n";
-
- foreach my $templine (@temparr){
- # print "templine = $templine\n" if $prinkter == 1;
- my @tields = split(/\t/,$templine);
- my $start = $tields[$startcord]; # - $tields[$gapcord];
- my $end = $tields[$endcord]; #- $tields[$gapcord];
- my $realstart = $tields[$startcord]- $tields[$gapcord];
- my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g);
- $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/;
- # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n";
- my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat;
- # print "real start = $realstart, realend = $realend \n";
- for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;}
- push(@tempstart, $start);
- push(@tempend, $end);
- $localhash{$start."-".$end} = $templine;
- }
- push @starthash, {%localhash};
- my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST);
-
- # print "foundclusters = $foundclusters\n";
-
- my @clusters = split(/_/,$foundclusters);
-
- my $clustno = 0;
-
- foreach my $cluster (@clusters) {
- my @constituenst = split(/,/,$cluster);
- # print "clusters returned: @constituenst\n" if $prinkter == 1;
- }
-
- @string = split("_S0S_",stringPainter(join("_C0C_",@string),$foundclusters));
-
-
- }
- next if $stopper == 1;
-
- # print colored ['blue'],"FINAL:\n" if $prinkter == 1;
- my $finalclusters =findClusters(join("!",@string), 1);
- # print "finalclusters = $finalclusters\n";
- # print colored ['blue'],"----------------------\n" if $prinkter == 1;
- my @clusters = split(/,/,$finalclusters);
- # print "@string\n" if $prinkter == 1;
- # print "@clusters\n" if $prinkter == 1;
- # print "------------------------------------------------------------------\n" if $prinkter == 1;
-
- my $clustno = 0;
-
- # foreach my $cluster (@clusters) {
- # my @constituenst = split(/,/,$cluster);
- # print "clusters returned: @constituenst\n";
- # }
-
- next if (scalar @clusters == 0);
-
- my @contigcluster=();
- my $clusterno=0;
- my @contigClusterstarts=();
- my @contigClusterends = ();
-
- foreach my $clust (@clusters){
- # print "cluster: $clust\n";
- $clusterno++;
- my @localclust = split(/\./, $clust);
- my @result = ();
- my @starts = ();
- my @ends = ();
-
- for my $i (0 ... $#localclust){
- # print "localclust[$i]: $localclust[$i]\n";
- my @pattern = split(/:/, $localclust[$i]);
- my @cords = split(/-/, $pattern[1]);
- push (@starts, $cords[0]);
- push (@ends, $cords[1]);
- }
-
- my $extremestart = smallest_number(@starts);
- my $extremeend = largest_number(@ends);
- push(@contigClusterstarts, $extremestart);
- push(@contigClusterends, $extremeend);
- # print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ;
-
- foreach my $clustparts (@localclust){
- my @pattern = split(/:/, $clustparts);
- # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n";
- push (@result, $starthash[$pattern[0]]{$pattern[1]});
- }
- push(@contigcluster, join("\t", @result));
- # print join("\t", @result),"<-result \n" if $prinkter == 1 ;
- }
-
-
- my $firstclusterstart = smallest_number(@contigClusterstarts);
- my $lastclusterend = largest_number(@contigClusterends);
-
-
- $contigclustersFirstStartOnly{$key2}=$firstclusterstart;
- $contigclustersLastEndOnly{$key2} = $lastclusterend;
- $contigclusters{$key2}=[ @contigcluster ];
- # print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1;
- for my $i (0 ... $#tags){
- #1 check if there exists adjacent alignment block wrt coordinates of this species.
- next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
- #2 no need to worry about proximity of anything in adjacent block!
-
- #1 BELOW, the following is really to calclate the distance between the end coordinate of the
- #2 cluster and the end of the gap-free sequence of each species. this is so that if an
- #3 adjacent alignment block is found lateron, the exact distance between the potentially
- #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
- #5 immediately below.
- # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1;
-
- my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart);
- my $species_endsubstring = ();
-
- if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";}
- else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);}
-
- # print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1;
- # print "for species: $tags[$i]: \n" if $prinkter == 1;
-
- $species_startsubstring =~ s/-| //g;
- $species_endsubstring =~ s/-| //g;
- $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring);
- $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring);
-
-
-
- # print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1;
- # print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1;
- # print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1;
-
- # print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1;
-
- }
-
-
- }
- # print "\ndone the job of filling... \n";
- #///////////////////////////////////////////////////////////////////////////////////////
- #///////////////////////////////////////////////////////////////////////////////////////
- #///////////////////////////////////////////////////////////////////////////////////////
- #///////////////////////////////////////////////////////////////////////////////////////
- $prinkter=0;
- open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
-
- my %clusteringpaths=();
- my %clustersholder=();
- my %foundkeys=();
- my %clusteringpathsRev=();
-
-
- my $totalcount=();
- my $founkeys_enteredcount=();
- my $transfered=0;
- my $complete_transfered=0;
- my $plain_transfered=0;
- my $existing_removed=0;
-
- while (my $line = <BO>){
- # print "x" x 60, "\n" if $prinkter == 1;
- next if $line !~ /^[0-9]+/;
- #print $line;
- chomp $line;
- my @fields2 = split(/\t/,$line);
- my $key2 = ();
- if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/ ) {
- $key2 = join("\t",$1, $2, $4, $5);
- }
-
- else {
- # print "seq line $line incompatible\n";
- next;
- }
- # print "KEY = : $key2\n" if $prinkter == 1;
-
-
- my @currentcontigstarts=();
- my @currentcontigends=();
- my @currentcontigchrs=();
- my @clusters = ();
- my @clusterscopy=();
- if (exists $contigclusters{$key2}){
- @clusters = @{$contigclusters{$key2}};
- @clusterscopy=@clusters;
- for my $i (0 ... $#tags){
- # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
- if ($line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/){
- # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
- my $startkey = $1."_S0E_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
- my $endkey = $1."_S0E_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
- $currentcontigchrs[$i]=$1;
- $currentcontigstarts[$i]=$2;
- $currentcontigends[$i]=$3;
- }
- else {
- $currentcontigchrs[$i] = 0;
- # print "no microsat clusters for $key2\n" if $prinkter == 1; next;
- }
- }
- } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
-
- my @sequences = ();
- for (0 ... $#tags){
- my $seq = <BO>;
- # print $seq;
- chomp $seq;
- push(@sequences , " ".$seq);
- }
-
- next if scalar @currentcontigchrs == 0;
-
- # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1;
- my %visitedcontigs=();
-
- for my $i (0 ... $#tags){
- #1 check if there exists adjacent alignment block wrt coordinates of this species.
- next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
- #2 no need to worry about proximity of anything in adjacent block!
- @clusters=@clusterscopy;
- #1 BELOW, the following is really to calclate the distance between the end coordinate of the
- #2 cluster and the end of the gap-free sequence of each species. this is so that if an
- #3 adjacent alignment block is found lateron, the exact distance between the potentially
- #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
- #5 immediately below.
- my $firstclusterstart = $contigclustersFirstStartOnly{$key2};
- my $lastclusterend = $contigclustersLastEndOnly{$key2};
-
- my $key3 = $currentcontigchrs[$i]."_S0E_".($currentcontigstarts[$i]);
- # print "check if exists $key3 in contigends for $i\n" if $prinkter == 1;
-
- if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){
- $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies#
- # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1;
-
- #1 extract coordinates of the last cluster of this found alignment block
- # print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1;
- # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1;
- # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
- # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
- my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}};
- my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}};
- my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=();
-
- for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){
- # print "val = $val\n" if $prinkter == 1;
- if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){
- push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/;
- }
- }
- # print "-----\n" if $prinkter == 1;
- for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){
- # print "val = $val\n" if $prinkter == 1;
- if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){
- push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/;
- }
- }
-
-
- # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1;
- # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1;
-
- # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){
- if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){
- my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}};
- $regurgitate[$#regurgitate]=~s/\n//g;
- $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters);
- delete $contigclusters{$contigends[$i]{$key3}};
- $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ];
- delete $contigclusters{$key2};
- $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0;
- $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0;
-
- if (scalar(@clusters) < 1){
- # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2};
- $clusteringpaths{$key2}=$contigends[$i]{$key3};
- $clusteringpathsRev{$contigends[$i]{$key3}}=$key2;
- print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n";
- # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n";
- $founkeys_enteredcount-- if exists $foundkeys{$key2};
- $existing_removed++ if exists $foundkeys{$key2};
- # print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1;
- delete $foundkeys{$key2} if exists $foundkeys{$key2};
- $complete_transfered++;
- }
- else{
- print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2};
- $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2};
- $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2};
-
- $founkeys_enteredcount++ if !exists $foundkeys{$key2};
- $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2};
- # print "adding foundkeys entry $foundkeys{$key2}\n";
- $transfered++;
- }
- #$contigclusters{$key2}=[ @contigcluster ];
- }
- }
- else{
- # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1;
- $plain_transfered++;
- print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
- $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2};
- $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
- $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
- $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
- # print "adding foundkeys entry $foundkeys{$key2}\n";
-
- }
- $totalcount++;
-
- }
-
-
- }
- close BO;
- #close (NORTH);
- #///////////////////////////////////////////////////////////////////////////////////////
- #///////////////////////////////////////////////////////////////////////////////////////
- #///////////////////////////////////////////////////////////////////////////////////////
- #///////////////////////////////////////////////////////////////////////////////////////
-
- my $founkeys_count=();
- my $nopath_count=();
- my $pathed_count=0;
- foreach my $key2 (keys %foundkeys){
- #print "x" x 60, "\n";
- # print "x" if $dotcounter % 100 ==0;
- # print "\n" if $dotcounter % 5000 ==0;
- $founkeys_count++;
- my $key = $key2;
- # print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1;
- if ($clusteringpaths{$key} eq $key){
- # print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1;
- $nopath_count++;
- delete $foundkeys{$key2};
- print ORTH join ("\n",@{$contigclusters{$key2}}),"\n";
- }
- else{
- my @pool=();
- my $key3=();
- $pathed_count++;
- # print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1;
- # print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1;
- if ($clusteringpathsRev{$key} eq "0") {
- next;
- }
- else{
- my $yek3 = $clusteringpathsRev{$key};
- my $yek = $key;
- # print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1;
- while ($yek3 ne "0"){
- # print "$yek->$yek3," if $prinkter == 1;
- $yek = $yek3;
- $yek3 = $clusteringpathsRev{$yek};
- }
- # print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1;
- $key3 = $clusteringpaths{$yek};
- $key = $yek;
- }
-
- # print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1;
-
- while($key ne $key3){
- # print "KEEY $key->$key3\n" if $prinkter == 1;
- # print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1;
-
- if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}};
- # print "now pool = @pool\n" if $prinkter == 1;
- }
- delete $foundkeys{$key3};
- $key=$key3;
- $key3=$clusteringpaths{$key};
- }
- # print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1;
- my @firstcontig= @{$contigclusters{$key}};
- delete $foundkeys{$key2} if exists $foundkeys{$key2} ;
- delete $foundkeys{$key} if exists $foundkeys{$key};
-
- unshift @pool, pop @firstcontig;
- # print join("\t",@pool),"\n" if $prinkter == 1;
- print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0;
- print ORTH join ("\t",@pool),"\n";
- # join();
- }
-
- }
- #close (NORTH);
- # print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1;
- close (BO);
- close (ORTH);
- close (OUTP);
- return 1;
-
- }
- sub stringPainter{
- my @string = split(/_C0C_/,$_[0]);
- # print $_[0], " <- in stringPainter\n";
- # print $_[1], " <- in clusters\n";
-
- my @clusters = split(/,/, $_[1]);
- for my $i (0 ... $#clusters){
- my $cluster = $clusters[$i];
- # print "cluster = $cluster\n";
- my @parts = split(/\./,$cluster);
- my @cord = split(/:|-/,shift(@parts));
- my $minstart = $cord[1];
- my $maxend = $cord[2];
- # print "minstart = $minstart , maxend = $maxend\n";
-
- for my $j (0 ... $#parts){
- # print "oing thri $parts[$j]\n";
- my @cord = split(/:|-/,$parts[$j]);
- $minstart = $cord[1] if $cord[1] < $minstart;
- $maxend = $cord[2] if $cord[2] > $maxend;
- }
- # print "minstart = $minstart , maxend = $maxend\n";
- for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;}
-
-
- }
- # print "@string <-done from function stringPainter\n";
- return join("_S0S_",@string);
- }
- sub findClusters{
- my $continue = 0;
- my @mapped_clusters = ();
- my $clusterdist = $_[1];
- my $previous = 'x';
- my @localcluster = ();
- my $cluster_starts = ();
- my $cluster_ends = ();
- my $localcluster_start = ();
- my $localcluster_end = ();
- my @record_cluster = ();
- my @string = split(/\!/, $_[0]);
- my $zerolength=0;
-
- for my $pos_pos (1 ... $#string){
- my $pos = $string[$pos_pos];
- # print $pos, "\n";
- if ($continue == 0 && $pos eq "x") {next;}
-
- if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){
- if ($zerolength == 0) {$localcluster_end = $pos_pos-1};
- $zerolength++;
- $continue = 1;
- }
- if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) {
- $zerolength = 0;
- $continue = 0;
- my %seen;
- my @uniqed = grep !$seen{$_}++, @localcluster;
- # print "caught cluster : @uniqed \n";
- push(@mapped_clusters, [@uniqed]);
- # print "clustered:\n@uniqed\n";
- @localcluster = ();
- @record_cluster = ();
-
- }
-
- if ($pos ne "x"){
- $zerolength = 0;
- $continue = 1;
- $pos =~ s/x,//g;
- my @entries = split(/,/,$pos);
- $localcluster_end = 0;
- $localcluster_start = 0;
- push(@record_cluster,$pos);
-
- if ($continue == 0){
- @localcluster = ();
- @localcluster = (@localcluster, @entries);
- $localcluster_start = $pos_pos;
- }
-
- if ($continue == 1 ) {
- @localcluster = (@localcluster, @entries);
- }
- }
- }
-
- if (scalar(@localcluster) > 0){
- my %seen;
- my @uniqed = grep !$seen{$_}++, @localcluster;
- # print "caught cluster : @uniqed \n";
- push(@mapped_clusters, [@uniqed]);
- # print "clustered:\n@uniqed\n";
- @localcluster = ();
- @record_cluster = ();
- }
- my @returner = ();
-
- foreach my $clust (@mapped_clusters){
- my @localclust = @$clust;
- my @result = ();
- foreach my $clustparts (@localclust){
- push(@result,$clustparts);
- }
- push(@returner , join(".",@result));
- }
- # print "returnig: ", join(",",@returner), "\n";
- return join(",",@returner);
- }
- #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx
- sub MakeTrees{
- my $tree = $_[0];
- my @parts=($tree);
- # my @parts=();
-
- while (1){
- $tree =~ s/^\(//g;
- $tree =~ s/\)$//g;
- my @arr = ();
-
- if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
- @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
- $tree = $2;
- push @parts, $tree;
- }
- elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
- @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
- $tree = $1;
- push @parts, $tree;
- }
- elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
- last;
- }
- }
- return @parts;
- }
- #xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx
- sub qualityFilter{
- my $unmaskedorthfile = $_[0];
- my $seqfile = $_[1];
- my $maskedorthfile = $_[2];
- my $filteredout = $maskedorthfile."_residue";
- open (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!";
- my %keyhash = ();
-
- while (my $line = <PMORTH>){
- my $key = join("\t", $1,$2,$3,$4) if $line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/;
- push @{$keyhash{$key}}, $line;
- }
-
- open (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!";
- open (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!";
- open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!";
-
- while (my $line = <SEQ>){
- chomp $line;
- if ($line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/){
- my $key = join("\t", $1,$2,$3,$4);
- next if !exists $keyhash{$key};
- my @orths = @{$keyhash{$key}} if exists $keyhash{$key};
- delete $keyhash{$key};
-
- my $sine = <SEQ>;
-
- foreach my $orth (@orths){
- #print "-----------------------------------------------------------------\n";
- #print $orth;
- my $orthcopy = $orth;
- $orth =~ s/^>//;
- my @parts = split(/>/,$orth);
-
- my @starts = ();
- my @ends = ();
-
- foreach my $part (@parts){
- my $no_of_species = adjustCoordinates($part);
- my @pields = split(/\t/,$part);
-
- # print "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\n";
-
- push @starts, $pields[$startcord];
- push @ends, $pields[$endcord];
- }
-
- #print "starts = @starts ... ends = @ends\n";
-
- my $leftend = smallest_number(@starts)-10;
- my $rightend = largest_number(@ends)+10;
- my $maskarea = substr($sine, $leftend, $rightend-$leftend+1);
- print RES $orth if $maskarea =~ /#/;
-
-
- next if $maskarea =~ /#/;
-
- print MORTH $orthcopy;
- }
- }
- else{
- next;
- }
-
-
- }
-
- # print "UNDONE: ", scalar(keys %keyhash),"\n";
- # print MORTH "UNDONE: ", scalar(keys %keyhash),"\n";
-
- }
- sub adjustCoordinates{
- my $line = $_[0];
- my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig;
- my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g);
- # print "line = $line\n";
- $infocord = 2 + (4*$no_of_species) - 1;
- $typecord = 2 + (4*$no_of_species) + 1 - 1;
- $motifcord = 2 + (4*$no_of_species) + 2 - 1;
- $gapcord = $motifcord+1;
- $startcord = $gapcord+1;
- $strandcord = $startcord+1;
- $endcord = $strandcord + 1;
- $microsatcord = $endcord + 1;
- $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
- $interr_poscord = $microsatcord + 3;
- $no_of_interruptionscord = $microsatcord + 4;
- $interrcord = $microsatcord + 2;
- # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i;
- return $no_of_species;
- }