/tools/regVariation/microsatellite_birthdeath.pl
Perl | 4333 lines | 2916 code | 609 blank | 808 comment | 702 complexity | d1bf4746bc2b8d229099df9b29475ffa MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- #!/usr/bin/perl -w
- use strict;
- use warnings;
- use Term::ANSIColor;
- use Pod::Checker;
- use File::Basename;
- use IO::Handle;
- use Cwd;
- use File::Path qw(make_path remove_tree);
- use File::Temp qw/ tempfile tempdir /;
- my $tdir = tempdir( CLEANUP => 1 );
- chdir $tdir;
- my $dir = getcwd;
- # print "current dit=$dir\n";
- #<STDIN>;
- use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags
- $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species
- $gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies_orig @exactspecies @exacttags
- $mono_flanksimplicityRepno $di_flanksimplicityRepno $prop_of_seq_allowedtoAT $prop_of_seq_allowedtoCG);
- use FileHandle;
- use IO::Handle; # 5.004 or higher
- #my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat",
- #"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12",
- #"10","0.8");
- my @ar = @ARGV;
- my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar;
- $SIMILARITY_THRESH=$SIMILARITY_THRESH/100;
- #########################
- $SIMILARITY_THRESH = $SIMILARITY_THRESH/100;
- my $EDGE_DISTANCE = 10;
- my $COMPLEXITY_SUPPORT = 20;
- load_thresholds("9_10_12_12");
- my $FLANKINDEL_MAXTHRESH = 0.3;
- my $mono_flanksimplicityRepno=6;
- my $di_flanksimplicityRepno=10;
- my $prop_of_seq_allowedtoAT=0.5;
- my $prop_of_seq_allowedtoCG=0.66;
- #########################
- my $tspecies_set = $species_set;
- my %speciesReplacement = ();
- my %speciesReplacementTag = ();
- my %replacementArr= ();
- my %replacementArrTag= ();
- my %backReplacementArr= ();
- my %backReplacementArrTag= ();
- $tree_definition=~s/\s+//g;
- my $tree_definition_split = $tree_definition;
- $tree_definition_split =~ s/[\(\)]//g;
- my @gotSpecies = ($tree_definition_split =~ /(,)/g);
- # print "gotSpecies = @gotSpecies\n";
- if (scalar(@gotSpecies)+1 ==5){
- $speciesReplacement{1}="calJac1";
- $speciesReplacement{2}="rheMac2";
- $speciesReplacement{3}="ponAbe2";
- $speciesReplacement{4}="panTro2";
- $speciesReplacement{5}="hg18";
- $speciesReplacementTag{1}="M";
- $speciesReplacementTag{2}="R";
- $speciesReplacementTag{3}="O";
- $speciesReplacementTag{4}="C";
- $speciesReplacementTag{5}="H";
- $species_set="hg18,panTro2,ponAbe2,rheMac2,calJac1";
- }
- if (scalar(@gotSpecies)+1 ==4){
- $speciesReplacement{1}="rheMac2";
- $speciesReplacement{2}="ponAbe2";
- $speciesReplacement{3}="panTro2";
- $speciesReplacement{4}="hg18";
- $speciesReplacementTag{1}="R";
- $speciesReplacementTag{2}="O";
- $speciesReplacementTag{3}="C";
- $speciesReplacementTag{4}="H";
- $species_set="hg18,panTro2,ponAbe2,rheMac2";
- }
- # $tree_definition = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)";
- my $tree_definition_copy = $tree_definition;
- my $tree_definition_orig = $tree_definition;
- my $brackets = 0;
- while (1){
- #last if $tree_definition_copy !~ /\(/;
- $brackets++;
- # print "brackets = $brackets\n";
- last if $brackets > 6;
- $tree_definition_copy =~ s/^\(//g;
- $tree_definition_copy =~ s/\)$//g;
- # print "tree_definition_copy = $tree_definition_copy\n";
- my @arr = ();
- if ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
- @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
- # print "arr = @arr\n";
- $tree_definition_copy = $2;
- $replacementArr{$1} = $speciesReplacement{$brackets};
- $backReplacementArr{$speciesReplacement{$brackets}}=$1;
-
- $replacementArrTag{$1} = $speciesReplacementTag{$brackets};
- $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$1;
- # print "replacing $1 with $replacementArr{$1}\n";
-
- $sp_ident[$brackets-1] = $1;
-
- }
- elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
- @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
- # print "arr = @arr\n";
- $tree_definition_copy = $1;
- $replacementArr{$2} = $speciesReplacement{$brackets};
- $backReplacementArr{$speciesReplacement{$brackets}}=$2;
- $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
- $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
- # print "replacing $2 with $replacementArr{$2}\n";
- $sp_ident[$brackets-1] = $2;
- }
- elsif ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
- @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/;
- # print "arr = @arr .. TERMINAL\n";
- $tree_definition_copy = $1;
- $replacementArr{$2} = $speciesReplacement{$brackets};
- $replacementArr{$1} = $speciesReplacement{$brackets+1};
- $backReplacementArr{$speciesReplacement{$brackets}}=$2;
- $backReplacementArr{$speciesReplacement{$brackets+1}}=$1;
- $replacementArrTag{$1} = $speciesReplacementTag{$brackets+1};
- $backReplacementArrTag{$speciesReplacementTag{$brackets+1}}=$1;
- $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
- $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
- # print "replacing $1 with $replacementArr{$1}\n";
- # print "replacing $2 with $replacementArr{$2}\n";
- # print "replacing $1 with $replacementArrTag{$1}\n";
- # print "replacing $2 with $replacementArrTag{$2}\n";
- $sp_ident[$brackets-1] = $2;
- $sp_ident[$brackets] = $1;
- last;
- }
- elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_\(\),]+)\)$/){
- $tree_definition_copy =~ s/^\(//g;
- $tree_definition_copy =~ s/\)$//g;
- $brackets--;
- }
- }
- foreach my $key (keys %replacementArr){
- my $replacement = $replacementArr{$key};
- $tree_definition =~ s/$key/$replacement/g;
- }
- @sp_ident = reverse(@sp_ident);
- # print "modified tree_definition = $tree_definition\n";
- # print "done .. tree_definition = $tree_definition\n";
- # print "sp_ident = @sp_ident\n";
- #<STDIN>;
- my $complexity=int($COMPLEXITY_SUPPORT * (1/40));
- #print "complexity=$complexity\n";
- #<STDIN>;
- $printer = 1;
- my $rando = int(rand(1000));
- my $localdate = `date`;
- $localdate =~ /([0-9]+):([0-9]+):([0-9]+)/;
- my $info = $rando.$1.$2.$3;
- #---------------------------------------------------------------------------
- # GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES
- my @thresharr = (0, split(/,/,$thresholds));
- my $randno=int(rand(100000));
- my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt";
- my $megamatchlck = $megamatch.".lck";
- unlink $megamatchlck;
- my $selected= $orth;
- #my $eventfile = $orth;
- $selected = $selected."_SELECTED";
- #$selected = $selected."_".$SIMILARITY_THRESH;
- #my $runtime = $selected.".runtime";
- my $inputtags = "H:C:O:R:M";
- $inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/;
- my @all_tags = split(/:/, $inputtags);
- my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1";
- $inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/;
- #@sp_ident = split(/:/,$inputsp);
- my $junkfile = $orth."_junk";
- my $sh = load_sameHash(1);
- my $rh = load_revHash(1);
- #print "inputs are : \n"; foreach(@ARGV){print $_,"\n";}
- #open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!";
- open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!";
- #open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!";
- #my $ctlfile = "baseml\.ctl"; #$ARGV[4];
- #my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK
- my %registeredTrees = ();
- my @removalReasons =
- ("microsatellite is compound",
- "complex structure",
- "if no. if micros is more than no. of species",
- "if more than one micro per species ",
- "if microsat contains N",
- "different motif than required ",
- "more than zero interruptions",
- "microsat could not form key ",
- "orthologous microsats of different motif size ",
- "orthologous microsats of different motifs ",
- "microsats belong to different alignment blocks altogether",
- "microsat near edge",
- "microsat in low complexity region",
- "microsat flanks dont align well",
- "phylogeny not informative");
- my %allowedhash=();
- #---------------------------------------------------------------------------
- # WORKING ON MAKING THE MEGAMATCH FILE
- my $chromt=int(rand(10000));
- my $p_chr=$chromt;
- my $tree_definition_orig_copy = $tree_definition_orig;
- $tree_definition=~s/,/, /g;
- $tree_definition =~ s/, +/, /g;
- $tree_definition_orig=~s/,/, /g;
- $tree_definition_orig =~ s/, +/, /g;
- my @exactspeciesset_unarranged = split(/,/,$species_set);
- my @exactspeciesset_unarranged_orig = split(/,/,$tspecies_set);
- my $largesttree = "$tree_definition;";
- my $largesttree_orig = "$tree_definition_orig;";
- # print "largesttree = $largesttree\n";
- $tree_definition =~ s/\(//g;
- $tree_definition =~ s/\)//g;
- $tree_definition=~s/[\)\(, ]/\t/g;
- $tree_definition =~ s/\t+/\t/g;
- $tree_definition_orig =~ s/\(//g;
- $tree_definition_orig =~ s/\)//g;
- $tree_definition_orig =~s/[\)\(, ]/\t/g;
- $tree_definition_orig =~ s/\t+/\t/g;
- # print "tree_definition = $tree_definition tree_definition_orig = $tree_definition_orig\n";
- my @treespecies=split(/\t+/,$tree_definition);
- my @treespecies_orig=split(/\t+/,$tree_definition_orig);
- # print "tree_definition = $tree_definition .. treespecies=@treespecies ... treespecies_orig=@treespecies_orig\n";
- #<STDIN>;
- foreach my $spec (@treespecies){
- foreach my $espec (@exactspeciesset_unarranged){
- # print "spec=$spec and espec=$espec\n";
- push @exactspecies, $spec if $spec eq $espec;
- }
- }
- foreach my $spec (@treespecies_orig){
- foreach my $espec (@exactspeciesset_unarranged_orig){
- # print "spec=$spec and espec=$espec\n";
- push @exactspecies_orig, $spec if $spec eq $espec;
- }
- }
- my $focalspec = $exactspecies[0];
- my $focalspec_orig = $exactspecies_orig[0];
- # print "exactspecies=@exactspecies ... focalspec=$focalspec\n";
- # print "texactspecies=@exactspecies_orig ... focalspec_orig=$focalspec_orig\n";
- #<STDIN>;
- my $arranged_species_set = join(".",@exactspecies);
- my $arranged_species_set_orig = join(".",@exactspecies_orig);
- @exacttags=@exactspecies;
- my @exacttags_orig=@exactspecies_orig;
- foreach my $extag (@exacttags){
- $extag =~ s/hg18/H/g;
- $extag =~ s/panTro2/C/g;
- $extag =~ s/ponAbe2/O/g;
- $extag =~ s/rheMac2/R/g;
- $extag =~ s/calJac1/M/g;
- }
- foreach my $extag (@exacttags_orig){
- $extag =~ s/hg18/H/g;
- $extag =~ s/panTro2/C/g;
- $extag =~ s/ponAbe2/O/g;
- $extag =~ s/rheMac2/R/g;
- $extag =~ s/calJac1/M/g;
- }
- my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
- #print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
- maftoAxt_multispecies($maf, $tree_definition_orig_copy, $chr_name, $tspecies_set);
- #print "made files\n";
- my @filterseqfiles= ($chr_name);
- $largesttree =~ s/hg18/H/g;
- $largesttree =~ s/panTro2/C/g;
- $largesttree =~ s/ponAbe2/O/g;
- $largesttree =~ s/rheMac2/R/g;
- $largesttree =~ s/calJac1/M/g;
- #<STDIN>;
- #---------------------------------------------------------------------------
- my ($lagestnodes, $largestbranches) = get_nodes($largesttree);
- shift (@$lagestnodes);
- my @extendedtitle=();
- my $title = ();
- my $parttitle = ();
- my @titlearr = ();
- my @firsttitle=($focalspec_orig."chrom", $focalspec_orig."start", $focalspec_orig."end", $focalspec_orig."motif", $focalspec_orig."motifsize", $focalspec_orig."threshold");
- my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details);
- my @fititle=();
- foreach my $spec (split(",",$tspecies_set)){
- push @fititle, $spec;
- foreach my $name (@finames){
- push @fititle, $spec.".".$name;
- }
- }
- my @othertitle=qw(somechr somestart somened event source);
- my @fnames = ();
- push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ;
- push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ;
- push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ;
- push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ;
- push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num);
- my @fullnames = ();
- # print "revising\n";
- # print "H = $backReplacementArrTag{H}\n";
- # print "C = $backReplacementArrTag{C}\n";
- # print "O = $backReplacementArrTag{O}\n";
- # print "R = $backReplacementArrTag{R}\n";
- # print "M = $backReplacementArrTag{M}\n";
- foreach my $lnode (@$lagestnodes){
- my @pair = @$lnode;
- my @nodemutarr = ();
- for my $p (@pair){
- # print "p = $p\n";
- $p =~ s/[\(\), ]+//g;
- $p =~ s/([A-Z])/$1./g;
- $p =~ s/\.$//g;
-
- $p =~ s/H/$backReplacementArrTag{H}/g;
- $p =~ s/C/$backReplacementArrTag{C}/g;
- $p =~ s/O/$backReplacementArrTag{O}/g;
- $p =~ s/R/$backReplacementArrTag{R}/g;
- $p =~ s/M/$backReplacementArrTag{M}/g;
- foreach my $n (@fnames) { push @fullnames, $p.".".$n;}
- }
- }
- #print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle);
- #print SUMMARY "\t",join("\t", @fullnames);
- my $header = join("\t",@firsttitle, @fititle, @othertitle, @fullnames, @fnames, "tree", "cleancase");
- # print "header= $header\n";
- #<STDIN>;
- #print SUMMARY "\t",join("\t", @fnames);
- #$title= $title."\t".join("\t", @fnames);
- #print SUMMARY "\t","tree","\t", "cleancase", "\n";
- #$title= $title."\t"."tree"."\t"."cleancase". "\n";
- #print $title; #<STDIN>;
- #print "all_tags = @all_tags\n";
- for my $no (3 ... $#all_tags+1){
- # print "no=$no\n"; #<STDIN>;
- @tags = @all_tags[0 ... $no-1];
- # print "all_tags=>@all_tags< , tags = >@tags<\n" if $printer == 1; #<STDIN>;
- %template=();
- my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- #next if scalar(@tags) < 4;
- # print "now doing tags = @tags, no = $no\n";
- open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!";
-
- # print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion);
-
-
- ##################### T E M P O R A R Y #####################
- my @finaltitle=();
- my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13);
- my $endtitle = ();
- foreach my $tag (@tags){
- my @tempsingle = ();
-
- foreach my $single (@singletitle){
- push @tempsingle, $tag.$single;
- }
- @finaltitle = (@finaltitle, @tempsingle);
- }
- # print SUMMARY join("\t",@finaltitle),"\n";
-
- #############################################################
-
- #---------------------------------------------------------------------------
- # GET THE TREE FROM TREE FILE
- my $tree = ();
- $tree = "((H, C), O)" if $no == 3;
- $tree = "(((H, C), O), R)" if $no == 4;
- $tree = "((((H, C), O), R), M)" if $no == 5;
- # $tree=~s/;$//g;
- # print "our tree = $tree\n";
- #---------------------------------------------------------------------------
- # LOADING HASH CONTAINING ALL POSSIBLE TREES:
- $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt";
- %template=();
- %alternate=();
- load_allPossibleTrees($tree_decipherer, \%template, \%alternate);
-
- #---------------------------------------------------------------------------
- # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS
- %treesToReject=();
- %treesToIgnore=();
- load_treesToReject(@tags);
- load_treesToIgnore(@tags);
- #---------------------------------------------------------------------------
- # LOADING INPUT DATA INTO HASHES AND ARRAYS
-
-
- #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS
- #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF
- #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES).
-
- my %orths=();
- my $counterm = 0;
- my $loaded = 0;
- my %seen = ();
- my @allowedchrs = ();
- # print "no = $no\n"; #<STDIN>;
- while (my $line = <ORTH>){
- # print "line=$line\n";
- my $register1 = $line =~ s/>$exactspecies_orig[0]/>$replacementArrTag{$exactspecies_orig[0]}/g;
- my $register2 = $line =~ s/>$exactspecies_orig[1]/>$replacementArrTag{$exactspecies_orig[1]}/g;
- my $register3 = $line =~ s/>$exactspecies_orig[2]/>$replacementArrTag{$exactspecies_orig[2]}/g;
- my $register4 = $line =~ s/>$exactspecies_orig[3]/>$replacementArrTag{$exactspecies_orig[3]}/g;
- my $register5 = $line =~ s/>$exactspecies_orig[4]/>$replacementArrTag{$exactspecies_orig[4]}/g if exists $exactspecies_orig[4];
-
- # print "line = $line\n"; <STDIN>;
-
-
- # next if $register1 + $register2 + $register3 + $register4 + $register5 > scalar(@tags);
- my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros
- #print "micros=",printarr(@micros),"\n"; #<STDIN>;
- shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY
-
- $no_of_species = adjustCoordinates($micros[0]);
- # print "A: $no_of_species\n";
- next if $no_of_species != $no;
- # print "no = $no ... no_of_species=$no_of_species\n";#<STDIN>;
- $counterm++;
- #------------------------------------------------
- $nextcounter[0]++ if $line =~ /compound/;
- next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS
- #------------------------------------------------
- #next if $line =~ /[A-Za-z]>[a-zA-Z]/;
- #------------------------------------------------
- chomp $line;
- my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER
- #print "number of species = $match_count\n";
- my $stopper = 0;
- foreach my $mic (@micros){
- my @local = split(/\t/,$mic);
- if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++;
- last; }
- # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM
- }
- next if $stopper ==1;
- #------------------------------------------------
- $nextcounter[2]++ if (scalar(@micros) >$no_of_species);
-
- next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET.
- #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY
- #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING
- #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD.
- #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl
- my $nexter = 0;
- foreach my $tag (@tags){
- my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g);
- if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1;
- next;
- }
- }
-
- if ($nexter == 1){
- $nextcounter[3]++;
- next;
- }
- #------------------------------------------------
- foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM
- my @local = split(/\t/,$mic);
- if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++;
- last;}
- }
- next if $stopper ==1;
- #print "till here 1\n"; #<STDIN>;
- #------------------------------------------------
- my @micros_copy = @micros;
-
- my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST
- #2 MICROSAT IN THE CLUSTER.
- my @tempfields = split(/\t/,$tempmicro);
- my $prevtype = $tempfields[$typecord];
- my $tempmotif = $tempfields[$motifcord];
-
- my $tempfirstmotif = ();
- if (scalar(@tempfields) > $microsatcord + 2){
- if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS
- #2 IN THE FIRST MICROSAT OF THE CLUSTER
- $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
- }
- }
- if ($nexter == 1){
- $nextcounter[6]++;
- next;
- } #1 DONE OBTAINING INFORMATION REGARDING
- #2 THE FIRST MICROSAT FROM THE CLUSTER
-
- if ($tempmotif =~ /^\[/){
- $tempmotif =~ s/^\[//g;
- $tempmotif =~ /([a-zA-Z]+)\].*/;
- $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT
- }
- else {$tempfirstmotif = $tempmotif;}
- my $prevmotif = $tempfirstmotif;
-
- my $key = ();
- # print "searching temp micro for 0-9 $focalspec chr0-9a-zA-Z 0-9 0-9 \n";
- # print "tempmicro = $tempmicro .. looking for ([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\n"; <STDIN>;
- if ($tempmicro =~ /([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
- # print "B: $no_of_species\n";
- $key = join("_",$2, $3, $4, $5);
- }
- else{
- # print "counld not form a key for temp\n"; # if $printer == 1;
- $nextcounter[7]++;
- next;
- }
- #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT
- #2 THE FIRST MICROSAT IN THE CLUSTER, THE
- #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS
- #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW)
-
- foreach my $micro (@micros_copy){
- my @fields = split(/\t/,$micro);
- #-----------------
- if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS
- if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
- $nextcounter[6]++;
- last; }
- }
- #-----------------
- if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) {
- $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1;
- $nextcounter[8]++;
- last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
- #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.)
- $prevtype = $fields[$typecord];
-
- my $motif = $fields[$motifcord];
- my $firstmotif = ();
-
- if ($motif =~ /^\[/){
- $motif =~ s/^\[//g;
- $motif =~ /([a-zA-Z]+)\].*/;
- $firstmotif = $1;
- }
- else {$firstmotif = $motif;}
-
- my $motifpattern = $firstmotif.$firstmotif;
- my $prevmotifpattern = $prevmotif.$prevmotif;
-
- if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) {
- $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1;
- $nextcounter[9]++;
- last;
- } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
- #2 TO DIFFERENT MOTIFS
- my $prevmotif = $firstmotif;
- #-----------------
-
- for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG
- #2 DIFFERENT ALIGNMENT BLOCKS
- if ($micro =~ /([0-9]+)\s+($focalspec_orig)\s([_0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
- my $key2 = join("_",$2, $3, $4, $5);
- # print "key = $key .. key2 = $key2\n"; #<STDIN>;
- if ($key2 ne $key){
- # print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1;
- $nextcounter[10]++;
- $nexter = 1; last;
- }
- }
- else{
- # print "counld not form a key for $line\n"; # if $printer == 1;
- #<STDIN>;
- $nexter = 1; last;
- }
- }
- }
- #print "D2: $no_of_species\n";
- #####################
- if ($nexter == 1){
- # print "nexting\n"; # if $printer == 1;
- next;
- }
- else{
- # print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1;
- push (@{$orths{$key}},$line);
- $loaded++;
- if ($line =~ /($focalspec_orig)\s([_a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
-
- # print "$line\n" if $printer == 1; #if $line =~ /Contig/;
- # print "################ ################\n" if $printer == 1;
- push @allowedchrs, $2 if !exists $allowedhash{$2};
- $allowedhash{$2} = 1;
- my $key = join("\t",$1, $2, $3, $4);
- # print "C: $no_of_species .. key = $key\n";#<STDIN>;
- # print "print the shit: $key\n" ; #if $printer == 1;
- $seen{$key} = 1;
- }
- else { #print "Key could not be formed in SPUT for ($focalspec_orig) (chrom) ([0-9]+) ([0-9]+)\n";
- }
- }
- }
- close ORTH;
- # print "now studying where we lost microsatellites: @nextcounter\n";
- for my $reason (0 ... $#nextcounter){
- #print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
- }
- # print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n";
- # print "done filtering .. counterm = $counterm and loaded = $loaded\n";
- #----------------------------------------------------------------------------------------------------------------
- # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY.
- # print "adding files @filterseqfiles \n";
- #<STDIN>;
-
- while (1){
- if (-e $megamatchlck){
- # print "waiting to write into $megamatchlck\n";
- sleep 10;
- }
- else{
- open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!";
- open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!";
- last;
- }
- }
- foreach my $seqfile (@filterseqfiles){
- my $fullpath = $seqfile;
- open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!";
- my $matchlines = 0;
- while (my $line = <MATCH>) {
- #print "checking $line";
- if ($line =~ /($focalspec_orig)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
- my $key = join("\t",$1, $2, $3, $4);
- # print "key = $key\n";
- #print "------------------------------------------------------\n";
- #print "asking $line\n";
- if (exists $seen{$key}){
-
- #print "seen $line \n"; <STDIN>;
- while (1){
- $matchlines++;
- print MEGAM $line;
- $line = <MATCH>;
- print MEGAM "\n" if $line !~ /[0-9a-zA-Z]/;
- last if $line !~/[0-9a-zA-Z]/;
- }
- }
- else{
- # print "not seen\n";
- }
- }
- }
- # print "matchlines = $matchlines\n";
- close MATCH;
- }
- close MEGAMLCK;
-
- unlink $megamatchlck;
- close MEGAM;
- undef %seen;
- # print "done writitn to $megamatch\n";#<STDIN>;
- #----------------------------------------------------------------------------------------------------------------
- #<STDIN>;
- #---------------------------------------------------------------------------
- # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO
- # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE
- # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER
- #$printer = 1;
-
- my $microreadcounter=0;
- my $contigsentered=0;
- my $contignotrightcounter=0;
- my $keynotformedcounter=0;
- my $keynotfoundcounter= 0;
- my $dotcounter = 0;
- # print "opening $megamatch\n";
- open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!";
- # print "doing $megamatch\n " ;
- #<STDIN>;
-
- while (my $line = <BO>){
- # print $line; #<STDIN>;
- # print "." if $dotcounter % 100 ==0;
- # print "\n" if $dotcounter % 5000 ==0;
- # print "dotcounter = $dotcounter\n " if $printer == 1;
- next if $line !~ /^[0-9]+/;
- $dotcounter++;
- # print colored ['green'], "~" x 60, "\n" if $printer == 1;
- # print colored ['green'], $line;# if $printer == 1;
- chomp $line;
- my @fields2 = split(/\t/,$line);
- my $key2 = ();
- my $alignment_no = (); #1 TEMPORARY
- if ($line =~ /([0-9]+)\s+($focalspec_orig)\s([_\-s0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
- # $key2 = join("\t",$1, $2, $4, $5);
- $key2 = join("_",$2, $3, $4, $5);
- # print "key = $key2\n";
- # print "key = $key2\n";
- $alignment_no=$1;
- }
- else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;}
-
- $no_of_species = adjustCoordinates($line);
- $contignotrightcounter++ if $no_of_species != $no;
- # print "contignotrightcounter=$contignotrightcounter\n";
- # print "no_of_species=$no_of_species\n";
- # print "no=$no\n";
-
- next if $no_of_species != $no;
- # print "D: $no_of_species\n";
- # print "E: $no_of_species\n";
- #<STDIN>;
- # print "key = $key2\n" if $printer == 1;
- my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS
- #2 ALIGNMENT BLOCK
- if (exists($orths{$key2})){
- @clusters = @{$orths{$key2}};
- $contigsentered++;
- delete $orths{$key2};
- }
- else{
- # print "orth does not exist\n";
- $keynotfoundcounter++;
- next;
- }
-
- my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK
- my $humseq = ();
- foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES
- my $seq = <BO>; #2 OF ALL SPECIES.
- chomp $seq;
- $sequences{$tag} = " ".$seq;
- #print "sequences = $sequences{$tag}\n" if $printer == 1;
- $humseq = $seq if $tag =~ /H/;
- }
-
-
- foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS
- # print "x" x 60, "\n" if $printer == 1;
- # print colored ['red'],"cluster = $cluster\n";
- $largesttree =~ s/hg18/H/g;
- $largesttree =~ s/panTro2/C/g;
- $largesttree =~ s/ponAbe2/O/g;
- $largesttree =~ s/rheMac2/R/g;
- $largesttree =~ s/calJac1/M/g;
- $microreadcounter++;
- my @micros = split(/>/,$cluster);
-
-
-
-
-
-
- shift @micros;
-
- my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND
- #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK
-
- my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES
- my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME
- #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES.
-
- my %microhash=();
- my %microsathash=();
- my %nonmicrosathash=();
- my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1
- # print "tags=@tags\n";
- for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE
- #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER
- #3 THE VARIABLE $TAG;
- my $tag = $tags[$i];
- # print $seq;
- my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES.
- #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL"
-
- foreach my $micro (@micros){
- # print "micro=$micro, tag=$tag\n";
- if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND..
- $locus = $micro;
- my @fields = split(/\t/,$micro);
- $motif = $fields[$motifcord];
- $microsathash{$tag}=$fields[$microsatcord];
- # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n";
- push(@starts, $fields[$startcord]);
- push(@ends, $fields[$endcord]);
- $start_hash{$tag}=$fields[$startcord];
- $end_hash{$tag}=$fields[$endcord];
- last;
- }
- else{$microsathash{$tag}="NULL"}
- }
- $microhash{$tag}=$locus;
-
- }
-
-
-
- my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE
- my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN
- #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS.
-
- # print "starts=@starts... ends=@ends\n";
-
- my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES
- my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES
-
- my %up_largeflanks = ();
- my %down_largeflanks = ();
-
- my %locusandflanks = ();
- my %locusandlargeflanks = ();
-
- my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the
- #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES
- my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the
- #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES
-
- my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO
- #2 EXTEME VALUES.
-
- my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE
- #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE
- #3 WIHIN THE ALIGNMENT
- next if !defined $extreme_start;
- next if !defined $extreme_end;
- next if $extreme_start > length($sequences{$tags[0]});
- next if $extreme_start < 0;
- next if $extreme_end > length($sequences{$tags[0]});
-
- for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING
- #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES
- #3 AS WELL AS THE EXTREME COORDINATES OF THE
- #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE
- #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN
- #6 MORE DETAIL.
- my $tag = $tags[$i];
- # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n";
- my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM
- #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS
- #3 CLUSTER.
- # print "seq length $tag = $sequences{$tag} = ",length($sequences{$tag})," extreme_end=$extreme_end\n" ;
- my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g);
- if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){
- $edge_microsat=1;
-
- last;
- }
- else{
- $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT);
- $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT);
- $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT);
- $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT);
- $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
- $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag};
- $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag};
-
- if ($microhash{$tag} ne "NULL"){
- $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start);
- $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag});
- $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start;
- # print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1;
- }
- else{
- $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
-
- }
- # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1;
-
- }
-
- }
- $nextcounter[11]++ if $edge_microsat==1;
- next if $edge_microsat==1;
-
-
- my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS
- #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE
- #3 FUNCTION sub test_complexity
-
-
- for my $i (0 ... $#tags){
- # print "i = $tags[$i]\n" if $printer == 1;
- if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){
- # print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1;
- $low_complexity =1; last;
- }
- }
-
- $nextcounter[12]++ if $low_complexity==1;
- next if $low_complexity == 1;
-
-
- my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY
- #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN
- #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD
- #4 AS DESCRIBED IN FUNCTION sub sequence_similarity
- my %donepair = ();
- for my $i (0 ... $#tags){
- # print "i = $tags[$i]\n" if $printer == 1;
- # next if $i == 0;
- # print colored ['magenta'],"THIS IS UP\n" if $printer == 1;
- for my $b (0 ... $#tags){
- next if $b == $i;
- my $pair = ();
- $pair = $i."_".$b if $i < $b;
- $pair = $b."_".$i if $b < $i;
- next if exists $donepair{$pair};
- my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
- my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
- $donepair{$pair} = $up_similarity."_".$down_similarity;
-
- # print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n";
-
- if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){
- $sequence_dissimilarity =1;
- last;
- }
- }
- }
- $nextcounter[13]++ if $sequence_dissimilarity==1;
- next if $sequence_dissimilarity == 1;
- my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq);
- # print "simplified_microsat=$simplified_microsat\n";
- my ($tree_analysis, $conformation) = treeStudy($simplified_microsat);
- # print "tree_analysis = $tree_analysis .. conformation=$conformation\n";
- #<STDIN>;
-
- # print SELECT "\"$conformation\"\t$tree_analysis\n";
-
- next if $tree_analysis =~ /DISCARD/;
- if (exists $treesToReject{$tree_analysis}){
- $nextcounter[14]++;
- next;
- }
- # print "F: $no_of_species\n";
- # my $adjuster=();
- # if ($no_of_species == 4){
- # my @sields = split(/\t/,$simplified_microsat);
- # my $somend = pop(@sields);
- # my $somestart = pop(@sields);
- # my $somechr = pop(@sields);
- # $adjuster = "NA\t" x 13 ;
- # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
- # }
- # if ($no_of_species == 3){
- # my @sields = split(/\t/,$simplified_microsat);
- # my $somend = pop(@sields);
- # my $somestart = pop(@sields);
- # my $somechr = pop(@sields);
- # $adjuster = "NA\t" x 26 ;
- # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
- # }
- #
- $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis};
- $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis};
-
- if (exists $treesToIgnore{$tree_analysis}){
- my @appendarr = ();
- # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
- #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
- # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n";
-
- foreach my $lnode (@$lagestnodes){
- my @pair = @$lnode;
- my @nodemutarr = ();
- for my $p (@pair){
- my @mutinfoarray1 = ();
- for (1 ... 38){
- # push (@mutinfoarray1, "NA")
- }
- # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
- # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
- }
-
- }
- for (1 ... 38){
- push (@appendarr, "NA")
- }
- # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
- # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
- # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
- next;
- }
- # print colored ['blue'],"cluster = $cluster\n";
-
- my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
-
- if ($mutations_array eq "NULL"){
- # print "cluster = $cluster \n"; <STDIN>;
- my @appendarr = ();
- # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize, "\t";
- # foreach my $lnode (@$lagestnodes){
- # my @pair = @$lnode;
- # my @nodemutarr = ();
- # for my $p (@pair){
- # my @mutinfoarray1 = ();
- # for (1 ... 38){
- # push (@mutinfoarray1, "NA")
- # }
- # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
- # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
- # }
- # }
- # for (1 ... 38){
- # push (@appendarr, "NA")
- # }
- # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
- # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
- # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
- next;
- }
-
-
- # print "sent: \n" if $printer == 1;
- # print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1;
- my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
- # print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1;
- my @finalmutations_array= ();
- @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL";
- @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL";
- # print "alt = $alternate{$conformation}\n";
- my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$conformation}, \@finalmutations_array);
- my $cleancase = "UNCLEAN";
- $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/;
- $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL";
- $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL";
- $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ;
- # print "besttree = $besttree ... cleancase=$cleancase\n"; #<STDIN>;
- my @selects = ("-C","+C","-H","+H","-HC","+HC","-O","+O","-H.-C","-H.-O","-HC,+C","-HC,+H","-HC.-O","-HCO,+HC","-HCO,+O","-O.-C","-O.-H",
- "+C.+O","+H.+C","+H.+O","+HC,-C","+HC,-H","+HC.+O","+HCO,-C","+HCO,-H","+HCO,-HC","+HCO,-O","+O.+C","+O.+H","+H.+C.+O","-H.-C.-O","+HCO","-HCO");
- next if (oneOf(@selects, $besttree) == 0);
- if ( ($besttree =~ /,/ || $besttree =~ /\./) && $cleancase eq "UNCLEAN"){
- $besttree = "$besttree / $tree_analysis";
- }
- $besttree = "NULL" if $treescore <= 0;
-
- while ($besttree =~ /[A-Z][A-Z]/){
- $besttree =~ s/([A-Z])([A-Z])/$1:$2/g;
- }
-
- if ($besttree !~ /NULL/){
- my @elements = ($besttree =~ /([A-Z])/g);
-
- foreach my $ele (@elements){
- # print "replacing $ele with $backReplacementArrTag{$ele}\n";
- $besttree =~ s/$ele/$backReplacementArrTag{$ele}/g if exists $backReplacementArrTag{$ele};
- }
- }
- my $endendstate = $focalspec_orig.".".$Hchrom."\t".$Hstart."\t".$Hend."\t".$locusmotif."\t".$locusmotifsize."\t".$tree_analysis."\t";
- next if $endendstate =~ /NA\tNA\tNA/;
-
- print SUMMARY $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t";
- # print "SUMMARY\t", $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t",$tree_analysis,"\t" ;
-
- my @mutinfoarray =();
-
- foreach my $lnode (@$lagestnodes){
- my @pair = @$lnode;
- my $joint = "(".join(", ",@pair).")";
- my @nodemutarr = ();
- for my $p (@pair){
- foreach my $mut (@finalmutations_array){
- $mut =~ /node=([A-Z, \(\)]+)/;
- push @nodemutarr, $mut if $p eq $1;
- }
- @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree);
-
- # print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
- # print join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
- }
- }
-
- # print "G: $no_of_species\n";
-
- my @alignmentarr = ();
-
- foreach my $key (keys %alignment){
- push @alignmentarr, $backReplacementArrTag{$key}.":".$alignment{$key};
-
- }
- # print "alignmentarr = @alignmentarr"; <STDIN>;
-
- @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree);
- print SUMMARY join ("\t", @mutinfoarray ),"\t";
- print SUMMARY join(",",@alignmentarr),"\n";
- # print join("\t","--------------","\n",$besttree, join("",@tags)),"\n" if scalar(@tags) < 5;
- # <STDIN> if scalar(@tags) < 5;
- # print $cleancase, "\n";
- # print join ("\t", @mutinfoarray,$cleancase,join(",",@alignmentarr)),"\n"; #<STDIN>;
- # print "summarized\n"; #<STDIN>;
-
-
-
- my %indelcatch = ();
- my %substcatch = ();
- my %typecatch = ();
- my %nodescatch = ();
- my $mutconcat = join("\t", @finalmutations_array)."\n";
- my %indelposcatch = ();
- my %subsposcatch = ();
-
- foreach my $fmut ( @finalmutations_array){
- # next if $fmut !~ /indeltype=[a-zA-Z]+/;
- #print RUN $fmut, "\n";
- $fmut =~ /node=([a-zA-Z, \(\)]+)/;
- my $lnode = $1;
- $nodescatch{$1}=1;
-
- if ($fmut =~ /type=substitution/){
- # print "fmut=$fmut\n";
- $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/;
- my $from=$1;
- # print "from=$from\n";
- my $to=$2;
- # print "to=$to\n";
- push @{$substcatch{$lnode}} , ("from:".$from." to:".$to);
- $fmut =~ /position=([0-9]+)/;
- push @{$subsposcatch{$lnode}}, $1;
- }
-
- if ($fmut =~ /insertion=[a-zA-Z\-]+/){
- $fmut =~ /insertion=([a-zA-Z\-]+)/;
- push @{$indelcatch{$lnode}} , $1;
- $fmut =~ /indeltype=([a-zA-Z]+)/;
- push @{$typecatch{$lnode}}, $1;
- $fmut =~ /position=([0-9]+)/;
- push @{$indelposcatch{$lnode}}, $1;
- }
- if ($fmut =~ /deletion=[a-zA-Z\-]+/){
- $fmut =~ /deletion=([a-zA-Z\-]+)/;
- push @{$indelcatch{$lnode}} , $1;
- $fmut =~ /indeltype=([a-zA-Z]+)/;
- push @{$typecatch{$lnode}}, $1;
- $fmut =~ /position=([0-9]+)/;
- push @{$indelposcatch{$lnode}}, $1;
- }
- }
-
- # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1;
- # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1;
- # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1;
- # #print RUN ".";
-
- # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
- # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
- # print colored ['red'],"finalmutations_array=\n" if $printer == 1;
- # foreach (@finalmutations_array) {
- # print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ;
- # print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ;
-
- # }# if $line =~ /cal/;# && $line =~ /chr4/;
-
- # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
- # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
- # print "tree analysis = $tree_analysis\n" if $printer == 1;
-
- # my $mutations = "@$mutations_array";
- next;
- for my $keys (@$nodes) {foreach my $key (@$keys){
- #print "key = $key, => $branches_hash->{$key}\n";
- }
- # print "x" x 50, "\n";
- }
- my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat);
- }
- }
- close BO;
- # print "now studying where we lost microsatellites:\n";
- # print "x" x 60,"\n";
- for my $reason (0 ... $#nextcounter){
- # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
- }
- # print "x" x 60,"\n";
- # print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n";
- # print " we lost $keynotformedcounter contigs as they did not form the key, \n";
- # print "$contignotrightcounter contigs as they were not of the right species configuration\n";
- # print "$keynotfoundcounter contigs as they did not contain the microsats\n";
- # print "... In total we went through a file that had $dotcounter contigs...\n";
- # print join ("\n","remaining orth keys = ", (keys %orths),"");
- # print "------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ \n";
- # print "now printing counted trees: \n";
- if (scalar(keys %registeredTrees) > 0){
- foreach my $keyb ( sort (keys %registeredTrees) )
- {
- # print "$keyb : $registeredTrees{$keyb}\n";
- }
- }
-
-
- }
- close SUMMARY;
- my @summarizarr = ("+C=+C +R.+C -HCOR,+C",
- "+H=+H +R.+H -HCOR,+H",
- "-C=-C -R.-C +HCOR,-C",
- "-H=-H -R.-H +HCOR,-H",
- "+HC=+HC",
- "-HC=-HC",
- "+O=+O -HCOR,+O",
- "-O=-O +HCOR,-O",
- "+HCO=+HCO",
- "-HCO=-HCO",
- "+R=+R +R.+C +R.+H",
- "-R=-R -R.-C -R.-H");
- foreach my $line (@summarizarr){
- next if $line !~ /[A-Za-z0-9]/;
- # print $line;
- chomp $line;
- my @fields = split(/=/,$line);
- # print "title = $fields[0]\n";
- my @parts=split(/ +/, $fields[1]);
- my %par…
Large files files are truncated, but you can click here to view the full file