/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl
Perl | 5606 lines | 5065 code | 245 blank | 296 comment | 163 complexity | e64ba560e907a6b8d7818123ba23613a MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- #!/usr/bin/perl
- use strict;
- use warnings;
- use Term::ANSIColor;
- use File::Basename;
- use IO::Handle;
- use Cwd;
- use File::Path;
- use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);
- use File::Path qw(make_path remove_tree);
- use File::Temp qw/ tempfile tempdir /;
- my $tdir = tempdir( CLEANUP => 1 );
- chdir $tdir;
- my $dir = getcwd;
- #print "dir = $dir\n";
- #$ENV{'PATH'} .= ':' . dirname($0);
- my $date = `date`;
- my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV;
- if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; }
- $tree_definition_all =~ s/\s+//g;
- $threshold_array =~ s/\s+//g;
- $allspeciesin =~ s/\s+//g;
- #-------------------------------------------------------------------------------
- # WHICH SPUTNIK USED?
- my $sputnikpath = ();
- $sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;
- #$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";
- #print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath;
- #exit if !-e $sputnikpath;
- #$sputnikpath = "bx-sputnik" ;
- #print "ARGV input = @ARGV\n";
- #print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n";
- #-------------------------------------------------------------------------------
- # RUNFILE
- #-------------------------------------------------------------------------------
- $distance = 1; #bp
- $distance++;
- my @tree_definitions=MakeTrees($tree_definition_all);
- my $allspeciesset = $tree_definition_all;
- $allspeciesset =~ s/[\(\) ]+//g;
- @allspecies = split(/,/,$allspeciesset);
- my @outputfiles = ();
- my $round = 0;
- #my $tdir = tempdir( CLEANUP => 0 );
- #chdir $tdir;
- foreach my $tree_definition (@tree_definitions){
- my @commas = ($tree_definition =~ /,/g) ;
- #print "commas = @commas\n"; <STDIN>;
- next if scalar(@commas) <= 1;
- #print "species_set = $species_set\n";
- $treeSpeciesNum = scalar(@commas) + 1;
- $species_set = $tree_definition;
- $species_set =~ s/[\)\( ;]+//g;
- #print "species_set = $species_set\n"; <STDIN>;
- $round++;
- #-------------------------------------------------------------------------------
- # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)
- $threshold_array=~ s/,/_/g;
- my @thresharr = split("_",$threshold_array);
- @thresholds=@thresharr;
- #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));
- #print "current dit=$dir\n";
- #-------------------------------------------------------------------------------
- # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY
- my @chrfiles=();
-
- #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];
- my $chromt=int(rand(10000));
- my $p_chr=$chromt;
-
-
- my @exactspeciesset_unarranged = split(/,/,$species_set);
- $tree_definition=~s/[\)\(, ]/\t/g;
- my @treespecies=split(/\t+/,$tree_definition);
- my @exactspecies=();
-
- foreach my $spec (@treespecies){
- foreach my $espec (@exactspeciesset_unarranged){
- push @exactspecies, $spec if $spec eq $espec;
- }
- }
- #print "exactspecies=@exactspecies\n";
- $focalspec = $exactspecies[0];
- my $arranged_species_set=join(".",@exactspecies);
- my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
- my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");
- #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
- maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);
- maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set);
- #print "done maf to axt conversion\n";
- my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");
- artificial_axdata_inverter ($chr_name, $reverse_chr_name);
- #print "reverse_chr_name=$reverse_chr_name\n";
- #-------------------------------------------------------------------------------
- # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT
- foreach my $direct ("reverse_direction","forward_direction"){
- $p_chr=$chromt;
- #print "direction = $direct\n";
- $p_chr = $p_chr."r" if $direct eq "reverse_direction";
- $p_chr = $p_chr if $direct eq "forward_direction";
- my $config = $species_set;
- $config=~s/,/./g;
- my @orgs = split(/\./,$arranged_species_set);
- #print "ORGS= @orgs\n";
- my @tag=@orgs;
-
-
- my $tags = join(",", @tag);
- my @tags=@tag;
- chomp $p_chr;
- $tags = join("_", split(/,/, $tags));
- my $pchr = "chr".$p_chr;
-
- my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array;
- my @sp_tags = ();
-
- # print "$ptag _ orthfile\n"; <STDIN>;
- #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n";
- foreach my $sp (@tag){
- push(@sp_tags, ($sp.".".$ptag));
- }
-
- my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]);
- my @presp_tags = ();
-
- foreach my $sp (@tag){
- push(@presp_tags, ($sp.".".$preptag));
- }
-
- my $resultdir = "";
- my $orthdir = "";
- my $filtereddir = "";
- my $pipedir = "";
-
- my @title_queries = ();
- push(@title_queries, "^[0-9]+");
- my $sep="\\s";
- for my $or (0 ... $#orgs){
- my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]"));
- #$title =~ s/chr\\+\\s+\+/chr/g;
- push(@title_queries, $title);
- }
- my $title_query = join($sep, @title_queries);
- #print "title_queries=@title_queries\n";
- #print "query = >$title_query<\n";
- #print "orgs = @orgs\n";
- #-------------------------------------------------------------------------------
- # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES
- my $t1input = $pchr.".".$arranged_species_set.".net.axt";
-
- my @t1outputs = ();
-
- foreach my $sp (@presp_tags){
- push(@t1outputs, $sp."_gap_op");
- }
-
-
-
- multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query);
- #print "t1outputs=@t1outputs\n";
- #print "done t1\n"; <STDIN>;
- #-------------------------------------------------------------------------------
- #START T2.PL
-
- my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = ();
-
- for my $t (0 ... scalar(@tags)-1){
- multi_species_t2($t1outputs[$t], $tag[$t]);
- }
- #-------------------------------------------------------------------------------
- #START T2.2.PL
-
- my @temp_tags = @tag;
-
- foreach my $sp (@presp_tags){
- my $t2input = $sp."_nogap_op_unrand";
- multi_species_t2_2($t2input, shift(@temp_tags));
- }
- undef (@temp_tags);
-
- #-------------------------------------------------------------------------------
- #START SPUTNIK
-
- my @jobIDs = ();
- @temp_tags = @tag;
- my @sput_filelist = ();
-
- foreach my $sp (@presp_tags){
- #print "sp = $sp\n";
- my $sputnikoutput = $pipedir.$sp."_sput_op0";
- my $sputnikinput = $pipedir.$sp."_nogap_op_unrand";
- push(@sput_filelist, $sputnikinput);
- my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput;
- # print "$sputnikcommand\n";
- my @sputnikcommand_system = $sputnikcommand;
- system(@sputnikcommand_system);
- }
-
- #-------------------------------------------------------------------------------
- #START SPUTNIK OUTPUT CORRECTOR
-
- foreach my $sp (@presp_tags){
- my $corroutput = $pipedir.$sp."_sput_op1";
- my $corrinput = $pipedir.$sp."_sput_op0";
- sputnikoutput_corrector($corrinput,$corroutput);
-
- my $t4output = $pipedir.$sp."_sput_op2";
- multi_species_t4($corroutput,$t4output);
-
- my $t5output = $pipedir.$sp."_sput_op3";
- multi_species_t5($t4output,$t5output);
- #print "done t5.pl for $sp\n";
-
- my $t6output = $pipedir.$sp."_sput_op4";
- multi_species_t6($t5output,$t6output,scalar(@orgs));
- }
- #-------------------------------------------------------------------------------
- #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING
-
- foreach my $sp (@presp_tags){
- my $t9output = $pipedir.$sp."_gap_op_unrand_match";
- my $t9sequence = $pipedir.$sp."_gap_op_unrand2";
- my $t9micro = $pipedir.$sp."_sput_op4";
- t9($t9micro,$t9sequence,$t9output);
-
- my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match";
- my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2";
- t9($t9micro,$t9sequence2,$t9output2);
- }
- #print "done both t9.pl for all orgs\n";
-
- #-------------------------------------------------------------------------------
- # FIND COMPOUND MICROSATELLITES
-
- @jobIDs = ();
- my $species_counter = 0;
-
- foreach my $sp (@presp_tags){
- my $simple_microsats=$pipedir.$sp."_sput_op4_simple";
- my $compound_microsats=$pipedir.$sp."_sput_op4_compound";
- my $input_micro = $pipedir.$sp."_sput_op4";
- my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match";
- multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array );
- $species_counter++;
- }
-
- #-------------------------------------------------------------------------------
- # READING AND FILTERING SIMPLE MICROSATELLITES
- my $spcounter2=0;
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$spcounter2];
- $spcounter2++;
- my $simple_microsats=$pipedir.$presp."_sput_op4_simple";
- my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered";
- my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue";
- multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
- }
-
- #-------------------------------------------------------------------------------
- # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS
-
- $species_counter = 0;
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
- my $compound_microsats = $pipedir.$presp."_sput_op4_compound";
- my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted";
- my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure";
- my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
- multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags));
- $species_counter++;
- }
- #-------------------------------------------------------------------------------
- # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM..
- $species_counter = 0;
-
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
- my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure";
- my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
- my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound";
- my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
- multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8");
- $species_counter++;
- }
- #-------------------------------------------------------------------------------
- # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES
- $species_counter = 0;
-
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
-
- my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
- my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
- my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue";
- multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
-
- my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted";
- my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
- my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue";
- multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags));
-
- my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound";
- my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered";
- my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue";
- multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
- $species_counter++;
- }
- #print "done filtering both simple and compound microsatellites \n";
-
- #-------------------------------------------------------------------------------
-
- my @combinedarray = ();
- my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide");
- my @combinedarray_tags = ("mono", "di", "tri", "tetra");
- $species_counter = 0;
-
- foreach my $sp (@sp_tags){
- my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple";
- push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra";
- $species_counter++;
- }
-
- #-------------------------------------------------------------------------------
- # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION
- my $sp_counter = 0;
- foreach my $sp (@sp_tags){
- my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
- my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
- my $simple = $pipedir.$sp."_sput_op4_simple_filtered";
- my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple";
- `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`;
- for my $i (0 ... 3){
- `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`;
- }
- $sp_counter++;
- }
- #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n";
-
- #-------------------------------------------------------------------------------
- # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES
- # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES
- @temp_tags = @sp_tags;
- my $increment = 1000000;
- my @splist = ();
- my $targetdir = $pipedir;
- $species_counter=0;
-
- foreach my $sp (@sp_tags){
- my $presp = $presp_tags[$species_counter];
- $species_counter++;
- my $localtag = shift @temp_tags;
- my $locallist = $targetdir.$localtag."_".$p_chr."_list";
- push(@splist, $locallist);
- my $input = $pipedir.$presp."_nogap_op_unrand2_match";
- chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr);
- }
-
-
- my @unionarray = ();
- #print "splist=@splist\n";
- #-------------------------------------------------------------------------------
- # FIND INTERRUPTED MICROSATELLITES
-
- $species_counter = 0;
-
- for my $i (0 .. $#combinedarray){
-
- @jobIDs = ();
- open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!";
-
- while (my $sp1 = <JLIST1>){
- #print "$splist[$i]: sp1=$sp1\n";
- chomp $sp1;
-
- for my $j (0 ... $#combinedarray_tags){
- my $interr = $sp1."_interr_".$combinedarray_tags[$j];
- my $simple = $sp1."_simple_".$combinedarray_tags[$j];
- push @{$unionarray[$i]}, $interr, $simple;
- multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8");
- }
- }
- $species_counter++;
- }
- close JLIST1;
- #-------------------------------------------------------------------------------
- # REUNION AND ZIPPING BEFORE T10.PL
-
- my @allarray = ();
-
- for my $i (0 ... $#sp_tags){
- my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
- unlink $localfile if -e $localfile;
- push(@allarray, $localfile);
-
- my $unfiltered_localfile= $localfile."_unfiltered";
- my $residue_localfile= $localfile."_residue";
-
- unlink $unfiltered_localfile;
- #unlink $unfiltered_localfile;
- for my $j (0 ... $#{$unionarray[$i]}){
- #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n";
- `cat $unionarray[$i][$j] >> $unfiltered_localfile`;
- unlink $unionarray[$i][$j];
- }
-
- multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) );
- my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered";
- my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue";
- my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue";
-
- `cat $analyzed_compound >> $localfile`;
- }
- #-------------------------------------------------------------------------------
- # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS
-
- my $toescape=0;
-
-
- for my $i (0 ... $#sp_tags){
- my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
- $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./;
- my $direction = $1;
- #print "localfile = $localfile , direction = $direction\n";
- # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1;
- $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/;
- last if $direction =~ /chr[0-9a-zA-Z]+r/;
- my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match";
- my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match";
- my $reverselocal = $localfile;
- $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g;
- merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags));
- #-------------------------------------------------------------------------------
- my $forward_separate = $localfile."_separate";
- my $reverse_separate = $reverselocal."_separate";
- my $diff = $forward_separate."_diff";
- my $miss = $forward_separate."_miss";
- my $common = $forward_separate."_common";
- forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags));
- #-------------------------------------------------------------------------------
- my $symmetrical_file = $localfile."_symmetrical";
- my $merged_file = $localfile."_merged";
- #print "cating: $merged_file $common into -> $symmetrical_file \n";
- `cat $merged_file $common > $symmetrical_file`;
- #-------------------------------------------------------------------------------
- my $t10output = $symmetrical_file."_fin_hit_all_2";
- new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs));
- #-------------------------------------------------------------------------------
- }
- next if $toescape == 1;
- #------------------------------------------------------------------------------------------------
- # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES
-
-
- my @micros_array = ();
- my $sampletag = ();
- for my $i (0 ... $#sp_tags){
- my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2";
- push(@micros_array, $finhitFile);
- $sampletag = $sp_tags[$i];
- }
- #$sampletag =~ s/^([A-Z]+\.)/ORTH_/;
- #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp";
- my $orthfiletemp = $ptag."_orthfile";
- my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation);
- my $maskedorthfiletemp = $ptag."_orthfile_masked";
- qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp);
- push @outputfiles , $maskedorthfiletemp;
- }
- $date = `date`;
- }
- `cat @outputfiles > $orthfile`;
- my $rootdir = $dir;
- $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
- chdir $rootdir;
- remove_tree($dir);
- #print "date = $date\n";
- #remove_tree($tdir);
- #------------------------------------------------------------------------------------------------
- #------------------------------------------------------------------------------------------------
- #------------------------------------------------------------------------------------------------
- #------------------------------------------------------------------------------------------------
- #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
- sub maftoAxt_multispecies {
- #print "in maftoAxt_multispecies : got @_\n";
- my $fname=$_[0];
- open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
- my $treedefinition = $_[1];
- open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
- my $counter = 0;
- my $exactspeciesset = $_[3];
- my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
-
- $treedefinition=~s/[\)\(, ]/\t/g;
- my @species=split(/\t+/,$treedefinition);
- my @exactspecies=();
-
- foreach my $spec (@species){
- foreach my $espec (@exactspeciesset_unarranged){
- push @exactspecies, $spec if $spec eq $espec;
- }
- }
- #print "exactspecies=@exactspecies\n";
-
- ###########
- my $select = 2;
- #select = 1 if all species need sequences to be present for each block otherwise, it is 0
- #select = 2 only the allowed set make up the alignment. use the removeset
- # information to detect alignmenets that have other important genomes aligned.
- ###########
- my @allowedset = ();
- @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
- @allowedset = join("_",0,@species) if $select == 1;
- #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
- @allowedset = join("_",0,@exactspecies) if $select == 2;
- #print "allowedset = @allowedset and exactspecies = @exactspecies\n";
-
- my $start = 0;
- my @sequences = ();
- my @titles = ();
- my $species_counter = "0";
- my $countermatch = 0;
- my $outsideSpecies=0;
-
- while(my $line = <IN>){
- # print $line;
- next if $line =~ /^#/;
- next if $line =~ /^i/;
- chomp $line;
- my @fields = split(/\s+/,$line);
- chomp $line;
- if ($line =~ /^a /){
- $start = 1;
- }
-
- if ($line =~ /^s /){
-
- foreach my $sp (@allspecies){
- # print "checking species $sp\n";
- if ($fields[1] =~ /$sp/){
- $species_counter = $species_counter."_".$sp;
- push(@sequences, $fields[6]);
- my @sp_info = split(/\./,$fields[1]);
- my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
- push(@titles, $title);
- # print "species_counter = $species_counter\n";
- }
- }
- }
-
- if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
- # print "species_counter = $species_counter\n";
- my $arranged = reorderSpecies($species_counter, @allspecies);
- my $stopper = 1;
- my $arrno = 0;
-
- # print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n";
- if (scalar(@sequences) == scalar(@exactspecies)){
- foreach my $set (@allowedset){
- # print "testing $arranged against $set\n";
- if ($arranged eq $set){
- $stopper = 0; last;
- }
- $arrno++;
- }
- }
- else{
- $stopper = 1;
- }
-
-
- if ($stopper == 0) {
- @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
- @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
- my $filteredseq = filter_gaps(@sequences);
-
- if ($filteredseq ne "SHORT"){
- #print "printing"; <STDIN>;
- $counter++;
- print OUT join (" ",$counter, @titles), "\n";
- print OUT $filteredseq, "\n";
- print OUT "\n";
- $countermatch++;
- }
- }
- else{ #print "nexting\n";<STDIN>;
- }
-
- @sequences = (); @titles = (); $start = 0;$species_counter = "0";
- next;
-
- }
- }
- # print "countermatch = $countermatch\n";
- }
- sub reorderSpecies{
- my @inarr=@_;
- my $currSpecies = shift (@inarr);
- my $ordered_species = 0;
- my @species=@inarr;
- #print "species = @species\n";
- foreach my $order (@species){
- $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
- }
- return $ordered_species;
- }
- sub filter_gaps{
- my @sequences = @_;
- # print "sequences sent are @sequences\n";
- my $seq_length = length($sequences[0]);
- my $seq_no = scalar(@sequences);
- my $allgaps = ();
- for (1 ... $seq_no){
- $allgaps = $allgaps."-";
- }
-
- my @seq_array = ();
- my $seq_counter = 0;
- foreach my $seq (@sequences){
- # my @sequence = split(/\s*/,$seq);
- $seq_array[$seq_counter] = [split(/\s*/,$seq)];
- # push @seq_array, [@sequence];
- $seq_counter++;
- }
- my $g = 0;
- while ( $g < $seq_length){
- last if (!exists $seq_array[0][$g]);
- my $bases = ();
- for my $u (0 ... $#seq_array){
- $bases = $bases.$seq_array[$u][$g];
- }
- # print $bases, "\n";
- if ($bases eq $allgaps){
- # print "bases are $bases, position is $g \n";
- for my $seq (@seq_array){
- splice(@$seq , $g, 1);
- }
- }
- else {
- $g++;
- }
- }
-
- my @outs = ();
-
- foreach my $seq (@seq_array){
- push(@outs, join("",@$seq));
- }
- return "SHORT" if length($outs[0]) <=100;
- return (join("\n", @outs));
- }
- sub allowedSetOfSpecies{
- my @allowed_species = split(/_/,$_[0]);
- unshift @allowed_species, 0;
- # print "allowed set = @allowed_species \n";
- my @output = ();
- for (0 ... scalar(@allowed_species) - 4){
- push(@output, join("_",@allowed_species));
- pop @allowed_species;
- }
- return join(";",reverse(@output));
- }
- sub orderInfo{
- my @info = split(/;/,$_[0]);
- # print "info = @info";
- my @old = split(/_/,$_[1]);
- my @new = split(/_/,$_[2]);
- shift @old; shift @new;
- my @outinfo = ();
- foreach my $spe (@new){
- for my $no (0 ... $#old){
- if ($spe eq $old[$no]){
- push(@outinfo, $info[$no]);
- }
- }
- }
- # print "outinfo = @outinfo \n";
- return join(";", @outinfo);
- }
- #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
- #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
- sub artificial_axdata_inverter{
- open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!";
- open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!";
- my $linecounter=0;
- while (my $line = <IN>){
- $linecounter++;
- #print "$linecounter\n";
- chomp $line;
- my $final_line = $line;
- my $trycounter = 0;
- if ($line =~ /^[a-zA-Z\-]/){
- # while ($final_line eq $line){
- my @fields = split(/\s*/,$line);
-
- $final_line = join("",reverse(@fields));
- # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
- # $trycounter++;
- # print "trying again....$trycounter : $final_line\n" if $final_line eq $line;
- # }
- }
-
- # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
- if ($line =~ /^[0-9]/){
- $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g;
- $final_line = $line;
- }
- print OUT $final_line,"\n";
- #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
- }
- close OUT;
- }
- #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
- #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
- sub multi_species_t1 {
- my $input1 = $_[0];
- #print "@_\n"; <STDIN>;
- my @tags = split(/_/, $_[1]);
- my @outputs = split(/,/, $_[2]);
- my $title_query = $_[3];
- my @handles = ();
-
- open(FILEB,"<$input1")or die "Cannot open file: $input1 $!";
- my $i = 0;
- foreach my $path (@outputs){
- $handles[$i] = IO::Handle->new();
- open ($handles[$i], ">$path") or die "Can't open $path : $!";
- $i++;
- }
-
- my $curdef;
- my $start = 0;
-
- while (my $line = <FILEB> ) {
- if ($line =~ /^\d/){
- $line =~ s/ +/\t/g;
- my @fields = split(/\s+/, $line);
- if (($line =~ /$title_query/)){
- my $title = $line;
- my $counter = 0;
- foreach my $tag (@tags){
- $line = <FILEB>;
- print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line;
- $counter++;
- }
- }
- else{
- foreach my $tag (@tags){
- my $tine = <FILEB>;
- }
- }
-
- }
- }
-
- foreach my $hand (@handles){
- $hand->close();
- }
-
- close FILEB;
- }
- #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
- #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
- sub multi_species_t2{
-
- my $input = $_[0];
- my $species = $_[1];
- my $output1 = $input."_unr";
-
- #------------------------------------------------------------------------------------------
- open (FILEF1, "<$input") or die "Cannot open file $input :$!";
- open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!";
-
- my $line1 = <FILEF1>;
-
- while($line1){
- {
- # chomp($line);
- if ($line1 =~ (m/^\>$species/)){
- chomp($line1);
- print FILEF2 $line1;
- $line1 = <FILEF1>;
- chomp($line1);
- print FILEF2 "\t", $line1,"\n";
- }
- }
- $line1 = <FILEF1>;
- }
-
- close FILEF1;
- close FILEF2;
- #------------------------------------------------------------------------------------------
-
- my $output2 = $output1."and";
- my $output3 = $output1."and2";
- open(IN,"<$output1");
- open (FILEF3, ">$output2");
- open (FILEF4, ">$output3");
-
-
- while (<IN>){
- my $line = $_;
- chomp($line);
- my @fields=split (/\t/, $line);
- # print $line,"\n"; <STDIN>;
- if($line !~ /random/){
- print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n";
- print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n";
- }
- }
-
-
- close IN;
- close FILEF3;
- close FILEF4;
- unlink $output1;
-
- #------------------------------------------------------------------------------------------
- # OLD T3.PL RUDIMENT
-
- my $t3output = $output2;
- $t3output =~ s/gap_op_unrand/nogap_op_unrand/g;
-
- open(IN,"<$output2");
- open(OUTA,">$t3output");
-
-
- while (<IN>){
- s/-//g unless /^>/;
- print OUTA;
- }
-
- close IN;
- close OUTA;
- #------------------------------------------------------------------------------------------
- }
- #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
- #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx
- sub multi_species_t2_2{
- #print "IN multi_species_t2_2 : @_\n";
- my $input = $_[0];
- my $species = $_[1];
- my $output1 = $input."2";
-
-
- open (FILEF1, "<$input");
- open (FILEF2, ">$output1");
-
- my $line1 = <FILEF1>;
-
- while($line1){
- {
- # chomp($line);
- if ($line1 =~ (m/^\>$species/)){
- chomp($line1);
- print FILEF2 $line1;
- $line1 = <FILEF1>;
- chomp($line1);
- print FILEF2 "\t", $line1,"\n";
- }
- }
- $line1 = <FILEF1>;
- }
-
- close FILEF1;
- close FILEF2;
- }
- #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx
- #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
- sub sputnikoutput_corrector{
- my $input = $_[0];
- my $output = $_[1];
- open(IN,"<$input") or die "Cannot open file $input :$!";
- open(OUT,">$output") or die "Cannot open file $output :$!";
- my $tine;
- while (my $line=<IN>){
- if($line =~/length /){
- $tine = $line;
- $tine =~ s/\s+/\t/g;
- my @fields = split(/\t/,$tine);
- if ($fields[6] > 60){
- print OUT $line;
- $line = <IN>;
-
- while (($line !~ /nucleotide/) && ($line !~ /^>/)){
- chomp $line;
- print OUT $line;
- $line = <IN>;
- }
- print OUT "\n";
- print OUT $line;
- }
- else{
- print OUT $line;
- }
- }
- else{
- print OUT $line;
- }
- }
- close IN;
- close OUT;
- }
- #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
- #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
- sub multi_species_t4{
- # print "multi_species_t4 : @_\n";
- my $input = $_[0];
- my $output = $_[1];
- open (FILEA, "<$input");
- open (FILEB, ">$output");
-
- my $line = <FILEA>;
-
- while ($line) {
- # chomp $line;
- if ($line =~ />/) {
- chomp $line;
- print FILEB $line, "\n";
- }
-
-
- if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
- chomp $line;
- print FILEB $line, " " ;
- $line = <FILEA>;
- chomp $line;
- print FILEB $line,"\n";
- }
-
- $line = <FILEA>;
- }
-
-
- close FILEA;
- close FILEB;
- }
- #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
- #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
- sub multi_species_t5{
-
- my $input = $_[0];
- my $output = $_[1];
-
- open(FILEB,"<$input");
- open(FILEC,">$output");
-
- my $curdef;
-
- while (my $line = <FILEB> ) {
-
- if ($line =~ /^>/){
- chomp $line;
- $curdef = $line;
- next;
- }
-
- if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
- print FILEC $curdef," ",$line;
- }
-
- }
-
-
- close FILEB;
- close FILEC;
- }
- #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
- #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
- sub multi_species_t6{
- my $input = $_[0];
- my $output = $_[1];
- my $focalstrand=$_[3];
- # print "inpput = @_\n";
- open (FILE, "<$input");
- open (FILE_MICRO, ">$output");
- my $linecounter=0;
- while (my $line = <FILE>){
- $linecounter++;
- chomp $line;
- #print "line = $line\n";
- #MONO#
- $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/;
- my $strand=$1;
- my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g);
- #print "line = $line\n";
- my $specfieldsend = 2 + ($no_of_species*4) - 1;
- my @fields = split(/\s+/, $line);
- my @speciesdata = @fields[0 ... $specfieldsend];
- $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/;
- my ($tide, $start, $end) = ($1, $2, $3);
- #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n";
- if($line =~ /mononucleotide/){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n";
- }
- #DI#
- elsif($line =~ /dinucleotide/){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n";
- }
- #TRI#
- elsif($line =~ /trinucleotide/ ){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n";
- }
- #TETRA#
- elsif($line =~ /tetranucleotide/){
- print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n";
- }
- #PENTA#
- elsif($line =~ /pentanucleotide/){
- #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n";
- }
- else{
- # print "not: @fields\n";
- }
- }
- # print "linecounter=$linecounter\n";
- close FILE;
- close FILE_MICRO;
- }
- sub mono {
- my $st = $_[0];
- my $tp = unpack "A1"x(length($st)/1),$st;
- my $var1 = substr($tp, 0, 1);
- return join ("\t", $var1);
- }
- sub di {
- my $st = $_[0];
- my $tp = unpack "A2"x(length($st)/2),$st;
- my $var1 = substr($tp, 0, 2);
- return join ("\t", $var1);
- }
- sub tri {
- my $st = $_[0];
- my $tp = unpack "A3"x(length($st)/3),$st;
- my $var1 = substr($tp, 0, 3);
- return join ("\t", $var1);
- }
- sub tetra {
- my $st = $_[0];
- my $tp = unpack "A4"x(length($st)/4),$st;
- my $var1 = substr($tp, 0, 4);
- return join ("\t", $var1);
- }
- sub penta {
- my $st = $_[0];
- my $tp = unpack "A5"x(length($st)/5),$st;
- my $var1 = substr($tp, 0, 5);
- return join ("\t", $var1);
- }
- #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
- #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
- sub t9{
- my $input1 = $_[0];
- my $input2 = $_[1];
- my $output = $_[2];
-
-
- open(IN1,"<$input1") if -e $input1;
- open(IN2,"<$input2") or die "cannot open file $_[1] : $!";
- open(OUT,">$output") or die "cannot open file $_[2] : $!";
-
-
- my %seen = ();
- my $prevkey = 0;
-
- if (-e $input1){
- while (my $line = <IN1>){
- chomp($line);
- my @fields = split(/\t/,$line);
- my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]);
- # print "key in t9 = $key1\n";
- $seen{$key1}++ if ($prevkey ne $key1) ;
- $prevkey = $key1;
- }
- # print "done first hash\n";
- close IN1;
- }
-
- while (my $line = <IN2>){
- # print $line, "**\n";
- if (-e $input1){
- chomp($line);
- my @fields = split(/\t/,$line);
- my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]);
- if (exists $seen{$key2}){
- print OUT "$line\n" ;
- delete $seen{$key2};
- }
- }
- else {
- print OUT "$line\n" ;
- # print "$line\n" ;
- }
- }
-
- close IN2;
- close OUT;
- }
- #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
- sub multiSpecies_compound_microsat_hunter3{
-
- my $input1 = $_[0]; ###### the *_sput_op4_ii file
- my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
- my $output1 = $_[2]; ###### plain microsatellite file
- my $output2 = $_[3]; ###### compound microsatellite file
- my $org = $_[4]; ###### 1 or 2
- $no_of_species = $_[5];
- #print "IN multiSpecies_compound_microsat_hunter3: @_\n";
- #my @tags = split(/\t/,$info);
- sub compoundify;
- open(IN,"<$input1") or die "Cannot open file $input1 $!";
- open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
- open(OUT,">$output1") or die "Cannot open file $output1 $!";
- open(OUT2,">$output2") or die "Cannot open file $output2 $!";
- $infocord = 2 + (4*$no_of_species) - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
- my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
-
- my @thresholds = ("0");
- push(@thresholds, split(/_/,$_[6]));
- sub thresholdCheck;
- my %micros = ();
- while (my $line = <IN>){
- # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
- next if $line =~ /\t\t/;
- if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- my $key = join("\t",$1, $2, $3, $4, $5);
- # print $key, "#-#-#-#-#-#-#-#\n";
- push (@{$micros{$key}},$line);
- }
- else{
- }
- }
- close IN;
- my @deletedlines = ();
-
- my $linecount = 0;
-
- while(my $sine = <SEQ>){
- my %microstart=();
- my %microend=();
-
- my @sields = split(/\t/,$sine);
-
- my $key = ();
-
- if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
- $key = join("\t",$1, $2, $3, $4, $5);
- # print $key, "<-<-<-<-<-<-<-<\n";
- }
- else{
- }
-
- if (exists $micros{$key}){
- $linecount++;
- my @microstring = @{$micros{$key}};
- my @tempmicrostring = @{$micros{$key}};
-
- foreach my $line (@tempmicrostring){
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- push (@{$microstart{$start}},$line);
- push (@{$microend{$end}},$line);
- }
- my $firstflag = 'down';
- while( my $line =shift(@microstring)){
- # print "-----------\nline = $line ";
- chomp $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- my $startmicro = $line;
- my $endmicro = $line;
-
- # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n";
-
- delete ($microstart{$start});
- delete ($microend{$end});
- my $flag = 'down';
- my $startflag = 'down';
- my $endflag = 'down';
- my $prestart = $start - $distance;
- my $postend = $end + $distance;
- my @compoundlines = ();
- my %compoundhash = ();
- push (@compoundlines, $line);
- push (@{$compoundhash{$line}},$line);
- my $startrank = 1;
- my $endrank = 1;
-
- while( ($startflag eq "down") || ($endflag eq "down") ){
- if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) {
- # print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n";
- last;
- }
-
- $firstflag = "up";
- if ($startflag eq "down"){
- for my $i ($prestart ... $start){
-
- if(exists $microend{$i}){
- chomp $microend{$i}[0];
- if(exists $compoundhash{$microend{$i}[0]}) {next;}
- # print "sending from microend $startmicro, $microend{$i}[0] |||\n";
- if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){
- push(@compoundlines, $microend{$i}[0]);
- # print "accepted\n";
- my @tields = split(/\t/,$microend{$i}[0]);
- $startmicro = $microend{$i}[0];
- chomp $startmicro;
- $start = $tields[$startcord];
- $flag = 'down';
- $startrank++;
- # print "startcompund = $microend{$i}[0]\n";
- delete $microend{$i};
- delete $microstart{$start};
- $startflag = 'down';
- $prestart = $start - $distance;
- last;
- }
- else{
- $flag = 'up';
- $startflag = 'up';
- }
- }
- else{
- $flag = 'up';
- $startflag = 'up';
- }
- }
- }
-
- $endrank = $startrank;
-
- if ($endflag eq "down"){
- for my $i ($end ... $postend){
-
- if(exists $microstart{$i} ){
- chomp $microstart{$i}[0];
- if(exists $compoundhash{$microstart{$i}[0]}) {next;}
- # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n";
-
- if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){
- push(@compoundlines, $microstart{$i}[0]);
- # print "accepted\n";
- my @tields = split(/\t/,$microstart{$i}[0]);
- $end = $tields[$endcord]-0;
- $endmicro = $microstart{$i}[0];
- $endrank++;
- chomp $endmicro;
- $flag = 'down';
- # print "endcompund = $microstart{$i}[0]\n";
- delete $microstart{$i};
- delete $microend{$end};
- shift @microstring;
- $postend = $end + $distance;
- $endflag = 'down';
- last;
- }
- else{
- $flag = 'up';
- $endflag = 'up';
- }
- }
- else{
- $flag = 'up';
- $endflag = 'up';
- }
- }
- }
- # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
- } #end while( $flag eq "down")
- # print "compoundlines = @compoundlines \n";
- if (scalar (@compoundlines) == 1){
- print OUT $line,"\n";
- }
- if (scalar (@compoundlines) > 1){
- my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]);
- # print $compoundline,"\n";
- print OUT2 $compoundline,"\n";
- }
- } #end foreach my $line (@microstring){
- } #if (exists $micros{$key}){
-
-
- }
-
- close OUT;
- close OUT2;
- }
- #------------------------------------------------------------------------------------------------
- sub compoundify{
- my ($compoundlines, $sequence) = @_;
- # print "\nfound to compound : @$compoundlines and$sequence \n";
- my $noOfComps = @$compoundlines;
- # print "Number of elements in hash is $noOfComps\n";
- my @starts;
- my @ends;
- foreach my $line (@$compoundlines){
- # print "compoundify.. line = $line \n";
- chomp $line;
- my @fields = split(/\t/,$line);
- my $start = $fields[$startcord];
- my $end = $fields[$endcord];
- # print "start = $start, end = $end \n";
- push(@starts, $start);
- push(@ends,$end);
- }
- my @temp = @$compoundlines;
- my $startline=$temp[0];
- my @mields = split(/\t/,$startline);
- my $startcoord = $mields[$startcord];
- my $startgapsign=$mields[$endcord];
- my @startsorted = sort { $a <=> $b } @starts;
- my @endsorted = sort { $a <=> $b } @ends;
- my @intervals;
- for my $end (0 ... (scalar(@endsorted)-2)){
- my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1));
- push(@intervals,$interval);
- # print "interval = $interval =\n";
- # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n";
- }
- push(@intervals,"");
- my $compoundmicrosat=();
- my $multiunit="";
- foreach my $line (@$compoundlines){
- my @fields = split(/\t/,$line);
- my $component="[".$fields[$microsatcord]."]".shift(@intervals);
- $compoundmicrosat=$compoundmicrosat.$component;
- $multiunit=$multiunit."[".$fields[$motifcord]."]";
- # print "multiunit = $multiunit\n";
- }
- my $compoundcopy = $compoundmicrosat;
- $compoundcopy =~ s/\[|\]//g;
- my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1;
-
-
- my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit));
- return $compoundline;
- }
- #------------------------------------------------------------------------------------------------
- sub identityMatch_thresholdCheck{
- my $line1 = $_[0];
- my $line2 = $_[1];
- my $rank = $_[2];
- my @lields1 = split(/\t/,$line1);
- my @lields2 = split(/\t/,$line2);
- # print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n";
-
- if (length($lields1[$motifcord]) == length($lields2[$motifcord])){
- my $probe = $lields1[$motifcord].$lields1[$motifcord];
- #print "$probe :: $lields2[$motifcord]\n";
- return "proceed" if $probe =~ /$lields2[$motifcord]/;
- #print "line recieved\n";
- if ($rank ==1){
- return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
- }
- else {
- return "proceed" if thresholdCheck($line2) eq "proceed";
- return "stop";
- }
- }
- else{
- if ($rank ==1){
- return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
- }
- else {
- return "proceed" if thresholdCheck($line2) eq "proceed";
- return "stop";
- }
- }
- return "stop";
- }
- #------------------------------------------------------------------------------------------------
- sub thresholdCheck{
- my @checkthresholds=(0,@thresholds);
- #print "IN thresholdCheck: @_\n";
- my $line = $_[0];
- my @lields = split(/\t/,$line);
- return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])];
- return "stop";
- }
- #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
- sub multiSpecies_filtering_interrupted_microsats{
- # print "IN multiSpecies_filtering_interrupted_microsats: @_\n";
- my $unfiltered = $_[0];
- my $filtered = $_[1];
- my $residue = $_[2];
- my $no_of_species = $_[5];
- open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
- open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
- open(RES,">$residue") or die "Cannot open file $residue: $!";
-
- $infocord = 2 + (4*$no_of_species) - 1;
- $startcord = 2 + (4*$no_of_species) + 2 - 1;
- $strandcord = 2 + (4*$no_of_species) + 3 - 1;
- $endcord = 2 + (4*$no_of_species) + 4 - 1;
- $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
- $motifcord = 2 + (4*$no_of_species) + 6 - 1;
-
-
- my @sub_thresholds = (0);
-
- push(@sub_thresholds, split(/_/,$_[3]));
- my @thresholds = (0);
-
- push(@thresholds, split(/_/,$_[4]));
-
- while (my $line = <UNF>) {
- next if $line !~ /[a-z]/;
- #print $line;
- chomp $line;
- my @fields = split(/\t/,$line);
- my $motif = $fields[$motifcord];
- my $realmotif = $motif;
- #print "motif = $motif\n";
- if ($motif =~ /^\[/){
- $motif =~ s/^\[//g;
- my @motifs = split(/\]/,$motif);
- $realmotif = $motifs[0];
- }
- # print "realmotif = $realmotif";
- my $motif_size = length($realmotif);
-
- my $microsat = $fields[$microsatcord];
- # print "microsat = $microsat\n";
- $microsat =~ s/^\[|\]$//sg;
- my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
-
- $microsat = join("",@microsats);
- if (length($microsat) < $thresholds[$motif_size]) {
- # print length($microsat)," < ",$thresholds[$motif_size],"\n";
- print RES $line,"\n"; next;
- }
- my @lengths = ();
- foreach my $mic (@microsats){
- push(@lengths, length($mic));
- }
- if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) {
- # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n";
- print RES $line,"\n"; next;}
- else {print FIL $line,"\n"; next;
- }
- }
- close FIL;
- close RES;
- }
- sub largest_microsat{
- my $counter = 0;
- my($max) = shift(@_);
- foreach my $temp (@_) {
- #print "finding largest array: $maxcounter \n";
- if($temp > $max){
- $max = $temp;
- }
- }
- return($max);
- }
- #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
- #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx…
Large files files are truncated, but you can click here to view the full file