PageRenderTime 44ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl

https://bitbucket.org/chapmanb/galaxy-central/
Perl | 5606 lines | 5065 code | 245 blank | 296 comment | 163 complexity | e64ba560e907a6b8d7818123ba23613a MD5 | raw file
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use Term::ANSIColor;
  5. use File::Basename;
  6. use IO::Handle;
  7. use Cwd;
  8. use File::Path;
  9. use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);
  10. use File::Path qw(make_path remove_tree);
  11. use File::Temp qw/ tempfile tempdir /;
  12. my $tdir = tempdir( CLEANUP => 1 );
  13. chdir $tdir;
  14. my $dir = getcwd;
  15. #print "dir = $dir\n";
  16. #$ENV{'PATH'} .= ':' . dirname($0);
  17. my $date = `date`;
  18. my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV;
  19. if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; }
  20. $tree_definition_all =~ s/\s+//g;
  21. $threshold_array =~ s/\s+//g;
  22. $allspeciesin =~ s/\s+//g;
  23. #-------------------------------------------------------------------------------
  24. # WHICH SPUTNIK USED?
  25. my $sputnikpath = ();
  26. $sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;
  27. #$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";
  28. #print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath;
  29. #exit if !-e $sputnikpath;
  30. #$sputnikpath = "bx-sputnik" ;
  31. #print "ARGV input = @ARGV\n";
  32. #print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n";
  33. #-------------------------------------------------------------------------------
  34. # RUNFILE
  35. #-------------------------------------------------------------------------------
  36. $distance = 1; #bp
  37. $distance++;
  38. my @tree_definitions=MakeTrees($tree_definition_all);
  39. my $allspeciesset = $tree_definition_all;
  40. $allspeciesset =~ s/[\(\) ]+//g;
  41. @allspecies = split(/,/,$allspeciesset);
  42. my @outputfiles = ();
  43. my $round = 0;
  44. #my $tdir = tempdir( CLEANUP => 0 );
  45. #chdir $tdir;
  46. foreach my $tree_definition (@tree_definitions){
  47. my @commas = ($tree_definition =~ /,/g) ;
  48. #print "commas = @commas\n"; <STDIN>;
  49. next if scalar(@commas) <= 1;
  50. #print "species_set = $species_set\n";
  51. $treeSpeciesNum = scalar(@commas) + 1;
  52. $species_set = $tree_definition;
  53. $species_set =~ s/[\)\( ;]+//g;
  54. #print "species_set = $species_set\n"; <STDIN>;
  55. $round++;
  56. #-------------------------------------------------------------------------------
  57. # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)
  58. $threshold_array=~ s/,/_/g;
  59. my @thresharr = split("_",$threshold_array);
  60. @thresholds=@thresharr;
  61. #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));
  62. #print "current dit=$dir\n";
  63. #-------------------------------------------------------------------------------
  64. # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY
  65. my @chrfiles=();
  66. #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];
  67. my $chromt=int(rand(10000));
  68. my $p_chr=$chromt;
  69. my @exactspeciesset_unarranged = split(/,/,$species_set);
  70. $tree_definition=~s/[\)\(, ]/\t/g;
  71. my @treespecies=split(/\t+/,$tree_definition);
  72. my @exactspecies=();
  73. foreach my $spec (@treespecies){
  74. foreach my $espec (@exactspeciesset_unarranged){
  75. push @exactspecies, $spec if $spec eq $espec;
  76. }
  77. }
  78. #print "exactspecies=@exactspecies\n";
  79. $focalspec = $exactspecies[0];
  80. my $arranged_species_set=join(".",@exactspecies);
  81. my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
  82. my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");
  83. #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
  84. maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);
  85. maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set);
  86. #print "done maf to axt conversion\n";
  87. my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");
  88. artificial_axdata_inverter ($chr_name, $reverse_chr_name);
  89. #print "reverse_chr_name=$reverse_chr_name\n";
  90. #-------------------------------------------------------------------------------
  91. # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT
  92. foreach my $direct ("reverse_direction","forward_direction"){
  93. $p_chr=$chromt;
  94. #print "direction = $direct\n";
  95. $p_chr = $p_chr."r" if $direct eq "reverse_direction";
  96. $p_chr = $p_chr if $direct eq "forward_direction";
  97. my $config = $species_set;
  98. $config=~s/,/./g;
  99. my @orgs = split(/\./,$arranged_species_set);
  100. #print "ORGS= @orgs\n";
  101. my @tag=@orgs;
  102. my $tags = join(",", @tag);
  103. my @tags=@tag;
  104. chomp $p_chr;
  105. $tags = join("_", split(/,/, $tags));
  106. my $pchr = "chr".$p_chr;
  107. my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array;
  108. my @sp_tags = ();
  109. # print "$ptag _ orthfile\n"; <STDIN>;
  110. #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n";
  111. foreach my $sp (@tag){
  112. push(@sp_tags, ($sp.".".$ptag));
  113. }
  114. my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]);
  115. my @presp_tags = ();
  116. foreach my $sp (@tag){
  117. push(@presp_tags, ($sp.".".$preptag));
  118. }
  119. my $resultdir = "";
  120. my $orthdir = "";
  121. my $filtereddir = "";
  122. my $pipedir = "";
  123. my @title_queries = ();
  124. push(@title_queries, "^[0-9]+");
  125. my $sep="\\s";
  126. for my $or (0 ... $#orgs){
  127. my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]"));
  128. #$title =~ s/chr\\+\\s+\+/chr/g;
  129. push(@title_queries, $title);
  130. }
  131. my $title_query = join($sep, @title_queries);
  132. #print "title_queries=@title_queries\n";
  133. #print "query = >$title_query<\n";
  134. #print "orgs = @orgs\n";
  135. #-------------------------------------------------------------------------------
  136. # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES
  137. my $t1input = $pchr.".".$arranged_species_set.".net.axt";
  138. my @t1outputs = ();
  139. foreach my $sp (@presp_tags){
  140. push(@t1outputs, $sp."_gap_op");
  141. }
  142. multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query);
  143. #print "t1outputs=@t1outputs\n";
  144. #print "done t1\n"; <STDIN>;
  145. #-------------------------------------------------------------------------------
  146. #START T2.PL
  147. my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = ();
  148. for my $t (0 ... scalar(@tags)-1){
  149. multi_species_t2($t1outputs[$t], $tag[$t]);
  150. }
  151. #-------------------------------------------------------------------------------
  152. #START T2.2.PL
  153. my @temp_tags = @tag;
  154. foreach my $sp (@presp_tags){
  155. my $t2input = $sp."_nogap_op_unrand";
  156. multi_species_t2_2($t2input, shift(@temp_tags));
  157. }
  158. undef (@temp_tags);
  159. #-------------------------------------------------------------------------------
  160. #START SPUTNIK
  161. my @jobIDs = ();
  162. @temp_tags = @tag;
  163. my @sput_filelist = ();
  164. foreach my $sp (@presp_tags){
  165. #print "sp = $sp\n";
  166. my $sputnikoutput = $pipedir.$sp."_sput_op0";
  167. my $sputnikinput = $pipedir.$sp."_nogap_op_unrand";
  168. push(@sput_filelist, $sputnikinput);
  169. my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput;
  170. # print "$sputnikcommand\n";
  171. my @sputnikcommand_system = $sputnikcommand;
  172. system(@sputnikcommand_system);
  173. }
  174. #-------------------------------------------------------------------------------
  175. #START SPUTNIK OUTPUT CORRECTOR
  176. foreach my $sp (@presp_tags){
  177. my $corroutput = $pipedir.$sp."_sput_op1";
  178. my $corrinput = $pipedir.$sp."_sput_op0";
  179. sputnikoutput_corrector($corrinput,$corroutput);
  180. my $t4output = $pipedir.$sp."_sput_op2";
  181. multi_species_t4($corroutput,$t4output);
  182. my $t5output = $pipedir.$sp."_sput_op3";
  183. multi_species_t5($t4output,$t5output);
  184. #print "done t5.pl for $sp\n";
  185. my $t6output = $pipedir.$sp."_sput_op4";
  186. multi_species_t6($t5output,$t6output,scalar(@orgs));
  187. }
  188. #-------------------------------------------------------------------------------
  189. #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING
  190. foreach my $sp (@presp_tags){
  191. my $t9output = $pipedir.$sp."_gap_op_unrand_match";
  192. my $t9sequence = $pipedir.$sp."_gap_op_unrand2";
  193. my $t9micro = $pipedir.$sp."_sput_op4";
  194. t9($t9micro,$t9sequence,$t9output);
  195. my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match";
  196. my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2";
  197. t9($t9micro,$t9sequence2,$t9output2);
  198. }
  199. #print "done both t9.pl for all orgs\n";
  200. #-------------------------------------------------------------------------------
  201. # FIND COMPOUND MICROSATELLITES
  202. @jobIDs = ();
  203. my $species_counter = 0;
  204. foreach my $sp (@presp_tags){
  205. my $simple_microsats=$pipedir.$sp."_sput_op4_simple";
  206. my $compound_microsats=$pipedir.$sp."_sput_op4_compound";
  207. my $input_micro = $pipedir.$sp."_sput_op4";
  208. my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match";
  209. multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array );
  210. $species_counter++;
  211. }
  212. #-------------------------------------------------------------------------------
  213. # READING AND FILTERING SIMPLE MICROSATELLITES
  214. my $spcounter2=0;
  215. foreach my $sp (@sp_tags){
  216. my $presp = $presp_tags[$spcounter2];
  217. $spcounter2++;
  218. my $simple_microsats=$pipedir.$presp."_sput_op4_simple";
  219. my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered";
  220. my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue";
  221. multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
  222. }
  223. #-------------------------------------------------------------------------------
  224. # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS
  225. $species_counter = 0;
  226. foreach my $sp (@sp_tags){
  227. my $presp = $presp_tags[$species_counter];
  228. my $compound_microsats = $pipedir.$presp."_sput_op4_compound";
  229. my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted";
  230. my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure";
  231. my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
  232. multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags));
  233. $species_counter++;
  234. }
  235. #-------------------------------------------------------------------------------
  236. # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM..
  237. $species_counter = 0;
  238. foreach my $sp (@sp_tags){
  239. my $presp = $presp_tags[$species_counter];
  240. my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure";
  241. my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
  242. my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound";
  243. my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
  244. multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8");
  245. $species_counter++;
  246. }
  247. #-------------------------------------------------------------------------------
  248. # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES
  249. $species_counter = 0;
  250. foreach my $sp (@sp_tags){
  251. my $presp = $presp_tags[$species_counter];
  252. my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
  253. my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
  254. my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue";
  255. multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
  256. my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted";
  257. my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
  258. my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue";
  259. multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags));
  260. my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound";
  261. my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered";
  262. my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue";
  263. multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
  264. $species_counter++;
  265. }
  266. #print "done filtering both simple and compound microsatellites \n";
  267. #-------------------------------------------------------------------------------
  268. my @combinedarray = ();
  269. my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide");
  270. my @combinedarray_tags = ("mono", "di", "tri", "tetra");
  271. $species_counter = 0;
  272. foreach my $sp (@sp_tags){
  273. my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple";
  274. push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra";
  275. $species_counter++;
  276. }
  277. #-------------------------------------------------------------------------------
  278. # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION
  279. my $sp_counter = 0;
  280. foreach my $sp (@sp_tags){
  281. my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
  282. my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
  283. my $simple = $pipedir.$sp."_sput_op4_simple_filtered";
  284. my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple";
  285. `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`;
  286. for my $i (0 ... 3){
  287. `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`;
  288. }
  289. $sp_counter++;
  290. }
  291. #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n";
  292. #-------------------------------------------------------------------------------
  293. # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES
  294. # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES
  295. @temp_tags = @sp_tags;
  296. my $increment = 1000000;
  297. my @splist = ();
  298. my $targetdir = $pipedir;
  299. $species_counter=0;
  300. foreach my $sp (@sp_tags){
  301. my $presp = $presp_tags[$species_counter];
  302. $species_counter++;
  303. my $localtag = shift @temp_tags;
  304. my $locallist = $targetdir.$localtag."_".$p_chr."_list";
  305. push(@splist, $locallist);
  306. my $input = $pipedir.$presp."_nogap_op_unrand2_match";
  307. chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr);
  308. }
  309. my @unionarray = ();
  310. #print "splist=@splist\n";
  311. #-------------------------------------------------------------------------------
  312. # FIND INTERRUPTED MICROSATELLITES
  313. $species_counter = 0;
  314. for my $i (0 .. $#combinedarray){
  315. @jobIDs = ();
  316. open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!";
  317. while (my $sp1 = <JLIST1>){
  318. #print "$splist[$i]: sp1=$sp1\n";
  319. chomp $sp1;
  320. for my $j (0 ... $#combinedarray_tags){
  321. my $interr = $sp1."_interr_".$combinedarray_tags[$j];
  322. my $simple = $sp1."_simple_".$combinedarray_tags[$j];
  323. push @{$unionarray[$i]}, $interr, $simple;
  324. multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8");
  325. }
  326. }
  327. $species_counter++;
  328. }
  329. close JLIST1;
  330. #-------------------------------------------------------------------------------
  331. # REUNION AND ZIPPING BEFORE T10.PL
  332. my @allarray = ();
  333. for my $i (0 ... $#sp_tags){
  334. my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
  335. unlink $localfile if -e $localfile;
  336. push(@allarray, $localfile);
  337. my $unfiltered_localfile= $localfile."_unfiltered";
  338. my $residue_localfile= $localfile."_residue";
  339. unlink $unfiltered_localfile;
  340. #unlink $unfiltered_localfile;
  341. for my $j (0 ... $#{$unionarray[$i]}){
  342. #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n";
  343. `cat $unionarray[$i][$j] >> $unfiltered_localfile`;
  344. unlink $unionarray[$i][$j];
  345. }
  346. multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) );
  347. my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered";
  348. my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue";
  349. my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue";
  350. `cat $analyzed_compound >> $localfile`;
  351. }
  352. #-------------------------------------------------------------------------------
  353. # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS
  354. my $toescape=0;
  355. for my $i (0 ... $#sp_tags){
  356. my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
  357. $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./;
  358. my $direction = $1;
  359. #print "localfile = $localfile , direction = $direction\n";
  360. # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1;
  361. $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/;
  362. last if $direction =~ /chr[0-9a-zA-Z]+r/;
  363. my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match";
  364. my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match";
  365. my $reverselocal = $localfile;
  366. $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g;
  367. merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags));
  368. #-------------------------------------------------------------------------------
  369. my $forward_separate = $localfile."_separate";
  370. my $reverse_separate = $reverselocal."_separate";
  371. my $diff = $forward_separate."_diff";
  372. my $miss = $forward_separate."_miss";
  373. my $common = $forward_separate."_common";
  374. forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags));
  375. #-------------------------------------------------------------------------------
  376. my $symmetrical_file = $localfile."_symmetrical";
  377. my $merged_file = $localfile."_merged";
  378. #print "cating: $merged_file $common into -> $symmetrical_file \n";
  379. `cat $merged_file $common > $symmetrical_file`;
  380. #-------------------------------------------------------------------------------
  381. my $t10output = $symmetrical_file."_fin_hit_all_2";
  382. new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs));
  383. #-------------------------------------------------------------------------------
  384. }
  385. next if $toescape == 1;
  386. #------------------------------------------------------------------------------------------------
  387. # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES
  388. my @micros_array = ();
  389. my $sampletag = ();
  390. for my $i (0 ... $#sp_tags){
  391. my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2";
  392. push(@micros_array, $finhitFile);
  393. $sampletag = $sp_tags[$i];
  394. }
  395. #$sampletag =~ s/^([A-Z]+\.)/ORTH_/;
  396. #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp";
  397. my $orthfiletemp = $ptag."_orthfile";
  398. my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation);
  399. my $maskedorthfiletemp = $ptag."_orthfile_masked";
  400. qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp);
  401. push @outputfiles , $maskedorthfiletemp;
  402. }
  403. $date = `date`;
  404. }
  405. `cat @outputfiles > $orthfile`;
  406. my $rootdir = $dir;
  407. $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
  408. chdir $rootdir;
  409. remove_tree($dir);
  410. #print "date = $date\n";
  411. #remove_tree($tdir);
  412. #------------------------------------------------------------------------------------------------
  413. #------------------------------------------------------------------------------------------------
  414. #------------------------------------------------------------------------------------------------
  415. #------------------------------------------------------------------------------------------------
  416. #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
  417. sub maftoAxt_multispecies {
  418. #print "in maftoAxt_multispecies : got @_\n";
  419. my $fname=$_[0];
  420. open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
  421. my $treedefinition = $_[1];
  422. open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
  423. my $counter = 0;
  424. my $exactspeciesset = $_[3];
  425. my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
  426. $treedefinition=~s/[\)\(, ]/\t/g;
  427. my @species=split(/\t+/,$treedefinition);
  428. my @exactspecies=();
  429. foreach my $spec (@species){
  430. foreach my $espec (@exactspeciesset_unarranged){
  431. push @exactspecies, $spec if $spec eq $espec;
  432. }
  433. }
  434. #print "exactspecies=@exactspecies\n";
  435. ###########
  436. my $select = 2;
  437. #select = 1 if all species need sequences to be present for each block otherwise, it is 0
  438. #select = 2 only the allowed set make up the alignment. use the removeset
  439. # information to detect alignmenets that have other important genomes aligned.
  440. ###########
  441. my @allowedset = ();
  442. @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
  443. @allowedset = join("_",0,@species) if $select == 1;
  444. #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
  445. @allowedset = join("_",0,@exactspecies) if $select == 2;
  446. #print "allowedset = @allowedset and exactspecies = @exactspecies\n";
  447. my $start = 0;
  448. my @sequences = ();
  449. my @titles = ();
  450. my $species_counter = "0";
  451. my $countermatch = 0;
  452. my $outsideSpecies=0;
  453. while(my $line = <IN>){
  454. # print $line;
  455. next if $line =~ /^#/;
  456. next if $line =~ /^i/;
  457. chomp $line;
  458. my @fields = split(/\s+/,$line);
  459. chomp $line;
  460. if ($line =~ /^a /){
  461. $start = 1;
  462. }
  463. if ($line =~ /^s /){
  464. foreach my $sp (@allspecies){
  465. # print "checking species $sp\n";
  466. if ($fields[1] =~ /$sp/){
  467. $species_counter = $species_counter."_".$sp;
  468. push(@sequences, $fields[6]);
  469. my @sp_info = split(/\./,$fields[1]);
  470. my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
  471. push(@titles, $title);
  472. # print "species_counter = $species_counter\n";
  473. }
  474. }
  475. }
  476. if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
  477. # print "species_counter = $species_counter\n";
  478. my $arranged = reorderSpecies($species_counter, @allspecies);
  479. my $stopper = 1;
  480. my $arrno = 0;
  481. # print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n";
  482. if (scalar(@sequences) == scalar(@exactspecies)){
  483. foreach my $set (@allowedset){
  484. # print "testing $arranged against $set\n";
  485. if ($arranged eq $set){
  486. $stopper = 0; last;
  487. }
  488. $arrno++;
  489. }
  490. }
  491. else{
  492. $stopper = 1;
  493. }
  494. if ($stopper == 0) {
  495. @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
  496. @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
  497. my $filteredseq = filter_gaps(@sequences);
  498. if ($filteredseq ne "SHORT"){
  499. #print "printing"; <STDIN>;
  500. $counter++;
  501. print OUT join (" ",$counter, @titles), "\n";
  502. print OUT $filteredseq, "\n";
  503. print OUT "\n";
  504. $countermatch++;
  505. }
  506. }
  507. else{ #print "nexting\n";<STDIN>;
  508. }
  509. @sequences = (); @titles = (); $start = 0;$species_counter = "0";
  510. next;
  511. }
  512. }
  513. # print "countermatch = $countermatch\n";
  514. }
  515. sub reorderSpecies{
  516. my @inarr=@_;
  517. my $currSpecies = shift (@inarr);
  518. my $ordered_species = 0;
  519. my @species=@inarr;
  520. #print "species = @species\n";
  521. foreach my $order (@species){
  522. $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
  523. }
  524. return $ordered_species;
  525. }
  526. sub filter_gaps{
  527. my @sequences = @_;
  528. # print "sequences sent are @sequences\n";
  529. my $seq_length = length($sequences[0]);
  530. my $seq_no = scalar(@sequences);
  531. my $allgaps = ();
  532. for (1 ... $seq_no){
  533. $allgaps = $allgaps."-";
  534. }
  535. my @seq_array = ();
  536. my $seq_counter = 0;
  537. foreach my $seq (@sequences){
  538. # my @sequence = split(/\s*/,$seq);
  539. $seq_array[$seq_counter] = [split(/\s*/,$seq)];
  540. # push @seq_array, [@sequence];
  541. $seq_counter++;
  542. }
  543. my $g = 0;
  544. while ( $g < $seq_length){
  545. last if (!exists $seq_array[0][$g]);
  546. my $bases = ();
  547. for my $u (0 ... $#seq_array){
  548. $bases = $bases.$seq_array[$u][$g];
  549. }
  550. # print $bases, "\n";
  551. if ($bases eq $allgaps){
  552. # print "bases are $bases, position is $g \n";
  553. for my $seq (@seq_array){
  554. splice(@$seq , $g, 1);
  555. }
  556. }
  557. else {
  558. $g++;
  559. }
  560. }
  561. my @outs = ();
  562. foreach my $seq (@seq_array){
  563. push(@outs, join("",@$seq));
  564. }
  565. return "SHORT" if length($outs[0]) <=100;
  566. return (join("\n", @outs));
  567. }
  568. sub allowedSetOfSpecies{
  569. my @allowed_species = split(/_/,$_[0]);
  570. unshift @allowed_species, 0;
  571. # print "allowed set = @allowed_species \n";
  572. my @output = ();
  573. for (0 ... scalar(@allowed_species) - 4){
  574. push(@output, join("_",@allowed_species));
  575. pop @allowed_species;
  576. }
  577. return join(";",reverse(@output));
  578. }
  579. sub orderInfo{
  580. my @info = split(/;/,$_[0]);
  581. # print "info = @info";
  582. my @old = split(/_/,$_[1]);
  583. my @new = split(/_/,$_[2]);
  584. shift @old; shift @new;
  585. my @outinfo = ();
  586. foreach my $spe (@new){
  587. for my $no (0 ... $#old){
  588. if ($spe eq $old[$no]){
  589. push(@outinfo, $info[$no]);
  590. }
  591. }
  592. }
  593. # print "outinfo = @outinfo \n";
  594. return join(";", @outinfo);
  595. }
  596. #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
  597. #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
  598. sub artificial_axdata_inverter{
  599. open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!";
  600. open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!";
  601. my $linecounter=0;
  602. while (my $line = <IN>){
  603. $linecounter++;
  604. #print "$linecounter\n";
  605. chomp $line;
  606. my $final_line = $line;
  607. my $trycounter = 0;
  608. if ($line =~ /^[a-zA-Z\-]/){
  609. # while ($final_line eq $line){
  610. my @fields = split(/\s*/,$line);
  611. $final_line = join("",reverse(@fields));
  612. # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
  613. # $trycounter++;
  614. # print "trying again....$trycounter : $final_line\n" if $final_line eq $line;
  615. # }
  616. }
  617. # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
  618. if ($line =~ /^[0-9]/){
  619. $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g;
  620. $final_line = $line;
  621. }
  622. print OUT $final_line,"\n";
  623. #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
  624. }
  625. close OUT;
  626. }
  627. #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
  628. #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
  629. sub multi_species_t1 {
  630. my $input1 = $_[0];
  631. #print "@_\n"; <STDIN>;
  632. my @tags = split(/_/, $_[1]);
  633. my @outputs = split(/,/, $_[2]);
  634. my $title_query = $_[3];
  635. my @handles = ();
  636. open(FILEB,"<$input1")or die "Cannot open file: $input1 $!";
  637. my $i = 0;
  638. foreach my $path (@outputs){
  639. $handles[$i] = IO::Handle->new();
  640. open ($handles[$i], ">$path") or die "Can't open $path : $!";
  641. $i++;
  642. }
  643. my $curdef;
  644. my $start = 0;
  645. while (my $line = <FILEB> ) {
  646. if ($line =~ /^\d/){
  647. $line =~ s/ +/\t/g;
  648. my @fields = split(/\s+/, $line);
  649. if (($line =~ /$title_query/)){
  650. my $title = $line;
  651. my $counter = 0;
  652. foreach my $tag (@tags){
  653. $line = <FILEB>;
  654. print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line;
  655. $counter++;
  656. }
  657. }
  658. else{
  659. foreach my $tag (@tags){
  660. my $tine = <FILEB>;
  661. }
  662. }
  663. }
  664. }
  665. foreach my $hand (@handles){
  666. $hand->close();
  667. }
  668. close FILEB;
  669. }
  670. #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
  671. #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
  672. sub multi_species_t2{
  673. my $input = $_[0];
  674. my $species = $_[1];
  675. my $output1 = $input."_unr";
  676. #------------------------------------------------------------------------------------------
  677. open (FILEF1, "<$input") or die "Cannot open file $input :$!";
  678. open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!";
  679. my $line1 = <FILEF1>;
  680. while($line1){
  681. {
  682. # chomp($line);
  683. if ($line1 =~ (m/^\>$species/)){
  684. chomp($line1);
  685. print FILEF2 $line1;
  686. $line1 = <FILEF1>;
  687. chomp($line1);
  688. print FILEF2 "\t", $line1,"\n";
  689. }
  690. }
  691. $line1 = <FILEF1>;
  692. }
  693. close FILEF1;
  694. close FILEF2;
  695. #------------------------------------------------------------------------------------------
  696. my $output2 = $output1."and";
  697. my $output3 = $output1."and2";
  698. open(IN,"<$output1");
  699. open (FILEF3, ">$output2");
  700. open (FILEF4, ">$output3");
  701. while (<IN>){
  702. my $line = $_;
  703. chomp($line);
  704. my @fields=split (/\t/, $line);
  705. # print $line,"\n"; <STDIN>;
  706. if($line !~ /random/){
  707. print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n";
  708. print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n";
  709. }
  710. }
  711. close IN;
  712. close FILEF3;
  713. close FILEF4;
  714. unlink $output1;
  715. #------------------------------------------------------------------------------------------
  716. # OLD T3.PL RUDIMENT
  717. my $t3output = $output2;
  718. $t3output =~ s/gap_op_unrand/nogap_op_unrand/g;
  719. open(IN,"<$output2");
  720. open(OUTA,">$t3output");
  721. while (<IN>){
  722. s/-//g unless /^>/;
  723. print OUTA;
  724. }
  725. close IN;
  726. close OUTA;
  727. #------------------------------------------------------------------------------------------
  728. }
  729. #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
  730. #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx
  731. sub multi_species_t2_2{
  732. #print "IN multi_species_t2_2 : @_\n";
  733. my $input = $_[0];
  734. my $species = $_[1];
  735. my $output1 = $input."2";
  736. open (FILEF1, "<$input");
  737. open (FILEF2, ">$output1");
  738. my $line1 = <FILEF1>;
  739. while($line1){
  740. {
  741. # chomp($line);
  742. if ($line1 =~ (m/^\>$species/)){
  743. chomp($line1);
  744. print FILEF2 $line1;
  745. $line1 = <FILEF1>;
  746. chomp($line1);
  747. print FILEF2 "\t", $line1,"\n";
  748. }
  749. }
  750. $line1 = <FILEF1>;
  751. }
  752. close FILEF1;
  753. close FILEF2;
  754. }
  755. #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx
  756. #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
  757. sub sputnikoutput_corrector{
  758. my $input = $_[0];
  759. my $output = $_[1];
  760. open(IN,"<$input") or die "Cannot open file $input :$!";
  761. open(OUT,">$output") or die "Cannot open file $output :$!";
  762. my $tine;
  763. while (my $line=<IN>){
  764. if($line =~/length /){
  765. $tine = $line;
  766. $tine =~ s/\s+/\t/g;
  767. my @fields = split(/\t/,$tine);
  768. if ($fields[6] > 60){
  769. print OUT $line;
  770. $line = <IN>;
  771. while (($line !~ /nucleotide/) && ($line !~ /^>/)){
  772. chomp $line;
  773. print OUT $line;
  774. $line = <IN>;
  775. }
  776. print OUT "\n";
  777. print OUT $line;
  778. }
  779. else{
  780. print OUT $line;
  781. }
  782. }
  783. else{
  784. print OUT $line;
  785. }
  786. }
  787. close IN;
  788. close OUT;
  789. }
  790. #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
  791. #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
  792. sub multi_species_t4{
  793. # print "multi_species_t4 : @_\n";
  794. my $input = $_[0];
  795. my $output = $_[1];
  796. open (FILEA, "<$input");
  797. open (FILEB, ">$output");
  798. my $line = <FILEA>;
  799. while ($line) {
  800. # chomp $line;
  801. if ($line =~ />/) {
  802. chomp $line;
  803. print FILEB $line, "\n";
  804. }
  805. if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
  806. chomp $line;
  807. print FILEB $line, " " ;
  808. $line = <FILEA>;
  809. chomp $line;
  810. print FILEB $line,"\n";
  811. }
  812. $line = <FILEA>;
  813. }
  814. close FILEA;
  815. close FILEB;
  816. }
  817. #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
  818. #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
  819. sub multi_species_t5{
  820. my $input = $_[0];
  821. my $output = $_[1];
  822. open(FILEB,"<$input");
  823. open(FILEC,">$output");
  824. my $curdef;
  825. while (my $line = <FILEB> ) {
  826. if ($line =~ /^>/){
  827. chomp $line;
  828. $curdef = $line;
  829. next;
  830. }
  831. if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
  832. print FILEC $curdef," ",$line;
  833. }
  834. }
  835. close FILEB;
  836. close FILEC;
  837. }
  838. #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
  839. #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
  840. sub multi_species_t6{
  841. my $input = $_[0];
  842. my $output = $_[1];
  843. my $focalstrand=$_[3];
  844. # print "inpput = @_\n";
  845. open (FILE, "<$input");
  846. open (FILE_MICRO, ">$output");
  847. my $linecounter=0;
  848. while (my $line = <FILE>){
  849. $linecounter++;
  850. chomp $line;
  851. #print "line = $line\n";
  852. #MONO#
  853. $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/;
  854. my $strand=$1;
  855. my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g);
  856. #print "line = $line\n";
  857. my $specfieldsend = 2 + ($no_of_species*4) - 1;
  858. my @fields = split(/\s+/, $line);
  859. my @speciesdata = @fields[0 ... $specfieldsend];
  860. $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/;
  861. my ($tide, $start, $end) = ($1, $2, $3);
  862. #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n";
  863. if($line =~ /mononucleotide/){
  864. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n";
  865. }
  866. #DI#
  867. elsif($line =~ /dinucleotide/){
  868. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n";
  869. }
  870. #TRI#
  871. elsif($line =~ /trinucleotide/ ){
  872. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n";
  873. }
  874. #TETRA#
  875. elsif($line =~ /tetranucleotide/){
  876. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n";
  877. }
  878. #PENTA#
  879. elsif($line =~ /pentanucleotide/){
  880. #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n";
  881. }
  882. else{
  883. # print "not: @fields\n";
  884. }
  885. }
  886. # print "linecounter=$linecounter\n";
  887. close FILE;
  888. close FILE_MICRO;
  889. }
  890. sub mono {
  891. my $st = $_[0];
  892. my $tp = unpack "A1"x(length($st)/1),$st;
  893. my $var1 = substr($tp, 0, 1);
  894. return join ("\t", $var1);
  895. }
  896. sub di {
  897. my $st = $_[0];
  898. my $tp = unpack "A2"x(length($st)/2),$st;
  899. my $var1 = substr($tp, 0, 2);
  900. return join ("\t", $var1);
  901. }
  902. sub tri {
  903. my $st = $_[0];
  904. my $tp = unpack "A3"x(length($st)/3),$st;
  905. my $var1 = substr($tp, 0, 3);
  906. return join ("\t", $var1);
  907. }
  908. sub tetra {
  909. my $st = $_[0];
  910. my $tp = unpack "A4"x(length($st)/4),$st;
  911. my $var1 = substr($tp, 0, 4);
  912. return join ("\t", $var1);
  913. }
  914. sub penta {
  915. my $st = $_[0];
  916. my $tp = unpack "A5"x(length($st)/5),$st;
  917. my $var1 = substr($tp, 0, 5);
  918. return join ("\t", $var1);
  919. }
  920. #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
  921. #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
  922. sub t9{
  923. my $input1 = $_[0];
  924. my $input2 = $_[1];
  925. my $output = $_[2];
  926. open(IN1,"<$input1") if -e $input1;
  927. open(IN2,"<$input2") or die "cannot open file $_[1] : $!";
  928. open(OUT,">$output") or die "cannot open file $_[2] : $!";
  929. my %seen = ();
  930. my $prevkey = 0;
  931. if (-e $input1){
  932. while (my $line = <IN1>){
  933. chomp($line);
  934. my @fields = split(/\t/,$line);
  935. my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]);
  936. # print "key in t9 = $key1\n";
  937. $seen{$key1}++ if ($prevkey ne $key1) ;
  938. $prevkey = $key1;
  939. }
  940. # print "done first hash\n";
  941. close IN1;
  942. }
  943. while (my $line = <IN2>){
  944. # print $line, "**\n";
  945. if (-e $input1){
  946. chomp($line);
  947. my @fields = split(/\t/,$line);
  948. my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]);
  949. if (exists $seen{$key2}){
  950. print OUT "$line\n" ;
  951. delete $seen{$key2};
  952. }
  953. }
  954. else {
  955. print OUT "$line\n" ;
  956. # print "$line\n" ;
  957. }
  958. }
  959. close IN2;
  960. close OUT;
  961. }
  962. #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
  963. #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
  964. sub multiSpecies_compound_microsat_hunter3{
  965. my $input1 = $_[0]; ###### the *_sput_op4_ii file
  966. my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
  967. my $output1 = $_[2]; ###### plain microsatellite file
  968. my $output2 = $_[3]; ###### compound microsatellite file
  969. my $org = $_[4]; ###### 1 or 2
  970. $no_of_species = $_[5];
  971. #print "IN multiSpecies_compound_microsat_hunter3: @_\n";
  972. #my @tags = split(/\t/,$info);
  973. sub compoundify;
  974. open(IN,"<$input1") or die "Cannot open file $input1 $!";
  975. open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
  976. open(OUT,">$output1") or die "Cannot open file $output1 $!";
  977. open(OUT2,">$output2") or die "Cannot open file $output2 $!";
  978. $infocord = 2 + (4*$no_of_species) - 1;
  979. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  980. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  981. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  982. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  983. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  984. my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  985. my @thresholds = ("0");
  986. push(@thresholds, split(/_/,$_[6]));
  987. sub thresholdCheck;
  988. my %micros = ();
  989. while (my $line = <IN>){
  990. # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
  991. next if $line =~ /\t\t/;
  992. if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  993. my $key = join("\t",$1, $2, $3, $4, $5);
  994. # print $key, "#-#-#-#-#-#-#-#\n";
  995. push (@{$micros{$key}},$line);
  996. }
  997. else{
  998. }
  999. }
  1000. close IN;
  1001. my @deletedlines = ();
  1002. my $linecount = 0;
  1003. while(my $sine = <SEQ>){
  1004. my %microstart=();
  1005. my %microend=();
  1006. my @sields = split(/\t/,$sine);
  1007. my $key = ();
  1008. if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  1009. $key = join("\t",$1, $2, $3, $4, $5);
  1010. # print $key, "<-<-<-<-<-<-<-<\n";
  1011. }
  1012. else{
  1013. }
  1014. if (exists $micros{$key}){
  1015. $linecount++;
  1016. my @microstring = @{$micros{$key}};
  1017. my @tempmicrostring = @{$micros{$key}};
  1018. foreach my $line (@tempmicrostring){
  1019. my @fields = split(/\t/,$line);
  1020. my $start = $fields[$startcord];
  1021. my $end = $fields[$endcord];
  1022. push (@{$microstart{$start}},$line);
  1023. push (@{$microend{$end}},$line);
  1024. }
  1025. my $firstflag = 'down';
  1026. while( my $line =shift(@microstring)){
  1027. # print "-----------\nline = $line ";
  1028. chomp $line;
  1029. my @fields = split(/\t/,$line);
  1030. my $start = $fields[$startcord];
  1031. my $end = $fields[$endcord];
  1032. my $startmicro = $line;
  1033. my $endmicro = $line;
  1034. # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n";
  1035. delete ($microstart{$start});
  1036. delete ($microend{$end});
  1037. my $flag = 'down';
  1038. my $startflag = 'down';
  1039. my $endflag = 'down';
  1040. my $prestart = $start - $distance;
  1041. my $postend = $end + $distance;
  1042. my @compoundlines = ();
  1043. my %compoundhash = ();
  1044. push (@compoundlines, $line);
  1045. push (@{$compoundhash{$line}},$line);
  1046. my $startrank = 1;
  1047. my $endrank = 1;
  1048. while( ($startflag eq "down") || ($endflag eq "down") ){
  1049. if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) {
  1050. # print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n";
  1051. last;
  1052. }
  1053. $firstflag = "up";
  1054. if ($startflag eq "down"){
  1055. for my $i ($prestart ... $start){
  1056. if(exists $microend{$i}){
  1057. chomp $microend{$i}[0];
  1058. if(exists $compoundhash{$microend{$i}[0]}) {next;}
  1059. # print "sending from microend $startmicro, $microend{$i}[0] |||\n";
  1060. if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){
  1061. push(@compoundlines, $microend{$i}[0]);
  1062. # print "accepted\n";
  1063. my @tields = split(/\t/,$microend{$i}[0]);
  1064. $startmicro = $microend{$i}[0];
  1065. chomp $startmicro;
  1066. $start = $tields[$startcord];
  1067. $flag = 'down';
  1068. $startrank++;
  1069. # print "startcompund = $microend{$i}[0]\n";
  1070. delete $microend{$i};
  1071. delete $microstart{$start};
  1072. $startflag = 'down';
  1073. $prestart = $start - $distance;
  1074. last;
  1075. }
  1076. else{
  1077. $flag = 'up';
  1078. $startflag = 'up';
  1079. }
  1080. }
  1081. else{
  1082. $flag = 'up';
  1083. $startflag = 'up';
  1084. }
  1085. }
  1086. }
  1087. $endrank = $startrank;
  1088. if ($endflag eq "down"){
  1089. for my $i ($end ... $postend){
  1090. if(exists $microstart{$i} ){
  1091. chomp $microstart{$i}[0];
  1092. if(exists $compoundhash{$microstart{$i}[0]}) {next;}
  1093. # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n";
  1094. if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){
  1095. push(@compoundlines, $microstart{$i}[0]);
  1096. # print "accepted\n";
  1097. my @tields = split(/\t/,$microstart{$i}[0]);
  1098. $end = $tields[$endcord]-0;
  1099. $endmicro = $microstart{$i}[0];
  1100. $endrank++;
  1101. chomp $endmicro;
  1102. $flag = 'down';
  1103. # print "endcompund = $microstart{$i}[0]\n";
  1104. delete $microstart{$i};
  1105. delete $microend{$end};
  1106. shift @microstring;
  1107. $postend = $end + $distance;
  1108. $endflag = 'down';
  1109. last;
  1110. }
  1111. else{
  1112. $flag = 'up';
  1113. $endflag = 'up';
  1114. }
  1115. }
  1116. else{
  1117. $flag = 'up';
  1118. $endflag = 'up';
  1119. }
  1120. }
  1121. }
  1122. # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
  1123. } #end while( $flag eq "down")
  1124. # print "compoundlines = @compoundlines \n";
  1125. if (scalar (@compoundlines) == 1){
  1126. print OUT $line,"\n";
  1127. }
  1128. if (scalar (@compoundlines) > 1){
  1129. my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]);
  1130. # print $compoundline,"\n";
  1131. print OUT2 $compoundline,"\n";
  1132. }
  1133. } #end foreach my $line (@microstring){
  1134. } #if (exists $micros{$key}){
  1135. }
  1136. close OUT;
  1137. close OUT2;
  1138. }
  1139. #------------------------------------------------------------------------------------------------
  1140. sub compoundify{
  1141. my ($compoundlines, $sequence) = @_;
  1142. # print "\nfound to compound : @$compoundlines and$sequence \n";
  1143. my $noOfComps = @$compoundlines;
  1144. # print "Number of elements in hash is $noOfComps\n";
  1145. my @starts;
  1146. my @ends;
  1147. foreach my $line (@$compoundlines){
  1148. # print "compoundify.. line = $line \n";
  1149. chomp $line;
  1150. my @fields = split(/\t/,$line);
  1151. my $start = $fields[$startcord];
  1152. my $end = $fields[$endcord];
  1153. # print "start = $start, end = $end \n";
  1154. push(@starts, $start);
  1155. push(@ends,$end);
  1156. }
  1157. my @temp = @$compoundlines;
  1158. my $startline=$temp[0];
  1159. my @mields = split(/\t/,$startline);
  1160. my $startcoord = $mields[$startcord];
  1161. my $startgapsign=$mields[$endcord];
  1162. my @startsorted = sort { $a <=> $b } @starts;
  1163. my @endsorted = sort { $a <=> $b } @ends;
  1164. my @intervals;
  1165. for my $end (0 ... (scalar(@endsorted)-2)){
  1166. my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1));
  1167. push(@intervals,$interval);
  1168. # print "interval = $interval =\n";
  1169. # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n";
  1170. }
  1171. push(@intervals,"");
  1172. my $compoundmicrosat=();
  1173. my $multiunit="";
  1174. foreach my $line (@$compoundlines){
  1175. my @fields = split(/\t/,$line);
  1176. my $component="[".$fields[$microsatcord]."]".shift(@intervals);
  1177. $compoundmicrosat=$compoundmicrosat.$component;
  1178. $multiunit=$multiunit."[".$fields[$motifcord]."]";
  1179. # print "multiunit = $multiunit\n";
  1180. }
  1181. my $compoundcopy = $compoundmicrosat;
  1182. $compoundcopy =~ s/\[|\]//g;
  1183. my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1;
  1184. my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit));
  1185. return $compoundline;
  1186. }
  1187. #------------------------------------------------------------------------------------------------
  1188. sub identityMatch_thresholdCheck{
  1189. my $line1 = $_[0];
  1190. my $line2 = $_[1];
  1191. my $rank = $_[2];
  1192. my @lields1 = split(/\t/,$line1);
  1193. my @lields2 = split(/\t/,$line2);
  1194. # print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n";
  1195. if (length($lields1[$motifcord]) == length($lields2[$motifcord])){
  1196. my $probe = $lields1[$motifcord].$lields1[$motifcord];
  1197. #print "$probe :: $lields2[$motifcord]\n";
  1198. return "proceed" if $probe =~ /$lields2[$motifcord]/;
  1199. #print "line recieved\n";
  1200. if ($rank ==1){
  1201. return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
  1202. }
  1203. else {
  1204. return "proceed" if thresholdCheck($line2) eq "proceed";
  1205. return "stop";
  1206. }
  1207. }
  1208. else{
  1209. if ($rank ==1){
  1210. return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
  1211. }
  1212. else {
  1213. return "proceed" if thresholdCheck($line2) eq "proceed";
  1214. return "stop";
  1215. }
  1216. }
  1217. return "stop";
  1218. }
  1219. #------------------------------------------------------------------------------------------------
  1220. sub thresholdCheck{
  1221. my @checkthresholds=(0,@thresholds);
  1222. #print "IN thresholdCheck: @_\n";
  1223. my $line = $_[0];
  1224. my @lields = split(/\t/,$line);
  1225. return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])];
  1226. return "stop";
  1227. }
  1228. #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
  1229. #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
  1230. sub multiSpecies_filtering_interrupted_microsats{
  1231. # print "IN multiSpecies_filtering_interrupted_microsats: @_\n";
  1232. my $unfiltered = $_[0];
  1233. my $filtered = $_[1];
  1234. my $residue = $_[2];
  1235. my $no_of_species = $_[5];
  1236. open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
  1237. open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
  1238. open(RES,">$residue") or die "Cannot open file $residue: $!";
  1239. $infocord = 2 + (4*$no_of_species) - 1;
  1240. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  1241. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  1242. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  1243. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  1244. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  1245. my @sub_thresholds = (0);
  1246. push(@sub_thresholds, split(/_/,$_[3]));
  1247. my @thresholds = (0);
  1248. push(@thresholds, split(/_/,$_[4]));
  1249. while (my $line = <UNF>) {
  1250. next if $line !~ /[a-z]/;
  1251. #print $line;
  1252. chomp $line;
  1253. my @fields = split(/\t/,$line);
  1254. my $motif = $fields[$motifcord];
  1255. my $realmotif = $motif;
  1256. #print "motif = $motif\n";
  1257. if ($motif =~ /^\[/){
  1258. $motif =~ s/^\[//g;
  1259. my @motifs = split(/\]/,$motif);
  1260. $realmotif = $motifs[0];
  1261. }
  1262. # print "realmotif = $realmotif";
  1263. my $motif_size = length($realmotif);
  1264. my $microsat = $fields[$microsatcord];
  1265. # print "microsat = $microsat\n";
  1266. $microsat =~ s/^\[|\]$//sg;
  1267. my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
  1268. $microsat = join("",@microsats);
  1269. if (length($microsat) < $thresholds[$motif_size]) {
  1270. # print length($microsat)," < ",$thresholds[$motif_size],"\n";
  1271. print RES $line,"\n"; next;
  1272. }
  1273. my @lengths = ();
  1274. foreach my $mic (@microsats){
  1275. push(@lengths, length($mic));
  1276. }
  1277. if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) {
  1278. # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n";
  1279. print RES $line,"\n"; next;}
  1280. else {print FIL $line,"\n"; next;
  1281. }
  1282. }
  1283. close FIL;
  1284. close RES;
  1285. }
  1286. sub largest_microsat{
  1287. my $counter = 0;
  1288. my($max) = shift(@_);
  1289. foreach my $temp (@_) {
  1290. #print "finding largest array: $maxcounter \n";
  1291. if($temp > $max){
  1292. $max = $temp;
  1293. }
  1294. }
  1295. return($max);
  1296. }
  1297. #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
  1298. #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
  1299. sub multiSpecies_compound_microsat_analyzer{
  1300. ####### PARAMETER ########
  1301. ##########################
  1302. my $input1 = $_[0]; ###### the *_sput_op4_ii file
  1303. my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
  1304. my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
  1305. my $output2 = $_[3]; ###### the pure compound microsatellites
  1306. my $org = $_[4];
  1307. my $no_of_species = $_[5];
  1308. # print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n";
  1309. $infocord = 2 + (4*$no_of_species) - 1;
  1310. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  1311. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  1312. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  1313. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  1314. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  1315. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  1316. open(IN,"<$input1") or die "Cannot open file $input1 $!";
  1317. open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
  1318. open(OUT,">$output1") or die "Cannot open file $output1 $!";
  1319. open(OUT2,">$output2") or die "Cannot open file $output2 $!";
  1320. # print "opened files \n";
  1321. my %micros = ();
  1322. my $keycounter=0;
  1323. my $linecounter=0;
  1324. while (my $line = <IN>){
  1325. $linecounter++;
  1326. if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  1327. my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
  1328. push (@{$micros{$key}},$line);
  1329. $keycounter++;
  1330. }
  1331. else{
  1332. # print "no key\n";
  1333. }
  1334. }
  1335. close IN;
  1336. my @deletedlines = ();
  1337. # print "done hash . linecounter=$linecounter, keycounter=$keycounter\n";
  1338. #---------------------------------------------------------------------------------------------------
  1339. # NOW READING THE SEQUENCE FILE
  1340. my $keyfound=0;
  1341. my $keyexists=0;
  1342. my $inter=0;
  1343. my $pure=0;
  1344. while(my $sine = <SEQ>){
  1345. my %microstart=();
  1346. my %microend=();
  1347. my @sields = split(/\t/,$sine);
  1348. my $key = 0;
  1349. if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  1350. $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
  1351. # print $sine;
  1352. # print $key;
  1353. $keyfound++;
  1354. }
  1355. else{
  1356. }
  1357. #<STDIN> if !defined $key;
  1358. if (exists $micros{$key}){
  1359. $keyexists++;
  1360. my @microstring = @{$micros{$key}};
  1361. my @filteredmicrostring;
  1362. foreach my $line (@microstring){
  1363. chomp $line;
  1364. my $copy_line = $line;
  1365. my @fields = split(/\t/,$line);
  1366. my $start = $fields[$startcord];
  1367. my $end = $fields[$endcord];
  1368. # FOR COMPOUND MICROSATELLITES
  1369. if ($fields[$typecord] eq "compound"){
  1370. $line = compound_microsat_analyser($line);
  1371. if ($line eq "NULL") {
  1372. print OUT2 "$copy_line\n";
  1373. $pure++;
  1374. next;
  1375. }
  1376. else{
  1377. print OUT "$line\n";
  1378. $inter++;
  1379. next;
  1380. }
  1381. }
  1382. }
  1383. } #if (exists $micros{$key}){
  1384. }
  1385. close OUT;
  1386. close OUT2;
  1387. # print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n";
  1388. }
  1389. sub compound_microsat_analyser{
  1390. my $line = $_[0];
  1391. my @fields = split(/\t/,$line);
  1392. my $motifline = $fields[$motifcord];
  1393. my $microsat = $fields[$microsatcord];
  1394. $motifline =~ s/^\[|\]$//g;
  1395. $microsat =~ s/^\[|\]$//g;
  1396. $microsat =~ s/-//g;
  1397. my @interruptions = ();
  1398. my @motields = split(/\]\[/,$motifline);
  1399. my @microields = split(/\][a-zA-Z|-]*\[/,$microsat);
  1400. my @inields = split(/[.*]/,$microsat);
  1401. shift @inields;
  1402. my @motifcount = scalar(@motields);
  1403. my $prevmotif = $motields[0];
  1404. my $prevmicro = $microields[0];
  1405. my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0]));
  1406. my $localflag = 'down';
  1407. my @infoarray = ();
  1408. for my $l (1 ... (scalar(@motields)-1)){
  1409. my $probe = $prevmotif.$prevmotif;
  1410. if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;}
  1411. if ($probe =~ /$motields[$l]/i){
  1412. my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l]));
  1413. my $curr_startphase = substr($microields[$l],0,length($motields[$l]));
  1414. if ($curr_startphase =~ /$prevphase/i) {
  1415. $infoarray[$l-1] = "insertion";
  1416. }
  1417. else {
  1418. $infoarray[$l-1] = "indel/substitution";
  1419. }
  1420. $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase;
  1421. next;
  1422. }
  1423. else {$localflag = "up"; last;}
  1424. }
  1425. if ($localflag eq 'up') {return "NULL";}
  1426. if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";}
  1427. if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";}
  1428. if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";}
  1429. if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";}
  1430. if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";}
  1431. @microields = split(/[\[|\]]/,$microsat);
  1432. my @microsats = ();
  1433. my @positions = ();
  1434. my $lengthtracker = 0;
  1435. for my $i (0 ... (scalar(@microields ) - 1)){
  1436. if ($i%2 == 0){
  1437. push(@microsats,$microields[$i]);
  1438. $lengthtracker = $lengthtracker + length($microields[$i]);
  1439. }
  1440. else{
  1441. push(@interruptions,$microields[$i]);
  1442. push(@positions, $lengthtracker+1);
  1443. $lengthtracker = $lengthtracker + length($microields[$i]);
  1444. }
  1445. }
  1446. my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions)));
  1447. return($returnline);
  1448. }
  1449. #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
  1450. #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
  1451. sub multiSpecies_compoundClarifyer{
  1452. # print "IN multiSpecies_compoundClarifyer: @_\n";
  1453. my $input1 = $_[0]; ###### the *_sput_compound
  1454. my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
  1455. my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
  1456. my $output2 = $_[3]; ###### compound file
  1457. my $org = $_[4];
  1458. my $no_of_species = $_[5];
  1459. @thresholds = "0";
  1460. push(@thresholds, split(/_/,$_[6]));
  1461. $infocord = 2 + (4*$no_of_species) - 1;
  1462. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  1463. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  1464. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  1465. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  1466. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  1467. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  1468. $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  1469. $interr_poscord = $motifcord + 3;
  1470. $no_of_interruptionscord = $motifcord + 4;
  1471. $interrcord = $motifcord + 2;
  1472. $interrtypecord = $motifcord + 1;
  1473. open(IN,"<$input1") or die "Cannot open file $input1 $!";
  1474. open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
  1475. open(INT,">$output1") or die "Cannot open file $output2 $!";
  1476. open(COMP,">$output2") or die "Cannot open file $output2 $!";
  1477. #open(CH,">changed") or die "Cannot open file changed $!";
  1478. # print "opened files \n";
  1479. my $linecounter = 0;
  1480. my $microcounter = 0;
  1481. my %micros = ();
  1482. while (my $line = <IN>){
  1483. # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
  1484. $linecounter++;
  1485. if ($line =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
  1486. my $key = join("\t",$1, $2, $3, $4);
  1487. # print $key, "#-#-#-#-#-#-#-#\n";
  1488. # print "key = $key\n";
  1489. push (@{$micros{$key}},$line);
  1490. $microcounter++;
  1491. }
  1492. else {#print $line," key not made\n"; <STDIN>;
  1493. }
  1494. }
  1495. # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
  1496. close IN;
  1497. my @deletedlines = ();
  1498. # print "done hash \n";
  1499. $linecounter = 0;
  1500. #---------------------------------------------------------------------------------------------------
  1501. # NOW READING THE SEQUENCE FILE
  1502. my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide);
  1503. $printer = 0;
  1504. while(my $sine = <SEQ>){
  1505. my %microstart=();
  1506. my %microend=();
  1507. my @sields = split(/\t/,$sine);
  1508. my $key = ();
  1509. # print "sine = $sine. focalspec = $focalspec \n"; #<STDIN>;
  1510. if ($sine =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
  1511. # if ($sine =~ /([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s/ ) {
  1512. $key = join("\t",$1, $2, $3, $4);
  1513. # print "key = $key\n";
  1514. }
  1515. else{
  1516. # print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n";
  1517. }
  1518. if (exists $micros{$key}){
  1519. my @microstring = @{$micros{$key}};
  1520. delete $micros{$key};
  1521. foreach my $line (@microstring){
  1522. # print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1;
  1523. # print "microsat = $line" if $printer == 1;
  1524. $linecounter++;
  1525. my $copy_line = $line;
  1526. my @mields = split(/\t/,$line);
  1527. my @fields = @mields;
  1528. my $start = $fields[$startcord];
  1529. my $end = $fields[$endcord];
  1530. my $microsat = $fields[$microsatcord];
  1531. my $motifline = $fields[$motifcord];
  1532. my $microsatcopy = $microsat;
  1533. my $positioner = $microsat;
  1534. $positioner =~ s/[a-zA-Z|-]/_/g;
  1535. $microsatcopy =~ s/^\[|\]$//gs;
  1536. chomp $microsatcopy;
  1537. my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
  1538. my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
  1539. my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start);
  1540. # print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1;
  1541. shift @inields;
  1542. #print "inields =@inields<\n";
  1543. $motifline =~ s/^\[|\]$//gs;
  1544. chomp $motifline;
  1545. #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n";
  1546. my @motields = split(/\]\[/,$motifline);
  1547. my $seq = $microsatcopy;
  1548. $seq =~ s/\[|\]//g;
  1549. my $seqlen = length($seq);
  1550. $seq = " ".$seq;
  1551. my $longestmotif_no = longest_array_element(@motields);
  1552. my $shortestmotif_no = shortest_array_element(@motields);
  1553. #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n";
  1554. my $search = $motields[$longestmotif_no].$motields[$longestmotif_no];
  1555. if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){
  1556. print COMP $line;
  1557. next;
  1558. }
  1559. my @shortestmotif_nos = ();
  1560. for my $m (0 ... $#motields){
  1561. push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) );
  1562. }
  1563. ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
  1564. my $newleft =();
  1565. my $leftstopper = 0; my $rightstopper = 0;
  1566. foreach my $shortmotif_no (@shortestmotif_nos){
  1567. next if $shortmotif_no == 0;
  1568. my $last_left = $shortmotif_no; #$#motields;
  1569. my $last_hitter = 0;
  1570. for (my $i =($shortmotif_no-1); $i>=0; $i--){
  1571. my $search = $motields[$shortmotif_no];
  1572. if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
  1573. if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
  1574. $last_hitter = 1;
  1575. $last_left = $i+1; last;
  1576. }
  1577. my $probe = $motields[$i];
  1578. if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
  1579. if ($probe !~ /$search/){
  1580. $last_hitter = 1;
  1581. $last_left = $i+1;
  1582. # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n";
  1583. last;
  1584. }
  1585. $last_left--;$last_hitter = 1;
  1586. # print "passed tests, last left = $last_left\n";
  1587. }
  1588. # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n";
  1589. if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;}
  1590. else {$leftstopper = 1;
  1591. # print "leftstopper = 1\n";
  1592. }
  1593. }
  1594. ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
  1595. my $newright =();
  1596. foreach my $shortmotif_no (@shortestmotif_nos){
  1597. next if $shortmotif_no == $#motields;
  1598. my $last_right = $shortmotif_no;# -1;
  1599. for my $i ($shortmotif_no+1 ... $#motields){
  1600. my $search = $motields[$shortmotif_no];
  1601. if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
  1602. if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
  1603. $last_right = $i-1; last;
  1604. }
  1605. my $probe = $motields[$i];
  1606. if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
  1607. if ( $probe !~ /$search/){
  1608. $last_right = $i-1; last;
  1609. }
  1610. $last_right++;
  1611. }
  1612. if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n";
  1613. }
  1614. else {$rightstopper = 1;
  1615. }
  1616. }
  1617. if ($rightstopper == 1 && $leftstopper == 1){
  1618. print COMP $line;
  1619. # print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1;
  1620. next;
  1621. }
  1622. # print "pased initial testing phase \n" if $printer == 1;
  1623. my @outputs = ();
  1624. my @orig_starts = ();
  1625. my @orig_ends = ();
  1626. for my $mic (0 ... $#microields){
  1627. my $miclen = length($microields[$mic]);
  1628. my $microleftlen = 0;
  1629. #print "\nmic = $mic\n";
  1630. if($mic > 0){
  1631. for my $submin (0 ... $mic-1){
  1632. my $interval = ();
  1633. if (!exists $inields[$submin]) {$interval = "";}
  1634. else {$interval = $inields[$submin];}
  1635. #print "inield =$interval< and microield =$microields[$submin]<\n ";
  1636. $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
  1637. }
  1638. }
  1639. push(@orig_starts,($microleftlen+1));
  1640. push(@orig_ends, ($microleftlen+1 + $miclen -1));
  1641. }
  1642. ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
  1643. for my $mic (0 ... $#microields){
  1644. my $miclen = length($microields[$mic]);
  1645. my $microleftlen = 0;
  1646. if($mic > 0){
  1647. for my $submin (0 ... $mic-1){
  1648. # if(!exists $inields[$submin]) {$inields[$submin] = "";}
  1649. my $interval = ();
  1650. if (!exists $inields[$submin]) {$interval = "";}
  1651. else {$interval = $inields[$submin];}
  1652. #print "inield =$interval< and microield =$microields[$submin]<\n ";
  1653. $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
  1654. }
  1655. }
  1656. $fields[$startcord] = $microleftlen+1;
  1657. $fields[$endcord] = $fields[$startcord] + $miclen -1;
  1658. $fields[$typecord] = $microsat_types[length($motields[$mic])];
  1659. $fields[$microsatcord] = $microields[$mic];
  1660. $fields[$motifcord] = $motields[$mic];
  1661. my $templine = join("\t", (@fields[0 .. $motifcord]) );
  1662. my $orig_templine = join("\t", (@fields[0 .. $motifcord]) );
  1663. my $newline;
  1664. my $lefter = 1; my $righter = 1;
  1665. if ( $fields[$startcord] < 2){$lefter = 0;}
  1666. if ($fields[$endcord] == $seqlen){$righter = 0;}
  1667. while($lefter == 1){
  1668. $newline = left_extender($templine, $seq,$org);
  1669. # print "returned line from left extender= $newline \n" if $printer == 1;
  1670. if ($newline eq $templine){$templine = $newline; last;}
  1671. else {$templine = $newline;}
  1672. if (left_extention_permission_giver($templine) eq "no") {last;}
  1673. }
  1674. while($righter == 1){
  1675. $newline = right_extender($templine, $seq,$org);
  1676. # print "returned line from right extender= $newline \n" if $printer == 1;
  1677. if ($newline eq $templine){$templine = $newline; last;}
  1678. else {$templine = $newline;}
  1679. if (right_extention_permission_giver($templine) eq "no") {last;}
  1680. }
  1681. my @tempfields = split(/\t/,$templine);
  1682. $tempfields[$microsatcord] =~ s/\]|\[//g;
  1683. $tempfields[$motifcord] =~ s/^\[|\]$//gs;
  1684. my @tempmotields = split(/\]\[/,$tempfields[$motifcord]);
  1685. if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) {
  1686. # print "scalar ( tempmotields) = 1\n" if $printer == 1;
  1687. next;
  1688. }
  1689. my $prevmotif = shift(@tempmotields);
  1690. my $stopper = 0;
  1691. foreach my $tempmot (@tempmotields){
  1692. if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;}
  1693. my $search = $prevmotif.$prevmotif;
  1694. if ($search !~ /$tempmot/) {$stopper = 1; last;}
  1695. $prevmotif = $tempmot;
  1696. }
  1697. if ( $stopper == 1) {
  1698. # print "length tempmot != length prevmotif\n" if $printer == 1;
  1699. next;
  1700. }
  1701. my $lastend = 0;
  1702. #----------------------------------------------------------
  1703. my $left_captured = (); my $right_captured = ();
  1704. my $left_bp = (); my $right_bp = ();
  1705. # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n";
  1706. for my $o (0 ... $#orig_starts){
  1707. # print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1;
  1708. # print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1;
  1709. if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord])
  1710. # print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1;
  1711. $left_captured = $o;
  1712. $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1;
  1713. }
  1714. elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord])
  1715. # print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1;
  1716. $right_captured = $o;
  1717. $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1;
  1718. }
  1719. $lastend = $orig_ends[$o]
  1720. }
  1721. # print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1;
  1722. my $leftmotif = (); my $left_trashed = ();
  1723. if ($tempfields[$startcord] != $fields[$startcord]) {
  1724. $leftmotif = $motields[$left_captured];
  1725. # print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1;
  1726. if ( $left_captured !~ /[0-9]+/) {#print $line,"\n", $templine,"\n";
  1727. }
  1728. $left_trashed = length($microields[$left_captured]) - $left_bp;
  1729. }
  1730. my $rightmotif = (); my $right_trashed = ();
  1731. if ($tempfields[$endcord] != $fields[$endcord]) {
  1732. # print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1;
  1733. $rightmotif = $motields[$right_captured];
  1734. $right_trashed = length($microields[$right_captured]) - $right_bp;
  1735. }
  1736. ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
  1737. $stopper = 0;
  1738. my $deletioner = 0;
  1739. #if($tempfields[$startcord] != $fields[$startcord]){
  1740. # print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1;
  1741. if ($left_captured != 0){
  1742. # print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1;
  1743. for my $e (0 ... $left_captured-1){
  1744. if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){
  1745. # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
  1746. $deletioner++; last;
  1747. }
  1748. if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
  1749. # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
  1750. $deletioner++; last;
  1751. }
  1752. if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
  1753. # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
  1754. $deletioner++; last;
  1755. }
  1756. }
  1757. }
  1758. #}
  1759. # print "after left search, deletioner = $deletioner\n" if $printer == 1;
  1760. if ($deletioner >= 1) {
  1761. # print "deletioner = $deletioner\n" if $printer == 1;
  1762. next;
  1763. }
  1764. $deletioner = 0;
  1765. #if($tempfields[$endcord] != $fields[$endcord]){
  1766. # print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1;
  1767. if ($right_captured != $#microields){
  1768. # print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1;
  1769. for my $e ($right_captured+1 ... $#microields){
  1770. if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){
  1771. # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
  1772. $deletioner++; last;
  1773. }
  1774. if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
  1775. # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
  1776. $deletioner++; last;
  1777. }
  1778. if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
  1779. # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
  1780. $deletioner++; last;
  1781. }
  1782. }
  1783. }
  1784. #}
  1785. # print "deletioner = $deletioner\n" if $printer == 1;
  1786. if ($deletioner >= 1) {
  1787. next;
  1788. }
  1789. my $leftMotifs_notCaptured = ();
  1790. my $rightMotifs_notCaptured = ();
  1791. if ($tempfields[$startcord] != $fields[$startcord] ){
  1792. #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n";
  1793. if (length($leftmotif) == 1 && $left_trashed > 3){
  1794. # print "invaded left motif is long mononucleotide" if $printer == 1;
  1795. next;
  1796. }
  1797. elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){
  1798. # print "invaded left motif too long" if $printer == 1;
  1799. next;
  1800. }
  1801. }
  1802. if ($tempfields[$endcord] != $fields[$endcord] ){
  1803. #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n";
  1804. if (length($rightmotif)==1 && $right_trashed){
  1805. # print "invaded right motif is long mononucleotide" if $printer == 1;
  1806. next;
  1807. }
  1808. elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){
  1809. # print "invaded right motif too long" if $printer == 1;
  1810. next;
  1811. }
  1812. }
  1813. push @outputs, $templine;
  1814. }
  1815. if (scalar(@outputs) == 0){ print COMP $line; next;}
  1816. # print "outputs are:", join("\n",@outputs),"\n";
  1817. if (scalar(@outputs) == 1){
  1818. my @oields = split(/\t/,$outputs[0]);
  1819. my $start = $oields[$startcord]+$mields[$startcord]-1;
  1820. my $end = $start+($oields[$endcord]-$oields[$startcord]);
  1821. $oields[$startcord] = $start; $oields[$endcord] = $end;
  1822. print INT join("\t",@oields), "\n";
  1823. # print CH $line,;
  1824. }
  1825. if (scalar(@outputs) > 1){
  1826. my $motif_min = 10;
  1827. my $chosen_one = $outputs[0];
  1828. foreach my $micro (@outputs){
  1829. my @oields = split(/\t/,$micro);
  1830. my $tempmotif = $oields[$motifcord];
  1831. $tempmotif =~ s/^\[|\]$//gs;
  1832. my @omots = split(/\]\[/, $tempmotif);
  1833. # print "motif_min = $motif_min, current motif = $tempmotif\n";
  1834. my $start = $oields[$startcord]+$mields[$startcord]-1;
  1835. my $end = $start+($oields[$endcord]-$oields[$startcord]);
  1836. $oields[$startcord] = $start; $oields[$endcord] = $end;
  1837. if(length($omots[0]) < $motif_min) {
  1838. $chosen_one = join("\t",@oields);
  1839. $motif_min = length($omots[0]);
  1840. }
  1841. }
  1842. print INT $chosen_one, "\n";
  1843. # print "chosen one is ".$chosen_one, "\n";
  1844. # print CH $line;
  1845. }
  1846. }
  1847. } #if (exists $micros{$key}){
  1848. else{
  1849. }
  1850. }
  1851. close INT;
  1852. close COMP;
  1853. }
  1854. sub left_extender{
  1855. #print "left extender\n";
  1856. my ($line, $seq, $org) = @_;
  1857. # print "in left extender... line passed = $line and sequence is $seq\n";
  1858. chomp $line;
  1859. my @fields = split(/\t/,$line);
  1860. my $rstart = $fields[$startcord];
  1861. my $microsat = $fields[$microsatcord];
  1862. $microsat =~ s/\[|\]//g;
  1863. my $rend = $rstart + length($microsat)-1;
  1864. $microsat =~ s/-//g;
  1865. my $motif = $fields[$motifcord];
  1866. my $firstmotif = ();
  1867. if ($motif =~ /^\[/){
  1868. $motif =~ s/^\[//g;
  1869. $motif =~ /([a-zA-Z]+)\].*/;
  1870. $firstmotif = $1;
  1871. }
  1872. else {$firstmotif = $motif;}
  1873. #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n";
  1874. my $leftphase = substr($microsat, 0,length($firstmotif));
  1875. my $phaser = $leftphase.$leftphase;
  1876. my @phase = split(/\s*/,$leftphase);
  1877. my @phases;
  1878. my @copy_phases = @phases;
  1879. my $crawler=0;
  1880. for (0 ... (length($leftphase)-1)){
  1881. push(@phases, substr($phaser, $crawler, length($leftphase)));
  1882. $crawler++;
  1883. }
  1884. my $start = $rstart;
  1885. my $end = $rend;
  1886. my $leftseq = substr($seq, 0, $start);
  1887. # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n";
  1888. my @extentions = ();
  1889. my @trappeds = ();
  1890. my @intervalposs = ();
  1891. my @trappedposs = ();
  1892. my @trappedphases = ();
  1893. my @intervals = ();
  1894. my $firstmotif_length = length($firstmotif);
  1895. foreach my $phase (@phases){
  1896. # print "left phase\t",substr($leftseq, -10),"\t$phase\n";
  1897. # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n";
  1898. if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
  1899. # print "in left pattern\n";
  1900. my $trapped = $1;
  1901. my $trappedpos = length($leftseq)-length($trapped);
  1902. my $interval = $3;
  1903. my $intervalpos = index($trapped, $interval) + 1;
  1904. # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n";
  1905. my $extention = substr($trapped, 0, length($trapped)-length($interval));
  1906. my $leftpeep = substr($seq, 0, ($start-length($trapped)));
  1907. my @passed_overhangs;
  1908. for my $i (1 ... length($phase)-1){
  1909. my $overhang = substr($phase, -length($phase)+$i);
  1910. # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n";
  1911. #TEMPORARY... BETTER METHOD NEEDED
  1912. $leftpeep =~ s/-//g;
  1913. if ($leftpeep =~ /$overhang$/i){
  1914. push(@passed_overhangs,$overhang);
  1915. # print "l overhang\n";
  1916. }
  1917. }
  1918. if(scalar(@passed_overhangs)>0){
  1919. my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
  1920. $extention = $overhang.$extention;
  1921. $trapped = $overhang.$trapped;
  1922. #print "trapped extended to $trapped \n";
  1923. $trappedpos = length($leftseq)-length($trapped);
  1924. }
  1925. push(@extentions,$extention);
  1926. # print "extentions = @extentions \n";
  1927. push(@trappeds,$trapped );
  1928. push(@intervalposs,length($extention)+1);
  1929. push(@trappedposs, $trappedpos);
  1930. # print "trappeds = @trappeds\n";
  1931. push(@trappedphases, substr($extention,0,length($phase)));
  1932. push(@intervals, $interval);
  1933. }
  1934. }
  1935. if (scalar(@trappeds == 0)) {return $line;}
  1936. my $nikaal = shortest_array_element(@intervals);
  1937. if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
  1938. $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
  1939. ##print "new fields 9 = $fields[9]\n";
  1940. $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
  1941. if($fields[$microsatcord] !~ /^\[/i){
  1942. $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
  1943. }
  1944. $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
  1945. if (exists ($fields[$motifcord+1])){
  1946. $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
  1947. }
  1948. else{$fields[$motifcord+1] = "indel/deletion";}
  1949. ##print "new fields 14 = $fields[14]\n";
  1950. if (exists ($fields[$motifcord+2])){
  1951. $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
  1952. }
  1953. else{$fields[$motifcord+2] = $intervals[$nikaal];}
  1954. my @seventeen=();
  1955. if (exists ($fields[$motifcord+3])){
  1956. @seventeen = split(/,/,$fields[$motifcord+3]);
  1957. # #print "scalarseventeen =@seventeen<-\n";
  1958. for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
  1959. $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
  1960. $fields[$motifcord+4] = $fields[$motifcord+4]+1;
  1961. }
  1962. else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
  1963. ##print "new fields 16 = $fields[16]\n";
  1964. ##print "new fields 17 = $fields[17]\n";
  1965. my $returnline = join("\t",@fields);
  1966. my $pastline = $returnline;
  1967. if ($fields[$microsatcord] =~ /\[/){
  1968. $returnline = multiSpecies_compoundClarifyer_merge($returnline);
  1969. }
  1970. return $returnline;
  1971. }
  1972. sub right_extender{
  1973. my ($line, $seq, $org) = @_;
  1974. chomp $line;
  1975. my @fields = split(/\t/,$line);
  1976. my $rstart = $fields[$startcord];
  1977. my $microsat = $fields[$microsatcord];
  1978. $microsat =~ s/\[|\]//g;
  1979. my $rend = $rstart + length($microsat)-1;
  1980. $microsat =~ s/-//g;
  1981. my $motif = $fields[$motifcord];
  1982. my $temp_lastmotif = ();
  1983. if ($motif =~ /\]$/s){
  1984. $motif =~ s/\]$//sg;
  1985. $motif =~ /.*\[([a-zA-Z]+)/;
  1986. $temp_lastmotif = $1;
  1987. }
  1988. else {$temp_lastmotif = $motif;}
  1989. my $lastmotif = substr($microsat,-length($temp_lastmotif));
  1990. ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n";
  1991. my $rightphase = substr($microsat, -length($lastmotif));
  1992. my $phaser = $rightphase.$rightphase;
  1993. my @phase = split(/\s*/,$rightphase);
  1994. my @phases;
  1995. my @copy_phases = @phases;
  1996. my $crawler=0;
  1997. for (0 ... (length($rightphase)-1)){
  1998. push(@phases, substr($phaser, $crawler, length($rightphase)));
  1999. $crawler++;
  2000. }
  2001. my $start = $rstart;
  2002. my $end = $rend;
  2003. my $rightseq = substr($seq, $end+1);
  2004. my @extentions = ();
  2005. my @trappeds = ();
  2006. my @intervalposs = ();
  2007. my @trappedposs = ();
  2008. my @trappedphases = ();
  2009. my @intervals = ();
  2010. my $lastmotif_length = length($lastmotif);
  2011. foreach my $phase (@phases){
  2012. if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
  2013. my $trapped = $1;
  2014. my $trappedpos = $end+1;
  2015. my $interval = $2;
  2016. my $intervalpos = index($trapped, $interval) + 1;
  2017. my $extention = substr($trapped, length($interval));
  2018. my $rightpeep = substr($seq, ($end+length($trapped))+1);
  2019. my @passed_overhangs = "";
  2020. #TEMPORARY... BETTER METHOD NEEDED
  2021. $rightpeep =~ s/-//g;
  2022. for my $i (1 ... length($phase)-1){
  2023. my $overhang = substr($phase,0, $i);
  2024. # #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n";
  2025. if ($rightpeep =~ /^$overhang/i){
  2026. push(@passed_overhangs, $overhang);
  2027. # #print "r overhang\n";
  2028. }
  2029. }
  2030. if (scalar(@passed_overhangs) > 0){
  2031. my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
  2032. $extention = $extention.$overhang;
  2033. $trapped = $trapped.$overhang;
  2034. # #print "trapped extended to $trapped \n";
  2035. }
  2036. push(@extentions,$extention);
  2037. ##print "extentions = @extentions \n";
  2038. push(@trappeds,$trapped );
  2039. push(@intervalposs,$intervalpos);
  2040. push(@trappedposs, $trappedpos);
  2041. # #print "trappeds = @trappeds\n";
  2042. push(@trappedphases, substr($extention,0,length($phase)));
  2043. push(@intervals, $interval);
  2044. }
  2045. }
  2046. if (scalar(@trappeds == 0)) {return $line;}
  2047. # my $nikaal = longest_array_element(@trappeds);
  2048. my $nikaal = shortest_array_element(@intervals);
  2049. # #print "longest element found = $nikaal \n";
  2050. if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
  2051. $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
  2052. ##print "new fields 9 = $fields[9]";
  2053. $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
  2054. ##print "new fields 11 = $fields[11]\n";
  2055. if($fields[$microsatcord] !~ /^\[/i){
  2056. $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
  2057. }
  2058. $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
  2059. ##print "new fields 12 = $fields[12]\n";
  2060. ##print "scalar of fields = ",scalar(@fields),"\n";
  2061. if (exists ($fields[$motifcord+1])){
  2062. # print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1];
  2063. # <STDIN> if !exists $fields[$motifcord+1];
  2064. $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
  2065. }
  2066. else{$fields[$motifcord+1] = "indel/deletion";}
  2067. ##print "new fields 14 = $fields[14]\n";
  2068. if (exists ($fields[$motifcord+2])){
  2069. $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
  2070. }
  2071. else{$fields[$motifcord+2] = $intervals[$nikaal];}
  2072. ##print "new fields 15 = $fields[15]\n";
  2073. my @seventeen=();
  2074. if (exists ($fields[$motifcord+3])){
  2075. ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n";
  2076. # print " print fields = @fields\n " if !exists $fields[$motifcord+3];
  2077. <STDIN> if !exists $fields[$motifcord+3];
  2078. my $currpos = length($microsat)+$intervalposs[$nikaal];
  2079. $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
  2080. $fields[$motifcord+4] = $fields[$motifcord+4]+1;
  2081. }
  2082. else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
  2083. ##print "new fields 16 = $fields[16]\n";
  2084. ##print "new fields 17 = $fields[17]\n";
  2085. my $returnline = join("\t",@fields);
  2086. my $pastline = $returnline;
  2087. if ($fields[$microsatcord] =~ /\[/){
  2088. $returnline = multiSpecies_compoundClarifyer_merge($returnline);
  2089. }
  2090. #print "finally right-extended line = ",$returnline,"\n";
  2091. return $returnline;
  2092. }
  2093. sub longest_array_element{
  2094. my $counter = 0;
  2095. my($max) = shift(@_);
  2096. my $maxcounter = 0;
  2097. foreach my $temp (@_) {
  2098. $counter++;
  2099. #print "finding largest array: $maxcounter \n" if $prinkter == 1;
  2100. if(length($temp) > length($max)){
  2101. $max = $temp;
  2102. $maxcounter = $counter;
  2103. }
  2104. }
  2105. return($maxcounter);
  2106. }
  2107. sub shortest_array_element{
  2108. my $counter = 0;
  2109. my($min) = shift(@_);
  2110. my $mincounter = 0;
  2111. foreach my $temp (@_) {
  2112. $counter++;
  2113. #print "finding largest array: $mincounter \n" if $prinkter == 1;
  2114. if(length($temp) < length($min)){
  2115. $min = $temp;
  2116. $mincounter = $counter;
  2117. }
  2118. }
  2119. return($mincounter);
  2120. }
  2121. sub left_extention_permission_giver{
  2122. my @fields = split(/\t/,$_[0]);
  2123. my $microsat = $fields[$microsatcord];
  2124. $microsat =~ s/(^\[)|-//g;
  2125. my $motif = $fields[$motifcord];
  2126. my $firstmotif = ();
  2127. my $firststretch = ();
  2128. my @stretches=();
  2129. if ($motif =~ /^\[/){
  2130. $motif =~ s/^\[//g;
  2131. $motif =~ /([a-zA-Z]+)\].*/;
  2132. $firstmotif = $1;
  2133. @stretches = split(/\]/,$microsat);
  2134. $firststretch = $stretches[0];
  2135. ##print "firststretch = $firststretch\n";
  2136. }
  2137. else {$firstmotif = $motif;$firststretch = $microsat;}
  2138. if (length($firststretch) < $thresholds[length($firstmotif)]){
  2139. return "no";
  2140. }
  2141. else {return "yes";}
  2142. }
  2143. sub right_extention_permission_giver{
  2144. my @fields = split(/\t/,$_[0]);
  2145. my $microsat = $fields[$microsatcord];
  2146. $microsat =~ s/-|(\]$)//sg;
  2147. my $motif = $fields[$motifcord];
  2148. my $temp_lastmotif = ();
  2149. my $laststretch = ();
  2150. my @stretches=();
  2151. if ($motif =~ /\]/){
  2152. $motif =~ s/\]$//gs;
  2153. $motif =~ /.*\[([a-zA-Z]+)$/;
  2154. $temp_lastmotif = $1;
  2155. @stretches = split(/\[/,$microsat);
  2156. $laststretch = pop(@stretches);
  2157. ##print "last stretch = $laststretch\n";
  2158. }
  2159. else {$temp_lastmotif = $motif; $laststretch = $microsat;}
  2160. if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
  2161. return "no";
  2162. }
  2163. else { return "yes";}
  2164. }
  2165. sub multiSpecies_compoundClarifyer_merge{
  2166. my $line = $_[0];
  2167. #print "sent for mering: $line \n";
  2168. my @mields = split(/\t/,$line);
  2169. my @fields = @mields;
  2170. my $microsat = $fields[$microsatcord];
  2171. my $motifline = $fields[$motifcord];
  2172. my $microsatcopy = $microsat;
  2173. $microsatcopy =~ s/^\[|\]$//sg;
  2174. my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
  2175. my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
  2176. shift @inields;
  2177. #print "inields =@inields<\n";
  2178. $motifline =~ s/^\[|\]$//sg;
  2179. my @motields = split(/\]\[/,$motifline);
  2180. my @firstmotifs = ();
  2181. my @lastmotifs = ();
  2182. for my $i (0 ... $#microields){
  2183. $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
  2184. $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
  2185. }
  2186. #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n";
  2187. my @mergelist = ();
  2188. my @inter_poses = split(/,/,$fields[$interr_poscord]);
  2189. my $no_of_interruptions = $fields[$no_of_interruptionscord];
  2190. my @interruptions = split(/,/,$fields[$interrcord]);
  2191. my @interrtypes = split(/,/,$fields[$interrtypecord]);
  2192. my $stopper = 0;
  2193. for my $i (0 ... $#motields-1){
  2194. #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n";
  2195. if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){
  2196. $stopper = 1;
  2197. push(@mergelist, ($i)."_".($i+1));
  2198. }
  2199. }
  2200. return $line if scalar(@mergelist) == 0;
  2201. foreach my $merging (@mergelist){
  2202. my @sets = split(/_/, $merging);
  2203. my @tempmicro = ();
  2204. my @tempmot = ();
  2205. for my $i (0 ... $sets[0]-1){
  2206. push(@tempmicro, "[".$microields[$i]."]");
  2207. push(@tempmicro, $inields[$i]);
  2208. push(@tempmot, "[".$motields[$i]."]");
  2209. #print "adding pre-motifs number $i\n";
  2210. }
  2211. my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
  2212. push (@tempmicro, $pusher);
  2213. push(@tempmot, "[".$motields[$sets[0]]."]");
  2214. my $outcoming = -2;
  2215. for my $i ($sets[1]+1 ... $#microields-1){
  2216. push(@tempmicro, "[".$microields[$i]."]");
  2217. push(@tempmicro, $inields[$i]);
  2218. push(@tempmot, "[".$motields[$i]."]");
  2219. #print "adding post-motifs number $i\n";
  2220. $outcoming = $i;
  2221. }
  2222. if ($outcoming != -2){
  2223. #print "outcoming = $outcoming \n";
  2224. push(@tempmicro, "[".$microields[$outcoming+1 ]."]");
  2225. push(@tempmot,"[". $motields[$outcoming+1]."]");
  2226. }
  2227. $fields[$microsatcord] = join("",@tempmicro);
  2228. $fields[$motifcord] = join("",@tempmot);
  2229. splice(@interrtypes, $sets[0], 1);
  2230. $fields[$interrtypecord] = join(",",@interrtypes);
  2231. splice(@interruptions, $sets[0], 1);
  2232. $fields[$interrcord] = join(",",@interruptions);
  2233. splice(@inter_poses, $sets[0], 1);
  2234. $fields[$interr_poscord] = join(",",@inter_poses);
  2235. $no_of_interruptions = $no_of_interruptions - 1;
  2236. }
  2237. if ($no_of_interruptions == 0){
  2238. $fields[$microsatcord] =~ s/^\[|\]$//sg;
  2239. $fields[$motifcord] =~ s/^\[|\]$//sg;
  2240. $line = join("\t", @fields[0 ... $motifcord]);
  2241. }
  2242. else{
  2243. $line = join("\t", @fields);
  2244. }
  2245. return $line;
  2246. }
  2247. sub thrashallow{
  2248. my $motif = $_[0];
  2249. return 4 if length($motif) == 2;
  2250. return 6 if length($motif) == 3;
  2251. return 8 if length($motif) == 4;
  2252. }
  2253. #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
  2254. #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
  2255. sub multispecies_filtering_compound_microsats{
  2256. my $unfiltered = $_[0];
  2257. my $filtered = $_[1];
  2258. my $residue = $_[2];
  2259. my $no_of_species = $_[5];
  2260. open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
  2261. open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
  2262. open(RES,">$residue") or die "Cannot open file $residue: $!";
  2263. $infocord = 2 + (4*$no_of_species) - 1;
  2264. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  2265. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  2266. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  2267. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  2268. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  2269. my @sub_thresholds = ("0");
  2270. push(@sub_thresholds, split(/_/,$_[3]));
  2271. my @thresholds = ("0");
  2272. push(@thresholds, split(/_/,$_[4]));
  2273. while (my $line = <UNF>) {
  2274. if ($line !~ /compound/){
  2275. print FIL $line,"\n"; next;
  2276. }
  2277. chomp $line;
  2278. my @fields = split(/\t/,$line);
  2279. my $motifline = $fields[$motifcord];
  2280. $motifline =~ s/^\[|\]$//g;
  2281. my @motifs = split(/\]\[/,$motifline);
  2282. my $microsat = $fields[$microsatcord];
  2283. $microsat =~ s/^\[|\]$|-//g;
  2284. my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
  2285. my $stopper = 0;
  2286. for my $i (0 ... $#motifs){
  2287. my @common = ();
  2288. my $probe = $motifs[$i].$motifs[$i];
  2289. my $motif_size = length($motifs[$i]);
  2290. for my $j (0 ... $#motifs){
  2291. next if length($motifs[$i]) != length($motifs[$j]);
  2292. push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i;
  2293. }
  2294. if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;}
  2295. else {next;}
  2296. }
  2297. if ($stopper == 1){
  2298. print RES $line,"\n";
  2299. }
  2300. else { print FIL $line,"\n"; }
  2301. }
  2302. close FIL;
  2303. close RES;
  2304. }
  2305. #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
  2306. #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
  2307. sub chromosome_unrand_breaker{
  2308. # print "IN chromosome_unrand_breaker: @_\n ";
  2309. my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
  2310. my $dir = $_[1]; ###### directory where subsets are put
  2311. my $output2 = $_[2]; ###### list of subset files
  2312. my $increment = $_[3];
  2313. my $info = $_[4];
  2314. my $chr = $_[5];
  2315. open(SEQ,"<$input1") or die "Cannot open file $input1 $!";
  2316. open(OUT,">$output2") or die "Cannot open file $output2 $!";
  2317. #---------------------------------------------------------------------------------------------------
  2318. # NOW READING THE SEQUENCE FILE
  2319. my $seed = 0;
  2320. my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
  2321. print OUT $subset,"\n";
  2322. open(SUB,">$subset");
  2323. while(my $sine = <SEQ>){
  2324. $seed++;
  2325. print SUB $sine;
  2326. if ($seed%$increment == 0 ){
  2327. close SUB;
  2328. $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
  2329. open(SUB,">$subset");
  2330. print SUB $sine;
  2331. print OUT $subset,"\n";
  2332. # print $subset,"\n";
  2333. }
  2334. }
  2335. close OUT;
  2336. close SUB;
  2337. }
  2338. #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
  2339. #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
  2340. sub multiSpecies_interruptedMicrosatHunter{
  2341. # print "IN multiSpecies_interruptedMicrosatHunter: @_\n";
  2342. my $input1 = $_[0]; ###### the *_sput_op4_ii file
  2343. my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
  2344. my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
  2345. my $output2 = $_[3]; ###### uninterrupted microsatellite file
  2346. my $org = $_[4];
  2347. my $no_of_species = $_[5];
  2348. my @thresholds = "0";
  2349. push(@thresholds, split(/_/,$_[6]));
  2350. # print "thresholds = @thresholds \n";
  2351. $infocord = 2 + (4*$no_of_species) - 1;
  2352. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  2353. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  2354. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  2355. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  2356. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  2357. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  2358. $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  2359. $interr_poscord = $motifcord + 3;
  2360. $no_of_interruptionscord = $motifcord + 4;
  2361. $interrcord = $motifcord + 2;
  2362. $interrtypecord = $motifcord + 1;
  2363. $prinkter = 0;
  2364. # print "prionkytet = $prinkter\n";
  2365. open(IN,"<$input1") or die "Cannot open file $input1 $!";
  2366. open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
  2367. open(INT,">$output1") or die "Cannot open file $output2 $!";
  2368. open(UNINT,">$output2") or die "Cannot open file $output2 $!";
  2369. # print "opened files !!\n";
  2370. my $linecounter = 0;
  2371. my $microcounter = 0;
  2372. my %micros = ();
  2373. while (my $line = <IN>){
  2374. # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
  2375. $linecounter++;
  2376. if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+([0-9a-zA-Z_]+)\s([0-9]+)\s+([0-9]+)\s/ ) {
  2377. my $key = join("\t",$1, $2, $3, $4, $5);
  2378. # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1;
  2379. push (@{$micros{$key}},$line);
  2380. $microcounter++;
  2381. }
  2382. else {#print $line if $prinkter == 1;
  2383. }
  2384. }
  2385. # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
  2386. close IN;
  2387. my @deletedlines = ();
  2388. # print "done hash \n";
  2389. $linecounter = 0;
  2390. #---------------------------------------------------------------------------------------------------
  2391. # NOW READING THE SEQUENCE FILE
  2392. while(my $sine = <SEQ>){
  2393. #print $linecounter,"\n" if $linecounter % 1000 == 0;
  2394. my %microstart=();
  2395. my %microend=();
  2396. my @sields = split(/\t/,$sine);
  2397. my $key = ();
  2398. if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  2399. $key = join("\t",$1, $2, $3, $4, $5);
  2400. # print $key, "<-<-<-<-<-<-<-<\n";
  2401. }
  2402. # $prinkter = 1 if $sine =~ /^>H\t499\t/;
  2403. if (exists $micros{$key}){
  2404. my @microstring = @{$micros{$key}};
  2405. delete $micros{$key};
  2406. my @filteredmicrostring;
  2407. # print "sequence = $sields[$sequencepos]" if $prinkter == 1;
  2408. foreach my $line (@microstring){
  2409. $linecounter++;
  2410. my $copy_line = $line;
  2411. my @fields = split(/\t/,$line);
  2412. my $start = $fields[$startcord];
  2413. my $end = $fields[$endcord];
  2414. # print $line if $prinkter == 1;
  2415. #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE
  2416. my $newline;
  2417. while(1){
  2418. # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1;
  2419. if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
  2420. $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org);
  2421. if ($newline eq $line){$line = $newline; last;}
  2422. else {$line = $newline;}
  2423. if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
  2424. # print "returned line from left extender= $line \n" if $prinkter == 1;
  2425. }
  2426. while(1){
  2427. # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1;
  2428. if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
  2429. $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org);
  2430. if ($newline eq $line){$line = $newline; last;}
  2431. else {$line = $newline;}
  2432. if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
  2433. # print "returned line from right extender= $line \n" if $prinkter == 1;
  2434. }
  2435. # print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1;
  2436. my @tempfields = split(/\t/,$line);
  2437. if ($tempfields[$microsatcord] =~ /\[/){
  2438. print INT $line,"\n";
  2439. }
  2440. else{
  2441. print UNINT $line,"\n";
  2442. }
  2443. if ($line =~ /NULL/){ next; }
  2444. push(@filteredmicrostring, $line);
  2445. push (@{$microstart{$start}},$line);
  2446. push (@{$microend{$end}},$line);
  2447. }
  2448. my $firstflag = 'down';
  2449. } #if (exists $micros{$key}){
  2450. }
  2451. close INT;
  2452. close UNINT;
  2453. # print "final number of lines = $linecounter\n";
  2454. }
  2455. sub multiSpecies_interruptedMicrosatHunter_left_extender{
  2456. my ($line, $seq, $org) = @_;
  2457. # print "left extender, like passed = $line\n" if $prinkter == 1;
  2458. # print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1;
  2459. chomp $line;
  2460. my @fields = split(/\t/,$line);
  2461. my $rstart = $fields[$startcord];
  2462. my $microsat = $fields[$microsatcord];
  2463. $microsat =~ s/\[|\]//g;
  2464. my $rend = $rstart + length($microsat)-1;
  2465. $microsat =~ s/-//g;
  2466. my $motif = $fields[$motifcord];
  2467. my $firstmotif = ();
  2468. if ($motif =~ /^\[/){
  2469. $motif =~ s/^\[//g;
  2470. $motif =~ /([a-zA-Z]+)\].*/;
  2471. $firstmotif = $1;
  2472. }
  2473. else {$firstmotif = $motif;}
  2474. # print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1;
  2475. my $leftphase = substr($microsat, 0,length($firstmotif));
  2476. my $phaser = $leftphase.$leftphase;
  2477. my @phase = split(/\s*/,$leftphase);
  2478. my @phases;
  2479. my @copy_phases = @phases;
  2480. my $crawler=0;
  2481. for (0 ... (length($leftphase)-1)){
  2482. push(@phases, substr($phaser, $crawler, length($leftphase)));
  2483. $crawler++;
  2484. }
  2485. my $start = $rstart;
  2486. my $end = $rend;
  2487. my $leftseq = substr($seq, 0, $start);
  2488. # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1;
  2489. my @extentions = ();
  2490. my @trappeds = ();
  2491. my @intervalposs = ();
  2492. my @trappedposs = ();
  2493. my @trappedphases = ();
  2494. my @intervals = ();
  2495. my $firstmotif_length = length($firstmotif);
  2496. foreach my $phase (@phases){
  2497. # print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1;
  2498. # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1;
  2499. if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
  2500. # print "in left pattern\n" if $prinkter == 1;
  2501. my $trapped = $1;
  2502. my $trappedpos = length($leftseq)-length($trapped);
  2503. my $interval = $3;
  2504. my $intervalpos = index($trapped, $interval) + 1;
  2505. # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1;
  2506. my $extention = substr($trapped, 0, length($trapped)-length($interval));
  2507. my $leftpeep = substr($seq, 0, ($start-length($trapped)));
  2508. my @passed_overhangs;
  2509. for my $i (1 ... length($phase)-1){
  2510. my $overhang = substr($phase, -length($phase)+$i);
  2511. # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1;
  2512. #TEMPORARY... BETTER METHOD NEEDED
  2513. $leftpeep =~ s/-//g;
  2514. if ($leftpeep =~ /$overhang$/i){
  2515. push(@passed_overhangs,$overhang);
  2516. # print "l overhang\n" if $prinkter == 1;
  2517. }
  2518. }
  2519. if(scalar(@passed_overhangs)>0){
  2520. my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
  2521. $extention = $overhang.$extention;
  2522. $trapped = $overhang.$trapped;
  2523. # print "trapped extended to $trapped \n" if $prinkter == 1;
  2524. $trappedpos = length($leftseq)-length($trapped);
  2525. }
  2526. push(@extentions,$extention);
  2527. # print "extentions = @extentions \n" if $prinkter == 1;
  2528. push(@trappeds,$trapped );
  2529. push(@intervalposs,length($extention)+1);
  2530. push(@trappedposs, $trappedpos);
  2531. # print "trappeds = @trappeds\n" if $prinkter == 1;
  2532. push(@trappedphases, substr($extention,0,length($phase)));
  2533. push(@intervals, $interval);
  2534. }
  2535. }
  2536. if (scalar(@trappeds == 0)) {return $line;}
  2537. ############################ my $nikaal = longest_array_element(@trappeds);
  2538. my $nikaal = shortest_array_element(@intervals);
  2539. # print "longest element found = $nikaal \n" if $prinkter == 1;
  2540. if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
  2541. $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
  2542. #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
  2543. $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
  2544. #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
  2545. if($fields[$microsatcord] !~ /^\[/i){
  2546. $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
  2547. }
  2548. $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
  2549. #print "new fields 14 = $fields[12]\n" if $prinkter == 1;
  2550. #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1;
  2551. if (scalar(@fields) > $motifcord+1){
  2552. $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
  2553. }
  2554. else{$fields[$motifcord+1] = "indel/deletion";}
  2555. #print "new fields 14 = $fields[14]\n" if $prinkter == 1;
  2556. if (scalar(@fields)>$motifcord+2){
  2557. $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
  2558. }
  2559. else{$fields[$motifcord+2] = $intervals[$nikaal];}
  2560. #print "new fields 15 = $fields[15]\n" if $prinkter == 1;
  2561. my @seventeen=();
  2562. if (scalar(@fields)>$motifcord+3){
  2563. @seventeen = split(/,/,$fields[$motifcord+3]);
  2564. # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1;
  2565. for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
  2566. $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
  2567. $fields[$motifcord+4] = $fields[$motifcord+4]+1;
  2568. }
  2569. else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
  2570. #print "new fields 16 = $fields[16]\n" if $prinkter == 1;
  2571. #print "new fields 17 = $fields[17]\n" if $prinkter == 1;
  2572. # return join("\t",@fields);
  2573. my $returnline = join("\t",@fields);
  2574. my $pastline = $returnline;
  2575. if ($fields[$microsatcord] =~ /\[/){
  2576. $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
  2577. }
  2578. # print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1;
  2579. return $returnline;
  2580. }
  2581. sub multiSpecies_interruptedMicrosatHunter_right_extender{
  2582. # print "right extender\n" if $prinkter == 1;
  2583. my ($line, $seq, $org) = @_;
  2584. # print "in right extender... line passed = $line\n" if $prinkter == 1;
  2585. # print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1;
  2586. chomp $line;
  2587. my @fields = split(/\t/,$line);
  2588. my $rstart = $fields[$startcord];
  2589. my $microsat = $fields[$microsatcord];
  2590. $microsat =~ s/\[|\]//g;
  2591. my $rend = $rstart + length($microsat)-1;
  2592. $microsat =~ s/-//g;
  2593. my $motif = $fields[$motifcord];
  2594. my $temp_lastmotif = ();
  2595. if ($motif =~ /\]$/){
  2596. $motif =~ s/\]$//g;
  2597. $motif =~ /.*\[([a-zA-Z]+)/;
  2598. $temp_lastmotif = $1;
  2599. }
  2600. else {$temp_lastmotif = $motif;}
  2601. my $lastmotif = substr($microsat,-length($temp_lastmotif));
  2602. # print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1;
  2603. my $rightphase = substr($microsat, -length($lastmotif));
  2604. my $phaser = $rightphase.$rightphase;
  2605. my @phase = split(/\s*/,$rightphase);
  2606. my @phases;
  2607. my @copy_phases = @phases;
  2608. my $crawler=0;
  2609. for (0 ... (length($rightphase)-1)){
  2610. push(@phases, substr($phaser, $crawler, length($rightphase)));
  2611. $crawler++;
  2612. }
  2613. my $start = $rstart;
  2614. my $end = $rend;
  2615. my $rightseq = substr($seq, $end+1);
  2616. # print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1;
  2617. # print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1;
  2618. my @extentions = ();
  2619. my @trappeds = ();
  2620. my @intervalposs = ();
  2621. my @trappedposs = ();
  2622. my @trappedphases = ();
  2623. my @intervals = ();
  2624. my $lastmotif_length = length($lastmotif);
  2625. foreach my $phase (@phases){
  2626. # print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1;
  2627. # print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1;
  2628. if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
  2629. # print "in right pattern\n" if $prinkter == 1;
  2630. my $trapped = $1;
  2631. my $trappedpos = $end+1;
  2632. my $interval = $2;
  2633. my $intervalpos = index($trapped, $interval) + 1;
  2634. # print "trapped = $trapped, interval = $interval\n" if $prinkter == 1;
  2635. my $extention = substr($trapped, length($interval));
  2636. my $rightpeep = substr($seq, ($end+length($trapped))+1);
  2637. my @passed_overhangs = "";
  2638. #TEMPORARY... BETTER METHOD NEEDED
  2639. $rightpeep =~ s/-//g;
  2640. for my $i (1 ... length($phase)-1){
  2641. my $overhang = substr($phase,0, $i);
  2642. # print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1;
  2643. if ($rightpeep =~ /^$overhang/i){
  2644. push(@passed_overhangs, $overhang);
  2645. # print "r overhang\n" if $prinkter == 1;
  2646. }
  2647. }
  2648. if (scalar(@passed_overhangs) > 0){
  2649. my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
  2650. $extention = $extention.$overhang;
  2651. $trapped = $trapped.$overhang;
  2652. # print "trapped extended to $trapped \n" if $prinkter == 1;
  2653. }
  2654. push(@extentions,$extention);
  2655. #print "extentions = @extentions \n" if $prinkter == 1;
  2656. push(@trappeds,$trapped );
  2657. push(@intervalposs,$intervalpos);
  2658. push(@trappedposs, $trappedpos);
  2659. # print "trappeds = @trappeds\n" if $prinkter == 1;
  2660. push(@trappedphases, substr($extention,0,length($phase)));
  2661. push(@intervals, $interval);
  2662. }
  2663. }
  2664. if (scalar(@trappeds == 0)) {return $line;}
  2665. ################################### my $nikaal = longest_array_element(@trappeds);
  2666. my $nikaal = shortest_array_element(@intervals);
  2667. # print "longest element found = $nikaal \n" if $prinkter == 1;
  2668. if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
  2669. $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
  2670. $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
  2671. if($fields[$microsatcord] !~ /^\[/i){
  2672. $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
  2673. }
  2674. $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
  2675. if (scalar(@fields) > $motifcord+1){
  2676. $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
  2677. }
  2678. else{$fields[$motifcord+1] = "indel/deletion";}
  2679. if (scalar(@fields)>$motifcord+2){
  2680. $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
  2681. }
  2682. else{$fields[$motifcord+2] = $intervals[$nikaal];}
  2683. my @seventeen=();
  2684. if (scalar(@fields)>$motifcord+3){
  2685. #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1;
  2686. my $currpos = length($microsat)+$intervalposs[$nikaal];
  2687. $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
  2688. $fields[$motifcord+4] = $fields[$motifcord+4]+1;
  2689. }
  2690. else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
  2691. # print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1;
  2692. # return join("\t",@fields);
  2693. my $returnline = join("\t",@fields);
  2694. my $pastline = $returnline;
  2695. if ($fields[$microsatcord] =~ /\[/){
  2696. $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
  2697. }
  2698. # print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1;
  2699. return $returnline;
  2700. }
  2701. sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{
  2702. my @fields = split(/\t/,$_[0]);
  2703. my $microsat = $fields[$microsatcord];
  2704. $microsat =~ s/(^\[)|-//sg;
  2705. my $motif = $fields[$motifcord];
  2706. chomp $motif;
  2707. # print $motif, "\n" if $motif !~ /^\[/;
  2708. my $firstmotif = ();
  2709. my $firststretch = ();
  2710. my @stretches=();
  2711. # print "motif = $motif, microsat = $microsat\n" if $prinkter == 1;
  2712. if ($motif =~ /^\[/){
  2713. $motif =~ s/^\[//sg;
  2714. $motif =~ /([a-zA-Z]+)\].*/;
  2715. $firstmotif = $1;
  2716. @stretches = split(/\]/,$microsat);
  2717. $firststretch = $stretches[0];
  2718. #print "firststretch = $firststretch\n" if $prinkter == 1;
  2719. }
  2720. else {$firstmotif = $motif;$firststretch = $microsat;}
  2721. # print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1;
  2722. if (length($firststretch) < $thresholds[length($firstmotif)]){
  2723. return "no";
  2724. }
  2725. else {return "yes";}
  2726. }
  2727. sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{
  2728. my @fields = split(/\t/,$_[0]);
  2729. my $microsat = $fields[$microsatcord];
  2730. $microsat =~ s/-|(\]$)//sg;
  2731. my $motif = $fields[$motifcord];
  2732. chomp $motif;
  2733. my $temp_lastmotif = ();
  2734. my $laststretch = ();
  2735. my @stretches=();
  2736. if ($motif =~ /\]/){
  2737. $motif =~ s/\]$//sg;
  2738. $motif =~ /.*\[([a-zA-Z]+)$/;
  2739. $temp_lastmotif = $1;
  2740. @stretches = split(/\[/,$microsat);
  2741. $laststretch = pop(@stretches);
  2742. #print "last stretch = $laststretch\n" if $prinkter == 1;
  2743. }
  2744. else {$temp_lastmotif = $motif; $laststretch = $microsat;}
  2745. if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
  2746. return "no";
  2747. }
  2748. else { return "yes";}
  2749. }
  2750. sub checking_substitutions{
  2751. my ($line, $seq, $startprobes, $endprobes) = @_;
  2752. #print "sequence = $seq \n" if $prinkter == 1;
  2753. #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n";
  2754. # <STDIN>;
  2755. my @seqarray = split(/\s*/,$seq);
  2756. my @startsubst_probes = split(/\|/,$startprobes);
  2757. my @endsubst_probes = split(/\|/,$endprobes);
  2758. chomp $line;
  2759. my @fields = split(/\t/,$line);
  2760. my $start = $fields[11] - $fields[10];
  2761. my $end = $fields[13] - $fields[10];
  2762. my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9]
  2763. $motif =~ s/\[|\]//g;
  2764. my $microsat = $fields[14];
  2765. $microsat =~ s/\[|\]//g;
  2766. #------------------------------------------------------------------------
  2767. # GETTING START AND END PHASES
  2768. my $startphase = substr($microsat,0, length($motif));
  2769. my $endphase = substr($microsat,-length($motif), length($motif));
  2770. #print "start and end phases are - $startphase and $endphase\n";
  2771. my $startflag = 'down';
  2772. my $endflag = 'down';
  2773. my $substitution_distance = length($motif);
  2774. my $prestart = $start - $substitution_distance;
  2775. my $postend = $end + $substitution_distance;
  2776. my @endadds = ();
  2777. my @startadds = ();
  2778. if (($prestart < 0) || ($postend > scalar(@seqarray))) {
  2779. last;
  2780. }
  2781. #------------------------------------------------------------------------#------------------------------------------------------------------------
  2782. # CHECKING FOR SUBSTITUTION PROBES NOW
  2783. if ($fields[8] ne "mononucleotide"){
  2784. while ($startflag eq "down"){
  2785. my $search = join("",@seqarray[$prestart...($start-1)]);
  2786. #print "search is from $prestart...($start-1) = $search\n";
  2787. foreach my $probe (@startsubst_probes){
  2788. #print "\t\tprobe = $probe\n";
  2789. if ($search =~ /^$probe/){
  2790. #print "\tfound addition to the left - $search \n";
  2791. my $copyprobe = $probe;
  2792. my $type;
  2793. my $subspos = 0;
  2794. my $interruption = "";
  2795. if ($search eq $startphase) { $type = "NONE";}
  2796. else{
  2797. $copyprobe =~ s/\[a-zA-Z\]/^/g;
  2798. $subspos = index($copyprobe,"^") + 1;
  2799. $type = "substitution";
  2800. $interruption = substr($search, $subspos,1);
  2801. }
  2802. my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos);
  2803. #print "adding information: $addinfo \n";
  2804. push(@startadds, $addinfo);
  2805. $prestart = $prestart - $substitution_distance;
  2806. $start = $start-$substitution_distance;
  2807. $startflag = 'down';
  2808. last;
  2809. }
  2810. else{
  2811. $startflag = 'up';
  2812. }
  2813. }
  2814. }
  2815. #<STDIN>;
  2816. while ($endflag eq "down"){
  2817. my $search = join("",@seqarray[($end+1)...$postend]);
  2818. #print "search is from ($end+1)...$postend] = $search\n";
  2819. foreach my $probe (@endsubst_probes){
  2820. #print "\t\tprobe = $probe\n";
  2821. if ($search =~ /$probe$/){
  2822. my $copyprobe = $probe;
  2823. my $type;
  2824. my $subspos = 0;
  2825. my $interruption = "";
  2826. if ($search eq $endphase) { $type = "NONE";}
  2827. else{
  2828. $copyprobe =~ s/\[a-zA-Z\]/^/g;
  2829. $subspos = index($copyprobe,"^") + 1;
  2830. $type = "substitution";
  2831. $interruption = substr($search, $subspos,1);
  2832. }
  2833. my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos);
  2834. #print "adding information: $addinfo \n";
  2835. push(@endadds, $addinfo);
  2836. $postend = $postend + $substitution_distance;
  2837. $end = $end+$substitution_distance;
  2838. push(@endadds, $search);
  2839. $endflag = 'down';
  2840. last;
  2841. }
  2842. else{
  2843. $endflag = 'up';
  2844. }
  2845. }
  2846. }
  2847. #print "startadds = @startadds, endadds = @endadds \n";
  2848. }
  2849. }
  2850. sub microsat_packer{
  2851. my $microsat = $_[0];
  2852. my $addition = $_[1];
  2853. }
  2854. sub multiSpecies_interruptedMicrosatHunter_merge{
  2855. $prinkter = 0;
  2856. # print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n";
  2857. my $line = $_[0];
  2858. # print "sent for mering: $line \n" if $prinkter ==1;
  2859. my @mields = split(/\t/,$line);
  2860. my @fields = @mields;
  2861. my $microsat = allCaps($fields[$microsatcord]);
  2862. my $motifline = allCaps($fields[$motifcord]);
  2863. my $microsatcopy = $microsat;
  2864. # print "microsat = $microsat|\n" if $prinkter ==1;
  2865. $microsatcopy =~ s/^\[|\]$//sg;
  2866. chomp $microsatcopy;
  2867. my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
  2868. my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
  2869. shift @inields;
  2870. # print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1;
  2871. $motifline =~ s/^\[|\]$//sg;
  2872. my @motields = split(/\]\[/,$motifline);
  2873. my @firstmotifs = ();
  2874. my @lastmotifs = ();
  2875. for my $i (0 ... $#microields){
  2876. $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
  2877. $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
  2878. }
  2879. # print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1;
  2880. my @mergelist = ();
  2881. my @inter_poses = split(/,/,$fields[$interr_poscord]);
  2882. my $no_of_interruptions = $fields[$no_of_interruptionscord];
  2883. my @interruptions = split(/,/,$fields[$interrcord]);
  2884. my @interrtypes = split(/,/,$fields[$interrtypecord]);
  2885. my $stopper = 0;
  2886. for my $i (0 ... $#motields-1){
  2887. # print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1;
  2888. if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){
  2889. $stopper = 1;
  2890. push(@mergelist, ($i)."_".($i+1)); #<STDIN> if $prinkter ==1;
  2891. }
  2892. }
  2893. # print "mergelist = @mergelist\n" if $prinkter ==1;
  2894. return $line if scalar(@mergelist) == 0;
  2895. # print "merging @mergelist\n" if $prinkter ==1;
  2896. # <STDIN> if $prinkter ==1;
  2897. foreach my $merging (@mergelist){
  2898. my @sets = split(/_/, $merging);
  2899. # print "sets = @sets\n" if $prinkter ==1;
  2900. my @tempmicro = ();
  2901. my @tempmot = ();
  2902. # print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1;
  2903. for my $i (0 ... $sets[0]-1){
  2904. # print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1;
  2905. push(@tempmicro, "[".$microields[$i]."]");
  2906. push(@tempmicro, $inields[$i]);
  2907. push(@tempmot, "[".$motields[$i]."]");
  2908. # print "adding pre-motifs number $i\n" if $prinkter ==1;
  2909. # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
  2910. }
  2911. # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
  2912. # print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1;
  2913. my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
  2914. # print "middle is, from @motields - @sets, number 0 which is is\n";
  2915. # print ": $motields[$sets[0]]\n";
  2916. push (@tempmicro, $pusher);
  2917. push(@tempmot, "[".$motields[$sets[0]]."]");
  2918. push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]];
  2919. my $outcoming = -2;
  2920. # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
  2921. # print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1;
  2922. for my $i ($sets[1]+1 ... $#microields){
  2923. # print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1;
  2924. push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i];
  2925. push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i];
  2926. push(@tempmot, "[".$motields[$i]."]");
  2927. # print "adding post-motifs number $i\n" if $prinkter ==1;
  2928. $outcoming = $i;
  2929. }
  2930. # print "____________________________________________________________________________\n";
  2931. $prinkter = 0;
  2932. $fields[$microsatcord] = join("",@tempmicro);
  2933. $fields[$motifcord] = join("",@tempmot);
  2934. # print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1;
  2935. splice(@interrtypes, $sets[0], 1);
  2936. $fields[$interrtypecord] = join(",",@interrtypes);
  2937. splice(@interruptions, $sets[0], 1);
  2938. $fields[$interrcord] = join(",",@interruptions);
  2939. splice(@inter_poses, $sets[0], 1);
  2940. $fields[$interr_poscord] = join(",",@inter_poses);
  2941. $no_of_interruptions = $no_of_interruptions - 1;
  2942. }
  2943. if ($no_of_interruptions == 0 && $line !~ /compound/){
  2944. $fields[$microsatcord] =~ s/^\[|\]$//sg;
  2945. $fields[$motifcord] =~ s/^\[|\]$//sg;
  2946. $line = join("\t", @fields[0 ... $motifcord]);
  2947. }
  2948. else{
  2949. $line = join("\t", @fields);
  2950. }
  2951. # print "post merging, the line is $line\n" if $prinkter ==1;
  2952. #<STDIN> if $stopper ==1;
  2953. return $line;
  2954. }
  2955. sub interval_asseser{
  2956. my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3];
  2957. }
  2958. #---------------------------------------------------------------------------------------------------
  2959. sub allCaps{
  2960. my $motif = $_[0];
  2961. $motif =~ s/a/A/g;
  2962. $motif =~ s/c/C/g;
  2963. $motif =~ s/t/T/g;
  2964. $motif =~ s/g/G/g;
  2965. return $motif;
  2966. }
  2967. #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
  2968. #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
  2969. sub merge_interruptedMicrosats{
  2970. # print "IN merge_interruptedMicrosats: @_\n";
  2971. my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
  2972. my $input1 = $_[1]; ###### the *_sput_op4_ii file
  2973. my $input2 = $_[2]; ###### the *_sput_op4_ii file
  2974. $no_of_species = $_[3];
  2975. my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward
  2976. my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse
  2977. my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward
  2978. #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse
  2979. #my $info = $_[4];
  2980. #my @tags = split(/\t/,$info);
  2981. open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
  2982. open(INF,"<$input1") or die "Cannot open file $input1 $!";
  2983. open(INR,"<$input2") or die "Cannot open file $input2 $!";
  2984. open(OUTF,">$output1") or die "Cannot open file $output1 $!";
  2985. open(OUTR,">$output2") or die "Cannot open file $output2 $!";
  2986. open(MER,">$output3") or die "Cannot open file $output3 $!";
  2987. #open(MERR,">$output4") or die "Cannot open file $output4 $!";
  2988. $printer = 0;
  2989. # print "files opened \n";
  2990. $infocord = 2 + (4*$no_of_species) - 1;
  2991. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  2992. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  2993. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  2994. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  2995. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  2996. $typecord = $infocord + 1;
  2997. my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  2998. $interrtypecord = $motifcord + 1;
  2999. $interrcord = $motifcord + 2;
  3000. $interr_poscord = $motifcord + 3;
  3001. $no_of_interruptionscord = $motifcord + 4;
  3002. $mergestarts = $no_of_interruptionscord+ 1;
  3003. $mergeends = $no_of_interruptionscord+ 2;
  3004. $mergemicros = $no_of_interruptionscord+ 3;
  3005. # NOW ADDING FORWARD MICROSATELLITES TO HASH
  3006. my %fmicros = ();
  3007. my $microcounter=0;
  3008. my $linecounter = 0;
  3009. while (my $line = <INF>){
  3010. # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
  3011. $linecounter++;
  3012. if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3013. my $key = join("\t",$1, $2, $4, $5);
  3014. # print $key, "#-#-#-#-#-#-#-#\n";
  3015. push (@{$fmicros{$key}},$line);
  3016. $microcounter++;
  3017. }
  3018. else {
  3019. #print $line;
  3020. }
  3021. }
  3022. # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
  3023. close INF;
  3024. my @deletedlines = ();
  3025. # print "done forward hash \n";
  3026. $linecounter = 0;
  3027. #---------------------------------------------------------------------------------------------------
  3028. # NOW ADDING REVERSE MICROSATELLITES TO HASH
  3029. my %rmicros = ();
  3030. $microcounter=0;
  3031. while (my $line = <INR>){
  3032. # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
  3033. $linecounter++;
  3034. if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3035. my $key = join("\t",$1, $2, $4, $5);
  3036. # print $key, "#-#-#-#-#-#-#-#\n";
  3037. push (@{$rmicros{$key}},$line);
  3038. $microcounter++;
  3039. }
  3040. else {
  3041. #print "cant make key\n";
  3042. }
  3043. }
  3044. # print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
  3045. close INR;
  3046. # print "done reverse hash \n";
  3047. $linecounter = 0;
  3048. #------------------------------------------------------------------------------------------------
  3049. while(my $sine = <SEQ>){
  3050. #<STDIN> if $sine =~ /16349128/;
  3051. next if $sine !~ /[a-zA-Z0-9]/;
  3052. # print "-" x 150, "\n" if $printer == 1;
  3053. my @sields = split(/\t/,$sine);
  3054. my @merged = ();
  3055. my $key = ();
  3056. if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3057. $key = join("\t",$1, $2, $4, $5);
  3058. # print $key, "<-<-<-<-<-<-<-<\n";
  3059. }
  3060. # print "key = $key\n";
  3061. my @sets1;
  3062. my @sets2;
  3063. chomp $sields[$sequencepos];
  3064. my $rev_sequence = reverse($sields[$sequencepos]);
  3065. $rev_sequence =~ s/ //g;
  3066. $rev_sequence = " ".$rev_sequence;
  3067. next if (!exists $fmicros{$key} && !exists $rmicros{$key});
  3068. if (exists $fmicros{$key}){
  3069. # print "line no : $linecount\n";
  3070. my @raw_microstring = @{$fmicros{$key}};
  3071. my %starts = (); my %ends = ();
  3072. # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
  3073. my @microstring=();
  3074. for my $u (0 ... $#raw_microstring){
  3075. my @tields = split(/\t/,$raw_microstring[$u]);
  3076. next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
  3077. push(@microstring, $raw_microstring[$u]);
  3078. $starts{$tields[$startcord]} = $tields[$startcord];
  3079. $ends{$tields[$endcord]} = $tields[$endcord];
  3080. }
  3081. # print "founf microstring in forward\n: @microstring\n";
  3082. chomp @microstring;
  3083. my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos]));
  3084. @sets1 = split("\=", $clusterresult);
  3085. my @temp = split(/_X0X_/,$sets1[0]) ; $microscanned+= scalar(@temp);
  3086. # print "sets = ", join("<all\nmerged>", @sets1), "\n<<-sets1\n"; <STDIN>;
  3087. } #if (exists $micros{$key}){
  3088. if (exists $rmicros{$key}){
  3089. # print "line no : $linecount\n";
  3090. my @raw_microstring = @{$rmicros{$key}};
  3091. my %starts = (); my %ends = ();
  3092. # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
  3093. my @microstring=();
  3094. for my $u (0 ... $#raw_microstring){
  3095. my @tields = split(/\t/,$raw_microstring[$u]);
  3096. next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
  3097. push(@microstring, $raw_microstring[$u]);
  3098. $starts{$tields[$startcord]} = $tields[$startcord];
  3099. $ends{$tields[$endcord]} = $tields[$endcord];
  3100. }
  3101. # print "founf microstring in reverse\n: @microstring\n"; <STDIN>;
  3102. chomp @microstring;
  3103. # print "sending reversed sequence\n";
  3104. my $clusterresult = (find_clusters(@microstring, $rev_sequence ) );
  3105. @sets2 = split("\=", $clusterresult);
  3106. my @temp = split(/_X0X_/,$sets2[0]) ; $microscanned+= scalar(@temp);
  3107. } #if (exists $micros{$key}){
  3108. my @popout1 = ();
  3109. my @popout2 = ();
  3110. my @forwardset = ();
  3111. if (exists $sets2[1] ){
  3112. if(exists $sets1[0]) {
  3113. push (@popout1, $sets1[0],$sets2[1]);
  3114. my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));#
  3115. print OUTF join("\n",split("_X0X_", $forwardset[0])), "\n";
  3116. my @localmerged = split("_X0X_", $forwardset[1]);
  3117. my $sequence = $sields[$sequencepos];
  3118. $sequence =~ s/ //g;
  3119. # print "\nforwardset = @forwardset\n";
  3120. for my $j (0 ... $#localmerged){
  3121. $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
  3122. }
  3123. push (@merged, @localmerged);
  3124. }
  3125. else{
  3126. my @localmerged = split("_X0X_", $sets2[1]);
  3127. my $sequence = $sields[$sequencepos];
  3128. $sequence =~ s/ //g;
  3129. for my $j (0 ... $#localmerged){
  3130. # print "\nlocalmerged = @localmerged\n";
  3131. $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
  3132. }
  3133. push (@merged, @localmerged);
  3134. }
  3135. }
  3136. elsif (exists $sets1[0]){
  3137. print OUTF join("\n",split("_X0X_", $sets1[0])), "\n";
  3138. }
  3139. my @reverseset= ();
  3140. if (exists $sets1[1]){
  3141. if (exists $sets2[0]){
  3142. push (@popout2, $sets2[0],$sets1[1]);
  3143. # print "popout2 = @popout2\n";
  3144. my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos]));
  3145. #print "reverseset = $reverseset[1] < --- reverseset1\n";
  3146. print OUTR join("\n",split("_X0X_", $reverseset[0])), "\n";
  3147. push(@merged, (split("_X0X_", $reverseset[1])));
  3148. }
  3149. else{
  3150. push(@merged, (split("_X0X_", $sets1[1])));
  3151. }
  3152. }
  3153. elsif (exists $sets2[0]){
  3154. print OUTR join("\n",split("_X0X_", $sets2[0])), "\n";
  3155. }
  3156. if (scalar @merged > 0){
  3157. my @filtered_merged = split("__",(filterDuplicates_merged(@merged)));
  3158. print MER join("\n", @filtered_merged),"\n";
  3159. }
  3160. # <STDIN> if $sine =~ /16349128/;
  3161. }
  3162. close(SEQ);
  3163. close(INF);
  3164. close(INR);
  3165. close(OUTF);
  3166. close(OUTR);
  3167. close(MER);
  3168. }
  3169. sub find_clusters{
  3170. my @input = @_;
  3171. my $sequence = pop(@input);
  3172. $sequence =~ s/ //g;
  3173. my @microstring0 = @input;
  3174. # print "IN: find_clusters:\n";
  3175. my %microstart=();
  3176. my %microend=();
  3177. my @nonmerged = ();
  3178. my @mergedSet = ();
  3179. # print "set of microsats = @microstring \n";
  3180. my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0;
  3181. # print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1;
  3182. #<STDIN> if $printer == 1;
  3183. my @tempmicrostring = @microstring;
  3184. foreach my $line (@tempmicrostring){
  3185. my @fields = split(/\t/,$line);
  3186. my $start = $fields[$startcord];
  3187. my $end = $fields[$endcord];
  3188. next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/;
  3189. # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
  3190. push (@{$microstart{$start}},$line);
  3191. push (@{$microend{$end}},$line);
  3192. }
  3193. my $firstflag = 'down';
  3194. while( my $line =shift(@microstring)){
  3195. # print "-----------\nline = $line \n" if $printer == 1;
  3196. chomp $line;
  3197. my @fields = split(/\t/,$line);
  3198. my $start = $fields[$startcord];
  3199. my $end = $fields[$endcord];
  3200. next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ;
  3201. my $startmicro = $line;
  3202. my $endmicro = $line;
  3203. # print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
  3204. delete ($microstart{$start});
  3205. delete ($microend{$end});
  3206. my $flag = 'down';
  3207. my $startflag = 'down';
  3208. my $endflag = 'down';
  3209. my $prestart = $start - $distance;
  3210. my $postend = $end + $distance;
  3211. my @compoundlines = ();
  3212. my %compoundhash = ();
  3213. push (@compoundlines, $line);
  3214. push (@{$compoundhash{$line}},$line);
  3215. my $startrank = 1;
  3216. my $endrank = 1;
  3217. while( ($startflag eq "down") || ($endflag eq "down") ){
  3218. # print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1;
  3219. if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){
  3220. # print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1;
  3221. last;
  3222. }
  3223. $firstflag = "up";
  3224. if ($startflag eq "down"){
  3225. for my $i ($prestart ... $end){
  3226. if(exists $microend{$i}){
  3227. chomp $microend{$i}[0];
  3228. if(exists $compoundhash{$microend{$i}[0]}) {next;}
  3229. chomp $microend{$i}[0];
  3230. push(@compoundlines, $microend{$i}[0]);
  3231. my @tields = split(/\t/,$microend{$i}[0]);
  3232. $startmicro = $microend{$i}[0];
  3233. chomp $startmicro;
  3234. $flag = 'down';
  3235. $startrank++;
  3236. # print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1;
  3237. delete $microend{$i};
  3238. delete $microstart{$tields[$startcord]};
  3239. $end = $tields[$endcord];
  3240. $startflag = 'down';
  3241. $prestart = $tields[$startcord] - $distance;
  3242. last;
  3243. }
  3244. else{
  3245. $flag = 'up';
  3246. $startflag = 'up';
  3247. }
  3248. }
  3249. }
  3250. if ($endflag eq "down"){
  3251. for my $i ($start ... $postend){
  3252. # print "$start ----> $i -----> $postend\n" if $printer == 1;
  3253. if(exists $microstart{$i} ){
  3254. chomp $microstart{$i}[0];
  3255. if(exists $compoundhash{$microstart{$i}[0]}) {next;}
  3256. chomp $microstart{$i}[0];
  3257. push(@compoundlines, $microstart{$i}[0]);
  3258. my @tields = split(/\t/,$microstart{$i}[0]);
  3259. $endmicro = $microstart{$i}[0];
  3260. $endrank++;
  3261. chomp $endmicro;
  3262. $flag = 'down';
  3263. # print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1;
  3264. delete $microstart{$i} if exists $microstart{$i} ;
  3265. delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]};
  3266. # print "done\n" if $printer == 1;
  3267. shift @microstring;
  3268. $end = $tields[$endcord];
  3269. $postend = $tields[$endcord] + $distance;
  3270. $endflag = 'down';
  3271. last;
  3272. }
  3273. else{
  3274. $flag = 'up';
  3275. $endflag = 'up';
  3276. }
  3277. # print "out of the if\n" if $printer == 1;
  3278. }
  3279. # print "out of the for\n" if $printer == 1;
  3280. }
  3281. # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
  3282. } #end while( $flag eq "down")
  3283. # print "compoundlines = @compoundlines \n" if $printer == 1;
  3284. if (scalar (@compoundlines) == 1){
  3285. push(@nonmerged, $line);
  3286. }
  3287. if (scalar (@compoundlines) > 1){
  3288. # print "FROM CLUSTERER\n" if $printer == 1;
  3289. push(@mergedSet,merge_microsats(@compoundlines, $sequence) );
  3290. }
  3291. } #end foreach my $line (@microstring){
  3292. # print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1;
  3293. #<STDIN> if scalar(@mergedSet) > 0;
  3294. # print "EXIT: find_clusters\n";
  3295. return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet));
  3296. }
  3297. sub custom {
  3298. $a->[$startcord+1] <=> $b->[$startcord+1];
  3299. }
  3300. sub popOuter {
  3301. # print "\nIN: popOuter @_\n"; <STDIN>;
  3302. my @all = split ("_X0X_",$_[0]);
  3303. # <STDIN> if !defined $_[0];
  3304. my @merged = split ("_X0X_",$_[1]);
  3305. my $sequence = $_[2];
  3306. my $seqlen = length($sequence);
  3307. my %microstart=();
  3308. my %microend=();
  3309. my @mergedSet = ();
  3310. my @nonmerged = ();
  3311. foreach my $line (@all){
  3312. my @fields = split(/\t/,$line);
  3313. my $start = $seqlen - $fields[$startcord]+ 1;
  3314. my $end = $seqlen - $fields[$endcord] + 1;
  3315. push (@{$microstart{$start}},$line);
  3316. push (@{$microend{$end}},$line);
  3317. }
  3318. my $firstflag = 'down';
  3319. my %forPopouting = ();
  3320. while( my $line =shift(@merged)){
  3321. # print "\n MErgedline: $line .. startcord = $startcord ... endcord = $endcord\n" ;
  3322. chomp $line;
  3323. my @fields = split(/\t/,$line);
  3324. my $start = $fields[$startcord];
  3325. my $end = $fields[$endcord];
  3326. my $startmicro = $line;
  3327. my $endmicro = $line;
  3328. delete ($microstart{$start});
  3329. delete ($microend{$end});
  3330. my $flag = 'down';
  3331. my $startflag = 'down';
  3332. my $endflag = 'down';
  3333. my $prestart = $start - $distance;
  3334. my $postend = $end + $distance;
  3335. my @compoundlines = ();
  3336. my %compoundhash = ();
  3337. push (@compoundlines, $line);
  3338. my $startrank = 1;
  3339. my $endrank = 1;
  3340. # print "\nstart = $start, end = $end\n";
  3341. # <STDIN>;
  3342. for my $i ($start ... $end){
  3343. if(exists $microend{$i}){
  3344. # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n";
  3345. chomp $microend{$i}[0];
  3346. my @fields = split(/\t/,$microend{$i}[0]);
  3347. delete $microstart{$seqlen - $fields[$startcord] + 1};
  3348. my $invertseq = $sequence;
  3349. $invertseq =~ s/ //g;
  3350. push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq ));
  3351. delete $microend{$i};
  3352. }
  3353. if(exists $microstart{$i} ){
  3354. # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n";
  3355. chomp $microstart{$i}[0];
  3356. my @fields = split(/\t/,$microstart{$i}[0]);
  3357. delete $microend{$seqlen - $fields[$endcord] + 1};
  3358. my $invertseq = $sequence;
  3359. $invertseq =~ s/ //g;
  3360. push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) );
  3361. delete $microstart{$i};
  3362. }
  3363. }
  3364. if (scalar (@compoundlines) == 1){
  3365. push(@mergedSet,join("\t",@compoundlines) );
  3366. }
  3367. else {
  3368. # print "FROM POPOUTER\n" if $printer == 1;
  3369. push(@mergedSet, merge_microsats(@compoundlines, $sequence) );
  3370. }
  3371. }
  3372. foreach my $key (sort keys %microstart) {
  3373. push(@nonmerged,$microstart{$key}[0]);
  3374. }
  3375. return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet) );
  3376. }
  3377. sub invert_justCoordinates{
  3378. my $microsat = $_[0];
  3379. # print "IN invert_justCoordinates ... @_\n" ; <STDIN>;
  3380. chomp $microsat;
  3381. my $seqLength = $_[1];
  3382. my @fields = split(/\t/,$microsat);
  3383. my $start = $seqLength - $fields[$endcord] + 1;
  3384. my $end = $seqLength - $fields[$startcord] + 1;
  3385. $fields[$startcord] = $start;
  3386. $fields[$endcord] = $end;
  3387. $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
  3388. # print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1;
  3389. return join("\t",@fields);
  3390. }
  3391. sub largest_number{
  3392. my $counter = 0;
  3393. my($max) = shift(@_);
  3394. foreach my $temp (@_) {
  3395. #print "finding largest array: $maxcounter \n";
  3396. if($temp > $max){
  3397. $max = $temp;
  3398. }
  3399. }
  3400. return($max);
  3401. }
  3402. sub smallest_number{
  3403. my $counter = 0;
  3404. my($min) = shift(@_);
  3405. foreach my $temp (@_) {
  3406. #print "finding largest array: $maxcounter \n";
  3407. if($temp < $min){
  3408. $min = $temp;
  3409. }
  3410. }
  3411. return($min);
  3412. }
  3413. sub filterDuplicates_merged{
  3414. my @merged = @_;
  3415. my %revmerged = ();
  3416. my @fmerged = ();
  3417. foreach my $micro (@merged) {
  3418. my @fields = split(/\t/,$micro);
  3419. if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){
  3420. my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
  3421. # print "adding ... \n$key\n$micro\n";
  3422. push(@{$revmerged{$key}}, $micro);
  3423. }
  3424. else{
  3425. # print "pushing.. $micro\n";
  3426. push(@fmerged, $micro);
  3427. }
  3428. }
  3429. # print "\n";
  3430. foreach my $micro (@fmerged) {
  3431. my @fields = split(/\t/,$micro);
  3432. my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
  3433. # print "searching for key $key\n";
  3434. if (exists $revmerged{$key}){
  3435. # print "deleting $revmerged{$key}[0]\n";
  3436. delete $revmerged{$key};
  3437. }
  3438. }
  3439. foreach my $key (sort keys %revmerged) {
  3440. push(@fmerged,$revmerged{$key}[0]);
  3441. }
  3442. # print "returning ", join("\n", @fmerged),"\n" ;
  3443. return join("__", @fmerged);
  3444. }
  3445. sub invert_microsat{
  3446. my $micro = $_[0];
  3447. chomp $micro;
  3448. if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;}
  3449. else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; }
  3450. my $sequence = $_[1];
  3451. $sequence =~ s/ //g;
  3452. my $seqlen = length($sequence);
  3453. my @fields = split(/\t/,$micro);
  3454. my $start = $seqlen - $fields[$endcord] +1;
  3455. my $end = $seqlen - $fields[$startcord] +1;
  3456. $fields[$startcord] = $start;
  3457. $fields[$endcord] = $end;
  3458. $fields[$motifcord] = reverse_micro($fields[$motifcord]);
  3459. $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
  3460. if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){
  3461. my @intertypes = split(/,/,$fields[$interrtypecord]);
  3462. my @inters = split(/,/,$fields[$interrcord]);
  3463. my @interposes = split(/,/,$fields[$interr_poscord]);
  3464. $fields[$interrtypecord] = join(",",reverse(@intertypes));
  3465. $fields[$no_of_interruptionscord] = scalar(@interposes);
  3466. for my $i (0 ... $fields[$no_of_interruptionscord]-1){
  3467. if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){
  3468. $inters[$i] = reverse($inters[$i]);
  3469. $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1;
  3470. }
  3471. else{
  3472. $inters[$i] = "";
  3473. $interposes[$i] = $interposes[$i] - 1;
  3474. }
  3475. $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1;
  3476. }
  3477. $fields[$interrcord] = join(",",reverse(@inters));
  3478. $fields[$interr_poscord] = join(",",reverse(@interposes));
  3479. }
  3480. my $finalmicrosat = join("\t", @fields);
  3481. return $finalmicrosat;
  3482. }
  3483. sub reverse_micro{
  3484. my $micro = reverse($_[0]);
  3485. my @strand = split(/\s*/,$micro);
  3486. for my $i (0 ... $#strand){
  3487. if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;}
  3488. if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;}
  3489. }
  3490. return join("",@strand);
  3491. }
  3492. #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
  3493. #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
  3494. sub forward_reverse_sputoutput_comparer {
  3495. # print "IN forward_reverse_sputoutput_comparer: @_\n";
  3496. my $input0 = $_[0]; ###### the *nogap_unrand_match file
  3497. my $input1 = $_[1]; ###### the real file, *sput* data
  3498. my $input2 = $_[2]; ###### the reverse file, *sput* data
  3499. my $output1 = $_[3]; ###### microsats different in real file
  3500. my $output2 = $_[4]; ###### microsats missing in real file
  3501. my $output3 = $_[5]; ###### microsats common among real and reverse file
  3502. my $no_of_species = $_[6];
  3503. $infocord = 2 + (4*$no_of_species) - 1;
  3504. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  3505. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  3506. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  3507. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  3508. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  3509. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  3510. $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  3511. $interrtypecord = $motifcord + 1;
  3512. $interrcord = $motifcord + 2;
  3513. $interr_poscord = $motifcord + 3;
  3514. $no_of_interruptionscord = $motifcord + 4;
  3515. $mergestarts = $no_of_interruptionscord+ 1;
  3516. $mergeends = $no_of_interruptionscord+ 2;
  3517. $mergemicros = $no_of_interruptionscord+ 3;
  3518. open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
  3519. open(INF,"<$input1") or die "Cannot open file $input1 $!";
  3520. open(INR,"<$input2") or die "Cannot open file $input2 $!";
  3521. open(DIFF,">$output1") or die "Cannot open file $output1 $!";
  3522. #open(MISS,">$output2") or die "Cannot open file $output2 $!";
  3523. open(SAME,">$output3") or die "Cannot open file $output3 $!";
  3524. # print "opened files \n";
  3525. my $linecounter = 0;
  3526. my $fcounter = 0;
  3527. my $rcounter = 0;
  3528. $printer = 0;
  3529. #---------------------------------------------------------------------------------------------------
  3530. # NOW ADDING FORWARD MICROSATELLITES TO HASH
  3531. my %fmicros = ();
  3532. my $microcounter=0;
  3533. while (my $line = <INF>){
  3534. $linecounter++;
  3535. if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3536. my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
  3537. # print $key, "#-#-#-#-#-#-#-#\n";
  3538. push (@{$fmicros{$key}},$line);
  3539. $microcounter++;
  3540. }
  3541. else {
  3542. #print $line;
  3543. }
  3544. }
  3545. # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
  3546. close INF;
  3547. my @deletedlines = ();
  3548. # print "done forward hash \n";
  3549. $linecounter = 0;
  3550. #---------------------------------------------------------------------------------------------------
  3551. # NOW ADDING REVERSE MICROSATELLITES TO HASH
  3552. my %rmicros = ();
  3553. $microcounter=0;
  3554. while (my $line = <INR>){
  3555. $linecounter++;
  3556. if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3557. my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
  3558. # print $key, "#-#-#-#-#-#-#-#\n";
  3559. push (@{$rmicros{$key}},$line);
  3560. $microcounter++;
  3561. }
  3562. else {}
  3563. }
  3564. # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
  3565. close INR;
  3566. # print "done reverse hash \n";
  3567. $linecounter = 0;
  3568. #---------------------------------------------------------------------------------------------------
  3569. #---------------------------------------------------------------------------------------------------
  3570. # NOW READING THE SEQUENCE FILE
  3571. while(my $sine = <SEQ>){
  3572. my %microstart=();
  3573. my %microend=();
  3574. my @sields = split(/\t/,$sine);
  3575. my $key = ();
  3576. if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3577. $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
  3578. }
  3579. else {
  3580. next;
  3581. }
  3582. $printer = 0;
  3583. my $sequence = $sields[$sequencepos];
  3584. chomp $sequence;
  3585. $sequence =~ s/ //g;
  3586. my @localfs = ();
  3587. my @localrs = ();
  3588. if (exists $fmicros{$key}){
  3589. @localfs = @{$fmicros{$key}};
  3590. delete $fmicros{$key};
  3591. }
  3592. my %forwardstarts = ();
  3593. my %forwardends = ();
  3594. foreach my $f (@localfs){
  3595. my @fields = split(/\t/,$f);
  3596. push (@{$forwardstarts{$fields[$startcord]}},$f);
  3597. push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]);
  3598. }
  3599. if (exists $rmicros{$key}){
  3600. @localrs = @{$rmicros{$key}};
  3601. delete $rmicros{$key};
  3602. }
  3603. else{
  3604. }
  3605. foreach my $r (@localrs){
  3606. chomp $r;
  3607. my @rields = split(/\t/,$r);
  3608. # print "rields = @rields\n" if $printer == 1;
  3609. my $reciprocalstart = length($sequence) - $rields[$endcord] + 1;
  3610. my $reciprocalend = length($sequence) - $rields[$startcord] + 1;
  3611. # print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1;
  3612. my $microsat = reverse_micro(all_caps($rields[$microsatcord]));
  3613. my @localcollection=();
  3614. for my $i ($reciprocalstart+1 ... $reciprocalend-1){
  3615. if (exists $forwardstarts{$i}){
  3616. push(@localcollection, $forwardstarts{$i}[0] );
  3617. delete $forwardstarts{$i};
  3618. }
  3619. if (exists $forwardends{$i}){
  3620. next if !exists $forwardstarts{$forwardends{$i}[0]};
  3621. push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] );
  3622. }
  3623. }
  3624. if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);}
  3625. if (scalar(@localcollection) == 0){
  3626. print SAME invert_microsat($r,($sequence) ), "\n";
  3627. }
  3628. elsif (scalar(@localcollection) == 1){
  3629. # print "f microsat = $localcollection[0]\n" if $printer == 1;
  3630. my @lields = split(/\t/,$localcollection[0]);
  3631. $lields[$microsatcord]=all_caps($lields[$microsatcord]);
  3632. # print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1;
  3633. # print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1;
  3634. if ($microsat eq $lields[$microsatcord]){
  3635. chomp $localcollection[0];
  3636. print SAME $localcollection[0], "\n";
  3637. }
  3638. if ($microsat ne $lields[$microsatcord]){
  3639. chomp $localcollection[0];
  3640. my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence);
  3641. # print "newmicro = $newmicro\n" if $printer == 1;
  3642. if ($newmicro =~ /[a-zA-Z]/){
  3643. print SAME $newmicro,"\n";
  3644. }
  3645. else{
  3646. print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n";
  3647. # print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1;
  3648. # print "@rields\n@lields\n" if $printer == 1;
  3649. }
  3650. }
  3651. }
  3652. else{
  3653. # print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1;
  3654. }
  3655. }
  3656. }
  3657. close(SEQ);
  3658. close(INF);
  3659. close(INR);
  3660. close(DIFF);
  3661. close(SAME);
  3662. }
  3663. sub all_caps{
  3664. my @strand = split(/\s*/,$_[0]);
  3665. for my $i (0 ... $#strand){
  3666. if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;}
  3667. if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;}
  3668. if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;}
  3669. if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;}
  3670. }
  3671. return join("",@strand);
  3672. }
  3673. sub microsatChooser{
  3674. my $forward = $_[0];
  3675. my $reverse = $_[1];
  3676. my $sequence = $_[2];
  3677. my $seqLength = length($sequence);
  3678. $sequence =~ s/ //g;
  3679. my @fields = split(/\t/,$forward);
  3680. my @rields = split(/\t/,$reverse);
  3681. my $r_start = $seqLength - $rields[$endcord] + 1;
  3682. my $r_end = $seqLength - $rields[$startcord] + 1;
  3683. my $f_microsat = $fields[$microsatcord];
  3684. my $r_microsat = $rields[$microsatcord];
  3685. if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) {
  3686. return $forward if length($f_microsat) >= length($r_microsat);
  3687. return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat);
  3688. }
  3689. return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord];
  3690. my $f_microsat_copy = $f_microsat;
  3691. my $r_microsat_copy = $r_microsat;
  3692. $f_microsat_copy =~ s/^\[|\]$//g;
  3693. $r_microsat_copy =~ s/^\[|\]$//g;
  3694. my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
  3695. my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
  3696. my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
  3697. my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
  3698. my $f_motif = $fields[$motifcord];
  3699. my $r_motif = $rields[$motifcord];
  3700. my $f_motif_copy = $f_motif;
  3701. my $r_motif_copy = $r_motif;
  3702. $f_motif_copy =~ s/^\[|\]$//g;
  3703. $r_motif_copy =~ s/^\[|\]$//g;
  3704. my @f_motields = split(/\]\[/,$f_motif_copy);
  3705. my @r_motields = split(/\]\[/,$r_motif_copy);
  3706. my $f_purestretch = join("",@f_microields);
  3707. my $r_purestretch = join("",@r_microields);
  3708. if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){
  3709. # print "now.. studying $forward\n$reverse\n" if $printer == 1;
  3710. if ($fields[$typecord] eq $rields[$typecord]){
  3711. # print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
  3712. if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
  3713. my $subset_answer = isSubset($forward, $reverse, $seqLength);
  3714. # print "subset answer = $subset_answer\n" if $printer == 1;
  3715. return $forward if $subset_answer == 1;
  3716. return invert_microsat($reverse, $sequence) if $subset_answer == 2;
  3717. return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
  3718. return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
  3719. return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch);
  3720. return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch);
  3721. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ;
  3722. }
  3723. elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
  3724. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3725. }
  3726. elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
  3727. return $forward;
  3728. }
  3729. elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
  3730. return invert_microsat($reverse, $sequence);
  3731. }
  3732. }
  3733. else{
  3734. my $fmotlen = ();
  3735. my $rmotlen = ();
  3736. $fmotlen =1 if $fields[$typecord] eq "mononucleotide";
  3737. $fmotlen =2 if $fields[$typecord] eq "dinucleotide";
  3738. $fmotlen =3 if $fields[$typecord] eq "trinucleotide";
  3739. $fmotlen =4 if $fields[$typecord] eq "tetranucleotide";
  3740. $rmotlen =1 if $rields[$typecord] eq "mononucleotide";
  3741. $rmotlen =2 if $rields[$typecord] eq "dinucleotide";
  3742. $rmotlen =3 if $rields[$typecord] eq "trinucleotide";
  3743. $rmotlen =4 if $rields[$typecord] eq "tetranucleotide";
  3744. if ($fmotlen < $rmotlen){
  3745. if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){
  3746. return $forward;
  3747. }
  3748. else{
  3749. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3750. }
  3751. }
  3752. if ($fmotlen > $rmotlen){
  3753. if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){
  3754. return invert_microsat($reverse, $sequence);
  3755. }
  3756. else{
  3757. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3758. }
  3759. }
  3760. }
  3761. }
  3762. if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){
  3763. # print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
  3764. if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
  3765. my $subset_answer = isSubset($forward, $reverse, $seqLength);
  3766. # print "subset answer = $subset_answer\n" if $printer == 1;
  3767. return $forward if $subset_answer == 1;
  3768. return invert_microsat($reverse, $sequence) if $subset_answer == 2;
  3769. # print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1;
  3770. return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
  3771. return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
  3772. if ($subset_answer == 3){
  3773. if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){
  3774. if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){
  3775. return $forward;
  3776. }
  3777. else{
  3778. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3779. }
  3780. }
  3781. if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){
  3782. if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){
  3783. return invert_microsat($reverse, $sequence);
  3784. }
  3785. else{
  3786. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3787. }
  3788. }
  3789. }
  3790. }
  3791. elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
  3792. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3793. }
  3794. elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
  3795. return $forward;
  3796. }
  3797. elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
  3798. return invert_microsat($reverse, $sequence);
  3799. }
  3800. }
  3801. if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){
  3802. # print "one compound, one nucleotide\n" if $printer == 1;
  3803. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3804. }
  3805. if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){
  3806. # print "one compound, one nucleotide\n" if $printer == 1;
  3807. return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
  3808. }
  3809. }
  3810. sub isSubset{
  3811. my $forward = $_[0]; my @fields = split(/\t/,$forward);
  3812. my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
  3813. my $seqLength = $_[2];
  3814. my $r_start = $seqLength - $rields[$endcord] + 1;
  3815. my $r_end = $seqLength - $rields[$startcord] + 1;
  3816. # print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1;
  3817. return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end;
  3818. return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end;
  3819. return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord];
  3820. return "3";
  3821. }
  3822. sub motifBYmotif_match{
  3823. my $forward = $_[0];
  3824. my $reverse = $_[1];
  3825. $forward =~ s/^\[|\]$//g;
  3826. $reverse =~ s/^\[|\]$//g;
  3827. my @f_motields=split(/\]\[/, $forward);
  3828. my @r_motields=split(/\]\[/, $reverse);
  3829. my $finalresult = 0;
  3830. if (scalar(@f_motields) != scalar(@r_motields)){
  3831. my $subresult = 0;
  3832. my @mega = (); my @sub = ();
  3833. @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields);
  3834. @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields);
  3835. @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields);
  3836. @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields);
  3837. for my $i (0 ... $#sub){
  3838. my $probe = $sub[$i].$sub[$i];
  3839. # print "probing $probe and $mega[$i]\n" if $printer == 1;
  3840. if ($probe =~ /$mega[$i]/) {$subresult = 1; }
  3841. else {$subresult = 0; last; }
  3842. }
  3843. return 0 if $subresult == 0;
  3844. return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f
  3845. return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse
  3846. }
  3847. else{
  3848. for my $i (0 ... $#f_motields){
  3849. my $probe = $f_motields[$i].$f_motields[$i];
  3850. if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;}
  3851. else {$finalresult = 0 ;last;}
  3852. }
  3853. }
  3854. # print "finalresult = $finalresult\n" if $printer == 1;
  3855. return $finalresult;
  3856. }
  3857. sub merge_microsats{
  3858. my @input = @_;
  3859. my $sequence = pop(@input);
  3860. $sequence =~ s/ //g;
  3861. my @seq_string = @input;
  3862. # print "IN: merge_microsats\n";
  3863. # print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n";
  3864. my $start;
  3865. my $end;
  3866. my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string;
  3867. # print "\nrearranged into @micros \n";
  3868. my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = ();
  3869. my @fields = ();
  3870. for my $i (0 ... $#micros){
  3871. chomp $micros[$i];
  3872. @fields = split(/\t/,$micros[$i]);
  3873. push(@types, $fields[$typecord]);
  3874. push(@motifs, $fields[$motifcord]);
  3875. if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);}
  3876. else { push(@interruptiontypes, "NA"); }
  3877. if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);}
  3878. else { push(@interruptions, "NA"); }
  3879. if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);}
  3880. else { push(@interrposes, "NA"); }
  3881. if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);}
  3882. else { push(@no_of_interruptions, "NA"); }
  3883. if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));}
  3884. else { push(@mergestart, $fields[$startcord]); }
  3885. if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));}
  3886. else { push(@mergeend, $fields[$endcord]); }
  3887. if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);}
  3888. else { push(@mergemicro, $fields[$microsatcord]); }
  3889. }
  3890. $start = smallest_number(@mergestart);
  3891. $end = largest_number(@mergeend);
  3892. my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]";
  3893. my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro));
  3894. return $microsat;
  3895. }
  3896. sub slided_microsat{
  3897. my $forward = $_[0]; my @fields = split(/\t/,$forward);
  3898. my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
  3899. my $seqLength = $_[2];
  3900. my $r_start = $seqLength - $rields[$endcord] + 1;
  3901. my $r_end = $seqLength - $rields[$startcord] + 1;
  3902. my $motlen =();
  3903. $motlen =1 if $fields[$typecord] eq "mononucleotide";
  3904. $motlen =2 if $fields[$typecord] eq "dinucleotide";
  3905. $motlen =3 if $fields[$typecord] eq "trinucleotide";
  3906. $motlen =4 if $fields[$typecord] eq "tetranucleotide";
  3907. if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) {
  3908. return 0;
  3909. }
  3910. else{
  3911. return 1;
  3912. }
  3913. }
  3914. #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
  3915. #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
  3916. sub new_multispecies_t10{
  3917. my $input1 = $_[0]; #gap_op_unrand_match
  3918. my $input2 = $_[1]; #sput
  3919. my $output = $_[2]; #output
  3920. my $bin = $output."_bin";
  3921. my $orgs = join("|",split(/\./,$_[3]));
  3922. my @organisms = split(/\./,$_[3]);
  3923. my $no_of_species = scalar(@organisms); #3
  3924. my $t10info = $output."_info";
  3925. $prinkter = 0;
  3926. open (MATCH, "<$input1");
  3927. open (SPUT, "<$input2");
  3928. open (OUT, ">$output");
  3929. open (INFO, ">$t10info");
  3930. sub microsat_bracketer;
  3931. sub custom;
  3932. my %seen = ();
  3933. $infocord = 2 + (4*$no_of_species) - 1;
  3934. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  3935. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  3936. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  3937. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  3938. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  3939. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  3940. $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  3941. #---------------------------------------------------------------------------------------------------------------#
  3942. # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES #
  3943. #---------------------------------------------------------------------------------------------------------------#
  3944. my $linecounter = 0;
  3945. my $microcounter = 0;
  3946. while (my $line = <SPUT>){
  3947. chomp $line;
  3948. # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
  3949. next if $line !~ /[0-9a-z]+/;
  3950. $linecounter++;
  3951. # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
  3952. # print $key, "#-#-#-#-#-#-#-#\n";
  3953. if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3954. my $key = join("\t",$1, $2, $3, $4, $5);
  3955. # print "key = $key\n" if $prinkter == 1;
  3956. push (@{$seen{$key}},$line);
  3957. $microcounter++;
  3958. }
  3959. else {
  3960. #print "could not make ker in SPUT : \n$line \n";
  3961. }
  3962. }
  3963. # print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
  3964. # print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
  3965. close SPUT;
  3966. #----------------------------------------------------------------------------------------------------------------
  3967. #-------------------------------------------------------------------------------------------------------#
  3968. # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY #
  3969. # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN #
  3970. # SECOND FILE FOLLOWED BY #
  3971. # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION #
  3972. #-------------------------------------------------------------------------------------------------------#
  3973. my $ref = 0;
  3974. my $ref2 = 0;
  3975. my $ref3 = 0;
  3976. my $ref4 = 0;
  3977. my $deletes= 0;
  3978. my $duplicates = 0;
  3979. my $neighbors = 0;
  3980. my $tooshort = 0;
  3981. my $prevmicrol=();
  3982. my $startnotfound = 0;
  3983. my $matchkeysformed = 0;
  3984. my $keysused = 0;
  3985. while (my $line = <MATCH>) {
  3986. # print colored ['magenta'], $line if $prinkter == 1;
  3987. next if $line !~ /[a-zA-Z0-9]/;
  3988. chomp $line;
  3989. my @fields2 = split(/\t/,$line);
  3990. my $key2 = ();
  3991. # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
  3992. if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  3993. $matchkeysformed++;
  3994. $key2 = join("\t",$1, $2, $3, $4, $5);
  3995. # print "key = $key2 \n" if $prinkter == 1;
  3996. }
  3997. else{
  3998. # print "could not make ker in SEQ : $line\n";
  3999. next;
  4000. }
  4001. my $sequence = $fields2[$sequencepos];
  4002. $sequence =~ s/\*/-/g;
  4003. my $count = 0;
  4004. if (exists $seen{$key2}){
  4005. $keysused++;
  4006. my @unsorted_raw = @{$seen{$key2}};
  4007. delete $seen{$key2};
  4008. my @sequencearr = split(/\s*/, $sequence);
  4009. # print "sequencearr = @sequencearr\n" if $prinkter == 1;
  4010. my $counter;
  4011. my %start_database = ();
  4012. my %end_database = ();
  4013. foreach my $uns (@unsorted_raw){
  4014. my @uields = split(/\t/,$uns);
  4015. $start_database{$uields[$startcord]} = $uns;
  4016. $end_database{$uields[$endcord]} = $uns;
  4017. }
  4018. my @unsorted = ();
  4019. my %starts = (); my %ends = ();
  4020. # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;}
  4021. for my $u (0 ... $#unsorted_raw){
  4022. my @tields = split(/\t/,$unsorted_raw[$u]);
  4023. next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
  4024. push(@unsorted, $unsorted_raw[$u]);
  4025. $starts{$tields[$startcord]} = $unsorted_raw[$u];
  4026. # print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1;
  4027. }
  4028. my $basecounter= 0;
  4029. my $gapcounter = 0;
  4030. my $poscounter = 0;
  4031. for my $s (@sequencearr){
  4032. $poscounter++;
  4033. if ($s eq "-"){
  4034. $gapcounter++; next;
  4035. }
  4036. else{
  4037. $basecounter++;
  4038. }
  4039. #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
  4040. #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
  4041. #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1;
  4042. if (exists $starts{$basecounter}){
  4043. my $locus = $starts{$basecounter};
  4044. # print "locus identified = $locus\n" if $prinkter == 1;
  4045. my @fields3 = split(/\t/,$locus);
  4046. my $start = $fields3[$startcord];
  4047. my $end = $fields3[$endcord];
  4048. my $motif = $fields3[$motifcord];
  4049. my $microsat = $fields3[$microsatcord];
  4050. my @leftbracketpos = ();
  4051. my @rightbracketpos = ();
  4052. my $bracket_picker = 'no';
  4053. my $leftbrackets=();
  4054. my $rightbrackets = ();
  4055. my $micro_cpy = $microsat;
  4056. # print "microsat = $microsat\n" if $prinkter == 1;
  4057. while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';}
  4058. while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);}
  4059. $microsat =~ s/[\[\]\-\*]//g;
  4060. # print "microsat = $microsat\n" if $prinkter == 1;
  4061. my $human_search = join '-*', split //, $microsat;
  4062. my $temp = substr($sequence, $poscounter-1);
  4063. # print "with poscounter = $poscounter\n" if $prinkter == 1;
  4064. my $search_result = ();
  4065. my $posnow = ();
  4066. # print "for $line, temp $temp or human_search $human_search not defined\n" if !defined $temp || !defined $human_search;
  4067. # <STDIN> if !defined $temp || !defined $human_search;
  4068. while ($temp =~ /($human_search)/gi){
  4069. $search_result = $1;
  4070. # $posnow = pos($temp);
  4071. last;
  4072. }
  4073. my @gapspos = ();
  4074. next if !defined $search_result;
  4075. while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); }
  4076. my $gaps = join("__",@gapspos);
  4077. my $final_microsat = $search_result;
  4078. if ($bracket_picker eq "yes"){
  4079. $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets);
  4080. }
  4081. my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat);
  4082. if ($bracket_picker eq "yes") {
  4083. $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]);
  4084. }
  4085. print OUT $outsentence,"\n";
  4086. }
  4087. }
  4088. }
  4089. }
  4090. my $unusedkeys = scalar(keys %seen);
  4091. # print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
  4092. # print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
  4093. # print INFO "microsatellites too short = $tooshort\n";
  4094. # print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
  4095. #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
  4096. #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
  4097. #print "microsatellites too short = $tooshort\n";
  4098. #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
  4099. #print "unused keys = \n",join("\n", (keys %seen)),"\n";
  4100. close (MATCH);
  4101. close (SPUT);
  4102. close (OUT);
  4103. close (INFO);
  4104. }
  4105. sub microsat_bracketer{
  4106. # print "in bracketer: @_\n";
  4107. my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_;
  4108. my @gaps = split(/__/,$gapspos);
  4109. my @lefts = split(/__/,$leftbracketpos);
  4110. my @rights = split(/__/,$rightbracketpos);
  4111. my @new=();
  4112. my $pure = $microsat;
  4113. $pure =~ s/-//g;
  4114. my $off = 0;
  4115. my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights);
  4116. push(@gaps, 0);
  4117. push(@lefts,0);
  4118. push(@rights,0);
  4119. for my $i (1 ... $finallength){
  4120. # print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n";
  4121. if($rights[0] == $i){
  4122. # print "pushed a ]\n";
  4123. push(@new, "]");
  4124. shift(@rights);
  4125. push(@rights,0);
  4126. for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
  4127. next;
  4128. }
  4129. if($gaps[0] == $i){
  4130. # print "pushed a -\n";
  4131. push(@new, "-");
  4132. shift(@gaps);
  4133. push(@gaps, 0);
  4134. for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;}
  4135. for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;}
  4136. next;
  4137. }
  4138. if($lefts[0] == $i){
  4139. # print "pushed a [\n";
  4140. push(@new, "[");
  4141. shift(@lefts);
  4142. push(@lefts,0);
  4143. for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
  4144. next;
  4145. }
  4146. else{
  4147. my $pushed = substr($pure,$off,1);
  4148. $off++;
  4149. push(@new,$pushed );
  4150. # print "pushed an alphabet, now new = @new, pushed = $pushed\n";
  4151. next;
  4152. }
  4153. }
  4154. my $returnmicrosat = join("",@new);
  4155. # print "final microsat = $returnmicrosat \n";
  4156. return($returnmicrosat);
  4157. }
  4158. #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
  4159. #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
  4160. sub multiSpecies_orthFinder4{
  4161. #print "IN multiSpecies_orthFinder4: @_\n";
  4162. my @handles = ();
  4163. #1 SEPT 30TH 2008
  4164. #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER
  4165. #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL
  4166. #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST
  4167. #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED
  4168. #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY
  4169. #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES.
  4170. #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE
  4171. #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE
  4172. #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN
  4173. #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl
  4174. #1 trial run:
  4175. #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50
  4176. $prinkter=0;
  4177. #############
  4178. my $CLUSTER_DIST = $_[4];
  4179. #############
  4180. my $aligns = $_[0];
  4181. my @micros = split(/:/, $_[1]);
  4182. my $orth = $_[2];
  4183. #my $not_orth = "notorth";
  4184. @tags = split(/:/, $_[3]);
  4185. $no_of_species=scalar(@tags);
  4186. my $junkfile = $orth."_junk";
  4187. #open(JUNK,">$junkfile");
  4188. #my $info = $output1."_info";
  4189. #print "inputs are : \n"; foreach(@micros){print $_,"\n";}
  4190. #print "info = @_\n";
  4191. open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
  4192. open (ORTH, ">$orth");
  4193. my $output=$orth."_out";
  4194. open (OUTP, ">$output");
  4195. #open (NORTH, ">$not_orth");
  4196. #open (INF, ">$info");
  4197. my $i = 0;
  4198. foreach my $path (@micros){
  4199. $handles[$i] = IO::Handle->new();
  4200. open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!";
  4201. $i++;
  4202. }
  4203. #print "Opened files\n";
  4204. $infocord = 2 + (4*$no_of_species) - 1;
  4205. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  4206. $motifcord = $typecord + 1;
  4207. $gapcord = $motifcord+1;
  4208. $startcord = $gapcord + 1;
  4209. $strandcord = $startcord + 1;
  4210. $endcord = $strandcord + 1;
  4211. $microsatcord = $endcord + 1;
  4212. $sequencepos = 2 + (4*$no_of_species) + 1 -1 ;
  4213. #$sequencepos = 17;
  4214. # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES
  4215. #----------------------------------------------------------------------------------------------------------------
  4216. my @hasharr = ();
  4217. foreach my $path (@micros){
  4218. open(READ, "<$path") or die "Cannot open file $path :$!";
  4219. my %single_hash = ();
  4220. my $key = ();
  4221. my $counter = 0;
  4222. while (my $line = <READ>){
  4223. $counter++;
  4224. # print $line;
  4225. chomp $line;
  4226. my @fields1 = split(/\t/,$line);
  4227. if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  4228. $key = join("\t",$1, $2, $4, $5);
  4229. # print "key = : $key\n" if $prinkter == 1;
  4230. # print $line if $prinkter == 1;
  4231. push (@{$single_hash{$key}},$line);
  4232. }
  4233. else{
  4234. # print "microsat line incompatible\n";
  4235. }
  4236. }
  4237. push @hasharr, {%single_hash};
  4238. # print "@{$single_hash{$key}} \n";
  4239. # print "done $path: counter = $counter\n" if $prinkter == 1;
  4240. close READ;
  4241. }
  4242. # print "Done hashes\n";
  4243. #----------------------------------------------------------------------------------------------------------------
  4244. my $question=();
  4245. #----------------------------------------------------------------------------------------------------------------
  4246. my @contigstarts = ();
  4247. my @contigends = ();
  4248. my %contigclusters = ();
  4249. my %contigclustersFirstStartOnly=();
  4250. my %contigclustersLastEndOnly=();
  4251. my %contigclustersLastEndLengthOnly=();
  4252. my %contigclustersFirstStartLengthOnly=();
  4253. my %contigpath=();
  4254. my $dotcounter = 0;
  4255. while (my $line = <BO>){
  4256. # print "x" x 60, "\n" if $prinkter == 1;
  4257. $dotcounter++;
  4258. # print "." if $dotcounter % 100 ==0;
  4259. # print "\n" if $dotcounter % 5000 ==0;
  4260. next if $line !~ /^[0-9]+/;
  4261. # print $line if $prinkter == 1;
  4262. chomp $line;
  4263. my @fields2 = split(/\t/,$line);
  4264. my $key2 = ();
  4265. if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
  4266. $key2 = join("\t",$1, $2, $4, $5);
  4267. }
  4268. else {
  4269. # print "seq line $line incompatible\n" if $prinkter == 1;
  4270. next;}
  4271. my @sequences = ();
  4272. for (0 ... $#tags){
  4273. my $seq = <BO>;
  4274. # print $seq;
  4275. chomp $seq;
  4276. push(@sequences , " ".$seq);
  4277. }
  4278. my @origsequences = @sequences;
  4279. my $seqcopy = $sequences[0];
  4280. my @strings = ();
  4281. $seqcopy =~ s/[a-zA-Z]|-/x/g;
  4282. my @string = split(/\s*/,$seqcopy);
  4283. for my $s (0 ... $#tags){
  4284. $sequences[$s] =~ s/-//g;
  4285. $sequences[$s] =~ s/[a-zA-Z]/x/g;
  4286. # print "length of sequence = ",length($sequences[$s]),"\n";
  4287. my @tempstring = split(/\s*/,$sequences[$s]);
  4288. push(@strings, [@tempstring])
  4289. }
  4290. my @species_list = ();
  4291. my @micro_count = 0;
  4292. my @starthash = ();
  4293. my $stopper = 1;
  4294. my @endhash = ();
  4295. my @currentcontigstarts=();
  4296. my @currentcontigends=();
  4297. my @currentcontigchrs=();
  4298. for my $i (0 ... $#tags){
  4299. # print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1;
  4300. my @temparr = ();
  4301. if (exists $hasharr[$i]{$key2}){
  4302. @temparr = @{$hasharr[$i]{$key2}};
  4303. $line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/;
  4304. ## print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9_])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
  4305. # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
  4306. my $startkey = $1."_SK0SK_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
  4307. my $endkey = $1."_EK0EK_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
  4308. $contigstarts[$i]{$startkey}= $key2;
  4309. $contigends[$i]{$endkey}= $key2;
  4310. # print "confirming existance: \n" if $prinkter == 1;
  4311. # print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1;
  4312. # print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1;
  4313. $currentcontigchrs[$i]=$1;
  4314. $currentcontigstarts[$i]=$2;
  4315. $currentcontigends[$i]=$3;
  4316. } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
  4317. else {
  4318. push (@starthash, {0 => "0"});
  4319. push (@endhash, {0 => "0"});
  4320. $currentcontigchrs[$i] = 0;
  4321. next;
  4322. }
  4323. $stopper = 0;
  4324. # print "exists: @temparr\n" if $prinkter == 1;
  4325. push(@micro_count, scalar(@temparr));
  4326. push(@species_list, [@temparr]);
  4327. my @tempstart = (); my @tempend = ();
  4328. my %localends = ();
  4329. my %localhash = ();
  4330. # print "---------------------------\n";
  4331. foreach my $templine (@temparr){
  4332. # print "templine = $templine\n" if $prinkter == 1;
  4333. my @tields = split(/\t/,$templine);
  4334. my $start = $tields[$startcord]; # - $tields[$gapcord];
  4335. my $end = $tields[$endcord]; #- $tields[$gapcord];
  4336. my $realstart = $tields[$startcord]- $tields[$gapcord];
  4337. my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g);
  4338. $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/;
  4339. # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n";
  4340. my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat;
  4341. # print "real start = $realstart, realend = $realend \n";
  4342. for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;}
  4343. push(@tempstart, $start);
  4344. push(@tempend, $end);
  4345. $localhash{$start."-".$end} = $templine;
  4346. }
  4347. push @starthash, {%localhash};
  4348. my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST);
  4349. # print "foundclusters = $foundclusters\n";
  4350. my @clusters = split(/_/,$foundclusters);
  4351. my $clustno = 0;
  4352. foreach my $cluster (@clusters) {
  4353. my @constituenst = split(/,/,$cluster);
  4354. # print "clusters returned: @constituenst\n" if $prinkter == 1;
  4355. }
  4356. @string = split("_S0S_",stringPainter(join("_C0C_",@string),$foundclusters));
  4357. }
  4358. next if $stopper == 1;
  4359. # print colored ['blue'],"FINAL:\n" if $prinkter == 1;
  4360. my $finalclusters =findClusters(join("!",@string), 1);
  4361. # print "finalclusters = $finalclusters\n";
  4362. # print colored ['blue'],"----------------------\n" if $prinkter == 1;
  4363. my @clusters = split(/,/,$finalclusters);
  4364. # print "@string\n" if $prinkter == 1;
  4365. # print "@clusters\n" if $prinkter == 1;
  4366. # print "------------------------------------------------------------------\n" if $prinkter == 1;
  4367. my $clustno = 0;
  4368. # foreach my $cluster (@clusters) {
  4369. # my @constituenst = split(/,/,$cluster);
  4370. # print "clusters returned: @constituenst\n";
  4371. # }
  4372. next if (scalar @clusters == 0);
  4373. my @contigcluster=();
  4374. my $clusterno=0;
  4375. my @contigClusterstarts=();
  4376. my @contigClusterends = ();
  4377. foreach my $clust (@clusters){
  4378. # print "cluster: $clust\n";
  4379. $clusterno++;
  4380. my @localclust = split(/\./, $clust);
  4381. my @result = ();
  4382. my @starts = ();
  4383. my @ends = ();
  4384. for my $i (0 ... $#localclust){
  4385. # print "localclust[$i]: $localclust[$i]\n";
  4386. my @pattern = split(/:/, $localclust[$i]);
  4387. my @cords = split(/-/, $pattern[1]);
  4388. push (@starts, $cords[0]);
  4389. push (@ends, $cords[1]);
  4390. }
  4391. my $extremestart = smallest_number(@starts);
  4392. my $extremeend = largest_number(@ends);
  4393. push(@contigClusterstarts, $extremestart);
  4394. push(@contigClusterends, $extremeend);
  4395. # print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ;
  4396. foreach my $clustparts (@localclust){
  4397. my @pattern = split(/:/, $clustparts);
  4398. # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n";
  4399. push (@result, $starthash[$pattern[0]]{$pattern[1]});
  4400. }
  4401. push(@contigcluster, join("\t", @result));
  4402. # print join("\t", @result),"<-result \n" if $prinkter == 1 ;
  4403. }
  4404. my $firstclusterstart = smallest_number(@contigClusterstarts);
  4405. my $lastclusterend = largest_number(@contigClusterends);
  4406. $contigclustersFirstStartOnly{$key2}=$firstclusterstart;
  4407. $contigclustersLastEndOnly{$key2} = $lastclusterend;
  4408. $contigclusters{$key2}=[ @contigcluster ];
  4409. # print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1;
  4410. for my $i (0 ... $#tags){
  4411. #1 check if there exists adjacent alignment block wrt coordinates of this species.
  4412. next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
  4413. #2 no need to worry about proximity of anything in adjacent block!
  4414. #1 BELOW, the following is really to calclate the distance between the end coordinate of the
  4415. #2 cluster and the end of the gap-free sequence of each species. this is so that if an
  4416. #3 adjacent alignment block is found lateron, the exact distance between the potentially
  4417. #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
  4418. #5 immediately below.
  4419. # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1;
  4420. my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart);
  4421. my $species_endsubstring = ();
  4422. if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";}
  4423. else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);}
  4424. # print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1;
  4425. # print "for species: $tags[$i]: \n" if $prinkter == 1;
  4426. $species_startsubstring =~ s/-| //g;
  4427. $species_endsubstring =~ s/-| //g;
  4428. $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring);
  4429. $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring);
  4430. # print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1;
  4431. # print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1;
  4432. # print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1;
  4433. # print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1;
  4434. }
  4435. }
  4436. # print "\ndone the job of filling... \n";
  4437. #///////////////////////////////////////////////////////////////////////////////////////
  4438. #///////////////////////////////////////////////////////////////////////////////////////
  4439. #///////////////////////////////////////////////////////////////////////////////////////
  4440. #///////////////////////////////////////////////////////////////////////////////////////
  4441. $prinkter=0;
  4442. open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
  4443. my %clusteringpaths=();
  4444. my %clustersholder=();
  4445. my %foundkeys=();
  4446. my %clusteringpathsRev=();
  4447. my $totalcount=();
  4448. my $founkeys_enteredcount=();
  4449. my $transfered=0;
  4450. my $complete_transfered=0;
  4451. my $plain_transfered=0;
  4452. my $existing_removed=0;
  4453. while (my $line = <BO>){
  4454. # print "x" x 60, "\n" if $prinkter == 1;
  4455. next if $line !~ /^[0-9]+/;
  4456. #print $line;
  4457. chomp $line;
  4458. my @fields2 = split(/\t/,$line);
  4459. my $key2 = ();
  4460. if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/ ) {
  4461. $key2 = join("\t",$1, $2, $4, $5);
  4462. }
  4463. else {
  4464. # print "seq line $line incompatible\n";
  4465. next;
  4466. }
  4467. # print "KEY = : $key2\n" if $prinkter == 1;
  4468. my @currentcontigstarts=();
  4469. my @currentcontigends=();
  4470. my @currentcontigchrs=();
  4471. my @clusters = ();
  4472. my @clusterscopy=();
  4473. if (exists $contigclusters{$key2}){
  4474. @clusters = @{$contigclusters{$key2}};
  4475. @clusterscopy=@clusters;
  4476. for my $i (0 ... $#tags){
  4477. # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
  4478. if ($line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/){
  4479. # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
  4480. my $startkey = $1."_S0E_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
  4481. my $endkey = $1."_S0E_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
  4482. $currentcontigchrs[$i]=$1;
  4483. $currentcontigstarts[$i]=$2;
  4484. $currentcontigends[$i]=$3;
  4485. }
  4486. else {
  4487. $currentcontigchrs[$i] = 0;
  4488. # print "no microsat clusters for $key2\n" if $prinkter == 1; next;
  4489. }
  4490. }
  4491. } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
  4492. my @sequences = ();
  4493. for (0 ... $#tags){
  4494. my $seq = <BO>;
  4495. # print $seq;
  4496. chomp $seq;
  4497. push(@sequences , " ".$seq);
  4498. }
  4499. next if scalar @currentcontigchrs == 0;
  4500. # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1;
  4501. my %visitedcontigs=();
  4502. for my $i (0 ... $#tags){
  4503. #1 check if there exists adjacent alignment block wrt coordinates of this species.
  4504. next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
  4505. #2 no need to worry about proximity of anything in adjacent block!
  4506. @clusters=@clusterscopy;
  4507. #1 BELOW, the following is really to calclate the distance between the end coordinate of the
  4508. #2 cluster and the end of the gap-free sequence of each species. this is so that if an
  4509. #3 adjacent alignment block is found lateron, the exact distance between the potentially
  4510. #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
  4511. #5 immediately below.
  4512. my $firstclusterstart = $contigclustersFirstStartOnly{$key2};
  4513. my $lastclusterend = $contigclustersLastEndOnly{$key2};
  4514. my $key3 = $currentcontigchrs[$i]."_S0E_".($currentcontigstarts[$i]);
  4515. # print "check if exists $key3 in contigends for $i\n" if $prinkter == 1;
  4516. if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){
  4517. $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies#
  4518. # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1;
  4519. #1 extract coordinates of the last cluster of this found alignment block
  4520. # print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1;
  4521. # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1;
  4522. # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
  4523. # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
  4524. my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}};
  4525. my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}};
  4526. my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=();
  4527. for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){
  4528. # print "val = $val\n" if $prinkter == 1;
  4529. if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){
  4530. push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/;
  4531. }
  4532. }
  4533. # print "-----\n" if $prinkter == 1;
  4534. for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){
  4535. # print "val = $val\n" if $prinkter == 1;
  4536. if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){
  4537. push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/;
  4538. }
  4539. }
  4540. # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1;
  4541. # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1;
  4542. # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){
  4543. if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){
  4544. my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}};
  4545. $regurgitate[$#regurgitate]=~s/\n//g;
  4546. $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters);
  4547. delete $contigclusters{$contigends[$i]{$key3}};
  4548. $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ];
  4549. delete $contigclusters{$key2};
  4550. $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0;
  4551. $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0;
  4552. if (scalar(@clusters) < 1){
  4553. # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2};
  4554. $clusteringpaths{$key2}=$contigends[$i]{$key3};
  4555. $clusteringpathsRev{$contigends[$i]{$key3}}=$key2;
  4556. print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n";
  4557. # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n";
  4558. $founkeys_enteredcount-- if exists $foundkeys{$key2};
  4559. $existing_removed++ if exists $foundkeys{$key2};
  4560. # print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1;
  4561. delete $foundkeys{$key2} if exists $foundkeys{$key2};
  4562. $complete_transfered++;
  4563. }
  4564. else{
  4565. print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2};
  4566. $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2};
  4567. $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2};
  4568. $founkeys_enteredcount++ if !exists $foundkeys{$key2};
  4569. $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2};
  4570. # print "adding foundkeys entry $foundkeys{$key2}\n";
  4571. $transfered++;
  4572. }
  4573. #$contigclusters{$key2}=[ @contigcluster ];
  4574. }
  4575. }
  4576. else{
  4577. # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1;
  4578. $plain_transfered++;
  4579. print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
  4580. $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2};
  4581. $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
  4582. $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
  4583. $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
  4584. # print "adding foundkeys entry $foundkeys{$key2}\n";
  4585. }
  4586. $totalcount++;
  4587. }
  4588. }
  4589. close BO;
  4590. #close (NORTH);
  4591. #///////////////////////////////////////////////////////////////////////////////////////
  4592. #///////////////////////////////////////////////////////////////////////////////////////
  4593. #///////////////////////////////////////////////////////////////////////////////////////
  4594. #///////////////////////////////////////////////////////////////////////////////////////
  4595. my $founkeys_count=();
  4596. my $nopath_count=();
  4597. my $pathed_count=0;
  4598. foreach my $key2 (keys %foundkeys){
  4599. #print "x" x 60, "\n";
  4600. # print "x" if $dotcounter % 100 ==0;
  4601. # print "\n" if $dotcounter % 5000 ==0;
  4602. $founkeys_count++;
  4603. my $key = $key2;
  4604. # print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1;
  4605. if ($clusteringpaths{$key} eq $key){
  4606. # print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1;
  4607. $nopath_count++;
  4608. delete $foundkeys{$key2};
  4609. print ORTH join ("\n",@{$contigclusters{$key2}}),"\n";
  4610. }
  4611. else{
  4612. my @pool=();
  4613. my $key3=();
  4614. $pathed_count++;
  4615. # print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1;
  4616. # print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1;
  4617. if ($clusteringpathsRev{$key} eq "0") {
  4618. next;
  4619. }
  4620. else{
  4621. my $yek3 = $clusteringpathsRev{$key};
  4622. my $yek = $key;
  4623. # print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1;
  4624. while ($yek3 ne "0"){
  4625. # print "$yek->$yek3," if $prinkter == 1;
  4626. $yek = $yek3;
  4627. $yek3 = $clusteringpathsRev{$yek};
  4628. }
  4629. # print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1;
  4630. $key3 = $clusteringpaths{$yek};
  4631. $key = $yek;
  4632. }
  4633. # print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1;
  4634. while($key ne $key3){
  4635. # print "KEEY $key->$key3\n" if $prinkter == 1;
  4636. # print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1;
  4637. if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}};
  4638. # print "now pool = @pool\n" if $prinkter == 1;
  4639. }
  4640. delete $foundkeys{$key3};
  4641. $key=$key3;
  4642. $key3=$clusteringpaths{$key};
  4643. }
  4644. # print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1;
  4645. my @firstcontig= @{$contigclusters{$key}};
  4646. delete $foundkeys{$key2} if exists $foundkeys{$key2} ;
  4647. delete $foundkeys{$key} if exists $foundkeys{$key};
  4648. unshift @pool, pop @firstcontig;
  4649. # print join("\t",@pool),"\n" if $prinkter == 1;
  4650. print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0;
  4651. print ORTH join ("\t",@pool),"\n";
  4652. # join();
  4653. }
  4654. }
  4655. #close (NORTH);
  4656. # print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1;
  4657. close (BO);
  4658. close (ORTH);
  4659. close (OUTP);
  4660. return 1;
  4661. }
  4662. sub stringPainter{
  4663. my @string = split(/_C0C_/,$_[0]);
  4664. # print $_[0], " <- in stringPainter\n";
  4665. # print $_[1], " <- in clusters\n";
  4666. my @clusters = split(/,/, $_[1]);
  4667. for my $i (0 ... $#clusters){
  4668. my $cluster = $clusters[$i];
  4669. # print "cluster = $cluster\n";
  4670. my @parts = split(/\./,$cluster);
  4671. my @cord = split(/:|-/,shift(@parts));
  4672. my $minstart = $cord[1];
  4673. my $maxend = $cord[2];
  4674. # print "minstart = $minstart , maxend = $maxend\n";
  4675. for my $j (0 ... $#parts){
  4676. # print "oing thri $parts[$j]\n";
  4677. my @cord = split(/:|-/,$parts[$j]);
  4678. $minstart = $cord[1] if $cord[1] < $minstart;
  4679. $maxend = $cord[2] if $cord[2] > $maxend;
  4680. }
  4681. # print "minstart = $minstart , maxend = $maxend\n";
  4682. for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;}
  4683. }
  4684. # print "@string <-done from function stringPainter\n";
  4685. return join("_S0S_",@string);
  4686. }
  4687. sub findClusters{
  4688. my $continue = 0;
  4689. my @mapped_clusters = ();
  4690. my $clusterdist = $_[1];
  4691. my $previous = 'x';
  4692. my @localcluster = ();
  4693. my $cluster_starts = ();
  4694. my $cluster_ends = ();
  4695. my $localcluster_start = ();
  4696. my $localcluster_end = ();
  4697. my @record_cluster = ();
  4698. my @string = split(/\!/, $_[0]);
  4699. my $zerolength=0;
  4700. for my $pos_pos (1 ... $#string){
  4701. my $pos = $string[$pos_pos];
  4702. # print $pos, "\n";
  4703. if ($continue == 0 && $pos eq "x") {next;}
  4704. if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){
  4705. if ($zerolength == 0) {$localcluster_end = $pos_pos-1};
  4706. $zerolength++;
  4707. $continue = 1;
  4708. }
  4709. if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) {
  4710. $zerolength = 0;
  4711. $continue = 0;
  4712. my %seen;
  4713. my @uniqed = grep !$seen{$_}++, @localcluster;
  4714. # print "caught cluster : @uniqed \n";
  4715. push(@mapped_clusters, [@uniqed]);
  4716. # print "clustered:\n@uniqed\n";
  4717. @localcluster = ();
  4718. @record_cluster = ();
  4719. }
  4720. if ($pos ne "x"){
  4721. $zerolength = 0;
  4722. $continue = 1;
  4723. $pos =~ s/x,//g;
  4724. my @entries = split(/,/,$pos);
  4725. $localcluster_end = 0;
  4726. $localcluster_start = 0;
  4727. push(@record_cluster,$pos);
  4728. if ($continue == 0){
  4729. @localcluster = ();
  4730. @localcluster = (@localcluster, @entries);
  4731. $localcluster_start = $pos_pos;
  4732. }
  4733. if ($continue == 1 ) {
  4734. @localcluster = (@localcluster, @entries);
  4735. }
  4736. }
  4737. }
  4738. if (scalar(@localcluster) > 0){
  4739. my %seen;
  4740. my @uniqed = grep !$seen{$_}++, @localcluster;
  4741. # print "caught cluster : @uniqed \n";
  4742. push(@mapped_clusters, [@uniqed]);
  4743. # print "clustered:\n@uniqed\n";
  4744. @localcluster = ();
  4745. @record_cluster = ();
  4746. }
  4747. my @returner = ();
  4748. foreach my $clust (@mapped_clusters){
  4749. my @localclust = @$clust;
  4750. my @result = ();
  4751. foreach my $clustparts (@localclust){
  4752. push(@result,$clustparts);
  4753. }
  4754. push(@returner , join(".",@result));
  4755. }
  4756. # print "returnig: ", join(",",@returner), "\n";
  4757. return join(",",@returner);
  4758. }
  4759. #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
  4760. #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx
  4761. sub MakeTrees{
  4762. my $tree = $_[0];
  4763. my @parts=($tree);
  4764. # my @parts=();
  4765. while (1){
  4766. $tree =~ s/^\(//g;
  4767. $tree =~ s/\)$//g;
  4768. my @arr = ();
  4769. if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
  4770. @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
  4771. $tree = $2;
  4772. push @parts, $tree;
  4773. }
  4774. elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
  4775. @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
  4776. $tree = $1;
  4777. push @parts, $tree;
  4778. }
  4779. elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
  4780. last;
  4781. }
  4782. }
  4783. return @parts;
  4784. }
  4785. #xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx
  4786. sub qualityFilter{
  4787. my $unmaskedorthfile = $_[0];
  4788. my $seqfile = $_[1];
  4789. my $maskedorthfile = $_[2];
  4790. my $filteredout = $maskedorthfile."_residue";
  4791. open (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!";
  4792. my %keyhash = ();
  4793. while (my $line = <PMORTH>){
  4794. my $key = join("\t", $1,$2,$3,$4) if $line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/;
  4795. push @{$keyhash{$key}}, $line;
  4796. }
  4797. open (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!";
  4798. open (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!";
  4799. open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!";
  4800. while (my $line = <SEQ>){
  4801. chomp $line;
  4802. if ($line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/){
  4803. my $key = join("\t", $1,$2,$3,$4);
  4804. next if !exists $keyhash{$key};
  4805. my @orths = @{$keyhash{$key}} if exists $keyhash{$key};
  4806. delete $keyhash{$key};
  4807. my $sine = <SEQ>;
  4808. foreach my $orth (@orths){
  4809. #print "-----------------------------------------------------------------\n";
  4810. #print $orth;
  4811. my $orthcopy = $orth;
  4812. $orth =~ s/^>//;
  4813. my @parts = split(/>/,$orth);
  4814. my @starts = ();
  4815. my @ends = ();
  4816. foreach my $part (@parts){
  4817. my $no_of_species = adjustCoordinates($part);
  4818. my @pields = split(/\t/,$part);
  4819. # print "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\n";
  4820. push @starts, $pields[$startcord];
  4821. push @ends, $pields[$endcord];
  4822. }
  4823. #print "starts = @starts ... ends = @ends\n";
  4824. my $leftend = smallest_number(@starts)-10;
  4825. my $rightend = largest_number(@ends)+10;
  4826. my $maskarea = substr($sine, $leftend, $rightend-$leftend+1);
  4827. print RES $orth if $maskarea =~ /#/;
  4828. next if $maskarea =~ /#/;
  4829. print MORTH $orthcopy;
  4830. }
  4831. }
  4832. else{
  4833. next;
  4834. }
  4835. }
  4836. # print "UNDONE: ", scalar(keys %keyhash),"\n";
  4837. # print MORTH "UNDONE: ", scalar(keys %keyhash),"\n";
  4838. }
  4839. sub adjustCoordinates{
  4840. my $line = $_[0];
  4841. my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig;
  4842. my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g);
  4843. # print "line = $line\n";
  4844. $infocord = 2 + (4*$no_of_species) - 1;
  4845. $typecord = 2 + (4*$no_of_species) + 1 - 1;
  4846. $motifcord = 2 + (4*$no_of_species) + 2 - 1;
  4847. $gapcord = $motifcord+1;
  4848. $startcord = $gapcord+1;
  4849. $strandcord = $startcord+1;
  4850. $endcord = $strandcord + 1;
  4851. $microsatcord = $endcord + 1;
  4852. $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  4853. $interr_poscord = $microsatcord + 3;
  4854. $no_of_interruptionscord = $microsatcord + 4;
  4855. $interrcord = $microsatcord + 2;
  4856. # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i;
  4857. return $no_of_species;
  4858. }