PageRenderTime 71ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl

https://bitbucket.org/chapmanb/galaxy-central/
Perl | 5606 lines | 5065 code | 245 blank | 296 comment | 163 complexity | e64ba560e907a6b8d7818123ba23613a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. use Term::ANSIColor;
  5. use File::Basename;
  6. use IO::Handle;
  7. use Cwd;
  8. use File::Path;
  9. use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);
  10. use File::Path qw(make_path remove_tree);
  11. use File::Temp qw/ tempfile tempdir /;
  12. my $tdir = tempdir( CLEANUP => 1 );
  13. chdir $tdir;
  14. my $dir = getcwd;
  15. #print "dir = $dir\n";
  16. #$ENV{'PATH'} .= ':' . dirname($0);
  17. my $date = `date`;
  18. my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV;
  19. if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; }
  20. $tree_definition_all =~ s/\s+//g;
  21. $threshold_array =~ s/\s+//g;
  22. $allspeciesin =~ s/\s+//g;
  23. #-------------------------------------------------------------------------------
  24. # WHICH SPUTNIK USED?
  25. my $sputnikpath = ();
  26. $sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;
  27. #$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";
  28. #print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath;
  29. #exit if !-e $sputnikpath;
  30. #$sputnikpath = "bx-sputnik" ;
  31. #print "ARGV input = @ARGV\n";
  32. #print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n";
  33. #-------------------------------------------------------------------------------
  34. # RUNFILE
  35. #-------------------------------------------------------------------------------
  36. $distance = 1; #bp
  37. $distance++;
  38. my @tree_definitions=MakeTrees($tree_definition_all);
  39. my $allspeciesset = $tree_definition_all;
  40. $allspeciesset =~ s/[\(\) ]+//g;
  41. @allspecies = split(/,/,$allspeciesset);
  42. my @outputfiles = ();
  43. my $round = 0;
  44. #my $tdir = tempdir( CLEANUP => 0 );
  45. #chdir $tdir;
  46. foreach my $tree_definition (@tree_definitions){
  47. my @commas = ($tree_definition =~ /,/g) ;
  48. #print "commas = @commas\n"; <STDIN>;
  49. next if scalar(@commas) <= 1;
  50. #print "species_set = $species_set\n";
  51. $treeSpeciesNum = scalar(@commas) + 1;
  52. $species_set = $tree_definition;
  53. $species_set =~ s/[\)\( ;]+//g;
  54. #print "species_set = $species_set\n"; <STDIN>;
  55. $round++;
  56. #-------------------------------------------------------------------------------
  57. # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)
  58. $threshold_array=~ s/,/_/g;
  59. my @thresharr = split("_",$threshold_array);
  60. @thresholds=@thresharr;
  61. #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));
  62. #print "current dit=$dir\n";
  63. #-------------------------------------------------------------------------------
  64. # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY
  65. my @chrfiles=();
  66. #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];
  67. my $chromt=int(rand(10000));
  68. my $p_chr=$chromt;
  69. my @exactspeciesset_unarranged = split(/,/,$species_set);
  70. $tree_definition=~s/[\)\(, ]/\t/g;
  71. my @treespecies=split(/\t+/,$tree_definition);
  72. my @exactspecies=();
  73. foreach my $spec (@treespecies){
  74. foreach my $espec (@exactspeciesset_unarranged){
  75. push @exactspecies, $spec if $spec eq $espec;
  76. }
  77. }
  78. #print "exactspecies=@exactspecies\n";
  79. $focalspec = $exactspecies[0];
  80. my $arranged_species_set=join(".",@exactspecies);
  81. my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
  82. my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");
  83. #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
  84. maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);
  85. maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set);
  86. #print "done maf to axt conversion\n";
  87. my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");
  88. artificial_axdata_inverter ($chr_name, $reverse_chr_name);
  89. #print "reverse_chr_name=$reverse_chr_name\n";
  90. #-------------------------------------------------------------------------------
  91. # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT
  92. foreach my $direct ("reverse_direction","forward_direction"){
  93. $p_chr=$chromt;
  94. #print "direction = $direct\n";
  95. $p_chr = $p_chr."r" if $direct eq "reverse_direction";
  96. $p_chr = $p_chr if $direct eq "forward_direction";
  97. my $config = $species_set;
  98. $config=~s/,/./g;
  99. my @orgs = split(/\./,$arranged_species_set);
  100. #print "ORGS= @orgs\n";
  101. my @tag=@orgs;
  102. my $tags = join(",", @tag);
  103. my @tags=@tag;
  104. chomp $p_chr;
  105. $tags = join("_", split(/,/, $tags));
  106. my $pchr = "chr".$p_chr;
  107. my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array;
  108. my @sp_tags = ();
  109. # print "$ptag _ orthfile\n"; <STDIN>;
  110. #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n";
  111. foreach my $sp (@tag){
  112. push(@sp_tags, ($sp.".".$ptag));
  113. }
  114. my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]);
  115. my @presp_tags = ();
  116. foreach my $sp (@tag){
  117. push(@presp_tags, ($sp.".".$preptag));
  118. }
  119. my $resultdir = "";
  120. my $orthdir = "";
  121. my $filtereddir = "";
  122. my $pipedir = "";
  123. my @title_queries = ();
  124. push(@title_queries, "^[0-9]+");
  125. my $sep="\\s";
  126. for my $or (0 ... $#orgs){
  127. my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]"));
  128. #$title =~ s/chr\\+\\s+\+/chr/g;
  129. push(@title_queries, $title);
  130. }
  131. my $title_query = join($sep, @title_queries);
  132. #print "title_queries=@title_queries\n";
  133. #print "query = >$title_query<\n";
  134. #print "orgs = @orgs\n";
  135. #-------------------------------------------------------------------------------
  136. # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES
  137. my $t1input = $pchr.".".$arranged_species_set.".net.axt";
  138. my @t1outputs = ();
  139. foreach my $sp (@presp_tags){
  140. push(@t1outputs, $sp."_gap_op");
  141. }
  142. multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query);
  143. #print "t1outputs=@t1outputs\n";
  144. #print "done t1\n"; <STDIN>;
  145. #-------------------------------------------------------------------------------
  146. #START T2.PL
  147. my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = ();
  148. for my $t (0 ... scalar(@tags)-1){
  149. multi_species_t2($t1outputs[$t], $tag[$t]);
  150. }
  151. #-------------------------------------------------------------------------------
  152. #START T2.2.PL
  153. my @temp_tags = @tag;
  154. foreach my $sp (@presp_tags){
  155. my $t2input = $sp."_nogap_op_unrand";
  156. multi_species_t2_2($t2input, shift(@temp_tags));
  157. }
  158. undef (@temp_tags);
  159. #-------------------------------------------------------------------------------
  160. #START SPUTNIK
  161. my @jobIDs = ();
  162. @temp_tags = @tag;
  163. my @sput_filelist = ();
  164. foreach my $sp (@presp_tags){
  165. #print "sp = $sp\n";
  166. my $sputnikoutput = $pipedir.$sp."_sput_op0";
  167. my $sputnikinput = $pipedir.$sp."_nogap_op_unrand";
  168. push(@sput_filelist, $sputnikinput);
  169. my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput;
  170. # print "$sputnikcommand\n";
  171. my @sputnikcommand_system = $sputnikcommand;
  172. system(@sputnikcommand_system);
  173. }
  174. #-------------------------------------------------------------------------------
  175. #START SPUTNIK OUTPUT CORRECTOR
  176. foreach my $sp (@presp_tags){
  177. my $corroutput = $pipedir.$sp."_sput_op1";
  178. my $corrinput = $pipedir.$sp."_sput_op0";
  179. sputnikoutput_corrector($corrinput,$corroutput);
  180. my $t4output = $pipedir.$sp."_sput_op2";
  181. multi_species_t4($corroutput,$t4output);
  182. my $t5output = $pipedir.$sp."_sput_op3";
  183. multi_species_t5($t4output,$t5output);
  184. #print "done t5.pl for $sp\n";
  185. my $t6output = $pipedir.$sp."_sput_op4";
  186. multi_species_t6($t5output,$t6output,scalar(@orgs));
  187. }
  188. #-------------------------------------------------------------------------------
  189. #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING
  190. foreach my $sp (@presp_tags){
  191. my $t9output = $pipedir.$sp."_gap_op_unrand_match";
  192. my $t9sequence = $pipedir.$sp."_gap_op_unrand2";
  193. my $t9micro = $pipedir.$sp."_sput_op4";
  194. t9($t9micro,$t9sequence,$t9output);
  195. my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match";
  196. my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2";
  197. t9($t9micro,$t9sequence2,$t9output2);
  198. }
  199. #print "done both t9.pl for all orgs\n";
  200. #-------------------------------------------------------------------------------
  201. # FIND COMPOUND MICROSATELLITES
  202. @jobIDs = ();
  203. my $species_counter = 0;
  204. foreach my $sp (@presp_tags){
  205. my $simple_microsats=$pipedir.$sp."_sput_op4_simple";
  206. my $compound_microsats=$pipedir.$sp."_sput_op4_compound";
  207. my $input_micro = $pipedir.$sp."_sput_op4";
  208. my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match";
  209. multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array );
  210. $species_counter++;
  211. }
  212. #-------------------------------------------------------------------------------
  213. # READING AND FILTERING SIMPLE MICROSATELLITES
  214. my $spcounter2=0;
  215. foreach my $sp (@sp_tags){
  216. my $presp = $presp_tags[$spcounter2];
  217. $spcounter2++;
  218. my $simple_microsats=$pipedir.$presp."_sput_op4_simple";
  219. my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered";
  220. my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue";
  221. multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
  222. }
  223. #-------------------------------------------------------------------------------
  224. # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS
  225. $species_counter = 0;
  226. foreach my $sp (@sp_tags){
  227. my $presp = $presp_tags[$species_counter];
  228. my $compound_microsats = $pipedir.$presp."_sput_op4_compound";
  229. my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted";
  230. my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure";
  231. my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
  232. multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags));
  233. $species_counter++;
  234. }
  235. #-------------------------------------------------------------------------------
  236. # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM..
  237. $species_counter = 0;
  238. foreach my $sp (@sp_tags){
  239. my $presp = $presp_tags[$species_counter];
  240. my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure";
  241. my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
  242. my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound";
  243. my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
  244. multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8");
  245. $species_counter++;
  246. }
  247. #-------------------------------------------------------------------------------
  248. # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES
  249. $species_counter = 0;
  250. foreach my $sp (@sp_tags){
  251. my $presp = $presp_tags[$species_counter];
  252. my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
  253. my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
  254. my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue";
  255. multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
  256. my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted";
  257. my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
  258. my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue";
  259. multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags));
  260. my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound";
  261. my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered";
  262. my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue";
  263. multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
  264. $species_counter++;
  265. }
  266. #print "done filtering both simple and compound microsatellites \n";
  267. #-------------------------------------------------------------------------------
  268. my @combinedarray = ();
  269. my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide");
  270. my @combinedarray_tags = ("mono", "di", "tri", "tetra");
  271. $species_counter = 0;
  272. foreach my $sp (@sp_tags){
  273. my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple";
  274. push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra";
  275. $species_counter++;
  276. }
  277. #-------------------------------------------------------------------------------
  278. # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION
  279. my $sp_counter = 0;
  280. foreach my $sp (@sp_tags){
  281. my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
  282. my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
  283. my $simple = $pipedir.$sp."_sput_op4_simple_filtered";
  284. my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple";
  285. `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`;
  286. for my $i (0 ... 3){
  287. `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`;
  288. }
  289. $sp_counter++;
  290. }
  291. #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n";
  292. #-------------------------------------------------------------------------------
  293. # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES
  294. # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES
  295. @temp_tags = @sp_tags;
  296. my $increment = 1000000;
  297. my @splist = ();
  298. my $targetdir = $pipedir;
  299. $species_counter=0;
  300. foreach my $sp (@sp_tags){
  301. my $presp = $presp_tags[$species_counter];
  302. $species_counter++;
  303. my $localtag = shift @temp_tags;
  304. my $locallist = $targetdir.$localtag."_".$p_chr."_list";
  305. push(@splist, $locallist);
  306. my $input = $pipedir.$presp."_nogap_op_unrand2_match";
  307. chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr);
  308. }
  309. my @unionarray = ();
  310. #print "splist=@splist\n";
  311. #-------------------------------------------------------------------------------
  312. # FIND INTERRUPTED MICROSATELLITES
  313. $species_counter = 0;
  314. for my $i (0 .. $#combinedarray){
  315. @jobIDs = ();
  316. open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!";
  317. while (my $sp1 = <JLIST1>){
  318. #print "$splist[$i]: sp1=$sp1\n";
  319. chomp $sp1;
  320. for my $j (0 ... $#combinedarray_tags){
  321. my $interr = $sp1."_interr_".$combinedarray_tags[$j];
  322. my $simple = $sp1."_simple_".$combinedarray_tags[$j];
  323. push @{$unionarray[$i]}, $interr, $simple;
  324. multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8");
  325. }
  326. }
  327. $species_counter++;
  328. }
  329. close JLIST1;
  330. #-------------------------------------------------------------------------------
  331. # REUNION AND ZIPPING BEFORE T10.PL
  332. my @allarray = ();
  333. for my $i (0 ... $#sp_tags){
  334. my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
  335. unlink $localfile if -e $localfile;
  336. push(@allarray, $localfile);
  337. my $unfiltered_localfile= $localfile."_unfiltered";
  338. my $residue_localfile= $localfile."_residue";
  339. unlink $unfiltered_localfile;
  340. #unlink $unfiltered_localfile;
  341. for my $j (0 ... $#{$unionarray[$i]}){
  342. #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n";
  343. `cat $unionarray[$i][$j] >> $unfiltered_localfile`;
  344. unlink $unionarray[$i][$j];
  345. }
  346. multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) );
  347. my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered";
  348. my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue";
  349. my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue";
  350. `cat $analyzed_compound >> $localfile`;
  351. }
  352. #-------------------------------------------------------------------------------
  353. # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS
  354. my $toescape=0;
  355. for my $i (0 ... $#sp_tags){
  356. my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
  357. $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./;
  358. my $direction = $1;
  359. #print "localfile = $localfile , direction = $direction\n";
  360. # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1;
  361. $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/;
  362. last if $direction =~ /chr[0-9a-zA-Z]+r/;
  363. my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match";
  364. my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match";
  365. my $reverselocal = $localfile;
  366. $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g;
  367. merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags));
  368. #-------------------------------------------------------------------------------
  369. my $forward_separate = $localfile."_separate";
  370. my $reverse_separate = $reverselocal."_separate";
  371. my $diff = $forward_separate."_diff";
  372. my $miss = $forward_separate."_miss";
  373. my $common = $forward_separate."_common";
  374. forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags));
  375. #-------------------------------------------------------------------------------
  376. my $symmetrical_file = $localfile."_symmetrical";
  377. my $merged_file = $localfile."_merged";
  378. #print "cating: $merged_file $common into -> $symmetrical_file \n";
  379. `cat $merged_file $common > $symmetrical_file`;
  380. #-------------------------------------------------------------------------------
  381. my $t10output = $symmetrical_file."_fin_hit_all_2";
  382. new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs));
  383. #-------------------------------------------------------------------------------
  384. }
  385. next if $toescape == 1;
  386. #------------------------------------------------------------------------------------------------
  387. # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES
  388. my @micros_array = ();
  389. my $sampletag = ();
  390. for my $i (0 ... $#sp_tags){
  391. my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2";
  392. push(@micros_array, $finhitFile);
  393. $sampletag = $sp_tags[$i];
  394. }
  395. #$sampletag =~ s/^([A-Z]+\.)/ORTH_/;
  396. #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp";
  397. my $orthfiletemp = $ptag."_orthfile";
  398. my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation);
  399. my $maskedorthfiletemp = $ptag."_orthfile_masked";
  400. qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp);
  401. push @outputfiles , $maskedorthfiletemp;
  402. }
  403. $date = `date`;
  404. }
  405. `cat @outputfiles > $orthfile`;
  406. my $rootdir = $dir;
  407. $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
  408. chdir $rootdir;
  409. remove_tree($dir);
  410. #print "date = $date\n";
  411. #remove_tree($tdir);
  412. #------------------------------------------------------------------------------------------------
  413. #------------------------------------------------------------------------------------------------
  414. #------------------------------------------------------------------------------------------------
  415. #------------------------------------------------------------------------------------------------
  416. #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
  417. sub maftoAxt_multispecies {
  418. #print "in maftoAxt_multispecies : got @_\n";
  419. my $fname=$_[0];
  420. open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
  421. my $treedefinition = $_[1];
  422. open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
  423. my $counter = 0;
  424. my $exactspeciesset = $_[3];
  425. my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
  426. $treedefinition=~s/[\)\(, ]/\t/g;
  427. my @species=split(/\t+/,$treedefinition);
  428. my @exactspecies=();
  429. foreach my $spec (@species){
  430. foreach my $espec (@exactspeciesset_unarranged){
  431. push @exactspecies, $spec if $spec eq $espec;
  432. }
  433. }
  434. #print "exactspecies=@exactspecies\n";
  435. ###########
  436. my $select = 2;
  437. #select = 1 if all species need sequences to be present for each block otherwise, it is 0
  438. #select = 2 only the allowed set make up the alignment. use the removeset
  439. # information to detect alignmenets that have other important genomes aligned.
  440. ###########
  441. my @allowedset = ();
  442. @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
  443. @allowedset = join("_",0,@species) if $select == 1;
  444. #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
  445. @allowedset = join("_",0,@exactspecies) if $select == 2;
  446. #print "allowedset = @allowedset and exactspecies = @exactspecies\n";
  447. my $start = 0;
  448. my @sequences = ();
  449. my @titles = ();
  450. my $species_counter = "0";
  451. my $countermatch = 0;
  452. my $outsideSpecies=0;
  453. while(my $line = <IN>){
  454. # print $line;
  455. next if $line =~ /^#/;
  456. next if $line =~ /^i/;
  457. chomp $line;
  458. my @fields = split(/\s+/,$line);
  459. chomp $line;
  460. if ($line =~ /^a /){
  461. $start = 1;
  462. }
  463. if ($line =~ /^s /){
  464. foreach my $sp (@allspecies){
  465. # print "checking species $sp\n";
  466. if ($fields[1] =~ /$sp/){
  467. $species_counter = $species_counter."_".$sp;
  468. push(@sequences, $fields[6]);
  469. my @sp_info = split(/\./,$fields[1]);
  470. my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
  471. push(@titles, $title);
  472. # print "species_counter = $species_counter\n";
  473. }
  474. }
  475. }
  476. if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
  477. # print "species_counter = $species_counter\n";
  478. my $arranged = reorderSpecies($species_counter, @allspecies);
  479. my $stopper = 1;
  480. my $arrno = 0;
  481. # print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n";
  482. if (scalar(@sequences) == scalar(@exactspecies)){
  483. foreach my $set (@allowedset){
  484. # print "testing $arranged against $set\n";
  485. if ($arranged eq $set){
  486. $stopper = 0; last;
  487. }
  488. $arrno++;
  489. }
  490. }
  491. else{
  492. $stopper = 1;
  493. }
  494. if ($stopper == 0) {
  495. @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
  496. @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
  497. my $filteredseq = filter_gaps(@sequences);
  498. if ($filteredseq ne "SHORT"){
  499. #print "printing"; <STDIN>;
  500. $counter++;
  501. print OUT join (" ",$counter, @titles), "\n";
  502. print OUT $filteredseq, "\n";
  503. print OUT "\n";
  504. $countermatch++;
  505. }
  506. }
  507. else{ #print "nexting\n";<STDIN>;
  508. }
  509. @sequences = (); @titles = (); $start = 0;$species_counter = "0";
  510. next;
  511. }
  512. }
  513. # print "countermatch = $countermatch\n";
  514. }
  515. sub reorderSpecies{
  516. my @inarr=@_;
  517. my $currSpecies = shift (@inarr);
  518. my $ordered_species = 0;
  519. my @species=@inarr;
  520. #print "species = @species\n";
  521. foreach my $order (@species){
  522. $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
  523. }
  524. return $ordered_species;
  525. }
  526. sub filter_gaps{
  527. my @sequences = @_;
  528. # print "sequences sent are @sequences\n";
  529. my $seq_length = length($sequences[0]);
  530. my $seq_no = scalar(@sequences);
  531. my $allgaps = ();
  532. for (1 ... $seq_no){
  533. $allgaps = $allgaps."-";
  534. }
  535. my @seq_array = ();
  536. my $seq_counter = 0;
  537. foreach my $seq (@sequences){
  538. # my @sequence = split(/\s*/,$seq);
  539. $seq_array[$seq_counter] = [split(/\s*/,$seq)];
  540. # push @seq_array, [@sequence];
  541. $seq_counter++;
  542. }
  543. my $g = 0;
  544. while ( $g < $seq_length){
  545. last if (!exists $seq_array[0][$g]);
  546. my $bases = ();
  547. for my $u (0 ... $#seq_array){
  548. $bases = $bases.$seq_array[$u][$g];
  549. }
  550. # print $bases, "\n";
  551. if ($bases eq $allgaps){
  552. # print "bases are $bases, position is $g \n";
  553. for my $seq (@seq_array){
  554. splice(@$seq , $g, 1);
  555. }
  556. }
  557. else {
  558. $g++;
  559. }
  560. }
  561. my @outs = ();
  562. foreach my $seq (@seq_array){
  563. push(@outs, join("",@$seq));
  564. }
  565. return "SHORT" if length($outs[0]) <=100;
  566. return (join("\n", @outs));
  567. }
  568. sub allowedSetOfSpecies{
  569. my @allowed_species = split(/_/,$_[0]);
  570. unshift @allowed_species, 0;
  571. # print "allowed set = @allowed_species \n";
  572. my @output = ();
  573. for (0 ... scalar(@allowed_species) - 4){
  574. push(@output, join("_",@allowed_species));
  575. pop @allowed_species;
  576. }
  577. return join(";",reverse(@output));
  578. }
  579. sub orderInfo{
  580. my @info = split(/;/,$_[0]);
  581. # print "info = @info";
  582. my @old = split(/_/,$_[1]);
  583. my @new = split(/_/,$_[2]);
  584. shift @old; shift @new;
  585. my @outinfo = ();
  586. foreach my $spe (@new){
  587. for my $no (0 ... $#old){
  588. if ($spe eq $old[$no]){
  589. push(@outinfo, $info[$no]);
  590. }
  591. }
  592. }
  593. # print "outinfo = @outinfo \n";
  594. return join(";", @outinfo);
  595. }
  596. #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
  597. #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
  598. sub artificial_axdata_inverter{
  599. open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!";
  600. open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!";
  601. my $linecounter=0;
  602. while (my $line = <IN>){
  603. $linecounter++;
  604. #print "$linecounter\n";
  605. chomp $line;
  606. my $final_line = $line;
  607. my $trycounter = 0;
  608. if ($line =~ /^[a-zA-Z\-]/){
  609. # while ($final_line eq $line){
  610. my @fields = split(/\s*/,$line);
  611. $final_line = join("",reverse(@fields));
  612. # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
  613. # $trycounter++;
  614. # print "trying again....$trycounter : $final_line\n" if $final_line eq $line;
  615. # }
  616. }
  617. # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
  618. if ($line =~ /^[0-9]/){
  619. $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g;
  620. $final_line = $line;
  621. }
  622. print OUT $final_line,"\n";
  623. #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
  624. }
  625. close OUT;
  626. }
  627. #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
  628. #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
  629. sub multi_species_t1 {
  630. my $input1 = $_[0];
  631. #print "@_\n"; <STDIN>;
  632. my @tags = split(/_/, $_[1]);
  633. my @outputs = split(/,/, $_[2]);
  634. my $title_query = $_[3];
  635. my @handles = ();
  636. open(FILEB,"<$input1")or die "Cannot open file: $input1 $!";
  637. my $i = 0;
  638. foreach my $path (@outputs){
  639. $handles[$i] = IO::Handle->new();
  640. open ($handles[$i], ">$path") or die "Can't open $path : $!";
  641. $i++;
  642. }
  643. my $curdef;
  644. my $start = 0;
  645. while (my $line = <FILEB> ) {
  646. if ($line =~ /^\d/){
  647. $line =~ s/ +/\t/g;
  648. my @fields = split(/\s+/, $line);
  649. if (($line =~ /$title_query/)){
  650. my $title = $line;
  651. my $counter = 0;
  652. foreach my $tag (@tags){
  653. $line = <FILEB>;
  654. print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line;
  655. $counter++;
  656. }
  657. }
  658. else{
  659. foreach my $tag (@tags){
  660. my $tine = <FILEB>;
  661. }
  662. }
  663. }
  664. }
  665. foreach my $hand (@handles){
  666. $hand->close();
  667. }
  668. close FILEB;
  669. }
  670. #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
  671. #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
  672. sub multi_species_t2{
  673. my $input = $_[0];
  674. my $species = $_[1];
  675. my $output1 = $input."_unr";
  676. #------------------------------------------------------------------------------------------
  677. open (FILEF1, "<$input") or die "Cannot open file $input :$!";
  678. open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!";
  679. my $line1 = <FILEF1>;
  680. while($line1){
  681. {
  682. # chomp($line);
  683. if ($line1 =~ (m/^\>$species/)){
  684. chomp($line1);
  685. print FILEF2 $line1;
  686. $line1 = <FILEF1>;
  687. chomp($line1);
  688. print FILEF2 "\t", $line1,"\n";
  689. }
  690. }
  691. $line1 = <FILEF1>;
  692. }
  693. close FILEF1;
  694. close FILEF2;
  695. #------------------------------------------------------------------------------------------
  696. my $output2 = $output1."and";
  697. my $output3 = $output1."and2";
  698. open(IN,"<$output1");
  699. open (FILEF3, ">$output2");
  700. open (FILEF4, ">$output3");
  701. while (<IN>){
  702. my $line = $_;
  703. chomp($line);
  704. my @fields=split (/\t/, $line);
  705. # print $line,"\n"; <STDIN>;
  706. if($line !~ /random/){
  707. print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n";
  708. print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n";
  709. }
  710. }
  711. close IN;
  712. close FILEF3;
  713. close FILEF4;
  714. unlink $output1;
  715. #------------------------------------------------------------------------------------------
  716. # OLD T3.PL RUDIMENT
  717. my $t3output = $output2;
  718. $t3output =~ s/gap_op_unrand/nogap_op_unrand/g;
  719. open(IN,"<$output2");
  720. open(OUTA,">$t3output");
  721. while (<IN>){
  722. s/-//g unless /^>/;
  723. print OUTA;
  724. }
  725. close IN;
  726. close OUTA;
  727. #------------------------------------------------------------------------------------------
  728. }
  729. #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
  730. #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx
  731. sub multi_species_t2_2{
  732. #print "IN multi_species_t2_2 : @_\n";
  733. my $input = $_[0];
  734. my $species = $_[1];
  735. my $output1 = $input."2";
  736. open (FILEF1, "<$input");
  737. open (FILEF2, ">$output1");
  738. my $line1 = <FILEF1>;
  739. while($line1){
  740. {
  741. # chomp($line);
  742. if ($line1 =~ (m/^\>$species/)){
  743. chomp($line1);
  744. print FILEF2 $line1;
  745. $line1 = <FILEF1>;
  746. chomp($line1);
  747. print FILEF2 "\t", $line1,"\n";
  748. }
  749. }
  750. $line1 = <FILEF1>;
  751. }
  752. close FILEF1;
  753. close FILEF2;
  754. }
  755. #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx
  756. #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
  757. sub sputnikoutput_corrector{
  758. my $input = $_[0];
  759. my $output = $_[1];
  760. open(IN,"<$input") or die "Cannot open file $input :$!";
  761. open(OUT,">$output") or die "Cannot open file $output :$!";
  762. my $tine;
  763. while (my $line=<IN>){
  764. if($line =~/length /){
  765. $tine = $line;
  766. $tine =~ s/\s+/\t/g;
  767. my @fields = split(/\t/,$tine);
  768. if ($fields[6] > 60){
  769. print OUT $line;
  770. $line = <IN>;
  771. while (($line !~ /nucleotide/) && ($line !~ /^>/)){
  772. chomp $line;
  773. print OUT $line;
  774. $line = <IN>;
  775. }
  776. print OUT "\n";
  777. print OUT $line;
  778. }
  779. else{
  780. print OUT $line;
  781. }
  782. }
  783. else{
  784. print OUT $line;
  785. }
  786. }
  787. close IN;
  788. close OUT;
  789. }
  790. #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
  791. #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
  792. sub multi_species_t4{
  793. # print "multi_species_t4 : @_\n";
  794. my $input = $_[0];
  795. my $output = $_[1];
  796. open (FILEA, "<$input");
  797. open (FILEB, ">$output");
  798. my $line = <FILEA>;
  799. while ($line) {
  800. # chomp $line;
  801. if ($line =~ />/) {
  802. chomp $line;
  803. print FILEB $line, "\n";
  804. }
  805. if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
  806. chomp $line;
  807. print FILEB $line, " " ;
  808. $line = <FILEA>;
  809. chomp $line;
  810. print FILEB $line,"\n";
  811. }
  812. $line = <FILEA>;
  813. }
  814. close FILEA;
  815. close FILEB;
  816. }
  817. #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
  818. #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
  819. sub multi_species_t5{
  820. my $input = $_[0];
  821. my $output = $_[1];
  822. open(FILEB,"<$input");
  823. open(FILEC,">$output");
  824. my $curdef;
  825. while (my $line = <FILEB> ) {
  826. if ($line =~ /^>/){
  827. chomp $line;
  828. $curdef = $line;
  829. next;
  830. }
  831. if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
  832. print FILEC $curdef," ",$line;
  833. }
  834. }
  835. close FILEB;
  836. close FILEC;
  837. }
  838. #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
  839. #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
  840. sub multi_species_t6{
  841. my $input = $_[0];
  842. my $output = $_[1];
  843. my $focalstrand=$_[3];
  844. # print "inpput = @_\n";
  845. open (FILE, "<$input");
  846. open (FILE_MICRO, ">$output");
  847. my $linecounter=0;
  848. while (my $line = <FILE>){
  849. $linecounter++;
  850. chomp $line;
  851. #print "line = $line\n";
  852. #MONO#
  853. $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/;
  854. my $strand=$1;
  855. my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g);
  856. #print "line = $line\n";
  857. my $specfieldsend = 2 + ($no_of_species*4) - 1;
  858. my @fields = split(/\s+/, $line);
  859. my @speciesdata = @fields[0 ... $specfieldsend];
  860. $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/;
  861. my ($tide, $start, $end) = ($1, $2, $3);
  862. #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n";
  863. if($line =~ /mononucleotide/){
  864. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n";
  865. }
  866. #DI#
  867. elsif($line =~ /dinucleotide/){
  868. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n";
  869. }
  870. #TRI#
  871. elsif($line =~ /trinucleotide/ ){
  872. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n";
  873. }
  874. #TETRA#
  875. elsif($line =~ /tetranucleotide/){
  876. print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n";
  877. }
  878. #PENTA#
  879. elsif($line =~ /pentanucleotide/){
  880. #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n";
  881. }
  882. else{
  883. # print "not: @fields\n";
  884. }
  885. }
  886. # print "linecounter=$linecounter\n";
  887. close FILE;
  888. close FILE_MICRO;
  889. }
  890. sub mono {
  891. my $st = $_[0];
  892. my $tp = unpack "A1"x(length($st)/1),$st;
  893. my $var1 = substr($tp, 0, 1);
  894. return join ("\t", $var1);
  895. }
  896. sub di {
  897. my $st = $_[0];
  898. my $tp = unpack "A2"x(length($st)/2),$st;
  899. my $var1 = substr($tp, 0, 2);
  900. return join ("\t", $var1);
  901. }
  902. sub tri {
  903. my $st = $_[0];
  904. my $tp = unpack "A3"x(length($st)/3),$st;
  905. my $var1 = substr($tp, 0, 3);
  906. return join ("\t", $var1);
  907. }
  908. sub tetra {
  909. my $st = $_[0];
  910. my $tp = unpack "A4"x(length($st)/4),$st;
  911. my $var1 = substr($tp, 0, 4);
  912. return join ("\t", $var1);
  913. }
  914. sub penta {
  915. my $st = $_[0];
  916. my $tp = unpack "A5"x(length($st)/5),$st;
  917. my $var1 = substr($tp, 0, 5);
  918. return join ("\t", $var1);
  919. }
  920. #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
  921. #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
  922. sub t9{
  923. my $input1 = $_[0];
  924. my $input2 = $_[1];
  925. my $output = $_[2];
  926. open(IN1,"<$input1") if -e $input1;
  927. open(IN2,"<$input2") or die "cannot open file $_[1] : $!";
  928. open(OUT,">$output") or die "cannot open file $_[2] : $!";
  929. my %seen = ();
  930. my $prevkey = 0;
  931. if (-e $input1){
  932. while (my $line = <IN1>){
  933. chomp($line);
  934. my @fields = split(/\t/,$line);
  935. my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]);
  936. # print "key in t9 = $key1\n";
  937. $seen{$key1}++ if ($prevkey ne $key1) ;
  938. $prevkey = $key1;
  939. }
  940. # print "done first hash\n";
  941. close IN1;
  942. }
  943. while (my $line = <IN2>){
  944. # print $line, "**\n";
  945. if (-e $input1){
  946. chomp($line);
  947. my @fields = split(/\t/,$line);
  948. my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]);
  949. if (exists $seen{$key2}){
  950. print OUT "$line\n" ;
  951. delete $seen{$key2};
  952. }
  953. }
  954. else {
  955. print OUT "$line\n" ;
  956. # print "$line\n" ;
  957. }
  958. }
  959. close IN2;
  960. close OUT;
  961. }
  962. #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
  963. #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
  964. sub multiSpecies_compound_microsat_hunter3{
  965. my $input1 = $_[0]; ###### the *_sput_op4_ii file
  966. my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
  967. my $output1 = $_[2]; ###### plain microsatellite file
  968. my $output2 = $_[3]; ###### compound microsatellite file
  969. my $org = $_[4]; ###### 1 or 2
  970. $no_of_species = $_[5];
  971. #print "IN multiSpecies_compound_microsat_hunter3: @_\n";
  972. #my @tags = split(/\t/,$info);
  973. sub compoundify;
  974. open(IN,"<$input1") or die "Cannot open file $input1 $!";
  975. open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
  976. open(OUT,">$output1") or die "Cannot open file $output1 $!";
  977. open(OUT2,">$output2") or die "Cannot open file $output2 $!";
  978. $infocord = 2 + (4*$no_of_species) - 1;
  979. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  980. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  981. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  982. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  983. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  984. my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
  985. my @thresholds = ("0");
  986. push(@thresholds, split(/_/,$_[6]));
  987. sub thresholdCheck;
  988. my %micros = ();
  989. while (my $line = <IN>){
  990. # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
  991. next if $line =~ /\t\t/;
  992. if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  993. my $key = join("\t",$1, $2, $3, $4, $5);
  994. # print $key, "#-#-#-#-#-#-#-#\n";
  995. push (@{$micros{$key}},$line);
  996. }
  997. else{
  998. }
  999. }
  1000. close IN;
  1001. my @deletedlines = ();
  1002. my $linecount = 0;
  1003. while(my $sine = <SEQ>){
  1004. my %microstart=();
  1005. my %microend=();
  1006. my @sields = split(/\t/,$sine);
  1007. my $key = ();
  1008. if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
  1009. $key = join("\t",$1, $2, $3, $4, $5);
  1010. # print $key, "<-<-<-<-<-<-<-<\n";
  1011. }
  1012. else{
  1013. }
  1014. if (exists $micros{$key}){
  1015. $linecount++;
  1016. my @microstring = @{$micros{$key}};
  1017. my @tempmicrostring = @{$micros{$key}};
  1018. foreach my $line (@tempmicrostring){
  1019. my @fields = split(/\t/,$line);
  1020. my $start = $fields[$startcord];
  1021. my $end = $fields[$endcord];
  1022. push (@{$microstart{$start}},$line);
  1023. push (@{$microend{$end}},$line);
  1024. }
  1025. my $firstflag = 'down';
  1026. while( my $line =shift(@microstring)){
  1027. # print "-----------\nline = $line ";
  1028. chomp $line;
  1029. my @fields = split(/\t/,$line);
  1030. my $start = $fields[$startcord];
  1031. my $end = $fields[$endcord];
  1032. my $startmicro = $line;
  1033. my $endmicro = $line;
  1034. # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n";
  1035. delete ($microstart{$start});
  1036. delete ($microend{$end});
  1037. my $flag = 'down';
  1038. my $startflag = 'down';
  1039. my $endflag = 'down';
  1040. my $prestart = $start - $distance;
  1041. my $postend = $end + $distance;
  1042. my @compoundlines = ();
  1043. my %compoundhash = ();
  1044. push (@compoundlines, $line);
  1045. push (@{$compoundhash{$line}},$line);
  1046. my $startrank = 1;
  1047. my $endrank = 1;
  1048. while( ($startflag eq "down") || ($endflag eq "down") ){
  1049. if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) {
  1050. # print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n";
  1051. last;
  1052. }
  1053. $firstflag = "up";
  1054. if ($startflag eq "down"){
  1055. for my $i ($prestart ... $start){
  1056. if(exists $microend{$i}){
  1057. chomp $microend{$i}[0];
  1058. if(exists $compoundhash{$microend{$i}[0]}) {next;}
  1059. # print "sending from microend $startmicro, $microend{$i}[0] |||\n";
  1060. if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){
  1061. push(@compoundlines, $microend{$i}[0]);
  1062. # print "accepted\n";
  1063. my @tields = split(/\t/,$microend{$i}[0]);
  1064. $startmicro = $microend{$i}[0];
  1065. chomp $startmicro;
  1066. $start = $tields[$startcord];
  1067. $flag = 'down';
  1068. $startrank++;
  1069. # print "startcompund = $microend{$i}[0]\n";
  1070. delete $microend{$i};
  1071. delete $microstart{$start};
  1072. $startflag = 'down';
  1073. $prestart = $start - $distance;
  1074. last;
  1075. }
  1076. else{
  1077. $flag = 'up';
  1078. $startflag = 'up';
  1079. }
  1080. }
  1081. else{
  1082. $flag = 'up';
  1083. $startflag = 'up';
  1084. }
  1085. }
  1086. }
  1087. $endrank = $startrank;
  1088. if ($endflag eq "down"){
  1089. for my $i ($end ... $postend){
  1090. if(exists $microstart{$i} ){
  1091. chomp $microstart{$i}[0];
  1092. if(exists $compoundhash{$microstart{$i}[0]}) {next;}
  1093. # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n";
  1094. if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){
  1095. push(@compoundlines, $microstart{$i}[0]);
  1096. # print "accepted\n";
  1097. my @tields = split(/\t/,$microstart{$i}[0]);
  1098. $end = $tields[$endcord]-0;
  1099. $endmicro = $microstart{$i}[0];
  1100. $endrank++;
  1101. chomp $endmicro;
  1102. $flag = 'down';
  1103. # print "endcompund = $microstart{$i}[0]\n";
  1104. delete $microstart{$i};
  1105. delete $microend{$end};
  1106. shift @microstring;
  1107. $postend = $end + $distance;
  1108. $endflag = 'down';
  1109. last;
  1110. }
  1111. else{
  1112. $flag = 'up';
  1113. $endflag = 'up';
  1114. }
  1115. }
  1116. else{
  1117. $flag = 'up';
  1118. $endflag = 'up';
  1119. }
  1120. }
  1121. }
  1122. # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
  1123. } #end while( $flag eq "down")
  1124. # print "compoundlines = @compoundlines \n";
  1125. if (scalar (@compoundlines) == 1){
  1126. print OUT $line,"\n";
  1127. }
  1128. if (scalar (@compoundlines) > 1){
  1129. my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]);
  1130. # print $compoundline,"\n";
  1131. print OUT2 $compoundline,"\n";
  1132. }
  1133. } #end foreach my $line (@microstring){
  1134. } #if (exists $micros{$key}){
  1135. }
  1136. close OUT;
  1137. close OUT2;
  1138. }
  1139. #------------------------------------------------------------------------------------------------
  1140. sub compoundify{
  1141. my ($compoundlines, $sequence) = @_;
  1142. # print "\nfound to compound : @$compoundlines and$sequence \n";
  1143. my $noOfComps = @$compoundlines;
  1144. # print "Number of elements in hash is $noOfComps\n";
  1145. my @starts;
  1146. my @ends;
  1147. foreach my $line (@$compoundlines){
  1148. # print "compoundify.. line = $line \n";
  1149. chomp $line;
  1150. my @fields = split(/\t/,$line);
  1151. my $start = $fields[$startcord];
  1152. my $end = $fields[$endcord];
  1153. # print "start = $start, end = $end \n";
  1154. push(@starts, $start);
  1155. push(@ends,$end);
  1156. }
  1157. my @temp = @$compoundlines;
  1158. my $startline=$temp[0];
  1159. my @mields = split(/\t/,$startline);
  1160. my $startcoord = $mields[$startcord];
  1161. my $startgapsign=$mields[$endcord];
  1162. my @startsorted = sort { $a <=> $b } @starts;
  1163. my @endsorted = sort { $a <=> $b } @ends;
  1164. my @intervals;
  1165. for my $end (0 ... (scalar(@endsorted)-2)){
  1166. my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1));
  1167. push(@intervals,$interval);
  1168. # print "interval = $interval =\n";
  1169. # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n";
  1170. }
  1171. push(@intervals,"");
  1172. my $compoundmicrosat=();
  1173. my $multiunit="";
  1174. foreach my $line (@$compoundlines){
  1175. my @fields = split(/\t/,$line);
  1176. my $component="[".$fields[$microsatcord]."]".shift(@intervals);
  1177. $compoundmicrosat=$compoundmicrosat.$component;
  1178. $multiunit=$multiunit."[".$fields[$motifcord]."]";
  1179. # print "multiunit = $multiunit\n";
  1180. }
  1181. my $compoundcopy = $compoundmicrosat;
  1182. $compoundcopy =~ s/\[|\]//g;
  1183. my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1;
  1184. my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit));
  1185. return $compoundline;
  1186. }
  1187. #------------------------------------------------------------------------------------------------
  1188. sub identityMatch_thresholdCheck{
  1189. my $line1 = $_[0];
  1190. my $line2 = $_[1];
  1191. my $rank = $_[2];
  1192. my @lields1 = split(/\t/,$line1);
  1193. my @lields2 = split(/\t/,$line2);
  1194. # print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n";
  1195. if (length($lields1[$motifcord]) == length($lields2[$motifcord])){
  1196. my $probe = $lields1[$motifcord].$lields1[$motifcord];
  1197. #print "$probe :: $lields2[$motifcord]\n";
  1198. return "proceed" if $probe =~ /$lields2[$motifcord]/;
  1199. #print "line recieved\n";
  1200. if ($rank ==1){
  1201. return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
  1202. }
  1203. else {
  1204. return "proceed" if thresholdCheck($line2) eq "proceed";
  1205. return "stop";
  1206. }
  1207. }
  1208. else{
  1209. if ($rank ==1){
  1210. return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
  1211. }
  1212. else {
  1213. return "proceed" if thresholdCheck($line2) eq "proceed";
  1214. return "stop";
  1215. }
  1216. }
  1217. return "stop";
  1218. }
  1219. #------------------------------------------------------------------------------------------------
  1220. sub thresholdCheck{
  1221. my @checkthresholds=(0,@thresholds);
  1222. #print "IN thresholdCheck: @_\n";
  1223. my $line = $_[0];
  1224. my @lields = split(/\t/,$line);
  1225. return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])];
  1226. return "stop";
  1227. }
  1228. #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
  1229. #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
  1230. sub multiSpecies_filtering_interrupted_microsats{
  1231. # print "IN multiSpecies_filtering_interrupted_microsats: @_\n";
  1232. my $unfiltered = $_[0];
  1233. my $filtered = $_[1];
  1234. my $residue = $_[2];
  1235. my $no_of_species = $_[5];
  1236. open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
  1237. open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
  1238. open(RES,">$residue") or die "Cannot open file $residue: $!";
  1239. $infocord = 2 + (4*$no_of_species) - 1;
  1240. $startcord = 2 + (4*$no_of_species) + 2 - 1;
  1241. $strandcord = 2 + (4*$no_of_species) + 3 - 1;
  1242. $endcord = 2 + (4*$no_of_species) + 4 - 1;
  1243. $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
  1244. $motifcord = 2 + (4*$no_of_species) + 6 - 1;
  1245. my @sub_thresholds = (0);
  1246. push(@sub_thresholds, split(/_/,$_[3]));
  1247. my @thresholds = (0);
  1248. push(@thresholds, split(/_/,$_[4]));
  1249. while (my $line = <UNF>) {
  1250. next if $line !~ /[a-z]/;
  1251. #print $line;
  1252. chomp $line;
  1253. my @fields = split(/\t/,$line);
  1254. my $motif = $fields[$motifcord];
  1255. my $realmotif = $motif;
  1256. #print "motif = $motif\n";
  1257. if ($motif =~ /^\[/){
  1258. $motif =~ s/^\[//g;
  1259. my @motifs = split(/\]/,$motif);
  1260. $realmotif = $motifs[0];
  1261. }
  1262. # print "realmotif = $realmotif";
  1263. my $motif_size = length($realmotif);
  1264. my $microsat = $fields[$microsatcord];
  1265. # print "microsat = $microsat\n";
  1266. $microsat =~ s/^\[|\]$//sg;
  1267. my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
  1268. $microsat = join("",@microsats);
  1269. if (length($microsat) < $thresholds[$motif_size]) {
  1270. # print length($microsat)," < ",$thresholds[$motif_size],"\n";
  1271. print RES $line,"\n"; next;
  1272. }
  1273. my @lengths = ();
  1274. foreach my $mic (@microsats){
  1275. push(@lengths, length($mic));
  1276. }
  1277. if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) {
  1278. # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n";
  1279. print RES $line,"\n"; next;}
  1280. else {print FIL $line,"\n"; next;
  1281. }
  1282. }
  1283. close FIL;
  1284. close RES;
  1285. }
  1286. sub largest_microsat{
  1287. my $counter = 0;
  1288. my($max) = shift(@_);
  1289. foreach my $temp (@_) {
  1290. #print "finding largest array: $maxcounter \n";
  1291. if($temp > $max){
  1292. $max = $temp;
  1293. }
  1294. }
  1295. return($max);
  1296. }
  1297. #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
  1298. #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx

Large files files are truncated, but you can click here to view the full file