PageRenderTime 69ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/Bio/fm/filters/blast_xml.pm

https://github.com/sauloal/perlscripts
Perl | 944 lines | 702 code | 99 blank | 143 comment | 92 complexity | 23dc9a6449580a83065bc8687dd1b5a7 MD5 | raw file
  1. #!/usr/bin/perl -w
  2. use strict;
  3. use Bio::SearchIO;
  4. $| = 1;
  5. my $inputFile = $ARGV[0];
  6. my $verbose = 1;
  7. my $Log;
  8. if ($verbose) { unlink("blast_xml.log"); };
  9. my $hypothetical = 0;
  10. my %priority =
  11. (
  12. "BLASTN" => 1
  13. );
  14. my %minident =
  15. (
  16. "BLASTN" => 80
  17. );
  18. my %minconsv =
  19. (
  20. "BLASTN" => 80
  21. );
  22. my %minsig =
  23. (
  24. "BLASTN" => 1e-25
  25. );
  26. my %minsize =
  27. (
  28. "BLASTN" => 20
  29. );
  30. # $inputFile = "answer_WM276GBFF10_06_R265_c_neoformans_db_tblastx.bls";
  31. # $inputFile = "answer_WM276GBFF10_06_R265_c_neoformans_db_blastn.bls";
  32. # $inputFile = "answer_WM276GBFF10_06_R265_c_neoformans_db_blastn_MIX.bls";
  33. # $inputFile = "answer_WM276GBFF10_06_R265_c_neoformans_db_blastn.bls";
  34. $inputFile = "../blast/1_query_1.fa_1_db_db_blastn.blast";
  35. #&fixBlast($inputFile);
  36. my @answers;
  37. my ($Gtotal, $GtotalG, %Gpositives, %Gblasts, %Gnegatives, %GnoIdent, %GnoSig, %Gshort, %GnoHit, %Gstat);
  38. @answers = &start($inputFile);
  39. $Gtotal = $answers[0];
  40. $GtotalG = $answers[1];
  41. %Gpositives = %{$answers[2]};
  42. %Gblasts = %{$answers[3]};
  43. %Gnegatives = %{$answers[4]};
  44. %GnoIdent = %{$answers[5]};
  45. %GnoSig = %{$answers[6]};
  46. %Gshort = %{$answers[7]};
  47. %GnoHit = %{$answers[8]};
  48. %Gstat = %{$answers[9]};
  49. &printStat($Gtotal, $GtotalG, \%Gpositives, \%Gblasts, \%Gnegatives, \%GnoIdent, \%GnoSig, \%Gshort, \%GnoHit, \%Gstat);
  50. die;
  51. my %GbyChrom = %{&genDB(\%Gpositives, \%Gblasts)};
  52. &save_chromossome(\%GbyChrom, $inputFile);
  53. &commitLog();
  54. sub commitLog
  55. {
  56. open FILE, ">blast_xml.log" or die "COULD NOT CREATE BLAST_XML.LOG";
  57. print FILE $Log;
  58. close FILE;
  59. }
  60. sub Log
  61. {
  62. if ($verbose)
  63. {
  64. print $_[0];
  65. }
  66. $Log .= $_[0];
  67. }
  68. sub fixBlast
  69. {
  70. my $input = $_[0];
  71. open INFILE, "<$input" or die "COULD NOT OPEN INPUT FILE $input FOR FIXING\n";
  72. open OUTFILE, ">$input.tmp" or die "COULD NOT OPEN INPUT FILE $input.tmp FOR FIXING\n";
  73. my $on = 0;
  74. my $query = "";
  75. while (<INFILE>)
  76. {
  77. chomp;
  78. if ((/^Length=/) && ($on))
  79. {
  80. $query =~ s/\_+/\_/g;
  81. s/\_+/\_/g;
  82. print OUTFILE "Query=$query\n";
  83. print OUTFILE "$_\n";
  84. $on = 0;
  85. $query = "";
  86. }
  87. elsif (( ! $on) && ( ! (/^Query=\s*(\S+)/)))
  88. {
  89. s/\_+/\_/g;
  90. print OUTFILE "$_\n";
  91. }
  92. if ($on)
  93. {
  94. $query .= $_;
  95. }
  96. elsif (/^Query=\s*(\S+)/)
  97. {
  98. $query = $1;
  99. $on = 1;
  100. }
  101. }
  102. close INFILE;
  103. close OUTFILE;
  104. rename $input, "$input.old";
  105. rename "$input.tmp", $input;
  106. }
  107. sub save_chromossome
  108. {
  109. my %db = %{$_[0]};
  110. my $inputFile = $_[1];
  111. my $log;
  112. mkdir "./xml";
  113. &Log("EXPORTING XML FILE: $inputFile.xml\n");
  114. my $count = 0;
  115. my %files;
  116. open FILE, ">$inputFile.xml" or die "COULD NOT OPEN inputFile.xml XML FILE: $!";
  117. print FILE "<root id=\"$inputFile\" type=\"blast\">\n";
  118. foreach my $chrom (sort keys %db)
  119. {
  120. &Log("CHROMOSSOME $chrom\n\tSTART\n");
  121. print FILE "\t<table id=\"$chrom\" type=\"chromossome\">\n";
  122. my $startT = 0;
  123. foreach my $start (sort { $a <=> $b } keys %{$db{$chrom}})
  124. {
  125. &Log("\t$start");
  126. $startT++;
  127. $count++;
  128. my %data = %{$db{$chrom}{$start}};
  129. my $out = "\t\t<row id=\"$count\">\n";
  130. $out .= "\t\t\t<id>$count</id>\n";
  131. foreach my $qual (sort keys %data)
  132. {
  133. if ($qual ne "chromR")
  134. {
  135. my $value = $data{$qual};
  136. $out .= "\t\t\t<$qual>$value</$qual>\n";
  137. }
  138. }
  139. $out .= "\t\t</row>\n";
  140. print FILE $out;
  141. } #end foreach my start
  142. &Log("\tTOTAL $startT\n\n");
  143. print FILE "\t</table>\n";
  144. } #end foreach my chrom
  145. print FILE "</root>";
  146. close FILE;
  147. &Log("EXPORTED $count REGISTERS TO XML FILE\n");
  148. $count = 0;
  149. foreach my $key (sort keys %db)
  150. {
  151. my $genes = (keys %{$db{$key}});
  152. &Log("CHROMOSSOME $key HAS $genes GENES MAPPED\n");
  153. }
  154. } #end sub parse snps
  155. # <root id="NAME" type="[snp|gap|blast]">
  156. # <table id="NAME" type="[program|chromossome]">
  157. # <row>
  158. # <colum> </colum>
  159. # <row>
  160. # </table>
  161. # </root>
  162. sub genDB
  163. {
  164. my @hashes = @_;
  165. my %byChrom;
  166. my $countG = 0;
  167. my $countU = 0;
  168. my %genCoun;
  169. my %faulty;
  170. print "" . @hashes . " HASHES\n";
  171. foreach my $hashRef (@hashes)
  172. {
  173. my %hash = %{$hashRef};
  174. &Log("\t THERE ARE " . (keys %hash) . " METHODS IN THIS HASH\n");
  175. foreach my $method (sort keys %priority)
  176. {
  177. if ( ! defined %{$hash{$method}} ) { last;};
  178. &Log("\t\t THE METHOD $method HAS " . (keys %{$hash{$method}}) . " GENES\n");
  179. foreach my $gene (sort keys %{$hash{$method}})
  180. {
  181. my %data = %{$hash{$method}{$gene}};
  182. my $chromR = $data{"chromR"};
  183. my $start = $data{"start"};
  184. my $end = $data{"end"};
  185. my $sign = $data{"sign"};
  186. my $ident = $data{"ident"};
  187. my $consv = $data{"consv"};
  188. my $gaps = $data{"gaps"};
  189. my $strand = $data{"strand"};
  190. my $prop = abs(1-($data{"Qrylen"} / $data{"Hlength"}));
  191. my $size;
  192. my $point;
  193. if ($strand == 1) { $point = $start; $size = $end - $start+1; }
  194. elsif ($strand == -1) { $point = $end; $size = $start - $end+1; }
  195. else { die "NO START POSITION FOUND"};
  196. $data{"method"} = $method;
  197. $data{"gene"} = $gene;
  198. $data{"size"} = $size;
  199. if ( ! defined $byChrom{$chromR}{$point} )
  200. {
  201. $countU++;
  202. if ( ! $genCoun{$gene} ) # to fix, order of execution shouldnt affect result
  203. {
  204. $byChrom{$chromR}{$point} = \%data;
  205. }
  206. $genCoun{$gene} += 1;
  207. }
  208. else
  209. {
  210. $faulty{$gene}++;
  211. my %dataOld = %{$byChrom{$chromR}{$point}};
  212. my $methodOld = $dataOld{"method"};
  213. my $endOld = $dataOld{"end"};
  214. my $geneOld = $dataOld{"gene"};
  215. my $signOld = $dataOld{"sign"};
  216. my $identOld = $dataOld{"ident"};
  217. my $consvOld = $dataOld{"consv"};
  218. my $gapsOld = $dataOld{"gaps"};
  219. my $propOld = abs(1-($dataOld{"Qrylen"} / $dataOld{"Hlength"}));
  220. my $update = 0;
  221. if ($priority{$method} < $priority{$methodOld})
  222. {
  223. &Log(print "GENE $gene ON $chromR WAS UPDATED\n");
  224. ${$byChrom{$chromR}{$point}}[0] = \%data;
  225. }
  226. elsif ($priority{$methodOld} == $priority{$method})
  227. {
  228. my $old = 0; my $new = 0; my $draw = 0;
  229. if ($signOld < $sign) { $old +=2; } elsif ($signOld > $sign) { $new +=2; } else { $draw++; };
  230. if ($propOld < $prop) { $old++; } elsif ($propOld > $prop) { $new++; } else { $draw++; };
  231. if ($gapsOld < $gaps) { $old++; } elsif ($gapsOld > $gaps) { $new++; } else { $draw++; };
  232. if ($identOld > $ident) { $old++; } elsif ($identOld < $ident) { $new++; } else { $draw++; };
  233. if ($consvOld > $consv) { $old++; } elsif ($consvOld < $consv) { $new++; } else { $draw++; };
  234. if ((20*$signOld) < $sign) { $old++; } elsif ((20*$sign) > $signOld) { $new++; };
  235. if (( 2*$propOld) < $prop) { $old++; } elsif (( 2*$prop) > $propOld) { $new++; };
  236. if ( $identOld < (1.1*$ident)) { $old++; } elsif ( $ident > (1.1*$identOld)) { $new++; };
  237. if ( $consvOld < (1.2*$consv)) { $old++; } elsif ( $consv > (1.2*$consvOld)) { $new++; };
  238. if ($old == $new) { if (int(rand(1))) { $old++; } else { $new++; }; };
  239. if ($geneOld eq $gene)
  240. {
  241. &Log("DUPLICATION... FIND OUT WHERE IS IT YOUR LAZY FAT ASS\n");
  242. &Log("\t\t$chromR BY ($methodOld vs $method) STARTING AT $point, ENDING AT $endOld AND $end ($old vs $new vs $draw)\n");
  243. &Log("\t\tSIG\t$signOld\tvs\t$sign\n");
  244. &Log("\t\tIDENT\t$identOld\tvs\t$ident\n");
  245. &Log("\t\tPROP\t$propOld\tvs\t$prop\n");
  246. &Log("\t\tGAPS\t$gapsOld\tvs\t$gaps\n");
  247. &Log("\t\tCONSV\t$consvOld\tvs\t$consv\n");
  248. &Log("\t\t$geneOld\n\t\t$gene\n\n\n");
  249. }
  250. elsif ($new > $old)
  251. {
  252. # print "KEEPING NEW DATA $old vs $new\n";
  253. # print "\t\t$chromR BY ($methodOld vs $method) STARTING AT $point, ENDING AT $endOld AND $end ($old vs $new vs $draw)\n";
  254. # print "\t\tSIG\t$signOld\tvs\t$sign\n";
  255. # print "\t\tIDENT\t$identOld\tvs\t$ident\n";
  256. # print "\t\tPROP\t$propOld\tvs\t$prop\n";
  257. # print "\t\tGAPS\t$gapsOld\tvs\t$gaps\n";
  258. # print "\t\tCONSV\t$consvOld\tvs\t$consv\n";
  259. # print "\t\t$geneOld\n\t\t$gene\n\n\n";
  260. $byChrom{$chromR}{$point} = \%data;
  261. }
  262. elsif ($old > $new)
  263. {
  264. # print "KEEPING OLD DATA $old vs $new\n";
  265. # print "\t\t$chromR BY ($methodOld vs $method) STARTING AT $point, ENDING AT $endOld AND $end ($old vs $new vs $draw)\n";
  266. # print "\t\tSIG\t$signOld\tvs\t$sign\n";
  267. # print "\t\tIDENT\t$identOld\tvs\t$ident\n";
  268. # print "\t\tPROP\t$propOld\tvs\t$prop\n";
  269. # print "\t\tGAPS\t$gapsOld\tvs\t$gaps\n";
  270. # print "\t\tCONSV\t$consvOld\tvs\t$consv\n";
  271. # print "\t\t$geneOld\n\t\t$gene\n\n\n";
  272. }
  273. else
  274. {
  275. &Log("SOMETHING CRAZY HAS HAPPENED: TWO GENES MAPPED TO THE SAME PLACE BY THE SAME METHOD HAVING EVERYTHING EQUAL\n");
  276. &Log("\t\t$chromR BY ($methodOld vs $method) STARTING AT $point, ENDING AT $endOld AND $end ($old vs $new vs $draw)\n");
  277. &Log("\t\tSIG\t$signOld\tvs\t$sign\n");
  278. &Log("\t\tIDENT\t$identOld\tvs\t$ident\n");
  279. &Log("\t\tPROP\t$propOld\tvs\t$prop\n");
  280. &Log("\t\tGAPS\t$gapsOld\tvs\t$gaps\n");
  281. &Log("\t\tCONSV\t$consvOld\tvs\t$consv\n");
  282. &Log("\t\t$geneOld\n\t\t$gene\n\n\n");
  283. $byChrom{$chromR}{$point} = \%data;
  284. } # end if ($new > $old)
  285. } # end if ($priority{$method} < $priority{$methodOld})
  286. } # end if ( defined @{$byChrom{$chromR}{$point}} )
  287. $countG++;
  288. } # end foreach my $gene (sort keys %{$hash{$method}})
  289. } # end foreach my $method (sort keys %hash)
  290. } # end foreach my $hashRef (@hashes)
  291. &Log("$countG GENES OF INPUT BEING $countU UNIQUES\n");
  292. return (\%byChrom);
  293. }
  294. sub getChar
  295. {
  296. my $method = $_[0];
  297. my $query_name = $_[1];
  298. my $valid = $_[2];
  299. my %data = ();
  300. if (exists $Gpositives{$method}{$query_name})
  301. {
  302. %data = %{$Gpositives{$method}{$query_name}};
  303. }
  304. elsif (exists $Gblasts{$method}{$query_name})
  305. {
  306. %data = %{$Gblasts{$method}{$query_name}};
  307. }
  308. elsif (exists $Gnegatives{$method}{$query_name})
  309. {
  310. %data = %{$Gnegatives{$method}{$query_name}};
  311. }
  312. else
  313. {
  314. # %data = %{$Gnegatives{$method}{$query_name}};
  315. # print "*"x20 . "\nTHERE'S NO DATA FOR $query_name IN METHOD $method\n" . "*"x20 . "\n";
  316. }
  317. return \%data;
  318. }
  319. sub start
  320. {
  321. my $inputFile = $_[0];
  322. if ( -f $inputFile ) { &Log("LOADING FILE $inputFile\n"); } else { die "COULD NOT OPEN $inputFile: $!"};
  323. my $in = new Bio::SearchIO(-format => 'blast',
  324. -file => $inputFile);
  325. $| =0;
  326. my %positives = ();
  327. my %blasts = ();
  328. my %negatives = ();
  329. my %noIdent = ();
  330. my %noSig = ();
  331. my %short = ();
  332. my %noHit = ();
  333. my %stat = ();
  334. my $Positives = 0;
  335. my $Negatives = 0;
  336. my $Blasts = 0;
  337. my $NoHits = 0;
  338. my $total = 0;
  339. my $totalG = 0;
  340. # my %totalG;
  341. # my %POSG;
  342. # my %NEGG;
  343. # my %NOHITG;
  344. while( my $result = $in->next_result )
  345. {
  346. # print "METHOD " . $result->algorithm . "\n";
  347. my $method = $result->algorithm;
  348. my $minSize = $minsize{$method};
  349. my $minSig = $minsig{$method};
  350. my $minIdent = $minident{$method};
  351. my $minConsv = $minconsv{$method};
  352. # print ".";
  353. # print "Query Name = " . $result->query_name . "\n";
  354. # print "Query Length = " . $result->query_length . "\n";
  355. my $num_hits = $result->num_hits;
  356. # print "Query Hits = " . $result->num_hits . "\n";
  357. my $hitC = 0;
  358. $totalG++;
  359. # $totalG{$result->query_name}++;
  360. if ( $result->num_hits )
  361. {
  362. while( my $hit = $result->next_hit )
  363. {
  364. $hitC++;
  365. my $hspC = 0;
  366. # print "\tHit #$hitC/$num_hits = " . $hit->name . "\n",
  367. # "\tHit Significance = " . $hit->significance . "\n",
  368. # "\tHit hsps = " . $hit->num_hsps . "\n";
  369. while( my $hsp = $hit->next_hsp )
  370. {
  371. $hspC++;
  372. if( $hsp->length('total') > $minSize )
  373. {
  374. if (( $hsp->percent_identity >= ($minIdent/100) ) || ($hsp->frac_conserved >= ($minConsv/100) ))
  375. {
  376. if ( $hit->significance <= $minSig )
  377. {
  378. # if (($hsp->rank == 1) && ($hspC == 1) && ($hitC == 1))
  379. # {
  380. $stat{$method}{$hsp->evalue}++; $total++;
  381. # if (($result->num_hits == 1) && ($hit->num_hsps == 1))
  382. # {
  383. # print ":";
  384. $positives{$method}{$result->query_name}{"ident"} = &roundFrac($hsp->frac_identical *100);
  385. $positives{$method}{$result->query_name}{"consv"} = &roundFrac($hsp->frac_conserved *100);
  386. $positives{$method}{$result->query_name}{"sign"} = $hit->significance;
  387. $positives{$method}{$result->query_name}{"Hlength"} = $hsp->length('hit');
  388. $positives{$method}{$result->query_name}{"Qlength"} = $hsp->length('query');
  389. $positives{$method}{$result->query_name}{"Tlength"} = $hsp->length('total');
  390. $positives{$method}{$result->query_name}{"Qrylen"} = $result->query_length;
  391. $positives{$method}{$result->query_name}{"start"} = $hsp->start('hit');
  392. $positives{$method}{$result->query_name}{"end"} = $hsp->end('hit');
  393. $positives{$method}{$result->query_name}{"strand"} = $hsp->strand('hit');
  394. my $name = $hit->name; $name =~ s/lcl\|//;
  395. $positives{$method}{$result->query_name}{"chromR"} = $name;
  396. $positives{$method}{$result->query_name}{"gaps"} = $hsp->gaps;
  397. # $Positives++;
  398. # $POSG{$result->query_name}++;
  399. # }
  400. # else
  401. # {
  402. # print ".";
  403. # $blasts{$method}{$result->query_name}{"ident"} = &roundFrac($hsp->frac_identical *100);
  404. # $blasts{$method}{$result->query_name}{"consv"} = &roundFrac($hsp->frac_conserved *100);
  405. # $blasts{$method}{$result->query_name}{"sign"} = $hit->significance;
  406. # $blasts{$method}{$result->query_name}{"Hlength"} = $hsp->length('hit');
  407. # $blasts{$method}{$result->query_name}{"Qlength"} = $hsp->length('query');
  408. # $blasts{$method}{$result->query_name}{"Tlength"} = $hsp->length('total');
  409. # $blasts{$method}{$result->query_name}{"Qrylen"} = $result->query_length;
  410. # $blasts{$method}{$result->query_name}{"start"} = $hsp->start('hit');
  411. # $blasts{$method}{$result->query_name}{"end"} = $hsp->end('hit');
  412. # $blasts{$method}{$result->query_name}{"strand"} = $hsp->strand('hit');
  413. # my $name = $hit->name; $name =~ s/lcl\|//;
  414. # $blasts{$method}{$result->query_name}{"chromR"} = $name;
  415. # $blasts{$method}{$result->query_name}{"gaps"} = $hsp->gaps;
  416. # $Blasts++;
  417. # $POSG{$result->query_name}++;
  418. # } #end if else num hits = 1
  419. # }; # end if hank = 1
  420. # print "\t\tHsp Length = " . $hsp->length('total') . "\n",
  421. # "\t\tHsp Percent_id = " . $hsp->percent_identity . "\n",
  422. # "\t\tHsp e-value = " . $hsp->evalue . "\n",
  423. # "\t\tHsp Frac_Identic = " . $hsp->frac_identical . "\n",
  424. # "\t\tHsp Frac_conserv = " . $hsp->frac_conserved . "\n",
  425. # "\t\tHsp Rank = " . $hsp->rank . "\n",
  426. # "\t\tHsp Gaps = " . $hsp->gaps . "\n",
  427. # "\t\tHsp Range @ = " . join ("\t", $hsp->range('hit')) . "\n",
  428. #
  429. # "\t\tHsp Query Strand = " . $hsp->strand('query') . "\n",
  430. # "\t\tHsp Query Start = " . $hsp->start('query') . "\n",
  431. # "\t\tHsp Query End = " . $hsp->end('query') . "\n",
  432. # "\t\tHsp Query Length = " . $hsp->length('query') . "\n",
  433. # # "\t\tHsp Query String = " . $hsp->query_string . "\n",
  434. #
  435. # "\t\tHsp Hit Strand = " . $hsp->strand('hit') . "\n",
  436. # "\t\tHsp Hit Start = " . $hsp->start('hit') . "\n",
  437. # "\t\tHsp Hit End = " . $hsp->end('hit') . "\n",
  438. # "\t\tHsp Hit Length = " . $hsp->length('hit') . "\n",
  439. # # "\t\tHsp Hit String = " . $hsp->hit_string . "\n",
  440. # # "\t\tHsp Homology Str = " . $hsp->homology_string . "\n",
  441. ;
  442. } # end if sig > 1e-20
  443. else
  444. {
  445. if (($hsp->rank == 1) && ($hspC == 1) && ($hitC == 1))
  446. {
  447. # print ",";
  448. $noSig{$method}{$result->query_name}++;
  449. $negatives{$method}{$result->query_name}{"ident"} = &roundFrac($hsp->frac_identical *100);
  450. $negatives{$method}{$result->query_name}{"consv"} = &roundFrac($hsp->frac_conserved *100);
  451. $negatives{$method}{$result->query_name}{"sign"} = $hit->significance;
  452. $negatives{$method}{$result->query_name}{"Hlength"} = $hsp->length('hit');
  453. $negatives{$method}{$result->query_name}{"Qlength"} = $hsp->length('query');
  454. $negatives{$method}{$result->query_name}{"Tlength"} = $hsp->length('total');
  455. $negatives{$method}{$result->query_name}{"Qrylen"} = $result->query_length;
  456. $negatives{$method}{$result->query_name}{"start"} = $hsp->start('hit');
  457. $negatives{$method}{$result->query_name}{"end"} = $hsp->end('hit');
  458. $negatives{$method}{$result->query_name}{"strand"} = $hsp->strand('hit');
  459. $negatives{$method}{$result->query_name}{"hits"} = $result->num_hits;
  460. my $name = $hit->name; $name =~ s/lcl\|//;
  461. $negatives{$method}{$result->query_name}{"chromR"} = $name;
  462. $negatives{$method}{$result->query_name}{"gaps"} = $hsp->gaps;
  463. # $Negatives++;
  464. # $NEGG{$result->query_name}++;
  465. };
  466. };
  467. } # end if ident < 80
  468. else
  469. {
  470. if (($hsp->rank == 1) && ($hspC == 1) && ($hitC == 1))
  471. {
  472. # print ",";
  473. $noIdent{$method}{$result->query_name}++;
  474. $negatives{$method}{$result->query_name}{"ident"} = &roundFrac($hsp->frac_identical *100);
  475. $negatives{$method}{$result->query_name}{"consv"} = &roundFrac($hsp->frac_conserved *100);
  476. $negatives{$method}{$result->query_name}{"sign"} = $hit->significance;
  477. $negatives{$method}{$result->query_name}{"Hlength"} = $hsp->length('hit');
  478. $negatives{$method}{$result->query_name}{"Qlength"} = $hsp->length('query');
  479. $negatives{$method}{$result->query_name}{"Tlength"} = $hsp->length('total');
  480. $negatives{$method}{$result->query_name}{"Qrylen"} = $result->query_length;
  481. $negatives{$method}{$result->query_name}{"start"} = $hsp->start('hit');
  482. $negatives{$method}{$result->query_name}{"end"} = $hsp->end('hit');
  483. $negatives{$method}{$result->query_name}{"strand"} = $hsp->strand('hit');
  484. $negatives{$method}{$result->query_name}{"hits"} = $result->num_hits;
  485. my $name = $hit->name; $name =~ s/lcl\|//;
  486. $negatives{$method}{$result->query_name}{"chromR"} = $name;
  487. $negatives{$method}{$result->query_name}{"gaps"} = $hsp->gaps;
  488. # $Negatives++;
  489. # $NEGG{$result->query_name}++;
  490. };
  491. };
  492. } # end if length < 50
  493. else
  494. {
  495. if (($hsp->rank == 1) && ($hspC == 1) && ($hitC == 1))
  496. {
  497. # print ",";
  498. $short{$method}{$result->query_name}++;
  499. $negatives{$method}{$result->query_name}{"ident"} = &roundFrac($hsp->frac_identical *100);
  500. $negatives{$method}{$result->query_name}{"consv"} = &roundFrac($hsp->frac_conserved *100);
  501. $negatives{$method}{$result->query_name}{"sign"} = $hit->significance;
  502. $negatives{$method}{$result->query_name}{"Hlength"} = $hsp->length('hit');
  503. $negatives{$method}{$result->query_name}{"Qlength"} = $hsp->length('query');
  504. $negatives{$method}{$result->query_name}{"Tlength"} = $hsp->length('total');
  505. $negatives{$method}{$result->query_name}{"Qrylen"} = $result->query_length;
  506. $negatives{$method}{$result->query_name}{"start"} = $hsp->start('hit');
  507. $negatives{$method}{$result->query_name}{"end"} = $hsp->end('hit');
  508. $negatives{$method}{$result->query_name}{"strand"} = $hsp->strand('hit');
  509. $negatives{$method}{$result->query_name}{"hits"} = $result->num_hits;
  510. my $name = $hit->name; $name =~ s/lcl\|//;
  511. $negatives{$method}{$result->query_name}{"chromR"} = $name;
  512. $negatives{$method}{$result->query_name}{"gaps"} = $hsp->gaps;
  513. # $Negatives++;
  514. # $NEGG{$result->query_name}++;
  515. };
  516. };
  517. }; # end while hsp
  518. }; # end while result
  519. } # end if hits
  520. else
  521. {
  522. # print " ";
  523. $noHit{$method}{$result->query_name}++;
  524. # $NoHits++;
  525. # $NOHITG{$result->query_name}++;
  526. };
  527. }; # end if $result->num_hits
  528. $totalG = ($totalG/(keys %stat));
  529. # foreach my $name (keys %POSG)
  530. # {
  531. # if ($POSG{$name} > 1)
  532. # {
  533. # print "*"x20 . "\n$name IS DUPLICATED ON POSITIVE\n" . "*"x20;
  534. # }
  535. # }
  536. #
  537. # foreach my $name (keys %NEGG)
  538. # {
  539. # if ($NEGG{$name} > 1)
  540. # {
  541. # print "*"x20 . "\n$name IS DUPLICATED ON NEGATIVE\n" . "*"x20;
  542. # }
  543. # }
  544. #
  545. # foreach my $name (keys %NOHITG)
  546. # {
  547. # if ($NOHITG{$name} > 1)
  548. # {
  549. # print "*"x20 . "\n$name IS DUPLICATED ON NO HIT\n" . "*"x20;
  550. # }
  551. # }
  552. # my $dv = $Positives+$Blasts+$Negatives+$NoHits;
  553. # my $dv2 = (keys %totalG);
  554. # my $dv3 = (keys %POSG);
  555. # my $dv4 = (keys %NEGG);
  556. # my $dv5 = (keys %NOHITG);
  557. # print "POS $Positives + BLAST $Blasts = " . ($Positives+$Blasts) . " ($dv3)\n";
  558. # print "NEG $Negatives + NOHIT $NoHits = " . ($Negatives+$NoHits) . " ($dv4 | $dv5)\n";
  559. # print "VALID " . ($Positives+$Blasts) . " + INVALID " . ($Negatives+$NoHits) . " = $dv\n";
  560. # print "TOTAL $totalG (DV1 $dv | DV2 $dv2)\n";
  561. $| =1;
  562. &Log("$inputFile LOADED\n");
  563. return ($total, $totalG, \%positives, \%blasts, \%negatives, \%noIdent, \%noSig, \%short, \%noHit, \%stat)
  564. }
  565. sub printStat
  566. {
  567. my $total = $_[0];
  568. my $totalG = $_[1];
  569. my %positives = %{$_[2]};
  570. my %blasts = %{$_[3]};
  571. my %negatives = %{$_[4]};
  572. my %noIdent = %{$_[5]};
  573. my %noSig = %{$_[6]};
  574. my %short = %{$_[7]};
  575. my %noHit = %{$_[8]};
  576. my %stat = %{$_[9]};
  577. foreach my $method (sort {$a cmp $b} keys %stat)
  578. {
  579. my $minSize = $minsize{$method};
  580. my $minSig = $minsig{$method};
  581. my $minIdent = $minident{$method};
  582. my $minConsv = $minconsv{$method};
  583. my @resume = (0)x4;
  584. my $total2;
  585. foreach my $key (sort {$a <=> $b} keys %{$stat{$method}})
  586. {
  587. if ($key == 0.0)
  588. {
  589. $resume[0] += $stat{$method}{$key};
  590. $total2 += $stat{$method}{$key};
  591. }
  592. elsif (($key > 0) && ($key <= 1e-35))
  593. {
  594. $resume[1] += $stat{$method}{$key};
  595. $total2 += $stat{$method}{$key};
  596. }
  597. elsif (($key > 1e-35) && ($key <= 1e-30))
  598. {
  599. $resume[2] += $stat{$method}{$key};
  600. $total2 += $stat{$method}{$key};
  601. }
  602. elsif (($key > 1e-30) && ($key <= $minSig))
  603. {
  604. $resume[3] += $stat{$method}{$key};
  605. $total2 += $stat{$method}{$key};
  606. }
  607. elsif ($key > $minSig)
  608. {
  609. $resume[4] += $stat{$method}{$key};
  610. $total2 += $stat{$method}{$key};
  611. }
  612. else
  613. {
  614. die "ERROR COUNTING KEY: $key";
  615. }
  616. } # end foreach my key
  617. my $cum = 0;
  618. my $cumP = 0;
  619. if ($total && $total2)
  620. {
  621. my $noSig = (keys %{$noSig{$method}});
  622. my $noIdent = (keys %{$noIdent{$method}});
  623. my $short = (keys %{$short{$method}});
  624. my $noHit = (keys %{$noHit{$method}});
  625. my $positives = (keys %{$positives{$method}});
  626. my $blasts = (keys %{$blasts{$method}});
  627. my $negatives = (keys %{$negatives{$method}});
  628. my $invalid = $noSig+$noIdent+$short+$noHit;
  629. # if ($invalid != ($totalG-$total2)) { die "INVALIDS ($invalid) IS DIFFERENT FROM THE SUBTRATION OF TOTAL ($totalG) MINUS IDENTIFIED ($total2)." };
  630. # if ($invalid != ($negatives+$noHit)) { die "INVALIDS ($invalid) IS DIFFERENT FROM NEGATIVES ($negatives) PLUS NOHITS ($noHit) [" . ($negatives + $noHit) . "]." };
  631. &Log("FILENAME : $inputFile\n");
  632. &Log("METHOD : $method\n");
  633. &Log("TOTAL GLOBAL : $totalG\n");
  634. &Log("TOTAL INDENTIFIED: $total2" . "\t( " . &percentage($total2 ,$totalG) . "% )\n");
  635. &Log(" UNIQUES : $positives" . "\t( " . &percentage($positives,$totalG) . "% )\n");
  636. &Log(" MULTI RESULTS : $blasts" . "\t( " . &percentage($blasts ,$totalG) . "% )\n");
  637. &Log("INVALIDS : $invalid" . "\t( " . &percentage($invalid ,$totalG) . "% )\n");
  638. &Log(" NEGATIVES : $negatives" . "\t( " . &percentage($negatives,$totalG) . "% )\n");
  639. &Log(" NO HITS : $noHit" . "\t( " . &percentage($noHit ,$totalG) . "% )\n");
  640. &Log("\nRESUME \n");
  641. $cum += $resume[0];
  642. &Log(" VALIDS $total2 (" . &percentage($total2,$totalG) . "%)\n"); $cum += $noSig;
  643. &Log(" E-VALUE == 0.0 : " . $resume[0] . "\t( " . &percentage($resume[0],$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG). "%] )\n"); $cum += $resume[1];
  644. &Log(" 0.0 < E-VALUE <= 1e-35 : " . $resume[1] . "\t( " . &percentage($resume[1],$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG). "%] )\n"); $cum += $resume[2];
  645. &Log(" 1e-35 < E-VALUE <= 1e-30 : " . $resume[2] . "\t( " . &percentage($resume[2],$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG). "%] )\n"); $cum += $resume[3];
  646. &Log(" 1e-30 < E-VALUE <= $minSig : " . $resume[3] . "\t( " . &percentage($resume[3],$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG). "%] )\n"); $cum += $resume[4];
  647. &Log(" E-VALUE > $minSig : " . $resume[4] . "\t( " . &percentage($resume[3],$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG). "%] )\n");
  648. &Log("\n");
  649. &Log(" INVALIDS $invalid (" . &percentage($invalid,$totalG) . "%)\n"); $cum += $noSig;
  650. &Log(" E-VALUE > " . $minSig . "\t\t: " . $noSig . "\t( " . &percentage($noSig ,$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG) . "%] )\n"); $cum += $noIdent;
  651. &Log(" ID < $minIdent% & SIM < $minConsv%\t: " . $noIdent . "\t( " . &percentage($noIdent,$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG) . "%] )\n"); $cum += $short;
  652. &Log(" LENGTH < " . $minSize . "\t\t: " . $short . "\t( " . &percentage($short ,$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG) . "%] )\n"); $cum += $noHit;
  653. &Log(" NO HIT \t" . "" . "\t\t: " . $noHit . "\t( " . &percentage($noHit ,$totalG) . "% CUM $cum [ " . &percentage($cum,$totalG) . "%] )\n");
  654. &Log("\n\n" . "-"x50 . "\n\n");
  655. &Log("E-VALUE > " . "$minSig \t(" . (keys %{$noSig{$method}}) . ") \t:\n"); &printHash(\%noSig, $method);
  656. &Log("ID < $minIdent% & SIM < $minConsv\t(" . (keys %{$noIdent{$method}}) . ")" ."\t:\n"); &printHash(\%noIdent,$method);
  657. &Log("LENGTH < " . "$minSize \t(" . (keys %{$short{$method}}) . ") \t:\n"); &printHash(\%short, $method);
  658. &Log("NO HIT \t(" . "" . (keys %{$noHit{$method}}) . ") \t:\n"); &printHash(\%noHit, $method);
  659. &Log("\n\n" . "="x50 . "\n\n");
  660. }
  661. else
  662. {
  663. &Log("COULD NOT RETRIEVE RESULTS\n\n");
  664. }
  665. } # end foreach my method
  666. &printHashSumary(\%noSig,\%noIdent,\%short,\%noHit);
  667. }
  668. sub getSumaryStr
  669. {
  670. my @hashes = @_;
  671. my %sumary;
  672. my %methodC;
  673. foreach my $hashRef (@hashes)
  674. {
  675. my %hash = %{$hashRef};
  676. foreach my $method (sort keys %hash)
  677. {
  678. foreach my $gene (sort keys %{$hash{$method}})
  679. {
  680. my $data = &getCharStr($method,$gene);
  681. if (! $data ) {die "NO DATA RETRIEVED"; };
  682. my $value = $hash{$method}{$gene};
  683. $sumary{$gene}{"count"} += 1;
  684. push(@{$sumary{$gene}{"char"}}, $data);
  685. $methodC{$method}++;
  686. }
  687. }
  688. }
  689. return (\%sumary, \%methodC);
  690. }
  691. sub printHashSumary
  692. {
  693. my @result = &getSumaryStr(@_);
  694. my %sumary = %{$result[0]};
  695. my %methodC = %{$result[1]};
  696. my %stat;
  697. if ((keys %methodC) > 1)
  698. {
  699. &Log("\n\n" . "#"x50 . "\n" . "#"x50 . "\n" . "#"x50 . "\n");
  700. &Log("NOT IDENTIFIED GENES : \n");
  701. &Log("\n\n" . "#"x50 . "\n" . "#"x50 . "\n" . "#"x50 . "\n");
  702. my $countRec = 0;
  703. my $countLost = 0;
  704. my $outRec = "";
  705. my $outLost = "";
  706. foreach my $gene (sort keys %sumary)
  707. {
  708. my $value = $sumary{$gene}{"count"};
  709. my @data = @{$sumary{$gene}{"char"}};
  710. my $valid = 1;
  711. if ( ( ! $hypothetical ) && ( $gene =~ /hypothetical.protein/sig ) && ( ! ($gene =~ /conserved/sig ) ) ) { $valid = 0; };
  712. if ($value == 1) # found in only one of the lists, so, was a match in the other
  713. {
  714. my $data = $data[0];
  715. $countRec++;
  716. if ($valid) { $outRec .= "\t$countRec\t$data\t$gene\n"};
  717. $stat{"rec"}++;
  718. }
  719. elsif ($value > 1) # found in both lists, so was a mismatch every time
  720. {
  721. $countLost++;
  722. $outLost .= "\t$countLost" if ($valid);
  723. my $second = 0;
  724. foreach my $data (@data)
  725. {
  726. if ($valid)
  727. {
  728. if ($second++) { $outLost .= "\t"; };
  729. $outLost .= "\t$data\t$gene\n";
  730. }
  731. }
  732. $stat{"lost"}++;
  733. }
  734. }
  735. my $total = $countRec+$countLost;
  736. &Log(
  737. "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n",
  738. "X LOST X\n",
  739. "X===============================X\n",
  740. "X LOST BEFORE AFTER X\n");
  741. foreach my $prog (keys %methodC)
  742. {
  743. my $value = $methodC{$prog};
  744. &Log("X $prog\t$value\t$countLost\tX\n");
  745. }
  746. &Log(
  747. "X===============================X\n",
  748. "X RECOVERED X\n",
  749. "X===============================X\n",
  750. "X LOST AFTER X\n");
  751. foreach my $prog (keys %methodC)
  752. {
  753. my $value = $methodC{$prog};
  754. my $minus = $value - $countLost;
  755. &Log("X $prog\t\t$minus\tX\n");
  756. }
  757. &Log(
  758. "X===============================X\n",
  759. "X TOTAL X\n",
  760. "X===============================X\n",
  761. "X LOST $countLost\tX\n",
  762. "X RECOVERED $countRec\tX\n",
  763. "X TOTAL $total\tX\n",
  764. "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n");
  765. &Log("\n\tTOTAL $total\n");
  766. &Log("\tRECUPERED $countRec\n");
  767. &Log("\tCOUNT\tMETHOD\tHITS\tIDENT\tCONSV\tSIGN\tHLENGTH\tQLENGTH\tTLENGTH\tSTART\tEND\tSTRAND\tNAME\n");
  768. &Log($outRec);
  769. &Log("\n" . "-"x50 . "\n");
  770. &Log("\n\tLOST $countLost\n");
  771. &Log("\tCOUNT\tMETHOD\tHITS\tIDENT\tCONSV\tSIGN\tHLENGTH\tQLENGTH\tTLENGTH\tSTART\tEND\tSTRAND\tNAME\n");
  772. &Log($outLost);
  773. &Log("\n\n");
  774. } #end if keys methodC > 1
  775. }
  776. sub printHash
  777. {
  778. my %hash = %{$_[0]};
  779. my $method = $_[1];
  780. my @methods;
  781. if (defined $method) { $methods[0] = $method; } else { @methods = (keys %hash); };
  782. my $count = 1;
  783. &Log("\tCOUNT\tMETHOD\tHITS\tIDENT\tCONSV\tSIGN\tHLENGTH\tQLENGTH\tTLENGTH\tSTART\tEND\tSTRAND\tNAME\n");
  784. foreach my $method (sort @methods)
  785. {
  786. foreach my $name (sort keys %{$hash{$method}})
  787. {
  788. my $valid = 1;
  789. my $value = $hash{$method}{$name};
  790. my $data = &getCharStr($method,$name);
  791. if ( ( ! $hypothetical ) && ( $name =~ /hypothetical.protein/sig ) && ( ! ( $name =~ /conserved/sig ) ) ) { $valid = 0; };
  792. if ($value == 1)
  793. {
  794. if ($valid) { &Log("\t" . $count++ . "\t$data\t$name\n") };
  795. }
  796. elsif ($value >= 2)
  797. {
  798. if ($valid) { &Log("\t" . $count++ . "\t$data\t$name\n") };
  799. }
  800. else
  801. {
  802. &Log("GENE $name SHOUDNT BE HERE\n");
  803. }
  804. }
  805. }
  806. }
  807. sub getCharStr
  808. {
  809. my $method = $_[0];
  810. my $query_name = $_[1];
  811. my $result;
  812. my $colums = 10;
  813. my %data = %{&getChar($method,$query_name)};
  814. if (%data)
  815. {
  816. my $ident = $data{"ident"};
  817. my $consv = $data{"consv"};
  818. $result = "$method\t" . $data{"hits"} . "\t" . $ident . "\t" . $consv . "\t" . $data{"sign"} . "\t" . $data{"Hlength"} . "\t" . $data{"Qlength"} . "\t" . $data{"Tlength"} . "\t" . $data{"start"} . "\t" . $data{"end"} . "\t" . $data{"strand"};
  819. }
  820. else
  821. {
  822. $result = "\t"x$colums;
  823. }
  824. return $result;
  825. }
  826. sub roundFrac
  827. {
  828. my $number = $_[0];
  829. $number = sprintf("%03.1f", $number);
  830. return $number;
  831. }
  832. sub percentage
  833. {
  834. my $value = $_[0];
  835. my $total = $_[1];
  836. my $result;# = sprintf("%.1f", (($value/$total)*100));
  837. $value = (($value/$total)*100);
  838. # if ($value < 10)
  839. # {
  840. # $result = sprintf("%.1f", $value);
  841. $result = sprintf("%05.1f", $value);
  842. # }
  843. return $result;
  844. }
  845. 1;