PageRenderTime 69ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/WormBase/API/Service/blast_blat.pm

https://bitbucket.org/tharris/wormbase/
Perl | 1351 lines | 932 code | 273 blank | 146 comment | 124 complexity | 0d02883a921c51535268311c8e2e385c MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause
  1. package WormBase::API::Service::blast_blat;
  2. use Carp;
  3. use File::Temp;
  4. use Bio::SearchIO;
  5. use Bio::Tools::Blat;
  6. use Bio::Graphics;
  7. use Bio::SeqFeature::Generic;
  8. use Bio::SeqIO;
  9. use File::Slurp qw(slurp);
  10. use GD::Simple;
  11. use Moose;
  12. with 'WormBase::API::Role::Object';
  13. has 'file_dir' => (
  14. is => 'ro',
  15. lazy => 1,
  16. default => sub {
  17. return shift->tmp_dir('blast_blat');
  18. }
  19. );
  20. has 'image_dir' => (
  21. is => 'ro',
  22. lazy => 1,
  23. default => sub {
  24. return shift->tmp_image_dir('blast_blat');
  25. }
  26. );
  27. has 'BLAST_DATABASES' => (
  28. is => 'ro',
  29. lazy => 1,
  30. default => sub {
  31. my $self=shift;
  32. my $hash;
  33. my $BLAST_DIR = $self->pre_compile->{base}.$self->ace_dsn->version.$self->pre_compile->{blast};
  34. foreach my $species (split /\s+|\t/, $self->pre_compile->{b_genome}){
  35. (my $type = $species) =~ s/.*_//;
  36. $hash->{$type."_genome"} = {
  37. type => "nucl",
  38. location => "$BLAST_DIR/$species/genomic.fa"
  39. } ;
  40. $hash->{$type."_protein"} = {
  41. type => "prot",
  42. location => "$BLAST_DIR/$species/peptide.fa"
  43. } if(grep /$species/, split /\s+|\t/, $self->pre_compile->{b_protein}) ;
  44. $hash->{$type."_gene"} = {
  45. type => "nucl",
  46. location => "$BLAST_DIR/$species/genes.fa"
  47. } if(grep /$species/, split /\s+|\t/,$self->pre_compile->{b_gene}) ;
  48. $hash->{$type."_est"} = {
  49. type => "nucl",
  50. location => "$BLAST_DIR/$species/ests.fa"
  51. } if(grep /$species/, split /\s+|\t/,$self->pre_compile->{b_est}) ;
  52. }
  53. foreach my $archive (split /\s+|\t/,$self->pre_compile->{ARCHIVES}) {
  54. my $BLAST_DIR = $self->pre_compile->{base}.$archive.$self->pre_compile->{blast};
  55. $hash->{"elegans_genome_$archive"} = {
  56. type => "nucl",
  57. location => "$BLAST_DIR/c_elegans/genomic.fa"
  58. };
  59. $hash->{"elegans_protein_$archive"} = {
  60. type => "prot",
  61. location => "$BLAST_DIR/c_elegans/peptide.fa"
  62. };
  63. }
  64. return $hash;
  65. }
  66. );
  67. our %BLAST_APPLICATIONS = (
  68. blastn => {query_type => "nucl", database_type => "nucl"},
  69. blastp => {query_type => "prot", database_type => "prot"},
  70. blastx => {query_type => "nucl", database_type => "prot"},
  71. tblastn => {query_type => "prot", database_type => "nucl"},
  72. );
  73. our %BLAT_DATABASES = (
  74. elegans_genome => {type => "N", port => "2008", location => "/"},
  75. briggsae_genome => {type => "N", port => "2009", location => "/"},
  76. #brenneri_genome => {type => "N", port => "2010", location => "/"},
  77. #japonica_genome => {type => "N", port => "2011", location => "/"},
  78. #remanei_genome => {type => "N", port => "2012", location => "/"},
  79. #malayi_genome => {type => "N", port => "2013", location => "/"},
  80. #hapla_genome => {type => "N", port => "2014", location => "/"},
  81. #incognita_genome => {type => "N", port => "2015", location => "/"}
  82. );
  83. our %BLAST_FILTERS = ("filter" => "-F T",);
  84. sub index {
  85. my ($self,$params) = @_;
  86. $self->autoload($params) if $params->{'autoload'};
  87. my %data = (
  88. query_sequence => $params->{query_sequence} || '',
  89. check_query_type_nucl => '',
  90. check_query_type_prot => 'checked="1"',
  91. selected_blastn => '',
  92. selected_blastp => '',
  93. selected_elegans_protein => '',
  94. selected_elegans_genome => '',
  95. );
  96. if ($params->{query_type} and $params->{query_type} eq 'nucl') {
  97. @data{qw(check_query_type_nucl check_query_type_prot)}
  98. = @data{qw(check_query_type_prot check_query_type_nucl)} # flip them
  99. }
  100. if ($params->{blast_app}) {
  101. if ($params->{blast_app} eq 'blastn') {
  102. $data{selected_blastn} = 'selected="1"';
  103. }
  104. elsif ($params->{blast_app} eq 'blastp') {
  105. $data{selected_blastp} = 'selected="1"';
  106. }
  107. }
  108. if ($params->{database}) {
  109. if ($params->{database} eq 'elegans_genome') {
  110. $data{selected_elegans_genome} = 'selected="1"';
  111. }
  112. elsif ($params->{database} eq 'elegans_protein') {
  113. $data{selected_elegans_protein} = 'selected="1"';
  114. }
  115. }
  116. return \%data;
  117. }
  118. sub run {
  119. my ($self,$param) = @_;
  120. my ($query_file, $query_type, $result_file) = $self->process_input($param);
  121. return $self->display_results($param, $query_file, $query_type, $result_file);
  122. }
  123. sub error {
  124. return 0;
  125. }
  126. sub message {
  127. return 0;
  128. }
  129. sub process_input {
  130. my ($self,$cgi) = @_;
  131. my $query_sequence = $cgi->{"query_sequence"};
  132. my $query_type = $cgi->{"query_type"};
  133. if (!$query_sequence) {
  134. message("A query sequence is required!");
  135. }
  136. my $query_file = $self->process_query_sequence($query_sequence);
  137. my $temp_file = File::Temp->new(
  138. TEMPLATE => "out_XXXXX",
  139. DIR => $self->file_dir,
  140. SUFFIX => ".out",
  141. UNLINK => 0,
  142. );
  143. # my $address = $ENV{REMOTE_ADDR};
  144. # my $out_file = ($address eq "127.0.0.1") ? 'localhost-debug' : $temp_file->filename;
  145. my $out_file = $temp_file->filename;
  146. my $database = $cgi->{"database"};
  147. my $search_type = $cgi->{"search_type"};
  148. my $command_line;
  149. if ($search_type eq "blast") {
  150. my $blast_app = $cgi->{"blast_app"};
  151. if (!$BLAST_APPLICATIONS{$blast_app}) {
  152. error("Invalid BLAST application ($blast_app)!");
  153. }
  154. my $blast_e_value = $cgi->{"blast_e_value"};
  155. if ($blast_e_value =~ /[^\dE+\-]/) {
  156. error("Invalid BLAST e-value ($blast_e_value)!");
  157. }
  158. if (!$self->BLAST_DATABASES->{$database}) {
  159. error("Invalid BLAST database ($database)!");
  160. }
  161. my $database_type = $self->BLAST_DATABASES->{$database}->{type};
  162. my $database_location = $self->BLAST_DATABASES->{$database}->{location};
  163. my $expected_query_type = $BLAST_APPLICATIONS{$blast_app}{query_type};
  164. my $expected_database_type =
  165. $BLAST_APPLICATIONS{$blast_app}{database_type};
  166. if ( $query_type ne $expected_query_type
  167. or $database_type ne $expected_database_type) {
  168. error("Incompatible query($query_type)/database($database_type)");
  169. }
  170. my $blast_filter = "-F F";
  171. if ($cgi->{"process_query"}) {
  172. my $process_query_type =
  173. $cgi->{"process_query_type"}; # filter, mask
  174. my $blast_filter_identifier = join(":", $process_query_type);
  175. $blast_filter = $BLAST_FILTERS{$blast_filter_identifier};
  176. if (!$blast_filter) {
  177. error(
  178. "Invalid BLAST filter selection ($blast_filter_identifier)!"
  179. );
  180. }
  181. }
  182. $command_line =
  183. $self->pre_compile->{BLAST_EXEC_DIR}.qq[/bin/blastall -p $blast_app -d $database_location -i $query_file -e $blast_e_value $blast_filter -o $out_file >& $out_file.err];
  184. }
  185. elsif ($search_type eq "blat") {
  186. if (!$BLAT_DATABASES{$database}) {
  187. error("Invalid BLAT database ($database)!");
  188. }
  189. my $database_type = $BLAT_DATABASES{$database}{type};
  190. my $database_port = $BLAT_DATABASES{$database}{port};
  191. my $database_location = $BLAT_DATABASES{$database}{location};
  192. # if ($query_type ne $database_type) {
  193. # error("Incompatible query($query_type)/database($database_type)");
  194. # }
  195. # Currently only DNA searches are supported, if expanded adjust query and db types accordingly
  196. $command_line =
  197. $self->pre_compile->{BLAT_CLIENT}.qq[ -out=blast -t=dna -q=dna localhost $database_port $database_location $query_file $out_file >& $out_file.err];
  198. }
  199. else {
  200. error("Unknown search type ($search_type)!");
  201. }
  202. # Run search
  203. my $result = system($command_line);
  204. if ($result) {
  205. warn "$out_file.err";
  206. my $err = slurp("$out_file.err");
  207. my %seen;
  208. my @err = split(/\n+/, $err);
  209. my @nr_err;
  210. foreach (@err) {
  211. push @nr_err, $_ unless $seen{$_};
  212. $seen{$_}++;
  213. }
  214. my $nr_err = join('<br/>', @nr_err);
  215. $self->log->debug( "$command_line\n" );
  216. error("<b>BLAST/BLAT search failed!</b><br/><br/>$!<br/>$nr_err");
  217. }
  218. return ($query_file, $query_type, $out_file);
  219. }
  220. sub process_query_sequence {
  221. my ($self,$query_sequence) = @_;
  222. # Check query sequence
  223. check_query_sequence($query_sequence);
  224. # Clean query sequence of extra white space
  225. $query_sequence =~ s/^\s+//;
  226. $query_sequence =~ s/\s+$//;
  227. # if ($query_sequence =~ /^>/ && $query_sequence !~ /^>[^\n]*\n+[^\n]+/) {
  228. # message("Empty sequence!");
  229. # }
  230. # Add a header line if not available
  231. if ($query_sequence !~ />/) {
  232. $query_sequence = ">Query Sequence\n" . $query_sequence;
  233. }
  234. # Write query sequence to a temp file
  235. my $temp_file = File::Temp->new(
  236. TEMPLATE => "query_XXXXX",
  237. DIR => $self->file_dir,
  238. SUFFIX => ".seq",
  239. UNLINK => 0,
  240. );
  241. my $file_name = $temp_file->filename;
  242. open(SEQ, ">$file_name")
  243. or error("Cannot write temp query sequence file ($file_name)!");
  244. print SEQ $query_sequence;
  245. close SEQ;
  246. return $file_name;
  247. }
  248. sub check_query_sequence {
  249. my ($query_sequence) = @_;
  250. $query_sequence =~ s/^>[^\n\r]*[\n\r]+//;
  251. $query_sequence =~ s/[\n\r]//g;
  252. $query_sequence =~ s/\s+//g;
  253. $query_sequence = lc($query_sequence);
  254. if (length($query_sequence) < 10) {
  255. error("At least 10 residues are required!");
  256. }
  257. return 1;
  258. }
  259. sub display_results {
  260. my ($self,$cgi, $query_file, $query_type, $result_file) = @_;
  261. my $search_type = $cgi->{"search_type"};
  262. my $blast_app = $cgi->{"blast_app"} || '';
  263. # Generate (i)Alignment Image URL and (ii)Karyotype Viewer URL
  264. # my ($alignment_image, $kviewer_adds_ref, $genome_links_ref,
  265. # $expand_links_ref, $image_map_pieces_ref
  266. # )
  267. my $data = $self->process_result_file(
  268. $query_file, $query_type, $result_file,
  269. $search_type, $blast_app
  270. );
  271. my $pattern;
  272. my @result_file_array;
  273. # warn "$address done slurping" if DEBUG;
  274. my $stream = slurp($result_file);
  275. #note!!!!!!! this will not work currately if the format of the out put change.
  276. $blast_app=uc($blast_app);
  277. foreach my $file (split /$blast_app/, $stream) {
  278. next unless($file);
  279. push @result_file_array, $blast_app.$file;
  280. }
  281. my @final;
  282. foreach (@$data) {
  283. my ($query_name, $alignment_image, $score_key_image, $kviewer_adds_ref, $genome_links_ref,
  284. $expand_links_ref, $image_map_pieces_ref) = @$_;
  285. my ($alignment_image_file_name) = $alignment_image =~ /([^\/]+)$/;
  286. my ($score_key_image_file_name) = $score_key_image =~ /([^\/]+)$/;
  287. my $alignment_image_url = $self->tmp_image_uri($self->image_dir."/$alignment_image_file_name");
  288. my $score_key_image_url = $self->tmp_image_uri($self->image_dir."/$score_key_image_file_name");
  289. # Retrieve kviewer image and imagemap
  290. my $kviewer_image_content;
  291. my $kviewer_imagemap;
  292. my $address = $ENV{REMOTE_ADDR};
  293. # Slurp result file
  294. my $result_file_content =
  295. $self->result2html(shift @result_file_array, $genome_links_ref, $expand_links_ref);
  296. $self->log->debug( "$address: processing results file: done" );
  297. my $vars = {
  298. query_name => $query_name,
  299. kviewer_image_content => $kviewer_image_content,
  300. kviewer_imagemap => $kviewer_imagemap,
  301. score_key_image_url => $score_key_image_url,
  302. alignment_image_url => $alignment_image_url,
  303. image_map_pieces => $image_map_pieces_ref,
  304. result_file_content => $result_file_content,
  305. hsp_genome_link_part_limit => $self->pre_compile->{HSP_GENOME_LINK_PART_LIMIT},
  306. hsp_alignment_image_limit => $self->pre_compile->{HSP_ALIGNMENT_IMAGE_LIMIT},
  307. };
  308. push @final, $vars;
  309. }
  310. return {data=>\@final};
  311. }
  312. sub result2html {
  313. my ($self,$result, $genome_links_ref, $expand_links_ref) = @_;
  314. my $address = $ENV{REMOTE_ADDR};
  315. # warn "$address: the result file is: $result_file" if DEBUG;
  316. # warn "$address done slurping" if DEBUG;
  317. # BLAST outformat of BLAT contains hard-coded sequece/bp count
  318. if ($result =~ /BLAT - The BLAST-like alignment tool/) {
  319. $result =~ s/23 sequences; 3,000,000,000 total letters//;
  320. }
  321. $result =~ s/\n{3,}/\n\n/g;
  322. $result =~ s/\n/<br\/>\n/g;
  323. $result =~ s/ /&nbsp;/g;
  324. my @content;
  325. my $current_idx = 0;
  326. my $current_hit = "";
  327. my $current_type = "header";
  328. my $current_sub_type = "";
  329. my $end_header_parsing = 0;
  330. my $end_parsing = 0;
  331. my $anchor_counter = 0;
  332. my $expand_area_counter = 0;
  333. # WARNING! THIS IS INCREDIBLY BRITTLE.
  334. # IF THE FORMAT OF OUR FASTA HEADERS CHANGES
  335. # THIS PARSING CODE WILL LIKELY BREAK.
  336. # CHECK BELOW FOR "FASTA WARNING"
  337. # Reformat the summary table of the BLAST report
  338. my ($hash);
  339. foreach my $line (split("<br/>\n", $result)) {
  340. my $link;
  341. #print STDERR "\n\nPROCESSING $line\n" if DEBUG;
  342. if ($current_type eq 'header' && $current_sub_type eq 'summary') {
  343. if ($line =~ /^WARNING/) {
  344. $end_header_parsing = 1;
  345. }
  346. # FASTA WARNING!
  347. # if (!$end_header_parsing && $line =~ /^([^&>]+?)\#/) {
  348. if (!$end_header_parsing && $line =~ /^([^&>]+?)[\#&]+?/) {
  349. my $hit = $1;
  350. # print STDERR "HERE: hit is $hit\n" if DEBUG;
  351. # Try and fasta headers that contain re-unfriendly characters.
  352. $hit =~ s/[\[\]]/--/g;
  353. $expand_area_counter++;
  354. my ($sequence_link, $gene_link, $protein_link,
  355. $corr_gene_link) = $self->make_links($hit);
  356. my $expand_link = $expand_links_ref->{$hit};
  357. my $alignment_link = "#hit_anchor_${hit}";
  358. my $genome_link;
  359. $genome_link = {class=>'genomic_location', id=>$genome_links_ref->{$hit} , label=>'[Genome View]'} if $genome_links_ref->{$hit};
  360. # Replace the Gene name in the report with a link to the sequence.
  361. $line =~ s/^$hit//;
  362. $line =~ s/\#/ /g;
  363. # Highlight the hit.
  364. =pod
  365. $line = qq[<font class="bold">] . $line . qq[</font>];
  366. $line .= qq[<br/>\n];
  367. $line .=
  368. qq[<img class="expand_button" expand_area_count="$expand_area_counter" src="/blast_blat/blank.png"/>]
  369. if $expand_link;
  370. $line .= qq[$alignment_link];
  371. $line .= qq[$genome_link] if $genome_links_ref->{$hit};
  372. $line .= qq[$gene_link] if $gene_link;
  373. $line .= qq[$protein_link] if $protein_link;
  374. $line .= qq[$corr_gene_link] if $corr_gene_link;
  375. $line .= qq[<br/>\n];
  376. $line .=
  377. qq[<div class="expand_area" expand_area_count="$expand_area_counter" expand_link="$expand_link"></div>\n]
  378. if $expand_link;
  379. =cut
  380. $link= $hash->{$hit} = {
  381. #expand_area_counter=>$expand_area_counter,
  382. sequence_link=> $sequence_link,
  383. expand_link=>$expand_link,
  384. alignment_link=>$alignment_link,
  385. genome_link=>$genome_link,
  386. gene_link=>$gene_link,
  387. protein_link=>$protein_link,
  388. corr_gene_link=>$corr_gene_link
  389. };
  390. }
  391. }
  392. # print STDERR "TYPE: $current_type; SUBTYPE: $current_sub_type\n" if DEBUG;
  393. if ($current_type eq 'hit' && $line !~ /^>/) {
  394. $line =~ s!/([^&=]+)=!<font class="red">$1</font>=!g;
  395. }
  396. if ($end_parsing) {
  397. # Do nothing
  398. } elsif ($line =~ /^Sequences&nbsp;producing&nbsp;/) { # Do not create a new type/section
  399. $current_sub_type = "summary";
  400. # FASTA WARNING!
  401. # } elsif ($line =~ /^>([^&>].+?)\#/) {
  402. # } elsif ($line =~ /^>([^&>]+?)[\#&]?/) { --this one doesn't work...
  403. } elsif ($line =~ /^>([^&>]+?)([\#&]|$)/) {
  404. $current_hit = $1;
  405. # print STDERR "CURRENT HIT: $current_hit\n" if DEBUG;
  406. $line =~ s!/([^&=]+)=!<font class="red">$1</font>=!g;
  407. # my ($sequence_link, $gene_link, $protein_link, $corr_gene_link) =
  408. # $self->make_links($current_hit);
  409. $current_hit =~ s/[\[\]]/--/g;
  410. $line =~ s/^>$current_hit//;
  411. $line =~ s/\#/ /g;
  412. $line =~ s/\n/ /g;
  413. $line =~ s/<br\/>/ /g;
  414. $current_sub_type = ""; # Close summary sub_type
  415. $current_idx++;
  416. $current_type = "hit";
  417. # push @{$content[$current_idx]->{html}},
  418. $link= {hit_sequence_link=> $hash->{$current_hit}->{sequence_link},current_hit=>$current_hit };
  419. # qq[<a name="hit_anchor_${current_hit}"></a>\n];
  420. } elsif ($line =~ /Length\&nbsp;=/) {
  421. # $line .= qq[<br/>\n];
  422. =pod
  423. my ($sequence_link, $gene_link, $protein_link, $corr_gene_link) =
  424. $self->make_links($current_hit);
  425. $expand_area_counter++;
  426. my $expand_link = $expand_links_ref->{$current_hit};
  427. my $genome_link = a(
  428. $genome_links_ref->{$current_hit} ,
  429. '[Genome View]'
  430. );
  431. $line .= qq[<br/>\n];
  432. $line .=
  433. qq[<img class="expand_button" expand_area_count="$expand_area_counter" src="/blast_blat/blank.png"/>]
  434. if $expand_link;
  435. $line .= qq[$genome_link] if $genome_links_ref->{$current_hit};
  436. $line .= qq[$gene_link] if $gene_link;
  437. $line .= qq[$protein_link] if $protein_link;
  438. $line .= qq[$corr_gene_link] if $corr_gene_link;
  439. $line .= qq[<br/>\n];
  440. $line .=
  441. qq[<div class="expand_area" expand_area_count="$expand_area_counter" expand_link="$expand_link"></div>\n]
  442. if $expand_link;
  443. =cut
  444. $link= $hash->{$current_hit};
  445. } elsif ($line =~ /Strand\&nbsp;HSPs:/) {
  446. # $current_idx++;
  447. # $current_type = "info";
  448. next;
  449. } elsif ($line =~ /^\&nbsp;Score/) {
  450. # $current_idx++;
  451. $current_type = "hit";
  452. $anchor_counter++;
  453. push @{$content[$current_idx]->{html}},
  454. qq[<a name="anchor${anchor_counter}"></a>\n];
  455. } elsif ($line =~ /^Parameters:/ or $line =~ /&nbsp;&nbsp;Database:/) {
  456. $current_idx++;
  457. $current_type = "footer";
  458. $end_parsing = 1;
  459. }
  460. $content[$current_idx]->{type} = $current_type;
  461. $line={ line=>$line,link=>$link} if($link) ;
  462. push @{$content[$current_idx]->{html}}, $line;
  463. }
  464. # warn "$address: done parsing blast report" if DEBUG;
  465. return \@content;
  466. }
  467. sub make_links {
  468. my ($self,$hit) = @_;
  469. my ($id1,$id2,$id3,$discard) = split /\#/, $hit;
  470. # $id1 = 0;
  471. if ($id1){
  472. $hit = $id1;
  473. }
  474. elsif ($id2){
  475. $hit = $id2;
  476. }
  477. else {
  478. $hit = $id3;
  479. }
  480. # check_output($hit);
  481. my $sequence_link;
  482. my $gene_link;
  483. my $protein_link;
  484. my $corr_gene_link;
  485. # elegans
  486. if ($hit =~ /^(CHROMOSOME_|)([IVX]+|MtDNA)/) {
  487. $sequence_link = { class=>"sequence", id=>"CHROMOSOME_$2", label=>$hit};
  488. }
  489. # cb3
  490. elsif ($hit =~ /^chr/) {
  491. $sequence_link = $hit;
  492. }
  493. # pb2801
  494. elsif ($hit =~ /^contig/i) {
  495. $sequence_link = $hit;
  496. }
  497. # cb25
  498. elsif ($hit =~ /^cb25/) {
  499. $sequence_link = $hit;
  500. }
  501. # briggpep
  502. elsif ($hit =~ /^BP:/i) {
  503. $sequence_link = { class=>"sequence", id=>$hit, label=>$hit};
  504. $gene_link = { class=>"gene", id=>$hit, label=>'[Gene Summary]'};
  505. $protein_link = { class=>"protein", id=>$hit, label=>'[Protein Summary]'};
  506. }
  507. # remanei
  508. elsif ($hit =~ /sctg/i) {
  509. $sequence_link = "$hit";
  510. $gene_link = '';
  511. $protein_link = '';
  512. }
  513. # wormpep
  514. elsif ($hit =~ /\./i && $hit !~ /^yk/) {
  515. $sequence_link = { class=>"sequence", id=>$hit, label=>$hit};
  516. $gene_link = { class=>"gene", id=>$hit, label=>'[Gene Summary]'};
  517. $protein_link = { class=>"protein", id=>$hit, label=>'[Protein Summary]'};
  518. }
  519. # est
  520. else {
  521. $sequence_link = { class=>"sequence", id=>$hit, label=>$hit};
  522. my ($object) = $self->dsn->{acedb}->fetch('Sequence' => $hit);
  523. my ($corr_gene) = $object->Gene if $object;
  524. if (!$corr_gene) {
  525. my ($matching_cds) = $object->Matching_cds if $object;
  526. ($corr_gene) = $matching_cds->Gene if $matching_cds;
  527. }
  528. my $corr_gene_name = $self->common_name($corr_gene) if $corr_gene;
  529. $corr_gene_link = { class=>"gene", id=>$corr_gene_name, label=>"[Corr. Gene: $corr_gene_name]"}
  530. if $corr_gene_name;
  531. }
  532. return ($sequence_link, $gene_link, $protein_link, $corr_gene_link);
  533. }
  534. sub process_result_file {
  535. my ($self,$query_file, $query_type, $result_file, $search_type, $blast_app) =
  536. @_;
  537. my @data;
  538. my $searchio;
  539. if ($search_type eq "blast" or $search_type eq "blat") {
  540. $searchio = new Bio::SearchIO(
  541. -format => "blast",
  542. -file => $result_file
  543. );
  544. }
  545. else {
  546. error("Unreconized search type ($search_type)!");
  547. }
  548. # Determine query length
  549. my $in = Bio::SeqIO->new(-file => $query_file, -format => "Fasta");
  550. for (my $count=1;my $query_seq = $in->next_seq;$count++) {
  551. my $query_length = $query_seq->length;
  552. my $query_name = $query_seq->id;
  553. # Generate score key colors
  554. my @score_key_colors;
  555. for my $i (0 .. 4) {
  556. my $h_value = 251 * (255 / 360);
  557. my $s_value = $i * 25 * (255 / 100);
  558. my $v_value = 100 * (255 / 100);
  559. my @rgb_values = GD::Simple->HSVtoRGB($h_value, $s_value, $v_value);
  560. push @score_key_colors, sprintf("#%.2x%.2x%.2x", @rgb_values);
  561. }
  562. # Store hits
  563. my @piece_refs;
  564. # Store genome links/positions during processing
  565. my %genome_links;
  566. my %expand_links;
  567. # Create image panels
  568. my $panel = Bio::Graphics::Panel->new(
  569. -length => $query_length,
  570. -width => 500,
  571. -pad_left => 10,
  572. -pad_right => 10,
  573. -pad_top => 10,
  574. -pad_bottom => 10,
  575. -key_style => "left",
  576. );
  577. my $score_key = Bio::Graphics::Panel->new(
  578. -length => $query_length,
  579. -width => 532,
  580. -pad_left => 1,
  581. -pad_right => 1,
  582. -pad_top => 1,
  583. -pad_bottom => 1,
  584. -key_style => "left",
  585. );
  586. # Score key
  587. my @score_key;
  588. for my $i (0 .. 19) {
  589. my $display_name = ' ';
  590. $display_name = '0 bits' if $i == 0;
  591. $display_name = '800+ bits' if $i == 18;
  592. my $feature = Bio::SeqFeature::Generic->new(
  593. -score => $i * (800 / 19),
  594. -start => $i ? ($query_length / 20) * $i : 1,
  595. -end => ($query_length / 20) * $i + ($query_length / 20),
  596. -display_name => $display_name,
  597. );
  598. push @score_key, $feature;
  599. }
  600. my $track = $score_key->add_track(
  601. -glyph => 'graded_segments',
  602. -key => ' SCORE KEY: ',
  603. -fgcolor => 'gray',
  604. -max_score => 800,
  605. -min_score => 0,
  606. -sort_order => 'high_score',
  607. -bgcolor => 'blue',
  608. -label => 1,
  609. -height => 16,
  610. -bump => 0,
  611. );
  612. $track->add_feature(@score_key);
  613. # Query sequence
  614. my $query = Bio::SeqFeature::Generic->new(
  615. -start => 1,
  616. -end => $query_length,
  617. -display_name => "QUERY: $query_name",
  618. );
  619. $panel->add_track(
  620. $query,
  621. -glyph => "arrow",
  622. -tick => 2,
  623. -fgcolor => "black",
  624. -double => 1,
  625. -label => 1,
  626. );
  627. # Process BLAST/BLAT output - both are provided in blast format
  628. my $result = $searchio->next_result;
  629. if (!$result && $search_type eq "blat")
  630. { # Blat does not provide output when no hits
  631. error("No hits were found!");
  632. }
  633. my $hsp_counter = 0;
  634. $result->rewind;
  635. while (my $hit = $result->next_hit) {
  636. next
  637. if $hit->num_hsps < 1; # Work-around, BLAT producing blank hits
  638. my @hsps_in_track;
  639. my $clustered_plus_strand_hsps = Bio::SeqFeature::Generic->new;
  640. my $clustered_minus_strand_hsps = Bio::SeqFeature::Generic->new;
  641. my $name = $hit->name;
  642. my ($formatted_name) = $name =~ /([^\/]+)$/;
  643. if (length($formatted_name) > 20) {
  644. $formatted_name =~ s/^(.{16})(.*)/$1 .../;
  645. }
  646. $formatted_name = sprintf("%20s", $formatted_name);
  647. my $description = $hit->description;
  648. $description =~ s!^/!!;
  649. my @formatted_parts;
  650. foreach my $part (split(" /", $description)) {
  651. my ($key, $value) = split("=", $part);
  652. if ($key eq "id" or $key eq "tentative_id") {
  653. push @formatted_parts, "/$key=$value";
  654. }
  655. }
  656. my $formatted_parts = join " ", @formatted_parts;
  657. my $formatted_description =
  658. length($formatted_parts) < 96
  659. ? $formatted_parts
  660. : substr($formatted_parts, 0, 92) . " ...";
  661. # my $start = $hit->start;
  662. # my $end = $hit->end;
  663. my ($piece_refs_ref, $hit_genome_link, $hit_expand_link) =
  664. $self->extract_hit_info($hit, $query_name, $query_type);
  665. push @piece_refs, @{$piece_refs_ref} if @{$piece_refs_ref};
  666. $genome_links{$hit->name} = $hit_genome_link;
  667. $expand_links{$hit->name} = $hit_expand_link;
  668. $self->sort_hits($hit, $blast_app); # Sort hits and add a cluster tag
  669. my $hsp_count_per_hit = 0;
  670. $hit->rewind;
  671. while (my $hsp = $hit->next_hsp) {
  672. $hsp_counter++;
  673. next if $hsp_count_per_hit > $self->pre_compile->{HSP_ALIGNMENT_IMAGE_LIMIT};
  674. $hsp_count_per_hit++;
  675. my $hsp_strand =
  676. $hsp->strand('hit') ne $hsp->strand('query') ? -1 : 1;
  677. $hsp_strand = $hsp->strand('hit')
  678. if $blast_app =~ /^tblastn/; # Exception
  679. $hsp_strand = $hsp->strand('query')
  680. if $blast_app =~ /^blastx/; # Exception
  681. my ($hsp_cluster) = $hsp->get_tag_values('cluster');
  682. my $feature = Bio::SeqFeature::Generic->new(
  683. -display_name => $hsp->hit->start . '..'
  684. . $hsp->hit->end, # . '(' . $hsp->score . ')',
  685. -score => $hsp->bits, # $hsp->score,
  686. -start => $hsp->start,
  687. -end => $hsp->end,
  688. -strand => $hsp_strand,
  689. -tag => {anchor => "anchor$hsp_counter"},
  690. );
  691. if ($hsp_cluster && $hsp_strand eq '1') {
  692. $clustered_plus_strand_hsps->add_sub_SeqFeature(
  693. $feature,
  694. 'EXPAND'
  695. );
  696. $clustered_plus_strand_hsps->score(
  697. $clustered_plus_strand_hsps->score + $feature->score);
  698. }
  699. elsif ($hsp_cluster && $hsp_strand eq '-1') {
  700. $clustered_minus_strand_hsps->add_sub_SeqFeature(
  701. $feature,
  702. 'EXPAND'
  703. );
  704. $clustered_minus_strand_hsps->score(
  705. $clustered_minus_strand_hsps->score +
  706. $feature->score);
  707. }
  708. else {
  709. push @hsps_in_track, $feature;
  710. }
  711. }
  712. my $track = $panel->add_track(
  713. -glyph => 'graded_segments',
  714. -connector => 'dashed',
  715. -max_score => 800,
  716. -min_score => 0,
  717. -sort_order => 'high_score',
  718. -key => "$formatted_name ",
  719. -bgcolor => 'blue',
  720. -strand_arrow => 1,
  721. -height => 7,
  722. -label => 0,
  723. -pad_bottom => 3,
  724. );
  725. $track->add_feature($clustered_plus_strand_hsps)
  726. if $clustered_plus_strand_hsps->start;
  727. $track->add_feature($clustered_minus_strand_hsps)
  728. if $clustered_minus_strand_hsps->start;
  729. $track->add_feature(@hsps_in_track) if @hsps_in_track;
  730. }
  731. # Write image to a temp file
  732. my $temp_file = File::Temp->new(
  733. TEMPLATE => "alignment_XXXXX",
  734. DIR => $self->image_dir,
  735. SUFFIX => ".png",
  736. UNLINK => 0,
  737. );
  738. my $alignment_image = $temp_file->filename;
  739. $temp_file = File::Temp->new(
  740. TEMPLATE => "score_key_XXXXX",
  741. DIR => $self->image_dir,
  742. SUFFIX => ".png",
  743. UNLINK => 0,
  744. );
  745. my $score_key_image = $temp_file->filename;
  746. # Print out images
  747. open(IMAGE, ">$alignment_image")
  748. or error("Cannot write file ($alignment_image): $!");
  749. print IMAGE $panel->png;
  750. $panel->finished;
  751. close IMAGE;
  752. open(IMAGE, ">$score_key_image")
  753. or error("Cannot write file ($score_key_image): $!");
  754. print IMAGE $score_key->png;
  755. $score_key->finished;
  756. close IMAGE;
  757. # Generate image map
  758. my @image_map_pieces;
  759. # my @boxes = $panel->boxes;
  760. # foreach my $map_component (@boxes) {
  761. # my ($feature, $x1, $y1, $x2, $y2, $track) = @{$map_component};
  762. #
  763. # my ($anchor) = $feature->get_tag_values('anchor');
  764. # $x1 += 142;
  765. # $x2 += 142;
  766. # push @image_map_pieces, { coords => qq[$x1, $y1, $x2, $y2], href => qq[#$anchor] } if $anchor; # Padding (temporary)
  767. # }
  768. # kviewer_string
  769. my %kviewer_adds;
  770. my $kviewer_adds = 0;
  771. foreach (@piece_refs) {
  772. next unless $_->{pos};
  773. $kviewer_adds++;
  774. my $kviewer_add;
  775. my $chr = $_->{chr};
  776. my $name = $_->{name};
  777. my $type = $_->{type};
  778. my $pos = $_->{pos};
  779. my $link = CGI::escape($_->{genome_link}); # ***
  780. my $name_string = join(",", @{$name});
  781. $kviewer_add =
  782. qq[$type ${name_string}-$kviewer_adds ${pos}]
  783. ; # Zero-length markers do not work
  784. push @{$kviewer_adds{$chr}}, $kviewer_add;
  785. }
  786. push @data, [$query_name, $alignment_image, $score_key_image,
  787. \%kviewer_adds, \%genome_links, \%expand_links,
  788. \@image_map_pieces];
  789. }
  790. return \@data;
  791. }
  792. sub extract_hit_info {
  793. my ($self,$hit, $query_name, $query_type) = @_;
  794. #my $piece_ref;
  795. my @piece_refs;
  796. my $hit_genome_link;
  797. my $hit_expand_link;
  798. my $hit_description = $hit->description;
  799. my $hit_name = $hit->name;
  800. my $gbrowse_root_id;
  801. # TH 7/2008: WARNING! This is hard-coded and will break if we have new IDs.
  802. if ($hit_name =~ /^cb25/) {
  803. $gbrowse_root_id = 'c_briggsae_cb25';
  804. }
  805. elsif ($hit_name =~ /^chr/) {
  806. $gbrowse_root_id = 'c_briggsae';
  807. }
  808. elsif ($hit_name =~ /^(CHROMOSOME_|)([IVX]+|MtDNA)/) {
  809. $gbrowse_root_id = 'c_elegans';
  810. }
  811. my $gbrowse_root = qq[$gbrowse_root_id/]
  812. if $gbrowse_root_id;
  813. my $expand_root =
  814. qq[$gbrowse_root_id/]
  815. if $gbrowse_root_id;
  816. # Scenario 1 - decription contains chromosome location; C. elegans
  817. if ($hit_description =~ /\/map=(CHROMOSOME_|)([^\/]+)\/(\d+),(\d+)/) {
  818. my ($chr, $start, $end) = ($2, $3, $4);
  819. my $counter;
  820. $hit->rewind;
  821. my @hsp_genome_link_parts;
  822. while (my $hsp = $hit->next_hsp) {
  823. $counter++;
  824. next if $query_type eq 'P'; # No HSPs for protein hits
  825. my $hsp_start = $hsp->start('hit');
  826. my $hsp_end = $hsp->end('hit');
  827. my $hsp_strand =
  828. $hsp->strand('hit') ne $hsp->strand('query') ? -1 : 1;
  829. my $hsp_genome_link_part = qq[add=]
  830. . qq[${hit_name}+Hits+HSP($counter)+$hsp_start-$hsp_end];
  831. push @hsp_genome_link_parts, $hsp_genome_link_part
  832. if @hsp_genome_link_parts < $self->pre_compile->{HSP_GENOME_LINK_PART_LIMIT};
  833. }
  834. $hit_genome_link =
  835. $gbrowse_root . qq[?]
  836. . join(";", qq[name=${hit_name}], @hsp_genome_link_parts)
  837. if $gbrowse_root;
  838. $hit_expand_link = $expand_root . qq[?]
  839. . join(
  840. ";",
  841. # 'type=CG;width=450', qq[name=${hit_name}], @hsp_genome_link_parts
  842. 'width=450', qq[name=${hit_name}], @hsp_genome_link_parts
  843. )
  844. if $expand_root;
  845. my $piece_ref = {
  846. chr => $chr,
  847. name => [$hit_name],
  848. type => 'hit',
  849. pos => "$start..$end",
  850. genome_link => undef,
  851. expand_link => undef,
  852. };
  853. push @piece_refs, $piece_ref;
  854. }
  855. # Scenario 2 - map to chromosome; C. elegans
  856. elsif ($hit_name =~ /CHROMOSOME_([^\/]+)/
  857. || $hit_name =~ /^([IVX]+|MtDNA)/) {
  858. my ($chr) = ($1);
  859. my $counter;
  860. my $top_hsp_start;
  861. my $top_hsp_end;
  862. my $top_hsp_strand;
  863. $hit->rewind;
  864. my @hsp_genome_link_parts;
  865. while (my $hsp = $hit->next_hsp) {
  866. $counter++;
  867. next if $query_type eq 'P'; # No HSPs for protein hits
  868. my $hsp_start = $hsp->start('hit');
  869. my $hsp_end = $hsp->end('hit');
  870. my $hsp_strand =
  871. $hsp->strand('hit') ne $hsp->strand('query') ? -1 : 1;
  872. my $hsp_genome_link_part =
  873. qq[add=] . qq[${chr}+Hits+HSP($counter)+$hsp_start..$hsp_end];
  874. push @hsp_genome_link_parts, $hsp_genome_link_part
  875. if @hsp_genome_link_parts < $self->pre_compile->{HSP_GENOME_LINK_PART_LIMIT};
  876. if ($counter == 1) {
  877. $top_hsp_start = $hsp_start;
  878. $top_hsp_end = $hsp_end;
  879. $top_hsp_strand = $hsp_strand;
  880. }
  881. my $piece_ref = {
  882. chr => $chr,
  883. name => [$hit_name],
  884. type => 'hsp',
  885. pos => "$hsp_start..$hsp_end",
  886. genome_link => undef,
  887. expand_link => undef,
  888. };
  889. push @piece_refs, $piece_ref;
  890. }
  891. my $view_start;
  892. my $view_end;
  893. if ($top_hsp_end > $top_hsp_start) {
  894. $view_start = $top_hsp_start - 3000;
  895. $view_end = $top_hsp_end + 3000;
  896. }
  897. else {
  898. $view_start = $top_hsp_start + 3000;
  899. $view_end = $top_hsp_end - 3000;
  900. }
  901. $hit_genome_link = $gbrowse_root . qq[?]
  902. . join(
  903. ";",
  904. qq[name=${chr}:${view_start}..${view_end}],
  905. @hsp_genome_link_parts
  906. )
  907. if $gbrowse_root;
  908. $hit_expand_link = $expand_root . qq[?]
  909. . join(
  910. ";",
  911. # 'type=CG;width=450',
  912. 'width=450',
  913. qq[name=${hit_name}:${view_start}..${view_end}],
  914. @hsp_genome_link_parts
  915. )
  916. if $expand_root;
  917. }
  918. # Scenario 3 - neither
  919. else {
  920. my $counter;
  921. my $top_hsp_start;
  922. my $top_hsp_end;
  923. my $top_hsp_strand;
  924. $hit->rewind;
  925. my @hsp_genome_link_parts;
  926. while (my $hsp = $hit->next_hsp) {
  927. $counter++;
  928. next if $query_type eq 'P'; # No HSPs for protein hits
  929. my $hsp_start = $hsp->start('hit');
  930. my $hsp_end = $hsp->end('hit');
  931. my $hsp_strand =
  932. $hsp->strand('hit') ne $hsp->strand('query') ? -1 : 1;
  933. my $hsp_genome_link_part = qq[add=]
  934. . qq[${hit_name}+Hits+HSP($counter)+$hsp_start-$hsp_end];
  935. push @hsp_genome_link_parts, $hsp_genome_link_part
  936. if @hsp_genome_link_parts < $self->pre_compile->{HSP_GENOME_LINK_PART_LIMIT};
  937. if ($counter == 1) {
  938. $top_hsp_start = $hsp_start;
  939. $top_hsp_end = $hsp_end;
  940. $top_hsp_strand = $hsp_strand;
  941. }
  942. my $piece_ref = {
  943. chr => undef,
  944. name => undef,
  945. type => undef,
  946. pos => undef,
  947. genome_link => undef,
  948. expand_link => undef,
  949. };
  950. push @piece_refs, $piece_ref;
  951. }
  952. my $view_start;
  953. my $view_end;
  954. if ($top_hsp_end > $top_hsp_start) {
  955. $view_start = $top_hsp_start - 3000;
  956. $view_end = $top_hsp_end + 3000;
  957. }
  958. else {
  959. $view_start = $top_hsp_start + 3000;
  960. $view_end = $top_hsp_end - 3000;
  961. }
  962. $hit_genome_link = $gbrowse_root . qq[?]
  963. . join(
  964. ";",
  965. qq[name=${hit_name}:${view_start}..${view_end}],
  966. @hsp_genome_link_parts
  967. )
  968. if $gbrowse_root;
  969. $hit_expand_link = $expand_root . qq[?]
  970. . join(
  971. ";",
  972. 'width=450', qq[name=${hit_name}], @hsp_genome_link_parts
  973. # 'type=CG;width=450', qq[name=${hit_name}], @hsp_genome_link_parts
  974. )
  975. if $expand_root;
  976. }
  977. return (\@piece_refs, $hit_genome_link, $hit_expand_link);
  978. }
  979. sub sort_hits {
  980. my ($self,$hit, $blast_app) = @_;
  981. my @all_hsps;
  982. # Mark with unique identifiers and copy into a new array as refs
  983. $hit->rewind;
  984. my $hsp_counter = -1;
  985. while (my $hsp = $hit->next_hsp) {
  986. next if $hsp_counter > $self->pre_compile->{HSP_ALIGNMENT_IMAGE_LIMIT};
  987. $hsp_counter++;
  988. $hsp->add_tag_value('id', $hsp_counter);
  989. $hsp->add_tag_value('cluster', 1);
  990. $all_hsps[$hsp_counter] = $hsp;
  991. }
  992. # Mark ones to be clustered
  993. foreach my $i (0 .. $#all_hsps) {
  994. my ($ith_cluster) = $all_hsps[$i]->get_tag_values('cluster');
  995. my $ith_score = $all_hsps[$i]->bits;
  996. my $ith_start = $all_hsps[$i]->start('query');
  997. my $ith_end = $all_hsps[$i]->end('query');
  998. my $ith_strand =
  999. $all_hsps[$i]->strand('hit') ne $all_hsps[$i]->strand('query')
  1000. ? -1
  1001. : 1;
  1002. $ith_strand = $all_hsps[$i]->strand('hit')
  1003. if $blast_app =~ /^tblastn/; # Exception
  1004. $ith_strand = $all_hsps[$i]->strand('query')
  1005. if $blast_app =~ /^blastx/; # Exception
  1006. foreach my $j (0 .. $#all_hsps) {
  1007. next if $i == $j;
  1008. my ($jth_cluster) = $all_hsps[$j]->get_tag_values('cluster');
  1009. my $jth_score = $all_hsps[$j]->bits;
  1010. my $jth_start = $all_hsps[$j]->start('query');
  1011. my $jth_end = $all_hsps[$j]->end('query');
  1012. my $jth_strand =
  1013. $all_hsps[$j]->strand('hit') ne $all_hsps[$j]->strand('query')
  1014. ? -1
  1015. : 1;
  1016. $jth_strand = $all_hsps[$j]->strand('hit')
  1017. if $blast_app =~ /^tblastn/; # Exception
  1018. $jth_strand = $all_hsps[$j]->strand('query')
  1019. if $blast_app =~ /^blastx/; # Exception
  1020. next if $ith_strand ne $jth_strand;
  1021. # Remove overlaps
  1022. if (
  1023. ( ($ith_start >= $jth_start)
  1024. && ($ith_start <= $jth_end)
  1025. && ((($jth_end - $ith_start) / ($ith_end - $ith_start)) >
  1026. 0.8)
  1027. )
  1028. # (($ith_start >= $jth_start) && ($ith_start <= $jth_end) && (((2 * $ith_end - $ith_start - $jth_end) / ($ith_end - $ith_start)) > 0.8))
  1029. || ( ($ith_end >= $jth_start)
  1030. && ($ith_end <= $jth_end)
  1031. && ($ith_end >= $jth_end)
  1032. && (
  1033. ( (2 * $jth_end - $jth_start - $ith_end) /
  1034. ($jth_end - $jth_start)
  1035. ) > 0.8
  1036. )
  1037. )
  1038. ) {
  1039. if ($ith_score > $jth_score) {
  1040. $all_hsps[$j]->remove_tag('cluster');
  1041. $all_hsps[$j]->add_tag_value('cluster', 0);
  1042. }
  1043. else {
  1044. $all_hsps[$i]->remove_tag('cluster');
  1045. $all_hsps[$i]->add_tag_value('cluster', 0);
  1046. }
  1047. }
  1048. }
  1049. }
  1050. return 1;
  1051. }
  1052. sub autoload {
  1053. my ($self,$param) = @_;
  1054. my $object = $param->{'autoload'};
  1055. my ($id, $species, $db) = $object =~ /^Blast (\S+) against (\S+) (\S+)/;
  1056. return unless $id && $db && $species;
  1057. my ($obj) = $self->ace_dsn->fetch(CDS => $id);
  1058. ($obj) = $self->ace_dsn->fetch(Sequence => $id) unless $obj;
  1059. return unless $obj;
  1060. $param->{'search_type'} = 'blast';
  1061. if ($db =~ /protein/) {
  1062. $param->{'blast_app'} = 'blastp';
  1063. $param->{'query_sequence'} = $obj->asPeptide;
  1064. $param->{'query_type'} = 'prot';
  1065. $param->{'database'} = lc($species.'_'.$db);
  1066. }
  1067. else {
  1068. $param->{'blast_app'} = 'blastn';
  1069. $param->{'query_sequence'} = $obj->asDNA;
  1070. $param->{'query_type'} = 'nucl';
  1071. $param->{'database'} = lc($species.'_'.$db);
  1072. }
  1073. }
  1074. 1;