PageRenderTime 87ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/code_lic/en_gbk2gbk_map2gen.pl

http://research-code-base-animesh.googlecode.com/
Perl | 415 lines | 357 code | 22 blank | 36 comment | 35 complexity | ab3b0dd42e3680677dc67656efb96564 MD5 | raw file
  1. # This program is free software: you can redistribute it and/or modify
  2. # it under the terms of the GNU General Public License as published by
  3. # the Free Software Foundation, either version 3 of the License, or
  4. # (at your option) any later version.
  5. #
  6. # This program is distributed in the hope that it will be useful,
  7. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. # GNU General Public License for more details.
  10. #
  11. # You should have received a copy of the GNU General Public License
  12. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. #
  14. # Code base of Animesh Sharma [ sharma.animesh@gmail.com ]
  15. #!/usr/bin/perl
  16. use Bio::SeqIO;
  17. use strict;
  18. my $main_file_pattern=shift @ARGV;chomp $main_file_pattern;
  19. my $other_file_pattern=shift @ARGV;chomp $other_file_pattern;
  20. my $l_utr=500;
  21. my $l_dutr=500;
  22. my $sim_thresh=90;
  23. my $blast_thresh=90;
  24. my $blast_len_thresh=0.50;
  25. my $blast_len_thresh=0.01;
  26. my $n_other_source;
  27. my %other_source_sequence_name;
  28. my %other_source_sequence;
  29. my $tfl1;
  30. my $tfl2;
  31. my @other_file;
  32. my $total_seq_cnt_threshold=100;
  33. my $n_gene_threshold=100;
  34. my $total_seq_cnt_threshold=100;
  35. my $n_gene_threshold=100;
  36. my $total_seq_cnt;
  37. my $total_file_name="total.fas";
  38. my $total_file_name_utr="total.utr.fas";
  39. my $total_file_name_dutr="total.dutr.fas";
  40. my $foobr="total_blastreport_utr.txt";
  41. my $n_gene;
  42. my %h_seq_name;
  43. my %h_seq;
  44. my %h_seq_utr;
  45. my %h_seq_dutr;
  46. my %total_seq_name;
  47. my %total_seq;
  48. my %total_seq_utr;
  49. my %total_seq_dutr;
  50. my $other_total_seq_cnt;
  51. my $other_total_file_name="other_total.fas";
  52. my $other_total_file_name_utr="other_total.utr.fas";
  53. my $other_total_file_name_dutr="other_total.dutr.fas";
  54. system("ls -1 $other_file_pattern*.gb* > tempfile1");
  55. open(FT2,"tempfile1");
  56. while($tfl2=<FT2>){
  57. chomp $tfl2;
  58. if($tfl2=~/gb$|gbk$/){
  59. push(@other_file,$tfl2);
  60. }
  61. }
  62. close FT2;
  63. system("ls -1 $main_file_pattern*.gb* > tempfile1");
  64. open(FT1,"tempfile1");
  65. open(FT,">temp.txt");
  66. open(BLASTREP,">$foobr");
  67. open(FTTL,">$total_file_name");
  68. open(FTTLUTR,">$total_file_name_utr");
  69. open(FTTLDUTR,">$total_file_name_dutr");
  70. open(OFTTL,">$other_total_file_name");
  71. open(OFTTLUTR,">$other_total_file_name_utr");
  72. open(OFTTLDUTR,">$other_total_file_name_dutr");
  73. while($tfl1=<FT1>){
  74. chomp $tfl1;
  75. if($tfl1=~/asn1$/){
  76. my @temp=split(/\./,$tfl1);
  77. my $foo=@temp[0].".gb";
  78. system("asn2gb.Win32 -i $tfl1 -o $foo -f b");
  79. select_utr($foo);
  80. #print "$tfl1\n";
  81. }
  82. else{
  83. my $foo=$tfl1;;
  84. select_utr($foo);
  85. }
  86. }
  87. close FT1;
  88. close FTTL;
  89. close FT;
  90. system("rm -rf tempfile1");
  91. sub select_utr{
  92. my $gb_file=shift;
  93. my $seqio_object = Bio::SeqIO->new(-file => $gb_file, '-format' => 'GenBank');
  94. my $seq_object = $seqio_object->next_seq;
  95. my $select_utr_no;
  96. print "$gb_file\n";
  97. for my $feat_object ($seq_object->get_SeqFeatures) {
  98. if ($feat_object->primary_tag eq "CDS") {
  99. #if ($feat_object->primary_tag eq "gene") {
  100. my $start = $feat_object->location->start;
  101. my $end = $feat_object->location->end;
  102. my $strand = $feat_object->location->strand;$strand+=0;
  103. my $seq = $feat_object->spliced_seq->seq;
  104. my $sequence_string = $feat_object->entire_seq->seq;
  105. my $seq_utr;my $seq_tag;my $seq_dutr;
  106. my $l_seq_complete=length($sequence_string);
  107. my $l_seq=length($seq);
  108. my $seq_name;
  109. my $al_utr;
  110. my $al_dutr;
  111. my @product_name=$feat_object->get_tag_values('product');
  112. if(@product_name[0]=~/hypothetical/){
  113. #print $feat_object->get_tag_values('product');
  114. #print "\t";
  115. next;
  116. }
  117. for my $tag ($feat_object->get_all_tags) {
  118. if(($tag eq "translation") or ($tag eq "codon_start")){
  119. next;
  120. }
  121. else{
  122. for my $value ($feat_object->get_tag_values($tag)){
  123. $seq_name.="$value ";
  124. }
  125. }
  126. }
  127. #print $feat_object->spliced_seq->seq,"\n";
  128. #$n_gene++;
  129. #$select_utr_no++;
  130. if(($strand == -1) && (($end+$l_utr)<$l_seq_complete) && (($start-$l_dutr)>0)){
  131. $n_gene++;
  132. $select_utr_no++;
  133. print "R->$n_gene\t$start-$end-$l_seq_complete-$l_utr-$l_dutr [$strand]\t";
  134. $seq_utr = substr($sequence_string,$end,$l_utr);
  135. $seq_utr=reverse($seq_utr);
  136. $seq_utr=~tr/ATGC/TACG/d;
  137. $seq_dutr = substr($sequence_string,($start-$l_dutr-1),$l_dutr);
  138. $seq_dutr=reverse($seq_dutr);
  139. $seq_dutr=~tr/ATGC/TACG/d;
  140. $al_utr=length($seq_utr);
  141. $al_dutr=length($seq_dutr);
  142. $seq_name.="$start-$end($l_seq) $strand UTR ($al_dutr-$al_utr)";
  143. $seq_name="$gb_file ($select_utr_no-$n_gene)".$seq_name;
  144. $h_seq_name{$n_gene}=$seq_name;
  145. $h_seq{$n_gene}=$seq;
  146. $h_seq_utr{$n_gene}=$seq_utr;
  147. $h_seq_dutr{$n_gene}=$seq_dutr;
  148. }
  149. elsif(($strand == 1) && (($start-$l_utr)>0) && (($end+$l_dutr)<$l_seq_complete)){
  150. $n_gene++;
  151. $select_utr_no++;
  152. print "F->$n_gene\t$start-$end-$l_seq_complete-$l_utr-$l_dutr [$strand]\t";
  153. $seq_utr = substr($sequence_string,($start-$l_utr-1),$l_utr);
  154. $al_utr=length($seq_utr);
  155. $seq_dutr = substr($sequence_string,$end,$l_dutr);
  156. $al_dutr=length($seq_dutr);
  157. $seq_name.="$start-$end($l_seq) $strand UTR ($al_utr-$al_dutr)";
  158. $seq_name="$gb_file ($select_utr_no-$n_gene)".$seq_name;
  159. $h_seq_name{$n_gene}=$seq_name;
  160. $h_seq{$n_gene}=$seq;
  161. $h_seq_utr{$n_gene}=$seq_utr;
  162. $h_seq_dutr{$n_gene}=$seq_dutr;
  163. }
  164. if($n_gene > $n_gene_threshold){
  165. print "\nTotal Gene, $n_gene, is > then $n_gene_threshold\n";
  166. select_utr_threshold($gb_file);
  167. die;
  168. }
  169. }
  170. }
  171. print "\n";
  172. }
  173. sub select_utr_threshold{
  174. my $gb_file=shift;
  175. my @t=split(/\./,$gb_file);
  176. my $gb_file_out=@t[0].".utr.txt";
  177. #open(FW,">$gb_file_out");
  178. foreach my $o (sort {$a <=> $b} keys %h_seq_name) {
  179. my $per_sim; my $max;my $max_seq_name;
  180. foreach my $i (sort {$a <=> $b} keys %h_seq_name){
  181. if($i<$o){
  182. open(F1,">file1");
  183. open(F2,">file2");
  184. print F1">$h_seq_name{$i}\n$h_seq{$i}\n";
  185. print F2">$h_seq_name{$o}\n$h_seq{$o}\n";
  186. print "Aligning seq $o and seq $i with ";
  187. system("needle file1 file2 -gapopen=10 -gapext=0.5 -outfile=file3");
  188. open(FN,"file3");
  189. my $line;
  190. while($line=<FN>){
  191. @t=split(/\s+/,$line);
  192. if(@t[1] eq "Identity:"){
  193. @t=split(/\(|\)/,$line);
  194. @t[1]=~s/\%|\s+//g;
  195. $per_sim=@t[1]+0;
  196. if($max<$per_sim){
  197. $max=$per_sim;
  198. $max_seq_name=$i;
  199. }
  200. print FT"$i\t$o\t$per_sim\t$max\n";
  201. print "$i\t$o\t$per_sim\t$max\n";
  202. }
  203. }
  204. close FN;
  205. close F1;
  206. close F2;
  207. }
  208. #print "\n";
  209. if($max>$sim_thresh){last;}
  210. }
  211. print FT"\n$o - $max_seq_name - $max - $per_sim - $sim_thresh\n";
  212. if(($total_seq_cnt >= $total_seq_cnt_threshold)){
  213. my $other_seq_no;
  214. foreach (@other_file){
  215. $other_seq_no++;
  216. print "$_\t$other_seq_no\n";
  217. get_other_source($_,$other_seq_no);
  218. }
  219. other_seq_comp($other_file_pattern,1);
  220. die"Total UTR Seq Count, $total_seq_cnt, reached $total_seq_cnt_threshold\n";
  221. }
  222. if(($max<$sim_thresh)){
  223. print FT"\t$o-$max_seq_name-$max-$per_sim\n";
  224. $total_seq_cnt++;
  225. #print FTTLUTR">$h_seq_name{$o}\n$h_seq_utr{$o}\n";
  226. #print FTTLDUTR">$h_seq_name{$o}\n$h_seq_dutr{$o}\n";
  227. #print FTTL">$h_seq_name{$o}\n$h_seq{$o}\n";
  228. $total_seq_name{$total_seq_cnt}=$h_seq_name{$o};
  229. $total_seq_utr{$total_seq_cnt}=$h_seq_utr{$o};
  230. $total_seq_dutr{$total_seq_cnt}=$h_seq_dutr{$o};
  231. $total_seq{$total_seq_cnt}=$h_seq{$o};
  232. next;
  233. }
  234. }
  235. }
  236. sub get_other_source{
  237. my $foofile=shift;
  238. my $foofileno=shift;
  239. print "$foofile File Number $foofileno\n";
  240. my $seqio_object = Bio::SeqIO->new(-file => $foofile, '-format' => 'GenBank');
  241. my $seq_object = $seqio_object->next_seq;
  242. for my $feat_object ($seq_object->get_SeqFeatures) {
  243. if ($feat_object->primary_tag eq "source") {
  244. my $start = $feat_object->location->start;
  245. my $end = $feat_object->location->end;
  246. my $sequence = $feat_object->entire_seq->seq;
  247. my $length_sequence=length($sequence);
  248. my $seq_name;
  249. for my $tag ($feat_object->get_all_tags) {
  250. for my $value ($feat_object->get_tag_values($tag)){
  251. $seq_name.="$value ";
  252. }
  253. }
  254. $n_other_source++;
  255. $seq_name.="$start-$end($length_sequence)";
  256. $seq_name="$foofile ($n_other_source)".$seq_name;
  257. $other_source_sequence_name{$n_other_source}=$seq_name;
  258. $other_source_sequence{$n_other_source}=$sequence;
  259. }
  260. }
  261. }
  262. sub other_seq_comp{
  263. my $foofile=shift;
  264. my $foonumber=shift;
  265. print "$foofile File Number $foonumber\n";
  266. my $foonw=$foofile."_".$foonumber."_total_other_utr_nw.txt";
  267. my $foosw=$foofile."_".$foonumber."_total_other_utr_sw.txt";
  268. my $fooms=$foofile."_".$foonumber."_total_other_utr_ms.txt";
  269. my $foobs=$foofile."_".$foonumber."_total_other_utr_bs.txt";
  270. my $fooeg=$foofile."_".$foonumber."_total_other_utr_eg.txt";
  271. open(FWOUTRN,">$foonw");
  272. open(FWOUTRS,">$foosw");
  273. open(FWOUTRM,">$fooms");
  274. open(FWOUTRB,">$foobs");
  275. open(FWOUTRE,">$fooeg");
  276. foreach my $o (sort {$a <=> $b} keys %total_seq) {
  277. foreach my $i (sort {$a <=> $b} keys %other_source_sequence){
  278. #if($i==$o){
  279. #if($other_source_sequence_name{$i} =~ /dispar102e08/){
  280. #true_string_match($o,$i);
  281. #water_string_match($o,$i);
  282. #needle_string_match($o,$i);
  283. #blast2seq($o,$i,$foofile);
  284. map2gen($o,$i,$foofile);
  285. #}
  286. #}
  287. }
  288. }
  289. close FWOUTRN;
  290. close FWOUTRM;
  291. close FWOUTRB;
  292. close FWOUTRS;
  293. close FWOUTRE;
  294. }
  295. sub map2gen{
  296. my $o=shift;
  297. my $i=shift;
  298. my $max_seq_name;
  299. my $max=90;
  300. my $seq_o=$total_seq{$o};
  301. my $seq_i=$other_source_sequence{$i};
  302. my $other_sequence_string=$seq_i;
  303. $seq_i=~s/\-/N/g;
  304. $seq_o=~s/\-/N/g;
  305. my $seq_o_name=$total_seq_name{$o};
  306. my $seq_i_name=$other_source_sequence_name{$i};
  307. my $seq_o_length=length($total_seq{$o});
  308. my $seq_i_length=length($other_source_sequence{$i});
  309. my $ol_seq_complete=$seq_i_length;
  310. open(F1,">file1");
  311. open(F2,">file2");
  312. print F1">$seq_o_name\n$seq_o\n";
  313. print F2">$seq_i_name\n$seq_i\n";
  314. print "Aligning seq $o and seq $i with ";
  315. system("est2genome file1 file2 -outfile=file3");
  316. open(FN,"file3");
  317. my $length;
  318. my $lnoeg;
  319. my @tnote;
  320. my @t;
  321. my $length;
  322. my $per_sim;
  323. my $other_start;
  324. my $other_end;
  325. while(my $line=<FN>){
  326. chomp $line;
  327. $lnoeg++;
  328. if(($lnoeg==1) and ($line=~/^Note/)){
  329. @tnote=split(/\s+/,$line);
  330. }
  331. if($line=~/^Span/){
  332. @t=split(/\s+/,$line);
  333. $length=@t[1];
  334. $per_sim=@t[2]+0;
  335. $other_start=@t[3]+0;
  336. $other_end=@t[4]+0;
  337. print FWOUTRE"$o-$i $seq_o_name\t$seq_i_name\t$per_sim\t$seq_o_length\t$seq_i_length\tLength-$length\tPer-$per_sim\tS-$other_start\tE-$other_end\n$line\n";
  338. }
  339. }
  340. close FN;
  341. close F1;
  342. close F2;
  343. if($per_sim>$sim_thresh and $seq_o_length==$length){
  344. if($other_total_seq_cnt>=$total_seq_cnt_threshold){die"Other Sequence count has reached $other_total_seq_cnt";}
  345. if((@tnote[5] eq "reversed") && (($other_end+$l_utr)<$ol_seq_complete) && (($other_start-$l_dutr)>0)){
  346. $other_total_seq_cnt++;
  347. my $oseq = substr($other_sequence_string,($other_start-1),($other_end-$other_start+1));
  348. $oseq=reverse($oseq);
  349. $oseq=~tr/ATGC/TACG/d;
  350. my $oseq_utr = substr($other_sequence_string,$other_end,$l_utr);
  351. $oseq_utr=reverse($oseq_utr);
  352. $oseq_utr=~tr/ATGC/TACG/d;
  353. my $oseq_dutr = substr($other_sequence_string,($other_start-$l_dutr-1),$l_dutr);
  354. $oseq_dutr=reverse($oseq_dutr);
  355. $oseq_dutr=~tr/ATGC/TACG/d;
  356. my $oal_utr=length($oseq_utr);
  357. my $oal_dutr=length($oseq_dutr);
  358. my $oal=length($oseq);
  359. my $oseq_name_utr.="$seq_i_name $other_start-$other_end UTR [$oal_utr]";
  360. my $oseq_name_dutr.="$seq_i_name $other_start-$other_end DUTR [$oal_utr]";
  361. print "R->$other_total_seq_cnt\t$other_start-$other_end-$ol_seq_complete-$l_utr-$l_dutr [@tnote[5]]\t";
  362. print OFTTL">$seq_i_name\t$other_start-$other_end\t$ol_seq_complete [$oal]\n$oseq\n";
  363. print OFTTLUTR">$oseq_name_utr\n$oseq_utr\n";
  364. print OFTTLDUTR">$oseq_name_dutr\n$oseq_dutr\n";
  365. print FTTLUTR">$total_seq_name{$o}\tUTR [$l_utr]\n$total_seq_utr{$o}\n";
  366. print FTTLDUTR">$total_seq_name{$o}\tDUTR [$l_dutr]\n$total_seq_dutr{$o}\n";
  367. print FTTL">$total_seq_name{$o}\n$total_seq{$o}\n";
  368. last;
  369. return;
  370. }
  371. elsif((@tnote[5] eq "forward") && (($other_start-$l_utr)>0) && (($other_end+$l_dutr)<$ol_seq_complete)){
  372. $other_total_seq_cnt++;
  373. my $oseq = substr($other_sequence_string,($other_start-1),($other_end-$other_start+1));
  374. my $oseq_utr = substr($other_sequence_string,($other_start-$l_utr-1),$l_utr);
  375. my $oseq_dutr = substr($other_sequence_string,$other_end,$l_dutr);
  376. my $oal_utr=length($oseq_utr);
  377. my $oal_dutr=length($oseq_dutr);
  378. my $oal=length($oseq);
  379. my $oseq_name_utr.="$seq_i_name $other_start-$other_end UTR [$oal_utr]";
  380. my $oseq_name_dutr.="$seq_i_name $other_start-$other_end DUTR [$oal_dutr]";
  381. print "F->$other_total_seq_cnt\t$other_start-$other_end-$ol_seq_complete-$l_utr-$l_dutr [@tnote[5]]\t";
  382. print OFTTL">$seq_i_name\t$other_start-$other_end\t$ol_seq_complete [$oal]\n$oseq\n";
  383. print OFTTLUTR">$oseq_name_utr\n$oseq_utr\n";
  384. print OFTTLDUTR">$oseq_name_dutr\n$oseq_dutr\n";
  385. print FTTLUTR">$total_seq_name{$o}\tUTR [$l_utr]\n$total_seq_utr{$o}\n";
  386. print FTTLDUTR">$total_seq_name{$o}\tDUTR [$l_dutr]\n$total_seq_dutr{$o}\n";
  387. print FTTL">$total_seq_name{$o}\n$total_seq{$o}\n";
  388. last;
  389. return;
  390. }
  391. }
  392. #return($max,$max_seq_name);
  393. }