PageRenderTime 73ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 1ms

/scripts/batch_ltrfinder.pl

http://dawg-paws.googlecode.com/
Perl | 1305 lines | 899 code | 195 blank | 211 comment | 42 complexity | 7c4e445dc5288d163469c5e9f3c9b652 MD5 | raw file
  1. #!/usr/bin/perl -w
  2. #-----------------------------------------------------------+
  3. # |
  4. # batch_ltrfinder.pl - Run ltr_finder in batch mode |
  5. # |
  6. #-----------------------------------------------------------+
  7. # |
  8. # AUTHOR: James C. Estill |
  9. # CONTACT: JamesEstill_@_gmail.com |
  10. # STARTED: 10/03/2007 |
  11. # UPDATED: 10/03/2007 |
  12. # |
  13. # DESCRIPTION: |
  14. # Run the ltr_finder program in batch mode. |
  15. # |
  16. # VERSION: $Rev$ |
  17. # |
  18. # LICENSE: |
  19. # GNU General Public License, Version 3 |
  20. # http://www.gnu.org/licenses/gpl.html |
  21. # |
  22. #-----------------------------------------------------------+
  23. # TO DO: -Reset vals to null and only print output when expected vals
  24. # are not null
  25. # -No gff output if no hits in ltr_finder
  26. # -Add possiblity to extract sequence data
  27. package DAWGPAWS;
  28. #-----------------------------+
  29. # INCLUDES |
  30. #-----------------------------+
  31. use strict;
  32. use Getopt::Long;
  33. #-----------------------------+
  34. # PROGRAM VARIABLES |
  35. #-----------------------------+
  36. my ($VERSION) = q$Rev$ =~ /(\d+)/;
  37. #-----------------------------+
  38. # VARIABLE SCOPE |
  39. #-----------------------------+
  40. my $indir; # Input dir of fasta files
  41. my $outdir; # Base output dir
  42. my $config_file; # Configuration file
  43. # LTR Finder Parameters
  44. my @lf_params = (); # 2d Array to hold the find_ltr parameters
  45. # Booleans
  46. my $quiet = 0;
  47. my $verbose = 0;
  48. my $show_help = 0;
  49. my $show_usage = 0;
  50. my $show_man = 0;
  51. my $show_version = 0;
  52. my $do_gff_convert = 0;
  53. # Counters/Index Vals
  54. my $i = 0; # Array index val
  55. my $file_num = 0; # File number
  56. my $proc_num = 0; # Process number
  57. # Vars needing
  58. my $name_root;
  59. my $gff_dir;
  60. my $ls_out;
  61. my $fl_gff_outpath;
  62. # Some options that would be useful to make ENV options
  63. my $trna_db = $ENV{TRNA_DB};
  64. my $prosite_dir = $ENV{PROSITE_DIR};
  65. my $lf_path = $ENV{LTR_FINDER};
  66. #-----------------------------+
  67. # COMMAND LINE OPTIONS |
  68. #-----------------------------+
  69. my $ok = GetOptions(# REQUIRED OPTIONS
  70. "i|indir=s" => \$indir,
  71. "o|outdir=s" => \$outdir,
  72. "c|config=s" => \$config_file,
  73. # ADDITIONAL OPTIONS
  74. "ltr-finder=s" => \$lf_path, # LTR Finder Path
  75. "s|trna-db=s" => \$trna_db, # s as per program
  76. "a|prosite=s" => \$prosite_dir, # a as per program
  77. "g|gff" => \$do_gff_convert,
  78. "q|quiet" => \$quiet,
  79. "verbose" => \$verbose,
  80. # ADDITIONAL INFORMATION
  81. "usage" => \$show_usage,
  82. "version" => \$show_version,
  83. "man" => \$show_man,
  84. "h|help" => \$show_help,);
  85. #-----------------------------+
  86. # SHOW REQUESTED HELP |
  87. #-----------------------------+
  88. if ($show_usage) {
  89. print_help("");
  90. }
  91. if ($show_help || (!$ok) ) {
  92. print_help("full");
  93. }
  94. if ($show_version) {
  95. print "\n$0:\nVersion: $VERSION\n\n";
  96. exit;
  97. }
  98. if ($show_man) {
  99. # User perldoc to generate the man documentation.
  100. system("perldoc $0");
  101. exit($ok ? 0 : 2);
  102. }
  103. # Throw error if required options not paseed
  104. # Throw error if required arguments not present
  105. if ( (!$indir) || (!$outdir) || (!$config_file) ) {
  106. print "\a";
  107. print STDERR "ERROR: An input directory must be specified" if !$indir;
  108. print STDERR "ERROR: An output directory must be specified" if !$outdir;
  109. print STDERR "ERROR: A config file must be specified" if !$config_file;
  110. print_help("full");
  111. exit;
  112. }
  113. #-----------------------------------------------------------+
  114. # MAIN PROGRAM BODY |
  115. #-----------------------------------------------------------+
  116. #-----------------------------+
  117. # CHECK FOR SLASH IN DIR |
  118. # VARIABLES |
  119. #-----------------------------+
  120. # If the indir does not end in a slash then append one
  121. # TO DO: Allow for backslash
  122. unless ($indir =~ /\/$/ ) {
  123. $indir = $indir."/";
  124. }
  125. unless ($outdir =~ /\/$/ ) {
  126. $outdir = $outdir."/";
  127. }
  128. #-----------------------------+
  129. # CREATE THE OUT DIR |
  130. # IF IT DOES NOT EXIST |
  131. #-----------------------------+
  132. unless (-e $outdir) {
  133. print "Creating output dir ...\n" if $verbose;
  134. mkdir $outdir ||
  135. die "Could not create the output directory:\n$outdir";
  136. }
  137. #-----------------------------+
  138. # LOAD THE CONFIG FILE |
  139. #-----------------------------+
  140. $i=0;
  141. my $config_line_num=0;
  142. open (CONFIG, "<$config_file") ||
  143. die "ERROR Can not open the config file:\n $config_file";
  144. while (<CONFIG>) {
  145. chomp;
  146. $config_line_num++;
  147. unless (m/^\#/) {
  148. my @in_line = split (/\t/); # Implicit split of $_ by tab
  149. my $num_in_line = @in_line;
  150. # Can have just a name for default
  151. # or can have two columns with additional parameter options
  152. if ($num_in_line < 3) {
  153. $lf_params[$i][0] = $in_line[0] || "NULL"; # Name
  154. $lf_params[$i][1] = $in_line[1] || ""; # Suffix
  155. # $lf_params[$i][2] = $in_line[2] || "NULL"; # MIN-MEM-DIST
  156. # $lf_params[$i][3] = $in_line[3] || "NULL"; # MAX-MEM-DIST
  157. # $lf_params[$i][4] = $in_line[4] || "NULL"; # MAX-MEM-GAP
  158. # $lf_params[$i][5] = $in_line[5] || "NULL"; # MIN-LEN-LTR
  159. # $lf_params[$i][6] = $in_line[6] || "NULL"; # MAX-LEN-LTR
  160. # $lf_params[$i][7] = $in_line[7] || "NULL"; # RANGE-BIN
  161. # $lf_params[$i][8] = $in_line[8] || "NULL"; # MIN LEN ORF
  162. # $lf_params[$i][9] = $in_line[9] || "NULL"; # MAX E Value HMM
  163. $i++;
  164. } # End of if $num_in_line is 10
  165. else {
  166. print "\a";
  167. print STDERR "WARNING: Unexpected number of line in config".
  168. " file line $config_line_num\n$config_file\n";
  169. }
  170. } # End of unless comment line
  171. } # End of while CONFIG file
  172. close CONFIG;
  173. # Number of parameter sets specified in the config file
  174. my $num_par_sets = $i;
  175. if ($num_par_sets == 0) {
  176. print "\a";
  177. print "ERROR: No parameter sets were found in the config file:\n".
  178. "$config_file\n";
  179. exit;
  180. }
  181. #-----------------------------+
  182. # Get the FASTA files from the|
  183. # directory provided by the |
  184. # var $indir |
  185. #-----------------------------+
  186. opendir( DIR, $indir ) ||
  187. die "Can't open directory:\n$indir";
  188. my @fasta_files = grep /\.fasta$|\.fa$/, readdir DIR ;
  189. closedir( DIR );
  190. my $num_files = @fasta_files;
  191. if ($num_files == 0) {
  192. print "\a";
  193. print "ERROR: No input fasta files were found in the directroy:\n$indir\n";
  194. print "Fasta files must end with the fasta or fa extension\n";
  195. exit;
  196. }
  197. my $num_proc_total = $num_files * $num_par_sets;
  198. print STDERR "$num_proc_total find_ltr runs to process\n" if $verbose;
  199. for my $ind_file (@fasta_files) {
  200. $file_num++;
  201. # Get root file name
  202. if ($ind_file =~ m/(.*)\.fasta$/ ) {
  203. $name_root = "$1";
  204. }
  205. elsif ($ind_file =~ m/(.*)\.fa$/ ) {
  206. $name_root = "$1";
  207. }
  208. else {
  209. $name_root = "UNDEFINED";
  210. }
  211. # The following added for temp work with gff output
  212. # 09/28/2007
  213. # if ($file_num == 5) {exit;}
  214. # print "Processing: $name_root\n";
  215. #-----------------------------+
  216. # CREATE ROOT NAME DIR |
  217. #-----------------------------+
  218. my $name_root_dir = $outdir.$name_root."/";
  219. unless (-e $name_root_dir) {
  220. mkdir $name_root_dir ||
  221. die "Could not create dir:\n$name_root_dir\n"
  222. }
  223. #-----------------------------+
  224. # CREATE LTR_FINDER OUTDIR |
  225. #-----------------------------+
  226. # Dir to hold gene prediction output from local software
  227. my $ltrfinder_dir = $name_root_dir."ltr_finder/";
  228. unless (-e $ltrfinder_dir) {
  229. mkdir $ltrfinder_dir ||
  230. die "Could not create ltr_finder out dir:\n$ltrfinder_dir\n";
  231. }
  232. #-----------------------------+
  233. # CREATE GFF OUTDIR |
  234. #-----------------------------+
  235. if ($do_gff_convert) {
  236. $gff_dir = $name_root_dir."gff/";
  237. unless (-e $gff_dir) {
  238. mkdir $gff_dir ||
  239. die "Could not create genscan out dir:\n$gff_dir\n";
  240. }
  241. }
  242. #-----------------------------------------------------------+
  243. # RUN LTR_FINDER FOR EACH SET OF PARAM VALS IN CONFIG FILE |
  244. #-----------------------------------------------------------+
  245. for ($i=0; $i<$num_par_sets; $i++) {
  246. $proc_num++;
  247. # Load parameters from array refs to name vars
  248. my $lf_gff_suffix = $lf_params[$i][0]; # Name of the parameter set
  249. my $lf_cmd_suffix = $lf_params[$i][1]; # Name of the parameter set
  250. print STDERR "#\n#------------------------------------\n" if $verbose;
  251. print STDERR "#Processing: $name_root $lf_gff_suffix\n" if $verbose;
  252. print STDERR "#------------------------------------\n" if $verbose;
  253. # lf_out is path of the ltrfinder output file
  254. my $lf_inseqpath = $indir.$ind_file;
  255. my $lf_out = $ltrfinder_dir.$name_root."_".$lf_gff_suffix.
  256. "_ltr_finder.txt";
  257. my $lf_gff_outpath = $gff_dir.$name_root."_".$lf_gff_suffix.
  258. "_ltr_finder.gff";
  259. my $lf_cmd = "$lf_path $lf_inseqpath".
  260. " -s $trna_db".
  261. " -a $prosite_dir".
  262. " $lf_cmd_suffix".
  263. " > $lf_out";
  264. print STDERR "#\tCMD:$lf_cmd\n" if $verbose;
  265. system ($lf_cmd);
  266. #-----------------------------+
  267. # CONVERT OUTPUT TO GFF |
  268. #-----------------------------+
  269. if ( (-e $lf_out) && ($do_gff_convert)) {
  270. ltrfinder2gff ( $name_root, $lf_out, $lf_gff_outpath,
  271. 0, $lf_gff_suffix);
  272. }
  273. }
  274. } # End of for each individual fasta file
  275. exit;
  276. #-----------------------------------------------------------+
  277. # SUBFUNCTIONS |
  278. #-----------------------------------------------------------+
  279. sub print_help {
  280. # Print requested help or exit.
  281. # Options are to just print the full
  282. my ($opt) = @_;
  283. my $usage = "USAGE:\n".
  284. "MyProg.pl -i InFile -o OutFile";
  285. my $args = "REQUIRED ARGUMENTS:\n".
  286. " --infile # Path to the input file\n".
  287. " --outfile # Path to the output file\n".
  288. "\n".
  289. "OPTIONS::\n".
  290. " --version # Show the program version\n".
  291. " --usage # Show program usage\n".
  292. " --help # Show this help message\n".
  293. " --man # Open full program manual\n".
  294. " --quiet # Run program with minimal output\n";
  295. if ($opt =~ "full") {
  296. print "\n$usage\n\n";
  297. print "$args\n\n";
  298. }
  299. else {
  300. print "\n$usage\n\n";
  301. }
  302. exit;
  303. }
  304. sub ltrfinder2gff {
  305. # seq_id could be extracted from the ltrfinder result
  306. # but passing it to the subfunction directly allows for cases
  307. # where the id assigned by ltr_struc differs from the way
  308. # the user is referring to the assembly
  309. # MAY WANT TO ALLOW FOR USING THE $ls_seq_id
  310. my ($seq_id, $lf_infile, $gffout, $do_append, $gff_suffix) = @_;
  311. # The gff src id
  312. my $gff_src = "ltr_finder:".$gff_suffix;
  313. my $gff_str_out; # A single out string line of gff out
  314. # LTF FINDER COUTNERS/INDICES
  315. my $lf_id_num = 0; # Incremented for every predicted model
  316. # LTR_FINDER BOOLEANS
  317. my $in_emp_details = 0; # Exact match pairs details
  318. my $in_ltr_align = 0; # Details of the LTR alignment
  319. my $in_pbs_align = 0;
  320. my $in_ppt;
  321. #
  322. my $lf_prog_name; # LTR Finder program name
  323. my $lf_seq_id; # Seq id
  324. my $lf_seq_len; # Sequence length
  325. my $lf_version; # Version of LTR finder being parsed
  326. my $lf_trna_db; # tRNA database used
  327. my $lf_strand; # Strand of the LTR
  328. my $lf_span_start; # Location start
  329. my $lf_span_end; # Location end
  330. my $lf_length; # Length
  331. my $lf_score; # Score
  332. my $lf_ltr_similarity; # Similarity of the LTRs
  333. # Status strings
  334. my $has_5ltr_tg; # TG in 5' END of 5' LTR
  335. my $has_5ltr_ca; # CA in 3' END of 5' LTR
  336. my $has_3ltr_tg; # TG in 5' END of 3' LTR
  337. my $has_3ltr_ca; # CA in 3' END of 3' LTR
  338. my $has_tsr; # Has Target Site Replication
  339. my $has_pbs; # Has Primer Binding Site
  340. my $has_ppt; # Has Poly Purine Tract
  341. my $has_rt; # Has Reverse Transcriptase
  342. my $has_in_core; # Has Integrase Core
  343. my $has_in_cterm; # Has Integrase C-term
  344. my $has_rh; # Has RNAseH
  345. my $lf_ltr_id; # Id number assigned to the LTR retrotransposon
  346. # LTR COORDINATES
  347. my $lf_5ltr_start;
  348. my $lf_5ltr_end;
  349. my $lf_5ltr_len;
  350. my $lf_3ltr_start;
  351. my $lf_3ltr_end;
  352. my $lf_3ltr_len;
  353. # TSR COORDINATES
  354. my $lf_5tsr_start; # Start of the 5' TSR
  355. my $lf_5tsr_end; # End of the 5' TSR
  356. my $lf_3tsr_start; # Start of the 3' TSR
  357. my $lf_3tsr_end; # End of the 3' TSR
  358. my $lf_tsr_string; # String of bases in the TSR
  359. # BOUNDARY SHARPNESS
  360. my $lf_sharp_5; # Sharpness of 5' Boundary
  361. my $lf_sharp_3; # Sharpness of 3' Boundary
  362. # PBS
  363. my $lf_pbs_num_match; # Number of matched bases in the PBS
  364. my $lf_pbs_aln_len; # PBS alignment length
  365. my $lf_pbs_start; # Start of the PBS signal
  366. my $lf_pbs_end; # End of the PBS signal
  367. my $lf_pbs_trna; # PBS tRNA type and anti-codon
  368. # PPT
  369. my $lf_ppt_num_match; # Number of matched based in the PPT
  370. my $lf_ppt_aln_len; # PPT alignment length
  371. my $lf_ppt_start; # Start of the PPT
  372. my $lf_ppt_end; # End of the PPT
  373. #-----------------------------+
  374. # DOMAIN DATA |
  375. #-----------------------------+
  376. # GENERAL DOMAIN VARS
  377. my $lf_domain_dom_start;
  378. my $lf_domain_dom_end;
  379. my $lf_domain_orf_start;
  380. my $lf_domain_orf_end;
  381. my $lf_domain_name; # Type of the domain
  382. # INTEGRASE CORE
  383. #my $has_in_core = 0; # Boolean for has integrase core
  384. my $lf_in_core_dom_start;
  385. my $lf_in_core_dom_end;
  386. my $lf_in_core_orf_start;
  387. my $lf_in_core_orf_end;
  388. # INTEGRASE C-TERM
  389. #my $has_in_cterm = 0;
  390. my $lf_in_cterm_dom_start;
  391. my $lf_in_cterm_dom_end;
  392. my $lf_in_cterm_orf_start;
  393. my $lf_in_cterm_orf_end;
  394. # RNASEH
  395. #my $has_rh = 0;
  396. my $lf_rh_dom_start;
  397. my $lf_rh_dom_end;
  398. my $lf_rh_orf_start;
  399. my $lf_rh_orf_end;
  400. # RT
  401. #my $has_rt = 0;
  402. my $lf_rt_dom_start;
  403. my $lf_rt_dom_end;
  404. my $lf_rt_orf_start;
  405. my $lf_rt_orf_end;
  406. #-----------------------------+
  407. # OPEN GFF OUTFILE |
  408. #-----------------------------+
  409. if ($do_append) {
  410. open (GFFOUT, ">>$gffout") ||
  411. die "ERROR: Can not open gff outfile:\n $gffout\n";
  412. }
  413. else {
  414. open (GFFOUT,">$gffout") ||
  415. die "ERROR: Can not open gff outfile:\n $gffout\n";
  416. }
  417. #-----------------------------+
  418. # OPEN INPUT FILE |
  419. #-----------------------------+
  420. open (INFILE, "<$lf_infile") ||
  421. die "ERROR: Can not open LTR_FINDER result file\n $lf_infile\n";
  422. while (<INFILE>) {
  423. chomp;
  424. # print $_."\n";
  425. # CHECK BOOLEANS
  426. #
  427. if (m/Nothing Header(.*)/) {
  428. }
  429. #///////////////////////////////////////
  430. # PAY ATTENTION BELOW THIS MAY BE THE
  431. # PLACE TO PRINT OUT SAVED GFF DATA
  432. #\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
  433. # IN NEW REC, GET ID
  434. elsif (m/^\[(.*)\]/) {
  435. #-----------------------------+
  436. # PRINT STORED GFF OUTPUT |
  437. #-----------------------------+
  438. unless ($1 == 1 ) {
  439. # Two alternatives here
  440. # keep appending to gff_lout and then print set
  441. # or print line at a time .. currently printing
  442. # a line at a time. JCE 10/02/2007
  443. # FULL SPAN
  444. $gff_str_out = "$lf_seq_id\t". # Seq ID
  445. "$gff_src\t". # Source
  446. "LTR_retrotransposon\t". # Data type
  447. "$lf_span_start\t". # Start
  448. "$lf_span_end\t". # End
  449. "$lf_score\t". # Score
  450. "$lf_strand\t". # Strand
  451. ".\t". # Frame
  452. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  453. print STDOUT $gff_str_out;
  454. print GFFOUT $gff_str_out;
  455. if ($has_pbs) {
  456. $gff_str_out = "$lf_seq_id\t". # Seq ID
  457. "$gff_src\t". # Source
  458. "primer_binding_site\t". # Data type
  459. "$lf_pbs_start\t" . # Start
  460. "$lf_pbs_end\t". # End
  461. "$lf_score\t". # Score
  462. "$lf_strand\t". # Strand
  463. ".\t". # Frame
  464. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  465. print STDOUT $gff_str_out;
  466. print GFFOUT $gff_str_out;
  467. }
  468. if ($has_ppt) {
  469. $gff_str_out = "$lf_seq_id\t". # Seq ID
  470. # print STDOUT "$lf_seq_id\t". # Seq ID
  471. "$gff_src\t". # Source
  472. "RR_tract\t". # Data type
  473. "$lf_ppt_start\t". # Start
  474. "$lf_ppt_end\t". # End
  475. "$lf_score\t". # Score
  476. "$lf_strand\t". # Strand
  477. ".\t". # Frame
  478. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  479. print STDOUT $gff_str_out;
  480. print GFFOUT $gff_str_out;
  481. }
  482. $gff_str_out = "$lf_seq_id\t". # Seq ID
  483. # print STDOUT "$lf_seq_id\t". # Seq ID
  484. "$gff_src\t". # Source
  485. "five_prime_LTR\t". # Data type
  486. "$lf_5ltr_start\t". # Start
  487. "$lf_5ltr_end\t". # End
  488. "$lf_score\t". # Score
  489. "$lf_strand\t". # Strand
  490. ".\t". # Frame
  491. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  492. print STDOUT $gff_str_out;
  493. print GFFOUT $gff_str_out;
  494. $gff_str_out = "$lf_seq_id\t". # Seq ID
  495. # print STDOUT "$lf_seq_id\t". # Seq ID
  496. "$gff_src\t". # Source
  497. "three_prime_LTR\t". # Data type
  498. "$lf_3ltr_start\t". # Start
  499. "$lf_3ltr_end\t". # End
  500. "$lf_score\t". # Score
  501. "$lf_strand\t". # Strand
  502. ".\t". # Frame
  503. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  504. print STDOUT $gff_str_out;
  505. print GFFOUT $gff_str_out;
  506. if ($has_tsr) {
  507. $gff_str_out = "$lf_seq_id\t". # Seq ID
  508. # print STDOUT "$lf_seq_id\t". # Seq ID
  509. "$gff_src\t". # Source
  510. "target_site_duplication\t". # Data type
  511. "$lf_5tsr_start\t". # Start
  512. "$lf_5tsr_end\t". # End
  513. "$lf_score\t". # Score
  514. "$lf_strand\t". # Strand
  515. ".\t". # Frame
  516. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  517. print STDOUT $gff_str_out;
  518. print GFFOUT $gff_str_out;
  519. $gff_str_out = "$lf_seq_id\t". # Seq ID
  520. "$gff_src\t". # Source
  521. "target_site_duplication\t". # Data type
  522. "$lf_3tsr_start\t". # Start
  523. "$lf_3tsr_end\t". # End
  524. "$lf_score\t". # Score
  525. "$lf_strand\t". # Strand
  526. ".\t". # Frame
  527. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  528. print STDOUT $gff_str_out;
  529. print GFFOUT $gff_str_out;
  530. }
  531. # Integrase Core
  532. if ($has_in_core) {
  533. #/////////
  534. # NOT SONG
  535. #\\\\\\\\\
  536. $gff_str_out = "$lf_seq_id\t". # Seq ID
  537. "$gff_src\t". # Source
  538. "integrase_core_domain\t". # Data type
  539. "$lf_in_core_dom_start\t". # Start
  540. "$lf_in_core_dom_end\t". # End
  541. "$lf_score\t". # Score
  542. "$lf_strand\t". # Strand
  543. ".\t". # Frame
  544. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  545. print STDOUT $gff_str_out;
  546. print GFFOUT $gff_str_out;
  547. #/////////
  548. # NOT SONG
  549. #\\\\\\\\\
  550. $gff_str_out = "$lf_seq_id\t". # Seq ID
  551. "$gff_src\t". # Source
  552. "integrase_core_orf\t". # Data type
  553. "$lf_in_core_orf_start\t". # Start
  554. "$lf_in_core_orf_end\t". # End
  555. "$lf_score\t". # Score
  556. "$lf_strand\t". # Strand
  557. ".\t". # Frame
  558. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  559. print STDOUT $gff_str_out;
  560. print GFFOUT $gff_str_out;
  561. } # End of has in_core
  562. if ($has_in_cterm) {
  563. #/////////
  564. # NOT SONG
  565. #\\\\\\\\\
  566. $gff_str_out = "$lf_seq_id\t". # Seq ID
  567. "$gff_src\t". # Source
  568. "integrase_cterm_domain\t". # Data type
  569. "$lf_in_cterm_dom_start\t". # Start
  570. "$lf_in_cterm_dom_end\t". # End
  571. "$lf_score\t". # Score
  572. "$lf_strand\t". # Strand
  573. ".\t". # Frame
  574. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  575. print STDOUT $gff_str_out;
  576. print GFFOUT $gff_str_out;
  577. $gff_str_out = "$lf_seq_id\t". # Seq ID
  578. "$gff_src\t". # Source
  579. "integrase_cterm_orf\t". # Data type
  580. "$lf_in_cterm_orf_start\t". # Start
  581. "$lf_in_cterm_orf_end\t". # End
  582. "$lf_score\t". # Score
  583. "$lf_strand\t". # Strand
  584. ".\t". # Frame
  585. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  586. print STDOUT $gff_str_out;
  587. print GFFOUT $gff_str_out;
  588. } # End of has_in_cterm
  589. if ($has_rh) {
  590. #/////////
  591. # NOT SONG
  592. #\\\\\\\\\
  593. $gff_str_out = "$lf_seq_id\t". # Seq ID
  594. "$gff_src\t". # Source
  595. "rnaseh_domain\t". # Data type
  596. "$lf_rh_dom_start\t". # Start
  597. "$lf_rh_dom_end\t". # End
  598. "$lf_score\t". # Score
  599. "$lf_strand\t". # Strand
  600. ".\t". # Frame
  601. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  602. print STDOUT $gff_str_out;
  603. print GFFOUT $gff_str_out;
  604. $gff_str_out = "$lf_seq_id\t". # Seq ID
  605. "$gff_src\t". # Source
  606. "rnaseh_orf\t". # Data type
  607. "$lf_rh_orf_start\t". # Start
  608. "$lf_rh_orf_end\t". # End
  609. "$lf_score\t". # Score
  610. "$lf_strand\t". # Strand
  611. ".\t". # Frame
  612. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  613. print STDOUT $gff_str_out;
  614. print GFFOUT $gff_str_out;
  615. }
  616. if ($has_rt) {
  617. #/////////
  618. # NOT SONG
  619. #\\\\\\\\\
  620. $gff_str_out = "$lf_seq_id\t". # Seq ID
  621. "$gff_src\t". # Source
  622. "rt_domain\t". # Data type
  623. "$lf_rt_dom_start\t". # Start
  624. "$lf_rt_dom_end\t". # End
  625. "$lf_score\t". # Score
  626. "$lf_strand\t". # Strand
  627. ".\t". # Frame
  628. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  629. print STDOUT $gff_str_out;
  630. print GFFOUT $gff_str_out;
  631. $gff_str_out = "$lf_seq_id\t". # Seq ID
  632. "$gff_src\t". # Source
  633. "rt_orf\t". # Data type
  634. "$lf_rt_orf_start\t". # Start
  635. "$lf_rt_orf_end\t". # End
  636. "$lf_score\t". # Score
  637. "$lf_strand\t". # Strand
  638. ".\t". # Frame
  639. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  640. print STDOUT $gff_str_out;
  641. print GFFOUT $gff_str_out;
  642. } # End of has_reverse_transcriptase
  643. } # End of unless this is the first record
  644. #-----------------------------+
  645. # RESET VARS TO NULL |
  646. #-----------------------------+
  647. # May not need to reset vars since existence of the vars
  648. # is designated by the do_* variables.
  649. #-----------------------------+
  650. # LOAD ID VAR |
  651. #-----------------------------+
  652. $lf_ltr_id = $1;
  653. }
  654. # SEQ ID AND LENGTH
  655. elsif (m/>Sequence: (.*) Len:(.*)/){
  656. $lf_seq_id = $1;
  657. $lf_seq_len = $2;
  658. }
  659. # SPAN LOCATION, LENGTH, AND STRAND
  660. elsif (m/^Location : (\d*) - (\d*) Len: (\d*) Strand:(.)/){
  661. $lf_span_start = $1;
  662. $lf_span_end = $2;
  663. $lf_length = $3;
  664. $lf_strand = $4;
  665. }
  666. # SCORE SIMILARITY
  667. elsif (m/^Score : (.*) \[LTR region similarity:(.*)\]/){
  668. $lf_score = $1;
  669. $lf_ltr_similarity = $2;
  670. }
  671. # STATUS SET
  672. elsif (m/^Status : (\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)/){
  673. # Since this is a binary string, it can be split as digits
  674. # and used to load the $has_* booleans
  675. $has_5ltr_tg = $1;
  676. $has_5ltr_ca = $2;
  677. $has_3ltr_tg = $3;
  678. $has_3ltr_ca = $4;
  679. $has_tsr = $5;
  680. $has_pbs = $6;
  681. $has_ppt = $7;
  682. $has_rt = $8;
  683. $has_in_core = $9;
  684. $has_in_cterm = $10;
  685. $has_rh = $11;
  686. }
  687. # 5' LTR
  688. elsif (m/^5\'-LTR : (\d*) - (\d*) Len: (\d*)/){
  689. $lf_5ltr_start = $1;
  690. $lf_5ltr_end = $2;
  691. $lf_5ltr_len = $3;
  692. }
  693. # 3' LTR
  694. elsif (m/^3\'-LTR : (\d*) - (\d*) Len: (\d*)/){
  695. $lf_3ltr_start = $1;
  696. $lf_3ltr_end = $2;
  697. $lf_3ltr_len = $3;
  698. }
  699. # TARGET SITE REPLICATION
  700. elsif (m/TSR : (\d*) - (\d*) , (\d*) - (\d*) \[(.*)\]/){
  701. $lf_5tsr_start = $1;
  702. $lf_5tsr_end = $2;
  703. $lf_3tsr_start = $3;
  704. $lf_3tsr_end = $4;
  705. $lf_tsr_string = $5;
  706. }
  707. # SHARPNESS METRIC
  708. elsif (m/^Sharpness: (.*),(.*)/){
  709. $lf_sharp_5 = $1;
  710. $lf_sharp_3 = $2;
  711. }
  712. # PBS
  713. elsif (m/PBS : \[(\d*)\/(\d*)\] (\d*) - (\d*) \((.*)\)/) {
  714. $lf_pbs_num_match = $1;
  715. $lf_pbs_aln_len = $2;
  716. $lf_pbs_start = $3;
  717. $lf_pbs_end = $4;
  718. $lf_pbs_trna = $5;
  719. }
  720. # PPT
  721. elsif (m/PPT : \[(\d*)\/(\d*)\] (\d*) - (\d*)/) {
  722. $lf_ppt_num_match = $1;
  723. $lf_ppt_aln_len = $2;
  724. $lf_ppt_start = $3;
  725. $lf_ppt_end = $4;
  726. }
  727. # PROTEIN DOMAINS
  728. # This will need to be modified and checked after another run
  729. # using ps_scan to get the additional domains
  730. #
  731. #Domain: 56796 - 57326 [possible ORF:56259-59144, (IN (core))]
  732. elsif (m/Domain: (\d*) - (\d*) \[possible ORF:(\d*)-(\d*), \((.*)\)\]/) {
  733. $lf_domain_dom_start = $1;
  734. $lf_domain_dom_end = $2;
  735. $lf_domain_orf_start = $3;
  736. $lf_domain_orf_end = $4;
  737. $lf_domain_name = $5;
  738. # Temp while I work with this data
  739. #print "DOMAIN:".$lf_domain_name."\n";
  740. if ($lf_domain_name =~ 'IN \(core\)') {
  741. $lf_in_core_dom_start = $lf_domain_dom_start;
  742. $lf_in_core_dom_end = $lf_domain_dom_end;
  743. $lf_in_core_orf_start = $lf_domain_orf_start;
  744. $lf_in_core_orf_end = $lf_domain_orf_end;
  745. # Temp while debug
  746. # This is to check the regexp vars can be fetched here
  747. #print "\tDom Start: $lf_in_core_dom_start\n";
  748. #print "\tDom End: $lf_in_core_dom_end\n";
  749. #print "\tOrf Start: $lf_in_core_orf_start\n";
  750. #print "\tOrf End: $lf_in_core_orf_end\n";
  751. }
  752. elsif ($lf_domain_name =~ 'IN \(c-term\)') {
  753. $lf_in_cterm_dom_start = $lf_domain_dom_start;
  754. $lf_in_cterm_dom_end = $lf_domain_dom_end;
  755. $lf_in_cterm_orf_start = $lf_domain_orf_start;
  756. $lf_in_cterm_orf_end = $lf_domain_orf_end;
  757. }
  758. elsif ($lf_domain_name =~ 'RH') {
  759. $lf_rh_dom_start = $lf_domain_dom_start;
  760. $lf_rh_dom_end = $lf_domain_dom_end;
  761. $lf_rh_orf_start = $lf_domain_orf_start;
  762. $lf_rh_orf_end = $lf_domain_orf_end;
  763. }
  764. elsif ($lf_domain_name =~ 'RT') {
  765. $lf_rt_dom_start = $lf_domain_dom_start;
  766. $lf_rt_dom_end = $lf_domain_dom_end;
  767. $lf_rt_orf_start = $lf_domain_orf_start;
  768. $lf_rt_orf_end = $lf_domain_orf_end;
  769. }
  770. else {
  771. print "\a";
  772. print STDERR "Unknown domain type: $lf_domain_name\n";
  773. }
  774. } # End of elsif Domain
  775. #-----------------------------+
  776. # FILE HEADER INFORMATION |
  777. #-----------------------------+
  778. # PROGRAM NAME
  779. elsif (m/^Program : (.*)/) {
  780. $lf_prog_name = $1;
  781. }
  782. # PROGRAM VERSION
  783. elsif (m/^Version : (.*)/) {
  784. $lf_version = $1;
  785. }
  786. }
  787. close INFILE;
  788. #-----------------------------+
  789. # PRINT LAST GFFOUT |
  790. #-----------------------------+
  791. # FULL SPAN
  792. $gff_str_out = "$lf_seq_id\t". # Seq ID
  793. "$gff_src\t". # Source
  794. "LTR_retrotransposon\t". # Data type
  795. "$lf_span_start\t". # Start
  796. "$lf_span_end\t". # End
  797. "$lf_score\t". # Score
  798. "$lf_strand\t". # Strand
  799. ".\t". # Frame
  800. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  801. print STDOUT $gff_str_out;
  802. print GFFOUT $gff_str_out;
  803. if ($has_pbs) {
  804. $gff_str_out = "$lf_seq_id\t". # Seq ID
  805. "$gff_src\t". # Source
  806. "primer_binding_site\t". # Data type
  807. "$lf_pbs_start\t" . # Start
  808. "$lf_pbs_end\t". # End
  809. "$lf_score\t". # Score
  810. "$lf_strand\t". # Strand
  811. ".\t". # Frame
  812. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  813. print STDOUT $gff_str_out;
  814. print GFFOUT $gff_str_out;
  815. }
  816. if ($has_ppt) {
  817. $gff_str_out = "$lf_seq_id\t". # Seq ID
  818. # print STDOUT "$lf_seq_id\t". # Seq ID
  819. "$gff_src\t". # Source
  820. "RR_tract\t". # Data type
  821. "$lf_ppt_start\t". # Start
  822. "$lf_ppt_end\t". # End
  823. "$lf_score\t". # Score
  824. "$lf_strand\t". # Strand
  825. ".\t". # Frame
  826. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  827. print STDOUT $gff_str_out;
  828. print GFFOUT $gff_str_out;
  829. }
  830. $gff_str_out = "$lf_seq_id\t". # Seq ID
  831. # print STDOUT "$lf_seq_id\t". # Seq ID
  832. "$gff_src\t". # Source
  833. "five_prime_LTR\t". # Data type
  834. "$lf_5ltr_start\t". # Start
  835. "$lf_5ltr_end\t". # End
  836. "$lf_score\t". # Score
  837. "$lf_strand\t". # Strand
  838. ".\t". # Frame
  839. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  840. print STDOUT $gff_str_out;
  841. print GFFOUT $gff_str_out;
  842. # Getting an error beow
  843. $gff_str_out = "$lf_seq_id\t". # Seq ID
  844. # print STDOUT "$lf_seq_id\t". # Seq ID
  845. "$gff_src\t". # Source
  846. "three_prime_LTR\t". # Data type
  847. "$lf_3ltr_start\t". # Start
  848. "$lf_3ltr_end\t". # End
  849. "$lf_score\t". # Score
  850. "$lf_strand\t". # Strand
  851. ".\t". # Frame
  852. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  853. print STDOUT $gff_str_out;
  854. print GFFOUT $gff_str_out;
  855. if ($has_tsr) {
  856. $gff_str_out = "$lf_seq_id\t". # Seq ID
  857. # print STDOUT "$lf_seq_id\t". # Seq ID
  858. "$gff_src\t". # Source
  859. "target_site_duplication\t". # Data type
  860. "$lf_5tsr_start\t". # Start
  861. "$lf_5tsr_end\t". # End
  862. "$lf_score\t". # Score
  863. "$lf_strand\t". # Strand
  864. ".\t". # Frame
  865. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  866. print STDOUT $gff_str_out;
  867. print GFFOUT $gff_str_out;
  868. $gff_str_out = "$lf_seq_id\t". # Seq ID
  869. "$gff_src\t". # Source
  870. "target_site_duplication\t". # Data type
  871. "$lf_3tsr_start\t". # Start
  872. "$lf_3tsr_end\t". # End
  873. "$lf_score\t". # Score
  874. "$lf_strand\t". # Strand
  875. ".\t". # Frame
  876. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  877. print STDOUT $gff_str_out;
  878. print GFFOUT $gff_str_out;
  879. }
  880. # Integrase Core
  881. if ($has_in_core) {
  882. #/////////
  883. # NOT SONG
  884. #\\\\\\\\\
  885. $gff_str_out = "$lf_seq_id\t". # Seq ID
  886. "$gff_src\t". # Source
  887. "integrase_core_domain\t". # Data type
  888. "$lf_in_core_dom_start\t". # Start
  889. "$lf_in_core_dom_end\t". # End
  890. "$lf_score\t". # Score
  891. "$lf_strand\t". # Strand
  892. ".\t". # Frame
  893. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  894. print STDOUT $gff_str_out;
  895. print GFFOUT $gff_str_out;
  896. #/////////
  897. # NOT SONG
  898. #\\\\\\\\\
  899. $gff_str_out = "$lf_seq_id\t". # Seq ID
  900. "$gff_src\t". # Source
  901. "integrase_core_orf\t". # Data type
  902. "$lf_in_core_orf_start\t". # Start
  903. "$lf_in_core_orf_end\t". # End
  904. "$lf_score\t". # Score
  905. "$lf_strand\t". # Strand
  906. ".\t". # Frame
  907. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  908. print STDOUT $gff_str_out;
  909. print GFFOUT $gff_str_out;
  910. } # End of has in_core
  911. if ($has_in_cterm) {
  912. #/////////
  913. # NOT SONG
  914. #\\\\\\\\\
  915. $gff_str_out = "$lf_seq_id\t". # Seq ID
  916. "$gff_src\t". # Source
  917. "integrase_cterm_domain\t". # Data type
  918. "$lf_in_cterm_dom_start\t". # Start
  919. "$lf_in_cterm_dom_end\t". # End
  920. "$lf_score\t". # Score
  921. "$lf_strand\t". # Strand
  922. ".\t". # Frame
  923. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  924. print STDOUT $gff_str_out;
  925. print GFFOUT $gff_str_out;
  926. $gff_str_out = "$lf_seq_id\t". # Seq ID
  927. "$gff_src\t". # Source
  928. "integrase_cterm_orf\t". # Data type
  929. "$lf_in_cterm_orf_start\t". # Start
  930. "$lf_in_cterm_orf_end\t". # End
  931. "$lf_score\t". # Score
  932. "$lf_strand\t". # Strand
  933. ".\t". # Frame
  934. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  935. print STDOUT $gff_str_out;
  936. print GFFOUT $gff_str_out;
  937. } # End of has_in_cterm
  938. if ($has_rh) {
  939. #/////////
  940. # NOT SONG
  941. #\\\\\\\\\
  942. $gff_str_out = "$lf_seq_id\t". # Seq ID
  943. "$gff_src\t". # Source
  944. "rnaseh_domain\t". # Data type
  945. "$lf_rh_dom_start\t". # Start
  946. "$lf_rh_dom_end\t". # End
  947. "$lf_score\t". # Score
  948. "$lf_strand\t". # Strand
  949. ".\t". # Frame
  950. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  951. print STDOUT $gff_str_out;
  952. print GFFOUT $gff_str_out;
  953. $gff_str_out = "$lf_seq_id\t". # Seq ID
  954. "$gff_src\t". # Source
  955. "rnaseh_orf\t". # Data type
  956. "$lf_rh_orf_start\t". # Start
  957. "$lf_rh_orf_end\t". # End
  958. "$lf_score\t". # Score
  959. "$lf_strand\t". # Strand
  960. ".\t". # Frame
  961. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  962. print STDOUT $gff_str_out;
  963. print GFFOUT $gff_str_out;
  964. }
  965. if ($has_rt) {
  966. #/////////
  967. # NOT SONG
  968. #\\\\\\\\\
  969. $gff_str_out = "$lf_seq_id\t". # Seq ID
  970. "$gff_src\t". # Source
  971. "rt_domain\t". # Data type
  972. "$lf_rt_dom_start\t". # Start
  973. "$lf_rt_dom_end\t". # End
  974. "$lf_score\t". # Score
  975. "$lf_strand\t". # Strand
  976. ".\t". # Frame
  977. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  978. print STDOUT $gff_str_out;
  979. print GFFOUT $gff_str_out;
  980. $gff_str_out = "$lf_seq_id\t". # Seq ID
  981. "$gff_src\t". # Source
  982. "rt_orf\t". # Data type
  983. "$lf_rt_orf_start\t". # Start
  984. "$lf_rt_orf_end\t". # End
  985. "$lf_score\t". # Score
  986. "$lf_strand\t". # Strand
  987. ".\t". # Frame
  988. "ltr_finder_$lf_ltr_id\n"; # Retro ID
  989. print STDOUT $gff_str_out;
  990. print GFFOUT $gff_str_out;
  991. } # End of has_reverse_transcriptase
  992. close GFFOUT;
  993. } # End of ltrfinder2gff subfunction
  994. =head1 NAME
  995. Name.pl - Short program description.
  996. =head1 VERSION
  997. This documentation refers to program version 0.1
  998. =head1 SYNOPSIS
  999. USAGE:
  1000. Name.pl -i InFile -o OutFile
  1001. --infile # Path to the input file
  1002. --outfie # Path to the output file
  1003. =head1 DESCRIPTION
  1004. This is what the program does
  1005. =head1 COMMAND LINE ARGUMENTS
  1006. =head 2 Required Arguments
  1007. =over 2
  1008. =item -i,--infile
  1009. Path of the input file.
  1010. =item -o,--outfile
  1011. Path of the output file.
  1012. =back
  1013. =head1 Additional Options
  1014. =over
  1015. =over 2
  1016. =item --usage
  1017. Short overview of how to use program from command line.
  1018. =item --help
  1019. Show program usage with summary of options.
  1020. =item --version
  1021. Show program version.
  1022. =item --man
  1023. Show the full program manual. This uses the perldoc command to print the
  1024. POD documentation for the program.
  1025. =item -q,--quiet
  1026. Run the program with minimal output.
  1027. =back
  1028. =head1 DIAGNOSTICS
  1029. The list of error messages that can be generated,
  1030. explanation of the problem
  1031. one or more causes
  1032. suggested remedies
  1033. list exit status associated with each error
  1034. =head1 CONFIGURATION AND ENVIRONMENT
  1035. Names and locations of config files
  1036. environmental variables
  1037. or properties that can be set.
  1038. =head1 DEPENDENCIES
  1039. Other modules or software that the program is dependent on.
  1040. =head1 BUGS AND LIMITATIONS
  1041. Any known bugs and limitations will be listed here.
  1042. =head1 LICENSE
  1043. GNU LESSER GENERAL PUBLIC LICENSE
  1044. http://www.gnu.org/licenses/lgpl.html
  1045. =head1 AUTHOR
  1046. James C. Estill E<lt>JamesEstill at gmail.comE<gt>
  1047. =head1 HISTORY
  1048. STARTED: 10/02/2007
  1049. UPDATED: 10/02/2007
  1050. VERSION: $Rev$
  1051. =cut
  1052. #-----------------------------------------------------------+
  1053. # HISTORY |
  1054. #-----------------------------------------------------------+
  1055. #
  1056. # 10/02/2007
  1057. # - Started program from general template
  1058. # - Added the ltrfinder2gff subfunction from the
  1059. # cnv_ltrfinder2gff program.