PageRenderTime 53ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/batch_mask.pl

http://dawg-paws.googlecode.com/
Perl | 1264 lines | 1136 code | 34 blank | 94 comment | 13 complexity | c6914b895e7116833e3b6949de14a570 MD5 | raw file
  1. #!/usr/bin/perl -w
  2. #-----------------------------------------------------------+
  3. # |
  4. # batch_mask.pl - Run RepeatMasker in batch mode |
  5. # |
  6. #-----------------------------------------------------------+
  7. # |
  8. # AUTHOR: James C. Estill |
  9. # CONTACT: JamesEstill_at_sourceforge.net |
  10. # STARTED: 04/10/2006 |
  11. # UPDATED: 09/11/2007 |
  12. # |
  13. # DESCRIPTION: |
  14. # Runs the RepeatMasker program for a set of input |
  15. # FASTA files against a set of repeat library files & |
  16. # then converts the repeat masker *.out file into the |
  17. # GFF format and then to the game XML format for |
  18. # visualization by the Apollo genome anotation program. |
  19. # |
  20. # USAGE: |
  21. # batch_mask.pl -i InDir -o OutDir -c ConfigFile.txt |
  22. # |
  23. # batch_mask.pl --man |
  24. # |
  25. # LICENSE: |
  26. # GNU General Public License, Version 3 |
  27. # http://www.gnu.org/licenses/gpl.html |
  28. # |
  29. #-----------------------------------------------------------+
  30. package DAWGPAWS;
  31. print "\n";
  32. #-----------------------------+
  33. # INCLUDES |
  34. #-----------------------------+
  35. use strict;
  36. use File::Copy;
  37. use Getopt::Long;
  38. #-----------------------------+
  39. # PROGRAM VARIABLES |
  40. #-----------------------------+
  41. my ($VERSION) = q$Rev: 248 $ =~ /(\d+)/;
  42. #//////////////////////
  43. my $file_num_max = 1;
  44. #\\\\\\\\\\\\\\\\\\\\\\
  45. #-----------------------------------------------------------+
  46. # VARIABLE SCOPE |
  47. #-----------------------------------------------------------+
  48. # VARS WITH DEFAULT VALUES
  49. my $engine = "crossmatch";
  50. # GENERAL PROGRAM VARS
  51. my @inline; # Parse of an input line
  52. my $msg; # Message printed to the log file
  53. # DIR PATHS
  54. my $indir; # Directory containing the seq files to process
  55. my $outdir; # Directory to hold the output
  56. my $bac_out_dir; # Dir for each sequence being masked
  57. my $bac_rep_out_dir; # Dir to hold BAC repeat masker output
  58. # $bac_out_dir/rm
  59. my $gff_out_dir; # Dir for the gff output
  60. # $bac_out_dir/gff
  61. # FILE PATHS
  62. my $logfile; # Path to a logfile to log error info
  63. my $rm_path; # Full path to the repeatmasker binary
  64. my $ap_path; # Full path to the apollo program
  65. my $rep_db_path; # Path to an indivual repeat database
  66. my $file_to_mask; # The fasta file to be masked
  67. my $repmask_outfile; # Repeat masked outfile
  68. my $repmask_catfile; # Concatenated repeat masked outfile
  69. my $config_file; # Full path to the configuration file
  70. # This includes the db names and paths
  71. # of the fasta files to use for masking
  72. my $repmask_cat_cp; # Path to the copy of the RepMask
  73. my $search_name; # Name searched for in grep command
  74. my $name_root; # Root name to be used for output etc
  75. my $repmask_log;
  76. my $repmask_log_cp;
  77. my $repmask_tbl_file;
  78. my $repmask_masked_file;
  79. my $gff_alldb_out;
  80. my $gff_el_out;
  81. my $xml_alldb_out;
  82. my $xml_el_out;
  83. #my $repmask_cat_cp;
  84. # FINAL FILE LOCATIONS
  85. my $repmask_masked_cp; # Masked fasta file
  86. my $repmask_local_cp; # Copy of masked in fasta file
  87. my $repmask_tbl_cp; # Repmask Table
  88. my $repmask_el_cp; # Individual
  89. my $repmask_xml_el_cp;
  90. my $repmask_out_cp; # Repmask out file copy
  91. # REPEAT DB VARS
  92. my $rep_db_name; # Name of the repeat database
  93. my $ind_lib; # Vars for an individual library
  94. # BOOLEANS
  95. my $show_help = 0; # Show program help
  96. my $show_version = 0; # Show program version
  97. my $show_man = 0; # Show program manual page using peldoc
  98. my $show_usage = 0; # Show program usage command
  99. my $quiet = 0; # Boolean for reduced output to STOUT
  100. my $apollo = 0; # Path to apollo and apollo variables
  101. my $test = 0;
  102. my $verbose = 0;
  103. my $debug = 0; # Run the program in debug mode
  104. # PROGRAM COMMAND STRINGS
  105. my $cmd_repmask; # Command to run RepeatMasker
  106. my $cmd_make_gff_db; # Make the gff file for an individual database
  107. my $cmd_make_gff_el; # Appears to not be used
  108. # COUNTERS AND INDEX VARS
  109. my $num_proc = 1; # Number of processors to use
  110. my $i; # Used to index the config file
  111. my $proc_num = 0; # Counter for processes
  112. my $file_num = 0; # Counter for fasta files being processed
  113. # ARRAYS
  114. my @mask_libs = ();
  115. #-----------------------------+
  116. # COMMAND LINE OPTIONS |
  117. #-----------------------------+
  118. my $ok = GetOptions(
  119. # REQUIRED ARGUMENTS
  120. "i|indir=s" => \$indir,
  121. "o|outdir=s" => \$outdir,
  122. "c|config=s", => \$config_file,
  123. # ADDITIONAL OPTIONS
  124. "rm-path=s" => \$rm_path,
  125. "ap-path=s", => \$ap_path,
  126. "logfile=s" => \$logfile,
  127. "p|num-proc=s" => \$num_proc,
  128. "engine=s" => \$engine,
  129. # BOOLEANS
  130. "apollo" => \$apollo,
  131. "verbose" => \$verbose,
  132. "debug" => \$debug,
  133. "test" => \$test,
  134. "usage" => \$show_usage,
  135. "version" => \$show_version,
  136. "man" => \$show_man,
  137. "h|help" => \$show_help,
  138. "q|quiet" => \$quiet,);
  139. my $bac_parent_dir = $outdir;
  140. my ( @rep_libs );
  141. #-----------------------------+
  142. # SHOW REQUESTED HELP |
  143. #-----------------------------+
  144. if ($show_usage) {
  145. print_help("");
  146. }
  147. if ($show_man) {
  148. # User perldoc to generate the man documentation.
  149. system ("perldoc $0");
  150. exit($ok ? 0 : 2);
  151. }
  152. if ($show_help || (!$ok) ) {
  153. print_help("full");
  154. }
  155. if ($show_version) {
  156. print "\nbatch_mask.pl:\n".
  157. "Version: $VERSION\n\n";
  158. exit;
  159. }
  160. # Show full help when required options
  161. # are not present
  162. if ( (!$indir) || (!$outdir) || (!$config_file) ) {
  163. print "\a";
  164. print "ERROR: An input directory was not specified with the -i flag\n"
  165. if !$indir;
  166. print "ERROR: An output directory was not specified with the -o flag\n"
  167. if !$outdir;
  168. print "ERROR: A config file was not specified with the -c flag\n"
  169. if !$config_file;
  170. print_help("full");
  171. }
  172. #-----------------------------+
  173. # OPEN THE LOG FILE |
  174. #-----------------------------+
  175. if ($logfile) {
  176. # Open file for appending
  177. open ( LOG, ">>$logfile" ) ||
  178. die "Can not open logfile:\n$logfile\n";
  179. my $time_now = time;
  180. print LOG "==================================\n";
  181. print LOG " batch_mask.pl\n";
  182. print LOG " JOB: $time_now\n";
  183. print LOG "==================================\n";
  184. }
  185. #-----------------------------+
  186. # CHECK FOR SLASH IN DIR |
  187. # VARIABLES |
  188. #-----------------------------+
  189. # If the indir does not end in a slash then append one
  190. # TO DO: Allow for backslash
  191. unless ($indir =~ /\/$/ ) {
  192. $indir = $indir."/";
  193. }
  194. unless ($outdir =~ /\/$/ ) {
  195. $outdir = $outdir."/";
  196. }
  197. #-----------------------------+
  198. # Get the FASTA files from the|
  199. # directory provided by the |
  200. # var $indir |
  201. #-----------------------------+
  202. opendir( DIR, $indir ) ||
  203. die "Can't open directory:\n$indir";
  204. my @fasta_files = grep /\.fasta$|\.fa$/, readdir DIR ;
  205. closedir( DIR );
  206. #-----------------------------+
  207. # REPEAT LIBRARY INFORMATION |
  208. #-----------------------------+
  209. open (CONFIGFILE, "<$config_file")
  210. || die "Could not open the config file:\n$config_file\n";
  211. $i = 0;
  212. my $config_line_num = 0;
  213. print "Parsing config file...\n" if $verbose;
  214. while (<CONFIGFILE>) {
  215. chomp;
  216. $config_line_num++;
  217. unless (m/^\#/) {
  218. # Split input by tab
  219. my @in_line = split(/\t/, $_);
  220. my $num_in_line = @in_line;
  221. print "\tConfig line $config_line_num\t".
  222. "$num_in_line sections\n" if $verbose;
  223. # Only try to parse the inline if it has the
  224. # expected number of componenets
  225. if ($num_in_line == 2) {
  226. $mask_libs[$i][0] = $in_line[0];
  227. $mask_libs[$i][1] = $in_line[1];
  228. # Only print for debug runs
  229. if ($debug) {
  230. print STDERR "INLINE SPLIT, i:$i \n";
  231. print STDERR "\t\t".$in_line[0]."\n";
  232. print STDERR "\t\t".$in_line[1]."\n";
  233. print "VAL1 IS:";
  234. print $mask_libs[$i][0]."\n";
  235. print "VAL2 IS:";
  236. print $mask_libs[$i][1]."\n";
  237. } # End of print for debug runs
  238. $i++;
  239. }
  240. } # End of unless comment line
  241. } # End of while CONFIGFILE
  242. close CONFIGFILE;
  243. my $num_libs = @mask_libs;
  244. my $num_files = @fasta_files;
  245. my $num_proc_total = $num_libs * $num_files;
  246. #-----------------------------+
  247. # SHOW ERROR IF NO LIBS IN |
  248. # THE CONFIG FILE |
  249. #-----------------------------+
  250. if ($num_libs == 0) {
  251. print "\a";
  252. print STDERR "\nERROR: No library files were indicated in the ".
  253. "config file\n$config_file\n";
  254. exit;
  255. }
  256. #-----------------------------+
  257. # SHOW ERROR IF NO FILES |
  258. # WERE FOUND IN THE INPUT DIR |
  259. #-----------------------------+
  260. if ($num_files == 0) {
  261. print "\a";
  262. print "\nERROR: No fasta files were found in the input directory\n".
  263. "$indir\n".
  264. "Fasta files must have the fasta or fa extension.\n\n";
  265. exit;
  266. }
  267. if ($verbose) {
  268. print STDERR "\n";
  269. print STDERR "NUM FILES: $num_files\n";
  270. print STDERR "NUM LIBS: $num_libs\n";
  271. print STDERR "NUM PROC: $num_proc_total\n";
  272. print STDERR "\n";
  273. }
  274. #-----------------------------+
  275. # SHOW ERROR IF ONE OF THE |
  276. # MASK LIBS DOES NOT EXIST |
  277. #-----------------------------+
  278. print STDERR "Checking mask libs ...\n" if $verbose;
  279. for ($i=0; $i<$num_libs; $i++) {
  280. print "i $i\n";
  281. $rep_db_name = $mask_libs[$i][0];
  282. $rep_db_path = $mask_libs[$i][1];
  283. unless (-e $rep_db_path) {
  284. print "\a";
  285. print "\nERROR: The following masking library could not be found:\n".
  286. $rep_db_path."\n";
  287. exit;
  288. }
  289. }
  290. #-----------------------------+
  291. # CREATE THE OUT DIR |
  292. # IF IT DOES NOT EXIST |
  293. #-----------------------------+
  294. print "Creating output dir ...\n" unless $quiet;
  295. unless (-e $outdir) {
  296. mkdir $outdir ||
  297. die "\aERROR: Could not create the output directory:\n$outdir\n";
  298. }
  299. #-----------------------------+
  300. # RUN REPEAT MASKER AND PARSE |
  301. # RESULTS FOR EACH SEQ IN THE |
  302. # fasta_files ARRAY FOR EACH |
  303. # REPEAT LIBRARY IN THE |
  304. # rep_libs ARRAY |
  305. #-----------------------------+
  306. for my $ind_file (@fasta_files)
  307. {
  308. $file_num++;
  309. # Reset search name to null
  310. $search_name = "";
  311. if ($ind_file =~ m/(.*)\.fasta$/ ) {
  312. $name_root = "$1";
  313. }
  314. elsif ($ind_file =~ m/(.*)\.fasta$/ ) {
  315. $name_root = "$1";
  316. }
  317. else {
  318. $name_root = "UNDEFINED";
  319. }
  320. # The following names are the names as produced by
  321. # RepeatMasker
  322. $file_to_mask = $indir.$ind_file;
  323. $repmask_outfile = $file_to_mask.".out";
  324. $repmask_catfile = $file_to_mask.".cat";
  325. $repmask_tbl_file = $file_to_mask.".tbl";
  326. $repmask_masked_file = $file_to_mask.".masked";
  327. $proc_num++;
  328. print LOG "\n\nProcess $proc_num of $num_proc_total.\n" if $logfile;
  329. print "\n\n+-----------------------------------------------------------+\n"
  330. unless $quiet;
  331. print "| Process $proc_num of $num_proc_total.\n" unless $quiet;
  332. print "+-----------------------------------------------------------+\n"
  333. unless $quiet;
  334. #-----------------------------+
  335. # SHOW BASIC RUN INFO
  336. #-----------------------------+
  337. print "\n";
  338. print "\tINFILE: $ind_file\n";
  339. print "\t ROOT: $name_root\n";
  340. #-----------------------------+
  341. # MAKE OUTPUT DIR |
  342. #-----------------------------+
  343. # The base output dir for the BAC
  344. $bac_out_dir = $outdir.$name_root."/";
  345. mkdir $bac_out_dir, 0777 unless (-e $bac_out_dir);
  346. #-----------------------------+
  347. # MAKE RM OUTPUT DIR |
  348. #-----------------------------+
  349. # dir for the repeat maske output
  350. $bac_rep_out_dir = "$bac_out_dir"."rm/";
  351. mkdir $bac_rep_out_dir, 0777 unless (-e $bac_rep_out_dir);
  352. #-----------------------------+
  353. # MAKE GFF OUTPUT DIR |
  354. #-----------------------------+
  355. $gff_out_dir = "$bac_out_dir"."gff/";
  356. mkdir $gff_out_dir, 0777 unless (-e $gff_out_dir);
  357. #-----------------------------+
  358. # FOR EACH DB IN THE |
  359. # mask_libs ARRAY |
  360. #-----------------------------+
  361. for ($i=0; $i<$num_libs; $i++) {
  362. $rep_db_name = $mask_libs[$i][0];
  363. $rep_db_path = $mask_libs[$i][1];
  364. #-----------------------------+
  365. # GET THE STRING TO SEARCH |
  366. # FOR IN THE RM OUTPUT |
  367. #-----------------------------+
  368. # The name used by Repeat Masker is taken from the FASTA header
  369. # Only the first twenty characters of the FASTA header are used
  370. open (IN, $file_to_mask);
  371. while (<IN>) {
  372. chomp;
  373. if (/^\>(.*)/) {
  374. print "\tFASTA HEADER:\n\t$_\n" if $verbose;
  375. print "\tSEQ_ID:\n\t$1\n" if $verbose;
  376. $search_name = $1;
  377. my $test_len = 20;
  378. my $cur_len = length ($search_name);
  379. if ( $cur_len > $test_len ) {
  380. $search_name = substr ($_, 0, 20);
  381. }
  382. } # End of in fasta header file
  383. }
  384. close IN;
  385. #$gff_el_out = $indir.$rep_db_name."_".$ind_file."_EL.gff";
  386. #$xml_el_out = $indir.$rep_db_name."_".$ind_file."_EL.game.xml";
  387. #$gff_alldb_out = $indir."ALLDB_".$ind_file.".gff";
  388. #$xml_alldb_out = $indir."ALLDB_".$ind_file."game.xml";
  389. # Renamed 09/11/2007
  390. $gff_el_out = $indir.$name_root."_".$rep_db_name.".gff";
  391. $xml_el_out = $indir.$name_root."_".$rep_db_name.".game.xml";
  392. $gff_alldb_out = $indir.$name_root."_ALLDB.gff";
  393. $xml_alldb_out = $indir.$name_root."_ALLDB.game.xml";
  394. if ($rm_path) {
  395. $cmd_repmask = $rm_path.
  396. " -lib ".$rep_db_path.
  397. " -pa ".$num_proc.
  398. " -engine ".$engine.
  399. " -xsmall".
  400. " $file_to_mask";
  401. }
  402. else {
  403. $cmd_repmask = "RepeatMasker".
  404. " -lib ".$rep_db_path.
  405. " -pa ".$num_proc.
  406. " -engine ".$engine.
  407. " -xsmall".
  408. " $file_to_mask";
  409. }
  410. #-----------------------------+
  411. # SHOW THE USER THE COMMANDS |
  412. # THAT WILL BE USED |
  413. #-----------------------------+
  414. if ($verbose) {
  415. print "\n";
  416. print "+-----------------------------+\n";
  417. print "| CONVERT COMMANDS |\n";
  418. print "+-----------------------------+\n";
  419. print "\tSEARCH: ".$search_name."\n";
  420. print "\tOUTFILE: ".$repmask_outfile."\n";
  421. print "\tDB-NAME: ".$rep_db_name."\n";
  422. print "\tGFF-FILE: ".$gff_alldb_out."\n";
  423. print "\n";
  424. print "+-----------------------------+\n";
  425. print "| REPEATMASKER COMMANDS |\n";
  426. print "+-----------------------------+\n";
  427. print "\tLIB-NAME: ".$rep_db_name."\n";
  428. print "\tLIB-PATH: ".$rep_db_path."\n";
  429. print "\tEL-OUT: ".$gff_el_out."\n";
  430. print "\tREPCMD: ".$cmd_repmask."\n";
  431. print "\n\n";
  432. }
  433. #-----------------------------+
  434. # PRINT INFO TO LOG FILE |
  435. #-----------------------------+
  436. if ($logfile) {
  437. print LOG "\tLib Name: ".$rep_db_name."\n";
  438. print LOG "\tLib Path: ".$rep_db_path."\n";
  439. print LOG "\tEL Out: ".$gff_el_out."\n";
  440. print LOG "\tRepCmd: ".$cmd_repmask."\n";
  441. print LOG "\n\n";
  442. }
  443. unless ( $test ) {
  444. $msg = "\nERROR:\n".
  445. "Could not complete system cmd\n$cmd_repmask\n";
  446. system ( $cmd_repmask );
  447. }
  448. unless ( $test ) {
  449. rmout_to_gff($repmask_outfile, $gff_el_out, ">");
  450. }
  451. unless ( $test ) {
  452. rmout_to_gff( $repmask_outfile, $gff_alldb_out, ">>");
  453. }
  454. #-----------------------------+
  455. # CONVERT THE FILES FROM GFF |
  456. # FORMAT TO A MORE USABLE |
  457. # FORMAT FOR APOLLO |
  458. #-----------------------------+
  459. if ($apollo) {
  460. print "\n\n\n\nCONVERTING GFF TO GAME\n\n\n" if $verbose;
  461. &apollo_convert ( $gff_el_out, "gff", $xml_el_out, "game",
  462. $file_to_mask, "none" );
  463. }
  464. #-----------------------------+
  465. # FILES MOVED TO HERE |
  466. #-----------------------------+
  467. $repmask_out_cp = $bac_rep_out_dir.$name_root."_".$rep_db_name.
  468. ".rm.out";
  469. $repmask_cat_cp = $bac_rep_out_dir.$name_root."_".$rep_db_name.
  470. ".rm.cat";
  471. $repmask_tbl_cp = $bac_rep_out_dir.$name_root."_".$rep_db_name.
  472. ".rm.tbl";
  473. $repmask_masked_cp = $bac_rep_out_dir.$name_root."_".$rep_db_name.
  474. ".masked.fasta";
  475. $repmask_el_cp = $bac_rep_out_dir.$name_root."_".$rep_db_name.
  476. ".gff";
  477. $repmask_xml_el_cp = $bac_rep_out_dir.$name_root."_".$rep_db_name.
  478. ".game.xml";
  479. #///////////////////////////////////////
  480. # This is another copy of the masked file
  481. # this will allow me to put all of the masked files in
  482. # a single location and have a shorter name
  483. # These will all be placed in the $outdir
  484. $repmask_local_cp = $outdir.$name_root.".masked.fasta";
  485. #\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
  486. # THE FOLLOWING ADDED 09/28/2006
  487. # REmoved 09/11/2007
  488. # $repmask_log = $indir.$rep_db_name."_".$ind_file.".log";
  489. # $repmask_log_cp = $bac_rep_out_dir.$rep_db_name."_".$ind_file.".log";
  490. # $msg = "\nERRORCan not move\n\t".$repmask_log."\n\t".
  491. # $repmask_log_cp."\n";
  492. # move ( $repmask_log, $repmask_log_cp) ||
  493. # print STDERR $msg;
  494. #-----------------------------+
  495. # MAKE A COPY OF THE MASKED |
  496. # FASTA FILE TO A SINGLE DIR |
  497. #-----------------------------+
  498. $msg = "\nERROR: Can not copy ".$repmask_masked_file." to\n".
  499. $repmask_local_cp."\n";
  500. copy ( $repmask_masked_file, $repmask_local_cp ) ||
  501. print STDERR $msg;
  502. #-----------------------------+
  503. # MOVE THE RM OUTPUT FILES TO |
  504. # THE TARGET DIR |
  505. #-----------------------------+
  506. $msg = "\nERROR: Can not move ".$repmask_outfile.
  507. " to\n ".$repmask_cat_cp."\n";
  508. move ( $repmask_outfile, $repmask_cat_cp ) ||
  509. print STDERR $msg;
  510. $msg = "\nERROR: Can not move ".$repmask_catfile."\n";
  511. move ( $repmask_catfile, $repmask_cat_cp ) ||
  512. print STDERR $msg;
  513. # $msg = "\nERROR: The table file could not be moved from".
  514. # "$repmask_tbl_file to $repmask_tbl_cp";
  515. # move ( $repmask_tbl_file, $repmask_tbl_cp ) ||
  516. # print STDERR $msg;
  517. $msg = "\nERROR: Can not move ".$repmask_masked_file."\n";
  518. move ( $repmask_masked_file, $repmask_masked_cp ) ||
  519. print STDERR $msg;
  520. $msg = "\nERROR: Can not move ".$gff_el_out."\n";
  521. move ( $gff_el_out , $repmask_el_cp ) ||
  522. print STDERR $msg;
  523. if ($apollo) {
  524. $msg = "\nERROR: Can not move ".$xml_el_out."\n";
  525. move ( $xml_el_out, $repmask_xml_el_cp ) ||
  526. print STDERR $msg;
  527. }
  528. } # End of for LibData
  529. #-----------------------------+
  530. # THE APOLLO CONVERT FOR THE |
  531. # ENTIRE SET OF REPEAT LIBS |
  532. # FOR A GIVEN SEQUENCE FILE |
  533. # THAT IS BEING MASKED. |
  534. #-----------------------------+
  535. if ($apollo) {
  536. apollo_convert ( $gff_alldb_out, "gff", $xml_alldb_out , "game",
  537. $file_to_mask, "none" );
  538. }
  539. my $repmask_all_gff_cp = $bac_rep_out_dir.$name_root."_ALLDB.rm.gff";
  540. my $repmask_all_xml_cp = $bac_rep_out_dir.$name_root."_ALLDB.rm.game.xml";
  541. $msg = "\nCan not move ".$gff_alldb_out."\n";
  542. move ( $gff_alldb_out, $repmask_all_gff_cp ) ||
  543. print STDERR $msg;
  544. if ($apollo) {
  545. $msg = "\nCan not move ".$xml_alldb_out."\n";
  546. move ( $xml_alldb_out, $repmask_all_xml_cp ) ||
  547. print STDERR $msg;
  548. }
  549. # TEMP EXIT FOR DEBUG, WIll JUST RUN FIRST FILE TO BE MASKED
  550. if ($debug) {
  551. if ($file_num > $file_num_max ) {
  552. print "\nDebug run finished\n\n";
  553. exit;
  554. }
  555. }
  556. } # End of for each file in the input folder
  557. close LOG if $logfile;
  558. exit;
  559. #-----------------------------------------------------------+
  560. # SUBFUNCTIONS |
  561. #-----------------------------------------------------------+
  562. sub apollo_convert {
  563. #-----------------------------+
  564. # CONVERT AMONG FILE FORMATS |
  565. # USING THE APOLLO PROGRAM |
  566. #-----------------------------+
  567. # Converts among the various data formats that can be used
  568. # from the command line in tbe Apollo program. For example
  569. # can convert GFF format files into the game XML format.
  570. # NOTES:
  571. # - Currently assumes that the input file is in the correct
  572. # coordinate system.
  573. # - GFF files will require a sequence file
  574. # - ChadoDB format will require a db password
  575. # ApPath - the path of dir with the Apollo binary
  576. # Specifying the path will allow for cases
  577. # where the program is not in the PATHS
  578. # ApCmd - the apollo commands to run
  579. print "Converting output to Apollo game.xml format\n"
  580. unless $quiet;
  581. my $InFile = $_[0]; # Input file path
  582. my $InForm = $_[1]; # Output file format:
  583. # game|gff|gb|chadoxml|backup
  584. my $OutFile = $_[2]; # Output file path
  585. my $OutForm = $_[3]; # Ouput file foramt
  586. # chadoDB|game|chadoxml|genbank|gff|backup
  587. my $SeqFile = $_[4]; # The path of the sequence file
  588. # This is only required for GFF foramt files
  589. # When not required this can be passed as na
  590. my $DbPass = $_[5]; # Database password for logging on to the
  591. # chado database for reading or writing.
  592. my ( $ApPath, $ApCmd );
  593. #$ApPath = "/home/jestill/Apps/Apollo/";
  594. $ApPath = "/home/jestill/Apps/Apollo_1.6.5/apollo/bin/apollo";
  595. # Set the base command line. More may need to be added for different
  596. # formats. For example, GFF in requires a sequence file and the CHADO
  597. # format will require a database password.
  598. # $ApPath = "/home/jestill/Apps/Apollo/";
  599. # $ApCmd = $ApPath."Apollo -i ".$InForm." -f ".$InFile.
  600. # " -o ".$OutForm." -w ".$OutFile;
  601. $ApCmd = $ApPath." -i ".$InForm." -f ".$InFile.
  602. " -o ".$OutForm." -w ".$OutFile;
  603. # Make sure that that input output formats are in lowercase
  604. # may need to add something here to avoid converting chadoDB
  605. $InForm = lc($InForm);
  606. $OutForm = lc($OutForm);
  607. # Determine the proper command to use based on the input format
  608. # since GFF file also require a sequence file
  609. if ($InForm =~ "gff" ) {
  610. $ApCmd = $ApCmd." -s ".$SeqFile;
  611. }
  612. if ($InForm =~ "chadodb") {
  613. $ApCmd = $ApCmd." -D ".$DbPass;
  614. }
  615. # Do the apollo command
  616. system ( $ApCmd );
  617. }
  618. sub print_help {
  619. # Print requested help or exit.
  620. # Options are to just print the full
  621. my ($opt) = @_;
  622. my $usage = "USAGE:\n".
  623. " batch_mask.pl -i DirToProcess -o OutDir";
  624. my $args = "REQUIRED ARGUMENTS:\n".
  625. " --indir # Path to the directory containing the sequences\n".
  626. " # to process. The files must have one of the\n".
  627. " # following file extensions:\n".
  628. " # [fasta|fa]\n".
  629. " --outdir # Path to the output directory\n".
  630. " --config # Path to database list config file\n".
  631. "\n".
  632. "ADDITIONAL OPTIONS:\n".
  633. " --rm-path # Full path to repeatmasker binary\n".
  634. " --engine # The repeatmasker engine to use:\n".
  635. " # [crossmatch|wublast|decypher]\n".
  636. " # default is to use crossmatch\n".
  637. " --num-proc # Number of processors to use for RepeatMasker\n".
  638. " # default is one.\n".
  639. " --apollo # Convert output to game.xml using apollo\n".
  640. " # default is not to use apollo\n".
  641. " --quiet # Run program with minimal output\n".
  642. " --test # Run the program in test mode\n".
  643. " --logfile # Path to file to use for logfile\n".
  644. "\n".
  645. "ADDITIONAL INFORMATION\n".
  646. " --version # Show the program version\n".
  647. " --usage # Show program usage\n".
  648. " --help # Show this help message\n".
  649. " --man # Open full program manual\n";
  650. if ($opt =~ "full") {
  651. print "\n$usage\n\n";
  652. print "$args\n\n";
  653. }
  654. else {
  655. print "\n$usage\n\n";
  656. }
  657. exit;
  658. }
  659. sub rmout_to_gff {
  660. # Subfunction to convert repeatmasker out file
  661. # to gff format for apollo
  662. # $rm_file = path to the repeat masker file
  663. # $gff_file = path to the gff output file
  664. # $pre is the way that the output file will
  665. # be made, it should be either > or >>
  666. # > for overwrite
  667. # >> for concatenate
  668. # the default will be to concatenate when the prefix
  669. # variable can not be undertood
  670. my ( $rm_file, $gff_file, $pre ) = @_;
  671. my $IN;
  672. my $OUT;
  673. my $strand;
  674. #-----------------------------------------------------------+
  675. # REPEATMASKER OUT FILE CONTAINS
  676. #-----------------------------------------------------------+
  677. # 0 Smith-Waterman score of the match
  678. # 1 % substitutions in matching region compared to the consensus
  679. # 2 % of bases opposite a gap in the query sequence (deleted bp)
  680. # 3 % of bases opposite a gap in the repeat consensus (inserted bp)
  681. # 4 name of query sequence
  682. # 5 starting position of match in query sequence
  683. # 6 ending position of match in query sequence
  684. # 7 no. of bases in query sequence past the ending position of match
  685. # in parenthesis
  686. # 8 match is with the Complement of the consensus sequence in the database
  687. # C
  688. # 9 name of the matching interspersed repeat
  689. #10 class of the repeat, in this case a DNA transposon
  690. #11 no. of bases in (complement of) the repeat consensus sequence
  691. # in parenthesis
  692. #12 starting position of match in database sequence
  693. # (using top-strand numbering)
  694. #13 ending position of match in database sequence
  695. open ( RM_IN, $rm_file ) ||
  696. die "Could not open the RM infile\n";
  697. # open ( RM_OUT, ">".$gff_file) ||
  698. open ( RM_OUT, $pre.$gff_file) ||
  699. die "Could not open the GFF outfile\n";
  700. while (<RM_IN>) {
  701. chomp;
  702. my @rmout = split;
  703. my $line_len = @rmout;
  704. my $cur_strand = $rmout[8];
  705. if ($cur_strand =~ "C" ) {
  706. $strand = "-";
  707. }
  708. else {
  709. $strand = "+";
  710. }
  711. #-----------------------------+
  712. # The following uses the TREP |
  713. # hits as names this is done |
  714. # by calling the attributes |
  715. # exons..a strange kluge |
  716. #-----------------------------+
  717. # This places the RepMask output in the same frame as
  718. # the hit was found in the database.
  719. #print RM_OUT "$rmout[4]\t"; # qry sequence name
  720. #print RM_OUT "repeatmasker:trep9\t"; # software used
  721. #print RM_OUT "exon\t"; # attribute name
  722. #print RM_OUT "$rmout[5]\t"; # start
  723. #print RM_OUT "$rmout[6]\t"; # stop
  724. #print RM_OUT "$rmout[0]\t"; # smith waterman score"
  725. #print RM_OUT "$strand\t"; # strand
  726. #print RM_OUT ".\t"; # frame
  727. #print RM_OUT "$rmout[9]"; # attribute
  728. #print RM_OUT "\n";
  729. #-----------------------------+
  730. # THE FOLLOWING MAKES THE HITS|
  731. # CUMULATIVE IN BOTH STRANDS |
  732. # OR JUST THE POSITVE STRAND |
  733. #-----------------------------+
  734. print RM_OUT "$rmout[4]\t"; # qry sequence name
  735. print RM_OUT "repeatmasker:trep9\t"; # software used
  736. print RM_OUT "exon\t"; # attribute name
  737. print RM_OUT "$rmout[5]\t"; # start
  738. print RM_OUT "$rmout[6]\t"; # stop
  739. print RM_OUT "$rmout[0]\t"; # smith waterman score"
  740. print RM_OUT "+\t"; # Postive strand
  741. print RM_OUT ".\t"; # frame
  742. print RM_OUT "$rmout[9]"; # attribute
  743. print RM_OUT "\n";
  744. #print RM_OUT "$rmout[4]\t"; # qry sequence name
  745. #print RM_OUT "repeatmasker:trep9\t"; # software used
  746. #print RM_OUT "exon\t"; # attribute name
  747. #print RM_OUT "$rmout[5]\t"; # start
  748. #print RM_OUT "$rmout[6]\t"; # stop
  749. #print RM_OUT "$rmout[0]\t"; # smith waterman score"
  750. #print RM_OUT "-\t"; # Negative strand
  751. #print RM_OUT ".\t"; # frame
  752. #print RM_OUT "$rmout[9]"; # attribute
  753. #print RM_OUT "\n";
  754. }
  755. return;
  756. }
  757. =head1 NAME
  758. batch_mask.pl - Run RepeatMasker and parse results to a gff format file.
  759. =head1 VERSION
  760. This documentation refers to program version $Rev: 248 $
  761. =head1 SYNOPSIS
  762. USAGE:
  763. batch_mask.pl -i DirToProcess -o OutDir -c ConfigFile
  764. =head1 DESCRIPTION
  765. Runs the RepeatMasker program for a set of input
  766. FASTA files against a set of repeat library files &
  767. then converts the repeat masker *.out file into the
  768. GFF format and then to the game XML format for
  769. visualization by the Apollo genome anotation program.
  770. =head1 COMMAND LINE ARGUMENTS
  771. =head2 Required Arguments
  772. =over 2
  773. =item -i,--indir
  774. Path of the directory containing the sequences to process.
  775. =item -o,--outdir
  776. Path of the directory to place the program output.
  777. =item -c, --config
  778. Configuration file that lists the database names and paths of the
  779. fasta files to use as masking databases.
  780. =back
  781. =head2 Additional Options
  782. =over 2
  783. =item -p,--num-proc
  784. The number of processors to use for RepeatMasker. Default is one.
  785. =item --engine
  786. The repeatmasker engine to use: [crossmatch|wublast|decypher].
  787. The default is to use the crossmatch engine.
  788. =item --apollo
  789. Use the apollo program to convert the file from gff to game xml.
  790. The default is not to use apollo.
  791. =item --rm-path
  792. The full path to the RepeatMasker binary.
  793. =item --logfile
  794. Path to a file that will be used to log program status.
  795. If the file already exists, additional information will be concatenated
  796. to the existing file.
  797. =item -q,--quiet
  798. Run the program with minimal output.
  799. =item --test
  800. Run the program without doing the system commands.
  801. =back
  802. =head2 Additional Program Information
  803. =over 2
  804. =item --usage
  805. Short overview of how to use program from command line.
  806. =item --help
  807. Show program usage with summary of options.
  808. =item --version
  809. Show program version.
  810. =item --man
  811. Show the full program manual. This uses the perldoc command to print the
  812. POD documentation for the program.
  813. =back
  814. =head1 DIAGNOSTICS
  815. The error messages that can be generated will be listed here.
  816. =head1 CONFIGURATION AND ENVIRONMENT
  817. The major configuration file for this program is the list of
  818. datbases indicated by the -c flag.
  819. =head2 Databases Config File
  820. This file is a tab delimited text file. Line beginning with # are ignored.
  821. B<EXAMPLE>
  822. #-------------------------------------------------------------
  823. # DBNAME DB_PATH
  824. #-------------------------------------------------------------
  825. TREP_9 /db/repeats/Trep9.nr.fasta
  826. TIGR_Trit /db/repeats/TIGR_Triticum_GSS_Repeats.v2.fasta
  827. # END
  828. The columns above represent the following
  829. =over 2
  830. =item Col. 1
  831. The name of the repeat library
  832. This will be used to name the output files from the analysis
  833. and to name the data tracks that will be used by Apollo.
  834. =item Col. 2
  835. The path to the fasta format file containing the repeats.
  836. =back
  837. =head1 DEPENDENCIES
  838. =head2 Required Software
  839. =over
  840. =item *
  841. RepeatMasker
  842. (http://www.repeatmasker.org/)
  843. =item *
  844. Apollo (Genome Annotation Curation Tool)
  845. http://www.fruitfly.org/annot/apollo/
  846. =back
  847. =head2 Required Perl Modules
  848. =over
  849. =item *
  850. File::Copy
  851. =item *
  852. Getopt::Long
  853. =back
  854. =head1 BUGS AND LIMITATIONS
  855. =head2 TO DO
  856. =over 2
  857. =item *
  858. Make the results compatable for an upload to a chado
  859. database.
  860. =item *
  861. Make it a variable to possible to put the gff output (1) all in positive
  862. strand, (2) all in negative strand, (3) alignment to positive or
  863. negative strand, (4) cumulative in both positive and negative strand.
  864. Current behavior is to do number 1 above.
  865. =back
  866. =head2 Limitations
  867. =over
  868. =item *
  869. This program has been tested with RepeatMasker v 3.1.6
  870. =back
  871. =head1 LICENSE
  872. GNU LESSER GENERAL PUBLIC LICENSE
  873. http://www.gnu.org/licenses/lgpl.html
  874. =head1 AUTHOR
  875. James C. Estill E<lt>JamesEstill at gmail.comE<gt>
  876. =head1 HISTORY
  877. STARTED: 04/10/2006
  878. UPDATED: 09/11/2007
  879. VERSION: $Rev: 248 $
  880. =cut
  881. #-------------------------------------------------------+
  882. # HISTORY |
  883. #-------------------------------------------------------+
  884. #
  885. # 4/10/2006
  886. # - Program started
  887. # - Parsing of RepeatMasker *.out file to an Apollo
  888. # compatible *.GFF file done. Working copy for both
  889. # a single track for each repeat class, and a data
  890. # track with all of the results from a Repeat Library
  891. # grouped toether.
  892. # - Two dimensional array containing the repeat library
  893. # database informaiton.
  894. #
  895. # 4/11/2006
  896. # - Additional code comments and reformat
  897. # - Code to cycle through a set of FASTA files and store
  898. # the output data in a separate named folder for each
  899. # of the FASTA flies.
  900. # - Added the -fixed to the command to run RepeatMasker
  901. # to make sure that the text format is the same for all
  902. # - Since Apollo does not allow additional GFF files to be
  903. # added later, I added code to create a GFF file that has
  904. # the outcome for all of the RepeatMask runs in one
  905. # database.
  906. #
  907. # 4/12/2006
  908. # - Added the AllRepeats to the dataset to make sure that
  909. # that the repeat characterization is cumulative
  910. # - ConvertGFF2Chado subfunction
  911. # - Adding Smith Waterman score from RepeatMasker to the
  912. # GFF output file. This should be the first column in the
  913. # *.out file and the 6th column in the GFF file.
  914. #
  915. # 4/16/2005
  916. # - Added array for list of fasta files to process and testing
  917. # with small set of sequences.
  918. # - The fasta file was manually edited to use just the GB
  919. # ID in the FASTA header. This could be done using the
  920. # the seq object PERL module from bioperl.
  921. # - The fasta files should be read from an input directory
  922. # and then the output should be copied to an output dir.
  923. #
  924. # 4/17/2006
  925. # - Working out variables names
  926. #
  927. # 4/25/2006
  928. # - Adding ability to get the input set of FASTA files
  929. # from a directory
  930. #
  931. # 4/26/2006
  932. # - Working out the copy of the relevant output files to
  933. # a central directory for the FASTA file (ie. all
  934. # programatic output from the FASTA file goes to
  935. # a dir named for the file.)
  936. # - Added use of File::Copy module for cp commands
  937. #
  938. # 5/24/2006
  939. # - Added process log to the program. This will allow
  940. # me to launch the program at the lab and then
  941. # monitor the process at home.
  942. #
  943. # 5/31/2006
  944. # - Made local copy dir a variable that can be set at the top
  945. #
  946. # 09/26/2006
  947. # - A few changes made to the base code to make this
  948. # work on the altix. Using databases at
  949. # /scratch/jestill/repmask/
  950. #
  951. # 09/28/2006
  952. # - Changes make to get this to work on the altix
  953. # - Changed the format of the repeat databases list
  954. # This is currently a two-d array
  955. #
  956. # 07/13/2007
  957. # - Added POD documentation.
  958. # - Renamed to automask.pl
  959. # - Made this the official 1.0 release
  960. # - Adding command line variables
  961. # - Added print_help subfunction
  962. # - Modified ApolloConvert subfunction to
  963. # apollo_convert
  964. # - Getting rid of the die commands and replacing
  965. # them with the write to logfile commands. This
  966. # should prvent the program from dying when
  967. # working on really large batch files.
  968. # - Added cmd line options for:
  969. # - Number of processors
  970. # - Engine to use in repeatmasker
  971. # - apollo variable at cmd line
  972. # initially will be used as boolean but can be
  973. # used to pass apollo version, path and
  974. # desired output (ie. chado, game.xml etc)
  975. #
  976. # 07/15/2007
  977. # - Added the ability to show help when the
  978. # required options $indir and $outdir are not
  979. # present.
  980. #
  981. # 07/16/2007
  982. # - Added error message when no fasta files were
  983. # found in the input directory.
  984. # - Added the ability to append a slash to the end
  985. # of the $indir and $outdir variables if they
  986. # are not already present
  987. #
  988. # 07/17/2007
  989. # - Added the ability to get the search_name directly
  990. # from the fasta file header
  991. # - Getting root name for the output files and folders
  992. # by stripping off the fasta extension with regexp
  993. # - Changed output dir to just the root name
  994. # - Adding rmout_to_gff subfunction to convert the repeat
  995. # masker output to gff format. This will get rid
  996. # of any dependence on grep and awk and will provide
  997. # a more stable program
  998. #
  999. # 07/18/2007
  1000. # - rmout_to_gff working now ... strangely had to make
  1001. # the type exon to draw properly in apollo
  1002. # - Got rid of previous code that used grep and awk
  1003. # to do this conversion
  1004. # - Added command line variable to specify the full
  1005. # path of the RepeatMasker program. This should
  1006. # help this to run machines without having to
  1007. # make any changes to the user's environment.
  1008. # (ie. don't have to add RepeatMaker to user's path)
  1009. # - Renamed program again to batch_mask.pl
  1010. #
  1011. # 09/07/2007
  1012. # - Moving POD documentation to the end of the program
  1013. # - Changing to use a config file instead of internal 2-d array
  1014. # - Modified all variable names to lowercase
  1015. # - Added use strict
  1016. #
  1017. # 09/11/2007
  1018. # - Getting rid of LOG, all printing to STDERR
  1019. # - Dropped attempts to move *.tbl file.