PageRenderTime 69ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 1ms

/Bio/Pipeline/InputCreate/setup_file.pm

https://github.com/bioperl/bioperl-pipeline
Perl | 562 lines | 458 code | 89 blank | 15 comment | 28 complexity | 1d07bcd8099bc8a6fcf27a4d36e8b573 MD5 | raw file
Possible License(s): LGPL-2.0
  1. #
  2. # BioPerl module for Bio::Pipeline::InputCreate::setup_file
  3. #
  4. # Please direct questions and support issues to <bioperl-l@bioperl.org>
  5. #
  6. # Cared for by Shawn Hoon <shawnh@fugu-sg.org>
  7. #
  8. #
  9. # You may distribute this module under the same terms as perl itself
  10. #
  11. # POD documentation - main docs before the code
  12. #
  13. =head1 NAME
  14. Bio::Pipeline::Input::setup_file
  15. =head1 SYNOPSIS
  16. my $inc = Bio::Pipeline::InputCreate::setup_file->new(-runnable=>"Bio::Pipeline::Runnable::Blast",
  17. -format=>"fasta",
  18. -input_file=>"/data0/blast.fa",
  19. -result_dir=>"/data/blast_results",
  20. -chop_nbr=>1);
  21. $inc->run;
  22. =head1 DESCRIPTION
  23. This input create is a generic flat file setup module for the pipeline. It allows
  24. files to be chopped up in to smaller pieces to be split into jobs.
  25. Currently works with
  26. Blast,
  27. Clustalw
  28. DnaBlockAligner
  29. and in theory any files that have programs take take in sequence files
  30. =head1 FEEDBACK
  31. =head2 Mailing Lists
  32. User feedback is an integral part of the evolution of this and other
  33. Bioperl modules. Send your comments and suggestions preferably to one
  34. of the Bioperl mailing lists. Your participation is much appreciated.
  35. bioperl-l@bioperl.org - General discussion
  36. http://bioperl.org/wiki/Mailing_lists - About the mailing lists
  37. =head2 Support
  38. Please direct usage questions or support issues to the mailing list:
  39. L<bioperl-l@bioperl.org>
  40. rather than to the module maintainer directly. Many experienced and
  41. reponsive experts will be able look at the problem and quickly
  42. address it. Please include a thorough description of the problem
  43. with code and data examples if at all possible.
  44. =head2 Reporting Bugs
  45. Report bugs to the Bioperl bug tracking system to help us keep track
  46. the bugs and their resolution. Bug reports can be submitted via email
  47. or the web:
  48. bioperl-bugs@bio.perl.org
  49. http://bio.perl.org/bioperl-bugs/
  50. =head1 AUTHOR - Shawn Hoon
  51. Email shawnh@fugu-sg.org
  52. =head1 APPENDIX
  53. The rest of the documentation details each of the object methods. Internal metho
  54. ds are usually preceded with a _
  55. =cut
  56. package Bio::Pipeline::InputCreate::setup_file;
  57. use vars qw(@ISA);
  58. use strict;
  59. use Bio::Pipeline::InputCreate;
  60. use Bio::Pipeline::Runnable::Blast;
  61. use Bio::Pipeline::DataType;
  62. use Bio::SeqIO;
  63. use Bio::Root::IO;
  64. use File::Copy;
  65. use Cwd;
  66. @ISA = qw(Bio::Pipeline::InputCreate);
  67. sub _initialize {
  68. my ($self,@args) = @_;
  69. $self->SUPER::_initialize(@args);
  70. my ($runnable,
  71. $informat,
  72. $outformat,
  73. $tag,
  74. $input_file,
  75. $input_dir,
  76. $chop_size,
  77. $workdir,
  78. $result_dir,
  79. $full_path,
  80. $format_db,
  81. $format_db_exe,
  82. $format_db_arg) = $self->_rearrange([qw(RUNNABLE
  83. INFORMAT
  84. OUTFORMAT
  85. TAG
  86. INPUT_FILE
  87. INPUT_DIR
  88. CHOP_NBR
  89. WORKDIR
  90. RESULT_DIR
  91. FULL_PATH
  92. FORMAT_DB
  93. FORMAT_DB_EXE
  94. FORMAT_DB_ARG)],@args);
  95. $runnable || $self->throw("Need an runnable name");
  96. $self->runnable($runnable);
  97. $input_dir || $input_file|| $self->throw("Need a input file or directory");
  98. $self->input_file($input_file) if $input_file;
  99. $self->input_dir($input_dir) if $input_dir;
  100. $informat ||='fasta';
  101. $self->informat($informat);
  102. $outformat ||='fasta';
  103. $self->outformat($outformat);
  104. $chop_size ||= 400;
  105. $self->chop_size($chop_size);
  106. $workdir ||= '/tmp';
  107. $self->workdir($workdir);
  108. $result_dir ||= Bio::Root::IO->catfile($workdir,"results");
  109. $self->result_dir($result_dir);
  110. $self->tag($tag) if $tag;
  111. $self->full_path($full_path) if $full_path;
  112. #standalone blast works with ncbi blast only anyway
  113. $format_db_exe ||='formatdb';
  114. $self->format_db_exe($format_db_exe);
  115. $format_db_arg && $self->format_db_arg($format_db_arg);
  116. if($self->runnable =~/Blast/i && $format_db){
  117. $self->_setup_blastdb();
  118. }
  119. }
  120. =head2 input_file
  121. Title : input_file
  122. Usage : $self->input_file()
  123. Function: get/sets of the input_file
  124. Returns :
  125. Args :
  126. =cut
  127. sub input_file{
  128. my ($self,$arg) = @_;
  129. if($arg){
  130. $self->{'_input_file'} = $arg;
  131. }
  132. return $self->{'_input_file'};
  133. }
  134. =head2 input_dir
  135. Title : input_dir
  136. Usage : $self->input_dir()
  137. Function: get/sets of the input_dir
  138. Returns :
  139. Args :
  140. =cut
  141. sub input_dir{
  142. my ($self,$arg) = @_;
  143. if($arg){
  144. $self->{'_input_dir'} = $arg;
  145. }
  146. return $self->{'_input_dir'};
  147. }
  148. sub full_path{
  149. my ($self,$arg) = @_;
  150. if($arg){
  151. $self->{'_full_path'} = $arg;
  152. }
  153. return $self->{'_full_path'};
  154. }
  155. sub tag {
  156. my ($self,$val) = @_;
  157. if($val){
  158. $self->{'_tag'} = $val;
  159. }
  160. return $self->{'_tag'};
  161. }
  162. =head2 informat
  163. Title : informat
  164. Usage : $self->informat()
  165. Function: get/sets of the informat
  166. Returns :
  167. Args :
  168. =cut
  169. sub informat{
  170. my ($self,$arg) = @_;
  171. if($arg){
  172. $self->{'_informat'} = $arg;
  173. }
  174. return $self->{'_informat'};
  175. }
  176. =head2 outformat
  177. Title : outformat
  178. Usage : $self->outformat()
  179. Function: get/sets of the outformat
  180. Returns :
  181. Args :
  182. =cut
  183. sub outformat{
  184. my ($self,$arg) = @_;
  185. if($arg){
  186. $self->{'_outformat'} = $arg;
  187. }
  188. return $self->{'_outformat'};
  189. }
  190. =head2 runnable
  191. Title : runnable
  192. Usage : $self->runnable()
  193. Function: get/sets of the runnable
  194. Returns :
  195. Args :
  196. =cut
  197. sub runnable{
  198. my ($self,$arg) = @_;
  199. if($arg){
  200. $self->{'_runnable'} = $arg;
  201. }
  202. return $self->{'_runnable'};
  203. }
  204. =head2 format_db_exe
  205. Title : format_db_exe
  206. Usage : $self->format_db_exe()
  207. Function: get/sets of the format_db_exe
  208. Returns :
  209. Args :
  210. =cut
  211. sub format_db_exe{
  212. my ($self,$arg) = @_;
  213. if($arg){
  214. $self->{'_format_db_exe'} = $arg;
  215. }
  216. return $self->{'_format_db_exe'};
  217. }
  218. =head2 format_db_arg
  219. Title : format_db_arg
  220. Usage : $self->format_db_arg()
  221. Function: get/sets of the format_db_arg
  222. Returns :
  223. Args :
  224. =cut
  225. sub format_db_arg{
  226. my ($self,$arg) = @_;
  227. if($arg){
  228. $self->{'_format_db_arg'} = $arg;
  229. }
  230. return $self->{'_format_db_arg'};
  231. }
  232. =head2 chop_size
  233. Title : chop_size
  234. Usage : $self->chop_size()
  235. Function: get/set number of files that input_file is to chopped into
  236. Returns :
  237. Args :
  238. =cut
  239. sub chop_size {
  240. my ($self,$arg) = @_;
  241. if($arg){
  242. $self->{'_chop_size'} = $arg;
  243. }
  244. return $self->{'_chop_size'};
  245. }
  246. =head2 workdir
  247. Title : workdir
  248. Usage : $self->workdir()
  249. Function: get/set of the working dir
  250. Returns :
  251. Args :
  252. =cut
  253. sub workdir {
  254. my ($self,$arg) = @_;
  255. if($arg){
  256. $self->{'_workdir'} = $arg;
  257. }
  258. return $self->{'_workdir'};
  259. }
  260. =head2 result_dir
  261. Title : result_dir
  262. Usage : $self->result_dir()
  263. Function: get/set of the result dir
  264. Returns :
  265. Args :
  266. =cut
  267. sub result_dir {
  268. my ($self,$arg) = @_;
  269. if($arg){
  270. $self->{'_result_dir'} = $arg;
  271. }
  272. return $self->{'_result_dir'};
  273. }
  274. =head2 datatypes
  275. Title : datatypes
  276. Usage : $self->datatypes()
  277. Function: get/set of the datatypes required for this input create
  278. Returns :
  279. Args :
  280. =cut
  281. sub datatypes {
  282. my ($self) = @_;
  283. return;
  284. }
  285. sub _get_file_from_dir {
  286. my ($self,) = @_;
  287. my $dir = $self->input_dir;
  288. opendir(DIR,$dir);
  289. my @files = grep(!/^\./,readdir(DIR));
  290. closedir DIR;
  291. my @file_fullpath = map{Bio::Root::IO->catfile($dir,$_)}@files;
  292. if($self->workdir){
  293. my $workdir = $self->workdir;
  294. if($workdir !~/^\/./){#is relative path
  295. #make absolute
  296. $workdir = Bio::Root::IO->catfile(cwd,$workdir);
  297. }
  298. mkdir($workdir,0755) || $self->warn("$workdir: $!");
  299. #move files to workdir
  300. foreach my $f(@file_fullpath){
  301. my $filename = (split /\//, $f)[-1];
  302. copy($f,Bio::Root::IO->catfile($workdir,$filename)) || $self->throw("Can't write to dir $workdir");
  303. }
  304. }
  305. if($self->full_path){
  306. @files = @file_fullpath;
  307. }
  308. return @files;
  309. }
  310. =head2 run
  311. Title : run
  312. Usage : $self->run($next_anal,$input)
  313. Function: creates the jobs for genewise
  314. Returns :
  315. Args : L<Bio::Pipeline::Analysis>, Hash reference
  316. =cut
  317. sub run {
  318. my ($self,$next_anal) = @_;
  319. my @file_names;
  320. if($self->input_file){
  321. @file_names = $self->_chop_files;
  322. }
  323. elsif($self->input_dir) {
  324. @file_names = $self->_get_file_from_dir;
  325. }
  326. my $runnable = $self->runnable;
  327. if($runnable !~/Bio::Pipeline::Runnable/){
  328. $runnable = "Bio::Pipeline::Runnable::".ucfirst $runnable;
  329. }
  330. $runnable =~s/\::/\//g;
  331. eval {
  332. require "${runnable}.pm";
  333. };
  334. if($@){
  335. $self->throw("Problems finding $runnable in setup_file.pm");
  336. }
  337. $runnable =~s/\//\::/g;
  338. my $runn = "${runnable}"->new();
  339. my %dt = $runn->datatypes;
  340. foreach my $file(@file_names){
  341. my @input;
  342. METHOD: foreach my $method(keys %dt){
  343. if($self->tag){
  344. if($method eq $self->tag){
  345. push @input ,$self->create_input($file,'',$self->tag);
  346. }
  347. else {
  348. next METHOD;
  349. }
  350. }
  351. else {
  352. push @input ,$self->create_input($file,'',$self->tag);
  353. }
  354. }
  355. my $job = $self->create_job($next_anal,\@input);
  356. $self->dbadaptor->get_JobAdaptor->store($job);
  357. }
  358. return;
  359. }
  360. sub _setup_blastdb {
  361. my ($self) = @_;
  362. my $input_file = $self->input_file;
  363. -e $input_file.".phr" && return;
  364. Bio::Root::IO->exists_exe($self->format_db_exe) || return;
  365. my $cmd = $self->format_db_exe." ". $self->format_db_arg." -i ".$input_file;
  366. my $status = system($cmd);
  367. $self->throw("Problems formatting db $input_file $!") if $status > 0;
  368. return;
  369. }
  370. #internal method for chopping up peptide files into bitesize chunks for blasting
  371. #taken from chopper script by Anton Enright and Philip Lijnzaad
  372. sub _chop_files {
  373. my ($self) = @_;
  374. my $filename= $self->input_file;
  375. my $workdir = $self->workdir;
  376. my $resultdir = $self->result_dir;
  377. my $n_chunks = $self->chop_size;
  378. my $informat = $self->informat;
  379. my $outformat = $self->outformat;
  380. my @filenames;
  381. if($workdir){
  382. mkdir($workdir,0755) || $self->warn("$workdir: $!");
  383. }
  384. if($resultdir){
  385. mkdir($resultdir,0755) || $self->warn("$resultdir: $!");
  386. }
  387. #chop peptide files into digestible parts
  388. my $sio = Bio::SeqIO->new(-file=>$filename,-format=>$informat);
  389. my @seq;
  390. while(my $seq = $sio->next_seq){
  391. push @seq, $seq;
  392. }
  393. my $split = int(scalar(@seq)/$n_chunks);
  394. $split = scalar(@seq) if $split ==0;
  395. NEW_FILE:
  396. my $index = 1;
  397. $filename = (split /\//, $filename)[-1]; #get the filename only
  398. my $file = Bio::Root::IO->catfile($workdir,"$filename.$index");
  399. if($self->full_path){
  400. push @filenames, "$file";
  401. }
  402. else {
  403. push @filenames, "$filename.$index";
  404. }
  405. $sio = Bio::SeqIO->new(-file=>">$file",-format=>$outformat);
  406. my $count = 0;
  407. while ($index <= $n_chunks){
  408. if($count == $split) {
  409. $index == $n_chunks && last;
  410. $count=0;
  411. $index++;
  412. $file = "$workdir/$filename.$index";
  413. $sio->close;
  414. $sio = Bio::SeqIO->new(-file=>">$file",-format=>$outformat);
  415. if($self->full_path){
  416. push @filenames, "$file";
  417. }
  418. else {
  419. push @filenames, "$filename.$index";
  420. }
  421. }
  422. my $seq = shift @seq;
  423. $sio->write_seq($seq);
  424. last if $#seq < 0; #no more seq
  425. $count++;
  426. }
  427. #write out the remaining ones to last file
  428. while($#seq >= 0){
  429. $sio->write_seq(shift @seq);
  430. }
  431. $sio->close();
  432. return @filenames;
  433. }
  434. sub _setup_blastdb {
  435. my ($self) = @_;
  436. my $input_file = $self->input_file;
  437. -e $input_file.".phr" && return;
  438. Bio::Root::IO->exists_exe($self->format_db_exe) || return;
  439. my $cmd = $self->format_db_exe." ". $self->format_db_arg." -i ".$input_file;
  440. my $status = system($cmd);
  441. $self->throw("Problems formatting db $input_file $!") if $status > 0;
  442. return;
  443. }
  444. 1;