PageRenderTime 28ms CodeModel.GetById 37ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/RunnerLSF.pm

https://github.com/sanger-pathogens/vr-codebase
Perl | 601 lines | 495 code | 62 blank | 44 comment | 126 complexity | 7301f41478d1a14ffcdc3f51e80528be MD5 | raw file
  1. package RunnerLSF;
  2. use strict;
  3. use warnings;
  4. use Carp;
  5. use DateTime;
  6. sub new
  7. {
  8. my ($class,@args) = @_;
  9. my $self = @args ? {@args} : {};
  10. bless $self, ref($class) || $class;
  11. $$self{Running} = 1;
  12. $$self{Error} = 2;
  13. $$self{Zombi} = 4;
  14. $$self{No} = 8;
  15. $$self{Done} = 16;
  16. $$self{Waiting} = 32;
  17. $$self{lsf_status_codes} =
  18. {
  19. DONE => $$self{Done},
  20. PEND => $$self{Running} | $$self{Waiting},
  21. WAIT => $$self{Running} | $$self{Waiting},
  22. EXIT => $$self{Error},
  23. ZOMBI => $$self{Zombi},
  24. RUN => $$self{Running},
  25. UNKWN => $$self{Running},
  26. SSUSP => $$self{Running}
  27. };
  28. # runtime and queue_limits are in minutes
  29. $$self{default_limits} = { runtime=>40, memory=>1_000, queue=>'normal' };
  30. $$self{queue_limits} = { basement=>1e9, long=>72*60, normal=>12*60, small=>30 };
  31. $self->_set_lsf_limits_unit();
  32. $self->_init_zombies();
  33. return $self;
  34. }
  35. sub job_running
  36. {
  37. my ($self,$task) = @_;
  38. return $$task{status} & $$self{Running};
  39. }
  40. sub job_done
  41. {
  42. my ($self, $task) = @_;
  43. return $$task{status} & $$self{Done};
  44. }
  45. sub job_failed
  46. {
  47. my ($self, $task) = @_;
  48. return $$task{status} & $$self{Error};
  49. }
  50. sub job_nfailures
  51. {
  52. my ($self, $task) = @_;
  53. return $$task{nfailures} ? $$task{nfailures} : 0;
  54. }
  55. sub set_max_jobs
  56. {
  57. my ($self,$nmax) = @_;
  58. $$self{nmax_jobs} = $nmax;
  59. }
  60. # runtime and queue_limits are in minutes
  61. sub set_limits
  62. {
  63. my ($self,%limits) = @_;
  64. $$self{limits} = { %{$$self{default_limits}}, %limits };
  65. if ( exists($limits{queues}) )
  66. {
  67. $$self{queue_limits} = { %{$limits{queues}} };
  68. }
  69. }
  70. sub clean_jobs
  71. {
  72. my ($self,$wfile,$ids) = @_;
  73. }
  74. sub kill_job
  75. {
  76. my ($self,$job) = @_;
  77. if ( !exists($$job{lsf_id}) ) { return; }
  78. my $cmd = "bkill -s7 -b '$$job{lsf_id}'";
  79. warn("$cmd\n");
  80. `$cmd`;
  81. }
  82. sub _init_zombies
  83. {
  84. my ($self) = @_;
  85. if ( !exists($ENV{LSF_ZOMBI_IS_DEAD}) ) { return; }
  86. my @arrays = split(/\./,$ENV{LSF_ZOMBI_IS_DEAD});
  87. for my $arr (@arrays)
  88. {
  89. if ( !($arr=~/^(\d+)\[(.+)\]$/) ) { confess("Could not parse LSF_ZOMBI_IS_DEAD=$ENV{LSF_ZOMBI_IS_DEAD}\n"); }
  90. my $id = $1;
  91. my $ids = $2;
  92. my @items = split(/,/,$ids);
  93. for my $item (@items)
  94. {
  95. if ( $item=~/^(\d+)$/ ) { $$self{ignore_zombies}{"${id}[$1]"} = 1; next; }
  96. my ($from,$to) = split(/-/,$item);
  97. for (my $i=$from; $i<=$to; $i++)
  98. {
  99. $$self{ignore_zombies}{"${id}[$i]"} = 1;
  100. }
  101. }
  102. }
  103. }
  104. sub get_jobs
  105. {
  106. my ($self, $jids_file, $ids) = @_;
  107. # For each input job id create a hash with info: status, number of failuers
  108. my @jobs_out = ();
  109. for my $id (@$ids) { push @jobs_out, {status=>$$self{No}, nfailures=>0}; }
  110. if ( ! -e $jids_file ) { return \@jobs_out; }
  111. # The same file with job ids may contain many job arrays runs. Check
  112. # the jobs in the reverse order in case there were many failures.
  113. # The last success counts, failures may be discarded in such a case.
  114. #
  115. open(my $fh, '-|', "tail -n ".$Max_jobs_to_check." ".$jids_file) or confess("$jids_file: $!");
  116. my $path;
  117. my @jids = ();
  118. while (my $line=<$fh>)
  119. {
  120. if ( !($line=~/^(\d+)\s+([^\t]*)/) ) { confess("Uh, could not parse \"$line\".\n") }
  121. push @jids, $1; # LSF array ID
  122. if ( !defined $path ) { $path = $2; }
  123. if ( $path ne $2 ) { confess("$path ne $2\n"); }
  124. }
  125. # ZOMBI jobs need special care, we cannot be 100% sure that the non-responsive
  126. # node stopped writing to network disks. Let the user decide if they can be
  127. # safely ignored.
  128. my %zombi_warning = ();
  129. # Get info from bjobs -l: iterate over LSF array IDs we remember for this task in the jids_file
  130. for (my $i=@jids-1; $i>=0; $i--)
  131. {
  132. my $info = $self->_parse_bjobs_l($jids[$i]);
  133. if ( !defined $info ) { next; }
  134. # Check if the time limits are still OK for all running jobs. Switch queues if not
  135. for my $job_l (values %$info)
  136. {
  137. if ( $$job_l{status} eq 'ZOMBI' && !$$self{ignore_zombies}{$$job_l{lsf_id}} ) { $zombi_warning{$$job_l{lsf_id}} = 1; }
  138. $self->_check_job($job_l,$jids_file);
  139. }
  140. # Update status of input jobs present in the bjobs -l listing. Note that the failed jobs
  141. # get their status from the output files as otherwise we wouldn't know how many times
  142. # they failed already.
  143. for (my $j=0; $j<@$ids; $j++)
  144. {
  145. my $id = $$ids[$j];
  146. if ( !exists($$info{$id}) ) { next; }
  147. if ( $jobs_out[$j]{status} ne $$self{No} ) { next; } # the job was submitted multiple times and already has a status
  148. if ( $$info{$id}{status} & $$self{Done} )
  149. {
  150. $jobs_out[$j]{status} = $$self{Done};
  151. }
  152. elsif ( $$info{$id}{status} & $$self{Running} )
  153. {
  154. $jobs_out[$j]{status} = $$self{Running};
  155. $jobs_out[$j]{lsf_id} = $$info{$id}{lsf_id};
  156. }
  157. elsif ( $$info{$id}{status} & $$self{Zombi} )
  158. {
  159. # Set as failed if ZOMBI should be ignored, otherwise say it's running.
  160. my $lsf_id = $$info{$id}{lsf_id};
  161. if ( $$self{ignore_zombies}{$lsf_id} ) { $jobs_out[$j]{status} = $$self{Error}; }
  162. else { $jobs_out[$j]{status} = $$self{Running}; }
  163. $jobs_out[$j]{lsf_id} = $lsf_id;
  164. }
  165. }
  166. }
  167. if ( scalar keys %zombi_warning )
  168. {
  169. my %arrays = ();
  170. for my $lsf_id (keys %zombi_warning)
  171. {
  172. if ( $lsf_id =~ s/^(\d+)\[(\d+)\]$// ) { push @{$arrays{$1}},$2; }
  173. }
  174. my @id_strings = ();
  175. for my $id (keys %arrays)
  176. {
  177. while ( @{$arrays{$id}} )
  178. {
  179. push @id_strings, "${id}[". $self->_create_bsub_ids_string($id,$arrays{$id}) . "]";
  180. }
  181. }
  182. warn(
  183. "\n\n----\n\n" .
  184. "WARNING: Some jobs were found in ZOMBI state and are still considered as\n" .
  185. " running by the pipeline. To ignore these jobs, set the environment variable\n" .
  186. " LSF_ZOMBI_IS_DEAD='" .join('.',@id_strings). "'\n" .
  187. " and restart the pipeline. See also \"$jids_file\".\n" .
  188. "----\n\n"
  189. );
  190. }
  191. # For jobs which are not present in the bjobs -l listing we get the info from output files
  192. my $ntodo = 0;
  193. for (my $i=0; $i<@$ids; $i++)
  194. {
  195. if ( $jobs_out[$i]{status} & $$self{Running} || $jobs_out[$i]{status} & $$self{Error} ) { $ntodo++; }
  196. if ( $$self{nmax_jobs} && $ntodo >= $$self{nmax_jobs} ) { last; }
  197. if ( $jobs_out[$i]{status} ne $$self{No} ) { next; }
  198. my $info = $self->_parse_output($$ids[$i], $path);
  199. if ( defined $info )
  200. {
  201. $jobs_out[$i]{status} = $$info{status};
  202. $jobs_out[$i]{nfailures} = $$info{nfailures};
  203. }
  204. }
  205. return \@jobs_out;
  206. }
  207. sub _parse_bjobs_l
  208. {
  209. my ($self,$jid) = @_;
  210. my @lines;
  211. for (my $i=0; $i<3; $i++)
  212. {
  213. @lines = `bjobs -l $jid 2>/dev/null`;
  214. if ( $? ) { sleep 5; next; }
  215. if ( !scalar @lines ) { return undef; }
  216. }
  217. my %months = qw(Jan 1 Feb 2 Mar 3 Apr 4 May 5 Jun 6 Jul 7 Aug 8 Sep 9 Oct 10 Nov 11 Dec 12);
  218. my $year = (gmtime())[5] + 1900;
  219. my $info = {};
  220. my $job;
  221. for (my $i=0; $i<@lines; $i++)
  222. {
  223. if ( $lines[$i]=~/^\s*$/ ) { next; }
  224. my $job_info;
  225. if ( $lines[$i]=~/^Job <(\d+)(.*)$/ )
  226. {
  227. # Runner's ID is $id, LSF job ID is lsf_id
  228. my $id = $1;
  229. my $rest = $2;
  230. my $lsf_id = $id;
  231. if ( $rest=~/^\[(\d+)/ )
  232. {
  233. $lsf_id = "$id\[$1\]";
  234. $id = $1;
  235. }
  236. if ( scalar keys %$job) { $$info{$$job{id}} = $job; }
  237. $job = { id=>$id, lsf_id=>$lsf_id, cpus=>1 };
  238. my $job_info = $lines[$i];
  239. chomp($job_info);
  240. $i++;
  241. while ( $i<@lines && $lines[$i]=~/^\s{21}?(.*)$/ )
  242. {
  243. $job_info .= $1;
  244. chomp($job_info);
  245. $i++;
  246. }
  247. if ( !($job_info=~/,\s*Status <([^>]+)>/) ) { confess("Could not determine the status: [$job_info]"); }
  248. $$job{status} = $1;
  249. if ( !($job_info=~/,\s*Queue <([^>]+)>/) ) { confess("Could not determine the queue: [$job_info]"); }
  250. $$job{queue} = $1;
  251. if ( !($job_info=~/,\s*Command <([^>]+)>/) ) { confess("Could not determine the command: [$job_info]"); }
  252. $$job{command} = $1;
  253. }
  254. # Collect also checkpoint data for LSFCR to avoid code duplication: checkpoint directory, memory, status
  255. # Wed Mar 19 10:14:17: Submitted from host <vr-2-2-02>...
  256. if ( $lines[$i]=~/^\w+\s+\w+\s+\d+ \d+:\d+:\d+:\s*Submitted from/ )
  257. {
  258. my $job_info = $lines[$i];
  259. chomp($job_info);
  260. $i++;
  261. while ( $i<@lines && $lines[$i]=~/^\s{21}?(.*)$/ )
  262. {
  263. $job_info .= $1;
  264. chomp($job_info);
  265. $i++;
  266. }
  267. if ( $job_info=~/,\s*Checkpoint directory <([^>]+)>/ ) { $$job{chkpnt_dir} = $1; }
  268. if ( $job_info=~/\srusage\[mem=(\d+)/ )
  269. {
  270. $$job{mem_usage} = $1;
  271. if ( $$self{lsf_limits_unit} eq 'kB' ) { $$job{mem_usage} /= 1000.0; }
  272. }
  273. }
  274. # elsif ( $lines[$i]=~/^\w+\s+\w+\s+\d+ \d+:\d+:\d+:\s*Completed <exit>; TERM_CHKPNT/ )
  275. # {
  276. # $$job{status} = 'EXIT';
  277. # }
  278. # Tue Mar 19 13:00:35: [685] started on <uk10k-4-1-07>...
  279. # Tue Dec 24 13:12:00: [1] started on 8 Hosts/Processors <8*vr-1-1-05>...
  280. elsif ( $lines[$i]=~/^\w+\s+(\w+)\s+(\d+) (\d+):(\d+):(\d+):.+ started on/ )
  281. {
  282. $$job{started} = DateTime->new(month=>$months{$1}, day=>$2, hour=>$3, minute=>$4, year=>$year)->epoch;
  283. }
  284. elsif ( $lines[$i]=~/^\w+\s+(\w+)\s+(\d+) (\d+):(\d+):(\d+):.+ dispatched to/ ) # associated with underrun status
  285. {
  286. $$job{started} = DateTime->new(month=>$months{$1}, day=>$2, hour=>$3, minute=>$4, year=>$year)->epoch;
  287. }
  288. # Tue Mar 19 13:58:23: Resource usage collected...
  289. elsif ( $lines[$i]=~/^\w+\s+(\w+)\s+(\d+) (\d+):(\d+):(\d+):\s+Resource usage collected/ )
  290. {
  291. if ( !exists($$job{started}) ) { confess("No wall time for job $$job{id}??", @lines); }
  292. my $wall_time = DateTime->new(month=>$months{$1}, day=>$2, hour=>$3, minute=>$4, year=>$year)->epoch - $$job{started};
  293. if ( !exists($$job{cpu_time}) or $$job{cpu_time} < $wall_time ) { $$job{cpu_time} = $wall_time; }
  294. }
  295. if ( $lines[$i]=~/The CPU time used is (\d+) seconds./ )
  296. {
  297. if ( !exists($$job{cpu_time}) or $$job{cpu_time} < $1 ) { $$job{cpu_time} = $1; }
  298. }
  299. if ( $lines[$i]=~/started on (\d+) Hosts\/Processors/ )
  300. {
  301. $$job{cpus} = $1;
  302. }
  303. if ( $lines[$i]=~/Exited with exit code (\d+)\./ )
  304. {
  305. $$job{exit_code} = $1;
  306. }
  307. }
  308. if ( scalar keys %$job)
  309. {
  310. if ( $$job{command}=~/^cr_restart/ && exists($$job{exit_code}) && $$job{exit_code} eq '16' )
  311. {
  312. # temporary failure (e.g. pid in use) of cr_restart, ignore this failure
  313. return $info;
  314. }
  315. $$info{$$job{id}} = $job;
  316. }
  317. return $info;
  318. }
  319. sub _check_job
  320. {
  321. my ($self,$job,$jids_file) = @_;
  322. my $status = $$self{lsf_status_codes};
  323. if ( !exists($$status{$$job{status}}) )
  324. {
  325. confess("Todo: $$job{status} $$job{lsf_id}\n");
  326. }
  327. $$job{status} = $$status{$$job{status}};
  328. if ( $$job{status}==$$self{Running} )
  329. {
  330. if ( !exists($$job{cpu_time}) ) { $$job{cpu_time} = 0; }
  331. # Estimate how long it might take before we are called again + plus 5 minutes to be safe, and
  332. # bswitch to a longer queue if necessary.
  333. my $wakeup_interval = $$self{limits}{wakeup_interval} ? $$self{limits}{wakeup_interval} + 300 : 300;
  334. my $time_mins = ($$job{cpu_time} / $$job{cpus} + $wakeup_interval) / 60.;
  335. my $new_queue = $self->_get_queue($time_mins);
  336. my $cur_queue = $$job{queue};
  337. if ( defined $new_queue && $new_queue ne $cur_queue && $$self{queue_limits}{$new_queue} > $$self{queue_limits}{$cur_queue} )
  338. {
  339. warn("Switching job $$job{lsf_id} from queue $cur_queue to $new_queue\n");
  340. `bswitch $new_queue '$$job{lsf_id}'`;
  341. if ( $? ) { warn("Could not switch queues: $$job{lsf_id}"); }
  342. else { $$job{queue} = $new_queue; }
  343. }
  344. }
  345. }
  346. # time is in minutes
  347. sub _get_queue
  348. {
  349. my ($self,$time) = @_;
  350. my $queue = exists($$self{limits}{queue}) ? $$self{limits}{queue} : $$self{default_limits}{queue};
  351. if ( $$self{queue_limits}{$queue} >= $time ) { return $queue; }
  352. for my $q (sort {$$self{queue_limits}{$a} <=> $$self{queue_limits}{$b}} keys %{$$self{queue_limits}})
  353. {
  354. if ( $time > $$self{queue_limits}{$q} ) { next; }
  355. return $q;
  356. }
  357. return undef;
  358. }
  359. sub _parse_output
  360. {
  361. my ($self,$jid,$output) = @_;
  362. my $fname = "$output.$jid.o";
  363. if ( !-e $fname ) { return undef; }
  364. # if the output file is empty, assume the job is running
  365. my $out = { status=>$$self{Running} };
  366. # collect command lines and exit status to detect non-critical
  367. # cr_restart exits
  368. my @attempts = ();
  369. open(my $fh,'<',$fname) or confess("$fname: $!");
  370. while (my $line=<$fh>)
  371. {
  372. # Subject: Job 822187: <_2215_1_graphs> Done
  373. if ( $line =~ /^Subject: Job.+\s+(\S+)$/ )
  374. {
  375. if ( $1 eq 'Exited' ) { $$out{status} = $$self{Error}; $$out{nfailures}++; }
  376. if ( $1 eq 'Done' ) { $$out{status} = $$self{Done}; $$out{nfailures} = 0; }
  377. }
  378. if ( $line =~ /^# LSBATCH:/ )
  379. {
  380. $line = <$fh>;
  381. my $cmd = substr($line,0,10);
  382. push @attempts, { cmd=>$cmd };
  383. next;
  384. }
  385. if ( $line =~ /^Exited with exit code (\d+)\./ )
  386. {
  387. if ( !scalar @attempts or exists($attempts[-1]{exit}) ) { warn("Uh, unable to parse $output.$jid.o\n"); next; }
  388. $attempts[-1]{exit} = $1;
  389. }
  390. # Do not count checkpoint and owner kills as a failure.
  391. if ( $line =~ /^TERM_CHKPNT/ && $$out{nfailures} ) { $$out{nfailures}--; }
  392. if ( $line =~ /^TERM_OWNER/ && $$out{nfailures} ) { $$out{nfailures}--; }
  393. }
  394. close($fh);
  395. for (my $i=0; $i<@attempts; $i++)
  396. {
  397. # cr_restart exited with a non-critical error
  398. if ( $attempts[$i]{cmd} eq 'cr_restart' && exists($attempts[$i]{exit}) && $attempts[$i]{exit} eq '16' )
  399. {
  400. $$out{nfailures}--;
  401. }
  402. }
  403. return $out;
  404. }
  405. sub past_limits
  406. {
  407. my ($self,$jid,$output) = @_;
  408. my $fname = "$output.$jid.o";
  409. if ( ! -e $fname ) { return (); }
  410. open(my $fh,'<',$fname) or confess("$fname: $!");
  411. my (%out,$killed,$mem);
  412. while (my $line=<$fh>)
  413. {
  414. if ( $line=~/^TERM_MEMLIMIT:/) { $killed = 1; }
  415. elsif ( $line=~/^\s+Max Memory\s+:\s+(\S+)\s+(\S+)/)
  416. {
  417. $mem = $1;
  418. if ($2 eq 'KB') { $mem /= 1024; }
  419. elsif ($2 eq 'GB') { $mem *= 1024; }
  420. if ( !exists($out{memory}) or $out{memory}<$mem )
  421. {
  422. $out{memory} = $mem;
  423. if ( $killed ) { $out{MEMLIMIT} = $mem; }
  424. else { delete($out{MEMLIMIT}); }
  425. }
  426. }
  427. }
  428. close($fh);
  429. return %out;
  430. }
  431. sub _set_lsf_limits_unit
  432. {
  433. my ($self) = @_;
  434. if ( exists($$self{lsf_limits_unit}) ) { return; }
  435. for (my $i=2; $i<15; $i++)
  436. {
  437. my @units = grep { /LSF_UNIT_FOR_LIMITS/ } `lsadmin showconf lim 2>/dev/null`;
  438. if ( $? )
  439. {
  440. # lasdmin may be temporarily unavailable and return confusing errors:
  441. # "Bad host name" or "ls_gethostinfo(): A socket operation has failed: Address already in use"
  442. print STDERR "lsadmin failed, trying again in $i sec...\n";
  443. sleep $i;
  444. next;
  445. }
  446. if ( @units && $units[0]=~/\s+MB$/ ) { $$self{lsf_limits_unit} = 'MB'; }
  447. else { $$self{lsf_limits_unit} = 'kB'; }
  448. return $$self{lsf_limits_unit};
  449. }
  450. confess("lsadmin showconf lim failed repeatedly");
  451. }
  452. sub _create_bsub_opts_string
  453. {
  454. my ($self) = @_;
  455. # Set bsub options. By default request 1GB of memory, the queues require mem to be set explicitly
  456. my $bsub_opts = '';
  457. my $mem = $$self{limits}{memory} ? int($$self{limits}{memory}) : $$self{default_limits}{memory};
  458. my $lmem = $$self{lsf_limits_unit} eq 'kB' ? $mem*1000 : $mem;
  459. my $runtime = $$self{limits}{runtime} ? $$self{limits}{runtime} : $$self{default_limits}{runtime};
  460. my $queue = $self->_get_queue($runtime);
  461. if ( !defined $queue ) { $queue = $$self{default_limits}{queue}; }
  462. $bsub_opts = sprintf " -M%d -R 'select[type==X86_64 && mem>%d] rusage[mem=%d]'", $lmem,$mem,$mem;
  463. $bsub_opts .= " -q $queue";
  464. if ( defined($$self{limits}{cpus}) )
  465. {
  466. $bsub_opts .= " -n $$self{limits}{cpus} -R 'span[hosts=1]'";
  467. }
  468. return $bsub_opts;
  469. }
  470. sub _create_bsub_ids_string
  471. {
  472. my ($self,$job_name,$ids) = @_;
  473. # Process the list of IDs. The maximum job name length is 255 characters.
  474. my @ids = sort { $a<=>$b } @$ids;
  475. my @bsub_ids;
  476. my $from = $ids[0];
  477. my $prev = $from;
  478. for (my $i=1; $i<@ids; $i++)
  479. {
  480. my $id = $ids[$i];
  481. if ( $id != $prev+1 )
  482. {
  483. if ( $prev>$from ) { push @bsub_ids, "$from-$prev"; }
  484. else { push @bsub_ids, $from; }
  485. $from = $id;
  486. $prev = $id;
  487. }
  488. $prev = $id;
  489. }
  490. if ( $prev>$from ) { push @bsub_ids, "$from-$prev"; }
  491. else { push @bsub_ids, $from; }
  492. my $bsub_ids = join(',', @bsub_ids);
  493. my @skipped_bsub_ids;
  494. while ( length($job_name) + length($bsub_ids) > 250 && scalar @bsub_ids )
  495. {
  496. push @skipped_bsub_ids, pop(@bsub_ids);
  497. $bsub_ids = join(',', @bsub_ids);
  498. }
  499. @$ids = ();
  500. foreach my $bsub_id (@skipped_bsub_ids)
  501. {
  502. if ($bsub_id =~ m/(\d+)-(\d+)/) { push @$ids, ($1..$2); }
  503. else { push @$ids, $bsub_id; }
  504. }
  505. return $bsub_ids;
  506. }
  507. sub _bsub_command
  508. {
  509. my ($self,$jids_file,$job_name,$bsub_cmd,$cmd) = @_;
  510. print STDERR "$bsub_cmd\n";
  511. my @out = `$bsub_cmd`;
  512. if ( scalar @out!=1 || !($out[0]=~/^Job <(\d+)> is submitted/) )
  513. {
  514. my $cwd = `pwd`;
  515. confess("Expected different output from bsub. The command was:\n\t$cmd\nThe bsub_command was:\n\t$bsub_cmd\nThe working directory was:\n\t$cwd\nThe output was:\n", @out);
  516. }
  517. # Write down info about the submitted command
  518. my $jid = $1;
  519. open(my $jids_fh, '>>', $jids_file) or confess("$jids_file: $!");
  520. print $jids_fh "$jid\t$job_name\t$bsub_cmd\n";
  521. close $jids_fh;
  522. }
  523. sub run_jobs
  524. {
  525. my ($self,$jids_file,$job_name,$cmd,$ids) = @_;
  526. if ( !scalar @$ids ) { confess("No IDs given??\n"); }
  527. $cmd =~ s/{JOB_INDEX}/\$LSB_JOBINDEX/g;
  528. my $bsub_opts = $self->_create_bsub_opts_string();
  529. my @ids = @$ids;
  530. while ( @ids )
  531. {
  532. my $bsub_ids = $self->_create_bsub_ids_string($job_name,\@ids);
  533. # Do not allow the system to requeue jobs automatically, we would loose track of the job ID: -rn
  534. my $bsub_cmd = qq[bsub -rn -J '${job_name}[$bsub_ids]' -e $job_name.\%I.e -o $job_name.\%I.o $bsub_opts '$cmd'];
  535. # Submit to LSF
  536. $self->_bsub_command($jids_file,$job_name,$bsub_cmd,$cmd);
  537. }
  538. }
  539. 1;