/src/plugins/accounting_storage/filetxt/filetxt_jobacct_process.c

https://github.com/cfenoy/slurm · C · 1593 lines · 1341 code · 133 blank · 119 comment · 218 complexity · 1d8740044769f121300ff93a1a2c6bbf MD5 · raw file

  1. /*****************************************************************************\
  2. * filetxt_jobacct_process.c - functions the processing of
  3. * information from the filetxt jobacct
  4. * storage.
  5. *****************************************************************************
  6. * Copyright (C) 2004-2007 The Regents of the University of California.
  7. * Copyright (C) 2008-2009 Lawrence Livermore National Security.
  8. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  9. * Written by Danny Auble <da@llnl.gov>
  10. *
  11. * This file is part of SLURM, a resource management program.
  12. * For details, see <http://www.schedmd.com/slurmdocs/>.
  13. * Please also read the included file: DISCLAIMER.
  14. *
  15. * SLURM is free software; you can redistribute it and/or modify it under
  16. * the terms of the GNU General Public License as published by the Free
  17. * Software Foundation; either version 2 of the License, or (at your option)
  18. * any later version.
  19. *
  20. * In addition, as a special exception, the copyright holders give permission
  21. * to link the code of portions of this program with the OpenSSL library under
  22. * certain conditions as described in each individual source file, and
  23. * distribute linked combinations including the two. You must obey the GNU
  24. * General Public License in all respects for all of the code used other than
  25. * OpenSSL. If you modify file(s) with this exception, you may extend this
  26. * exception to your version of the file(s), but you are not obligated to do
  27. * so. If you do not wish to do so, delete this exception statement from your
  28. * version. If you delete this exception statement from all source files in
  29. * the program, then also delete it here.
  30. *
  31. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  32. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  33. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  34. * details.
  35. *
  36. * You should have received a copy of the GNU General Public License along
  37. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  38. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  39. *
  40. * This file is patterned after jobcomp_linux.c, written by Morris Jette and
  41. * Copyright (C) 2002 The Regents of the University of California.
  42. \*****************************************************************************/
  43. #include <sys/resource.h> /* for struct rusage */
  44. #include <stdlib.h>
  45. #include <ctype.h>
  46. #include <sys/stat.h>
  47. #include "src/common/xstring.h"
  48. #include "src/common/xmalloc.h"
  49. #include "src/common/list.h"
  50. #include "filetxt_jobacct_process.h"
  51. #include "src/slurmctld/slurmctld.h"
  52. #include "src/slurmdbd/read_config.h"
  53. /* Map field names to positions */
  54. /* slurmd uses "(uint32_t) -2" to track data for batch allocations
  55. * which have no logical jobsteps. */
  56. #define BATCH_JOB_TIMESTAMP 0
  57. #define EXPIRE_READ_LENGTH 10
  58. #define MAX_RECORD_FIELDS 100
  59. typedef struct expired_rec { /* table of expired jobs */
  60. uint32_t job;
  61. time_t job_submit;
  62. char *line;
  63. } expired_rec_t;
  64. typedef struct header {
  65. uint32_t jobnum;
  66. char *partition;
  67. char *blockid;
  68. time_t job_submit;
  69. time_t timestamp;
  70. uint32_t uid;
  71. uint32_t gid;
  72. uint16_t rec_type;
  73. } filetxt_header_t;
  74. typedef struct {
  75. uint32_t job_start_seen, /* useful flags */
  76. job_step_seen,
  77. job_terminated_seen,
  78. jobnum_superseded; /* older jobnum was reused */
  79. filetxt_header_t header;
  80. uint16_t show_full;
  81. char *nodes;
  82. char *jobname;
  83. uint16_t track_steps;
  84. int32_t priority;
  85. uint32_t ncpus;
  86. uint32_t ntasks;
  87. enum job_states status;
  88. int32_t exitcode;
  89. uint32_t elapsed;
  90. time_t end;
  91. uint32_t tot_cpu_sec;
  92. uint32_t tot_cpu_usec;
  93. struct rusage rusage;
  94. slurmdb_stats_t stats;
  95. List steps;
  96. char *account;
  97. uint32_t requid;
  98. } filetxt_job_rec_t;
  99. typedef struct {
  100. filetxt_header_t header;
  101. uint32_t stepnum; /* job's step number */
  102. char *nodes;
  103. char *stepname;
  104. enum job_states status;
  105. int32_t exitcode;
  106. uint32_t ntasks;
  107. uint32_t ncpus;
  108. uint32_t elapsed;
  109. time_t end;
  110. uint32_t tot_cpu_sec;
  111. uint32_t tot_cpu_usec;
  112. struct rusage rusage;
  113. slurmdb_stats_t stats;
  114. char *account;
  115. uint32_t requid;
  116. } filetxt_step_rec_t;
  117. /* Fields common to all records */
  118. enum { F_JOB = 0,
  119. F_PARTITION,
  120. F_JOB_SUBMIT,
  121. F_TIMESTAMP,
  122. F_UID,
  123. F_GID,
  124. F_BLOCKID,
  125. F_RESERVED2,
  126. F_RECTYPE,
  127. HEADER_LENGTH
  128. };
  129. /* JOB_START fields */
  130. enum { F_JOBNAME = HEADER_LENGTH,
  131. F_TRACK_STEPS,
  132. F_PRIORITY,
  133. F_NCPUS,
  134. F_NODES,
  135. F_JOB_ACCOUNT,
  136. JOB_START_LENGTH
  137. };
  138. /* JOB_STEP fields */
  139. enum { F_JOBSTEP = HEADER_LENGTH,
  140. F_STATUS,
  141. F_EXITCODE,
  142. F_NTASKS,
  143. F_STEPNCPUS,
  144. F_ELAPSED,
  145. F_CPU_SEC,
  146. F_CPU_USEC,
  147. F_USER_SEC,
  148. F_USER_USEC,
  149. F_SYS_SEC,
  150. F_SYS_USEC,
  151. F_RSS,
  152. F_IXRSS,
  153. F_IDRSS,
  154. F_ISRSS,
  155. F_MINFLT,
  156. F_MAJFLT,
  157. F_NSWAP,
  158. F_INBLOCKS,
  159. F_OUBLOCKS,
  160. F_MSGSND,
  161. F_MSGRCV,
  162. F_NSIGNALS,
  163. F_NVCSW,
  164. F_NIVCSW,
  165. F_MAX_VSIZE,
  166. F_MAX_VSIZE_TASK,
  167. F_AVE_VSIZE,
  168. F_MAX_RSS,
  169. F_MAX_RSS_TASK,
  170. F_AVE_RSS,
  171. F_MAX_PAGES,
  172. F_MAX_PAGES_TASK,
  173. F_AVE_PAGES,
  174. F_MIN_CPU,
  175. F_MIN_CPU_TASK,
  176. F_AVE_CPU,
  177. F_STEPNAME,
  178. F_STEPNODES,
  179. F_MAX_VSIZE_NODE,
  180. F_MAX_RSS_NODE,
  181. F_MAX_PAGES_NODE,
  182. F_MIN_CPU_NODE,
  183. F_STEP_ACCOUNT,
  184. F_STEP_REQUID,
  185. JOB_STEP_LENGTH
  186. };
  187. /* JOB_TERM / JOB_SUSPEND fields */
  188. enum { F_TOT_ELAPSED = HEADER_LENGTH,
  189. F_TERM_STATUS,
  190. F_JOB_REQUID,
  191. F_JOB_EXITCODE,
  192. JOB_TERM_LENGTH
  193. };
  194. static void _destroy_exp(void *object)
  195. {
  196. expired_rec_t *exp_rec = (expired_rec_t *)object;
  197. if(exp_rec) {
  198. xfree(exp_rec->line);
  199. xfree(exp_rec);
  200. }
  201. }
  202. static void _free_filetxt_header(void *object)
  203. {
  204. filetxt_header_t *header = (filetxt_header_t *)object;
  205. if(header) {
  206. xfree(header->partition);
  207. #ifdef HAVE_BG
  208. xfree(header->blockid);
  209. #endif
  210. }
  211. }
  212. static void _destroy_filetxt_job_rec(void *object)
  213. {
  214. filetxt_job_rec_t *job = (filetxt_job_rec_t *)object;
  215. if (job) {
  216. if(job->steps)
  217. list_destroy(job->steps);
  218. _free_filetxt_header(&job->header);
  219. xfree(job->jobname);
  220. xfree(job->account);
  221. xfree(job->nodes);
  222. xfree(job);
  223. }
  224. }
  225. static void _destroy_filetxt_step_rec(void *object)
  226. {
  227. filetxt_step_rec_t *step = (filetxt_step_rec_t *)object;
  228. if (step) {
  229. _free_filetxt_header(&step->header);
  230. xfree(step->stepname);
  231. xfree(step->nodes);
  232. xfree(step->account);
  233. xfree(step);
  234. }
  235. }
  236. static slurmdb_step_rec_t *_slurmdb_create_step_rec(
  237. filetxt_step_rec_t *filetxt_step)
  238. {
  239. slurmdb_step_rec_t *slurmdb_step = slurmdb_create_step_rec();
  240. slurmdb_step->elapsed = filetxt_step->elapsed;
  241. slurmdb_step->end = filetxt_step->header.timestamp;
  242. slurmdb_step->exitcode = filetxt_step->exitcode;
  243. slurmdb_step->ncpus = filetxt_step->ncpus;
  244. if(filetxt_step->nodes) {
  245. hostlist_t hl = hostlist_create(filetxt_step->nodes);
  246. slurmdb_step->nnodes = hostlist_count(hl);
  247. hostlist_destroy(hl);
  248. }
  249. slurmdb_step->nodes = xstrdup(filetxt_step->nodes);
  250. slurmdb_step->requid = filetxt_step->requid;
  251. memcpy(&slurmdb_step->stats, &filetxt_step->stats,
  252. sizeof(slurmdb_stats_t));
  253. slurmdb_step->start = filetxt_step->header.timestamp -
  254. slurmdb_step->elapsed;
  255. slurmdb_step->state = filetxt_step->status;
  256. slurmdb_step->stepid = filetxt_step->stepnum;
  257. slurmdb_step->stepname = xstrdup(filetxt_step->stepname);
  258. slurmdb_step->sys_cpu_sec = filetxt_step->rusage.ru_stime.tv_sec;
  259. slurmdb_step->sys_cpu_usec = filetxt_step->rusage.ru_stime.tv_usec;
  260. slurmdb_step->tot_cpu_sec = filetxt_step->tot_cpu_sec;
  261. slurmdb_step->tot_cpu_usec = filetxt_step->tot_cpu_usec;
  262. slurmdb_step->user_cpu_sec = filetxt_step->rusage.ru_utime.tv_sec;
  263. slurmdb_step->user_cpu_usec = filetxt_step->rusage.ru_utime.tv_usec;
  264. return slurmdb_step;
  265. }
  266. static slurmdb_job_rec_t *_slurmdb_create_job_rec(
  267. filetxt_job_rec_t *filetxt_job, slurmdb_job_cond_t *job_cond)
  268. {
  269. slurmdb_job_rec_t *slurmdb_job = NULL;
  270. ListIterator itr = NULL;
  271. filetxt_step_rec_t *filetxt_step = NULL;
  272. if(!job_cond)
  273. goto no_cond;
  274. if (job_cond->state_list
  275. && list_count(job_cond->state_list)) {
  276. char *object = NULL;
  277. itr = list_iterator_create(job_cond->state_list);
  278. while((object = list_next(itr))) {
  279. if (atoi(object) == filetxt_job->status) {
  280. list_iterator_destroy(itr);
  281. goto foundstate;
  282. }
  283. }
  284. list_iterator_destroy(itr);
  285. return NULL; /* no match */
  286. }
  287. foundstate:
  288. no_cond:
  289. slurmdb_job = slurmdb_create_job_rec();
  290. slurmdb_job->associd = 0;
  291. slurmdb_job->account = xstrdup(filetxt_job->account);
  292. slurmdb_job->blockid = xstrdup(filetxt_job->header.blockid);
  293. slurmdb_job->cluster = NULL;
  294. slurmdb_job->elapsed = filetxt_job->elapsed;
  295. slurmdb_job->eligible = filetxt_job->header.job_submit;
  296. slurmdb_job->end = filetxt_job->header.timestamp;
  297. slurmdb_job->exitcode = filetxt_job->exitcode;
  298. slurmdb_job->gid = filetxt_job->header.gid;
  299. slurmdb_job->jobid = filetxt_job->header.jobnum;
  300. slurmdb_job->jobname = xstrdup(filetxt_job->jobname);
  301. slurmdb_job->partition = xstrdup(filetxt_job->header.partition);
  302. slurmdb_job->req_cpus = filetxt_job->ncpus;
  303. slurmdb_job->alloc_cpus = filetxt_job->ncpus;
  304. if(filetxt_job->nodes) {
  305. hostlist_t hl = hostlist_create(filetxt_job->nodes);
  306. slurmdb_job->alloc_nodes = hostlist_count(hl);
  307. hostlist_destroy(hl);
  308. }
  309. slurmdb_job->nodes = xstrdup(filetxt_job->nodes);
  310. slurmdb_job->priority = filetxt_job->priority;
  311. slurmdb_job->requid = filetxt_job->requid;
  312. memcpy(&slurmdb_job->stats, &filetxt_job->stats,
  313. sizeof(slurmdb_stats_t));
  314. slurmdb_job->show_full = filetxt_job->show_full;
  315. slurmdb_job->start = filetxt_job->header.timestamp -
  316. slurmdb_job->elapsed;
  317. slurmdb_job->state = filetxt_job->status;
  318. slurmdb_job->steps = list_create(slurmdb_destroy_step_rec);
  319. if(filetxt_job->steps) {
  320. itr = list_iterator_create(filetxt_job->steps);
  321. while((filetxt_step = list_next(itr))) {
  322. slurmdb_step_rec_t *step =
  323. _slurmdb_create_step_rec(filetxt_step);
  324. if(step) {
  325. step->job_ptr = slurmdb_job;
  326. if(!slurmdb_job->first_step_ptr)
  327. slurmdb_job->first_step_ptr = step;
  328. list_append(slurmdb_job->steps, step);
  329. }
  330. }
  331. list_iterator_destroy(itr);
  332. }
  333. slurmdb_job->submit = filetxt_job->header.job_submit;
  334. slurmdb_job->sys_cpu_sec = filetxt_job->rusage.ru_stime.tv_sec;
  335. slurmdb_job->sys_cpu_usec = filetxt_job->rusage.ru_stime.tv_usec;
  336. slurmdb_job->tot_cpu_sec = filetxt_job->tot_cpu_sec;
  337. slurmdb_job->tot_cpu_usec = filetxt_job->tot_cpu_usec;
  338. slurmdb_job->track_steps = filetxt_job->track_steps;
  339. slurmdb_job->uid = filetxt_job->header.uid;
  340. slurmdb_job->user = NULL;
  341. slurmdb_job->user_cpu_sec = filetxt_job->rusage.ru_utime.tv_sec;
  342. slurmdb_job->user_cpu_usec = filetxt_job->rusage.ru_utime.tv_usec;
  343. return slurmdb_job;
  344. }
  345. static filetxt_job_rec_t *_create_filetxt_job_rec(filetxt_header_t header)
  346. {
  347. filetxt_job_rec_t *job = xmalloc(sizeof(filetxt_job_rec_t));
  348. memcpy(&job->header, &header, sizeof(filetxt_header_t));
  349. memset(&job->rusage, 0, sizeof(struct rusage));
  350. memset(&job->stats, 0, sizeof(slurmdb_stats_t));
  351. job->stats.cpu_min = NO_VAL;
  352. job->job_start_seen = 0;
  353. job->job_step_seen = 0;
  354. job->job_terminated_seen = 0;
  355. job->jobnum_superseded = 0;
  356. job->jobname = NULL;
  357. job->status = JOB_PENDING;
  358. job->nodes = NULL;
  359. job->jobname = NULL;
  360. job->exitcode = 0;
  361. job->priority = 0;
  362. job->ntasks = 0;
  363. job->ncpus = 0;
  364. job->elapsed = 0;
  365. job->tot_cpu_sec = 0;
  366. job->tot_cpu_usec = 0;
  367. job->steps = list_create(_destroy_filetxt_step_rec);
  368. job->nodes = NULL;
  369. job->track_steps = 0;
  370. job->account = NULL;
  371. job->requid = -1;
  372. return job;
  373. }
  374. static filetxt_step_rec_t *_create_filetxt_step_rec(filetxt_header_t header)
  375. {
  376. filetxt_step_rec_t *step = xmalloc(sizeof(filetxt_job_rec_t));
  377. memcpy(&step->header, &header, sizeof(filetxt_header_t));
  378. memset(&step->rusage, 0, sizeof(struct rusage));
  379. memset(&step->stats, 0, sizeof(slurmdb_stats_t));
  380. step->stepnum = (uint32_t)NO_VAL;
  381. step->nodes = NULL;
  382. step->stepname = NULL;
  383. step->status = NO_VAL;
  384. step->exitcode = NO_VAL;
  385. step->ntasks = (uint32_t)NO_VAL;
  386. step->ncpus = (uint32_t)NO_VAL;
  387. step->elapsed = (uint32_t)NO_VAL;
  388. step->tot_cpu_sec = (uint32_t)NO_VAL;
  389. step->tot_cpu_usec = (uint32_t)NO_VAL;
  390. step->account = NULL;
  391. step->requid = -1;
  392. return step;
  393. }
  394. /* prefix_filename() -- insert a filename prefix into a path
  395. *
  396. * IN: path = fully-qualified path+file name
  397. * prefix = the prefix to insert into the file name
  398. * RETURNS: pointer to the updated path+file name
  399. */
  400. static char *_prefix_filename(char *path, char *prefix) {
  401. char *out;
  402. int i,
  403. plen;
  404. plen = strlen(path);
  405. out = xmalloc(plen+strlen(prefix)+1);
  406. for (i=plen-1; i>=0; i--)
  407. if (path[i]=='/') {
  408. break;
  409. }
  410. i++;
  411. *out = 0;
  412. strncpy(out, path, i);
  413. out[i] = 0;
  414. strcat(out, prefix);
  415. strcat(out, path+i);
  416. return(out);
  417. }
  418. /* _open_log_file() -- find the current or specified log file, and open it
  419. *
  420. * IN: Nothing
  421. * RETURNS: Nothing
  422. *
  423. * Side effects:
  424. * - Sets opt_filein to the current system accounting log unless
  425. * the user specified another file.
  426. */
  427. static FILE *_open_log_file(char *logfile)
  428. {
  429. FILE *fd = fopen(logfile, "r");
  430. if (fd == NULL) {
  431. perror(logfile);
  432. exit(1);
  433. }
  434. return fd;
  435. }
  436. static char *_convert_type(int rec_type)
  437. {
  438. switch(rec_type) {
  439. case JOB_START:
  440. return "JOB_START";
  441. case JOB_STEP:
  442. return "JOB_STEP";
  443. case JOB_TERMINATED:
  444. return "JOB_TERMINATED";
  445. default:
  446. return "UNKNOWN";
  447. }
  448. }
  449. static int _cmp_jrec(const void *a1, const void *a2) {
  450. expired_rec_t *j1 = (expired_rec_t *) a1;
  451. expired_rec_t *j2 = (expired_rec_t *) a2;
  452. if (j1->job < j2->job)
  453. return -1;
  454. else if (j1->job == j2->job) {
  455. if(j1->job_submit == j2->job_submit)
  456. return 0;
  457. else
  458. return 1;
  459. }
  460. return 1;
  461. }
  462. static void _show_rec(char *f[])
  463. {
  464. int i;
  465. fprintf(stderr, "rec>");
  466. for (i=0; f[i]; i++)
  467. fprintf(stderr, " %s", f[i]);
  468. fprintf(stderr, "\n");
  469. return;
  470. }
  471. static void _do_fdump(char* f[], int lc)
  472. {
  473. int i=0, j=0;
  474. char **type;
  475. char *header[] = {"job", /* F_JOB */
  476. "partition", /* F_PARTITION */
  477. "job_submit", /* F_JOB_SUBMIT */
  478. "timestamp", /* F_TIMESTAMP */
  479. "uid", /* F_UIDGID */
  480. "gid", /* F_UIDGID */
  481. "BlockID", /* F_BLOCKID */
  482. "reserved-2",/* F_RESERVED1 */
  483. "recordType",/* F_RECTYPE */
  484. NULL};
  485. char *start[] = {"jobName", /* F_JOBNAME */
  486. "TrackSteps", /* F_TRACK_STEPS */
  487. "priority", /* F_PRIORITY */
  488. "ncpus", /* F_NCPUS */
  489. "nodeList", /* F_NODES */
  490. "account", /* F_JOB_ACCOUNT */
  491. NULL};
  492. char *step[] = {"jobStep", /* F_JOBSTEP */
  493. "status", /* F_STATUS */
  494. "exitcode", /* F_EXITCODE */
  495. "ntasks", /* F_NTASKS */
  496. "ncpus", /* F_STEPNCPUS */
  497. "elapsed", /* F_ELAPSED */
  498. "cpu_sec", /* F_CPU_SEC */
  499. "cpu_usec", /* F_CPU_USEC */
  500. "user_sec", /* F_USER_SEC */
  501. "user_usec", /* F_USER_USEC */
  502. "sys_sec", /* F_SYS_SEC */
  503. "sys_usec", /* F_SYS_USEC */
  504. "rss", /* F_RSS */
  505. "ixrss", /* F_IXRSS */
  506. "idrss", /* F_IDRSS */
  507. "isrss", /* F_ISRSS */
  508. "minflt", /* F_MINFLT */
  509. "majflt", /* F_MAJFLT */
  510. "nswap", /* F_NSWAP */
  511. "inblocks", /* F_INBLOCKS */
  512. "oublocks", /* F_OUTBLOCKS */
  513. "msgsnd", /* F_MSGSND */
  514. "msgrcv", /* F_MSGRCV */
  515. "nsignals", /* F_NSIGNALS */
  516. "nvcsw", /* F_VCSW */
  517. "nivcsw", /* F_NIVCSW */
  518. "max_vsize", /* F_MAX_VSIZE */
  519. "max_vsize_task", /* F_MAX_VSIZE_TASK */
  520. "ave_vsize", /* F_AVE_VSIZE */
  521. "max_rss", /* F_MAX_RSS */
  522. "max_rss_task", /* F_MAX_RSS_TASK */
  523. "ave_rss", /* F_AVE_RSS */
  524. "max_pages", /* F_MAX_PAGES */
  525. "max_pages_task", /* F_MAX_PAGES_TASK */
  526. "ave_pages", /* F_AVE_PAGES */
  527. "min_cputime", /* F_MIN_CPU */
  528. "min_cputime_task", /* F_MIN_CPU_TASK */
  529. "ave_cputime", /* F_AVE_RSS */
  530. "StepName", /* F_STEPNAME */
  531. "StepNodes", /* F_STEPNODES */
  532. "max_vsize_node", /* F_MAX_VSIZE_NODE */
  533. "max_rss_node", /* F_MAX_RSS_NODE */
  534. "max_pages_node", /* F_MAX_PAGES_NODE */
  535. "min_cputime_node", /* F_MIN_CPU_NODE */
  536. "account", /* F_STEP_ACCOUNT */
  537. "requid", /* F_STEP_REQUID */
  538. NULL};
  539. char *suspend[] = {"Suspend/Run time", /* F_TOT_ELAPSED */
  540. "status", /* F_STATUS */
  541. NULL};
  542. char *term[] = {"totElapsed", /* F_TOT_ELAPSED */
  543. "status", /* F_STATUS */
  544. "requid", /* F_JOB_REQUID */
  545. "exitcode", /* F_EXITCODE */
  546. NULL};
  547. i = atoi(f[F_RECTYPE]);
  548. printf("\n------- Line %d %s -------\n", lc, _convert_type(i));
  549. for(j=0; j < HEADER_LENGTH; j++)
  550. printf("%12s: %s\n", header[j], f[j]);
  551. switch(i) {
  552. case JOB_START:
  553. type = start;
  554. j = JOB_START_LENGTH;
  555. break;
  556. case JOB_STEP:
  557. type = step;
  558. j = JOB_STEP_LENGTH;
  559. break;
  560. case JOB_SUSPEND:
  561. type = suspend;
  562. j = JOB_TERM_LENGTH;
  563. case JOB_TERMINATED:
  564. type = term;
  565. j = JOB_TERM_LENGTH;
  566. break;
  567. default:
  568. while(f[j]) {
  569. printf(" Field[%02d]: %s\n", j, f[j]);
  570. j++;
  571. }
  572. return;
  573. }
  574. for(i=HEADER_LENGTH; i < j; i++)
  575. printf("%12s: %s\n", type[i-HEADER_LENGTH], f[i]);
  576. }
  577. static filetxt_job_rec_t *_find_job_record(List job_list,
  578. filetxt_header_t header,
  579. int type)
  580. {
  581. filetxt_job_rec_t *job = NULL;
  582. ListIterator itr = list_iterator_create(job_list);
  583. while((job = (filetxt_job_rec_t *)list_next(itr)) != NULL) {
  584. if (job->header.jobnum == header.jobnum) {
  585. if(job->header.job_submit == 0 && type == JOB_START) {
  586. list_remove(itr);
  587. _destroy_filetxt_job_rec(job);
  588. job = NULL;
  589. break;
  590. }
  591. if(job->header.job_submit == BATCH_JOB_TIMESTAMP) {
  592. job->header.job_submit = header.job_submit;
  593. break;
  594. }
  595. if(job->header.job_submit == header.job_submit)
  596. break;
  597. else {
  598. /* If we're looking for a later
  599. * record with this job number, we
  600. * know that this one is an older,
  601. * duplicate record.
  602. * We assume that the newer record
  603. * will be created if it doesn't
  604. * already exist. */
  605. job->jobnum_superseded = 1;
  606. }
  607. }
  608. }
  609. list_iterator_destroy(itr);
  610. return job;
  611. }
  612. static filetxt_step_rec_t *_find_step_record(filetxt_job_rec_t *job,
  613. long stepnum)
  614. {
  615. filetxt_step_rec_t *step = NULL;
  616. ListIterator itr = NULL;
  617. if(!list_count(job->steps))
  618. return step;
  619. itr = list_iterator_create(job->steps);
  620. while((step = (filetxt_step_rec_t *)list_next(itr)) != NULL) {
  621. if (step->stepnum == stepnum)
  622. break;
  623. }
  624. list_iterator_destroy(itr);
  625. return step;
  626. }
  627. static int _parse_header(char *f[], filetxt_header_t *header)
  628. {
  629. header->jobnum = atoi(f[F_JOB]);
  630. header->partition = xstrdup(f[F_PARTITION]);
  631. header->job_submit = atoi(f[F_JOB_SUBMIT]);
  632. header->timestamp = atoi(f[F_TIMESTAMP]);
  633. header->uid = atoi(f[F_UID]);
  634. header->gid = atoi(f[F_GID]);
  635. header->blockid = xstrdup(f[F_BLOCKID]);
  636. return SLURM_SUCCESS;
  637. }
  638. static int _parse_line(char *f[], void **data, int len)
  639. {
  640. int i = atoi(f[F_RECTYPE]);
  641. filetxt_job_rec_t **job = (filetxt_job_rec_t **)data;
  642. filetxt_step_rec_t **step = (filetxt_step_rec_t **)data;
  643. filetxt_header_t header;
  644. _parse_header(f, &header);
  645. switch(i) {
  646. case JOB_START:
  647. *job = _create_filetxt_job_rec(header);
  648. (*job)->jobname = xstrdup(f[F_JOBNAME]);
  649. (*job)->track_steps = atoi(f[F_TRACK_STEPS]);
  650. (*job)->priority = atoi(f[F_PRIORITY]);
  651. (*job)->ncpus = atoi(f[F_NCPUS]);
  652. (*job)->nodes = xstrdup(f[F_NODES]);
  653. for (i=0; (*job)->nodes[i]; i++) { /* discard trailing <CR> */
  654. if (isspace((*job)->nodes[i]))
  655. (*job)->nodes[i] = '\0';
  656. }
  657. if (!strcmp((*job)->nodes, "(null)")) {
  658. xfree((*job)->nodes);
  659. (*job)->nodes = xstrdup("(unknown)");
  660. }
  661. if (len > F_JOB_ACCOUNT) {
  662. (*job)->account = xstrdup(f[F_JOB_ACCOUNT]);
  663. for (i=0; (*job)->account[i]; i++) {
  664. /* discard trailing <CR> */
  665. if (isspace((*job)->account[i]))
  666. (*job)->account[i] = '\0';
  667. }
  668. }
  669. break;
  670. case JOB_STEP:
  671. *step = _create_filetxt_step_rec(header);
  672. (*step)->stepnum = atoi(f[F_JOBSTEP]);
  673. (*step)->status = atoi(f[F_STATUS]);
  674. (*step)->exitcode = atoi(f[F_EXITCODE]);
  675. (*step)->ntasks = atoi(f[F_NTASKS]);
  676. (*step)->ncpus = atoi(f[F_STEPNCPUS]);
  677. (*step)->elapsed = atoi(f[F_ELAPSED]);
  678. (*step)->tot_cpu_sec = atoi(f[F_CPU_SEC]);
  679. (*step)->tot_cpu_usec = atoi(f[F_CPU_USEC]);
  680. (*step)->rusage.ru_utime.tv_sec = atoi(f[F_USER_SEC]);
  681. (*step)->rusage.ru_utime.tv_usec = atoi(f[F_USER_USEC]);
  682. (*step)->rusage.ru_stime.tv_sec = atoi(f[F_SYS_SEC]);
  683. (*step)->rusage.ru_stime.tv_usec = atoi(f[F_SYS_USEC]);
  684. (*step)->rusage.ru_maxrss = atoi(f[F_RSS]);
  685. (*step)->rusage.ru_ixrss = atoi(f[F_IXRSS]);
  686. (*step)->rusage.ru_idrss = atoi(f[F_IDRSS]);
  687. (*step)->rusage.ru_isrss = atoi(f[F_ISRSS]);
  688. (*step)->rusage.ru_minflt = atoi(f[F_MINFLT]);
  689. (*step)->rusage.ru_majflt = atoi(f[F_MAJFLT]);
  690. (*step)->rusage.ru_nswap = atoi(f[F_NSWAP]);
  691. (*step)->rusage.ru_inblock = atoi(f[F_INBLOCKS]);
  692. (*step)->rusage.ru_oublock = atoi(f[F_OUBLOCKS]);
  693. (*step)->rusage.ru_msgsnd = atoi(f[F_MSGSND]);
  694. (*step)->rusage.ru_msgrcv = atoi(f[F_MSGRCV]);
  695. (*step)->rusage.ru_nsignals = atoi(f[F_NSIGNALS]);
  696. (*step)->rusage.ru_nvcsw = atoi(f[F_NVCSW]);
  697. (*step)->rusage.ru_nivcsw = atoi(f[F_NIVCSW]);
  698. (*step)->stats.vsize_max = atoi(f[F_MAX_VSIZE]);
  699. if(len > F_STEPNODES) {
  700. (*step)->stats.vsize_max_taskid =
  701. atoi(f[F_MAX_VSIZE_TASK]);
  702. (*step)->stats.vsize_ave = atof(f[F_AVE_VSIZE]);
  703. (*step)->stats.rss_max = atoi(f[F_MAX_RSS]);
  704. (*step)->stats.rss_max_taskid =
  705. atoi(f[F_MAX_RSS_TASK]);
  706. (*step)->stats.rss_ave = atof(f[F_AVE_RSS]);
  707. (*step)->stats.pages_max = atoi(f[F_MAX_PAGES]);
  708. (*step)->stats.pages_max_taskid =
  709. atoi(f[F_MAX_PAGES_TASK]);
  710. (*step)->stats.pages_ave = atof(f[F_AVE_PAGES]);
  711. (*step)->stats.cpu_min = atoi(f[F_MIN_CPU]);
  712. (*step)->stats.cpu_min_taskid =
  713. atoi(f[F_MIN_CPU_TASK]);
  714. (*step)->stats.cpu_ave = atof(f[F_AVE_CPU]);
  715. (*step)->stepname = xstrdup(f[F_STEPNAME]);
  716. (*step)->nodes = xstrdup(f[F_STEPNODES]);
  717. } else {
  718. (*step)->stats.vsize_max_taskid = (uint16_t)NO_VAL;
  719. (*step)->stats.vsize_ave = (float)NO_VAL;
  720. (*step)->stats.rss_max = NO_VAL;
  721. (*step)->stats.rss_max_taskid = (uint16_t)NO_VAL;
  722. (*step)->stats.rss_ave = (float)NO_VAL;
  723. (*step)->stats.pages_max = NO_VAL;
  724. (*step)->stats.pages_max_taskid = (uint16_t)NO_VAL;
  725. (*step)->stats.pages_ave = (float)NO_VAL;
  726. (*step)->stats.cpu_min = NO_VAL;
  727. (*step)->stats.cpu_min_taskid = (uint16_t)NO_VAL;
  728. (*step)->stats.cpu_ave = (float)NO_VAL;
  729. (*step)->stepname = NULL;
  730. (*step)->nodes = NULL;
  731. }
  732. if(len > F_MIN_CPU_NODE) {
  733. (*step)->stats.vsize_max_nodeid =
  734. atoi(f[F_MAX_VSIZE_NODE]);
  735. (*step)->stats.rss_max_nodeid =
  736. atoi(f[F_MAX_RSS_NODE]);
  737. (*step)->stats.pages_max_nodeid =
  738. atoi(f[F_MAX_PAGES_NODE]);
  739. (*step)->stats.cpu_min_nodeid =
  740. atoi(f[F_MIN_CPU_NODE]);
  741. } else {
  742. (*step)->stats.vsize_max_nodeid = NO_VAL;
  743. (*step)->stats.rss_max_nodeid = NO_VAL;
  744. (*step)->stats.pages_max_nodeid = NO_VAL;
  745. (*step)->stats.cpu_min_nodeid = NO_VAL;
  746. }
  747. if(len > F_STEP_ACCOUNT)
  748. (*step)->account = xstrdup(f[F_STEP_ACCOUNT]);
  749. if(len > F_STEP_REQUID)
  750. (*step)->requid = atoi(f[F_STEP_REQUID]);
  751. break;
  752. case JOB_SUSPEND:
  753. case JOB_TERMINATED:
  754. *job = _create_filetxt_job_rec(header);
  755. (*job)->elapsed = atoi(f[F_TOT_ELAPSED]);
  756. (*job)->status = atoi(f[F_STATUS]);
  757. if(len > F_JOB_REQUID)
  758. (*job)->requid = atoi(f[F_JOB_REQUID]);
  759. if(len > F_JOB_EXITCODE)
  760. (*job)->exitcode = atoi(f[F_JOB_EXITCODE]);
  761. break;
  762. default:
  763. error("UNKNOWN TYPE %d",i);
  764. break;
  765. }
  766. return SLURM_SUCCESS;
  767. }
  768. static void _process_start(List job_list, char *f[], int lc,
  769. int show_full, int len)
  770. {
  771. filetxt_job_rec_t *job = NULL;
  772. filetxt_job_rec_t *temp = NULL;
  773. _parse_line(f, (void **)&temp, len);
  774. job = _find_job_record(job_list, temp->header, JOB_START);
  775. if (job) {
  776. /* in slurm we can get 2 start records one for submit
  777. * and one for start, so look at the last one */
  778. xfree(job->jobname);
  779. job->jobname = xstrdup(temp->jobname);
  780. job->track_steps = temp->track_steps;
  781. job->priority = temp->priority;
  782. job->ncpus = temp->ncpus;
  783. xfree(job->nodes);
  784. job->nodes = xstrdup(temp->nodes);
  785. xfree(job->account);
  786. job->account = xstrdup(temp->account);
  787. _destroy_filetxt_job_rec(temp);
  788. return;
  789. }
  790. job = temp;
  791. job->show_full = show_full;
  792. list_append(job_list, job);
  793. job->job_start_seen = 1;
  794. }
  795. static void _process_step(List job_list, char *f[], int lc,
  796. int show_full, int len)
  797. {
  798. filetxt_job_rec_t *job = NULL;
  799. filetxt_step_rec_t *step = NULL;
  800. filetxt_step_rec_t *temp = NULL;
  801. _parse_line(f, (void **)&temp, len);
  802. job = _find_job_record(job_list, temp->header, JOB_STEP);
  803. if (temp->stepnum == -2) {
  804. _destroy_filetxt_step_rec(temp);
  805. return;
  806. }
  807. if (!job) { /* fake it for now */
  808. job = _create_filetxt_job_rec(temp->header);
  809. job->jobname = xstrdup("(unknown)");
  810. debug2("Note: JOB_STEP record %u.%u preceded "
  811. "JOB_START record at line %d\n",
  812. temp->header.jobnum, temp->stepnum, lc);
  813. }
  814. job->show_full = show_full;
  815. if ((step = _find_step_record(job, temp->stepnum))) {
  816. if (temp->status == JOB_RUNNING) {
  817. _destroy_filetxt_step_rec(temp);
  818. return;/* if "R" record preceded by F or CD; unusual */
  819. }
  820. if (step->status != JOB_RUNNING) { /* if not JOB_RUNNING */
  821. fprintf(stderr,
  822. "Conflicting JOB_STEP record for "
  823. "jobstep %u.%u at line %d "
  824. "-- ignoring it\n",
  825. step->header.jobnum,
  826. step->stepnum, lc);
  827. _destroy_filetxt_step_rec(temp);
  828. return;
  829. }
  830. step->status = temp->status;
  831. step->exitcode = temp->exitcode;
  832. step->ntasks = temp->ntasks;
  833. step->ncpus = temp->ncpus;
  834. step->elapsed = temp->elapsed;
  835. step->tot_cpu_sec = temp->tot_cpu_sec;
  836. step->tot_cpu_usec = temp->tot_cpu_usec;
  837. job->requid = temp->requid;
  838. step->requid = temp->requid;
  839. memcpy(&step->rusage, &temp->rusage, sizeof(struct rusage));
  840. memcpy(&step->stats, &temp->stats, sizeof(slurmdb_stats_t));
  841. xfree(step->stepname);
  842. step->stepname = xstrdup(temp->stepname);
  843. step->end = temp->header.timestamp;
  844. _destroy_filetxt_step_rec(temp);
  845. goto got_step;
  846. }
  847. step = temp;
  848. temp = NULL;
  849. list_append(job->steps, step);
  850. if(!job->track_steps) {
  851. /* If we don't have track_steps we want to see
  852. if we have multiple steps. If we only have
  853. 1 step check the job name against the step
  854. name in most all cases it will be
  855. different. If it is different print out
  856. the step separate.
  857. */
  858. if(list_count(job->steps) > 1)
  859. job->track_steps = 1;
  860. else if(step && step->stepname && job->jobname) {
  861. if(strcmp(step->stepname, job->jobname))
  862. job->track_steps = 1;
  863. }
  864. }
  865. if(job->header.timestamp == 0)
  866. job->header.timestamp = step->header.timestamp;
  867. job->job_step_seen = 1;
  868. job->ntasks += step->ntasks;
  869. if(!job->nodes || !strcmp(job->nodes, "(unknown)")) {
  870. xfree(job->nodes);
  871. job->nodes = xstrdup(step->nodes);
  872. }
  873. got_step:
  874. if (job->job_terminated_seen == 0) { /* If the job is still running,
  875. this is the most recent
  876. status */
  877. if ( job->exitcode == 0 )
  878. job->exitcode = step->exitcode;
  879. job->status = JOB_RUNNING;
  880. job->elapsed = step->header.timestamp - job->header.timestamp;
  881. }
  882. }
  883. static void _process_suspend(List job_list, char *f[], int lc,
  884. int show_full, int len)
  885. {
  886. filetxt_job_rec_t *job = NULL;
  887. filetxt_job_rec_t *temp = NULL;
  888. _parse_line(f, (void **)&temp, len);
  889. job = _find_job_record(job_list, temp->header, JOB_SUSPEND);
  890. if (!job) { /* fake it for now */
  891. job = _create_filetxt_job_rec(temp->header);
  892. job->jobname = xstrdup("(unknown)");
  893. }
  894. job->show_full = show_full;
  895. if (job->status == JOB_SUSPENDED)
  896. job->elapsed -= temp->elapsed;
  897. //job->header.timestamp = temp->header.timestamp;
  898. job->status = temp->status;
  899. _destroy_filetxt_job_rec(temp);
  900. }
  901. static void _process_terminated(List job_list, char *f[], int lc,
  902. int show_full, int len)
  903. {
  904. filetxt_job_rec_t *job = NULL;
  905. filetxt_job_rec_t *temp = NULL;
  906. _parse_line(f, (void **)&temp, len);
  907. job = _find_job_record(job_list, temp->header, JOB_TERMINATED);
  908. if (!job) { /* fake it for now */
  909. job = _create_filetxt_job_rec(temp->header);
  910. job->jobname = xstrdup("(unknown)");
  911. debug("Note: JOB_TERMINATED record for job "
  912. "%u preceded "
  913. "other job records at line %d\n",
  914. temp->header.jobnum, lc);
  915. } else if (job->job_terminated_seen) {
  916. if (temp->status == JOB_NODE_FAIL) {
  917. /* multiple node failures - extra TERMINATED records */
  918. debug("Note: Duplicate JOB_TERMINATED "
  919. "record (nf) for job %u at "
  920. "line %d\n",
  921. temp->header.jobnum, lc);
  922. /* JOB_TERMINATED/NF records may be preceded
  923. * by a JOB_TERMINATED/CA record; NF is much
  924. * more interesting.
  925. */
  926. job->status = temp->status;
  927. goto finished;
  928. }
  929. fprintf(stderr,
  930. "Conflicting JOB_TERMINATED record (%s) for "
  931. "job %u at line %d -- ignoring it\n",
  932. job_state_string(temp->status),
  933. job->header.jobnum, lc);
  934. goto finished;
  935. }
  936. job->job_terminated_seen = 1;
  937. job->elapsed = temp->elapsed;
  938. job->end = temp->header.timestamp;
  939. job->status = temp->status;
  940. job->requid = temp->requid;
  941. job->exitcode = temp->exitcode;
  942. if(list_count(job->steps) > 1)
  943. job->track_steps = 1;
  944. job->show_full = show_full;
  945. finished:
  946. _destroy_filetxt_job_rec(temp);
  947. }
  948. extern List filetxt_jobacct_process_get_jobs(slurmdb_job_cond_t *job_cond)
  949. {
  950. char line[BUFFER_SIZE];
  951. char *f[MAX_RECORD_FIELDS+1]; /* End list with null entry and,
  952. possibly, more data than we
  953. expected */
  954. char *fptr = NULL, *filein = NULL;
  955. int i;
  956. FILE *fd = NULL;
  957. int lc = 0;
  958. int rec_type = -1;
  959. int job_id = 0, step_id = 0, uid = 0, gid = 0;
  960. filetxt_job_rec_t *filetxt_job = NULL;
  961. slurmdb_selected_step_t *selected_step = NULL;
  962. char *object = NULL;
  963. ListIterator itr = NULL, itr2 = NULL;
  964. int show_full = 0;
  965. int fdump_flag = 0;
  966. List ret_job_list = list_create(slurmdb_destroy_job_rec);
  967. List job_list = list_create(_destroy_filetxt_job_rec);
  968. filein = slurm_get_accounting_storage_loc();
  969. /* we grab the fdump only for the filetxt plug through the
  970. FDUMP_FLAG on the job_cond->duplicates variable. We didn't
  971. add this extra field to the structure since it only applies
  972. to this plugin.
  973. */
  974. if(job_cond) {
  975. fdump_flag = job_cond->duplicates & FDUMP_FLAG;
  976. job_cond->duplicates &= (~FDUMP_FLAG);
  977. if(!job_cond->duplicates)
  978. itr2 = list_iterator_create(ret_job_list);
  979. }
  980. fd = _open_log_file(filein);
  981. while (fgets(line, BUFFER_SIZE, fd)) {
  982. lc++;
  983. fptr = line; /* break the record into NULL-
  984. terminated strings */
  985. for (i = 0; i < MAX_RECORD_FIELDS; i++) {
  986. f[i] = fptr;
  987. fptr = strstr(fptr, " ");
  988. if (fptr == NULL) {
  989. fptr = strstr(f[i], "\n");
  990. if (fptr)
  991. *fptr = 0;
  992. break;
  993. } else {
  994. *fptr++ = 0;
  995. }
  996. }
  997. if (i < MAX_RECORD_FIELDS)
  998. i++;
  999. f[i] = 0;
  1000. if (i < HEADER_LENGTH) {
  1001. continue;
  1002. }
  1003. rec_type = atoi(f[F_RECTYPE]);
  1004. job_id = atoi(f[F_JOB]);
  1005. uid = atoi(f[F_UID]);
  1006. gid = atoi(f[F_GID]);
  1007. if(rec_type == JOB_STEP)
  1008. step_id = atoi(f[F_JOBSTEP]);
  1009. else
  1010. step_id = NO_VAL;
  1011. if(!job_cond) {
  1012. show_full = 1;
  1013. goto no_cond;
  1014. }
  1015. if (job_cond->userid_list
  1016. && list_count(job_cond->userid_list)) {
  1017. itr = list_iterator_create(job_cond->userid_list);
  1018. while((object = list_next(itr))) {
  1019. if (atoi(object) == uid) {
  1020. list_iterator_destroy(itr);
  1021. goto founduid;
  1022. }
  1023. }
  1024. list_iterator_destroy(itr);
  1025. continue; /* no match */
  1026. }
  1027. founduid:
  1028. if (job_cond->groupid_list
  1029. && list_count(job_cond->groupid_list)) {
  1030. itr = list_iterator_create(job_cond->groupid_list);
  1031. while((object = list_next(itr))) {
  1032. if (atoi(object) == gid) {
  1033. list_iterator_destroy(itr);
  1034. goto foundgid;
  1035. }
  1036. }
  1037. list_iterator_destroy(itr);
  1038. continue; /* no match */
  1039. }
  1040. foundgid:
  1041. if (job_cond->jobname_list
  1042. && list_count(job_cond->jobname_list)) {
  1043. itr = list_iterator_create(job_cond->jobname_list);
  1044. while((object = list_next(itr))) {
  1045. if (!strcasecmp(f[F_JOBNAME], object)) {
  1046. list_iterator_destroy(itr);
  1047. goto foundjobname;
  1048. }
  1049. }
  1050. list_iterator_destroy(itr);
  1051. continue; /* no match */
  1052. }
  1053. foundjobname:
  1054. if (job_cond->step_list
  1055. && list_count(job_cond->step_list)) {
  1056. itr = list_iterator_create(job_cond->step_list);
  1057. while((selected_step = list_next(itr))) {
  1058. if (selected_step->jobid != job_id)
  1059. continue;
  1060. /* job matches; does the step? */
  1061. if(selected_step->stepid == NO_VAL) {
  1062. show_full = 1;
  1063. list_iterator_destroy(itr);
  1064. goto foundjob;
  1065. } else if (rec_type != JOB_STEP
  1066. || selected_step->stepid
  1067. == step_id) {
  1068. list_iterator_destroy(itr);
  1069. goto foundjob;
  1070. }
  1071. }
  1072. list_iterator_destroy(itr);
  1073. continue; /* no match */
  1074. } else {
  1075. show_full = 1;
  1076. }
  1077. foundjob:
  1078. if (job_cond->partition_list
  1079. && list_count(job_cond->partition_list)) {
  1080. itr = list_iterator_create(job_cond->partition_list);
  1081. while((object = list_next(itr)))
  1082. if (!strcasecmp(f[F_PARTITION], object)) {
  1083. list_iterator_destroy(itr);
  1084. goto foundp;
  1085. }
  1086. list_iterator_destroy(itr);
  1087. continue; /* no match */
  1088. }
  1089. foundp:
  1090. if (fdump_flag) {
  1091. _do_fdump(f, lc);
  1092. continue;
  1093. }
  1094. no_cond:
  1095. /* Build suitable tables with all the data */
  1096. switch(rec_type) {
  1097. case JOB_START:
  1098. if(i < F_JOB_ACCOUNT) {
  1099. error("Bad data on a Job Start");
  1100. _show_rec(f);
  1101. } else
  1102. _process_start(job_list, f, lc, show_full, i);
  1103. break;
  1104. case JOB_STEP:
  1105. if(i < F_MAX_VSIZE) {
  1106. error("Bad data on a Step entry");
  1107. _show_rec(f);
  1108. } else
  1109. _process_step(job_list, f, lc, show_full, i);
  1110. break;
  1111. case JOB_SUSPEND:
  1112. if(i < F_JOB_REQUID) {
  1113. error("Bad data on a Suspend entry");
  1114. _show_rec(f);
  1115. } else
  1116. _process_suspend(job_list, f, lc,
  1117. show_full, i);
  1118. break;
  1119. case JOB_TERMINATED:
  1120. if(i < F_JOB_REQUID) {
  1121. error("Bad data on a Job Term");
  1122. _show_rec(f);
  1123. } else
  1124. _process_terminated(job_list, f, lc,
  1125. show_full, i);
  1126. break;
  1127. default:
  1128. debug("Invalid record at line %d of input file", lc);
  1129. _show_rec(f);
  1130. break;
  1131. }
  1132. }
  1133. if (ferror(fd)) {
  1134. perror(filein);
  1135. exit(1);
  1136. }
  1137. fclose(fd);
  1138. itr = list_iterator_create(job_list);
  1139. while((filetxt_job = list_next(itr))) {
  1140. slurmdb_job_rec_t *slurmdb_job =
  1141. _slurmdb_create_job_rec(filetxt_job, job_cond);
  1142. if(slurmdb_job) {
  1143. slurmdb_job_rec_t *curr_job = NULL;
  1144. if(itr2) {
  1145. list_iterator_reset(itr2);
  1146. while((curr_job = list_next(itr2))) {
  1147. if (curr_job->jobid ==
  1148. slurmdb_job->jobid) {
  1149. list_delete_item(itr2);
  1150. info("removing job %d",
  1151. slurmdb_job->jobid);
  1152. break;
  1153. }
  1154. }
  1155. }
  1156. list_append(ret_job_list, slurmdb_job);
  1157. }
  1158. }
  1159. if(itr2)
  1160. list_iterator_destroy(itr2);
  1161. list_iterator_destroy(itr);
  1162. list_destroy(job_list);
  1163. xfree(filein);
  1164. return ret_job_list;
  1165. }
  1166. extern int filetxt_jobacct_process_archive(slurmdb_archive_cond_t *arch_cond)
  1167. {
  1168. char line[BUFFER_SIZE],
  1169. *f[EXPIRE_READ_LENGTH],
  1170. *fptr = NULL,
  1171. *logfile_name = NULL,
  1172. *old_logfile_name = NULL,
  1173. *filein = NULL,
  1174. *object = NULL;
  1175. int file_err=0,
  1176. new_file,
  1177. i = 0,
  1178. rc = SLURM_ERROR;
  1179. expired_rec_t *exp_rec = NULL;
  1180. expired_rec_t *exp_rec2 = NULL;
  1181. List keep_list = list_create(_destroy_exp);
  1182. List exp_list = list_create(_destroy_exp);
  1183. List other_list = list_create(_destroy_exp);
  1184. struct stat statbuf;
  1185. mode_t prot = 0600;
  1186. uid_t uid;
  1187. gid_t gid;
  1188. FILE *expired_logfile = NULL,
  1189. *new_logfile = NULL;
  1190. FILE *fd = NULL;
  1191. int lc=0;
  1192. int rec_type = -1;
  1193. ListIterator itr = NULL;
  1194. ListIterator itr2 = NULL;
  1195. slurmdb_job_cond_t *job_cond = NULL;
  1196. /* Figure out our expiration date */
  1197. time_t expiry;
  1198. if(!arch_cond || !arch_cond->job_cond) {
  1199. error("no job_cond was given for archive");
  1200. return SLURM_ERROR;
  1201. }
  1202. job_cond = arch_cond->job_cond;
  1203. if(!arch_cond->archive_script)
  1204. filein = slurm_get_accounting_storage_loc();
  1205. else
  1206. filein = arch_cond->archive_script;
  1207. expiry = time(NULL) - job_cond->usage_end;
  1208. debug("Purging jobs completed prior to %d", (int)expiry);
  1209. /* Open the current or specified logfile, or quit */
  1210. fd = _open_log_file(filein);
  1211. if (stat(filein, &statbuf)) {
  1212. perror("stat'ing logfile");
  1213. goto finished;
  1214. }
  1215. if ((statbuf.st_mode & S_IFLNK) == S_IFLNK) {
  1216. error("%s is a symbolic link; --expire requires "
  1217. "a hard-linked file name", filein);
  1218. goto finished;
  1219. }
  1220. if (!(statbuf.st_mode & S_IFREG)) {
  1221. error("%s is not a regular file; --expire "
  1222. "only works on accounting log files",
  1223. filein);
  1224. goto finished;
  1225. }
  1226. prot = statbuf.st_mode & 0777;
  1227. gid = statbuf.st_gid;
  1228. uid = statbuf.st_uid;
  1229. old_logfile_name = _prefix_filename(filein, ".old.");
  1230. if (stat(old_logfile_name, &statbuf)) {
  1231. if (errno != ENOENT) {
  1232. fprintf(stderr,"Error checking for %s: ",
  1233. old_logfile_name);
  1234. perror("");
  1235. goto finished;
  1236. }
  1237. } else {
  1238. error("Warning! %s exists -- please remove "
  1239. "or rename it before proceeding",
  1240. old_logfile_name);
  1241. goto finished;
  1242. }
  1243. /* create our initial buffer */
  1244. while (fgets(line, BUFFER_SIZE, fd)) {
  1245. lc++;
  1246. fptr = line; /* break the record into NULL-
  1247. terminated strings */
  1248. exp_rec = xmalloc(sizeof(expired_rec_t));
  1249. exp_rec->line = xstrdup(line);
  1250. for (i = 0; i < EXPIRE_READ_LENGTH; i++) {
  1251. f[i] = fptr;
  1252. fptr = strstr(fptr, " ");
  1253. if (fptr == NULL)
  1254. break;
  1255. else
  1256. *fptr++ = 0;
  1257. }
  1258. exp_rec->job = atoi(f[F_JOB]);
  1259. exp_rec->job_submit = atoi(f[F_JOB_SUBMIT]);
  1260. rec_type = atoi(f[F_RECTYPE]);
  1261. /* Odd, but complain some other time */
  1262. if (rec_type == JOB_TERMINATED) {
  1263. if (expiry < atoi(f[F_TIMESTAMP])) {
  1264. list_append(keep_list, exp_rec);
  1265. continue;
  1266. }
  1267. if (job_cond->partition_list
  1268. && list_count(job_cond->partition_list)) {
  1269. itr = list_iterator_create(
  1270. job_cond->partition_list);
  1271. while((object = list_next(itr)))
  1272. if (!strcasecmp(f[F_PARTITION], object))
  1273. break;
  1274. list_iterator_destroy(itr);
  1275. if(!object)
  1276. continue; /* no match */
  1277. }
  1278. list_append(exp_list, exp_rec);
  1279. debug2("Selected: %8d %d",
  1280. exp_rec->job,
  1281. (int)exp_rec->job_submit);
  1282. } else {
  1283. list_append(other_list, exp_rec);
  1284. }
  1285. }
  1286. if (!list_count(exp_list)) {
  1287. debug3("No job records were purged.");
  1288. goto finished;
  1289. }
  1290. logfile_name = xmalloc(strlen(filein)+sizeof(".expired"));
  1291. sprintf(logfile_name, "%s.expired", filein);
  1292. new_file = stat(logfile_name, &statbuf);
  1293. if ((expired_logfile = fopen(logfile_name, "a"))==NULL) {
  1294. error("Error while opening %s",
  1295. logfile_name);
  1296. perror("");
  1297. xfree(logfile_name);
  1298. goto finished;
  1299. }
  1300. if (new_file) { /* By default, the expired file looks like the log */
  1301. chmod(logfile_name, prot);
  1302. if(chown(logfile_name, uid, gid) == -1)
  1303. error("Couldn't change ownership of %s to %u:%u",
  1304. logfile_name, uid, gid);
  1305. }
  1306. xfree(logfile_name);
  1307. logfile_name = _prefix_filename(filein, ".new.");
  1308. if ((new_logfile = fopen(logfile_name, "w"))==NULL) {
  1309. error("Error while opening %s",
  1310. logfile_name);
  1311. perror("");
  1312. fclose(expired_logfile);
  1313. goto finished;
  1314. }
  1315. chmod(logfile_name, prot); /* preserve file protection */
  1316. if(chown(logfile_name, uid, gid) == -1)/* and ownership */
  1317. error("2 Couldn't change ownership of %s to %u:%u",
  1318. logfile_name, uid, gid);
  1319. /* Use line buffering to allow us to safely write
  1320. * to the log file at the same time as slurmctld. */
  1321. if (setvbuf(new_logfile, NULL, _IOLBF, 0)) {
  1322. perror("setvbuf()");
  1323. fclose(expired_logfile);
  1324. goto finished2;
  1325. }
  1326. list_sort(exp_list, (ListCmpF) _cmp_jrec);
  1327. list_sort(keep_list, (ListCmpF) _cmp_jrec);
  1328. /* if (params->opt_verbose > 2) { */
  1329. /* error("--- contents of exp_list ---"); */
  1330. /* itr = list_iterator_create(exp_list); */
  1331. /* while((exp_rec = list_next(itr))) */
  1332. /* error("%d", exp_rec->job); */
  1333. /* error("---- end of exp_list ---"); */
  1334. /* list_iterator_destroy(itr); */
  1335. /* } */
  1336. /* write the expired file */
  1337. itr = list_iterator_create(exp_list);
  1338. while((exp_rec = list_next(itr))) {
  1339. itr2 = list_iterator_create(other_list);
  1340. while((exp_rec2 = list_next(itr2))) {
  1341. if((exp_rec2->job != exp_rec->job)
  1342. || (exp_rec2->job_submit != exp_rec->job_submit))
  1343. continue;
  1344. if (fputs(exp_rec2->line, expired_logfile)<0) {
  1345. perror("writing expired_logfile");
  1346. list_iterator_destroy(itr2);
  1347. list_iterator_destroy(itr);
  1348. fclose(expired_logfile);
  1349. goto finished2;
  1350. }
  1351. list_remove(itr2);
  1352. _destroy_exp(exp_rec2);
  1353. }
  1354. list_iterator_destroy(itr2);
  1355. if (fputs(exp_rec->line, expired_logfile)<0) {
  1356. perror("writing expired_logfile");
  1357. list_iterator_destroy(itr);
  1358. fclose(expired_logfile);
  1359. goto finished2;
  1360. }
  1361. }
  1362. list_iterator_destroy(itr);
  1363. fclose(expired_logfile);
  1364. /* write the new log */
  1365. itr = list_iterator_create(keep_list);
  1366. while((exp_rec = list_next(itr))) {
  1367. itr2 = list_iterator_create(other_list);
  1368. while((exp_rec2 = list_next(itr2))) {
  1369. if(exp_rec2->job != exp_rec->job)
  1370. continue;
  1371. if (fputs(exp_rec2->line, new_logfile)<0) {
  1372. perror("writing keep_logfile");
  1373. list_iterator_destroy(itr2);
  1374. list_iterator_destroy(itr);
  1375. goto finished2;
  1376. }
  1377. list_remove(itr2);
  1378. _destroy_exp(exp_rec2);
  1379. }
  1380. list_iterator_destroy(itr2);
  1381. if (fputs(exp_rec->line, new_logfile)<0) {
  1382. perror("writing keep_logfile");
  1383. list_iterator_destroy(itr);
  1384. goto finished2;
  1385. }
  1386. }
  1387. list_iterator_destroy(itr);
  1388. /* write records in other_list to new log */
  1389. itr = list_iterator_create(other_list);
  1390. while((exp_rec = list_next(itr))) {
  1391. if (fputs(exp_rec->line, new_logfile)<0) {
  1392. perror("writing keep_logfile");
  1393. list_iterator_destroy(itr);
  1394. goto finished2;
  1395. }
  1396. }
  1397. list_iterator_destroy(itr);
  1398. if (rename(filein, old_logfile_name)) {
  1399. perror("renaming logfile to .old.");
  1400. goto finished2;
  1401. }
  1402. if (rename(logfile_name, filein)) {
  1403. perror("renaming new logfile");
  1404. /* undo it? */
  1405. if (!rename(old_logfile_name, filein))
  1406. error("Please correct the problem "
  1407. "and try again");
  1408. else
  1409. error("SEVERE ERROR: Current accounting "
  1410. "log may have been renamed %s;\n"
  1411. "please rename it to \"%s\" if necessary, "
  1412. "and try again",
  1413. old_logfile_name, filein);
  1414. goto finished2;
  1415. }
  1416. fflush(new_logfile); /* Flush the buffers before forking */
  1417. fflush(fd);
  1418. file_err = slurm_reconfigure();
  1419. if (file_err) {
  1420. file_err = 1;
  1421. error("Error: Attempt to reconfigure SLURM failed.");
  1422. if (rename(old_logfile_name, filein)) {
  1423. perror("renaming logfile from .old.");
  1424. goto finished2;
  1425. }
  1426. }
  1427. if (fseek(fd, 0, SEEK_CUR)) { /* clear EOF */
  1428. perror("looking for late-arriving records");
  1429. goto finished2;
  1430. }
  1431. /* reopen new logfile in append mode, since slurmctld may write it */
  1432. if (freopen(filein, "a", new_logfile) == NULL) {
  1433. perror("reopening new logfile");
  1434. goto finished2;
  1435. }
  1436. while (fgets(line, BUFFER_SIZE, fd)) {
  1437. if (fputs(line, new_logfile)<0) {
  1438. perror("writing final records");
  1439. goto finished2;
  1440. }
  1441. }
  1442. rc = SLURM_SUCCESS;
  1443. printf("%d jobs expired.\n", list_count(exp_list));
  1444. finished2:
  1445. fclose(new_logfile);
  1446. if (!file_err) {
  1447. if (unlink(old_logfile_name) == -1)
  1448. error("Unable to unlink old logfile %s: %m",
  1449. old_logfile_name);
  1450. }
  1451. finished:
  1452. xfree(filein);
  1453. fclose(fd);
  1454. list_destroy(exp_list);
  1455. list_destroy(keep_list);
  1456. list_destroy(other_list);
  1457. xfree(old_logfile_name);
  1458. xfree(logfile_name);
  1459. return rc;
  1460. }