/src/resmom/linux/mom_mach.c

https://github.com/itkovian/torque · C · 5345 lines · 3417 code · 1458 blank · 470 comment · 820 complexity · 8cf49e16f2eb3c479a19208755979b78 MD5 · raw file

  1. #include "license_pbs.h" /* See here for the software license */
  2. #include <pbs_config.h> /* the master config generated by configure */
  3. #include "lib_mom.h" /* header */
  4. #include <assert.h>
  5. #include <limits.h>
  6. #include <stdio.h>
  7. #include <stdlib.h>
  8. #include <unistd.h>
  9. #include <dirent.h>
  10. #include <errno.h>
  11. #include <strings.h>
  12. #include <mntent.h>
  13. #include <asm/types.h>
  14. #include <time.h>
  15. #include <sys/quota.h>
  16. #include <sys/time.h>
  17. #include <sys/procfs.h>
  18. #include <sys/param.h>
  19. #include <sys/stat.h>
  20. #include <sys/vfs.h>
  21. #include <sys/sysmacros.h>
  22. #include <sys/resource.h>
  23. #include <signal.h>
  24. #include <syscall.h>
  25. #include <ctype.h>
  26. #include <string.h>
  27. #include <csv.h>
  28. #include <fcntl.h>
  29. /* needed for oom_adj */
  30. #include <linux/limits.h>
  31. #ifdef Q_6_5_QUOTAON
  32. /* remap dqblk for SUSE 9.0 */
  33. #define dqblk if_dqblk
  34. #endif /* Q_6_5_QUOTAON */
  35. /*
  36. #ifndef dqblk
  37. #include <linux/quotaio_v1.h>
  38. #define dqblk v1_disk_dqblk
  39. #endif
  40. */
  41. #include "pbs_error.h"
  42. #include "portability.h"
  43. #include "list_link.h"
  44. #include "server_limits.h"
  45. #include "attribute.h"
  46. #include "resource.h"
  47. #include "pbs_job.h"
  48. #include "log.h"
  49. #include "mom_mach.h"
  50. #include "mom_func.h"
  51. #include "resmon.h"
  52. #include "utils.h"
  53. #include "../rm_dep.h"
  54. #include "pbs_nodes.h"
  55. #ifdef PENABLE_LINUX26_CPUSETS
  56. #include "pbs_cpuset.h"
  57. #endif
  58. #include "mom_config.h"
  59. /*
  60. ** System dependent code to gather information for the resource
  61. ** monitor for a Linux i386 machine.
  62. **
  63. ** Resources known by this code:
  64. ** cput cpu time for a pid or session
  65. ** mem memory size for a pid or session in KB
  66. ** resi resident memory size for a pid or session in KB
  67. ** sessions list of sessions in the system
  68. ** pids list of pids in a session
  69. ** nsessions number of sessions in the system
  70. ** nusers number of users in the system
  71. ** totmem total memory size in KB
  72. ** availmem available memory size in KB
  73. ** ncpus number of cpus
  74. ** physmem physical memory size in KB
  75. ** size size of a file or filesystem
  76. ** idletime seconds of idle time
  77. ** walltime wall clock time for a pid
  78. ** loadave current load average
  79. ** quota quota information (sizes in kb)
  80. ** netload number of bytes transferred for all interfaces
  81. */
  82. #ifndef MAX_LINE
  83. #define MAX_LINE 1024
  84. #endif
  85. #ifndef TRUE
  86. #define FALSE 0
  87. #define TRUE 1
  88. #endif /* TRUE */
  89. static char procfs[] = "/proc";
  90. static DIR *pdir = NULL;
  91. static int pagesize;
  92. extern char *ret_string;
  93. extern time_t time_now;
  94. #define TBL_INC 200 /* initial proc table */
  95. #define PMEMBUF_SIZE 2048
  96. static proc_stat_t *proc_array = NULL;
  97. static int nproc = 0;
  98. static int max_proc = 0;
  99. /*
  100. ** external functions and data
  101. */
  102. extern tlist_head svr_alljobs;
  103. extern struct config *search(struct config *,char *);
  104. extern struct rm_attribute *momgetattr(char *);
  105. extern long system_ncpus;
  106. #ifdef NUMA_SUPPORT
  107. extern int num_node_boards;
  108. extern nodeboard node_boards[];
  109. extern int numa_index;
  110. #else
  111. extern char path_meminfo[MAX_LINE];
  112. #endif /* NUMA_SUPPORT */
  113. /*
  114. ** local functions and data
  115. */
  116. static const char *resi (struct rm_attribute *);
  117. static const char *totmem (struct rm_attribute *);
  118. static const char *availmem (struct rm_attribute *);
  119. static const char *physmem (struct rm_attribute *);
  120. static const char *ncpus (struct rm_attribute *);
  121. static const char *walltime (struct rm_attribute *);
  122. static const char *quota (struct rm_attribute *);
  123. static const char *netload (struct rm_attribute *);
  124. #ifdef NUMA_SUPPORT
  125. const char *cpuact (struct rm_attribute *);
  126. #endif
  127. #ifdef USELIBMEMACCT
  128. #ifdef __cplusplus
  129. extern "C"
  130. {
  131. #endif
  132. long long get_memacct_resi(pid_t pid);
  133. extern long get_weighted_memory_size(pid_t);
  134. #ifdef __cplusplus
  135. }
  136. #endif
  137. #endif
  138. #ifndef mbool_t
  139. #define mbool_t char
  140. #endif /* mbool_t */
  141. mbool_t ProcIsChild(char *,pid_t,char *);
  142. extern const char *loadave(struct rm_attribute *);
  143. extern const char *nullproc(struct rm_attribute *);
  144. time_t wait_time = 10;
  145. #ifdef NUMA_SUPPORT
  146. typedef struct proc_cpu
  147. {
  148. unsigned long long idle_total;
  149. unsigned long long busy_total;
  150. } proc_cpu_t;
  151. static proc_cpu_t *cpu_array = NULL;
  152. #endif
  153. /*
  154. ** local resource array
  155. */
  156. struct config dependent_config[] =
  157. {
  158. { "resi", {resi} },
  159. { "totmem", {totmem} },
  160. { "availmem", {availmem} },
  161. { "physmem", {physmem} },
  162. { "ncpus", {ncpus} },
  163. #ifdef NUMA_SUPPORT
  164. { "loadave", {cpuact} },
  165. #else
  166. { "loadave", {loadave} },
  167. #endif
  168. { "walltime", {walltime} },
  169. { "quota", {quota} },
  170. { "netload", {netload} },
  171. { "size", {size} },
  172. { NULL, {nullproc} }
  173. };
  174. unsigned linux_time = 0;
  175. /*
  176. * support routine for getting system time -- sets linux_time
  177. */
  178. void proc_get_btime(void)
  179. {
  180. FILE *fp;
  181. char label[256];
  182. if ((fp = fopen("/proc/stat", "r")) == NULL)
  183. {
  184. return;
  185. }
  186. while (!feof(fp))
  187. {
  188. if (fscanf(fp, "%s", label) != 1)
  189. {
  190. fclose(fp);
  191. return;
  192. }
  193. if (strcmp(label, "btime"))
  194. {
  195. if (fscanf(fp, "%*[^\n]%*c") != 0)
  196. {
  197. fclose(fp);
  198. return;
  199. }
  200. }
  201. else
  202. {
  203. if (fscanf(fp, "%u", &linux_time) != 1) {}
  204. fclose(fp);
  205. return;
  206. }
  207. } /* END while (!feof(fp)) */
  208. fclose(fp);
  209. return;
  210. } /* END proc_get_btime() */
  211. /* NOTE: see 'man 5 proc' for /proc/pid/stat format and description */
  212. /* NOTE: leading '*' indicates that field should be ignored */
  213. /* FORMAT: <PID> <COMM> <STATE> <PPID> <PGRP> <SESSION> [<TTY_NR>] [<TPGID>] <FLAGS> [<MINFLT>] [<CMINFLT>] [<MAJFLT>] [<CMAJFLT>] <UTIME> <STIME> <CUTIME> <CSTIME> [<PRIORITY>] [<NICE>] [<0>] [<ITREALVALUE>] <STARTTIME> <VSIZE> <RSS> [<RLIM>] [<STARTCODE>] ... */
  214. static char stat_str[] = " %c %d %d %d %*d %*d %u %*u \
  215. %*u %*u %*u %lu %lu %lu %lu %*ld %*ld %*u %*ld %lu %llu %lld %*lu %*lu \
  216. %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu";
  217. /*
  218. * Convert jiffies to seconds.
  219. *
  220. * Hertz is sysconf(_SC_CLK_TCK) in get_proc_stat()
  221. */
  222. #define JTOS(x) (x) / Hertz;
  223. /*
  224. * Linux /proc status routine.
  225. *
  226. * Returns a pointer to a static proc_stat_t structure given
  227. * a process number, or NULL if there is an error. Takes the
  228. * place of the ioctl call PIOCSTATUS in the irix imp of mom_mach.c
  229. *
  230. */
  231. proc_stat_t *get_proc_stat(
  232. int pid) /* I */
  233. {
  234. static proc_stat_t ps;
  235. static char path[MAXLINE];
  236. static char readbuf[MAXLINE << 2];
  237. static char *lastbracket;
  238. FILE *fd;
  239. unsigned long jstarttime; /* number of jiffies since OS start time when process started */
  240. struct stat sb;
  241. static int Hertz = 0;
  242. int Hertz_errored = 0;
  243. if (Hertz <= 0)
  244. {
  245. Hertz = sysconf(_SC_CLK_TCK); /* returns 0 on error */
  246. if (Hertz <= 0)
  247. {
  248. /* FAILURE */
  249. if (!Hertz_errored)
  250. log_err(errno, "get_proc_stat", "sysconf(_SC_CLK_TCK) failed, unable to monitor processes");
  251. Hertz_errored = 1;
  252. return(NULL);
  253. }
  254. }
  255. Hertz_errored = 0;
  256. sprintf(path, "/proc/%d/stat",
  257. pid);
  258. if ((fd = fopen(path, "r")) == NULL)
  259. {
  260. /* FAILURE */
  261. return(NULL);
  262. }
  263. /* use 'man 5 proc' for /proc/pid/stat format */
  264. if (!fgets(readbuf, sizeof(readbuf), fd))
  265. {
  266. fclose(fd);
  267. return(NULL);
  268. }
  269. lastbracket = strrchr(readbuf, ')');
  270. if (lastbracket == NULL)
  271. {
  272. fclose(fd);
  273. return(NULL);
  274. }
  275. *lastbracket = '\0'; /* We basically split the string here, overwriting the ')'. */
  276. lastbracket++;
  277. if (sscanf(readbuf,"%d (%[^\n]",&ps.pid,path) != 2)
  278. {
  279. /* FAILURE */
  280. fclose(fd);
  281. return(NULL);
  282. }
  283. /* see stat_str[] value for mapping 'stat' format */
  284. if (sscanf(lastbracket,stat_str,
  285. &ps.state, /* state (one of RSDZTW) */
  286. &ps.ppid, /* ppid */
  287. &ps.pgrp, /* pgrp */
  288. &ps.session, /* session id */
  289. &ps.flags, /* flags - kernel flags of the process, see the PF_* in <linux/sched.h> */
  290. &ps.utime, /* utime - jiffies that this process has been scheduled in user mode */
  291. &ps.stime, /* stime - jiffies that this process has been scheduled in kernel mode */
  292. &ps.cutime, /* cutime - jiffies that this process’s waited-for children have been scheduled in user mode */
  293. &ps.cstime, /* cstime - jiffies that this process’s waited-for children have been scheduled in kernel mode */
  294. &jstarttime, /* starttime */
  295. &ps.vsize, /* vsize */
  296. &ps.rss) != 12) /* rss */
  297. {
  298. /* FAILURE */
  299. fclose(fd);
  300. return(NULL);
  301. }
  302. if (fstat(fileno(fd), &sb) == -1)
  303. {
  304. /* FAILURE */
  305. fclose(fd);
  306. return(NULL);
  307. }
  308. ps.uid = sb.st_uid;
  309. ps.start_time = linux_time + JTOS(jstarttime);
  310. ps.name = path;
  311. ps.utime = JTOS(ps.utime);
  312. ps.stime = JTOS(ps.stime);
  313. ps.cutime = JTOS(ps.cutime);
  314. ps.cstime = JTOS(ps.cstime);
  315. /* SUCCESS */
  316. fclose(fd);
  317. return(&ps);
  318. } /* END get_proc_stat() */
  319. #ifdef USELIBMEMACCT
  320. /*
  321. * Retrieve weighted RSS value for process with pid from memacctd.
  322. * Returns the value in bytes on success, returns -1 on failure.
  323. */
  324. long long get_memacct_resi(pid_t pid)
  325. {
  326. long long w_rss;
  327. if ((w_rss = get_weighted_memory_size(pid)) == -1)
  328. {
  329. sprintf(log_buffer, "get_weighted_memory_size(%d) failed", pid);
  330. log_err(errno, __func__, log_buffer);
  331. }
  332. return(w_rss);
  333. } /* END get_memacct_resi() */
  334. #endif
  335. /*
  336. * get_proc_mem_from_path()
  337. * @returns a pointer to a struct containing the memory information
  338. * @pre-cond: path must point to a valid path of a meminfo system file
  339. */
  340. proc_mem_t *get_proc_mem_from_path(
  341. const char *path)
  342. {
  343. proc_mem_t *mm;
  344. FILE *fp;
  345. char str[32];
  346. long long bfsz = -1;
  347. long long casz = -1;
  348. long long fcasz = -1;
  349. if ((fp = fopen(path,"r")) == NULL)
  350. {
  351. return(NULL);
  352. }
  353. mm = (proc_mem_t *)calloc(1, sizeof(proc_mem_t));
  354. if (fscanf(fp,"%30s",str) != 1)
  355. {
  356. fclose(fp);
  357. return(NULL);
  358. }
  359. if (!strncmp(str,"total:",sizeof(str)))
  360. {
  361. /* old format */
  362. if (fscanf(fp,"%*[^\n]%*c") != 0) /* remove text header */
  363. {
  364. fclose(fp);
  365. return(NULL);
  366. }
  367. /* umu vmem patch */
  368. if (fscanf(fp, "%*s %llu %llu %llu %*u %lld %lld",
  369. &mm->mem_total,
  370. &mm->mem_used,
  371. &mm->mem_free,
  372. &bfsz,
  373. &casz) != 5)
  374. {
  375. fclose(fp);
  376. return(NULL);
  377. }
  378. mm->mem_free += casz + bfsz;
  379. if (fscanf(fp, "%*s %llu %llu %llu %*[^\n]%*c",
  380. &mm->swap_total,
  381. &mm->swap_used,
  382. &mm->swap_free) != 3)
  383. {
  384. fclose(fp);
  385. return(NULL);
  386. }
  387. }
  388. else
  389. {
  390. do
  391. {
  392. /* new format (kernel > 2.4) the first 'str' has been read */
  393. if (!strncmp(str, "MemTotal:", sizeof(str)))
  394. {
  395. if (fscanf(fp, "%llu",
  396. &mm->mem_total) != 1)
  397. {
  398. fclose(fp);
  399. return(NULL);
  400. }
  401. mm->mem_total *= 1024; /* the unit is kB */
  402. }
  403. else if (!strncmp(str, "MemFree:", sizeof(str)))
  404. {
  405. if (fscanf(fp, "%llu",
  406. &mm->mem_free) != 1)
  407. {
  408. fclose(fp);
  409. return(NULL);
  410. }
  411. mm->mem_free *= 1024;
  412. }
  413. else if (!strncmp(str, "Buffers:", sizeof(str)))
  414. {
  415. if (fscanf(fp, "%lld",
  416. &bfsz) != 1)
  417. {
  418. fclose(fp);
  419. return(NULL);
  420. }
  421. bfsz *= 1024;
  422. }
  423. else if (!strncmp(str, "Cached:", sizeof(str)))
  424. {
  425. if (fscanf(fp, "%lld",
  426. &casz) != 1)
  427. {
  428. fclose(fp);
  429. return(NULL);
  430. }
  431. casz *= 1024;
  432. }
  433. else if (!strncmp(str, "FilePages:", sizeof(str)))
  434. {
  435. if (fscanf(fp, "%lld",
  436. &fcasz) != 1)
  437. {
  438. fclose(fp);
  439. return(NULL);
  440. }
  441. fcasz *= 1024;
  442. }
  443. else if (!strncmp(str, "SwapTotal:", sizeof(str)))
  444. {
  445. if (fscanf(fp, "%llu",
  446. &mm->swap_total) != 1)
  447. {
  448. fclose(fp);
  449. return(NULL);
  450. }
  451. mm->swap_total *= 1024;
  452. }
  453. else if (!strncmp(str, "SwapFree:", sizeof(str)))
  454. {
  455. if (fscanf(fp, "%llu",
  456. &mm->swap_free) != 1)
  457. {
  458. fclose(fp);
  459. return(NULL);
  460. }
  461. mm->swap_free *= 1024;
  462. }
  463. }
  464. while (fscanf(fp, "%30s", str) == 1);
  465. } /* END else */
  466. fclose(fp);
  467. if (bfsz >= 0 || casz >= 0)
  468. {
  469. if (bfsz > 0)
  470. mm->mem_free += bfsz;
  471. if (casz > 0)
  472. mm->mem_free += casz;
  473. }
  474. else if (fcasz > 0)
  475. {
  476. mm->mem_free += fcasz;
  477. }
  478. return(mm);
  479. } /* END get_proc_mem_from_path() */
  480. proc_mem_t *get_proc_mem(void)
  481. {
  482. static proc_mem_t ret_mm;
  483. #ifdef NUMA_SUPPORT
  484. int i;
  485. #else
  486. proc_mem_t *mem;
  487. #endif
  488. #ifdef NUMA_SUPPORT
  489. ret_mm.mem_total = 0;
  490. ret_mm.mem_used = 0;
  491. ret_mm.mem_free = 0;
  492. ret_mm.swap_total = 0;
  493. ret_mm.swap_used = 0;
  494. ret_mm.swap_free = 0;
  495. for (i = 0; i < node_boards[numa_index].num_nodes; i++)
  496. {
  497. proc_mem_t *node_mem = get_proc_mem_from_path(node_boards[numa_index].path_meminfo[i]);
  498. if (node_mem == NULL)
  499. return(NULL);
  500. ret_mm.mem_total += node_mem->mem_total;
  501. ret_mm.mem_used += node_mem->mem_used;
  502. ret_mm.mem_free += node_mem->mem_free;
  503. ret_mm.swap_total += node_mem->swap_total;
  504. ret_mm.swap_used += node_mem->swap_used;
  505. ret_mm.swap_free += node_mem->swap_free;
  506. free(node_mem);
  507. }
  508. #else
  509. mem = get_proc_mem_from_path(path_meminfo);
  510. if(mem == NULL)
  511. return (NULL);
  512. ret_mm.mem_total = mem->mem_total;
  513. ret_mm.mem_used = mem->mem_used;
  514. ret_mm.mem_free = mem->mem_free;
  515. ret_mm.swap_total = mem->swap_total;
  516. ret_mm.swap_used = mem->swap_used;
  517. ret_mm.swap_free = mem->swap_free;
  518. free(mem);
  519. #endif
  520. return(&ret_mm);
  521. } /* END get_proc_mem() */
  522. #ifdef PNOT
  523. proc_mem_t *get_proc_mem(void)
  524. {
  525. static proc_mem_t mm;
  526. FILE *fp;
  527. unsigned long m_tot, m_use, m_free;
  528. unsigned long s_tot, s_use, s_free;
  529. if ((fp = fopen(path_meminfo, "r")) == NULL)
  530. {
  531. return(NULL);
  532. }
  533. fscanf(fp, "%*[^\n]%*c"); /* remove text header */;
  534. fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c",
  535. &m_tot,
  536. &m_use,
  537. &m_free);
  538. fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c",
  539. &s_tot,
  540. &s_use,
  541. &s_free);
  542. mm.total = m_tot + s_tot;
  543. mm.used = m_use + s_use;
  544. mm.free = m_free + s_free;
  545. fclose(fp);
  546. return(&mm);
  547. } /* END get_proc_mem() */
  548. #endif /* PNOT */
  549. /*
  550. * sets oom_adj score for current process
  551. * requires root privileges or CAP_SYS_RESOURCE to succeed
  552. */
  553. static int oom_adj(int score)
  554. {
  555. pid_t pid;
  556. int rc,fd;
  557. char oom_adj_path[PATH_MAX] = "";
  558. char adj_value[128] = "";
  559. /* valid values are -17 to 15 */
  560. if ( score > 15 || score < -17 )
  561. return -1;
  562. pid = getpid();
  563. if ( snprintf(oom_adj_path, sizeof(oom_adj_path), "/proc/%d/oom_adj", pid) < 0 )
  564. return -1;
  565. if ( ( fd = open(oom_adj_path, O_RDWR) ) == -1 )
  566. return -1;
  567. if (snprintf(adj_value,sizeof(adj_value),"%d",score) < 0)
  568. return -1;
  569. rc = write(fd,adj_value,strlen(adj_value));
  570. close(fd);
  571. return rc;
  572. }
  573. void dep_initialize(void)
  574. {
  575. pagesize = getpagesize();
  576. if ((pdir = opendir(procfs)) == NULL)
  577. {
  578. log_err(errno, __func__, "opendir");
  579. return;
  580. }
  581. /* NOTE: /proc/<pid>/oom_adj tunable is linux specific */
  582. /* LKF: make pbs_mom processes immune to oom killer's killing frenzy if requested*/
  583. if (mom_oom_immunize != 0)
  584. {
  585. if (oom_adj(-17) < 0)
  586. {
  587. log_record(
  588. PBSEVENT_SYSTEM,
  589. PBS_EVENTCLASS_SERVER,
  590. __func__,
  591. "failed to make pbs_mom oom-killer immune");
  592. }
  593. else
  594. {
  595. log_record(
  596. PBSEVENT_SYSTEM,
  597. PBS_EVENTCLASS_SERVER,
  598. __func__,
  599. "mom is now oom-killer safe");
  600. }
  601. }
  602. proc_get_btime();
  603. return;
  604. } /* END dep_initialize() */
  605. void dep_cleanup(void)
  606. {
  607. log_record(PBSEVENT_SYSTEM, 0, __func__, "dependent cleanup");
  608. if (pdir)
  609. {
  610. closedir(pdir);
  611. pdir = NULL;
  612. }
  613. return;
  614. }
  615. /*
  616. * This routine is called on each cycle of the main loop.
  617. */
  618. void
  619. dep_main_loop_cycle(void)
  620. {
  621. /* No periodic functions. */
  622. }
  623. /*
  624. * Internal size decoding routine.
  625. *
  626. * Accepts a resource pointer and a pointer to the unsigned long integer
  627. * to receive the decoded value. It returns a PBS error code, and the
  628. * decoded value in the unsigned long integer.
  629. *
  630. * sizeof(word) = sizeof(int)
  631. */
  632. static int mm_getsize(
  633. resource *pres, /* I */
  634. unsigned long *ret) /* O */
  635. {
  636. unsigned long value;
  637. if (pres->rs_value.at_type != ATR_TYPE_SIZE)
  638. {
  639. return(PBSE_ATTRTYPE);
  640. }
  641. value = pres->rs_value.at_val.at_size.atsv_num;
  642. if (pres->rs_value.at_val.at_size.atsv_units == ATR_SV_WORDSZ)
  643. {
  644. if (value > ULONG_MAX / sizeof(int))
  645. {
  646. return(PBSE_BADATVAL);
  647. }
  648. value *= sizeof(int);
  649. }
  650. if (value > (ULONG_MAX >> pres->rs_value.at_val.at_size.atsv_shift))
  651. {
  652. return(PBSE_BADATVAL);
  653. }
  654. *ret = (value << pres->rs_value.at_val.at_size.atsv_shift);
  655. return(PBSE_NONE);
  656. } /* END mm_getsize() */
  657. /*
  658. * Internal time decoding routine.
  659. *
  660. * Accepts a resource pointer and a pointer to the unsigned long integer
  661. * to receive the decoded value. It returns a PBS error code, and the
  662. * decoded value of time in seconds in the unsigned long integer.
  663. */
  664. static int mm_gettime(
  665. resource *pres,
  666. unsigned long *ret)
  667. {
  668. if (pres->rs_value.at_type != ATR_TYPE_LONG)
  669. {
  670. return(PBSE_ATTRTYPE);
  671. }
  672. if (pres->rs_value.at_val.at_long < 0)
  673. {
  674. return(PBSE_BADATVAL);
  675. }
  676. *ret = pres->rs_value.at_val.at_long;
  677. return(PBSE_NONE);
  678. }
  679. static int injob(
  680. job *pjob,
  681. pid_t sid)
  682. {
  683. task *ptask;
  684. pid_t pid;
  685. #ifdef PENABLE_LINUX26_CPUSETS
  686. struct pidl *pids = NULL;
  687. struct pidl *pp;
  688. #else
  689. proc_stat_t *ps;
  690. #endif /* PENABLE_LINUX26_CPUSETS */
  691. for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
  692. ptask != NULL;
  693. ptask = (task *)GET_NEXT(ptask->ti_jobtask))
  694. {
  695. if (ptask->ti_qs.ti_sid <= 1)
  696. continue;
  697. if (ptask->ti_qs.ti_sid == sid)
  698. {
  699. return(TRUE);
  700. }
  701. }
  702. /* processes with a different sessionid are not necessarily not part of the
  703. job: the job can call setsid; need to check whether one of the parent
  704. processes has a sessionid that is in the job */
  705. #ifdef PENABLE_LINUX26_CPUSETS
  706. /* check whether the sid is in the job's cpuset */
  707. pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids);
  708. pp = pids;
  709. while (pp != NULL)
  710. {
  711. pid = pp->pid;
  712. pp = pp->next;
  713. if (pid == sid)
  714. {
  715. free_pidlist(pids);
  716. return(TRUE);
  717. }
  718. }
  719. free_pidlist(pids);
  720. #else
  721. /* get the parent process id of the sid and check whether it is part of
  722. the job; iterate */
  723. pid = sid;
  724. while (pid > 1)
  725. {
  726. if ((ps = get_proc_stat(pid)) == NULL)
  727. {
  728. if (errno != ENOENT)
  729. {
  730. sprintf(log_buffer, "%d: get_proc_stat", pid);
  731. log_err(errno, __func__, log_buffer);
  732. }
  733. return(FALSE);
  734. }
  735. pid = getsid(ps->ppid);
  736. for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
  737. ptask != NULL;
  738. ptask = (task *)GET_NEXT(ptask->ti_jobtask))
  739. {
  740. if (ptask->ti_qs.ti_sid <= 1)
  741. continue;
  742. if (ptask->ti_qs.ti_sid == pid)
  743. {
  744. return(TRUE);
  745. }
  746. }
  747. }
  748. #endif /* PENABLE_LINUX26_CPUSETS */
  749. return(FALSE);
  750. } /* END injob() */
  751. /*
  752. * Internal session CPU time decoding routine.
  753. *
  754. * Accepts a job pointer. Returns the sum of all cpu time
  755. * consumed for all tasks executed by the job, in seconds,
  756. * adjusted by cputfactor.
  757. */
  758. static unsigned long cput_sum(
  759. job *pjob) /* I */
  760. {
  761. ulong cputime;
  762. int nps = 0;
  763. int i;
  764. proc_stat_t *ps;
  765. cputime = 0;
  766. if (LOGLEVEL >= 6)
  767. {
  768. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  769. pjob->ji_qs.ji_jobid);
  770. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  771. }
  772. for (i = 0;i < nproc;i++)
  773. {
  774. ps = &proc_array[i];
  775. if ((LOGLEVEL >= 6) && (ps == NULL))
  776. {
  777. sprintf(log_buffer, "proc_array loop end - nproc=%d, i=%d, ps is null",
  778. nproc,
  779. i);
  780. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  781. }
  782. if (!injob(pjob, ps->session))
  783. continue;
  784. nps++;
  785. cputime += (ps->utime + ps->stime + ps->cutime + ps->cstime);
  786. if (LOGLEVEL >= 6)
  787. {
  788. sprintf(log_buffer, "%s: session=%d pid=%d cputime=%lu (cputfactor=%f)",
  789. __func__,
  790. ps->session,
  791. ps->pid,
  792. cputime,
  793. cputfactor);
  794. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  795. }
  796. } /* END for (i) */
  797. if (nps == 0)
  798. pjob->ji_flags |= MOM_NO_PROC;
  799. else
  800. pjob->ji_flags &= ~MOM_NO_PROC;
  801. return((unsigned long)((double)cputime * cputfactor));
  802. } /* END cput_sum() */
  803. /*
  804. * Return TRUE if any process in the job is over limit for cputime usage.
  805. */
  806. static int overcpu_proc(
  807. job *pjob,
  808. unsigned long limit) /* I */
  809. {
  810. ulong cputime;
  811. pid_t pid;
  812. proc_stat_t *ps;
  813. #ifdef PENABLE_LINUX26_CPUSETS
  814. struct pidl *pids = NULL;
  815. struct pidl *pp;
  816. #else
  817. struct dirent *dent;
  818. #endif /* PENABLE_LINUX26_CPUSETS */
  819. #ifdef PENABLE_LINUX26_CPUSETS
  820. /* Instead of collect stats of all processes running on a large SMP system,
  821. * collect stats of processes running in and below the cpuset of the job, only. */
  822. pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids);
  823. pp = pids;
  824. while (pp != NULL)
  825. {
  826. pid = pp->pid;
  827. pp = pp->next;
  828. #else
  829. rewinddir(pdir);
  830. while ((dent = readdir(pdir)) != NULL)
  831. {
  832. if (!isdigit(dent->d_name[0]))
  833. continue;
  834. pid = atoi(dent->d_name);
  835. #endif /* PENABLE_LINUX26_CPUSETS */
  836. if ((ps = get_proc_stat(pid)) == NULL)
  837. {
  838. if (errno != ENOENT)
  839. {
  840. sprintf(log_buffer, "%d: get_proc_stat", pid);
  841. log_err(errno, __func__, log_buffer);
  842. }
  843. continue;
  844. }
  845. #ifndef PENABLE_LINUX26_CPUSETS
  846. /* if it was in the cpuset, its part of the job, no need to check */
  847. if (!injob(pjob, ps->session))
  848. continue;
  849. #endif /* PENABLE_LINUX26_CPUSETS */
  850. /* change from ps->cutime to ps->utime, and ps->cstime to ps->stime */
  851. cputime = (ulong)((double)(ps->utime + ps->stime) * cputfactor);
  852. if (cputime > limit)
  853. {
  854. #ifdef PENABLE_LINUX26_CPUSETS
  855. free_pidlist(pids);
  856. #endif
  857. return(TRUE);
  858. }
  859. }
  860. #ifdef PENABLE_LINUX26_CPUSETS
  861. free_pidlist(pids);
  862. #endif
  863. return(FALSE);
  864. } /* END overcpu_proc() */
  865. /*
  866. * Internal session virtual memory usage function.
  867. *
  868. * Returns the total number of bytes of address
  869. * space consumed by all current processes within the job.
  870. */
  871. static unsigned long long mem_sum(
  872. job *pjob)
  873. {
  874. int i;
  875. unsigned long long segadd;
  876. proc_stat_t *ps;
  877. segadd = 0;
  878. if (LOGLEVEL >= 6)
  879. {
  880. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  881. pjob->ji_qs.ji_jobid);
  882. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  883. }
  884. for (i = 0;i < nproc;i++)
  885. {
  886. ps = &proc_array[i];
  887. if (!injob(pjob, ps->session))
  888. continue;
  889. segadd += ps->vsize;
  890. if (LOGLEVEL >= 6)
  891. {
  892. sprintf(log_buffer, "%s: session=%d pid=%d vsize=%llu sum=%llu",
  893. __func__,
  894. ps->session,
  895. ps->pid,
  896. ps->vsize,
  897. segadd);
  898. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  899. }
  900. } /* END for (i) */
  901. return(segadd);
  902. } /* END mem_sum() */
  903. /*
  904. * Internal session memory usage function.
  905. *
  906. * Returns the total number of bytes of resident memory
  907. * consumed by all current processes within the job.
  908. */
  909. static unsigned long long resi_sum(
  910. job *pjob)
  911. {
  912. int i;
  913. unsigned long long resisize;
  914. proc_stat_t *ps;
  915. #ifdef USELIBMEMACCT
  916. long long w_rss;
  917. #endif
  918. resisize = 0;
  919. if (LOGLEVEL >= 6)
  920. {
  921. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  922. pjob->ji_qs.ji_jobid);
  923. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  924. }
  925. for (i = 0;i < nproc;i++)
  926. {
  927. ps = &proc_array[i];
  928. if (!injob(pjob, ps->session))
  929. continue;
  930. #ifdef USELIBMEMACCT
  931. /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */
  932. w_rss = get_memacct_resi(ps->pid);
  933. if (w_rss == -1)
  934. resisize += ps->rss * pagesize;
  935. else
  936. resisize += w_rss;
  937. if (LOGLEVEL >= 6)
  938. {
  939. sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu w_rss=%ld sum=%llu",
  940. __func__,
  941. ps->session,
  942. ps->pid,
  943. ps->rss * pagesize,
  944. w_rss,
  945. resisize);
  946. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  947. }
  948. #else
  949. resisize += ps->rss * pagesize;
  950. if (LOGLEVEL >= 6)
  951. {
  952. sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu sum=%llu",
  953. __func__,
  954. ps->session,
  955. ps->pid,
  956. ps->rss * pagesize,
  957. resisize);
  958. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  959. }
  960. #endif
  961. } /* END for (i) */
  962. return(resisize);
  963. } /* END resi_sum() */
  964. /*
  965. * Return TRUE if any process in the job is over limit for virtual memory usage.
  966. */
  967. static int overmem_proc(
  968. job *pjob, /* I */
  969. unsigned long long limit) /* I */
  970. {
  971. int i;
  972. proc_stat_t *ps;
  973. if (LOGLEVEL >= 6)
  974. {
  975. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  976. pjob->ji_qs.ji_jobid);
  977. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  978. }
  979. for (i = 0;i < nproc;i++)
  980. {
  981. ps = &proc_array[i];
  982. if (!injob(pjob, ps->session))
  983. continue;
  984. if (ps->vsize > limit)
  985. {
  986. return(TRUE);
  987. }
  988. } /* END for (i) */
  989. return(FALSE);
  990. } /* END overmem_proc() */
  991. extern char *msg_momsetlim;
  992. /*
  993. * Internal error routine
  994. */
  995. int error(
  996. const char *string,
  997. int value)
  998. {
  999. char *message;
  1000. assert(string != NULL);
  1001. assert(*string != '\0');
  1002. message = pbse_to_txt(value);
  1003. assert(message != NULL);
  1004. assert(*message != '\0');
  1005. fprintf(stderr, msg_momsetlim, string, message);
  1006. fflush(stderr);
  1007. return(value);
  1008. } /* END error() */
  1009. /*
  1010. * Establish system-enforced limits for the job.
  1011. *
  1012. * Run through the resource list, checking the values for all items
  1013. * we recognize.
  1014. *
  1015. * If set_mode is SET_LIMIT_SET, then also set hard limits for the
  1016. * system enforced limits (not-polled).
  1017. * If anything goes wrong with the process, return a PBS error code
  1018. * and print a message on standard error. A zero-length resource list
  1019. * is not an error.
  1020. *
  1021. * If set_mode is SET_LIMIT_SET the entry conditions are:
  1022. * 1. MOM has already forked, and we are called from the child.
  1023. * 2. The child is still running as root.
  1024. * 3. Standard error is open to the user's file.
  1025. *
  1026. * If set_mode is SET_LIMIT_ALTER, we are being called to modify
  1027. * existing limits. Cannot alter those set by setrlimit (kernel)
  1028. * because we are the wrong process.
  1029. */
  1030. int mom_set_limits(
  1031. job *pjob, /* I */
  1032. int set_mode) /* SET_LIMIT_SET or SET_LIMIT_ALTER */
  1033. {
  1034. const char *pname = NULL;
  1035. int retval;
  1036. unsigned long value; /* place in which to build resource value */
  1037. resource *pres;
  1038. struct rlimit reslim;
  1039. unsigned long vmem_limit = 0;
  1040. unsigned long mem_limit = 0;
  1041. /* NOTE: log_buffer is exported */
  1042. if (LOGLEVEL >= 2)
  1043. {
  1044. sprintf(log_buffer, "%s(%s,%s) entered",
  1045. __func__,
  1046. (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
  1047. (set_mode == SET_LIMIT_SET) ? "set" : "alter");
  1048. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1049. log_buffer[0] = '\0';
  1050. }
  1051. assert(pjob != NULL);
  1052. assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
  1053. pres = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
  1054. /*
  1055. * cycle through all the resource specifications,
  1056. * setting limits appropriately.
  1057. */
  1058. memset(&reslim, 0, sizeof(reslim));
  1059. /* set oom_adj score for the starting job */
  1060. /* if immunize mode is set to on, we have to set child score to 0 */
  1061. if ( (set_mode == SET_LIMIT_SET) && ( job_oom_score_adjust != 0 || mom_oom_immunize != 0 ) )
  1062. {
  1063. retval = oom_adj(job_oom_score_adjust);
  1064. if ( LOGLEVEL >= 2 )
  1065. {
  1066. sprintf(log_buffer, "setting oom_adj '%s'",
  1067. (retval != -1) ? "succeeded" : "failed");
  1068. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1069. }
  1070. };
  1071. while (pres != NULL)
  1072. {
  1073. if (pres->rs_defin != NULL)
  1074. pname = pres->rs_defin->rs_name;
  1075. else
  1076. pname = NULL;
  1077. if (LOGLEVEL >= 2)
  1078. {
  1079. sprintf(log_buffer, "setting limit for attribute '%s'",
  1080. (pname != NULL) ? pname : "NULL");
  1081. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1082. log_buffer[0] = '\0';
  1083. }
  1084. assert(pres->rs_defin != NULL);
  1085. assert(pname != NULL);
  1086. assert(pname[0] != '\0');
  1087. if (!strcmp(pname, "cput"))
  1088. {
  1089. if (igncput == FALSE)
  1090. {
  1091. /* cpu time - check, if less than pcput use it */
  1092. retval = mm_gettime(pres, &value);
  1093. if (retval != PBSE_NONE)
  1094. {
  1095. sprintf(log_buffer, "cput mm_gettime failed in %s", __func__);
  1096. return(error(pname, retval));
  1097. }
  1098. }
  1099. }
  1100. else if (!strcmp(pname, "pcput"))
  1101. {
  1102. if (igncput == FALSE)
  1103. {
  1104. if (set_mode == SET_LIMIT_SET)
  1105. {
  1106. /* process cpu time - set */
  1107. retval = mm_gettime(pres, &value);
  1108. if (retval != PBSE_NONE)
  1109. {
  1110. sprintf(log_buffer, "pcput mm_gettime failed in %s", __func__);
  1111. return(error(pname, retval));
  1112. }
  1113. reslim.rlim_cur = reslim.rlim_max =
  1114. (unsigned long)((double)value / cputfactor);
  1115. if (LOGLEVEL >= 2)
  1116. {
  1117. sprintf(log_buffer, "setting cpu time limit to %ld for job %s",
  1118. (long int)reslim.rlim_cur,
  1119. pjob->ji_qs.ji_jobid);
  1120. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1121. log_buffer[0] = '\0';
  1122. }
  1123. /* NOTE: some versions of linux have a bug which causes the parent
  1124. process to receive a SIGKILL if the child's cpu limit is exceeded */
  1125. if (setrlimit(RLIMIT_CPU, &reslim) < 0)
  1126. {
  1127. sprintf(log_buffer, "setrlimit for RLIMIT_CPU failed in %s, errno=%d (%s)",
  1128. __func__,
  1129. errno, strerror(errno));
  1130. return(error("RLIMIT_CPU", PBSE_SYSTEM));
  1131. }
  1132. } /* END if (set_mode == SET_LIMIT_SET) */
  1133. }
  1134. }
  1135. else if (!strcmp(pname, "file"))
  1136. {
  1137. /* set */
  1138. if (set_mode == SET_LIMIT_SET)
  1139. {
  1140. retval = mm_getsize(pres, &value);
  1141. if (retval != PBSE_NONE)
  1142. {
  1143. sprintf(log_buffer, "mm_getsize() failed for file in %s",
  1144. __func__);
  1145. return(error(pname, retval));
  1146. }
  1147. if (value > ULONG_MAX)
  1148. {
  1149. if (LOGLEVEL >= 0)
  1150. {
  1151. sprintf(log_buffer, "cannot set file limit to %ld for job %s (value too large)",
  1152. (long int)reslim.rlim_cur,
  1153. pjob->ji_qs.ji_jobid);
  1154. log_err(-1, __func__, log_buffer);
  1155. log_buffer[0] = '\0';
  1156. }
  1157. return(error(pname, PBSE_BADATVAL));
  1158. }
  1159. reslim.rlim_cur = reslim.rlim_max = value;
  1160. if (setrlimit(RLIMIT_FSIZE, &reslim) < 0)
  1161. {
  1162. sprintf(log_buffer, "cannot set file limit to %ld for job %s (setrlimit failed - check default user limits)",
  1163. (long int)reslim.rlim_max,
  1164. pjob->ji_qs.ji_jobid);
  1165. log_err(errno, __func__, log_buffer);
  1166. log_buffer[0] = '\0';
  1167. return(error(pname, PBSE_SYSTEM));
  1168. }
  1169. }
  1170. }
  1171. else if (!strcmp(pname, "vmem"))
  1172. {
  1173. if (ignvmem == FALSE)
  1174. {
  1175. /* check */
  1176. retval = mm_getsize(pres, &value);
  1177. if (retval != PBSE_NONE)
  1178. {
  1179. sprintf(log_buffer, "mm_getsize() failed for vmem in %s", __func__);
  1180. return(error(pname, retval));
  1181. }
  1182. if ((vmem_limit == 0) || (value < vmem_limit))
  1183. vmem_limit = value;
  1184. }
  1185. }
  1186. else if (!strcmp(pname, "pvmem"))
  1187. {
  1188. if (ignvmem == FALSE)
  1189. {
  1190. /* set */
  1191. if (set_mode == SET_LIMIT_SET)
  1192. {
  1193. retval = mm_getsize(pres, &value);
  1194. if (retval != PBSE_NONE)
  1195. {
  1196. sprintf(log_buffer, "mm_getsize() failed for pvmem in %s",
  1197. __func__);
  1198. return(error(pname, retval));
  1199. }
  1200. if (value > ULONG_MAX)
  1201. {
  1202. log_buffer[0] = '\0';
  1203. sprintf(log_buffer, "invalid value returned by mm_getsize() for pvmem in %s",
  1204. __func__);
  1205. return(error(pname, PBSE_BADATVAL));
  1206. }
  1207. if ((vmem_limit == 0) || (value < vmem_limit))
  1208. vmem_limit = value;
  1209. }
  1210. }
  1211. }
  1212. else if ((!strcmp(pname,"mem") && (pjob->ji_numnodes != 1)) ||
  1213. !strcmp(pname,"mppmem"))
  1214. {
  1215. /* ignore. If we ever get rid of support for the UNICOS OS then we can
  1216. remove the ATR_DFLAG_MOM | ATR_DFLAG_ALTRUN flags from mppmem */
  1217. }
  1218. else if ((!strcmp(pname, "mem") && (pjob->ji_numnodes == 1)) ||
  1219. !strcmp(pname, "pmem"))
  1220. {
  1221. if (ignmem == FALSE)
  1222. {
  1223. /* set */
  1224. if (set_mode == SET_LIMIT_SET)
  1225. {
  1226. retval = mm_getsize(pres, &value);
  1227. if (retval != PBSE_NONE)
  1228. {
  1229. sprintf(log_buffer, "mm_getsize() failed for mem/pmem in %s",
  1230. __func__);
  1231. return(error(pname, retval));
  1232. }
  1233. reslim.rlim_cur = reslim.rlim_max = value;
  1234. if (setrlimit(RLIMIT_DATA, &reslim) < 0)
  1235. {
  1236. sprintf(log_buffer, "cannot set data limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1237. (long int)reslim.rlim_max,
  1238. pjob->ji_qs.ji_jobid,
  1239. errno,
  1240. strerror(errno));
  1241. return(error("RLIMIT_DATA", PBSE_SYSTEM));
  1242. }
  1243. if (setrlimit(RLIMIT_RSS, &reslim) < 0)
  1244. {
  1245. sprintf(log_buffer, "cannot set RSS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1246. (long int)reslim.rlim_max,
  1247. pjob->ji_qs.ji_jobid,
  1248. errno,
  1249. strerror(errno));
  1250. return(error("RLIMIT_RSS", PBSE_SYSTEM));
  1251. }
  1252. #ifdef __GATECH
  1253. /* NOTE: best patch may be to change to 'vmem_limit = value;' */
  1254. if (setrlimit(RLIMIT_STACK, &reslim) < 0)
  1255. {
  1256. sprintf(log_buffer, "cannot set stack limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1257. (long int)reslim.rlim_max,
  1258. pjob->ji_qs.ji_jobid,
  1259. errno,
  1260. strerror(errno));
  1261. return(error("RLIMIT_STACK", PBSE_SYSTEM));
  1262. }
  1263. /* set address space */
  1264. if (setrlimit(RLIMIT_AS, &reslim) < 0)
  1265. {
  1266. sprintf(log_buffer, "cannot set AS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1267. (long int)reslim.rlim_max,
  1268. pjob->ji_qs.ji_jobid,
  1269. errno,
  1270. strerror(errno));
  1271. return(error("RLIMIT_AS", PBSE_SYSTEM));
  1272. }
  1273. #endif /* __GATECH */
  1274. mem_limit = value;
  1275. if (getrlimit(RLIMIT_STACK, &reslim) >= 0)
  1276. {
  1277. /* NOTE: mem_limit no longer used with UMU patch in place */
  1278. mem_limit = value + reslim.rlim_cur;
  1279. }
  1280. }
  1281. }
  1282. } /* END else if (!strcmp(pname,"mem") && ... */
  1283. else if (!strcmp(pname, "walltime"))
  1284. {
  1285. /* check */
  1286. retval = mm_gettime(pres, &value);
  1287. if (retval != PBSE_NONE)
  1288. {
  1289. sprintf(log_buffer, "mm_gettime() failed for walltime in %s\n",
  1290. __func__);
  1291. return(error(pname, retval));
  1292. }
  1293. }
  1294. else if (!strcmp(pname, "nice"))
  1295. {
  1296. /* set nice */
  1297. if (set_mode == SET_LIMIT_SET)
  1298. {
  1299. errno = 0;
  1300. if ((nice((int)pres->rs_value.at_val.at_long) == -1) && (errno != 0))
  1301. {
  1302. sprintf(log_buffer, "nice() failed w/errno=%d (%s) in %s\n",
  1303. errno,
  1304. strerror(errno),
  1305. __func__);
  1306. return(error(pname, PBSE_BADATVAL));
  1307. }
  1308. }
  1309. }
  1310. else if (!strcmp(pname, "size"))
  1311. {
  1312. /* ignore */
  1313. /* NO-OP */
  1314. }
  1315. else if (!strcmp(pname, "prologue"))
  1316. {
  1317. }
  1318. else if (!strcmp(pname, "epilogue"))
  1319. {
  1320. }
  1321. else if ((!strcmp(pname, "mppdepth")) ||
  1322. (!strcmp(pname, "mppnodect")) ||
  1323. (!strcmp(pname, "mppwidth")) ||
  1324. (!strcmp(pname, "mppnppn")) ||
  1325. (!strcmp(pname, "mppnodes")) ||
  1326. (!strcmp(pname, "mpplabels")) ||
  1327. (!strcmp(pname, "mpparch")) ||
  1328. (!strcmp(pname, "mpplabel")))
  1329. {
  1330. /* NO-OP */
  1331. }
  1332. else if ((pres->rs_defin->rs_flags & ATR_DFLAG_RMOMIG) == 0)
  1333. {
  1334. /* don't recognize and not marked as ignore by mom */
  1335. sprintf(log_buffer, "do not know how to process resource '%s' in %s\n",
  1336. pname,
  1337. __func__);
  1338. return(error(pname, PBSE_UNKRESC));
  1339. }
  1340. pres = (resource *)GET_NEXT(pres->rs_link);
  1341. }
  1342. if (set_mode == SET_LIMIT_SET)
  1343. {
  1344. /* if either of vmem or pvmem was given, set sys limit to lesser */
  1345. if (vmem_limit != 0)
  1346. {
  1347. /* Don't make (p)vmem < pmem */
  1348. if (mem_limit > vmem_limit)
  1349. {
  1350. vmem_limit = mem_limit;
  1351. }
  1352. reslim.rlim_cur = reslim.rlim_max = vmem_limit;
  1353. if ((ignvmem == 0) && (setrlimit(RLIMIT_AS, &reslim) < 0))
  1354. {
  1355. sprintf(log_buffer, "setrlimit() failed setting AS for vmem_limit mod in %s\n",
  1356. __func__);
  1357. return(error("RLIMIT_AS", PBSE_SYSTEM));
  1358. }
  1359. /* UMU vmem patch sets RLIMIT_AS rather than RLIMIT_DATA and RLIMIT_STACK */
  1360. /*
  1361. reslim.rlim_cur = reslim.rlim_max = mem_limit;
  1362. if (setrlimit(RLIMIT_DATA,&reslim) < 0)
  1363. {
  1364. sprintf(log_buffer,"setrlimit() failed setting data for vmem_limit mod in %s\n",
  1365. id);
  1366. return(error("RLIMIT_DATA",PBSE_SYSTEM));
  1367. }
  1368. if (setrlimit(RLIMIT_STACK,&reslim) < 0)
  1369. {
  1370. sprintf(log_buffer,"setrlimit() failed setting stack for vmem_limit mod in %s\n",
  1371. id);
  1372. return(error("RLIMIT_STACK",PBSE_SYSTEM));
  1373. }
  1374. */
  1375. }
  1376. }
  1377. if (LOGLEVEL >= 5)
  1378. {
  1379. sprintf(log_buffer, "%s(%s,%s) completed",
  1380. __func__,
  1381. (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
  1382. (set_mode == SET_LIMIT_SET) ? "set" : "alter");
  1383. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1384. log_buffer[0] = '\0';
  1385. }
  1386. /* SUCCESS */
  1387. return(PBSE_NONE);
  1388. } /* END mom_set_limits() */
  1389. /*
  1390. * State whether MOM main loop has to poll this job to determine if some
  1391. * limits are being exceeded.
  1392. *
  1393. * Sets flag TRUE if polling is necessary, FALSE otherwise. Actual
  1394. * polling is done using the mom_over_limit machine-dependent function.
  1395. */
  1396. int mom_do_poll(
  1397. job *pjob) /* I */
  1398. {
  1399. const char *pname;
  1400. resource *pres;
  1401. assert(pjob != NULL);
  1402. if (LOGLEVEL >= 4)
  1403. {
  1404. log_record(
  1405. PBSEVENT_JOB,
  1406. PBS_EVENTCLASS_JOB,
  1407. pjob->ji_qs.ji_jobid,
  1408. "evaluating limits for job");
  1409. }
  1410. assert(pjob != NULL);
  1411. assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
  1412. pres = (resource *)GET_NEXT(
  1413. pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
  1414. while (pres != NULL)
  1415. {
  1416. assert(pres->rs_defin != NULL);
  1417. pname = pres->rs_defin->rs_name;
  1418. assert(pname != NULL);
  1419. assert(*pname != '\0');
  1420. if (strcmp(pname, "walltime") == 0 ||
  1421. strcmp(pname, "cput") == 0 ||
  1422. strcmp(pname, "pcput") == 0 ||
  1423. strcmp(pname, "mem") == 0 ||
  1424. strcmp(pname, "pvmem") == 0 ||
  1425. strcmp(pname, "vmem") == 0)
  1426. {
  1427. return(TRUE);
  1428. }
  1429. pres = (resource *)GET_NEXT(pres->rs_link);
  1430. }
  1431. return(FALSE);
  1432. } /* END mom_do_poll() */
  1433. /*
  1434. * Setup for polling.
  1435. *
  1436. * Open kernel device and get namelist info.
  1437. */
  1438. int mom_open_poll(void)
  1439. {
  1440. if (LOGLEVEL >= 6)
  1441. {
  1442. log_record(PBSEVENT_SYSTEM, 0, __func__, "started");
  1443. }
  1444. pagesize = getpagesize();
  1445. proc_array = (proc_stat_t *)calloc(TBL_INC, sizeof(proc_stat_t));
  1446. if (proc_array == NULL)
  1447. {
  1448. log_err(errno, __func__, "calloc");
  1449. return(PBSE_SYSTEM);
  1450. }
  1451. max_proc = TBL_INC;
  1452. return(PBSE_NONE);
  1453. } /* END mom_open_poll() */
  1454. /*
  1455. * Declare start of polling loop.
  1456. *
  1457. * This function caches information about all of processes
  1458. * on the compute node (pbs_mom calls this function). Each process
  1459. * in /proc/ is queried by looking at the 'stat' file. Statistics like
  1460. * CPU usage time, memory consumption, etc. are gathered in the proc_array
  1461. * list. This list is then used throughout the pbs_mom to get information
  1462. * about tasks it is monitoring.
  1463. *
  1464. * This function is called from the main MOM loop once every "check_poll_interval"
  1465. * seconds.
  1466. *
  1467. * @see get_proc_stat() - child
  1468. * @see mom_set_use() - Aggregates data collected here
  1469. *
  1470. * NOTE: populates global 'proc_array[]' variable.
  1471. * NOTE: reallocs proc_array[] as needed to accomodate processes.
  1472. *
  1473. * @see mom_open_poll() - allocs proc_array table.
  1474. * @see mom_close_poll() - frees procs_array.
  1475. * @see setup_program_environment() - parent - called at pbs_mom start
  1476. * @see main_loop() - parent - called once per iteration
  1477. * @see mom_set_use() - populate job structure with usage data for local use or to send to mother superior
  1478. */
  1479. int mom_get_sample(void)
  1480. {
  1481. proc_stat_t *pi;
  1482. proc_stat_t *ps;
  1483. pid_t pid;
  1484. #ifdef PENABLE_LINUX26_CPUSETS
  1485. struct pidl *pids = NULL;
  1486. struct pidl *pp;
  1487. #else
  1488. struct dirent *dent;
  1489. #endif
  1490. if (proc_array == NULL)
  1491. mom_open_poll();
  1492. nproc = 0;
  1493. pi = proc_array;
  1494. if (LOGLEVEL >= 6)
  1495. {
  1496. log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "proc_array load started");
  1497. }
  1498. #ifdef PENABLE_LINUX26_CPUSETS
  1499. /* Instead of collect stats of all processes running on a large SMP system,
  1500. * collect stats of processes running in and below the Torque cpuset, only
  1501. * This relies on reliable process starters for MPI, which bind their tasks
  1502. * to the cpuset of the job. */
  1503. #ifdef USELIBCPUSET
  1504. pids = get_cpuset_pidlist(TTORQUECPUSET_BASE, pids);
  1505. #else
  1506. pids = get_cpuset_pidlist(TTORQUECPUSET_PATH, pids);
  1507. #endif
  1508. pp = pids;
  1509. while (pp != NULL)
  1510. {
  1511. pid = pp->pid;
  1512. pp = pp->next;
  1513. #else
  1514. if (pdir == NULL)
  1515. {
  1516. if ((pdir = opendir(procfs)) == NULL)
  1517. return(PBSE_SYSTEM);
  1518. }
  1519. rewinddir(pdir);
  1520. while ((dent = readdir(pdir)) != NULL)
  1521. {
  1522. if (!isdigit(dent->d_name[0]))
  1523. continue;
  1524. pid = atoi(dent->d_name);
  1525. #endif
  1526. if ((ps = get_proc_stat(pid)) == NULL)
  1527. {
  1528. if (errno != ENOENT)
  1529. {
  1530. sprintf(log_buffer, "%d: get_proc_stat", pid);
  1531. log_err(errno, __func__, log_buffer);
  1532. }
  1533. continue;
  1534. }
  1535. /* nproc++; -- we need to increment AFTER assigning this ps to
  1536. the proc_array--otherwise we could skip it in for loops */
  1537. if ((nproc + 1) >= max_proc)
  1538. {
  1539. proc_stat_t *hold;
  1540. if (LOGLEVEL >= 9)
  1541. {
  1542. log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "alloc more proc_array");
  1543. }
  1544. max_proc *= 2;
  1545. hold = (proc_stat_t *)calloc(1, max_proc * sizeof(proc_stat_t));
  1546. if (hold == NULL)
  1547. {
  1548. log_err(errno, __func__, "unable to realloc space for proc_array sample");
  1549. return(PBSE_SYSTEM);
  1550. }
  1551. memcpy(hold, proc_array, sizeof(proc_stat_t) * max_proc / 2);
  1552. free(proc_array);
  1553. proc_array = hold;
  1554. } /* END if ((nproc+1) == max_proc) */
  1555. pi = &proc_array[nproc++];
  1556. memcpy(pi, ps, sizeof(proc_stat_t));
  1557. } /* END while (...) != NULL) */
  1558. #ifdef PENABLE_LINUX26_CPUSETS
  1559. free_pidlist(pids);
  1560. #endif
  1561. if (LOGLEVEL >= 6)
  1562. {
  1563. sprintf(log_buffer, "proc_array loaded - nproc=%d",
  1564. nproc);
  1565. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  1566. }
  1567. return(PBSE_NONE);
  1568. } /* END mom_get_sample() */
  1569. /*
  1570. * Measure job resource usage and compare with its limits.
  1571. *
  1572. * If it has exceeded any well-formed polled limit return the limit that
  1573. * it exceeded.
  1574. * Otherwise, return PBSE_NONE. log_buffer is populated with failure.
  1575. */
  1576. int mom_over_limit(
  1577. job *pjob) /* I */
  1578. {
  1579. const char *pname;
  1580. int retval;
  1581. unsigned long value;
  1582. unsigned long num;
  1583. unsigned long long numll;
  1584. resource *pres;
  1585. assert(pjob != NULL);
  1586. assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
  1587. pres = (resource *)GET_NEXT(
  1588. pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
  1589. for (;pres != NULL;pres = (resource *)GET_NEXT(pres->rs_link))
  1590. {
  1591. assert(pres->rs_defin != NULL);
  1592. pname = pres->rs_defin->rs_name;
  1593. assert(pname != NULL);
  1594. assert(*pname != '\0');
  1595. if ((igncput == FALSE) && (strcmp(pname, "cput") == 0))
  1596. {
  1597. retval = mm_gettime(pres, &value);
  1598. if (retval != PBSE_NONE)
  1599. continue;
  1600. if ((num = cput_sum(pjob)) > value)
  1601. {
  1602. sprintf(log_buffer, "cput %lu exceeded limit %lu",
  1603. num,
  1604. value);
  1605. return(JOB_EXEC_OVERLIMIT_CPUT);
  1606. }
  1607. }
  1608. else if ((igncput == FALSE) && (strcmp(pname, "pcput") == 0))
  1609. {
  1610. retval = mm_gettime(pres, &value);
  1611. if (retval != PBSE_NONE)
  1612. continue;
  1613. if (overcpu_proc(pjob, value))
  1614. {
  1615. sprintf(log_buffer, "pcput exceeded limit %lu",
  1616. value);
  1617. return(JOB_EXEC_OVERLIMIT_CPUT);
  1618. }
  1619. }
  1620. else if (strcmp(pname, "vmem") == 0)
  1621. {
  1622. retval = mm_getsize(pres, &value);
  1623. if (retval != PBSE_NONE)
  1624. continue;
  1625. if ((ignvmem == 0) && ((numll = mem_sum(pjob)) > value))
  1626. {
  1627. sprintf(log_buffer, "vmem %llu exceeded limit %lu",
  1628. numll,
  1629. value);
  1630. return(JOB_EXEC_OVERLIMIT_MEM);
  1631. }
  1632. }
  1633. else if (strcmp(pname, "pvmem") == 0)
  1634. {
  1635. unsigned long long valuell;
  1636. retval = mm_getsize(pres, &value);
  1637. if (retval != PBSE_NONE)
  1638. continue;
  1639. valuell = (unsigned long long)value;
  1640. if ((ignvmem == 0) && (overmem_proc(pjob, valuell)))
  1641. {
  1642. sprintf(log_buffer, "pvmem exceeded limit %llu",
  1643. valuell);
  1644. return(JOB_EXEC_OVERLIMIT_MEM);
  1645. }
  1646. }
  1647. else if (ignwalltime == 0 && strcmp(pname, "walltime") == 0)
  1648. {
  1649. /* no need to check walltime on sisters, MS will get it */
  1650. if (am_i_mother_superior(*pjob) == false)
  1651. continue;
  1652. retval = mm_gettime(pres, &value);
  1653. if (retval != PBSE_NONE)
  1654. continue;
  1655. num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) *
  1656. wallfactor);
  1657. if (num > value)
  1658. {
  1659. sprintf(log_buffer, "walltime %ld exceeded limit %ld",
  1660. num,
  1661. value);
  1662. return(JOB_EXEC_OVERLIMIT_WT);
  1663. }
  1664. }
  1665. } /* END for (pres) */
  1666. #ifdef PENABLE_LINUX26_CPUSETS
  1667. /* Check memory_pressure */
  1668. if (memory_pressure_threshold > 0)
  1669. {
  1670. /*
  1671. * If last recorded memory_pressure is over threshold, increment counter.
  1672. * If duration is enabled, throw over_limit if counter reaches duration.
  1673. */
  1674. if (pjob->ji_mempressure_curr < memory_pressure_threshold)
  1675. {
  1676. pjob->ji_mempressure_cnt = 0; /* reset */
  1677. }
  1678. else
  1679. {
  1680. pjob->ji_mempressure_cnt++; /* count */
  1681. sprintf(log_buffer, "job %s memory_pressure is over %d for %d (%d) cycles",
  1682. pjob->ji_qs.ji_jobid,
  1683. memory_pressure_threshold,
  1684. pjob->ji_mempressure_cnt,
  1685. memory_pressure_duration);
  1686. log_ext(-1, __func__, log_buffer,LOG_ALERT);
  1687. if (memory_pressure_duration && (pjob->ji_mempressure_cnt >= memory_pressure_duration))
  1688. {
  1689. sprintf(log_buffer, "swap rate due to memory oversubscription is too high");
  1690. return(JOB_EXEC_OVERLIMIT_MEM);
  1691. }
  1692. }
  1693. }
  1694. #endif
  1695. return(PBSE_NONE);
  1696. } /* END mom_over_limit() */
  1697. /*
  1698. * job_expected_resc_found: logs an error if an expected resource was not found
  1699. */
  1700. int job_expected_resc_found(
  1701. const resource *pres,
  1702. const resource_def *rd,
  1703. const char *jobid)
  1704. {
  1705. if (!pres)
  1706. {
  1707. char log_buf[2048];
  1708. snprintf(log_buf, sizeof(log_buf), "job %s missing expected resource %s for resource usage calculation",
  1709. jobid, rd->rs_name);
  1710. log_err(-1, __func__, log_buf);
  1711. return -1;
  1712. }
  1713. return PBSE_NONE;
  1714. }
  1715. /*
  1716. * Update the job attribute for resources used.
  1717. *
  1718. * The first time this function is called for a job,
  1719. * it sets up resource entries for
  1720. * each resource that can be reported for this machine.
  1721. *
  1722. * Subsequent calls update the resource usage information based on
  1723. * stats gathered by the mom_get_sample() function. This function
  1724. * is often called by "im_request()" as a result of POLL_JOB query
  1725. * from the mother superior.
  1726. *
  1727. * @see im_request() - parent - respond to poll_job request from mother superior
  1728. * @see examine_all_running_jobs() - parent - update local use on mother superior
  1729. * @see TMomFinalizeJob1() - parent - update serial job immediately at job start
  1730. *
  1731. * @return An error code if something goes wrong.
  1732. */
  1733. int mom_set_use(
  1734. job *pjob) /* I (modified) */
  1735. {
  1736. resource *pres;
  1737. pbs_attribute *at;
  1738. resource_def *rd;
  1739. unsigned long *lp;
  1740. unsigned long lnum;
  1741. #ifdef PENABLE_LINUX26_CPUSETS
  1742. int inum;
  1743. #endif
  1744. assert(pjob != NULL);
  1745. at = &pjob->ji_wattr[JOB_ATR_resc_used];
  1746. assert(at->at_type == ATR_TYPE_RESC);
  1747. #ifdef USESAVEDRESOURCES
  1748. /* don't update jobs that are marked as recovery */
  1749. if (pjob->ji_flags & MOM_JOB_RECOVERY)
  1750. {
  1751. return(PBSE_NONE);
  1752. }
  1753. #endif /* USESAVEDRESOURCES */
  1754. at->at_flags |= ATR_VFLAG_MODIFY;
  1755. if ((at->at_flags & ATR_VFLAG_SET) == 0)
  1756. {
  1757. /* initialize usage structures */
  1758. at->at_flags |= ATR_VFLAG_SET;
  1759. rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);
  1760. assert(rd != NULL);
  1761. pres = add_resource_entry(at, rd);
  1762. pres->rs_value.at_flags |= ATR_VFLAG_SET;
  1763. pres->rs_value.at_type = ATR_TYPE_LONG;
  1764. rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);
  1765. assert(rd != NULL);
  1766. pres = add_resource_entry(at, rd);
  1767. pres->rs_value.at_flags |= ATR_VFLAG_SET;
  1768. pres->rs_value.at_type = ATR_TYPE_SIZE;
  1769. pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */
  1770. pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;
  1771. rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);
  1772. assert(rd != NULL);
  1773. pres = add_resource_entry(at, rd);
  1774. pres->rs_value.at_flags |= ATR_VFLAG_SET;
  1775. pres->rs_value.at_type = ATR_TYPE_LONG;
  1776. rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);
  1777. assert(rd != NULL);
  1778. pres = add_resource_entry(at, rd);
  1779. pres->rs_value.at_flags |= ATR_VFLAG_SET;
  1780. pres->rs_value.at_type = ATR_TYPE_SIZE;
  1781. pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */
  1782. pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;
  1783. } /* END if ((at->at_flags & ATR_VFLAG_SET) == 0) */
  1784. /* get cputime */
  1785. rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);
  1786. assert(rd != NULL);
  1787. pres = find_resc_entry(at, rd);
  1788. if (job_expected_resc_found(pres, rd, pjob->ji_qs.ji_jobid))
  1789. return -1;
  1790. lp = (unsigned long *) & pres->rs_value.at_val.at_long;
  1791. lnum = cput_sum(pjob);
  1792. *lp = MAX(*lp, lnum);
  1793. /* get swap */
  1794. rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);
  1795. assert(rd != NULL);
  1796. pres = find_resc_entry(at, rd);
  1797. if (job_expected_resc_found(pres, rd, pjob->ji_qs.ji_jobid))
  1798. return -1;
  1799. lp = &pres->rs_value.at_val.at_size.atsv_num;
  1800. lnum = (mem_sum(pjob) + 1023) >> pres->rs_value.at_val.at_size.atsv_shift; /* as KB */
  1801. *lp = MAX(*lp, lnum);
  1802. /* get walltime */
  1803. rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);
  1804. assert(rd != NULL);
  1805. pres = find_resc_entry(at, rd);
  1806. if (job_expected_resc_found(pres, rd, pjob->ji_qs.ji_jobid))
  1807. return -1;
  1808. /* NOTE: starting jobs can come through here before stime is recorded */
  1809. if (pjob->ji_qs.ji_stime == 0)
  1810. pres->rs_value.at_val.at_long = 0;
  1811. else
  1812. pres->rs_value.at_val.at_long =
  1813. (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor);
  1814. /* get memory */
  1815. rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);
  1816. assert(rd != NULL);
  1817. pres = find_resc_entry(at, rd);
  1818. if (job_expected_resc_found(pres, rd, pjob->ji_qs.ji_jobid))
  1819. return -1;
  1820. lp = &pres->rs_value.at_val.at_size.atsv_num;
  1821. lnum = (resi_sum(pjob) + 1023) >> pres->rs_value.at_val.at_size.atsv_shift; /* as KB */
  1822. *lp = MAX(*lp, lnum);
  1823. #ifdef PENABLE_LINUX26_CPUSETS
  1824. /* get memory_pressure */
  1825. if (memory_pressure_threshold > 0)
  1826. {
  1827. inum = get_cpuset_mempressure(pjob->ji_qs.ji_jobid);
  1828. /* Store if success */
  1829. if (inum != -1)
  1830. pjob->ji_mempressure_curr = inum;
  1831. /* Alert if there is pressure */
  1832. if (inum > 0)
  1833. {
  1834. sprintf(log_buffer, "job %s causes memory_pressure %d", pjob->ji_qs.ji_jobid, inum);
  1835. log_ext(-1, __func__, log_buffer, LOG_ALERT);
  1836. }
  1837. }
  1838. else
  1839. {
  1840. pjob->ji_mempressure_curr = 0;
  1841. }
  1842. #endif
  1843. return(PBSE_NONE);
  1844. } /* END mom_set_use() */
  1845. /**
  1846. * Kill a task session.
  1847. * Call with the task pointer and a signal number.
  1848. *
  1849. * @return number of tasks signalled (0 = failure)
  1850. *
  1851. * @see kill_job() - parent
  1852. *
  1853. * NOTE: should support killpg() or killpidtree() - (NYI)
  1854. * may be required for suspend/resume
  1855. */
  1856. int kill_task(
  1857. task *ptask, /* I */
  1858. int sig, /* I */
  1859. int pg) /* I (1=signal process group, 0=signal master process only) */
  1860. {
  1861. int ct = 0; /* num of processes killed */
  1862. int ctThisIteration = 0;
  1863. int ctCleanIterations = 0;
  1864. int loopCt = 0;
  1865. int NumProcessesFound = 0; /* number of processes found with session ID */
  1866. #ifdef PENABLE_LINUX26_CPUSETS
  1867. struct pidl *pids = NULL;
  1868. struct pidl *pp;
  1869. #else
  1870. struct dirent *dent;
  1871. #endif
  1872. pid_t pid;
  1873. proc_stat_t *ps;
  1874. int sesid;
  1875. pid_t mompid;
  1876. sesid = ptask->ti_qs.ti_sid;
  1877. mompid = getpid();
  1878. if (LOGLEVEL >= 5)
  1879. {
  1880. sprintf(log_buffer, "%s: sending signal %d to task %d, session %d",
  1881. __func__,
  1882. sig,
  1883. ptask->ti_qs.ti_task,
  1884. sesid);
  1885. log_record(
  1886. PBSEVENT_JOB,
  1887. PBS_EVENTCLASS_JOB,
  1888. ptask->ti_job->ji_qs.ji_jobid,
  1889. log_buffer);
  1890. }
  1891. if (sesid <= 1)
  1892. {
  1893. if (LOGLEVEL >= 3)
  1894. {
  1895. sprintf(log_buffer, "cannot send signal %d to task (no session id)",
  1896. sig);
  1897. log_record(
  1898. PBSEVENT_ERROR,
  1899. PBS_EVENTCLASS_JOB,
  1900. ptask->ti_job->ji_qs.ji_jobid,
  1901. log_buffer);
  1902. }
  1903. /* FAILURE */
  1904. return(0);
  1905. }
  1906. do
  1907. {
  1908. ctThisIteration = 0;
  1909. /* NOTE: do not use cached proc-buffer since we need up-to-date info */
  1910. #ifdef PENABLE_LINUX26_CPUSETS
  1911. /* Instead of collecting stats of all processes running on a large SMP system,
  1912. * collect stats of processes running in and below the Torque cpuset, only
  1913. * This relies on reliable process starters for MPI, which bind their tasks
  1914. * to the cpuset of the job. */
  1915. #ifdef USELIBCPUSET
  1916. pids = get_cpuset_pidlist(TTORQUECPUSET_BASE, pids);
  1917. #else
  1918. pids = get_cpuset_pidlist(TTORQUECPUSET_PATH, pids);
  1919. #endif /* USELIBCPUSET */
  1920. pp = pids;
  1921. while (pp != NULL)
  1922. {
  1923. pid = pp->pid;
  1924. pp = pp->next;
  1925. #else
  1926. if (pdir == NULL)
  1927. {
  1928. if ((pdir = opendir(procfs)) == NULL)
  1929. return(PBSE_SYSTEM);
  1930. }
  1931. /* pdir is global */
  1932. rewinddir(pdir);
  1933. while ((dent = readdir(pdir)) != NULL)
  1934. {
  1935. if (!isdigit(dent->d_name[0]))
  1936. continue;
  1937. pid = atoi(dent->d_name);
  1938. #endif /* PENABLE_LINUX26_CPUSETS */
  1939. if ((ps = get_proc_stat(pid)) == NULL)
  1940. {
  1941. if (errno != ENOENT)
  1942. {
  1943. sprintf(log_buffer, "%d: get_proc_stat", pid);
  1944. log_err(errno, __func__, log_buffer);
  1945. }
  1946. continue;
  1947. }
  1948. if ((sesid == ps->session) ||
  1949. (ProcIsChild(procfs,pid,ptask->ti_job->ji_qs.ji_jobid) == TRUE))
  1950. {
  1951. NumProcessesFound++;
  1952. if ((ps->state == 'Z') || (ps->pid == 0))
  1953. {
  1954. /*
  1955. * Killing a zombie is sure death! Its pid is zero,
  1956. * which to kill(2) means 'every process in the process
  1957. * group of the current process'.
  1958. */
  1959. sprintf(log_buffer, "%s: not killing process (pid=%d/state=%c) with sig %d",
  1960. __func__,
  1961. ps->pid,
  1962. ps->state,
  1963. sig);
  1964. log_record(
  1965. PBSEVENT_JOB,
  1966. PBS_EVENTCLASS_JOB,
  1967. ptask->ti_job->ji_qs.ji_jobid,
  1968. log_buffer);
  1969. } /* END if ((ps->state == 'Z') || (ps->pid == 0)) */
  1970. else
  1971. {
  1972. int i = 0;
  1973. if (ps->pid == mompid)
  1974. {
  1975. /*
  1976. * there is a race condition with newly started jobs that
  1977. * can be killed before they've established their own
  1978. * session id. This means the child tasks still have MOM's
  1979. * session id. We check this to make sure MOM doesn't kill
  1980. * herself.
  1981. */
  1982. if (LOGLEVEL >= 3)
  1983. {
  1984. sprintf(log_buffer, "%s: not killing process %d. Avoid sending signal because child task still has MOM's session id", __func__, ps->pid);
  1985. log_record(
  1986. PBSEVENT_JOB,
  1987. PBS_EVENTCLASS_JOB,
  1988. ptask->ti_job->ji_qs.ji_jobid,
  1989. log_buffer);
  1990. }
  1991. if((sig == SIGKILL)||(sig == SIGTERM))
  1992. {
  1993. ++ctThisIteration; //Ultimately this is task that will need to be killed.
  1994. }
  1995. continue;
  1996. } /* END if (ps->pid == mompid) */
  1997. if((sig == SIGKILL)||(sig == SIGTERM))
  1998. {
  1999. ++ctThisIteration; //Only count for killing don't count for any other signal.
  2000. }
  2001. if (sig == SIGKILL)
  2002. {
  2003. struct timespec req;
  2004. req.tv_sec = 0;
  2005. req.tv_nsec = 250000000; /* .25 seconds */
  2006. /* give the process some time to quit gracefully first (up to .25*20=5 seconds) */
  2007. sprintf(log_buffer, "%s: killing pid %d task %d gracefully with sig %d",
  2008. __func__,
  2009. ps->pid,
  2010. ptask->ti_qs.ti_task,
  2011. SIGTERM);
  2012. log_record(
  2013. PBSEVENT_JOB,
  2014. PBS_EVENTCLASS_JOB,
  2015. ptask->ti_job->ji_qs.ji_jobid,
  2016. log_buffer);
  2017. if (pg == 0)
  2018. kill(ps->pid, SIGTERM);
  2019. else
  2020. killpg(ps->pid, SIGTERM);
  2021. for (i = 0;i < 20;i++)
  2022. {
  2023. /* check if process is gone */
  2024. if ((ps = get_proc_stat(ps->pid)) == NULL)
  2025. {
  2026. break;
  2027. }
  2028. else
  2029. {
  2030. sprintf(log_buffer, "%s: process (pid=%d/state=%c) after sig %d",
  2031. __func__,
  2032. ps->pid,
  2033. ps->state,
  2034. SIGTERM);
  2035. log_record(
  2036. PBSEVENT_JOB,
  2037. PBS_EVENTCLASS_JOB,
  2038. ptask->ti_job->ji_qs.ji_jobid,
  2039. log_buffer);
  2040. if (ps->state == 'Z')
  2041. break;
  2042. }
  2043. /* try to kill again */
  2044. if (kill(ps->pid, 0) == -1)
  2045. break;
  2046. nanosleep(&req, NULL);
  2047. } /* END for (i = 0) */
  2048. } /* END if (sig == SIGKILL) */
  2049. else
  2050. {
  2051. i = 20;
  2052. }
  2053. if (i >= 20)
  2054. {
  2055. /* NOTE: handle race-condition where process goes zombie as a result of previous SIGTERM */
  2056. /* update proc info from /proc/<PID>/stat */
  2057. if ((ps = get_proc_stat(ps->pid)) != NULL)
  2058. {
  2059. if (ps->state == 'Z')
  2060. {
  2061. /*
  2062. * Killing a zombie is sure death! Its pid is zero,
  2063. * which to kill(2) means 'every process in the process
  2064. * group of the current process'.
  2065. */
  2066. sprintf(log_buffer, "%s: not killing process (pid=%d/state=%c) with sig %d",
  2067. __func__,
  2068. ps->pid,
  2069. ps->state,
  2070. sig);
  2071. log_record(
  2072. PBSEVENT_JOB,
  2073. PBS_EVENTCLASS_JOB,
  2074. ptask->ti_job->ji_qs.ji_jobid,
  2075. log_buffer);
  2076. } /* END if ((ps->state == 'Z') || (ps->pid == 0)) */
  2077. else
  2078. {
  2079. /* kill process hard */
  2080. /* why is this not killing with SIGKILL? */
  2081. sprintf(log_buffer, "%s: killing pid %d task %d with sig %d",
  2082. __func__,
  2083. ps->pid,
  2084. ptask->ti_qs.ti_task,
  2085. sig);
  2086. log_record(
  2087. PBSEVENT_JOB,
  2088. PBS_EVENTCLASS_JOB,
  2089. ptask->ti_job->ji_qs.ji_jobid,
  2090. log_buffer);
  2091. if (pg == 0)
  2092. kill(ps->pid, sig);
  2093. else
  2094. killpg(ps->pid, sig);
  2095. }
  2096. } /* END if ((ps = get_proc_stat(ps->pid)) != NULL) */
  2097. } /* END if (i >= 20) */
  2098. ++ct;
  2099. } /* END else ((ps->state == 'Z') || (ps->pid == 0)) */
  2100. } /* END if (sesid == ps->session) */
  2101. } /* END while (...) != NULL) */
  2102. #ifdef PENABLE_LINUX26_CPUSETS
  2103. free_pidlist(pids);
  2104. pids = NULL;
  2105. #endif
  2106. if(ctThisIteration == 0)
  2107. {
  2108. ctCleanIterations++;
  2109. }
  2110. else
  2111. {
  2112. ctCleanIterations=0;
  2113. }
  2114. }while((ctCleanIterations <= 5)&&(loopCt++ < 20));
  2115. /* NOTE: to fix bad state situations resulting from a hard crash, the logic
  2116. below should be triggered any time no processes are found (NYI) */
  2117. if (IS_ADOPTED_TASK(ptask->ti_qs.ti_task) && (NumProcessesFound == 0))
  2118. {
  2119. /* no process was found, but for an adopted task this is OK (we don't find
  2120. * out about the adopted task's termination via waitpid()--so we can safely
  2121. * say that we have "killed" the task, even though the task was killed/died
  2122. * some other way */
  2123. ct++;
  2124. /* do code to mark task as finished (borrowed from Linux scan_for_terminated())... */
  2125. ptask->ti_qs.ti_exitstat = 0; /* assume successful completion */
  2126. ptask->ti_qs.ti_status = TI_STATE_EXITED;
  2127. task_save(ptask);
  2128. sprintf(log_buffer,
  2129. "%s: job %s adopted task %d was marked as terminated because task's PID was no longer found, sid=%d",
  2130. __func__,
  2131. ptask->ti_job->ji_qs.ji_jobid,
  2132. ptask->ti_qs.ti_task,
  2133. ptask->ti_qs.ti_sid);
  2134. log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer);
  2135. }
  2136. if ((NumProcessesFound == 0) && (ct <= 0))
  2137. {
  2138. /* we can't find any processes belonging to given session, so we can safely say
  2139. * that we "killed" the task and have TORQUE clean it up */
  2140. ct++;
  2141. /* do code to mark task as finished (borrowed from Linux scan_for_terminated())... */
  2142. ptask->ti_qs.ti_exitstat = 0; /* assume successful completion */
  2143. ptask->ti_qs.ti_status = TI_STATE_EXITED;
  2144. task_save(ptask);
  2145. if (LOGLEVEL >= 5)
  2146. {
  2147. sprintf(log_buffer,
  2148. "%s: could not send signal %d to task %d (session %d)--no process was found with this session ID (marking task as killed)!",
  2149. __func__,
  2150. sig,
  2151. ptask->ti_qs.ti_task,
  2152. sesid);
  2153. log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer);
  2154. }
  2155. }
  2156. /* SUCCESS */
  2157. return(ct);
  2158. } /* END kill_task() */
  2159. /*
  2160. * Clean up everything related to polling.
  2161. */
  2162. int mom_close_poll(void)
  2163. {
  2164. if (LOGLEVEL >= 6)
  2165. {
  2166. log_record(PBSEVENT_SYSTEM, 0, __func__, "entered");
  2167. }
  2168. if (pdir != NULL)
  2169. {
  2170. if (closedir(pdir) != 0)
  2171. {
  2172. log_err(errno, __func__, "closedir");
  2173. return(PBSE_SYSTEM);
  2174. }
  2175. pdir = NULL;
  2176. }
  2177. if (proc_array != NULL)
  2178. {
  2179. free(proc_array);
  2180. proc_array = NULL;
  2181. nproc = 0;
  2182. max_proc = TBL_INC;
  2183. }
  2184. return(PBSE_NONE);
  2185. } /* END mom_close_poll() */
  2186. /*
  2187. * mom_does_checkpoint
  2188. *
  2189. * @returns CST values as described in resmon.h.
  2190. */
  2191. int mom_does_checkpoint(void)
  2192. {
  2193. return(CST_BLCR); /* Use the BLCR checkpointing system. */
  2194. }
  2195. /*
  2196. * Checkpoint the job.
  2197. *
  2198. * If abort is true, kill it too.
  2199. */
  2200. int mach_checkpoint(
  2201. task *ptask, /* I */
  2202. char *file, /* I */
  2203. int abort) /* I */
  2204. {
  2205. return(-1);
  2206. } /* END mach_checkpoint() */
  2207. /*
  2208. * Restart the job from the checkpoint file.
  2209. *
  2210. * Return -1 on error or sid if okay.
  2211. */
  2212. long mach_restart(
  2213. task *ptask,
  2214. char *file)
  2215. {
  2216. return(-1);
  2217. }
  2218. #define dsecs(val) ( (double)(val) )
  2219. char *cput_job(
  2220. pid_t jobid)
  2221. {
  2222. int found = 0;
  2223. int i;
  2224. double cputime, addtime;
  2225. proc_stat_t *ps;
  2226. cputime = 0.0;
  2227. if (LOGLEVEL >= 6)
  2228. {
  2229. sprintf(log_buffer, "proc_array loop start - jobid = %d",
  2230. jobid);
  2231. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  2232. }
  2233. for (i = 0;i < nproc;i++)
  2234. {
  2235. ps = &proc_array[i];
  2236. if (jobid != ps->session)
  2237. continue;
  2238. found = 1;
  2239. /* add utime and stime (AKE) */
  2240. addtime =
  2241. dsecs(ps->utime) +
  2242. dsecs(ps->stime) +
  2243. dsecs(ps->cutime) +
  2244. dsecs(ps->cstime);
  2245. cputime += addtime;
  2246. DBPRT(("%s: total %.2f pid %d %.2f\n",
  2247. __func__,
  2248. cputime,
  2249. ps->pid,
  2250. addtime))
  2251. } /* END for (i) */
  2252. if (!found)
  2253. {
  2254. rm_errno = RM_ERR_EXIST;
  2255. return(NULL);
  2256. }
  2257. sprintf(ret_string, "%.2f",
  2258. cputime * cputfactor);
  2259. return(ret_string);
  2260. } /* END cput_job() */
  2261. char *cput_proc(
  2262. pid_t pid)
  2263. {
  2264. double cputime;
  2265. proc_stat_t *ps;
  2266. cputime = 0.0;
  2267. if ((ps = get_proc_stat(pid)) == NULL)
  2268. {
  2269. if (errno != ENOENT)
  2270. {
  2271. sprintf(log_buffer, "%d: get_proc_stat",
  2272. pid);
  2273. log_err(errno, __func__, log_buffer);
  2274. }
  2275. rm_errno = RM_ERR_SYSTEM;
  2276. return(NULL);
  2277. }
  2278. cputime = dsecs(ps->utime) + dsecs(ps->stime);
  2279. sprintf(ret_string, "%.2f",
  2280. cputime * cputfactor);
  2281. return(ret_string);
  2282. } /* END cput_proc() */
  2283. const char *cput(
  2284. struct rm_attribute *attrib)
  2285. {
  2286. int value;
  2287. if (attrib == NULL)
  2288. {
  2289. log_err(-1, __func__, no_parm);
  2290. rm_errno = RM_ERR_NOPARAM;
  2291. return(NULL);
  2292. }
  2293. if ((value = atoi(attrib->a_value)) == 0)
  2294. {
  2295. sprintf(log_buffer, "bad param: %s",
  2296. attrib->a_value);
  2297. log_err(-1, __func__, log_buffer);
  2298. rm_errno = RM_ERR_BADPARAM;
  2299. return(NULL);
  2300. }
  2301. if (momgetattr(NULL))
  2302. {
  2303. log_err(-1, __func__, extra_parm);
  2304. rm_errno = RM_ERR_BADPARAM;
  2305. return(NULL);
  2306. }
  2307. if (strcmp(attrib->a_qualifier, "session") == 0)
  2308. {
  2309. return(cput_job((pid_t)value));
  2310. }
  2311. if (strcmp(attrib->a_qualifier, "proc") == 0)
  2312. {
  2313. return(cput_proc((pid_t)value));
  2314. }
  2315. rm_errno = RM_ERR_BADPARAM;
  2316. return(NULL);
  2317. } /* END cput() */
  2318. char *mem_job(
  2319. pid_t sid) /* I */
  2320. {
  2321. unsigned long long memsize;
  2322. int i;
  2323. proc_stat_t *ps;
  2324. /* max memsize ??? */
  2325. memsize = 0;
  2326. if (LOGLEVEL >= 6)
  2327. {
  2328. sprintf(log_buffer, "proc_array loop start - sid = %d",
  2329. sid);
  2330. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  2331. }
  2332. for (i = 0;i < nproc;i++)
  2333. {
  2334. ps = &proc_array[i];
  2335. if (sid != ps->session)
  2336. continue;
  2337. memsize += ps->vsize;
  2338. } /* END for (i) */
  2339. if (memsize == 0)
  2340. {
  2341. rm_errno = RM_ERR_EXIST;
  2342. return(NULL);
  2343. }
  2344. sprintf(ret_string, "%llukb",
  2345. memsize >> 10); /* KB */
  2346. return(ret_string);
  2347. } /* END mem_job() */
  2348. char *mem_proc(
  2349. pid_t pid)
  2350. {
  2351. proc_stat_t *ps;
  2352. if ((ps = get_proc_stat(pid)) == NULL)
  2353. {
  2354. if (errno != ENOENT)
  2355. {
  2356. sprintf(log_buffer, "%d: get_proc_stat",
  2357. pid);
  2358. log_err(errno, __func__, log_buffer);
  2359. }
  2360. rm_errno = RM_ERR_SYSTEM;
  2361. return(NULL);
  2362. }
  2363. sprintf(ret_string, "%llukb",
  2364. (unsigned long long)ps->vsize >> 10); /* KB */
  2365. return(ret_string);
  2366. } /* END mem_proc() */
  2367. const char *mem(
  2368. struct rm_attribute *attrib)
  2369. {
  2370. int value;
  2371. if (attrib == NULL)
  2372. {
  2373. log_err(-1, __func__, no_parm);
  2374. rm_errno = RM_ERR_NOPARAM;
  2375. return(NULL);
  2376. }
  2377. if ((value = atoi(attrib->a_value)) == 0)
  2378. {
  2379. sprintf(log_buffer, "bad param: %s",
  2380. attrib->a_value);
  2381. log_err(-1, __func__, log_buffer);
  2382. rm_errno = RM_ERR_BADPARAM;
  2383. return(NULL);
  2384. }
  2385. if (momgetattr(NULL))
  2386. {
  2387. log_err(-1, __func__, extra_parm);
  2388. rm_errno = RM_ERR_BADPARAM;
  2389. return(NULL);
  2390. }
  2391. if (strcmp(attrib->a_qualifier, "session") == 0)
  2392. {
  2393. return(mem_job((pid_t)value));
  2394. }
  2395. else if (strcmp(attrib->a_qualifier, "proc") == 0)
  2396. {
  2397. return(mem_proc((pid_t)value));
  2398. }
  2399. else
  2400. {
  2401. rm_errno = RM_ERR_BADPARAM;
  2402. return(NULL);
  2403. }
  2404. return(NULL);
  2405. } /* END mem() */
  2406. static char *resi_job(
  2407. pid_t jobid)
  2408. {
  2409. int i;
  2410. int found = 0;
  2411. unsigned long long resisize;
  2412. proc_stat_t *ps;
  2413. #ifdef USELIBMEMACCT
  2414. long long w_rss;
  2415. #endif
  2416. resisize = 0;
  2417. if (LOGLEVEL >= 6)
  2418. {
  2419. sprintf(log_buffer, "proc_array loop start - jobid = %d",
  2420. jobid);
  2421. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  2422. }
  2423. for (i = 0;i < nproc;i++)
  2424. {
  2425. ps = &proc_array[i];
  2426. if (jobid != ps->session)
  2427. continue;
  2428. found = 1;
  2429. #ifdef USELIBMEMACCT
  2430. /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */
  2431. w_rss = get_memacct_resi(ps->pid);
  2432. if (w_rss == -1)
  2433. resisize += ps->rss * pagesize;
  2434. else
  2435. resisize += w_rss;
  2436. #else
  2437. resisize += ps->rss;
  2438. #endif
  2439. } /* END for (i) */
  2440. if (found)
  2441. {
  2442. /* in KB */
  2443. #ifdef USELIBMEMACCT
  2444. sprintf(ret_string, "%llukb", resisize >> 10);
  2445. #else
  2446. sprintf(ret_string, "%llukb",
  2447. (resisize * (unsigned long long)pagesize) >> 10);
  2448. #endif
  2449. return(ret_string);
  2450. }
  2451. rm_errno = RM_ERR_EXIST;
  2452. return(NULL);
  2453. } /* END resi_job() */
  2454. static char *resi_proc(
  2455. pid_t pid)
  2456. {
  2457. proc_stat_t *ps;
  2458. #ifdef USELIBMEMACCT
  2459. long long w_rss;
  2460. #endif
  2461. if ((ps = get_proc_stat(pid)) == NULL)
  2462. {
  2463. if (errno != ENOENT)
  2464. {
  2465. sprintf(log_buffer, "%d: get_proc_stat(PIOCPSINFO)",
  2466. pid);
  2467. log_err(errno, __func__, log_buffer);
  2468. }
  2469. rm_errno = RM_ERR_SYSTEM;
  2470. return(NULL);
  2471. }
  2472. #ifdef USELIBMEMACCT
  2473. /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */
  2474. if ((w_rss = get_memacct_resi(ps->pid)) == -1)
  2475. sprintf(ret_string, "%llukb", (ps->rss * (unsigned long long)pagesize) >> 10);
  2476. else
  2477. sprintf(ret_string, "%ldkb", w_rss >> 10);
  2478. #else
  2479. /* in KB */
  2480. sprintf(ret_string, "%lukb",
  2481. ((ulong)ps->rss * (ulong)pagesize) >> 10);
  2482. #endif
  2483. return(ret_string);
  2484. } /* END resi_proc() */
  2485. static const char *resi(
  2486. struct rm_attribute *attrib)
  2487. {
  2488. int value;
  2489. if (attrib == NULL)
  2490. {
  2491. log_err(-1, __func__, no_parm);
  2492. rm_errno = RM_ERR_NOPARAM;
  2493. return(NULL);
  2494. }
  2495. if ((value = atoi(attrib->a_value)) == 0)
  2496. {
  2497. sprintf(log_buffer,
  2498. "bad param: %s",
  2499. attrib->a_value);
  2500. log_err(-1, __func__, log_buffer);
  2501. rm_errno = RM_ERR_BADPARAM;
  2502. return(NULL);
  2503. }
  2504. if (momgetattr(NULL))
  2505. {
  2506. log_err(-1, __func__, extra_parm);
  2507. rm_errno = RM_ERR_BADPARAM;
  2508. return(NULL);
  2509. }
  2510. if (strcmp(attrib->a_qualifier, "session") == 0)
  2511. {
  2512. return(resi_job((pid_t)value));
  2513. }
  2514. if (strcmp(attrib->a_qualifier, "proc") == 0)
  2515. {
  2516. return(resi_proc((pid_t)value));
  2517. }
  2518. rm_errno = RM_ERR_BADPARAM;
  2519. return(NULL);
  2520. } /* END resi() */
  2521. const char *sessions(
  2522. struct rm_attribute *attrib) /* I */
  2523. {
  2524. int nsids = 0;
  2525. pid_t sid;
  2526. char *s;
  2527. #ifdef NUMA_SUPPORT
  2528. char mom_check_name[PBS_MAXSERVERNAME];
  2529. job *pjob;
  2530. task *ptask;
  2531. #else
  2532. proc_stat_t *ps;
  2533. struct pidl *sids = NULL, *sl = NULL, *sp;
  2534. int i;
  2535. #endif
  2536. if (attrib != NULL)
  2537. {
  2538. log_err(-1, __func__, extra_parm);
  2539. rm_errno = RM_ERR_BADPARAM;
  2540. return(NULL);
  2541. }
  2542. ret_string[0] = '\0';
  2543. #ifdef NUMA_SUPPORT
  2544. /* Initialize the node name to check for for this NUMA node */
  2545. strcpy(mom_check_name, mom_host);
  2546. if ((s = strchr(mom_check_name, '.')) != NULL)
  2547. *s = '\0';
  2548. sprintf(mom_check_name + strlen(mom_check_name), "-%d/", numa_index);
  2549. /* Initialize the return string */
  2550. s = ret_string;
  2551. /* Walk through job list, look for jobs running on this NUMA node */
  2552. for (pjob = (job *)GET_NEXT(svr_alljobs);
  2553. pjob != NULL;
  2554. pjob = (job *)GET_NEXT(pjob->ji_alljobs))
  2555. {
  2556. if (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, mom_check_name) == NULL)
  2557. continue;
  2558. /* Show all tasks registered for this job */
  2559. for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
  2560. ptask != NULL;
  2561. ptask = (task *)GET_NEXT(ptask->ti_jobtask))
  2562. {
  2563. if (ptask->ti_qs.ti_status != TI_STATE_RUNNING)
  2564. continue;
  2565. sid = ptask->ti_qs.ti_sid;
  2566. if (LOGLEVEL >= 9)
  2567. {
  2568. sprintf(log_buffer, "%s[%d]: job %s on %s? sid %d",
  2569. __func__,
  2570. nsids,
  2571. pjob->ji_qs.ji_jobid,
  2572. mom_check_name,
  2573. sid);
  2574. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2575. }
  2576. checkret(&s, 100);
  2577. sprintf(s, "%s%d", (ret_string[0] != '\0') ? " " : "", sid);
  2578. s += strlen(s);
  2579. nsids++;
  2580. } /* END for(ptask) */
  2581. } /* END for(pjob) */
  2582. #else
  2583. /* Walk through proc_array, store unique session IDs in the pids list */
  2584. for (i = 0;i < nproc;i++)
  2585. {
  2586. ps = &proc_array[i];
  2587. if (ps->uid == 0)
  2588. continue;
  2589. if ((sid = ps->session) == 0)
  2590. continue;
  2591. if (LOGLEVEL >= 9)
  2592. {
  2593. sprintf(log_buffer, "%s[%d]: pid %d sid %d", __func__, nsids, ps->pid, sid);
  2594. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2595. }
  2596. sp = sids;
  2597. while (sp)
  2598. {
  2599. if (sp->pid == sid) /* found */
  2600. break;
  2601. sp = sp->next;
  2602. }
  2603. if (sp)
  2604. continue;
  2605. /* not found */
  2606. if ((sp = (struct pidl *)calloc(1, sizeof(struct pidl))) == NULL)
  2607. {
  2608. log_err(errno, __func__, "no memory");
  2609. rm_errno = RM_ERR_SYSTEM;
  2610. if (sids)
  2611. free_pidlist(sids);
  2612. return(NULL);
  2613. }
  2614. sp->pid = sid;
  2615. sp->next = NULL;
  2616. nsids++;
  2617. if (sl)
  2618. sl->next = sp;
  2619. else
  2620. sids = sp;
  2621. sl = sp;
  2622. } /* END for(i) */
  2623. /*
  2624. * Assemble return string.
  2625. * Return empty string if no sessions.
  2626. */
  2627. s = ret_string;
  2628. sp = sids;
  2629. while (sp)
  2630. {
  2631. checkret(&s, 100);
  2632. if (sp == sids)
  2633. sprintf(s, "%d", sp->pid);
  2634. else
  2635. sprintf(s, " %d", sp->pid);
  2636. s += strlen(s);
  2637. sp = sp->next;
  2638. } /* END while(sp) */
  2639. /* Done */
  2640. if (sids)
  2641. free_pidlist(sids);
  2642. #endif
  2643. if (LOGLEVEL >= 6)
  2644. {
  2645. sprintf(log_buffer, "nsessions=%d", nsids);
  2646. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2647. }
  2648. return(ret_string);
  2649. }
  2650. const char *nsessions(
  2651. struct rm_attribute *attrib)
  2652. {
  2653. const char *result;
  2654. const char *ch;
  2655. int num;
  2656. if ((result = sessions(attrib)) == NULL)
  2657. return(result);
  2658. if (result[0] == '\0')
  2659. {
  2660. num = 0;
  2661. }
  2662. else
  2663. {
  2664. num = 1;
  2665. for (ch = result;*ch;ch++)
  2666. if (*ch == ' ') /* count blanks */
  2667. num++;
  2668. } /* END for (ch) */
  2669. sprintf(ret_string, "%d",
  2670. num);
  2671. return(ret_string);
  2672. } /* END nsessions() */
  2673. const char *pids(
  2674. struct rm_attribute *attrib) /* I */
  2675. {
  2676. pid_t jobid;
  2677. proc_stat_t *ps;
  2678. char *fmt;
  2679. int i;
  2680. int num_pids = 0;
  2681. if (attrib == NULL)
  2682. {
  2683. log_err(-1, __func__, no_parm);
  2684. rm_errno = RM_ERR_NOPARAM;
  2685. return(NULL);
  2686. }
  2687. if ((jobid = (pid_t)atoi(attrib->a_value)) == 0)
  2688. {
  2689. sprintf(log_buffer, "bad param: %s",
  2690. attrib->a_value);
  2691. log_err(-1, __func__, log_buffer);
  2692. rm_errno = RM_ERR_BADPARAM;
  2693. return(NULL);
  2694. }
  2695. if (momgetattr(NULL))
  2696. {
  2697. log_err(-1, __func__, extra_parm);
  2698. rm_errno = RM_ERR_BADPARAM;
  2699. return(NULL);
  2700. }
  2701. if (strcmp(attrib->a_qualifier, "session") != 0)
  2702. {
  2703. rm_errno = RM_ERR_BADPARAM;
  2704. return(NULL);
  2705. }
  2706. /* Search for members of session */
  2707. fmt = ret_string;
  2708. for (i = 0;i < nproc;i++)
  2709. {
  2710. ps = &proc_array[i];
  2711. if (LOGLEVEL >= 6)
  2712. {
  2713. DBPRT(("%s[%d]: pid: %d sid: %d\n",
  2714. __func__,
  2715. num_pids,
  2716. ps->pid,
  2717. ps->session))
  2718. }
  2719. if (jobid != ps->session)
  2720. continue;
  2721. sprintf(fmt, "%d ",
  2722. ps->pid);
  2723. fmt += strlen(fmt);
  2724. num_pids++;
  2725. } /* END for (i) */
  2726. if (num_pids == 0)
  2727. {
  2728. rm_errno = RM_ERR_EXIST;
  2729. return(NULL);
  2730. }
  2731. return(ret_string);
  2732. } /* END pids() */
  2733. const char *nusers(
  2734. struct rm_attribute *attrib)
  2735. {
  2736. int j;
  2737. int nuids = 0;
  2738. uid_t *uids, *hold;
  2739. static int maxuid = 200;
  2740. register uid_t uid;
  2741. #ifdef NUMA_SUPPORT
  2742. char mom_check_name[PBS_MAXSERVERNAME], *s;
  2743. job *pjob;
  2744. #else
  2745. int i;
  2746. proc_stat_t *ps;
  2747. #endif
  2748. if (attrib != NULL)
  2749. {
  2750. log_err(-1, __func__, extra_parm);
  2751. rm_errno = RM_ERR_BADPARAM;
  2752. return(NULL);
  2753. }
  2754. if ((uids = (uid_t *)calloc(maxuid, sizeof(uid_t))) == NULL)
  2755. {
  2756. log_err(errno, __func__, "no memory");
  2757. rm_errno = RM_ERR_SYSTEM;
  2758. return(NULL);
  2759. }
  2760. #ifdef NUMA_SUPPORT
  2761. /* Initialize the node name to check for for this NUMA node */
  2762. strcpy(mom_check_name, mom_host);
  2763. if ((s = strchr(mom_check_name, '.')) != NULL)
  2764. *s = '\0';
  2765. sprintf(mom_check_name + strlen(mom_check_name), "-%d/", numa_index);
  2766. /* Walk through job list, look for jobs running on this NUMA node */
  2767. for (pjob = (job *)GET_NEXT(svr_alljobs);
  2768. pjob != NULL;
  2769. pjob = (job *)GET_NEXT(pjob->ji_alljobs))
  2770. {
  2771. if (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, mom_check_name) == NULL)
  2772. continue;
  2773. /* Store uid of job owner */
  2774. uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid;
  2775. if (LOGLEVEL >= 9)
  2776. {
  2777. sprintf(log_buffer, "%s[%d]: job %s on %s? uid %d",
  2778. __func__,
  2779. nuids,
  2780. pjob->ji_qs.ji_jobid,
  2781. mom_check_name,
  2782. uid);
  2783. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2784. }
  2785. #else
  2786. for (i = 0;i < nproc;i++)
  2787. {
  2788. ps = &proc_array[i];
  2789. if ((uid = ps->uid) == 0)
  2790. continue;
  2791. if (LOGLEVEL >= 9)
  2792. {
  2793. sprintf(log_buffer, "%s[%d]: pid %d uid %d",
  2794. __func__,
  2795. nuids,
  2796. ps->pid,
  2797. uid);
  2798. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2799. }
  2800. #endif
  2801. for (j = 0;j < nuids;j++)
  2802. {
  2803. if (uids[j] == uid)
  2804. break;
  2805. }
  2806. if (j == nuids)
  2807. {
  2808. /* not found */
  2809. if (nuids == maxuid)
  2810. {
  2811. /* need more space */
  2812. maxuid += 100;
  2813. hold = (uid_t *)realloc(uids, maxuid);
  2814. if (hold == NULL)
  2815. {
  2816. log_err(errno, __func__, "realloc");
  2817. rm_errno = RM_ERR_SYSTEM;
  2818. free(uids);
  2819. return(NULL);
  2820. }
  2821. memset(hold+(maxuid-100), 0, 100*sizeof(uid_t));
  2822. if (LOGLEVEL >= 7)
  2823. {
  2824. sprintf(log_buffer, "%s[%d]: need more space: %d", __func__, nuids, maxuid);
  2825. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2826. }
  2827. hold[nuids++] = uid; /* add uid to list */
  2828. uids = hold;
  2829. }
  2830. else
  2831. {
  2832. uids[nuids++] = uid; /* add uid to list */
  2833. }
  2834. }
  2835. } /* END for (i) */
  2836. sprintf(ret_string, "%d",
  2837. nuids);
  2838. free(uids);
  2839. if (LOGLEVEL >= 6)
  2840. {
  2841. sprintf(log_buffer, "nusers=%d", nuids);
  2842. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2843. }
  2844. return(ret_string);
  2845. } /* END nusers() */
  2846. const char *totmem(
  2847. struct rm_attribute *attrib)
  2848. {
  2849. proc_mem_t *mm;
  2850. if (attrib)
  2851. {
  2852. log_err(-1, __func__, extra_parm);
  2853. rm_errno = RM_ERR_BADPARAM;
  2854. return(NULL);
  2855. }
  2856. if ((mm = get_proc_mem()) == NULL)
  2857. {
  2858. log_err(errno, __func__, "get_proc_mem");
  2859. rm_errno = RM_ERR_SYSTEM;
  2860. return(NULL);
  2861. }
  2862. if (LOGLEVEL >= 6)
  2863. {
  2864. sprintf(log_buffer, "%s: total mem=%llu",
  2865. __func__,
  2866. mm->mem_total + mm->swap_total);
  2867. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2868. }
  2869. sprintf(ret_string, "%lukb",
  2870. (ulong)((mm->mem_total >> 10) + (mm->swap_total >> 10))); /* KB */
  2871. return(ret_string);
  2872. } /* END totmem() */
  2873. const char *availmem(
  2874. struct rm_attribute *attrib)
  2875. {
  2876. proc_mem_t *mm;
  2877. if (attrib != NULL)
  2878. {
  2879. log_err(-1, __func__, extra_parm);
  2880. rm_errno = RM_ERR_BADPARAM;
  2881. return(NULL);
  2882. }
  2883. if ((mm = get_proc_mem()) == NULL)
  2884. {
  2885. log_err(errno, __func__, "get_proc_mem");
  2886. rm_errno = RM_ERR_SYSTEM;
  2887. return(NULL);
  2888. } /* END availmem() */
  2889. if (LOGLEVEL >= 6)
  2890. {
  2891. sprintf(log_buffer, "%s: free mem=%llu",
  2892. __func__,
  2893. mm->mem_free + mm->swap_free);
  2894. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  2895. }
  2896. sprintf(ret_string, "%lukb",
  2897. (ulong)((mm->mem_free >> 10) + (mm->swap_free >> 10))); /* KB */
  2898. return(ret_string);
  2899. } /* END availmem() */
  2900. const char *ncpus(
  2901. struct rm_attribute *attrib)
  2902. {
  2903. #ifdef NUMA_SUPPORT
  2904. /* report the configured ncpus for this numa node */
  2905. sprintf(ret_string,"%d",node_boards[numa_index].num_cpus);
  2906. #else
  2907. char label[128];
  2908. FILE *fp;
  2909. int procs;
  2910. if (attrib != NULL)
  2911. {
  2912. log_err(-1, __func__, extra_parm);
  2913. rm_errno = RM_ERR_BADPARAM;
  2914. return(NULL);
  2915. }
  2916. if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
  2917. {
  2918. return(NULL);
  2919. }
  2920. procs = 0;
  2921. while (!feof(fp))
  2922. {
  2923. if (fscanf(fp, "%s %*[^\n]%*c", label) == 0)
  2924. {
  2925. getc(fp); /* must do something to get to eof */
  2926. }
  2927. else if (strcmp("processor", label) == 0)
  2928. procs++;
  2929. }
  2930. sprintf(ret_string, "%d", procs);
  2931. system_ncpus = procs;
  2932. fclose(fp);
  2933. #endif /* NUMA_SUPPORT */
  2934. if (LOGLEVEL >= 6)
  2935. {
  2936. sprintf(log_buffer, "ncpus=%s", ret_string);
  2937. log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_NODE, "ncpus", log_buffer);
  2938. }
  2939. return(ret_string);
  2940. } /* END ncpus() */
  2941. /* find_file checks for the existence of filename
  2942. * in the ':' delimited path string
  2943. * Return TRUE if file exists
  2944. * FALSE if file not found
  2945. */
  2946. int find_file(
  2947. char *path,
  2948. char *filename)
  2949. {
  2950. char *ptr1, *ptr2;
  2951. char buf[RETURN_STRING_SIZE];
  2952. int rc;
  2953. struct stat statBuf;
  2954. if (path == NULL)
  2955. {
  2956. return(FALSE);
  2957. }
  2958. if (filename == NULL)
  2959. {
  2960. return(FALSE);
  2961. }
  2962. memset(buf, 0, RETURN_STRING_SIZE);
  2963. ptr1 = path;
  2964. ptr2 = buf;
  2965. do
  2966. {
  2967. *ptr2 = *ptr1;
  2968. ptr1++;
  2969. if (*ptr1 == ':' || *ptr1 == '\0')
  2970. {
  2971. /* check for the forward slash at the end of the path variable */
  2972. if (*ptr2 != '/')
  2973. {
  2974. ptr2++;
  2975. *ptr2 = '/';
  2976. }
  2977. strcat(buf, filename);
  2978. rc = stat(buf, &statBuf);
  2979. if (rc == 0)
  2980. {
  2981. return(TRUE);
  2982. }
  2983. /* Advance the pointer in the path */
  2984. ptr1++;
  2985. /* reset ptr2 to the beginning of buf and get the
  2986. next directory */
  2987. memset(buf, 0, RETURN_STRING_SIZE);
  2988. ptr2 = buf;
  2989. }
  2990. else
  2991. ptr2++; /* advance ptr2 to the next element in buf */
  2992. }while(*ptr1 != '\0');
  2993. return(FALSE);
  2994. }
  2995. static const char *physmem(
  2996. struct rm_attribute *attrib)
  2997. {
  2998. char tmpBuf[PMEMBUF_SIZE];
  2999. char *BPtr;
  3000. int BSpace;
  3001. unsigned long long mem;
  3002. unsigned long long mem_total;
  3003. FILE *fp;
  3004. #ifdef NUMA_SUPPORT
  3005. int i;
  3006. #endif
  3007. if (attrib != NULL)
  3008. {
  3009. log_err(-1, __func__, extra_parm);
  3010. rm_errno = RM_ERR_BADPARAM;
  3011. return(NULL);
  3012. }
  3013. mem_total = 0;
  3014. #ifdef NUMA_SUPPORT
  3015. for (i = 0; i < node_boards[numa_index].num_nodes; i++)
  3016. #endif /* NUMA_SUPPORT */
  3017. {
  3018. #ifdef NUMA_SUPPORT
  3019. if (!(fp = fopen(node_boards[numa_index].path_meminfo[i],"r")))
  3020. #else
  3021. if (!(fp = fopen(path_meminfo, "r")))
  3022. #endif
  3023. {
  3024. rm_errno = RM_ERR_SYSTEM;
  3025. return(NULL);
  3026. }
  3027. BPtr = tmpBuf;
  3028. BSpace = sizeof(tmpBuf);
  3029. BPtr[0] = '\0';
  3030. while (!feof(fp))
  3031. {
  3032. if (fgets(BPtr, BSpace, fp) == NULL)
  3033. {
  3034. break;
  3035. }
  3036. BSpace -= strlen(BPtr);
  3037. BPtr += strlen(BPtr);
  3038. }
  3039. fclose(fp);
  3040. /* FORMAT: '...\nMemTotal: XXX kB\n' */
  3041. if ((BPtr = strstr(tmpBuf, "MemTotal:")) != NULL)
  3042. {
  3043. BPtr += strlen("MemTotal:");
  3044. if (sscanf(BPtr, "%llu",
  3045. &mem) != 1)
  3046. {
  3047. rm_errno = RM_ERR_SYSTEM;
  3048. return(NULL);
  3049. }
  3050. /* value specified in kb */
  3051. }
  3052. else
  3053. {
  3054. /* attempt to load first numeric value */
  3055. if (sscanf(BPtr, "%*s %llu",
  3056. &mem) != 1)
  3057. {
  3058. rm_errno = RM_ERR_SYSTEM;
  3059. return(NULL);
  3060. }
  3061. /* value specified in bytes */
  3062. mem >>= 10;
  3063. }
  3064. mem_total += mem;
  3065. }
  3066. sprintf(ret_string, "%llukb",
  3067. mem_total);
  3068. return(ret_string);
  3069. } /* END physmem() */
  3070. char *size_fs(
  3071. char *param)
  3072. {
  3073. struct statfs fsbuf;
  3074. if (param[0] != '/')
  3075. {
  3076. sprintf(log_buffer, "%s: not full path filesystem name: %s",
  3077. __func__,
  3078. param);
  3079. log_err(-1, __func__, log_buffer);
  3080. rm_errno = RM_ERR_BADPARAM;
  3081. return(NULL);
  3082. }
  3083. if (statfs(param, &fsbuf) == -1)
  3084. {
  3085. log_err(errno, __func__, "statfs");
  3086. rm_errno = RM_ERR_BADPARAM;
  3087. return(NULL);
  3088. }
  3089. #ifdef RPT_BAVAIL
  3090. #define RPT_STATFS_MEMBER f_bavail
  3091. #else
  3092. #define RPT_STATFS_MEMBER f_bfree
  3093. #endif
  3094. sprintf(ret_string, "%lukb:%lukb",
  3095. (ulong)(((double)fsbuf.f_bsize * (double)fsbuf.RPT_STATFS_MEMBER) / 1024.0),
  3096. (ulong)(((double)fsbuf.f_bsize * (double)fsbuf.f_blocks) / 1024.0)); /* KB */
  3097. return(ret_string);
  3098. } /* END size_fs() */
  3099. char *size_file(
  3100. char *param)
  3101. {
  3102. struct stat sbuf;
  3103. if (param[0] != '/')
  3104. {
  3105. sprintf(log_buffer, "%s: not full path filesystem name: %s",
  3106. __func__, param);
  3107. log_err(-1, __func__, log_buffer);
  3108. rm_errno = RM_ERR_BADPARAM;
  3109. return(NULL);
  3110. }
  3111. if (stat(param, &sbuf) == -1)
  3112. {
  3113. log_err(errno, __func__, "stat");
  3114. rm_errno = RM_ERR_BADPARAM;
  3115. return(NULL);
  3116. }
  3117. sprintf(ret_string, "%lukb",
  3118. (unsigned long)sbuf.st_size >> 10); /* KB */
  3119. return(ret_string);
  3120. } /* END size_file() */
  3121. const char *size(
  3122. struct rm_attribute *attrib)
  3123. {
  3124. char *param;
  3125. if (attrib == NULL)
  3126. {
  3127. log_err(-1, __func__, no_parm);
  3128. rm_errno = RM_ERR_NOPARAM;
  3129. return(NULL);
  3130. }
  3131. if (momgetattr(NULL))
  3132. {
  3133. log_err(-1, __func__, extra_parm);
  3134. rm_errno = RM_ERR_BADPARAM;
  3135. return(NULL);
  3136. }
  3137. param = attrib->a_value;
  3138. if (strcmp(attrib->a_qualifier, "file") == 0)
  3139. {
  3140. return(size_file(param));
  3141. }
  3142. if (strcmp(attrib->a_qualifier, "fs") == 0)
  3143. {
  3144. return(size_fs(param));
  3145. }
  3146. rm_errno = RM_ERR_BADPARAM;
  3147. return(NULL);
  3148. } /* END size() */
  3149. /*
  3150. * For a recovering (-p) mom, look through existing tasks in existing
  3151. * jobs for things that have exited that are not owned by us through a
  3152. * parent-child relationship. Otherwise we cannot report back to tm
  3153. * clients when tasks have exited.
  3154. */
  3155. void scan_non_child_tasks(void)
  3156. {
  3157. job *pJob;
  3158. static int first_time = TRUE;
  3159. DIR *pdir; /* use local pdir to prevent race conditions associated w/global pdir (VPAC) */
  3160. pdir = opendir(procfs);
  3161. for (pJob = (job *)(GET_NEXT(svr_alljobs));
  3162. pJob != (job *)NULL;pJob = (job *)(GET_NEXT(pJob->ji_alljobs)))
  3163. {
  3164. task *pTask;
  3165. long job_start_time = 0;
  3166. long job_session_id = 0;
  3167. long session_start_time = 0;
  3168. proc_stat_t *ps = NULL;
  3169. if(pJob->ji_wattr[JOB_ATR_system_start_time].at_flags&ATR_VFLAG_SET)
  3170. {
  3171. job_start_time = pJob->ji_wattr[JOB_ATR_system_start_time].at_val.at_long;
  3172. }
  3173. if(pJob->ji_wattr[JOB_ATR_session_id].at_flags&ATR_VFLAG_SET)
  3174. {
  3175. job_session_id = pJob->ji_wattr[JOB_ATR_session_id].at_val.at_long;
  3176. }
  3177. if((ps = get_proc_stat(job_session_id)) != NULL)
  3178. {
  3179. session_start_time = (long)ps->start_time;
  3180. }
  3181. for (pTask = (task *)(GET_NEXT(pJob->ji_tasks));
  3182. pTask != NULL;
  3183. pTask = (task *)(GET_NEXT(pTask->ti_jobtask)))
  3184. {
  3185. #ifdef PENABLE_LINUX26_CPUSETS
  3186. struct pidl *pids = NULL;
  3187. struct pidl *pp;
  3188. #else
  3189. struct dirent *dent;
  3190. #endif
  3191. pid_t pid;
  3192. int found;
  3193. /*
  3194. * Check for tasks that were exiting when mom went down, set back to
  3195. * running so we can reprocess them and send the obit
  3196. */
  3197. if ((first_time) && (pTask->ti_qs.ti_sid != 0) &&
  3198. ((pTask->ti_qs.ti_status == TI_STATE_EXITED) ||
  3199. (pTask->ti_qs.ti_status == TI_STATE_DEAD)))
  3200. {
  3201. if (LOGLEVEL >= 7)
  3202. {
  3203. sprintf(log_buffer, "marking task %d as TI_STATE_RUNNING was %d",
  3204. pTask->ti_qs.ti_task,
  3205. pTask->ti_qs.ti_status);
  3206. log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pJob->ji_qs.ji_jobid, log_buffer);
  3207. }
  3208. pTask->ti_qs.ti_status = TI_STATE_RUNNING;
  3209. }
  3210. /* only check on tasks that we think should still be around */
  3211. if (pTask->ti_qs.ti_status != TI_STATE_RUNNING)
  3212. continue;
  3213. /* look for processes with this session id */
  3214. found = 0;
  3215. /* NOTE: on linux systems, the session master should have pid == sessionid */
  3216. if (kill(pTask->ti_qs.ti_sid, 0) != -1)
  3217. {
  3218. if((job_start_time != 0)&&
  3219. (session_start_time != 0))
  3220. {
  3221. if(job_start_time == session_start_time)
  3222. {
  3223. found = 1;
  3224. }
  3225. }
  3226. else
  3227. {
  3228. found = 1;
  3229. }
  3230. }
  3231. if(!found)
  3232. {
  3233. /* session master cannot be found, look for other pid in session */
  3234. #ifdef PENABLE_LINUX26_CPUSETS
  3235. pids = get_cpuset_pidlist(pJob->ji_qs.ji_jobid, pids);
  3236. pp = pids;
  3237. while (pp != NULL)
  3238. {
  3239. pid = pp->pid;
  3240. pp = pp->next;
  3241. #else
  3242. if (pdir == NULL)
  3243. {
  3244. if ((pdir = opendir(procfs)) == NULL)
  3245. return;
  3246. }
  3247. rewinddir(pdir);
  3248. while ((dent = readdir(pdir)) != NULL)
  3249. {
  3250. if (!isdigit(dent->d_name[0]))
  3251. continue;
  3252. pid = atoi(dent->d_name);
  3253. #endif /* PENABLE_LINUX26_CPUSETS */
  3254. if ((ps = get_proc_stat(pid)) == NULL)
  3255. continue;
  3256. if (ps->session == pTask->ti_qs.ti_sid)
  3257. {
  3258. if(pJob->ji_wattr[JOB_ATR_system_start_time].at_flags&ATR_VFLAG_SET)
  3259. {
  3260. proc_stat_t *ts = get_proc_stat(ps->session);
  3261. if(ts == NULL)
  3262. continue;
  3263. if(ts->start_time == (unsigned long)pJob->ji_wattr[JOB_ATR_system_start_time].at_val.at_long)
  3264. {
  3265. found = 1;
  3266. break;
  3267. }
  3268. }
  3269. else
  3270. {
  3271. found = 1;
  3272. break;
  3273. }
  3274. }
  3275. } /* END while ((dent) != NULL) */
  3276. #ifdef PENABLE_LINUX26_CPUSETS
  3277. free_pidlist(pids);
  3278. #endif
  3279. }
  3280. if (!found)
  3281. {
  3282. char buf[MAXLINE];
  3283. extern int exiting_tasks;
  3284. sprintf(buf, "found exited session %d for task %d in job %s",
  3285. pTask->ti_qs.ti_sid,
  3286. pTask->ti_qs.ti_task,
  3287. pJob->ji_qs.ji_jobid);
  3288. log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, buf);
  3289. pTask->ti_qs.ti_exitstat = 0; /* actually unknown */
  3290. pTask->ti_qs.ti_status = TI_STATE_EXITED;
  3291. task_save(pTask);
  3292. #ifdef USESAVEDRESOURCES
  3293. if (first_time)
  3294. {
  3295. pJob->ji_flags |= MOM_JOB_RECOVERY;
  3296. if (LOGLEVEL >= 7)
  3297. {
  3298. sprintf(buf, "marking job as MOM_JOB_RECOVERY for task %d",
  3299. pTask->ti_qs.ti_task);
  3300. log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pJob->ji_qs.ji_jobid, buf);
  3301. }
  3302. }
  3303. #endif /* USESAVEDRESOURCES */
  3304. exiting_tasks = 1;
  3305. }
  3306. }
  3307. } /* END for (job = GET_NEXT(svr_alljobs)) */
  3308. if (pdir != NULL)
  3309. closedir(pdir);
  3310. first_time = FALSE;
  3311. return;
  3312. } /* END scan_non_child_tasks() */
  3313. time_t maxtm;
  3314. void setmax(
  3315. const char *dev)
  3316. {
  3317. struct stat sb;
  3318. if (stat(dev, &sb) == -1)
  3319. {
  3320. return;
  3321. }
  3322. if (maxtm < sb.st_atime)
  3323. maxtm = sb.st_atime;
  3324. return;
  3325. } /* END setmax() */
  3326. const char *idletime(
  3327. struct rm_attribute *attrib)
  3328. {
  3329. DIR *dp;
  3330. struct dirent *de;
  3331. char ttyname[50];
  3332. time_t curtm;
  3333. if (attrib)
  3334. {
  3335. log_err(-1, __func__, extra_parm);
  3336. rm_errno = RM_ERR_BADPARAM;
  3337. return(NULL);
  3338. }
  3339. if ((dp = opendir("/dev")) == NULL)
  3340. {
  3341. log_err(errno, __func__, "opendir /dev");
  3342. rm_errno = RM_ERR_SYSTEM;
  3343. return(NULL);
  3344. }
  3345. maxtm = 0;
  3346. curtm = time(NULL);
  3347. setmax("/dev/mouse");
  3348. while ((de = readdir(dp)) != NULL)
  3349. {
  3350. if (maxtm >= curtm)
  3351. break;
  3352. if (strncmp(de->d_name, "tty", 3))
  3353. continue;
  3354. sprintf(ttyname, "/dev/%s",
  3355. de->d_name);
  3356. setmax(ttyname);
  3357. }
  3358. closedir(dp);
  3359. sprintf(ret_string, "%ld",
  3360. (long)MAX(0, curtm - maxtm));
  3361. return(ret_string);
  3362. } /* END idletime() */
  3363. static const char *walltime(
  3364. struct rm_attribute *attrib)
  3365. {
  3366. int value, job, found = 0;
  3367. time_t now, start;
  3368. proc_stat_t *ps;
  3369. int i;
  3370. if (attrib == NULL)
  3371. {
  3372. log_err(-1, __func__, no_parm);
  3373. rm_errno = RM_ERR_NOPARAM;
  3374. return(NULL);
  3375. }
  3376. if ((value = atoi(attrib->a_value)) == 0)
  3377. {
  3378. sprintf(log_buffer, "bad param: %s",
  3379. attrib->a_value);
  3380. log_err(-1, __func__, log_buffer);
  3381. rm_errno = RM_ERR_BADPARAM;
  3382. return(NULL);
  3383. }
  3384. if (momgetattr(NULL))
  3385. {
  3386. log_err(-1, __func__, extra_parm);
  3387. rm_errno = RM_ERR_BADPARAM;
  3388. return(NULL);
  3389. }
  3390. if (strcmp(attrib->a_qualifier, "proc") == 0)
  3391. {
  3392. job = 0;
  3393. }
  3394. else if (strcmp(attrib->a_qualifier, "session") == 0)
  3395. {
  3396. job = 1;
  3397. }
  3398. else
  3399. {
  3400. rm_errno = RM_ERR_BADPARAM;
  3401. return(NULL);
  3402. }
  3403. if ((now = time(NULL)) <= 0)
  3404. {
  3405. log_err(errno, __func__, "time");
  3406. rm_errno = RM_ERR_SYSTEM;
  3407. return(NULL);
  3408. }
  3409. start = now;
  3410. for (i = 0;i < nproc;i++)
  3411. {
  3412. ps = &proc_array[i];
  3413. if (job != 0)
  3414. {
  3415. if (value != ps->session)
  3416. continue;
  3417. }
  3418. else
  3419. {
  3420. if (value != ps->pid)
  3421. continue;
  3422. }
  3423. found = 1;
  3424. start = MIN((unsigned)start, ps->start_time);
  3425. } /* END for (i) */
  3426. if (found)
  3427. {
  3428. sprintf(ret_string, "%ld",
  3429. (long)((double)(now - start) * wallfactor));
  3430. return(ret_string);
  3431. }
  3432. rm_errno = RM_ERR_EXIST;
  3433. return(NULL);
  3434. } /* END walltime() */
  3435. /* Get the load average for this node */
  3436. int get_la(
  3437. double *rv) /* O */
  3438. {
  3439. FILE *fp;
  3440. float load;
  3441. if ((fp = fopen("/proc/loadavg", "r")) == NULL)
  3442. {
  3443. rm_errno = RM_ERR_SYSTEM;
  3444. return(rm_errno);
  3445. }
  3446. if (fscanf(fp, "%f",
  3447. &load) != 1)
  3448. {
  3449. log_err(errno, __func__, "fscanf of load in /proc/loadavg");
  3450. fclose(fp);
  3451. rm_errno = RM_ERR_SYSTEM;
  3452. return(rm_errno);
  3453. }
  3454. *rv = (double)load;
  3455. fclose(fp);
  3456. return(0);
  3457. } /* END get_la() */
  3458. #ifdef NUMA_SUPPORT
  3459. /*
  3460. * Calculate cpu activities for numa nodeboards.
  3461. *
  3462. * This is a very preliminary attempt to provide useful load data for NUMA nodeboards.
  3463. * Instead of a load average, we report the cpu activities of all cpus of a NUMA board.
  3464. * Calculated numbers range from 0.0 (no CPU activity) to the number of
  3465. * CPUs of a NUMA board (all CPUs are busy to 100%).
  3466. *
  3467. * Note that this is NOT the load average. However, it almost looks the same.
  3468. *
  3469. * The activity of a cpu is calculated from the content of /proc/stat like done
  3470. * by top and related tools.
  3471. */
  3472. void collect_cpuact(void)
  3473. {
  3474. FILE *fp;
  3475. char label[128];
  3476. long procs;
  3477. int cpu_id;
  3478. int i;
  3479. unsigned long long usr, nice, sys, idle, wait;
  3480. unsigned long long totidle, totbusy, prevtot;
  3481. unsigned long long dtot, dbusy;
  3482. /*
  3483. * Allocate cpu_array, if not already done.
  3484. * Need to figure out number of cpus in the system, first.
  3485. */
  3486. if (cpu_array == NULL)
  3487. {
  3488. if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
  3489. /* Failure */
  3490. return;
  3491. procs = 0;
  3492. while (! feof(fp))
  3493. {
  3494. if (fscanf(fp, "%s %*[^\n]%*c", label) == 0)
  3495. getc(fp);
  3496. else if (strcmp("processor", label) == 0)
  3497. procs++;
  3498. }
  3499. fclose(fp);
  3500. system_ncpus = procs;
  3501. sprintf(log_buffer, "system contains %ld CPUs", system_ncpus);
  3502. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  3503. if (system_ncpus)
  3504. {
  3505. if ((cpu_array = (proc_cpu_t *)calloc(system_ncpus, sizeof(proc_cpu_t))) == NULL)
  3506. {
  3507. log_err(errno, __func__, "failed to allocate memory");
  3508. return;
  3509. }
  3510. }
  3511. }
  3512. /* Zero out cpu_array */
  3513. memset(cpu_array, 0, system_ncpus * sizeof(proc_cpu_t));
  3514. /* Parse CPU counters from /proc/stat */
  3515. if ((fp = fopen("/proc/stat", "r")) != NULL)
  3516. {
  3517. while (! feof(fp))
  3518. {
  3519. if (fscanf(fp, "%s", label) != 1)
  3520. /* Format error */
  3521. break;
  3522. if (sscanf(label, "cpu%d", &cpu_id) != 1)
  3523. /* Line does not report cpu activities */
  3524. continue;
  3525. if (cpu_id >= system_ncpus)
  3526. /* Ups, more cpus than found in /proc/cpuinfo */
  3527. break;
  3528. if (fscanf(fp, " %llu %llu %llu %llu %llu", &usr, &nice, &sys, &idle, &wait) != 5)
  3529. /* Format error */
  3530. break;
  3531. cpu_array[cpu_id].idle_total = idle;
  3532. cpu_array[cpu_id].busy_total = usr + nice + sys + wait;
  3533. }
  3534. fclose(fp);
  3535. } /* END if (fp) */
  3536. /* Calculate cpu activity for each nodeboard */
  3537. for (i = 0; i < num_node_boards; i++)
  3538. {
  3539. /* Sum up cpu counters of relevant CPUs */
  3540. totidle = totbusy = 0;
  3541. hwloc_bitmap_foreach_begin(cpu_id, node_boards[i].cpuset)
  3542. {
  3543. totidle += cpu_array[cpu_id].idle_total;
  3544. totbusy += cpu_array[cpu_id].busy_total;
  3545. }
  3546. hwloc_bitmap_foreach_end();
  3547. /* If there are counters from a previous call, evaluate */
  3548. if ((prevtot = node_boards[i].pstat_idle + node_boards[i].pstat_busy) != 0)
  3549. {
  3550. dbusy = totbusy - node_boards[i].pstat_busy; /* diff busy counter sum */
  3551. dtot = totbusy + totidle - prevtot; /* diff total counter sum */
  3552. node_boards[i].cpuact = (float)(node_boards[i].num_cpus * dbusy / (double)dtot);
  3553. }
  3554. else
  3555. {
  3556. node_boards[i].cpuact = 0;
  3557. }
  3558. /* Remember counter sums */
  3559. node_boards[i].pstat_idle = totidle;
  3560. node_boards[i].pstat_busy = totbusy;
  3561. } /* END for(i) */
  3562. return;
  3563. } /* END collect_cpuact() */
  3564. const char *cpuact(
  3565. struct rm_attribute *attrib)
  3566. {
  3567. if (attrib != NULL)
  3568. {
  3569. log_err(-1, __func__, extra_parm);
  3570. rm_errno = RM_ERR_BADPARAM;
  3571. return(NULL);
  3572. }
  3573. sprintf(ret_string, "%.2f", node_boards[numa_index].cpuact);
  3574. if (LOGLEVEL >= 6)
  3575. {
  3576. sprintf(log_buffer, "cpuact=%s", ret_string);
  3577. log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_NODE, __func__, log_buffer);
  3578. }
  3579. return(ret_string);
  3580. } /* END cpuact() */
  3581. #endif
  3582. u_long gracetime(
  3583. u_long secs)
  3584. {
  3585. time_t now = time((time_t *)NULL);
  3586. if (secs > (u_long)now) /* time is in the future */
  3587. return(secs - now);
  3588. return(0);
  3589. }
  3590. static const char *quota(
  3591. struct rm_attribute *attrib)
  3592. {
  3593. int type;
  3594. dev_t dirdev;
  3595. uid_t uid;
  3596. struct stat sb;
  3597. struct mntent *me;
  3598. struct dqblk qi;
  3599. FILE *m;
  3600. struct passwd *pw;
  3601. static const char *type_array[] =
  3602. {
  3603. "harddata",
  3604. "softdata",
  3605. "currdata",
  3606. "hardfile",
  3607. "softfile",
  3608. "currfile",
  3609. "timedata",
  3610. "timefile",
  3611. NULL
  3612. };
  3613. enum type_name
  3614. {
  3615. harddata,
  3616. softdata,
  3617. currdata,
  3618. hardfile,
  3619. softfile,
  3620. currfile,
  3621. timedata,
  3622. timefile,
  3623. type_end
  3624. };
  3625. if (attrib == NULL)
  3626. {
  3627. log_err(-1, __func__, no_parm);
  3628. rm_errno = RM_ERR_NOPARAM;
  3629. return(NULL);
  3630. }
  3631. if (strcmp(attrib->a_qualifier, "type"))
  3632. {
  3633. sprintf(log_buffer, "unknown qualifier %s",
  3634. attrib->a_qualifier);
  3635. log_err(-1, __func__, log_buffer);
  3636. rm_errno = RM_ERR_BADPARAM;
  3637. return(NULL);
  3638. }
  3639. for (type = 0;type < type_end;type++)
  3640. {
  3641. if (strcmp(attrib->a_value, type_array[type]) == 0)
  3642. break;
  3643. }
  3644. if (type == type_end)
  3645. {
  3646. /* check to see if command is legal */
  3647. sprintf(log_buffer, "bad param: %s=%s",
  3648. attrib->a_qualifier,
  3649. attrib->a_value);
  3650. log_err(-1, __func__, log_buffer);
  3651. rm_errno = RM_ERR_BADPARAM;
  3652. return(NULL);
  3653. }
  3654. if ((attrib = momgetattr(NULL)) == NULL)
  3655. {
  3656. log_err(-1, __func__, no_parm);
  3657. rm_errno = RM_ERR_NOPARAM;
  3658. return(NULL);
  3659. }
  3660. if (strcmp(attrib->a_qualifier, "dir") != 0)
  3661. {
  3662. sprintf(log_buffer, "bad param: %s=%s",
  3663. attrib->a_qualifier,
  3664. attrib->a_value);
  3665. log_err(-1, __func__, log_buffer);
  3666. rm_errno = RM_ERR_BADPARAM;
  3667. return(NULL);
  3668. }
  3669. if (attrib->a_value[0] != '/') /* must be absolute path */
  3670. {
  3671. sprintf(log_buffer,
  3672. "not an absolute path: %s", attrib->a_value);
  3673. log_err(-1, __func__, log_buffer);
  3674. rm_errno = RM_ERR_BADPARAM;
  3675. return NULL;
  3676. }
  3677. if (stat(attrib->a_value, &sb) == -1)
  3678. {
  3679. sprintf(log_buffer, "stat: %s", attrib->a_value);
  3680. log_err(errno, __func__, log_buffer);
  3681. rm_errno = RM_ERR_EXIST;
  3682. return NULL;
  3683. }
  3684. dirdev = (dev_t)sb.st_dev;
  3685. DBPRT(("dir has devnum %d\n", (int)dirdev))
  3686. if ((m = setmntent(MOUNTED, "r")) == NULL)
  3687. {
  3688. log_err(errno, __func__, "setmntent");
  3689. rm_errno = RM_ERR_SYSTEM;
  3690. return NULL;
  3691. }
  3692. while ((me = getmntent(m)) != NULL)
  3693. {
  3694. if (strcmp(me->mnt_type, MNTTYPE_IGNORE) == 0)
  3695. continue;
  3696. if (stat(me->mnt_dir, &sb) == -1)
  3697. {
  3698. sprintf(log_buffer, "stat: %s", me->mnt_dir);
  3699. log_err(errno, __func__, log_buffer);
  3700. continue;
  3701. }
  3702. if (LOGLEVEL >= 6)
  3703. DBPRT(("%s\t%s\t%d\n", me->mnt_fsname, me->mnt_dir, (int)dirdev))
  3704. if (!memcmp(&sb.st_dev, &dirdev, sizeof(dev_t)))
  3705. break;
  3706. }
  3707. endmntent(m);
  3708. if (me == NULL)
  3709. {
  3710. sprintf(log_buffer,
  3711. "filesystem %s not found", attrib->a_value);
  3712. log_err(-1, __func__, log_buffer);
  3713. rm_errno = RM_ERR_EXIST;
  3714. return NULL;
  3715. }
  3716. #if defined(MNTOPT_NOQUOTA)
  3717. if (hasmntopt(me, MNTOPT_NOQUOTA) != NULL)
  3718. {
  3719. sprintf(log_buffer,
  3720. "no quotas on filesystem %s", me->mnt_dir);
  3721. log_err(-1, __func__, log_buffer);
  3722. rm_errno = RM_ERR_EXIST;
  3723. return NULL;
  3724. }
  3725. #endif /* MNTOPT_NOQUOTA */
  3726. if ((attrib = momgetattr(NULL)) == NULL)
  3727. {
  3728. log_err(-1, __func__, no_parm);
  3729. rm_errno = RM_ERR_NOPARAM;
  3730. return NULL;
  3731. }
  3732. if (strcmp(attrib->a_qualifier, "user") != 0)
  3733. {
  3734. sprintf(log_buffer, "bad param: %s=%s",
  3735. attrib->a_qualifier, attrib->a_value);
  3736. log_err(-1, __func__, log_buffer);
  3737. rm_errno = RM_ERR_BADPARAM;
  3738. return NULL;
  3739. }
  3740. if ((uid = (uid_t)atoi(attrib->a_value)) == 0)
  3741. {
  3742. if ((pw = getpwnam_ext(attrib->a_value)) == NULL)
  3743. {
  3744. sprintf(log_buffer,
  3745. "user not found: %s", attrib->a_value);
  3746. log_err(-1, __func__, log_buffer);
  3747. rm_errno = RM_ERR_EXIST;
  3748. return NULL;
  3749. }
  3750. uid = pw->pw_uid;
  3751. }
  3752. if (syscall(
  3753. SYS_quotactl,
  3754. QCMD(Q_GETQUOTA, USRQUOTA),
  3755. me->mnt_fsname,
  3756. uid,
  3757. (caddr_t)&qi) == -1)
  3758. {
  3759. log_err(errno, __func__, "quotactl");
  3760. rm_errno = RM_ERR_SYSTEM;
  3761. return(NULL);
  3762. }
  3763. /* sizes in KB */
  3764. switch (type)
  3765. {
  3766. case harddata:
  3767. sprintf(ret_string, "%lukb",
  3768. (u_long)qi.dqb_bhardlimit >> 10);
  3769. break;
  3770. case softdata:
  3771. sprintf(ret_string, "%lukb",
  3772. (u_long)qi.dqb_bsoftlimit >> 10);
  3773. break;
  3774. case currdata:
  3775. #if defined(TENABLEQUOTA)
  3776. #if _LINUX_QUOTA_VERSION < 2
  3777. sprintf(ret_string, "%lukb",
  3778. (u_long)qi.dqb_curblocks >> 10);
  3779. #else /* _LINUX_QUOTA_VERSION < 2 */
  3780. sprintf(ret_string, "%lukb",
  3781. (u_long)qi.dqb_curspace >> 10);
  3782. #endif /* _LINUX_QUOTA_VERSION < 2 */
  3783. #endif /* TENABLEQUOTA */
  3784. break;
  3785. case hardfile:
  3786. sprintf(ret_string, "%lu",
  3787. (u_long)qi.dqb_ihardlimit);
  3788. break;
  3789. case softfile:
  3790. sprintf(ret_string, "%lu",
  3791. (u_long)qi.dqb_isoftlimit);
  3792. break;
  3793. case currfile:
  3794. sprintf(ret_string, "%lu",
  3795. (u_long)qi.dqb_curinodes);
  3796. break;
  3797. case timedata:
  3798. sprintf(ret_string, "%lu",
  3799. gracetime((u_long)qi.dqb_btime));
  3800. break;
  3801. case timefile:
  3802. sprintf(ret_string, "%lu",
  3803. gracetime((u_long)qi.dqb_itime));
  3804. break;
  3805. } /* END switch() */
  3806. return(ret_string);
  3807. } /* END quota() */
  3808. /* tested for linux 2.4 kernel (not tested on 2.6) */
  3809. #define MAX_INTERFACES 10 /*the maximum number of interfaces*/
  3810. #define HEADER_STR "%*[^\n]\n%*[^\n]\n"
  3811. #define INTERFACE_STR "%[^:]:%lu %*d %*d %*d %*d %*d %*d %*d %lu %*d %*d %*d %*d %*d %*d %*d\n"
  3812. static const char *netload(
  3813. struct rm_attribute *attrib)
  3814. {
  3815. #ifdef NUMA_SUPPORT
  3816. /* there's no way to determine these numbers for a numa node */
  3817. return(NULL);
  3818. #else
  3819. FILE *fp;
  3820. int rc; /*read count*/
  3821. char interfaceName[MAX_INTERFACES][32];
  3822. unsigned long int bytesRX[MAX_INTERFACES + 1];
  3823. unsigned long int bytesTX[MAX_INTERFACES + 1];
  3824. int interface = 0;
  3825. /* int ethNum = 0; */
  3826. if ((fp = fopen("/proc/net/dev", "r")) == NULL)
  3827. {
  3828. rm_errno = RM_ERR_SYSTEM;
  3829. return(NULL);
  3830. }
  3831. rc = fscanf(fp, HEADER_STR); /*strip off header lines*/
  3832. if (rc < 0)
  3833. {
  3834. log_err(errno, __func__, "fscanf of header lines in /proc/net/dev");
  3835. fclose(fp);
  3836. rm_errno = RM_ERR_SYSTEM;
  3837. return(NULL);
  3838. }
  3839. /* read in interface stats until we can't */
  3840. /* sum all interface stats, excluding 'lo'*/
  3841. memset(bytesRX, 0, sizeof(bytesRX));
  3842. memset(bytesTX, 0, sizeof(bytesTX));
  3843. for (interface = 0;interface < MAX_INTERFACES;interface++)
  3844. {
  3845. rc = fscanf(fp, INTERFACE_STR,
  3846. interfaceName[interface],
  3847. &bytesRX[interface],
  3848. &bytesTX[interface]);
  3849. if (rc != 3)
  3850. {
  3851. interface++; /*adjust counter for future decrement*/
  3852. break;
  3853. }
  3854. if (strcmp(interfaceName[interface], "lo") != 0) /* don't count 'lo' interfaces' stats */
  3855. {
  3856. /* For singling out ethernet interfaces */
  3857. /*
  3858. if (strncmp(interfaceName[interface],"eth",3) == 0)
  3859. {
  3860. rc = sscanf(interfaceName[interface],"eth%d",
  3861. &ethNum);
  3862. }
  3863. */
  3864. bytesRX[MAX_INTERFACES] += bytesRX[interface];
  3865. bytesTX[MAX_INTERFACES] += bytesTX[interface];
  3866. }
  3867. } /* END for (interface) */
  3868. /* remove lo from interface count */
  3869. --interface;
  3870. fclose(fp);
  3871. sprintf(ret_string, "%lu",
  3872. bytesRX[MAX_INTERFACES] + bytesTX[MAX_INTERFACES]);
  3873. return(ret_string);
  3874. #endif /* NUMA_SUPPORT */
  3875. } /* END netload() */
  3876. mbool_t ProcIsChild(
  3877. char *Dir, /* I */
  3878. pid_t PID, /* I */
  3879. char *JobID) /* I */
  3880. {
  3881. return(FALSE);
  3882. } /* END ProcIsChild() */
  3883. /* END mom_mach.c */