PageRenderTime 51ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/src/resmom/linux/mom_mach.c

https://github.com/itkovian/torque
C | 5345 lines | 3417 code | 1458 blank | 470 comment | 820 complexity | 8cf49e16f2eb3c479a19208755979b78 MD5 | raw file
Possible License(s): LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. #include "license_pbs.h" /* See here for the software license */
  2. #include <pbs_config.h> /* the master config generated by configure */
  3. #include "lib_mom.h" /* header */
  4. #include <assert.h>
  5. #include <limits.h>
  6. #include <stdio.h>
  7. #include <stdlib.h>
  8. #include <unistd.h>
  9. #include <dirent.h>
  10. #include <errno.h>
  11. #include <strings.h>
  12. #include <mntent.h>
  13. #include <asm/types.h>
  14. #include <time.h>
  15. #include <sys/quota.h>
  16. #include <sys/time.h>
  17. #include <sys/procfs.h>
  18. #include <sys/param.h>
  19. #include <sys/stat.h>
  20. #include <sys/vfs.h>
  21. #include <sys/sysmacros.h>
  22. #include <sys/resource.h>
  23. #include <signal.h>
  24. #include <syscall.h>
  25. #include <ctype.h>
  26. #include <string.h>
  27. #include <csv.h>
  28. #include <fcntl.h>
  29. /* needed for oom_adj */
  30. #include <linux/limits.h>
  31. #ifdef Q_6_5_QUOTAON
  32. /* remap dqblk for SUSE 9.0 */
  33. #define dqblk if_dqblk
  34. #endif /* Q_6_5_QUOTAON */
  35. /*
  36. #ifndef dqblk
  37. #include <linux/quotaio_v1.h>
  38. #define dqblk v1_disk_dqblk
  39. #endif
  40. */
  41. #include "pbs_error.h"
  42. #include "portability.h"
  43. #include "list_link.h"
  44. #include "server_limits.h"
  45. #include "attribute.h"
  46. #include "resource.h"
  47. #include "pbs_job.h"
  48. #include "log.h"
  49. #include "mom_mach.h"
  50. #include "mom_func.h"
  51. #include "resmon.h"
  52. #include "utils.h"
  53. #include "../rm_dep.h"
  54. #include "pbs_nodes.h"
  55. #ifdef PENABLE_LINUX26_CPUSETS
  56. #include "pbs_cpuset.h"
  57. #endif
  58. #include "mom_config.h"
  59. /*
  60. ** System dependent code to gather information for the resource
  61. ** monitor for a Linux i386 machine.
  62. **
  63. ** Resources known by this code:
  64. ** cput cpu time for a pid or session
  65. ** mem memory size for a pid or session in KB
  66. ** resi resident memory size for a pid or session in KB
  67. ** sessions list of sessions in the system
  68. ** pids list of pids in a session
  69. ** nsessions number of sessions in the system
  70. ** nusers number of users in the system
  71. ** totmem total memory size in KB
  72. ** availmem available memory size in KB
  73. ** ncpus number of cpus
  74. ** physmem physical memory size in KB
  75. ** size size of a file or filesystem
  76. ** idletime seconds of idle time
  77. ** walltime wall clock time for a pid
  78. ** loadave current load average
  79. ** quota quota information (sizes in kb)
  80. ** netload number of bytes transferred for all interfaces
  81. */
  82. #ifndef MAX_LINE
  83. #define MAX_LINE 1024
  84. #endif
  85. #ifndef TRUE
  86. #define FALSE 0
  87. #define TRUE 1
  88. #endif /* TRUE */
  89. static char procfs[] = "/proc";
  90. static DIR *pdir = NULL;
  91. static int pagesize;
  92. extern char *ret_string;
  93. extern time_t time_now;
  94. #define TBL_INC 200 /* initial proc table */
  95. #define PMEMBUF_SIZE 2048
  96. static proc_stat_t *proc_array = NULL;
  97. static int nproc = 0;
  98. static int max_proc = 0;
  99. /*
  100. ** external functions and data
  101. */
  102. extern tlist_head svr_alljobs;
  103. extern struct config *search(struct config *,char *);
  104. extern struct rm_attribute *momgetattr(char *);
  105. extern long system_ncpus;
  106. #ifdef NUMA_SUPPORT
  107. extern int num_node_boards;
  108. extern nodeboard node_boards[];
  109. extern int numa_index;
  110. #else
  111. extern char path_meminfo[MAX_LINE];
  112. #endif /* NUMA_SUPPORT */
  113. /*
  114. ** local functions and data
  115. */
  116. static const char *resi (struct rm_attribute *);
  117. static const char *totmem (struct rm_attribute *);
  118. static const char *availmem (struct rm_attribute *);
  119. static const char *physmem (struct rm_attribute *);
  120. static const char *ncpus (struct rm_attribute *);
  121. static const char *walltime (struct rm_attribute *);
  122. static const char *quota (struct rm_attribute *);
  123. static const char *netload (struct rm_attribute *);
  124. #ifdef NUMA_SUPPORT
  125. const char *cpuact (struct rm_attribute *);
  126. #endif
  127. #ifdef USELIBMEMACCT
  128. #ifdef __cplusplus
  129. extern "C"
  130. {
  131. #endif
  132. long long get_memacct_resi(pid_t pid);
  133. extern long get_weighted_memory_size(pid_t);
  134. #ifdef __cplusplus
  135. }
  136. #endif
  137. #endif
  138. #ifndef mbool_t
  139. #define mbool_t char
  140. #endif /* mbool_t */
  141. mbool_t ProcIsChild(char *,pid_t,char *);
  142. extern const char *loadave(struct rm_attribute *);
  143. extern const char *nullproc(struct rm_attribute *);
  144. time_t wait_time = 10;
  145. #ifdef NUMA_SUPPORT
  146. typedef struct proc_cpu
  147. {
  148. unsigned long long idle_total;
  149. unsigned long long busy_total;
  150. } proc_cpu_t;
  151. static proc_cpu_t *cpu_array = NULL;
  152. #endif
  153. /*
  154. ** local resource array
  155. */
  156. struct config dependent_config[] =
  157. {
  158. { "resi", {resi} },
  159. { "totmem", {totmem} },
  160. { "availmem", {availmem} },
  161. { "physmem", {physmem} },
  162. { "ncpus", {ncpus} },
  163. #ifdef NUMA_SUPPORT
  164. { "loadave", {cpuact} },
  165. #else
  166. { "loadave", {loadave} },
  167. #endif
  168. { "walltime", {walltime} },
  169. { "quota", {quota} },
  170. { "netload", {netload} },
  171. { "size", {size} },
  172. { NULL, {nullproc} }
  173. };
  174. unsigned linux_time = 0;
  175. /*
  176. * support routine for getting system time -- sets linux_time
  177. */
  178. void proc_get_btime(void)
  179. {
  180. FILE *fp;
  181. char label[256];
  182. if ((fp = fopen("/proc/stat", "r")) == NULL)
  183. {
  184. return;
  185. }
  186. while (!feof(fp))
  187. {
  188. if (fscanf(fp, "%s", label) != 1)
  189. {
  190. fclose(fp);
  191. return;
  192. }
  193. if (strcmp(label, "btime"))
  194. {
  195. if (fscanf(fp, "%*[^\n]%*c") != 0)
  196. {
  197. fclose(fp);
  198. return;
  199. }
  200. }
  201. else
  202. {
  203. if (fscanf(fp, "%u", &linux_time) != 1) {}
  204. fclose(fp);
  205. return;
  206. }
  207. } /* END while (!feof(fp)) */
  208. fclose(fp);
  209. return;
  210. } /* END proc_get_btime() */
  211. /* NOTE: see 'man 5 proc' for /proc/pid/stat format and description */
  212. /* NOTE: leading '*' indicates that field should be ignored */
  213. /* FORMAT: <PID> <COMM> <STATE> <PPID> <PGRP> <SESSION> [<TTY_NR>] [<TPGID>] <FLAGS> [<MINFLT>] [<CMINFLT>] [<MAJFLT>] [<CMAJFLT>] <UTIME> <STIME> <CUTIME> <CSTIME> [<PRIORITY>] [<NICE>] [<0>] [<ITREALVALUE>] <STARTTIME> <VSIZE> <RSS> [<RLIM>] [<STARTCODE>] ... */
  214. static char stat_str[] = " %c %d %d %d %*d %*d %u %*u \
  215. %*u %*u %*u %lu %lu %lu %lu %*ld %*ld %*u %*ld %lu %llu %lld %*lu %*lu \
  216. %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu";
  217. /*
  218. * Convert jiffies to seconds.
  219. *
  220. * Hertz is sysconf(_SC_CLK_TCK) in get_proc_stat()
  221. */
  222. #define JTOS(x) (x) / Hertz;
  223. /*
  224. * Linux /proc status routine.
  225. *
  226. * Returns a pointer to a static proc_stat_t structure given
  227. * a process number, or NULL if there is an error. Takes the
  228. * place of the ioctl call PIOCSTATUS in the irix imp of mom_mach.c
  229. *
  230. */
  231. proc_stat_t *get_proc_stat(
  232. int pid) /* I */
  233. {
  234. static proc_stat_t ps;
  235. static char path[MAXLINE];
  236. static char readbuf[MAXLINE << 2];
  237. static char *lastbracket;
  238. FILE *fd;
  239. unsigned long jstarttime; /* number of jiffies since OS start time when process started */
  240. struct stat sb;
  241. static int Hertz = 0;
  242. int Hertz_errored = 0;
  243. if (Hertz <= 0)
  244. {
  245. Hertz = sysconf(_SC_CLK_TCK); /* returns 0 on error */
  246. if (Hertz <= 0)
  247. {
  248. /* FAILURE */
  249. if (!Hertz_errored)
  250. log_err(errno, "get_proc_stat", "sysconf(_SC_CLK_TCK) failed, unable to monitor processes");
  251. Hertz_errored = 1;
  252. return(NULL);
  253. }
  254. }
  255. Hertz_errored = 0;
  256. sprintf(path, "/proc/%d/stat",
  257. pid);
  258. if ((fd = fopen(path, "r")) == NULL)
  259. {
  260. /* FAILURE */
  261. return(NULL);
  262. }
  263. /* use 'man 5 proc' for /proc/pid/stat format */
  264. if (!fgets(readbuf, sizeof(readbuf), fd))
  265. {
  266. fclose(fd);
  267. return(NULL);
  268. }
  269. lastbracket = strrchr(readbuf, ')');
  270. if (lastbracket == NULL)
  271. {
  272. fclose(fd);
  273. return(NULL);
  274. }
  275. *lastbracket = '\0'; /* We basically split the string here, overwriting the ')'. */
  276. lastbracket++;
  277. if (sscanf(readbuf,"%d (%[^\n]",&ps.pid,path) != 2)
  278. {
  279. /* FAILURE */
  280. fclose(fd);
  281. return(NULL);
  282. }
  283. /* see stat_str[] value for mapping 'stat' format */
  284. if (sscanf(lastbracket,stat_str,
  285. &ps.state, /* state (one of RSDZTW) */
  286. &ps.ppid, /* ppid */
  287. &ps.pgrp, /* pgrp */
  288. &ps.session, /* session id */
  289. &ps.flags, /* flags - kernel flags of the process, see the PF_* in <linux/sched.h> */
  290. &ps.utime, /* utime - jiffies that this process has been scheduled in user mode */
  291. &ps.stime, /* stime - jiffies that this process has been scheduled in kernel mode */
  292. &ps.cutime, /* cutime - jiffies that this process’s waited-for children have been scheduled in user mode */
  293. &ps.cstime, /* cstime - jiffies that this process’s waited-for children have been scheduled in kernel mode */
  294. &jstarttime, /* starttime */
  295. &ps.vsize, /* vsize */
  296. &ps.rss) != 12) /* rss */
  297. {
  298. /* FAILURE */
  299. fclose(fd);
  300. return(NULL);
  301. }
  302. if (fstat(fileno(fd), &sb) == -1)
  303. {
  304. /* FAILURE */
  305. fclose(fd);
  306. return(NULL);
  307. }
  308. ps.uid = sb.st_uid;
  309. ps.start_time = linux_time + JTOS(jstarttime);
  310. ps.name = path;
  311. ps.utime = JTOS(ps.utime);
  312. ps.stime = JTOS(ps.stime);
  313. ps.cutime = JTOS(ps.cutime);
  314. ps.cstime = JTOS(ps.cstime);
  315. /* SUCCESS */
  316. fclose(fd);
  317. return(&ps);
  318. } /* END get_proc_stat() */
  319. #ifdef USELIBMEMACCT
  320. /*
  321. * Retrieve weighted RSS value for process with pid from memacctd.
  322. * Returns the value in bytes on success, returns -1 on failure.
  323. */
  324. long long get_memacct_resi(pid_t pid)
  325. {
  326. long long w_rss;
  327. if ((w_rss = get_weighted_memory_size(pid)) == -1)
  328. {
  329. sprintf(log_buffer, "get_weighted_memory_size(%d) failed", pid);
  330. log_err(errno, __func__, log_buffer);
  331. }
  332. return(w_rss);
  333. } /* END get_memacct_resi() */
  334. #endif
  335. /*
  336. * get_proc_mem_from_path()
  337. * @returns a pointer to a struct containing the memory information
  338. * @pre-cond: path must point to a valid path of a meminfo system file
  339. */
  340. proc_mem_t *get_proc_mem_from_path(
  341. const char *path)
  342. {
  343. proc_mem_t *mm;
  344. FILE *fp;
  345. char str[32];
  346. long long bfsz = -1;
  347. long long casz = -1;
  348. long long fcasz = -1;
  349. if ((fp = fopen(path,"r")) == NULL)
  350. {
  351. return(NULL);
  352. }
  353. mm = (proc_mem_t *)calloc(1, sizeof(proc_mem_t));
  354. if (fscanf(fp,"%30s",str) != 1)
  355. {
  356. fclose(fp);
  357. return(NULL);
  358. }
  359. if (!strncmp(str,"total:",sizeof(str)))
  360. {
  361. /* old format */
  362. if (fscanf(fp,"%*[^\n]%*c") != 0) /* remove text header */
  363. {
  364. fclose(fp);
  365. return(NULL);
  366. }
  367. /* umu vmem patch */
  368. if (fscanf(fp, "%*s %llu %llu %llu %*u %lld %lld",
  369. &mm->mem_total,
  370. &mm->mem_used,
  371. &mm->mem_free,
  372. &bfsz,
  373. &casz) != 5)
  374. {
  375. fclose(fp);
  376. return(NULL);
  377. }
  378. mm->mem_free += casz + bfsz;
  379. if (fscanf(fp, "%*s %llu %llu %llu %*[^\n]%*c",
  380. &mm->swap_total,
  381. &mm->swap_used,
  382. &mm->swap_free) != 3)
  383. {
  384. fclose(fp);
  385. return(NULL);
  386. }
  387. }
  388. else
  389. {
  390. do
  391. {
  392. /* new format (kernel > 2.4) the first 'str' has been read */
  393. if (!strncmp(str, "MemTotal:", sizeof(str)))
  394. {
  395. if (fscanf(fp, "%llu",
  396. &mm->mem_total) != 1)
  397. {
  398. fclose(fp);
  399. return(NULL);
  400. }
  401. mm->mem_total *= 1024; /* the unit is kB */
  402. }
  403. else if (!strncmp(str, "MemFree:", sizeof(str)))
  404. {
  405. if (fscanf(fp, "%llu",
  406. &mm->mem_free) != 1)
  407. {
  408. fclose(fp);
  409. return(NULL);
  410. }
  411. mm->mem_free *= 1024;
  412. }
  413. else if (!strncmp(str, "Buffers:", sizeof(str)))
  414. {
  415. if (fscanf(fp, "%lld",
  416. &bfsz) != 1)
  417. {
  418. fclose(fp);
  419. return(NULL);
  420. }
  421. bfsz *= 1024;
  422. }
  423. else if (!strncmp(str, "Cached:", sizeof(str)))
  424. {
  425. if (fscanf(fp, "%lld",
  426. &casz) != 1)
  427. {
  428. fclose(fp);
  429. return(NULL);
  430. }
  431. casz *= 1024;
  432. }
  433. else if (!strncmp(str, "FilePages:", sizeof(str)))
  434. {
  435. if (fscanf(fp, "%lld",
  436. &fcasz) != 1)
  437. {
  438. fclose(fp);
  439. return(NULL);
  440. }
  441. fcasz *= 1024;
  442. }
  443. else if (!strncmp(str, "SwapTotal:", sizeof(str)))
  444. {
  445. if (fscanf(fp, "%llu",
  446. &mm->swap_total) != 1)
  447. {
  448. fclose(fp);
  449. return(NULL);
  450. }
  451. mm->swap_total *= 1024;
  452. }
  453. else if (!strncmp(str, "SwapFree:", sizeof(str)))
  454. {
  455. if (fscanf(fp, "%llu",
  456. &mm->swap_free) != 1)
  457. {
  458. fclose(fp);
  459. return(NULL);
  460. }
  461. mm->swap_free *= 1024;
  462. }
  463. }
  464. while (fscanf(fp, "%30s", str) == 1);
  465. } /* END else */
  466. fclose(fp);
  467. if (bfsz >= 0 || casz >= 0)
  468. {
  469. if (bfsz > 0)
  470. mm->mem_free += bfsz;
  471. if (casz > 0)
  472. mm->mem_free += casz;
  473. }
  474. else if (fcasz > 0)
  475. {
  476. mm->mem_free += fcasz;
  477. }
  478. return(mm);
  479. } /* END get_proc_mem_from_path() */
  480. proc_mem_t *get_proc_mem(void)
  481. {
  482. static proc_mem_t ret_mm;
  483. #ifdef NUMA_SUPPORT
  484. int i;
  485. #else
  486. proc_mem_t *mem;
  487. #endif
  488. #ifdef NUMA_SUPPORT
  489. ret_mm.mem_total = 0;
  490. ret_mm.mem_used = 0;
  491. ret_mm.mem_free = 0;
  492. ret_mm.swap_total = 0;
  493. ret_mm.swap_used = 0;
  494. ret_mm.swap_free = 0;
  495. for (i = 0; i < node_boards[numa_index].num_nodes; i++)
  496. {
  497. proc_mem_t *node_mem = get_proc_mem_from_path(node_boards[numa_index].path_meminfo[i]);
  498. if (node_mem == NULL)
  499. return(NULL);
  500. ret_mm.mem_total += node_mem->mem_total;
  501. ret_mm.mem_used += node_mem->mem_used;
  502. ret_mm.mem_free += node_mem->mem_free;
  503. ret_mm.swap_total += node_mem->swap_total;
  504. ret_mm.swap_used += node_mem->swap_used;
  505. ret_mm.swap_free += node_mem->swap_free;
  506. free(node_mem);
  507. }
  508. #else
  509. mem = get_proc_mem_from_path(path_meminfo);
  510. if(mem == NULL)
  511. return (NULL);
  512. ret_mm.mem_total = mem->mem_total;
  513. ret_mm.mem_used = mem->mem_used;
  514. ret_mm.mem_free = mem->mem_free;
  515. ret_mm.swap_total = mem->swap_total;
  516. ret_mm.swap_used = mem->swap_used;
  517. ret_mm.swap_free = mem->swap_free;
  518. free(mem);
  519. #endif
  520. return(&ret_mm);
  521. } /* END get_proc_mem() */
  522. #ifdef PNOT
  523. proc_mem_t *get_proc_mem(void)
  524. {
  525. static proc_mem_t mm;
  526. FILE *fp;
  527. unsigned long m_tot, m_use, m_free;
  528. unsigned long s_tot, s_use, s_free;
  529. if ((fp = fopen(path_meminfo, "r")) == NULL)
  530. {
  531. return(NULL);
  532. }
  533. fscanf(fp, "%*[^\n]%*c"); /* remove text header */;
  534. fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c",
  535. &m_tot,
  536. &m_use,
  537. &m_free);
  538. fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c",
  539. &s_tot,
  540. &s_use,
  541. &s_free);
  542. mm.total = m_tot + s_tot;
  543. mm.used = m_use + s_use;
  544. mm.free = m_free + s_free;
  545. fclose(fp);
  546. return(&mm);
  547. } /* END get_proc_mem() */
  548. #endif /* PNOT */
  549. /*
  550. * sets oom_adj score for current process
  551. * requires root privileges or CAP_SYS_RESOURCE to succeed
  552. */
  553. static int oom_adj(int score)
  554. {
  555. pid_t pid;
  556. int rc,fd;
  557. char oom_adj_path[PATH_MAX] = "";
  558. char adj_value[128] = "";
  559. /* valid values are -17 to 15 */
  560. if ( score > 15 || score < -17 )
  561. return -1;
  562. pid = getpid();
  563. if ( snprintf(oom_adj_path, sizeof(oom_adj_path), "/proc/%d/oom_adj", pid) < 0 )
  564. return -1;
  565. if ( ( fd = open(oom_adj_path, O_RDWR) ) == -1 )
  566. return -1;
  567. if (snprintf(adj_value,sizeof(adj_value),"%d",score) < 0)
  568. return -1;
  569. rc = write(fd,adj_value,strlen(adj_value));
  570. close(fd);
  571. return rc;
  572. }
  573. void dep_initialize(void)
  574. {
  575. pagesize = getpagesize();
  576. if ((pdir = opendir(procfs)) == NULL)
  577. {
  578. log_err(errno, __func__, "opendir");
  579. return;
  580. }
  581. /* NOTE: /proc/<pid>/oom_adj tunable is linux specific */
  582. /* LKF: make pbs_mom processes immune to oom killer's killing frenzy if requested*/
  583. if (mom_oom_immunize != 0)
  584. {
  585. if (oom_adj(-17) < 0)
  586. {
  587. log_record(
  588. PBSEVENT_SYSTEM,
  589. PBS_EVENTCLASS_SERVER,
  590. __func__,
  591. "failed to make pbs_mom oom-killer immune");
  592. }
  593. else
  594. {
  595. log_record(
  596. PBSEVENT_SYSTEM,
  597. PBS_EVENTCLASS_SERVER,
  598. __func__,
  599. "mom is now oom-killer safe");
  600. }
  601. }
  602. proc_get_btime();
  603. return;
  604. } /* END dep_initialize() */
  605. void dep_cleanup(void)
  606. {
  607. log_record(PBSEVENT_SYSTEM, 0, __func__, "dependent cleanup");
  608. if (pdir)
  609. {
  610. closedir(pdir);
  611. pdir = NULL;
  612. }
  613. return;
  614. }
  615. /*
  616. * This routine is called on each cycle of the main loop.
  617. */
  618. void
  619. dep_main_loop_cycle(void)
  620. {
  621. /* No periodic functions. */
  622. }
  623. /*
  624. * Internal size decoding routine.
  625. *
  626. * Accepts a resource pointer and a pointer to the unsigned long integer
  627. * to receive the decoded value. It returns a PBS error code, and the
  628. * decoded value in the unsigned long integer.
  629. *
  630. * sizeof(word) = sizeof(int)
  631. */
  632. static int mm_getsize(
  633. resource *pres, /* I */
  634. unsigned long *ret) /* O */
  635. {
  636. unsigned long value;
  637. if (pres->rs_value.at_type != ATR_TYPE_SIZE)
  638. {
  639. return(PBSE_ATTRTYPE);
  640. }
  641. value = pres->rs_value.at_val.at_size.atsv_num;
  642. if (pres->rs_value.at_val.at_size.atsv_units == ATR_SV_WORDSZ)
  643. {
  644. if (value > ULONG_MAX / sizeof(int))
  645. {
  646. return(PBSE_BADATVAL);
  647. }
  648. value *= sizeof(int);
  649. }
  650. if (value > (ULONG_MAX >> pres->rs_value.at_val.at_size.atsv_shift))
  651. {
  652. return(PBSE_BADATVAL);
  653. }
  654. *ret = (value << pres->rs_value.at_val.at_size.atsv_shift);
  655. return(PBSE_NONE);
  656. } /* END mm_getsize() */
  657. /*
  658. * Internal time decoding routine.
  659. *
  660. * Accepts a resource pointer and a pointer to the unsigned long integer
  661. * to receive the decoded value. It returns a PBS error code, and the
  662. * decoded value of time in seconds in the unsigned long integer.
  663. */
  664. static int mm_gettime(
  665. resource *pres,
  666. unsigned long *ret)
  667. {
  668. if (pres->rs_value.at_type != ATR_TYPE_LONG)
  669. {
  670. return(PBSE_ATTRTYPE);
  671. }
  672. if (pres->rs_value.at_val.at_long < 0)
  673. {
  674. return(PBSE_BADATVAL);
  675. }
  676. *ret = pres->rs_value.at_val.at_long;
  677. return(PBSE_NONE);
  678. }
  679. static int injob(
  680. job *pjob,
  681. pid_t sid)
  682. {
  683. task *ptask;
  684. pid_t pid;
  685. #ifdef PENABLE_LINUX26_CPUSETS
  686. struct pidl *pids = NULL;
  687. struct pidl *pp;
  688. #else
  689. proc_stat_t *ps;
  690. #endif /* PENABLE_LINUX26_CPUSETS */
  691. for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
  692. ptask != NULL;
  693. ptask = (task *)GET_NEXT(ptask->ti_jobtask))
  694. {
  695. if (ptask->ti_qs.ti_sid <= 1)
  696. continue;
  697. if (ptask->ti_qs.ti_sid == sid)
  698. {
  699. return(TRUE);
  700. }
  701. }
  702. /* processes with a different sessionid are not necessarily not part of the
  703. job: the job can call setsid; need to check whether one of the parent
  704. processes has a sessionid that is in the job */
  705. #ifdef PENABLE_LINUX26_CPUSETS
  706. /* check whether the sid is in the job's cpuset */
  707. pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids);
  708. pp = pids;
  709. while (pp != NULL)
  710. {
  711. pid = pp->pid;
  712. pp = pp->next;
  713. if (pid == sid)
  714. {
  715. free_pidlist(pids);
  716. return(TRUE);
  717. }
  718. }
  719. free_pidlist(pids);
  720. #else
  721. /* get the parent process id of the sid and check whether it is part of
  722. the job; iterate */
  723. pid = sid;
  724. while (pid > 1)
  725. {
  726. if ((ps = get_proc_stat(pid)) == NULL)
  727. {
  728. if (errno != ENOENT)
  729. {
  730. sprintf(log_buffer, "%d: get_proc_stat", pid);
  731. log_err(errno, __func__, log_buffer);
  732. }
  733. return(FALSE);
  734. }
  735. pid = getsid(ps->ppid);
  736. for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
  737. ptask != NULL;
  738. ptask = (task *)GET_NEXT(ptask->ti_jobtask))
  739. {
  740. if (ptask->ti_qs.ti_sid <= 1)
  741. continue;
  742. if (ptask->ti_qs.ti_sid == pid)
  743. {
  744. return(TRUE);
  745. }
  746. }
  747. }
  748. #endif /* PENABLE_LINUX26_CPUSETS */
  749. return(FALSE);
  750. } /* END injob() */
  751. /*
  752. * Internal session CPU time decoding routine.
  753. *
  754. * Accepts a job pointer. Returns the sum of all cpu time
  755. * consumed for all tasks executed by the job, in seconds,
  756. * adjusted by cputfactor.
  757. */
  758. static unsigned long cput_sum(
  759. job *pjob) /* I */
  760. {
  761. ulong cputime;
  762. int nps = 0;
  763. int i;
  764. proc_stat_t *ps;
  765. cputime = 0;
  766. if (LOGLEVEL >= 6)
  767. {
  768. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  769. pjob->ji_qs.ji_jobid);
  770. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  771. }
  772. for (i = 0;i < nproc;i++)
  773. {
  774. ps = &proc_array[i];
  775. if ((LOGLEVEL >= 6) && (ps == NULL))
  776. {
  777. sprintf(log_buffer, "proc_array loop end - nproc=%d, i=%d, ps is null",
  778. nproc,
  779. i);
  780. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  781. }
  782. if (!injob(pjob, ps->session))
  783. continue;
  784. nps++;
  785. cputime += (ps->utime + ps->stime + ps->cutime + ps->cstime);
  786. if (LOGLEVEL >= 6)
  787. {
  788. sprintf(log_buffer, "%s: session=%d pid=%d cputime=%lu (cputfactor=%f)",
  789. __func__,
  790. ps->session,
  791. ps->pid,
  792. cputime,
  793. cputfactor);
  794. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  795. }
  796. } /* END for (i) */
  797. if (nps == 0)
  798. pjob->ji_flags |= MOM_NO_PROC;
  799. else
  800. pjob->ji_flags &= ~MOM_NO_PROC;
  801. return((unsigned long)((double)cputime * cputfactor));
  802. } /* END cput_sum() */
  803. /*
  804. * Return TRUE if any process in the job is over limit for cputime usage.
  805. */
  806. static int overcpu_proc(
  807. job *pjob,
  808. unsigned long limit) /* I */
  809. {
  810. ulong cputime;
  811. pid_t pid;
  812. proc_stat_t *ps;
  813. #ifdef PENABLE_LINUX26_CPUSETS
  814. struct pidl *pids = NULL;
  815. struct pidl *pp;
  816. #else
  817. struct dirent *dent;
  818. #endif /* PENABLE_LINUX26_CPUSETS */
  819. #ifdef PENABLE_LINUX26_CPUSETS
  820. /* Instead of collect stats of all processes running on a large SMP system,
  821. * collect stats of processes running in and below the cpuset of the job, only. */
  822. pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids);
  823. pp = pids;
  824. while (pp != NULL)
  825. {
  826. pid = pp->pid;
  827. pp = pp->next;
  828. #else
  829. rewinddir(pdir);
  830. while ((dent = readdir(pdir)) != NULL)
  831. {
  832. if (!isdigit(dent->d_name[0]))
  833. continue;
  834. pid = atoi(dent->d_name);
  835. #endif /* PENABLE_LINUX26_CPUSETS */
  836. if ((ps = get_proc_stat(pid)) == NULL)
  837. {
  838. if (errno != ENOENT)
  839. {
  840. sprintf(log_buffer, "%d: get_proc_stat", pid);
  841. log_err(errno, __func__, log_buffer);
  842. }
  843. continue;
  844. }
  845. #ifndef PENABLE_LINUX26_CPUSETS
  846. /* if it was in the cpuset, its part of the job, no need to check */
  847. if (!injob(pjob, ps->session))
  848. continue;
  849. #endif /* PENABLE_LINUX26_CPUSETS */
  850. /* change from ps->cutime to ps->utime, and ps->cstime to ps->stime */
  851. cputime = (ulong)((double)(ps->utime + ps->stime) * cputfactor);
  852. if (cputime > limit)
  853. {
  854. #ifdef PENABLE_LINUX26_CPUSETS
  855. free_pidlist(pids);
  856. #endif
  857. return(TRUE);
  858. }
  859. }
  860. #ifdef PENABLE_LINUX26_CPUSETS
  861. free_pidlist(pids);
  862. #endif
  863. return(FALSE);
  864. } /* END overcpu_proc() */
  865. /*
  866. * Internal session virtual memory usage function.
  867. *
  868. * Returns the total number of bytes of address
  869. * space consumed by all current processes within the job.
  870. */
  871. static unsigned long long mem_sum(
  872. job *pjob)
  873. {
  874. int i;
  875. unsigned long long segadd;
  876. proc_stat_t *ps;
  877. segadd = 0;
  878. if (LOGLEVEL >= 6)
  879. {
  880. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  881. pjob->ji_qs.ji_jobid);
  882. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  883. }
  884. for (i = 0;i < nproc;i++)
  885. {
  886. ps = &proc_array[i];
  887. if (!injob(pjob, ps->session))
  888. continue;
  889. segadd += ps->vsize;
  890. if (LOGLEVEL >= 6)
  891. {
  892. sprintf(log_buffer, "%s: session=%d pid=%d vsize=%llu sum=%llu",
  893. __func__,
  894. ps->session,
  895. ps->pid,
  896. ps->vsize,
  897. segadd);
  898. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  899. }
  900. } /* END for (i) */
  901. return(segadd);
  902. } /* END mem_sum() */
  903. /*
  904. * Internal session memory usage function.
  905. *
  906. * Returns the total number of bytes of resident memory
  907. * consumed by all current processes within the job.
  908. */
  909. static unsigned long long resi_sum(
  910. job *pjob)
  911. {
  912. int i;
  913. unsigned long long resisize;
  914. proc_stat_t *ps;
  915. #ifdef USELIBMEMACCT
  916. long long w_rss;
  917. #endif
  918. resisize = 0;
  919. if (LOGLEVEL >= 6)
  920. {
  921. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  922. pjob->ji_qs.ji_jobid);
  923. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  924. }
  925. for (i = 0;i < nproc;i++)
  926. {
  927. ps = &proc_array[i];
  928. if (!injob(pjob, ps->session))
  929. continue;
  930. #ifdef USELIBMEMACCT
  931. /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */
  932. w_rss = get_memacct_resi(ps->pid);
  933. if (w_rss == -1)
  934. resisize += ps->rss * pagesize;
  935. else
  936. resisize += w_rss;
  937. if (LOGLEVEL >= 6)
  938. {
  939. sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu w_rss=%ld sum=%llu",
  940. __func__,
  941. ps->session,
  942. ps->pid,
  943. ps->rss * pagesize,
  944. w_rss,
  945. resisize);
  946. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  947. }
  948. #else
  949. resisize += ps->rss * pagesize;
  950. if (LOGLEVEL >= 6)
  951. {
  952. sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu sum=%llu",
  953. __func__,
  954. ps->session,
  955. ps->pid,
  956. ps->rss * pagesize,
  957. resisize);
  958. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  959. }
  960. #endif
  961. } /* END for (i) */
  962. return(resisize);
  963. } /* END resi_sum() */
  964. /*
  965. * Return TRUE if any process in the job is over limit for virtual memory usage.
  966. */
  967. static int overmem_proc(
  968. job *pjob, /* I */
  969. unsigned long long limit) /* I */
  970. {
  971. int i;
  972. proc_stat_t *ps;
  973. if (LOGLEVEL >= 6)
  974. {
  975. sprintf(log_buffer, "proc_array loop start - jobid = %s",
  976. pjob->ji_qs.ji_jobid);
  977. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  978. }
  979. for (i = 0;i < nproc;i++)
  980. {
  981. ps = &proc_array[i];
  982. if (!injob(pjob, ps->session))
  983. continue;
  984. if (ps->vsize > limit)
  985. {
  986. return(TRUE);
  987. }
  988. } /* END for (i) */
  989. return(FALSE);
  990. } /* END overmem_proc() */
  991. extern char *msg_momsetlim;
  992. /*
  993. * Internal error routine
  994. */
  995. int error(
  996. const char *string,
  997. int value)
  998. {
  999. char *message;
  1000. assert(string != NULL);
  1001. assert(*string != '\0');
  1002. message = pbse_to_txt(value);
  1003. assert(message != NULL);
  1004. assert(*message != '\0');
  1005. fprintf(stderr, msg_momsetlim, string, message);
  1006. fflush(stderr);
  1007. return(value);
  1008. } /* END error() */
  1009. /*
  1010. * Establish system-enforced limits for the job.
  1011. *
  1012. * Run through the resource list, checking the values for all items
  1013. * we recognize.
  1014. *
  1015. * If set_mode is SET_LIMIT_SET, then also set hard limits for the
  1016. * system enforced limits (not-polled).
  1017. * If anything goes wrong with the process, return a PBS error code
  1018. * and print a message on standard error. A zero-length resource list
  1019. * is not an error.
  1020. *
  1021. * If set_mode is SET_LIMIT_SET the entry conditions are:
  1022. * 1. MOM has already forked, and we are called from the child.
  1023. * 2. The child is still running as root.
  1024. * 3. Standard error is open to the user's file.
  1025. *
  1026. * If set_mode is SET_LIMIT_ALTER, we are being called to modify
  1027. * existing limits. Cannot alter those set by setrlimit (kernel)
  1028. * because we are the wrong process.
  1029. */
  1030. int mom_set_limits(
  1031. job *pjob, /* I */
  1032. int set_mode) /* SET_LIMIT_SET or SET_LIMIT_ALTER */
  1033. {
  1034. const char *pname = NULL;
  1035. int retval;
  1036. unsigned long value; /* place in which to build resource value */
  1037. resource *pres;
  1038. struct rlimit reslim;
  1039. unsigned long vmem_limit = 0;
  1040. unsigned long mem_limit = 0;
  1041. /* NOTE: log_buffer is exported */
  1042. if (LOGLEVEL >= 2)
  1043. {
  1044. sprintf(log_buffer, "%s(%s,%s) entered",
  1045. __func__,
  1046. (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
  1047. (set_mode == SET_LIMIT_SET) ? "set" : "alter");
  1048. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1049. log_buffer[0] = '\0';
  1050. }
  1051. assert(pjob != NULL);
  1052. assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
  1053. pres = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
  1054. /*
  1055. * cycle through all the resource specifications,
  1056. * setting limits appropriately.
  1057. */
  1058. memset(&reslim, 0, sizeof(reslim));
  1059. /* set oom_adj score for the starting job */
  1060. /* if immunize mode is set to on, we have to set child score to 0 */
  1061. if ( (set_mode == SET_LIMIT_SET) && ( job_oom_score_adjust != 0 || mom_oom_immunize != 0 ) )
  1062. {
  1063. retval = oom_adj(job_oom_score_adjust);
  1064. if ( LOGLEVEL >= 2 )
  1065. {
  1066. sprintf(log_buffer, "setting oom_adj '%s'",
  1067. (retval != -1) ? "succeeded" : "failed");
  1068. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1069. }
  1070. };
  1071. while (pres != NULL)
  1072. {
  1073. if (pres->rs_defin != NULL)
  1074. pname = pres->rs_defin->rs_name;
  1075. else
  1076. pname = NULL;
  1077. if (LOGLEVEL >= 2)
  1078. {
  1079. sprintf(log_buffer, "setting limit for attribute '%s'",
  1080. (pname != NULL) ? pname : "NULL");
  1081. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1082. log_buffer[0] = '\0';
  1083. }
  1084. assert(pres->rs_defin != NULL);
  1085. assert(pname != NULL);
  1086. assert(pname[0] != '\0');
  1087. if (!strcmp(pname, "cput"))
  1088. {
  1089. if (igncput == FALSE)
  1090. {
  1091. /* cpu time - check, if less than pcput use it */
  1092. retval = mm_gettime(pres, &value);
  1093. if (retval != PBSE_NONE)
  1094. {
  1095. sprintf(log_buffer, "cput mm_gettime failed in %s", __func__);
  1096. return(error(pname, retval));
  1097. }
  1098. }
  1099. }
  1100. else if (!strcmp(pname, "pcput"))
  1101. {
  1102. if (igncput == FALSE)
  1103. {
  1104. if (set_mode == SET_LIMIT_SET)
  1105. {
  1106. /* process cpu time - set */
  1107. retval = mm_gettime(pres, &value);
  1108. if (retval != PBSE_NONE)
  1109. {
  1110. sprintf(log_buffer, "pcput mm_gettime failed in %s", __func__);
  1111. return(error(pname, retval));
  1112. }
  1113. reslim.rlim_cur = reslim.rlim_max =
  1114. (unsigned long)((double)value / cputfactor);
  1115. if (LOGLEVEL >= 2)
  1116. {
  1117. sprintf(log_buffer, "setting cpu time limit to %ld for job %s",
  1118. (long int)reslim.rlim_cur,
  1119. pjob->ji_qs.ji_jobid);
  1120. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1121. log_buffer[0] = '\0';
  1122. }
  1123. /* NOTE: some versions of linux have a bug which causes the parent
  1124. process to receive a SIGKILL if the child's cpu limit is exceeded */
  1125. if (setrlimit(RLIMIT_CPU, &reslim) < 0)
  1126. {
  1127. sprintf(log_buffer, "setrlimit for RLIMIT_CPU failed in %s, errno=%d (%s)",
  1128. __func__,
  1129. errno, strerror(errno));
  1130. return(error("RLIMIT_CPU", PBSE_SYSTEM));
  1131. }
  1132. } /* END if (set_mode == SET_LIMIT_SET) */
  1133. }
  1134. }
  1135. else if (!strcmp(pname, "file"))
  1136. {
  1137. /* set */
  1138. if (set_mode == SET_LIMIT_SET)
  1139. {
  1140. retval = mm_getsize(pres, &value);
  1141. if (retval != PBSE_NONE)
  1142. {
  1143. sprintf(log_buffer, "mm_getsize() failed for file in %s",
  1144. __func__);
  1145. return(error(pname, retval));
  1146. }
  1147. if (value > ULONG_MAX)
  1148. {
  1149. if (LOGLEVEL >= 0)
  1150. {
  1151. sprintf(log_buffer, "cannot set file limit to %ld for job %s (value too large)",
  1152. (long int)reslim.rlim_cur,
  1153. pjob->ji_qs.ji_jobid);
  1154. log_err(-1, __func__, log_buffer);
  1155. log_buffer[0] = '\0';
  1156. }
  1157. return(error(pname, PBSE_BADATVAL));
  1158. }
  1159. reslim.rlim_cur = reslim.rlim_max = value;
  1160. if (setrlimit(RLIMIT_FSIZE, &reslim) < 0)
  1161. {
  1162. sprintf(log_buffer, "cannot set file limit to %ld for job %s (setrlimit failed - check default user limits)",
  1163. (long int)reslim.rlim_max,
  1164. pjob->ji_qs.ji_jobid);
  1165. log_err(errno, __func__, log_buffer);
  1166. log_buffer[0] = '\0';
  1167. return(error(pname, PBSE_SYSTEM));
  1168. }
  1169. }
  1170. }
  1171. else if (!strcmp(pname, "vmem"))
  1172. {
  1173. if (ignvmem == FALSE)
  1174. {
  1175. /* check */
  1176. retval = mm_getsize(pres, &value);
  1177. if (retval != PBSE_NONE)
  1178. {
  1179. sprintf(log_buffer, "mm_getsize() failed for vmem in %s", __func__);
  1180. return(error(pname, retval));
  1181. }
  1182. if ((vmem_limit == 0) || (value < vmem_limit))
  1183. vmem_limit = value;
  1184. }
  1185. }
  1186. else if (!strcmp(pname, "pvmem"))
  1187. {
  1188. if (ignvmem == FALSE)
  1189. {
  1190. /* set */
  1191. if (set_mode == SET_LIMIT_SET)
  1192. {
  1193. retval = mm_getsize(pres, &value);
  1194. if (retval != PBSE_NONE)
  1195. {
  1196. sprintf(log_buffer, "mm_getsize() failed for pvmem in %s",
  1197. __func__);
  1198. return(error(pname, retval));
  1199. }
  1200. if (value > ULONG_MAX)
  1201. {
  1202. log_buffer[0] = '\0';
  1203. sprintf(log_buffer, "invalid value returned by mm_getsize() for pvmem in %s",
  1204. __func__);
  1205. return(error(pname, PBSE_BADATVAL));
  1206. }
  1207. if ((vmem_limit == 0) || (value < vmem_limit))
  1208. vmem_limit = value;
  1209. }
  1210. }
  1211. }
  1212. else if ((!strcmp(pname,"mem") && (pjob->ji_numnodes != 1)) ||
  1213. !strcmp(pname,"mppmem"))
  1214. {
  1215. /* ignore. If we ever get rid of support for the UNICOS OS then we can
  1216. remove the ATR_DFLAG_MOM | ATR_DFLAG_ALTRUN flags from mppmem */
  1217. }
  1218. else if ((!strcmp(pname, "mem") && (pjob->ji_numnodes == 1)) ||
  1219. !strcmp(pname, "pmem"))
  1220. {
  1221. if (ignmem == FALSE)
  1222. {
  1223. /* set */
  1224. if (set_mode == SET_LIMIT_SET)
  1225. {
  1226. retval = mm_getsize(pres, &value);
  1227. if (retval != PBSE_NONE)
  1228. {
  1229. sprintf(log_buffer, "mm_getsize() failed for mem/pmem in %s",
  1230. __func__);
  1231. return(error(pname, retval));
  1232. }
  1233. reslim.rlim_cur = reslim.rlim_max = value;
  1234. if (setrlimit(RLIMIT_DATA, &reslim) < 0)
  1235. {
  1236. sprintf(log_buffer, "cannot set data limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1237. (long int)reslim.rlim_max,
  1238. pjob->ji_qs.ji_jobid,
  1239. errno,
  1240. strerror(errno));
  1241. return(error("RLIMIT_DATA", PBSE_SYSTEM));
  1242. }
  1243. if (setrlimit(RLIMIT_RSS, &reslim) < 0)
  1244. {
  1245. sprintf(log_buffer, "cannot set RSS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1246. (long int)reslim.rlim_max,
  1247. pjob->ji_qs.ji_jobid,
  1248. errno,
  1249. strerror(errno));
  1250. return(error("RLIMIT_RSS", PBSE_SYSTEM));
  1251. }
  1252. #ifdef __GATECH
  1253. /* NOTE: best patch may be to change to 'vmem_limit = value;' */
  1254. if (setrlimit(RLIMIT_STACK, &reslim) < 0)
  1255. {
  1256. sprintf(log_buffer, "cannot set stack limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1257. (long int)reslim.rlim_max,
  1258. pjob->ji_qs.ji_jobid,
  1259. errno,
  1260. strerror(errno));
  1261. return(error("RLIMIT_STACK", PBSE_SYSTEM));
  1262. }
  1263. /* set address space */
  1264. if (setrlimit(RLIMIT_AS, &reslim) < 0)
  1265. {
  1266. sprintf(log_buffer, "cannot set AS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
  1267. (long int)reslim.rlim_max,
  1268. pjob->ji_qs.ji_jobid,
  1269. errno,
  1270. strerror(errno));
  1271. return(error("RLIMIT_AS", PBSE_SYSTEM));
  1272. }
  1273. #endif /* __GATECH */
  1274. mem_limit = value;
  1275. if (getrlimit(RLIMIT_STACK, &reslim) >= 0)
  1276. {
  1277. /* NOTE: mem_limit no longer used with UMU patch in place */
  1278. mem_limit = value + reslim.rlim_cur;
  1279. }
  1280. }
  1281. }
  1282. } /* END else if (!strcmp(pname,"mem") && ... */
  1283. else if (!strcmp(pname, "walltime"))
  1284. {
  1285. /* check */
  1286. retval = mm_gettime(pres, &value);
  1287. if (retval != PBSE_NONE)
  1288. {
  1289. sprintf(log_buffer, "mm_gettime() failed for walltime in %s\n",
  1290. __func__);
  1291. return(error(pname, retval));
  1292. }
  1293. }
  1294. else if (!strcmp(pname, "nice"))
  1295. {
  1296. /* set nice */
  1297. if (set_mode == SET_LIMIT_SET)
  1298. {
  1299. errno = 0;
  1300. if ((nice((int)pres->rs_value.at_val.at_long) == -1) && (errno != 0))
  1301. {
  1302. sprintf(log_buffer, "nice() failed w/errno=%d (%s) in %s\n",
  1303. errno,
  1304. strerror(errno),
  1305. __func__);
  1306. return(error(pname, PBSE_BADATVAL));
  1307. }
  1308. }
  1309. }
  1310. else if (!strcmp(pname, "size"))
  1311. {
  1312. /* ignore */
  1313. /* NO-OP */
  1314. }
  1315. else if (!strcmp(pname, "prologue"))
  1316. {
  1317. }
  1318. else if (!strcmp(pname, "epilogue"))
  1319. {
  1320. }
  1321. else if ((!strcmp(pname, "mppdepth")) ||
  1322. (!strcmp(pname, "mppnodect")) ||
  1323. (!strcmp(pname, "mppwidth")) ||
  1324. (!strcmp(pname, "mppnppn")) ||
  1325. (!strcmp(pname, "mppnodes")) ||
  1326. (!strcmp(pname, "mpplabels")) ||
  1327. (!strcmp(pname, "mpparch")) ||
  1328. (!strcmp(pname, "mpplabel")))
  1329. {
  1330. /* NO-OP */
  1331. }
  1332. else if ((pres->rs_defin->rs_flags & ATR_DFLAG_RMOMIG) == 0)
  1333. {
  1334. /* don't recognize and not marked as ignore by mom */
  1335. sprintf(log_buffer, "do not know how to process resource '%s' in %s\n",
  1336. pname,
  1337. __func__);
  1338. return(error(pname, PBSE_UNKRESC));
  1339. }
  1340. pres = (resource *)GET_NEXT(pres->rs_link);
  1341. }
  1342. if (set_mode == SET_LIMIT_SET)
  1343. {
  1344. /* if either of vmem or pvmem was given, set sys limit to lesser */
  1345. if (vmem_limit != 0)
  1346. {
  1347. /* Don't make (p)vmem < pmem */
  1348. if (mem_limit > vmem_limit)
  1349. {
  1350. vmem_limit = mem_limit;
  1351. }
  1352. reslim.rlim_cur = reslim.rlim_max = vmem_limit;
  1353. if ((ignvmem == 0) && (setrlimit(RLIMIT_AS, &reslim) < 0))
  1354. {
  1355. sprintf(log_buffer, "setrlimit() failed setting AS for vmem_limit mod in %s\n",
  1356. __func__);
  1357. return(error("RLIMIT_AS", PBSE_SYSTEM));
  1358. }
  1359. /* UMU vmem patch sets RLIMIT_AS rather than RLIMIT_DATA and RLIMIT_STACK */
  1360. /*
  1361. reslim.rlim_cur = reslim.rlim_max = mem_limit;
  1362. if (setrlimit(RLIMIT_DATA,&reslim) < 0)
  1363. {
  1364. sprintf(log_buffer,"setrlimit() failed setting data for vmem_limit mod in %s\n",
  1365. id);
  1366. return(error("RLIMIT_DATA",PBSE_SYSTEM));
  1367. }
  1368. if (setrlimit(RLIMIT_STACK,&reslim) < 0)
  1369. {
  1370. sprintf(log_buffer,"setrlimit() failed setting stack for vmem_limit mod in %s\n",
  1371. id);
  1372. return(error("RLIMIT_STACK",PBSE_SYSTEM));
  1373. }
  1374. */
  1375. }
  1376. }
  1377. if (LOGLEVEL >= 5)
  1378. {
  1379. sprintf(log_buffer, "%s(%s,%s) completed",
  1380. __func__,
  1381. (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
  1382. (set_mode == SET_LIMIT_SET) ? "set" : "alter");
  1383. log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
  1384. log_buffer[0] = '\0';
  1385. }
  1386. /* SUCCESS */
  1387. return(PBSE_NONE);
  1388. } /* END mom_set_limits() */
  1389. /*
  1390. * State whether MOM main loop has to poll this job to determine if some
  1391. * limits are being exceeded.
  1392. *
  1393. * Sets flag TRUE if polling is necessary, FALSE otherwise. Actual
  1394. * polling is done using the mom_over_limit machine-dependent function.
  1395. */
  1396. int mom_do_poll(
  1397. job *pjob) /* I */
  1398. {
  1399. const char *pname;
  1400. resource *pres;
  1401. assert(pjob != NULL);
  1402. if (LOGLEVEL >= 4)
  1403. {
  1404. log_record(
  1405. PBSEVENT_JOB,
  1406. PBS_EVENTCLASS_JOB,
  1407. pjob->ji_qs.ji_jobid,
  1408. "evaluating limits for job");
  1409. }
  1410. assert(pjob != NULL);
  1411. assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
  1412. pres = (resource *)GET_NEXT(
  1413. pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
  1414. while (pres != NULL)
  1415. {
  1416. assert(pres->rs_defin != NULL);
  1417. pname = pres->rs_defin->rs_name;
  1418. assert(pname != NULL);
  1419. assert(*pname != '\0');
  1420. if (strcmp(pname, "walltime") == 0 ||
  1421. strcmp(pname, "cput") == 0 ||
  1422. strcmp(pname, "pcput") == 0 ||
  1423. strcmp(pname, "mem") == 0 ||
  1424. strcmp(pname, "pvmem") == 0 ||
  1425. strcmp(pname, "vmem") == 0)
  1426. {
  1427. return(TRUE);
  1428. }
  1429. pres = (resource *)GET_NEXT(pres->rs_link);
  1430. }
  1431. return(FALSE);
  1432. } /* END mom_do_poll() */
  1433. /*
  1434. * Setup for polling.
  1435. *
  1436. * Open kernel device and get namelist info.
  1437. */
  1438. int mom_open_poll(void)
  1439. {
  1440. if (LOGLEVEL >= 6)
  1441. {
  1442. log_record(PBSEVENT_SYSTEM, 0, __func__, "started");
  1443. }
  1444. pagesize = getpagesize();
  1445. proc_array = (proc_stat_t *)calloc(TBL_INC, sizeof(proc_stat_t));
  1446. if (proc_array == NULL)
  1447. {
  1448. log_err(errno, __func__, "calloc");
  1449. return(PBSE_SYSTEM);
  1450. }
  1451. max_proc = TBL_INC;
  1452. return(PBSE_NONE);
  1453. } /* END mom_open_poll() */
  1454. /*
  1455. * Declare start of polling loop.
  1456. *
  1457. * This function caches information about all of processes
  1458. * on the compute node (pbs_mom calls this function). Each process
  1459. * in /proc/ is queried by looking at the 'stat' file. Statistics like
  1460. * CPU usage time, memory consumption, etc. are gathered in the proc_array
  1461. * list. This list is then used throughout the pbs_mom to get information
  1462. * about tasks it is monitoring.
  1463. *
  1464. * This function is called from the main MOM loop once every "check_poll_interval"
  1465. * seconds.
  1466. *
  1467. * @see get_proc_stat() - child
  1468. * @see mom_set_use() - Aggregates data collected here
  1469. *
  1470. * NOTE: populates global 'proc_array[]' variable.
  1471. * NOTE: reallocs proc_array[] as needed to accomodate processes.
  1472. *
  1473. * @see mom_open_poll() - allocs proc_array table.
  1474. * @see mom_close_poll() - frees procs_array.
  1475. * @see setup_program_environment() - parent - called at pbs_mom start
  1476. * @see main_loop() - parent - called once per iteration
  1477. * @see mom_set_use() - populate job structure with usage data for local use or to send to mother superior
  1478. */
  1479. int mom_get_sample(void)
  1480. {
  1481. proc_stat_t *pi;
  1482. proc_stat_t *ps;
  1483. pid_t pid;
  1484. #ifdef PENABLE_LINUX26_CPUSETS
  1485. struct pidl *pids = NULL;
  1486. struct pidl *pp;
  1487. #else
  1488. struct dirent *dent;
  1489. #endif
  1490. if (proc_array == NULL)
  1491. mom_open_poll();
  1492. nproc = 0;
  1493. pi = proc_array;
  1494. if (LOGLEVEL >= 6)
  1495. {
  1496. log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "proc_array load started");
  1497. }
  1498. #ifdef PENABLE_LINUX26_CPUSETS
  1499. /* Instead of collect stats of all processes running on a large SMP system,
  1500. * collect stats of processes running in and below the Torque cpuset, only
  1501. * This relies on reliable process starters for MPI, which bind their tasks
  1502. * to the cpuset of the job. */
  1503. #ifdef USELIBCPUSET
  1504. pids = get_cpuset_pidlist(TTORQUECPUSET_BASE, pids);
  1505. #else
  1506. pids = get_cpuset_pidlist(TTORQUECPUSET_PATH, pids);
  1507. #endif
  1508. pp = pids;
  1509. while (pp != NULL)
  1510. {
  1511. pid = pp->pid;
  1512. pp = pp->next;
  1513. #else
  1514. if (pdir == NULL)
  1515. {
  1516. if ((pdir = opendir(procfs)) == NULL)
  1517. return(PBSE_SYSTEM);
  1518. }
  1519. rewinddir(pdir);
  1520. while ((dent = readdir(pdir)) != NULL)
  1521. {
  1522. if (!isdigit(dent->d_name[0]))
  1523. continue;
  1524. pid = atoi(dent->d_name);
  1525. #endif
  1526. if ((ps = get_proc_stat(pid)) == NULL)
  1527. {
  1528. if (errno != ENOENT)
  1529. {
  1530. sprintf(log_buffer, "%d: get_proc_stat", pid);
  1531. log_err(errno, __func__, log_buffer);
  1532. }
  1533. continue;
  1534. }
  1535. /* nproc++; -- we need to increment AFTER assigning this ps to
  1536. the proc_array--otherwise we could skip it in for loops */
  1537. if ((nproc + 1) >= max_proc)
  1538. {
  1539. proc_stat_t *hold;
  1540. if (LOGLEVEL >= 9)
  1541. {
  1542. log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "alloc more proc_array");
  1543. }
  1544. max_proc *= 2;
  1545. hold = (proc_stat_t *)calloc(1, max_proc * sizeof(proc_stat_t));
  1546. if (hold == NULL)
  1547. {
  1548. log_err(errno, __func__, "unable to realloc space for proc_array sample");
  1549. return(PBSE_SYSTEM);
  1550. }
  1551. memcpy(hold, proc_array, sizeof(proc_stat_t) * max_proc / 2);
  1552. free(proc_array);
  1553. proc_array = hold;
  1554. } /* END if ((nproc+1) == max_proc) */
  1555. pi = &proc_array[nproc++];
  1556. memcpy(pi, ps, sizeof(proc_stat_t));
  1557. } /* END while (...) != NULL) */
  1558. #ifdef PENABLE_LINUX26_CPUSETS
  1559. free_pidlist(pids);
  1560. #endif
  1561. if (LOGLEVEL >= 6)
  1562. {
  1563. sprintf(log_buffer, "proc_array loaded - nproc=%d",
  1564. nproc);
  1565. log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
  1566. }
  1567. return(PBSE_NONE);
  1568. } /* END mom_get_sample() */
  1569. /*
  1570. * Measure job resource usage and compare with its limits.
  1571. *
  1572. * If it has exceeded any well-formed polled limit return the limit that
  1573. * it exceeded.
  1574. * Otherwise, return PBSE_NONE. log_buffer is populated with failure.
  1575. */
  1576. int mom_over_limit(
  1577. job *pjob) /* I */
  1578. {
  1579. const char *pname;
  1580. int retval;
  1581. unsigned long value;
  1582. unsigned long num;
  1583. unsigned long long numll;
  1584. resource *pres;
  1585. assert(pjob != NULL);
  1586. assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
  1587. pres = (resource *)GET_NEXT(
  1588. pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
  1589. for (;pres != NULL;pres = (resource *)GET_NEXT(pres->rs_link))
  1590. {
  1591. assert(pres->rs_defin != NULL);
  1592. pname = pres->rs_defin->rs_name;
  1593. assert(pname != NULL);
  1594. assert(*pname != '\0');
  1595. if ((igncput == FALSE) && (strcmp(pname, "cput") == 0))
  1596. {
  1597. retval = mm_gettime(pres, &value);
  1598. if (retval != PBSE_NONE)
  1599. continue;
  1600. if ((num = cput_sum(pjob)) > value)
  1601. {
  1602. sprintf(log_buffer, "cput %lu exceeded limit %lu",
  1603. num,
  1604. value);
  1605. return(JOB_EXEC_OVERLIMIT_CPUT);
  1606. }
  1607. }
  1608. else if ((igncput == FALSE) && (strcmp(pname, "pcput") == 0))
  1609. {
  1610. retval = mm_gettime(pres, &value);
  1611. if (retval != PBSE_NONE)
  1612. continue;
  1613. if (overcpu_proc(pjob, value))
  1614. {
  1615. sprintf(log_buffer, "pcput exceeded limit %lu",
  1616. value);
  1617. return(JOB_EXEC_OVERLIMIT_CPUT);
  1618. }
  1619. }
  1620. else if (strcmp(pname, "vmem") == 0)
  1621. {
  1622. retval = mm_getsize(pres, &value);
  1623. if (retval != PBSE_NONE)
  1624. continue;
  1625. if ((ignvmem == 0) && ((numll = mem_sum(pjob)) > value))
  1626. {
  1627. sprintf(log_buffer, "vmem %llu exceeded limit %lu",
  1628. numll,
  1629. value);
  1630. return(JOB_EXEC_OVERLIMIT_MEM);
  1631. }
  1632. }
  1633. else if (strcmp(pname, "pvmem") == 0)
  1634. {
  1635. unsigned long long valuell;
  1636. retval = mm_getsize(pres, &value);
  1637. if (retval != PBSE_NONE)
  1638. continue;
  1639. valuell = (unsigned long long)value;
  1640. if ((ignvmem == 0) && (overmem_proc(pjob, valuell)))
  1641. {
  1642. sprintf(log_buffer, "pvmem exceeded limit %llu",
  1643. valuell);
  1644. return(JOB_EXEC_OVERLIMIT_MEM);
  1645. }
  1646. }
  1647. else if (ignwalltime == 0 && strcmp(pname, "walltime") == 0)
  1648. {
  1649. /* no need to check walltime on sisters, MS will get it */
  1650. if (am_i_mother_superior(*pjob) == false)
  1651. continue;
  1652. retval = mm_gettime(pres, &value);
  1653. if (retval != PBSE_NONE)
  1654. continue;
  1655. num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) *
  1656. wallfactor);
  1657. if (num > value)
  1658. {
  1659. sprintf(log_buffer, "walltime %ld exceeded limit %ld",
  1660. num,
  1661. value);
  1662. return(JOB_EXEC_OVERLIMIT_WT);
  1663. }
  1664. }
  1665. } /* END for (pres) */
  1666. #ifdef PENABLE_LINUX26_CPUSETS
  1667. /* Check memory_pressure */
  1668. if (memory_pressure_threshold > 0)
  1669. {
  1670. /*
  1671. * If last recorded memory_pressure is over threshold, increment counter.
  1672. * If duration is enabled, throw over_limit if counter reaches duration.
  1673. */
  1674. if (pjob->ji_mempressure_curr < memory_pressure_threshold)
  1675. {
  1676. pjob->ji_mempressure_cnt = 0; /* reset */
  1677. }
  1678. else
  1679. {
  1680. pjob->ji_mempressure_cnt++; /* count */
  1681. sprintf(log_buffer, "job %s memory_pressure is over %d for %d (%d) cycles",
  1682. pjob->ji_qs.ji_jobid,
  1683. memory_pressure_threshold,
  1684. pjob->ji_mempressure_cnt,
  1685. memory_pressure_duration);
  1686. log_ext(-1, __func__, log_buffer,LOG_ALERT);
  1687. if (memory_pressure_duration && (pjob->ji_mempressure_cnt >= memory_pressure_duration))
  1688. {
  1689. sprintf(log_buffer, "swap rate due to memory oversubscription is too high");
  1690. return(JOB_EXEC_OVERLIMIT_MEM);
  1691. }
  1692. }
  1693. }
  1694. #endif
  1695. return(PBSE_NONE);
  1696. } /* END mom_over_limit() */
  1697. /*
  1698. * job_expected_resc_found: logs an error if an expected resource was not found
  1699. */
  1700. int job_expected_resc_found(
  1701. const resource *pres,
  1702. const resource_def *rd,
  1703. const char *jobid)
  1704. {
  1705. if (!pres)
  1706. {
  1707. char log_buf[2048];
  1708. snprintf(log_buf, sizeof(log_buf), "job %s missing expected resource %s for resource usage calculation",
  1709. jobid, rd->rs_

Large files files are truncated, but you can click here to view the full file