/src/resmom/linux/mom_mach.c
C | 5345 lines | 3417 code | 1458 blank | 470 comment | 820 complexity | 8cf49e16f2eb3c479a19208755979b78 MD5 | raw file
Possible License(s): LGPL-2.1
Large files files are truncated, but you can click here to view the full file
- #include "license_pbs.h" /* See here for the software license */
- #include <pbs_config.h> /* the master config generated by configure */
- #include "lib_mom.h" /* header */
- #include <assert.h>
- #include <limits.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <unistd.h>
- #include <dirent.h>
- #include <errno.h>
- #include <strings.h>
- #include <mntent.h>
- #include <asm/types.h>
- #include <time.h>
- #include <sys/quota.h>
- #include <sys/time.h>
- #include <sys/procfs.h>
- #include <sys/param.h>
- #include <sys/stat.h>
- #include <sys/vfs.h>
- #include <sys/sysmacros.h>
- #include <sys/resource.h>
- #include <signal.h>
- #include <syscall.h>
- #include <ctype.h>
- #include <string.h>
- #include <csv.h>
- #include <fcntl.h>
- /* needed for oom_adj */
- #include <linux/limits.h>
- #ifdef Q_6_5_QUOTAON
- /* remap dqblk for SUSE 9.0 */
- #define dqblk if_dqblk
- #endif /* Q_6_5_QUOTAON */
- /*
- #ifndef dqblk
- #include <linux/quotaio_v1.h>
- #define dqblk v1_disk_dqblk
- #endif
- */
- #include "pbs_error.h"
- #include "portability.h"
- #include "list_link.h"
- #include "server_limits.h"
- #include "attribute.h"
- #include "resource.h"
- #include "pbs_job.h"
- #include "log.h"
- #include "mom_mach.h"
- #include "mom_func.h"
- #include "resmon.h"
- #include "utils.h"
- #include "../rm_dep.h"
- #include "pbs_nodes.h"
- #ifdef PENABLE_LINUX26_CPUSETS
- #include "pbs_cpuset.h"
- #endif
- #include "mom_config.h"
- /*
- ** System dependent code to gather information for the resource
- ** monitor for a Linux i386 machine.
- **
- ** Resources known by this code:
- ** cput cpu time for a pid or session
- ** mem memory size for a pid or session in KB
- ** resi resident memory size for a pid or session in KB
- ** sessions list of sessions in the system
- ** pids list of pids in a session
- ** nsessions number of sessions in the system
- ** nusers number of users in the system
- ** totmem total memory size in KB
- ** availmem available memory size in KB
- ** ncpus number of cpus
- ** physmem physical memory size in KB
- ** size size of a file or filesystem
- ** idletime seconds of idle time
- ** walltime wall clock time for a pid
- ** loadave current load average
- ** quota quota information (sizes in kb)
- ** netload number of bytes transferred for all interfaces
- */
- #ifndef MAX_LINE
- #define MAX_LINE 1024
- #endif
- #ifndef TRUE
- #define FALSE 0
- #define TRUE 1
- #endif /* TRUE */
- static char procfs[] = "/proc";
- static DIR *pdir = NULL;
- static int pagesize;
- extern char *ret_string;
- extern time_t time_now;
- #define TBL_INC 200 /* initial proc table */
- #define PMEMBUF_SIZE 2048
- static proc_stat_t *proc_array = NULL;
- static int nproc = 0;
- static int max_proc = 0;
- /*
- ** external functions and data
- */
- extern tlist_head svr_alljobs;
- extern struct config *search(struct config *,char *);
- extern struct rm_attribute *momgetattr(char *);
- extern long system_ncpus;
- #ifdef NUMA_SUPPORT
- extern int num_node_boards;
- extern nodeboard node_boards[];
- extern int numa_index;
- #else
- extern char path_meminfo[MAX_LINE];
- #endif /* NUMA_SUPPORT */
- /*
- ** local functions and data
- */
- static const char *resi (struct rm_attribute *);
- static const char *totmem (struct rm_attribute *);
- static const char *availmem (struct rm_attribute *);
- static const char *physmem (struct rm_attribute *);
- static const char *ncpus (struct rm_attribute *);
- static const char *walltime (struct rm_attribute *);
- static const char *quota (struct rm_attribute *);
- static const char *netload (struct rm_attribute *);
- #ifdef NUMA_SUPPORT
- const char *cpuact (struct rm_attribute *);
- #endif
- #ifdef USELIBMEMACCT
- #ifdef __cplusplus
- extern "C"
- {
- #endif
- long long get_memacct_resi(pid_t pid);
- extern long get_weighted_memory_size(pid_t);
- #ifdef __cplusplus
- }
- #endif
- #endif
- #ifndef mbool_t
- #define mbool_t char
- #endif /* mbool_t */
- mbool_t ProcIsChild(char *,pid_t,char *);
- extern const char *loadave(struct rm_attribute *);
- extern const char *nullproc(struct rm_attribute *);
- time_t wait_time = 10;
- #ifdef NUMA_SUPPORT
- typedef struct proc_cpu
- {
- unsigned long long idle_total;
- unsigned long long busy_total;
- } proc_cpu_t;
- static proc_cpu_t *cpu_array = NULL;
- #endif
- /*
- ** local resource array
- */
- struct config dependent_config[] =
- {
- { "resi", {resi} },
- { "totmem", {totmem} },
- { "availmem", {availmem} },
- { "physmem", {physmem} },
- { "ncpus", {ncpus} },
- #ifdef NUMA_SUPPORT
- { "loadave", {cpuact} },
- #else
- { "loadave", {loadave} },
- #endif
- { "walltime", {walltime} },
- { "quota", {quota} },
- { "netload", {netload} },
- { "size", {size} },
- { NULL, {nullproc} }
- };
- unsigned linux_time = 0;
- /*
- * support routine for getting system time -- sets linux_time
- */
- void proc_get_btime(void)
- {
- FILE *fp;
- char label[256];
- if ((fp = fopen("/proc/stat", "r")) == NULL)
- {
- return;
- }
- while (!feof(fp))
- {
- if (fscanf(fp, "%s", label) != 1)
- {
- fclose(fp);
- return;
- }
- if (strcmp(label, "btime"))
- {
- if (fscanf(fp, "%*[^\n]%*c") != 0)
- {
- fclose(fp);
- return;
- }
- }
- else
- {
- if (fscanf(fp, "%u", &linux_time) != 1) {}
- fclose(fp);
- return;
- }
- } /* END while (!feof(fp)) */
- fclose(fp);
- return;
- } /* END proc_get_btime() */
- /* NOTE: see 'man 5 proc' for /proc/pid/stat format and description */
- /* NOTE: leading '*' indicates that field should be ignored */
- /* FORMAT: <PID> <COMM> <STATE> <PPID> <PGRP> <SESSION> [<TTY_NR>] [<TPGID>] <FLAGS> [<MINFLT>] [<CMINFLT>] [<MAJFLT>] [<CMAJFLT>] <UTIME> <STIME> <CUTIME> <CSTIME> [<PRIORITY>] [<NICE>] [<0>] [<ITREALVALUE>] <STARTTIME> <VSIZE> <RSS> [<RLIM>] [<STARTCODE>] ... */
- static char stat_str[] = " %c %d %d %d %*d %*d %u %*u \
- %*u %*u %*u %lu %lu %lu %lu %*ld %*ld %*u %*ld %lu %llu %lld %*lu %*lu \
- %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu";
- /*
- * Convert jiffies to seconds.
- *
- * Hertz is sysconf(_SC_CLK_TCK) in get_proc_stat()
- */
- #define JTOS(x) (x) / Hertz;
- /*
- * Linux /proc status routine.
- *
- * Returns a pointer to a static proc_stat_t structure given
- * a process number, or NULL if there is an error. Takes the
- * place of the ioctl call PIOCSTATUS in the irix imp of mom_mach.c
- *
- */
- proc_stat_t *get_proc_stat(
- int pid) /* I */
- {
- static proc_stat_t ps;
- static char path[MAXLINE];
- static char readbuf[MAXLINE << 2];
- static char *lastbracket;
- FILE *fd;
- unsigned long jstarttime; /* number of jiffies since OS start time when process started */
- struct stat sb;
- static int Hertz = 0;
- int Hertz_errored = 0;
- if (Hertz <= 0)
- {
- Hertz = sysconf(_SC_CLK_TCK); /* returns 0 on error */
- if (Hertz <= 0)
- {
- /* FAILURE */
- if (!Hertz_errored)
- log_err(errno, "get_proc_stat", "sysconf(_SC_CLK_TCK) failed, unable to monitor processes");
- Hertz_errored = 1;
- return(NULL);
- }
- }
- Hertz_errored = 0;
- sprintf(path, "/proc/%d/stat",
- pid);
- if ((fd = fopen(path, "r")) == NULL)
- {
- /* FAILURE */
- return(NULL);
- }
- /* use 'man 5 proc' for /proc/pid/stat format */
- if (!fgets(readbuf, sizeof(readbuf), fd))
- {
- fclose(fd);
- return(NULL);
- }
- lastbracket = strrchr(readbuf, ')');
- if (lastbracket == NULL)
- {
- fclose(fd);
- return(NULL);
- }
- *lastbracket = '\0'; /* We basically split the string here, overwriting the ')'. */
- lastbracket++;
- if (sscanf(readbuf,"%d (%[^\n]",&ps.pid,path) != 2)
- {
- /* FAILURE */
- fclose(fd);
- return(NULL);
- }
- /* see stat_str[] value for mapping 'stat' format */
- if (sscanf(lastbracket,stat_str,
- &ps.state, /* state (one of RSDZTW) */
- &ps.ppid, /* ppid */
- &ps.pgrp, /* pgrp */
- &ps.session, /* session id */
- &ps.flags, /* flags - kernel flags of the process, see the PF_* in <linux/sched.h> */
- &ps.utime, /* utime - jiffies that this process has been scheduled in user mode */
- &ps.stime, /* stime - jiffies that this process has been scheduled in kernel mode */
- &ps.cutime, /* cutime - jiffies that this processâs waited-for children have been scheduled in user mode */
- &ps.cstime, /* cstime - jiffies that this processâs waited-for children have been scheduled in kernel mode */
- &jstarttime, /* starttime */
- &ps.vsize, /* vsize */
- &ps.rss) != 12) /* rss */
- {
- /* FAILURE */
- fclose(fd);
- return(NULL);
- }
- if (fstat(fileno(fd), &sb) == -1)
- {
- /* FAILURE */
- fclose(fd);
- return(NULL);
- }
- ps.uid = sb.st_uid;
- ps.start_time = linux_time + JTOS(jstarttime);
- ps.name = path;
- ps.utime = JTOS(ps.utime);
- ps.stime = JTOS(ps.stime);
- ps.cutime = JTOS(ps.cutime);
- ps.cstime = JTOS(ps.cstime);
- /* SUCCESS */
- fclose(fd);
- return(&ps);
- } /* END get_proc_stat() */
- #ifdef USELIBMEMACCT
- /*
- * Retrieve weighted RSS value for process with pid from memacctd.
- * Returns the value in bytes on success, returns -1 on failure.
- */
- long long get_memacct_resi(pid_t pid)
- {
- long long w_rss;
- if ((w_rss = get_weighted_memory_size(pid)) == -1)
- {
- sprintf(log_buffer, "get_weighted_memory_size(%d) failed", pid);
- log_err(errno, __func__, log_buffer);
- }
- return(w_rss);
- } /* END get_memacct_resi() */
- #endif
- /*
- * get_proc_mem_from_path()
- * @returns a pointer to a struct containing the memory information
- * @pre-cond: path must point to a valid path of a meminfo system file
- */
- proc_mem_t *get_proc_mem_from_path(
- const char *path)
- {
- proc_mem_t *mm;
- FILE *fp;
- char str[32];
- long long bfsz = -1;
- long long casz = -1;
- long long fcasz = -1;
- if ((fp = fopen(path,"r")) == NULL)
- {
- return(NULL);
- }
- mm = (proc_mem_t *)calloc(1, sizeof(proc_mem_t));
- if (fscanf(fp,"%30s",str) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- if (!strncmp(str,"total:",sizeof(str)))
- {
- /* old format */
- if (fscanf(fp,"%*[^\n]%*c") != 0) /* remove text header */
- {
- fclose(fp);
- return(NULL);
- }
- /* umu vmem patch */
- if (fscanf(fp, "%*s %llu %llu %llu %*u %lld %lld",
- &mm->mem_total,
- &mm->mem_used,
- &mm->mem_free,
- &bfsz,
- &casz) != 5)
- {
- fclose(fp);
- return(NULL);
- }
- mm->mem_free += casz + bfsz;
- if (fscanf(fp, "%*s %llu %llu %llu %*[^\n]%*c",
- &mm->swap_total,
- &mm->swap_used,
- &mm->swap_free) != 3)
- {
- fclose(fp);
- return(NULL);
- }
- }
- else
- {
- do
- {
- /* new format (kernel > 2.4) the first 'str' has been read */
- if (!strncmp(str, "MemTotal:", sizeof(str)))
- {
- if (fscanf(fp, "%llu",
- &mm->mem_total) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- mm->mem_total *= 1024; /* the unit is kB */
- }
- else if (!strncmp(str, "MemFree:", sizeof(str)))
- {
- if (fscanf(fp, "%llu",
- &mm->mem_free) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- mm->mem_free *= 1024;
- }
- else if (!strncmp(str, "Buffers:", sizeof(str)))
- {
- if (fscanf(fp, "%lld",
- &bfsz) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- bfsz *= 1024;
- }
- else if (!strncmp(str, "Cached:", sizeof(str)))
- {
- if (fscanf(fp, "%lld",
- &casz) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- casz *= 1024;
- }
- else if (!strncmp(str, "FilePages:", sizeof(str)))
- {
- if (fscanf(fp, "%lld",
- &fcasz) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- fcasz *= 1024;
- }
- else if (!strncmp(str, "SwapTotal:", sizeof(str)))
- {
- if (fscanf(fp, "%llu",
- &mm->swap_total) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- mm->swap_total *= 1024;
- }
- else if (!strncmp(str, "SwapFree:", sizeof(str)))
- {
- if (fscanf(fp, "%llu",
- &mm->swap_free) != 1)
- {
- fclose(fp);
- return(NULL);
- }
- mm->swap_free *= 1024;
- }
- }
- while (fscanf(fp, "%30s", str) == 1);
- } /* END else */
- fclose(fp);
- if (bfsz >= 0 || casz >= 0)
- {
- if (bfsz > 0)
- mm->mem_free += bfsz;
- if (casz > 0)
- mm->mem_free += casz;
- }
- else if (fcasz > 0)
- {
- mm->mem_free += fcasz;
- }
- return(mm);
- } /* END get_proc_mem_from_path() */
- proc_mem_t *get_proc_mem(void)
- {
- static proc_mem_t ret_mm;
- #ifdef NUMA_SUPPORT
- int i;
- #else
- proc_mem_t *mem;
- #endif
- #ifdef NUMA_SUPPORT
- ret_mm.mem_total = 0;
- ret_mm.mem_used = 0;
- ret_mm.mem_free = 0;
- ret_mm.swap_total = 0;
- ret_mm.swap_used = 0;
- ret_mm.swap_free = 0;
- for (i = 0; i < node_boards[numa_index].num_nodes; i++)
- {
- proc_mem_t *node_mem = get_proc_mem_from_path(node_boards[numa_index].path_meminfo[i]);
- if (node_mem == NULL)
- return(NULL);
- ret_mm.mem_total += node_mem->mem_total;
- ret_mm.mem_used += node_mem->mem_used;
- ret_mm.mem_free += node_mem->mem_free;
- ret_mm.swap_total += node_mem->swap_total;
- ret_mm.swap_used += node_mem->swap_used;
- ret_mm.swap_free += node_mem->swap_free;
- free(node_mem);
- }
- #else
- mem = get_proc_mem_from_path(path_meminfo);
-
- if(mem == NULL)
- return (NULL);
- ret_mm.mem_total = mem->mem_total;
- ret_mm.mem_used = mem->mem_used;
- ret_mm.mem_free = mem->mem_free;
- ret_mm.swap_total = mem->swap_total;
- ret_mm.swap_used = mem->swap_used;
- ret_mm.swap_free = mem->swap_free;
- free(mem);
- #endif
- return(&ret_mm);
- } /* END get_proc_mem() */
- #ifdef PNOT
- proc_mem_t *get_proc_mem(void)
- {
- static proc_mem_t mm;
- FILE *fp;
- unsigned long m_tot, m_use, m_free;
- unsigned long s_tot, s_use, s_free;
- if ((fp = fopen(path_meminfo, "r")) == NULL)
- {
- return(NULL);
- }
- fscanf(fp, "%*[^\n]%*c"); /* remove text header */;
- fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c",
- &m_tot,
- &m_use,
- &m_free);
- fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c",
- &s_tot,
- &s_use,
- &s_free);
- mm.total = m_tot + s_tot;
- mm.used = m_use + s_use;
- mm.free = m_free + s_free;
- fclose(fp);
- return(&mm);
- } /* END get_proc_mem() */
- #endif /* PNOT */
- /*
- * sets oom_adj score for current process
- * requires root privileges or CAP_SYS_RESOURCE to succeed
- */
- static int oom_adj(int score)
- {
- pid_t pid;
- int rc,fd;
- char oom_adj_path[PATH_MAX] = "";
- char adj_value[128] = "";
- /* valid values are -17 to 15 */
- if ( score > 15 || score < -17 )
- return -1;
- pid = getpid();
- if ( snprintf(oom_adj_path, sizeof(oom_adj_path), "/proc/%d/oom_adj", pid) < 0 )
- return -1;
- if ( ( fd = open(oom_adj_path, O_RDWR) ) == -1 )
- return -1;
- if (snprintf(adj_value,sizeof(adj_value),"%d",score) < 0)
- return -1;
- rc = write(fd,adj_value,strlen(adj_value));
- close(fd);
- return rc;
- }
- void dep_initialize(void)
- {
- pagesize = getpagesize();
- if ((pdir = opendir(procfs)) == NULL)
- {
- log_err(errno, __func__, "opendir");
-
- return;
- }
- /* NOTE: /proc/<pid>/oom_adj tunable is linux specific */
- /* LKF: make pbs_mom processes immune to oom killer's killing frenzy if requested*/
- if (mom_oom_immunize != 0)
- {
-
- if (oom_adj(-17) < 0)
- {
- log_record(
- PBSEVENT_SYSTEM,
- PBS_EVENTCLASS_SERVER,
- __func__,
- "failed to make pbs_mom oom-killer immune");
- }
- else
- {
- log_record(
- PBSEVENT_SYSTEM,
- PBS_EVENTCLASS_SERVER,
- __func__,
- "mom is now oom-killer safe");
- }
- }
- proc_get_btime();
- return;
- } /* END dep_initialize() */
- void dep_cleanup(void)
- {
- log_record(PBSEVENT_SYSTEM, 0, __func__, "dependent cleanup");
- if (pdir)
- {
- closedir(pdir);
- pdir = NULL;
- }
- return;
- }
- /*
- * This routine is called on each cycle of the main loop.
- */
- void
- dep_main_loop_cycle(void)
- {
- /* No periodic functions. */
- }
- /*
- * Internal size decoding routine.
- *
- * Accepts a resource pointer and a pointer to the unsigned long integer
- * to receive the decoded value. It returns a PBS error code, and the
- * decoded value in the unsigned long integer.
- *
- * sizeof(word) = sizeof(int)
- */
- static int mm_getsize(
- resource *pres, /* I */
- unsigned long *ret) /* O */
- {
- unsigned long value;
- if (pres->rs_value.at_type != ATR_TYPE_SIZE)
- {
- return(PBSE_ATTRTYPE);
- }
- value = pres->rs_value.at_val.at_size.atsv_num;
- if (pres->rs_value.at_val.at_size.atsv_units == ATR_SV_WORDSZ)
- {
- if (value > ULONG_MAX / sizeof(int))
- {
- return(PBSE_BADATVAL);
- }
- value *= sizeof(int);
- }
- if (value > (ULONG_MAX >> pres->rs_value.at_val.at_size.atsv_shift))
- {
- return(PBSE_BADATVAL);
- }
- *ret = (value << pres->rs_value.at_val.at_size.atsv_shift);
- return(PBSE_NONE);
- } /* END mm_getsize() */
- /*
- * Internal time decoding routine.
- *
- * Accepts a resource pointer and a pointer to the unsigned long integer
- * to receive the decoded value. It returns a PBS error code, and the
- * decoded value of time in seconds in the unsigned long integer.
- */
- static int mm_gettime(
- resource *pres,
- unsigned long *ret)
- {
- if (pres->rs_value.at_type != ATR_TYPE_LONG)
- {
- return(PBSE_ATTRTYPE);
- }
- if (pres->rs_value.at_val.at_long < 0)
- {
- return(PBSE_BADATVAL);
- }
- *ret = pres->rs_value.at_val.at_long;
- return(PBSE_NONE);
- }
- static int injob(
- job *pjob,
- pid_t sid)
- {
- task *ptask;
- pid_t pid;
- #ifdef PENABLE_LINUX26_CPUSETS
- struct pidl *pids = NULL;
- struct pidl *pp;
- #else
- proc_stat_t *ps;
- #endif /* PENABLE_LINUX26_CPUSETS */
- for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
- ptask != NULL;
- ptask = (task *)GET_NEXT(ptask->ti_jobtask))
- {
- if (ptask->ti_qs.ti_sid <= 1)
- continue;
- if (ptask->ti_qs.ti_sid == sid)
- {
- return(TRUE);
- }
- }
- /* processes with a different sessionid are not necessarily not part of the
- job: the job can call setsid; need to check whether one of the parent
- processes has a sessionid that is in the job */
- #ifdef PENABLE_LINUX26_CPUSETS
- /* check whether the sid is in the job's cpuset */
- pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids);
- pp = pids;
- while (pp != NULL)
- {
- pid = pp->pid;
- pp = pp->next;
- if (pid == sid)
- {
- free_pidlist(pids);
- return(TRUE);
- }
- }
- free_pidlist(pids);
- #else
- /* get the parent process id of the sid and check whether it is part of
- the job; iterate */
- pid = sid;
- while (pid > 1)
- {
- if ((ps = get_proc_stat(pid)) == NULL)
- {
- if (errno != ENOENT)
- {
- sprintf(log_buffer, "%d: get_proc_stat", pid);
- log_err(errno, __func__, log_buffer);
- }
- return(FALSE);
- }
- pid = getsid(ps->ppid);
- for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
- ptask != NULL;
- ptask = (task *)GET_NEXT(ptask->ti_jobtask))
- {
- if (ptask->ti_qs.ti_sid <= 1)
- continue;
- if (ptask->ti_qs.ti_sid == pid)
- {
- return(TRUE);
- }
- }
- }
- #endif /* PENABLE_LINUX26_CPUSETS */
- return(FALSE);
- } /* END injob() */
- /*
- * Internal session CPU time decoding routine.
- *
- * Accepts a job pointer. Returns the sum of all cpu time
- * consumed for all tasks executed by the job, in seconds,
- * adjusted by cputfactor.
- */
- static unsigned long cput_sum(
- job *pjob) /* I */
- {
- ulong cputime;
- int nps = 0;
- int i;
- proc_stat_t *ps;
- cputime = 0;
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "proc_array loop start - jobid = %s",
- pjob->ji_qs.ji_jobid);
- log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
- }
- for (i = 0;i < nproc;i++)
- {
- ps = &proc_array[i];
- if ((LOGLEVEL >= 6) && (ps == NULL))
- {
- sprintf(log_buffer, "proc_array loop end - nproc=%d, i=%d, ps is null",
- nproc,
- i);
- log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
- }
- if (!injob(pjob, ps->session))
- continue;
- nps++;
- cputime += (ps->utime + ps->stime + ps->cutime + ps->cstime);
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "%s: session=%d pid=%d cputime=%lu (cputfactor=%f)",
- __func__,
- ps->session,
- ps->pid,
- cputime,
- cputfactor);
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- }
- } /* END for (i) */
- if (nps == 0)
- pjob->ji_flags |= MOM_NO_PROC;
- else
- pjob->ji_flags &= ~MOM_NO_PROC;
- return((unsigned long)((double)cputime * cputfactor));
- } /* END cput_sum() */
- /*
- * Return TRUE if any process in the job is over limit for cputime usage.
- */
- static int overcpu_proc(
- job *pjob,
- unsigned long limit) /* I */
- {
- ulong cputime;
- pid_t pid;
- proc_stat_t *ps;
- #ifdef PENABLE_LINUX26_CPUSETS
- struct pidl *pids = NULL;
- struct pidl *pp;
- #else
- struct dirent *dent;
- #endif /* PENABLE_LINUX26_CPUSETS */
- #ifdef PENABLE_LINUX26_CPUSETS
- /* Instead of collect stats of all processes running on a large SMP system,
- * collect stats of processes running in and below the cpuset of the job, only. */
- pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids);
- pp = pids;
- while (pp != NULL)
- {
- pid = pp->pid;
- pp = pp->next;
- #else
- rewinddir(pdir);
- while ((dent = readdir(pdir)) != NULL)
- {
- if (!isdigit(dent->d_name[0]))
- continue;
- pid = atoi(dent->d_name);
- #endif /* PENABLE_LINUX26_CPUSETS */
- if ((ps = get_proc_stat(pid)) == NULL)
- {
- if (errno != ENOENT)
- {
- sprintf(log_buffer, "%d: get_proc_stat", pid);
- log_err(errno, __func__, log_buffer);
- }
- continue;
- }
- #ifndef PENABLE_LINUX26_CPUSETS
- /* if it was in the cpuset, its part of the job, no need to check */
- if (!injob(pjob, ps->session))
- continue;
- #endif /* PENABLE_LINUX26_CPUSETS */
- /* change from ps->cutime to ps->utime, and ps->cstime to ps->stime */
- cputime = (ulong)((double)(ps->utime + ps->stime) * cputfactor);
- if (cputime > limit)
- {
- #ifdef PENABLE_LINUX26_CPUSETS
- free_pidlist(pids);
- #endif
- return(TRUE);
- }
- }
- #ifdef PENABLE_LINUX26_CPUSETS
- free_pidlist(pids);
- #endif
- return(FALSE);
- } /* END overcpu_proc() */
- /*
- * Internal session virtual memory usage function.
- *
- * Returns the total number of bytes of address
- * space consumed by all current processes within the job.
- */
- static unsigned long long mem_sum(
- job *pjob)
- {
- int i;
- unsigned long long segadd;
- proc_stat_t *ps;
- segadd = 0;
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "proc_array loop start - jobid = %s",
- pjob->ji_qs.ji_jobid);
- log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
- }
- for (i = 0;i < nproc;i++)
- {
- ps = &proc_array[i];
- if (!injob(pjob, ps->session))
- continue;
- segadd += ps->vsize;
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "%s: session=%d pid=%d vsize=%llu sum=%llu",
- __func__,
- ps->session,
- ps->pid,
- ps->vsize,
- segadd);
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- }
- } /* END for (i) */
- return(segadd);
- } /* END mem_sum() */
- /*
- * Internal session memory usage function.
- *
- * Returns the total number of bytes of resident memory
- * consumed by all current processes within the job.
- */
- static unsigned long long resi_sum(
- job *pjob)
- {
- int i;
- unsigned long long resisize;
- proc_stat_t *ps;
- #ifdef USELIBMEMACCT
- long long w_rss;
- #endif
- resisize = 0;
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "proc_array loop start - jobid = %s",
- pjob->ji_qs.ji_jobid);
- log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
- }
- for (i = 0;i < nproc;i++)
- {
- ps = &proc_array[i];
- if (!injob(pjob, ps->session))
- continue;
- #ifdef USELIBMEMACCT
- /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */
- w_rss = get_memacct_resi(ps->pid);
- if (w_rss == -1)
- resisize += ps->rss * pagesize;
- else
- resisize += w_rss;
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu w_rss=%ld sum=%llu",
- __func__,
- ps->session,
- ps->pid,
- ps->rss * pagesize,
- w_rss,
- resisize);
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- }
- #else
- resisize += ps->rss * pagesize;
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu sum=%llu",
- __func__,
- ps->session,
- ps->pid,
- ps->rss * pagesize,
- resisize);
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- }
- #endif
- } /* END for (i) */
- return(resisize);
- } /* END resi_sum() */
- /*
- * Return TRUE if any process in the job is over limit for virtual memory usage.
- */
- static int overmem_proc(
- job *pjob, /* I */
- unsigned long long limit) /* I */
- {
- int i;
- proc_stat_t *ps;
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "proc_array loop start - jobid = %s",
- pjob->ji_qs.ji_jobid);
- log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
- }
- for (i = 0;i < nproc;i++)
- {
- ps = &proc_array[i];
- if (!injob(pjob, ps->session))
- continue;
- if (ps->vsize > limit)
- {
- return(TRUE);
- }
- } /* END for (i) */
- return(FALSE);
- } /* END overmem_proc() */
- extern char *msg_momsetlim;
- /*
- * Internal error routine
- */
- int error(
- const char *string,
- int value)
- {
- char *message;
- assert(string != NULL);
- assert(*string != '\0');
- message = pbse_to_txt(value);
- assert(message != NULL);
- assert(*message != '\0');
- fprintf(stderr, msg_momsetlim, string, message);
- fflush(stderr);
- return(value);
- } /* END error() */
- /*
- * Establish system-enforced limits for the job.
- *
- * Run through the resource list, checking the values for all items
- * we recognize.
- *
- * If set_mode is SET_LIMIT_SET, then also set hard limits for the
- * system enforced limits (not-polled).
- * If anything goes wrong with the process, return a PBS error code
- * and print a message on standard error. A zero-length resource list
- * is not an error.
- *
- * If set_mode is SET_LIMIT_SET the entry conditions are:
- * 1. MOM has already forked, and we are called from the child.
- * 2. The child is still running as root.
- * 3. Standard error is open to the user's file.
- *
- * If set_mode is SET_LIMIT_ALTER, we are being called to modify
- * existing limits. Cannot alter those set by setrlimit (kernel)
- * because we are the wrong process.
- */
- int mom_set_limits(
- job *pjob, /* I */
- int set_mode) /* SET_LIMIT_SET or SET_LIMIT_ALTER */
- {
- const char *pname = NULL;
- int retval;
- unsigned long value; /* place in which to build resource value */
- resource *pres;
- struct rlimit reslim;
- unsigned long vmem_limit = 0;
- unsigned long mem_limit = 0;
- /* NOTE: log_buffer is exported */
- if (LOGLEVEL >= 2)
- {
- sprintf(log_buffer, "%s(%s,%s) entered",
- __func__,
- (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
- (set_mode == SET_LIMIT_SET) ? "set" : "alter");
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- log_buffer[0] = '\0';
- }
- assert(pjob != NULL);
- assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
- pres = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
- /*
- * cycle through all the resource specifications,
- * setting limits appropriately.
- */
- memset(&reslim, 0, sizeof(reslim));
- /* set oom_adj score for the starting job */
- /* if immunize mode is set to on, we have to set child score to 0 */
- if ( (set_mode == SET_LIMIT_SET) && ( job_oom_score_adjust != 0 || mom_oom_immunize != 0 ) )
- {
- retval = oom_adj(job_oom_score_adjust);
- if ( LOGLEVEL >= 2 )
- {
- sprintf(log_buffer, "setting oom_adj '%s'",
- (retval != -1) ? "succeeded" : "failed");
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- }
- };
- while (pres != NULL)
- {
- if (pres->rs_defin != NULL)
- pname = pres->rs_defin->rs_name;
- else
- pname = NULL;
- if (LOGLEVEL >= 2)
- {
- sprintf(log_buffer, "setting limit for attribute '%s'",
- (pname != NULL) ? pname : "NULL");
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- log_buffer[0] = '\0';
- }
- assert(pres->rs_defin != NULL);
- assert(pname != NULL);
- assert(pname[0] != '\0');
- if (!strcmp(pname, "cput"))
- {
- if (igncput == FALSE)
- {
- /* cpu time - check, if less than pcput use it */
- retval = mm_gettime(pres, &value);
- if (retval != PBSE_NONE)
- {
- sprintf(log_buffer, "cput mm_gettime failed in %s", __func__);
- return(error(pname, retval));
- }
- }
- }
- else if (!strcmp(pname, "pcput"))
- {
- if (igncput == FALSE)
- {
- if (set_mode == SET_LIMIT_SET)
- {
- /* process cpu time - set */
- retval = mm_gettime(pres, &value);
- if (retval != PBSE_NONE)
- {
- sprintf(log_buffer, "pcput mm_gettime failed in %s", __func__);
- return(error(pname, retval));
- }
- reslim.rlim_cur = reslim.rlim_max =
- (unsigned long)((double)value / cputfactor);
- if (LOGLEVEL >= 2)
- {
- sprintf(log_buffer, "setting cpu time limit to %ld for job %s",
- (long int)reslim.rlim_cur,
- pjob->ji_qs.ji_jobid);
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- log_buffer[0] = '\0';
- }
- /* NOTE: some versions of linux have a bug which causes the parent
- process to receive a SIGKILL if the child's cpu limit is exceeded */
- if (setrlimit(RLIMIT_CPU, &reslim) < 0)
- {
- sprintf(log_buffer, "setrlimit for RLIMIT_CPU failed in %s, errno=%d (%s)",
- __func__,
- errno, strerror(errno));
- return(error("RLIMIT_CPU", PBSE_SYSTEM));
- }
- } /* END if (set_mode == SET_LIMIT_SET) */
- }
- }
- else if (!strcmp(pname, "file"))
- {
- /* set */
- if (set_mode == SET_LIMIT_SET)
- {
- retval = mm_getsize(pres, &value);
- if (retval != PBSE_NONE)
- {
- sprintf(log_buffer, "mm_getsize() failed for file in %s",
- __func__);
- return(error(pname, retval));
- }
- if (value > ULONG_MAX)
- {
- if (LOGLEVEL >= 0)
- {
- sprintf(log_buffer, "cannot set file limit to %ld for job %s (value too large)",
- (long int)reslim.rlim_cur,
- pjob->ji_qs.ji_jobid);
- log_err(-1, __func__, log_buffer);
- log_buffer[0] = '\0';
- }
- return(error(pname, PBSE_BADATVAL));
- }
- reslim.rlim_cur = reslim.rlim_max = value;
- if (setrlimit(RLIMIT_FSIZE, &reslim) < 0)
- {
- sprintf(log_buffer, "cannot set file limit to %ld for job %s (setrlimit failed - check default user limits)",
- (long int)reslim.rlim_max,
- pjob->ji_qs.ji_jobid);
- log_err(errno, __func__, log_buffer);
- log_buffer[0] = '\0';
- return(error(pname, PBSE_SYSTEM));
- }
- }
- }
- else if (!strcmp(pname, "vmem"))
- {
- if (ignvmem == FALSE)
- {
- /* check */
- retval = mm_getsize(pres, &value);
- if (retval != PBSE_NONE)
- {
- sprintf(log_buffer, "mm_getsize() failed for vmem in %s", __func__);
- return(error(pname, retval));
- }
- if ((vmem_limit == 0) || (value < vmem_limit))
- vmem_limit = value;
- }
- }
- else if (!strcmp(pname, "pvmem"))
- {
- if (ignvmem == FALSE)
- {
- /* set */
- if (set_mode == SET_LIMIT_SET)
- {
- retval = mm_getsize(pres, &value);
- if (retval != PBSE_NONE)
- {
- sprintf(log_buffer, "mm_getsize() failed for pvmem in %s",
- __func__);
- return(error(pname, retval));
- }
- if (value > ULONG_MAX)
- {
- log_buffer[0] = '\0';
- sprintf(log_buffer, "invalid value returned by mm_getsize() for pvmem in %s",
- __func__);
- return(error(pname, PBSE_BADATVAL));
- }
- if ((vmem_limit == 0) || (value < vmem_limit))
- vmem_limit = value;
- }
- }
- }
- else if ((!strcmp(pname,"mem") && (pjob->ji_numnodes != 1)) ||
- !strcmp(pname,"mppmem"))
- {
- /* ignore. If we ever get rid of support for the UNICOS OS then we can
- remove the ATR_DFLAG_MOM | ATR_DFLAG_ALTRUN flags from mppmem */
- }
- else if ((!strcmp(pname, "mem") && (pjob->ji_numnodes == 1)) ||
- !strcmp(pname, "pmem"))
- {
- if (ignmem == FALSE)
- {
- /* set */
- if (set_mode == SET_LIMIT_SET)
- {
- retval = mm_getsize(pres, &value);
- if (retval != PBSE_NONE)
- {
- sprintf(log_buffer, "mm_getsize() failed for mem/pmem in %s",
- __func__);
- return(error(pname, retval));
- }
- reslim.rlim_cur = reslim.rlim_max = value;
- if (setrlimit(RLIMIT_DATA, &reslim) < 0)
- {
- sprintf(log_buffer, "cannot set data limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
- (long int)reslim.rlim_max,
- pjob->ji_qs.ji_jobid,
- errno,
- strerror(errno));
- return(error("RLIMIT_DATA", PBSE_SYSTEM));
- }
- if (setrlimit(RLIMIT_RSS, &reslim) < 0)
- {
- sprintf(log_buffer, "cannot set RSS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
- (long int)reslim.rlim_max,
- pjob->ji_qs.ji_jobid,
- errno,
- strerror(errno));
- return(error("RLIMIT_RSS", PBSE_SYSTEM));
- }
- #ifdef __GATECH
- /* NOTE: best patch may be to change to 'vmem_limit = value;' */
- if (setrlimit(RLIMIT_STACK, &reslim) < 0)
- {
- sprintf(log_buffer, "cannot set stack limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
- (long int)reslim.rlim_max,
- pjob->ji_qs.ji_jobid,
- errno,
- strerror(errno));
- return(error("RLIMIT_STACK", PBSE_SYSTEM));
- }
- /* set address space */
- if (setrlimit(RLIMIT_AS, &reslim) < 0)
- {
- sprintf(log_buffer, "cannot set AS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)",
- (long int)reslim.rlim_max,
- pjob->ji_qs.ji_jobid,
- errno,
- strerror(errno));
- return(error("RLIMIT_AS", PBSE_SYSTEM));
- }
- #endif /* __GATECH */
- mem_limit = value;
- if (getrlimit(RLIMIT_STACK, &reslim) >= 0)
- {
- /* NOTE: mem_limit no longer used with UMU patch in place */
- mem_limit = value + reslim.rlim_cur;
- }
- }
- }
- } /* END else if (!strcmp(pname,"mem") && ... */
- else if (!strcmp(pname, "walltime"))
- {
- /* check */
- retval = mm_gettime(pres, &value);
- if (retval != PBSE_NONE)
- {
- sprintf(log_buffer, "mm_gettime() failed for walltime in %s\n",
- __func__);
- return(error(pname, retval));
- }
- }
- else if (!strcmp(pname, "nice"))
- {
- /* set nice */
- if (set_mode == SET_LIMIT_SET)
- {
- errno = 0;
- if ((nice((int)pres->rs_value.at_val.at_long) == -1) && (errno != 0))
- {
- sprintf(log_buffer, "nice() failed w/errno=%d (%s) in %s\n",
- errno,
- strerror(errno),
- __func__);
- return(error(pname, PBSE_BADATVAL));
- }
- }
- }
- else if (!strcmp(pname, "size"))
- {
- /* ignore */
- /* NO-OP */
- }
- else if (!strcmp(pname, "prologue"))
- {
- }
- else if (!strcmp(pname, "epilogue"))
- {
- }
- else if ((!strcmp(pname, "mppdepth")) ||
- (!strcmp(pname, "mppnodect")) ||
- (!strcmp(pname, "mppwidth")) ||
- (!strcmp(pname, "mppnppn")) ||
- (!strcmp(pname, "mppnodes")) ||
- (!strcmp(pname, "mpplabels")) ||
- (!strcmp(pname, "mpparch")) ||
- (!strcmp(pname, "mpplabel")))
- {
- /* NO-OP */
- }
- else if ((pres->rs_defin->rs_flags & ATR_DFLAG_RMOMIG) == 0)
- {
- /* don't recognize and not marked as ignore by mom */
- sprintf(log_buffer, "do not know how to process resource '%s' in %s\n",
- pname,
- __func__);
- return(error(pname, PBSE_UNKRESC));
- }
- pres = (resource *)GET_NEXT(pres->rs_link);
- }
- if (set_mode == SET_LIMIT_SET)
- {
- /* if either of vmem or pvmem was given, set sys limit to lesser */
- if (vmem_limit != 0)
- {
- /* Don't make (p)vmem < pmem */
- if (mem_limit > vmem_limit)
- {
- vmem_limit = mem_limit;
- }
- reslim.rlim_cur = reslim.rlim_max = vmem_limit;
- if ((ignvmem == 0) && (setrlimit(RLIMIT_AS, &reslim) < 0))
- {
- sprintf(log_buffer, "setrlimit() failed setting AS for vmem_limit mod in %s\n",
- __func__);
- return(error("RLIMIT_AS", PBSE_SYSTEM));
- }
- /* UMU vmem patch sets RLIMIT_AS rather than RLIMIT_DATA and RLIMIT_STACK */
- /*
- reslim.rlim_cur = reslim.rlim_max = mem_limit;
- if (setrlimit(RLIMIT_DATA,&reslim) < 0)
- {
- sprintf(log_buffer,"setrlimit() failed setting data for vmem_limit mod in %s\n",
- id);
- return(error("RLIMIT_DATA",PBSE_SYSTEM));
- }
- if (setrlimit(RLIMIT_STACK,&reslim) < 0)
- {
- sprintf(log_buffer,"setrlimit() failed setting stack for vmem_limit mod in %s\n",
- id);
- return(error("RLIMIT_STACK",PBSE_SYSTEM));
- }
- */
- }
- }
- if (LOGLEVEL >= 5)
- {
- sprintf(log_buffer, "%s(%s,%s) completed",
- __func__,
- (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
- (set_mode == SET_LIMIT_SET) ? "set" : "alter");
- log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
- log_buffer[0] = '\0';
- }
- /* SUCCESS */
- return(PBSE_NONE);
- } /* END mom_set_limits() */
- /*
- * State whether MOM main loop has to poll this job to determine if some
- * limits are being exceeded.
- *
- * Sets flag TRUE if polling is necessary, FALSE otherwise. Actual
- * polling is done using the mom_over_limit machine-dependent function.
- */
- int mom_do_poll(
- job *pjob) /* I */
- {
- const char *pname;
- resource *pres;
- assert(pjob != NULL);
- if (LOGLEVEL >= 4)
- {
- log_record(
- PBSEVENT_JOB,
- PBS_EVENTCLASS_JOB,
- pjob->ji_qs.ji_jobid,
- "evaluating limits for job");
- }
- assert(pjob != NULL);
- assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
- pres = (resource *)GET_NEXT(
- pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
- while (pres != NULL)
- {
- assert(pres->rs_defin != NULL);
- pname = pres->rs_defin->rs_name;
- assert(pname != NULL);
- assert(*pname != '\0');
- if (strcmp(pname, "walltime") == 0 ||
- strcmp(pname, "cput") == 0 ||
- strcmp(pname, "pcput") == 0 ||
- strcmp(pname, "mem") == 0 ||
- strcmp(pname, "pvmem") == 0 ||
- strcmp(pname, "vmem") == 0)
- {
- return(TRUE);
- }
- pres = (resource *)GET_NEXT(pres->rs_link);
- }
- return(FALSE);
- } /* END mom_do_poll() */
- /*
- * Setup for polling.
- *
- * Open kernel device and get namelist info.
- */
- int mom_open_poll(void)
- {
- if (LOGLEVEL >= 6)
- {
- log_record(PBSEVENT_SYSTEM, 0, __func__, "started");
- }
- pagesize = getpagesize();
- proc_array = (proc_stat_t *)calloc(TBL_INC, sizeof(proc_stat_t));
- if (proc_array == NULL)
- {
- log_err(errno, __func__, "calloc");
- return(PBSE_SYSTEM);
- }
- max_proc = TBL_INC;
- return(PBSE_NONE);
- } /* END mom_open_poll() */
- /*
- * Declare start of polling loop.
- *
- * This function caches information about all of processes
- * on the compute node (pbs_mom calls this function). Each process
- * in /proc/ is queried by looking at the 'stat' file. Statistics like
- * CPU usage time, memory consumption, etc. are gathered in the proc_array
- * list. This list is then used throughout the pbs_mom to get information
- * about tasks it is monitoring.
- *
- * This function is called from the main MOM loop once every "check_poll_interval"
- * seconds.
- *
- * @see get_proc_stat() - child
- * @see mom_set_use() - Aggregates data collected here
- *
- * NOTE: populates global 'proc_array[]' variable.
- * NOTE: reallocs proc_array[] as needed to accomodate processes.
- *
- * @see mom_open_poll() - allocs proc_array table.
- * @see mom_close_poll() - frees procs_array.
- * @see setup_program_environment() - parent - called at pbs_mom start
- * @see main_loop() - parent - called once per iteration
- * @see mom_set_use() - populate job structure with usage data for local use or to send to mother superior
- */
- int mom_get_sample(void)
- {
- proc_stat_t *pi;
- proc_stat_t *ps;
- pid_t pid;
- #ifdef PENABLE_LINUX26_CPUSETS
- struct pidl *pids = NULL;
- struct pidl *pp;
- #else
- struct dirent *dent;
- #endif
- if (proc_array == NULL)
- mom_open_poll();
- nproc = 0;
- pi = proc_array;
- if (LOGLEVEL >= 6)
- {
- log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "proc_array load started");
- }
- #ifdef PENABLE_LINUX26_CPUSETS
- /* Instead of collect stats of all processes running on a large SMP system,
- * collect stats of processes running in and below the Torque cpuset, only
- * This relies on reliable process starters for MPI, which bind their tasks
- * to the cpuset of the job. */
- #ifdef USELIBCPUSET
- pids = get_cpuset_pidlist(TTORQUECPUSET_BASE, pids);
- #else
- pids = get_cpuset_pidlist(TTORQUECPUSET_PATH, pids);
- #endif
- pp = pids;
- while (pp != NULL)
- {
- pid = pp->pid;
- pp = pp->next;
- #else
- if (pdir == NULL)
- {
- if ((pdir = opendir(procfs)) == NULL)
- return(PBSE_SYSTEM);
- }
-
- rewinddir(pdir);
- while ((dent = readdir(pdir)) != NULL)
- {
- if (!isdigit(dent->d_name[0]))
- continue;
- pid = atoi(dent->d_name);
- #endif
- if ((ps = get_proc_stat(pid)) == NULL)
- {
- if (errno != ENOENT)
- {
- sprintf(log_buffer, "%d: get_proc_stat", pid);
- log_err(errno, __func__, log_buffer);
- }
- continue;
- }
- /* nproc++; -- we need to increment AFTER assigning this ps to
- the proc_array--otherwise we could skip it in for loops */
- if ((nproc + 1) >= max_proc)
- {
- proc_stat_t *hold;
- if (LOGLEVEL >= 9)
- {
- log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "alloc more proc_array");
- }
- max_proc *= 2;
- hold = (proc_stat_t *)calloc(1, max_proc * sizeof(proc_stat_t));
- if (hold == NULL)
- {
- log_err(errno, __func__, "unable to realloc space for proc_array sample");
- return(PBSE_SYSTEM);
- }
- memcpy(hold, proc_array, sizeof(proc_stat_t) * max_proc / 2);
- free(proc_array);
- proc_array = hold;
- } /* END if ((nproc+1) == max_proc) */
- pi = &proc_array[nproc++];
- memcpy(pi, ps, sizeof(proc_stat_t));
- } /* END while (...) != NULL) */
- #ifdef PENABLE_LINUX26_CPUSETS
- free_pidlist(pids);
- #endif
- if (LOGLEVEL >= 6)
- {
- sprintf(log_buffer, "proc_array loaded - nproc=%d",
- nproc);
- log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer);
- }
- return(PBSE_NONE);
- } /* END mom_get_sample() */
- /*
- * Measure job resource usage and compare with its limits.
- *
- * If it has exceeded any well-formed polled limit return the limit that
- * it exceeded.
- * Otherwise, return PBSE_NONE. log_buffer is populated with failure.
- */
- int mom_over_limit(
- job *pjob) /* I */
- {
- const char *pname;
- int retval;
- unsigned long value;
- unsigned long num;
- unsigned long long numll;
- resource *pres;
- assert(pjob != NULL);
- assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC);
- pres = (resource *)GET_NEXT(
- pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);
- for (;pres != NULL;pres = (resource *)GET_NEXT(pres->rs_link))
- {
- assert(pres->rs_defin != NULL);
- pname = pres->rs_defin->rs_name;
- assert(pname != NULL);
- assert(*pname != '\0');
- if ((igncput == FALSE) && (strcmp(pname, "cput") == 0))
- {
- retval = mm_gettime(pres, &value);
- if (retval != PBSE_NONE)
- continue;
- if ((num = cput_sum(pjob)) > value)
- {
- sprintf(log_buffer, "cput %lu exceeded limit %lu",
- num,
- value);
- return(JOB_EXEC_OVERLIMIT_CPUT);
- }
- }
- else if ((igncput == FALSE) && (strcmp(pname, "pcput") == 0))
- {
- retval = mm_gettime(pres, &value);
- if (retval != PBSE_NONE)
- continue;
- if (overcpu_proc(pjob, value))
- {
- sprintf(log_buffer, "pcput exceeded limit %lu",
- value);
- return(JOB_EXEC_OVERLIMIT_CPUT);
- }
- }
- else if (strcmp(pname, "vmem") == 0)
- {
- retval = mm_getsize(pres, &value);
- if (retval != PBSE_NONE)
- continue;
- if ((ignvmem == 0) && ((numll = mem_sum(pjob)) > value))
- {
- sprintf(log_buffer, "vmem %llu exceeded limit %lu",
- numll,
- value);
- return(JOB_EXEC_OVERLIMIT_MEM);
- }
- }
- else if (strcmp(pname, "pvmem") == 0)
- {
- unsigned long long valuell;
- retval = mm_getsize(pres, &value);
- if (retval != PBSE_NONE)
- continue;
- valuell = (unsigned long long)value;
- if ((ignvmem == 0) && (overmem_proc(pjob, valuell)))
- {
- sprintf(log_buffer, "pvmem exceeded limit %llu",
- valuell);
- return(JOB_EXEC_OVERLIMIT_MEM);
- }
- }
- else if (ignwalltime == 0 && strcmp(pname, "walltime") == 0)
- {
- /* no need to check walltime on sisters, MS will get it */
- if (am_i_mother_superior(*pjob) == false)
- continue;
- retval = mm_gettime(pres, &value);
- if (retval != PBSE_NONE)
- continue;
- num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) *
- wallfactor);
- if (num > value)
- {
- sprintf(log_buffer, "walltime %ld exceeded limit %ld",
- num,
- value);
- return(JOB_EXEC_OVERLIMIT_WT);
- }
- }
- } /* END for (pres) */
- #ifdef PENABLE_LINUX26_CPUSETS
- /* Check memory_pressure */
- if (memory_pressure_threshold > 0)
- {
- /*
- * If last recorded memory_pressure is over threshold, increment counter.
- * If duration is enabled, throw over_limit if counter reaches duration.
- */
- if (pjob->ji_mempressure_curr < memory_pressure_threshold)
- {
- pjob->ji_mempressure_cnt = 0; /* reset */
- }
- else
- {
- pjob->ji_mempressure_cnt++; /* count */
- sprintf(log_buffer, "job %s memory_pressure is over %d for %d (%d) cycles",
- pjob->ji_qs.ji_jobid,
- memory_pressure_threshold,
- pjob->ji_mempressure_cnt,
- memory_pressure_duration);
- log_ext(-1, __func__, log_buffer,LOG_ALERT);
- if (memory_pressure_duration && (pjob->ji_mempressure_cnt >= memory_pressure_duration))
- {
- sprintf(log_buffer, "swap rate due to memory oversubscription is too high");
- return(JOB_EXEC_OVERLIMIT_MEM);
- }
- }
- }
- #endif
- return(PBSE_NONE);
- } /* END mom_over_limit() */
- /*
- * job_expected_resc_found: logs an error if an expected resource was not found
- */
- int job_expected_resc_found(
- const resource *pres,
- const resource_def *rd,
- const char *jobid)
- {
- if (!pres)
- {
- char log_buf[2048];
- snprintf(log_buf, sizeof(log_buf), "job %s missing expected resource %s for resource usage calculation",
- jobid, rd->rs_…
Large files files are truncated, but you can click here to view the full file