PageRenderTime 42ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugins/proctrack/linuxproc/kill_tree.c

https://github.com/cfenoy/slurm
C | 406 lines | 312 code | 45 blank | 49 comment | 62 complexity | 15cb72b4eda930851266c61555597951 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * kill_tree.c - Kill process tree based upon process IDs
  3. * Used primarily for MPICH-GM
  4. *****************************************************************************
  5. * Copyright (C) 2004 The Regents of the University of California.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Takao Hatazaki <takao.hatazaki@hp.com>
  8. * CODE-OCEC-09-009. All rights reserved.
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #if HAVE_CONFIG_H
  40. # include "config.h"
  41. #endif
  42. #include <sys/types.h>
  43. #include <dirent.h>
  44. #include <sys/stat.h>
  45. #include <fcntl.h>
  46. #include <stdlib.h>
  47. #include <stdio.h>
  48. #include <signal.h>
  49. #include <strings.h>
  50. #include <unistd.h>
  51. #include <string.h>
  52. #include <limits.h>
  53. #include "slurm/slurm.h"
  54. #include "slurm/slurm_errno.h"
  55. #include "src/common/log.h"
  56. #include "src/common/xmalloc.h"
  57. #include "src/common/xstring.h"
  58. #include "kill_tree.h"
  59. typedef struct xpid_s {
  60. pid_t pid;
  61. int is_usercmd;
  62. char *cmd;
  63. struct xpid_s *next;
  64. } xpid_t;
  65. typedef struct xppid_s {
  66. pid_t ppid;
  67. xpid_t *list;
  68. struct xppid_s *next;
  69. } xppid_t;
  70. #define HASH_LEN 64
  71. #define GET_HASH_IDX(ppid) ((ppid)%HASH_LEN)
  72. static xpid_t *_alloc_pid(pid_t pid, int is_usercmd, char *cmd, xpid_t *next)
  73. {
  74. xpid_t *new;
  75. new = (xpid_t *)xmalloc(sizeof(*new));
  76. new->pid = pid;
  77. new->is_usercmd = is_usercmd;
  78. new->cmd = xstrdup(cmd);
  79. new->next = next;
  80. return new;
  81. }
  82. static xppid_t *_alloc_ppid(pid_t ppid, pid_t pid, int is_usercmd, char *cmd,
  83. xppid_t *next)
  84. {
  85. xppid_t *new;
  86. new = xmalloc(sizeof(*new));
  87. new->ppid = ppid;
  88. new->list = _alloc_pid(pid, is_usercmd, cmd, NULL);
  89. new->next = next;
  90. return new;
  91. }
  92. static void _push_to_hashtbl(pid_t ppid, pid_t pid,
  93. int is_usercmd, char *cmd, xppid_t **hashtbl)
  94. {
  95. int idx;
  96. xppid_t *ppids, *newppid;
  97. xpid_t *newpid;
  98. idx = GET_HASH_IDX(ppid);
  99. ppids = hashtbl[idx];
  100. while (ppids) {
  101. if (ppids->ppid == ppid) {
  102. newpid = _alloc_pid(pid, is_usercmd, cmd, ppids->list);
  103. ppids->list = newpid;
  104. return;
  105. }
  106. ppids = ppids->next;
  107. }
  108. newppid = _alloc_ppid(ppid, pid, is_usercmd, cmd, hashtbl[idx]);
  109. hashtbl[idx] = newppid;
  110. }
  111. static int get_myname(char *s)
  112. {
  113. char path[PATH_MAX], rbuf[1024];
  114. int fd;
  115. sprintf(path, "/proc/%ld/stat", (long)getpid());
  116. if ((fd = open(path, O_RDONLY)) < 0) {
  117. error("Cannot open /proc/getpid()/stat");
  118. return -1;
  119. }
  120. if (read(fd, rbuf, 1024) <= 0) {
  121. error("Cannot read /proc/getpid()/stat");
  122. close(fd);
  123. return -1;
  124. }
  125. close(fd);
  126. if (sscanf(rbuf, "%*d %s ", s) != 1) {
  127. error("Cannot get the command name from /proc/getpid()/stat");
  128. return -1;
  129. }
  130. return 0;
  131. }
  132. static xppid_t **_build_hashtbl(void)
  133. {
  134. DIR *dir;
  135. struct dirent *de;
  136. char path[PATH_MAX], *endptr, *num, rbuf[1024];
  137. char myname[1024], cmd[1024];
  138. char state;
  139. int fd;
  140. long pid, ppid, ret_l;
  141. xppid_t **hashtbl;
  142. if ((dir = opendir("/proc")) == NULL) {
  143. error("opendir(/proc): %m");
  144. return NULL;
  145. }
  146. if (get_myname(myname) < 0) return NULL;
  147. debug3("Myname in build_hashtbl: %s", myname);
  148. hashtbl = (xppid_t **)xmalloc(HASH_LEN * sizeof(xppid_t *));
  149. slurm_seterrno(0);
  150. while ((de = readdir(dir)) != NULL) {
  151. num = de->d_name;
  152. if ((num[0] < '0') || (num[0] > '9'))
  153. continue;
  154. ret_l = strtol(num, &endptr, 10);
  155. if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX) ||
  156. (errno == ERANGE)) {
  157. error("couldn't do a strtol on str %s(%ld): %m",
  158. num, ret_l);
  159. }
  160. if (endptr == NULL || *endptr != 0)
  161. continue;
  162. sprintf(path, "/proc/%s/stat", num);
  163. if ((fd = open(path, O_RDONLY)) < 0) {
  164. continue;
  165. }
  166. if (read(fd, rbuf, 1024) <= 0) {
  167. close(fd);
  168. continue;
  169. }
  170. if (sscanf(rbuf, "%ld %s %c %ld", &pid, cmd, &state, &ppid)
  171. != 4) {
  172. close(fd);
  173. continue;
  174. }
  175. close(fd);
  176. if (state == 'Z') {
  177. debug3("Defunct process skipped: command=%s state=%c "
  178. "pid=%ld ppid=%ld", cmd, state, pid, ppid);
  179. continue; /* Defunct, don't try to kill */
  180. }
  181. /* Record cmd for debugging purpose */
  182. _push_to_hashtbl((pid_t)ppid, (pid_t)pid,
  183. strcmp(myname, cmd), cmd, hashtbl);
  184. }
  185. closedir(dir);
  186. return hashtbl;
  187. }
  188. static void _destroy_list(xpid_t *list)
  189. {
  190. xpid_t *tmp;
  191. while (list) {
  192. tmp = list->next;
  193. xfree(list->cmd);
  194. xfree(list);
  195. list = tmp;
  196. }
  197. }
  198. static void _destroy_hashtbl(xppid_t **hashtbl)
  199. {
  200. int i;
  201. xppid_t *ppid, *tmp;
  202. for (i=0; i<HASH_LEN; i++) {
  203. ppid = hashtbl[i];
  204. while (ppid) {
  205. _destroy_list(ppid->list);
  206. tmp = ppid->next;
  207. xfree(ppid);
  208. ppid = tmp;
  209. }
  210. }
  211. xfree(hashtbl);
  212. }
  213. static xpid_t *_get_list(int top, xpid_t *list, xppid_t **hashtbl)
  214. {
  215. xppid_t *ppid;
  216. xpid_t *children;
  217. ppid = hashtbl[GET_HASH_IDX(top)];
  218. while (ppid) {
  219. if (ppid->ppid == top) {
  220. children = ppid->list;
  221. while (children) {
  222. list = _alloc_pid(children->pid,
  223. children->is_usercmd,
  224. children->cmd,
  225. list);
  226. children = children->next;
  227. }
  228. children = ppid->list;
  229. while (children) {
  230. list = _get_list(children->pid, list, hashtbl);
  231. children = children->next;
  232. }
  233. break;
  234. }
  235. ppid = ppid->next;
  236. }
  237. return list;
  238. }
  239. static int _kill_proclist(xpid_t *list, int sig)
  240. {
  241. int rc;
  242. rc = 0;
  243. while (list) {
  244. if (list->pid > 1) {
  245. if (! list->is_usercmd) {
  246. debug2("%ld %s is not a user command. "
  247. "Skipped sending signal %d",
  248. (long)list->pid, list->cmd, sig);
  249. } else {
  250. verbose("Sending signal %d to pid %d %s",
  251. sig, list->pid, list->cmd);
  252. if (kill(list->pid, sig))
  253. rc = errno; /* save the last error */
  254. }
  255. }
  256. list = list->next;
  257. }
  258. return rc;
  259. }
  260. /*
  261. * Some of processes may not be in the same process group
  262. * (e.g. GMPI processes). So, find out the process tree,
  263. * then kill all that subtree.
  264. */
  265. extern int kill_proc_tree(pid_t top, int sig)
  266. {
  267. xpid_t *list;
  268. int rc = -1;
  269. xppid_t **hashtbl;
  270. if ((hashtbl = _build_hashtbl()) == NULL)
  271. return -1;
  272. list = _get_list(top, NULL, hashtbl);
  273. rc = _kill_proclist(list, sig);
  274. _destroy_hashtbl(hashtbl);
  275. _destroy_list(list);
  276. return rc;
  277. }
  278. /*
  279. * Return the pid of the process named "process_name"
  280. * which is the ancestor of "process".
  281. */
  282. extern pid_t find_ancestor(pid_t process, char *process_name)
  283. {
  284. char path[PATH_MAX], rbuf[1024];
  285. int fd;
  286. long pid, ppid;
  287. pid = ppid = (long)process;
  288. do {
  289. if (ppid <= 1) {
  290. return 0;
  291. }
  292. sprintf(path, "/proc/%ld/stat", ppid);
  293. if ((fd = open(path, O_RDONLY)) < 0) {
  294. return 0;
  295. }
  296. if (read(fd, rbuf, 1024) <= 0) {
  297. close(fd);
  298. return 0;
  299. }
  300. close(fd);
  301. if (sscanf(rbuf, "%ld %*s %*s %ld", &pid, &ppid) != 2) {
  302. return 0;
  303. }
  304. sprintf(path, "/proc/%ld/cmdline", pid);
  305. if ((fd = open(path, O_RDONLY)) < 0) {
  306. continue;
  307. }
  308. if (read(fd, rbuf, 1024) <= 0) {
  309. close(fd);
  310. continue;
  311. }
  312. close(fd);
  313. } while (!strstr(rbuf, process_name));
  314. return pid;
  315. }
  316. /* The returned "pids" array does NOT include the slurmstepd */
  317. extern int proctrack_linuxproc_get_pids(pid_t top, pid_t **pids, int *npids)
  318. {
  319. xppid_t **hashtbl;
  320. xpid_t *list, *ptr;
  321. pid_t *p;
  322. int i;
  323. int len = 32;
  324. if ((hashtbl = _build_hashtbl()) == NULL)
  325. return SLURM_ERROR;
  326. list = _get_list(top, NULL, hashtbl);
  327. if (list == NULL) {
  328. *pids = NULL;
  329. *npids = 0;
  330. _destroy_hashtbl(hashtbl);
  331. return SLURM_ERROR;
  332. }
  333. p = (pid_t *)xmalloc(sizeof(pid_t) * len);
  334. ptr = list;
  335. i = 0;
  336. while(ptr != NULL) {
  337. if (ptr->is_usercmd) { /* don't include the slurmstepd */
  338. if (i >= len-1) {
  339. len *= 2;
  340. xrealloc(p, (sizeof(pid_t) * len));
  341. }
  342. p[i] = ptr->pid;
  343. i++;
  344. }
  345. ptr = ptr->next;
  346. }
  347. if (i == 0) {
  348. xfree(p);
  349. *pids = NULL;
  350. *npids = 0;
  351. _destroy_hashtbl(hashtbl);
  352. _destroy_list(list);
  353. return SLURM_ERROR;
  354. } else {
  355. *pids = p;
  356. *npids = i;
  357. _destroy_hashtbl(hashtbl);
  358. _destroy_list(list);
  359. return SLURM_SUCCESS;
  360. }
  361. }