| /*****************************************************************************\ |
| * kill_tree.c - Kill process tree based upon process IDs |
| * Used primarily for MPICH-GM |
| ***************************************************************************** |
| * Copyright (C) 2004 The Regents of the University of California. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Takao Hatazaki <takao.hatazaki@hp.com> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include <dirent.h> |
| #include <fcntl.h> |
| #include <limits.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| #include "src/common/log.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "kill_tree.h" |
| |
| typedef struct xpid_s { |
| pid_t pid; |
| int is_usercmd; |
| char *cmd; |
| struct xpid_s *next; |
| } xpid_t; |
| |
| typedef struct xppid_s { |
| pid_t ppid; |
| xpid_t *list; |
| struct xppid_s *next; |
| } xppid_t; |
| |
| #define HASH_LEN 64 |
| |
| #define GET_HASH_IDX(ppid) ((ppid)%HASH_LEN) |
| |
| static xpid_t *_alloc_pid(pid_t pid, int is_usercmd, char *cmd, xpid_t *next) |
| { |
| xpid_t *new; |
| |
| new = xmalloc(sizeof(*new)); |
| new->pid = pid; |
| new->is_usercmd = is_usercmd; |
| new->cmd = xstrdup(cmd); |
| new->next = next; |
| return new; |
| } |
| |
| static xppid_t *_alloc_ppid(pid_t ppid, pid_t pid, int is_usercmd, char *cmd, |
| xppid_t *next) |
| { |
| xppid_t *new; |
| |
| new = xmalloc(sizeof(*new)); |
| new->ppid = ppid; |
| new->list = _alloc_pid(pid, is_usercmd, cmd, NULL); |
| new->next = next; |
| return new; |
| } |
| |
| static void _push_to_hashtbl(pid_t ppid, pid_t pid, |
| int is_usercmd, char *cmd, xppid_t **hashtbl) |
| { |
| int idx; |
| xppid_t *ppids, *newppid; |
| xpid_t *newpid; |
| |
| idx = GET_HASH_IDX(ppid); |
| ppids = hashtbl[idx]; |
| while (ppids) { |
| if (ppids->ppid == ppid) { |
| newpid = _alloc_pid(pid, is_usercmd, cmd, ppids->list); |
| ppids->list = newpid; |
| return; |
| } |
| ppids = ppids->next; |
| } |
| newppid = _alloc_ppid(ppid, pid, is_usercmd, cmd, hashtbl[idx]); |
| hashtbl[idx] = newppid; |
| } |
| |
| static int _get_myname(char *s) |
| { |
| char path[PATH_MAX], *rbuf; |
| ssize_t buf_used; |
| int fd; |
| |
| snprintf(path, PATH_MAX, "/proc/%ld/stat", (long)getpid()); |
| if ((fd = open(path, O_RDONLY)) < 0) { |
| error("Cannot open /proc/getpid()/stat"); |
| return -1; |
| } |
| rbuf = xmalloc(4096); |
| buf_used = read(fd, rbuf, 4096); |
| if ((buf_used <= 0) || (buf_used >= 4096)) { |
| error("Cannot read /proc/getpid()/stat"); |
| xfree(rbuf); |
| close(fd); |
| return -1; |
| } |
| close(fd); |
| if (sscanf(rbuf, "%*d %s ", s) != 1) { |
| error("Cannot get the command name from /proc/getpid()/stat"); |
| xfree(rbuf); |
| return -1; |
| } |
| xfree(rbuf); |
| return 0; |
| } |
| |
| static xppid_t **_build_hashtbl(void) |
| { |
| DIR *dir; |
| struct dirent *de; |
| char path[PATH_MAX], *endptr, *num, *rbuf; |
| ssize_t buf_used; |
| char myname[1024], cmd[1024]; |
| char state; |
| int fd; |
| long pid, ppid, ret_l; |
| xppid_t **hashtbl; |
| |
| if ((dir = opendir("/proc")) == NULL) { |
| error("opendir(/proc): %m"); |
| return NULL; |
| } |
| if (_get_myname(myname) < 0) |
| return NULL; |
| debug3("Myname in build_hashtbl: %s", myname); |
| |
| hashtbl = xcalloc(HASH_LEN, sizeof(xppid_t *)); |
| |
| errno = 0; |
| rbuf = xmalloc(4096); |
| while ((de = readdir(dir)) != NULL) { |
| num = de->d_name; |
| if ((num[0] < '0') || (num[0] > '9')) |
| continue; |
| ret_l = strtol(num, &endptr, 10); |
| if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX)) { |
| error("couldn't do a strtol on str %s(%ld): %m", |
| num, ret_l); |
| continue; |
| } |
| if (endptr == NULL || *endptr != 0) |
| continue; |
| snprintf(path, PATH_MAX, "/proc/%s/stat", num); |
| if ((fd = open(path, O_RDONLY)) < 0) { |
| continue; |
| } |
| buf_used = read(fd, rbuf, 4096); |
| if ((buf_used <= 0) || (buf_used >= 4096)) { |
| close(fd); |
| continue; |
| } |
| close(fd); |
| if (sscanf(rbuf, "%ld %s %c %ld", &pid, cmd, &state, &ppid) |
| != 4) { |
| continue; |
| } |
| if (state == 'Z') { |
| debug3("Defunct process skipped: command=%s state=%c " |
| "pid=%ld ppid=%ld", cmd, state, pid, ppid); |
| continue; /* Defunct, don't try to kill */ |
| } |
| |
| /* Record cmd for debugging purpose */ |
| _push_to_hashtbl((pid_t)ppid, (pid_t)pid, |
| xstrcmp(myname, cmd), cmd, hashtbl); |
| } |
| xfree(rbuf); |
| closedir(dir); |
| return hashtbl; |
| } |
| |
| static void _destroy_list(xpid_t *list) |
| { |
| xpid_t *tmp; |
| |
| while (list) { |
| tmp = list->next; |
| xfree(list->cmd); |
| xfree(list); |
| list = tmp; |
| } |
| } |
| |
| static void _destroy_hashtbl(xppid_t **hashtbl) |
| { |
| int i; |
| xppid_t *ppid, *tmp; |
| |
| for (i=0; i<HASH_LEN; i++) { |
| ppid = hashtbl[i]; |
| while (ppid) { |
| _destroy_list(ppid->list); |
| tmp = ppid->next; |
| xfree(ppid); |
| ppid = tmp; |
| } |
| } |
| xfree(hashtbl); |
| } |
| |
| |
| static xpid_t *_get_list(int top, xpid_t *list, xppid_t **hashtbl) |
| { |
| xppid_t *ppid; |
| xpid_t *children; |
| |
| ppid = hashtbl[GET_HASH_IDX(top)]; |
| while (ppid) { |
| if (ppid->ppid == top) { |
| children = ppid->list; |
| while (children) { |
| list = _alloc_pid(children->pid, |
| children->is_usercmd, |
| children->cmd, |
| list); |
| children = children->next; |
| } |
| children = ppid->list; |
| while (children) { |
| list = _get_list(children->pid, list, hashtbl); |
| children = children->next; |
| } |
| break; |
| } |
| ppid = ppid->next; |
| } |
| return list; |
| } |
| |
| static int _kill_proclist(xpid_t *list, int sig) |
| { |
| int rc; |
| |
| rc = 0; |
| while (list) { |
| if (list->pid > 1) { |
| if (! list->is_usercmd) { |
| debug2("%ld %s is not a user command. " |
| "Skipped sending signal %d", |
| (long)list->pid, list->cmd, sig); |
| } else { |
| verbose("Sending signal %d to pid %d %s", |
| sig, list->pid, list->cmd); |
| if (kill(list->pid, sig)) |
| rc = errno; /* save the last error */ |
| } |
| } |
| list = list->next; |
| } |
| |
| return rc; |
| } |
| |
| |
| /* |
| * Some of processes may not be in the same process group |
| * (e.g. GMPI processes). So, find out the process tree, |
| * then kill all that subtree. |
| */ |
| extern int kill_proc_tree(pid_t top, int sig) |
| { |
| xpid_t *list; |
| int rc = -1; |
| xppid_t **hashtbl; |
| |
| if ((hashtbl = _build_hashtbl()) == NULL) |
| return -1; |
| |
| list = _get_list(top, NULL, hashtbl); |
| rc = _kill_proclist(list, sig); |
| _destroy_hashtbl(hashtbl); |
| _destroy_list(list); |
| return rc; |
| } |
| |
| |
| /* |
| * Return the pid of the process named "process_name" |
| * which is the ancestor of "process". |
| */ |
| extern pid_t find_ancestor(pid_t process, char *process_name) |
| { |
| char path[PATH_MAX], *rbuf; |
| ssize_t buf_used; |
| int fd, len; |
| long pid, ppid; |
| |
| len = strlen(process_name); |
| rbuf = xmalloc_nz(4097); |
| pid = ppid = (long)process; |
| while (1) { |
| if (ppid <= 1) { |
| pid = 0; |
| break; |
| } |
| |
| snprintf(path, PATH_MAX, "/proc/%ld/stat", ppid); |
| if ((fd = open(path, O_RDONLY)) < 0) { |
| pid = 0; |
| break; |
| } |
| buf_used = read(fd, rbuf, 4096); |
| if (buf_used >= 0) |
| rbuf[buf_used] = '\0'; |
| else |
| rbuf[0] = '\0'; |
| if ((buf_used <= 0) || (buf_used >= 4096)) { |
| close(fd); |
| pid = 0; |
| break; |
| } |
| close(fd); |
| if (sscanf(rbuf, "%ld %*s %*s %ld", &pid, &ppid) != 2) { |
| pid = 0; |
| break; |
| } |
| |
| snprintf(path, PATH_MAX, "/proc/%ld/cmdline", pid); |
| if ((fd = open(path, O_RDONLY)) < 0) { |
| continue; |
| } |
| buf_used = read(fd, rbuf, 4096); |
| if (buf_used >= 0) |
| rbuf[buf_used] = '\0'; |
| else |
| rbuf[0] = '\0'; |
| if ((buf_used <= 0) || (buf_used >= 4096)) { |
| close(fd); |
| continue; |
| } |
| close(fd); |
| if (strncmp(rbuf, process_name, len) == 0) |
| break; |
| } |
| xfree(rbuf); |
| |
| return pid; |
| } |
| |
| /* The returned "pids" array does NOT include the slurmstepd */ |
| extern int proctrack_linuxproc_get_pids(pid_t top, pid_t **pids, int *npids) |
| { |
| xppid_t **hashtbl; |
| xpid_t *list, *ptr; |
| pid_t *p; |
| int i, len = 32, rc; |
| |
| if ((hashtbl = _build_hashtbl()) == NULL) |
| return SLURM_ERROR; |
| |
| list = _get_list(top, NULL, hashtbl); |
| if (list == NULL) { |
| *pids = NULL; |
| *npids = 0; |
| _destroy_hashtbl(hashtbl); |
| return SLURM_ERROR; |
| } |
| |
| p = xcalloc(len, sizeof(pid_t)); |
| ptr = list; |
| i = 0; |
| while (ptr != NULL) { |
| if (ptr->is_usercmd) { /* don't include the slurmstepd */ |
| if (i >= len - 1) { |
| len *= 2; |
| xrealloc(p, (sizeof(pid_t) * len)); |
| } |
| p[i] = ptr->pid; |
| i++; |
| } |
| ptr = ptr->next; |
| } |
| |
| if (i == 0) { |
| xfree(p); |
| *pids = NULL; |
| *npids = 0; |
| rc = SLURM_ERROR; |
| } else { |
| *pids = p; |
| *npids = i; |
| rc = SLURM_SUCCESS; |
| } |
| _destroy_hashtbl(hashtbl); |
| _destroy_list(list); |
| return rc; |
| } |