| /*****************************************************************************\ |
| * proctrack_rms.c - process tracking via QsNet rms kernel module |
| ***************************************************************************** |
| * Copyright (C) 2005 The Regents of the University of California. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * UCRL-CODE-226842. |
| * |
| * This file is part of SLURM, a resource management program. |
| * For details, see <http://www.llnl.gov/linux/slurm/>. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| #if HAVE_CONFIG_H |
| # include "config.h" |
| #endif |
| |
| #if HAVE_STDINT_H |
| # include <stdint.h> |
| #endif |
| #if HAVE_INTTYPES_H |
| # include <inttypes.h> |
| #endif |
| |
| #include <rms/rmscall.h> |
| |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <signal.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <slurm/slurm.h> |
| #include <slurm/slurm_errno.h> |
| #include "src/common/log.h" |
| #include "src/slurmd/common/proctrack.h" |
| |
| const char plugin_name[] = "Process tracking for QsNet via the rms module"; |
| const char plugin_type[] = "proctrack/rms"; |
| const uint32_t plugin_version = 1; |
| |
| static int _prg_destructor_fork(void); |
| static void _prg_destructor_send(int fd, int prgid); |
| |
| #define MAX_IDS 512 |
| |
| extern int init (void) |
| { |
| /* close librmscall's internal fd to /proc/rms/control */ |
| pthread_atfork(NULL, NULL, rmsmod_fini); |
| return SLURM_SUCCESS; |
| } |
| |
| extern int fini (void) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* |
| * When proctrack/rms is used in conjunction with switch/elan, |
| * slurm_container_create will not normally create the program description. |
| * It just retrieves the prgid created in switch/elan. |
| * |
| * When the program description cannot be retrieved (switch/elan is not |
| * being used, the job step is a batch script, etc.) then rms_prgcreate() |
| * is called here. |
| */ |
| extern int slurm_container_create (slurmd_job_t *job) |
| { |
| int prgid; |
| /* |
| * Return a handle to an existing prgid or create a new one |
| */ |
| if (rms_getprgid (job->jmgr_pid, &prgid) < 0) { |
| int fd = _prg_destructor_fork(); |
| /* Use slurmd job-step manager's pid as a unique identifier */ |
| prgid = job->jmgr_pid; |
| if ((rms_prgcreate (prgid, job->uid, 1)) < 0) { |
| error ("ptrack/rms: rms_prgcreate: %m"); |
| _prg_destructor_send(fd, -1); |
| return SLURM_ERROR; |
| } |
| _prg_destructor_send(fd, prgid); |
| } |
| debug3("proctrack/rms: prgid = %d", prgid); |
| |
| job->cont_id = (uint32_t)prgid; |
| return SLURM_SUCCESS; |
| } |
| |
| extern int slurm_container_add (slurmd_job_t *job, pid_t pid) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * slurm_container_signal assumes that the slurmd jobstep manager |
| * is always the last process in the rms program description. |
| * No signals are sent to the last process. |
| */ |
| extern int slurm_container_signal (uint32_t id, int signal) |
| { |
| pid_t *pids; |
| int nids = 0; |
| int i; |
| int rc; |
| |
| if (id <= 0) |
| return -1; |
| |
| pids = malloc(MAX_IDS * sizeof(pid_t)); |
| if (!pids) { |
| error("proctrack/rms container signal: malloc failed: %m"); |
| return -1; |
| } |
| if ((rc = rms_prginfo((int)id, MAX_IDS, pids, &nids)) < 0) { |
| error("proctrack/rms rms_prginfo failed %d: %m", rc); |
| free(pids); |
| /* |
| * Ignore errors, program desc has probably already |
| * been cleaned up. |
| */ |
| return -1; |
| } |
| |
| rc = -1; |
| for (i = nids-2; i >= 0 ; i--) { |
| debug2("proctrack/rms(pid %d) Sending signal %d to process %d", |
| getpid(), signal, pids[i]); |
| rc &= kill(pids[i], signal); |
| debug2(" rc = %d", rc); |
| } |
| free(pids); |
| debug3("proctrack/rms signal container returning %d", rc); |
| return rc; |
| } |
| |
| |
| /* |
| * The switch/elan plugin is really responsible for creating and |
| * destroying rms program descriptions. slurm_destroy_container simply |
| * returns SLURM_SUCCESS when the program description contains one and |
| * only one process, assumed to be the slurmd jobstep manager. |
| */ |
| extern int slurm_container_destroy (uint32_t id) |
| { |
| debug2("proctrack/rms: destroying container %u\n", id); |
| if (id == 0) |
| return SLURM_SUCCESS; |
| |
| if (slurm_container_signal(id, 0) == -1) |
| return SLURM_SUCCESS; |
| |
| return SLURM_ERROR; |
| } |
| |
| |
| extern uint32_t slurm_container_find (pid_t pid) |
| { |
| int prgid = 0; |
| |
| if (rms_getprgid ((int) pid, &prgid) < 0) |
| return (uint32_t) 0; |
| return (uint32_t) prgid; |
| } |
| |
| extern bool slurm_container_has_pid (uint32_t cont_id, pid_t pid) |
| { |
| int prgid = 0; |
| |
| if (rms_getprgid ((int) pid, &prgid) < 0) |
| return false; |
| if ((uint32_t)prgid != cont_id) |
| return false; |
| |
| return true; |
| } |
| |
| extern int |
| slurm_container_wait(uint32_t cont_id) |
| { |
| int delay = 1; |
| |
| if (cont_id == 0 || cont_id == 1) { |
| errno = EINVAL; |
| return SLURM_ERROR; |
| } |
| |
| /* Spin until the container is empty */ |
| while (slurm_container_signal(cont_id, 0) != -1) { |
| slurm_container_signal(cont_id, SIGKILL); |
| sleep(delay); |
| if (delay < 120) { |
| delay *= 2; |
| } else { |
| error("Unable to destroy container %u", cont_id); |
| } |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * This module assumes that the slurmstepd (running as root) is always the |
| * last process in the rms program description. We do not include |
| * the slurmstepd in the list of pids that we return. |
| */ |
| extern int |
| slurm_container_get_pids(uint32_t cont_id, pid_t **pids, int *npids) |
| { |
| pid_t *p; |
| int np; |
| int len = 32; |
| |
| p = xmalloc(len * sizeof(pid_t)); |
| while(rms_prginfo((int)cont_id, len, p, &np) == -1) { |
| if (errno == EINVAL) { |
| /* array is too short, double its length */ |
| len *= 2; |
| xrealloc(p, len); |
| } else { |
| xfree(p); |
| *pids = NULL; |
| *npids = 0; |
| return SLURM_ERROR; |
| } |
| } |
| |
| /* Don't include the last pid (slurmstepd) in the list */ |
| if (np > 0) { |
| p[np-1] = 0; |
| np--; |
| } |
| |
| *npids = np; |
| *pids = p; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void |
| _close_all_fd_except(int fd) |
| { |
| int openmax; |
| int i; |
| |
| openmax = sysconf(_SC_OPEN_MAX); |
| for (i = 0; i <= openmax; i++) { |
| if (i != fd) |
| close(i); |
| } |
| } |
| |
| |
| /* |
| * Fork a child process that waits for a pipe to close, signalling that the |
| * parent process has exited. Then call rms_prgdestroy. |
| */ |
| static int |
| _prg_destructor_fork() |
| { |
| pid_t pid; |
| int fdpair[2]; |
| int prgid; |
| int i; |
| int dummy; |
| |
| if (pipe(fdpair) < 0) { |
| error("_prg_destructor_fork: failed creating pipe"); |
| return -1; |
| } |
| |
| pid = fork(); |
| if (pid < 0) { |
| error("_prg_destructor_fork: failed to fork program destructor"); |
| } else if (pid > 0) { |
| /* parent */ |
| close(fdpair[0]); |
| waitpid(pid, (int *)NULL, 0); |
| return fdpair[1]; |
| } |
| |
| /****************************************/ |
| /* fork again so the destructor process |
| * will not be a child of the slurmd |
| */ |
| pid = fork(); |
| if (pid < 0) { |
| error("_prg_destructor_fork: second fork failed"); |
| } else if (pid > 0) { |
| exit(0); |
| } |
| |
| /* child */ |
| close(fdpair[1]); |
| |
| /* close librmscall's internal fd to /proc/rms/control */ |
| rmsmod_fini(); |
| |
| _close_all_fd_except(fdpair[0]); |
| /* Wait for the program description id from the child */ |
| if (read(fdpair[0], &prgid, sizeof(prgid)) != sizeof(prgid)) { |
| error("_prg_destructor_fork read failed: %m"); |
| exit(1); |
| } |
| |
| if (prgid == -1) |
| exit(1); |
| |
| /* |
| * Wait for the pipe to close, signalling that the parent |
| * has exited. |
| */ |
| while (read(fdpair[0], &dummy, sizeof(dummy)) > 0) {} |
| |
| /* |
| * Verify that program description is empty. If not, send a SIGKILL. |
| */ |
| for (i = 0; i < 30; i++) { |
| int maxids = 8; |
| pid_t pids[8]; |
| int nids = 0; |
| |
| if (rms_prginfo(prgid, maxids, pids, &nids) < 0) { |
| error("_prg_destructor_fork: rms_prginfo: %m"); |
| } |
| if (nids == 0) |
| break; |
| if (rms_prgsignal(prgid, SIGKILL) < 0) { |
| error("_prg_destructor_fork: rms_prgsignal: %m"); |
| } |
| sleep(1); |
| } |
| |
| if (rms_prgdestroy(prgid) < 0) { |
| error("rms_prgdestroy"); |
| } |
| exit(0); |
| } |
| |
| |
| |
| /* |
| * Send the prgid of the newly created program description to the process |
| * forked earlier by _prg_destructor_fork(), using the file descriptor |
| * "fd" which was returned by the call to _prg_destructor_fork(). |
| */ |
| static void |
| _prg_destructor_send(int fd, int prgid) |
| { |
| debug3("_prg_destructor_send %d", prgid); |
| if (write (fd, &prgid, sizeof(prgid)) != sizeof(prgid)) { |
| error ("_prg_destructor_send failed: %m"); |
| } |
| /* Deliberately avoid closing fd. When this process exits, it |
| will close fd signalling to the child process that it is |
| time to call rms_prgdestroy */ |
| /*close(fd);*/ |
| } |