| /*****************************************************************************\ |
| * proctrack_cgroup.c - process tracking via linux cgroup containers |
| ***************************************************************************** |
| * Copyright (C) 2009 CEA/DAM/DIF |
| * Written by Matthieu Hautreux <matthieu.hautreux@cea.fr> |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <fcntl.h> |
| #include <inttypes.h> |
| #include <limits.h> |
| #include <signal.h> |
| #include <stdlib.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| #include "src/common/log.h" |
| #include "src/common/xcgroup_read_config.h" |
| #include "src/common/xstring.h" |
| #include "src/slurmd/common/xcpuinfo.h" |
| #include "src/slurmd/common/xcgroup.h" |
| #include "src/slurmd/slurmd/slurmd.h" |
| #include "src/slurmd/slurmstepd/slurmstepd_job.h" |
| |
| /* |
| * These variables are required by the generic plugin interface. If they |
| * are not found in the plugin, the plugin loader will ignore it. |
| * |
| * plugin_name - a string giving a human-readable description of the |
| * plugin. There is no maximum length, but the symbol must refer to |
| * a valid string. |
| * |
| * plugin_type - a string suggesting the type of the plugin or its |
| * applicability to a particular form of data or method of data handling. |
| * If the low-level plugin API is used, the contents of this string are |
| * unimportant and may be anything. Slurm uses the higher-level plugin |
| * interface which requires this string to be of the form |
| * |
| * <application>/<method> |
| * |
| * where <application> is a description of the intended application of |
| * the plugin (e.g., "jobcomp" for Slurm job completion logging) and <method> |
| * is a description of how this plugin satisfies that application. Slurm will |
| * only load job completion logging plugins if the plugin_type string has a |
| * prefix of "jobcomp/". |
| * |
| * plugin_version - an unsigned 32-bit integer containing the Slurm version |
| * (major.minor.micro combined into a single number). |
| */ |
| const char plugin_name[] = "Process tracking via linux cgroup freezer subsystem"; |
| const char plugin_type[] = "proctrack/cgroup"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| static slurm_cgroup_conf_t slurm_cgroup_conf; |
| |
| static char user_cgroup_path[PATH_MAX]; |
| static char job_cgroup_path[PATH_MAX]; |
| static char jobstep_cgroup_path[PATH_MAX]; |
| |
| static xcgroup_ns_t freezer_ns; |
| |
| static bool slurm_freezer_init = false; |
| static xcgroup_t freezer_cg; |
| static xcgroup_t slurm_freezer_cg; |
| static xcgroup_t user_freezer_cg; |
| static xcgroup_t job_freezer_cg; |
| static xcgroup_t step_freezer_cg; |
| |
| int _slurm_cgroup_init(void) |
| { |
| /* initialize user/job/jobstep cgroup relative paths |
| * and release agent path */ |
| user_cgroup_path[0]='\0'; |
| job_cgroup_path[0]='\0'; |
| jobstep_cgroup_path[0]='\0'; |
| |
| /* initialize freezer cgroup namespace */ |
| if (xcgroup_ns_create(&slurm_cgroup_conf, &freezer_ns, "", "freezer") |
| != XCGROUP_SUCCESS) { |
| error("unable to create freezer cgroup namespace"); |
| return SLURM_ERROR; |
| } |
| |
| /* initialize the root freezer cg */ |
| if (xcgroup_create(&freezer_ns, &freezer_cg, "", 0, 0) |
| != XCGROUP_SUCCESS) { |
| error("proctrack/cgroup unable to create root freezer xcgroup"); |
| xcgroup_ns_destroy(&freezer_ns); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| int _slurm_cgroup_create(stepd_step_rec_t *job, uint64_t id, uid_t uid, gid_t gid) |
| { |
| /* |
| * we do it here as we do not have access to the conf structure |
| * in libslurm (src/common/xcgroup.c) |
| */ |
| char *pre = (char *)xstrdup(slurm_cgroup_conf.cgroup_prepend); |
| #ifdef MULTIPLE_SLURMD |
| if ( conf->node_name != NULL ) |
| xstrsubstitute(pre,"%n", conf->node_name); |
| else { |
| xfree(pre); |
| pre = (char*) xstrdup("/slurm"); |
| } |
| #endif |
| |
| if (xcgroup_create(&freezer_ns, &slurm_freezer_cg, pre, |
| getuid(), getgid()) != XCGROUP_SUCCESS) { |
| xfree(pre); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * While creating the cgroup hierarchy of the step, lock the root |
| * cgroup directory. The same lock is hold during removal of the |
| * hierarchies of other jobs/steps. This helps to avoid the race |
| * condition with concurrent creation/removal of the intermediate |
| * shared directories that could result in the failure of the |
| * hierarchy setup |
| */ |
| if (xcgroup_lock(&freezer_cg) != XCGROUP_SUCCESS) { |
| error("%s: xcgroup_lock error", __func__); |
| goto bail; |
| } |
| |
| /* create slurm cgroup in the freezer ns (it could already exist) */ |
| if (xcgroup_instantiate(&slurm_freezer_cg) != XCGROUP_SUCCESS) |
| goto bail; |
| |
| /* build user cgroup relative path if not set (should not be) */ |
| if (*user_cgroup_path == '\0') { |
| if (snprintf(user_cgroup_path, PATH_MAX, |
| "%s/uid_%u", pre, uid) >= PATH_MAX) { |
| error("unable to build uid %u cgroup relative path : %m", |
| uid); |
| goto bail; |
| } |
| } |
| xfree(pre); |
| |
| /* build job cgroup relative path if no set (should not be) */ |
| if (*job_cgroup_path == '\0') { |
| if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u", |
| user_cgroup_path, job->jobid) >= PATH_MAX) { |
| error("unable to build job %u cgroup relative path : %m", |
| job->jobid); |
| goto bail; |
| } |
| } |
| |
| /* build job step cgroup relative path (should not be) */ |
| if (*jobstep_cgroup_path == '\0') { |
| int cc; |
| if (job->stepid == SLURM_BATCH_SCRIPT) { |
| cc = snprintf(jobstep_cgroup_path, PATH_MAX, |
| "%s/step_batch", job_cgroup_path); |
| } else if (job->stepid == SLURM_EXTERN_CONT) { |
| cc = snprintf(jobstep_cgroup_path, PATH_MAX, |
| "%s/step_extern", job_cgroup_path); |
| } else { |
| cc = snprintf(jobstep_cgroup_path, PATH_MAX, |
| "%s/step_%u", |
| job_cgroup_path, job->stepid); |
| } |
| if (cc >= PATH_MAX) { |
| error("proctrack/cgroup unable to build job step %u.%u " |
| "freezer cg relative path: %m", |
| job->jobid, job->stepid); |
| goto bail; |
| } |
| } |
| |
| /* create user cgroup in the freezer ns (it could already exist) */ |
| if (xcgroup_create(&freezer_ns, &user_freezer_cg, |
| user_cgroup_path, |
| getuid(), getgid()) != XCGROUP_SUCCESS) { |
| xcgroup_destroy(&slurm_freezer_cg); |
| goto bail; |
| } |
| |
| /* create job cgroup in the freezer ns (it could already exist) */ |
| if (xcgroup_create(&freezer_ns, &job_freezer_cg, |
| job_cgroup_path, |
| getuid(), getgid()) != XCGROUP_SUCCESS) { |
| xcgroup_destroy(&slurm_freezer_cg); |
| xcgroup_destroy(&user_freezer_cg); |
| goto bail; |
| } |
| |
| /* create step cgroup in the freezer ns (it should not exists) */ |
| if (xcgroup_create(&freezer_ns, &step_freezer_cg, |
| jobstep_cgroup_path, |
| getuid(), getgid()) != XCGROUP_SUCCESS) { |
| xcgroup_destroy(&slurm_freezer_cg); |
| xcgroup_destroy(&user_freezer_cg); |
| xcgroup_destroy(&job_freezer_cg); |
| goto bail; |
| } |
| |
| if ((xcgroup_instantiate(&user_freezer_cg) != XCGROUP_SUCCESS) || |
| (xcgroup_instantiate(&job_freezer_cg) != XCGROUP_SUCCESS) || |
| (xcgroup_instantiate(&step_freezer_cg) != XCGROUP_SUCCESS)) { |
| xcgroup_destroy(&user_freezer_cg); |
| xcgroup_destroy(&job_freezer_cg); |
| xcgroup_destroy(&step_freezer_cg); |
| goto bail; |
| } |
| |
| /* inhibit release agent for the step cgroup thus letting |
| * slurmstepd being able to add new pids to the container |
| * when the job ends (TaskEpilog,...) */ |
| xcgroup_set_param(&step_freezer_cg, "notify_on_release", "0"); |
| slurm_freezer_init = true; |
| |
| xcgroup_unlock(&freezer_cg); |
| return SLURM_SUCCESS; |
| |
| bail: |
| xfree(pre); |
| xcgroup_destroy(&slurm_freezer_cg); |
| xcgroup_unlock(&freezer_cg); |
| xcgroup_destroy(&freezer_cg); |
| return SLURM_ERROR; |
| } |
| |
| static int _move_current_to_root_cgroup(xcgroup_ns_t *ns) |
| { |
| xcgroup_t cg; |
| int rc; |
| |
| if (xcgroup_create(ns, &cg, "", 0, 0) != XCGROUP_SUCCESS) |
| return SLURM_ERROR; |
| |
| rc = xcgroup_move_process(&cg, getpid()); |
| xcgroup_destroy(&cg); |
| |
| return rc; |
| } |
| |
| int _slurm_cgroup_destroy(void) |
| { |
| if (xcgroup_lock(&freezer_cg) != XCGROUP_SUCCESS) { |
| error("%s: xcgroup_lock error", __func__); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * First move slurmstepd process to the root cgroup, otherwise |
| * the rmdir(2) triggered by the calls below will always fail, |
| * because slurmstepd is still in the cgroup! |
| */ |
| if (_move_current_to_root_cgroup(&freezer_ns) != SLURM_SUCCESS) { |
| error("%s: Unable to move pid %d to root cgroup", |
| __func__, getpid()); |
| xcgroup_unlock(&freezer_cg); |
| return SLURM_ERROR; |
| } |
| |
| xcgroup_wait_pid_moved(&job_freezer_cg, "freezer job"); |
| |
| if (jobstep_cgroup_path[0] != '\0') { |
| if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { |
| debug("_slurm_cgroup_destroy: problem deleting step cgroup path %s: %m", |
| step_freezer_cg.path); |
| xcgroup_unlock(&freezer_cg); |
| return SLURM_ERROR; |
| } |
| xcgroup_destroy(&step_freezer_cg); |
| } |
| |
| if (job_cgroup_path[0] != '\0') { |
| (void)xcgroup_delete(&job_freezer_cg); |
| xcgroup_destroy(&job_freezer_cg); |
| } |
| |
| if (user_cgroup_path[0] != '\0') { |
| (void)xcgroup_delete(&user_freezer_cg); |
| xcgroup_destroy(&user_freezer_cg); |
| } |
| |
| if (slurm_freezer_init) { |
| xcgroup_destroy(&slurm_freezer_cg); |
| } |
| |
| xcgroup_unlock(&freezer_cg); |
| xcgroup_destroy(&freezer_cg); |
| xcgroup_ns_destroy(&freezer_ns); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| int _slurm_cgroup_add_pids(uint64_t id, pid_t* pids, int npids) |
| { |
| if (*jobstep_cgroup_path == '\0') |
| return SLURM_ERROR; |
| |
| return xcgroup_add_pids(&step_freezer_cg, pids, npids); |
| } |
| |
| int _slurm_cgroup_stick_stepd(uint64_t id, pid_t pid) |
| { |
| if (*job_cgroup_path == '\0') |
| return SLURM_ERROR; |
| |
| return xcgroup_add_pids(&job_freezer_cg, &pid, 1); |
| } |
| |
| int |
| _slurm_cgroup_get_pids(uint64_t id, pid_t **pids, int *npids) |
| { |
| if (*jobstep_cgroup_path == '\0') |
| return SLURM_ERROR; |
| |
| return xcgroup_get_pids(&step_freezer_cg, pids, npids); |
| } |
| |
| int _slurm_cgroup_suspend(uint64_t id) |
| { |
| if (*jobstep_cgroup_path == '\0') |
| return SLURM_ERROR; |
| |
| return xcgroup_set_param(&step_freezer_cg, |
| "freezer.state", "FROZEN"); |
| } |
| |
| int _slurm_cgroup_resume(uint64_t id) |
| { |
| if (*jobstep_cgroup_path == '\0') |
| return SLURM_ERROR; |
| |
| return xcgroup_set_param(&step_freezer_cg, |
| "freezer.state", "THAWED"); |
| } |
| |
| bool |
| _slurm_cgroup_has_pid(pid_t pid) |
| { |
| bool fstatus; |
| xcgroup_t cg; |
| |
| fstatus = xcgroup_ns_find_by_pid(&freezer_ns, &cg, pid); |
| if ( fstatus != XCGROUP_SUCCESS) |
| return false; |
| |
| if (xstrcmp(cg.path, step_freezer_cg.path)) { |
| fstatus = false; |
| } |
| else { |
| fstatus = true; |
| } |
| |
| xcgroup_destroy(&cg); |
| return fstatus; |
| } |
| |
| int |
| _slurm_cgroup_is_pid_a_slurm_task(uint64_t id, pid_t pid) |
| { |
| int fstatus = -1; |
| int fd; |
| pid_t ppid; |
| char file_path[PATH_MAX], buf[2048]; |
| |
| if (snprintf(file_path, PATH_MAX, "/proc/%ld/stat", |
| (long)pid) >= PATH_MAX) { |
| debug2("unable to build pid '%d' stat file: %m ", pid); |
| return fstatus; |
| } |
| |
| if ((fd = open(file_path, O_RDONLY)) < 0) { |
| debug2("unable to open '%s' : %m ", file_path); |
| return fstatus; |
| } |
| if (read(fd, buf, 2048) <= 0) { |
| debug2("unable to read '%s' : %m ", file_path); |
| close(fd); |
| return fstatus; |
| } |
| close(fd); |
| |
| if (sscanf(buf, "%*d %*s %*s %d", &ppid) != 1) { |
| debug2("unable to get ppid of pid '%d', %m", pid); |
| return fstatus; |
| } |
| |
| /* |
| * assume that any child of slurmstepd is a slurm task |
| * they will get all signals, inherited processes will |
| * only get SIGKILL |
| */ |
| if (ppid == (pid_t) id) |
| fstatus = 1; |
| else |
| fstatus = 0; |
| |
| return fstatus; |
| } |
| |
| /* |
| * init() is called when the plugin is loaded, before any other functions |
| * are called. Put global initialization here. |
| */ |
| extern int init (void) |
| { |
| /* read cgroup configuration */ |
| if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) |
| return SLURM_ERROR; |
| |
| /* initialize cpuinfo internal data */ |
| if (xcpuinfo_init() != XCPUINFO_SUCCESS) { |
| free_slurm_cgroup_conf(&slurm_cgroup_conf); |
| return SLURM_ERROR; |
| } |
| |
| /* initialize cgroup internal data */ |
| if (_slurm_cgroup_init() != SLURM_SUCCESS) { |
| xcpuinfo_fini(); |
| free_slurm_cgroup_conf(&slurm_cgroup_conf); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int fini (void) |
| { |
| _slurm_cgroup_destroy(); |
| xcpuinfo_fini(); |
| free_slurm_cgroup_conf(&slurm_cgroup_conf); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Uses slurmd job-step manager's pid as the unique container id. |
| */ |
| extern int proctrack_p_create (stepd_step_rec_t *job) |
| { |
| int fstatus; |
| |
| /* create a new cgroup for that container */ |
| fstatus = _slurm_cgroup_create(job, (uint64_t)job->jmgr_pid, |
| job->uid, job->gid); |
| if (fstatus) |
| return SLURM_ERROR; |
| |
| /* stick slurmstepd pid to the newly created job container |
| * (Note: we do not put it in the step container because this |
| * container could be used to suspend/resume tasks using freezer |
| * properties so we need to let the slurmstepd outside of |
| * this one) |
| */ |
| fstatus = _slurm_cgroup_stick_stepd((uint64_t)job->jmgr_pid, |
| job->jmgr_pid); |
| if (fstatus) { |
| _slurm_cgroup_destroy(); |
| return SLURM_ERROR; |
| } |
| |
| /* we use slurmstepd pid as the identifier of the container |
| * the corresponding cgroup could be found using |
| * _slurm_cgroup_find_by_pid */ |
| job->cont_id = (uint64_t)job->jmgr_pid; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int proctrack_p_add (stepd_step_rec_t *job, pid_t pid) |
| { |
| return _slurm_cgroup_add_pids(job->cont_id, &pid, 1); |
| } |
| |
| extern int proctrack_p_signal (uint64_t id, int signal) |
| { |
| pid_t* pids = NULL; |
| int npids; |
| int i; |
| int slurm_task; |
| |
| /* get all the pids associated with the step */ |
| if (_slurm_cgroup_get_pids(id, &pids, &npids) != |
| SLURM_SUCCESS) { |
| debug3("unable to get pids list for cont_id=%"PRIu64"", id); |
| /* that could mean that all the processes already exit */ |
| /* the container so return success */ |
| return SLURM_SUCCESS; |
| } |
| |
| /* directly manage SIGSTOP using cgroup freezer subsystem */ |
| if (signal == SIGSTOP) { |
| xfree(pids); |
| return _slurm_cgroup_suspend(id); |
| } |
| |
| /* start by resuming in case of SIGKILL */ |
| if (signal == SIGKILL) { |
| _slurm_cgroup_resume(id); |
| } |
| |
| for (i = 0 ; i<npids ; i++) { |
| /* do not kill slurmstepd (it should not be part |
| * of the list, but just to not forget about that ;)) |
| */ |
| if (pids[i] == (pid_t)id) |
| continue; |
| |
| /* only signal slurm tasks unless signal is SIGKILL */ |
| slurm_task = _slurm_cgroup_is_pid_a_slurm_task(id, pids[i]); |
| if (slurm_task == 1 || signal == SIGKILL) { |
| debug2("killing process %d (%s) with signal %d", pids[i], |
| (slurm_task==1)?"slurm_task":"inherited_task", |
| signal); |
| kill(pids[i], signal); |
| } |
| } |
| |
| xfree(pids); |
| |
| /* resume tasks after signaling slurm tasks with SIGCONT to be sure */ |
| /* that SIGTSTP received at suspend time is removed */ |
| if (signal == SIGCONT) { |
| return _slurm_cgroup_resume(id); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int proctrack_p_destroy (uint64_t id) |
| { |
| return _slurm_cgroup_destroy(); |
| } |
| |
| extern uint64_t proctrack_p_find(pid_t pid) |
| { |
| /* not provided for now */ |
| return 0; |
| } |
| |
| extern bool proctrack_p_has_pid(uint64_t cont_id, pid_t pid) |
| { |
| return _slurm_cgroup_has_pid(pid); |
| } |
| |
| extern int proctrack_p_wait(uint64_t cont_id) |
| { |
| int delay = 1; |
| |
| if (cont_id == 0 || cont_id == 1) { |
| errno = EINVAL; |
| return SLURM_ERROR; |
| } |
| |
| /* Spin until the container is successfully destroyed */ |
| /* This indicates that all tasks have exited the container */ |
| while (proctrack_p_destroy(cont_id) != SLURM_SUCCESS) { |
| proctrack_p_signal(cont_id, SIGKILL); |
| sleep(delay); |
| if (delay < 120) { |
| delay *= 2; |
| } else { |
| error("%s: Unable to destroy container %"PRIu64" in cgroup plugin, giving up after %d sec", |
| __func__, cont_id, delay); |
| break; |
| } |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int proctrack_p_get_pids(uint64_t cont_id, |
| pid_t **pids, int *npids) |
| { |
| return _slurm_cgroup_get_pids(cont_id, pids, npids); |
| } |