| /*****************************************************************************\ |
| * cgroup_common.c - Cgroup plugin common functions |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "cgroup_common.h" |
| #include <poll.h> |
| |
| /* Testing read() on cgroup interfaces returns 4092 bytes at most. */ |
| #define CGROUP_READ_COUNT 4092 |
| |
| /* How much to wait for a pid to be removed from one cgroup. */ |
| #define MAX_MOVE_WAIT 1000 /* Milliseconds */ |
| |
| /* These are defined here so when we link with something other than |
| * the slurmctld we will have these symbols defined. They will get |
| * overwritten when linking with the slurmctld. |
| */ |
| #if defined (__APPLE__) |
| extern slurm_conf_t slurm_conf __attribute__((weak_import)); |
| #else |
| slurm_conf_t slurm_conf; |
| #endif |
| |
| /* |
| * Returns the path to the cgroup.procs file over which we have permissions |
| * defined by check_mode. This path is where we'll be able to read or write |
| * pids. If there are no paths available with these permissions, return NULL, |
| * which means the cgroup doesn't exist or we do not have permissions to modify |
| * the cg. |
| */ |
| static char *_cgroup_procs_check(xcgroup_t *cg, int check_mode) |
| { |
| struct stat st; |
| char *path = xstrdup_printf("%s/%s", cg->path, "cgroup.procs"); |
| |
| if (!((stat(path, &st) >= 0) && (st.st_mode & check_mode))) { |
| error("%s: failed on path %s: %m", __func__, path); |
| xfree(path); |
| } |
| |
| return path; |
| } |
| |
| static char *_cgroup_procs_readable_path(xcgroup_t *cg) |
| { |
| return _cgroup_procs_check(cg, S_IRUSR); |
| } |
| |
| static char *_cgroup_procs_writable_path(xcgroup_t *cg) |
| { |
| return _cgroup_procs_check(cg, S_IWUSR); |
| } |
| |
| static int _set_uint32_param(xcgroup_t *cg, char *param, uint32_t value) |
| { |
| int fstatus = SLURM_ERROR; |
| char file_path[PATH_MAX]; |
| char *cpath = cg->path; |
| |
| if (snprintf(file_path, PATH_MAX, "%s/%s", cpath, param) >= PATH_MAX) { |
| log_flag(CGROUP, "unable to build filepath for '%s' and parameter '%s' : %m", |
| cpath, param); |
| return fstatus; |
| } |
| |
| fstatus = common_file_write_uint32s(file_path, &value, 1); |
| if (fstatus != SLURM_SUCCESS) |
| log_flag(CGROUP, "unable to set parameter '%s' to '%u' for '%s'", |
| param, value, cpath); |
| else |
| log_flag(CGROUP, "parameter '%s' set to '%u' for '%s'", |
| param, value, cpath); |
| |
| return fstatus; |
| } |
| |
| static bool _is_empty_dir(const char *dirpath) |
| { |
| DIR *d; |
| struct dirent *dir; |
| bool empty = true; |
| |
| if (!(d = opendir(dirpath))) |
| return empty; |
| |
| while ((dir = readdir(d))) { |
| if (dir->d_type == DT_DIR && |
| (strcmp(dir->d_name, ".") && strcmp(dir->d_name, ".."))) { |
| empty = false; |
| log_flag(CGROUP, "Found at least one child directory: %s/%s", |
| dirpath, dir->d_name); |
| break; |
| } |
| } |
| |
| closedir(d); |
| return empty; |
| } |
| |
| /* |
| * Read a cgroup file interface in chunks of CGROUP_READ_COUNT. If the read is |
| * atomic, we should have a correct snapshot of the data. If multiple read() |
| * have been needed, the file might have been changed in between calls. |
| * |
| * IN: file_path - file path |
| * IN/OUT: out - pointer to file contents |
| * |
| * RET: -1 on error, accumulated number of read bytes otherwise |
| */ |
| static ssize_t _read_cg_file(char *file_path, char **out) |
| { |
| int fd, nr_reads = 0; |
| size_t count = CGROUP_READ_COUNT; |
| ssize_t rc, read_bytes = 0; |
| char *buf; |
| |
| xassert(!*out); |
| |
| /* open file for reading */ |
| fd = open(file_path, O_RDONLY, 0700); |
| if (fd < 0) { |
| error("unable to open '%s' for reading : %m", file_path); |
| return SLURM_ERROR; |
| } |
| |
| /* read file contents */ |
| buf = xmalloc(count); |
| while ((rc = read(fd, buf + read_bytes, count))) { |
| if (rc < 0) { |
| if (errno == EINTR) |
| continue; |
| error("unable to read '%s': %m", file_path); |
| xfree(buf); |
| break; |
| } |
| read_bytes += rc; |
| xrealloc(buf, (read_bytes + count)); |
| nr_reads++; |
| } |
| |
| if (nr_reads > 1) |
| log_flag(CGROUP, "%s: Read %zd bytes after %d read() syscalls. File may have changed between syscalls.", |
| file_path, read_bytes, nr_reads); |
| |
| close(fd); |
| *out = buf; |
| return (rc == -1) ? rc : read_bytes; |
| } |
| |
| extern int common_file_read_uints(char *file_path, void **values, int *nb, |
| int base) |
| { |
| int i; |
| ssize_t fsize; |
| char *buf = NULL, *p; |
| uint32_t *values32 = NULL; |
| uint64_t *values64 = NULL; |
| long long unsigned int ll_tmp; |
| |
| /* check input pointers */ |
| if (values == NULL || nb == NULL) |
| return SLURM_ERROR; |
| |
| if ((fsize = _read_cg_file(file_path, &buf)) < 0) |
| return SLURM_ERROR; |
| |
| /* count values (split by \n) */ |
| i = 0; |
| p = buf; |
| while (xstrchr(p, '\n') != NULL) { |
| i++; |
| p = xstrchr(p, '\n') + 1; |
| } |
| |
| if (base == 32) { |
| /* build uint32_t list */ |
| if (i > 0) { |
| values32 = xcalloc(i, sizeof(uint32_t)); |
| p = buf; |
| i = 0; |
| while (xstrchr(p, '\n') != NULL) { |
| sscanf(p, "%u", (values32 + i)); |
| p = xstrchr(p, '\n') + 1; |
| i++; |
| } |
| } |
| } else if (base == 64) { |
| /* build uint64_t list */ |
| if (i > 0) { |
| values64 = xcalloc(i, sizeof(uint64_t)); |
| p = buf; |
| i = 0; |
| while (xstrchr(p, '\n') != NULL) { |
| sscanf(p, "%llu", &ll_tmp); |
| values64[i++] = ll_tmp; |
| p = xstrchr(p, '\n') + 1; |
| } |
| } |
| } |
| |
| /* free buffer */ |
| xfree(buf); |
| |
| /* set output values */ |
| if (base == 32) |
| *values = values32; |
| else if (base == 64) |
| *values = values64; |
| |
| *nb = i; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int common_file_write_uints(char *file_path, void *values, int nb, |
| int base) |
| { |
| int rc; |
| int fd; |
| char tstr[256]; |
| uint32_t *values32 = NULL; |
| uint64_t *values64 = NULL; |
| |
| /* open file for writing */ |
| if ((fd = open(file_path, O_WRONLY, 0700)) < 0) { |
| error("%s: unable to open '%s' for writing: %m", |
| __func__, file_path); |
| return SLURM_ERROR; |
| } |
| |
| if (base == 32) |
| values32 = (uint32_t *) values; |
| else if (base == 64) |
| values64 = (uint64_t *) values; |
| |
| /* add one value per line */ |
| for (int i = 0; i < nb; i++) { |
| if (base == 32) { |
| uint32_t value = values32[i]; |
| if (snprintf(tstr, sizeof(tstr), "%u", value) < 0) { |
| error("%s: unable to build %u string value: %m", |
| __func__, value); |
| close(fd); |
| return SLURM_ERROR; |
| } |
| } else if (base == 64) { |
| uint64_t value = values64[i]; |
| if (snprintf(tstr, sizeof(tstr), |
| "%"PRIu64"", value) <0) { |
| error("%s: unable to build %"PRIu64" string value: %m", |
| __func__, value); |
| close(fd); |
| return SLURM_ERROR; |
| } |
| } else { |
| error("%s: unexpected base %d. Unable to write to %s", |
| __func__, base, file_path); |
| close(fd); |
| return SLURM_ERROR; |
| } |
| |
| /* write terminating NUL byte */ |
| safe_write(fd, tstr, strlen(tstr) + 1); |
| } |
| |
| /* close file */ |
| close(fd); |
| return SLURM_SUCCESS; |
| rwfail: |
| rc = errno; |
| error("%s: write value '%s' to '%s' failed: %m", |
| __func__, tstr, file_path); |
| close(fd); |
| return rc; |
| } |
| |
| extern int common_file_write_content(char *file_path, char *content, |
| size_t csize) |
| { |
| int fd; |
| |
| /* open file for writing */ |
| if ((fd = open(file_path, O_WRONLY, 0700)) < 0) { |
| error("%s: unable to open '%s' for writing: %m", |
| __func__, file_path); |
| return SLURM_ERROR; |
| } |
| |
| safe_write(fd, content, csize); |
| |
| /* close file */ |
| close(fd); |
| return SLURM_SUCCESS; |
| |
| rwfail: |
| error("%s: unable to write %zu bytes to cgroup %s: %m", |
| __func__, csize, file_path); |
| close(fd); |
| return SLURM_ERROR; |
| } |
| |
| extern int common_file_read_content(char *file_path, char **content, |
| size_t *csize) |
| { |
| ssize_t fsize; |
| char *buf = NULL; |
| |
| /* check input pointers */ |
| if (content == NULL || csize == NULL) |
| return SLURM_ERROR; |
| |
| if ((fsize = _read_cg_file(file_path, &buf)) < 0) |
| return SLURM_ERROR; |
| |
| /* set output values */ |
| *content = buf; |
| *csize = fsize; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int common_cgroup_instantiate(xcgroup_t *cg) |
| { |
| int fstatus = SLURM_ERROR; |
| mode_t cmask; |
| mode_t omask; |
| |
| char *file_path; |
| |
| /* init variables based on input cgroup */ |
| file_path = cg->path; |
| |
| /* save current mask and apply working one */ |
| cmask = S_IWGRP | S_IWOTH; |
| omask = umask(cmask); |
| |
| /* build cgroup */ |
| if (mkdir(file_path, 0755)) { |
| if (errno != EEXIST) { |
| error("%s: unable to create cgroup '%s' : %m", |
| __func__, file_path); |
| umask(omask); |
| return fstatus; |
| } |
| } |
| umask(omask); |
| |
| /* following operations failure might not result in a general |
| * failure so set output status to success */ |
| fstatus = SLURM_SUCCESS; |
| |
| return fstatus; |
| } |
| |
| extern int common_cgroup_create(xcgroup_ns_t *cgns, xcgroup_t *cg, char *uri, |
| uid_t uid, gid_t gid) |
| { |
| int fstatus = SLURM_ERROR; |
| char file_path[PATH_MAX]; |
| |
| /* build cgroup absolute path*/ |
| if (snprintf(file_path, PATH_MAX, "%s%s", cgns->mnt_point, |
| uri) >= PATH_MAX) { |
| log_flag(CGROUP, "unable to build cgroup '%s' absolute path in ns '%s' : %m", |
| uri, cgns->subsystems); |
| return fstatus; |
| } |
| |
| /* fill xcgroup structure */ |
| cg->ns = cgns; |
| cg->name = xstrdup(uri); |
| cg->path = xstrdup(file_path); |
| cg->uid = uid; |
| cg->gid = gid; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int common_cgroup_move_process(xcgroup_t *cg, pid_t pid) |
| { |
| char *path = NULL; |
| |
| /* |
| * First we check permissions to see if we will be able to move the pid. |
| * The path is a path to cgroup.procs and writing there will instruct |
| * the cgroup subsystem to move the process and all its threads there. |
| */ |
| path = _cgroup_procs_writable_path(cg); |
| |
| if (!path) { |
| error("Cannot write to cgroup.procs for %s", cg->path); |
| return SLURM_ERROR; |
| } |
| |
| xfree(path); |
| |
| return _set_uint32_param(cg, "cgroup.procs", pid); |
| } |
| |
| extern int common_cgroup_set_param(xcgroup_t *cg, char *param, char *content) |
| { |
| int fstatus = SLURM_ERROR; |
| char file_path[PATH_MAX]; |
| char *cpath = cg->path; |
| |
| if (!cpath || !param) |
| return fstatus; |
| |
| if (!content) { |
| log_flag(CGROUP, "no content given, nothing to do"); |
| return fstatus; |
| } |
| |
| if (snprintf(file_path, PATH_MAX, "%s/%s", cpath, param) >= PATH_MAX) { |
| log_flag(CGROUP, "unable to build filepath for '%s' and parameter '%s' : %m", |
| cpath, param); |
| return fstatus; |
| } |
| |
| fstatus = common_file_write_content(file_path, content, |
| strlen(content)); |
| if (fstatus != SLURM_SUCCESS) |
| log_flag(CGROUP, "unable to set parameter '%s' to '%s' for '%s'", |
| param, content, cpath); |
| else |
| debug3("%s: parameter '%s' set to '%s' for '%s'", |
| __func__, param, content, cpath); |
| |
| return fstatus; |
| } |
| |
| extern void common_cgroup_ns_destroy(xcgroup_ns_t *cgns) |
| { |
| xfree(cgns->mnt_point); |
| xfree(cgns->mnt_args); |
| xfree(cgns->subsystems); |
| xfree(cgns->init_cg_path); |
| FREE_NULL_BITMAP(cgns->avail_controllers); |
| } |
| |
| extern void common_cgroup_destroy(xcgroup_t *cg) |
| { |
| cg->ns = NULL; |
| xfree(cg->name); |
| xfree(cg->path); |
| cg->uid = -1; |
| cg->gid = -1; |
| } |
| |
| extern int common_cgroup_delete(xcgroup_t *cg) |
| { |
| int retries = 0, npids = -1; |
| pid_t *pids = NULL; |
| |
| if (!cg || !cg->path) { |
| error("invalid control group"); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Do 5 retries and wait 1000 milis on each if we receive an EBUSY and |
| * there are no pids, because we may be trying to remove the directory |
| * when the kernel hasn't yet drained the cgroup internal references |
| * (css_online), even if cgroup.procs is already empty. |
| * |
| * This workaround tries to mitigate a bug on kernels < 3.18 as per |
| * commit 41c25707d21716826e3c1f60967f5550610ec1c9 in the linux kernel. |
| */ |
| while ((rmdir(cg->path) < 0) && (errno != ENOENT)) { |
| if (errno == EBUSY) { |
| /* |
| * Do not rely in ENOTEMPTY since in cgroupfs a |
| * non-empty dir. removal will return EBUSY. |
| */ |
| if (!_is_empty_dir(cg->path)) { |
| log_flag(CGROUP, "Cannot rmdir(%s), cgroup is not empty", |
| cg->path); |
| return SLURM_ERROR; |
| } |
| |
| if (npids == -1) { |
| /* Do not retry on a 'really' busy cgroup */ |
| if ((common_cgroup_get_pids(cg, &pids, &npids) |
| != SLURM_SUCCESS)) |
| return SLURM_ERROR; |
| |
| if (npids > 0) { |
| xfree(pids); |
| debug3("Not removing %s, found %d pids", |
| cg->path, npids); |
| return SLURM_ERROR; |
| } |
| } |
| |
| /* This should happen usually only on kernels < 3.18 */ |
| if (retries < 5) { |
| poll(NULL, 0, 1000); |
| retries++; |
| continue; |
| } |
| |
| log_flag(CGROUP, "Unable to rmdir(%s), did %d retries: %m", |
| cg->path, retries); |
| } else { |
| error("Unable to rmdir(%s), unexpected error: %m", |
| cg->path); |
| } |
| |
| return SLURM_ERROR; |
| } |
| |
| if (retries) |
| log_flag(CGROUP, "rmdir(%s): took %d retries, possible cgroup filesystem slowness", |
| cg->path, retries); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int common_cgroup_add_pids(xcgroup_t *cg, pid_t *pids, int npids) |
| { |
| int rc = SLURM_ERROR; |
| char *path = _cgroup_procs_writable_path(cg); |
| |
| rc = common_file_write_uint32s(path, (uint32_t*)pids, npids); |
| if (rc != SLURM_SUCCESS) |
| error("unable to add pids to '%s'", cg->path); |
| |
| xfree(path); |
| return rc; |
| } |
| |
| extern int common_cgroup_get_pids(xcgroup_t *cg, pid_t **pids, int *npids) |
| { |
| int fstatus = SLURM_ERROR; |
| char *path = NULL; |
| |
| if (pids == NULL || npids == NULL || !cg->path) |
| return SLURM_ERROR; |
| |
| path = _cgroup_procs_readable_path(cg); |
| if (!path) { |
| error("unable to read '%s/cgroup.procs'", cg->path); |
| return SLURM_ERROR; |
| } |
| |
| fstatus = common_file_read_uint32s(path, (uint32_t**)pids, npids); |
| if (fstatus != SLURM_SUCCESS) |
| log_flag(CGROUP, "unable to get pids of '%s', file disappeared?", |
| path); |
| |
| xfree(path); |
| return fstatus; |
| } |
| |
| extern int common_cgroup_get_param(xcgroup_t *cg, char *param, char **content, |
| size_t *csize) |
| { |
| int fstatus = SLURM_ERROR; |
| char file_path[PATH_MAX]; |
| char *cpath = cg->path; |
| |
| if (snprintf(file_path, PATH_MAX, "%s/%s", cpath, param) >= PATH_MAX) { |
| log_flag(CGROUP, "unable to build filepath for '%s' and parameter '%s' : %m", |
| cpath, param); |
| } else { |
| fstatus = common_file_read_content(file_path, content, csize); |
| if (fstatus != SLURM_SUCCESS) |
| log_flag(CGROUP, "unable to get parameter '%s' for '%s'", |
| param, cpath); |
| } |
| return fstatus; |
| } |
| |
| extern int common_cgroup_set_uint64_param(xcgroup_t *cg, char *param, |
| uint64_t value) |
| { |
| int fstatus = SLURM_ERROR; |
| char file_path[PATH_MAX]; |
| char *cpath = cg->path; |
| |
| if (snprintf(file_path, PATH_MAX, "%s/%s", cpath, param) >= PATH_MAX) { |
| log_flag(CGROUP, "unable to build filepath for '%s' and parameter '%s' : %m", |
| cpath, param); |
| return fstatus; |
| } |
| |
| fstatus = common_file_write_uint64s(file_path, &value, 1); |
| if (fstatus != SLURM_SUCCESS) |
| log_flag(CGROUP, "unable to set parameter '%s' to '%"PRIu64"' for '%s'", |
| param, value, cpath); |
| else |
| debug3("%s: parameter '%s' set to '%"PRIu64"' for '%s'", |
| __func__, param, value, cpath); |
| |
| return fstatus; |
| } |
| |
| extern int common_cgroup_lock(xcgroup_t *cg) |
| { |
| int fstatus = SLURM_ERROR; |
| |
| if (cg->path == NULL) |
| return fstatus; |
| |
| if ((cg->fd = open(cg->path, O_RDONLY)) < 0) { |
| error("error from open of cgroup '%s' : %m", cg->path); |
| return fstatus; |
| } |
| |
| if (flock(cg->fd, LOCK_EX) < 0) { |
| error("error locking cgroup '%s' : %m", cg->path); |
| close(cg->fd); |
| } else |
| fstatus = SLURM_SUCCESS; |
| |
| return fstatus; |
| } |
| |
| extern int common_cgroup_unlock(xcgroup_t *cg) |
| { |
| int fstatus = SLURM_ERROR; |
| |
| if (flock(cg->fd, LOCK_UN) < 0) { |
| error("error unlocking cgroup '%s' : %m", cg->path); |
| } else |
| fstatus = SLURM_SUCCESS; |
| |
| close(cg->fd); |
| return fstatus; |
| } |
| |
| extern bool common_cgroup_wait_pid_moved(xcgroup_t *cg, pid_t pid, |
| const char *cg_name) |
| { |
| pid_t *pids = NULL; |
| int npids = 0; |
| int cnt = 0; |
| int i = 0; |
| bool found; |
| |
| /* |
| * There is a delay in the cgroup system when moving the pid from one |
| * cgroup to another. This is usually short, but we need to wait to make |
| * sure the pid is out of the step cgroup or we will occur an error |
| * leaving the cgroup unable to be removed. |
| * |
| * The way it is implemented of checking whether the pid is in the |
| * cgroup or not is not 100% reliable. In slow cgroup subsystems there |
| * is the possibility that the internal kernel references are not |
| * cleaned up even if the pid is not in the cgroup.procs anymore, in |
| * that case we will receive an -EBUSY when trying to delete later the |
| * cgroup. This is explained here: |
| * https://bugs.schedmd.com/show_bug.cgi?id=8911#c18 |
| * |
| * So try to mitigate this issue in a best-effort by waiting |
| * MAX_MOVE_WAIT/10 milis when we find the pid, and retry 10 times. |
| */ |
| do { |
| cnt++; |
| common_cgroup_get_pids(cg, &pids, &npids); |
| found = false; |
| for (i = 0; i < npids; i++) { |
| if (pids[i] == pid) { |
| found = true; |
| poll(NULL, 0, MAX_MOVE_WAIT/10); |
| break; |
| } |
| } |
| xfree(pids); |
| } while (found && (cnt < 10)); |
| |
| if (!found) |
| log_flag(CGROUP, "Took %d checks before pid %d was removed from the %s cgroup.", |
| cnt, pid, cg_name); |
| else { |
| error("Pid %d is still in the %s cgroup after %d tries and %d ms.", |
| pid, cg_name, cnt, MAX_MOVE_WAIT); |
| return false; |
| } |
| return true; |
| } |