| /*****************************************************************************\ |
| * container.c - slurmstepd container handling |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| |
| #include "src/common/data.h" |
| #include "src/common/fd.h" |
| #include "src/common/oci_config.h" |
| #include "src/common/read_config.h" |
| #include "src/common/run_command.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "src/interfaces/serializer.h" |
| |
| #include "src/slurmd/slurmstepd/container.h" |
| #include "src/slurmd/slurmstepd/slurmstepd.h" |
| |
| /* |
| * We need a location inside of the container that is controlled by Slurm to |
| * pass the startup script and I/O handling for batch steps. |
| * |
| * /tmp/slurm was chosen since runc will always mount it private |
| */ |
| #define SLURM_CONTAINER_BATCH_SCRIPT "/tmp/slurm/startup" |
| #define SLURM_CONTAINER_ENV_FILE "environment" |
| #define SLURM_CONTAINER_STDIN "/tmp/slurm/stdin" |
| #define SLURM_CONTAINER_STDOUT "/tmp/slurm/stdout" |
| #define SLURM_CONTAINER_STDERR "/tmp/slurm/stderr" |
| |
| #define SLURM_CONTAINER_BATCH_STEP_PATTERN "oci-job%j-batch" |
| #define SLURM_CONTAINER_INTERACT_STEP_PATTERN "oci-job%j-interactive" |
| #define SLURM_CONTAINER_EXTERN_STEP_PATTERN "oci-job%j-extern" |
| #define SLURM_CONTAINER_STEP_PATTERN "oci-job%j-%s" |
| #define SLURM_CONTAINER_TASK_PATTERN "task-%t" |
| |
| oci_conf_t *oci_conf = NULL; |
| |
| static char *create_argv[] = { |
| "/bin/sh", "-c", "echo 'RunTimeCreate never configured in oci.conf'; exit 1", NULL }; |
| static char *delete_argv[] = { |
| "/bin/sh", "-c", "echo 'RunTimeDelete never configured in oci.conf'; exit 1", NULL }; |
| static char *kill_argv[] = { |
| "/bin/sh", "-c", "echo 'RunTimeKill never configured in oci.conf'; exit 1", NULL }; |
| static char *query_argv[] = { |
| "/bin/sh", "-c", "echo 'RunTimeQuery never configured in oci.conf'; exit 1", NULL }; |
| static char *run_argv[] = { |
| "/bin/sh", "-c", "echo 'RunTimeRun never configured in oci.conf'; exit 1", NULL }; |
| static char *start_argv[] = { |
| "/bin/sh", "-c", "echo 'RunTimeStart never configured in oci.conf'; exit 1", NULL }; |
| |
| static char *_get_config_path(stepd_step_rec_t *step); |
| static char *_generate_spooldir(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task); |
| static void _generate_patterns(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task); |
| |
| static void _dump_command_args(run_command_args_t *args, const char *caller) |
| { |
| if (get_log_level() < LOG_LEVEL_DEBUG3) |
| return; |
| |
| for (int i = 0; args->script_argv[i]; i++) |
| debug3("%s: command argv[%d]=%s", |
| caller, i, args->script_argv[i]); |
| } |
| |
| static void _pattern_argv(char **buffer, char **offset, char **cmd_args) |
| { |
| for (char **arg = cmd_args; arg && *arg; arg++) { |
| if (arg != cmd_args) |
| xstrfmtcatat(*buffer, offset, " "); |
| |
| xstrfmtcatat(*buffer, offset, "'"); |
| |
| /* |
| * POSIX 1003.1 2.2.2 only bans a single quote in single quotes |
| * for escaping |
| */ |
| |
| for (char *c = *arg; *c != '\0'; c++) { |
| if (*c == '\'') |
| xstrfmtcatat(*buffer, offset, "'\"'\"'"); |
| |
| xstrfmtcatat(*buffer, offset, "%c", *c); |
| } |
| |
| xstrfmtcatat(*buffer, offset, "'"); |
| } |
| } |
| |
| static char *_generate_pattern(const char *pattern, stepd_step_rec_t *step, |
| int task_id, char **cmd_args) |
| { |
| step_container_t *c = step->container; |
| char *buffer = NULL, *offset = NULL; |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| if (!pattern) |
| return NULL; |
| |
| xassert((task_id == -1) || (step->node_tasks >= task_id)); |
| |
| for (const char *b = pattern; *b; b++) { |
| if (*b == '%') { |
| switch (*(++b)) { |
| case '%': |
| xstrfmtcatat(buffer, &offset, "%s", "%"); |
| break; |
| case '@': |
| if (cmd_args) |
| _pattern_argv(&buffer, &offset, |
| cmd_args); |
| else |
| xstrfmtcatat(buffer, &offset, |
| "\"/bin/false\""); |
| break; |
| case 'b': |
| xstrfmtcatat(buffer, &offset, "%s", c->bundle); |
| break; |
| case 'e': |
| xstrfmtcatat(buffer, &offset, "%s/%s", |
| c->task_spool_dir, |
| SLURM_CONTAINER_ENV_FILE); |
| break; |
| case 'j': |
| xstrfmtcatat(buffer, &offset, "%u", |
| step->step_id.job_id); |
| break; |
| case 'm': |
| if (c->spool_dir) |
| xstrfmtcatat(buffer, &offset, "%s", |
| c->spool_dir); |
| else |
| xstrfmtcatat(buffer, &offset, "%s", |
| conf->spooldir); |
| break; |
| case 'n': |
| xstrfmtcatat(buffer, &offset, "%s", |
| step->node_name); |
| break; |
| case 'p': |
| if (task_id >= 0) |
| xstrfmtcatat(buffer, &offset, "%u", |
| step->task[task_id]->pid); |
| else |
| xstrfmtcatat(buffer, &offset, "%u", |
| INFINITE); |
| break; |
| case 'r': |
| xstrfmtcatat(buffer, &offset, "%s", c->rootfs); |
| break; |
| case 's': |
| xstrfmtcatat(buffer, &offset, "%u", |
| step->step_id.step_id); |
| break; |
| case 't': |
| xstrfmtcatat(buffer, &offset, "%d", task_id); |
| break; |
| case 'u': |
| xstrfmtcatat(buffer, &offset, "%s", |
| step->user_name); |
| break; |
| case 'U': |
| xstrfmtcatat(buffer, &offset, "%u", step->uid); |
| break; |
| default: |
| fatal("%s: unexpected replacement character: %c", |
| __func__, *b); |
| } |
| } else { |
| xstrfmtcatat(buffer, &offset, "%c", *b); |
| } |
| } |
| |
| return buffer; |
| } |
| |
| static int _mkdir(const char *pathname, mode_t mode, uid_t uid, gid_t gid) |
| { |
| int rc; |
| |
| if ((rc = mkdir(pathname, mode))) |
| rc = errno; |
| else { |
| /* |
| * Directory was successfully created so it needs user:group set |
| */ |
| if (chown(pathname, uid, gid) < 0) { |
| error("%s: chown(%s): %m", __func__, pathname); |
| return errno; |
| } |
| |
| if (chmod(pathname, mode) < 0) { |
| error("%s: chmod(%s, 750): %m", __func__, pathname); |
| return errno; |
| } |
| |
| debug("%s: created %s for %u:%u mode %o", |
| __func__, pathname, uid, gid, mode); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| if (rc == EEXIST) |
| return SLURM_SUCCESS; |
| |
| error("%s: unable to mkdir(%s): %s", |
| __func__, pathname, slurm_strerror(rc)); |
| |
| return rc; |
| } |
| |
| /* |
| * Create entire directory path while setting uid:gid for every newly created |
| * directory. |
| */ |
| static int _mkpath(const char *pathname, uid_t uid, gid_t gid) |
| { |
| static const mode_t mode = S_IRWXU | S_IRWXG; |
| int rc; |
| char *p, *dst; |
| |
| p = dst = xstrdup(pathname); |
| |
| while ((p = xstrchr(p + 1, '/'))) { |
| *p = '\0'; |
| |
| if ((rc = _mkdir(dst, mode, uid, gid))) |
| goto cleanup; |
| |
| *p = '/'; |
| } |
| |
| /* final directory */ |
| rc = _mkdir(dst, mode, uid, gid); |
| |
| cleanup: |
| xfree(dst); |
| return rc; |
| } |
| |
| static int _load_config(stepd_step_rec_t *step) |
| { |
| step_container_t *c = step->container; |
| int rc; |
| buf_t *buffer = NULL; |
| char *path = _get_config_path(step); |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| xassert(!c->config); |
| xassert(path); |
| |
| errno = SLURM_SUCCESS; |
| if (!(buffer = create_mmap_buf(path))) { |
| rc = errno; |
| error("%s: unable to open: %s", __func__, path); |
| goto cleanup; |
| } |
| |
| if ((rc = serialize_g_string_to_data(&c->config, get_buf_data(buffer), |
| remaining_buf(buffer), |
| MIME_TYPE_JSON))) { |
| error("%s: unable to parse %s: %s", |
| __func__, path, slurm_strerror(rc)); |
| } |
| |
| cleanup: |
| FREE_NULL_BUFFER(buffer); |
| xfree(path); |
| return rc; |
| } |
| |
| static int _write_config(const stepd_step_rec_t *step, const char *jconfig, |
| const char *out) |
| { |
| int outfd = -1; |
| int rc = SLURM_SUCCESS; |
| |
| outfd = open(jconfig, (O_WRONLY | O_CREAT | O_EXCL), 0600); |
| if (outfd < 0) { |
| error("%s: unable to open %s: %m", |
| __func__, jconfig); |
| goto rwfail; |
| } |
| |
| safe_write(outfd, out, strlen(out)); |
| |
| if (fsync_and_close(outfd, jconfig)) { |
| outfd = -1; |
| error("%s: failure sync and close of config: %s", |
| __func__, slurm_strerror(rc)); |
| goto rwfail; |
| } |
| |
| outfd = -1; |
| |
| if (chown(jconfig, (uid_t) -1, (gid_t) step->gid) < 0) { |
| error("%s: chown(%s): %m", __func__, jconfig); |
| goto rwfail; |
| } |
| |
| if (chmod(jconfig, 0750) < 0) { |
| error("%s: chmod(%s, 750): %m", __func__, jconfig); |
| goto rwfail; |
| } |
| |
| return rc; |
| |
| rwfail: |
| rc = errno; |
| |
| if (outfd >= 0) |
| close(outfd); |
| |
| return rc; |
| } |
| |
| static bool _match_env(const data_t *data, void *needle) |
| { |
| bool match; |
| const char *needle_name = needle; |
| char *name = NULL, *value; |
| |
| if (!data_get_string_converted(data, &name)) { |
| xfree(name); |
| return false; |
| } |
| |
| value = xstrstr(name, "="); |
| |
| if (value) |
| *value = '\0'; |
| |
| match = !xstrcmp(name, needle_name); |
| |
| xfree(name); |
| |
| return match; |
| } |
| |
| static int _modify_config(stepd_step_rec_t *step, stepd_step_task_info_t *task) |
| { |
| step_container_t *c = step->container; |
| int rc = SLURM_SUCCESS; |
| data_t *mnts, *env, *args; |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| data_set_bool(data_define_dict_path(c->config, "/process/terminal/"), |
| (step->flags & LAUNCH_PTY)); |
| |
| /* point to correct rootfs */ |
| data_set_string(data_define_dict_path(c->config, "/root/path/"), |
| c->rootfs); |
| |
| mnts = data_define_dict_path(c->config, "/mounts/"); |
| if (data_get_type(mnts) != DATA_TYPE_LIST) |
| data_set_list(mnts); |
| |
| if (c->mount_spool_dir) { |
| data_t *mnt = data_set_dict(data_list_append(mnts)); |
| data_t *opt = data_set_list(data_key_set(mnt, "options")); |
| data_set_string(data_key_set(mnt, "destination"), |
| c->mount_spool_dir); |
| data_set_string(data_key_set(mnt, "type"), "none"); |
| data_set_string(data_key_set(mnt, "source"), c->spool_dir); |
| data_set_string(data_list_append(opt), "bind"); |
| } |
| |
| if (step->batch) { |
| data_t *mnt, *opt; |
| |
| /* |
| * /dev/null has very special handling in runc and we must make |
| * sure to not conflict with that: |
| * https://github.com/opencontainers/runc/blob/master/libcontainer/rootfs_linux.go#L610-L613 |
| */ |
| if (xstrcmp(step->task[0]->ifname, "/dev/null")) { |
| data_t *mnt = data_set_dict(data_list_append(mnts)); |
| data_t *opt = data_set_list( |
| data_key_set(mnt, "options")); |
| data_set_string(data_key_set(mnt, "destination"), |
| SLURM_CONTAINER_STDIN); |
| data_set_string(data_key_set(mnt, "type"), "none"); |
| data_set_string(data_key_set(mnt, "source"), |
| step->task[0]->ifname); |
| data_set_string(data_list_append(opt), "bind"); |
| } |
| |
| /* Bind mount stdout */ |
| if (xstrcmp(step->task[0]->ofname, "/dev/null")) { |
| data_t *mnt = data_set_dict(data_list_append(mnts)); |
| data_t *opt = data_set_list( |
| data_key_set(mnt, "options")); |
| |
| data_set_string(data_key_set(mnt, "destination"), |
| SLURM_CONTAINER_STDOUT); |
| data_set_string(data_key_set(mnt, "type"), "none"); |
| data_set_string(data_key_set(mnt, "source"), |
| step->task[0]->ofname); |
| data_set_string(data_list_append(opt), "bind"); |
| } |
| |
| /* Bind mount stderr */ |
| if (xstrcmp(step->task[0]->efname, "/dev/null")) { |
| data_t *mnt = data_set_dict(data_list_append(mnts)); |
| data_t *opt = data_set_list( |
| data_key_set(mnt, "options")); |
| |
| data_set_string(data_key_set(mnt, "destination"), |
| SLURM_CONTAINER_STDERR); |
| data_set_string(data_key_set(mnt, "type"), "none"); |
| data_set_string(data_key_set(mnt, "source"), |
| step->task[0]->efname); |
| data_set_string(data_list_append(opt), "bind"); |
| } |
| |
| /* |
| * Add bind mount of the batch script to allow |
| * the container to execute it directly |
| */ |
| mnt = data_set_dict(data_list_append(mnts)); |
| opt = data_set_list(data_key_set(mnt, "options")); |
| |
| data_set_string(data_key_set(mnt, "destination"), |
| SLURM_CONTAINER_BATCH_SCRIPT); |
| data_set_string(data_key_set(mnt, "type"), "none"); |
| data_set_string_own(data_key_set(mnt, "source"), |
| step->task[0]->argv[0]); |
| step->task[0]->argv[0] = xstrdup(SLURM_CONTAINER_BATCH_SCRIPT); |
| data_set_string(data_list_append(opt), "bind"); |
| data_set_string(data_list_append(opt), "ro"); |
| } |
| |
| if (oci_conf->disable_hooks) { |
| data_t *hooks = data_resolve_dict_path(c->config, "/hooks/"); |
| |
| for (int i = 0; oci_conf->disable_hooks[i]; i++) { |
| data_t *hook = data_key_get(hooks, |
| oci_conf->disable_hooks[i]); |
| |
| if (hook) { |
| int count = 0; |
| |
| if (data_get_type(hook) == DATA_TYPE_LIST) { |
| count = data_get_list_length(hook); |
| } else { |
| error("Invalid type for hook %s", |
| oci_conf->disable_hooks[i]); |
| } |
| |
| debug("%s: hook %s found and disabled %d entries", |
| __func__, oci_conf->disable_hooks[i], |
| count); |
| |
| data_key_unset(hooks, |
| oci_conf->disable_hooks[i]); |
| } else { |
| debug("%s: hook %s not found", |
| __func__, oci_conf->disable_hooks[i]); |
| } |
| } |
| } |
| |
| /* overwrite environ with the final step->env contents */ |
| env = data_set_list(data_define_dict_path(c->config, "/process/env/")); |
| for (char **ptr = step->env; *ptr; ptr++) { |
| data_t *entry; |
| char *name = xstrdup(*ptr); |
| char *value = xstrstr(name, "="); |
| |
| if (value) |
| *value = '\0'; |
| |
| if (!(entry = data_list_find_first(env, _match_env, name))) |
| entry = data_list_append(env); |
| |
| data_set_string(entry, *ptr); |
| xfree(name); |
| } |
| |
| args = data_define_dict_path(c->config, "/process/args/"); |
| data_set_list(args); |
| |
| /* move args to the config.json for runtime to handle */ |
| for (int i = 0; i < task->argc; i++) { |
| data_set_string_own(data_list_append(args), task->argv[i]); |
| task->argv[i] = NULL; |
| } |
| |
| return rc; |
| } |
| |
| static int _generate_container_paths(stepd_step_rec_t *step) |
| { |
| step_container_t *c = step->container; |
| int rc = SLURM_SUCCESS; |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| if (c->config) { |
| if ((rc = data_retrieve_dict_path_string(c->config, |
| "/root/path/", |
| &c->rootfs))) { |
| debug("%s: unable to find /root/path/", __func__); |
| return rc; |
| } |
| |
| if (c->rootfs[0] != '/') { |
| /* always provide absolute path */ |
| char *t = NULL; |
| |
| xstrfmtcat(t, "%s/%s", c->bundle, c->rootfs); |
| SWAP(c->rootfs, t); |
| xfree(t); |
| } |
| } else { |
| /* default to bundle path without config.json */ |
| c->rootfs = xstrdup(c->bundle); |
| } |
| |
| /* generate step's spool_dir */ |
| if (oci_conf->mount_spool_dir) { |
| c->mount_spool_dir = |
| _generate_pattern(oci_conf->mount_spool_dir, step, |
| step->task[0]->id, NULL); |
| } else { |
| c->mount_spool_dir = _generate_spooldir(step, NULL); |
| } |
| |
| if (!c->spool_dir) |
| c->spool_dir = _generate_spooldir(step, NULL); |
| |
| if ((rc = _mkpath(c->spool_dir, step->uid, step->gid))) |
| fatal("%s: unable to create spool directory %s: %s", |
| __func__, c->spool_dir, slurm_strerror(rc)); |
| |
| return rc; |
| } |
| |
| static bool _pattern_has_taskid(const char *pattern) |
| { |
| const char *p = pattern; |
| |
| while (*p) { |
| if (!(p = xstrchr(p, '%'))) |
| break; |
| |
| if ((p[1] == '%') && (p[2] != '\0')) |
| p += 2; |
| else if (p[1] == 't') |
| return true; |
| else |
| p++; |
| } |
| |
| return false; |
| } |
| |
| static char *_generate_spooldir_pattern(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task) |
| { |
| const char *step_id = NULL; |
| const char *task_id = NULL; |
| char *parent = NULL, *pattern = NULL; |
| |
| if (oci_conf->container_path) { |
| parent = xstrdup(oci_conf->container_path); |
| } else if (conf->spooldir) { |
| #ifdef MULTIPLE_SLURMD |
| parent = slurm_conf_expand_slurmd_path(conf->spooldir, |
| conf->node_name, |
| conf->hostname); |
| #else /* !MULTIPLE_SLURMD */ |
| parent = xstrdup(conf->spooldir); |
| #endif /* !MULTIPLE_SLURMD */ |
| } else { |
| parent = xstrdup(DEFAULT_SPOOLDIR); |
| } |
| |
| if (step->step_id.step_id == SLURM_BATCH_SCRIPT) { |
| step_id = SLURM_CONTAINER_BATCH_STEP_PATTERN; |
| } else if (step->step_id.step_id == SLURM_INTERACTIVE_STEP) { |
| step_id = SLURM_CONTAINER_INTERACT_STEP_PATTERN; |
| } else if (step->step_id.step_id == SLURM_EXTERN_CONT) { |
| step_id = SLURM_CONTAINER_EXTERN_STEP_PATTERN; |
| } else { |
| step_id = SLURM_CONTAINER_STEP_PATTERN; |
| } |
| |
| if (task) |
| task_id = SLURM_CONTAINER_TASK_PATTERN; |
| else |
| task_id = ""; |
| |
| pattern = xstrdup_printf("%s%s%s%s%s", parent, (step_id[0] ? "/" : ""), |
| step_id, (task_id[0] ? "/" : ""), task_id); |
| xfree(parent); |
| return pattern; |
| } |
| |
| static char *_generate_spooldir(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task) |
| { |
| int id = -1; |
| char **argv = NULL, *path = NULL; |
| char *pattern = _generate_spooldir_pattern(step, task); |
| |
| if (task) { |
| id = task->id; |
| argv = task->argv; |
| } else { |
| char *start, *end, *next; |
| |
| /* trim pattern at first taskid replacement */ |
| |
| if (pattern[0] == '/') |
| next = pattern + 1; |
| else |
| next = pattern; |
| |
| while (next) { |
| char term; |
| |
| start = next; |
| |
| if (!(end = xstrchr(next, '/'))) { |
| end = start + strlen(start); |
| next = NULL; |
| } else { |
| next = end + 1; |
| } |
| |
| term = *end; |
| *end = '\0'; |
| |
| if (_pattern_has_taskid(start)) { |
| /* cut pattern at this directory */ |
| *start = '\0'; |
| *end = term; |
| break; |
| } |
| |
| *end = term; |
| } |
| } |
| |
| xassert((id != -1) || !xstrstr(pattern, "%t")); |
| path = _generate_pattern(pattern, step, id, argv); |
| debug3("%s: task:%d pattern:%s path:%s", __func__, id, pattern, path); |
| xfree(pattern); |
| |
| return path; |
| } |
| |
| extern void container_task_init(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task) |
| { |
| int rc; |
| step_container_t *c = step->container; |
| |
| if (!oci_conf) { |
| debug2("%s: ignoring step container when oci.conf not configured", |
| __func__); |
| return; |
| } |
| |
| if (c->task_spool_dir || !c->spool_dir) |
| fatal_abort("task spool dir already set or spool dir not set"); |
| |
| /* generate the task_spool_dir now we know the task */ |
| c->task_spool_dir = _generate_spooldir(step, task); |
| |
| if ((rc = _mkpath(c->task_spool_dir, step->uid, step->gid))) |
| fatal("%s: unable to create spool directory %s: %s", |
| __func__, c->task_spool_dir, slurm_strerror(rc)); |
| } |
| |
| static char *_get_config_path(stepd_step_rec_t *step) |
| { |
| step_container_t *c = step->container; |
| char *path = NULL; |
| |
| if (!step->container) |
| return NULL; |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| /* OCI runtime spec requires config.json to be in root of bundle */ |
| xstrfmtcat(path, "%s/config.json", c->bundle); |
| |
| return path; |
| } |
| |
| static data_for_each_cmd_t _foreach_config_env(const data_t *data, void *arg) |
| { |
| int rc; |
| stepd_step_rec_t *step = arg; |
| char *name = NULL, *value; |
| |
| if (data_get_string_converted(data, &name)) |
| return DATA_FOR_EACH_FAIL; |
| |
| value = xstrstr(name, "="); |
| |
| if (value) { |
| *value = '\0'; |
| value++; |
| } |
| |
| rc = setenvf(&step->env, name, "%s", value); |
| |
| xfree(name); |
| |
| return (rc ? DATA_FOR_EACH_FAIL : DATA_FOR_EACH_CONT); |
| } |
| |
| static int _merge_step_config_env(stepd_step_rec_t *step) |
| { |
| step_container_t *c = step->container; |
| data_t *env = data_resolve_dict_path(c->config, "/process/env/"); |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| if (!env) |
| return SLURM_SUCCESS; |
| |
| xassert(!oci_conf->ignore_config_json); |
| |
| if (data_list_for_each_const(env, _foreach_config_env, step) < 0) |
| return ESLURM_DATA_CONV_FAILED; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int setup_container(stepd_step_rec_t *step) |
| { |
| step_container_t *c = step->container; |
| int rc; |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| if ((rc = get_oci_conf(&oci_conf)) && (rc != ENOENT)) { |
| error("%s: error loading oci.conf: %s", |
| __func__, slurm_strerror(rc)); |
| return rc; |
| } |
| |
| if (!oci_conf) { |
| debug("%s: OCI Container not configured. Ignoring %pS requested container: %s", |
| __func__, step, c->bundle); |
| return ESLURM_CONTAINER_NOT_CONFIGURED; |
| } |
| |
| serializer_required(MIME_TYPE_JSON); |
| |
| if (!oci_conf->ignore_config_json) { |
| if ((rc = _load_config(step))) |
| goto error; |
| |
| if ((rc = _merge_step_config_env(step))) |
| goto error; |
| } |
| |
| if ((rc = _generate_container_paths(step))) |
| goto error; |
| |
| error: |
| if (rc) |
| error("%s: container setup failed: %s", |
| __func__, slurm_strerror(rc)); |
| |
| return rc; |
| } |
| |
| static data_t *_get_container_state() |
| { |
| int rc = SLURM_ERROR; |
| data_t *state = NULL; |
| char *out; |
| run_command_args_t run_command_args = { |
| .max_wait = -1, |
| .script_argv = query_argv, |
| .script_path = query_argv[0], |
| .script_type = "RunTimeQuery", |
| .status = &rc, |
| }; |
| |
| /* request container get deleted if known at all any more */ |
| _dump_command_args(&run_command_args, __func__); |
| out = run_command(&run_command_args); |
| debug("%s: RunTimeQuery rc:%u output:%s", __func__, rc, out); |
| |
| if (!out || !out[0] || rc) { |
| error("%s: RunTimeQuery failed rc:%u output:%s", __func__, rc, out); |
| return NULL; |
| } |
| |
| if (serialize_g_string_to_data(&state, out, strlen(out), |
| MIME_TYPE_JSON)) { |
| error("%s: unable to parse container state: %s", |
| __func__, out); |
| log_flag_hex(STEPS, out, strlen(out), |
| "unable to parse container state response"); |
| } |
| |
| xfree(out); |
| |
| return state; |
| } |
| |
| static char *_get_container_status() |
| { |
| char *state = NULL; |
| data_t *dstate = _get_container_state(); |
| |
| if (!dstate) |
| return NULL; |
| |
| if (data_retrieve_dict_path_string(dstate, "/status/", &state)) |
| debug("%s: unable to find /status", __func__); |
| |
| return state; |
| } |
| |
| static void _kill_container() |
| { |
| int stime = 2500; |
| char *status = NULL; |
| run_command_args_t run_command_args = { |
| .max_wait = -1, |
| }; |
| |
| if (!oci_conf->ignore_config_json && |
| !(status = _get_container_status())) { |
| debug("container already dead"); |
| } else if (!xstrcasecmp(status, "running")) { |
| run_command_args.script_argv = kill_argv; |
| run_command_args.script_path = kill_argv[0]; |
| run_command_args.script_type = "RunTimeKill"; |
| |
| for (int t = 0; t < 10; t++) { |
| char *out; |
| int kill_status = SLURM_ERROR; |
| run_command_args.status = &kill_status; |
| |
| xfree(status); |
| status = _get_container_status(); |
| |
| if (!oci_conf->ignore_config_json && |
| (!status || !xstrcasecmp(status, "stopped"))) |
| break; |
| |
| out = run_command(&run_command_args); |
| debug("%s: RunTimeKill rc:%u output:%s", |
| __func__, kill_status, out); |
| xfree(out); |
| |
| if (oci_conf->ignore_config_json) |
| break; |
| |
| /* |
| * use exp backoff up to 1s to wait for the container to |
| * cleanup. |
| * |
| * OCI runtime doesn't provide any way but to poll to see |
| * if the container has been squashed |
| */ |
| debug("%s: sleeping %dusec to query state again", |
| __func__, stime); |
| usleep(stime); |
| |
| if (stime > 1000000) |
| stime = 1000000; |
| else |
| stime *= 2; |
| } |
| } |
| |
| if (status) { |
| int delete_status = SLURM_ERROR; |
| char *out; |
| |
| /* request container get deleted if known at all any more */ |
| run_command_args.script_argv = delete_argv; |
| run_command_args.script_path = delete_argv[0]; |
| run_command_args.script_type = "RunTimeDelete"; |
| run_command_args.status = &delete_status; |
| _dump_command_args(&run_command_args, __func__); |
| out = run_command(&run_command_args); |
| debug("%s: RunTimeDelete rc:%u output:%s", |
| __func__, delete_status, out); |
| xfree(out); |
| xfree(status); |
| } |
| } |
| |
| static void _run(stepd_step_rec_t *step, stepd_step_task_info_t *task) |
| { |
| debug3("%s: executing: %s", __func__, run_argv[2]); |
| if (oci_conf->create_env_file) |
| execve(run_argv[0], run_argv, step->env); |
| else |
| execv(run_argv[0], run_argv); |
| fatal("execv(%s) failed: %m", run_argv[0]); |
| } |
| |
| static void _create_start(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task) |
| { |
| int stime = 250, rc = SLURM_ERROR; |
| char *out; |
| run_command_args_t run_command_args = { |
| .max_wait = -1, |
| .status = &rc, |
| }; |
| |
| if (oci_conf->ignore_config_json) |
| fatal("IgnoreFileConfigJson=true and RunTimeStart are mutually exclusive"); |
| |
| run_command_args.script_argv = create_argv; |
| run_command_args.script_path = create_argv[0]; |
| run_command_args.script_type = "RunTimeCreate"; |
| _dump_command_args(&run_command_args, __func__); |
| out = run_command(&run_command_args); |
| debug("%s: RunTimeCreate rc:%u output:%s", __func__, rc, out); |
| xfree(out); |
| |
| /* have to wait here until state finds the container or fail out */ |
| for (int t = 0; t <= 10; t++) { |
| char *status = _get_container_status(); |
| |
| if (!status) { |
| if (t == 10) |
| fatal("container never started"); |
| |
| /* state called before create done */ |
| if (stime > 1000000) |
| stime = 1000000; |
| else |
| stime *= 2; |
| |
| usleep(stime); |
| continue; |
| } |
| |
| debug("container in %s state", status); |
| |
| if (!xstrcasecmp(status, "creating")) { |
| /* wait for creation to finish */ |
| xfree(status); |
| usleep(250); |
| } else if (!xstrcasecmp(status, "created")) { |
| xfree(status); |
| break; |
| } else { |
| fatal("%s: unexpected container status: %s", |
| __func__, status); |
| } |
| } |
| |
| run_command_args.script_argv = start_argv; |
| run_command_args.script_path = start_argv[0]; |
| run_command_args.script_type = "RunTimeStart"; |
| _dump_command_args(&run_command_args, __func__); |
| out = run_command(&run_command_args); |
| debug("%s: RunTimeStart rc:%u output:%s", __func__, rc, out); |
| xfree(out); |
| |
| /* |
| * the initial PID is now dead but the container could still be running |
| * but it likely is running outside of slurmstepd's process group |
| */ |
| |
| stime = 2500; |
| while (true) { |
| char *status = _get_container_status(); |
| |
| if (!status || xstrcasecmp(status, "running")) { |
| debug("container no longer running: %s", status); |
| xfree(status); |
| break; |
| } |
| |
| xfree(status); |
| |
| /* increase wait times exp */ |
| if (stime > 1000000) |
| stime = 1000000; |
| else |
| stime *= 2; |
| |
| usleep(stime); |
| } |
| |
| /* |
| * since the parent process has exited, kill off the container to kill |
| * any orphan processes |
| */ |
| _kill_container(); |
| |
| _exit(rc); |
| } |
| |
| static void _generate_patterns(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task) |
| { |
| char *gen; |
| int id = -1; |
| char **argv = NULL; |
| |
| debug2("%s: %ps TaskId=%d", |
| __func__, &step->step_id, (task ? task->id : -1)); |
| |
| if (task) { |
| id = task->id; |
| argv = task->argv; |
| } |
| |
| gen = _generate_pattern(oci_conf->runtime_create, step, id, argv); |
| if (gen) { |
| static bool set = false; |
| if (set) |
| xfree(create_argv[2]); |
| create_argv[2] = gen; |
| set = true; |
| } |
| |
| gen = _generate_pattern(oci_conf->runtime_delete, step, id, argv); |
| if (gen) { |
| static bool set = false; |
| if (set) |
| xfree(delete_argv[2]); |
| delete_argv[2] = gen; |
| set = true; |
| } |
| |
| gen = _generate_pattern(oci_conf->runtime_kill, step, id, argv); |
| if (gen) { |
| static bool set = false; |
| if (set) |
| xfree(kill_argv[2]); |
| kill_argv[2] = gen; |
| set = true; |
| } |
| |
| gen = _generate_pattern(oci_conf->runtime_query, step, id, argv); |
| if (gen) { |
| static bool set = false; |
| if (set) |
| xfree(query_argv[2]); |
| query_argv[2] = gen; |
| set = true; |
| } |
| |
| gen = _generate_pattern(oci_conf->runtime_run, step, id, argv); |
| if (gen) { |
| static bool set = false; |
| if (set) |
| xfree(run_argv[2]); |
| run_argv[2] = gen; |
| set = true; |
| } |
| |
| gen = _generate_pattern(oci_conf->runtime_start, step, id, argv); |
| if (gen) { |
| static bool set = false; |
| if (set) |
| xfree(start_argv[2]); |
| start_argv[2] = gen; |
| set = true; |
| } |
| } |
| |
| extern void container_run(stepd_step_rec_t *step, |
| stepd_step_task_info_t *task) |
| { |
| step_container_t *c = step->container; |
| int rc; |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| if (!oci_conf) { |
| debug("%s: OCI Container not configured. Ignoring %pS requested container: %s", |
| __func__, step, c->bundle); |
| return; |
| } |
| |
| if (!c->spool_dir || !c->task_spool_dir) |
| fatal_abort("spool directory not populated"); |
| |
| if (oci_conf->env_exclude_set) { |
| char **env = env_array_exclude((const char **) step->env, |
| &oci_conf->env_exclude); |
| #ifdef MEMORY_LEAK_DEBUG |
| env_array_free(step->env); |
| #endif |
| step->env = env; |
| } |
| |
| if (c->config) { |
| int rc; |
| char *out = NULL; |
| char *jconfig = NULL; |
| |
| /* create new config.json in spooldir */ |
| xstrfmtcat(jconfig, "%s/config.json", c->task_spool_dir); |
| |
| if ((rc = _modify_config(step, task))) |
| fatal("%s: configuring container failed: %s", |
| __func__, slurm_strerror(rc)); |
| |
| if ((rc = serialize_g_data_to_string(&out, NULL, c->config, |
| MIME_TYPE_JSON, |
| SER_FLAGS_NONE))) { |
| fatal("%s: serialization of config failed: %s", |
| __func__, slurm_strerror(rc)); |
| } |
| |
| FREE_NULL_DATA(c->config); |
| |
| if ((rc = _write_config(step, jconfig, out))) |
| fatal("%s: unable to write %s: %s", |
| __func__, jconfig, slurm_strerror(rc)); |
| |
| debug("%s: wrote %s", __func__, jconfig); |
| |
| /* |
| * Swap bundle path to spool directory to ensure runtime uses |
| * correct config.json |
| */ |
| xfree(c->bundle); |
| c->bundle = xstrdup(c->task_spool_dir); |
| |
| xfree(out); |
| xfree(jconfig); |
| } |
| |
| if (oci_conf->create_env_file) { |
| char *envfile = NULL; |
| bool nl = (oci_conf->create_env_file == |
| NEWLINE_TERMINATED_ENV_FILE); |
| |
| /* keep _generate_pattern() in sync with this path */ |
| xstrfmtcat(envfile, "%s/%s", c->task_spool_dir, |
| SLURM_CONTAINER_ENV_FILE); |
| |
| if ((rc = env_array_to_file(envfile, (const char **) step->env, |
| nl))) |
| fatal("%s: unable to write %s: %s", |
| __func__, envfile, slurm_strerror(rc)); |
| |
| if (chown(envfile, step->uid, step->gid) < 0) |
| fatal("%s: chown(%s): %m", __func__, envfile); |
| |
| if (!rc && chmod(envfile, 0750) < 0) |
| error("%s: chmod(%s, 750): %m", __func__, envfile); |
| |
| debug("%s: wrote %s", __func__, envfile); |
| |
| xfree(envfile); |
| } |
| |
| if (oci_conf->runtime_env_exclude_set) { |
| extern char **environ; |
| char **env = env_array_exclude((const char **) environ, |
| &oci_conf->runtime_env_exclude); |
| |
| #ifdef MEMORY_LEAK_DEBUG |
| env_unset_environment(); |
| #endif |
| environ = env; |
| } |
| |
| debug4("%s: setting cwd from %s to task spooldir: %s", |
| __func__, step->cwd, c->task_spool_dir); |
| xfree(step->cwd); |
| step->cwd = xstrdup(c->task_spool_dir); |
| |
| _generate_patterns(step, task); |
| |
| if (oci_conf->runtime_run) |
| _run(step, task); |
| else |
| _create_start(step, task); |
| } |
| |
| extern void cleanup_container(stepd_step_rec_t *step) |
| { |
| step_container_t *c = step->container; |
| |
| xassert(c->magic == STEP_CONTAINER_MAGIC); |
| |
| if (!oci_conf) { |
| debug("%s: OCI Container not configured. Ignoring %pS requested container: %s", |
| __func__, step, c->bundle); |
| return; |
| } |
| |
| /* cleanup may be called without ever setting up container */ |
| |
| _generate_patterns(step, NULL); |
| _kill_container(); |
| |
| if (oci_conf->disable_cleanup) |
| goto done; |
| |
| if (!c->spool_dir) |
| c->spool_dir = _generate_spooldir(step, NULL); |
| |
| if (step->node_tasks > 0) { |
| /* clear every config.json and task dir */ |
| for (int i = 0; i < step->node_tasks; i++) { |
| xfree(c->task_spool_dir); |
| c->task_spool_dir = |
| _generate_spooldir(step, step->task[i]); |
| |
| _generate_patterns(step, step->task[i]); |
| |
| if (!oci_conf->ignore_config_json) { |
| char *jconfig = NULL; |
| |
| xstrfmtcat(jconfig, "%s/config.json", |
| c->task_spool_dir); |
| |
| if ((unlink(jconfig) < 0) && (errno != ENOENT)) |
| error("unlink(%s): %m", jconfig); |
| xfree(jconfig); |
| } |
| |
| if (oci_conf->create_env_file) { |
| char *envfile = NULL; |
| |
| xstrfmtcat(envfile, "%s/%s", c->task_spool_dir, |
| SLURM_CONTAINER_ENV_FILE); |
| |
| if (unlink(envfile) && (errno != ENOENT)) |
| error("unlink(%s): %m", envfile); |
| |
| xfree(envfile); |
| } |
| |
| if (rmdir(c->task_spool_dir) && (errno != ENOENT)) |
| error("rmdir(%s): %m", c->task_spool_dir); |
| } |
| |
| xfree(c->task_spool_dir); |
| } |
| |
| if (rmdir(c->spool_dir) && (errno != ENOENT)) |
| error("rmdir(%s): %m", c->spool_dir); |
| |
| done: |
| FREE_NULL_OCI_CONF(oci_conf); |
| } |