| /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
| |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <poll.h> |
| #include <sys/eventfd.h> |
| #include <sys/ioctl.h> |
| #include <sys/mman.h> |
| #include <sys/mount.h> |
| #include <sys/personality.h> |
| #include <sys/prctl.h> |
| #include <sys/shm.h> |
| #include <sys/types.h> |
| #include <sys/un.h> |
| #include <unistd.h> |
| #include <utmpx.h> |
| |
| #if HAVE_PAM |
| #include <security/pam_appl.h> |
| #endif |
| |
| #if HAVE_SELINUX |
| #include <selinux/selinux.h> |
| #endif |
| |
| #if HAVE_SECCOMP |
| #include <seccomp.h> |
| #endif |
| |
| #if HAVE_APPARMOR |
| #include <sys/apparmor.h> |
| #endif |
| |
| #include "sd-messages.h" |
| |
| #include "acl-util.h" |
| #include "af-list.h" |
| #include "alloc-util.h" |
| #if HAVE_APPARMOR |
| #include "apparmor-util.h" |
| #endif |
| #include "async.h" |
| #include "barrier.h" |
| #include "cap-list.h" |
| #include "capability-util.h" |
| #include "cgroup-setup.h" |
| #include "chown-recursive.h" |
| #include "cpu-set-util.h" |
| #include "data-fd-util.h" |
| #include "def.h" |
| #include "env-file.h" |
| #include "env-util.h" |
| #include "errno-list.h" |
| #include "escape.h" |
| #include "execute.h" |
| #include "exit-status.h" |
| #include "fd-util.h" |
| #include "fileio.h" |
| #include "format-util.h" |
| #include "fs-util.h" |
| #include "glob-util.h" |
| #include "hexdecoct.h" |
| #include "io-util.h" |
| #include "ioprio.h" |
| #include "label.h" |
| #include "log.h" |
| #include "macro.h" |
| #include "manager.h" |
| #include "manager-dump.h" |
| #include "memory-util.h" |
| #include "missing_fs.h" |
| #include "mkdir.h" |
| #include "mount-util.h" |
| #include "mountpoint-util.h" |
| #include "namespace.h" |
| #include "parse-util.h" |
| #include "path-util.h" |
| #include "process-util.h" |
| #include "random-util.h" |
| #include "rlimit-util.h" |
| #include "rm-rf.h" |
| #if HAVE_SECCOMP |
| #include "seccomp-util.h" |
| #endif |
| #include "securebits-util.h" |
| #include "selinux-util.h" |
| #include "signal-util.h" |
| #include "smack-util.h" |
| #include "socket-util.h" |
| #include "special.h" |
| #include "stat-util.h" |
| #include "string-table.h" |
| #include "string-util.h" |
| #include "strv.h" |
| #include "syslog-util.h" |
| #include "terminal-util.h" |
| #include "tmpfile-util.h" |
| #include "umask-util.h" |
| #include "unit-serialize.h" |
| #include "user-util.h" |
| #include "utmp-wtmp.h" |
| |
| #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC) |
| #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC) |
| |
| #define SNDBUF_SIZE (8*1024*1024) |
| |
| static int shift_fds(int fds[], size_t n_fds) { |
| if (n_fds <= 0) |
| return 0; |
| |
| /* Modifies the fds array! (sorts it) */ |
| |
| assert(fds); |
| |
| for (int start = 0;;) { |
| int restart_from = -1; |
| |
| for (int i = start; i < (int) n_fds; i++) { |
| int nfd; |
| |
| /* Already at right index? */ |
| if (fds[i] == i+3) |
| continue; |
| |
| nfd = fcntl(fds[i], F_DUPFD, i + 3); |
| if (nfd < 0) |
| return -errno; |
| |
| safe_close(fds[i]); |
| fds[i] = nfd; |
| |
| /* Hmm, the fd we wanted isn't free? Then |
| * let's remember that and try again from here */ |
| if (nfd != i+3 && restart_from < 0) |
| restart_from = i; |
| } |
| |
| if (restart_from < 0) |
| break; |
| |
| start = restart_from; |
| } |
| |
| return 0; |
| } |
| |
| static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) { |
| size_t n_fds; |
| int r; |
| |
| n_fds = n_socket_fds + n_storage_fds; |
| if (n_fds <= 0) |
| return 0; |
| |
| assert(fds); |
| |
| /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags. |
| * O_NONBLOCK only applies to socket activation though. */ |
| |
| for (size_t i = 0; i < n_fds; i++) { |
| |
| if (i < n_socket_fds) { |
| r = fd_nonblock(fds[i], nonblock); |
| if (r < 0) |
| return r; |
| } |
| |
| /* We unconditionally drop FD_CLOEXEC from the fds, |
| * since after all we want to pass these fds to our |
| * children */ |
| |
| r = fd_cloexec(fds[i], false); |
| if (r < 0) |
| return r; |
| } |
| |
| return 0; |
| } |
| |
| static const char *exec_context_tty_path(const ExecContext *context) { |
| assert(context); |
| |
| if (context->stdio_as_fds) |
| return NULL; |
| |
| if (context->tty_path) |
| return context->tty_path; |
| |
| return "/dev/console"; |
| } |
| |
| static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) { |
| const char *path; |
| |
| assert(context); |
| |
| path = exec_context_tty_path(context); |
| |
| if (context->tty_vhangup) { |
| if (p && p->stdin_fd >= 0) |
| (void) terminal_vhangup_fd(p->stdin_fd); |
| else if (path) |
| (void) terminal_vhangup(path); |
| } |
| |
| if (context->tty_reset) { |
| if (p && p->stdin_fd >= 0) |
| (void) reset_terminal_fd(p->stdin_fd, true); |
| else if (path) |
| (void) reset_terminal(path); |
| } |
| |
| if (context->tty_vt_disallocate && path) |
| (void) vt_disallocate(path); |
| } |
| |
| static bool is_terminal_input(ExecInput i) { |
| return IN_SET(i, |
| EXEC_INPUT_TTY, |
| EXEC_INPUT_TTY_FORCE, |
| EXEC_INPUT_TTY_FAIL); |
| } |
| |
| static bool is_terminal_output(ExecOutput o) { |
| return IN_SET(o, |
| EXEC_OUTPUT_TTY, |
| EXEC_OUTPUT_KMSG_AND_CONSOLE, |
| EXEC_OUTPUT_JOURNAL_AND_CONSOLE); |
| } |
| |
| static bool is_kmsg_output(ExecOutput o) { |
| return IN_SET(o, |
| EXEC_OUTPUT_KMSG, |
| EXEC_OUTPUT_KMSG_AND_CONSOLE); |
| } |
| |
| static bool exec_context_needs_term(const ExecContext *c) { |
| assert(c); |
| |
| /* Return true if the execution context suggests we should set $TERM to something useful. */ |
| |
| if (is_terminal_input(c->std_input)) |
| return true; |
| |
| if (is_terminal_output(c->std_output)) |
| return true; |
| |
| if (is_terminal_output(c->std_error)) |
| return true; |
| |
| return !!c->tty_path; |
| } |
| |
| static int open_null_as(int flags, int nfd) { |
| int fd; |
| |
| assert(nfd >= 0); |
| |
| fd = open("/dev/null", flags|O_NOCTTY); |
| if (fd < 0) |
| return -errno; |
| |
| return move_fd(fd, nfd, false); |
| } |
| |
| static int connect_journal_socket( |
| int fd, |
| const char *log_namespace, |
| uid_t uid, |
| gid_t gid) { |
| |
| union sockaddr_union sa; |
| socklen_t sa_len; |
| uid_t olduid = UID_INVALID; |
| gid_t oldgid = GID_INVALID; |
| const char *j; |
| int r; |
| |
| j = log_namespace ? |
| strjoina("/run/systemd/journal.", log_namespace, "/stdout") : |
| "/run/systemd/journal/stdout"; |
| r = sockaddr_un_set_path(&sa.un, j); |
| if (r < 0) |
| return r; |
| sa_len = r; |
| |
| if (gid_is_valid(gid)) { |
| oldgid = getgid(); |
| |
| if (setegid(gid) < 0) |
| return -errno; |
| } |
| |
| if (uid_is_valid(uid)) { |
| olduid = getuid(); |
| |
| if (seteuid(uid) < 0) { |
| r = -errno; |
| goto restore_gid; |
| } |
| } |
| |
| r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0; |
| |
| /* If we fail to restore the uid or gid, things will likely |
| fail later on. This should only happen if an LSM interferes. */ |
| |
| if (uid_is_valid(uid)) |
| (void) seteuid(olduid); |
| |
| restore_gid: |
| if (gid_is_valid(gid)) |
| (void) setegid(oldgid); |
| |
| return r; |
| } |
| |
| static int connect_logger_as( |
| const Unit *unit, |
| const ExecContext *context, |
| const ExecParameters *params, |
| ExecOutput output, |
| const char *ident, |
| int nfd, |
| uid_t uid, |
| gid_t gid) { |
| |
| _cleanup_close_ int fd = -1; |
| int r; |
| |
| assert(context); |
| assert(params); |
| assert(output < _EXEC_OUTPUT_MAX); |
| assert(ident); |
| assert(nfd >= 0); |
| |
| fd = socket(AF_UNIX, SOCK_STREAM, 0); |
| if (fd < 0) |
| return -errno; |
| |
| r = connect_journal_socket(fd, context->log_namespace, uid, gid); |
| if (r < 0) |
| return r; |
| |
| if (shutdown(fd, SHUT_RD) < 0) |
| return -errno; |
| |
| (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); |
| |
| if (dprintf(fd, |
| "%s\n" |
| "%s\n" |
| "%i\n" |
| "%i\n" |
| "%i\n" |
| "%i\n" |
| "%i\n", |
| context->syslog_identifier ?: ident, |
| params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "", |
| context->syslog_priority, |
| !!context->syslog_level_prefix, |
| false, |
| is_kmsg_output(output), |
| is_terminal_output(output)) < 0) |
| return -errno; |
| |
| return move_fd(TAKE_FD(fd), nfd, false); |
| } |
| |
| static int open_terminal_as(const char *path, int flags, int nfd) { |
| int fd; |
| |
| assert(path); |
| assert(nfd >= 0); |
| |
| fd = open_terminal(path, flags | O_NOCTTY); |
| if (fd < 0) |
| return fd; |
| |
| return move_fd(fd, nfd, false); |
| } |
| |
| static int acquire_path(const char *path, int flags, mode_t mode) { |
| union sockaddr_union sa; |
| socklen_t sa_len; |
| _cleanup_close_ int fd = -1; |
| int r; |
| |
| assert(path); |
| |
| if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR)) |
| flags |= O_CREAT; |
| |
| fd = open(path, flags|O_NOCTTY, mode); |
| if (fd >= 0) |
| return TAKE_FD(fd); |
| |
| if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */ |
| return -errno; |
| |
| /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */ |
| |
| r = sockaddr_un_set_path(&sa.un, path); |
| if (r < 0) |
| return r == -EINVAL ? -ENXIO : r; |
| sa_len = r; |
| |
| fd = socket(AF_UNIX, SOCK_STREAM, 0); |
| if (fd < 0) |
| return -errno; |
| |
| if (connect(fd, &sa.sa, sa_len) < 0) |
| return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have |
| * indication that this wasn't an AF_UNIX socket after all */ |
| |
| if ((flags & O_ACCMODE) == O_RDONLY) |
| r = shutdown(fd, SHUT_WR); |
| else if ((flags & O_ACCMODE) == O_WRONLY) |
| r = shutdown(fd, SHUT_RD); |
| else |
| r = 0; |
| if (r < 0) |
| return -errno; |
| |
| return TAKE_FD(fd); |
| } |
| |
| static int fixup_input( |
| const ExecContext *context, |
| int socket_fd, |
| bool apply_tty_stdin) { |
| |
| ExecInput std_input; |
| |
| assert(context); |
| |
| std_input = context->std_input; |
| |
| if (is_terminal_input(std_input) && !apply_tty_stdin) |
| return EXEC_INPUT_NULL; |
| |
| if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0) |
| return EXEC_INPUT_NULL; |
| |
| if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0) |
| return EXEC_INPUT_NULL; |
| |
| return std_input; |
| } |
| |
| static int fixup_output(ExecOutput output, int socket_fd) { |
| |
| if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0) |
| return EXEC_OUTPUT_INHERIT; |
| |
| return output; |
| } |
| |
| static int setup_input( |
| const ExecContext *context, |
| const ExecParameters *params, |
| int socket_fd, |
| const int named_iofds[static 3]) { |
| |
| ExecInput i; |
| |
| assert(context); |
| assert(params); |
| assert(named_iofds); |
| |
| if (params->stdin_fd >= 0) { |
| if (dup2(params->stdin_fd, STDIN_FILENO) < 0) |
| return -errno; |
| |
| /* Try to make this the controlling tty, if it is a tty, and reset it */ |
| if (isatty(STDIN_FILENO)) { |
| (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE); |
| (void) reset_terminal_fd(STDIN_FILENO, true); |
| } |
| |
| return STDIN_FILENO; |
| } |
| |
| i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); |
| |
| switch (i) { |
| |
| case EXEC_INPUT_NULL: |
| return open_null_as(O_RDONLY, STDIN_FILENO); |
| |
| case EXEC_INPUT_TTY: |
| case EXEC_INPUT_TTY_FORCE: |
| case EXEC_INPUT_TTY_FAIL: { |
| int fd; |
| |
| fd = acquire_terminal(exec_context_tty_path(context), |
| i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY : |
| i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE : |
| ACQUIRE_TERMINAL_WAIT, |
| USEC_INFINITY); |
| if (fd < 0) |
| return fd; |
| |
| return move_fd(fd, STDIN_FILENO, false); |
| } |
| |
| case EXEC_INPUT_SOCKET: |
| assert(socket_fd >= 0); |
| |
| return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; |
| |
| case EXEC_INPUT_NAMED_FD: |
| assert(named_iofds[STDIN_FILENO] >= 0); |
| |
| (void) fd_nonblock(named_iofds[STDIN_FILENO], false); |
| return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; |
| |
| case EXEC_INPUT_DATA: { |
| int fd; |
| |
| fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0); |
| if (fd < 0) |
| return fd; |
| |
| return move_fd(fd, STDIN_FILENO, false); |
| } |
| |
| case EXEC_INPUT_FILE: { |
| bool rw; |
| int fd; |
| |
| assert(context->stdio_file[STDIN_FILENO]); |
| |
| rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) || |
| (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO])); |
| |
| fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask); |
| if (fd < 0) |
| return fd; |
| |
| return move_fd(fd, STDIN_FILENO, false); |
| } |
| |
| default: |
| assert_not_reached("Unknown input type"); |
| } |
| } |
| |
| static bool can_inherit_stderr_from_stdout( |
| const ExecContext *context, |
| ExecOutput o, |
| ExecOutput e) { |
| |
| assert(context); |
| |
| /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the |
| * stderr fd */ |
| |
| if (e == EXEC_OUTPUT_INHERIT) |
| return true; |
| if (e != o) |
| return false; |
| |
| if (e == EXEC_OUTPUT_NAMED_FD) |
| return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]); |
| |
| if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE)) |
| return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]); |
| |
| return true; |
| } |
| |
| static int setup_output( |
| const Unit *unit, |
| const ExecContext *context, |
| const ExecParameters *params, |
| int fileno, |
| int socket_fd, |
| const int named_iofds[static 3], |
| const char *ident, |
| uid_t uid, |
| gid_t gid, |
| dev_t *journal_stream_dev, |
| ino_t *journal_stream_ino) { |
| |
| ExecOutput o; |
| ExecInput i; |
| int r; |
| |
| assert(unit); |
| assert(context); |
| assert(params); |
| assert(ident); |
| assert(journal_stream_dev); |
| assert(journal_stream_ino); |
| |
| if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) { |
| |
| if (dup2(params->stdout_fd, STDOUT_FILENO) < 0) |
| return -errno; |
| |
| return STDOUT_FILENO; |
| } |
| |
| if (fileno == STDERR_FILENO && params->stderr_fd >= 0) { |
| if (dup2(params->stderr_fd, STDERR_FILENO) < 0) |
| return -errno; |
| |
| return STDERR_FILENO; |
| } |
| |
| i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); |
| o = fixup_output(context->std_output, socket_fd); |
| |
| if (fileno == STDERR_FILENO) { |
| ExecOutput e; |
| e = fixup_output(context->std_error, socket_fd); |
| |
| /* This expects the input and output are already set up */ |
| |
| /* Don't change the stderr file descriptor if we inherit all |
| * the way and are not on a tty */ |
| if (e == EXEC_OUTPUT_INHERIT && |
| o == EXEC_OUTPUT_INHERIT && |
| i == EXEC_INPUT_NULL && |
| !is_terminal_input(context->std_input) && |
| getppid() != 1) |
| return fileno; |
| |
| /* Duplicate from stdout if possible */ |
| if (can_inherit_stderr_from_stdout(context, o, e)) |
| return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno; |
| |
| o = e; |
| |
| } else if (o == EXEC_OUTPUT_INHERIT) { |
| /* If input got downgraded, inherit the original value */ |
| if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input)) |
| return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); |
| |
| /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */ |
| if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA)) |
| return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; |
| |
| /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */ |
| if (getppid() != 1) |
| return fileno; |
| |
| /* We need to open /dev/null here anew, to get the right access mode. */ |
| return open_null_as(O_WRONLY, fileno); |
| } |
| |
| switch (o) { |
| |
| case EXEC_OUTPUT_NULL: |
| return open_null_as(O_WRONLY, fileno); |
| |
| case EXEC_OUTPUT_TTY: |
| if (is_terminal_input(i)) |
| return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; |
| |
| /* We don't reset the terminal if this is just about output */ |
| return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno); |
| |
| case EXEC_OUTPUT_KMSG: |
| case EXEC_OUTPUT_KMSG_AND_CONSOLE: |
| case EXEC_OUTPUT_JOURNAL: |
| case EXEC_OUTPUT_JOURNAL_AND_CONSOLE: |
| r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid); |
| if (r < 0) { |
| log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", |
| fileno == STDOUT_FILENO ? "stdout" : "stderr"); |
| r = open_null_as(O_WRONLY, fileno); |
| } else { |
| struct stat st; |
| |
| /* If we connected this fd to the journal via a stream, patch the device/inode into the passed |
| * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits |
| * services to detect whether they are connected to the journal or not. |
| * |
| * If both stdout and stderr are connected to a stream then let's make sure to store the data |
| * about STDERR as that's usually the best way to do logging. */ |
| |
| if (fstat(fileno, &st) >= 0 && |
| (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) { |
| *journal_stream_dev = st.st_dev; |
| *journal_stream_ino = st.st_ino; |
| } |
| } |
| return r; |
| |
| case EXEC_OUTPUT_SOCKET: |
| assert(socket_fd >= 0); |
| |
| return dup2(socket_fd, fileno) < 0 ? -errno : fileno; |
| |
| case EXEC_OUTPUT_NAMED_FD: |
| assert(named_iofds[fileno] >= 0); |
| |
| (void) fd_nonblock(named_iofds[fileno], false); |
| return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno; |
| |
| case EXEC_OUTPUT_FILE: |
| case EXEC_OUTPUT_FILE_APPEND: |
| case EXEC_OUTPUT_FILE_TRUNCATE: { |
| bool rw; |
| int fd, flags; |
| |
| assert(context->stdio_file[fileno]); |
| |
| rw = context->std_input == EXEC_INPUT_FILE && |
| streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]); |
| |
| if (rw) |
| return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; |
| |
| flags = O_WRONLY; |
| if (o == EXEC_OUTPUT_FILE_APPEND) |
| flags |= O_APPEND; |
| else if (o == EXEC_OUTPUT_FILE_TRUNCATE) |
| flags |= O_TRUNC; |
| |
| fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask); |
| if (fd < 0) |
| return fd; |
| |
| return move_fd(fd, fileno, 0); |
| } |
| |
| default: |
| assert_not_reached("Unknown error type"); |
| } |
| } |
| |
| static int chown_terminal(int fd, uid_t uid) { |
| int r; |
| |
| assert(fd >= 0); |
| |
| /* Before we chown/chmod the TTY, let's ensure this is actually a tty */ |
| if (isatty(fd) < 1) { |
| if (IN_SET(errno, EINVAL, ENOTTY)) |
| return 0; /* not a tty */ |
| |
| return -errno; |
| } |
| |
| /* This might fail. What matters are the results. */ |
| r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID); |
| if (r < 0) |
| return r; |
| |
| return 1; |
| } |
| |
| static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) { |
| _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1; |
| int r; |
| |
| assert(_saved_stdin); |
| assert(_saved_stdout); |
| |
| saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3); |
| if (saved_stdin < 0) |
| return -errno; |
| |
| saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3); |
| if (saved_stdout < 0) |
| return -errno; |
| |
| fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC); |
| if (fd < 0) |
| return fd; |
| |
| r = chown_terminal(fd, getuid()); |
| if (r < 0) |
| return r; |
| |
| r = reset_terminal_fd(fd, true); |
| if (r < 0) |
| return r; |
| |
| r = rearrange_stdio(fd, fd, STDERR_FILENO); |
| fd = -1; |
| if (r < 0) |
| return r; |
| |
| *_saved_stdin = saved_stdin; |
| *_saved_stdout = saved_stdout; |
| |
| saved_stdin = saved_stdout = -1; |
| |
| return 0; |
| } |
| |
| static void write_confirm_error_fd(int err, int fd, const Unit *u) { |
| assert(err < 0); |
| |
| if (err == -ETIMEDOUT) |
| dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id); |
| else { |
| errno = -err; |
| dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id); |
| } |
| } |
| |
| static void write_confirm_error(int err, const char *vc, const Unit *u) { |
| _cleanup_close_ int fd = -1; |
| |
| assert(vc); |
| |
| fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC); |
| if (fd < 0) |
| return; |
| |
| write_confirm_error_fd(err, fd, u); |
| } |
| |
| static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) { |
| int r = 0; |
| |
| assert(saved_stdin); |
| assert(saved_stdout); |
| |
| release_terminal(); |
| |
| if (*saved_stdin >= 0) |
| if (dup2(*saved_stdin, STDIN_FILENO) < 0) |
| r = -errno; |
| |
| if (*saved_stdout >= 0) |
| if (dup2(*saved_stdout, STDOUT_FILENO) < 0) |
| r = -errno; |
| |
| *saved_stdin = safe_close(*saved_stdin); |
| *saved_stdout = safe_close(*saved_stdout); |
| |
| return r; |
| } |
| |
| enum { |
| CONFIRM_PRETEND_FAILURE = -1, |
| CONFIRM_PRETEND_SUCCESS = 0, |
| CONFIRM_EXECUTE = 1, |
| }; |
| |
| static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) { |
| int saved_stdout = -1, saved_stdin = -1, r; |
| _cleanup_free_ char *e = NULL; |
| char c; |
| |
| /* For any internal errors, assume a positive response. */ |
| r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout); |
| if (r < 0) { |
| write_confirm_error(r, vc, u); |
| return CONFIRM_EXECUTE; |
| } |
| |
| /* confirm_spawn might have been disabled while we were sleeping. */ |
| if (manager_is_confirm_spawn_disabled(u->manager)) { |
| r = 1; |
| goto restore_stdio; |
| } |
| |
| e = ellipsize(cmdline, 60, 100); |
| if (!e) { |
| log_oom(); |
| r = CONFIRM_EXECUTE; |
| goto restore_stdio; |
| } |
| |
| for (;;) { |
| r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e); |
| if (r < 0) { |
| write_confirm_error_fd(r, STDOUT_FILENO, u); |
| r = CONFIRM_EXECUTE; |
| goto restore_stdio; |
| } |
| |
| switch (c) { |
| case 'c': |
| printf("Resuming normal execution.\n"); |
| manager_disable_confirm_spawn(); |
| r = 1; |
| break; |
| case 'D': |
| unit_dump(u, stdout, " "); |
| continue; /* ask again */ |
| case 'f': |
| printf("Failing execution.\n"); |
| r = CONFIRM_PRETEND_FAILURE; |
| break; |
| case 'h': |
| printf(" c - continue, proceed without asking anymore\n" |
| " D - dump, show the state of the unit\n" |
| " f - fail, don't execute the command and pretend it failed\n" |
| " h - help\n" |
| " i - info, show a short summary of the unit\n" |
| " j - jobs, show jobs that are in progress\n" |
| " s - skip, don't execute the command and pretend it succeeded\n" |
| " y - yes, execute the command\n"); |
| continue; /* ask again */ |
| case 'i': |
| printf(" Description: %s\n" |
| " Unit: %s\n" |
| " Command: %s\n", |
| u->id, u->description, cmdline); |
| continue; /* ask again */ |
| case 'j': |
| manager_dump_jobs(u->manager, stdout, " "); |
| continue; /* ask again */ |
| case 'n': |
| /* 'n' was removed in favor of 'f'. */ |
| printf("Didn't understand 'n', did you mean 'f'?\n"); |
| continue; /* ask again */ |
| case 's': |
| printf("Skipping execution.\n"); |
| r = CONFIRM_PRETEND_SUCCESS; |
| break; |
| case 'y': |
| r = CONFIRM_EXECUTE; |
| break; |
| default: |
| assert_not_reached("Unhandled choice"); |
| } |
| break; |
| } |
| |
| restore_stdio: |
| restore_confirm_stdio(&saved_stdin, &saved_stdout); |
| return r; |
| } |
| |
| static int get_fixed_user(const ExecContext *c, const char **user, |
| uid_t *uid, gid_t *gid, |
| const char **home, const char **shell) { |
| int r; |
| const char *name; |
| |
| assert(c); |
| |
| if (!c->user) |
| return 0; |
| |
| /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway |
| * (i.e. are "/" or "/bin/nologin"). */ |
| |
| name = c->user; |
| r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN); |
| if (r < 0) |
| return r; |
| |
| *user = name; |
| return 0; |
| } |
| |
| static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) { |
| int r; |
| const char *name; |
| |
| assert(c); |
| |
| if (!c->group) |
| return 0; |
| |
| name = c->group; |
| r = get_group_creds(&name, gid, 0); |
| if (r < 0) |
| return r; |
| |
| *group = name; |
| return 0; |
| } |
| |
| static int get_supplementary_groups(const ExecContext *c, const char *user, |
| const char *group, gid_t gid, |
| gid_t **supplementary_gids, int *ngids) { |
| char **i; |
| int r, k = 0; |
| int ngroups_max; |
| bool keep_groups = false; |
| gid_t *groups = NULL; |
| _cleanup_free_ gid_t *l_gids = NULL; |
| |
| assert(c); |
| |
| /* |
| * If user is given, then lookup GID and supplementary groups list. |
| * We avoid NSS lookups for gid=0. Also we have to initialize groups |
| * here and as early as possible so we keep the list of supplementary |
| * groups of the caller. |
| */ |
| if (user && gid_is_valid(gid) && gid != 0) { |
| /* First step, initialize groups from /etc/groups */ |
| if (initgroups(user, gid) < 0) |
| return -errno; |
| |
| keep_groups = true; |
| } |
| |
| if (strv_isempty(c->supplementary_groups)) |
| return 0; |
| |
| /* |
| * If SupplementaryGroups= was passed then NGROUPS_MAX has to |
| * be positive, otherwise fail. |
| */ |
| errno = 0; |
| ngroups_max = (int) sysconf(_SC_NGROUPS_MAX); |
| if (ngroups_max <= 0) |
| return errno_or_else(EOPNOTSUPP); |
| |
| l_gids = new(gid_t, ngroups_max); |
| if (!l_gids) |
| return -ENOMEM; |
| |
| if (keep_groups) { |
| /* |
| * Lookup the list of groups that the user belongs to, we |
| * avoid NSS lookups here too for gid=0. |
| */ |
| k = ngroups_max; |
| if (getgrouplist(user, gid, l_gids, &k) < 0) |
| return -EINVAL; |
| } else |
| k = 0; |
| |
| STRV_FOREACH(i, c->supplementary_groups) { |
| const char *g; |
| |
| if (k >= ngroups_max) |
| return -E2BIG; |
| |
| g = *i; |
| r = get_group_creds(&g, l_gids+k, 0); |
| if (r < 0) |
| return r; |
| |
| k++; |
| } |
| |
| /* |
| * Sets ngids to zero to drop all supplementary groups, happens |
| * when we are under root and SupplementaryGroups= is empty. |
| */ |
| if (k == 0) { |
| *ngids = 0; |
| return 0; |
| } |
| |
| /* Otherwise get the final list of supplementary groups */ |
| groups = memdup(l_gids, sizeof(gid_t) * k); |
| if (!groups) |
| return -ENOMEM; |
| |
| *supplementary_gids = groups; |
| *ngids = k; |
| |
| groups = NULL; |
| |
| return 0; |
| } |
| |
| static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) { |
| int r; |
| |
| /* Handle SupplementaryGroups= if it is not empty */ |
| if (ngids > 0) { |
| r = maybe_setgroups(ngids, supplementary_gids); |
| if (r < 0) |
| return r; |
| } |
| |
| if (gid_is_valid(gid)) { |
| /* Then set our gids */ |
| if (setresgid(gid, gid, gid) < 0) |
| return -errno; |
| } |
| |
| return 0; |
| } |
| |
| static int set_securebits(int bits, int mask) { |
| int current, applied; |
| current = prctl(PR_GET_SECUREBITS); |
| if (current < 0) |
| return -errno; |
| /* Clear all securebits defined in mask and set bits */ |
| applied = (current & ~mask) | bits; |
| if (current == applied) |
| return 0; |
| if (prctl(PR_SET_SECUREBITS, applied) < 0) |
| return -errno; |
| return 1; |
| } |
| |
| static int enforce_user(const ExecContext *context, uid_t uid) { |
| assert(context); |
| int r; |
| |
| if (!uid_is_valid(uid)) |
| return 0; |
| |
| /* Sets (but doesn't look up) the uid and make sure we keep the |
| * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is |
| * required, so we also need keep-caps in this case. |
| */ |
| |
| if (context->capability_ambient_set != 0 || context->secure_bits != 0) { |
| |
| /* First step: If we need to keep capabilities but |
| * drop privileges we need to make sure we keep our |
| * caps, while we drop privileges. */ |
| if (uid != 0) { |
| /* Add KEEP_CAPS to the securebits */ |
| r = set_securebits(1<<SECURE_KEEP_CAPS, 0); |
| if (r < 0) |
| return r; |
| } |
| } |
| |
| /* Second step: actually set the uids */ |
| if (setresuid(uid, uid, uid) < 0) |
| return -errno; |
| |
| /* At this point we should have all necessary capabilities but |
| are otherwise a normal user. However, the caps might got |
| corrupted due to the setresuid() so we need clean them up |
| later. This is done outside of this call. */ |
| |
| return 0; |
| } |
| |
| #if HAVE_PAM |
| |
| static int null_conv( |
| int num_msg, |
| const struct pam_message **msg, |
| struct pam_response **resp, |
| void *appdata_ptr) { |
| |
| /* We don't support conversations */ |
| |
| return PAM_CONV_ERR; |
| } |
| |
| #endif |
| |
| static int setup_pam( |
| const char *name, |
| const char *user, |
| uid_t uid, |
| gid_t gid, |
| const char *tty, |
| char ***env, /* updated on success */ |
| const int fds[], size_t n_fds) { |
| |
| #if HAVE_PAM |
| |
| static const struct pam_conv conv = { |
| .conv = null_conv, |
| .appdata_ptr = NULL |
| }; |
| |
| _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; |
| _cleanup_strv_free_ char **e = NULL; |
| pam_handle_t *handle = NULL; |
| sigset_t old_ss; |
| int pam_code = PAM_SUCCESS, r; |
| char **nv; |
| bool close_session = false; |
| pid_t pam_pid = 0, parent_pid; |
| int flags = 0; |
| |
| assert(name); |
| assert(user); |
| assert(env); |
| |
| /* We set up PAM in the parent process, then fork. The child |
| * will then stay around until killed via PR_GET_PDEATHSIG or |
| * systemd via the cgroup logic. It will then remove the PAM |
| * session again. The parent process will exec() the actual |
| * daemon. We do things this way to ensure that the main PID |
| * of the daemon is the one we initially fork()ed. */ |
| |
| r = barrier_create(&barrier); |
| if (r < 0) |
| goto fail; |
| |
| if (log_get_max_level() < LOG_DEBUG) |
| flags |= PAM_SILENT; |
| |
| pam_code = pam_start(name, user, &conv, &handle); |
| if (pam_code != PAM_SUCCESS) { |
| handle = NULL; |
| goto fail; |
| } |
| |
| if (!tty) { |
| _cleanup_free_ char *q = NULL; |
| |
| /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure |
| * out if that's the case, and read the TTY off it. */ |
| |
| if (getttyname_malloc(STDIN_FILENO, &q) >= 0) |
| tty = strjoina("/dev/", q); |
| } |
| |
| if (tty) { |
| pam_code = pam_set_item(handle, PAM_TTY, tty); |
| if (pam_code != PAM_SUCCESS) |
| goto fail; |
| } |
| |
| STRV_FOREACH(nv, *env) { |
| pam_code = pam_putenv(handle, *nv); |
| if (pam_code != PAM_SUCCESS) |
| goto fail; |
| } |
| |
| pam_code = pam_acct_mgmt(handle, flags); |
| if (pam_code != PAM_SUCCESS) |
| goto fail; |
| |
| pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags); |
| if (pam_code != PAM_SUCCESS) |
| log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code)); |
| |
| pam_code = pam_open_session(handle, flags); |
| if (pam_code != PAM_SUCCESS) |
| goto fail; |
| |
| close_session = true; |
| |
| e = pam_getenvlist(handle); |
| if (!e) { |
| pam_code = PAM_BUF_ERR; |
| goto fail; |
| } |
| |
| /* Block SIGTERM, so that we know that it won't get lost in the child */ |
| |
| assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0); |
| |
| parent_pid = getpid_cached(); |
| |
| r = safe_fork("(sd-pam)", 0, &pam_pid); |
| if (r < 0) |
| goto fail; |
| if (r == 0) { |
| int sig, ret = EXIT_PAM; |
| |
| /* The child's job is to reset the PAM session on termination */ |
| barrier_set_role(&barrier, BARRIER_CHILD); |
| |
| /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only |
| * those fds are open here that have been opened by PAM. */ |
| (void) close_many(fds, n_fds); |
| |
| /* Drop privileges - we don't need any to pam_close_session and this will make |
| * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam |
| * threads to fail to exit normally */ |
| |
| r = maybe_setgroups(0, NULL); |
| if (r < 0) |
| log_warning_errno(r, "Failed to setgroups() in sd-pam: %m"); |
| if (setresgid(gid, gid, gid) < 0) |
| log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m"); |
| if (setresuid(uid, uid, uid) < 0) |
| log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m"); |
| |
| (void) ignore_signals(SIGPIPE); |
| |
| /* Wait until our parent died. This will only work if the above setresuid() succeeds, |
| * otherwise the kernel will not allow unprivileged parents kill their privileged children |
| * this way. We rely on the control groups kill logic to do the rest for us. */ |
| if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0) |
| goto child_finish; |
| |
| /* Tell the parent that our setup is done. This is especially important regarding dropping |
| * privileges. Otherwise, unit setup might race against our setresuid(2) call. |
| * |
| * If the parent aborted, we'll detect this below, hence ignore return failure here. */ |
| (void) barrier_place(&barrier); |
| |
| /* Check if our parent process might already have died? */ |
| if (getppid() == parent_pid) { |
| sigset_t ss; |
| |
| assert_se(sigemptyset(&ss) >= 0); |
| assert_se(sigaddset(&ss, SIGTERM) >= 0); |
| |
| for (;;) { |
| if (sigwait(&ss, &sig) < 0) { |
| if (errno == EINTR) |
| continue; |
| |
| goto child_finish; |
| } |
| |
| assert(sig == SIGTERM); |
| break; |
| } |
| } |
| |
| pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags); |
| if (pam_code != PAM_SUCCESS) |
| goto child_finish; |
| |
| /* If our parent died we'll end the session */ |
| if (getppid() != parent_pid) { |
| pam_code = pam_close_session(handle, flags); |
| if (pam_code != PAM_SUCCESS) |
| goto child_finish; |
| } |
| |
| ret = 0; |
| |
| child_finish: |
| /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module |
| * know about this. See pam_end(3) */ |
| (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT); |
| _exit(ret); |
| } |
| |
| barrier_set_role(&barrier, BARRIER_PARENT); |
| |
| /* If the child was forked off successfully it will do all the cleanups, so forget about the handle |
| * here. */ |
| handle = NULL; |
| |
| /* Unblock SIGTERM again in the parent */ |
| assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0); |
| |
| /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want |
| * this fd around. */ |
| closelog(); |
| |
| /* Synchronously wait for the child to initialize. We don't care for errors as we cannot |
| * recover. However, warn loudly if it happens. */ |
| if (!barrier_place_and_sync(&barrier)) |
| log_error("PAM initialization failed"); |
| |
| return strv_free_and_replace(*env, e); |
| |
| fail: |
| if (pam_code != PAM_SUCCESS) { |
| log_error("PAM failed: %s", pam_strerror(handle, pam_code)); |
| r = -EPERM; /* PAM errors do not map to errno */ |
| } else |
| log_error_errno(r, "PAM failed: %m"); |
| |
| if (handle) { |
| if (close_session) |
| pam_code = pam_close_session(handle, flags); |
| |
| (void) pam_end(handle, pam_code | flags); |
| } |
| |
| closelog(); |
| return r; |
| #else |
| return 0; |
| #endif |
| } |
| |
| static void rename_process_from_path(const char *path) { |
| char process_name[11]; |
| const char *p; |
| size_t l; |
| |
| /* This resulting string must fit in 10 chars (i.e. the length |
| * of "/sbin/init") to look pretty in /bin/ps */ |
| |
| p = basename(path); |
| if (isempty(p)) { |
| rename_process("(...)"); |
| return; |
| } |
| |
| l = strlen(p); |
| if (l > 8) { |
| /* The end of the process name is usually more |
| * interesting, since the first bit might just be |
| * "systemd-" */ |
| p = p + l - 8; |
| l = 8; |
| } |
| |
| process_name[0] = '('; |
| memcpy(process_name+1, p, l); |
| process_name[1+l] = ')'; |
| process_name[1+l+1] = 0; |
| |
| rename_process(process_name); |
| } |
| |
| static bool context_has_address_families(const ExecContext *c) { |
| assert(c); |
| |
| return c->address_families_allow_list || |
| !set_isempty(c->address_families); |
| } |
| |
| static bool context_has_syscall_filters(const ExecContext *c) { |
| assert(c); |
| |
| return c->syscall_allow_list || |
| !hashmap_isempty(c->syscall_filter); |
| } |
| |
| static bool context_has_syscall_logs(const ExecContext *c) { |
| assert(c); |
| |
| return c->syscall_log_allow_list || |
| !hashmap_isempty(c->syscall_log); |
| } |
| |
| static bool context_has_no_new_privileges(const ExecContext *c) { |
| assert(c); |
| |
| if (c->no_new_privileges) |
| return true; |
| |
| if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ |
| return false; |
| |
| /* We need NNP if we have any form of seccomp and are unprivileged */ |
| return c->lock_personality || |
| c->memory_deny_write_execute || |
| c->private_devices || |
| c->protect_clock || |
| c->protect_hostname || |
| c->protect_kernel_tunables || |
| c->protect_kernel_modules || |
| c->protect_kernel_logs || |
| context_has_address_families(c) || |
| exec_context_restrict_namespaces_set(c) || |
| c->restrict_realtime || |
| c->restrict_suid_sgid || |
| !set_isempty(c->syscall_archs) || |
| context_has_syscall_filters(c) || |
| context_has_syscall_logs(c); |
| } |
| |
| static bool exec_context_has_credentials(const ExecContext *context) { |
| |
| assert(context); |
| |
| return !hashmap_isempty(context->set_credentials) || |
| context->load_credentials; |
| } |
| |
| #if HAVE_SECCOMP |
| |
| static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { |
| |
| if (is_seccomp_available()) |
| return false; |
| |
| log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); |
| return true; |
| } |
| |
| static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) { |
| uint32_t negative_action, default_action, action; |
| int r; |
| |
| assert(u); |
| assert(c); |
| |
| if (!context_has_syscall_filters(c)) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "SystemCallFilter=")) |
| return 0; |
| |
| negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno); |
| |
| if (c->syscall_allow_list) { |
| default_action = negative_action; |
| action = SCMP_ACT_ALLOW; |
| } else { |
| default_action = SCMP_ACT_ALLOW; |
| action = negative_action; |
| } |
| |
| if (needs_ambient_hack) { |
| r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID); |
| if (r < 0) |
| return r; |
| } |
| |
| return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false); |
| } |
| |
| static int apply_syscall_log(const Unit* u, const ExecContext *c) { |
| #ifdef SCMP_ACT_LOG |
| uint32_t default_action, action; |
| #endif |
| |
| assert(u); |
| assert(c); |
| |
| if (!context_has_syscall_logs(c)) |
| return 0; |
| |
| #ifdef SCMP_ACT_LOG |
| if (skip_seccomp_unavailable(u, "SystemCallLog=")) |
| return 0; |
| |
| if (c->syscall_log_allow_list) { |
| /* Log nothing but the ones listed */ |
| default_action = SCMP_ACT_ALLOW; |
| action = SCMP_ACT_LOG; |
| } else { |
| /* Log everything but the ones listed */ |
| default_action = SCMP_ACT_LOG; |
| action = SCMP_ACT_ALLOW; |
| } |
| |
| return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false); |
| #else |
| /* old libseccomp */ |
| log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog="); |
| return 0; |
| #endif |
| } |
| |
| static int apply_syscall_archs(const Unit *u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (set_isempty(c->syscall_archs)) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "SystemCallArchitectures=")) |
| return 0; |
| |
| return seccomp_restrict_archs(c->syscall_archs); |
| } |
| |
| static int apply_address_families(const Unit* u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (!context_has_address_families(c)) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) |
| return 0; |
| |
| return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list); |
| } |
| |
| static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (!c->memory_deny_write_execute) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) |
| return 0; |
| |
| return seccomp_memory_deny_write_execute(); |
| } |
| |
| static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (!c->restrict_realtime) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "RestrictRealtime=")) |
| return 0; |
| |
| return seccomp_restrict_realtime(); |
| } |
| |
| static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (!c->restrict_suid_sgid) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "RestrictSUIDSGID=")) |
| return 0; |
| |
| return seccomp_restrict_suid_sgid(); |
| } |
| |
| static int apply_protect_sysctl(const Unit *u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but |
| * let's protect even those systems where this is left on in the kernel. */ |
| |
| if (!c->protect_kernel_tunables) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) |
| return 0; |
| |
| return seccomp_protect_sysctl(); |
| } |
| |
| static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| /* Turn off module syscalls on ProtectKernelModules=yes */ |
| |
| if (!c->protect_kernel_modules) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) |
| return 0; |
| |
| return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false); |
| } |
| |
| static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (!c->protect_kernel_logs) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "ProtectKernelLogs=")) |
| return 0; |
| |
| return seccomp_protect_syslog(); |
| } |
| |
| static int apply_protect_clock(const Unit *u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (!c->protect_clock) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "ProtectClock=")) |
| return 0; |
| |
| return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false); |
| } |
| |
| static int apply_private_devices(const Unit *u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ |
| |
| if (!c->private_devices) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "PrivateDevices=")) |
| return 0; |
| |
| return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false); |
| } |
| |
| static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) { |
| assert(u); |
| assert(c); |
| |
| if (!exec_context_restrict_namespaces_set(c)) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "RestrictNamespaces=")) |
| return 0; |
| |
| return seccomp_restrict_namespaces(c->restrict_namespaces); |
| } |
| |
| static int apply_lock_personality(const Unit* u, const ExecContext *c) { |
| unsigned long personality; |
| int r; |
| |
| assert(u); |
| assert(c); |
| |
| if (!c->lock_personality) |
| return 0; |
| |
| if (skip_seccomp_unavailable(u, "LockPersonality=")) |
| return 0; |
| |
| personality = c->personality; |
| |
| /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */ |
| if (personality == PERSONALITY_INVALID) { |
| |
| r = opinionated_personality(&personality); |
| if (r < 0) |
| return r; |
| } |
| |
| return seccomp_lock_personality(personality); |
| } |
| |
| #endif |
| |
| static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) { |
| assert(u); |
| assert(c); |
| |
| if (!c->protect_hostname) |
| return 0; |
| |
| if (ns_type_supported(NAMESPACE_UTS)) { |
| if (unshare(CLONE_NEWUTS) < 0) { |
| if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) { |
| *ret_exit_status = EXIT_NAMESPACE; |
| return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m"); |
| } |
| |
| log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup."); |
| } |
| } else |
| log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup."); |
| |
| #if HAVE_SECCOMP |
| int r; |
| |
| if (skip_seccomp_unavailable(u, "ProtectHostname=")) |
| return 0; |
| |
| r = seccomp_protect_hostname(); |
| if (r < 0) { |
| *ret_exit_status = EXIT_SECCOMP; |
| return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m"); |
| } |
| #endif |
| |
| return 0; |
| } |
| |
| static void do_idle_pipe_dance(int idle_pipe[static 4]) { |
| assert(idle_pipe); |
| |
| idle_pipe[1] = safe_close(idle_pipe[1]); |
| idle_pipe[2] = safe_close(idle_pipe[2]); |
| |
| if (idle_pipe[0] >= 0) { |
| int r; |
| |
| r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC); |
| |
| if (idle_pipe[3] >= 0 && r == 0 /* timeout */) { |
| ssize_t n; |
| |
| /* Signal systemd that we are bored and want to continue. */ |
| n = write(idle_pipe[3], "x", 1); |
| if (n > 0) |
| /* Wait for systemd to react to the signal above. */ |
| (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC); |
| } |
| |
| idle_pipe[0] = safe_close(idle_pipe[0]); |
| |
| } |
| |
| idle_pipe[3] = safe_close(idle_pipe[3]); |
| } |
| |
| static const char *exec_directory_env_name_to_string(ExecDirectoryType t); |
| |
| static int build_environment( |
| const Unit *u, |
| const ExecContext *c, |
| const ExecParameters *p, |
| size_t n_fds, |
| const char *home, |
| const char *username, |
| const char *shell, |
| dev_t journal_stream_dev, |
| ino_t journal_stream_ino, |
| char ***ret) { |
| |
| _cleanup_strv_free_ char **our_env = NULL; |
| size_t n_env = 0; |
| char *x; |
| |
| assert(u); |
| assert(c); |
| assert(p); |
| assert(ret); |
| |
| #define N_ENV_VARS 17 |
| our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); |
| if (!our_env) |
| return -ENOMEM; |
| |
| if (n_fds > 0) { |
| _cleanup_free_ char *joined = NULL; |
| |
| if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| |
| if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| |
| joined = strv_join(p->fd_names, ":"); |
| if (!joined) |
| return -ENOMEM; |
| |
| x = strjoin("LISTEN_FDNAMES=", joined); |
| if (!x) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| } |
| |
| if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) { |
| if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| |
| if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| } |
| |
| /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking |
| * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and |
| * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */ |
| if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) { |
| x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1"); |
| if (!x) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| } |
| |
| if (home) { |
| x = strjoin("HOME=", home); |
| if (!x) |
| return -ENOMEM; |
| |
| path_simplify(x + 5); |
| our_env[n_env++] = x; |
| } |
| |
| if (username) { |
| x = strjoin("LOGNAME=", username); |
| if (!x) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| |
| x = strjoin("USER=", username); |
| if (!x) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| } |
| |
| if (shell) { |
| x = strjoin("SHELL=", shell); |
| if (!x) |
| return -ENOMEM; |
| |
| path_simplify(x + 6); |
| our_env[n_env++] = x; |
| } |
| |
| if (!sd_id128_is_null(u->invocation_id)) { |
| if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0) |
| return -ENOMEM; |
| |
| our_env[n_env++] = x; |
| } |
| |
| if (exec_context_needs_term(c)) { |
| const char *tty_path, *term = NULL; |
| |
| tty_path = exec_context_tty_path(c); |
| |
| /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try |
| * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the |
| * container manager passes to PID 1 ends up all the way in the console login shown. */ |
| |
| if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1) |
| term = getenv("TERM"); |
| |
| if (!term) |
| term = default_term_for_tty(tty_path); |
| |
| x = strjoin("TERM=", term); |
| if (!x) |
| return -ENOMEM; |
| our_env[n_env++] = x; |
| } |
| |
| if (journal_stream_dev != 0 && journal_stream_ino != 0) { |
| if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0) |
| return -ENOMEM; |
| |
| our_env[n_env++] = x; |
| } |
| |
| if (c->log_namespace) { |
| x = strjoin("LOG_NAMESPACE=", c->log_namespace); |
| if (!x) |
| return -ENOMEM; |
| |
| our_env[n_env++] = x; |
| } |
| |
| for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { |
| _cleanup_free_ char *pre = NULL, *joined = NULL; |
| const char *n; |
| |
| if (!p->prefix[t]) |
| continue; |
| |
| if (strv_isempty(c->directories[t].paths)) |
| continue; |
| |
| n = exec_directory_env_name_to_string(t); |
| if (!n) |
| continue; |
| |
| pre = strjoin(p->prefix[t], "/"); |
| if (!pre) |
| return -ENOMEM; |
| |
| joined = strv_join_full(c->directories[t].paths, ":", pre, true); |
| if (!joined) |
| return -ENOMEM; |
| |
| x = strjoin(n, "=", joined); |
| if (!x) |
| return -ENOMEM; |
| |
| our_env[n_env++] = x; |
| } |
| |
| if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) { |
| x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id); |
| if (!x) |
| return -ENOMEM; |
| |
| our_env[n_env++] = x; |
| } |
| |
| if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0) |
| return -ENOMEM; |
| |
| our_env[n_env++] = x; |
| |
| our_env[n_env++] = NULL; |
| assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); |
| #undef N_ENV_VARS |
| |
| *ret = TAKE_PTR(our_env); |
| |
| return 0; |
| } |
| |
| static int build_pass_environment(const ExecContext *c, char ***ret) { |
| _cleanup_strv_free_ char **pass_env = NULL; |
| size_t n_env = 0; |
| char **i; |
| |
| STRV_FOREACH(i, c->pass_environment) { |
| _cleanup_free_ char *x = NULL; |
| char *v; |
| |
| v = getenv(*i); |
| if (!v) |
| continue; |
| x = strjoin(*i, "=", v); |
| if (!x) |
| return -ENOMEM; |
| |
| if (!GREEDY_REALLOC(pass_env, n_env + 2)) |
| return -ENOMEM; |
| |
| pass_env[n_env++] = TAKE_PTR(x); |
| pass_env[n_env] = NULL; |
| } |
| |
| *ret = TAKE_PTR(pass_env); |
| |
| return 0; |
| } |
| |
| bool exec_needs_mount_namespace( |
| const ExecContext *context, |
| const ExecParameters *params, |
| const ExecRuntime *runtime) { |
| |
| assert(context); |
| |
| if (context->root_image) |
| return true; |
| |
| if (!strv_isempty(context->read_write_paths) || |
| !strv_isempty(context->read_only_paths) || |
| !strv_isempty(context->inaccessible_paths) || |
| !strv_isempty(context->exec_paths) || |
| !strv_isempty(context->no_exec_paths)) |
| return true; |
| |
| if (context->n_bind_mounts > 0) |
| return true; |
| |
| if (context->n_temporary_filesystems > 0) |
| return true; |
| |
| if (context->n_mount_images > 0) |
| return true; |
| |
| if (context->n_extension_images > 0) |
| return true; |
| |
| if (!IN_SET(context->mount_flags, 0, MS_SHARED)) |
| return true; |
| |
| if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir)) |
| return true; |
| |
| if (context->private_devices || |
| context->private_mounts || |
| context->protect_system != PROTECT_SYSTEM_NO || |
| context->protect_home != PROTECT_HOME_NO || |
| context->protect_kernel_tunables || |
| context->protect_kernel_modules || |
| context->protect_kernel_logs || |
| context->protect_control_groups || |
| context->protect_proc != PROTECT_PROC_DEFAULT || |
| context->proc_subset != PROC_SUBSET_ALL || |
| context->private_ipc || |
| context->ipc_namespace_path) |
| return true; |
| |
| if (context->root_directory) { |
| if (exec_context_get_effective_mount_apivfs(context)) |
| return true; |
| |
| for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { |
| if (params && !params->prefix[t]) |
| continue; |
| |
| if (!strv_isempty(context->directories[t].paths)) |
| return true; |
| } |
| } |
| |
| if (context->dynamic_user && |
| (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) || |
| !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) || |
| !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths))) |
| return true; |
| |
| if (context->log_namespace) |
| return true; |
| |
| return false; |
| } |
| |
| static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) { |
| _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; |
| _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 }; |
| _cleanup_close_ int unshare_ready_fd = -1; |
| _cleanup_(sigkill_waitp) pid_t pid = 0; |
| uint64_t c = 1; |
| ssize_t n; |
| int r; |
| |
| /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e. |
| * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to |
| * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which |
| * we however lack after opening the user namespace. To work around this we fork() a temporary child process, |
| * which waits for the parent to create the new user namespace while staying in the original namespace. The |
| * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and |
| * continues execution normally. |
| * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it |
| * does not need CAP_SETUID to write the single line mapping to itself. */ |
| |
| /* Can only set up multiple mappings with CAP_SETUID. */ |
| if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid)) |
| r = asprintf(&uid_map, |
| UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */ |
| UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ |
| ouid, ouid, uid, uid); |
| else |
| r = asprintf(&uid_map, |
| UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */ |
| ouid, ouid); |
| |
| if (r < 0) |
| return -ENOMEM; |
| |
| /* Can only set up multiple mappings with CAP_SETGID. */ |
| if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid)) |
| r = asprintf(&gid_map, |
| GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */ |
| GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */ |
| ogid, ogid, gid, gid); |
| else |
| r = asprintf(&gid_map, |
| GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */ |
| ogid, ogid); |
| |
| if (r < 0) |
| return -ENOMEM; |
| |
| /* Create a communication channel so that the parent can tell the child when it finished creating the user |
| * namespace. */ |
| unshare_ready_fd = eventfd(0, EFD_CLOEXEC); |
| if (unshare_ready_fd < 0) |
| return -errno; |
| |
| /* Create a communication channel so that the child can tell the parent a proper error code in case it |
| * failed. */ |
| if (pipe2(errno_pipe, O_CLOEXEC) < 0) |
| return -errno; |
| |
| r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid); |
| if (r < 0) |
| return r; |
| if (r == 0) { |
| _cleanup_close_ int fd = -1; |
| const char *a; |
| pid_t ppid; |
| |
| /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from |
| * here, after the parent opened its own user namespace. */ |
| |
| ppid = getppid(); |
| errno_pipe[0] = safe_close(errno_pipe[0]); |
| |
| /* Wait until the parent unshared the user namespace */ |
| if (read(unshare_ready_fd, &c, sizeof(c)) < 0) { |
| r = -errno; |
| goto child_fail; |
| } |
| |
| /* Disable the setgroups() system call in the child user namespace, for good. */ |
| a = procfs_file_alloca(ppid, "setgroups"); |
| fd = open(a, O_WRONLY|O_CLOEXEC); |
| if (fd < 0) { |
| if (errno != ENOENT) { |
| r = -errno; |
| goto child_fail; |
| } |
| |
| /* If the file is missing the kernel is too old, let's continue anyway. */ |
| } else { |
| if (write(fd, "deny\n", 5) < 0) { |
| r = -errno; |
| goto child_fail; |
| } |
| |
| fd = safe_close(fd); |
| } |
| |
| /* First write the GID map */ |
| a = procfs_file_alloca(ppid, "gid_map"); |
| fd = open(a, O_WRONLY|O_CLOEXEC); |
| if (fd < 0) { |
| r = -errno; |
| goto child_fail; |
| } |
| if (write(fd, gid_map, strlen(gid_map)) < 0) { |
| r = -errno; |
| goto child_fail; |
| } |
| fd = safe_close(fd); |
| |
| /* The write the UID map */ |
| a = procfs_file_alloca(ppid, "uid_map"); |
| fd = open(a, O_WRONLY|O_CLOEXEC); |
| if (fd < 0) { |
| r = -errno; |
| goto child_fail; |
| } |
| if (write(fd, uid_map, strlen(uid_map)) < 0) { |
| r = -errno; |
| goto child_fail; |
| } |
| |
| _exit(EXIT_SUCCESS); |
| |
| child_fail: |
| (void) write(errno_pipe[1], &r, sizeof(r)); |
| _exit(EXIT_FAILURE); |
| } |
| |
| errno_pipe[1] = safe_close(errno_pipe[1]); |
| |
| if (unshare(CLONE_NEWUSER) < 0) |
| return -errno; |
| |
| /* Let the child know that the namespace is ready now */ |
| if (write(unshare_ready_fd, &c, sizeof(c)) < 0) |
| return -errno; |
| |
| /* Try to read an error code from the child */ |
| n = read(errno_pipe[0], &r, sizeof(r)); |
| if (n < 0) |
| return -errno; |
| if (n == sizeof(r)) { /* an error code was sent to us */ |
| if (r < 0) |
| return r; |
| return -EIO; |
| } |
| if (n != 0) /* on success we should have read 0 bytes */ |
| return -EIO; |
| |
| r = wait_for_terminate_and_check("(sd-userns)", pid, 0); |
| pid = 0; |
| if (r < 0) |
| return r; |
| if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */ |
| return -EIO; |
| |
| return 0; |
| } |
| |
| static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) { |
| if (!context->dynamic_user) |
| return false; |
| |
| if (type == EXEC_DIRECTORY_CONFIGURATION) |
| return false; |
| |
| if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO) |
| return false; |
| |
| return true; |
| } |
| |
| static int setup_exec_directory( |
| const ExecContext *context, |
| const ExecParameters *params, |
| uid_t uid, |
| gid_t gid, |
| ExecDirectoryType type, |
| int *exit_status) { |
| |
| static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = { |
| [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY, |
| [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY, |
| [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY, |
| [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY, |
| [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY, |
| }; |
| char **rt; |
| int r; |
| |
| assert(context); |
| assert(params); |
| assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX); |
| assert(exit_status); |
| |
| if (!params->prefix[type]) |
| return 0; |
| |
| if (params->flags & EXEC_CHOWN_DIRECTORIES) { |
| if (!uid_is_valid(uid)) |
| uid = 0; |
| if (!gid_is_valid(gid)) |
| gid = 0; |
| } |
| |
| STRV_FOREACH(rt, context->directories[type].paths) { |
| _cleanup_free_ char *p = NULL, *pp = NULL; |
| |
| p = path_join(params->prefix[type], *rt); |
| if (!p) { |
| r = -ENOMEM; |
| goto fail; |
| } |
| |
| r = mkdir_parents_label(p, 0755); |
| if (r < 0) |
| goto fail; |
| |
| if (exec_directory_is_private(context, type)) { |
| /* So, here's one extra complication when dealing with DynamicUser=1 units. In that |
| * case we want to avoid leaving a directory around fully accessible that is owned by |
| * a dynamic user whose UID is later on reused. To lock this down we use the same |
| * trick used by container managers to prohibit host users to get access to files of |
| * the same UID in containers: we place everything inside a directory that has an |
| * access mode of 0700 and is owned root:root, so that it acts as security boundary |
| * for unprivileged host code. We then use fs namespacing to make this directory |
| * permeable for the service itself. |
| * |
| * Specifically: for a service which wants a special directory "foo/" we first create |
| * a directory "private/" with access mode 0700 owned by root:root. Then we place |
| * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to |
| * "private/foo". This way, privileged host users can access "foo/" as usual, but |
| * unprivileged host users can't look into it. Inside of the namespace of the unit |
| * "private/" is replaced by a more liberally accessible tmpfs, into which the host's |
| * "private/foo/" is mounted under the same name, thus disabling the access boundary |
| * for the service and making sure it only gets access to the dirs it needs but no |
| * others. Tricky? Yes, absolutely, but it works! |
| * |
| * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not |
| * to be owned by the service itself. |
| * |
| * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used |
| * for sharing files or sockets with other services. */ |
| |
| pp = path_join(params->prefix[type], "private"); |
| if (!pp) { |
| r = -ENOMEM; |
| goto fail; |
| } |
| |
| /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */ |
| r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE); |
| if (r < 0) |
| goto fail; |
| |
| if (!path_extend(&pp, *rt)) { |
| r = -ENOMEM; |
| goto fail; |
| } |
| |
| /* Create all directories between the configured directory and this private root, and mark them 0755 */ |
| r = mkdir_parents_label(pp, 0755); |
| if (r < 0) |
| goto fail; |
| |
| if (is_dir(p, false) > 0 && |
| (laccess(pp, F_OK) < 0 && errno == ENOENT)) { |
| |
| /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move |
| * it over. Most likely the service has been upgraded from one that didn't use |
| * DynamicUser=1, to one that does. */ |
| |
| log_info("Found pre-existing public %s= directory %s, migrating to %s.\n" |
| "Apparently, service previously had DynamicUser= turned off, and has now turned it on.", |
| exec_directory_type_to_string(type), p, pp); |
| |
| if (rename(p, pp) < 0) { |
| r = -errno; |
| goto fail; |
| } |
| } else { |
| /* Otherwise, create the actual directory for the service */ |
| |
| r = mkdir_label(pp, context->directories[type].mode); |
| if (r < 0 && r != -EEXIST) |
| goto fail; |
| } |
| |
| /* And link it up from the original place */ |
| r = symlink_idempotent(pp, p, true); |
| if (r < 0) |
| goto fail; |
| |
| } else { |
| _cleanup_free_ char *target = NULL; |
| |
| if (type != EXEC_DIRECTORY_CONFIGURATION && |
| readlink_and_make_absolute(p, &target) >= 0) { |
| _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL; |
| |
| /* This already exists and is a symlink? Interesting. Maybe it's one created |
| * by DynamicUser=1 (see above)? |
| * |
| * We do this for all directory types except for ConfigurationDirectory=, |
| * since they all support the private/ symlink logic at least in some |
| * configurations, see above. */ |
| |
| r = chase_symlinks(target, NULL, 0, &target_resolved, NULL); |
| if (r < 0) |
| goto fail; |
| |
| q = path_join(params->prefix[type], "private", *rt); |
| if (!q) { |
| r = -ENOMEM; |
| goto fail; |
| } |
| |
| /* /var/lib or friends may be symlinks. So, let's chase them also. */ |
| r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL); |
| if (r < 0) |
| goto fail; |
| |
| if (path_equal(q_resolved, target_resolved)) { |
| |
| /* Hmm, apparently DynamicUser= was once turned on for this service, |
| * but is no longer. Let's move the directory back up. */ |
| |
| log_info("Found pre-existing private %s= directory %s, migrating to %s.\n" |
| "Apparently, service previously had DynamicUser= turned on, and has now turned it off.", |
| exec_directory_type_to_string(type), q, p); |
| |
| if (unlink(p) < 0) { |
| r = -errno; |
| goto fail; |
| } |
| |
| if (rename(q, p) < 0) { |
| r = -errno; |
| goto fail; |
| } |
| } |
| } |
| |
| r = mkdir_label(p, context->directories[type].mode); |
| if (r < 0) { |
| if (r != -EEXIST) |
| goto fail; |
| |
| if (type == EXEC_DIRECTORY_CONFIGURATION) { |
| struct stat st; |
| |
| /* Don't change the owner/access mode of the configuration directory, |
| * as in the common case it is not written to by a service, and shall |
| * not be writable. */ |
| |
| if (stat(p, &st) < 0) { |
| r = -errno; |
| goto fail; |
| } |
| |
| /* Still complain if the access mode doesn't match */ |
| if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0) |
| log_warning("%s \'%s\' already exists but the mode is different. " |
| "(File system: %o %sMode: %o)", |
| exec_directory_type_to_string(type), *rt, |
| st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777); |
| |
| continue; |
| } |
| } |
| } |
| |
| /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't |
| * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the |
| * current UID/GID ownership.) */ |
| r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID); |
| if (r < 0) |
| goto fail; |
| |
| /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we |
| * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID |
| * assignments to exist. */ |
| r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777); |
| if (r < 0) |
| goto fail; |
| } |
| |
| return 0; |
| |
| fail: |
| *exit_status = exit_status_table[type]; |
| return r; |
| } |
| |
| static int write_credential( |
| int dfd, |
| const char *id, |
| const void *data, |
| size_t size, |
| uid_t uid, |
| bool ownership_ok) { |
| |
| _cleanup_(unlink_and_freep) char *tmp = NULL; |
| _cleanup_close_ int fd = -1; |
| int r; |
| |
| r = tempfn_random_child("", "cred", &tmp); |
| if (r < 0) |
| return r; |
| |
| fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600); |
| if (fd < 0) { |
| tmp = mfree(tmp); |
| return -errno; |
| } |
| |
| r = loop_write(fd, data, size, /* do_pool = */ false); |
| if (r < 0) |
| return r; |
| |
| if (fchmod(fd, 0400) < 0) /* Take away "w" bit */ |
| return -errno; |
| |
| if (uid_is_valid(uid) && uid != getuid()) { |
| r = fd_add_uid_acl_permission(fd, uid, ACL_READ); |
| if (r < 0) { |
| if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) |
| return r; |
| |
| if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want |
| * to express: that the user gets read access and nothing |
| * else. But if the backing fs can't support that (e.g. ramfs) |
| * then we can use file ownership instead. But that's only safe if |
| * we can then re-mount the whole thing read-only, so that the |
| * user can no longer chmod() the file to gain write access. */ |
| return r; |
| |
| if (fchown(fd, uid, GID_INVALID) < 0) |
| return -errno; |
| } |
| } |
| |
| if (renameat(dfd, tmp, dfd, id) < 0) |
| return -errno; |
| |
| tmp = mfree(tmp); |
| return 0; |
| } |
| |
| #define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */ |
| |
| static int acquire_credentials( |
| const ExecContext *context, |
| const ExecParameters *params, |
| const char *unit, |
| const char *p, |
| uid_t uid, |
| bool ownership_ok) { |
| |
| uint64_t left = CREDENTIALS_BYTES_MAX; |
| _cleanup_close_ int dfd = -1; |
| ExecSetCredential *sc; |
| char **id, **fn; |
| int r; |
| |
| assert(context); |
| assert(p); |
| |
| dfd = open(p, O_DIRECTORY|O_CLOEXEC); |
| if (dfd < 0) |
| return -errno; |
| |
| /* First we use the literally specified credentials. Note that they might be overridden again below, |
| * and thus act as a "default" if the same credential is specified multiple times */ |
| HASHMAP_FOREACH(sc, context->set_credentials) { |
| size_t add; |
| |
| add = strlen(sc->id) + sc->size; |
| if (add > left) |
| return -E2BIG; |
| |
| r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok); |
| if (r < 0) |
| return r; |
| |
| left -= add; |
| } |
| |
| /* Then, load credential off disk (or acquire via AF_UNIX socket) */ |
| STRV_FOREACH_PAIR(id, fn, context->load_credentials) { |
| ReadFullFileFlags flags = READ_FULL_FILE_SECURE; |
| _cleanup_(erase_and_freep) char *data = NULL; |
| _cleanup_free_ char *j = NULL, *bindname = NULL; |
| bool missing_ok = true; |
| const char *source; |
| size_t size, add; |
| |
| if (path_is_absolute(*fn)) { |
| /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */ |
| source = *fn; |
| flags |= READ_FULL_FILE_CONNECT_SOCKET; |
| |
| /* Pass some minimal info about the unit and the credential name we are looking to acquire |
| * via the source socket address in case we read off an AF_UNIX socket. */ |
| if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0) |
| return -ENOMEM; |
| |
| missing_ok = false; |
| |
| } else if (params->received_credentials) { |
| /* If this is a relative path, take it relative to the credentials we received |
| * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating |
| * on a credential store, i.e. this is guaranteed to be regular files. */ |
| j = path_join(params->received_credentials, *fn); |
| if (!j) |
| return -ENOMEM; |
| |
| source = j; |
| } else |
| source = NULL; |
| |
| if (source) |
| r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size); |
| else |
| r = -ENOENT; |
| if (r == -ENOENT && (missing_ok || faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)) { |
| /* Make a missing inherited credential non-fatal, let's just continue. After all apps |
| * will get clear errors if we don't pass such a missing credential on as they |
| * themselves will get ENOENT when trying to read them, which should not be much |
| * worse than when we handle the error here and make it fatal. |
| * |
| * Also, if the source file doesn't exist, but we already acquired the key otherwise, |
| * then don't fail either. */ |
| log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", *fn); |
| continue; |
| } |
| if (r < 0) |
| return log_debug_errno(r, "Failed to read credential '%s': %m", *fn); |
| |
| add = strlen(*id) + size; |
| if (add > left) |
| return -E2BIG; |
| |
| r = write_credential(dfd, *id, data, size, uid, ownership_ok); |
| if (r < 0) |
| return r; |
| |
| left -= add; |
| } |
| |
| if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */ |
| return -errno; |
| |
| /* After we created all keys with the right perms, also make sure the credential store as a whole is |
| * accessible */ |
| |
| if (uid_is_valid(uid) && uid != getuid()) { |
| r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE); |
| if (r < 0) { |
| if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) |
| return r; |
| |
| if (!ownership_ok) |
| return r; |
| |
| if (fchown(dfd, uid, GID_INVALID) < 0) |
| return -errno; |
| } |
| } |
| |
| return 0; |
| } |
| |
| static int setup_credentials_internal( |
| const ExecContext *context, |
| const ExecParameters *params, |
| const char *unit, |
| const char *final, /* This is where the credential store shall eventually end up at */ |
| const char *workspace, /* This is where we can prepare it before moving it to the final place */ |
| bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */ |
| bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */ |
| uid_t uid) { |
| |
| int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true |
| * if we mounted something; false if we definitely can't mount anything */ |
| bool final_mounted; |
| const char *where; |
| |
| assert(context); |
| assert(final); |
| assert(workspace); |
| |
| if (reuse_workspace) { |
| r = path_is_mount_point(workspace, NULL, 0); |
| if (r < 0) |
| return r; |
| if (r > 0) |
| workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */ |
| else |
| workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */ |
| } else |
| workspace_mounted = -1; /* ditto */ |
| |
| r = path_is_mount_point(final, NULL, 0); |
| if (r < 0) |
| return r; |
| if (r > 0) { |
| /* If the final place already has something mounted, we use that. If the workspace also has |
| * something mounted we assume it's actually the same mount (but with MS_RDONLY |
| * different). */ |
| final_mounted = true; |
| |
| if (workspace_mounted < 0) { |
| /* If the final place is mounted, but the workspace we isn't, then let's bind mount |
| * the final version to the workspace, and make it writable, so that we can make |
| * changes */ |
| |
| r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); |
| if (r < 0) |
| return r; |
| |
| r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL); |
| if (r < 0) |
| return r; |
| |
| workspace_mounted = true; |
| } |
| } else |
| final_mounted = false; |
| |
| if (workspace_mounted < 0) { |
| /* Nothing is mounted on the workspace yet, let's try to mount something now */ |
| for (int try = 0;; try++) { |
| |
| if (try == 0) { |
| /* Try "ramfs" first, since it's not swap backed */ |
| r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700"); |
| if (r >= 0) { |
| workspace_mounted = true; |
| break; |
| } |
| |
| } else if (try == 1) { |
| _cleanup_free_ char *opts = NULL; |
| |
| if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0) |
| return -ENOMEM; |
| |
| /* Fall back to "tmpfs" otherwise */ |
| r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts); |
| if (r >= 0) { |
| workspace_mounted = true; |
| break; |
| } |
| |
| } else { |
| /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */ |
| r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); |
| if (r < 0) { |
| if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */ |
| return r; |
| |
| if (must_mount) /* If we it's not OK to use the plain directory |
| * fallback, propagate all errors too */ |
| return r; |
| |
| /* If we lack privileges to bind mount stuff, then let's gracefully |
| * proceed for compat with container envs, and just use the final dir |
| * as is. */ |
| |
| workspace_mounted = false; |
| break; |
| } |
| |
| /* Make the new bind mount writable (i.e. drop MS_RDONLY) */ |
| r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL); |
| if (r < 0) |
| return r; |
| |
| workspace_mounted = true; |
| break; |
| } |
| } |
| } |
| |
| assert(!must_mount || workspace_mounted > 0); |
| where = workspace_mounted ? workspace : final; |
| |
| r = acquire_credentials(context, params, unit, where, uid, workspace_mounted); |
| if (r < 0) |
| return r; |
| |
| if (workspace_mounted) { |
| /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */ |
| r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL); |
| if (r < 0) |
| return r; |
| |
| /* And mount it to the final place, read-only */ |
| if (final_mounted) |
| r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW); |
| else |
| r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL); |
| if (r < 0) |
| return r; |
| } else { |
| _cleanup_free_ char *parent = NULL; |
| |
| /* If we do not have our own mount put used the plain directory fallback, then we need to |
| * open access to the top-level credential directory and the per-service directory now */ |
| |
| parent = dirname_malloc(final); |
| if (!parent) |
| return -ENOMEM; |
| if (chmod(parent, 0755) < 0) |
| return -errno; |
| } |
| |
| return 0; |
| } |
| |
| static int setup_credentials( |
| const ExecContext *context, |
| const ExecParameters *params, |
| const char *unit, |
| uid_t uid) { |
| |
| _cleanup_free_ char *p = NULL, *q = NULL; |
| const char *i; |
| int r; |
| |
| assert(context); |
| assert(params); |
| |
| if (!exec_context_has_credentials(context)) |
| return 0; |
| |
| if (!params->prefix[EXEC_DIRECTORY_RUNTIME]) |
| return -EINVAL; |
| |
| /* This where we'll place stuff when we are done; this main credentials directory is world-readable, |
| * and the subdir we mount over with a read-only file system readable by the service's user */ |
| q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials"); |
| if (!q) |
| return -ENOMEM; |
| |
| r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */ |
| if (r < 0 && r != -EEXIST) |
| return r; |
| |
| p = path_join(q, unit); |
| if (!p) |
| return -ENOMEM; |
| |
| r = mkdir_label(p, 0700); /* per-unit dir: private to user */ |
| if (r < 0 && r != -EEXIST) |
| return r; |
| |
| r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL); |
| if (r < 0) { |
| _cleanup_free_ char *t = NULL, *u = NULL; |
| |
| /* If this is not a privilege or support issue then propagate the error */ |
| if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)) |
| return r; |
| |
| /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving |
| * it into place, so that users can't access half-initialized credential stores. */ |
| t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials"); |
| if (!t) |
| return -ENOMEM; |
| |
| /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit |
| * directory outside of /run/credentials/ first, and then move it over to /run/credentials/ |
| * after it is fully set up */ |
| u = path_join(t, unit); |
| if (!u) |
| return -ENOMEM; |
| |
| FOREACH_STRING(i, t, u) { |
| r = mkdir_label(i, 0700); |
| if (r < 0 && r != -EEXIST) |
| return r; |
| } |
| |
| r = setup_credentials_internal( |
| context, |
| params, |
| unit, |
| p, /* final mount point */ |
| u, /* temporary workspace to overmount */ |
| true, /* reuse the workspace if it is already a mount */ |
| false, /* it's OK to fall back to a plain directory if we can't mount anything */ |
| uid); |
| |
| (void) rmdir(u); /* remove the workspace again if we can. */ |
| |
| if (r < 0) |
| return r; |
| |
| } else if (r == 0) { |
| |
| /* We managed to set up a mount namespace, and are now in a child. That's great. In this case |
| * we can use the same directory for all cases, after turning off propagation. Question |
| * though is: where do we turn off propagation exactly, and where do we place the workspace |
| * directory? We need some place that is guaranteed to be a mount point in the host, and |
| * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this, |
| * since we ultimately want to move the resulting file system there, i.e. we need propagation |
| * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that |
| * would be visible in the host mount table all the time, which we want to avoid. Hence, what |
| * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that |
| * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off |
| * propagation on the former, and then overmount the latter. |
| * |
| * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist |
| * for this purpose, but there are few other candidates that work equally well for us, and |
| * given that the we do this in a privately namespaced short-lived single-threaded process |
| * that no one else sees this should be OK to do. */ |
| |
| r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */ |
| if (r < 0) |
| goto child_fail; |
| |
| r = setup_credentials_internal( |
| context, |
| params, |
| unit, |
| p, /* final mount point */ |
| "/dev/shm", /* temporary workspace to overmount */ |
| false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */ |
| true, /* insist that something is mounted, do not allow fallback to plain directory */ |
| uid); |
| if (r < 0) |
| goto child_fail; |
| |
| _exit(EXIT_SUCCESS); |
| |
| child_fail: |
| _exit(EXIT_FAILURE); |
| } |
| |
| return 0; |
| } |
| |
| #if ENABLE_SMACK |
| static int setup_smack( |
| const ExecContext *context, |
| int executable_fd) { |
| int r; |
| |
| assert(context); |
| assert(executable_fd >= 0); |
| |
| if (context->smack_process_label) { |
| r = mac_smack_apply_pid(0, context->smack_process_label); |
| if (r < 0) |
| return r; |
| } |
| #ifdef SMACK_DEFAULT_PROCESS_LABEL |
| else { |
| _cleanup_free_ char *exec_label = NULL; |
| |
| r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label); |
| if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP)) |
| return r; |
| |
| r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL); |
| if (r < 0) |
| return r; |
| } |
| #endif |
| |
| return 0; |
| } |
| #endif |
| |
| static int compile_bind_mounts( |
| const ExecContext *context, |
| const ExecParameters *params, |
| BindMount **ret_bind_mounts, |
| size_t *ret_n_bind_mounts, |
| char ***ret_empty_directories) { |
| |
| _cleanup_strv_free_ char **empty_directories = NULL; |
| BindMount *bind_mounts; |
| size_t n, h = 0; |
| int r; |
| |
| assert(context); |
| assert(params); |
| assert(ret_bind_mounts); |
| assert(ret_n_bind_mounts); |
| assert(ret_empty_directories); |
| |
| n = context->n_bind_mounts; |
| for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { |
| if (!params->prefix[t]) |
| continue; |
| |
| n += strv_length(context->directories[t].paths); |
| } |
| |
| if (n <= 0) { |
| *ret_bind_mounts = NULL; |
| *ret_n_bind_mounts = 0; |
| *ret_empty_directories = NULL; |
| return 0; |
| } |
| |
| |