blob: 37f63a93788363af422f65310feb6e26c21a4599 [file] [log] [blame]
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/personality.h>
#include <sys/prctl.h>
#include <sys/shm.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>
#include <utmpx.h>
#if HAVE_PAM
#include <security/pam_appl.h>
#endif
#if HAVE_SELINUX
#include <selinux/selinux.h>
#endif
#if HAVE_SECCOMP
#include <seccomp.h>
#endif
#if HAVE_APPARMOR
#include <sys/apparmor.h>
#endif
#include "sd-messages.h"
#include "acl-util.h"
#include "af-list.h"
#include "alloc-util.h"
#if HAVE_APPARMOR
#include "apparmor-util.h"
#endif
#include "async.h"
#include "barrier.h"
#include "cap-list.h"
#include "capability-util.h"
#include "cgroup-setup.h"
#include "chown-recursive.h"
#include "cpu-set-util.h"
#include "data-fd-util.h"
#include "def.h"
#include "env-file.h"
#include "env-util.h"
#include "errno-list.h"
#include "escape.h"
#include "execute.h"
#include "exit-status.h"
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
#include "fs-util.h"
#include "glob-util.h"
#include "hexdecoct.h"
#include "io-util.h"
#include "ioprio.h"
#include "label.h"
#include "log.h"
#include "macro.h"
#include "manager.h"
#include "manager-dump.h"
#include "memory-util.h"
#include "missing_fs.h"
#include "mkdir.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "random-util.h"
#include "rlimit-util.h"
#include "rm-rf.h"
#if HAVE_SECCOMP
#include "seccomp-util.h"
#endif
#include "securebits-util.h"
#include "selinux-util.h"
#include "signal-util.h"
#include "smack-util.h"
#include "socket-util.h"
#include "special.h"
#include "stat-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
#include "syslog-util.h"
#include "terminal-util.h"
#include "tmpfile-util.h"
#include "umask-util.h"
#include "unit-serialize.h"
#include "user-util.h"
#include "utmp-wtmp.h"
#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
#define SNDBUF_SIZE (8*1024*1024)
static int shift_fds(int fds[], size_t n_fds) {
if (n_fds <= 0)
return 0;
/* Modifies the fds array! (sorts it) */
assert(fds);
for (int start = 0;;) {
int restart_from = -1;
for (int i = start; i < (int) n_fds; i++) {
int nfd;
/* Already at right index? */
if (fds[i] == i+3)
continue;
nfd = fcntl(fds[i], F_DUPFD, i + 3);
if (nfd < 0)
return -errno;
safe_close(fds[i]);
fds[i] = nfd;
/* Hmm, the fd we wanted isn't free? Then
* let's remember that and try again from here */
if (nfd != i+3 && restart_from < 0)
restart_from = i;
}
if (restart_from < 0)
break;
start = restart_from;
}
return 0;
}
static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
size_t n_fds;
int r;
n_fds = n_socket_fds + n_storage_fds;
if (n_fds <= 0)
return 0;
assert(fds);
/* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
* O_NONBLOCK only applies to socket activation though. */
for (size_t i = 0; i < n_fds; i++) {
if (i < n_socket_fds) {
r = fd_nonblock(fds[i], nonblock);
if (r < 0)
return r;
}
/* We unconditionally drop FD_CLOEXEC from the fds,
* since after all we want to pass these fds to our
* children */
r = fd_cloexec(fds[i], false);
if (r < 0)
return r;
}
return 0;
}
static const char *exec_context_tty_path(const ExecContext *context) {
assert(context);
if (context->stdio_as_fds)
return NULL;
if (context->tty_path)
return context->tty_path;
return "/dev/console";
}
static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
const char *path;
assert(context);
path = exec_context_tty_path(context);
if (context->tty_vhangup) {
if (p && p->stdin_fd >= 0)
(void) terminal_vhangup_fd(p->stdin_fd);
else if (path)
(void) terminal_vhangup(path);
}
if (context->tty_reset) {
if (p && p->stdin_fd >= 0)
(void) reset_terminal_fd(p->stdin_fd, true);
else if (path)
(void) reset_terminal(path);
}
if (context->tty_vt_disallocate && path)
(void) vt_disallocate(path);
}
static bool is_terminal_input(ExecInput i) {
return IN_SET(i,
EXEC_INPUT_TTY,
EXEC_INPUT_TTY_FORCE,
EXEC_INPUT_TTY_FAIL);
}
static bool is_terminal_output(ExecOutput o) {
return IN_SET(o,
EXEC_OUTPUT_TTY,
EXEC_OUTPUT_KMSG_AND_CONSOLE,
EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
}
static bool is_kmsg_output(ExecOutput o) {
return IN_SET(o,
EXEC_OUTPUT_KMSG,
EXEC_OUTPUT_KMSG_AND_CONSOLE);
}
static bool exec_context_needs_term(const ExecContext *c) {
assert(c);
/* Return true if the execution context suggests we should set $TERM to something useful. */
if (is_terminal_input(c->std_input))
return true;
if (is_terminal_output(c->std_output))
return true;
if (is_terminal_output(c->std_error))
return true;
return !!c->tty_path;
}
static int open_null_as(int flags, int nfd) {
int fd;
assert(nfd >= 0);
fd = open("/dev/null", flags|O_NOCTTY);
if (fd < 0)
return -errno;
return move_fd(fd, nfd, false);
}
static int connect_journal_socket(
int fd,
const char *log_namespace,
uid_t uid,
gid_t gid) {
union sockaddr_union sa;
socklen_t sa_len;
uid_t olduid = UID_INVALID;
gid_t oldgid = GID_INVALID;
const char *j;
int r;
j = log_namespace ?
strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
"/run/systemd/journal/stdout";
r = sockaddr_un_set_path(&sa.un, j);
if (r < 0)
return r;
sa_len = r;
if (gid_is_valid(gid)) {
oldgid = getgid();
if (setegid(gid) < 0)
return -errno;
}
if (uid_is_valid(uid)) {
olduid = getuid();
if (seteuid(uid) < 0) {
r = -errno;
goto restore_gid;
}
}
r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
/* If we fail to restore the uid or gid, things will likely
fail later on. This should only happen if an LSM interferes. */
if (uid_is_valid(uid))
(void) seteuid(olduid);
restore_gid:
if (gid_is_valid(gid))
(void) setegid(oldgid);
return r;
}
static int connect_logger_as(
const Unit *unit,
const ExecContext *context,
const ExecParameters *params,
ExecOutput output,
const char *ident,
int nfd,
uid_t uid,
gid_t gid) {
_cleanup_close_ int fd = -1;
int r;
assert(context);
assert(params);
assert(output < _EXEC_OUTPUT_MAX);
assert(ident);
assert(nfd >= 0);
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0)
return -errno;
r = connect_journal_socket(fd, context->log_namespace, uid, gid);
if (r < 0)
return r;
if (shutdown(fd, SHUT_RD) < 0)
return -errno;
(void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
if (dprintf(fd,
"%s\n"
"%s\n"
"%i\n"
"%i\n"
"%i\n"
"%i\n"
"%i\n",
context->syslog_identifier ?: ident,
params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
context->syslog_priority,
!!context->syslog_level_prefix,
false,
is_kmsg_output(output),
is_terminal_output(output)) < 0)
return -errno;
return move_fd(TAKE_FD(fd), nfd, false);
}
static int open_terminal_as(const char *path, int flags, int nfd) {
int fd;
assert(path);
assert(nfd >= 0);
fd = open_terminal(path, flags | O_NOCTTY);
if (fd < 0)
return fd;
return move_fd(fd, nfd, false);
}
static int acquire_path(const char *path, int flags, mode_t mode) {
union sockaddr_union sa;
socklen_t sa_len;
_cleanup_close_ int fd = -1;
int r;
assert(path);
if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
flags |= O_CREAT;
fd = open(path, flags|O_NOCTTY, mode);
if (fd >= 0)
return TAKE_FD(fd);
if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
return -errno;
/* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
r = sockaddr_un_set_path(&sa.un, path);
if (r < 0)
return r == -EINVAL ? -ENXIO : r;
sa_len = r;
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0)
return -errno;
if (connect(fd, &sa.sa, sa_len) < 0)
return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
* indication that this wasn't an AF_UNIX socket after all */
if ((flags & O_ACCMODE) == O_RDONLY)
r = shutdown(fd, SHUT_WR);
else if ((flags & O_ACCMODE) == O_WRONLY)
r = shutdown(fd, SHUT_RD);
else
r = 0;
if (r < 0)
return -errno;
return TAKE_FD(fd);
}
static int fixup_input(
const ExecContext *context,
int socket_fd,
bool apply_tty_stdin) {
ExecInput std_input;
assert(context);
std_input = context->std_input;
if (is_terminal_input(std_input) && !apply_tty_stdin)
return EXEC_INPUT_NULL;
if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
return EXEC_INPUT_NULL;
if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
return EXEC_INPUT_NULL;
return std_input;
}
static int fixup_output(ExecOutput output, int socket_fd) {
if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
return EXEC_OUTPUT_INHERIT;
return output;
}
static int setup_input(
const ExecContext *context,
const ExecParameters *params,
int socket_fd,
const int named_iofds[static 3]) {
ExecInput i;
assert(context);
assert(params);
assert(named_iofds);
if (params->stdin_fd >= 0) {
if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
return -errno;
/* Try to make this the controlling tty, if it is a tty, and reset it */
if (isatty(STDIN_FILENO)) {
(void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
(void) reset_terminal_fd(STDIN_FILENO, true);
}
return STDIN_FILENO;
}
i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
switch (i) {
case EXEC_INPUT_NULL:
return open_null_as(O_RDONLY, STDIN_FILENO);
case EXEC_INPUT_TTY:
case EXEC_INPUT_TTY_FORCE:
case EXEC_INPUT_TTY_FAIL: {
int fd;
fd = acquire_terminal(exec_context_tty_path(context),
i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
ACQUIRE_TERMINAL_WAIT,
USEC_INFINITY);
if (fd < 0)
return fd;
return move_fd(fd, STDIN_FILENO, false);
}
case EXEC_INPUT_SOCKET:
assert(socket_fd >= 0);
return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
case EXEC_INPUT_NAMED_FD:
assert(named_iofds[STDIN_FILENO] >= 0);
(void) fd_nonblock(named_iofds[STDIN_FILENO], false);
return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
case EXEC_INPUT_DATA: {
int fd;
fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
if (fd < 0)
return fd;
return move_fd(fd, STDIN_FILENO, false);
}
case EXEC_INPUT_FILE: {
bool rw;
int fd;
assert(context->stdio_file[STDIN_FILENO]);
rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
(context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
if (fd < 0)
return fd;
return move_fd(fd, STDIN_FILENO, false);
}
default:
assert_not_reached("Unknown input type");
}
}
static bool can_inherit_stderr_from_stdout(
const ExecContext *context,
ExecOutput o,
ExecOutput e) {
assert(context);
/* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
* stderr fd */
if (e == EXEC_OUTPUT_INHERIT)
return true;
if (e != o)
return false;
if (e == EXEC_OUTPUT_NAMED_FD)
return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
return true;
}
static int setup_output(
const Unit *unit,
const ExecContext *context,
const ExecParameters *params,
int fileno,
int socket_fd,
const int named_iofds[static 3],
const char *ident,
uid_t uid,
gid_t gid,
dev_t *journal_stream_dev,
ino_t *journal_stream_ino) {
ExecOutput o;
ExecInput i;
int r;
assert(unit);
assert(context);
assert(params);
assert(ident);
assert(journal_stream_dev);
assert(journal_stream_ino);
if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
return -errno;
return STDOUT_FILENO;
}
if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
return -errno;
return STDERR_FILENO;
}
i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
o = fixup_output(context->std_output, socket_fd);
if (fileno == STDERR_FILENO) {
ExecOutput e;
e = fixup_output(context->std_error, socket_fd);
/* This expects the input and output are already set up */
/* Don't change the stderr file descriptor if we inherit all
* the way and are not on a tty */
if (e == EXEC_OUTPUT_INHERIT &&
o == EXEC_OUTPUT_INHERIT &&
i == EXEC_INPUT_NULL &&
!is_terminal_input(context->std_input) &&
getppid() != 1)
return fileno;
/* Duplicate from stdout if possible */
if (can_inherit_stderr_from_stdout(context, o, e))
return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
o = e;
} else if (o == EXEC_OUTPUT_INHERIT) {
/* If input got downgraded, inherit the original value */
if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
/* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
/* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
if (getppid() != 1)
return fileno;
/* We need to open /dev/null here anew, to get the right access mode. */
return open_null_as(O_WRONLY, fileno);
}
switch (o) {
case EXEC_OUTPUT_NULL:
return open_null_as(O_WRONLY, fileno);
case EXEC_OUTPUT_TTY:
if (is_terminal_input(i))
return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
/* We don't reset the terminal if this is just about output */
return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
case EXEC_OUTPUT_KMSG:
case EXEC_OUTPUT_KMSG_AND_CONSOLE:
case EXEC_OUTPUT_JOURNAL:
case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
if (r < 0) {
log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
fileno == STDOUT_FILENO ? "stdout" : "stderr");
r = open_null_as(O_WRONLY, fileno);
} else {
struct stat st;
/* If we connected this fd to the journal via a stream, patch the device/inode into the passed
* parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
* services to detect whether they are connected to the journal or not.
*
* If both stdout and stderr are connected to a stream then let's make sure to store the data
* about STDERR as that's usually the best way to do logging. */
if (fstat(fileno, &st) >= 0 &&
(*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
*journal_stream_dev = st.st_dev;
*journal_stream_ino = st.st_ino;
}
}
return r;
case EXEC_OUTPUT_SOCKET:
assert(socket_fd >= 0);
return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
case EXEC_OUTPUT_NAMED_FD:
assert(named_iofds[fileno] >= 0);
(void) fd_nonblock(named_iofds[fileno], false);
return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
case EXEC_OUTPUT_FILE:
case EXEC_OUTPUT_FILE_APPEND:
case EXEC_OUTPUT_FILE_TRUNCATE: {
bool rw;
int fd, flags;
assert(context->stdio_file[fileno]);
rw = context->std_input == EXEC_INPUT_FILE &&
streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
if (rw)
return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
flags = O_WRONLY;
if (o == EXEC_OUTPUT_FILE_APPEND)
flags |= O_APPEND;
else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
flags |= O_TRUNC;
fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
if (fd < 0)
return fd;
return move_fd(fd, fileno, 0);
}
default:
assert_not_reached("Unknown error type");
}
}
static int chown_terminal(int fd, uid_t uid) {
int r;
assert(fd >= 0);
/* Before we chown/chmod the TTY, let's ensure this is actually a tty */
if (isatty(fd) < 1) {
if (IN_SET(errno, EINVAL, ENOTTY))
return 0; /* not a tty */
return -errno;
}
/* This might fail. What matters are the results. */
r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
if (r < 0)
return r;
return 1;
}
static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
_cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
int r;
assert(_saved_stdin);
assert(_saved_stdout);
saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
if (saved_stdin < 0)
return -errno;
saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
if (saved_stdout < 0)
return -errno;
fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
if (fd < 0)
return fd;
r = chown_terminal(fd, getuid());
if (r < 0)
return r;
r = reset_terminal_fd(fd, true);
if (r < 0)
return r;
r = rearrange_stdio(fd, fd, STDERR_FILENO);
fd = -1;
if (r < 0)
return r;
*_saved_stdin = saved_stdin;
*_saved_stdout = saved_stdout;
saved_stdin = saved_stdout = -1;
return 0;
}
static void write_confirm_error_fd(int err, int fd, const Unit *u) {
assert(err < 0);
if (err == -ETIMEDOUT)
dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
else {
errno = -err;
dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
}
}
static void write_confirm_error(int err, const char *vc, const Unit *u) {
_cleanup_close_ int fd = -1;
assert(vc);
fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
if (fd < 0)
return;
write_confirm_error_fd(err, fd, u);
}
static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
int r = 0;
assert(saved_stdin);
assert(saved_stdout);
release_terminal();
if (*saved_stdin >= 0)
if (dup2(*saved_stdin, STDIN_FILENO) < 0)
r = -errno;
if (*saved_stdout >= 0)
if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
r = -errno;
*saved_stdin = safe_close(*saved_stdin);
*saved_stdout = safe_close(*saved_stdout);
return r;
}
enum {
CONFIRM_PRETEND_FAILURE = -1,
CONFIRM_PRETEND_SUCCESS = 0,
CONFIRM_EXECUTE = 1,
};
static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
int saved_stdout = -1, saved_stdin = -1, r;
_cleanup_free_ char *e = NULL;
char c;
/* For any internal errors, assume a positive response. */
r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
if (r < 0) {
write_confirm_error(r, vc, u);
return CONFIRM_EXECUTE;
}
/* confirm_spawn might have been disabled while we were sleeping. */
if (manager_is_confirm_spawn_disabled(u->manager)) {
r = 1;
goto restore_stdio;
}
e = ellipsize(cmdline, 60, 100);
if (!e) {
log_oom();
r = CONFIRM_EXECUTE;
goto restore_stdio;
}
for (;;) {
r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
if (r < 0) {
write_confirm_error_fd(r, STDOUT_FILENO, u);
r = CONFIRM_EXECUTE;
goto restore_stdio;
}
switch (c) {
case 'c':
printf("Resuming normal execution.\n");
manager_disable_confirm_spawn();
r = 1;
break;
case 'D':
unit_dump(u, stdout, " ");
continue; /* ask again */
case 'f':
printf("Failing execution.\n");
r = CONFIRM_PRETEND_FAILURE;
break;
case 'h':
printf(" c - continue, proceed without asking anymore\n"
" D - dump, show the state of the unit\n"
" f - fail, don't execute the command and pretend it failed\n"
" h - help\n"
" i - info, show a short summary of the unit\n"
" j - jobs, show jobs that are in progress\n"
" s - skip, don't execute the command and pretend it succeeded\n"
" y - yes, execute the command\n");
continue; /* ask again */
case 'i':
printf(" Description: %s\n"
" Unit: %s\n"
" Command: %s\n",
u->id, u->description, cmdline);
continue; /* ask again */
case 'j':
manager_dump_jobs(u->manager, stdout, " ");
continue; /* ask again */
case 'n':
/* 'n' was removed in favor of 'f'. */
printf("Didn't understand 'n', did you mean 'f'?\n");
continue; /* ask again */
case 's':
printf("Skipping execution.\n");
r = CONFIRM_PRETEND_SUCCESS;
break;
case 'y':
r = CONFIRM_EXECUTE;
break;
default:
assert_not_reached("Unhandled choice");
}
break;
}
restore_stdio:
restore_confirm_stdio(&saved_stdin, &saved_stdout);
return r;
}
static int get_fixed_user(const ExecContext *c, const char **user,
uid_t *uid, gid_t *gid,
const char **home, const char **shell) {
int r;
const char *name;
assert(c);
if (!c->user)
return 0;
/* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
* (i.e. are "/" or "/bin/nologin"). */
name = c->user;
r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
if (r < 0)
return r;
*user = name;
return 0;
}
static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
int r;
const char *name;
assert(c);
if (!c->group)
return 0;
name = c->group;
r = get_group_creds(&name, gid, 0);
if (r < 0)
return r;
*group = name;
return 0;
}
static int get_supplementary_groups(const ExecContext *c, const char *user,
const char *group, gid_t gid,
gid_t **supplementary_gids, int *ngids) {
char **i;
int r, k = 0;
int ngroups_max;
bool keep_groups = false;
gid_t *groups = NULL;
_cleanup_free_ gid_t *l_gids = NULL;
assert(c);
/*
* If user is given, then lookup GID and supplementary groups list.
* We avoid NSS lookups for gid=0. Also we have to initialize groups
* here and as early as possible so we keep the list of supplementary
* groups of the caller.
*/
if (user && gid_is_valid(gid) && gid != 0) {
/* First step, initialize groups from /etc/groups */
if (initgroups(user, gid) < 0)
return -errno;
keep_groups = true;
}
if (strv_isempty(c->supplementary_groups))
return 0;
/*
* If SupplementaryGroups= was passed then NGROUPS_MAX has to
* be positive, otherwise fail.
*/
errno = 0;
ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
if (ngroups_max <= 0)
return errno_or_else(EOPNOTSUPP);
l_gids = new(gid_t, ngroups_max);
if (!l_gids)
return -ENOMEM;
if (keep_groups) {
/*
* Lookup the list of groups that the user belongs to, we
* avoid NSS lookups here too for gid=0.
*/
k = ngroups_max;
if (getgrouplist(user, gid, l_gids, &k) < 0)
return -EINVAL;
} else
k = 0;
STRV_FOREACH(i, c->supplementary_groups) {
const char *g;
if (k >= ngroups_max)
return -E2BIG;
g = *i;
r = get_group_creds(&g, l_gids+k, 0);
if (r < 0)
return r;
k++;
}
/*
* Sets ngids to zero to drop all supplementary groups, happens
* when we are under root and SupplementaryGroups= is empty.
*/
if (k == 0) {
*ngids = 0;
return 0;
}
/* Otherwise get the final list of supplementary groups */
groups = memdup(l_gids, sizeof(gid_t) * k);
if (!groups)
return -ENOMEM;
*supplementary_gids = groups;
*ngids = k;
groups = NULL;
return 0;
}
static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
int r;
/* Handle SupplementaryGroups= if it is not empty */
if (ngids > 0) {
r = maybe_setgroups(ngids, supplementary_gids);
if (r < 0)
return r;
}
if (gid_is_valid(gid)) {
/* Then set our gids */
if (setresgid(gid, gid, gid) < 0)
return -errno;
}
return 0;
}
static int set_securebits(int bits, int mask) {
int current, applied;
current = prctl(PR_GET_SECUREBITS);
if (current < 0)
return -errno;
/* Clear all securebits defined in mask and set bits */
applied = (current & ~mask) | bits;
if (current == applied)
return 0;
if (prctl(PR_SET_SECUREBITS, applied) < 0)
return -errno;
return 1;
}
static int enforce_user(const ExecContext *context, uid_t uid) {
assert(context);
int r;
if (!uid_is_valid(uid))
return 0;
/* Sets (but doesn't look up) the uid and make sure we keep the
* capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
* required, so we also need keep-caps in this case.
*/
if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
/* First step: If we need to keep capabilities but
* drop privileges we need to make sure we keep our
* caps, while we drop privileges. */
if (uid != 0) {
/* Add KEEP_CAPS to the securebits */
r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
if (r < 0)
return r;
}
}
/* Second step: actually set the uids */
if (setresuid(uid, uid, uid) < 0)
return -errno;
/* At this point we should have all necessary capabilities but
are otherwise a normal user. However, the caps might got
corrupted due to the setresuid() so we need clean them up
later. This is done outside of this call. */
return 0;
}
#if HAVE_PAM
static int null_conv(
int num_msg,
const struct pam_message **msg,
struct pam_response **resp,
void *appdata_ptr) {
/* We don't support conversations */
return PAM_CONV_ERR;
}
#endif
static int setup_pam(
const char *name,
const char *user,
uid_t uid,
gid_t gid,
const char *tty,
char ***env, /* updated on success */
const int fds[], size_t n_fds) {
#if HAVE_PAM
static const struct pam_conv conv = {
.conv = null_conv,
.appdata_ptr = NULL
};
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_strv_free_ char **e = NULL;
pam_handle_t *handle = NULL;
sigset_t old_ss;
int pam_code = PAM_SUCCESS, r;
char **nv;
bool close_session = false;
pid_t pam_pid = 0, parent_pid;
int flags = 0;
assert(name);
assert(user);
assert(env);
/* We set up PAM in the parent process, then fork. The child
* will then stay around until killed via PR_GET_PDEATHSIG or
* systemd via the cgroup logic. It will then remove the PAM
* session again. The parent process will exec() the actual
* daemon. We do things this way to ensure that the main PID
* of the daemon is the one we initially fork()ed. */
r = barrier_create(&barrier);
if (r < 0)
goto fail;
if (log_get_max_level() < LOG_DEBUG)
flags |= PAM_SILENT;
pam_code = pam_start(name, user, &conv, &handle);
if (pam_code != PAM_SUCCESS) {
handle = NULL;
goto fail;
}
if (!tty) {
_cleanup_free_ char *q = NULL;
/* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
* out if that's the case, and read the TTY off it. */
if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
tty = strjoina("/dev/", q);
}
if (tty) {
pam_code = pam_set_item(handle, PAM_TTY, tty);
if (pam_code != PAM_SUCCESS)
goto fail;
}
STRV_FOREACH(nv, *env) {
pam_code = pam_putenv(handle, *nv);
if (pam_code != PAM_SUCCESS)
goto fail;
}
pam_code = pam_acct_mgmt(handle, flags);
if (pam_code != PAM_SUCCESS)
goto fail;
pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
if (pam_code != PAM_SUCCESS)
log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
pam_code = pam_open_session(handle, flags);
if (pam_code != PAM_SUCCESS)
goto fail;
close_session = true;
e = pam_getenvlist(handle);
if (!e) {
pam_code = PAM_BUF_ERR;
goto fail;
}
/* Block SIGTERM, so that we know that it won't get lost in the child */
assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
parent_pid = getpid_cached();
r = safe_fork("(sd-pam)", 0, &pam_pid);
if (r < 0)
goto fail;
if (r == 0) {
int sig, ret = EXIT_PAM;
/* The child's job is to reset the PAM session on termination */
barrier_set_role(&barrier, BARRIER_CHILD);
/* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
* those fds are open here that have been opened by PAM. */
(void) close_many(fds, n_fds);
/* Drop privileges - we don't need any to pam_close_session and this will make
* PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
* threads to fail to exit normally */
r = maybe_setgroups(0, NULL);
if (r < 0)
log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
if (setresgid(gid, gid, gid) < 0)
log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
if (setresuid(uid, uid, uid) < 0)
log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
(void) ignore_signals(SIGPIPE);
/* Wait until our parent died. This will only work if the above setresuid() succeeds,
* otherwise the kernel will not allow unprivileged parents kill their privileged children
* this way. We rely on the control groups kill logic to do the rest for us. */
if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
goto child_finish;
/* Tell the parent that our setup is done. This is especially important regarding dropping
* privileges. Otherwise, unit setup might race against our setresuid(2) call.
*
* If the parent aborted, we'll detect this below, hence ignore return failure here. */
(void) barrier_place(&barrier);
/* Check if our parent process might already have died? */
if (getppid() == parent_pid) {
sigset_t ss;
assert_se(sigemptyset(&ss) >= 0);
assert_se(sigaddset(&ss, SIGTERM) >= 0);
for (;;) {
if (sigwait(&ss, &sig) < 0) {
if (errno == EINTR)
continue;
goto child_finish;
}
assert(sig == SIGTERM);
break;
}
}
pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
if (pam_code != PAM_SUCCESS)
goto child_finish;
/* If our parent died we'll end the session */
if (getppid() != parent_pid) {
pam_code = pam_close_session(handle, flags);
if (pam_code != PAM_SUCCESS)
goto child_finish;
}
ret = 0;
child_finish:
/* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
* know about this. See pam_end(3) */
(void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
_exit(ret);
}
barrier_set_role(&barrier, BARRIER_PARENT);
/* If the child was forked off successfully it will do all the cleanups, so forget about the handle
* here. */
handle = NULL;
/* Unblock SIGTERM again in the parent */
assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
/* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
* this fd around. */
closelog();
/* Synchronously wait for the child to initialize. We don't care for errors as we cannot
* recover. However, warn loudly if it happens. */
if (!barrier_place_and_sync(&barrier))
log_error("PAM initialization failed");
return strv_free_and_replace(*env, e);
fail:
if (pam_code != PAM_SUCCESS) {
log_error("PAM failed: %s", pam_strerror(handle, pam_code));
r = -EPERM; /* PAM errors do not map to errno */
} else
log_error_errno(r, "PAM failed: %m");
if (handle) {
if (close_session)
pam_code = pam_close_session(handle, flags);
(void) pam_end(handle, pam_code | flags);
}
closelog();
return r;
#else
return 0;
#endif
}
static void rename_process_from_path(const char *path) {
char process_name[11];
const char *p;
size_t l;
/* This resulting string must fit in 10 chars (i.e. the length
* of "/sbin/init") to look pretty in /bin/ps */
p = basename(path);
if (isempty(p)) {
rename_process("(...)");
return;
}
l = strlen(p);
if (l > 8) {
/* The end of the process name is usually more
* interesting, since the first bit might just be
* "systemd-" */
p = p + l - 8;
l = 8;
}
process_name[0] = '(';
memcpy(process_name+1, p, l);
process_name[1+l] = ')';
process_name[1+l+1] = 0;
rename_process(process_name);
}
static bool context_has_address_families(const ExecContext *c) {
assert(c);
return c->address_families_allow_list ||
!set_isempty(c->address_families);
}
static bool context_has_syscall_filters(const ExecContext *c) {
assert(c);
return c->syscall_allow_list ||
!hashmap_isempty(c->syscall_filter);
}
static bool context_has_syscall_logs(const ExecContext *c) {
assert(c);
return c->syscall_log_allow_list ||
!hashmap_isempty(c->syscall_log);
}
static bool context_has_no_new_privileges(const ExecContext *c) {
assert(c);
if (c->no_new_privileges)
return true;
if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
return false;
/* We need NNP if we have any form of seccomp and are unprivileged */
return c->lock_personality ||
c->memory_deny_write_execute ||
c->private_devices ||
c->protect_clock ||
c->protect_hostname ||
c->protect_kernel_tunables ||
c->protect_kernel_modules ||
c->protect_kernel_logs ||
context_has_address_families(c) ||
exec_context_restrict_namespaces_set(c) ||
c->restrict_realtime ||
c->restrict_suid_sgid ||
!set_isempty(c->syscall_archs) ||
context_has_syscall_filters(c) ||
context_has_syscall_logs(c);
}
static bool exec_context_has_credentials(const ExecContext *context) {
assert(context);
return !hashmap_isempty(context->set_credentials) ||
context->load_credentials;
}
#if HAVE_SECCOMP
static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
if (is_seccomp_available())
return false;
log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
return true;
}
static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
uint32_t negative_action, default_action, action;
int r;
assert(u);
assert(c);
if (!context_has_syscall_filters(c))
return 0;
if (skip_seccomp_unavailable(u, "SystemCallFilter="))
return 0;
negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
if (c->syscall_allow_list) {
default_action = negative_action;
action = SCMP_ACT_ALLOW;
} else {
default_action = SCMP_ACT_ALLOW;
action = negative_action;
}
if (needs_ambient_hack) {
r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
if (r < 0)
return r;
}
return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
}
static int apply_syscall_log(const Unit* u, const ExecContext *c) {
#ifdef SCMP_ACT_LOG
uint32_t default_action, action;
#endif
assert(u);
assert(c);
if (!context_has_syscall_logs(c))
return 0;
#ifdef SCMP_ACT_LOG
if (skip_seccomp_unavailable(u, "SystemCallLog="))
return 0;
if (c->syscall_log_allow_list) {
/* Log nothing but the ones listed */
default_action = SCMP_ACT_ALLOW;
action = SCMP_ACT_LOG;
} else {
/* Log everything but the ones listed */
default_action = SCMP_ACT_LOG;
action = SCMP_ACT_ALLOW;
}
return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
#else
/* old libseccomp */
log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
return 0;
#endif
}
static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
assert(u);
assert(c);
if (set_isempty(c->syscall_archs))
return 0;
if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
return 0;
return seccomp_restrict_archs(c->syscall_archs);
}
static int apply_address_families(const Unit* u, const ExecContext *c) {
assert(u);
assert(c);
if (!context_has_address_families(c))
return 0;
if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
return 0;
return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
}
static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
assert(u);
assert(c);
if (!c->memory_deny_write_execute)
return 0;
if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
return 0;
return seccomp_memory_deny_write_execute();
}
static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
assert(u);
assert(c);
if (!c->restrict_realtime)
return 0;
if (skip_seccomp_unavailable(u, "RestrictRealtime="))
return 0;
return seccomp_restrict_realtime();
}
static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
assert(u);
assert(c);
if (!c->restrict_suid_sgid)
return 0;
if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
return 0;
return seccomp_restrict_suid_sgid();
}
static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
assert(u);
assert(c);
/* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
* let's protect even those systems where this is left on in the kernel. */
if (!c->protect_kernel_tunables)
return 0;
if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
return 0;
return seccomp_protect_sysctl();
}
static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
assert(u);
assert(c);
/* Turn off module syscalls on ProtectKernelModules=yes */
if (!c->protect_kernel_modules)
return 0;
if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
return 0;
return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
}
static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
assert(u);
assert(c);
if (!c->protect_kernel_logs)
return 0;
if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
return 0;
return seccomp_protect_syslog();
}
static int apply_protect_clock(const Unit *u, const ExecContext *c) {
assert(u);
assert(c);
if (!c->protect_clock)
return 0;
if (skip_seccomp_unavailable(u, "ProtectClock="))
return 0;
return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
}
static int apply_private_devices(const Unit *u, const ExecContext *c) {
assert(u);
assert(c);
/* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
if (!c->private_devices)
return 0;
if (skip_seccomp_unavailable(u, "PrivateDevices="))
return 0;
return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
}
static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
assert(u);
assert(c);
if (!exec_context_restrict_namespaces_set(c))
return 0;
if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
return 0;
return seccomp_restrict_namespaces(c->restrict_namespaces);
}
static int apply_lock_personality(const Unit* u, const ExecContext *c) {
unsigned long personality;
int r;
assert(u);
assert(c);
if (!c->lock_personality)
return 0;
if (skip_seccomp_unavailable(u, "LockPersonality="))
return 0;
personality = c->personality;
/* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
if (personality == PERSONALITY_INVALID) {
r = opinionated_personality(&personality);
if (r < 0)
return r;
}
return seccomp_lock_personality(personality);
}
#endif
static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
assert(u);
assert(c);
if (!c->protect_hostname)
return 0;
if (ns_type_supported(NAMESPACE_UTS)) {
if (unshare(CLONE_NEWUTS) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
*ret_exit_status = EXIT_NAMESPACE;
return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
}
log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
}
} else
log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
#if HAVE_SECCOMP
int r;
if (skip_seccomp_unavailable(u, "ProtectHostname="))
return 0;
r = seccomp_protect_hostname();
if (r < 0) {
*ret_exit_status = EXIT_SECCOMP;
return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
}
#endif
return 0;
}
static void do_idle_pipe_dance(int idle_pipe[static 4]) {
assert(idle_pipe);
idle_pipe[1] = safe_close(idle_pipe[1]);
idle_pipe[2] = safe_close(idle_pipe[2]);
if (idle_pipe[0] >= 0) {
int r;
r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
ssize_t n;
/* Signal systemd that we are bored and want to continue. */
n = write(idle_pipe[3], "x", 1);
if (n > 0)
/* Wait for systemd to react to the signal above. */
(void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
}
idle_pipe[0] = safe_close(idle_pipe[0]);
}
idle_pipe[3] = safe_close(idle_pipe[3]);
}
static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
static int build_environment(
const Unit *u,
const ExecContext *c,
const ExecParameters *p,
size_t n_fds,
const char *home,
const char *username,
const char *shell,
dev_t journal_stream_dev,
ino_t journal_stream_ino,
char ***ret) {
_cleanup_strv_free_ char **our_env = NULL;
size_t n_env = 0;
char *x;
assert(u);
assert(c);
assert(p);
assert(ret);
#define N_ENV_VARS 17
our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
if (!our_env)
return -ENOMEM;
if (n_fds > 0) {
_cleanup_free_ char *joined = NULL;
if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
return -ENOMEM;
our_env[n_env++] = x;
if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
return -ENOMEM;
our_env[n_env++] = x;
joined = strv_join(p->fd_names, ":");
if (!joined)
return -ENOMEM;
x = strjoin("LISTEN_FDNAMES=", joined);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
return -ENOMEM;
our_env[n_env++] = x;
if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
return -ENOMEM;
our_env[n_env++] = x;
}
/* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
* Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
* PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
if (home) {
x = strjoin("HOME=", home);
if (!x)
return -ENOMEM;
path_simplify(x + 5);
our_env[n_env++] = x;
}
if (username) {
x = strjoin("LOGNAME=", username);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
x = strjoin("USER=", username);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
if (shell) {
x = strjoin("SHELL=", shell);
if (!x)
return -ENOMEM;
path_simplify(x + 6);
our_env[n_env++] = x;
}
if (!sd_id128_is_null(u->invocation_id)) {
if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
return -ENOMEM;
our_env[n_env++] = x;
}
if (exec_context_needs_term(c)) {
const char *tty_path, *term = NULL;
tty_path = exec_context_tty_path(c);
/* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
* to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
* container manager passes to PID 1 ends up all the way in the console login shown. */
if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
term = getenv("TERM");
if (!term)
term = default_term_for_tty(tty_path);
x = strjoin("TERM=", term);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
if (journal_stream_dev != 0 && journal_stream_ino != 0) {
if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
return -ENOMEM;
our_env[n_env++] = x;
}
if (c->log_namespace) {
x = strjoin("LOG_NAMESPACE=", c->log_namespace);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
_cleanup_free_ char *pre = NULL, *joined = NULL;
const char *n;
if (!p->prefix[t])
continue;
if (strv_isempty(c->directories[t].paths))
continue;
n = exec_directory_env_name_to_string(t);
if (!n)
continue;
pre = strjoin(p->prefix[t], "/");
if (!pre)
return -ENOMEM;
joined = strv_join_full(c->directories[t].paths, ":", pre, true);
if (!joined)
return -ENOMEM;
x = strjoin(n, "=", joined);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
return -ENOMEM;
our_env[n_env++] = x;
our_env[n_env++] = NULL;
assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
#undef N_ENV_VARS
*ret = TAKE_PTR(our_env);
return 0;
}
static int build_pass_environment(const ExecContext *c, char ***ret) {
_cleanup_strv_free_ char **pass_env = NULL;
size_t n_env = 0;
char **i;
STRV_FOREACH(i, c->pass_environment) {
_cleanup_free_ char *x = NULL;
char *v;
v = getenv(*i);
if (!v)
continue;
x = strjoin(*i, "=", v);
if (!x)
return -ENOMEM;
if (!GREEDY_REALLOC(pass_env, n_env + 2))
return -ENOMEM;
pass_env[n_env++] = TAKE_PTR(x);
pass_env[n_env] = NULL;
}
*ret = TAKE_PTR(pass_env);
return 0;
}
bool exec_needs_mount_namespace(
const ExecContext *context,
const ExecParameters *params,
const ExecRuntime *runtime) {
assert(context);
if (context->root_image)
return true;
if (!strv_isempty(context->read_write_paths) ||
!strv_isempty(context->read_only_paths) ||
!strv_isempty(context->inaccessible_paths) ||
!strv_isempty(context->exec_paths) ||
!strv_isempty(context->no_exec_paths))
return true;
if (context->n_bind_mounts > 0)
return true;
if (context->n_temporary_filesystems > 0)
return true;
if (context->n_mount_images > 0)
return true;
if (context->n_extension_images > 0)
return true;
if (!IN_SET(context->mount_flags, 0, MS_SHARED))
return true;
if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
return true;
if (context->private_devices ||
context->private_mounts ||
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
context->private_ipc ||
context->ipc_namespace_path)
return true;
if (context->root_directory) {
if (exec_context_get_effective_mount_apivfs(context))
return true;
for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
if (params && !params->prefix[t])
continue;
if (!strv_isempty(context->directories[t].paths))
return true;
}
}
if (context->dynamic_user &&
(!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
!strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
!strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
return true;
if (context->log_namespace)
return true;
return false;
}
static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
_cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
_cleanup_close_ int unshare_ready_fd = -1;
_cleanup_(sigkill_waitp) pid_t pid = 0;
uint64_t c = 1;
ssize_t n;
int r;
/* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
* the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
* nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
* we however lack after opening the user namespace. To work around this we fork() a temporary child process,
* which waits for the parent to create the new user namespace while staying in the original namespace. The
* child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
* continues execution normally.
* For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
* does not need CAP_SETUID to write the single line mapping to itself. */
/* Can only set up multiple mappings with CAP_SETUID. */
if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
r = asprintf(&uid_map,
UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
ouid, ouid, uid, uid);
else
r = asprintf(&uid_map,
UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
ouid, ouid);
if (r < 0)
return -ENOMEM;
/* Can only set up multiple mappings with CAP_SETGID. */
if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
r = asprintf(&gid_map,
GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
ogid, ogid, gid, gid);
else
r = asprintf(&gid_map,
GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
ogid, ogid);
if (r < 0)
return -ENOMEM;
/* Create a communication channel so that the parent can tell the child when it finished creating the user
* namespace. */
unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
if (unshare_ready_fd < 0)
return -errno;
/* Create a communication channel so that the child can tell the parent a proper error code in case it
* failed. */
if (pipe2(errno_pipe, O_CLOEXEC) < 0)
return -errno;
r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
if (r < 0)
return r;
if (r == 0) {
_cleanup_close_ int fd = -1;
const char *a;
pid_t ppid;
/* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
* here, after the parent opened its own user namespace. */
ppid = getppid();
errno_pipe[0] = safe_close(errno_pipe[0]);
/* Wait until the parent unshared the user namespace */
if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
r = -errno;
goto child_fail;
}
/* Disable the setgroups() system call in the child user namespace, for good. */
a = procfs_file_alloca(ppid, "setgroups");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT) {
r = -errno;
goto child_fail;
}
/* If the file is missing the kernel is too old, let's continue anyway. */
} else {
if (write(fd, "deny\n", 5) < 0) {
r = -errno;
goto child_fail;
}
fd = safe_close(fd);
}
/* First write the GID map */
a = procfs_file_alloca(ppid, "gid_map");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
r = -errno;
goto child_fail;
}
if (write(fd, gid_map, strlen(gid_map)) < 0) {
r = -errno;
goto child_fail;
}
fd = safe_close(fd);
/* The write the UID map */
a = procfs_file_alloca(ppid, "uid_map");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
r = -errno;
goto child_fail;
}
if (write(fd, uid_map, strlen(uid_map)) < 0) {
r = -errno;
goto child_fail;
}
_exit(EXIT_SUCCESS);
child_fail:
(void) write(errno_pipe[1], &r, sizeof(r));
_exit(EXIT_FAILURE);
}
errno_pipe[1] = safe_close(errno_pipe[1]);
if (unshare(CLONE_NEWUSER) < 0)
return -errno;
/* Let the child know that the namespace is ready now */
if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
return -errno;
/* Try to read an error code from the child */
n = read(errno_pipe[0], &r, sizeof(r));
if (n < 0)
return -errno;
if (n == sizeof(r)) { /* an error code was sent to us */
if (r < 0)
return r;
return -EIO;
}
if (n != 0) /* on success we should have read 0 bytes */
return -EIO;
r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
pid = 0;
if (r < 0)
return r;
if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
return -EIO;
return 0;
}
static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
if (!context->dynamic_user)
return false;
if (type == EXEC_DIRECTORY_CONFIGURATION)
return false;
if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
return false;
return true;
}
static int setup_exec_directory(
const ExecContext *context,
const ExecParameters *params,
uid_t uid,
gid_t gid,
ExecDirectoryType type,
int *exit_status) {
static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
[EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
[EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
[EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
[EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
[EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
};
char **rt;
int r;
assert(context);
assert(params);
assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
assert(exit_status);
if (!params->prefix[type])
return 0;
if (params->flags & EXEC_CHOWN_DIRECTORIES) {
if (!uid_is_valid(uid))
uid = 0;
if (!gid_is_valid(gid))
gid = 0;
}
STRV_FOREACH(rt, context->directories[type].paths) {
_cleanup_free_ char *p = NULL, *pp = NULL;
p = path_join(params->prefix[type], *rt);
if (!p) {
r = -ENOMEM;
goto fail;
}
r = mkdir_parents_label(p, 0755);
if (r < 0)
goto fail;
if (exec_directory_is_private(context, type)) {
/* So, here's one extra complication when dealing with DynamicUser=1 units. In that
* case we want to avoid leaving a directory around fully accessible that is owned by
* a dynamic user whose UID is later on reused. To lock this down we use the same
* trick used by container managers to prohibit host users to get access to files of
* the same UID in containers: we place everything inside a directory that has an
* access mode of 0700 and is owned root:root, so that it acts as security boundary
* for unprivileged host code. We then use fs namespacing to make this directory
* permeable for the service itself.
*
* Specifically: for a service which wants a special directory "foo/" we first create
* a directory "private/" with access mode 0700 owned by root:root. Then we place
* "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
* "private/foo". This way, privileged host users can access "foo/" as usual, but
* unprivileged host users can't look into it. Inside of the namespace of the unit
* "private/" is replaced by a more liberally accessible tmpfs, into which the host's
* "private/foo/" is mounted under the same name, thus disabling the access boundary
* for the service and making sure it only gets access to the dirs it needs but no
* others. Tricky? Yes, absolutely, but it works!
*
* Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
* to be owned by the service itself.
*
* Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
* for sharing files or sockets with other services. */
pp = path_join(params->prefix[type], "private");
if (!pp) {
r = -ENOMEM;
goto fail;
}
/* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
if (r < 0)
goto fail;
if (!path_extend(&pp, *rt)) {
r = -ENOMEM;
goto fail;
}
/* Create all directories between the configured directory and this private root, and mark them 0755 */
r = mkdir_parents_label(pp, 0755);
if (r < 0)
goto fail;
if (is_dir(p, false) > 0 &&
(laccess(pp, F_OK) < 0 && errno == ENOENT)) {
/* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
* it over. Most likely the service has been upgraded from one that didn't use
* DynamicUser=1, to one that does. */
log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
"Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
exec_directory_type_to_string(type), p, pp);
if (rename(p, pp) < 0) {
r = -errno;
goto fail;
}
} else {
/* Otherwise, create the actual directory for the service */
r = mkdir_label(pp, context->directories[type].mode);
if (r < 0 && r != -EEXIST)
goto fail;
}
/* And link it up from the original place */
r = symlink_idempotent(pp, p, true);
if (r < 0)
goto fail;
} else {
_cleanup_free_ char *target = NULL;
if (type != EXEC_DIRECTORY_CONFIGURATION &&
readlink_and_make_absolute(p, &target) >= 0) {
_cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
/* This already exists and is a symlink? Interesting. Maybe it's one created
* by DynamicUser=1 (see above)?
*
* We do this for all directory types except for ConfigurationDirectory=,
* since they all support the private/ symlink logic at least in some
* configurations, see above. */
r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
if (r < 0)
goto fail;
q = path_join(params->prefix[type], "private", *rt);
if (!q) {
r = -ENOMEM;
goto fail;
}
/* /var/lib or friends may be symlinks. So, let's chase them also. */
r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
if (r < 0)
goto fail;
if (path_equal(q_resolved, target_resolved)) {
/* Hmm, apparently DynamicUser= was once turned on for this service,
* but is no longer. Let's move the directory back up. */
log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
"Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
exec_directory_type_to_string(type), q, p);
if (unlink(p) < 0) {
r = -errno;
goto fail;
}
if (rename(q, p) < 0) {
r = -errno;
goto fail;
}
}
}
r = mkdir_label(p, context->directories[type].mode);
if (r < 0) {
if (r != -EEXIST)
goto fail;
if (type == EXEC_DIRECTORY_CONFIGURATION) {
struct stat st;
/* Don't change the owner/access mode of the configuration directory,
* as in the common case it is not written to by a service, and shall
* not be writable. */
if (stat(p, &st) < 0) {
r = -errno;
goto fail;
}
/* Still complain if the access mode doesn't match */
if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
log_warning("%s \'%s\' already exists but the mode is different. "
"(File system: %o %sMode: %o)",
exec_directory_type_to_string(type), *rt,
st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
continue;
}
}
}
/* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
* specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
* current UID/GID ownership.) */
r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
if (r < 0)
goto fail;
/* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
* drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
* assignments to exist. */
r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
if (r < 0)
goto fail;
}
return 0;
fail:
*exit_status = exit_status_table[type];
return r;
}
static int write_credential(
int dfd,
const char *id,
const void *data,
size_t size,
uid_t uid,
bool ownership_ok) {
_cleanup_(unlink_and_freep) char *tmp = NULL;
_cleanup_close_ int fd = -1;
int r;
r = tempfn_random_child("", "cred", &tmp);
if (r < 0)
return r;
fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
if (fd < 0) {
tmp = mfree(tmp);
return -errno;
}
r = loop_write(fd, data, size, /* do_pool = */ false);
if (r < 0)
return r;
if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
return -errno;
if (uid_is_valid(uid) && uid != getuid()) {
r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
if (r < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
return r;
if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
* to express: that the user gets read access and nothing
* else. But if the backing fs can't support that (e.g. ramfs)
* then we can use file ownership instead. But that's only safe if
* we can then re-mount the whole thing read-only, so that the
* user can no longer chmod() the file to gain write access. */
return r;
if (fchown(fd, uid, GID_INVALID) < 0)
return -errno;
}
}
if (renameat(dfd, tmp, dfd, id) < 0)
return -errno;
tmp = mfree(tmp);
return 0;
}
#define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
static int acquire_credentials(
const ExecContext *context,
const ExecParameters *params,
const char *unit,
const char *p,
uid_t uid,
bool ownership_ok) {
uint64_t left = CREDENTIALS_BYTES_MAX;
_cleanup_close_ int dfd = -1;
ExecSetCredential *sc;
char **id, **fn;
int r;
assert(context);
assert(p);
dfd = open(p, O_DIRECTORY|O_CLOEXEC);
if (dfd < 0)
return -errno;
/* First we use the literally specified credentials. Note that they might be overridden again below,
* and thus act as a "default" if the same credential is specified multiple times */
HASHMAP_FOREACH(sc, context->set_credentials) {
size_t add;
add = strlen(sc->id) + sc->size;
if (add > left)
return -E2BIG;
r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
if (r < 0)
return r;
left -= add;
}
/* Then, load credential off disk (or acquire via AF_UNIX socket) */
STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
_cleanup_(erase_and_freep) char *data = NULL;
_cleanup_free_ char *j = NULL, *bindname = NULL;
bool missing_ok = true;
const char *source;
size_t size, add;
if (path_is_absolute(*fn)) {
/* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
source = *fn;
flags |= READ_FULL_FILE_CONNECT_SOCKET;
/* Pass some minimal info about the unit and the credential name we are looking to acquire
* via the source socket address in case we read off an AF_UNIX socket. */
if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
return -ENOMEM;
missing_ok = false;
} else if (params->received_credentials) {
/* If this is a relative path, take it relative to the credentials we received
* ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
* on a credential store, i.e. this is guaranteed to be regular files. */
j = path_join(params->received_credentials, *fn);
if (!j)
return -ENOMEM;
source = j;
} else
source = NULL;
if (source)
r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
else
r = -ENOENT;
if (r == -ENOENT && (missing_ok || faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)) {
/* Make a missing inherited credential non-fatal, let's just continue. After all apps
* will get clear errors if we don't pass such a missing credential on as they
* themselves will get ENOENT when trying to read them, which should not be much
* worse than when we handle the error here and make it fatal.
*
* Also, if the source file doesn't exist, but we already acquired the key otherwise,
* then don't fail either. */
log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", *fn);
continue;
}
if (r < 0)
return log_debug_errno(r, "Failed to read credential '%s': %m", *fn);
add = strlen(*id) + size;
if (add > left)
return -E2BIG;
r = write_credential(dfd, *id, data, size, uid, ownership_ok);
if (r < 0)
return r;
left -= add;
}
if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
return -errno;
/* After we created all keys with the right perms, also make sure the credential store as a whole is
* accessible */
if (uid_is_valid(uid) && uid != getuid()) {
r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
if (r < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
return r;
if (!ownership_ok)
return r;
if (fchown(dfd, uid, GID_INVALID) < 0)
return -errno;
}
}
return 0;
}
static int setup_credentials_internal(
const ExecContext *context,
const ExecParameters *params,
const char *unit,
const char *final, /* This is where the credential store shall eventually end up at */
const char *workspace, /* This is where we can prepare it before moving it to the final place */
bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
uid_t uid) {
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
bool final_mounted;
const char *where;
assert(context);
assert(final);
assert(workspace);
if (reuse_workspace) {
r = path_is_mount_point(workspace, NULL, 0);
if (r < 0)
return r;
if (r > 0)
workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
else
workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
} else
workspace_mounted = -1; /* ditto */
r = path_is_mount_point(final, NULL, 0);
if (r < 0)
return r;
if (r > 0) {
/* If the final place already has something mounted, we use that. If the workspace also has
* something mounted we assume it's actually the same mount (but with MS_RDONLY
* different). */
final_mounted = true;
if (workspace_mounted < 0) {
/* If the final place is mounted, but the workspace we isn't, then let's bind mount
* the final version to the workspace, and make it writable, so that we can make
* changes */
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
if (r < 0)
return r;
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
if (r < 0)
return r;
workspace_mounted = true;
}
} else
final_mounted = false;
if (workspace_mounted < 0) {
/* Nothing is mounted on the workspace yet, let's try to mount something now */
for (int try = 0;; try++) {
if (try == 0) {
/* Try "ramfs" first, since it's not swap backed */
r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
if (r >= 0) {
workspace_mounted = true;
break;
}
} else if (try == 1) {
_cleanup_free_ char *opts = NULL;
if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
return -ENOMEM;
/* Fall back to "tmpfs" otherwise */
r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
if (r >= 0) {
workspace_mounted = true;
break;
}
} else {
/* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
if (r < 0) {
if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
return r;
if (must_mount) /* If we it's not OK to use the plain directory
* fallback, propagate all errors too */
return r;
/* If we lack privileges to bind mount stuff, then let's gracefully
* proceed for compat with container envs, and just use the final dir
* as is. */
workspace_mounted = false;
break;
}
/* Make the new bind mount writable (i.e. drop MS_RDONLY) */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
if (r < 0)
return r;
workspace_mounted = true;
break;
}
}
}
assert(!must_mount || workspace_mounted > 0);
where = workspace_mounted ? workspace : final;
r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
if (r < 0)
return r;
if (workspace_mounted) {
/* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
if (r < 0)
return r;
/* And mount it to the final place, read-only */
if (final_mounted)
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
else
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
if (r < 0)
return r;
} else {
_cleanup_free_ char *parent = NULL;
/* If we do not have our own mount put used the plain directory fallback, then we need to
* open access to the top-level credential directory and the per-service directory now */
parent = dirname_malloc(final);
if (!parent)
return -ENOMEM;
if (chmod(parent, 0755) < 0)
return -errno;
}
return 0;
}
static int setup_credentials(
const ExecContext *context,
const ExecParameters *params,
const char *unit,
uid_t uid) {
_cleanup_free_ char *p = NULL, *q = NULL;
const char *i;
int r;
assert(context);
assert(params);
if (!exec_context_has_credentials(context))
return 0;
if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
return -EINVAL;
/* This where we'll place stuff when we are done; this main credentials directory is world-readable,
* and the subdir we mount over with a read-only file system readable by the service's user */
q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
if (!q)
return -ENOMEM;
r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
if (r < 0 && r != -EEXIST)
return r;
p = path_join(q, unit);
if (!p)
return -ENOMEM;
r = mkdir_label(p, 0700); /* per-unit dir: private to user */
if (r < 0 && r != -EEXIST)
return r;
r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
if (r < 0) {
_cleanup_free_ char *t = NULL, *u = NULL;
/* If this is not a privilege or support issue then propagate the error */
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
return r;
/* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
* it into place, so that users can't access half-initialized credential stores. */
t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
if (!t)
return -ENOMEM;
/* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
* directory outside of /run/credentials/ first, and then move it over to /run/credentials/
* after it is fully set up */
u = path_join(t, unit);
if (!u)
return -ENOMEM;
FOREACH_STRING(i, t, u) {
r = mkdir_label(i, 0700);
if (r < 0 && r != -EEXIST)
return r;
}
r = setup_credentials_internal(
context,
params,
unit,
p, /* final mount point */
u, /* temporary workspace to overmount */
true, /* reuse the workspace if it is already a mount */
false, /* it's OK to fall back to a plain directory if we can't mount anything */
uid);
(void) rmdir(u); /* remove the workspace again if we can. */
if (r < 0)
return r;
} else if (r == 0) {
/* We managed to set up a mount namespace, and are now in a child. That's great. In this case
* we can use the same directory for all cases, after turning off propagation. Question
* though is: where do we turn off propagation exactly, and where do we place the workspace
* directory? We need some place that is guaranteed to be a mount point in the host, and
* which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
* since we ultimately want to move the resulting file system there, i.e. we need propagation
* for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
* would be visible in the host mount table all the time, which we want to avoid. Hence, what
* we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
* /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
* propagation on the former, and then overmount the latter.
*
* Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
* for this purpose, but there are few other candidates that work equally well for us, and
* given that the we do this in a privately namespaced short-lived single-threaded process
* that no one else sees this should be OK to do. */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
if (r < 0)
goto child_fail;
r = setup_credentials_internal(
context,
params,
unit,
p, /* final mount point */
"/dev/shm", /* temporary workspace to overmount */
false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
true, /* insist that something is mounted, do not allow fallback to plain directory */
uid);
if (r < 0)
goto child_fail;
_exit(EXIT_SUCCESS);
child_fail:
_exit(EXIT_FAILURE);
}
return 0;
}
#if ENABLE_SMACK
static int setup_smack(
const ExecContext *context,
int executable_fd) {
int r;
assert(context);
assert(executable_fd >= 0);
if (context->smack_process_label) {
r = mac_smack_apply_pid(0, context->smack_process_label);
if (r < 0)
return r;
}
#ifdef SMACK_DEFAULT_PROCESS_LABEL
else {
_cleanup_free_ char *exec_label = NULL;
r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
return r;
r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
if (r < 0)
return r;
}
#endif
return 0;
}
#endif
static int compile_bind_mounts(
const ExecContext *context,
const ExecParameters *params,
BindMount **ret_bind_mounts,
size_t *ret_n_bind_mounts,
char ***ret_empty_directories) {
_cleanup_strv_free_ char **empty_directories = NULL;
BindMount *bind_mounts;
size_t n, h = 0;
int r;
assert(context);
assert(params);
assert(ret_bind_mounts);
assert(ret_n_bind_mounts);
assert(ret_empty_directories);
n = context->n_bind_mounts;
for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
if (!params->prefix[t])
continue;
n += strv_length(context->directories[t].paths);
}
if (n <= 0) {
*ret_bind_mounts = NULL;
*ret_n_bind_mounts = 0;
*ret_empty_directories = NULL;
return 0;
}