src/nspawn/nspawn-stub-pid1.c - systemd - Git at Google

 /* SPDX-License-Identifier: LGPL-2.1-or-later */

 #include <sys/ioctl.h>
 #include <sys/reboot.h>
 #include <sys/wait.h>
 #include <sys/prctl.h>
 #include <unistd.h>

 #include "argv-util.h"
 #include "constants.h"
 #include "exit-status.h"
 #include "fd-util.h"
 #include "log.h"
 #include "nspawn-stub-pid1.h"
 #include "process-util.h"
 #include "signal-util.h"
 #include "time-util.h"

 static int reset_environ(const char *new_environment, size_t length) {
         unsigned long start, end;

         start = (unsigned long) new_environment;
         end = start + length;

         if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0)
                 return -errno;

         if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0)
                 return -errno;

         return 0;
 }

 int stub_pid1(sd_id128_t uuid) {
         enum {
                 STATE_RUNNING,
                 STATE_REBOOT,
                 STATE_POWEROFF,
         } state = STATE_RUNNING;

         sigset_t fullmask, oldmask, waitmask;
         usec_t quit_usec = USEC_INFINITY;
         pid_t pid;
         int r;

         /* The new environment we set up, on the stack. */
         char new_environment[] =
                 "container=systemd-nspawn\0"
                 "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";

         /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful
          * for allowing arbitrary processes run in a container, and still have all zombies reaped. */

         assert_se(sigfillset(&fullmask) >= 0);
         assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0);

         pid = fork();
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork child pid: %m");

         if (pid == 0) {
                 /* Return in the child */
                 assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0);

                 if (setsid() < 0)
                         return log_error_errno(errno, "Failed to become session leader in payload process: %m");

                 return 0;
         }

         reset_all_signal_handlers();

         log_close();
         (void) close_all_fds(NULL, 0);
         log_open();

         if (ioctl(STDIN_FILENO, TIOCNOTTY) < 0) {
                 if (errno != ENOTTY)
                         log_warning_errno(errno, "Unexpected error from TIOCNOTTY ioctl in init stub process, ignoring: %m");
         } else
                 log_warning("Expected TIOCNOTTY to fail, but it succeeded in init stub process, ignoring.");

         /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also,
          * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ
          * find them set. */
         sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX);
         reset_environ(new_environment, sizeof(new_environment));

         (void) rename_process("(sd-stubinit)");

         assert_se(sigemptyset(&waitmask) >= 0);
         assert_se(sigset_add_many(&waitmask,
                                   SIGCHLD,          /* posix: process died */
                                   SIGINT,           /* sysv: ctrl-alt-del */
                                   SIGRTMIN+3,       /* systemd: halt */
                                   SIGRTMIN+4,       /* systemd: poweroff */
                                   SIGRTMIN+5,       /* systemd: reboot */
                                   SIGRTMIN+6,       /* systemd: kexec */
                                   SIGRTMIN+13,      /* systemd: halt */
                                   SIGRTMIN+14,      /* systemd: poweroff */
                                   SIGRTMIN+15,      /* systemd: reboot */
                                   SIGRTMIN+16,      /* systemd: kexec */
                                   -1) >= 0);

         /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't
          * support reexec/reloading in this stub process. */

         for (;;) {
                 siginfo_t si;
                 usec_t current_usec;

                 si.si_pid = 0;
                 r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG);
                 if (r < 0) {
                         r = log_error_errno(errno, "Failed to reap children: %m");
                         goto finish;
                 }

                 current_usec = now(CLOCK_MONOTONIC);

                 if (si.si_pid == pid || current_usec >= quit_usec) {

                         /* The child we started ourselves died or we reached a timeout. */

                         if (state == STATE_REBOOT) { /* dispatch a queued reboot */
                                 (void) reboot(RB_AUTOBOOT);
                                 r = log_error_errno(errno, "Failed to reboot: %m");
                                 goto finish;

                         } else if (state == STATE_POWEROFF)
                                 (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */

                         if (si.si_pid == pid && si.si_code == CLD_EXITED)
                                 r = si.si_status; /* pass on exit code */
                         else
                                 r = EXIT_EXCEPTION; /* signal, coredump, timeout, … */

                         goto finish;
                 }
                 if (si.si_pid != 0)
                         /* We reaped something. Retry until there's nothing more to reap. */
                         continue;

                 if (quit_usec == USEC_INFINITY)
                         r = sigwaitinfo(&waitmask, &si);
                 else
                         r = sigtimedwait(&waitmask, &si, TIMESPEC_STORE(quit_usec - current_usec));
                 if (r < 0) {
                         if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */
                                 continue;
                         if (errno == EAGAIN) /* timeout reached */
                                 continue;

                         r = log_error_errno(errno, "Failed to wait for signal: %m");
                         goto finish;
                 }

                 if (si.si_signo == SIGCHLD)
                         continue; /* Let's reap this */

                 if (state != STATE_RUNNING)
                         continue;

                 /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a
                  * constant… */

                 if (si.si_signo == SIGRTMIN+3 ||
                     si.si_signo == SIGRTMIN+4 ||
                     si.si_signo == SIGRTMIN+13 ||
                     si.si_signo == SIGRTMIN+14)

                         state = STATE_POWEROFF;

                 else if (si.si_signo == SIGINT ||
                          si.si_signo == SIGRTMIN+5 ||
                          si.si_signo == SIGRTMIN+6 ||
                          si.si_signo == SIGRTMIN+15 ||
                          si.si_signo == SIGRTMIN+16)

                         state = STATE_REBOOT;
                 else
                         assert_not_reached();

                 r = kill_and_sigcont(pid, SIGTERM);

                 /* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We
                  * do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those
                  * processes which handle both. That's because services tend to bind configuration reload or something
                  * else to SIGHUP. */

                 if (r != -ESRCH)
                         (void) kill(pid, SIGHUP);

                 quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC;
         }

 finish:
         _exit(r < 0 ? EXIT_FAILURE : r);
 }
	/* SPDX-License-Identifier: LGPL-2.1-or-later */

	#include <sys/ioctl.h>
	#include <sys/reboot.h>
	#include <sys/wait.h>
	#include <sys/prctl.h>
	#include <unistd.h>

	#include "argv-util.h"
	#include "constants.h"
	#include "exit-status.h"
	#include "fd-util.h"
	#include "log.h"
	#include "nspawn-stub-pid1.h"
	#include "process-util.h"
	#include "signal-util.h"
	#include "time-util.h"

	static int reset_environ(const char *new_environment, size_t length) {
	unsigned long start, end;

	start = (unsigned long) new_environment;
	end = start + length;

	if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0)
	return -errno;

	if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0)
	return -errno;

	return 0;
	}

	int stub_pid1(sd_id128_t uuid) {
	enum {
	STATE_RUNNING,
	STATE_REBOOT,
	STATE_POWEROFF,
	} state = STATE_RUNNING;

	sigset_t fullmask, oldmask, waitmask;
	usec_t quit_usec = USEC_INFINITY;
	pid_t pid;
	int r;

	/* The new environment we set up, on the stack. */
	char new_environment[] =
	"container=systemd-nspawn\0"
	"container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";

	/* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful
	* for allowing arbitrary processes run in a container, and still have all zombies reaped. */

	assert_se(sigfillset(&fullmask) >= 0);
	assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0);

	pid = fork();
	if (pid < 0)
	return log_error_errno(errno, "Failed to fork child pid: %m");

	if (pid == 0) {
	/* Return in the child */
	assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0);

	if (setsid() < 0)
	return log_error_errno(errno, "Failed to become session leader in payload process: %m");

	return 0;
	}

	reset_all_signal_handlers();

	log_close();
	(void) close_all_fds(NULL, 0);
	log_open();

	if (ioctl(STDIN_FILENO, TIOCNOTTY) < 0) {
	if (errno != ENOTTY)
	log_warning_errno(errno, "Unexpected error from TIOCNOTTY ioctl in init stub process, ignoring: %m");
	} else
	log_warning("Expected TIOCNOTTY to fail, but it succeeded in init stub process, ignoring.");

	/* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also,
	* set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ
	* find them set. */
	sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX);
	reset_environ(new_environment, sizeof(new_environment));

	(void) rename_process("(sd-stubinit)");

	assert_se(sigemptyset(&waitmask) >= 0);
	assert_se(sigset_add_many(&waitmask,
	SIGCHLD, /* posix: process died */
	SIGINT, /* sysv: ctrl-alt-del */
	SIGRTMIN+3, /* systemd: halt */
	SIGRTMIN+4, /* systemd: poweroff */
	SIGRTMIN+5, /* systemd: reboot */
	SIGRTMIN+6, /* systemd: kexec */
	SIGRTMIN+13, /* systemd: halt */
	SIGRTMIN+14, /* systemd: poweroff */
	SIGRTMIN+15, /* systemd: reboot */
	SIGRTMIN+16, /* systemd: kexec */
	-1) >= 0);

	/* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't
	* support reexec/reloading in this stub process. */

	for (;;) {
	siginfo_t si;
	usec_t current_usec;

	si.si_pid = 0;
	r = waitid(P_ALL, 0, &si, WEXITED\|WNOHANG);
	if (r < 0) {
	r = log_error_errno(errno, "Failed to reap children: %m");
	goto finish;
	}

	current_usec = now(CLOCK_MONOTONIC);

	if (si.si_pid == pid \|\| current_usec >= quit_usec) {

	/* The child we started ourselves died or we reached a timeout. */

	if (state == STATE_REBOOT) { /* dispatch a queued reboot */
	(void) reboot(RB_AUTOBOOT);
	r = log_error_errno(errno, "Failed to reboot: %m");
	goto finish;

	} else if (state == STATE_POWEROFF)
	(void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */

	if (si.si_pid == pid && si.si_code == CLD_EXITED)
	r = si.si_status; /* pass on exit code */
	else
	r = EXIT_EXCEPTION; /* signal, coredump, timeout, … */

	goto finish;
	}
	if (si.si_pid != 0)
	/* We reaped something. Retry until there's nothing more to reap. */
	continue;

	if (quit_usec == USEC_INFINITY)
	r = sigwaitinfo(&waitmask, &si);
	else
	r = sigtimedwait(&waitmask, &si, TIMESPEC_STORE(quit_usec - current_usec));
	if (r < 0) {
	if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */
	continue;
	if (errno == EAGAIN) /* timeout reached */
	continue;

	r = log_error_errno(errno, "Failed to wait for signal: %m");
	goto finish;
	}

	if (si.si_signo == SIGCHLD)
	continue; /* Let's reap this */

	if (state != STATE_RUNNING)
	continue;

	/* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a
	* constant… */

	if (si.si_signo == SIGRTMIN+3 \|\|
	si.si_signo == SIGRTMIN+4 \|\|
	si.si_signo == SIGRTMIN+13 \|\|
	si.si_signo == SIGRTMIN+14)

	state = STATE_POWEROFF;

	else if (si.si_signo == SIGINT \|\|
	si.si_signo == SIGRTMIN+5 \|\|
	si.si_signo == SIGRTMIN+6 \|\|
	si.si_signo == SIGRTMIN+15 \|\|
	si.si_signo == SIGRTMIN+16)

	state = STATE_REBOOT;
	else
	assert_not_reached();

	r = kill_and_sigcont(pid, SIGTERM);

	/* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We
	* do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those
	* processes which handle both. That's because services tend to bind configuration reload or something
	* else to SIGHUP. */

	if (r != -ESRCH)
	(void) kill(pid, SIGHUP);

	quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC;
	}

	finish:
	_exit(r < 0 ? EXIT_FAILURE : r);
	}