blob: bafc3c5e5eb8a50cc6102561274562025bb7b3bc [file] [edit]
/* -*- c-set-style: "K&R"; c-basic-offset: 8 -*-
*
* This file is part of PRoot.
*
* Copyright (C) 2014 STMicroelectronics
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301 USA.
*/
#include <sched.h> /* CLONE_*, */
#include <sys/types.h> /* pid_t, */
#include <sys/ptrace.h> /* ptrace(1), PTRACE_*, */
#include <sys/types.h> /* waitpid(2), */
#include <sys/wait.h> /* waitpid(2), */
#include <sys/utsname.h> /* uname(2), */
#include <unistd.h> /* fork(2), chdir(2), getpid(2), */
#include <string.h> /* strcmp(3), */
#include <errno.h> /* errno(3), */
#include <stdbool.h> /* bool, true, false, */
#include <assert.h> /* assert(3), */
#include <stdlib.h> /* atexit(3), getenv(3), */
#include <talloc.h> /* talloc_*, */
#include "tracee/event.h"
#include "cli/note.h"
#include "path/path.h"
#include "path/binding.h"
#include "syscall/syscall.h"
#include "syscall/seccomp.h"
#include "ptrace/wait.h"
#include "extension/extension.h"
#include "execve/elf.h"
#include "attribute.h"
#include "compat.h"
/**
* Start @tracee->exe with the given @argv[]. This function
* returns -errno if an error occurred, otherwise 0.
*/
int launch_process(Tracee *tracee, char *const argv[])
{
char *const default_argv[] = { "-sh", NULL };
long status;
pid_t pid;
/* Warn about open file descriptors. They won't be
* translated until they are closed. */
if (tracee->verbose > 0)
list_open_fd(tracee);
pid = fork();
switch(pid) {
case -1:
note(tracee, ERROR, SYSTEM, "fork()");
return -errno;
case 0: /* child */
/* Declare myself as ptraceable before executing the
* requested program. */
status = ptrace(PTRACE_TRACEME, 0, NULL, NULL);
if (status < 0) {
note(tracee, ERROR, SYSTEM, "ptrace(TRACEME)");
return -errno;
}
/* Synchronize with the tracer's event loop. Without
* this trick the tracer only sees the "return" from
* the next execve(2) so PRoot wouldn't handle the
* interpreter/runner. I also verified that strace
* does the same thing. */
kill(getpid(), SIGSTOP);
/* Improve performance by using seccomp mode 2, unless
* this support is explicitly disabled. */
if (getenv("PROOT_NO_SECCOMP") == NULL)
(void) enable_syscall_filtering(tracee);
/* Now process is ptraced, so the current rootfs is already the
* guest rootfs. Note: Valgrind can't handle execve(2) on
* "foreign" binaries (ENOEXEC) but can handle execvp(3) on such
* binaries. */
execvp(tracee->exe, argv[0] != NULL ? argv : default_argv);
return -errno;
default: /* parent */
/* We know the pid of the first tracee now. */
tracee->pid = pid;
return 0;
}
/* Never reached. */
return -ENOSYS;
}
/* Send the KILL signal to all tracees when PRoot has received a fatal
* signal. */
static void kill_all_tracees2(int signum, siginfo_t *siginfo UNUSED, void *ucontext UNUSED)
{
note(NULL, WARNING, INTERNAL, "signal %d received from process %d",
signum, siginfo->si_pid);
kill_all_tracees();
/* Exit immediately for system signals (segmentation fault,
* illegal instruction, ...), otherwise exit cleanly through
* the event loop. */
if (signum != SIGQUIT)
_exit(EXIT_FAILURE);
}
/**
* Helper for print_talloc_hierarchy().
*/
static void print_talloc_chunk(const void *ptr, int depth, int max_depth UNUSED,
int is_ref, void *data UNUSED)
{
const char *name;
size_t count;
size_t size;
name = talloc_get_name(ptr);
size = talloc_get_size(ptr);
count = talloc_reference_count(ptr);
if (depth == 0)
return;
while (depth-- > 1)
fprintf(stderr, "\t");
fprintf(stderr, "%-16s ", name);
if (is_ref)
fprintf(stderr, "-> %-8p", ptr);
else {
fprintf(stderr, "%-8p %zd bytes %zd ref'", ptr, size, count);
if (name[0] == '$') {
fprintf(stderr, "\t(\"%s\")", (char *)ptr);
}
if (name[0] == '@') {
char **argv;
int i;
fprintf(stderr, "\t(");
for (i = 0, argv = (char **)ptr; argv[i] != NULL; i++)
fprintf(stderr, "\"%s\", ", argv[i]);
fprintf(stderr, ")");
}
else if (strcmp(name, "Tracee") == 0) {
fprintf(stderr, "\t(pid = %d, parent = %p)",
((Tracee *)ptr)->pid, ((Tracee *)ptr)->parent);
}
else if (strcmp(name, "Bindings") == 0) {
Tracee *tracee;
tracee = TRACEE(ptr);
if (ptr == tracee->fs->bindings.pending)
fprintf(stderr, "\t(pending)");
else if (ptr == tracee->fs->bindings.guest)
fprintf(stderr, "\t(guest)");
else if (ptr == tracee->fs->bindings.host)
fprintf(stderr, "\t(host)");
}
else if (strcmp(name, "Binding") == 0) {
Binding *binding = (Binding *)ptr;
fprintf(stderr, "\t(%s:%s)", binding->host.path, binding->guest.path);
}
}
fprintf(stderr, "\n");
}
/* Print on stderr the complete talloc hierarchy. */
static void print_talloc_hierarchy(int signum, siginfo_t *siginfo UNUSED, void *ucontext UNUSED)
{
switch (signum) {
case SIGUSR1:
talloc_report_depth_cb(NULL, 0, 100, print_talloc_chunk, NULL);
break;
case SIGUSR2:
talloc_report_depth_file(NULL, 0, 100, stderr);
break;
default:
break;
}
}
static int last_exit_status = -1;
/**
* Check if this instance of PRoot can *technically* handle @tracee.
*/
static void check_architecture(Tracee *tracee)
{
struct utsname utsname;
ElfHeader elf_header;
char path[PATH_MAX];
int status;
if (tracee->exe == NULL)
return;
status = translate_path(tracee, path, AT_FDCWD, tracee->exe, false);
if (status < 0)
return;
status = open_elf(path, &elf_header);
if (status < 0)
return;
close(status);
if (!IS_CLASS64(elf_header) || sizeof(word_t) == sizeof(uint64_t))
return;
note(tracee, ERROR, USER,
"'%s' is a 64-bit program whereas this version of "
"%s handles 32-bit programs only", path, tracee->tool_name);
status = uname(&utsname);
if (status < 0)
return;
if (strcmp(utsname.machine, "x86_64") != 0)
return;
note(tracee, INFO, USER,
"Get a 64-bit version that supports 32-bit binaries here: "
"http://static.proot.me/proot-x86_64");
}
/**
* Wait then handle any event from any tracee. This function returns
* the exit status of the last terminated program.
*/
int event_loop()
{
struct sigaction signal_action;
long status;
int signum;
/* Kill all tracees when exiting. */
status = atexit(kill_all_tracees);
if (status != 0)
note(NULL, WARNING, INTERNAL, "atexit() failed");
/* All signals are blocked when the signal handler is called.
* SIGINFO is used to know which process has signaled us and
* RESTART is used to restart waitpid(2) seamlessly. */
bzero(&signal_action, sizeof(signal_action));
signal_action.sa_flags = SA_SIGINFO | SA_RESTART;
status = sigfillset(&signal_action.sa_mask);
if (status < 0)
note(NULL, WARNING, SYSTEM, "sigfillset()");
/* Handle all signals. */
for (signum = 0; signum < SIGRTMAX; signum++) {
switch (signum) {
case SIGQUIT:
case SIGILL:
case SIGABRT:
case SIGFPE:
case SIGSEGV:
/* Kill all tracees on abnormal termination
* signals. This ensures no process is left
* untraced. */
signal_action.sa_sigaction = kill_all_tracees2;
break;
case SIGUSR1:
case SIGUSR2:
/* Print on stderr the complete talloc
* hierarchy, useful for debug purpose. */
signal_action.sa_sigaction = print_talloc_hierarchy;
break;
case SIGCHLD:
case SIGCONT:
case SIGSTOP:
case SIGTSTP:
case SIGTTIN:
case SIGTTOU:
/* The default action is OK for these signals,
* they are related to tty and job control. */
continue;
default:
/* Ignore all other signals, including
* terminating ones (^C for instance). */
signal_action.sa_sigaction = (void *)SIG_IGN;
break;
}
status = sigaction(signum, &signal_action, NULL);
if (status < 0 && errno != EINVAL)
note(NULL, WARNING, SYSTEM, "sigaction(%d)", signum);
}
while (1) {
int tracee_status;
Tracee *tracee;
int signal;
pid_t pid;
/* This is the only safe place to free tracees. */
free_terminated_tracees();
/* Wait for the next tracee's stop. */
pid = waitpid(-1, &tracee_status, __WALL);
if (pid < 0) {
if (errno != ECHILD) {
note(NULL, ERROR, SYSTEM, "waitpid()");
return EXIT_FAILURE;
}
break;
}
/* Get information about this tracee. */
tracee = get_tracee(NULL, pid, true);
assert(tracee != NULL);
tracee->running = false;
status = notify_extensions(tracee, NEW_STATUS, tracee_status, 0);
if (status != 0)
continue;
if (tracee->as_ptracee.ptracer != NULL) {
bool keep_stopped = handle_ptracee_event(tracee, tracee_status);
if (keep_stopped)
continue;
}
signal = handle_tracee_event(tracee, tracee_status);
(void) restart_tracee(tracee, signal);
}
return last_exit_status;
}
/**
* Handle the current event (@tracee_status) of the given @tracee.
* This function returns the "computed" signal that should be used to
* restart the given @tracee.
*/
int handle_tracee_event(Tracee *tracee, int tracee_status)
{
static bool seccomp_detected = false;
pid_t pid = tracee->pid;
long status;
int signal;
/* Don't overwrite restart_how if it is explicitly set
* elsewhere, i.e in the ptrace emulation when single
* stepping. */
if (tracee->restart_how == 0) {
/* When seccomp is enabled, all events are restarted in
* non-stop mode, but this default choice could be overwritten
* later if necessary. The check against "sysexit_pending"
* ensures PTRACE_SYSCALL (used to hit the exit stage under
* seccomp) is not cleared due to an event that would happen
* before the exit stage, eg. PTRACE_EVENT_EXEC for the exit
* stage of execve(2). */
if (tracee->seccomp == ENABLED && !tracee->sysexit_pending)
tracee->restart_how = PTRACE_CONT;
else
tracee->restart_how = PTRACE_SYSCALL;
}
/* Not a signal-stop by default. */
signal = 0;
if (WIFEXITED(tracee_status)) {
last_exit_status = WEXITSTATUS(tracee_status);
VERBOSE(tracee, 1, "pid %d: exited with status %d", pid, last_exit_status);
tracee->terminated = true;
}
else if (WIFSIGNALED(tracee_status)) {
check_architecture(tracee);
VERBOSE(tracee, (int) (last_exit_status != -1),
"pid %d: terminated with signal %d", pid, WTERMSIG(tracee_status));
tracee->terminated = true;
}
else if (WIFSTOPPED(tracee_status)) {
/* Don't use WSTOPSIG() to extract the signal
* since it clears the PTRACE_EVENT_* bits. */
signal = (tracee_status & 0xfff00) >> 8;
switch (signal) {
static bool deliver_sigtrap = false;
case SIGTRAP: {
const unsigned long default_ptrace_options = (
PTRACE_O_TRACESYSGOOD |
PTRACE_O_TRACEFORK |
PTRACE_O_TRACEVFORK |
PTRACE_O_TRACEVFORKDONE |
PTRACE_O_TRACEEXEC |
PTRACE_O_TRACECLONE |
PTRACE_O_TRACEEXIT);
/* Distinguish some events from others and
* automatically trace each new process with
* the same options.
*
* Note that only the first bare SIGTRAP is
* related to the tracing loop, others SIGTRAP
* carry tracing information because of
* TRACE*FORK/CLONE/EXEC. */
if (deliver_sigtrap)
break; /* Deliver this signal as-is. */
deliver_sigtrap = true;
/* Try to enable seccomp mode 2... */
status = ptrace(PTRACE_SETOPTIONS, tracee->pid, NULL,
default_ptrace_options | PTRACE_O_TRACESECCOMP);
if (status < 0) {
/* ... otherwise use default options only. */
status = ptrace(PTRACE_SETOPTIONS, tracee->pid, NULL,
default_ptrace_options);
if (status < 0) {
note(tracee, ERROR, SYSTEM, "ptrace(PTRACE_SETOPTIONS)");
exit(EXIT_FAILURE);
}
}
}
/* Fall through. */
case SIGTRAP | 0x80:
signal = 0;
/* This tracee got signaled then freed during the
sysenter stage but the kernel reports the sysexit
stage; just discard this spurious tracee/event. */
if (tracee->exe == NULL) {
tracee->restart_how = PTRACE_CONT;
return 0;
}
switch (tracee->seccomp) {
case ENABLED:
if (IS_IN_SYSENTER(tracee)) {
/* sysenter: ensure the sysexit
* stage will be hit under seccomp. */
tracee->restart_how = PTRACE_SYSCALL;
tracee->sysexit_pending = true;
}
else {
/* sysexit: the next sysenter
* will be notified by seccomp. */
tracee->restart_how = PTRACE_CONT;
tracee->sysexit_pending = false;
}
/* Fall through. */
case DISABLED:
translate_syscall(tracee);
/* This syscall has disabled seccomp. */
if (tracee->seccomp == DISABLING) {
tracee->restart_how = PTRACE_SYSCALL;
tracee->seccomp = DISABLED;
}
break;
case DISABLING:
/* Seccomp was disabled by the
* previous syscall, but its sysenter
* stage was already handled. */
tracee->seccomp = DISABLED;
if (IS_IN_SYSENTER(tracee))
tracee->status = 1;
break;
}
break;
case SIGTRAP | PTRACE_EVENT_SECCOMP2 << 8:
case SIGTRAP | PTRACE_EVENT_SECCOMP << 8: {
unsigned long flags = 0;
signal = 0;
if (!seccomp_detected) {
VERBOSE(tracee, 1, "ptrace acceleration (seccomp mode 2) enabled");
tracee->seccomp = ENABLED;
seccomp_detected = true;
}
/* Use the common ptrace flow if seccomp was
* explicitely disabled for this tracee. */
if (tracee->seccomp != ENABLED)
break;
status = ptrace(PTRACE_GETEVENTMSG, tracee->pid, NULL, &flags);
if (status < 0)
break;
/* Use the common ptrace flow when
* sysexit has to be handled. */
if ((flags & FILTER_SYSEXIT) != 0) {
tracee->restart_how = PTRACE_SYSCALL;
break;
}
/* Otherwise, handle the sysenter
* stage right now. */
tracee->restart_how = PTRACE_CONT;
translate_syscall(tracee);
/* This syscall has disabled seccomp, so move
* the ptrace flow back to the common path to
* ensure its sysexit will be handled. */
if (tracee->seccomp == DISABLING)
tracee->restart_how = PTRACE_SYSCALL;
break;
}
case SIGTRAP | PTRACE_EVENT_VFORK << 8:
signal = 0;
(void) new_child(tracee, CLONE_VFORK);
break;
case SIGTRAP | PTRACE_EVENT_FORK << 8:
case SIGTRAP | PTRACE_EVENT_CLONE << 8:
signal = 0;
(void) new_child(tracee, 0);
break;
case SIGTRAP | PTRACE_EVENT_VFORK_DONE << 8:
case SIGTRAP | PTRACE_EVENT_EXEC << 8:
case SIGTRAP | PTRACE_EVENT_EXIT << 8:
signal = 0;
break;
case SIGSTOP:
/* Stop this tracee until PRoot has received
* the EVENT_*FORK|CLONE notification. */
if (tracee->exe == NULL) {
tracee->sigstop = SIGSTOP_PENDING;
signal = -1;
}
/* For each tracee, the first SIGSTOP
* is only used to notify the tracer. */
if (tracee->sigstop == SIGSTOP_IGNORED) {
tracee->sigstop = SIGSTOP_ALLOWED;
signal = 0;
}
break;
default:
/* Deliver this signal as-is. */
break;
}
}
/* Clear the pending event, if any. */
tracee->as_ptracee.event4.proot.pending = false;
return signal;
}
/**
* Restart the given @tracee with the specified @signal. This
* function returns false if the tracee was not restarted (error or
* put in the "waiting for ptracee" state), otherwise true.
*/
bool restart_tracee(Tracee *tracee, int signal)
{
int status;
/* Put in the "stopped"/"waiting for ptracee" state?. */
if (tracee->as_ptracer.wait_pid != 0 || signal == -1)
return false;
/* Restart the tracee and stop it at the next instruction, or
* at the next entry or exit of a system call. */
status = ptrace(tracee->restart_how, tracee->pid, NULL, signal);
if (status < 0)
return false; /* The process likely died in a syscall. */
tracee->restart_how = 0;
tracee->running = true;
return true;
}