| /* -*- c-set-style: "K&R"; c-basic-offset: 8 -*- |
| * |
| * This file is part of PRoot. |
| * |
| * Copyright (C) 2014 STMicroelectronics |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public License as |
| * published by the Free Software Foundation; either version 2 of the |
| * License, or (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, but |
| * WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
| * 02110-1301 USA. |
| */ |
| |
| #include "build.h" |
| #include "arch.h" |
| |
| #if defined(HAVE_SECCOMP_FILTER) |
| |
| #include <sys/prctl.h> /* prctl(2), PR_* */ |
| #include <linux/filter.h> /* struct sock_*, */ |
| #include <linux/seccomp.h> /* SECCOMP_MODE_FILTER, */ |
| #include <linux/filter.h> /* struct sock_*, */ |
| #include <linux/audit.h> /* AUDIT_, */ |
| #include <sys/queue.h> /* LIST_FOREACH, */ |
| #include <sys/types.h> /* size_t, */ |
| #include <talloc.h> /* talloc_*, */ |
| #include <errno.h> /* E*, */ |
| #include <string.h> /* memcpy(3), */ |
| #include <stddef.h> /* offsetof(3), */ |
| #include <stdint.h> /* uint*_t, UINT*_MAX, */ |
| #include <assert.h> /* assert(3), */ |
| |
| #include "syscall/seccomp.h" |
| #include "tracee/tracee.h" |
| #include "syscall/syscall.h" |
| #include "syscall/sysnum.h" |
| #include "extension/extension.h" |
| #include "cli/note.h" |
| |
| #include "compat.h" |
| #include "attribute.h" |
| |
| #define DEBUG_FILTER(...) /* fprintf(stderr, __VA_ARGS__) */ |
| |
| /** |
| * Allocate an empty @program->filter. This function returns -errno |
| * if an error occurred, otherwise 0. |
| */ |
| static int new_program_filter(struct sock_fprog *program) |
| { |
| program->filter = talloc_array(NULL, struct sock_filter, 0); |
| if (program->filter == NULL) |
| return -ENOMEM; |
| |
| program->len = 0; |
| return 0; |
| } |
| |
| /** |
| * Append to @program->filter the given @statements (@nb_statements |
| * items). This function returns -errno if an error occurred, |
| * otherwise 0. |
| */ |
| static int add_statements(struct sock_fprog *program, size_t nb_statements, |
| struct sock_filter *statements) |
| { |
| size_t length; |
| void *tmp; |
| size_t i; |
| |
| length = talloc_array_length(program->filter); |
| tmp = talloc_realloc(NULL, program->filter, struct sock_filter, length + nb_statements); |
| if (tmp == NULL) |
| return -ENOMEM; |
| program->filter = tmp; |
| |
| for (i = 0; i < nb_statements; i++, length++) |
| memcpy(&program->filter[length], &statements[i], sizeof(struct sock_filter)); |
| |
| return 0; |
| } |
| |
| /** |
| * Append to @program->filter the statements required to notify PRoot |
| * about the given @syscall made by a tracee, with the given @flag. |
| * This function returns -errno if an error occurred, otherwise 0. |
| */ |
| static int add_trace_syscall(struct sock_fprog *program, word_t syscall, int flag) |
| { |
| int status; |
| |
| /* Sanity check. */ |
| if (syscall > UINT32_MAX) |
| return -ERANGE; |
| |
| #define LENGTH_TRACE_SYSCALL 2 |
| struct sock_filter statements[LENGTH_TRACE_SYSCALL] = { |
| /* Compare the accumulator with the expected syscall: |
| * skip the next statement if not equal. */ |
| BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, syscall, 0, 1), |
| |
| /* Notify the tracer. */ |
| BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRACE + flag) |
| }; |
| |
| DEBUG_FILTER("FILTER: trace if syscall == %ld\n", syscall); |
| |
| status = add_statements(program, LENGTH_TRACE_SYSCALL, statements); |
| if (status < 0) |
| return status; |
| |
| return 0; |
| } |
| |
| /** |
| * Append to @program->filter the statements that allow anything (if |
| * unfiltered). Note that @nb_traced_syscalls is used to make a |
| * sanity check. This function returns -errno if an error occurred, |
| * otherwise 0. |
| */ |
| static int end_arch_section(struct sock_fprog *program, size_t nb_traced_syscalls) |
| { |
| int status; |
| |
| #define LENGTH_END_SECTION 1 |
| struct sock_filter statements[LENGTH_END_SECTION] = { |
| BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW) |
| }; |
| |
| DEBUG_FILTER("FILTER: allow\n"); |
| |
| status = add_statements(program, LENGTH_END_SECTION, statements); |
| if (status < 0) |
| return status; |
| |
| /* Sanity check, see start_arch_section(). */ |
| if ( talloc_array_length(program->filter) - program->len |
| != LENGTH_END_SECTION + nb_traced_syscalls * LENGTH_TRACE_SYSCALL) |
| return -ERANGE; |
| |
| return 0; |
| } |
| |
| /** |
| * Append to @program->filter the statements that check the current |
| * @architecture. Note that @nb_traced_syscalls is used to make a |
| * sanity check. This function returns -errno if an error occurred, |
| * otherwise 0. |
| */ |
| static int start_arch_section(struct sock_fprog *program, uint32_t arch, size_t nb_traced_syscalls) |
| { |
| const size_t arch_offset = offsetof(struct seccomp_data, arch); |
| const size_t syscall_offset = offsetof(struct seccomp_data, nr); |
| const size_t section_length = LENGTH_END_SECTION + |
| nb_traced_syscalls * LENGTH_TRACE_SYSCALL; |
| int status; |
| |
| /* Sanity checks. */ |
| if ( arch_offset > UINT32_MAX |
| || syscall_offset > UINT32_MAX |
| || section_length > UINT32_MAX - 1) |
| return -ERANGE; |
| |
| #define LENGTH_START_SECTION 4 |
| struct sock_filter statements[LENGTH_START_SECTION] = { |
| /* Load the current architecture into the |
| * accumulator. */ |
| BPF_STMT(BPF_LD + BPF_W + BPF_ABS, arch_offset), |
| |
| /* Compare the accumulator with the expected |
| * architecture: skip the following statement if |
| * equal. */ |
| BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, arch, 1, 0), |
| |
| /* This is not the expected architecture, so jump |
| * unconditionally to the end of this section. */ |
| BPF_STMT(BPF_JMP + BPF_JA + BPF_K, section_length + 1), |
| |
| /* This is the expected architecture, so load the |
| * current syscall into the accumulator. */ |
| BPF_STMT(BPF_LD + BPF_W + BPF_ABS, syscall_offset) |
| }; |
| |
| DEBUG_FILTER("FILTER: if arch == %ld, up to %zdth statement\n", |
| arch, nb_traced_syscalls); |
| |
| status = add_statements(program, LENGTH_START_SECTION, statements); |
| if (status < 0) |
| return status; |
| |
| /* See the sanity check in end_arch_section(). */ |
| program->len = talloc_array_length(program->filter); |
| |
| return 0; |
| } |
| |
| /** |
| * Append to @program->filter the statements that forbid anything (if |
| * unfiltered) and update @program->len. This function returns -errno |
| * if an error occurred, otherwise 0. |
| */ |
| static int finalize_program_filter(struct sock_fprog *program) |
| { |
| int status; |
| |
| #define LENGTH_FINALIZE 1 |
| struct sock_filter statements[LENGTH_FINALIZE] = { |
| BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL) |
| }; |
| |
| DEBUG_FILTER("FILTER: kill\n"); |
| |
| status = add_statements(program, LENGTH_FINALIZE, statements); |
| if (status < 0) |
| return status; |
| |
| program->len = talloc_array_length(program->filter); |
| |
| return 0; |
| } |
| |
| /** |
| * Free @program->filter and set @program->len to 0. |
| */ |
| static void free_program_filter(struct sock_fprog *program) |
| { |
| TALLOC_FREE(program->filter); |
| program->len = 0; |
| } |
| |
| /** |
| * Convert the given @sysnums into BPF filters according to the |
| * following pseudo-code, then enabled them for the given @tracee and |
| * all of its future children: |
| * |
| * for each handled architectures |
| * for each filtered syscall |
| * trace |
| * allow |
| * kill |
| * |
| * This function returns -errno if an error occurred, otherwise 0. |
| */ |
| static int set_seccomp_filters(const FilteredSysnum *sysnums) |
| { |
| SeccompArch seccomp_archs[] = SECCOMP_ARCHS; |
| size_t nb_archs = sizeof(seccomp_archs) / sizeof(SeccompArch); |
| |
| struct sock_fprog program = { .len = 0, .filter = NULL }; |
| size_t nb_traced_syscalls; |
| size_t i, j, k; |
| int status; |
| |
| status = new_program_filter(&program); |
| if (status < 0) |
| goto end; |
| |
| /* For each handled architectures */ |
| for (i = 0; i < nb_archs; i++) { |
| word_t syscall; |
| |
| nb_traced_syscalls = 0; |
| |
| /* Pre-compute the number of traced syscalls for this architecture. */ |
| for (j = 0; j < seccomp_archs[i].nb_abis; j++) { |
| for (k = 0; sysnums[k].value != PR_void; k++) { |
| syscall = detranslate_sysnum(seccomp_archs[i].abis[j], sysnums[k].value); |
| if (syscall != SYSCALL_AVOIDER) |
| nb_traced_syscalls++; |
| } |
| } |
| |
| /* Filter: if handled architecture */ |
| status = start_arch_section(&program, seccomp_archs[i].value, nb_traced_syscalls); |
| if (status < 0) |
| goto end; |
| |
| for (j = 0; j < seccomp_archs[i].nb_abis; j++) { |
| for (k = 0; sysnums[k].value != PR_void; k++) { |
| /* Get the architecture specific syscall number. */ |
| syscall = detranslate_sysnum(seccomp_archs[i].abis[j], sysnums[k].value); |
| if (syscall == SYSCALL_AVOIDER) |
| continue; |
| |
| /* Filter: trace if handled syscall */ |
| status = add_trace_syscall(&program, syscall, sysnums[k].flags); |
| if (status < 0) |
| goto end; |
| } |
| } |
| |
| /* Filter: allow untraced syscalls for this architecture */ |
| status = end_arch_section(&program, nb_traced_syscalls); |
| if (status < 0) |
| goto end; |
| } |
| |
| status = finalize_program_filter(&program); |
| if (status < 0) |
| goto end; |
| |
| status = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); |
| if (status < 0) |
| goto end; |
| |
| /* To output this BPF program for debug purpose: |
| * |
| * write(2, program.filter, program.len * sizeof(struct sock_filter)); |
| */ |
| |
| status = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program); |
| if (status < 0) |
| goto end; |
| |
| status = 0; |
| end: |
| free_program_filter(&program); |
| return status; |
| } |
| |
| /* List of sysnums handled by PRoot. */ |
| static FilteredSysnum proot_sysnums[] = { |
| { PR_accept, FILTER_SYSEXIT }, |
| { PR_accept4, FILTER_SYSEXIT }, |
| { PR_access, 0 }, |
| { PR_acct, 0 }, |
| { PR_bind, 0 }, |
| { PR_brk, FILTER_SYSEXIT }, |
| { PR_chdir, FILTER_SYSEXIT }, |
| { PR_chmod, 0 }, |
| { PR_chown, 0 }, |
| { PR_chown32, 0 }, |
| { PR_chroot, 0 }, |
| { PR_connect, 0 }, |
| { PR_creat, 0 }, |
| { PR_execve, FILTER_SYSEXIT }, |
| { PR_faccessat, 0 }, |
| { PR_fchdir, FILTER_SYSEXIT }, |
| { PR_fchmodat, 0 }, |
| { PR_fchownat, 0 }, |
| { PR_fstatat64, 0 }, |
| { PR_futimesat, 0 }, |
| { PR_getcwd, FILTER_SYSEXIT }, |
| { PR_getpeername, FILTER_SYSEXIT }, |
| { PR_getsockname, FILTER_SYSEXIT }, |
| { PR_getxattr, 0 }, |
| { PR_inotify_add_watch, 0 }, |
| { PR_lchown, 0 }, |
| { PR_lchown32, 0 }, |
| { PR_lgetxattr, 0 }, |
| { PR_link, 0 }, |
| { PR_linkat, 0 }, |
| { PR_listxattr, 0 }, |
| { PR_llistxattr, 0 }, |
| { PR_lremovexattr, 0 }, |
| { PR_lsetxattr, 0 }, |
| { PR_lstat, 0 }, |
| { PR_lstat64, 0 }, |
| { PR_mkdir, 0 }, |
| { PR_mkdirat, 0 }, |
| { PR_mknod, 0 }, |
| { PR_mknodat, 0 }, |
| { PR_mount, 0 }, |
| { PR_name_to_handle_at, 0 }, |
| { PR_newfstatat, 0 }, |
| { PR_oldlstat, 0 }, |
| { PR_oldstat, 0 }, |
| { PR_open, 0 }, |
| { PR_openat, 0 }, |
| { PR_pivot_root, 0 }, |
| { PR_ptrace, FILTER_SYSEXIT }, |
| { PR_readlink, FILTER_SYSEXIT }, |
| { PR_readlinkat, FILTER_SYSEXIT }, |
| { PR_removexattr, 0 }, |
| { PR_rename, FILTER_SYSEXIT }, |
| { PR_renameat, FILTER_SYSEXIT }, |
| { PR_renameat2, FILTER_SYSEXIT }, |
| { PR_rmdir, 0 }, |
| { PR_setxattr, 0 }, |
| { PR_socketcall, FILTER_SYSEXIT }, |
| { PR_stat, 0 }, |
| { PR_stat64, 0 }, |
| { PR_statfs, 0 }, |
| { PR_statfs64, 0 }, |
| { PR_swapoff, 0 }, |
| { PR_swapon, 0 }, |
| { PR_symlink, 0 }, |
| { PR_symlinkat, 0 }, |
| { PR_truncate, 0 }, |
| { PR_truncate64, 0 }, |
| { PR_umount, 0 }, |
| { PR_umount2, 0 }, |
| { PR_uname, FILTER_SYSEXIT }, |
| { PR_unlink, 0 }, |
| { PR_unlinkat, 0 }, |
| { PR_uselib, 0 }, |
| { PR_utime, 0 }, |
| { PR_utimensat, 0 }, |
| { PR_utimes, 0 }, |
| { PR_wait4, FILTER_SYSEXIT }, |
| { PR_waitpid, FILTER_SYSEXIT }, |
| FILTERED_SYSNUM_END, |
| }; |
| |
| /** |
| * Add the @new_sysnums to the list of filtered @sysnums, using the |
| * given Talloc @context. This function returns -errno if an error |
| * occurred, otherwise 0. |
| */ |
| static int merge_filtered_sysnums(TALLOC_CTX *context, FilteredSysnum **sysnums, |
| const FilteredSysnum *new_sysnums) |
| { |
| size_t i, j; |
| |
| assert(sysnums != NULL); |
| |
| if (*sysnums == NULL) { |
| /* Start with no sysnums but the terminator. */ |
| *sysnums = talloc_array(context, FilteredSysnum, 1); |
| if (*sysnums == NULL) |
| return -ENOMEM; |
| |
| (*sysnums)[0].value = PR_void; |
| } |
| |
| for (i = 0; new_sysnums[i].value != PR_void; i++) { |
| /* Search for the given sysnum. */ |
| for (j = 0; (*sysnums)[j].value != PR_void |
| && (*sysnums)[j].value != new_sysnums[i].value; j++) |
| ; |
| |
| if ((*sysnums)[j].value == PR_void) { |
| /* No such sysnum, allocate a new entry. */ |
| (*sysnums) = talloc_realloc(context, (*sysnums), FilteredSysnum, j + 2); |
| if ((*sysnums) == NULL) |
| return -ENOMEM; |
| |
| (*sysnums)[j] = new_sysnums[i]; |
| |
| /* The last item is the terminator. */ |
| (*sysnums)[j + 1].value = PR_void; |
| } |
| else |
| /* The sysnum is already filtered, merge the |
| * flags. */ |
| (*sysnums)[j].flags |= new_sysnums[i].flags; |
| } |
| |
| return 0; |
| } |
| |
| /** |
| * Tell the kernel to trace only syscalls handled by PRoot and its |
| * extensions. This filter will be enabled for the given @tracee and |
| * all of its future children. This function returns -errno if an |
| * error occurred, otherwise 0. |
| */ |
| int enable_syscall_filtering(const Tracee *tracee) |
| { |
| FilteredSysnum *filtered_sysnums = NULL; |
| Extension *extension; |
| int status; |
| |
| assert(tracee != NULL && tracee->ctx != NULL); |
| |
| /* Add the sysnums required by PRoot to the list of filtered |
| * sysnums. TODO: only if path translation is required. */ |
| status = merge_filtered_sysnums(tracee->ctx, &filtered_sysnums, proot_sysnums); |
| if (status < 0) |
| return status; |
| |
| /* Merge the sysnums required by the extensions to the list |
| * of filtered sysnums. */ |
| if (tracee->extensions != NULL) { |
| LIST_FOREACH(extension, tracee->extensions, link) { |
| if (extension->filtered_sysnums == NULL) |
| continue; |
| |
| status = merge_filtered_sysnums(tracee->ctx, &filtered_sysnums, |
| extension->filtered_sysnums); |
| if (status < 0) |
| return status; |
| } |
| } |
| |
| status = set_seccomp_filters(filtered_sysnums); |
| if (status < 0) |
| return status; |
| |
| return 0; |
| } |
| |
| #else |
| |
| #include "tracee/tracee.h" |
| #include "attribute.h" |
| |
| int enable_syscall_filtering(const Tracee *tracee UNUSED) |
| { |
| return 0; |
| } |
| |
| #endif /* defined(HAVE_SECCOMP_FILTER) */ |