| /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
| |
| #include <fcntl.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| |
| #include "alloc-util.h" |
| #include "bpf-program.h" |
| #include "escape.h" |
| #include "fd-util.h" |
| #include "memory-util.h" |
| #include "missing_syscall.h" |
| #include "path-util.h" |
| #include "serialize.h" |
| #include "string-table.h" |
| |
| static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = { |
| [BPF_CGROUP_INET_INGRESS] = "ingress", |
| [BPF_CGROUP_INET_EGRESS] = "egress", |
| [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", |
| [BPF_CGROUP_SOCK_OPS] = "sock_ops", |
| [BPF_CGROUP_DEVICE] = "device", |
| [BPF_CGROUP_INET4_BIND] = "bind4", |
| [BPF_CGROUP_INET6_BIND] = "bind6", |
| [BPF_CGROUP_INET4_CONNECT] = "connect4", |
| [BPF_CGROUP_INET6_CONNECT] = "connect6", |
| [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", |
| [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", |
| [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", |
| [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", |
| [BPF_CGROUP_SYSCTL] = "sysctl", |
| [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", |
| [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", |
| [BPF_CGROUP_GETSOCKOPT] = "getsockopt", |
| [BPF_CGROUP_SETSOCKOPT] = "setsockopt", |
| }; |
| |
| DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int); |
| |
| DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_unref); |
| |
| /* struct bpf_prog_info info must be initialized since its value is both input and output |
| * for BPF_OBJ_GET_INFO_BY_FD syscall. */ |
| static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) { |
| union bpf_attr attr; |
| |
| /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when |
| * structured initialization is used. |
| * Refer to https://github.com/systemd/systemd/issues/18164 |
| */ |
| zero(attr); |
| attr.info.bpf_fd = prog_fd; |
| attr.info.info_len = info_len; |
| attr.info.info = PTR_TO_UINT64(info); |
| |
| if (bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)) < 0) |
| return -errno; |
| |
| return 0; |
| } |
| |
| int bpf_program_new(uint32_t prog_type, BPFProgram **ret) { |
| _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; |
| |
| p = new(BPFProgram, 1); |
| if (!p) |
| return -ENOMEM; |
| |
| *p = (BPFProgram) { |
| .n_ref = 1, |
| .prog_type = prog_type, |
| .kernel_fd = -1, |
| }; |
| |
| *ret = TAKE_PTR(p); |
| |
| return 0; |
| } |
| |
| int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) { |
| _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; |
| struct bpf_prog_info info = {}; |
| int r; |
| |
| assert(path); |
| assert(ret); |
| |
| p = new(BPFProgram, 1); |
| if (!p) |
| return -ENOMEM; |
| |
| *p = (BPFProgram) { |
| .prog_type = BPF_PROG_TYPE_UNSPEC, |
| .n_ref = 1, |
| .kernel_fd = -1, |
| }; |
| |
| r = bpf_program_load_from_bpf_fs(p, path); |
| if (r < 0) |
| return r; |
| |
| r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info)); |
| if (r < 0) |
| return r; |
| |
| p->prog_type = info.type; |
| *ret = TAKE_PTR(p); |
| |
| return 0; |
| } |
| |
| static BPFProgram *bpf_program_free(BPFProgram *p) { |
| assert(p); |
| |
| /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last |
| * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated |
| * programs that attached one of their BPF programs to a cgroup will leave this programs pinned for good with |
| * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in |
| * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during |
| * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To |
| * counter this, we track closely to which cgroup a program was attached to and will detach it on our own |
| * whenever we close the BPF fd. */ |
| (void) bpf_program_cgroup_detach(p); |
| |
| safe_close(p->kernel_fd); |
| free(p->instructions); |
| free(p->attached_path); |
| |
| return mfree(p); |
| } |
| |
| DEFINE_TRIVIAL_REF_UNREF_FUNC(BPFProgram, bpf_program, bpf_program_free); |
| |
| int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) { |
| |
| assert(p); |
| |
| if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */ |
| return -EBUSY; |
| |
| if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count)) |
| return -ENOMEM; |
| |
| memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count); |
| p->n_instructions += count; |
| |
| return 0; |
| } |
| |
| int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) { |
| union bpf_attr attr; |
| |
| assert(p); |
| |
| if (p->kernel_fd >= 0) { /* make this idempotent */ |
| memzero(log_buf, log_size); |
| return 0; |
| } |
| |
| // FIXME: Clang doesn't 0-pad with structured initialization, causing |
| // the kernel to reject the bpf_attr as invalid. See: |
| // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65 |
| // Ideally it should behave like GCC, so that we can remove these workarounds. |
| zero(attr); |
| attr.prog_type = p->prog_type; |
| attr.insns = PTR_TO_UINT64(p->instructions); |
| attr.insn_cnt = p->n_instructions; |
| attr.license = PTR_TO_UINT64("GPL"); |
| attr.log_buf = PTR_TO_UINT64(log_buf); |
| attr.log_level = !!log_buf; |
| attr.log_size = log_size; |
| |
| p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); |
| if (p->kernel_fd < 0) |
| return -errno; |
| |
| return 0; |
| } |
| |
| int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) { |
| union bpf_attr attr; |
| |
| assert(p); |
| |
| if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */ |
| return -EBUSY; |
| |
| zero(attr); |
| attr.pathname = PTR_TO_UINT64(path); |
| |
| p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr)); |
| if (p->kernel_fd < 0) |
| return -errno; |
| |
| return 0; |
| } |
| |
| int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) { |
| _cleanup_free_ char *copy = NULL; |
| _cleanup_close_ int fd = -1; |
| union bpf_attr attr; |
| int r; |
| |
| assert(p); |
| assert(type >= 0); |
| assert(path); |
| |
| if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI)) |
| return -EINVAL; |
| |
| /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's |
| * refuse this early. */ |
| if (p->attached_path) { |
| if (!path_equal(p->attached_path, path)) |
| return -EBUSY; |
| if (p->attached_type != type) |
| return -EBUSY; |
| if (p->attached_flags != flags) |
| return -EBUSY; |
| |
| /* Here's a shortcut: if we previously attached this program already, then we don't have to do so |
| * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have |
| * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags |
| * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags |
| * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours |
| * would remain in effect. */ |
| if (flags != BPF_F_ALLOW_OVERRIDE) |
| return 0; |
| } |
| |
| /* Ensure we have a kernel object for this. */ |
| r = bpf_program_load_kernel(p, NULL, 0); |
| if (r < 0) |
| return r; |
| |
| copy = strdup(path); |
| if (!copy) |
| return -ENOMEM; |
| |
| fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC); |
| if (fd < 0) |
| return -errno; |
| |
| zero(attr); |
| attr.attach_type = type; |
| attr.target_fd = fd; |
| attr.attach_bpf_fd = p->kernel_fd; |
| attr.attach_flags = flags; |
| |
| if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) |
| return -errno; |
| |
| free_and_replace(p->attached_path, copy); |
| p->attached_type = type; |
| p->attached_flags = flags; |
| |
| return 0; |
| } |
| |
| int bpf_program_cgroup_detach(BPFProgram *p) { |
| _cleanup_close_ int fd = -1; |
| |
| assert(p); |
| |
| if (!p->attached_path) |
| return -EUNATCH; |
| |
| fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC); |
| if (fd < 0) { |
| if (errno != ENOENT) |
| return -errno; |
| |
| /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached |
| * implicitly by the removal, hence don't complain */ |
| |
| } else { |
| union bpf_attr attr; |
| |
| zero(attr); |
| attr.attach_type = p->attached_type; |
| attr.target_fd = fd; |
| attr.attach_bpf_fd = p->kernel_fd; |
| |
| if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) |
| return -errno; |
| } |
| |
| p->attached_path = mfree(p->attached_path); |
| |
| return 0; |
| } |
| |
| int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) { |
| union bpf_attr attr; |
| int fd; |
| |
| zero(attr); |
| attr.map_type = type; |
| attr.key_size = key_size; |
| attr.value_size = value_size; |
| attr.max_entries = max_entries; |
| attr.map_flags = flags; |
| |
| fd = bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); |
| if (fd < 0) |
| return -errno; |
| |
| return fd; |
| } |
| |
| int bpf_map_update_element(int fd, const void *key, void *value) { |
| union bpf_attr attr; |
| |
| zero(attr); |
| attr.map_fd = fd; |
| attr.key = PTR_TO_UINT64(key); |
| attr.value = PTR_TO_UINT64(value); |
| |
| if (bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)) < 0) |
| return -errno; |
| |
| return 0; |
| } |
| |
| int bpf_map_lookup_element(int fd, const void *key, void *value) { |
| union bpf_attr attr; |
| |
| zero(attr); |
| attr.map_fd = fd; |
| attr.key = PTR_TO_UINT64(key); |
| attr.value = PTR_TO_UINT64(value); |
| |
| if (bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)) < 0) |
| return -errno; |
| |
| return 0; |
| } |
| |
| int bpf_program_pin(int prog_fd, const char *bpffs_path) { |
| union bpf_attr attr; |
| |
| zero(attr); |
| attr.pathname = PTR_TO_UINT64((void *) bpffs_path); |
| attr.bpf_fd = prog_fd; |
| |
| if (bpf(BPF_OBJ_PIN, &attr, sizeof(attr)) < 0) |
| return -errno; |
| |
| return 0; |
| } |
| |
| int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) { |
| struct bpf_prog_info info = {}; |
| int r; |
| |
| assert(ret_id); |
| |
| r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info)); |
| if (r < 0) |
| return r; |
| |
| *ret_id = info.id; |
| |
| return 0; |
| }; |
| |
| int bpf_program_serialize_attachment( |
| FILE *f, |
| FDSet *fds, |
| const char *key, |
| BPFProgram *p) { |
| |
| _cleanup_free_ char *escaped = NULL; |
| int copy, r; |
| |
| if (!p || !p->attached_path) |
| return 0; |
| |
| assert(p->kernel_fd >= 0); |
| |
| escaped = cescape(p->attached_path); |
| if (!escaped) |
| return -ENOMEM; |
| |
| copy = fdset_put_dup(fds, p->kernel_fd); |
| if (copy < 0) |
| return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m"); |
| |
| r = serialize_item_format( |
| f, |
| key, |
| "%i %s %s", |
| copy, |
| bpf_cgroup_attach_type_to_string(p->attached_type), |
| escaped); |
| if (r < 0) |
| return r; |
| |
| /* After serialization, let's forget the fact that this program is attached. The attachment — if you |
| * so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because |
| * of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to |
| * explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't |
| * want the program to be detached while freeing things, so that the attachment can be retained after |
| * deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL, |
| * hence we set it to NULL here. */ |
| |
| p->attached_path = mfree(p->attached_path); |
| return 0; |
| } |
| |
| int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) { |
| BPFProgram *p; |
| int r; |
| |
| SET_FOREACH(p, set) { |
| r = bpf_program_serialize_attachment(f, fds, key, p); |
| if (r < 0) |
| return r; |
| } |
| |
| return 0; |
| } |
| |
| int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) { |
| _cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL; |
| _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; |
| _cleanup_close_ int fd = -1; |
| int ifd, at, r; |
| |
| assert(v); |
| assert(bpfp); |
| |
| /* Extract first word: the fd number */ |
| r = extract_first_word(&v, &sfd, NULL, 0); |
| if (r < 0) |
| return r; |
| if (r == 0) |
| return -EINVAL; |
| |
| r = safe_atoi(sfd, &ifd); |
| if (r < 0) |
| return r; |
| if (ifd < 0) |
| return -EBADF; |
| |
| /* Extract second word: the attach type */ |
| r = extract_first_word(&v, &sat, NULL, 0); |
| if (r < 0) |
| return r; |
| if (r == 0) |
| return -EINVAL; |
| |
| at = bpf_cgroup_attach_type_from_string(sat); |
| if (at < 0) |
| return at; |
| |
| /* The rest is the path */ |
| r = cunescape(v, 0, &unescaped); |
| if (r < 0) |
| return r; |
| |
| fd = fdset_remove(fds, ifd); |
| if (fd < 0) |
| return fd; |
| |
| p = new(BPFProgram, 1); |
| if (!p) |
| return -ENOMEM; |
| |
| *p = (BPFProgram) { |
| .n_ref = 1, |
| .kernel_fd = TAKE_FD(fd), |
| .prog_type = BPF_PROG_TYPE_UNSPEC, |
| .attached_path = TAKE_PTR(unescaped), |
| .attached_type = at, |
| }; |
| |
| if (*bpfp) |
| bpf_program_unref(*bpfp); |
| |
| *bpfp = TAKE_PTR(p); |
| return 0; |
| } |
| |
| int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) { |
| BPFProgram *p = NULL; |
| int r; |
| |
| assert(v); |
| assert(bpfsetp); |
| |
| r = bpf_program_deserialize_attachment(v, fds, &p); |
| if (r < 0) |
| return r; |
| |
| r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p); |
| if (r < 0) |
| return r; |
| |
| return 0; |
| } |