src/plugins/cgroup/v2/ebpf.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  ebpf.c - library to handle BPF cgroup device constrains
  *****************************************************************************
  *  Copyright (C) SchedMD LLC.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #define _GNU_SOURCE
 #include "ebpf.h"

 #define bpf(cmd, attr, size) (int) syscall(__NR_bpf, cmd, attr, size);

 /* Macros inspired from libcrun. */
 #define BPF_ALU32_IMM(OP, DST, IMM)					\
 	((struct bpf_insn) { .code = BPF_ALU | BPF_OP (OP) | BPF_K,	\
 		 .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })

 #define BPF_LDX_MEM(SIZE, DST, SRC, OFF)				\
 	((struct bpf_insn) { .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, \
 		 .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })

 #define BPF_JMP_IMM(OP, DST, IMM, OFF)					\
 	((struct bpf_insn) { .code = BPF_JMP | BPF_OP (OP) | BPF_K,	\
 		 .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })

 #define BPF_MOV64_IMM(DST, IMM)						\
 	((struct bpf_insn) { .code = BPF_ALU64 | BPF_MOV | BPF_K,	\
 		 .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })

 #define BPF_EXIT_INSN()							\
 	((struct bpf_insn) { .code = BPF_JMP | BPF_EXIT, .dst_reg = 0,	\
 		 .src_reg = 0, .off = 0, .imm = 0 })

 extern void init_ebpf_prog(bpf_program_t *program)
 {
 	/*
 	 * The following bpf program represented by the struct array init_dev
 	 * will take care of storing the parameter (a.k.a the request) of the
 	 * called function (which is program->program) into different registers,
 	 * The request is stored in BPF_REG_1 at the moment of the function call
 	 * and is a struct that represents the device that needs to be checked
 	 * by the program in order to see whether access is granted or not.
 	 *
 	 * It looks like:
 	 *
 	 * struct request {
 	 *     int access_type; //int is split into type(low) & access(high)
 	 *     int major;
 	 *     int minor;
 	 * }
 	 *
 	 * To split type_access the following operation is done:
 	 *
 	 * int type = access_type & 0x0000FFFF;
 	 * int access = access_type >> 16;
 	 */
 	struct bpf_insn init_dev[] = {
 		/* type -> R2.  */
 		BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0),
 		BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF),

 		/* access -> R3.  */
 		BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0),
 		BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16),

 		/* major -> R4.  */
 		BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4),

 		/* minor -> R5.  */
 		BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8),
 	};

 	/*
 	 * Allocate the size of the init instructions(6) + 2 more instructions
 	 * for the ending (close_ebpf_prog). The allocated space is stored in
 	 * prog_size so that it can be used in future reallocs.
 	 * If the number of init or closing instructions changes remember to
 	 * change the define in the header file.
 	 */
 	program->prog_size = (INIT_INST + CLOSE_INST) * sizeof(struct bpf_insn);
 	program->program = xmalloc(program->prog_size);

 	/* Copy the init sequence of the program. */
 	memcpy(program->program, &init_dev, sizeof(init_dev));
 	/*
 	 * Save the number of instructions in the program, used when loading the
 	 * program.
 	 */
 	program->n_inst = INIT_INST;
 }

 extern int add_device_ebpf_prog(bpf_program_t *program, uint32_t dev_type,
 				uint32_t major, uint32_t minor, bool accept)
 {
 	bool has_type = ((dev_type == BPF_DEVCG_DEV_BLOCK)||
 			 (dev_type == BPF_DEVCG_DEV_CHAR));
 	bool has_major = (major != NO_VAL);
 	bool has_minor = (minor != NO_VAL);
 	int jump_inst = 1;

 	/*
 	 * Calculate the needed offset to skip to the next device check.
 	 * jump_inst is initialized with 1 to also jump to the "return accept"
 	 * block. For example if the device has both major and minor, then the
 	 * jump_inst will be 3, we would jump 3 in the major check
 	 * (1 instruction for the minor check and 2 for the return accept) and
 	 * 2 in the minor check (the return accept block).
 	 */
 	if (has_type)
 		jump_inst++;
 	if (has_major)
 		jump_inst++;
 	if (has_minor)
 		jump_inst++;

 	/* If none of the conditions is set exit with an error. */
 	if (jump_inst == 1) {
 		error("%s: At least one parameter needs to not be a wildcard",
 		      __func__);
 		return SLURM_ERROR;
 	}

 	/*
 	 * Reallocate the space, taking into account that number_instructions
 	 * variable is the number of instructions to jump, the number of
 	 * instructions added is 1 more, to take into account the first one.
 	 */
 	program->prog_size += (jump_inst + 1) * sizeof(struct bpf_insn);
 	xrealloc(program->program, program->prog_size);

 	/*
 	 * The remaining logic will insert the C code described in the following
 	 * comments into the program as BPF bytecode. Look at the struct request
 	 * definition in init_ebpf_prog() to better understand the code. Note
 	 * that the access type is not checked here as we allow the devices of
 	 * any access type.
 	 */

 	/*
 	 * //R2 = request.type
 	 * //if(has_type) == if(dev_type != 'a')
 	 * if (dev_type != 'a' && request.type != dev_type)
 	 *   goto next_device:
 	 */
 	if (has_type)
 		program->program[program->n_inst++] = BPF_JMP_IMM(BPF_JNE,
 								  BPF_REG_2,
 								  dev_type,
 								  jump_inst--);

 	/*
 	 * //R4 = request.major
 	 * //if(has_major) == if(major != -1)
 	 * if (major != -1 && request.major != major)
 	 *   goto next_device:
 	 */
 	if (has_major)
 		program->program[program->n_inst++] = BPF_JMP_IMM(BPF_JNE,
 								  BPF_REG_4,
 								  major,
 								  jump_inst--);

 	/*
 	 * //R5 = request.major
 	 * //if(has_minor) == if(minor != -1)
 	 * if (minor != -1 && request.minor != minor)
 	 *   goto next_device:
 	 */
 	if (has_minor)
 		program->program[program->n_inst++] = BPF_JMP_IMM(BPF_JNE,
 								  BPF_REG_5,
 								  minor,
 								  jump_inst--);

 	/*
 	 * The "return accept;" piece of code, the return value is stored in R0.
 	 * The variable accept is what to do with the device (accept/deny).
 	 */
 	program->program[program->n_inst++] = BPF_MOV64_IMM(BPF_REG_0, accept);
 	program->program[program->n_inst++] = BPF_EXIT_INSN();

 	/*
 	 * Add future devices below this line
 	 * next_device:
 	 */

 	return SLURM_SUCCESS;
 }

 extern void close_ebpf_prog(bpf_program_t *program, bool def_action)
 {
 	/* This is the same code as the return accept block in add_device. */
 	program->program[program->n_inst++] = BPF_MOV64_IMM(BPF_REG_0,
 							    def_action);
 	program->program[program->n_inst++] = BPF_EXIT_INSN();
 }

 extern int load_ebpf_prog(bpf_program_t *program, const char cgroup_path[],
 			  bool override_flag)
 {
 	int dirfd, ret, fd = -1;
 	union bpf_attr attr;

 	/*
 	 * Open the cgroup directory to get the fd for later use in the cgroup
 	 * attach syscall.
 	 */
 	dirfd = open(cgroup_path, O_DIRECTORY);
 	if (dirfd < 0) {
 		error("%s: cannot open cgroup (%s): %m", __func__, cgroup_path);
 		return SLURM_ERROR;
 	}

 	/*
 	 * Prepare all the attributes to verify and load the bpf program.
 	 * With the fd of the loaded program then we can associate it with the
 	 * cgroup.
 	 */
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = BPF_PROG_TYPE_CGROUP_DEVICE;
 	attr.insns = (size_t) program->program;
 	attr.insn_cnt = program->n_inst;
 	/* We set the license to GPL to use helper functions marked gpl_only. */
 	attr.license = (size_t) "GPL";
 	strlcpy(attr.prog_name, "Slurm_Cgroup_v2", BPF_OBJ_NAME_LEN);
 	/* Explicitly disable logging */
 	attr.log_level = 0;
 	attr.log_buf = (size_t) NULL;
 	attr.log_size = 0;

 	/* Call the load syscall */
 	fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 	if (fd < 0) {
 		error("%s: BPF load error (%m). Please check your system limits (MEMLOCK).",
 		      __func__);
 		return SLURM_ERROR;
 	}

 	/*
 	 * Erase the whole attr union so that is can be reused to attach the bpf
 	 * program to the cgroup, if override_flag is true then also add the
 	 * BPF_F_ALLOW_OVERRIDE flag, what this flag does is that any descendent
 	 * cgroups will be able to override effective bpf program that was
 	 * inherited from this cgroup, this flag is specified for all "non-leaf"
 	 * cgroups.
 	 */
 	memset(&attr, 0, sizeof(attr));
 	attr.attach_type = BPF_CGROUP_DEVICE;
 	attr.target_fd = dirfd;
 	attr.attach_bpf_fd = fd;
 	if (override_flag)
 		attr.attach_flags = BPF_F_ALLOW_OVERRIDE;

 	/* Call the attach syscall */
 	ret = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 	if (ret < 0) {
 		error("%s: BPF attach: %d: %m", __func__, ret);
 		close(dirfd);
 		return SLURM_ERROR;
 	}

 	close(dirfd);
 	return SLURM_SUCCESS;
 }

 extern void free_ebpf_prog(bpf_program_t *program)
 {
 	xfree(program->program);
 }
	/*****************************************************************************\
	* ebpf.c - library to handle BPF cgroup device constrains
	*****************************************************************************
	* Copyright (C) SchedMD LLC.
	*
	* This file is part of Slurm, a resource management program.
	* For details, see <https://slurm.schedmd.com/>.
	* Please also read the included file: DISCLAIMER.
	*
	* Slurm is free software; you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free
	* Software Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* In addition, as a special exception, the copyright holders give permission
	* to link the code of portions of this program with the OpenSSL library under
	* certain conditions as described in each individual source file, and
	* distribute linked combinations including the two. You must obey the GNU
	* General Public License in all respects for all of the code used other than
	* OpenSSL. If you modify file(s) with this exception, you may extend this
	* exception to your version of the file(s), but you are not obligated to do
	* so. If you do not wish to do so, delete this exception statement from your
	* version. If you delete this exception statement from all source files in
	* the program, then also delete it here.
	*
	* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
	* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along
	* with Slurm; if not, write to the Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	\*****************************************************************************/

	#define _GNU_SOURCE
	#include "ebpf.h"

	#define bpf(cmd, attr, size) (int) syscall(__NR_bpf, cmd, attr, size);

	/* Macros inspired from libcrun. */
	#define BPF_ALU32_IMM(OP, DST, IMM) \
	((struct bpf_insn) { .code = BPF_ALU \| BPF_OP (OP) \| BPF_K, \
	.dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })

	#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
	((struct bpf_insn) { .code = BPF_LDX \| BPF_SIZE (SIZE) \| BPF_MEM, \
	.dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })

	#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
	((struct bpf_insn) { .code = BPF_JMP \| BPF_OP (OP) \| BPF_K, \
	.dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })

	#define BPF_MOV64_IMM(DST, IMM) \
	((struct bpf_insn) { .code = BPF_ALU64 \| BPF_MOV \| BPF_K, \
	.dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })

	#define BPF_EXIT_INSN() \
	((struct bpf_insn) { .code = BPF_JMP \| BPF_EXIT, .dst_reg = 0, \
	.src_reg = 0, .off = 0, .imm = 0 })

	extern void init_ebpf_prog(bpf_program_t *program)
	{
	/*
	* The following bpf program represented by the struct array init_dev
	* will take care of storing the parameter (a.k.a the request) of the
	* called function (which is program->program) into different registers,
	* The request is stored in BPF_REG_1 at the moment of the function call
	* and is a struct that represents the device that needs to be checked
	* by the program in order to see whether access is granted or not.
	*
	* It looks like:
	*
	* struct request {
	* int access_type; //int is split into type(low) & access(high)
	* int major;
	* int minor;
	* }
	*
	* To split type_access the following operation is done:
	*
	* int type = access_type & 0x0000FFFF;
	* int access = access_type >> 16;
	*/
	struct bpf_insn init_dev[] = {
	/* type -> R2. */
	BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0),
	BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF),

	/* access -> R3. */
	BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0),
	BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16),

	/* major -> R4. */
	BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4),

	/* minor -> R5. */
	BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8),
	};

	/*
	* Allocate the size of the init instructions(6) + 2 more instructions
	* for the ending (close_ebpf_prog). The allocated space is stored in
	* prog_size so that it can be used in future reallocs.
	* If the number of init or closing instructions changes remember to
	* change the define in the header file.
	*/
	program->prog_size = (INIT_INST + CLOSE_INST) * sizeof(struct bpf_insn);
	program->program = xmalloc(program->prog_size);

	/* Copy the init sequence of the program. */
	memcpy(program->program, &init_dev, sizeof(init_dev));
	/*
	* Save the number of instructions in the program, used when loading the
	* program.
	*/
	program->n_inst = INIT_INST;
	}

	extern int add_device_ebpf_prog(bpf_program_t *program, uint32_t dev_type,
	uint32_t major, uint32_t minor, bool accept)
	{
	bool has_type = ((dev_type == BPF_DEVCG_DEV_BLOCK)\|\|
	(dev_type == BPF_DEVCG_DEV_CHAR));
	bool has_major = (major != NO_VAL);
	bool has_minor = (minor != NO_VAL);
	int jump_inst = 1;

	/*
	* Calculate the needed offset to skip to the next device check.
	* jump_inst is initialized with 1 to also jump to the "return accept"
	* block. For example if the device has both major and minor, then the
	* jump_inst will be 3, we would jump 3 in the major check
	* (1 instruction for the minor check and 2 for the return accept) and
	* 2 in the minor check (the return accept block).
	*/
	if (has_type)
	jump_inst++;
	if (has_major)
	jump_inst++;
	if (has_minor)
	jump_inst++;

	/* If none of the conditions is set exit with an error. */
	if (jump_inst == 1) {
	error("%s: At least one parameter needs to not be a wildcard",
	__func__);
	return SLURM_ERROR;
	}

	/*
	* Reallocate the space, taking into account that number_instructions
	* variable is the number of instructions to jump, the number of
	* instructions added is 1 more, to take into account the first one.
	*/
	program->prog_size += (jump_inst + 1) * sizeof(struct bpf_insn);
	xrealloc(program->program, program->prog_size);

	/*
	* The remaining logic will insert the C code described in the following
	* comments into the program as BPF bytecode. Look at the struct request
	* definition in init_ebpf_prog() to better understand the code. Note
	* that the access type is not checked here as we allow the devices of
	* any access type.
	*/

	/*
	* //R2 = request.type
	* //if(has_type) == if(dev_type != 'a')
	* if (dev_type != 'a' && request.type != dev_type)
	* goto next_device:
	*/
	if (has_type)
	program->program[program->n_inst++] = BPF_JMP_IMM(BPF_JNE,
	BPF_REG_2,
	dev_type,
	jump_inst--);

	/*
	* //R4 = request.major
	* //if(has_major) == if(major != -1)
	* if (major != -1 && request.major != major)
	* goto next_device:
	*/
	if (has_major)
	program->program[program->n_inst++] = BPF_JMP_IMM(BPF_JNE,
	BPF_REG_4,
	major,
	jump_inst--);

	/*
	* //R5 = request.major
	* //if(has_minor) == if(minor != -1)
	* if (minor != -1 && request.minor != minor)
	* goto next_device:
	*/
	if (has_minor)
	program->program[program->n_inst++] = BPF_JMP_IMM(BPF_JNE,
	BPF_REG_5,
	minor,
	jump_inst--);

	/*
	* The "return accept;" piece of code, the return value is stored in R0.
	* The variable accept is what to do with the device (accept/deny).
	*/
	program->program[program->n_inst++] = BPF_MOV64_IMM(BPF_REG_0, accept);
	program->program[program->n_inst++] = BPF_EXIT_INSN();

	/*
	* Add future devices below this line
	* next_device:
	*/

	return SLURM_SUCCESS;
	}

	extern void close_ebpf_prog(bpf_program_t *program, bool def_action)
	{
	/* This is the same code as the return accept block in add_device. */
	program->program[program->n_inst++] = BPF_MOV64_IMM(BPF_REG_0,
	def_action);
	program->program[program->n_inst++] = BPF_EXIT_INSN();
	}

	extern int load_ebpf_prog(bpf_program_t *program, const char cgroup_path[],
	bool override_flag)
	{
	int dirfd, ret, fd = -1;
	union bpf_attr attr;

	/*
	* Open the cgroup directory to get the fd for later use in the cgroup
	* attach syscall.
	*/
	dirfd = open(cgroup_path, O_DIRECTORY);
	if (dirfd < 0) {
	error("%s: cannot open cgroup (%s): %m", __func__, cgroup_path);
	return SLURM_ERROR;
	}

	/*
	* Prepare all the attributes to verify and load the bpf program.
	* With the fd of the loaded program then we can associate it with the
	* cgroup.
	*/
	memset(&attr, 0, sizeof(attr));
	attr.prog_type = BPF_PROG_TYPE_CGROUP_DEVICE;
	attr.insns = (size_t) program->program;
	attr.insn_cnt = program->n_inst;
	/* We set the license to GPL to use helper functions marked gpl_only. */
	attr.license = (size_t) "GPL";
	strlcpy(attr.prog_name, "Slurm_Cgroup_v2", BPF_OBJ_NAME_LEN);
	/* Explicitly disable logging */
	attr.log_level = 0;
	attr.log_buf = (size_t) NULL;
	attr.log_size = 0;

	/* Call the load syscall */
	fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
	if (fd < 0) {
	error("%s: BPF load error (%m). Please check your system limits (MEMLOCK).",
	__func__);
	return SLURM_ERROR;
	}

	/*
	* Erase the whole attr union so that is can be reused to attach the bpf
	* program to the cgroup, if override_flag is true then also add the
	* BPF_F_ALLOW_OVERRIDE flag, what this flag does is that any descendent
	* cgroups will be able to override effective bpf program that was
	* inherited from this cgroup, this flag is specified for all "non-leaf"
	* cgroups.
	*/
	memset(&attr, 0, sizeof(attr));
	attr.attach_type = BPF_CGROUP_DEVICE;
	attr.target_fd = dirfd;
	attr.attach_bpf_fd = fd;
	if (override_flag)
	attr.attach_flags = BPF_F_ALLOW_OVERRIDE;

	/* Call the attach syscall */
	ret = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
	if (ret < 0) {
	error("%s: BPF attach: %d: %m", __func__, ret);
	close(dirfd);
	return SLURM_ERROR;
	}

	close(dirfd);
	return SLURM_SUCCESS;
	}

	extern void free_ebpf_prog(bpf_program_t *program)
	{
	xfree(program->program);
	}