| /*****************************************************************************\ |
| * gres_mps.c - Support MPS as a generic resources. |
| * MPS or CUDA Multi-Process Services is a mechanism to share GPUs. |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * Written by Morris Jette |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include <ctype.h> |
| #include <inttypes.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/slurm_xlator.h" |
| #include "src/common/bitstring.h" |
| #include "src/common/env.h" |
| #include "src/interfaces/gres.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/list.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| |
| #include "../common/gres_common.h" |
| #include "../common/gres_c_s.h" |
| |
| /* |
| * These variables are required by the generic plugin interface. If they |
| * are not found in the plugin, the plugin loader will ignore it. |
| * |
| * plugin_name - A string giving a human-readable description of the |
| * plugin. There is no maximum length, but the symbol must refer to |
| * a valid string. |
| * |
| * plugin_type - A string suggesting the type of the plugin or its |
| * applicability to a particular form of data or method of data handling. |
| * If the low-level plugin API is used, the contents of this string are |
| * unimportant and may be anything. Slurm uses the higher-level plugin |
| * interface which requires this string to be of the form |
| * |
| * <application>/<method> |
| * |
| * where <application> is a description of the intended application of |
| * the plugin (e.g., "auth" for Slurm authentication) and <method> is a |
| * description of how this plugin satisfies that application. Slurm will |
| * only load authentication plugins if the plugin_type string has a prefix |
| * of "auth/". |
| * |
| * plugin_version - an unsigned 32-bit integer containing the Slurm version |
| * (major.minor.micro combined into a single number). |
| */ |
| const char plugin_name[] = "Gres MPS plugin"; |
| const char plugin_type[] = "gres/mps"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| static list_t *gres_devices = NULL; |
| |
| extern int init(void) |
| { |
| debug("loaded"); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern void fini(void) |
| { |
| debug("unloading"); |
| FREE_NULL_LIST(gres_devices); |
| gres_c_s_fini(); |
| } |
| |
| /* |
| * We could load gres state or validate it using various mechanisms here. |
| * This only validates that the configuration was specified in gres.conf. |
| * In the general case, no code would need to be changed. |
| */ |
| extern int gres_p_node_config_load(list_t *gres_conf_list, |
| node_config_load_t *config) |
| { |
| return gres_c_s_init_share_devices( |
| gres_conf_list, &gres_devices, config, "gpu"); |
| } |
| |
| /* Given a global device ID, return its gres/mps count */ |
| static uint64_t _get_dev_count(int global_id) |
| { |
| list_itr_t *itr; |
| shared_dev_info_t *mps_ptr; |
| uint64_t count = NO_VAL64; |
| |
| if (!shared_info) { |
| error("shared_info is NULL"); |
| return 100; |
| } |
| itr = list_iterator_create(shared_info); |
| while ((mps_ptr = list_next(itr))) { |
| if (mps_ptr->id == global_id) { |
| count = mps_ptr->count; |
| break; |
| } |
| } |
| list_iterator_destroy(itr); |
| if (count == NO_VAL64) { |
| error("Could not find gres/mps count for device ID %d", |
| global_id); |
| return 100; |
| } |
| |
| return count; |
| } |
| |
| static void _set_env(common_gres_env_t *gres_env) |
| { |
| char perc_str[64]; |
| uint64_t count_on_dev, percentage; |
| |
| gres_env->global_id = -1; |
| gres_env->gres_conf_flags = GRES_CONF_ENV_NVML; |
| gres_env->gres_devices = gres_devices; |
| gres_env->prefix = ""; |
| |
| gres_common_gpu_set_env(gres_env); |
| |
| /* |
| * Set environment variables if GRES is found. Otherwise, unset |
| * environment variables, since this means GRES is not allocated. |
| * This is useful for jobs and steps that request --gres=none within an |
| * existing job allocation with GRES. |
| */ |
| if (gres_env->gres_cnt && shared_info) { |
| count_on_dev = _get_dev_count(gres_env->global_id); |
| if (count_on_dev > 0) { |
| percentage = (gres_env->gres_cnt * 100) / count_on_dev; |
| percentage = MAX(percentage, 1); |
| } else |
| percentage = 0; |
| snprintf(perc_str, sizeof(perc_str), "%"PRIu64, percentage); |
| env_array_overwrite(gres_env->env_ptr, |
| "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", |
| perc_str); |
| } else if (gres_env->gres_cnt) { |
| error("shared_info list is NULL"); |
| snprintf(perc_str, sizeof(perc_str), "%"PRIu64, |
| gres_env->gres_cnt); |
| env_array_overwrite(gres_env->env_ptr, |
| "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", |
| perc_str); |
| } else { |
| unsetenvp(*gres_env->env_ptr, |
| "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"); |
| } |
| } |
| |
| /* |
| * Set environment variables as appropriate for a job (i.e. all tasks) based |
| * upon the job's GRES state. |
| */ |
| extern void gres_p_job_set_env(char ***job_env_ptr, |
| bitstr_t *gres_bit_alloc, |
| uint64_t gres_per_node, |
| gres_internal_flags_t flags) |
| { |
| common_gres_env_t gres_env = { |
| .bit_alloc = gres_bit_alloc, |
| .env_ptr = job_env_ptr, |
| .flags = flags, |
| .gres_cnt = gres_per_node, |
| .is_job = true, |
| }; |
| |
| _set_env(&gres_env); |
| } |
| |
| /* |
| * Set environment variables as appropriate for a step (i.e. all tasks) based |
| * upon the job step's GRES state. |
| */ |
| extern void gres_p_step_set_env(char ***step_env_ptr, |
| bitstr_t *gres_bit_alloc, |
| uint64_t gres_per_node, |
| gres_internal_flags_t flags) |
| { |
| common_gres_env_t gres_env = { |
| .bit_alloc = gres_bit_alloc, |
| .env_ptr = step_env_ptr, |
| .flags = flags, |
| .gres_cnt = gres_per_node, |
| }; |
| |
| _set_env(&gres_env); |
| } |
| |
| /* |
| * Reset environment variables as appropriate for a job (i.e. this one task) |
| * based upon the job step's GRES state and assigned CPUs. |
| */ |
| extern void gres_p_task_set_env(char ***task_env_ptr, |
| bitstr_t *gres_bit_alloc, |
| uint64_t gres_cnt, |
| bitstr_t *usable_gres, |
| gres_internal_flags_t flags) |
| { |
| common_gres_env_t gres_env = { |
| .bit_alloc = gres_bit_alloc, |
| .env_ptr = task_env_ptr, |
| .flags = flags, |
| .gres_cnt = gres_cnt, |
| .is_task = true, |
| .usable_gres = usable_gres, |
| }; |
| |
| _set_env(&gres_env); |
| } |
| |
| /* Send GRES information to slurmstepd on the specified file descriptor */ |
| extern void gres_p_send_stepd(buf_t *buffer) |
| { |
| gres_send_stepd(buffer, gres_devices); |
| |
| gres_c_s_send_stepd(buffer); |
| |
| return; |
| } |
| |
| /* Receive GRES information from slurmd on the specified file descriptor */ |
| extern void gres_p_recv_stepd(buf_t *buffer) |
| { |
| gres_recv_stepd(buffer, &gres_devices); |
| |
| gres_c_s_recv_stepd(buffer); |
| |
| return; |
| } |
| |
| /* |
| * Return a list of devices of this type. The list elements are of type |
| * "gres_device_t" and the list should be freed using FREE_NULL_LIST(). |
| */ |
| extern list_t *gres_p_get_devices(void) |
| { |
| return gres_devices; |
| } |
| |
| extern void gres_p_step_hardware_init(bitstr_t *usable_gres, char *settings) |
| { |
| return; |
| } |
| |
| extern void gres_p_step_hardware_fini(void) |
| { |
| return; |
| } |
| |
| /* |
| * Build record used to set environment variables as appropriate for a job's |
| * prolog or epilog based GRES allocated to the job. |
| */ |
| extern gres_prep_t *gres_p_prep_build_env( |
| gres_job_state_t *gres_js) |
| { |
| int i; |
| gres_prep_t *gres_prep; |
| |
| gres_prep = xmalloc(sizeof(gres_prep_t)); |
| gres_prep->node_cnt = gres_js->node_cnt; |
| gres_prep->gres_bit_alloc = xcalloc(gres_prep->node_cnt, |
| sizeof(bitstr_t *)); |
| gres_prep->gres_cnt_node_alloc = xcalloc(gres_prep->node_cnt, |
| sizeof(uint64_t)); |
| for (i = 0; i < gres_prep->node_cnt; i++) { |
| if (gres_js->gres_bit_alloc && |
| gres_js->gres_bit_alloc[i]) { |
| gres_prep->gres_bit_alloc[i] = |
| bit_copy(gres_js->gres_bit_alloc[i]); |
| } |
| if (gres_js->gres_bit_alloc && |
| gres_js->gres_bit_alloc[i]) { |
| gres_prep->gres_cnt_node_alloc[i] = |
| gres_js->gres_cnt_node_alloc[i]; |
| } |
| } |
| |
| return gres_prep; |
| } |
| |
| /* |
| * Set environment variables as appropriate for a job's prolog or epilog based |
| * GRES allocated to the job. |
| */ |
| extern void gres_p_prep_set_env(char ***prep_env_ptr, |
| gres_prep_t *gres_prep, int node_inx) |
| { |
| int dev_inx = -1, global_id = -1, i; |
| uint64_t count_on_dev, gres_per_node = 0, percentage; |
| gres_device_t *gres_device; |
| list_itr_t *iter; |
| |
| if (gres_common_prep_set_env(prep_env_ptr, |
| gres_prep, node_inx, |
| GRES_CONF_ENV_NVML, gres_devices)) |
| return; |
| |
| if (gres_prep->gres_bit_alloc && |
| gres_prep->gres_bit_alloc[node_inx]) |
| dev_inx = bit_ffs(gres_prep->gres_bit_alloc[node_inx]); |
| if (dev_inx >= 0) { |
| /* Translate bit to device number, may differ */ |
| i = -1; |
| iter = list_iterator_create(gres_devices); |
| while ((gres_device = list_next(iter))) { |
| i++; |
| if (i == dev_inx) { |
| global_id = gres_device->dev_num; |
| break; |
| } |
| } |
| list_iterator_destroy(iter); |
| } |
| if ((global_id >= 0) && |
| gres_prep->gres_cnt_node_alloc && |
| gres_prep->gres_cnt_node_alloc[node_inx]) { |
| gres_per_node = gres_prep->gres_cnt_node_alloc[node_inx]; |
| count_on_dev = _get_dev_count(global_id); |
| if (count_on_dev > 0) { |
| percentage = (gres_per_node * 100) / count_on_dev; |
| percentage = MAX(percentage, 1); |
| } else |
| percentage = 0; |
| |
| xassert(*prep_env_ptr); |
| env_array_overwrite_fmt(prep_env_ptr, |
| "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", |
| "%"PRIu64, percentage); |
| } |
| |
| return; |
| } |