blob: 9292e1fe4ab586971611dc8e0cb7bf5c403a91e7 [file] [log] [blame]
/*****************************************************************************\
* switch_hpe_slingshot.h - Library for managing HPE Slingshot networks
*****************************************************************************
* Copyright 2021-2023 Hewlett Packard Enterprise Development LP
* Written by David Gloe <david.gloe@hpe.com>
* Written by Jim Nordby <james.nordby@hpe.com>
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _SWITCH_HPE_SLINGSHOT_H_
#define _SWITCH_HPE_SLINGSHOT_H_
#include <stdbool.h>
#include <stdint.h>
#include "src/common/read_config.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmd/slurmstepd/slurmstepd_job.h"
/* Version of the state file */
#define SLINGSHOT_STATE_VERSION 2
#define SLINGSHOT_STATE_VERSION_VER1 1
/* State file name */
#define SLINGSHOT_STATE_FILE "slingshot_state"
/* Environment variable for libcxi library name (for dlopen()) */
#define SLINGSHOT_CXI_LIB_VERSION_ENV "SLURM_SLINGSHOT_CXI_VERSION"
/* Suffix of versioned CXI library functions if environment variable not set */
#define SLINGSHOT_CXI_LIB_VERSION ""
/* Min/max VNI values */
#define SLINGSHOT_VNI_MIN 0
#define SLINGSHOT_VNI_MAX 65535
#define SLINGSHOT_VNIS 4 /* Max VNIs/service */
/* Default state values if not configured */
#define SLINGSHOT_VNI_MIN_DEF 1024
#define SLINGSHOT_VNI_MAX_DEF 65535
/* Number of retries for destroying CXI services */
#define SLINGSHOT_CXI_DESTROY_RETRIES 5
/* File path format to rdzv_get_en setting */
#define SLINGSHOT_RDZV_GET_EN_FMT \
"/sys/class/cxi/cxi%d/device/properties/rdzv_get_en"
/* File path format to the default rdzv_get_en setting */
#define SLINGSHOT_RDZV_GET_EN_DEFAULT_FMT \
"/sys/module/cxi_%s/parameters/rdzv_get_en_default"
/* RGID sharing: max_lnis_per_rgid value found here - insert device name */
#define SLINGSHOT_RGIDS_AVAIL_FMT \
"/sys/class/cxi/%s/device/properties/rgids_avail"
extern int free_vnis; /* Number of free VNIs */
/* Set of valid auth types used for REST */
typedef enum {
SLINGSHOT_AUTH_NONE = 0, /* No authentication */
SLINGSHOT_AUTH_BASIC, /* User name and password */
SLINGSHOT_AUTH_OAUTH /* OAuth2 client credentials grant */
} slingshot_rest_auth_t;
/* Per-job shared VNI structure */
typedef struct job_vni {
uint32_t job_id; /* Job ID */
uint16_t vni; /* Per-Job-ID shared VNI */
} job_vni_t;
/* Format for state file created by switch_p_libstate_save */
typedef struct slingshot_state {
uint32_t version; /* Version of this file format */
uint16_t vni_min; /* Minimum VNI to allocate */
uint16_t vni_max; /* Maximum VNI to allocate */
uint16_t vni_last; /* Last allocated VNI */
bitstr_t *vni_table; /* Bitmap of allocated VNIs */
uint32_t num_job_vnis; /* Number of per-job shared VNIs */
job_vni_t *job_vnis; /* Per-job shared VNI reservations */
uint32_t num_job_hwcoll; /* Number of per-job shared VNIs */
uint32_t *job_hwcoll; /* Array of job IDs using collectives */
} slingshot_state_t;
/* Max NIC resources per application */
#define SLINGSHOT_TXQ_MAX 1024 /* Max transmit command queues */
#define SLINGSHOT_TGQ_MAX 512 /* Max target command queues */
#define SLINGSHOT_EQ_MAX 2047 /* Max event queues */
#define SLINGSHOT_CT_MAX 2047 /* Max counters */
#define SLINGSHOT_TLE_MAX 2048 /* Max trigger list entries */
#define SLINGSHOT_PTE_MAX 2048 /* Max portal table entries */
#define SLINGSHOT_LE_MAX 16384 /* Max list entries */
#define SLINGSHOT_AC_MAX 1022 /* Max addressing contexts */
/* Default per-thread NIC resources per application */
#define SLINGSHOT_TXQ_DEF 2 /* Per-thread transmit command queues */
#define SLINGSHOT_TGQ_DEF 1 /* Per-thread target command queues */
#define SLINGSHOT_EQ_DEF 2 /* Per-thread event queues */
#define SLINGSHOT_CT_DEF 1 /* Per-thread counters */
#define SLINGSHOT_TLE_DEF 1 /* Per-thread trigger list entries */
#define SLINGSHOT_PTE_DEF 6 /* Per-thread portal table entries */
#define SLINGSHOT_LE_DEF 16 /* Per-thread list entries */
#define SLINGSHOT_AC_DEF 2 /* Per-thread addressing contexts */
/* NIC resource limit structure */
typedef struct slingshot_limits {
uint16_t max; /* Max of this resource the application can use */
uint16_t res; /* Resources reserved for only this application */
uint16_t def; /* Per-thread resources to reserve */
} slingshot_limits_t;
/* Full set of NIC resource limits */
typedef struct slingshot_limits_set {
slingshot_limits_t txqs; /* Transmit command queue limits */
slingshot_limits_t tgqs; /* Target command queue limits */
slingshot_limits_t eqs; /* Event queue limits */
slingshot_limits_t cts; /* Counter limits */
slingshot_limits_t tles; /* Trigger list entry limits */
slingshot_limits_t ptes; /* Portal table entry limits */
slingshot_limits_t les; /* List entry limits */
slingshot_limits_t acs; /* Addressing context limits */
} slingshot_limits_set_t;
/*
* Slingshot switch plugin global configuration state, based on defaults and
* 'SwitchParameters' slurm.conf variable
*/
typedef struct slingshot_config {
uint32_t destroy_retries; /* retry count for destroying services */
uint8_t single_node_vni; /* Allocate VNIs for single-node apps */
uint8_t job_vni; /* Allocate extra VNI per-job */
uint32_t tcs; /* Bitmap of default traffic classes */
uint32_t flags; /* Bitmap of configuration flags */
slingshot_limits_set_t limits; /* Set of NIC resource limits */
char *jlope_url; /* URL of jackaloped REST interface */
slingshot_rest_auth_t jlope_auth; /* jackaloped authentication type */
char *jlope_authdir; /* jackaloped auth file directory */
uint32_t hwcoll_addrs_per_job; /* #Hardware collectives per job */
uint32_t hwcoll_num_nodes; /* Minimum job nodes for HW coll */
char *fm_url; /* fabric manager REST interface URL */
slingshot_rest_auth_t fm_auth; /* fabric manager authentication type */
char *fm_authdir; /* fabric manager auth file directory */
char *fm_mtls_ca; /* fabric manager certificate bundle path */
char *fm_mtls_cert; /* fabric manager client public certificate path */
char *fm_mtls_key; /* fabric manager client private key path */
char *fm_mtls_url; /* fabric manager REST interface URL for mtls */
} slingshot_config_t;
/* Values for slingshot_config_t.single_node_vni */
#define SLINGSHOT_SN_VNI_NONE 0 /* No VNIs allocated for single-node apps */
#define SLINGSHOT_SN_VNI_ALL 1 /* All single-node apps get a VNI */
#define SLINGSHOT_SN_VNI_USER 2 /* srun --network=single_node_vni */
/* Values for slingshot_config_t.job_vni */
#define SLINGSHOT_JOB_VNI_NONE 0 /* No job VNIs allocated */
#define SLINGSHOT_JOB_VNI_ALL 1 /* All jobs get a job VNI */
#define SLINGSHOT_JOB_VNI_USER 2 /* Job VNIs using srun --network=job_vni */
/* NIC communication profile structure (compute-node specific) */
typedef struct slingshot_comm_profile {
uint32_t svc_id; /* Slingshot service ID */
uint16_t vnis[SLINGSHOT_VNIS]; /* VNIs for this service */
uint16_t vnis_used; /* Number of valid VNIs in vnis[] */
uint32_t tcs; /* Bitmap of allowed traffic classes */
char device_name[16]; /* NIC device name (e.g. "cxi0") */
} slingshot_comm_profile_t;
/*
* Slingshot HSN NIC information structure
*/
typedef enum {
SLINGSHOT_ADDR_IPV4,
SLINGSHOT_ADDR_IPV6,
SLINGSHOT_ADDR_MAC
} slingshot_addr_type_t;
typedef struct {
uint32_t nodeidx; /* Node index this NIC belongs to */
slingshot_addr_type_t address_type; /* Address type for this NIC */
char address[64]; /* Address of this NIC */
uint16_t numa_node; /* NUMA node it is in */
char device_name[16]; /* Device name */
} slingshot_hsn_nic_t;
/*
* Information to support Slingshot HSN hardware collectives
* and communication with Slingshot Fabric Manager (FM)
*/
typedef struct {
uint32_t job_id; /* job id */
uint32_t step_id; /* step id */
char *mcast_token; /* Session token returned from FM */
char *fm_url; /* FM URL for creating multicast trees */
uint32_t addrs_per_job; /* Collectives multicast addrs per job */
uint32_t num_nodes; /* Minimum #nodes to get multicast addrs */
} slingshot_hwcoll_t;
/* Denotes packing a null stepinfo structure */
#define SLINGSHOT_JOBINFO_NULL_VERSION 0xDEAFDEAF
typedef struct slingshot_jobinfo {
uint32_t num_vnis; /* Number of VNIs */
uint16_t *vnis; /* List of VNIs allocated for this job */
char *extra; /* storage for mid-release extras */
} slingshot_jobinfo_t;
/* Jobinfo structure passed from slurmctld to slurmd */
typedef struct slingshot_stepinfo {
uint32_t version; /* Version of this structure */
uint32_t num_vnis; /* Number of VNIs */
uint16_t *vnis; /* List of VNIs allocated for this application */
uint32_t tcs; /* Bitmap of allowed traffic classes */
slingshot_limits_set_t limits; /* Set of NIC resource limits */
uint32_t depth; /* Threads-per-task for limit calculation */
uint32_t num_profiles; /* Number of communication profiles */
slingshot_comm_profile_t *profiles; /* List of communication profiles */
uint32_t flags; /* Configuration flags */
uint32_t num_nics; /* Number of entries in 'nics' array */
slingshot_hsn_nic_t *nics; /* HSN NIC information for instant on */
slingshot_hwcoll_t *hwcoll; /* HSN HW collectives info */
} slingshot_stepinfo_t;
/* Slingshot traffic classes (bitmap) */
#define SLINGSHOT_TC_DEDICATED_ACCESS 0x1
#define SLINGSHOT_TC_LOW_LATENCY 0x2
#define SLINGSHOT_TC_BULK_DATA 0x4
#define SLINGSHOT_TC_BEST_EFFORT 0x8
#define SLINGSHOT_TC_DEFAULT (SLINGSHOT_TC_LOW_LATENCY | \
SLINGSHOT_TC_BEST_EFFORT)
/* Values for slingshot_stepinfo_t.flags */
/*
* If SLINGSHOT_FLAGS_ADJUST_LIMITS is set (default), slurmd will adjust
* resource limit reservations by subtracting system service reserved/used
* resources
*
* If SLINGSHOT_FLAGS_ENABLE_MTLS is set, Slurm daemons will use mTLS
* authentication with the fabric manager for the duration of the application
*
* If SLINGSHOT_FLAGS_DISABLE_RDZV_GET is set, slurmd will disable rendezvous
* gets in the Cassini NIC for the duration of the application
*/
#define SLINGSHOT_FLAGS_ADJUST_LIMITS 0x1
#define SLINGSHOT_FLAGS_ENABLE_MTLS 0x2
#define SLINGSHOT_FLAGS_DISABLE_RDZV_GET 0x4
#define SLINGSHOT_FLAGS_DEFAULT SLINGSHOT_FLAGS_ADJUST_LIMITS
/* Environment variables set for applications */
#define SLINGSHOT_SVC_IDS_ENV "SLINGSHOT_SVC_IDS"
#define SLINGSHOT_VNIS_ENV "SLINGSHOT_VNIS"
#define SLINGSHOT_DEVICES_ENV "SLINGSHOT_DEVICES"
#define SLINGSHOT_TCS_ENV "SLINGSHOT_TCS"
/* Slingshot collectives environment variables set for applications */
#define SLINGSHOT_FI_CXI_COLL_JOB_ID_ENV "FI_CXI_COLL_JOB_ID"
#define SLINGSHOT_FI_CXI_COLL_JOB_STEP_ID_ENV "FI_CXI_COLL_JOB_STEP_ID"
#define SLINGSHOT_FI_CXI_COLL_MCAST_TOKEN_ENV "FI_CXI_COLL_MCAST_TOKEN"
#define SLINGSHOT_FI_CXI_COLL_FABRIC_MGR_URL_ENV "FI_CXI_COLL_FABRIC_MGR_URL"
#define SLINGSHOT_FI_CXI_HWCOLL_ADDRS_PER_JOB_ENV "FI_CXI_HWCOLL_ADDRS_PER_JOB"
#define SLINGSHOT_FI_CXI_HWCOLL_MIN_NODES_ENV "FI_CXI_HWCOLL_MIN_NODES"
/* Global variables */
extern slingshot_state_t slingshot_state;
extern slingshot_config_t slingshot_config;
extern bool active_outside_ctld;
/* Global functions */
/* apinfo.c */
extern bool create_slingshot_apinfo(const stepd_step_rec_t *step);
extern void remove_slingshot_apinfo(const stepd_step_rec_t *step);
/* collectives.c */
extern bool slingshot_init_collectives(void);
extern void slingshot_fini_collectives(void);
extern bool slingshot_setup_collectives(slingshot_stepinfo_t *job,
uint32_t node_cnt, uint32_t job_id,
uint32_t step_id);
extern void slingshot_collectives_env(slingshot_stepinfo_t *job, char ***env);
extern void slingshot_release_collectives_job_step(slingshot_stepinfo_t *job);
extern void slingshot_release_collectives_job(uint32_t job_id);
/* config.c */
extern void slingshot_free_config(void);
extern bool slingshot_stepd_init(const char *switch_params);
extern bool slingshot_setup_config(const char *switch_params);
extern int slingshot_update_vni_table(void);
extern bool slingshot_setup_job_vni_pool(job_record_t *job_ptr);
extern bool slingshot_setup_job_step_vni(
slingshot_stepinfo_t *job, int node_cnt,
uint32_t job_id, const char *network_params,
const char *job_network_params);
extern void slingshot_free_job_step_vni(slingshot_stepinfo_t *job);
extern void slingshot_free_job_vni(uint32_t job_id);
extern void slingshot_free_job_vni_pool(slingshot_jobinfo_t *job);
extern void slingshot_free_jobinfo(slingshot_jobinfo_t *jobinfo);
/* setup_nic.c */
extern bool slingshot_open_cxi_lib(slingshot_stepinfo_t *job);
extern bool slingshot_create_services(slingshot_stepinfo_t *job, uint32_t uid,
uint16_t step_cpus, uint32_t job_id);
extern bool slingshot_destroy_services(slingshot_stepinfo_t *job,
uint32_t job_id);
extern void slingshot_free_services(void);
extern int slingshot_update_config(slingshot_jobinfo_t *jobinfo);
#endif