|  | /*****************************************************************************\ | 
|  | *  switch_hpe_slingshot.h - Library for managing HPE Slingshot networks | 
|  | ***************************************************************************** | 
|  | *  Copyright 2021-2023 Hewlett Packard Enterprise Development LP | 
|  | *  Written by David Gloe <david.gloe@hpe.com> | 
|  | *  Written by Jim Nordby <james.nordby@hpe.com> | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  In addition, as a special exception, the copyright holders give permission | 
|  | *  to link the code of portions of this program with the OpenSSL library under | 
|  | *  certain conditions as described in each individual source file, and | 
|  | *  distribute linked combinations including the two. You must obey the GNU | 
|  | *  General Public License in all respects for all of the code used other than | 
|  | *  OpenSSL. If you modify file(s) with this exception, you may extend this | 
|  | *  exception to your version of the file(s), but you are not obligated to do | 
|  | *  so. If you do not wish to do so, delete this exception statement from your | 
|  | *  version.  If you delete this exception statement from all source files in | 
|  | *  the program, then also delete it here. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #ifndef _SWITCH_HPE_SLINGSHOT_H_ | 
|  | #define _SWITCH_HPE_SLINGSHOT_H_ | 
|  |  | 
|  | #include <stdbool.h> | 
|  | #include <stdint.h> | 
|  |  | 
|  | #include "src/common/read_config.h" | 
|  | #include "src/common/xmalloc.h" | 
|  | #include "src/common/xstring.h" | 
|  | #include "src/slurmd/slurmstepd/slurmstepd_job.h" | 
|  |  | 
|  | /* Version of the state file */ | 
|  | #define SLINGSHOT_STATE_VERSION 2 | 
|  | #define SLINGSHOT_STATE_VERSION_VER1 1 | 
|  |  | 
|  | /* State file name */ | 
|  | #define SLINGSHOT_STATE_FILE "slingshot_state" | 
|  |  | 
|  | /* Environment variable for libcxi library name (for dlopen()) */ | 
|  | #define SLINGSHOT_CXI_LIB_VERSION_ENV "SLURM_SLINGSHOT_CXI_VERSION" | 
|  |  | 
|  | /* Suffix of versioned CXI library functions if environment variable not set */ | 
|  | #define SLINGSHOT_CXI_LIB_VERSION "" | 
|  |  | 
|  | /* Min/max VNI values */ | 
|  | #define SLINGSHOT_VNI_MIN 0 | 
|  | #define SLINGSHOT_VNI_MAX 65535 | 
|  | #define SLINGSHOT_VNIS    4     /* Max VNIs/service */ | 
|  |  | 
|  | /* Default state values if not configured */ | 
|  | #define SLINGSHOT_VNI_MIN_DEF 1024 | 
|  | #define SLINGSHOT_VNI_MAX_DEF 65535 | 
|  |  | 
|  | /* Number of retries for destroying CXI services */ | 
|  | #define SLINGSHOT_CXI_DESTROY_RETRIES 5 | 
|  |  | 
|  | /* File path format to rdzv_get_en setting */ | 
|  | #define SLINGSHOT_RDZV_GET_EN_FMT \ | 
|  | "/sys/class/cxi/cxi%d/device/properties/rdzv_get_en" | 
|  |  | 
|  | /* File path format to the default rdzv_get_en setting */ | 
|  | #define SLINGSHOT_RDZV_GET_EN_DEFAULT_FMT \ | 
|  | "/sys/module/cxi_%s/parameters/rdzv_get_en_default" | 
|  |  | 
|  | /* RGID sharing: max_lnis_per_rgid value found here - insert device name */ | 
|  | #define SLINGSHOT_RGIDS_AVAIL_FMT \ | 
|  | "/sys/class/cxi/%s/device/properties/rgids_avail" | 
|  |  | 
|  | extern int free_vnis; /* Number of free VNIs */ | 
|  |  | 
|  | /* Set of valid auth types used for REST */ | 
|  | typedef enum { | 
|  | SLINGSHOT_AUTH_NONE = 0, /* No authentication */ | 
|  | SLINGSHOT_AUTH_BASIC,    /* User name and password */ | 
|  | SLINGSHOT_AUTH_OAUTH     /* OAuth2 client credentials grant */ | 
|  | } slingshot_rest_auth_t; | 
|  |  | 
|  | /* Per-job shared VNI structure */ | 
|  | typedef struct job_vni { | 
|  | uint32_t job_id;        /* Job ID */ | 
|  | uint16_t vni;           /* Per-Job-ID shared VNI */ | 
|  | } job_vni_t; | 
|  |  | 
|  | /* Format for state file created by switch_p_libstate_save */ | 
|  | typedef struct slingshot_state { | 
|  | uint32_t version;       /* Version of this file format */ | 
|  | uint16_t vni_min;       /* Minimum VNI to allocate */ | 
|  | uint16_t vni_max;       /* Maximum VNI to allocate */ | 
|  | uint16_t vni_last;      /* Last allocated VNI */ | 
|  | bitstr_t *vni_table;    /* Bitmap of allocated VNIs */ | 
|  | uint32_t num_job_vnis;  /* Number of per-job shared VNIs */ | 
|  | job_vni_t *job_vnis;    /* Per-job shared VNI reservations */ | 
|  | uint32_t num_job_hwcoll; /* Number of per-job shared VNIs */ | 
|  | uint32_t *job_hwcoll;	/* Array of job IDs using collectives */ | 
|  | } slingshot_state_t; | 
|  |  | 
|  | /* Max NIC resources per application */ | 
|  | #define SLINGSHOT_TXQ_MAX     1024    /* Max transmit command queues */ | 
|  | #define SLINGSHOT_TGQ_MAX     512     /* Max target command queues */ | 
|  | #define SLINGSHOT_EQ_MAX      2047    /* Max event queues */ | 
|  | #define SLINGSHOT_CT_MAX      2047    /* Max counters */ | 
|  | #define SLINGSHOT_TLE_MAX     2048    /* Max trigger list entries */ | 
|  | #define SLINGSHOT_PTE_MAX     2048    /* Max portal table entries */ | 
|  | #define SLINGSHOT_LE_MAX      16384   /* Max list entries */ | 
|  | #define SLINGSHOT_AC_MAX      1022    /* Max addressing contexts */ | 
|  |  | 
|  | /* Default per-thread NIC resources per application */ | 
|  | #define SLINGSHOT_TXQ_DEF     2       /* Per-thread transmit command queues */ | 
|  | #define SLINGSHOT_TGQ_DEF     1       /* Per-thread target command queues */ | 
|  | #define SLINGSHOT_EQ_DEF      2       /* Per-thread event queues */ | 
|  | #define SLINGSHOT_CT_DEF      1       /* Per-thread counters */ | 
|  | #define SLINGSHOT_TLE_DEF     1       /* Per-thread trigger list entries */ | 
|  | #define SLINGSHOT_PTE_DEF     6       /* Per-thread portal table entries */ | 
|  | #define SLINGSHOT_LE_DEF      16      /* Per-thread list entries */ | 
|  | #define SLINGSHOT_AC_DEF      2       /* Per-thread addressing contexts */ | 
|  |  | 
|  | /* NIC resource limit structure */ | 
|  | typedef struct slingshot_limits { | 
|  | uint16_t max;  /* Max of this resource the application can use */ | 
|  | uint16_t res;  /* Resources reserved for only this application */ | 
|  | uint16_t def;  /* Per-thread resources to reserve */ | 
|  | } slingshot_limits_t; | 
|  |  | 
|  | /* Full set of NIC resource limits */ | 
|  | typedef struct slingshot_limits_set { | 
|  | slingshot_limits_t txqs;  /* Transmit command queue limits */ | 
|  | slingshot_limits_t tgqs;  /* Target command queue limits */ | 
|  | slingshot_limits_t eqs;   /* Event queue limits */ | 
|  | slingshot_limits_t cts;   /* Counter limits */ | 
|  | slingshot_limits_t tles;  /* Trigger list entry limits */ | 
|  | slingshot_limits_t ptes;  /* Portal table entry limits */ | 
|  | slingshot_limits_t les;   /* List entry limits */ | 
|  | slingshot_limits_t acs;   /* Addressing context limits */ | 
|  | } slingshot_limits_set_t; | 
|  |  | 
|  | /* | 
|  | * Slingshot switch plugin global configuration state, based on defaults and | 
|  | * 'SwitchParameters' slurm.conf variable | 
|  | */ | 
|  | typedef struct slingshot_config { | 
|  | uint32_t destroy_retries; /* retry count for destroying services */ | 
|  | uint8_t single_node_vni;        /* Allocate VNIs for single-node apps */ | 
|  | uint8_t job_vni;                /* Allocate extra VNI per-job */ | 
|  | uint32_t tcs;                   /* Bitmap of default traffic classes */ | 
|  | uint32_t flags;                 /* Bitmap of configuration flags */ | 
|  | slingshot_limits_set_t limits;  /* Set of NIC resource limits */ | 
|  | char *jlope_url;                /* URL of jackaloped REST interface */ | 
|  | slingshot_rest_auth_t jlope_auth; /* jackaloped authentication type */ | 
|  | char *jlope_authdir;            /* jackaloped auth file directory */ | 
|  | uint32_t hwcoll_addrs_per_job;  /* #Hardware collectives per job */ | 
|  | uint32_t hwcoll_num_nodes;      /* Minimum job nodes for HW coll */ | 
|  | char *fm_url;                   /* fabric manager REST interface URL */ | 
|  | slingshot_rest_auth_t fm_auth;  /* fabric manager authentication type */ | 
|  | char *fm_authdir;               /* fabric manager auth file directory */ | 
|  | char *fm_mtls_ca; /* fabric manager certificate bundle path */ | 
|  | char *fm_mtls_cert; /* fabric manager client public certificate path */ | 
|  | char *fm_mtls_key; /* fabric manager client private key path */ | 
|  | char *fm_mtls_url; /* fabric manager REST interface URL for mtls */ | 
|  | } slingshot_config_t; | 
|  |  | 
|  | /* Values for slingshot_config_t.single_node_vni */ | 
|  | #define SLINGSHOT_SN_VNI_NONE   0  /* No VNIs allocated for single-node apps */ | 
|  | #define SLINGSHOT_SN_VNI_ALL    1  /* All single-node apps get a VNI */ | 
|  | #define SLINGSHOT_SN_VNI_USER   2  /* srun --network=single_node_vni */ | 
|  |  | 
|  | /* Values for slingshot_config_t.job_vni */ | 
|  | #define SLINGSHOT_JOB_VNI_NONE   0  /* No job VNIs allocated */ | 
|  | #define SLINGSHOT_JOB_VNI_ALL    1  /* All jobs get a job VNI */ | 
|  | #define SLINGSHOT_JOB_VNI_USER   2  /* Job VNIs using srun --network=job_vni */ | 
|  |  | 
|  | /* NIC communication profile structure (compute-node specific) */ | 
|  | typedef struct slingshot_comm_profile { | 
|  | uint32_t svc_id;        /* Slingshot service ID */ | 
|  | uint16_t vnis[SLINGSHOT_VNIS]; /* VNIs for this service */ | 
|  | uint16_t vnis_used;     /* Number of valid VNIs in vnis[] */ | 
|  | uint32_t tcs;           /* Bitmap of allowed traffic classes */ | 
|  | char device_name[16];   /* NIC device name (e.g. "cxi0") */ | 
|  | } slingshot_comm_profile_t; | 
|  |  | 
|  | /* | 
|  | * Slingshot HSN NIC information structure | 
|  | */ | 
|  | typedef enum { | 
|  | SLINGSHOT_ADDR_IPV4, | 
|  | SLINGSHOT_ADDR_IPV6, | 
|  | SLINGSHOT_ADDR_MAC | 
|  | } slingshot_addr_type_t; | 
|  | typedef struct { | 
|  | uint32_t nodeidx;       /* Node index this NIC belongs to */ | 
|  | slingshot_addr_type_t address_type; /* Address type for this NIC */ | 
|  | char address[64];      /* Address of this NIC */ | 
|  | uint16_t numa_node;    /* NUMA node it is in */ | 
|  | char device_name[16];  /* Device name */ | 
|  | } slingshot_hsn_nic_t; | 
|  |  | 
|  | /* | 
|  | * Information to support Slingshot HSN hardware collectives | 
|  | * and communication with Slingshot Fabric Manager (FM) | 
|  | */ | 
|  | typedef struct { | 
|  | uint32_t job_id; 	 /* job id */ | 
|  | uint32_t step_id; 	 /* step id */ | 
|  | char *mcast_token;       /* Session token returned from FM */ | 
|  | char *fm_url;            /* FM URL for creating multicast trees */ | 
|  | uint32_t addrs_per_job;  /* Collectives multicast addrs per job */ | 
|  | uint32_t num_nodes;      /* Minimum #nodes to get multicast addrs */ | 
|  | } slingshot_hwcoll_t; | 
|  |  | 
|  | /* Denotes packing a null stepinfo structure */ | 
|  | #define SLINGSHOT_JOBINFO_NULL_VERSION 0xDEAFDEAF | 
|  |  | 
|  | typedef struct slingshot_jobinfo { | 
|  | uint32_t num_vnis; /* Number of VNIs */ | 
|  | uint16_t *vnis; /* List of VNIs allocated for this job */ | 
|  | char *extra; /* storage for mid-release extras */ | 
|  | } slingshot_jobinfo_t; | 
|  |  | 
|  | /* Jobinfo structure passed from slurmctld to slurmd */ | 
|  | typedef struct slingshot_stepinfo { | 
|  | uint32_t version;      /* Version of this structure */ | 
|  | uint32_t num_vnis;     /* Number of VNIs */ | 
|  | uint16_t *vnis;        /* List of VNIs allocated for this application */ | 
|  | uint32_t tcs;          /* Bitmap of allowed traffic classes */ | 
|  | slingshot_limits_set_t limits; /* Set of NIC resource limits */ | 
|  | uint32_t depth;        /* Threads-per-task for limit calculation */ | 
|  | uint32_t num_profiles; /* Number of communication profiles */ | 
|  | slingshot_comm_profile_t *profiles; /* List of communication profiles */ | 
|  | uint32_t flags;        /* Configuration flags */ | 
|  | uint32_t num_nics;     /* Number of entries in 'nics' array */ | 
|  | slingshot_hsn_nic_t *nics; /* HSN NIC information for instant on */ | 
|  | slingshot_hwcoll_t *hwcoll; /* HSN HW collectives info */ | 
|  | } slingshot_stepinfo_t; | 
|  |  | 
|  | /* Slingshot traffic classes (bitmap) */ | 
|  | #define SLINGSHOT_TC_DEDICATED_ACCESS 0x1 | 
|  | #define SLINGSHOT_TC_LOW_LATENCY      0x2 | 
|  | #define SLINGSHOT_TC_BULK_DATA        0x4 | 
|  | #define SLINGSHOT_TC_BEST_EFFORT      0x8 | 
|  | #define SLINGSHOT_TC_DEFAULT          (SLINGSHOT_TC_LOW_LATENCY | \ | 
|  | SLINGSHOT_TC_BEST_EFFORT) | 
|  |  | 
|  | /* Values for slingshot_stepinfo_t.flags */ | 
|  | /* | 
|  | * If SLINGSHOT_FLAGS_ADJUST_LIMITS is set (default), slurmd will adjust | 
|  | * resource limit reservations by subtracting system service reserved/used | 
|  | * resources | 
|  | * | 
|  | * If SLINGSHOT_FLAGS_ENABLE_MTLS is set, Slurm daemons will use mTLS | 
|  | * authentication with the fabric manager for the duration of the application | 
|  | * | 
|  | * If SLINGSHOT_FLAGS_DISABLE_RDZV_GET is set, slurmd will disable rendezvous | 
|  | * gets in the Cassini NIC for the duration of the application | 
|  | */ | 
|  | #define SLINGSHOT_FLAGS_ADJUST_LIMITS 0x1 | 
|  | #define SLINGSHOT_FLAGS_ENABLE_MTLS 0x2 | 
|  | #define SLINGSHOT_FLAGS_DISABLE_RDZV_GET 0x4 | 
|  | #define SLINGSHOT_FLAGS_DEFAULT SLINGSHOT_FLAGS_ADJUST_LIMITS | 
|  |  | 
|  | /* Environment variables set for applications */ | 
|  | #define SLINGSHOT_SVC_IDS_ENV         "SLINGSHOT_SVC_IDS" | 
|  | #define SLINGSHOT_VNIS_ENV            "SLINGSHOT_VNIS" | 
|  | #define SLINGSHOT_DEVICES_ENV         "SLINGSHOT_DEVICES" | 
|  | #define SLINGSHOT_TCS_ENV             "SLINGSHOT_TCS" | 
|  | /* Slingshot collectives environment variables set for applications */ | 
|  | #define SLINGSHOT_FI_CXI_COLL_JOB_ID_ENV          "FI_CXI_COLL_JOB_ID" | 
|  | #define SLINGSHOT_FI_CXI_COLL_JOB_STEP_ID_ENV     "FI_CXI_COLL_JOB_STEP_ID" | 
|  | #define SLINGSHOT_FI_CXI_COLL_MCAST_TOKEN_ENV     "FI_CXI_COLL_MCAST_TOKEN" | 
|  | #define SLINGSHOT_FI_CXI_COLL_FABRIC_MGR_URL_ENV  "FI_CXI_COLL_FABRIC_MGR_URL" | 
|  | #define SLINGSHOT_FI_CXI_HWCOLL_ADDRS_PER_JOB_ENV "FI_CXI_HWCOLL_ADDRS_PER_JOB" | 
|  | #define SLINGSHOT_FI_CXI_HWCOLL_MIN_NODES_ENV     "FI_CXI_HWCOLL_MIN_NODES" | 
|  |  | 
|  | /* Global variables */ | 
|  | extern slingshot_state_t slingshot_state; | 
|  | extern slingshot_config_t slingshot_config; | 
|  | extern bool active_outside_ctld; | 
|  |  | 
|  | /* Global functions */ | 
|  | /* apinfo.c */ | 
|  | extern bool create_slingshot_apinfo(const stepd_step_rec_t *step); | 
|  | extern void remove_slingshot_apinfo(const stepd_step_rec_t *step); | 
|  | /* collectives.c */ | 
|  | extern bool slingshot_init_collectives(void); | 
|  | extern void slingshot_fini_collectives(void); | 
|  | extern bool slingshot_setup_collectives(slingshot_stepinfo_t *job, | 
|  | uint32_t node_cnt, uint32_t job_id, | 
|  | uint32_t step_id); | 
|  | extern void slingshot_collectives_env(slingshot_stepinfo_t *job, char ***env); | 
|  | extern void slingshot_release_collectives_job_step(slingshot_stepinfo_t *job); | 
|  | extern void slingshot_release_collectives_job(uint32_t job_id); | 
|  | /* config.c */ | 
|  | extern void slingshot_free_config(void); | 
|  | extern bool slingshot_stepd_init(const char *switch_params); | 
|  | extern bool slingshot_setup_config(const char *switch_params); | 
|  | extern int slingshot_update_vni_table(void); | 
|  | extern bool slingshot_setup_job_vni_pool(job_record_t *job_ptr); | 
|  | extern bool slingshot_setup_job_step_vni( | 
|  | slingshot_stepinfo_t *job, int node_cnt, | 
|  | uint32_t job_id, const char *network_params, | 
|  | const char *job_network_params); | 
|  | extern void slingshot_free_job_step_vni(slingshot_stepinfo_t *job); | 
|  | extern void slingshot_free_job_vni(uint32_t job_id); | 
|  | extern void slingshot_free_job_vni_pool(slingshot_jobinfo_t *job); | 
|  | extern void slingshot_free_jobinfo(slingshot_jobinfo_t *jobinfo); | 
|  | /* setup_nic.c */ | 
|  | extern bool slingshot_open_cxi_lib(slingshot_stepinfo_t *job); | 
|  | extern bool slingshot_create_services(slingshot_stepinfo_t *job, uint32_t uid, | 
|  | uint16_t step_cpus, uint32_t job_id); | 
|  | extern bool slingshot_destroy_services(slingshot_stepinfo_t *job, | 
|  | uint32_t job_id); | 
|  | extern void slingshot_free_services(void); | 
|  | extern int slingshot_update_config(slingshot_jobinfo_t *jobinfo); | 
|  |  | 
|  | #endif |