src/plugins/switch/hpe_slingshot/collectives.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  collectives.c - Library for managing HPE Slingshot networks
  *****************************************************************************
  *  Copyright 2023 Hewlett Packard Enterprise Development LP
  *  Written by Jim Nordby <james.nordby@hpe.com>
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "config.h"

 #define _GNU_SOURCE

 #include <stdio.h>
 #include <stdlib.h>

 #include "src/common/slurm_xlator.h"
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/slurmctld.h"

 #include "switch_hpe_slingshot.h"
 #include "rest.h"

 #define CLEANUP_THREAD_PERIOD 30

 static slingshot_rest_conn_t fm_conn;  /* Connection to fabric manager */

 static bool collectives_enabled = false;

 pthread_t cleanup_thread_id = 0;
 pthread_cond_t cleanup_thread_cond = PTHREAD_COND_INITIALIZER;
 pthread_mutex_t cleanup_thread_lock = PTHREAD_MUTEX_INITIALIZER;
 bool cleanup_thread_shutdown = false;

 static void *_cleanup_thread(void *data)
 {
 	struct timespec ts = {0, 0};
 	json_object *respjson = NULL, *jobsjson = NULL, *jobjson = NULL;
 	long status = 0;
 	uint32_t job_id, arraylen;
 	size_t path_len, cluster_name_len;
 	job_record_t *job_ptr;
 	slurmctld_lock_t job_read_lock = { .job = READ_LOCK };
 	char *url = "/fabric/collectives/jobs/";

 	path_len = strlen(url);
 	cluster_name_len = strlen(slurm_conf.cluster_name);

 	while (!cleanup_thread_shutdown) {
 		slurm_mutex_lock(&cleanup_thread_lock);
 		if (!cleanup_thread_shutdown) {
 			ts.tv_sec = time(NULL) + CLEANUP_THREAD_PERIOD;
 			slurm_cond_timedwait(&cleanup_thread_cond,
 					     &cleanup_thread_lock, &ts);
 		}
 		slurm_mutex_unlock(&cleanup_thread_lock);

 		json_object_put(respjson);
 		respjson = NULL;
 		if (!(respjson = slingshot_rest_get(&fm_conn, url, &status))) {
 			error("GET %s to fabric manager for job failed: %ld",
 			      url, status);
 			continue; /* Try again next time around */
 		} else {
 			log_flag(SWITCH, "GET %s resp='%s'", url,
 				 json_object_to_json_string(respjson));
 		}
 		json_object_object_get_ex(respjson, "documentLinks", &jobsjson);
 		arraylen = json_object_array_length(jobsjson);

 		for (int i = 0; i < arraylen; i++) {
 			bool release = false;
 			const char *jobstr;
 			char *endptr = NULL;
 			jobjson = json_object_array_get_idx(jobsjson, i);
 			jobstr = json_object_get_string(jobjson) + path_len;

 			if (xstrncmp(jobstr, slurm_conf.cluster_name,
 				     cluster_name_len)) {
 				log_flag(SWITCH, "Skipping fabric manager job '%s' because the cluster name doesn't match %s",
 					jobstr, slurm_conf.cluster_name);
 				continue;
 			}

 			/* Add 1 to skip the '-' after the cluster name */
 			job_id = strtol(jobstr + cluster_name_len + 1, &endptr,
 					10);
 			if (endptr && (*endptr != '\0')) {
 				log_flag(SWITCH, "Skipping fabric manager job '%s'",
 					 jobstr);
 				continue;
 			}

 			lock_slurmctld(job_read_lock);
 			job_ptr = find_job_record(job_id);
 			if (!job_ptr) {
 				error("job %u isn't in slurmctld, removing from fabric manager",
 				      job_id);
 				release = true;
 			} else if (!IS_JOB_RUNNING(job_ptr) &&
 				   !IS_JOB_SUSPENDED(job_ptr)) {
 				error("job %u isn't currently allocated resources, removing from fabric manager",
 				      job_id);
 				release = true;
 			}
 			unlock_slurmctld(job_read_lock);

 			if (release)
 				slingshot_release_collectives_job(job_id);
 		}
 	}

 	debug("shutting down collectives cleanup thread");

 	json_object_put(respjson);
 	return NULL;
 }

 /*
  * Read any authentication files and connect to the fabric manager,
  * which implements a REST interface supporting Slingshot collectives
  */
 extern bool slingshot_init_collectives(void)
 {
 	/* Enable Hardware Collectives only if fm_url is configured */
 	if (!slingshot_config.fm_url)
 		return true;

 	if (running_in_slurmctld() &&
 	    !xstrcasestr(slurm_conf.slurmctld_params, "enable_stepmgr")) {
 		error("Hardware collectives enabled by setting SwitchParameters=fm_url but SlurmctldParameters=enable_stepmgr is not set.");
 		return false;
 	}

 	if (!slingshot_rest_connection(&fm_conn, slingshot_config.fm_url,
 				       slingshot_config.fm_auth,
 				       slingshot_config.fm_authdir,
 				       SLINGSHOT_FM_AUTH_BASIC_USER,
 				       SLINGSHOT_FM_AUTH_BASIC_PWD_FILE,
 				       (slingshot_config.flags &
 					SLINGSHOT_FLAGS_ENABLE_MTLS),
 				       slingshot_config.fm_mtls_ca,
 				       slingshot_config.fm_mtls_cert,
 				       slingshot_config.fm_mtls_key,
 				       slingshot_config.fm_mtls_url,
 				       SLINGSHOT_FM_TIMEOUT,
 				       SLINGSHOT_FM_CONNECT_TIMEOUT,
 				       "Slingshot Fabric Manager"))
 		goto err;

 	if (running_in_slurmctld()) {
 		slurm_mutex_lock(&cleanup_thread_lock);
 		slurm_thread_create(&cleanup_thread_id, _cleanup_thread, NULL);
 		slurm_mutex_unlock(&cleanup_thread_lock);
 	}

 	collectives_enabled = true;
 	return true;

 err:
 	info("Slingshot collectives support disabled due to errors");
 	slingshot_rest_destroy_connection(&fm_conn);
 	collectives_enabled = false;
 	return false;
 }

 /*
  * Close connection to fabric manager REST interface, free memory
  */
 extern void slingshot_fini_collectives(void)
 {
 	if (running_in_slurmctld() && cleanup_thread_id) {
 		cleanup_thread_shutdown = true;
 		slurm_mutex_lock(&cleanup_thread_lock);
 		slurm_cond_signal(&cleanup_thread_cond);
 		slurm_mutex_unlock(&cleanup_thread_lock);

 		slurm_thread_join(cleanup_thread_id);
 	}

 	slingshot_rest_destroy_connection(&fm_conn);
 }

 /*
  * Save jobID in slingshot_state.job_hwcoll[] array to indicate use of
  * hardware collectives (for cleanup time).  Return if jobID is already there.
  */
 static void _save_hwcoll(uint32_t job_id)
 {
 	int freeslot = -1;

 	for (int i = 0; i < slingshot_state.num_job_hwcoll; i++) {
 		if (slingshot_state.job_hwcoll[i] == job_id) {
 			goto done;
 		} else if (slingshot_state.job_hwcoll[i] == 0 && freeslot < 0) {
 			freeslot = i;
 		}
 	}

 	/* If no free slot, allocate a new slot in the job_vnis table */
 	if (freeslot < 0) {
 		freeslot = slingshot_state.num_job_hwcoll;
 		slingshot_state.num_job_hwcoll++;
 		xrecalloc(slingshot_state.job_hwcoll,
 			  slingshot_state.num_job_hwcoll, sizeof(uint32_t));
 	}
 	slingshot_state.job_hwcoll[freeslot] = job_id;
 done:
 	log_flag(SWITCH, "job_hwcoll[%d] %u num_job_hwcoll=%d",
 		 freeslot, job_id, slingshot_state.num_job_hwcoll);
 	return;
 }

 /*
  * Zero out entry if job_id is found in slingshot_state.job_hwcoll[];
  * return true if job_id is in the table, false otherwise.
  */
 static bool _clear_hwcoll(uint32_t job_id)
 {
 	if (slingshot_state.num_job_hwcoll == 0)
 		return false;

 	for (int i = 0; i < slingshot_state.num_job_hwcoll; i++) {
 		if (slingshot_state.job_hwcoll[i] == job_id) {
 			slingshot_state.job_hwcoll[i] = 0;
 			return true;
 		}
 	}
 	return false;
 }

 static json_object *_post_job_to_fabric_manager(uint32_t job_id)
 {
 	long status = 0;
 	json_object *reqjson = NULL;
 	json_object *jobid_json = NULL;
 	json_object *mcasts_json = NULL;
 	json_object *respjson = NULL;
 	char *jobid_str = NULL;

 	/* Put job ID and number of multicast addresses to reserve in payload */
 	jobid_str = xstrdup_printf("%s-%u", slurm_conf.cluster_name, job_id);
 	if (!(reqjson = json_object_new_object()) ||
 	    !(jobid_json = json_object_new_string(jobid_str)) ||
 	    json_object_object_add(reqjson, "jobID", jobid_json) ||
 	    !(mcasts_json = json_object_new_int(
 			slingshot_config.hwcoll_addrs_per_job)) ||
 	    json_object_object_add(reqjson, "mcastLimit", mcasts_json)) {
 		error("Couldn't create collectives request json");
 		json_object_put(jobid_json);
 		json_object_put(mcasts_json);
 		goto out;
 	}
 	log_flag(SWITCH, "reqjson='%s'", json_object_to_json_string(reqjson));

 	if (!(respjson = slingshot_rest_post(&fm_conn,
 					     "/fabric/collectives/jobs",
 					     reqjson, &status))) {
 		error("POST to fabric manager for collectives failed: %ld",
 		      status);
 		goto out;
 	}
 	log_flag(SWITCH, "respjson='%s'", json_object_to_json_string(respjson));

 out:
 	xfree(jobid_str);
 	json_object_put(reqjson);

 	return respjson;
 }

 /*
  * If Slingshot hardware collectives are configured, and the job has
  * enough nodes, reserve the configured per-job number of multicast addresses
  * by registering the job with the fabric manager
  */
 extern bool slingshot_setup_collectives(slingshot_stepinfo_t *job,
 					uint32_t node_cnt, uint32_t job_id,
 					uint32_t step_id)
 {
 	long status = 0;
 	json_object *respjson = NULL;
 	char *jobid_str = NULL, *url;
 	const char *token = NULL;
 	bool rc = false;

 	/*
 	 * Only reserve multicast addresses if configured and job has
 	 * enough nodes
 	 */
 	if (!slingshot_config.fm_url || !collectives_enabled ||
 	    (slingshot_config.hwcoll_num_nodes == 0) ||
 	    (node_cnt < slingshot_config.hwcoll_num_nodes))
 		return true;

 	/* GET on the job object if it already exists */
 	url = xstrdup_printf("/fabric/collectives/jobs/%s-%u",
 			     slurm_conf.cluster_name, job_id);
 	if (!(respjson = slingshot_rest_get(&fm_conn, url, &status))) {
 		error("GET %s to fabric manager for job failed: %ld",
 			url, status);
 	} else {
 		log_flag(SWITCH, "GET %s resp='%s'",
 				url, json_object_to_json_string(respjson));
 	}
 	xfree(url);

 	if (status == HTTP_NOT_FOUND) {
 		/* If the job object doesn't exist, create it */
 		respjson = _post_job_to_fabric_manager(job_id);
 	}

 	/* Get per-job session token out of response */
 	if (!(token = json_object_get_string(
 			json_object_object_get(respjson, "sessionToken")))) {
 		error("Couldn't extract sessionToken from fabric manager response");
 		goto out;
 	}

 	/* Put info in job struct to send to slurmd */
 	job->hwcoll = xmalloc(sizeof(slingshot_hwcoll_t));
 	job->hwcoll->job_id = job_id;
 	job->hwcoll->step_id = step_id;
 	job->hwcoll->mcast_token = xstrdup(token);
 	job->hwcoll->fm_url = xstrdup(slingshot_config.fm_url);
 	job->hwcoll->addrs_per_job = slingshot_config.hwcoll_addrs_per_job;
 	job->hwcoll->num_nodes = slingshot_config.hwcoll_num_nodes;

 	/*
 	 * Save jobID in slingshot_state.job_hwcoll[] array to indicate
 	 * use of hardware collectives (for cleanup time)
 	 */
 	_save_hwcoll(job_id);

 	rc = true;

 out:
 	xfree(jobid_str);
 	json_object_put(respjson);
 	return rc;
 }

 /*
  * Set up collectives-related environment variables for job step:
  * if job->hwcoll is set, add the string-ized value of every
  * field in job->hwcoll to this job step's environment
  */
 extern void slingshot_collectives_env(slingshot_stepinfo_t *job, char ***env)
 {
 	slingshot_hwcoll_t *hwcoll = job->hwcoll;
 	char *job_id = NULL, *step_id = NULL;
 	char *addrs_per_job = NULL, *num_nodes = NULL;
 	char *fm_full_url = NULL;

 	if (!hwcoll)
 		return;

 	xstrfmtcat(job_id, "%s-%u", slurm_conf.cluster_name, hwcoll->job_id);
 	xstrfmtcat(step_id, "%u", hwcoll->step_id);
 	xstrfmtcat(addrs_per_job, "%u", hwcoll->addrs_per_job);
 	xstrfmtcat(num_nodes, "%u", hwcoll->num_nodes);
 	xstrfmtcat(fm_full_url, "%s/fabric/collectives/multicasts",
 		   hwcoll->fm_url);

 	log_flag(SWITCH, "%s=%s %s=%s %s=%s",
 		 SLINGSHOT_FI_CXI_COLL_JOB_ID_ENV, job_id,
 		 SLINGSHOT_FI_CXI_COLL_JOB_STEP_ID_ENV, step_id,
 		 SLINGSHOT_FI_CXI_COLL_MCAST_TOKEN_ENV, hwcoll->mcast_token);
 	log_flag(SWITCH, "%s=%s %s=%s %s=%s",
 		 SLINGSHOT_FI_CXI_COLL_FABRIC_MGR_URL_ENV, fm_full_url,
 		 SLINGSHOT_FI_CXI_HWCOLL_ADDRS_PER_JOB_ENV, addrs_per_job,
 		 SLINGSHOT_FI_CXI_HWCOLL_MIN_NODES_ENV, num_nodes);

 	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_JOB_ID_ENV, job_id);
 	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_JOB_STEP_ID_ENV,
 			    step_id);
 	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_MCAST_TOKEN_ENV,
 			    hwcoll->mcast_token);
 	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_FABRIC_MGR_URL_ENV,
 			    fm_full_url);
 	env_array_overwrite(env, SLINGSHOT_FI_CXI_HWCOLL_ADDRS_PER_JOB_ENV,
 			    addrs_per_job);
 	env_array_overwrite(env, SLINGSHOT_FI_CXI_HWCOLL_MIN_NODES_ENV,
 			    num_nodes);
 	xfree(job_id);
 	xfree(step_id);
 	xfree(addrs_per_job);
 	xfree(num_nodes);
 	xfree(fm_full_url);
 	return;
 }

 /*
  * If this job step is using Slingshot hardware collectives, release any
  * multicast addresses associated with this job step, by PATCHing the job
  * object.  The job object has a "jobSteps" field:
  * "jobSteps": { "<job step ID>": [ <mcast_address1>, ... ] }
  * To release the multicast addresses associated with the job step,
  * PATCH the "jobSteps" object with a NULL value under the job step ID key.
  */
 extern void slingshot_release_collectives_job_step(slingshot_stepinfo_t *job)
 {
 	slingshot_hwcoll_t *hwcoll = job->hwcoll;
 	long status = 0;
 	char *stepid_str = NULL;
 	json_object *reqjson = NULL;
 	json_object *jobsteps_json = NULL;
 	json_object *respjson = NULL;
 	const char *url = NULL;

 	/* Just return if we're not using collectives */
 	if (!slingshot_config.fm_url || !collectives_enabled || !hwcoll)
 		return;

 	/* Payload is '{ "jobSteps": { "<step_id>": null } }' */
 	stepid_str = xstrdup_printf("%u", hwcoll->step_id);
 	if (!(reqjson = json_object_new_object()) ||
 	    !(jobsteps_json = json_object_new_object()) ||
 	    json_object_object_add(jobsteps_json, stepid_str, NULL) ||
 	    json_object_object_add(reqjson, "jobSteps", jobsteps_json)) {
 		error("Slingshot hardware collectives release failed (JSON creation failed)");
 		json_object_put(jobsteps_json);
 		goto out;
 	}
 	log_flag(SWITCH, "reqjson='%s'", json_object_to_json_string(reqjson));

 	/*
 	 * PATCH the "jobSteps" map in this job's object
 	 * NOTE: timing-wise, the job complete could happen before this.
 	 * Don't fail on error 404 (Not Found)
 	 */
 	url = xstrdup_printf("/fabric/collectives/jobs/%s-%u",
 			     slurm_conf.cluster_name, hwcoll->job_id);
 	if (!(respjson = slingshot_rest_patch(&fm_conn, url, reqjson,
 					      &status))) {
 		if (status != HTTP_NOT_FOUND) {
 			error("Slingshot hardware collectives release failed (PATCH %s fabric manager failed: %ld)",
 			      url, status);
 			goto out;
 		}
 	}
 	log_flag(SWITCH, "respjson='%s'", json_object_to_json_string(respjson));

 	/* If in debug mode, do a GET on the PATCHed job object and print it */
 	if ((slurm_conf.debug_flags & DEBUG_FLAG_SWITCH) &&
 	    (status != HTTP_NOT_FOUND)) {
 		json_object_put(respjson);
 		if (!(respjson = slingshot_rest_get(&fm_conn, url, &status))) {
 			error("GET %s to fabric manager for job failed: %ld",
 			      url, status);
 		} else {
 			log_flag(SWITCH, "GET %s resp='%s'",
 				 url, json_object_to_json_string(respjson));
 		}
 	}

 out:
 	json_object_put(reqjson);
 	json_object_put(respjson);
 	xfree(stepid_str);
 	xfree(url);
 	return;
 }

 /*
  * If this job is using Slingshot hardware collectives, release any
  * multicast addresses associated with this job, by deleting the job
  * object from the fabric manager.
  */
 extern void slingshot_release_collectives_job(uint32_t job_id)
 {
 	long status = 0;
 	const char *url = NULL;

 	/* Just return if we're not using collectives */
 	if (!slingshot_config.fm_url || !collectives_enabled)
 		return;

 	_clear_hwcoll(job_id);

 	/* Do a DELETE on the job object in the fabric manager */
 	url = xstrdup_printf("/fabric/collectives/jobs/%s-%u",
 			     slurm_conf.cluster_name, job_id);
 	if (!slingshot_rest_delete(&fm_conn, url, &status)) {
 		error("DELETE %s from fabric manager for collectives failed: %ld",
 		      url, status);
 	}
 	xfree(url);
 	return;
 }
	/*****************************************************************************\
	* collectives.c - Library for managing HPE Slingshot networks
	*****************************************************************************
	* Copyright 2023 Hewlett Packard Enterprise Development LP
	* Written by Jim Nordby <james.nordby@hpe.com>
	*
	* This file is part of Slurm, a resource management program.
	* For details, see <https://slurm.schedmd.com/>.
	* Please also read the included file: DISCLAIMER.
	*
	* Slurm is free software; you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free
	* Software Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* In addition, as a special exception, the copyright holders give permission
	* to link the code of portions of this program with the OpenSSL library under
	* certain conditions as described in each individual source file, and
	* distribute linked combinations including the two. You must obey the GNU
	* General Public License in all respects for all of the code used other than
	* OpenSSL. If you modify file(s) with this exception, you may extend this
	* exception to your version of the file(s), but you are not obligated to do
	* so. If you do not wish to do so, delete this exception statement from your
	* version. If you delete this exception statement from all source files in
	* the program, then also delete it here.
	*
	* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
	* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along
	* with Slurm; if not, write to the Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	\*****************************************************************************/

	#include "config.h"

	#define _GNU_SOURCE

	#include <stdio.h>
	#include <stdlib.h>

	#include "src/common/slurm_xlator.h"
	#include "src/slurmctld/locks.h"
	#include "src/slurmctld/slurmctld.h"

	#include "switch_hpe_slingshot.h"
	#include "rest.h"

	#define CLEANUP_THREAD_PERIOD 30

	static slingshot_rest_conn_t fm_conn; /* Connection to fabric manager */

	static bool collectives_enabled = false;

	pthread_t cleanup_thread_id = 0;
	pthread_cond_t cleanup_thread_cond = PTHREAD_COND_INITIALIZER;
	pthread_mutex_t cleanup_thread_lock = PTHREAD_MUTEX_INITIALIZER;
	bool cleanup_thread_shutdown = false;

	static void _cleanup_thread(void data)
	{
	struct timespec ts = {0, 0};
	json_object respjson = NULL, jobsjson = NULL, *jobjson = NULL;
	long status = 0;
	uint32_t job_id, arraylen;
	size_t path_len, cluster_name_len;
	job_record_t *job_ptr;
	slurmctld_lock_t job_read_lock = { .job = READ_LOCK };
	char *url = "/fabric/collectives/jobs/";

	path_len = strlen(url);
	cluster_name_len = strlen(slurm_conf.cluster_name);

	while (!cleanup_thread_shutdown) {
	slurm_mutex_lock(&cleanup_thread_lock);
	if (!cleanup_thread_shutdown) {
	ts.tv_sec = time(NULL) + CLEANUP_THREAD_PERIOD;
	slurm_cond_timedwait(&cleanup_thread_cond,
	&cleanup_thread_lock, &ts);
	}
	slurm_mutex_unlock(&cleanup_thread_lock);

	json_object_put(respjson);
	respjson = NULL;
	if (!(respjson = slingshot_rest_get(&fm_conn, url, &status))) {
	error("GET %s to fabric manager for job failed: %ld",
	url, status);
	continue; /* Try again next time around */
	} else {
	log_flag(SWITCH, "GET %s resp='%s'", url,
	json_object_to_json_string(respjson));
	}
	json_object_object_get_ex(respjson, "documentLinks", &jobsjson);
	arraylen = json_object_array_length(jobsjson);

	for (int i = 0; i < arraylen; i++) {
	bool release = false;
	const char *jobstr;
	char *endptr = NULL;
	jobjson = json_object_array_get_idx(jobsjson, i);
	jobstr = json_object_get_string(jobjson) + path_len;

	if (xstrncmp(jobstr, slurm_conf.cluster_name,
	cluster_name_len)) {
	log_flag(SWITCH, "Skipping fabric manager job '%s' because the cluster name doesn't match %s",
	jobstr, slurm_conf.cluster_name);
	continue;
	}

	/* Add 1 to skip the '-' after the cluster name */
	job_id = strtol(jobstr + cluster_name_len + 1, &endptr,
	10);
	if (endptr && (*endptr != '\0')) {
	log_flag(SWITCH, "Skipping fabric manager job '%s'",
	jobstr);
	continue;
	}

	lock_slurmctld(job_read_lock);
	job_ptr = find_job_record(job_id);
	if (!job_ptr) {
	error("job %u isn't in slurmctld, removing from fabric manager",
	job_id);
	release = true;
	} else if (!IS_JOB_RUNNING(job_ptr) &&
	!IS_JOB_SUSPENDED(job_ptr)) {
	error("job %u isn't currently allocated resources, removing from fabric manager",
	job_id);
	release = true;
	}
	unlock_slurmctld(job_read_lock);

	if (release)
	slingshot_release_collectives_job(job_id);
	}
	}

	debug("shutting down collectives cleanup thread");

	json_object_put(respjson);
	return NULL;
	}

	/*
	* Read any authentication files and connect to the fabric manager,
	* which implements a REST interface supporting Slingshot collectives
	*/
	extern bool slingshot_init_collectives(void)
	{
	/* Enable Hardware Collectives only if fm_url is configured */
	if (!slingshot_config.fm_url)
	return true;

	if (running_in_slurmctld() &&
	!xstrcasestr(slurm_conf.slurmctld_params, "enable_stepmgr")) {
	error("Hardware collectives enabled by setting SwitchParameters=fm_url but SlurmctldParameters=enable_stepmgr is not set.");
	return false;
	}

	if (!slingshot_rest_connection(&fm_conn, slingshot_config.fm_url,
	slingshot_config.fm_auth,
	slingshot_config.fm_authdir,
	SLINGSHOT_FM_AUTH_BASIC_USER,
	SLINGSHOT_FM_AUTH_BASIC_PWD_FILE,
	(slingshot_config.flags &
	SLINGSHOT_FLAGS_ENABLE_MTLS),
	slingshot_config.fm_mtls_ca,
	slingshot_config.fm_mtls_cert,
	slingshot_config.fm_mtls_key,
	slingshot_config.fm_mtls_url,
	SLINGSHOT_FM_TIMEOUT,
	SLINGSHOT_FM_CONNECT_TIMEOUT,
	"Slingshot Fabric Manager"))
	goto err;

	if (running_in_slurmctld()) {
	slurm_mutex_lock(&cleanup_thread_lock);
	slurm_thread_create(&cleanup_thread_id, _cleanup_thread, NULL);
	slurm_mutex_unlock(&cleanup_thread_lock);
	}

	collectives_enabled = true;
	return true;

	err:
	info("Slingshot collectives support disabled due to errors");
	slingshot_rest_destroy_connection(&fm_conn);
	collectives_enabled = false;
	return false;
	}

	/*
	* Close connection to fabric manager REST interface, free memory
	*/
	extern void slingshot_fini_collectives(void)
	{
	if (running_in_slurmctld() && cleanup_thread_id) {
	cleanup_thread_shutdown = true;
	slurm_mutex_lock(&cleanup_thread_lock);
	slurm_cond_signal(&cleanup_thread_cond);
	slurm_mutex_unlock(&cleanup_thread_lock);

	slurm_thread_join(cleanup_thread_id);
	}

	slingshot_rest_destroy_connection(&fm_conn);
	}

	/*
	* Save jobID in slingshot_state.job_hwcoll[] array to indicate use of
	* hardware collectives (for cleanup time). Return if jobID is already there.
	*/
	static void _save_hwcoll(uint32_t job_id)
	{
	int freeslot = -1;

	for (int i = 0; i < slingshot_state.num_job_hwcoll; i++) {
	if (slingshot_state.job_hwcoll[i] == job_id) {
	goto done;
	} else if (slingshot_state.job_hwcoll[i] == 0 && freeslot < 0) {
	freeslot = i;
	}
	}

	/* If no free slot, allocate a new slot in the job_vnis table */
	if (freeslot < 0) {
	freeslot = slingshot_state.num_job_hwcoll;
	slingshot_state.num_job_hwcoll++;
	xrecalloc(slingshot_state.job_hwcoll,
	slingshot_state.num_job_hwcoll, sizeof(uint32_t));
	}
	slingshot_state.job_hwcoll[freeslot] = job_id;
	done:
	log_flag(SWITCH, "job_hwcoll[%d] %u num_job_hwcoll=%d",
	freeslot, job_id, slingshot_state.num_job_hwcoll);
	return;
	}

	/*
	* Zero out entry if job_id is found in slingshot_state.job_hwcoll[];
	* return true if job_id is in the table, false otherwise.
	*/
	static bool _clear_hwcoll(uint32_t job_id)
	{
	if (slingshot_state.num_job_hwcoll == 0)
	return false;

	for (int i = 0; i < slingshot_state.num_job_hwcoll; i++) {
	if (slingshot_state.job_hwcoll[i] == job_id) {
	slingshot_state.job_hwcoll[i] = 0;
	return true;
	}
	}
	return false;
	}

	static json_object *_post_job_to_fabric_manager(uint32_t job_id)
	{
	long status = 0;
	json_object *reqjson = NULL;
	json_object *jobid_json = NULL;
	json_object *mcasts_json = NULL;
	json_object *respjson = NULL;
	char *jobid_str = NULL;

	/* Put job ID and number of multicast addresses to reserve in payload */
	jobid_str = xstrdup_printf("%s-%u", slurm_conf.cluster_name, job_id);
	if (!(reqjson = json_object_new_object()) \|\|
	!(jobid_json = json_object_new_string(jobid_str)) \|\|
	json_object_object_add(reqjson, "jobID", jobid_json) \|\|
	!(mcasts_json = json_object_new_int(
	slingshot_config.hwcoll_addrs_per_job)) \|\|
	json_object_object_add(reqjson, "mcastLimit", mcasts_json)) {
	error("Couldn't create collectives request json");
	json_object_put(jobid_json);
	json_object_put(mcasts_json);
	goto out;
	}
	log_flag(SWITCH, "reqjson='%s'", json_object_to_json_string(reqjson));

	if (!(respjson = slingshot_rest_post(&fm_conn,
	"/fabric/collectives/jobs",
	reqjson, &status))) {
	error("POST to fabric manager for collectives failed: %ld",
	status);
	goto out;
	}
	log_flag(SWITCH, "respjson='%s'", json_object_to_json_string(respjson));

	out:
	xfree(jobid_str);
	json_object_put(reqjson);

	return respjson;
	}

	/*
	* If Slingshot hardware collectives are configured, and the job has
	* enough nodes, reserve the configured per-job number of multicast addresses
	* by registering the job with the fabric manager
	*/
	extern bool slingshot_setup_collectives(slingshot_stepinfo_t *job,
	uint32_t node_cnt, uint32_t job_id,
	uint32_t step_id)
	{
	long status = 0;
	json_object *respjson = NULL;
	char jobid_str = NULL, url;
	const char *token = NULL;
	bool rc = false;

	/*
	* Only reserve multicast addresses if configured and job has
	* enough nodes
	*/
	if (!slingshot_config.fm_url \|\| !collectives_enabled \|\|
	(slingshot_config.hwcoll_num_nodes == 0) \|\|
	(node_cnt < slingshot_config.hwcoll_num_nodes))
	return true;

	/* GET on the job object if it already exists */
	url = xstrdup_printf("/fabric/collectives/jobs/%s-%u",
	slurm_conf.cluster_name, job_id);
	if (!(respjson = slingshot_rest_get(&fm_conn, url, &status))) {
	error("GET %s to fabric manager for job failed: %ld",
	url, status);
	} else {
	log_flag(SWITCH, "GET %s resp='%s'",
	url, json_object_to_json_string(respjson));
	}
	xfree(url);

	if (status == HTTP_NOT_FOUND) {
	/* If the job object doesn't exist, create it */
	respjson = _post_job_to_fabric_manager(job_id);
	}

	/* Get per-job session token out of response */
	if (!(token = json_object_get_string(
	json_object_object_get(respjson, "sessionToken")))) {
	error("Couldn't extract sessionToken from fabric manager response");
	goto out;
	}

	/* Put info in job struct to send to slurmd */
	job->hwcoll = xmalloc(sizeof(slingshot_hwcoll_t));
	job->hwcoll->job_id = job_id;
	job->hwcoll->step_id = step_id;
	job->hwcoll->mcast_token = xstrdup(token);
	job->hwcoll->fm_url = xstrdup(slingshot_config.fm_url);
	job->hwcoll->addrs_per_job = slingshot_config.hwcoll_addrs_per_job;
	job->hwcoll->num_nodes = slingshot_config.hwcoll_num_nodes;

	/*
	* Save jobID in slingshot_state.job_hwcoll[] array to indicate
	* use of hardware collectives (for cleanup time)
	*/
	_save_hwcoll(job_id);

	rc = true;

	out:
	xfree(jobid_str);
	json_object_put(respjson);
	return rc;
	}

	/*
	* Set up collectives-related environment variables for job step:
	* if job->hwcoll is set, add the string-ized value of every
	* field in job->hwcoll to this job step's environment
	*/
	extern void slingshot_collectives_env(slingshot_stepinfo_t job, char **env)
	{
	slingshot_hwcoll_t *hwcoll = job->hwcoll;
	char job_id = NULL, step_id = NULL;
	char addrs_per_job = NULL, num_nodes = NULL;
	char *fm_full_url = NULL;

	if (!hwcoll)
	return;

	xstrfmtcat(job_id, "%s-%u", slurm_conf.cluster_name, hwcoll->job_id);
	xstrfmtcat(step_id, "%u", hwcoll->step_id);
	xstrfmtcat(addrs_per_job, "%u", hwcoll->addrs_per_job);
	xstrfmtcat(num_nodes, "%u", hwcoll->num_nodes);
	xstrfmtcat(fm_full_url, "%s/fabric/collectives/multicasts",
	hwcoll->fm_url);

	log_flag(SWITCH, "%s=%s %s=%s %s=%s",
	SLINGSHOT_FI_CXI_COLL_JOB_ID_ENV, job_id,
	SLINGSHOT_FI_CXI_COLL_JOB_STEP_ID_ENV, step_id,
	SLINGSHOT_FI_CXI_COLL_MCAST_TOKEN_ENV, hwcoll->mcast_token);
	log_flag(SWITCH, "%s=%s %s=%s %s=%s",
	SLINGSHOT_FI_CXI_COLL_FABRIC_MGR_URL_ENV, fm_full_url,
	SLINGSHOT_FI_CXI_HWCOLL_ADDRS_PER_JOB_ENV, addrs_per_job,
	SLINGSHOT_FI_CXI_HWCOLL_MIN_NODES_ENV, num_nodes);

	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_JOB_ID_ENV, job_id);
	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_JOB_STEP_ID_ENV,
	step_id);
	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_MCAST_TOKEN_ENV,
	hwcoll->mcast_token);
	env_array_overwrite(env, SLINGSHOT_FI_CXI_COLL_FABRIC_MGR_URL_ENV,
	fm_full_url);
	env_array_overwrite(env, SLINGSHOT_FI_CXI_HWCOLL_ADDRS_PER_JOB_ENV,
	addrs_per_job);
	env_array_overwrite(env, SLINGSHOT_FI_CXI_HWCOLL_MIN_NODES_ENV,
	num_nodes);
	xfree(job_id);
	xfree(step_id);
	xfree(addrs_per_job);
	xfree(num_nodes);
	xfree(fm_full_url);
	return;
	}

	/*
	* If this job step is using Slingshot hardware collectives, release any
	* multicast addresses associated with this job step, by PATCHing the job
	* object. The job object has a "jobSteps" field:
	* "jobSteps": { "<job step ID>": [ <mcast_address1>, ... ] }
	* To release the multicast addresses associated with the job step,
	* PATCH the "jobSteps" object with a NULL value under the job step ID key.
	*/
	extern void slingshot_release_collectives_job_step(slingshot_stepinfo_t *job)
	{
	slingshot_hwcoll_t *hwcoll = job->hwcoll;
	long status = 0;
	char *stepid_str = NULL;
	json_object *reqjson = NULL;
	json_object *jobsteps_json = NULL;
	json_object *respjson = NULL;
	const char *url = NULL;

	/* Just return if we're not using collectives */
	if (!slingshot_config.fm_url \|\| !collectives_enabled \|\| !hwcoll)
	return;

	/* Payload is '{ "jobSteps": { "<step_id>": null } }' */
	stepid_str = xstrdup_printf("%u", hwcoll->step_id);
	if (!(reqjson = json_object_new_object()) \|\|
	!(jobsteps_json = json_object_new_object()) \|\|
	json_object_object_add(jobsteps_json, stepid_str, NULL) \|\|
	json_object_object_add(reqjson, "jobSteps", jobsteps_json)) {
	error("Slingshot hardware collectives release failed (JSON creation failed)");
	json_object_put(jobsteps_json);
	goto out;
	}
	log_flag(SWITCH, "reqjson='%s'", json_object_to_json_string(reqjson));

	/*
	* PATCH the "jobSteps" map in this job's object
	* NOTE: timing-wise, the job complete could happen before this.
	* Don't fail on error 404 (Not Found)
	*/
	url = xstrdup_printf("/fabric/collectives/jobs/%s-%u",
	slurm_conf.cluster_name, hwcoll->job_id);
	if (!(respjson = slingshot_rest_patch(&fm_conn, url, reqjson,
	&status))) {
	if (status != HTTP_NOT_FOUND) {
	error("Slingshot hardware collectives release failed (PATCH %s fabric manager failed: %ld)",
	url, status);
	goto out;
	}
	}
	log_flag(SWITCH, "respjson='%s'", json_object_to_json_string(respjson));

	/* If in debug mode, do a GET on the PATCHed job object and print it */
	if ((slurm_conf.debug_flags & DEBUG_FLAG_SWITCH) &&
	(status != HTTP_NOT_FOUND)) {
	json_object_put(respjson);
	if (!(respjson = slingshot_rest_get(&fm_conn, url, &status))) {
	error("GET %s to fabric manager for job failed: %ld",
	url, status);
	} else {
	log_flag(SWITCH, "GET %s resp='%s'",
	url, json_object_to_json_string(respjson));
	}
	}

	out:
	json_object_put(reqjson);
	json_object_put(respjson);
	xfree(stepid_str);
	xfree(url);
	return;
	}

	/*
	* If this job is using Slingshot hardware collectives, release any
	* multicast addresses associated with this job, by deleting the job
	* object from the fabric manager.
	*/
	extern void slingshot_release_collectives_job(uint32_t job_id)
	{
	long status = 0;
	const char *url = NULL;

	/* Just return if we're not using collectives */
	if (!slingshot_config.fm_url \|\| !collectives_enabled)
	return;

	_clear_hwcoll(job_id);

	/* Do a DELETE on the job object in the fabric manager */
	url = xstrdup_printf("/fabric/collectives/jobs/%s-%u",
	slurm_conf.cluster_name, job_id);
	if (!slingshot_rest_delete(&fm_conn, url, &status)) {
	error("DELETE %s from fabric manager for collectives failed: %ld",
	url, status);
	}
	xfree(url);
	return;
	}