src/common/stepd_api.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  src/common/stepd_api.c - slurmstepd message API
  *****************************************************************************
  *  Copyright (C) 2005-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Portions Copyright (C) 2008 Vijay Ramasubramanian
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Christopher Morrone <morrone2@llnl.gov>
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #define _GNU_SOURCE

 #include <dirent.h>
 #include <grp.h>
 #include <inttypes.h>
 #include <netdb.h>
 #include <regex.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <unistd.h>

 #include "src/common/fd.h"
 #include "src/common/list.h"
 #include "src/common/macros.h"
 #include "src/common/pack.h"
 #include "src/common/read_config.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/stepd_api.h"
 #include "src/common/strlcpy.h"
 #include "src/common/xmalloc.h"
 #include "src/common/xregex.h"
 #include "src/common/xstring.h"

 #include "src/interfaces/auth.h"
 #include "src/interfaces/conn.h"
 #include "src/interfaces/cred.h"
 #include "src/interfaces/jobacct_gather.h"

 strong_alias(stepd_available, slurm_stepd_available);
 strong_alias(stepd_connect, slurm_stepd_connect);
 strong_alias(stepd_get_uid, slurm_stepd_get_uid);
 strong_alias(stepd_add_extern_pid, slurm_stepd_add_extern_pid);
 strong_alias(stepd_get_x11_display, slurm_stepd_get_x11_display);
 strong_alias(stepd_getpw, slurm_stepd_getpw);
 strong_alias(xfree_struct_passwd, slurm_xfree_struct_passwd);
 strong_alias(stepd_getgr, slurm_stepd_getgr);
 strong_alias(xfree_struct_group_array, slurm_xfree_struct_group_array);
 strong_alias(stepd_gethostbyname, slurm_stepd_gethostbyname);
 strong_alias(xfree_struct_hostent, slurm_xfree_struct_hostent);
 strong_alias(stepd_get_namespace_fd, slurm_stepd_get_namespace_fd);

 /*
  * Should be called when a connect() to a socket returns ECONNREFUSED.
  * Presumably the ECONNREFUSED means that nothing is attached to the listening
  * side of the unix domain socket.
  * If the socket is at least 10 minutes old, then unlink it.
  */
 static void
 _handle_stray_socket(const char *socket_name)
 {
 	struct stat buf;
 	uid_t uid;
 	time_t now;

 	/* Only attempt to remove the stale socket if process is running
 	   as root or the SlurmdUser. */
 	if (getuid() && (getuid() != slurm_conf.slurmd_user_id))
 		return;

 	if (stat(socket_name, &buf) == -1) {
 		debug3("_handle_stray_socket: unable to stat %s: %m",
 			socket_name);
 		return;
 	}

 	if ((uid = getuid()) != buf.st_uid) {
 		debug3("_handle_stray_socket: socket %s is not owned by uid %u",
 		       socket_name, uid);
 		return;
 	}

 	now = time(NULL);
 	if ((now - buf.st_mtime) > 600) {
 		/* remove the socket */
 		if (unlink(socket_name) == -1) {
 			if (errno != ENOENT) {
 				error("_handle_stray_socket: unable to clean up"
 				      " stray socket %s: %m", socket_name);
 			}
 		} else {
 			debug("Cleaned up stray socket %s", socket_name);
 		}
 	}
 }

 static void _handle_stray_script(const char *directory, uint32_t job_id)
 {
 	char *dir_path = NULL, *file_path = NULL;

 	xstrfmtcat(dir_path, "%s/job%05u", directory, job_id);
 	xstrfmtcat(file_path, "%s/slurm_script", dir_path);
 	info("%s: Purging vestigial job script %s", __func__, file_path);
 	(void) unlink(file_path);
 	(void) rmdir(dir_path);

 	xfree(dir_path);
 	xfree(file_path);
 }

 static int
 _step_connect(const char *directory, const char *nodename,
 	      slurm_step_id_t *step_id)
 {
 	int fd;
 	int rc;
 	char *name = NULL, *pos = NULL;
 	uint32_t stepid = step_id->step_id;

 	xstrfmtcatat(name, &pos, "%s/%s_%u.%u",
 		     directory, nodename, step_id->job_id, stepid);
 	if (step_id->step_het_comp != NO_VAL)
 		xstrfmtcatat(name, &pos, ".%u", step_id->step_het_comp);

 	if ((rc = slurm_open_unix_stream(name, 0, &fd))) {
 		/* Can indicate race condition at step termination */
 		debug("%s: failed for %s: %s",
 		      __func__, name, slurm_strerror(rc));
 		if (errno == ECONNREFUSED && running_in_slurmd()) {
 			_handle_stray_socket(name);

 			if (step_id->step_id == SLURM_BATCH_SCRIPT)
 				_handle_stray_script(directory,
 						     step_id->job_id);
 		}

 		xfree(name);
 		close(fd);
 		return -1;
 	}

 	xfree(name);
 	return fd;
 }


 static char *
 _guess_nodename(void)
 {
 	char host[HOST_NAME_MAX];
 	char *nodename = NULL;

 	/* If we are in a step just grab it from the ENV */
 	if ((nodename = getenv("SLURMD_NODENAME")))
 		return xstrdup(nodename);

 	if (gethostname_short(host, sizeof(host)) != 0)
 		return NULL;
 	nodename = slurm_conf_get_nodename(host);
 	if (nodename == NULL)
 		nodename = slurm_conf_get_aliased_nodename();
 	if (nodename == NULL) /* if no match, try localhost */
 		nodename = slurm_conf_get_nodename("localhost");
 	/*
 	 * If nothing above has given us a name, just return what
 	 * gethostname_short. This is helpful for dynamic nodes.
 	 */
 	if (!nodename)
 		nodename = xstrdup(host);

 	return nodename;
 }

 /*
  * Connect to a slurmstepd process by way of its unix domain socket.
  *
  * Both "directory" and "nodename" may be null, in which case stepd_connect
  * will attempt to determine them on its own.  If you are using multiple
  * slurmd on one node (unusual outside of development environments), you
  * will get one of the local NodeNames more-or-less at random.
  *
  * Returns a file descriptor for the opened socket on success alongside the
  * protocol_version for the stepd, or -1 on error.
  */
 extern int stepd_connect(const char *directory, const char *nodename,
 			 slurm_step_id_t *step_id,
 			 uint16_t *protocol_version)
 {
 	int req = SLURM_PROTOCOL_VERSION;
 	int fd = -1;
 	int rc;
 	char *local_nodename = NULL;

 	*protocol_version = 0;

 	if (nodename == NULL) {
 		if (!(local_nodename = _guess_nodename()))
 			return -1;
 		nodename = local_nodename;
 	}
 	if (directory == NULL) {
 		slurm_conf_t *cf = slurm_conf_lock();
 		directory = slurm_conf_expand_slurmd_path(cf->slurmd_spooldir,
 							  nodename, NULL);
 		slurm_conf_unlock();
 	}

 	/* Connect to the step */
 	fd = _step_connect(directory, nodename, step_id);
 	if (fd == -1)
 		goto fail1;

 	safe_write(fd, &req, sizeof(int));
 	safe_read(fd, &rc, sizeof(int));
 	if (rc < 0)
 		goto rwfail;
 	else if (rc)
 		*protocol_version = rc;

 	xfree(local_nodename);
 	return fd;

 rwfail:
 	fd_close(&fd);
 fail1:
 	xfree(local_nodename);
 	return fd;
 }


 /*
  * Retrieve a job step's current state.
  */
 slurmstepd_state_t
 stepd_state(int fd, uint16_t protocol_version)
 {
 	int req	= REQUEST_STATE;
 	slurmstepd_state_t status = SLURMSTEPD_NOT_RUNNING;

 	safe_write(fd, &req, sizeof(int));
 	safe_read(fd, &status, sizeof(slurmstepd_state_t));
 rwfail:
 	return status;
 }

 /*
  * Send job notification message to a batch job
  */
 int
 stepd_notify_job(int fd, uint16_t protocol_version, char *message)
 {
 	int req = REQUEST_JOB_NOTIFY;
 	int rc;

 	safe_write(fd, &req, sizeof(int));
 	if (message) {
 		rc = strlen(message) + 1;
 		safe_write(fd, &rc, sizeof(int));
 		safe_write(fd, message, rc);
 	} else {
 		rc = 0;
 		safe_write(fd, &rc, sizeof(int));
 	}

 	/* Receive the return code */
 	safe_read(fd, &rc, sizeof(int));
 	return rc;
  rwfail:
 	return -1;
 }

 /*
  * Send a signal to the proctrack container of a job step.
  */
 int stepd_signal_container(int fd, uint16_t protocol_version, int signal,
 			   int flags, char *details, uid_t req_uid)
 {
 	int req = REQUEST_SIGNAL_CONTAINER, details_len = 0;
 	int rc;
 	int errnum = 0;

 	safe_write(fd, &req, sizeof(int));
 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		safe_write(fd, &signal, sizeof(int));
 		safe_write(fd, &flags, sizeof(int));
 		if (details)
 			details_len = strlen(details);
 		safe_write(fd, &details_len, sizeof(int));
 		safe_write(fd, details, details_len);
 		safe_write(fd, &req_uid, sizeof(uid_t));
 	} else {
 		error("%s: invalid protocol_version %u",
 		      __func__, protocol_version);
 		goto rwfail;
 	}

 	/* Receive the return code and errno */
 	safe_read(fd, &rc, sizeof(int));
 	safe_read(fd, &errnum, sizeof(int));

 	errno = errnum;
 	return rc;
 rwfail:
 	return -1;
 }

 /*
  * Request to enter namespace of a job
  * -1 on error;
  */
 extern int stepd_get_namespace_fd(int fd, uint16_t protocol_version)
 {
 	int req = REQUEST_GET_NS_FD;
 	int ns_fd = 0;

 	debug("entering %s", __func__);
 	safe_write(fd, &req, sizeof(int));

 	safe_read(fd, &ns_fd, sizeof(ns_fd));

 	/*
 	 * Receive the file descriptor of the namespace to be joined if valid fd
 	 * is coming. Note that the number of ns_fd will not be the same
 	 * returned from receive_fd_over_socket().  The number we got from the
 	 * safe_read was the fd on the sender which will be different on our
 	 * end.
 	 */
 	if (ns_fd > 0)
 		ns_fd = receive_fd_over_socket(fd);

 	return ns_fd;

 rwfail:
 	return -1;
 }

 /*
  * Attach a client to a running job step.
  *
  * On success returns SLURM_SUCCESS and fills in resp->local_pids,
  * resp->gtids, resp->ntasks, and resp->executable.
  */
 extern int stepd_attach(int fd, uint16_t protocol_version, slurm_addr_t *ioaddr,
 			slurm_addr_t *respaddr, char *cert, char *io_key,
 			uid_t uid, reattach_tasks_response_msg_t *resp)
 {
 	int req = REQUEST_ATTACH;
 	uint32_t io_key_len = strlen(io_key) + 1;
 	uint32_t cert_len;
 	int rc = SLURM_SUCCESS;

 	if (protocol_version >= SLURM_25_05_PROTOCOL_VERSION) {
 		safe_write(fd, &req, sizeof(int)); /* needs to be first */

 		if (cert) {
 			cert_len = strlen(cert) + 1;
 			safe_write(fd, &cert_len, sizeof(uint32_t));
 			safe_write(fd, cert, cert_len);
 		} else {
 			cert_len = 0;
 			safe_write(fd, &cert_len, sizeof(uint32_t));
 		}

 		safe_write(fd, ioaddr, sizeof(slurm_addr_t));
 		safe_write(fd, respaddr, sizeof(slurm_addr_t));
 		safe_write(fd, &io_key_len, sizeof(uint32_t));
 		safe_write(fd, io_key, io_key_len);
 		safe_write(fd, &uid, sizeof(uid_t));
 		safe_write(fd, &protocol_version, sizeof(uint16_t));
 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		safe_write(fd, &req, sizeof(int));
 		safe_write(fd, ioaddr, sizeof(slurm_addr_t));
 		safe_write(fd, respaddr, sizeof(slurm_addr_t));
 		safe_write(fd, &io_key_len, sizeof(uint32_t));
 		safe_write(fd, io_key, io_key_len);
 		safe_write(fd, &uid, sizeof(uid_t));
 		safe_write(fd, &protocol_version, sizeof(uint16_t));
 	} else
 		goto rwfail;

 	/* Receive the return code */
 	safe_read(fd, &rc, sizeof(int));

 	if (rc == SLURM_SUCCESS) {
 		/* Receive response info */
 		uint32_t ntasks;
 		int len, i;

 		safe_read(fd, &ntasks, sizeof(uint32_t));
 		resp->ntasks = ntasks;
 		len = ntasks * sizeof(uint32_t);

 		resp->local_pids = xcalloc(ntasks, sizeof(uint32_t));
 		safe_read(fd, resp->local_pids, len);

 		resp->gtids = xcalloc(ntasks, sizeof(uint32_t));
 		safe_read(fd, resp->gtids, len);

 		resp->executable_names = xcalloc(ntasks, sizeof(char *));
 		for (i = 0; i < ntasks; i++) {
 			safe_read(fd, &len, sizeof(int));
 			resp->executable_names[i] = xmalloc(len);
 			safe_read(fd, resp->executable_names[i], len);
 		}
 	}

 	return rc;
 rwfail:
 	return SLURM_ERROR;
 }

 static void
 _free_step_loc_t(step_loc_t *loc)
 {
 	if (loc->directory)
 		xfree(loc->directory);
 	if (loc->nodename)
 		xfree(loc->nodename);
 	xfree(loc);
 }

 static int
 _sockname_regex_init(regex_t *re, const char *nodename)
 {
 	char *pattern = NULL;
 	int rc;

 	xstrcat(pattern, "^");
 	xstrcat(pattern, nodename);
 	xstrcat(pattern,
 		"_([[:digit:]]*)\\.([[:digit:]]*)\\.{0,1}([[:digit:]]*)$");

 	if ((rc = regcomp(re, pattern, REG_EXTENDED))) {
 		dump_regex_error(rc, re,
 				 "sockname regex \"%s\" compilation failed",
 				 pattern);
 		return -1;
 	}

 	xfree(pattern);

 	return 0;
 }

 static int
 _sockname_regex(regex_t *re, const char *filename, slurm_step_id_t *step_id)
 {
 	size_t nmatch = 5;
 	regmatch_t pmatch[5];
 	char *match;
 	size_t my_size;
 	int rc;

 	xassert(step_id);

 	memset(pmatch, 0, sizeof(regmatch_t)*nmatch);
 	if ((rc = regexec(re, filename, nmatch, pmatch, 0))) {
 		if (rc != REG_NOMATCH)
 			dump_regex_error(rc, re, "regexc(%s)", filename);
 		return -1;
 	}

 	match = xstrndup(filename + pmatch[1].rm_so,
 			(size_t)(pmatch[1].rm_eo - pmatch[1].rm_so));
 	step_id->job_id = slurm_atoul(match);
 	xfree(match);

 	match = xstrndup(filename + pmatch[2].rm_so,
 			(size_t)(pmatch[2].rm_eo - pmatch[2].rm_so));
 	step_id->step_id = slurm_atoul(match);
 	xfree(match);

 	/* If we have a size here we have a het_comp */
 	if ((my_size = pmatch[3].rm_eo - pmatch[3].rm_so)) {
 		match = xstrndup(filename + pmatch[3].rm_so, my_size);
 		step_id->step_het_comp = slurm_atoul(match);
 		xfree(match);
 	} else
 		step_id->step_het_comp = NO_VAL;

 	return 0;
 }

 /*
  * Scan for available running slurm step daemons by checking
  * "directory" for unix domain sockets with names beginning in "nodename".
  *
  * Both "directory" and "nodename" may be null, in which case stepd_available
  * will attempt to determine them on its own.  If you are using multiple
  * slurmd on one node (unusual outside of development environments), you
  * will get one of the local NodeNames more-or-less at random.
  *
  * Returns a list of pointers to step_loc_t structures.
  */
 extern list_t *stepd_available(const char *directory, const char *nodename)
 {
 	list_t *l = NULL;
 	DIR *dp;
 	struct dirent *ent;
 	regex_t re;
 	struct stat stat_buf;
 	char *local_nodename = NULL;

 	if (nodename == NULL) {
 		if (!(local_nodename = _guess_nodename())) {
 			error("%s: Couldn't find nodename", __func__);
 			return NULL;
 		}
 		nodename = local_nodename;
 	}
 	if (directory == NULL) {
 		slurm_conf_t *cf = slurm_conf_lock();
 		directory = slurm_conf_expand_slurmd_path(
 			cf->slurmd_spooldir, nodename, NULL);
 		slurm_conf_unlock();
 	}

 	l = list_create((ListDelF) _free_step_loc_t);
 	if (_sockname_regex_init(&re, nodename) == -1)
 		goto done;

 	/*
 	 * Make sure that "directory" exists and is a directory.
 	 */
 	if (stat(directory, &stat_buf) < 0) {
 		error("Domain socket directory %s: %m", directory);
 		goto done;
 	} else if (!S_ISDIR(stat_buf.st_mode)) {
 		error("%s is not a directory", directory);
 		goto done;
 	}

 	if ((dp = opendir(directory)) == NULL) {
 		error("Unable to open directory: %m");
 		goto done;
 	}

 	while ((ent = readdir(dp)) != NULL) {
 		step_loc_t *loc;
 		slurm_step_id_t step_id;

 		if (!_sockname_regex(&re, ent->d_name, &step_id)) {
 			debug4("found %ps", &step_id);
 			loc = xmalloc(sizeof(step_loc_t));
 			loc->directory = xstrdup(directory);
 			loc->nodename = xstrdup(nodename);
 			memcpy(&loc->step_id, &step_id, sizeof(loc->step_id));
 			list_append(l, (void *)loc);
 		}
 	}

 	closedir(dp);
 done:
 	xfree(local_nodename);
 	regfree(&re);
 	return l;
 }

 /*
  * Send the termination signal to all of the unix domain socket files
  * for a given directory and nodename, and then unlink the files.
  * Returns SLURM_ERROR if any sockets could not be unlinked.
  */
 int
 stepd_cleanup_sockets(const char *directory, const char *nodename)
 {
 	DIR *dp;
 	struct dirent *ent;
 	regex_t re;
 	struct stat stat_buf;
 	int rc = SLURM_SUCCESS;

 	_sockname_regex_init(&re, nodename);

 	/*
 	 * Make sure that "directory" exists and is a directory.
 	 */
 	if (stat(directory, &stat_buf) < 0) {
 		error("Domain socket directory %s: %m", directory);
 		goto done;
 	} else if (!S_ISDIR(stat_buf.st_mode)) {
 		error("%s is not a directory", directory);
 		goto done;
 	}

 	if ((dp = opendir(directory)) == NULL) {
 		error("Unable to open directory: %m");
 		goto done;
 	}

 	while ((ent = readdir(dp)) != NULL) {
 		slurm_step_id_t step_id;
 		if (!_sockname_regex(&re, ent->d_name, &step_id)) {
 			char *path;
 			int fd;
 			uint16_t protocol_version;

 			path = NULL;
 			xstrfmtcat(path, "%s/%s", directory, ent->d_name);

 			verbose("Cleaning up stray %ps", &step_id);

 			/* signal the slurmstepd to terminate its step */
 			fd = stepd_connect((char *) directory,
 					   (char *) nodename,
 					   &step_id,
 					   &protocol_version);
 			if (fd == -1) {
 				debug("Unable to connect to socket %s", path);
 			} else {
 				if (stepd_signal_container(
 					    fd, protocol_version, SIGKILL, 0,
 					    NULL, getuid())
 				    == -1) {
 					debug("Error sending SIGKILL to %ps",
 					      &step_id);
 				}
 				close(fd);
 			}

 			/* make sure that the socket has been removed */
 			if (unlink(path) == -1 && errno != ENOENT) {
 				error("Unable to clean up stray socket %s: %m",
 				      path);
 				rc = SLURM_ERROR;
 			}
 			xfree(path);
 		}
 	}

 	closedir(dp);
 done:
 	regfree(&re);
 	return rc;
 }

 /*
  * Return true if the process with process ID "pid" is found in
  * the proctrack container of the slurmstepd "step".
  */
 bool
 stepd_pid_in_container(int fd, uint16_t protocol_version, pid_t pid)
 {
 	int req = REQUEST_PID_IN_CONTAINER;
 	bool rc;

 	safe_write(fd, &req, sizeof(int));
 	safe_write(fd, &pid, sizeof(pid_t));

 	/* Receive the return code */
 	safe_read(fd, &rc, sizeof(bool));

 	debug("Leaving stepd_pid_in_container");
 	return rc;
 rwfail:
 	return false;
 }

 /*
  * Add a pid to the "extern" step of a job, meaning add it to the
  * jobacct_gather and proctrack plugins.
  */
 extern int stepd_add_extern_pid(int fd, uint16_t protocol_version, pid_t pid)
 {
 	int req = REQUEST_ADD_EXTERN_PID;
 	int rc;

 	safe_write(fd, &req, sizeof(int));
 	safe_write(fd, &pid, sizeof(pid_t));

 	/* Receive the return code */
 	safe_read(fd, &rc, sizeof(int));

 	debug("Leaving stepd_add_extern_pid");
 	return rc;
 rwfail:
 	return SLURM_ERROR;
 }

 extern int stepd_get_x11_display(int fd, uint16_t protocol_version,
 				 char **xauthority)
 {
 	int req = REQUEST_X11_DISPLAY;
 	int display = 0, len = 0;

 	*xauthority = NULL;

 	safe_write(fd, &req, sizeof(int));

 	/*
 	 * Receive the display number,
 	 * or zero if x11 forwarding is not setup
 	 */
 	safe_read(fd, &display, sizeof(int));

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		safe_read(fd, &len, sizeof(int));
 		if (len) {
 			*xauthority = xmalloc(len);
 			safe_read(fd, *xauthority, len);
 		}
 	}

 	debug("Leaving stepd_get_x11_display");
 	return display;

 rwfail:
 	return 0;
 }

 /*
  *
  */
 extern struct passwd *stepd_getpw(int fd, uint16_t protocol_version,
 				  int mode, uid_t uid, const char *name)
 {
 	int req = REQUEST_GETPW;
 	int found = 0;
 	int len = 0;
 	struct passwd *pwd = xmalloc(sizeof(struct passwd));

 	safe_write(fd, &req, sizeof(int));

 	safe_write(fd, &mode, sizeof(int));

 	safe_write(fd, &uid, sizeof(uid_t));
 	if (name) {
 		len = strlen(name);
 		safe_write(fd, &len, sizeof(int));
 		safe_write(fd, name, len);
 	} else {
 		safe_write(fd, &len, sizeof(int));
 	}

 	safe_read(fd, &found, sizeof(int));

 	if (!found) {
 		xfree(pwd);
 		return NULL;
 	}

 	safe_read(fd, &len, sizeof(int));
 	pwd->pw_name = xmalloc(len + 1);
 	safe_read(fd, pwd->pw_name, len);

 	safe_read(fd, &len, sizeof(int));
 	pwd->pw_passwd = xmalloc(len + 1);
 	safe_read(fd, pwd->pw_passwd, len);

 	safe_read(fd, &pwd->pw_uid, sizeof(uid_t));
 	safe_read(fd, &pwd->pw_gid, sizeof(gid_t));

 	safe_read(fd, &len, sizeof(int));
 	pwd->pw_gecos = xmalloc(len + 1);
 	safe_read(fd, pwd->pw_gecos, len);

 	safe_read(fd, &len, sizeof(int));
 	pwd->pw_dir = xmalloc(len + 1);
 	safe_read(fd, pwd->pw_dir, len);

 	safe_read(fd, &len, sizeof(int));
 	pwd->pw_shell = xmalloc(len + 1);
 	safe_read(fd, pwd->pw_shell, len);

 	debug("Leaving %s", __func__);
 	return pwd;

 rwfail:
 	xfree_struct_passwd(pwd);
 	return NULL;
 }

 extern void xfree_struct_passwd(struct passwd *pwd)
 {
 	if (!pwd)
 		return;

 	xfree(pwd->pw_name);
 	xfree(pwd->pw_passwd);
 	xfree(pwd->pw_gecos);
 	xfree(pwd->pw_dir);
 	xfree(pwd->pw_shell);
 	xfree(pwd);
 }

 extern struct group **stepd_getgr(int fd, uint16_t protocol_version,
 				  int mode, gid_t gid, const char *name)
 {
 	int req = REQUEST_GETGR;
 	int found = 0;
 	int len = 0;
 	struct group **grps = NULL;

 	safe_write(fd, &req, sizeof(int));

 	safe_write(fd, &mode, sizeof(int));

 	safe_write(fd, &gid, sizeof(gid_t));
 	if (name) {
 		len = strlen(name);
 		safe_write(fd, &len, sizeof(int));
 		safe_write(fd, name, len);
 	} else {
 		safe_write(fd, &len, sizeof(int));
 	}

 	safe_read(fd, &found, sizeof(int));

 	if (!found)
 		return NULL;

 	/* Add space for NULL termination of the array */
 	grps = xcalloc(found + 1, sizeof(struct group *));

 	for (int i = 0; i < found; i++) {
 		grps[i] = xmalloc(sizeof(struct group));

 		safe_read(fd, &len, sizeof(int));
 		grps[i]->gr_name = xmalloc(len + 1);
 		safe_read(fd, grps[i]->gr_name, len);

 		safe_read(fd, &len, sizeof(int));
 		grps[i]->gr_passwd = xmalloc(len + 1);
 		safe_read(fd, grps[i]->gr_passwd, len);

 		safe_read(fd, &grps[i]->gr_gid, sizeof(gid_t));

 		/*
 		 * In the current implementation, we define each group to
 		 * only have a single member - that of the user running the
 		 * job. (Since gr_mem is a NULL terminated array, allocate
 		 * space for two elements.)
 		 */
 		grps[i]->gr_mem = xcalloc(2, sizeof(char *));
 		safe_read(fd, &len, sizeof(int));
 		grps[i]->gr_mem[0] = xmalloc(len + 1);
 		safe_read(fd, grps[i]->gr_mem[0], len);
 	}
 	debug("Leaving %s", __func__);
 	return grps;

 rwfail:
 	xfree_struct_group_array(grps);
 	return NULL;
 }

 extern void xfree_struct_group_array(struct group **grps)
 {
 	for (int i = 0; grps && grps[i]; i++) {
 		xfree(grps[i]->gr_name);
 		xfree(grps[i]->gr_passwd);
 		xfree(grps[i]->gr_mem[0]);
 		xfree(grps[i]->gr_mem);
 		xfree(grps[i]);
 	}
 	xfree(grps);
 }

 extern struct hostent *stepd_gethostbyname(int fd, uint16_t protocol_version,
 					   int mode, const char *nodename)
 {
 	int req = REQUEST_GETHOST;
 	int found = 0;
 	int len = 0;
 	int cnt = 0;
 	struct hostent *host = NULL;

 	safe_write(fd, &req, sizeof(int));

 	safe_write(fd, &mode, sizeof(int));

 	if (nodename) {
 		len = strlen(nodename);
 		safe_write(fd, &len, sizeof(int));
 		safe_write(fd, nodename, len);
 	} else {
 		safe_write(fd, &len, sizeof(int));
 	}

 	safe_read(fd, &found, sizeof(int));

 	if (!found)
 		return NULL;

 	host = xmalloc(sizeof(struct hostent));

 	safe_read(fd, &len, sizeof(int));
 	host->h_name = xmalloc(len + 1);
 	safe_read(fd, host->h_name, len);

 	safe_read(fd, &cnt, sizeof(int));
 	host->h_aliases = xcalloc(cnt + 1, sizeof(char *));
 	for (int i = 0; i < cnt; i++) {
 		safe_read(fd, &len, sizeof(int));
 		host->h_aliases[i] = xmalloc(len + 1);
 		safe_read(fd, host->h_aliases[i], len);
 	}
 	safe_read(fd, &host->h_addrtype, sizeof(int));
 	safe_read(fd, &len, sizeof(int));
 	host->h_length = len;

 	/*
 	 * In the current implementation, we define each host to
 	 * only have a single address.
 	 * (Since h_addr_list is a NULL terminated array, allocate
 	 * space for two elements.)
 	 */
 	host->h_addr_list = xcalloc(2, sizeof(char *));
 	host->h_addr_list[0] = xmalloc(len);
 	safe_read(fd, host->h_addr_list[0], len);

 	debug("Leaving %s", __func__);
 	return host;

 rwfail:
 	xfree_struct_hostent(host);
 	return NULL;

 }

 extern void xfree_struct_hostent(struct hostent *host)
 {
 	if (!host)
 		return;
 	xfree(host->h_name);
 	for (int i = 0; host->h_aliases && host->h_aliases[i];
 	     i++) {
 		xfree(host->h_aliases[i]);
 	}
 	xfree(host->h_aliases);
 	if (host->h_addr_list) {
 		xfree(host->h_addr_list[0]);
 		xfree(host->h_addr_list);
 	}
 	xfree(host);
 }

 /*
  * Return the process ID of the slurmstepd.
  */
 pid_t
 stepd_daemon_pid(int fd, uint16_t protocol_version)
 {
 	int req	= REQUEST_DAEMON_PID;
 	pid_t pid;

 	safe_write(fd, &req, sizeof(int));
 	safe_read(fd, &pid, sizeof(pid_t));

 	return pid;
 rwfail:
 	return (pid_t)-1;
 }

 /*
  * Suspend execution of the job step.  Only root or SlurmUser is
  * authorized to use this call. Since this activity includes a 'sleep 1'
  * in the slurmstepd, initiate the "suspend" in parallel.
  *
  * Returns SLURM_SUCCESS is successful.  On error returns SLURM_ERROR
  * and sets errno.
  */
 extern int
 stepd_suspend(int fd, uint16_t protocol_version,
 	      suspend_int_msg_t *susp_req, int phase)
 {
 	int req = REQUEST_STEP_SUSPEND;
 	int rc = 0;
 	int errnum = 0;

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		if (phase == 0) {
 			safe_write(fd, &req, sizeof(int));
 		} else {
 			/* Receive the return code and errno */
 			safe_read(fd, &rc, sizeof(int));
 			safe_read(fd, &errnum, sizeof(int));
 			errno = errnum;
 		}
 	}

 	return rc;
 rwfail:
 	return -1;
 }

 /*
  * Resume execution of the job step that has been suspended by a
  * call to stepd_suspend().  Only root or SlurmUser is
  * authorized to use this call.
  *
  * Returns SLURM_SUCCESS is successful.  On error returns SLURM_ERROR
  * and sets errno.
  */
 extern int
 stepd_resume(int fd, uint16_t protocol_version,
 	     suspend_int_msg_t *susp_req, int phase)
 {
 	int req = REQUEST_STEP_RESUME;
 	int rc = 0;
 	int errnum = 0;

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		if (phase == 0) {
 			safe_write(fd, &req, sizeof(int));
 		} else {
 			/* Receive the return code and errno */
 			safe_read(fd, &rc, sizeof(int));
 			safe_read(fd, &errnum, sizeof(int));
 			errno = errnum;
 		}
 	}

 	return rc;
 rwfail:
 	return -1;
 }

 extern int stepd_reconfig(int fd, uint16_t protocol_version, buf_t *reconf)
 {
 	int req = REQUEST_STEP_RECONFIGURE;
 	int rc;
 	int errnum = 0;

 	safe_write(fd, &req, sizeof(int));

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		int len = 0;
 		if (reconf) {
 			len = get_buf_offset(reconf);
 			safe_write(fd, &len, sizeof(int));
 			safe_write(fd, get_buf_data(reconf), len);
 		} else {
 			safe_write(fd, &len, sizeof(int));
 		}
 	}

 	/* Receive the return code and errno */
 	safe_read(fd, &rc, sizeof(int));
 	safe_read(fd, &errnum, sizeof(int));

 	errno = errnum;
 	return rc;
 rwfail:
 	return -1;
 }

 /*
  * Terminate the job step.
  *
  * Returns SLURM_SUCCESS is successful.  On error returns SLURM_ERROR
  * and sets errno.
  */
 int
 stepd_terminate(int fd, uint16_t protocol_version)
 {
 	int req = REQUEST_STEP_TERMINATE;
 	int rc;
 	int errnum = 0;

 	safe_write(fd, &req, sizeof(int));

 	/* Receive the return code and errno */
 	safe_read(fd, &rc, sizeof(int));
 	safe_read(fd, &errnum, sizeof(int));

 	errno = errnum;
 	return rc;
 rwfail:
 	return -1;
 }

 /*
  *
  * Returns SLURM_SUCCESS if successful.  On error returns SLURM_ERROR
  * and sets errno.
  */
 int
 stepd_completion(int fd, uint16_t protocol_version, step_complete_msg_t *sent)
 {
 	int req = REQUEST_STEP_COMPLETION;
 	int rc;
 	int errnum = 0;
 	buf_t *buffer;
 	int len = 0;

 	buffer = init_buf(0);

 	debug("Entering stepd_completion for %ps, range_first = %d, range_last = %d",
 	      &sent->step_id, sent->range_first, sent->range_last);

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		safe_write(fd, &req, sizeof(int));
 		safe_write(fd, &sent->range_first, sizeof(int));
 		safe_write(fd, &sent->range_last, sizeof(int));
 		safe_write(fd, &sent->step_rc, sizeof(int));
 		safe_write(fd, &sent->step_id.step_id, sizeof(uint32_t));
 		safe_write(fd, &sent->send_to_stepmgr, sizeof(bool));

 		/*
 		 * We must not use setinfo over a pipe with slurmstepd here
 		 * Indeed, slurmd does a large use of getinfo over a pipe
 		 * with slurmstepd and doing the reverse can result in
 		 * a deadlock scenario with slurmstepd :
 		 * slurmd(lockforread,write)/slurmstepd(write,lockforread)
 		 * Do pack/unpack instead to be sure of independances of
 		 * slurmd and slurmstepd
 		 */
 		jobacctinfo_pack(sent->jobacct, protocol_version,
 				 PROTOCOL_TYPE_SLURM, buffer);
 		len = get_buf_offset(buffer);
 		safe_write(fd, &len, sizeof(int));
 		safe_write(fd, get_buf_data(buffer), len);
 		FREE_NULL_BUFFER(buffer);

 		/* Receive the return code and errno */
 		safe_read(fd, &rc, sizeof(int));
 		safe_read(fd, &errnum, sizeof(int));
 	} else {
 		error("%s: bad protocol version %hu",
 		      __func__, protocol_version);
 		rc = SLURM_ERROR;
 	}

 	errno = errnum;
 	return rc;

 rwfail:
 	FREE_NULL_BUFFER(buffer);
 	return -1;
 }

 /*
  *
  * Returns jobacctinfo_t struct on success, NULL on error.
  * jobacctinfo_t must be freed after calling this function.
  */
 int
 stepd_stat_jobacct(int fd, uint16_t protocol_version,
 		   slurm_step_id_t *sent, job_step_stat_t *resp)
 {
 	int req = REQUEST_STEP_STAT;
 	int rc = SLURM_SUCCESS;
 	int tasks = 0;

 	/* NULL return indicates that accounting is disabled */
 	if (!(resp->jobacct = jobacctinfo_create(NULL)))
 		return rc;

 	debug("Entering %s for %ps", __func__, sent);

 	safe_write(fd, &req, sizeof(int));

 	/* Do not attempt reading data until there is something to read.
 	 * Avoid locking the jobacct_gather plugin early and creating
 	 * possible deadlock. */
 	if (wait_fd_readable(fd, 300))
 		goto rwfail;

 	/* Fill in the jobacct struct and return */
 	rc = jobacctinfo_getinfo(resp->jobacct, JOBACCT_DATA_PIPE, &fd,
 				 protocol_version);

 	safe_read(fd, &tasks, sizeof(int));
 	resp->num_tasks = tasks;

 	return rc;
 rwfail:
 	error("gathering job accounting: %d", rc);
 	jobacctinfo_destroy(resp->jobacct);
 	resp->jobacct = NULL;
 	return rc;
 }

 /*
  * List all of task process IDs and their local and global Slurm IDs.
  *
  * Returns SLURM_SUCCESS on success.  On error returns SLURM_ERROR
  * and sets errno.
  */
 int
 stepd_task_info(int fd, uint16_t protocol_version,
 		slurmstepd_task_info_t **task_info,
 		uint32_t *task_info_count)
 {
 	int req = REQUEST_STEP_TASK_INFO;
 	slurmstepd_task_info_t *task = NULL;
 	uint32_t ntasks;
 	int i;

 	safe_write(fd, &req, sizeof(int));

 	safe_read(fd, &ntasks, sizeof(uint32_t));
 	task = xcalloc(ntasks, sizeof(slurmstepd_task_info_t));
 	for (i = 0; i < ntasks; i++) {
 		safe_read(fd, &(task[i].id), sizeof(int));
 		safe_read(fd, &(task[i].gtid), sizeof(uint32_t));
 		safe_read(fd, &(task[i].pid), sizeof(pid_t));
 		safe_read(fd, &(task[i].exited), sizeof(bool));
 		safe_read(fd, &(task[i].estatus), sizeof(int));
 	}

 	if (ntasks == 0) {
 		xfree(task);
 		*task_info_count = 0;
 		*task_info = NULL;
 	} else {
 		*task_info_count = ntasks;
 		*task_info = task;
 	}

 	return SLURM_SUCCESS;
 rwfail:
 	xfree(task);
 	*task_info_count = 0;
 	*task_info = NULL;
 	xfree(task);
 	return SLURM_ERROR;
 }

 /*
  * List all of process IDs in the proctrack container.
  *
  * Returns SLURM_SUCCESS is successful.  On error returns SLURM_ERROR
  * and sets errno.
  */
 int
 stepd_list_pids(int fd, uint16_t protocol_version,
 		uint32_t **pids_array, uint32_t *pids_count)
 {
 	int req = REQUEST_STEP_LIST_PIDS;
 	uint32_t npids;
 	uint32_t *pids = NULL;
 	int i;

 	safe_write(fd, &req, sizeof(int));

 	/* read the pid list */
 	safe_read(fd, &npids, sizeof(uint32_t));
 	pids = xcalloc(npids, sizeof(uint32_t));
 	for (i = 0; i < npids; i++) {
 		safe_read(fd, &pids[i], sizeof(uint32_t));
 	}

 	if (npids == 0)
 		xfree(pids);

 	*pids_count = npids;
 	*pids_array = pids;
 	return SLURM_SUCCESS;

 rwfail:
 	xfree(pids);
 	*pids_count = 0;
 	*pids_array = NULL;
 	return SLURM_ERROR;
 }

 /*
  * Get the memory limits of the step
  * Returns uid of the running step if successful.  On error returns -1.
  */
 extern int stepd_get_mem_limits(int fd, uint16_t protocol_version,
 				slurmstepd_mem_info_t *stepd_mem_info)
 {
 	int req = REQUEST_STEP_MEM_LIMITS;

 	xassert(stepd_mem_info);
 	memset(stepd_mem_info, 0, sizeof(slurmstepd_mem_info_t));

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		safe_write(fd, &req, sizeof(int));

 		safe_read(fd, &stepd_mem_info->job_mem_limit, sizeof(uint32_t));
 		safe_read(fd, &stepd_mem_info->step_mem_limit,
 			  sizeof(uint32_t));
 	}

 	return SLURM_SUCCESS;
 rwfail:
 	return SLURM_ERROR;
 }

 /*
  * Get the uid of the step
  * Returns uid of the running step if successful.  On error returns -1.
  *
  * FIXME: BUG: On Linux, uid_t is uint32_t but this can return -1.
  */
 extern uid_t stepd_get_uid(int fd, uint16_t protocol_version)
 {
 	int req = REQUEST_STEP_UID;
 	uid_t uid = -1;

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		safe_write(fd, &req, sizeof(int));

 		safe_read(fd, &uid, sizeof(uid_t));
 	}

 	return uid;
 rwfail:
 	return -1;
 }

 /*
  * Get the nodeid of the stepd
  * Returns nodeid of the running stepd if successful.  On error returns NO_VAL.
  */
 extern uint32_t stepd_get_nodeid(int fd, uint16_t protocol_version)
 {
 	int req = REQUEST_STEP_NODEID;
 	uint32_t nodeid = NO_VAL;

 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		safe_write(fd, &req, sizeof(int));

 		safe_read(fd, &nodeid, sizeof(uid_t));
 	}

 	return nodeid;
 rwfail:
 	return NO_VAL;
 }

 extern int stepd_relay_msg(int fd, slurm_msg_t *msg, uint16_t protocol_version)
 {
 	int req = msg->msg_type;
 	uint32_t buf_size;

 	safe_write(fd, &req, sizeof(int));

 	buf_size = get_buf_offset(msg->buffer) - msg->body_offset;

 	safe_write(fd, &msg->protocol_version, sizeof(uint16_t));
 	send_fd_over_socket(fd, conn_g_get_fd(msg->tls_conn));
 	safe_write(fd, &buf_size, sizeof(uint32_t));
 	safe_write(fd, &msg->buffer->head[msg->body_offset], buf_size);

 	return 0;

 rwfail:
 	return -1;
 }