blob: d51eda3a9b68662fb42de788a1b29ad1b8719f4a [file] [log] [blame]
/*****************************************************************************\
* src/common/stepd_api.h - slurmstepd message API
* $Id$
*****************************************************************************
* Copyright (C) 2005 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Christopher Morrone <morrone2@llnl.gov>
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _STEPD_API_H
#define _STEPD_API_H
#include <inttypes.h>
#include "slurm/slurm.h"
#include "src/common/list.h"
#include "src/common/slurm_protocol_defs.h"
typedef struct step_location {
uint32_t jobid;
uint32_t stepid;
char *nodename;
char *directory;
} step_loc_t;
typedef enum {
REQUEST_CONNECT = 0,
REQUEST_SIGNAL_PROCESS_GROUP,
REQUEST_SIGNAL_TASK_LOCAL,
REQUEST_SIGNAL_TASK_GLOBAL,
REQUEST_SIGNAL_CONTAINER,
REQUEST_STATE,
REQUEST_INFO,
REQUEST_ATTACH,
REQUEST_PID_IN_CONTAINER,
REQUEST_DAEMON_PID,
REQUEST_STEP_SUSPEND,
REQUEST_STEP_RESUME,
REQUEST_STEP_TERMINATE,
REQUEST_STEP_COMPLETION,
REQUEST_STEP_TASK_INFO,
REQUEST_STEP_LIST_PIDS
} step_msg_t;
typedef enum {
SLURMSTEPD_NOT_RUNNING = 0,
SLURMSTEPD_STEP_STARTING,
SLURMSTEPD_STEP_RUNNING,
SLURMSTEPD_STEP_ENDING
} slurmstepd_state_t;
typedef struct {
uid_t uid;
uint32_t jobid;
uint32_t stepid;
uint32_t nodeid;
} slurmstepd_info_t;
typedef struct {
int id; /* local task id */
uint32_t gtid; /* global task id */
pid_t pid; /* task pid */
bool exited; /* true if task has exited */
int estatus; /* exit status if exited is true*/
} slurmstepd_task_info_t;
/*
* Cleanup stale stepd domain sockets.
*/
int stepd_cleanup_sockets(const char *directory, const char *nodename);
int stepd_terminate(int fd);
/*
* Connect to a slurmstepd proccess by way of its unix domain socket.
*
* Both "directory" and "nodename" may be null, in which case stepd_connect
* will attempt to determine them on its own. If you are using multiple
* slurmd on one node (unusual outside of development environments), you
* will get one of the local NodeNames more-or-less at random.
*
* Returns a socket descriptor for the opened socket on success,
* and -1 on error.
*/
int stepd_connect(const char *directory, const char *nodename,
uint32_t jobid, uint32_t stepid);
/*
* Retrieve a job step's current state.
*/
slurmstepd_state_t stepd_state(int fd);
/*
* Retrieve slurmstepd_info_t structure for a job step.
*
* Must be xfree'd by the caller.
*/
slurmstepd_info_t *stepd_get_info(int fd);
/*
* Send a signal to the process group of a job step.
*/
int stepd_signal(int fd, int signal);
/*
* Send a signal to a single task in a job step.
*/
int stepd_signal_task_local(int fd, int signal, int ltaskid);
/*
* Send a signal to a single task in a job step.
*/
int stepd_signal_task_global(int fd, int signal, int gtaskid);
/*
* Send a signal to the proctrack container of a job step.
*/
int stepd_signal_container(int fd, int signal);
/*
* Attach a client to a running job step.
*
* On success returns SLURM_SUCCESS and fills in resp->local_pids,
* resp->gtids, resp->ntasks, and resp->executable.
*
* FIXME - The pid/gtid info returned in the "resp" parameter should
* probably be moved into a more generic stepd_api call so that
* this header does not need to include slurm_protocol_defs.h.
*/
int stepd_attach(int fd, slurm_addr *ioaddr, slurm_addr *respaddr,
void *job_cred_sig, reattach_tasks_response_msg_t *resp);
/*
* Scan for available running slurm step daemons by checking
* "directory" for unix domain sockets with names beginning in "nodename".
*
* Both "directory" and "nodename" may be null, in which case stepd_available
* will attempt to determine them on its own. If you are using multiple
* slurmd on one node (unusual outside of development environments), you
* will get one of the local NodeNames more-or-less at random.
*
* Returns a List of pointers to step_loc_t structures.
*/
List stepd_available(const char *directory, const char *nodename);
/*
* Return true if the process with process ID "pid" is found in
* the proctrack container of the slurmstepd "step".
*/
bool stepd_pid_in_container(int fd, pid_t pid);
/*
* Return the process ID of the slurmstepd.
*/
pid_t stepd_daemon_pid(int fd);
/*
* Suspend execution of the job step. Only root or SlurmUser is
* authorized to use this call.
*
* Returns SLURM_SUCCESS is successful. On error returns SLURM_ERROR
* and sets errno.
*/
int stepd_suspend(int fd);
/*
* Resume execution of the job step that has been suspended by a
* call to stepd_suspend(). Only root or SlurmUser is
* authorized to use this call.
*
* Returns SLURM_SUCCESS is successful. On error returns SLURM_ERROR
* and sets errno.
*/
int stepd_resume(int fd);
/*
*
* Returns SLURM_SUCCESS is successful. On error returns SLURM_ERROR
* and sets errno.
*/
int stepd_completion(int fd, step_complete_msg_t *sent);
/*
*
* Returns SLURM_SUCCESS on success or SLURM_ERROR on error.
* resp recieves a jobacctinfo_t which must be freed if SUCCESS.
*/
int stepd_stat_jobacct(int fd, stat_jobacct_msg_t *sent,
stat_jobacct_msg_t *resp);
int stepd_task_info(int fd, slurmstepd_task_info_t **task_info,
uint32_t *task_info_count);
int stepd_list_pids(int fd, pid_t **pids_array, int *pids_count);
#endif /* _STEPD_API_H */