blob: 25421533224ec266d4f1fbc3ab136b127c30388b [file] [log] [blame]
/*****************************************************************************\
* src/slurmd/slurmstepd/slurmstepd_job.h stepd_step_rec_t definition
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Copyright (C) 2013 Intel, Inc.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark Grondona <mgrondona@llnl.gov>.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _SLURMSTEPD_JOB_H
#define _SLURMSTEPD_JOB_H
#define _GNU_SOURCE
#include <pthread.h>
#include <pwd.h>
#include "src/common/data.h"
#include "src/common/macros.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/list.h"
#include "src/common/eio.h"
#include "src/common/env.h"
#include "src/common/stepd_api.h"
#include "src/common/xsched.h"
/* required for rusage */
#include <sys/resource.h>
#define STEP_CONTAINER_MAGIC 0xa0b9b2ba
typedef struct {
char *key; /* srun key for IO verification */
char *tls_cert; /* srun public certificate if tls in use */
slurm_addr_t resp_addr; /* response addr for task exit msg */
slurm_addr_t ioaddr; /* Address to connect on for normal I/O.
Spawn IO uses messages to the normal
resp_addr. */
uid_t uid; /* user id for job */
uint16_t protocol_version; /* protocol_version of the srun */
} srun_info_t;
typedef enum {
STEPD_STEP_TASK_INIT,
STEPD_STEP_TASK_STARTING,
STEPD_STEP_TASK_RUNNING,
STEPD_STEP_TASK_COMPLETE
} stepd_step_task_state_t;
typedef struct {
pthread_mutex_t mutex; /* mutex to protect task state */
stepd_step_task_state_t state; /* task state */
cpu_set_t *cpu_set;
int id; /* local task id */
uint32_t gtid; /* global task id */
pid_t pid; /* task pid */
char *ifname; /* standard input file name */
char *ofname; /* standard output file name */
char *efname; /* standard error file name */
int stdin_fd; /* standard input file descriptor */
int stdout_fd; /* standard output file descriptor */
int stderr_fd; /* standard error file descriptor */
int to_stdin; /* write file descriptor for task stdin */
int from_stdout;/* read file descriptor from task stdout*/
int from_stderr;/* read file descriptor from task stderr*/
eio_obj_t *in; /* standard input event IO object */
eio_obj_t *out; /* standard output event IO object */
eio_obj_t *err; /* standard error event IO object */
bool killed_by_cmd; /* true if task killed by our signal */
bool aborted; /* true if task called abort */
bool esent; /* true if exit status has been sent */
bool exited; /* true if task has exited */
int estatus; /* this task's exit status */
struct rusage rusage;
uint32_t argc;
char **argv;
} stepd_step_task_info_t;
typedef struct {
int magic;
char *bundle; /* OCI Container Bundle path */
data_t *config; /* OCI Container config.json contents */
char *mount_spool_dir; /* target path to mount container spool dir */
char *rootfs; /* path to container rootfs */
char *spool_dir; /* path to slurmd's spool dir for container */
char *task_spool_dir; /* path to slurmd's spool dir for container task */
} step_container_t;
typedef struct {
char *alias_list; /* node name to address aliases */
step_container_t *container; /* populated if step is a container */
slurmstepd_state_t state; /* Job state */
pthread_cond_t state_cond; /* Job state conditional */
pthread_mutex_t state_mutex; /* Job state mutex */
slurm_step_id_t step_id; /* Current step id (or NO_VAL) */
uint32_t array_job_id; /* job array master job ID */
uint32_t array_task_id; /* job array ID */
uint32_t nnodes; /* number of nodes in current job */
uint32_t ntasks; /* total number of tasks in current job */
uint32_t nodeid; /* relative position of this node in job */
uint32_t node_tasks; /* number of tasks on *this* node */
slurm_addr_t *node_addrs; /* allocated node addrs -- from cred */
char *node_list; /* allocated nodes -- from cred */
uint32_t het_job_id; /* Hetjob ID or NO_VAL */
uint32_t het_job_nnodes; /* total node count for entire hetjob */
char *het_job_node_list; /* Hetjob step node list */
uint32_t het_job_node_offset;/* Hetjob node offset or NO_VAL */
uint32_t het_job_ntasks; /* total task count for entire hetjob */
uint32_t het_job_offset; /* Hetjob offset or NO_VAL */
uint32_t het_job_step_cnt; /* number of steps for entire hetjob */
uint32_t *het_job_step_task_cnts; /* ntasks on each comp of hetjob */
uint32_t het_job_task_offset;/* Hetjob task offset or NO_VAL */
uint16_t *het_job_task_cnts; /* Number of tasks on each node in hetjob */
uint32_t **het_job_tids; /* Task IDs on each node of hetjob */
uint32_t *het_job_tid_offsets;/* map of tasks (by id) to originating hetjob*/
uint16_t *task_cnts; /* Number of tasks on each node in job */
uint32_t cpus_per_task; /* number of cpus desired per task */
uint32_t debug; /* debug level for job slurmd */
uint64_t job_mem; /* MB of memory reserved for the job */
uint64_t step_mem; /* MB of memory reserved for the step */
uint16_t cpus; /* number of cpus to use for this job */
uint32_t argc; /* number of commandline arguments */
char **env; /* job environment */
char **argv; /* job argument vector */
char *cwd; /* path to current working directory */
task_dist_states_t task_dist;/* -m distribution */
char *node_name; /* node name of node running job
* needed for front-end systems */
cpu_bind_type_t cpu_bind_type; /* --cpu-bind= */
char *cpu_bind; /* binding map for map/mask_cpu */
mem_bind_type_t mem_bind_type; /* --mem-bind= */
char *mem_bind; /* binding map for tasks to memory */
uint16_t accel_bind_type; /* --accel_bind= */
uint32_t cpu_freq_min; /* Minimum cpu frequency */
uint32_t cpu_freq_max; /* Maximum cpu frequency */
uint32_t cpu_freq_gov; /* cpu frequency governor */
dynamic_plugin_data_t *switch_step; /* switch-specific job information */
uid_t uid; /* user id for job */
char *user_name;
/* fields from the launch cred used to support nss_slurm */
char *pw_gecos;
char *pw_dir;
char *pw_shell;
gid_t gid; /* group ID for job */
int ngids; /* length of the following gids array */
char **gr_names;
gid_t *gids; /* array of gids for user specified in uid */
bool aborted; /* true if already aborted */
bool batch; /* true if this is a batch job */
bool run_prolog; /* true if need to run prolog */
time_t timelimit; /* time at which job must stop */
uint32_t profile; /* Level of acct_gather_profile */
char *task_prolog; /* per-task prolog */
char *task_epilog; /* per-task epilog */
stepd_step_task_info_t **task; /* array of task information pointers*/
eio_handle_t *eio;
list_t *sruns; /* list of srun_info_t pointers */
list_t *clients; /* list of struct client_io_info pointers */
list_t *stdout_eio_objs;/* list of objs that gather stdout from tasks */
list_t *stderr_eio_objs;/* list of objs that gather stderr from tasks */
list_t *free_incoming; /* list of free struct io_buf * for incoming
* traffic. "incoming" means traffic from srun
* to the tasks.
*/
list_t *free_outgoing; /* list of free struct io_buf * for outgoing
* traffic "outgoing" means traffic from the
* tasks to srun.
*/
int incoming_count; /* Count of total incoming message buffers
* including free_incoming buffers and
* buffers in use.
*/
int outgoing_count; /* Count of total outgoing message buffers
* including free_outgoing buffers and
* buffers in use.
*/
list_t *outgoing_cache; /* cache of outgoing stdio messages
* used when a new client attaches
*/
bool io_running; /* I/O thread running */
pthread_cond_t io_cond; /* I/O thread state conditional */
pthread_mutex_t io_mutex; /* I/O thread state mutex */
pthread_t msgid; /* pthread id of message thread */
eio_handle_t *msg_handle; /* eio handle for the message thread */
pid_t jmgr_pid; /* job manager pid */
pid_t pgid; /* process group id for tasks */
uint32_t flags; /* See LAUNCH_* flags defined in slurm_protocol_defs.h */
env_t *envtp;
uint64_t cont_id;
char *batchdir;
jobacctinfo_t *jobacct;
uint8_t open_mode; /* stdout/err append or truncate */
list_t *options;
uint16_t restart_cnt; /* batch job restart count */
char *job_alloc_cores; /* needed by the SPANK cpuset plugin */
char *step_alloc_cores;/* needed by the SPANK cpuset plugin */
list_t *job_gres_list; /* Needed by GRES plugin */
list_t *step_gres_list; /* Needed by GRES plugin */
char *tres_bind; /* TRES binding */
char *tres_freq; /* TRES frequency */
time_t job_end_time; /* job end time */
char *job_licenses; /* Licenses allocated to job */
time_t job_start_time; /* job start time */
launch_tasks_request_msg_t *msg; /* When a non-batch step this
* is the message sent. DO
* NOT FREE, IT IS JUST A
* POINTER. */
uint16_t job_core_spec; /* count of specialized cores */
bool oom_error; /* step out of memory error */
uint16_t x11; /* only set for extern step */
int x11_display; /* display number if x11 forwarding setup */
char *x11_alloc_host; /* remote host to proxy through */
uint16_t x11_alloc_port; /* remote port to proxy through */
char *x11_magic_cookie; /* xauth magic cookie value */
char *x11_target; /* remote target. unix socket if port == 0 */
uint16_t x11_target_port; /* remote x11 port to connect back to */
char *x11_xauthority; /* temporary XAUTHORITY location, or NULL */
char *selinux_context;
char *stepmgr;
bool oom_kill_step;
} stepd_step_rec_t;
stepd_step_rec_t * stepd_step_rec_create(launch_tasks_request_msg_t *msg,
uint16_t protocol_version);
stepd_step_rec_t * batch_stepd_step_rec_create(batch_job_launch_msg_t *msg);
void stepd_step_rec_destroy(stepd_step_rec_t *step);
srun_info_t *srun_info_create(slurm_cred_t *cred, char *alloc_tls_cert,
slurm_addr_t *respaddr, slurm_addr_t *ioaddr,
uid_t uid, uint16_t protocol_version);
void srun_info_destroy(srun_info_t *srun);
stepd_step_task_info_t * task_info_create(int taskid, int gtaskid,
char *ifname, char *ofname,
char *efname);
/*
* Return a task info structure corresponding to pid.
* We inline it here so that it can be included from src/common/plugstack.c
* without undefined symbol warnings.
*/
static inline stepd_step_task_info_t *
job_task_info_by_pid (stepd_step_rec_t *step, pid_t pid)
{
uint32_t i;
if (!step)
return NULL;
for (i = 0; i < step->node_tasks; i++) {
if (step->task[i]->pid == pid)
return (step->task[i]);
}
return (NULL);
}
#endif /* !_SLURMSTEPD_JOB_H */