blob: 7b27a1f8de0afaae88f60187c5be08ad64485da8 [file] [log] [blame]
/*****************************************************************************\
* src/srun/srun_job.h - specification of an srun "job"
*****************************************************************************
* Copyright (C) 2002 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark Grondona <mgrondona@llnl.gov>.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _HAVE_JOB_H
#define _HAVE_JOB_H
#include <netinet/in.h>
#include <pthread.h>
#include "slurm/slurm.h"
#include "src/common/eio.h"
#include "src/common/cbuf.h"
#include "src/common/macros.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/api/step_io.h"
#include "src/srun/opt.h"
#include "src/srun/step_ctx.h"
typedef enum {
SRUN_JOB_INIT = 0, /* Job's initial state */
SRUN_JOB_LAUNCHING, /* Launch thread is running */
SRUN_JOB_STARTING, /* Launch thread is complete */
SRUN_JOB_RUNNING, /* Launch thread complete */
SRUN_JOB_CANCELLED, /* CTRL-C cancelled */
} srun_job_state_t;
enum io_t {
IO_ALL = 0, /* multiplex output from all/bcast stdin to all */
IO_ONE = 1, /* output from only one task/stdin to one task */
IO_PER_TASK = 2, /* separate output/input file per task */
IO_NONE = 3, /* close output/close stdin */
};
#define format_io_t(t) (t == IO_ONE) ? "one" : (t == IO_ALL) ? \
"all" : "per task"
typedef struct fname {
char *name;
enum io_t type;
int taskid; /* taskid for IO if IO_ONE */
} fname_t;
typedef struct srun_job {
slurm_step_id_t step_id; /* assigned step id */
uint32_t het_job_node_offset; /* Hetjob node offset or NO_VAL */
uint32_t het_job_id; /* Hetjob leader or NO_VAL */
char *het_job_node_list; /* node list for combined hetjob */
uint32_t het_job_nnodes; /* total node count for entire hetjob */
uint32_t het_job_ntasks; /* total task count for entire hetjob */
uint32_t het_job_offset; /* Hetjob offset or NO_VAL */
uint32_t *het_job_step_task_cnts; /* ntasks on each comp. of hetjob */
uint32_t het_job_task_offset; /* Hetjob task offset or NO_VAL */
uint16_t *het_job_task_cnts; /* tasks invoked on each node of hetjob */
uint32_t **het_job_tids; /* Task IDs on each node of hetjob */
uint32_t *het_job_tid_offsets;/* map of tasks (by id) to originating
* hetjob */
char *container; /* OCI container bundle path */
uint32_t cpu_count; /* allocated CPUs */
uint32_t nhosts; /* node count */
uint32_t ntasks; /* task count */
uint16_t ntasks_per_board;/* number of tasks to invoke on each board */
uint16_t ntasks_per_core; /* number of tasks to invoke on each core */
uint16_t ntasks_per_tres; /* number of tasks that can access each gpu */
uint16_t ntasks_per_socket;/* number of tasks to invoke on
* each socket */
srun_job_state_t state; /* job state */
pthread_mutex_t state_mutex;
pthread_cond_t state_cond;
int rc; /* srun return code */
char **env; /* hetjob specific environment */
char *nodelist; /* nodelist in string form */
char *partition; /* name of partition running job */
fname_t *ifname;
fname_t *ofname;
fname_t *efname;
/* Pseudo terminal support */
int input_fd;
int pty_fd; /* file to communicate window size changes */
uint16_t pty_port; /* used to communicate window size changes */
uint16_t ws_col; /* window size, columns */
uint16_t ws_row; /* window size, row count */
slurm_step_ctx_t *step_ctx;
char *account; /* account of this job */
char *qos; /* job's qos */
char *resv_name; /* reservation the job is using */
uid_t uid; /* resolved user id of job */
char *user_name; /* resolved user name of job */
gid_t gid; /* resolved group id of job */
char *group_name; /* resolved group name of job */
} srun_job_t;
void update_job_state(srun_job_t *job, srun_job_state_t newstate);
void job_force_termination(srun_job_t *job);
srun_job_state_t job_state(srun_job_t *job);
extern srun_job_t * job_create_noalloc(void);
/*
* Create an srun job structure for a step w/out an allocation response msg.
* (i.e. inside an allocation)
*/
extern srun_job_t *job_step_create_allocation(
resource_allocation_response_msg_t *resp,
slurm_opt_t *opt_local);
/*
* Create an srun job structure from a resource allocation response msg
*/
extern srun_job_t *job_create_allocation(
resource_allocation_response_msg_t *resp,
slurm_opt_t *opt_local);
extern void init_srun(int argc, char **argv, log_options_t *logopt,
bool handle_signals);
extern void create_srun_job(void **p_job, bool *got_alloc);
extern void pre_launch_srun_job(srun_job_t *job, slurm_opt_t *opt_local);
extern void fini_srun(srun_job_t *job, bool got_alloc, uint32_t *global_rc);
/*
* Update job filenames and modes for stderr, stdout, and stdin.
*/
extern void job_update_io_fnames(srun_job_t *job, slurm_opt_t *opt_local);
/* Set up port to handle messages from slurmctld */
int slurmctld_msg_init(void);
#endif /* !_HAVE_JOB_H */