| /*****************************************************************************\ |
| * src/srun/srun_job.h - specification of an srun "job" |
| ***************************************************************************** |
| * Copyright (C) 2002 The Regents of the University of California. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Mark Grondona <mgrondona@llnl.gov>. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| #ifndef _HAVE_JOB_H |
| #define _HAVE_JOB_H |
| |
| #include <netinet/in.h> |
| #include <pthread.h> |
| |
| #include "slurm/slurm.h" |
| |
| #include "src/common/eio.h" |
| #include "src/common/cbuf.h" |
| #include "src/common/macros.h" |
| #include "src/common/slurm_protocol_defs.h" |
| |
| #include "src/api/step_io.h" |
| #include "src/srun/opt.h" |
| #include "src/srun/step_ctx.h" |
| |
| typedef enum { |
| SRUN_JOB_INIT = 0, /* Job's initial state */ |
| SRUN_JOB_LAUNCHING, /* Launch thread is running */ |
| SRUN_JOB_STARTING, /* Launch thread is complete */ |
| SRUN_JOB_RUNNING, /* Launch thread complete */ |
| SRUN_JOB_CANCELLED, /* CTRL-C cancelled */ |
| } srun_job_state_t; |
| |
| enum io_t { |
| IO_ALL = 0, /* multiplex output from all/bcast stdin to all */ |
| IO_ONE = 1, /* output from only one task/stdin to one task */ |
| IO_PER_TASK = 2, /* separate output/input file per task */ |
| IO_NONE = 3, /* close output/close stdin */ |
| }; |
| |
| #define format_io_t(t) (t == IO_ONE) ? "one" : (t == IO_ALL) ? \ |
| "all" : "per task" |
| |
| typedef struct fname { |
| char *name; |
| enum io_t type; |
| int taskid; /* taskid for IO if IO_ONE */ |
| } fname_t; |
| |
| typedef struct srun_job { |
| slurm_step_id_t step_id; /* assigned step id */ |
| uint32_t het_job_node_offset; /* Hetjob node offset or NO_VAL */ |
| uint32_t het_job_id; /* Hetjob leader or NO_VAL */ |
| char *het_job_node_list; /* node list for combined hetjob */ |
| uint32_t het_job_nnodes; /* total node count for entire hetjob */ |
| uint32_t het_job_ntasks; /* total task count for entire hetjob */ |
| uint32_t het_job_offset; /* Hetjob offset or NO_VAL */ |
| uint32_t *het_job_step_task_cnts; /* ntasks on each comp. of hetjob */ |
| uint32_t het_job_task_offset; /* Hetjob task offset or NO_VAL */ |
| uint16_t *het_job_task_cnts; /* tasks invoked on each node of hetjob */ |
| uint32_t **het_job_tids; /* Task IDs on each node of hetjob */ |
| uint32_t *het_job_tid_offsets;/* map of tasks (by id) to originating |
| * hetjob */ |
| |
| char *container; /* OCI container bundle path */ |
| uint32_t cpu_count; /* allocated CPUs */ |
| uint32_t nhosts; /* node count */ |
| uint32_t ntasks; /* task count */ |
| uint16_t ntasks_per_board;/* number of tasks to invoke on each board */ |
| uint16_t ntasks_per_core; /* number of tasks to invoke on each core */ |
| uint16_t ntasks_per_tres; /* number of tasks that can access each gpu */ |
| uint16_t ntasks_per_socket;/* number of tasks to invoke on |
| * each socket */ |
| |
| srun_job_state_t state; /* job state */ |
| pthread_mutex_t state_mutex; |
| pthread_cond_t state_cond; |
| |
| int rc; /* srun return code */ |
| |
| char **env; /* hetjob specific environment */ |
| char *nodelist; /* nodelist in string form */ |
| char *partition; /* name of partition running job */ |
| |
| fname_t *ifname; |
| fname_t *ofname; |
| fname_t *efname; |
| |
| /* Pseudo terminal support */ |
| int input_fd; |
| int pty_fd; /* file to communicate window size changes */ |
| uint16_t pty_port; /* used to communicate window size changes */ |
| uint16_t ws_col; /* window size, columns */ |
| uint16_t ws_row; /* window size, row count */ |
| slurm_step_ctx_t *step_ctx; |
| char *account; /* account of this job */ |
| char *qos; /* job's qos */ |
| char *resv_name; /* reservation the job is using */ |
| uid_t uid; /* resolved user id of job */ |
| char *user_name; /* resolved user name of job */ |
| gid_t gid; /* resolved group id of job */ |
| char *group_name; /* resolved group name of job */ |
| } srun_job_t; |
| |
| void update_job_state(srun_job_t *job, srun_job_state_t newstate); |
| void job_force_termination(srun_job_t *job); |
| |
| srun_job_state_t job_state(srun_job_t *job); |
| |
| extern srun_job_t * job_create_noalloc(void); |
| |
| /* |
| * Create an srun job structure for a step w/out an allocation response msg. |
| * (i.e. inside an allocation) |
| */ |
| extern srun_job_t *job_step_create_allocation( |
| resource_allocation_response_msg_t *resp, |
| slurm_opt_t *opt_local); |
| |
| /* |
| * Create an srun job structure from a resource allocation response msg |
| */ |
| extern srun_job_t *job_create_allocation( |
| resource_allocation_response_msg_t *resp, |
| slurm_opt_t *opt_local); |
| |
| extern void init_srun(int argc, char **argv, log_options_t *logopt, |
| bool handle_signals); |
| |
| extern void create_srun_job(void **p_job, bool *got_alloc); |
| |
| extern void pre_launch_srun_job(srun_job_t *job, slurm_opt_t *opt_local); |
| |
| extern void fini_srun(srun_job_t *job, bool got_alloc, uint32_t *global_rc); |
| |
| /* |
| * Update job filenames and modes for stderr, stdout, and stdin. |
| */ |
| extern void job_update_io_fnames(srun_job_t *job, slurm_opt_t *opt_local); |
| |
| /* Set up port to handle messages from slurmctld */ |
| int slurmctld_msg_init(void); |
| |
| #endif /* !_HAVE_JOB_H */ |