blob: 8e37429500b5fbbcb9a98f6ab28387f636da2728 [file] [log] [blame] [edit]
/*****************************************************************************\
* src/srun/srun_job.h - specification of an srun "job"
* $Id$
*****************************************************************************
* Copyright (C) 2002 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark Grondona <mgrondona@llnl.gov>.
* UCRL-CODE-217948.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _HAVE_JOB_H
#define _HAVE_JOB_H
#ifdef WITH_PTHREADS
# include <pthread.h>
#endif
#include <netinet/in.h>
#include <slurm/slurm.h>
#include "src/common/eio.h"
#include "src/common/cbuf.h"
#include "src/common/macros.h"
#include "src/common/node_select.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/dist_tasks.h"
//#include "src/common/global_srun.h"
#include "src/srun/signals.h"
#include "src/srun/fname.h"
typedef enum {
PIPE_NONE = 0,
PIPE_JOB_STATE,
PIPE_TASK_STATE,
PIPE_TASK_EXITCODE,
PIPE_HOST_STATE,
PIPE_SIGNALED,
PIPE_MPIR_DEBUG_STATE,
PIPE_UPDATE_MPIR_PROCTABLE,
PIPE_UPDATE_STEP_LAYOUT,
PIPE_NODE_FAIL
} pipe_enum_t;
typedef enum {
SRUN_JOB_INIT = 0, /* Job's initial state */
SRUN_JOB_LAUNCHING, /* Launch thread is running */
SRUN_JOB_STARTING, /* Launch thread is complete */
SRUN_JOB_RUNNING, /* Launch thread complete */
SRUN_JOB_TERMINATING, /* Once first task terminates */
SRUN_JOB_TERMINATED, /* All tasks terminated (may have IO) */
SRUN_JOB_WAITING_ON_IO, /* All tasks terminated; waiting for IO */
SRUN_JOB_DONE, /* tasks and IO complete */
SRUN_JOB_DETACHED, /* Detached IO from job (Not used now) */
SRUN_JOB_FAILED, /* Job failed for some reason */
SRUN_JOB_CANCELLED, /* CTRL-C cancelled */
SRUN_JOB_FORCETERM /* Forced termination of IO thread */
} srun_job_state_t;
typedef enum {
SRUN_HOST_INIT = 0,
SRUN_HOST_CONTACTED,
SRUN_HOST_UNREACHABLE,
SRUN_HOST_REPLIED
} srun_host_state_t;
typedef enum {
SRUN_TASK_INIT = 0,
SRUN_TASK_RUNNING,
SRUN_TASK_FAILED,
SRUN_TASK_IO_WAIT, /* this state deprecated with new eio stdio engine */
SRUN_TASK_EXITED,
SRUN_TASK_ABNORMAL_EXIT
} srun_task_state_t;
/* For Message thread */
typedef struct forked_msg_pipe {
int msg_pipe[2];
int pid;
} forked_msg_pipe_t;
typedef struct forked_message {
forked_msg_pipe_t * par_msg;
forked_msg_pipe_t * msg_par;
enum job_states * job_state;
} forked_msg_t;
typedef struct srun_job {
slurm_step_layout_t *step_layout; /* holds info about how the task is
laid out */
uint32_t jobid; /* assigned job id */
uint32_t stepid; /* assigned step id */
bool old_job; /* run job step under previous allocation */
bool removed; /* job has been removed from SLURM */
uint32_t nhosts; /* node count */
uint32_t ntasks; /* task count */
srun_job_state_t state; /* job state */
pthread_mutex_t state_mutex;
pthread_cond_t state_cond;
bool signaled; /* True if user generated signal to job */
int rc; /* srun return code */
slurm_cred_t cred; /* Slurm job credential */
char *nodelist; /* nodelist in string form */
slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */
pthread_t sigid; /* signals thread tid */
pthread_t jtid; /* job control thread id */
slurm_fd *jfd; /* job control info fd */
pthread_t ioid; /* stdio thread id */
int *listensock; /* Array of stdio listen sockets */
eio_handle_t *eio; /* Event IO handle */
int ioservers_ready; /* Number of servers that established contact */
eio_obj_t **ioserver; /* Array of nhosts pointers to eio_obj_t */
eio_obj_t *stdin_obj; /* stdin eio_obj_t */
eio_obj_t *stdout_obj; /* stdout eio_obj_t */
eio_obj_t *stderr_obj; /* stderr eio_obj_t */
List free_incoming; /* List of free struct io_buf * for incoming
* traffic. "incoming" means traffic from srun
* to the tasks.
*/
List free_outgoing; /* List of free struct io_buf * for outgoing
* traffic "outgoing" means traffic from the
* tasks to srun.
*/
int incoming_count; /* Count of total incoming message buffers
* including free_incoming buffers and
* buffers in use.
*/
int outgoing_count; /* Count of total incoming message buffers
* including free_incoming buffers and
* buffers in use.
*/
pthread_t lid; /* launch thread id */
time_t ltimeout; /* Time by which all tasks must be running */
time_t etimeout; /* exit timeout (see opt.max_wait */
srun_host_state_t *host_state; /* nhost host states */
int *tstatus; /* ntask exit statii */
srun_task_state_t *task_state; /* ntask task states */
switch_jobinfo_t switch_job;
io_filename_t *ifname;
io_filename_t *ofname;
io_filename_t *efname;
forked_msg_t *forked_msg;
char *task_epilog; /* task-epilog */
char *task_prolog; /* task-prolog */
pthread_mutex_t task_mutex;
int njfds; /* number of job control info fds */
slurm_addr *jaddr; /* job control info ports */
int num_listen; /* Number of stdio listen sockets */
int *listenport; /* Array of stdio listen ports */
int thr_count; /* count of threads in job launch */
/* Output streams and stdin fileno */
select_jobinfo_t select_jobinfo;
} srun_job_t;
extern int message_thread;
void update_job_state(srun_job_t *job, srun_job_state_t newstate);
void job_force_termination(srun_job_t *job);
srun_job_state_t job_state(srun_job_t *job);
extern srun_job_t * job_create_noalloc(void);
extern srun_job_t *job_step_create_allocation(
resource_allocation_response_msg_t *resp);
extern srun_job_t * job_create_allocation(
resource_allocation_response_msg_t *resp);
extern srun_job_t * job_create_structure(
resource_allocation_response_msg_t *resp);
/*
* Update job filenames and modes for stderr, stdout, and stdin.
*/
void job_update_io_fnames(srun_job_t *j);
/*
* Issue a fatal error message and terminate running job
*/
void job_fatal(srun_job_t *job, const char *msg);
/*
* Deallocates job and or job step via slurm API
*/
void srun_job_destroy(srun_job_t *job, int error);
/*
* Send SIGKILL to running job via slurm controller
*/
void srun_job_kill(srun_job_t *job);
/*
* report current task status
*/
void report_task_status(srun_job_t *job);
/*
* report current node status
*/
void report_job_status(srun_job_t *job);
/*
* Returns job return code (for srun exit status)
*/
int job_rc(srun_job_t *job);
/*
* To run a job step on existing allocation, modify the
* existing_allocation() response to remove nodes as needed
* for the job step request (for --excluded nodes or reduced
* --nodes count). This is a temporary fix for slurm 0.2.
* resp IN/OUT - existing_allocation() response message
* RET - zero or fatal error code
*/
int job_resp_hack_for_step(resource_allocation_response_msg_t *resp);
#endif /* !_HAVE_JOB_H */