blob: 5ffc466c5255407430708a883d7b01db8395989b [file] [log] [blame]
/*****************************************************************************\
* step_launch.h - launch a parallel job step
*****************************************************************************
* Copyright (C) 2006-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Christopher J. Morrone <morrone2@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _STEP_LAUNCH_H
#define _STEP_LAUNCH_H
#include <pthread.h>
#include <stdint.h>
#include <unistd.h>
#include "slurm/slurm.h"
#include "src/common/bitstring.h"
#include "src/common/eio.h"
#include "src/interfaces/mpi.h"
#include "src/common/slurm_step_layout.h"
#include "src/api/step_io.h"
struct step_launch_state {
/* This lock protects tasks_started, tasks_exited, node_io_error,
io_deadline, abort, and abort_action_taken. The main thread
blocks on cond, waking when a task starts or exits, or the abort
flag is set. */
pthread_mutex_t lock;
pthread_cond_t cond;
int tasks_requested;
bitstr_t *tasks_started; /* or attempted to start, but failed */
bitstr_t *tasks_exited; /* or never started correctly */
bitstr_t *node_io_error; /* set after write or read error */
pthread_t io_timeout_thread;
bool io_timeout_thread_created;
time_t *io_deadline; /* Holds the time by which a "connection okay"
message must be received. Each entry holds
NO_VAL unless the node is suspected to be
down and is being tested. */
int io_timeout; /* num seconds between I/O tests */
bool halt_io_test; /* set to true when I/O test thread should
shut down. */
bool abort;
bool abort_action_taken;
uint32_t job_id;
/* message thread variables */
eio_handle_t *msg_handle;
pthread_t msg_thread;
/* set to -1 if step launch message handler should not attempt
to handle */
int slurmctld_socket_fd;
uint16_t num_resp_port;
uint16_t *resp_port; /* array of message response ports */
/* io variables */
client_io_t *io;
slurm_step_layout_t *layout; /* a pointer into the ctx
step_resp, do not free */
mpi_step_info_t mpi_step[1];
mpi_plugin_client_state_t *mpi_state;
int ret_code;
/* user registered callbacks */
slurm_step_launch_callbacks_t callback;
};
typedef struct step_launch_state step_launch_state_t;
/*
* Create a launch state structure for a specified step context, "ctx".
*/
struct step_launch_state * step_launch_state_create(slurm_step_ctx_t *ctx);
/*
* If a steps size has changed update the launch_state structure for a
* specified step context, "ctx".
*/
void step_launch_state_alter(slurm_step_ctx_t *ctx);
/*
* Free the memory associated with the a launch state structure.
*/
void step_launch_state_destroy(struct step_launch_state *sls);
/*
* Notify the step_launch_state that an I/O connection went bad.
* If the node is suspected to be down, abort the job.
*/
int step_launch_notify_io_failure(step_launch_state_t *sls, int node_id);
/*
* Just in case the node was marked questionable very early in the
* job step setup, clear this flag when the node makes its initial
* connection.
*/
int step_launch_clear_questionable_state(step_launch_state_t *sls,
int node_id);
#endif /* _STEP_LAUNCH_H */