blob: 1f9d5d1aa87902ed24f51f6e6f0b564f87e2b70b [file] [log] [blame]
/*****************************************************************************\
* src/api/step_io.h - job-step client-side I/O routines
*****************************************************************************
* Copyright (C) 2006 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Christopher J. Morrone <morrone2@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _HAVE_STEP_IO_H
#define _HAVE_STEP_IO_H
#include <pthread.h>
#include <stdint.h>
#include "slurm/slurm.h"
#include "src/common/bitstring.h"
#include "src/common/eio.h"
#include "src/common/list.h"
#include "src/common/slurm_step_layout.h"
struct step_launch_state;
typedef struct {
/* input parameters - set (indirectly) by user */
int num_tasks;
int num_nodes;
bool label;
int taskid_width; /* characters needed for task_id label */
uint32_t het_job_offset; /* offset within a hetjob or NO_VAL */
uint32_t het_job_task_offset; /* task offset within a hetjob or
* NO_VAL */
char *io_key;
/* internal variables */
bool io_running; /* I/O thread running */
pthread_cond_t io_cond; /* I/O thread state conditional */
pthread_mutex_t io_mutex; /* I/O thread state mutex */
int num_listen; /* Number of stdio listen sockets */
int *listensock; /* Array of stdio listen sockets */
uint16_t *listenport; /* Array of stdio listen port numbers */
eio_handle_t *eio; /* Event IO handle for stdio traffic */
pthread_mutex_t ioservers_lock; /* This lock protects
ioservers_ready_bits, ioservers_ready,
pointers in ioserver, all the msg_queues
in each ioserver's server_io_info, and
the free_incoming list. The queues
are used both for normal writes
and writes that verify a connection to
a remote host. */
bitstr_t *ioservers_ready_bits; /* length "num_nodes" */
int ioservers_ready; /* Number of servers that established contact */
eio_obj_t **ioserver; /* Array of nhosts pointers to eio_obj_t */
eio_obj_t *stdin_obj;
eio_obj_t *stdout_obj;
eio_obj_t *stderr_obj;
list_t *free_incoming; /* List of free struct io_buf * for incoming
* traffic. "incoming" means traffic from the
* client to the tasks.
*/
list_t *free_outgoing; /* List of free struct io_buf * for outgoing
* traffic "outgoing" means traffic from the
* tasks to the client.
*/
int incoming_count; /* Count of total incoming message buffers
* including free_incoming buffers and
* buffers in use.
*/
int outgoing_count; /* Count of total incoming message buffers
* including free_incoming buffers and
* buffers in use.
*/
struct step_launch_state *sls; /* Used to notify the main thread of an
I/O problem. */
} client_io_t;
client_io_t *client_io_handler_create(slurm_step_io_fds_t fds, int num_tasks,
int num_nodes, char *io_key,
bool label, uint32_t het_job_offset,
uint32_t het_job_task_offset);
extern void client_io_handler_start(client_io_t *cio);
/*
* Tell the client IO handler that a set of remote nodes are now considered
* "down", and no further communication from that node should be expected.
* This will prevent the IO handler from blocking indefinitely while it
* waits for a node to phone home.
*
* IN cio - the client_io_t handle
* IN node_ids - an array of integers representing the ID of a node
* within a job step.
* IN num_node_ids - the length of the node_ids array
*/
void client_io_handler_downnodes(client_io_t *cio,
const int *node_ids, int num_node_ids);
/*
* Tell the client IO handler to test the communication path to a
* node suspected to be down by sending a message, which will be
* ignored by the slurmstepd. If the write fails the step_launch_state
* will be notified.
*/
int client_io_handler_send_test_message(client_io_t *cio, int node_id,
bool *sent_message);
/*
* Tell the client IO handler that the step has been aborted, and if
* any slurmstepd's have not yet establish IO connections, they should
* not be expected to ever make a connection.
*
* Calling this when an error occurs will prevent client_io_handler_finish()
* from blocking indefinitely.
*
* WARNING: This WILL abandon live IO connections.
*/
void client_io_handler_abort(client_io_t *cio);
extern void client_io_handler_finish(client_io_t *cio);
void client_io_handler_destroy(client_io_t *cio);
#endif /* !_HAVE_STEP_IO_H */