blob: 103813293af22848b0d4910e6dfbb54950ac5355 [file] [log] [blame]
/*****************************************************************************\
* stepmgr.h - manage the job step information of slurm
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _SLURM_STEP_MGR_H
#define _SLURM_STEP_MGR_H
#include "src/common/id_util.h"
#include "src/common/job_record.h"
#include "src/common/node_conf.h"
#include "src/common/slurm_step_layout.h"
typedef struct {
void *acct_db_conn;
list_t *job_list;
time_t *last_job_update;
bitstr_t *up_node_bitmap;
void (*job_config_fini)(job_record_t *job_ptr);
job_record_t *(*find_job_record)(uint32_t job_id);
job_record_t *(*find_job_array_rec)(uint32_t array_job_id,
uint32_t array_task_id);
void (*agent_queue_request)(agent_arg_t *agent_arg_ptr);
} stepmgr_ops_t;
extern stepmgr_ops_t *stepmgr_ops;
extern void stepmgr_init(stepmgr_ops_t *ops);
/*
* delete_step_records - delete step record for specified job_ptr
* IN job_ptr - pointer to job table entry to have step records removed
*/
extern void delete_step_records(job_record_t *job_ptr);
/*
* job_step_signal - signal the specified job step
* IN step_id - filled in slurm_step_id_t
* IN signal - user id of user issuing the RPC
* IN flags - RPC flags
* IN uid - user id of user issuing the RPC
* RET 0 on success, otherwise ESLURM error code
* global: job_list - pointer global job list
* last_job_update - time of last job table update
*/
extern int job_step_signal(slurm_step_id_t *step_id,
uint16_t signal, uint16_t flags, uid_t uid);
/*
* step_create - creates a step_record in step_specs->job_id, sets up the
* according to the step_specs.
* IN job_ptr - job_ptr to create step in
* IN step_specs - job step specifications
* OUT new_step_record - pointer to the new step_record (NULL on error)
* IN protocol_version - slurm protocol version of client
* OUT err_msg - Custom error message to the user, caller to xfree results
* RET - 0 or error code
* NOTE: don't free the returned step_record because that is managed through
* the job.
*/
extern int step_create(job_record_t *job_ptr,
job_step_create_request_msg_t *step_specs,
step_record_t **new_step_record,
uint16_t protocol_version, char **err_msg);
/*
* step_layout_create - creates a step_layout according to the inputs.
* IN step_ptr - step having tasks laid out
* IN step_node_list - node list of hosts in step
* IN node_count - count of nodes in step allocation
* IN num_tasks - number of tasks in step
* IN cpus_per_task - number of cpus per task
* IN task_dist - type of task distribution
* IN plane_size - size of plane (only needed for the plane distribution)
* RET - NULL or slurm_step_layout_t *
* NOTE: you need to free the returned step_layout usually when the
* step is freed.
*/
extern slurm_step_layout_t *step_layout_create(step_record_t *step_ptr,
char *step_node_list,
uint32_t node_count,
uint32_t num_tasks,
uint16_t cpus_per_task,
uint32_t task_dist,
uint16_t plane_size);
/*
* kill_step_on_node - determine if the specified job has any job steps
* allocated to the specified node and kill them unless no_kill flag
* is set on the step
* IN job_ptr - pointer to an active job record
* IN node_ptr - pointer to a node record
* IN node_fail - true of removed node has failed
*/
extern void kill_step_on_node(job_record_t *job_ptr, node_record_t *node_ptr,
bool node_fail);
/*
* step_partial_comp - Note the completion of a job step on at least
* some of its nodes
* IN req - step_completion_msg RPC from slurmstepd
* IN uid - UID issuing the request
* IN finish - If true, no error, and no rem is 0 finish the step.
* OUT rem - count of nodes for which responses are still pending
* OUT max_rc - highest return code for any step thus far
* RET 0 on success, otherwise ESLURM error code
*/
extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, bool finish,
int *rem, uint32_t *max_rc);
/*
* step_set_alloc_tres - set the tres up when allocating the step.
* Only set when job is running.
* NOTE: job write lock must be locked before calling this */
extern void step_set_alloc_tres(step_record_t *step_ptr, uint32_t node_count,
bool assoc_mgr_locked, bool make_formatted);
/*
* Create the batch step and add it to the job.
*/
extern step_record_t *build_batch_step(job_record_t *job_ptr_in);
/* Update time stamps for job step suspend */
extern void suspend_job_step(job_record_t *job_ptr);
/* Update time stamps for job step resume */
extern void resume_job_step(job_record_t *job_ptr);
/* Process job step update request from specified user,
* RET - 0 or error code */
extern int update_step(step_update_request_msg_t *req, uid_t uid);
/*
* Rebuild a job step's core_bitmap_job after a job has just changed size
* job_ptr IN - job that was just re-sized
* orig_job_node_bitmap IN - The job's original node bitmap
*/
extern void rebuild_step_bitmaps(job_record_t *job_ptr,
bitstr_t *orig_job_node_bitmap);
/*
* Create the extern step and add it to the job.
*/
extern step_record_t *build_extern_step(job_record_t *job_ptr);
/*
* build_alias_addrs - build alias_addrs for step_layout
*/
extern slurm_node_alias_addrs_t *build_alias_addrs(job_record_t *job_ptr);
/*
* Given a full system bitmap return the nth bit set where node_name is in it
* IN - node_name - name of node
* IN - node_bitmap - full system bitmap
*
* Used when you have a job/step specific array and you want to find the index
* where that node is represented in that array.
*/
extern int job_get_node_inx(char *node_name, bitstr_t *node_bitmap);
extern int step_create_from_msg(slurm_msg_t *msg, int slurmd_fd,
void (*lock_func)(bool lock),
void (*fail_lock_func)(bool lock));
/*
* pack_job_step_info_response_msg - packs job step info
* IN step_id - specific id or NO_VAL/NO_VAL for all
* IN uid - user issuing request
* IN show_flags - job step filtering options
* OUT buffer - location to store data, pointers automatically advanced
* IN protocol_version - slurm protocol version of client
* RET - 0 or error code
* NOTE: MUST free_buf buffer
*/
extern int pack_job_step_info_response_msg(pack_step_args_t *args);
/*
* Return combined layouts of all job_ptr steps
*/
extern int stepmgr_get_step_layouts(job_record_t *job_ptr,
slurm_step_id_t *step_id,
slurm_step_layout_t **out_step_layout);
/*
* Create a job_sbcast_cred_msg_t with a sbcast_cred to send back to client
*/
extern int stepmgr_get_job_sbcast_cred_msg(
job_record_t *job_ptr,
slurm_step_id_t *step_id,
uint16_t protocol_version,
job_sbcast_cred_msg_t **out_sbcast_cred_msg);
extern resource_allocation_response_msg_t *build_job_info_resp(
job_record_t *job_ptr);
#endif /* _SLURM_STEP_MGR_H */