blob: 4abe457edd2d92e555f1f09bba6c85231ca165eb [file] [log] [blame]
/*****************************************************************************\
* signal.c - Send a signal to a slurm job or job step
* $Id$
*****************************************************************************
* Copyright (C) 2005 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Christopher J. Morrone <morrone2@llnl.gov>.
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <slurm/slurm.h>
#include <slurm/slurm_errno.h>
#include "src/common/xmalloc.h"
#include "src/common/hostlist.h"
#include "src/common/read_config.h"
#include "src/common/macros.h"
#include "src/common/slurm_protocol_api.h"
static int _local_send_recv_rc_msgs(const char *nodelist,
slurm_msg_type_t type,
void *data);
static int _signal_job_step(
const job_step_info_t *step,
const resource_allocation_response_msg_t *allocation,
uint16_t signal);
static int _signal_batch_script_step(
const resource_allocation_response_msg_t *allocation, uint16_t signal);
static int _terminate_job_step(const job_step_info_t *step,
const resource_allocation_response_msg_t *allocation);
static int _terminate_batch_script_step(
const resource_allocation_response_msg_t *allocation);
/*
* slurm_signal_job - send the specified signal to all steps of an existing job
* IN job_id - the job's id
* IN signal - signal number
* RET 0 on success, otherwise return -1 and set errno to indicate the error
*/
extern int
slurm_signal_job (uint32_t job_id, uint16_t signal)
{
int rc = SLURM_SUCCESS;
resource_allocation_response_msg_t *alloc_info = NULL;
signal_job_msg_t rpc;
if (slurm_allocation_lookup_lite(job_id, &alloc_info)) {
rc = slurm_get_errno();
goto fail1;
}
/* same remote procedure call for each node */
rpc.job_id = job_id;
rpc.signal = (uint32_t)signal;
rc = _local_send_recv_rc_msgs(alloc_info->node_list,
REQUEST_SIGNAL_JOB, &rpc);
slurm_free_resource_allocation_response_msg(alloc_info);
fail1:
if (rc) {
slurm_seterrno_ret(rc);
} else {
return SLURM_SUCCESS;
}
}
/*
* slurm_signal_job_step - send the specified signal to an existing job step
* IN job_id - the job's id
* IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id
* to send a signal to a job's batch script
* IN signal - signal number
* RET 0 on success, otherwise return -1 and set errno to indicate the error
*/
extern int
slurm_signal_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal)
{
resource_allocation_response_msg_t *alloc_info = NULL;
job_step_info_response_msg_t *step_info = NULL;
int rc;
int i;
int save_errno = 0;
if (slurm_allocation_lookup_lite(job_id, &alloc_info)) {
return -1;
}
/*
* The controller won't give us info about the batch script job step,
* so we need to handle that seperately.
*/
if (step_id == SLURM_BATCH_SCRIPT) {
rc = _signal_batch_script_step(alloc_info, signal);
slurm_free_resource_allocation_response_msg(alloc_info);
errno = rc;
return rc ? -1 : 0;
}
/*
* Otherwise, look through the list of job step info and find
* the one matching step_id. Signal that step.
*/
rc = slurm_get_job_steps((time_t)0, job_id, step_id,
&step_info, SHOW_ALL);
if (rc != 0) {
save_errno = errno;
goto fail;
}
for (i = 0; i < step_info->job_step_count; i++) {
if (step_info->job_steps[i].job_id == job_id
&& step_info->job_steps[i].step_id == step_id) {
rc = _signal_job_step(&step_info->job_steps[i],
alloc_info, signal);
save_errno = errno;
break;
}
}
slurm_free_job_step_info_response_msg(step_info);
fail:
slurm_free_resource_allocation_response_msg(alloc_info);
errno = save_errno;
return rc ? -1 : 0;
}
static int
_local_send_recv_rc_msgs(const char *nodelist, slurm_msg_type_t type,
void *data)
{
List ret_list = NULL;
int temp_rc = 0, rc = 0;
ret_data_info_t *ret_data_info = NULL;
slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t));
slurm_msg_t_init(msg);
msg->msg_type = type;
msg->data = data;
if((ret_list = slurm_send_recv_msgs(nodelist, msg, 0))) {
while((ret_data_info = list_pop(ret_list))) {
temp_rc = slurm_get_return_code(ret_data_info->type,
ret_data_info->data);
if(temp_rc)
rc = temp_rc;
}
} else {
error("slurm_signal_job: no list was returned");
rc = SLURM_ERROR;
}
slurm_free_msg(msg);
return rc;
}
static int
_signal_job_step(const job_step_info_t *step,
const resource_allocation_response_msg_t *allocation,
uint16_t signal)
{
kill_tasks_msg_t rpc;
int rc = SLURM_SUCCESS;
/* same remote procedure call for each node */
rpc.job_id = step->job_id;
rpc.job_step_id = step->step_id;
rpc.signal = (uint32_t)signal;
rc = _local_send_recv_rc_msgs(allocation->node_list,
REQUEST_SIGNAL_TASKS, &rpc);
return rc;
}
static int _signal_batch_script_step(
const resource_allocation_response_msg_t *allocation, uint16_t signal)
{
slurm_msg_t msg;
kill_tasks_msg_t rpc;
int rc = SLURM_SUCCESS;
char *name = nodelist_nth_host(allocation->node_list, 0);
if(!name) {
error("_signal_batch_script_step: "
"can't get the first name out of %s",
allocation->node_list);
return -1;
}
rpc.job_id = allocation->job_id;
rpc.job_step_id = SLURM_BATCH_SCRIPT;
rpc.signal = (uint32_t)signal;
slurm_msg_t_init(&msg);
msg.msg_type = REQUEST_SIGNAL_TASKS;
msg.data = &rpc;
if(slurm_conf_get_addr(name, &msg.address) == SLURM_ERROR) {
error("_signal_batch_script_step: "
"can't get address for "
"host %s", name);
free(name);
return -1;
}
free(name);
if (slurm_send_recv_rc_msg_only_one(&msg, &rc, 0) < 0) {
error("_signal_batch_script_step: %m");
rc = -1;
}
return rc;
}
/*
* slurm_terminate_job - terminates all steps of an existing job by sending
* a REQUEST_TERMINATE_JOB rpc to all slurmd in the the job allocation,
* and then calls slurm_complete_job().
* IN job_id - the job's id
* RET 0 on success, otherwise return -1 and set errno to indicate the error
*/
extern int
slurm_terminate_job (uint32_t job_id)
{
int rc = SLURM_SUCCESS;
resource_allocation_response_msg_t *alloc_info = NULL;
signal_job_msg_t rpc;
if (slurm_allocation_lookup_lite(job_id, &alloc_info)) {
rc = slurm_get_errno();
goto fail1;
}
/* same remote procedure call for each node */
rpc.job_id = job_id;
rpc.signal = (uint32_t)-1; /* not used by slurmd */
rc = _local_send_recv_rc_msgs(alloc_info->node_list,
REQUEST_TERMINATE_JOB, &rpc);
slurm_free_resource_allocation_response_msg(alloc_info);
slurm_complete_job(job_id, 0);
fail1:
if (rc) {
slurm_seterrno_ret(rc);
return SLURM_FAILURE;
} else {
return SLURM_SUCCESS;
}
}
/*
* slurm_terminate_job_step - terminates a job step by sending a
* REQUEST_TERMINATE_TASKS rpc to all slurmd of a job step.
* IN job_id - the job's id
* IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id
* to terminate a job's batch script
* RET 0 on success, otherwise return -1 and set errno to indicate the error
*/
extern int
slurm_terminate_job_step (uint32_t job_id, uint32_t step_id)
{
resource_allocation_response_msg_t *alloc_info = NULL;
job_step_info_response_msg_t *step_info = NULL;
int rc = 0;
int i;
int save_errno = 0;
if (slurm_allocation_lookup_lite(job_id, &alloc_info)) {
return -1;
}
/*
* The controller won't give us info about the batch script job step,
* so we need to handle that seperately.
*/
if (step_id == SLURM_BATCH_SCRIPT) {
rc = _terminate_batch_script_step(alloc_info);
slurm_free_resource_allocation_response_msg(alloc_info);
errno = rc;
return rc ? -1 : 0;
}
/*
* Otherwise, look through the list of job step info and find
* the one matching step_id. Terminate that step.
*/
rc = slurm_get_job_steps((time_t)0, job_id, step_id,
&step_info, SHOW_ALL);
if (rc != 0) {
save_errno = errno;
goto fail;
}
for (i = 0; i < step_info->job_step_count; i++) {
if (step_info->job_steps[i].job_id == job_id
&& step_info->job_steps[i].step_id == step_id) {
rc = _terminate_job_step(&step_info->job_steps[i],
alloc_info);
save_errno = errno;
break;
}
}
slurm_free_job_step_info_response_msg(step_info);
fail:
slurm_free_resource_allocation_response_msg(alloc_info);
errno = save_errno;
return rc ? -1 : 0;
}
/*
* Send a REQUEST_TERMINATE_TASKS rpc to all nodes in a job step.
*
* RET Upon successful termination of the job step, 0 shall be returned.
* Otherwise, -1 shall be returned and errno set to indicate the error.
*/
static int
_terminate_job_step(const job_step_info_t *step,
const resource_allocation_response_msg_t *allocation)
{
kill_tasks_msg_t rpc;
int rc = SLURM_SUCCESS;
/*
* Send REQUEST_TERMINATE_TASKS to all nodes of the step
*/
rpc.job_id = step->job_id;
rpc.job_step_id = step->step_id;
rpc.signal = (uint32_t)-1; /* not used by slurmd */
rc = _local_send_recv_rc_msgs(allocation->node_list,
REQUEST_TERMINATE_TASKS, &rpc);
if (rc == -1 && errno == ESLURM_ALREADY_DONE) {
rc = 0;
errno = 0;
}
return rc;
}
static int _terminate_batch_script_step(
const resource_allocation_response_msg_t *allocation)
{
slurm_msg_t msg;
kill_tasks_msg_t rpc;
int rc = SLURM_SUCCESS;
int i;
char *name = nodelist_nth_host(allocation->node_list, 0);
if(!name) {
error("_signal_batch_script_step: "
"can't get the first name out of %s",
allocation->node_list);
return -1;
}
rpc.job_id = allocation->job_id;
rpc.job_step_id = SLURM_BATCH_SCRIPT;
rpc.signal = (uint32_t)-1; /* not used by slurmd */
slurm_msg_t_init(&msg);
msg.msg_type = REQUEST_TERMINATE_TASKS;
msg.data = &rpc;
if(slurm_conf_get_addr(name, &msg.address) == SLURM_ERROR) {
error("_signal_batch_script_step: "
"can't get address for "
"host %s", name);
free(name);
return -1;
}
free(name);
i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 0);
if (i != 0)
rc = i;
return rc;
}