blob: 805770f82de8f16c1732d8af199b5f059d9fa9ce [file] [log] [blame]
/*****************************************************************************\
* suspend.c - job step suspend and resume functions.
*****************************************************************************
* Copyright (C) 2005-2006 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov> et. al.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "slurm/slurm.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
/*
* _suspend_op - perform a suspend/resume operation for some job.
* IN op - operation to perform
* IN job_id - job on which to perform operation or NO_VAL
* RET 0 or a slurm error code
* NOTE: Supply either job_id NO_VAL or job_id_str as NULL, not both
*/
static int _suspend_op(uint16_t op, uint32_t job_id)
{
int rc = SLURM_SUCCESS;
suspend_msg_t sus_req;
slurm_msg_t req_msg;
slurm_msg_t_init(&req_msg);
memset(&sus_req, 0, sizeof(sus_req));
sus_req.op = op;
sus_req.job_id = job_id;
sus_req.job_id_str = NULL;
req_msg.msg_type = REQUEST_SUSPEND;
req_msg.data = &sus_req;
if (slurm_send_recv_controller_rc_msg(&req_msg, &rc,
working_cluster_rec) < 0)
return SLURM_ERROR;
errno = rc;
return rc;
}
/*
* slurm_suspend - suspend execution of a job.
* IN job_id - job on which to perform operation
* RET 0 or a slurm error code
*/
extern int slurm_suspend(uint32_t job_id)
{
return _suspend_op (SUSPEND_JOB, job_id);
}
/*
* slurm_resume - resume execution of a previously suspended job.
* IN job_id - job on which to perform operation
* RET 0 or a slurm error code
*/
extern int slurm_resume(uint32_t job_id)
{
return _suspend_op(RESUME_JOB, job_id);
}
/*
* _suspend_op2 - perform a suspend/resume operation for some job.
* IN op - operation to perform
* IN job_id_str - job on which to perform operation in string format or NULL
* OUT resp - slurm error codes by job array task ID
* RET 0 or a slurm error code
* NOTE: Supply either job_id NO_VAL or job_id_str as NULL, not both
*/
static int _suspend_op2(uint16_t op, char *job_id_str,
job_array_resp_msg_t **resp)
{
int rc = SLURM_SUCCESS;
suspend_msg_t sus_req;
slurm_msg_t req_msg, resp_msg;
slurm_msg_t_init(&req_msg);
slurm_msg_t_init(&resp_msg);
memset(&sus_req, 0, sizeof(sus_req));
sus_req.op = op;
sus_req.job_id = NO_VAL;
sus_req.job_id_str = job_id_str;
req_msg.msg_type = REQUEST_SUSPEND;
req_msg.data = &sus_req;
rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
working_cluster_rec);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_ARRAY_ERRORS:
*resp = (job_array_resp_msg_t *) resp_msg.data;
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
if (rc)
errno = rc;
break;
default:
errno = SLURM_UNEXPECTED_MSG_ERROR;
}
return rc;
}
/*
* slurm_suspend2 - suspend execution of a job.
* IN job_id in string form - job on which to perform operation
* OUT resp - per task response to the request,
* free using slurm_free_job_array_resp()
* RET 0 or a slurm error code
*/
extern int slurm_suspend2(char *job_id, job_array_resp_msg_t **resp)
{
return _suspend_op2(SUSPEND_JOB, job_id, resp);
}
/*
* slurm_resume2 - resume execution of a previously suspended job.
* IN job_id in string form - job on which to perform operation
* OUT resp - per task response to the request,
* free using slurm_free_job_array_resp()
* RET 0 or a slurm error code
*/
extern int slurm_resume2(char *job_id, job_array_resp_msg_t **resp)
{
return _suspend_op2(RESUME_JOB, job_id, resp);
}
/*
* slurm_requeue - re-queue a batch job, if already running
* then terminate it first
* IN job_id - job on which to perform operation
* IN flags - JOB_SPECIAL_EXIT - job should be placed special exit state and
* held.
* JOB_REQUEUE_HOLD - job should be placed JOB_PENDING state and
* held.
* JOB_RECONFIG_FAIL - Node configuration for job failed
* JOB_RUNNING - Operate only on jobs in a state of
* CONFIGURING, RUNNING, STOPPED or SUSPENDED.
* RET 0 or a slurm error code
*/
extern int slurm_requeue(uint32_t job_id, uint32_t flags)
{
int rc = SLURM_SUCCESS;
requeue_msg_t requeue_req;
slurm_msg_t req_msg;
slurm_msg_t_init(&req_msg);
memset(&requeue_req, 0, sizeof(requeue_req));
requeue_req.job_id = job_id;
requeue_req.job_id_str = NULL;
requeue_req.flags = flags;
req_msg.msg_type = REQUEST_JOB_REQUEUE;
req_msg.data = &requeue_req;
if (slurm_send_recv_controller_rc_msg(&req_msg, &rc,
working_cluster_rec) < 0)
return SLURM_ERROR;
errno = rc;
return rc;
}
/*
* slurm_requeue2 - re-queue a batch job, if already running
* then terminate it first
* IN job_id in string form - job on which to perform operation, may be job
* array specification (e.g. "123_1-20,44");
* IN flags - JOB_SPECIAL_EXIT - job should be placed special exit state and
* held.
* JOB_REQUEUE_HOLD - job should be placed JOB_PENDING state and
* held.
* JOB_RECONFIG_FAIL - Node configuration for job failed
* JOB_RUNNING - Operate only on jobs in a state of
* CONFIGURING, RUNNING, STOPPED or SUSPENDED.
* OUT resp - per task response to the request,
* free using slurm_free_job_array_resp()
* RET 0 or a slurm error code
*/
extern int slurm_requeue2(char *job_id_str, uint32_t flags,
job_array_resp_msg_t **resp)
{
int rc = SLURM_SUCCESS;
requeue_msg_t requeue_req;
slurm_msg_t req_msg, resp_msg;
slurm_msg_t_init(&req_msg);
slurm_msg_t_init(&resp_msg);
memset(&requeue_req, 0, sizeof(requeue_req));
requeue_req.job_id = NO_VAL;
requeue_req.job_id_str = job_id_str;
requeue_req.flags = flags;
req_msg.msg_type = REQUEST_JOB_REQUEUE;
req_msg.data = &requeue_req;
rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
working_cluster_rec);
switch (resp_msg.msg_type) {
case RESPONSE_JOB_ARRAY_ERRORS:
*resp = (job_array_resp_msg_t *) resp_msg.data;
break;
case RESPONSE_SLURM_RC:
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
if (rc)
errno = rc;
break;
default:
errno = SLURM_UNEXPECTED_MSG_ERROR;
}
return rc;
}