blob: b13b8816690142906535483a4099b7e6807831d6 [file] [log] [blame]
/*****************************************************************************\
* scancel - cancel specified job(s) and/or job step(s)
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#if HAVE_CONFIG_H
# include "config.h"
#endif
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#if HAVE_INTTYPES_H
# include <inttypes.h>
#else /* !HAVE_INTTYPES_H */
# if HAVE_STDINT_H
# include <stdint.h>
# endif
#endif /* HAVE_INTTYPES_H */
#include <slurm/slurm.h>
#include "src/common/log.h"
#include "src/common/xstring.h"
#include "src/common/xmalloc.h"
#include "src/scancel/scancel.h"
#define MAX_CANCEL_RETRY 10
static void _cancel_jobs (void);
static void _cancel_job_id (uint32_t job_id, uint16_t sig);
static void _cancel_step_id (uint32_t job_id, uint32_t step_id,
uint16_t sig);
static int _confirmation (int i, uint32_t step_id);
static void _filter_job_records (void);
static void _load_job_records (void);
static job_info_msg_t * job_buffer_ptr = NULL;
int
main (int argc, char *argv[])
{
log_options_t log_opts = LOG_OPTS_STDERR_ONLY ;
log_init (xbasename(argv[0]), log_opts, SYSLOG_FACILITY_DAEMON, NULL);
initialize_and_process_args(argc, argv);
if (opt.verbose) {
log_opts.stderr_level += opt.verbose;
log_alter (log_opts, SYSLOG_FACILITY_DAEMON, NULL);
}
if ((opt.interactive) ||
(opt.job_name) ||
(opt.partition) ||
(opt.state != JOB_END) ||
(opt.user_name)) {
_load_job_records ();
_filter_job_records ();
}
_cancel_jobs ();
exit (0);
}
/* _load_job_records - load all job information for filtering and verification */
static void
_load_job_records (void)
{
int error_code;
error_code = slurm_load_jobs ((time_t) NULL, &job_buffer_ptr, 1);
if (error_code) {
slurm_perror ("slurm_load_jobs error");
exit (1);
}
}
/* _filter_job_records - filtering job information per user specification */
static void
_filter_job_records (void)
{
int i, j;
job_info_t *job_ptr = NULL;
job_ptr = job_buffer_ptr->job_array ;
for (i = 0; i < job_buffer_ptr->record_count; i++) {
if (job_ptr[i].job_id == 0)
continue;
if ((job_ptr[i].job_state != JOB_PENDING)
&& (job_ptr[i].job_state != JOB_RUNNING)
&& (job_ptr[i].job_state != JOB_SUSPENDED)) {
job_ptr[i].job_id = 0;
continue;
}
if ((opt.job_name != NULL) &&
(strcmp(job_ptr[i].name,opt.job_name) != 0)) {
job_ptr[i].job_id = 0;
continue;
}
if ((opt.partition != NULL) &&
(strcmp(job_ptr[i].partition,opt.partition) != 0)) {
job_ptr[i].job_id = 0;
continue;
}
if ((opt.state != JOB_END) &&
(job_ptr[i].job_state != opt.state)) {
job_ptr[i].job_id = 0;
continue;
}
if ((opt.user_name != NULL) &&
(job_ptr[i].user_id != opt.user_id)) {
job_ptr[i].job_id = 0;
continue;
}
if (opt.job_cnt == 0)
continue;
for (j = 0; j < opt.job_cnt; j++) {
if (job_ptr[i].job_id == opt.job_id[j])
break;
}
if (j >= opt.job_cnt) { /* not found */
job_ptr[i].job_id = 0;
continue;
}
}
}
/* _cancel_jobs - filter then cancel jobs or job steps per request */
static void
_cancel_jobs (void)
{
int i, j;
job_info_t *job_ptr = NULL;
if (opt.job_cnt && opt.interactive) { /* confirm cancel */
job_ptr = job_buffer_ptr->job_array ;
for (j = 0; j < opt.job_cnt; j++ ) {
for (i = 0; i < job_buffer_ptr->record_count; i++) {
if (job_ptr[i].job_id != opt.job_id[j])
continue;
if (opt.interactive &&
(_confirmation(i, opt.step_id[j]) == 0))
break;
if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
_cancel_job_id (opt.job_id[j],
opt.signal);
else
_cancel_step_id (opt.job_id[j],
opt.step_id[j],
opt.signal);
break;
}
if (i >= job_buffer_ptr->record_count)
fprintf (stderr, "Job %u not found\n",
opt.job_id[j]);
}
} else if (opt.job_cnt) { /* delete specific jobs */
for (j = 0; j < opt.job_cnt; j++ ) {
if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
_cancel_job_id (opt.job_id[j],
opt.signal);
else
_cancel_step_id (opt.job_id[j],
opt.step_id[j],
opt.signal);
}
} else { /* delete all jobs per filtering */
job_ptr = job_buffer_ptr->job_array ;
for (i = 0; i < job_buffer_ptr->record_count; i++) {
if (job_ptr[i].job_id == 0)
continue;
if (opt.interactive &&
(_confirmation(i, SLURM_BATCH_SCRIPT) == 0))
continue;
_cancel_job_id (job_ptr[i].job_id, opt.signal);
}
}
}
static void
_cancel_job_id (uint32_t job_id, uint16_t sig)
{
int error_code = SLURM_SUCCESS, i;
if (sig == (uint16_t)-1)
sig = SIGKILL;
for (i=0; i<MAX_CANCEL_RETRY; i++) {
if (sig == SIGKILL)
verbose("Terminating job %u", job_id);
else
verbose("Signal %u to job %u", sig, job_id);
if ((sig == SIGKILL) || opt.ctld) {
error_code = slurm_kill_job (job_id, sig,
(uint16_t)opt.batch);
} else {
if (opt.batch)
error_code = slurm_signal_job_step(job_id,
SLURM_BATCH_SCRIPT,
sig);
else
error_code = slurm_signal_job (job_id, sig);
}
if (error_code == 0
|| (errno != ESLURM_TRANSITION_STATE_NO_UPDATE
&& errno != ESLURM_JOB_PENDING))
break;
verbose("Job is in transistional state, retrying");
sleep ( 5 + i );
}
if (error_code) {
error_code = slurm_get_errno();
if ((opt.verbose >= 0) ||
((error_code != ESLURM_ALREADY_DONE) &&
(error_code != ESLURM_INVALID_JOB_ID)))
error("Kill job error on job id %u: %s",
job_id, slurm_strerror(slurm_get_errno()));
}
}
static void
_cancel_step_id (uint32_t job_id, uint32_t step_id, uint16_t sig)
{
int error_code = SLURM_SUCCESS, i;
if (sig == (uint16_t)-1)
sig = SIGKILL;
for (i=0; i<MAX_CANCEL_RETRY; i++) {
if (sig == SIGKILL)
verbose("Terminating step %u.%u", job_id, step_id);
else {
verbose("Signal %u to step %u.%u",
sig, job_id, step_id);
}
if (opt.ctld)
error_code = slurm_kill_job_step(job_id, step_id, sig);
else if (sig == SIGKILL)
error_code = slurm_terminate_job_step(job_id, step_id);
else
error_code = slurm_signal_job_step(job_id, step_id,
sig);
if (error_code == 0
|| (errno != ESLURM_TRANSITION_STATE_NO_UPDATE
&& errno != ESLURM_JOB_PENDING))
break;
verbose("Job is in transistional state, retrying");
sleep ( 5 + i );
}
if (error_code) {
error_code = slurm_get_errno();
if ((opt.verbose >= 0) || (error_code != ESLURM_ALREADY_DONE ))
error("Kill job error on job step id %u.%u: %s",
job_id, step_id,
slurm_strerror(slurm_get_errno()));
}
}
/* _confirmation - Confirm job cancel request interactively */
static int
_confirmation (int i, uint32_t step_id)
{
char in_line[128];
job_info_t *job_ptr = NULL;
job_ptr = job_buffer_ptr->job_array ;
while (1) {
if (step_id == SLURM_BATCH_SCRIPT) {
printf ("Cancel job_id=%u name=%s partition=%s [y/n]? ",
job_ptr[i].job_id, job_ptr[i].name,
job_ptr[i].partition);
} else {
printf ("Cancel step_id=%u.%u name=%s partition=%s [y/n]? ",
job_ptr[i].job_id, step_id, job_ptr[i].name,
job_ptr[i].partition);
}
fgets (in_line, sizeof (in_line), stdin);
if ((in_line[0] == 'y') || (in_line[0] == 'Y'))
return 1;
if ((in_line[0] == 'n') || (in_line[0] == 'N'))
return 0;
}
}