blob: 281c27b1a590ce21459d8bae783a86af26182082 [file] [log] [blame]
/*****************************************************************************\
* update_job.c - update job functions for scontrol.
*****************************************************************************
* Copyright (C) 2002-2006 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "scontrol.h"
/*
* scontrol_checkpoint - perform some checkpoint/resume operation
* IN op - checkpoint operation
* IN job_step_id_str - either a job name (for all steps of the given job) or
* a step name: "<jid>.<step_id>"
* RET 0 if no slurm error, errno otherwise. parsing error prints
* error message and returns 0
*/
extern int
scontrol_checkpoint(char *op, char *job_step_id_str)
{
int rc = SLURM_SUCCESS;
uint32_t job_id = 0, step_id = 0, step_id_set = 0;
char *next_str;
uint32_t ckpt_errno;
char *ckpt_strerror = NULL;
if (job_step_id_str) {
job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10);
if (next_str[0] == '.') {
step_id = (uint32_t) strtol (&next_str[1], &next_str, 10);
step_id_set = 1;
} else
step_id = NO_VAL;
if (next_str[0] != '\0') {
fprintf(stderr, "Invalid job step name\n");
return 0;
}
} else {
fprintf(stderr, "Invalid job step name\n");
return 0;
}
if (strncasecmp(op, "able", 2) == 0) {
time_t start_time;
rc = slurm_checkpoint_able (job_id, step_id, &start_time);
if (rc == SLURM_SUCCESS) {
if (start_time) {
char buf[128], time_str[32];
slurm_make_time_str(&start_time, time_str,
sizeof(time_str));
snprintf(buf, sizeof(buf),
"Began at %s\n", time_str);
printf(buf);
} else
printf("Yes\n");
} else if (slurm_get_errno() == ESLURM_DISABLED) {
printf("No\n");
rc = SLURM_SUCCESS; /* not real error */
}
}
else if (strncasecmp(op, "complete", 3) == 0) {
/* Undocumented option used for testing purposes */
static uint32_t error_code = 1;
char error_msg[64];
sprintf(error_msg, "test error message %d", error_code);
rc = slurm_checkpoint_complete(job_id, step_id, (time_t) 0,
error_code++, error_msg);
}
else if (strncasecmp(op, "disable", 3) == 0)
rc = slurm_checkpoint_disable (job_id, step_id);
else if (strncasecmp(op, "enable", 2) == 0)
rc = slurm_checkpoint_enable (job_id, step_id);
else if (strncasecmp(op, "create", 2) == 0)
rc = slurm_checkpoint_create (job_id, step_id, CKPT_WAIT);
else if (strncasecmp(op, "vacate", 2) == 0)
rc = slurm_checkpoint_vacate (job_id, step_id, CKPT_WAIT);
else if (strncasecmp(op, "restart", 2) == 0)
rc = slurm_checkpoint_restart (job_id, step_id);
else if (strncasecmp(op, "error", 2) == 0) {
rc = slurm_checkpoint_error (job_id, step_id,
&ckpt_errno, &ckpt_strerror);
if (rc == SLURM_SUCCESS) {
printf("error(%u): %s\n", ckpt_errno, ckpt_strerror);
free(ckpt_strerror);
}
}
else {
fprintf (stderr, "Invalid checkpoint operation: %s\n", op);
return 0;
}
return rc;
}
/*
* scontrol_suspend - perform some suspend/resume operation
* IN op - suspend/resume operation
* IN job_id_str - a job id
* RET 0 if no slurm error, errno otherwise. parsing error prints
* error message and returns 0
*/
extern int
scontrol_suspend(char *op, char *job_id_str)
{
int rc = SLURM_SUCCESS;
uint32_t job_id = 0;
char *next_str;
if (job_id_str) {
job_id = (uint32_t) strtol (job_id_str, &next_str, 10);
if (next_str[0] != '\0') {
fprintf(stderr, "Invalid job id specified\n");
exit_code = 1;
return 0;
}
} else {
fprintf(stderr, "Invalid job id specified\n");
exit_code = 1;
return 0;
}
if (strncasecmp(op, "suspend", 3) == 0)
rc = slurm_suspend (job_id);
else
rc = slurm_resume (job_id);
return rc;
}
/*
* scontrol_requeue - requeue a pending or running batch job
* IN job_id_str - a job id
* RET 0 if no slurm error, errno otherwise. parsing error prints
* error message and returns 0
*/
extern int
scontrol_requeue(char *job_id_str)
{
int rc = SLURM_SUCCESS;
uint32_t job_id = 0;
char *next_str;
if (job_id_str) {
job_id = (uint32_t) strtol (job_id_str, &next_str, 10);
if (next_str[0] != '\0') {
fprintf(stderr, "Invalid job id specified\n");
exit_code = 1;
return 0;
}
} else {
fprintf(stderr, "Invalid job id specified\n");
exit_code = 1;
return 0;
}
rc = slurm_requeue (job_id);
return rc;
}
/*
* scontrol_update_job - update the slurm job configuration per the supplied arguments
* IN argc - count of arguments
* IN argv - list of arguments
* RET 0 if no slurm error, errno otherwise. parsing error prints
* error message and returns 0
*/
extern int
scontrol_update_job (int argc, char *argv[])
{
int i, update_cnt = 0;
job_desc_msg_t job_msg;
slurm_init_job_desc_msg (&job_msg);
for (i=0; i<argc; i++) {
if (strncasecmp(argv[i], "JobId=", 6) == 0)
job_msg.job_id =
(uint32_t) strtol(&argv[i][6],
(char **) NULL, 10);
else if (strncasecmp(argv[i], "Comment=", 8) == 0) {
job_msg.comment = &argv[i][8];
update_cnt++;
}
else if (strncasecmp(argv[i], "TimeLimit=", 10) == 0) {
if ((strcasecmp(&argv[i][10], "UNLIMITED") == 0) ||
(strcasecmp(&argv[i][10], "INFINITE") == 0))
job_msg.time_limit = INFINITE;
else
job_msg.time_limit =
(uint32_t) strtol(&argv[i][10],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "Priority=", 9) == 0) {
job_msg.priority =
(uint32_t) strtoll(&argv[i][9],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "Nice=", 5) == 0) {
int nice;
nice = strtoll(&argv[i][5], (char **) NULL, 10);
if (abs(nice) > NICE_OFFSET) {
error("Invalid nice value, must be between "
"-%d and %d", NICE_OFFSET, NICE_OFFSET);
exit_code = 1;
return 0;
}
job_msg.nice = NICE_OFFSET + nice;
update_cnt++;
}
else if (strncasecmp(argv[i], "Nice", 4) == 0) {
job_msg.nice = NICE_OFFSET + 100;
update_cnt++;
}
else if (strncasecmp(argv[i], "ReqProcs=", 9) == 0) {
job_msg.num_procs =
(uint32_t) strtol(&argv[i][9],
(char **) NULL, 10);
update_cnt++;
}
else if ((strncasecmp(argv[i], "MinNodes=", 9) == 0) ||
(strncasecmp(argv[i], "ReqNodes=", 9) == 0)) {
char *tmp;
job_msg.min_nodes =
(uint32_t) strtol(&argv[i][9],
&tmp, 10);
if (tmp[0] == '-') {
job_msg.max_nodes = (uint32_t)
strtol(&tmp[1], (char **) NULL, 10);
if (job_msg.max_nodes < job_msg.min_nodes) {
error("Maximum node count less than "
"minimum value (%u < %u)",
job_msg.max_nodes,
job_msg.min_nodes);
}
}
update_cnt++;
}
else if (strncasecmp(argv[i], "ReqSockets=", 11) == 0) {
job_msg.min_sockets =
(uint16_t) strtol(&argv[i][11],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "ReqCores=", 9) == 0) {
job_msg.min_cores =
(uint16_t) strtol(&argv[i][9],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "TasksPerNode=", 13) == 0) {
job_msg.ntasks_per_node =
(uint16_t) strtol(&argv[i][13],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "ReqThreads=", 11) == 0) {
job_msg.min_threads =
(uint16_t) strtol(&argv[i][11],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "MinProcs=", 9) == 0) {
job_msg.job_min_procs =
(uint32_t) strtol(&argv[i][9],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "MinSockets=", 11) == 0) {
job_msg.job_min_sockets =
(uint16_t) strtol(&argv[i][11],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "MinCores=", 9) == 0) {
job_msg.job_min_cores =
(uint16_t) strtol(&argv[i][9],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "MinThreads=", 11) == 0) {
job_msg.job_min_threads =
(uint16_t) strtol(&argv[i][11],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "MinMemory=", 10) == 0) {
job_msg.job_min_memory =
(uint32_t) strtol(&argv[i][10],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "MinTmpDisk=", 11) == 0) {
job_msg.job_min_tmp_disk =
(uint32_t) strtol(&argv[i][11],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "Partition=", 10) == 0) {
job_msg.partition = &argv[i][10];
update_cnt++;
}
else if (strncasecmp(argv[i], "Name=", 5) == 0) {
job_msg.name = &argv[i][5];
update_cnt++;
}
else if (strncasecmp(argv[i], "Shared=", 7) == 0) {
if (strcasecmp(&argv[i][7], "YES") == 0)
job_msg.shared = 1;
else if (strcasecmp(&argv[i][7], "NO") == 0)
job_msg.shared = 0;
else
job_msg.shared =
(uint16_t) strtol(&argv[i][7],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "Contiguous=", 11) == 0) {
if (strcasecmp(&argv[i][11], "YES") == 0)
job_msg.contiguous = 1;
else if (strcasecmp(&argv[i][11], "NO") == 0)
job_msg.contiguous = 0;
else
job_msg.contiguous =
(uint16_t) strtol(&argv[i][11],
(char **) NULL, 10);
update_cnt++;
}
else if (strncasecmp(argv[i], "ExcNodeList=", 12) == 0) {
job_msg.exc_nodes = &argv[i][12];
update_cnt++;
}
else if (strncasecmp(argv[i], "ReqNodeList=", 12) == 0) {
job_msg.req_nodes = &argv[i][12];
update_cnt++;
}
else if (strncasecmp(argv[i], "Features=", 9) == 0) {
job_msg.features = &argv[i][9];
update_cnt++;
}
else if (strncasecmp(argv[i], "Account=", 8) == 0) {
job_msg.account = &argv[i][8];
update_cnt++;
}
else if (strncasecmp(argv[i], "Dependency=", 11) == 0) {
job_msg.dependency =
(uint32_t) strtol(&argv[i][11],
(char **) NULL, 10);
update_cnt++;
}
#ifdef HAVE_BG
else if (strncasecmp(argv[i], "Geometry=", 9) == 0) {
char* token, *delimiter = ",x", *next_ptr;
int j, rc = 0;
uint16_t geo[SYSTEM_DIMENSIONS];
char* geometry_tmp = xstrdup(&argv[i][9]);
char* original_ptr = geometry_tmp;
token = strtok_r(geometry_tmp, delimiter, &next_ptr);
for (j=0; j<SYSTEM_DIMENSIONS; j++) {
if (token == NULL) {
error("insufficient dimensions in "
"Geometry");
rc = -1;
break;
}
geo[j] = (uint16_t) atoi(token);
if (geo[j] <= 0) {
error("invalid --geometry argument");
rc = -1;
break;
}
geometry_tmp = next_ptr;
token = strtok_r(geometry_tmp, delimiter,
&next_ptr);
}
if (token != NULL) {
error("too many dimensions in Geometry");
rc = -1;
}
if (original_ptr)
xfree(original_ptr);
if (rc != 0)
exit_code = 1;
else {
for (j=0; j<SYSTEM_DIMENSIONS; j++)
job_msg.geometry[j] = geo[j];
update_cnt++;
}
}
else if (strncasecmp(argv[i], "Rotate=", 7) == 0) {
uint16_t rotate;
if (strcasecmp(&argv[i][7], "yes") == 0)
rotate = 1;
else if (strcasecmp(&argv[i][7], "no") == 0)
rotate = 0;
else
rotate = (uint16_t) strtol(&argv[i][7],
(char **) NULL, 10);
job_msg.rotate = rotate;
update_cnt++;
}
#endif
else if (strncasecmp(argv[i], "StartTime=", 10) == 0) {
job_msg.begin_time = parse_time(&argv[i][10]);
update_cnt++;
}
else {
exit_code = 1;
fprintf (stderr, "Invalid input: %s\n", argv[i]);
fprintf (stderr, "Request aborted\n");
return 0;
}
}
if (update_cnt == 0) {
exit_code = 1;
fprintf (stderr, "No changes specified\n");
return 0;
}
if (slurm_update_job(&job_msg))
return slurm_get_errno ();
else
return 0;
}