blob: 032792d110959b2c2a77df572610747184e72762 [file] [log] [blame]
/*****************************************************************************\
* runaway_jobs_functions.c - functions dealing with runaway/orphan jobs
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "src/sacctmgr/sacctmgr.h"
#include "src/common/assoc_mgr.h"
/* Max runaway jobs per single sql statement. */
#define RUNAWAY_JOBS_PER_PASS 1000
static uint32_t _parse_state(char *state_str)
{
uint32_t state = job_state_num(state_str);
if ((state != JOB_COMPLETE) && (state != JOB_FAILED)) {
fatal("Unknown or unsupported state specified (%s). Only Completed or Failed are accepted.", state_str);
}
return state;
}
static int _set_cond(int *start, int argc, char **argv,
slurmdb_job_cond_t *job_cond,
list_t *format_list)
{
int i, end = 0;
int set = 0;
int command_len = 0;
for (i = (*start); i < argc; i++) {
int op_type;
end = parse_option_end(argv[i], &op_type, &command_len);
if (!common_verify_option_syntax(argv[i], op_type, false))
continue;
if (!end ||
!xstrncasecmp(argv[i], "Cluster", MAX(command_len, 1))) {
if (!job_cond->cluster_list)
job_cond->cluster_list = list_create(xfree_ptr);
if (slurm_addto_char_list(job_cond->cluster_list,
argv[i]+end))
set = 1;
} else if (!xstrncasecmp(argv[i], "Format",
MAX(command_len, 1))) {
if (format_list)
slurm_addto_char_list(format_list, argv[i]+end);
} else {
exit_code=1;
fprintf(stderr, " Unknown condition: %s\n", argv[i]);
}
}
(*start) = i;
return set;
}
static int _set_rec(int *start, int argc, char **argv,
slurmdb_job_rec_t *job_rec)
{
int i;
int set = 0;
int end = 0;
int command_len = 0;
int option = 0;
bool allow_option = false;
for (i = (*start); i < argc; i++) {
end = parse_option_end(argv[i], &option, &command_len);
if (!xstrncasecmp(argv[i], "Where", MAX(command_len, 5))) {
i--;
break;
} else if (!end &&
!xstrncasecmp(argv[i], "set", MAX(command_len, 3))) {
continue;
} else if (!end) {
exit_code = 1;
error("Bad format on %s: End your option with an '=' sign\n",
argv[i]);
} else if (!xstrncasecmp(argv[i], "EndState",
MAX(command_len, 1))) {
job_rec->state = _parse_state(argv[i] + end);
set = 1;
} else {
exit_code = 1;
error("Unknown option: %s\n", argv[i]);
}
common_verify_option_syntax(argv[i], option, allow_option);
}
(*start) = i;
return set;
}
static void _print_runaway_jobs(list_t *format_list, list_t *jobs)
{
char outbuf[FORMAT_STRING_SIZE];
slurmdb_job_rec_t *job = NULL;
list_itr_t *itr = NULL;
list_itr_t *field_itr = NULL;
list_t *print_fields_list; /* types are of print_field_t */
print_field_t *field = NULL;
int field_count;
printf("NOTE: Runaway jobs are jobs that don't exist in the "
"controller but have a start time and no end time "
"in the database\n");
print_fields_list = sacctmgr_process_format_list(format_list);
print_fields_header(print_fields_list);
field_count = list_count(print_fields_list);
list_sort(jobs, slurmdb_job_sort_by_submit_time);
itr = list_iterator_create(jobs);
field_itr = list_iterator_create(print_fields_list);
while ((job = list_next(itr))) {
int curr_inx = 1;
while ((field = list_next(field_itr))) {
switch(field->type) {
case PRINT_ID:
field->print_routine(
field,
&job->jobid,
(curr_inx == field_count));
break;
case PRINT_NAME:
field->print_routine(
field,
job->jobname,
(curr_inx == field_count));
break;
case PRINT_PART:
field->print_routine(
field,
job->partition,
(curr_inx == field_count));
break;
case PRINT_CLUSTER:
field->print_routine(
field,
job->cluster,
(curr_inx == field_count));
break;
case PRINT_STATE:
snprintf(outbuf, FORMAT_STRING_SIZE, "%s",
job_state_string(job->state));
field->print_routine(
field,
outbuf,
(curr_inx == field_count));
break;
case PRINT_TIMESTART:
field->print_routine(
field,
&job->start,
(curr_inx == field_count));
break;
case PRINT_TIMEEND:
field->print_routine(
field,
&job->end,
(curr_inx == field_count));
break;
case PRINT_TIMESUBMIT:
field->print_routine(
field,
&job->submit,
(curr_inx == field_count));
break;
case PRINT_TIMEELIGIBLE:
field->print_routine(
field,
&job->eligible,
(curr_inx == field_count));
break;
default:
break;
}
curr_inx++;
}
list_iterator_reset(field_itr);
printf("\n");
}
list_iterator_destroy(field_itr);
list_iterator_destroy(itr);
}
/*
* Remove job from run away list if it is a currently known job to slurmctld.
*/
static int _purge_known_jobs(void *x, void *key)
{
job_info_msg_t *clus_jobs = (job_info_msg_t *) key;
slurmdb_job_rec_t *db_job = (slurmdb_job_rec_t *) x;
if (clus_jobs->record_count > 0) {
job_info_t *clus_job = clus_jobs->job_array;
for (int i = 0; i < clus_jobs->record_count; i++, clus_job++) {
if ((db_job->jobid == clus_job->job_id) &&
((db_job->submit == clus_job->submit_time) ||
(db_job->submit == clus_job->resize_time))) {
debug5("%s: matched known JobId=%u SubmitTime=%"PRIu64,
__func__, db_job->jobid,
(uint64_t)db_job->submit);
return true;
}
}
}
debug5("%s: runaway job found JobId=%u SubmitTime=%"PRIu64,
__func__, db_job->jobid, (uint64_t)db_job->submit);
return false;
}
static list_t *_get_runaway_jobs(slurmdb_job_cond_t *job_cond)
{
list_t *db_jobs_list = NULL;
job_info_msg_t *clus_jobs = NULL;
slurmdb_cluster_cond_t cluster_cond;
list_t *cluster_list = NULL;
job_cond->db_flags = SLURMDB_JOB_FLAG_NOTSET;
job_cond->flags |= JOBCOND_FLAG_RUNAWAY | JOBCOND_FLAG_NO_TRUNC;
if (!job_cond->cluster_list || !list_count(job_cond->cluster_list)) {
if (!job_cond->cluster_list)
job_cond->cluster_list = list_create(xfree_ptr);
slurm_addto_char_list(job_cond->cluster_list,
slurm_conf.cluster_name);
}
if (list_count(job_cond->cluster_list) != 1) {
error("You can only fix runaway jobs on "
"one cluster at a time.");
return NULL;
}
db_jobs_list = slurmdb_jobs_get(db_conn, job_cond);
if (!db_jobs_list) {
if (errno != ESLURM_ACCESS_DENIED)
error("No job list returned");
goto cleanup;
} else if (!list_count(db_jobs_list))
return db_jobs_list; /* Just return now since we don't
* have any run away jobs, no
* reason to check the cluster.
*/
slurmdb_init_cluster_cond(&cluster_cond, 0);
cluster_cond.cluster_list = job_cond->cluster_list;
cluster_list = slurmdb_clusters_get(db_conn,
&cluster_cond);
if (!cluster_list) {
error("No cluster list returned.");
goto cleanup;
} else if (!list_count(cluster_list)) {
error("Cluster %s is unknown",
(char *)list_peek(job_cond->cluster_list));
goto cleanup;
} else if (list_count(cluster_list) != 1) {
error("slurmdb_clusters_get didn't return exactly one cluster (%d)! This should never happen.",
list_count(cluster_list));
goto cleanup;
}
working_cluster_rec = list_peek(cluster_list);
if (!working_cluster_rec->control_host ||
working_cluster_rec->control_host[0] == '\0' ||
!working_cluster_rec->control_port) {
error("Slurmctld running on cluster %s is not up, can't check running jobs",
working_cluster_rec->name);
goto cleanup;
}
if (slurm_load_jobs((time_t)NULL, &clus_jobs, SHOW_ALL)) {
error("Failed to get jobs from requested clusters: %m");
goto cleanup;
}
list_delete_all(db_jobs_list, _purge_known_jobs, clus_jobs);
return db_jobs_list;
cleanup:
FREE_NULL_LIST(db_jobs_list);
FREE_NULL_LIST(cluster_list);
return NULL;
}
/*
* List and ask user if they wish to fix the runaway jobs
*/
extern int sacctmgr_list_runaway_jobs(int argc, char **argv)
{
list_t *runaway_jobs = NULL;
list_t *process_jobs = list_create(slurmdb_destroy_job_rec);
int rc = SLURM_SUCCESS;
int altered = 0;
int i=0;
char *cluster_str;
list_t *format_list = list_create(xfree_ptr);
slurmdb_job_rec_t job_rec_update = { 0 };
slurmdb_job_cond_t *job_cond = xmalloc(sizeof(slurmdb_job_cond_t));
char *ask_msg = "\nWould you like to fix these runaway jobs?\n"
"(This sets the end time for each job to the latest of "
"the job's start, eligible, and submit times, and sets "
"the state to completed.\n"
"Once corrected, this triggers the SlurmDBD to "
"recalculate the usage from before the earliest submit "
"time of all the runaway jobs. "
"Warning: This could take a long time and sreport may "
"not return data until the recalculation is completed.)\n\n";
for (i=0; i<argc; i++) {
int command_len = strlen(argv[i]);
if (!xstrncasecmp(argv[i], "Where", MAX(command_len, 5))) {
i++;
_set_cond(&i, argc, argv, job_cond, format_list);
} else if (!xstrncasecmp(argv[i], "Set", MAX(command_len, 3))) {
i++;
altered += _set_rec(&i, argc, argv, &job_rec_update);
} else {
_set_cond(&i, argc, argv, job_cond, format_list);
}
}
runaway_jobs = _get_runaway_jobs(job_cond);
cluster_str = xstrdup(list_peek(job_cond->cluster_list));
slurmdb_destroy_job_cond(job_cond);
if (!runaway_jobs) {
rc = SLURM_ERROR;
goto end_it;
}
if (!list_count(runaway_jobs)) {
printf("Runaway Jobs: No runaway jobs found on cluster %s\n",
cluster_str);
rc = SLURM_SUCCESS;
goto end_it;
}
if (!list_count(format_list))
slurm_addto_char_list(
format_list,
"ID%-12,Name,Part,Cluster,State%10,Submit,Start,End");
_print_runaway_jobs(format_list, runaway_jobs);
while (!rc && list_transfer_max(process_jobs, runaway_jobs,
RUNAWAY_JOBS_PER_PASS)) {
/*
* To tell which end state we want for the jobs,
* we mark the first job of the list for slurmdbd.
*/
if (altered) {
slurmdb_job_rec_t *job_rec = list_peek(process_jobs);
if (job_rec_update.state)
job_rec->state = job_rec_update.state;
job_rec->flags = SLURMDB_JOB_FLAG_ALTERED;
}
rc = slurmdb_jobs_fix_runaway(db_conn, process_jobs);
list_flush(process_jobs);
}
if (rc == SLURM_SUCCESS) {
if (commit_check(ask_msg)) {
rc = slurmdb_connection_commit(db_conn, 1);
if (rc != SLURM_SUCCESS)
fprintf(stderr, " Error committing changes: %s\n",
slurm_strerror(rc));
} else {
printf("Changes Discarded\n");
rc = slurmdb_connection_commit(db_conn, 0);
if (rc != SLURM_SUCCESS)
fprintf(stderr, " Error rolling back changes: %s\n",
slurm_strerror(rc));
}
} else
error("Failed to fix runaway job: %s\n",
slurm_strerror(rc));
end_it:
xfree(cluster_str);
FREE_NULL_LIST(runaway_jobs);
FREE_NULL_LIST(process_jobs);
FREE_NULL_LIST(format_list);
return rc;
}