blob: e2e9d51ef1e5a43b78db781def40940966870920 [file] [log] [blame]
/*****************************************************************************\
* prep_slurmctld.c - slurmctld-specific aspects of the PrEpPlugin interface
* (for PrologSlurmctld / EpilogSlurmctld scripts)
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <signal.h>
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/power_save.h"
#include "src/slurmctld/slurmctld.h"
#include "src/stepmgr/srun_comm.h"
extern void prep_prolog_slurmctld_callback(int rc, uint32_t job_id,
bool timed_out)
{
slurmctld_lock_t job_write_lock =
{ .job = WRITE_LOCK, .node = WRITE_LOCK, .fed = READ_LOCK };
job_record_t *job_ptr;
lock_slurmctld(job_write_lock);
if (!(job_ptr = find_job_record(job_id))) {
error("%s: missing JobId=%u", __func__, job_id);
unlock_slurmctld(job_write_lock);
return;
}
if (WIFSIGNALED(rc) && timed_out) {
/*
* If the script was signaled due to the job being cancelled or
* slurmctld shutting down, we don't consider that a failure.
* However, if the script timed out, then it is considered a
* failure. In both of these cases, the script was signaled with
* SIGKILL, so we use the timed_out to distinguish between them.
*/
error("prolog_slurmctld JobId=%u failed due to timing out",
job_id);
job_ptr->prep_prolog_failed = true;
} else if (WIFEXITED(rc) && WEXITSTATUS(rc)) {
error("prolog_slurmctld JobId=%u prolog exit status %u:%u",
job_id, WEXITSTATUS(rc), WTERMSIG(rc));
job_ptr->prep_prolog_failed = true;
}
/* prevent underflow */
if (job_ptr->prep_prolog_cnt)
job_ptr->prep_prolog_cnt--;
if (job_ptr->prep_prolog_cnt) {
debug2("%s: still %u async prologs left to complete",
__func__, job_ptr->prep_prolog_cnt);
unlock_slurmctld(job_write_lock);
return;
}
/* all async prologs have completed, continue on now */
if (job_ptr->prep_prolog_failed) {
uint32_t jid = job_id;
job_ptr->prep_prolog_failed = false;
/* requeue het leader if het job */
if (job_ptr->het_job_id)
jid = job_ptr->het_job_id;
if ((rc = job_requeue(0, jid, NULL, false, 0)) &&
(rc != ESLURM_JOB_PENDING)) {
info("unable to requeue JobId=%u: %s", jid,
slurm_strerror(rc));
srun_user_message(job_ptr,
"PrologSlurmctld failed, job killed");
if (job_ptr->het_job_id) {
job_record_t *het_leader = job_ptr;
if (!het_leader->het_job_list) {
het_leader = find_job_record(
job_ptr->het_job_id);
}
/*
* Don't do anything if there isn't a het_leader
* (which there should be).
*/
if (het_leader) {
(void) het_job_signal(het_leader,
SIGKILL,
0, 0, false);
} else {
error("No het_leader found for %pJ",
job_ptr);
}
} else {
job_signal(job_ptr, SIGKILL, 0, 0, false);
}
}
} else
debug2("prolog_slurmctld JobId=%u prolog completed", job_id);
prolog_running_decr(job_ptr);
unlock_slurmctld(job_write_lock);
}
extern void prep_epilog_slurmctld_callback(int rc, uint32_t job_id,
bool timed_out)
{
slurmctld_lock_t job_write_lock = {
.job = WRITE_LOCK, .node = WRITE_LOCK};
job_record_t *job_ptr;
lock_slurmctld(job_write_lock);
if (!(job_ptr = find_job_record(job_id))) {
error("%s: missing JobId=%u", __func__, job_id);
unlock_slurmctld(job_write_lock);
return;
}
if (timed_out) {
/* Log an error but still continue cleaning up the job */
error("epilog_slurmctld JobId=%u timed out", job_id);
}
/* prevent underflow */
if (job_ptr->prep_epilog_cnt)
job_ptr->prep_epilog_cnt--;
if (job_ptr->prep_epilog_cnt) {
debug2("%s: still %u async epilogs left to complete",
__func__, job_ptr->prep_epilog_cnt);
unlock_slurmctld(job_write_lock);
return;
}
/* all async prologs have completed, continue on now */
job_ptr->epilog_running = false;
/*
* Clear the JOB_COMPLETING flag only if the node count is 0
* meaning the slurmd epilogs have already completed.
*/
if (IS_JOB_COMPLETING(job_ptr)) {
cleanup_completing(job_ptr, true);
}
unlock_slurmctld(job_write_lock);
}