| /*****************************************************************************\ |
| * prep_slurmctld.c - slurmctld-specific aspects of the PrEpPlugin interface |
| * (for PrologSlurmctld / EpilogSlurmctld scripts) |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include <signal.h> |
| |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/power_save.h" |
| #include "src/slurmctld/slurmctld.h" |
| |
| #include "src/stepmgr/srun_comm.h" |
| |
| extern void prep_prolog_slurmctld_callback(int rc, uint32_t job_id, |
| bool timed_out) |
| { |
| slurmctld_lock_t job_write_lock = |
| { .job = WRITE_LOCK, .node = WRITE_LOCK, .fed = READ_LOCK }; |
| job_record_t *job_ptr; |
| |
| lock_slurmctld(job_write_lock); |
| if (!(job_ptr = find_job_record(job_id))) { |
| error("%s: missing JobId=%u", __func__, job_id); |
| unlock_slurmctld(job_write_lock); |
| return; |
| } |
| if (WIFSIGNALED(rc) && timed_out) { |
| /* |
| * If the script was signaled due to the job being cancelled or |
| * slurmctld shutting down, we don't consider that a failure. |
| * However, if the script timed out, then it is considered a |
| * failure. In both of these cases, the script was signaled with |
| * SIGKILL, so we use the timed_out to distinguish between them. |
| */ |
| error("prolog_slurmctld JobId=%u failed due to timing out", |
| job_id); |
| job_ptr->prep_prolog_failed = true; |
| } else if (WIFEXITED(rc) && WEXITSTATUS(rc)) { |
| error("prolog_slurmctld JobId=%u prolog exit status %u:%u", |
| job_id, WEXITSTATUS(rc), WTERMSIG(rc)); |
| job_ptr->prep_prolog_failed = true; |
| } |
| |
| /* prevent underflow */ |
| if (job_ptr->prep_prolog_cnt) |
| job_ptr->prep_prolog_cnt--; |
| |
| if (job_ptr->prep_prolog_cnt) { |
| debug2("%s: still %u async prologs left to complete", |
| __func__, job_ptr->prep_prolog_cnt); |
| unlock_slurmctld(job_write_lock); |
| return; |
| } |
| |
| /* all async prologs have completed, continue on now */ |
| if (job_ptr->prep_prolog_failed) { |
| uint32_t jid = job_id; |
| |
| job_ptr->prep_prolog_failed = false; |
| |
| /* requeue het leader if het job */ |
| if (job_ptr->het_job_id) |
| jid = job_ptr->het_job_id; |
| |
| if ((rc = job_requeue(0, jid, NULL, false, 0)) && |
| (rc != ESLURM_JOB_PENDING)) { |
| info("unable to requeue JobId=%u: %s", jid, |
| slurm_strerror(rc)); |
| |
| srun_user_message(job_ptr, |
| "PrologSlurmctld failed, job killed"); |
| |
| if (job_ptr->het_job_id) { |
| job_record_t *het_leader = job_ptr; |
| |
| if (!het_leader->het_job_list) { |
| het_leader = find_job_record( |
| job_ptr->het_job_id); |
| } |
| |
| /* |
| * Don't do anything if there isn't a het_leader |
| * (which there should be). |
| */ |
| if (het_leader) { |
| (void) het_job_signal(het_leader, |
| SIGKILL, |
| 0, 0, false); |
| } else { |
| error("No het_leader found for %pJ", |
| job_ptr); |
| } |
| } else { |
| job_signal(job_ptr, SIGKILL, 0, 0, false); |
| } |
| } |
| } else |
| debug2("prolog_slurmctld JobId=%u prolog completed", job_id); |
| |
| prolog_running_decr(job_ptr); |
| |
| unlock_slurmctld(job_write_lock); |
| } |
| |
| extern void prep_epilog_slurmctld_callback(int rc, uint32_t job_id, |
| bool timed_out) |
| { |
| slurmctld_lock_t job_write_lock = { |
| .job = WRITE_LOCK, .node = WRITE_LOCK}; |
| job_record_t *job_ptr; |
| |
| lock_slurmctld(job_write_lock); |
| if (!(job_ptr = find_job_record(job_id))) { |
| error("%s: missing JobId=%u", __func__, job_id); |
| unlock_slurmctld(job_write_lock); |
| return; |
| } |
| if (timed_out) { |
| /* Log an error but still continue cleaning up the job */ |
| error("epilog_slurmctld JobId=%u timed out", job_id); |
| } |
| |
| /* prevent underflow */ |
| if (job_ptr->prep_epilog_cnt) |
| job_ptr->prep_epilog_cnt--; |
| |
| if (job_ptr->prep_epilog_cnt) { |
| debug2("%s: still %u async epilogs left to complete", |
| __func__, job_ptr->prep_epilog_cnt); |
| unlock_slurmctld(job_write_lock); |
| return; |
| } |
| |
| /* all async prologs have completed, continue on now */ |
| job_ptr->epilog_running = false; |
| |
| /* |
| * Clear the JOB_COMPLETING flag only if the node count is 0 |
| * meaning the slurmd epilogs have already completed. |
| */ |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| cleanup_completing(job_ptr, true); |
| } |
| |
| unlock_slurmctld(job_write_lock); |
| } |