| /*****************************************************************************\ |
| * job_mgr.c - manage the job information of slurm |
| * Note: there is a global job list (job_list), time stamp |
| * (last_job_update), and hash table (job_hash) |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Copyright (C) SchedMD LLC. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| #define _GNU_SOURCE |
| |
| #include <ctype.h> |
| #include <dirent.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <libgen.h> |
| #include <limits.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/bitstring.h" |
| #include "src/common/cpu_frequency.h" |
| #include "src/common/cron.h" |
| #include "src/common/fd.h" |
| #include "src/common/forward.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/id_util.h" |
| #include "src/common/node_features.h" |
| #include "src/common/parse_time.h" |
| #include "src/common/port_mgr.h" |
| #include "src/common/slurm_protocol_pack.h" |
| #include "src/common/state_save.h" |
| #include "src/common/timers.h" |
| #include "src/common/track_script.h" |
| #include "src/common/tres_bind.h" |
| #include "src/common/tres_frequency.h" |
| #include "src/common/xassert.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/interfaces/accounting_storage.h" |
| #include "src/interfaces/acct_gather.h" |
| #include "src/interfaces/auth.h" |
| #include "src/interfaces/burst_buffer.h" |
| #include "src/interfaces/cred.h" |
| #include "src/interfaces/gres.h" |
| #include "src/interfaces/hash.h" |
| #include "src/interfaces/job_submit.h" |
| #include "src/interfaces/jobcomp.h" |
| #include "src/interfaces/mcs.h" |
| #include "src/interfaces/node_features.h" |
| #include "src/interfaces/preempt.h" |
| #include "src/interfaces/priority.h" |
| #include "src/interfaces/sched_plugin.h" |
| #include "src/interfaces/select.h" |
| #include "src/interfaces/switch.h" |
| #include "src/interfaces/topology.h" |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/fed_mgr.h" |
| #include "src/slurmctld/gang.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/locks.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/power_save.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmctld/slurmscriptd.h" |
| #include "src/slurmctld/state_save.h" |
| #include "src/slurmctld/trigger_mgr.h" |
| |
| #include "src/stepmgr/gres_stepmgr.h" |
| #include "src/stepmgr/srun_comm.h" |
| #include "src/stepmgr/stepmgr.h" |
| |
| #define ARRAY_ID_BUF_SIZE 32 |
| #define MAX_EXIT_VAL 255 /* Maximum value returned by WIFEXITED() */ |
| #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0 |
| #define TOP_PRIORITY 0xffff0000 /* large, but leave headroom for higher */ |
| #define PURGE_OLD_JOB_IN_SEC 2592000 /* 30 days in seconds */ |
| |
| #define JOB_HASH_INX(_job_id) (_job_id % hash_table_size) |
| #define JOB_ARRAY_HASH_INX(_job_id, _task_id) \ |
| ((_job_id + _task_id) % hash_table_size) |
| |
| /* No need to change we always pack SLURM_PROTOCOL_VERSION */ |
| #define JOB_STATE_VERSION "PROTOCOL_VERSION" |
| |
| typedef enum { |
| JOB_HASH_JOB, |
| JOB_HASH_ARRAY_JOB, |
| JOB_HASH_ARRAY_TASK, |
| } job_hash_type_t; |
| |
| typedef struct { |
| int resp_array_cnt; |
| int resp_array_size; |
| uint32_t *resp_array_rc; |
| bitstr_t **resp_array_task_id; |
| char **err_msg; |
| } resp_array_struct_t; |
| |
| typedef struct { |
| buf_t *buffer; |
| uint32_t filter_uid; |
| bool has_qos_lock; |
| job_record_t *het_leader; |
| uint32_t jobs_packed; |
| uint16_t protocol_version; |
| uint16_t show_flags; |
| uid_t uid; |
| slurmdb_user_rec_t user_rec; |
| bool privileged; |
| part_record_t **visible_parts; |
| } _foreach_pack_job_info_t; |
| |
| typedef struct { |
| bitstr_t *node_map; |
| list_t *license_list; |
| int rc; |
| } job_overlap_args_t; |
| |
| typedef struct { |
| slurm_selected_step_t *filter_id; |
| bool free_array_bitmap; |
| job_record_t *job_ptr; |
| } array_task_filter_t; |
| |
| typedef struct { |
| list_t *array_leader_list; /* list of job_record_t */ |
| list_t *pending_array_task_list; /* list of array_task_filter_t */ |
| uid_t auth_uid; |
| bool filter_specific_job_ids; |
| job_record_t *het_leader; |
| kill_jobs_msg_t *kill_msg; |
| time_t now; |
| list_t *other_job_list; /* list of job_record_t */ |
| list_t *responses; /* list of kill_jobs_resp_job_t */ |
| } signal_jobs_args_t; |
| |
| typedef struct { |
| int curr_count; |
| kill_jobs_resp_msg_t *resp_msg; |
| } xfer_signal_jobs_responses_args_t; |
| |
| #define MAGIC_FOREACH_BY_JOBID_ARGS 0x1a0beebe |
| typedef struct { |
| int magic; /* MAGIC_FOREACH_BY_JOBID_ARGS */ |
| foreach_job_by_id_control_t control; |
| uint32_t count; |
| JobForEachFunc callback; |
| JobNullForEachFunc null_callback; /* If not set, then do nothing when |
| * the job id is not found. */ |
| JobROForEachFunc ro_callback; |
| void *callback_arg; |
| job_record_t *job_ptr; |
| const slurm_selected_step_t *filter; |
| } for_each_by_job_id_args_t; |
| |
| typedef struct { |
| uint32_t error_code; |
| uint32_t max_nodes; |
| uint32_t min_nodes; |
| part_record_t *part_ptr; |
| uid_t submit_uid; |
| uint32_t time_limit; |
| } qos_part_check_t; |
| |
| typedef struct { |
| uint32_t het_job_offset; |
| job_record_t *job_ptr; |
| uint16_t min_part_prio_tier; |
| time_t now; |
| bitstr_t *part_nodes; |
| bool use_none_resv_nodes; |
| } top_prio_args_t; |
| |
| typedef struct { |
| job_record_t *job_ptr; |
| hostset_t *hs; |
| } foreach_hetcomp_args_t; |
| |
| typedef struct { |
| job_step_kill_msg_t *job_step_kill_msg; |
| int rc; |
| uint32_t uid; |
| } foreach_kill_hetjob_step_t; |
| |
| typedef struct { |
| slurmctld_resv_t *cur_resv; |
| bool found; |
| job_record_t *job_ptr2; |
| } findfirst_resv_overlap_t; |
| |
| typedef struct { |
| time_t batch_startup_time; |
| job_record_t *job_ptr; |
| time_t node_boot_time; |
| node_record_t *node_ptr; |
| int node_inx; |
| time_t now; |
| bool power_save_on; |
| } foreach_purge_missing_jobs_t; |
| |
| typedef struct { |
| int kill_job_cnt; |
| node_record_t *node_ptr; |
| time_t now; |
| part_record_t *part_ptr; |
| bool requeue_on_resume_failure; |
| } foreach_kill_job_by_t; |
| |
| typedef struct { |
| uint16_t flags; |
| job_record_t *het_job_leader; |
| bool preempt; |
| int rc; |
| uint16_t signal; |
| uid_t uid; |
| } foreach_kill_hetjob_t; |
| |
| typedef struct { |
| job_record_t *het_job_leader; |
| uint32_t job_return_code; |
| bool node_fail; |
| bool requeue; |
| int rc; |
| uid_t uid; |
| } foreach_complete_hetjob_t; |
| |
| typedef struct { |
| char *names; |
| char *names_pos; |
| part_record_t *part_ptr; |
| } foreach_rebuild_names_t; |
| |
| typedef struct { |
| bool any_check; |
| slurmdb_assoc_rec_t *assoc_ptr; |
| job_desc_msg_t *job_desc; |
| uint32_t max_nodes_orig; |
| uint32_t max_time; |
| uint32_t min_nodes_orig; |
| slurmdb_qos_rec_t *qos_ptr; |
| list_t *qos_ptr_list; |
| int rc; |
| bitstr_t *req_bitmap; |
| uid_t submit_uid; |
| } foreach_valid_part_t; |
| |
| typedef struct { |
| uint16_t cpus_per_task; |
| job_desc_msg_t *job_desc; |
| uint32_t max_cpus; |
| uint32_t min_cpus; |
| uint32_t pn_min_cpus; |
| uint64_t pn_min_memory; |
| int rc; |
| } foreach_valid_pn_min_mem_t; |
| |
| typedef struct { |
| char *err_msg; |
| job_record_t *het_leader; |
| job_desc_msg_t *job_desc; |
| int rc; |
| uid_t uid; |
| } foreach_update_hetjob_t; |
| |
| typedef struct { |
| job_record_t *het_leader; |
| bool indf_susp; |
| uint16_t op; |
| int rc; |
| } foreach_sus_hetjob_t; |
| |
| typedef struct { |
| uint32_t flags; |
| job_record_t *het_leader; |
| bool preempt; |
| int rc; |
| uid_t uid; |
| } foreach_requeue_hetjob_t; |
| |
| typedef struct { |
| uint32_t id; |
| int cnt; |
| } foreach_hold_by_id_t; |
| |
| /* Global variables */ |
| list_t *job_list = NULL; /* job_record list */ |
| time_t last_job_update; /* time of last update to job records */ |
| |
| list_t *purge_jobs_list = NULL; /* job_record_t entries to free */ |
| |
| /* Local variables */ |
| static int bf_min_age_reserve = 0; |
| static uint32_t delay_boot = 0; |
| static uint32_t highest_prio = 0; |
| static uint32_t lowest_prio = TOP_PRIORITY; |
| static int hash_table_size = 0; |
| static int job_count = 0; /* job's in the system */ |
| static uint32_t job_id_sequence = 0; /* first job_id to assign new job */ |
| static struct job_record **job_hash = NULL; |
| static struct job_record **job_array_hash_j = NULL; |
| static struct job_record **job_array_hash_t = NULL; |
| static bool kill_invalid_dep; |
| static time_t last_file_write_time = (time_t) 0; |
| static uint32_t max_array_size = NO_VAL; |
| static bitstr_t *requeue_exit = NULL; |
| static bitstr_t *requeue_exit_hold = NULL; |
| static bool validate_cfgd_licenses = true; |
| |
| /* Local functions */ |
| static void _signal_pending_job_array_tasks(job_record_t *job_ptr, bitstr_t |
| **array_bitmap, uint16_t signal, |
| uid_t uid, int32_t i_last, |
| time_t now, int *rc); |
| static void _add_job_hash(job_record_t *job_ptr); |
| static void _add_job_array_hash(job_record_t *job_ptr); |
| static void _handle_requeue_limit(job_record_t *job_ptr, const char *caller); |
| static int _copy_job_desc_to_file(job_desc_msg_t * job_desc, |
| uint32_t job_id); |
| static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, |
| job_record_t **job_ptr, |
| bitstr_t ** exc_bitmap, |
| bitstr_t ** req_bitmap); |
| static char *_copy_nodelist_no_dup(char *node_list); |
| static job_record_t *_create_job_record(uint32_t num_jobs, bool list_add); |
| static slurmdb_qos_rec_t *_determine_and_validate_qos( |
| char *resv_name, slurmdb_assoc_rec_t *assoc_ptr, bool privileged, |
| slurmdb_qos_rec_t *qos_rec, int *error_code, bool locked, |
| log_level_t log_lvl); |
| static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src); |
| static uint64_t _get_def_mem(part_record_t *part_ptr, uint64_t *tres_req_cnt); |
| static bool _get_whole_hetjob(void); |
| static bool _higher_precedence(job_record_t *job_ptr, job_record_t *job_ptr2); |
| static void _job_array_comp(job_record_t *job_ptr, bool was_running, |
| bool requeue); |
| static int _job_create(job_desc_msg_t *job_desc, int allocate, int will_run, |
| bool cron, job_record_t **job_rec_ptr, uid_t submit_uid, |
| char **err_msg, uint16_t protocol_version); |
| static void _job_timed_out(job_record_t *job_ptr, bool preempted); |
| static void _kill_dependent(job_record_t *job_ptr); |
| static int _list_find_job_old(void *job_entry, void *key); |
| static bitstr_t *_make_requeue_array(char *conf_buf); |
| static uint32_t _max_switch_wait(uint32_t input_wait); |
| static void _move_to_purge_jobs_list(void *job_entry); |
| static time_t _get_last_job_state_write_time(void); |
| static void _pack_default_job_details(job_record_t *job_ptr, buf_t *buffer, |
| uint16_t protocol_version); |
| static void _pack_pending_job_details(job_details_t *detail_ptr, buf_t *buffer, |
| uint16_t protocol_version); |
| static void _purge_missing_jobs(int node_inx, time_t now); |
| static int _read_data_array_from_file(int fd, char *file_name, char ***data, |
| uint32_t *size, job_record_t *job_ptr); |
| static void _remove_job_hash(job_record_t *job_ptr, job_hash_type_t type); |
| static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr, |
| uint32_t rc, char *err_msg); |
| static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id, |
| uint32_t task_id, uint32_t rc); |
| static void _resp_array_free(resp_array_struct_t *resp); |
| static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp, |
| uint32_t job_id); |
| static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp); |
| static void _send_job_kill(job_record_t *job_ptr); |
| static int _set_job_id(job_record_t *job_ptr); |
| static void _set_job_requeue_exit_value(job_record_t *job_ptr); |
| static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal, |
| uint16_t flags); |
| static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags); |
| static void _suspend_job(job_record_t *job_ptr, uint16_t op); |
| static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp); |
| static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset); |
| static int _update_job_nodes_str(job_record_t *job_ptr); |
| static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid, |
| bitstr_t *req_bitmap, part_record_t *part_ptr, |
| list_t *part_ptr_list, |
| slurmdb_assoc_rec_t *assoc_ptr, |
| slurmdb_qos_rec_t *qos_ptr, |
| list_t *qos_ptr_list); |
| static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate, |
| bool cron, uid_t submit_uid, |
| part_record_t *part_ptr, list_t *part_list); |
| static void _validate_job_files(void); |
| static int _clear_state_dir_flag(void *x, void *arg); |
| static int _test_state_dir_flag(void *x, void *arg); |
| |
| static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg, |
| part_record_t *part_ptr, |
| list_t *part_list); |
| static bool _valid_pn_min_mem(job_desc_msg_t * job_desc_msg, |
| part_record_t *part_ptr); |
| static int _write_data_array_to_file(char *file_name, char **data, |
| uint32_t size); |
| |
| static char *_get_mail_user(const char *user_name, job_record_t *job_ptr) |
| { |
| char *mail_user = NULL; |
| if (!user_name || (user_name[0] == '\0')) { |
| mail_user = user_from_job(job_ptr); |
| /* unqualified sender, append MailDomain if set */ |
| if (slurm_conf.mail_domain) |
| xstrfmtcat(mail_user, "@%s", slurm_conf.mail_domain); |
| } else { |
| mail_user = xstrdup(user_name); |
| } |
| |
| return mail_user; |
| } |
| |
| static int _job_fail_account(job_record_t *job_ptr, const char *func_name, |
| bool assoc_locked) |
| { |
| int rc = 0; // Return number of pending jobs held |
| |
| if (IS_JOB_FINISHED(job_ptr)) { |
| /* |
| * The acct_policy has already be cleared for this job. Just |
| * reset the pointer. |
| */ |
| job_ptr->assoc_ptr = NULL; |
| job_ptr->assoc_id = 0; |
| return rc; |
| } |
| |
| if (IS_JOB_PENDING(job_ptr)) { |
| info("%s: %pJ ineligible due to invalid association", |
| func_name, job_ptr); |
| |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_ACCOUNT; |
| |
| if (job_ptr->details) { |
| /* reset the job */ |
| job_ptr->details->accrue_time = 0; |
| job_ptr->bit_flags &= ~JOB_ACCRUE_OVER; |
| job_ptr->details->begin_time = 0; |
| /* Update job with new begin_time. */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } |
| rc = 1; |
| } |
| |
| /* This job is no longer eligible, so make it so. */ |
| if (job_ptr->assoc_ptr) { |
| part_record_t *tmp_part = job_ptr->part_ptr; |
| list_t *tmp_part_list = job_ptr->part_ptr_list; |
| slurmdb_qos_rec_t *tmp_qos = job_ptr->qos_ptr; |
| |
| /* |
| * Force a start so the association doesn't get lost. Since |
| * there could be some delay in the start of the job when |
| * running with the slurmdbd. |
| */ |
| if (!IS_JOB_IN_DB(job_ptr)) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| /* |
| * Don't call acct_policy_remove_accrue_time() here, the cnt on |
| * parent associations will be handled correctly by the removal |
| * of the association. |
| */ |
| |
| /* |
| * Clear ptrs so that only association usage is removed. |
| * Otherwise qos and partition limits will be double accounted |
| * for when this job finishes. Don't do this for acrrual time, |
| * it has be on both because the job is ineligible and can't |
| * accrue time. |
| */ |
| job_ptr->part_ptr = NULL; |
| job_ptr->part_ptr_list = NULL; |
| job_ptr->qos_ptr = NULL; |
| |
| acct_policy_remove_job_submit(job_ptr, assoc_locked); |
| |
| job_ptr->part_ptr = tmp_part; |
| job_ptr->part_ptr_list = tmp_part_list; |
| job_ptr->qos_ptr = tmp_qos; |
| |
| job_ptr->assoc_ptr = NULL; |
| /* Don't clear assoc_id, since that is what the job requests */ |
| } |
| |
| job_ptr->assoc_id = 0; |
| |
| return rc; |
| } |
| |
| extern int job_fail_qos(job_record_t *job_ptr, const char *func_name, |
| bool assoc_locked) |
| { |
| int rc = 0; // Return number of pending jobs held |
| |
| if (IS_JOB_FINISHED(job_ptr)) { |
| /* |
| * The acct_policy has already be cleared for this job. Just |
| * reset the pointer. |
| */ |
| job_ptr->qos_ptr = NULL; |
| job_ptr->qos_id = 0; |
| return rc; |
| } |
| |
| if (IS_JOB_PENDING(job_ptr)) { |
| info("%s: %pJ ineligible due to invalid qos", |
| func_name, job_ptr); |
| |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_QOS; |
| |
| if (job_ptr->details) { |
| /* reset the job */ |
| acct_policy_remove_accrue_time(job_ptr, assoc_locked); |
| job_ptr->details->begin_time = 0; |
| /* Update job with new begin_time. */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } |
| rc = 1; |
| } |
| |
| /* This job is no longer eligible, so make it so. */ |
| if (job_ptr->qos_ptr) { |
| slurmdb_assoc_rec_t *tmp_assoc = job_ptr->assoc_ptr; |
| |
| /* |
| * Force a start so the qos doesn't get lost. Since |
| * there could be some delay in the start of the job when |
| * running with the slurmdbd. |
| */ |
| if (!IS_JOB_IN_DB(job_ptr)) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| /* |
| * Clear ptrs so that only qos usage is removed. Otherwise |
| * association limits will be double accounted for when this |
| * job finishes. Don't do this for acrrual time, it has be on |
| * both because the job is ineligible and can't accrue time. |
| */ |
| job_ptr->assoc_ptr = NULL; |
| |
| acct_policy_remove_job_submit(job_ptr, assoc_locked); |
| |
| job_ptr->assoc_ptr = tmp_assoc; |
| |
| job_ptr->qos_ptr = NULL; |
| FREE_NULL_LIST(job_ptr->qos_list); |
| /* |
| * Don't clear qos_id or details->qos_req, since that is what |
| * the job requests |
| */ |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Functions used to manage job array responses with a separate return code |
| * possible for each task ID |
| */ |
| /* Add job record to resp_array_struct_t, free with _resp_array_free() */ |
| static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr, |
| uint32_t rc, char *err_msg) |
| { |
| resp_array_struct_t *loc_resp; |
| int array_size; |
| int i; |
| |
| if ((job_ptr->array_task_id == NO_VAL) && |
| (job_ptr->array_recs == NULL)) { |
| error("%s: called for non-job array %pJ", |
| __func__, job_ptr); |
| return; |
| } |
| |
| if (max_array_size == NO_VAL) |
| max_array_size = slurm_conf.max_array_sz; |
| |
| xassert(resp); |
| if (*resp == NULL) { |
| /* Initialize the data structure */ |
| loc_resp = xmalloc(sizeof(resp_array_struct_t)); |
| loc_resp->resp_array_cnt = 0; |
| loc_resp->resp_array_size = 10; |
| xrecalloc(loc_resp->resp_array_rc, loc_resp->resp_array_size, |
| sizeof(uint32_t)); |
| xrecalloc(loc_resp->resp_array_task_id, |
| loc_resp->resp_array_size, |
| sizeof(bitstr_t *)); |
| xrecalloc(loc_resp->err_msg, loc_resp->resp_array_size, |
| sizeof(char *)); |
| *resp = loc_resp; |
| } else { |
| loc_resp = *resp; |
| } |
| |
| for (i = 0; i < loc_resp->resp_array_cnt; i++) { |
| if (loc_resp->resp_array_rc[i] != rc) |
| continue; |
| /* Add to existing error code record */ |
| if (job_ptr->array_task_id != NO_VAL) { |
| if (job_ptr->array_task_id < |
| bit_size(loc_resp->resp_array_task_id[i])) { |
| bit_set(loc_resp->resp_array_task_id[i], |
| job_ptr->array_task_id); |
| } else { |
| error("%s: found invalid task id %pJ", |
| __func__, job_ptr); |
| } |
| } else if (job_ptr->array_recs && |
| job_ptr->array_recs->task_id_bitmap) { |
| array_size = bit_size(job_ptr->array_recs-> |
| task_id_bitmap); |
| if (bit_size(loc_resp->resp_array_task_id[i]) != |
| array_size) { |
| bit_realloc(loc_resp->resp_array_task_id[i], |
| array_size); |
| } |
| bit_or(loc_resp->resp_array_task_id[i], |
| job_ptr->array_recs->task_id_bitmap); |
| } else { |
| error("%s: found job %pJ without task ID or bitmap", |
| __func__, job_ptr); |
| } |
| return; |
| } |
| |
| /* Need to add a new record for this error code */ |
| if (loc_resp->resp_array_cnt >= loc_resp->resp_array_size) { |
| /* Need to grow the table size */ |
| loc_resp->resp_array_size += 10; |
| xrecalloc(loc_resp->resp_array_rc, loc_resp->resp_array_size, |
| sizeof(uint32_t)); |
| xrecalloc(loc_resp->resp_array_task_id, |
| loc_resp->resp_array_size, |
| sizeof(bitstr_t *)); |
| xrecalloc(loc_resp->err_msg, loc_resp->resp_array_size, |
| sizeof(bitstr_t *)); |
| } |
| |
| loc_resp->resp_array_rc[loc_resp->resp_array_cnt] = rc; |
| loc_resp->err_msg[loc_resp->resp_array_cnt] = xstrdup(err_msg); |
| if (job_ptr->array_task_id != NO_VAL) { |
| loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] = |
| bit_alloc(max_array_size); |
| if (job_ptr->array_task_id < |
| bit_size(loc_resp->resp_array_task_id |
| [loc_resp->resp_array_cnt])) { |
| bit_set(loc_resp->resp_array_task_id |
| [loc_resp->resp_array_cnt], |
| job_ptr->array_task_id); |
| } |
| } else if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) { |
| loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] = |
| bit_copy(job_ptr->array_recs->task_id_bitmap); |
| } else { |
| error("%s: found %pJ without task ID or bitmap", |
| __func__, job_ptr); |
| loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] = |
| bit_alloc(max_array_size); |
| } |
| loc_resp->resp_array_cnt++; |
| } |
| |
| /* Add record to resp_array_struct_t, free with _resp_array_free(). |
| * This is a variant of _resp_array_add for the case where a job/task ID |
| * is not found, so we use a dummy job record based upon the input IDs. */ |
| static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id, |
| uint32_t task_id, uint32_t rc) |
| { |
| job_record_t job_ptr; |
| |
| job_ptr.job_id = job_id; |
| job_ptr.array_job_id = job_id; |
| job_ptr.array_task_id = task_id; |
| job_ptr.array_recs = NULL; |
| _resp_array_add(resp, &job_ptr, rc, NULL); |
| } |
| |
| /* Free resp_array_struct_t built by _resp_array_add() */ |
| static void _resp_array_free(resp_array_struct_t *resp) |
| { |
| int i; |
| |
| if (resp) { |
| for (i = 0; i < resp->resp_array_cnt; i++) { |
| FREE_NULL_BITMAP(resp->resp_array_task_id[i]); |
| xfree(resp->err_msg[i]); |
| } |
| xfree(resp->err_msg); |
| xfree(resp->resp_array_task_id); |
| xfree(resp->resp_array_rc); |
| xfree(resp); |
| } |
| } |
| |
| /* Translate internal job array data structure into a response message */ |
| static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp, |
| uint32_t job_id) |
| { |
| job_array_resp_msg_t *msg; |
| char task_str[ARRAY_ID_BUF_SIZE]; |
| int *ffs = NULL; |
| int i, j, low; |
| |
| ffs = xcalloc(resp->resp_array_cnt, sizeof(int)); |
| for (i = 0; i < resp->resp_array_cnt; i++) { |
| ffs[i] = bit_ffs(resp->resp_array_task_id[i]); |
| } |
| |
| msg = xmalloc(sizeof(job_array_resp_msg_t)); |
| msg->job_array_count = resp->resp_array_cnt; |
| msg->job_array_id = xcalloc(resp->resp_array_cnt, sizeof(char *)); |
| msg->error_code = xcalloc(resp->resp_array_cnt, sizeof(uint32_t)); |
| msg->err_msg = xcalloc(resp->resp_array_cnt, sizeof(char *)); |
| for (i = 0; i < resp->resp_array_cnt; i++) { |
| low = -1; |
| for (j = 0; j < resp->resp_array_cnt; j++) { |
| if ((ffs[j] != -1) && |
| ((low == -1) || (ffs[j] < ffs[low]))) |
| low = j; |
| } |
| if (low == -1) |
| break; |
| ffs[low] = -1; |
| |
| msg->error_code[i] = resp->resp_array_rc[low]; |
| msg->err_msg[i] = xstrdup(resp->err_msg[low]); |
| bit_fmt(task_str, ARRAY_ID_BUF_SIZE, |
| resp->resp_array_task_id[low]); |
| if (strlen(task_str) >= ARRAY_ID_BUF_SIZE - 2) { |
| /* Append "..." to the buffer on overflow */ |
| task_str[ARRAY_ID_BUF_SIZE - 4] = '.'; |
| task_str[ARRAY_ID_BUF_SIZE - 3] = '.'; |
| task_str[ARRAY_ID_BUF_SIZE - 2] = '.'; |
| task_str[ARRAY_ID_BUF_SIZE - 1] = '\0'; |
| } |
| xstrfmtcat(msg->job_array_id[i], "%u_%s", job_id, task_str); |
| } |
| |
| xfree(ffs); |
| return msg; |
| } |
| |
| static int _add_job_record(job_record_t *job_ptr, int num_jobs) |
| { |
| if ((job_count + num_jobs) > slurm_conf.max_job_cnt) { |
| error("%s: MaxJobCount limit from slurm.conf reached (%u)", |
| __func__, slurm_conf.max_job_cnt); |
| return SLURM_ERROR; |
| } |
| job_count += num_jobs; |
| last_job_update = time(NULL); |
| list_append(job_list, job_ptr); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _create_job_record - create an empty job_record including job_details. |
| * load its values with defaults (zeros, nulls, and magic cookie) |
| * IN num_jobs - number of jobs this record should represent |
| * = 0 - split out a job array record to its own job record |
| * = 1 - simple job OR job array with one task |
| * > 1 - job array create with the task count as num_jobs |
| * IN list_add - add to the joblist or not. |
| * RET pointer to the record or NULL if error |
| * NOTE: allocates memory that should be xfreed with job_record_delete |
| */ |
| static job_record_t *_create_job_record(uint32_t num_jobs, bool list_add) |
| { |
| job_record_t *job_ptr = job_record_create(); |
| |
| if (list_add) { |
| _add_job_record(job_ptr, num_jobs); |
| } |
| |
| return job_ptr; |
| } |
| |
| /* |
| * delete_job_desc_files - delete job descriptor related files |
| * |
| * Note that this will be called on all individual job array tasks, |
| * even though (as of 17.11) individual directories are no longer created. |
| */ |
| extern void delete_job_desc_files(uint32_t job_id) |
| { |
| char *dir_name = NULL, *file_name = NULL; |
| int hash = job_id % 10; |
| DIR *f_dir; |
| struct dirent *dir_ent; |
| |
| dir_name = xstrdup_printf("%s/hash.%d/job.%u", |
| slurm_conf.state_save_location, |
| hash, job_id); |
| |
| f_dir = opendir(dir_name); |
| if (f_dir) { |
| while ((dir_ent = readdir(f_dir))) { |
| if (!xstrcmp(dir_ent->d_name, ".") || |
| !xstrcmp(dir_ent->d_name, "..")) |
| continue; |
| xstrfmtcat(file_name, "%s/%s", dir_name, |
| dir_ent->d_name); |
| (void) unlink(file_name); |
| xfree(file_name); |
| } |
| closedir(f_dir); |
| } else if (errno == ENOENT) { |
| xfree(dir_name); |
| return; |
| } else { |
| error("opendir(%s): %m", dir_name); |
| } |
| |
| (void) rmdir(dir_name); |
| xfree(dir_name); |
| } |
| |
| static uint32_t _max_switch_wait(uint32_t input_wait) |
| { |
| static time_t sched_update = 0; |
| static uint32_t max_wait = 300; /* default max_switch_wait, seconds */ |
| int i; |
| |
| if (sched_update != slurm_conf.last_update) { |
| char *tmp_ptr; |
| sched_update = slurm_conf.last_update; |
| if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, |
| "max_switch_wait="))) { |
| /* 0123456789012345 */ |
| i = atoi(tmp_ptr + 16); |
| if (i < 0) { |
| error("ignoring SchedulerParameters: " |
| "max_switch_wait of %d", i); |
| } else { |
| max_wait = i; |
| } |
| } |
| } |
| |
| if (max_wait > input_wait) |
| return input_wait; |
| return max_wait; |
| } |
| |
| static slurmdb_qos_rec_t *_determine_and_validate_qos( |
| char *resv_name, slurmdb_assoc_rec_t *assoc_ptr, bool privileged, |
| slurmdb_qos_rec_t *qos_rec, int *error_code, bool locked, |
| log_level_t log_lvl) |
| { |
| slurmdb_qos_rec_t *qos_ptr = NULL; |
| |
| /* If enforcing associations make sure this is a valid qos |
| with the association. If not just fill in the qos and |
| continue. */ |
| |
| xassert(qos_rec); |
| |
| assoc_mgr_get_default_qos_info(assoc_ptr, qos_rec); |
| if (assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce, |
| &qos_ptr, locked) != SLURM_SUCCESS) { |
| log_var(log_lvl, "Invalid qos (%s)", qos_rec->name); |
| *error_code = ESLURM_INVALID_QOS; |
| return NULL; |
| } |
| |
| if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS) && assoc_ptr && |
| !privileged && |
| (!assoc_ptr->usage->valid_qos || |
| !bit_test(assoc_ptr->usage->valid_qos, qos_rec->id))) { |
| log_var(log_lvl, "This association %d(account='%s', user='%s', partition='%s') does not have access to qos %s", |
| assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user, |
| assoc_ptr->partition, qos_rec->name); |
| *error_code = ESLURM_INVALID_QOS; |
| return NULL; |
| } |
| |
| if (qos_ptr) { |
| if ((qos_ptr->flags & QOS_FLAG_RELATIVE) && |
| (qos_ptr->flags & QOS_FLAG_PART_QOS)) { |
| log_var(log_lvl, "QOS %s is relative and used as a Partition QOS. This prohibits it from being used as a job's QOS", |
| qos_rec->name); |
| *error_code = ESLURM_INVALID_QOS; |
| return NULL; |
| } |
| |
| if ((qos_ptr->flags & QOS_FLAG_REQ_RESV) && |
| (!resv_name || resv_name[0] == '\0')) { |
| log_var(log_lvl, "qos %s can only be used in a reservation", |
| qos_rec->name); |
| *error_code = ESLURM_INVALID_QOS; |
| return NULL; |
| } |
| } |
| |
| *error_code = SLURM_SUCCESS; |
| return qos_ptr; |
| } |
| |
| static list_t *_get_qos_ptr_list(char *qos_req, char *resv_name, |
| slurmdb_assoc_rec_t *assoc_ptr, |
| bool privileged, int *error_code, bool locked, |
| log_level_t log_lvl) |
| { |
| list_t *qos_ptr_list = NULL; |
| char *token, *last = NULL, *tmp_qos_req; |
| |
| xassert(error_code); |
| |
| if (!xstrchr(qos_req, ',')) |
| return qos_ptr_list; |
| |
| tmp_qos_req = xstrdup(qos_req); |
| token = strtok_r(tmp_qos_req, ",", &last); |
| while (token) { |
| slurmdb_qos_rec_t qos_rec = { |
| .name = token, |
| }; |
| slurmdb_qos_rec_t *qos_ptr = |
| _determine_and_validate_qos(resv_name, assoc_ptr, |
| privileged, &qos_rec, |
| error_code, locked, |
| log_lvl); |
| |
| if (*error_code != SLURM_SUCCESS) |
| break; |
| |
| /* |
| * This should not happen as the error_code check should catch |
| * issues before we get here. |
| */ |
| if (!qos_ptr) { |
| *error_code = ESLURM_INVALID_QOS; |
| break; |
| } |
| |
| if (!qos_ptr_list) |
| qos_ptr_list = list_create(NULL); |
| |
| if (!list_find_first_ro(qos_ptr_list, |
| slurm_find_ptr_in_list, |
| qos_ptr)) { |
| list_append(qos_ptr_list, qos_ptr); |
| } |
| token = strtok_r(NULL, ",", &last); |
| } |
| xfree(tmp_qos_req); |
| |
| /* If we have a trailing comma error out */ |
| if (qos_ptr_list && (list_count(qos_ptr_list) == 1)) { |
| error("%s: Invalid qos (%s), it appears there is a trailing comma", |
| __func__, qos_req); |
| *error_code = ESLURM_INVALID_QOS; |
| } |
| |
| if (*error_code != SLURM_SUCCESS) |
| FREE_NULL_LIST(qos_ptr_list); |
| |
| if (qos_ptr_list) |
| list_sort(qos_ptr_list, priority_sort_qos_desc); |
| |
| return qos_ptr_list; |
| } |
| |
| static int _get_qos_info(char *qos_req, uint32_t qos_id, list_t **qos_plist, |
| slurmdb_qos_rec_t **qos_pptr, char *resv_name, |
| slurmdb_assoc_rec_t *assoc_ptr, bool privileged, |
| bool locked, log_level_t log_lvl) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| xassert(qos_plist); |
| xassert(qos_pptr); |
| xassert(!*qos_plist); |
| |
| *qos_plist = _get_qos_ptr_list(qos_req, resv_name, assoc_ptr, |
| privileged, &rc, locked, log_lvl); |
| |
| if (!*qos_plist) { |
| slurmdb_qos_rec_t qos_rec = { |
| .name = qos_req, |
| .id = qos_id, |
| }; |
| |
| *qos_pptr = _determine_and_validate_qos(resv_name, assoc_ptr, |
| privileged, &qos_rec, |
| &rc, locked, log_lvl); |
| } else { |
| *qos_pptr = list_peek(*qos_plist); |
| } |
| |
| return rc; |
| } |
| /* |
| * dump_all_job_state - save the state of all jobs to file for checkpoint |
| * Changes here should be reflected in load_last_job_id() and |
| * load_all_job_state(). |
| * RET 0 or error code |
| */ |
| int dump_all_job_state(void) |
| { |
| /* Save high-water mark to avoid buffer growth with copies */ |
| static uint32_t high_buffer_size = (1024 * 1024); |
| int error_code = SLURM_SUCCESS; |
| char *reg_file; |
| struct stat stat_buf; |
| /* Locks: Read config and job */ |
| slurmctld_lock_t job_read_lock = |
| { READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; |
| buf_t *buffer = init_buf(high_buffer_size); |
| time_t now = time(NULL); |
| time_t last_state_file_time; |
| static time_t last_job_state_size_check = 0; |
| uint32_t jobs_start, jobs_end, jobs_count; |
| DEF_TIMERS; |
| |
| START_TIMER; |
| /* |
| * Check that last state file was written at expected time. |
| * This is a check for two slurmctld daemons running at the same |
| * time in primary mode (a split-brain problem). |
| */ |
| last_state_file_time = _get_last_job_state_write_time(); |
| if (last_file_write_time && last_state_file_time && |
| (last_file_write_time != last_state_file_time)) { |
| error("Bad job state save file time. We wrote it at time %u, " |
| "but the file contains a time stamp of %u.", |
| (uint32_t) last_file_write_time, |
| (uint32_t) last_state_file_time); |
| if (!slurmctld_primary) { |
| fatal("Two slurmctld daemons are running as primary. " |
| "Shutting down this daemon to avoid inconsistent " |
| "state due to split brain."); |
| } |
| } |
| |
| /* write header: version, time */ |
| packstr(JOB_STATE_VERSION, buffer); |
| pack16(SLURM_PROTOCOL_VERSION, buffer); |
| pack_time(now, buffer); |
| |
| /* |
| * write header: job id |
| * This is needed so that the job id remains persistent even after |
| * slurmctld is restarted. |
| */ |
| pack32( job_id_sequence, buffer); |
| |
| debug3("Writing job id %u to header record of job_state file", |
| job_id_sequence); |
| |
| /* write individual job records */ |
| lock_slurmctld(job_read_lock); |
| |
| pack_time(slurmctld_diag_stats.bf_when_last_cycle, buffer); |
| |
| jobs_start = get_buf_offset(buffer); |
| list_for_each_ro(job_list, job_mgr_dump_job_state, buffer); |
| jobs_end = get_buf_offset(buffer); |
| if ((difftime(now, last_job_state_size_check) > 60) && |
| (jobs_count = list_count(job_list))) { |
| uint64_t ave_job_size = jobs_end - jobs_start; |
| uint64_t estimated_job_state_size = ave_job_size * |
| slurm_conf.max_job_cnt; |
| last_job_state_size_check = time(NULL); |
| /* |
| * We assume all jobs were written to buffer, which may not |
| * be true, but in that case we'd already flood the log with |
| * errors. |
| */ |
| estimated_job_state_size /= jobs_count; |
| estimated_job_state_size += jobs_start; |
| ave_job_size /= jobs_count; |
| if (estimated_job_state_size > MAX_BUF_SIZE) |
| error("Configured MaxJobCount may lead to job_state being larger then maximum buffer size and not saved, based on the average job state size(%.2f KiB) we can save state of %"PRIu64" jobs.", |
| (float)ave_job_size / 1024, |
| ((uint64_t)(MAX_BUF_SIZE - jobs_start)) / |
| ave_job_size); |
| } |
| |
| unlock_slurmctld(job_read_lock); |
| |
| reg_file = xstrdup_printf("%s/job_state", |
| slurm_conf.state_save_location); |
| |
| if (stat(reg_file, &stat_buf) == 0) { |
| static time_t last_mtime = (time_t) 0; |
| int delta_t = difftime(stat_buf.st_mtime, last_mtime); |
| if (delta_t < -10) { |
| error("The modification time of %s moved backwards " |
| "by %d seconds", |
| reg_file, (0-delta_t)); |
| error("The clock of the file system and this computer " |
| "appear to not be synchronized"); |
| /* It could be safest to exit here. We likely mounted |
| * a different file system with the state save files */ |
| } |
| last_mtime = time(NULL); |
| } |
| |
| error_code = save_buf_to_state("job_state", buffer, &high_buffer_size); |
| if (!error_code) |
| last_file_write_time = now; |
| |
| xfree(reg_file); |
| FREE_NULL_BUFFER(buffer); |
| END_TIMER2(__func__); |
| return error_code; |
| } |
| |
| static int _find_job_part(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| |
| if ((job_ptr->part_ptr == arg) && !IS_JOB_FINISHED(job_ptr)) |
| return 1; /* match */ |
| return 0; |
| } |
| |
| static int _find_resv_part(void *x, void *key) |
| { |
| slurmctld_resv_t *resv_ptr = (slurmctld_resv_t *) x; |
| |
| if (resv_ptr->part_ptr != (part_record_t *) key) |
| return 0; |
| else |
| return 1; /* match */ |
| } |
| |
| static int _find_part_assoc(void *x, void *key) |
| { |
| part_record_t *part_ptr = (part_record_t *)x; |
| slurmdb_assoc_rec_t *assoc_ptr = (slurmdb_assoc_rec_t *) key; |
| slurmdb_assoc_rec_t assoc_rec; |
| |
| memset(&assoc_rec, 0, sizeof(assoc_rec)); |
| assoc_rec.acct = assoc_ptr->acct; |
| assoc_rec.partition = part_ptr->name; |
| assoc_rec.uid = assoc_ptr->uid; |
| |
| (void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, NULL, true); |
| |
| if (assoc_rec.id != assoc_ptr->id) { |
| info("%s: can't check multiple partitions with partition based associations", |
| __func__); |
| return 1; |
| } |
| return 0; |
| } |
| |
| static int _check_for_part_assocs(list_t *part_ptr_list, |
| slurmdb_assoc_rec_t *assoc_ptr) |
| { |
| if (assoc_ptr && part_ptr_list && |
| list_find_first(part_ptr_list, _find_part_assoc, assoc_ptr)) { |
| return ESLURM_PARTITION_ASSOC; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern void set_job_failed_assoc_qos_ptr(job_record_t *job_ptr) |
| { |
| if (!job_ptr->assoc_ptr && (job_ptr->state_reason == FAIL_ACCOUNT)) { |
| slurmdb_assoc_rec_t assoc_rec; |
| memset(&assoc_rec, 0, sizeof(assoc_rec)); |
| /* |
| * For speed and accuracy we will first see if we once had an |
| * association record. If not look for it by |
| * account,partition, user_id. |
| */ |
| if (job_ptr->assoc_id) |
| assoc_rec.id = job_ptr->assoc_id; |
| else { |
| assoc_rec.acct = job_ptr->account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| } |
| |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| &job_ptr->assoc_ptr, false) == |
| SLURM_SUCCESS) { |
| job_ptr->assoc_id = assoc_rec.id; |
| debug("%s: Filling in assoc for %pJ Assoc=%u", |
| __func__, job_ptr, job_ptr->assoc_id); |
| |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| last_job_update = time(NULL); |
| } |
| } |
| |
| /* |
| * This shouldn't matter if there is a qos_list as that will get |
| * handled after this is called. |
| */ |
| if (!job_ptr->qos_ptr && (job_ptr->state_reason == FAIL_QOS)) { |
| int qos_error = SLURM_SUCCESS; |
| slurmdb_qos_rec_t qos_rec; |
| memset(&qos_rec, 0, sizeof(qos_rec)); |
| qos_rec.id = job_ptr->qos_id; |
| job_ptr->qos_ptr = _determine_and_validate_qos( |
| job_ptr->resv_name, job_ptr->assoc_ptr, |
| job_ptr->limit_set.qos, &qos_rec, |
| &qos_error, false, LOG_LEVEL_DEBUG2); |
| |
| if ((qos_error == SLURM_SUCCESS) && job_ptr->qos_ptr) { |
| /* job_ptr->qos_id should never start at 0 */ |
| if (job_ptr->qos_id != qos_rec.id) { |
| error("%s: Changing job_ptr->qos_id from %u to %u; this should never happen", |
| __func__, job_ptr->qos_id, qos_rec.id); |
| job_ptr->qos_id = qos_rec.id; |
| } |
| debug("%s: Filling in QOS for %pJ QOS=%s(%u)", |
| __func__, job_ptr, qos_rec.name, job_ptr->qos_id); |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| last_job_update = time(NULL); |
| } |
| } |
| } |
| |
| extern void set_job_tres_req_str(job_record_t *job_ptr, bool assoc_mgr_locked) |
| { |
| assoc_mgr_lock_t locks = { .tres = READ_LOCK }; |
| xassert(job_ptr); |
| |
| if (!assoc_mgr_locked) |
| assoc_mgr_lock(&locks); |
| |
| xfree(job_ptr->tres_req_str); |
| job_ptr->tres_req_str = assoc_mgr_make_tres_str_from_array( |
| job_ptr->tres_req_cnt, TRES_STR_FLAG_SIMPLE, true); |
| |
| xfree(job_ptr->tres_fmt_req_str); |
| job_ptr->tres_fmt_req_str = assoc_mgr_make_tres_str_from_array( |
| job_ptr->tres_req_cnt, TRES_STR_CONVERT_UNITS, true); |
| |
| if (!assoc_mgr_locked) |
| assoc_mgr_unlock(&locks); |
| } |
| |
| /* Note that the backup slurmctld has assumed primary control. |
| * This function can be called multiple times. */ |
| extern void backup_slurmctld_restart(void) |
| { |
| last_file_write_time = (time_t) 0; |
| } |
| |
| /* Return the time stamp in the current job state save file, 0 is returned on |
| * error */ |
| static time_t _get_last_job_state_write_time(void) |
| { |
| int error_code = SLURM_SUCCESS; |
| char *state_file = NULL; |
| buf_t *buffer; |
| time_t buf_time = (time_t) 0; |
| char *ver_str = NULL; |
| uint16_t protocol_version = NO_VAL16; |
| |
| /* read the file */ |
| if (!(buffer = state_save_open("job_state", &state_file))) { |
| info("No job state file (%s) found", state_file); |
| error_code = ENOENT; |
| } |
| xfree(state_file); |
| if (error_code) |
| return buf_time; |
| |
| safe_unpackstr(&ver_str, buffer); |
| if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION)) |
| safe_unpack16(&protocol_version, buffer); |
| safe_unpack_time(&buf_time, buffer); |
| |
| unpack_error: |
| xfree(ver_str); |
| FREE_NULL_BUFFER(buffer); |
| return buf_time; |
| } |
| |
| /* |
| * load_all_job_state - load the job state from file, recover from last |
| * checkpoint. Execute this after loading the configuration file data. |
| * Changes here should be reflected in load_last_job_id(). |
| * RET 0 or error code |
| */ |
| extern int load_all_job_state(void) |
| { |
| int error_code = SLURM_SUCCESS; |
| int job_cnt = 0; |
| char *state_file = NULL; |
| buf_t *buffer; |
| time_t buf_time; |
| uint32_t saved_job_id; |
| char *ver_str = NULL; |
| uint16_t protocol_version = NO_VAL16; |
| |
| /* read the file */ |
| if (!(buffer = state_save_open("job_state", &state_file))) { |
| if ((clustername_existed == 1) && (!ignore_state_errors)) |
| fatal("No job state file (%s) to recover", state_file); |
| info("No job state file (%s) to recover", state_file); |
| xfree(state_file); |
| return ENOENT; |
| } |
| xfree(state_file); |
| |
| job_id_sequence = MAX(job_id_sequence, slurm_conf.first_job_id); |
| |
| safe_unpackstr(&ver_str, buffer); |
| debug3("Version string in job_state header is %s", ver_str); |
| if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION)) |
| safe_unpack16(&protocol_version, buffer); |
| xfree(ver_str); |
| |
| if (protocol_version == NO_VAL16) { |
| if (!ignore_state_errors) |
| fatal("Can not recover job state, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); |
| error("***********************************************"); |
| error("Can not recover job state, incompatible version"); |
| error("***********************************************"); |
| FREE_NULL_BUFFER(buffer); |
| return EFAULT; |
| } |
| |
| safe_unpack_time(&buf_time, buffer); |
| safe_unpack32(&saved_job_id, buffer); |
| if (saved_job_id <= slurm_conf.max_job_id) |
| job_id_sequence = MAX(saved_job_id, job_id_sequence); |
| debug3("Job id in job_state header is %u", saved_job_id); |
| |
| safe_unpack_time(&buf_time, buffer); /* bf_when_last_cycle */ |
| if (!slurmctld_diag_stats.bf_when_last_cycle) |
| slurmctld_diag_stats.bf_when_last_cycle = buf_time; |
| |
| /* |
| * Previously we locked the tres read lock before this loop. It turned |
| * out that created a double lock when steps were being loaded during |
| * the calls to jobacctinfo_create() which also locks the read lock. |
| * It ended up being much easier to move the locks for the assoc_mgr |
| * into the job_mgr_load_job_state function than any other option. |
| */ |
| while (remaining_buf(buffer) > 0) { |
| error_code = job_mgr_load_job_state(buffer, protocol_version); |
| if (error_code != SLURM_SUCCESS) |
| goto unpack_error; |
| job_cnt++; |
| } |
| debug3("Set job_id_sequence to %u", job_id_sequence); |
| |
| FREE_NULL_BUFFER(buffer); |
| info("Recovered information about %d jobs", job_cnt); |
| return error_code; |
| |
| unpack_error: |
| if (!ignore_state_errors) |
| fatal("Incomplete job state save file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); |
| error("Incomplete job state save file"); |
| info("Recovered information about %d jobs", job_cnt); |
| FREE_NULL_BUFFER(buffer); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * load_last_job_id - load only the last job ID from state save file. |
| * Changes here should be reflected in load_all_job_state(). |
| * RET 0 or error code |
| */ |
| extern int load_last_job_id( void ) |
| { |
| char *state_file = NULL; |
| buf_t *buffer; |
| time_t buf_time; |
| char *ver_str = NULL; |
| uint16_t protocol_version = NO_VAL16; |
| |
| /* read the file */ |
| if (!(buffer = state_save_open("job_state", &state_file))) { |
| debug("No job state file (%s) to recover", state_file); |
| xfree(state_file); |
| return ENOENT; |
| } |
| xfree(state_file); |
| |
| safe_unpackstr(&ver_str, buffer); |
| debug3("Version string in job_state header is %s", ver_str); |
| if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION)) |
| safe_unpack16(&protocol_version, buffer); |
| xfree(ver_str); |
| |
| if (protocol_version == NO_VAL16) { |
| if (!ignore_state_errors) |
| fatal("Can not recover last job ID, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); |
| debug("*************************************************"); |
| debug("Can not recover last job ID, incompatible version"); |
| debug("*************************************************"); |
| FREE_NULL_BUFFER(buffer); |
| return EFAULT; |
| } |
| |
| safe_unpack_time(&buf_time, buffer); |
| safe_unpack32( &job_id_sequence, buffer); |
| debug3("Job ID in job_state header is %u", job_id_sequence); |
| |
| /* Ignore the state for individual jobs stored here */ |
| |
| xfree(ver_str); |
| FREE_NULL_BUFFER(buffer); |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| if (!ignore_state_errors) |
| fatal("Invalid job data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); |
| error("Invalid job data checkpoint file"); |
| xfree(ver_str); |
| FREE_NULL_BUFFER(buffer); |
| return SLURM_ERROR; |
| } |
| |
| extern int job_mgr_dump_job_state(void *object, void *arg) |
| { |
| job_record_t *dump_job_ptr = object; |
| buf_t *buffer = arg; |
| |
| xassert(dump_job_ptr->magic == JOB_MAGIC); |
| |
| /* Don't pack "unlinked" job. */ |
| if (dump_job_ptr->job_id == NO_VAL) |
| return 0; |
| |
| if (dump_job_ptr->array_recs) |
| build_array_str(dump_job_ptr); |
| _update_job_nodes_str(dump_job_ptr); |
| |
| job_record_pack(dump_job_ptr, slurmctld_tres_cnt, buffer, |
| SLURM_PROTOCOL_VERSION); |
| return 0; |
| } |
| |
| extern int job_mgr_load_job_state(buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| char *err_part = NULL; |
| time_t now = time(NULL); |
| job_record_t *job_ptr = NULL; |
| int rc; |
| slurmdb_assoc_rec_t assoc_rec; |
| bool job_finished = false; |
| assoc_mgr_lock_t locks = { |
| .assoc = WRITE_LOCK, |
| .qos = WRITE_LOCK, |
| .tres = READ_LOCK, |
| .user = READ_LOCK, |
| }; |
| |
| if (job_record_unpack(&job_ptr, slurmctld_tres_cnt, buffer, |
| protocol_version)) { |
| error("failed to load job from state"); |
| goto unpack_error; |
| } |
| |
| if (find_job_record(job_ptr->job_id)) { |
| error("duplicate job state record found for %pJ", job_ptr); |
| goto unpack_error; |
| } else if (_add_job_record(job_ptr, 1)) { |
| rc = SLURM_SUCCESS; |
| job_record_delete(job_ptr); |
| job_ptr = NULL; |
| goto free_it; |
| } |
| |
| /* "Don't load "unlinked" job. */ |
| if (job_ptr->job_id == NO_VAL) { |
| debug("skipping unlinked job"); |
| rc = SLURM_SUCCESS; |
| goto free_it; |
| } |
| |
| if ((job_ptr->job_state & JOB_STATE_BASE) >= JOB_END) { |
| error("Invalid data for JobId=%u: job_state=%u", |
| job_ptr->job_id, job_ptr->job_state); |
| goto unpack_error; |
| } |
| if (job_ptr->kill_on_node_fail > 1) { |
| error("Invalid data for JobId=%u: kill_on_node_fail=%u", |
| job_ptr->job_id, job_ptr->kill_on_node_fail); |
| goto unpack_error; |
| } |
| |
| if ((job_ptr->priority > 1) && (job_ptr->direct_set_prio == 0)) { |
| highest_prio = MAX(highest_prio, job_ptr->priority); |
| lowest_prio = MIN(lowest_prio, job_ptr->priority); |
| } |
| |
| get_part_list(job_ptr->partition, &job_ptr->part_ptr_list, |
| &job_ptr->part_ptr, &err_part); |
| if (job_ptr->part_ptr == NULL) { |
| verbose("Invalid partition (%s) for JobId=%u", |
| err_part, job_ptr->job_id); |
| xfree(err_part); |
| /* not fatal error, partition could have been |
| * removed, reset_job_bitmaps() will clean-up |
| * this job */ |
| } |
| |
| #if 0 |
| /* |
| * This is not necessary since the job_id_sequence is checkpointed and |
| * the jobid will be checked if it's in use in get_next_job_id(). |
| */ |
| |
| /* Base job_id_sequence off of local job id but only if the job |
| * originated from this cluster -- so that the local job id of a |
| * different cluster isn't restored here. */ |
| if (!job_fed_details || |
| !xstrcmp(job_fed_details->origin_str, slurm_conf.cluster_name)) |
| local_job_id = fed_mgr_get_local_id(job_id); |
| if (job_id_sequence <= local_job_id) |
| job_id_sequence = local_job_id + 1; |
| #endif |
| |
| if (job_ptr->array_recs && (job_ptr->array_recs->task_cnt > 1)) |
| job_count += (job_ptr->array_recs->task_cnt - 1); |
| |
| xstrtolower(job_ptr->account); |
| job_state_set(job_ptr, job_ptr->job_state); |
| job_ptr->time_last_active = now; |
| |
| if (IS_JOB_PENDING(job_ptr)) |
| job_ptr->node_cnt_wag = job_ptr->total_nodes; |
| |
| /* |
| * This needs to always to initialized to "true". The select |
| * plugin will deal with it every time it goes through the |
| * logic if req_switch or wait4switch are set. |
| */ |
| job_ptr->best_switch = true; |
| |
| /* If start_protocol_ver is too old, reset to current version. */ |
| if (job_ptr->start_protocol_ver < SLURM_MIN_PROTOCOL_VERSION) |
| job_ptr->start_protocol_ver = SLURM_PROTOCOL_VERSION; |
| |
| /* Handle this after user_id and other identity has been filled in */ |
| if (!job_ptr->mail_user) { |
| job_ptr->mail_user = _get_mail_user(NULL, job_ptr); |
| } |
| |
| _add_job_hash(job_ptr); |
| _add_job_array_hash(job_ptr); |
| |
| memset(&assoc_rec, 0, sizeof(assoc_rec)); |
| |
| /* |
| * For speed and accuracy we will first see if we once had an |
| * association record. If not look for it by |
| * account,partition, user_id. |
| */ |
| if (job_ptr->assoc_id) |
| assoc_rec.id = job_ptr->assoc_id; |
| else { |
| assoc_rec.acct = job_ptr->account; |
| if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| assoc_rec.uid = job_ptr->user_id; |
| } |
| |
| assoc_mgr_lock(&locks); |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| &job_ptr->assoc_ptr, true) && |
| (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) { |
| _job_fail_account(job_ptr, __func__, true); |
| } else { |
| job_ptr->assoc_id = assoc_rec.id; |
| info("Recovered %pJ Assoc=%u", job_ptr, job_ptr->assoc_id); |
| |
| if (job_ptr->state_reason == FAIL_ACCOUNT) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| |
| /* make sure we have started this job in accounting */ |
| if (!IS_JOB_IN_DB(job_ptr)) { |
| debug("starting %pJ in accounting", job_ptr); |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| if (slurmctld_init_db |
| && IS_JOB_SUSPENDED(job_ptr)) { |
| jobacct_storage_g_job_suspend(acct_db_conn, |
| job_ptr); |
| } |
| } |
| /* make sure we have this job completed in the database */ |
| if (IS_JOB_FINISHED(job_ptr)) { |
| if (slurmctld_init_db && |
| !(job_ptr->bit_flags & TRES_STR_CALC) && |
| job_ptr->tres_alloc_cnt && |
| (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) |
| assoc_mgr_set_job_tres_alloc_str(job_ptr, |
| false); |
| jobacct_storage_g_job_complete( |
| acct_db_conn, job_ptr); |
| job_finished = 1; |
| } |
| } |
| |
| if (!job_finished && (job_ptr->qos_id || job_ptr->details->qos_req) && |
| (job_ptr->state_reason != FAIL_ACCOUNT)) { |
| int qos_error = _get_qos_info(job_ptr->details->qos_req, |
| job_ptr->qos_id, |
| &job_ptr->qos_list, |
| &job_ptr->qos_ptr, |
| job_ptr->resv_name, |
| job_ptr->assoc_ptr, |
| job_ptr->limit_set.qos, |
| true, LOG_LEVEL_ERROR); |
| |
| if ((qos_error != SLURM_SUCCESS) && |
| !job_ptr->limit_set.qos) { |
| job_fail_qos(job_ptr, __func__, true); |
| } else if (job_ptr->qos_ptr) { |
| job_ptr->qos_id = job_ptr->qos_ptr->id; |
| if (job_ptr->state_reason == FAIL_QOS) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| } |
| } |
| |
| /* |
| * do this after the format string just in case for some |
| * reason the tres_alloc_str is NULL but not the fmt_str |
| */ |
| if (job_ptr->tres_alloc_str) |
| assoc_mgr_set_tres_cnt_array( |
| &job_ptr->tres_alloc_cnt, job_ptr->tres_alloc_str, |
| 0, true, false, NULL); |
| else |
| job_set_alloc_tres(job_ptr, true); |
| |
| if (job_ptr->tres_req_str) |
| assoc_mgr_set_tres_cnt_array( |
| &job_ptr->tres_req_cnt, job_ptr->tres_req_str, 0, true, |
| false, NULL); |
| else |
| job_set_req_tres(job_ptr, true); |
| assoc_mgr_unlock(&locks); |
| |
| build_node_details(job_ptr, false); /* set node_addr */ |
| gres_stepmgr_job_build_details( |
| job_ptr->gres_list_alloc, job_ptr->nodes, |
| &job_ptr->gres_detail_cnt, |
| &job_ptr->gres_detail_str, |
| &job_ptr->gres_used); |
| |
| on_job_state_change(job_ptr, job_ptr->job_state); |
| last_job_update = now; |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("Incomplete job record"); |
| rc = SLURM_ERROR; |
| |
| free_it: |
| if (job_ptr) { |
| if (job_ptr->job_id == 0) |
| job_ptr->job_id = NO_VAL; |
| purge_job_record(job_ptr->job_id); |
| } |
| |
| return rc; |
| } |
| |
| /* _add_job_hash - add a job hash entry for given job record, job_id must |
| * already be set |
| * IN job_ptr - pointer to job record |
| * Globals: hash table updated |
| */ |
| static void _add_job_hash(job_record_t *job_ptr) |
| { |
| int inx; |
| |
| inx = JOB_HASH_INX(job_ptr->job_id); |
| job_ptr->job_next = job_hash[inx]; |
| job_hash[inx] = job_ptr; |
| } |
| |
| /* _remove_job_hash - remove a job hash entry for given job record, job_id must |
| * already be set |
| * IN job_ptr - pointer to job record |
| * IN type - which hash to work with |
| * Globals: hash table updated |
| */ |
| static void _remove_job_hash(job_record_t *job_entry, job_hash_type_t type) |
| { |
| job_record_t *job_ptr, **job_pptr; |
| |
| xassert(job_entry); |
| |
| on_job_state_change(job_entry, NO_VAL); |
| |
| switch (type) { |
| case JOB_HASH_JOB: |
| job_pptr = &job_hash[JOB_HASH_INX(job_entry->job_id)]; |
| break; |
| case JOB_HASH_ARRAY_JOB: |
| job_pptr = &job_array_hash_j[ |
| JOB_HASH_INX(job_entry->array_job_id)]; |
| break; |
| case JOB_HASH_ARRAY_TASK: |
| job_pptr = &job_array_hash_t[ |
| JOB_ARRAY_HASH_INX(job_entry->array_job_id, |
| job_entry->array_task_id)]; |
| break; |
| default: |
| fatal("%s: unknown job_hash_type_t %d", __func__, type); |
| return; |
| } |
| |
| while ((job_pptr != NULL) && (*job_pptr != NULL) && |
| ((job_ptr = *job_pptr) != job_entry)) { |
| xassert(job_ptr->magic == JOB_MAGIC); |
| switch (type) { |
| case JOB_HASH_JOB: |
| job_pptr = &job_ptr->job_next; |
| break; |
| case JOB_HASH_ARRAY_JOB: |
| job_pptr = &job_ptr->job_array_next_j; |
| break; |
| case JOB_HASH_ARRAY_TASK: |
| job_pptr = &job_ptr->job_array_next_t; |
| break; |
| } |
| } |
| |
| if (job_pptr == NULL || *job_pptr == NULL) { |
| if (job_entry->job_id == NO_VAL) |
| return; |
| |
| switch (type) { |
| case JOB_HASH_JOB: |
| error("%s: Could not find hash entry for JobId=%u", |
| __func__, job_entry->job_id); |
| break; |
| case JOB_HASH_ARRAY_JOB: |
| error("%s: job array hash error %u", __func__, |
| job_entry->array_job_id); |
| break; |
| case JOB_HASH_ARRAY_TASK: |
| error("%s: job array, task ID hash error %u_%u", |
| __func__, |
| job_entry->array_job_id, |
| job_entry->array_task_id); |
| break; |
| } |
| return; |
| } |
| |
| switch (type) { |
| case JOB_HASH_JOB: |
| *job_pptr = job_entry->job_next; |
| job_entry->job_next = NULL; |
| break; |
| case JOB_HASH_ARRAY_JOB: |
| *job_pptr = job_entry->job_array_next_j; |
| job_entry->job_array_next_j = NULL; |
| break; |
| case JOB_HASH_ARRAY_TASK: |
| *job_pptr = job_entry->job_array_next_t; |
| job_entry->job_array_next_t = NULL; |
| break; |
| } |
| } |
| |
| /* _add_job_array_hash - add a job hash entry for given job record, |
| * array_job_id and array_task_id must already be set |
| * IN job_ptr - pointer to job record |
| * Globals: hash table updated |
| */ |
| void _add_job_array_hash(job_record_t *job_ptr) |
| { |
| int inx; |
| |
| if (job_ptr->array_task_id == NO_VAL) |
| return; /* Not a job array */ |
| |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| |
| inx = JOB_HASH_INX(job_ptr->array_job_id); |
| job_ptr->job_array_next_j = job_array_hash_j[inx]; |
| job_array_hash_j[inx] = job_ptr; |
| |
| inx = JOB_ARRAY_HASH_INX(job_ptr->array_job_id,job_ptr->array_task_id); |
| job_ptr->job_array_next_t = job_array_hash_t[inx]; |
| job_array_hash_t[inx] = job_ptr; |
| } |
| |
| /* For the job array data structure, build the string representation of the |
| * bitmap. |
| * NOTE: bit_fmt_hexmask() is far more scalable than bit_fmt(). */ |
| extern void build_array_str(job_record_t *job_ptr) |
| { |
| job_array_struct_t *array_recs = job_ptr->array_recs; |
| |
| if (!array_recs || array_recs->task_id_str || |
| !array_recs->task_id_bitmap || |
| (job_ptr->array_task_id != NO_VAL) || |
| (bit_ffs(job_ptr->array_recs->task_id_bitmap) == -1)) |
| return; |
| |
| array_recs->task_id_str = bit_fmt_hexmask(array_recs->task_id_bitmap); |
| |
| /* Update the job in the database. */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } |
| |
| /* Return true if ALL tasks of specific array job ID are complete */ |
| extern bool test_job_array_complete(uint32_t array_job_id) |
| { |
| job_record_t *job_ptr; |
| int inx; |
| |
| job_ptr = find_job_record(array_job_id); |
| if (job_ptr) { |
| if (!IS_JOB_COMPLETE(job_ptr)) |
| return false; |
| if (job_ptr->array_recs && job_ptr->array_recs->max_exit_code) |
| return false; |
| } |
| |
| /* Need to test individual job array records */ |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) { |
| if (!IS_JOB_COMPLETE(job_ptr)) |
| return false; |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| return true; |
| } |
| |
| /* Return true if ALL tasks of specific array job ID are completed */ |
| extern bool test_job_array_completed(uint32_t array_job_id) |
| { |
| job_record_t *job_ptr; |
| int inx; |
| |
| job_ptr = find_job_record(array_job_id); |
| if (job_ptr) { |
| if (!IS_JOB_COMPLETED(job_ptr)) |
| return false; |
| } |
| |
| /* Need to test individual job array records */ |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) { |
| if (!IS_JOB_COMPLETED(job_ptr)) |
| return false; |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| return true; |
| } |
| |
| /* |
| * Return true if ALL tasks of specific array job ID are completed AND |
| * all except for the head job have been purged. |
| */ |
| static bool _test_job_array_purged(uint32_t array_job_id) |
| { |
| job_record_t *job_ptr, *head_job_ptr; |
| int inx; |
| |
| head_job_ptr = find_job_record(array_job_id); |
| if (head_job_ptr) { |
| if (!IS_JOB_COMPLETED(head_job_ptr)) |
| return false; |
| } |
| |
| /* Need to test individual job array records */ |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == array_job_id) && |
| (job_ptr != head_job_ptr)) { |
| return false; |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| return true; |
| } |
| |
| /* Return true if ALL tasks of specific array job ID are finished */ |
| extern bool test_job_array_finished(uint32_t array_job_id) |
| { |
| job_record_t *job_ptr; |
| int inx; |
| |
| job_ptr = find_job_record(array_job_id); |
| if (job_ptr) { |
| if (!IS_JOB_FINISHED(job_ptr)) |
| return false; |
| } |
| |
| /* Need to test individual job array records */ |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) { |
| if (!IS_JOB_FINISHED(job_ptr)) |
| return false; |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| |
| return true; |
| } |
| |
| /* Return true if ANY tasks of specific array job ID are pending */ |
| extern bool test_job_array_pending(uint32_t array_job_id) |
| { |
| job_record_t *job_ptr; |
| int inx; |
| |
| job_ptr = find_job_record(array_job_id); |
| if (job_ptr) { |
| if (IS_JOB_PENDING(job_ptr) || IS_JOB_CONFIGURING(job_ptr)) |
| return true; |
| if (job_ptr->array_recs && job_ptr->array_recs->task_cnt) |
| return true; |
| } |
| |
| /* Need to test individual job array records */ |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) { |
| if (IS_JOB_PENDING(job_ptr) || |
| IS_JOB_CONFIGURING(job_ptr)) |
| return true; |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| return false; |
| } |
| |
| /* For a given job ID return the number of PENDING tasks which have their |
| * own separate job_record (do not count tasks in pending META job record) */ |
| extern int num_pending_job_array_tasks(uint32_t array_job_id) |
| { |
| job_record_t *job_ptr; |
| int count = 0, inx; |
| |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == array_job_id) && |
| IS_JOB_PENDING(job_ptr)) |
| count++; |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| |
| return count; |
| } |
| |
| static void _foreach_by_job_callback(job_record_t *job_ptr, |
| for_each_by_job_id_args_t *args) |
| { |
| xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS); |
| |
| if (!job_ptr || !job_ptr->job_id) |
| return; |
| |
| xassert(!!args->ro_callback != !!args->callback); /* xor */ |
| xassert(args->control == FOR_EACH_JOB_BY_ID_EACH_CONT); |
| |
| if (args->ro_callback) |
| args->control = args->ro_callback(job_ptr, args->filter, |
| args->callback_arg); |
| else |
| args->control = args->callback(job_ptr, args->filter, |
| args->callback_arg); |
| |
| xassert(args->control > FOR_EACH_JOB_BY_ID_EACH_INVALID); |
| xassert(args->control < FOR_EACH_JOB_BY_ID_EACH_INVALID_MAX); |
| } |
| |
| static int _foreach_job_by_id_single(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| for_each_by_job_id_args_t *args = arg; |
| |
| xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS); |
| |
| _foreach_by_job_callback(job_ptr, args); |
| |
| switch (args->control) |
| { |
| case FOR_EACH_JOB_BY_ID_EACH_CONT: |
| return SLURM_SUCCESS; |
| case FOR_EACH_JOB_BY_ID_EACH_STOP: |
| case FOR_EACH_JOB_BY_ID_EACH_FAIL: |
| /* must return error as only way to stop list foreach */ |
| return SLURM_ERROR; |
| case FOR_EACH_JOB_BY_ID_EACH_INVALID_MAX: |
| case FOR_EACH_JOB_BY_ID_EACH_INVALID: |
| fatal_abort("should never happen"); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_by_het_job(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| for_each_by_job_id_args_t *args = arg; |
| |
| xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS); |
| |
| /* Filter to only this HetJob */ |
| |
| if (job_ptr->het_job_id != args->job_ptr->het_job_id) |
| return SLURM_SUCCESS; |
| |
| if ((args->filter->het_job_offset != NO_VAL) && |
| (job_ptr->het_job_offset != args->filter->het_job_offset)) |
| return SLURM_SUCCESS; |
| |
| return _foreach_job_by_id_single(job_ptr, args); |
| } |
| |
| static job_record_t *_find_first_job_array_rec(uint32_t array_job_id) |
| { |
| job_record_t *job_ptr; |
| int inx; |
| |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) |
| return job_ptr; |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| |
| return NULL; |
| } |
| |
| static void _foreach_job_by_id_array(for_each_by_job_id_args_t *args) |
| { |
| job_record_t *meta, *start; |
| bool dumped_meta = false, dumped_linked = false; |
| const uint32_t array_job_id = args->job_ptr->array_job_id; |
| |
| xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS); |
| |
| start = _find_first_job_array_rec(array_job_id); |
| |
| for (job_record_t *j = start; j; j = j->job_array_next_j) { |
| if (j->array_job_id != array_job_id) |
| continue; |
| |
| if (j->array_recs) |
| dumped_meta = true; |
| |
| if ((args->filter->array_task_id != NO_VAL) && |
| (j->array_task_id != args->filter->array_task_id)) |
| continue; |
| |
| debug3("%pJ->array_recs=%"PRIxPTR" linked to %pJ->array_recs=%"PRIxPTR, |
| start, (uintptr_t) (start ? start->array_recs : NULL), j, |
| (uintptr_t) j->array_recs); |
| |
| _foreach_by_job_callback(j, args); |
| |
| if (args->control != FOR_EACH_JOB_BY_ID_EACH_CONT) |
| return; |
| |
| dumped_linked = true; |
| } |
| |
| if (dumped_meta) |
| return; |
| |
| meta = find_job_record(args->job_ptr->array_job_id); |
| |
| if (!meta) |
| return; |
| |
| if (!meta->array_recs) { |
| debug3("%pJ->array_recs = NULL", meta); |
| return; |
| } else if (!meta->array_recs->task_id_bitmap) { |
| debug3("%pJ->array_recs->task_id_bitmap = NULL", meta); |
| return; |
| } |
| |
| xassert(meta->array_task_id == NO_VAL); |
| xassert(meta->array_job_id == meta->job_id); |
| |
| _foreach_by_job_callback(meta, args); |
| |
| if (args->control != FOR_EACH_JOB_BY_ID_EACH_CONT) |
| return; |
| |
| if (dumped_linked) |
| return; |
| |
| for (int i = 0; i < bit_size(meta->array_recs->task_id_bitmap); i++) { |
| if (!bit_test(meta->array_recs->task_id_bitmap, i)) { |
| job_record_t *job_ptr = |
| find_job_array_rec(meta->array_job_id, i); |
| |
| if (!job_ptr) |
| continue; |
| |
| if ((args->filter->array_task_id != NO_VAL) && |
| (job_ptr->array_task_id != |
| args->filter->array_task_id)) |
| continue; |
| |
| debug3("%pJ resolving bit:%d=%c to %pJ", |
| meta, i, |
| (bit_test(meta->array_recs->task_id_bitmap, i) ? |
| '1' : '0'), job_ptr); |
| |
| _foreach_by_job_callback(job_ptr, args); |
| |
| if (args->control != FOR_EACH_JOB_BY_ID_EACH_CONT) |
| return; |
| } |
| } |
| } |
| |
| static void _find_array_expression_jobs(const slurm_selected_step_t *filter, |
| for_each_by_job_id_args_t *args, |
| list_t *match_job_list, |
| slurm_selected_step_t *not_found_tasks) |
| { |
| int32_t i_first, i_last; |
| uint32_t job_id = filter->step_id.job_id; |
| bitstr_t *array_bitmap = filter->array_bitmap; |
| job_record_t *job_ptr; |
| job_record_t *meta_job = NULL; |
| |
| i_first = bit_ffs(array_bitmap); |
| if (i_first >= 0) |
| i_last = bit_fls(array_bitmap); |
| else |
| i_last = -2; |
| for (int i = i_first; i <= i_last; i++) { |
| if (!bit_test(array_bitmap, i)) |
| continue; |
| job_ptr = find_job_array_rec(job_id, i); |
| /* If !job_ptr, the array task does not exist. */ |
| if (!job_ptr && !not_found_tasks) |
| continue; |
| if (!job_ptr && not_found_tasks) { |
| bit_set(not_found_tasks->array_bitmap, i); |
| continue; |
| } |
| if (IS_JOB_PENDING(job_ptr) && job_ptr->array_recs) { |
| /* Found the meta job, or a task in the meta job */ |
| meta_job = job_ptr; |
| continue; |
| } |
| /* |
| * Found an array task that has been split from the meta record, |
| * or the meta record is not pending and all tasks have already |
| * been split out. |
| */ |
| list_append(match_job_list, job_ptr); |
| } |
| if (meta_job) |
| list_append(match_job_list, meta_job); |
| } |
| |
| static void _foreach_array_bitmap(const slurm_selected_step_t *filter, |
| for_each_by_job_id_args_t *args) |
| { |
| list_t *match_job_list = list_create(NULL); /* list of job_record_t */ |
| slurm_selected_step_t *not_found_tasks = NULL; |
| foreach_job_by_id_control_t tmp_control = |
| FOR_EACH_JOB_BY_ID_EACH_INVALID; |
| |
| /* |
| * Call the callback once per record that has been split off. |
| * Then call it once for the meta record. |
| */ |
| if (args->null_callback) { |
| not_found_tasks = xmalloc(sizeof(*not_found_tasks)); |
| memcpy(not_found_tasks, filter, sizeof(*not_found_tasks)); |
| not_found_tasks->array_bitmap = |
| bit_alloc(bit_size(filter->array_bitmap)); |
| } |
| _find_array_expression_jobs(filter, args, match_job_list, |
| not_found_tasks); |
| |
| /* |
| * Because this is a single filter, call both callbacks (no-match and |
| * match). Then, set args->control to the max of each callback return |
| * value. |
| */ |
| if (not_found_tasks) { |
| if (bit_ffs(not_found_tasks->array_bitmap) != -1) |
| tmp_control = args->null_callback(not_found_tasks, |
| args->callback_arg); |
| FREE_NULL_BITMAP(not_found_tasks->array_bitmap); |
| xfree(not_found_tasks); |
| } |
| |
| if (list_count(match_job_list)) |
| (void) list_for_each(match_job_list, _foreach_job_by_id_single, |
| args); |
| |
| FREE_NULL_LIST(match_job_list); |
| if (tmp_control != FOR_EACH_JOB_BY_ID_EACH_INVALID) |
| args->control = MAX(args->control, tmp_control); |
| } |
| |
| static int _walk_jobs_by_selected_step(const slurm_selected_step_t *filter, |
| for_each_by_job_id_args_t *args) |
| { |
| xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS); |
| |
| if (!filter->step_id.job_id) { |
| /* 0 is never a valid job so just return now */ |
| goto done; |
| } else if (filter->step_id.job_id == NO_VAL) { |
| /* walk all jobs */ |
| (void) list_for_each_ro(job_list, _foreach_job_by_id_single, |
| args); |
| goto done; |
| } |
| |
| xassert(!((filter->array_task_id != NO_VAL) && |
| (filter->het_job_offset != NO_VAL))); |
| |
| if (filter->array_bitmap) { |
| _foreach_array_bitmap(filter, args); |
| goto done; |
| } |
| |
| if (filter->array_task_id != NO_VAL) |
| args->job_ptr = find_job_array_rec(filter->step_id.job_id, |
| filter->array_task_id); |
| else if (filter->het_job_offset != NO_VAL) |
| args->job_ptr = find_job_record(filter->step_id.job_id + |
| filter->het_job_offset); |
| else /* not array task or het component */ |
| args->job_ptr = find_job_record(filter->step_id.job_id); |
| |
| if (!args->job_ptr) { |
| if (!args->null_callback) { |
| args->control = FOR_EACH_JOB_BY_ID_EACH_CONT; |
| } else { |
| args->control = args->null_callback(filter, |
| args->callback_arg); |
| } |
| goto done; |
| } |
| |
| if (args->job_ptr->het_job_list) { |
| xassert(args->job_ptr->het_job_id > 0); |
| (void) list_for_each(args->job_ptr->het_job_list, |
| _foreach_by_het_job, args); |
| } else if (args->job_ptr->array_job_id != args->job_ptr->job_id) { |
| /* Pack regular (not array/het) job */ |
| _foreach_by_job_callback(args->job_ptr, args); |
| } else { |
| /* array job */ |
| _foreach_job_by_id_array(args); |
| } |
| |
| done: |
| switch (args->control) |
| { |
| case FOR_EACH_JOB_BY_ID_EACH_STOP: |
| case FOR_EACH_JOB_BY_ID_EACH_CONT: |
| return args->count; |
| case FOR_EACH_JOB_BY_ID_EACH_FAIL: |
| return args->count * -1; |
| case FOR_EACH_JOB_BY_ID_EACH_INVALID_MAX: |
| case FOR_EACH_JOB_BY_ID_EACH_INVALID: |
| fatal_abort("should never happen"); |
| } |
| |
| fatal_abort("should never happen"); |
| } |
| |
| extern int foreach_job_by_id(const slurm_selected_step_t *filter, |
| JobForEachFunc callback, |
| JobNullForEachFunc null_callback, void *arg) |
| { |
| for_each_by_job_id_args_t args = { |
| .magic = MAGIC_FOREACH_BY_JOBID_ARGS, |
| .control = FOR_EACH_JOB_BY_ID_EACH_CONT, |
| .count = 0, |
| .callback = callback, |
| .callback_arg = arg, |
| .null_callback = null_callback, |
| .filter = filter, |
| }; |
| |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| |
| return _walk_jobs_by_selected_step(filter, &args); |
| } |
| |
| extern int foreach_job_by_id_ro(const slurm_selected_step_t *filter, |
| JobROForEachFunc callback, |
| JobNullForEachFunc null_callback, void *arg) |
| { |
| for_each_by_job_id_args_t args = { |
| .magic = MAGIC_FOREACH_BY_JOBID_ARGS, |
| .control = FOR_EACH_JOB_BY_ID_EACH_CONT, |
| .count = 0, |
| .ro_callback = callback, |
| .callback_arg = arg, |
| .null_callback = null_callback, |
| .filter = filter, |
| }; |
| |
| xassert(verify_lock(JOB_LOCK, READ_LOCK)); |
| |
| return _walk_jobs_by_selected_step(filter, &args); |
| } |
| |
| /* |
| * find_job_array_rec - return a pointer to the job record with the given |
| * array_job_id/array_task_id |
| * IN job_id - requested job's id |
| * IN array_task_id - requested job's task id, |
| * NO_VAL if none specified (i.e. not a job array) |
| * INFINITE return any task for specified job id |
| * RET pointer to the job's record, NULL on error |
| */ |
| extern job_record_t *find_job_array_rec(uint32_t array_job_id, |
| uint32_t array_task_id) |
| { |
| job_record_t *job_ptr, *match_job_ptr = NULL; |
| int inx; |
| |
| if (array_task_id == NO_VAL) |
| return find_job_record(array_job_id); |
| |
| if (array_task_id == INFINITE) { /* find by job ID */ |
| /* Look for job record with all of the pending tasks */ |
| job_ptr = find_job_record(array_job_id); |
| if (job_ptr && job_ptr->array_recs && |
| (job_ptr->array_job_id == array_job_id)) |
| return job_ptr; |
| |
| inx = JOB_HASH_INX(array_job_id); |
| job_ptr = job_array_hash_j[inx]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) { |
| match_job_ptr = job_ptr; |
| if (!IS_JOB_FINISHED(job_ptr)) { |
| return job_ptr; |
| } |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| return match_job_ptr; |
| } else { /* Find specific task ID */ |
| inx = JOB_ARRAY_HASH_INX(array_job_id, array_task_id); |
| job_ptr = job_array_hash_t[inx]; |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == array_job_id) && |
| (job_ptr->array_task_id == array_task_id)) { |
| return job_ptr; |
| } |
| job_ptr = job_ptr->job_array_next_t; |
| } |
| /* Look for job record with all of the pending tasks */ |
| job_ptr = find_job_record(array_job_id); |
| if (job_ptr && job_ptr->array_recs && |
| job_ptr->array_recs->task_id_bitmap) { |
| inx = bit_size(job_ptr->array_recs->task_id_bitmap); |
| if ((array_task_id < inx) && |
| bit_test(job_ptr->array_recs->task_id_bitmap, |
| array_task_id)) { |
| return job_ptr; |
| } |
| } |
| return NULL; /* None found */ |
| } |
| } |
| |
| static int _find_het_job(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| job_record_t *search_job_ptr = arg; |
| |
| if (search_job_ptr->het_job_id != het_job->het_job_id) { |
| error("%s: Bad het_job_list for %u", __func__, |
| search_job_ptr->job_id); |
| return 0; |
| } |
| |
| if (het_job->het_job_offset == search_job_ptr->het_job_offset) |
| return 1; |
| |
| return 0; |
| } |
| |
| /* |
| * find_het_job_record - return a pointer to the job record with the given ID |
| * IN job_id - requested job's ID |
| * IN het_job_offset - hetjob component offset |
| * RET pointer to the job's record, NULL on error |
| */ |
| extern job_record_t *find_het_job_record(uint32_t job_id, |
| uint32_t het_job_offset) |
| { |
| job_record_t *het_job_leader = find_job_record(job_id); |
| job_record_t search_job_rec = { 0 }; |
| |
| if (!het_job_leader) |
| return NULL; |
| if (het_job_leader->het_job_offset == het_job_offset) |
| return het_job_leader; |
| |
| if (!het_job_leader->het_job_list) |
| return NULL; |
| |
| search_job_rec.job_id = het_job_leader->job_id; |
| search_job_rec.het_job_id = het_job_leader->het_job_id; |
| search_job_rec.het_job_offset = het_job_offset; |
| |
| return list_find_first(het_job_leader->het_job_list, _find_het_job, |
| &search_job_rec); |
| } |
| |
| /* |
| * find_job_record - return a pointer to the job record with the given job_id |
| * IN job_id - requested job's id |
| * RET pointer to the job's record, NULL on error |
| */ |
| extern job_record_t *find_job_record(uint32_t job_id) |
| { |
| job_record_t *job_ptr; |
| xassert(verify_lock(JOB_LOCK, READ_LOCK)); |
| |
| job_ptr = job_hash[JOB_HASH_INX(job_id)]; |
| while (job_ptr) { |
| if (job_ptr->job_id == job_id) |
| return job_ptr; |
| job_ptr = job_ptr->job_next; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * Set a requeued job to PENDING and COMPLETING if all the nodes are completed |
| * and the EpilogSlurmctld is not running |
| */ |
| static void _set_requeued_job_pending_completing(job_record_t *job_ptr) |
| { |
| /* do this after the epilog complete, setting it here is too early */ |
| //job_record_set_sluid(job_ptr); |
| //job_ptr->details->submit_time = now; |
| |
| if (job_ptr->node_cnt || job_ptr->epilog_running) |
| job_state_set(job_ptr, (JOB_PENDING | JOB_COMPLETING)); |
| else |
| job_state_set(job_ptr, JOB_PENDING); |
| } |
| |
| /* |
| * Kill job or job step |
| * |
| * IN job_step_kill_msg - msg with specs on which job/step to cancel. |
| * IN job_ptr - pointer to job_record_t to cancel. |
| * IN uid - uid of user requesting job/step cancel. |
| */ |
| static int _kill_job_step(job_step_kill_msg_t *job_step_kill_msg, |
| job_record_t *job_ptr, uint32_t uid) |
| { |
| DEF_TIMERS; |
| int error_code = SLURM_SUCCESS; |
| xassert(job_ptr); |
| xassert(job_ptr->job_id == job_step_kill_msg->step_id.job_id); |
| |
| START_TIMER; |
| |
| log_flag(TRACE_JOBS, "%s: enter %pJ", __func__, job_ptr); |
| |
| /* do RPC call */ |
| if (job_step_kill_msg->step_id.step_id == NO_VAL) { |
| /* NO_VAL means the whole job, not individual steps */ |
| error_code = job_signal(job_ptr, |
| job_step_kill_msg->signal, |
| job_step_kill_msg->flags, uid, |
| false); |
| END_TIMER2(__func__); |
| |
| /* return result */ |
| if (error_code) { |
| log_flag(STEPS, "Signal %u %pJ by UID=%u: %s", |
| job_step_kill_msg->signal, job_ptr, uid, |
| slurm_strerror(error_code)); |
| } else { |
| if (job_step_kill_msg->signal == SIGKILL) { |
| log_flag(STEPS, "%s: Cancel of %pJ by UID=%u, %s", |
| __func__, job_ptr, uid, TIME_STR); |
| slurmctld_diag_stats.jobs_canceled++; |
| } else |
| log_flag(STEPS, "%s: Signal %u of %pJ by UID=%u, %s", |
| __func__, job_step_kill_msg->signal, |
| job_ptr, uid, TIME_STR); |
| |
| /* Below function provides its own locking */ |
| schedule_job_save(); |
| } |
| } else { |
| error_code = job_step_signal(&job_step_kill_msg->step_id, |
| job_step_kill_msg->signal, |
| job_step_kill_msg->flags, |
| uid); |
| END_TIMER2(__func__); |
| |
| /* return result */ |
| if (error_code) { |
| log_flag(STEPS, "Signal %u of JobId=%u StepId=%u by UID=%u: %s", |
| job_step_kill_msg->signal, |
| job_step_kill_msg->step_id.job_id, |
| job_step_kill_msg->step_id.step_id, uid, |
| slurm_strerror(error_code)); |
| } else { |
| if (job_step_kill_msg->signal == SIGKILL) |
| log_flag(STEPS, "%s: Cancel of JobId=%u StepId=%u by UID=%u %s", |
| __func__, |
| job_step_kill_msg->step_id.job_id, |
| job_step_kill_msg->step_id.step_id, |
| uid, |
| TIME_STR); |
| else |
| log_flag(STEPS, "%s: Signal %u of JobId=%u StepId=%u by UID=%u %s", |
| __func__, job_step_kill_msg->signal, |
| job_step_kill_msg->step_id.job_id, |
| job_step_kill_msg->step_id.step_id, |
| uid, |
| TIME_STR); |
| |
| /* Below function provides its own locking */ |
| schedule_job_save(); |
| } |
| } |
| |
| log_flag(TRACE_JOBS, "%s: return %pJ", __func__, job_ptr); |
| return error_code; |
| } |
| |
| static int _foreach_kill_hetjob_step(void *x, void *arg) |
| { |
| job_record_t *het_job_ptr = x; |
| foreach_kill_hetjob_step_t *foreach_kill_hetjob_step = arg; |
| job_step_kill_msg_t *job_step_kill_msg = |
| foreach_kill_hetjob_step->job_step_kill_msg; |
| int rc; |
| |
| job_step_kill_msg->step_id.job_id = het_job_ptr->job_id; |
| rc = _kill_job_step(job_step_kill_msg, het_job_ptr, |
| foreach_kill_hetjob_step->uid); |
| |
| if (rc != SLURM_SUCCESS) |
| foreach_kill_hetjob_step->rc = rc; |
| |
| return 0; |
| } |
| |
| /* |
| * Kill job or job step |
| * |
| * IN job_step_kill_msg - msg with specs on which job/step to cancel. |
| * IN uid - uid of user requesting job/step cancel. |
| */ |
| extern int kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid) |
| { |
| /* Locks: Read config, write job, write node, read fed */ |
| slurmctld_lock_t job_write_lock = { |
| .conf = READ_LOCK, |
| .job = WRITE_LOCK, |
| .node = WRITE_LOCK, |
| .fed = READ_LOCK, |
| }; |
| |
| job_record_t *job_ptr; |
| int error_code = SLURM_SUCCESS; |
| |
| lock_slurmctld(job_write_lock); |
| job_ptr = find_job_record(job_step_kill_msg->step_id.job_id); |
| |
| if (!job_ptr) { |
| info("%s: invalid JobId=%u", |
| __func__, job_step_kill_msg->step_id.job_id); |
| error_code = ESLURM_INVALID_JOB_ID; |
| goto endit; |
| } |
| |
| if ((job_ptr->user_id != uid) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) { |
| error("Security violation, JOB_CANCEL RPC for %pJ from uid %u", |
| job_ptr, uid); |
| error_code = ESLURM_ACCESS_DENIED; |
| goto endit; |
| } |
| |
| if (job_ptr->het_job_list && |
| (job_step_kill_msg->signal == SIGKILL) && |
| (job_step_kill_msg->step_id.step_id != NO_VAL)) { |
| foreach_kill_hetjob_step_t foreach_kill_hetjob_step = { |
| .job_step_kill_msg = job_step_kill_msg, |
| .rc = SLURM_SUCCESS, |
| .uid = uid, |
| }; |
| (void) list_for_each(job_ptr->het_job_list, |
| _foreach_kill_hetjob_step, |
| &foreach_kill_hetjob_step); |
| if (foreach_kill_hetjob_step.rc != SLURM_SUCCESS) |
| error_code = foreach_kill_hetjob_step.rc; |
| } else { |
| error_code = _kill_job_step(job_step_kill_msg, job_ptr, uid); |
| } |
| |
| endit: |
| unlock_slurmctld(job_write_lock); |
| |
| return error_code; |
| } |
| |
| static int _foreach_kill_job_by_part_name(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| foreach_kill_job_by_t *foreach_kill_job_by = arg; |
| part_record_t *part_ptr = foreach_kill_job_by->part_ptr; |
| time_t now = foreach_kill_job_by->now; |
| bool pending = false, suspended = false; |
| |
| pending = IS_JOB_PENDING(job_ptr); |
| if (job_ptr->part_ptr_list) { |
| /* Remove partition if candidate for a job */ |
| int rebuild_name_list = |
| list_delete_first(job_ptr->part_ptr_list, |
| slurm_find_ptr_in_list, |
| part_ptr); |
| |
| if (rebuild_name_list == -1) { |
| error("%s: Processing part_ptr_list, this should never happen.", |
| __func__); |
| } else if (rebuild_name_list) { |
| if (list_count(job_ptr->part_ptr_list) > 0) { |
| rebuild_job_part_list(job_ptr); |
| job_ptr->part_ptr = |
| list_peek(job_ptr->part_ptr_list); |
| } else { |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| } |
| } |
| } |
| |
| if (job_ptr->part_ptr != part_ptr) |
| return 0; |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| uint32_t suspend_job_state = job_ptr->job_state; |
| /* we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_state_set(job_ptr, JOB_CANCELLED); |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_state_set(job_ptr, suspend_job_state); |
| suspended = true; |
| } |
| if (IS_JOB_RUNNING(job_ptr) || suspended) { |
| foreach_kill_job_by->kill_job_cnt++; |
| info("Killing %pJ on defunct partition %s", |
| job_ptr, part_ptr->name); |
| job_state_set(job_ptr, (JOB_NODE_FAIL | JOB_COMPLETING)); |
| build_cg_bitmap(job_ptr); |
| job_ptr->state_reason = FAIL_DOWN_PARTITION; |
| xfree(job_ptr->state_desc); |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| job_ptr->exit_code = 1; |
| job_completion_logger(job_ptr, false); |
| if (!pending) |
| deallocate_nodes(job_ptr, false, suspended, false); |
| } else if (pending) { |
| foreach_kill_job_by->kill_job_cnt++; |
| info("Killing %pJ on defunct partition %s", |
| job_ptr, part_ptr->name); |
| job_state_set(job_ptr, JOB_CANCELLED); |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| job_ptr->exit_code = 1; |
| job_completion_logger(job_ptr, false); |
| fed_mgr_job_complete(job_ptr, 0, now); |
| } |
| job_ptr->part_ptr = NULL; |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| |
| return 0; |
| } |
| |
| /* |
| * kill_job_by_part_name - Given a partition name, deallocate resource for |
| * its jobs and kill them. All jobs associated with this partition |
| * will have their partition pointer cleared. |
| * IN part_name - name of a partition |
| * RET number of jobs associated with this partition |
| */ |
| extern int kill_job_by_part_name(char *part_name) |
| { |
| foreach_kill_job_by_t foreach_kill_job_by = { |
| .now = time(NULL), |
| .part_ptr = find_part_record(part_name), |
| }; |
| |
| if (!foreach_kill_job_by.part_ptr) /* No such partition */ |
| return 0; |
| |
| (void) list_for_each(job_list, _foreach_kill_job_by_part_name, |
| &foreach_kill_job_by); |
| |
| if (foreach_kill_job_by.kill_job_cnt) |
| last_job_update = foreach_kill_job_by.now; |
| return foreach_kill_job_by.kill_job_cnt; |
| } |
| |
| /* |
| * partition_in_use - determine whether a partition is in use by a RUNNING |
| * PENDING or SUSPENDED job or reservations |
| * IN part_name - name of a partition |
| * RET true if the partition is in use, else false |
| */ |
| extern bool partition_in_use(char *part_name) |
| { |
| part_record_t *part_ptr; |
| |
| part_ptr = find_part_record (part_name); |
| if (part_ptr == NULL) /* No such partition */ |
| return false; |
| |
| /* check jobs */ |
| if (list_find_first(job_list, _find_job_part, part_ptr)) |
| return true; |
| |
| /* check reservations */ |
| if (list_find_first(resv_list, _find_resv_part, part_ptr)) |
| return true; |
| |
| return false; |
| } |
| |
| static bool _job_node_test(job_record_t *job_ptr, int node_inx) |
| { |
| if (job_ptr->node_bitmap && |
| bit_test(job_ptr->node_bitmap, node_inx)) |
| return true; |
| return false; |
| } |
| |
| static int _find_het_job_on_node(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| int node_inx = *(int *)arg; |
| |
| if (_job_node_test(het_job, node_inx)) |
| return 1; |
| /* |
| * After a DOWN node is removed from another job component, |
| * we have no way to identify other hetjob components with |
| * the same node, so assume if one component is in NODE_FAILED |
| * state, they all should be. |
| */ |
| if (IS_JOB_NODE_FAILED(het_job)) |
| return 1; |
| |
| return 0; |
| } |
| |
| static bool _het_job_on_node(job_record_t *job_ptr, int node_inx) |
| { |
| job_record_t *het_job_leader; |
| |
| if (!job_ptr->het_job_id) |
| return _job_node_test(job_ptr, node_inx); |
| |
| het_job_leader = find_job_record(job_ptr->het_job_id); |
| if (!het_job_leader) { |
| error("%s: Hetjob leader %pJ not found", |
| __func__, job_ptr); |
| return _job_node_test(job_ptr, node_inx); |
| } |
| if (!het_job_leader->het_job_list) { |
| error("%s: Hetjob leader %pJ job list is NULL", |
| __func__, job_ptr); |
| return _job_node_test(job_ptr, node_inx); |
| } |
| |
| return list_find_first(het_job_leader->het_job_list, |
| _find_het_job_on_node, &node_inx); |
| } |
| |
| static int _foreach_kill_running_job_by_node(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| foreach_kill_job_by_t *foreach_kill_job_by = arg; |
| node_record_t *node_ptr = foreach_kill_job_by->node_ptr; |
| bool suspended = false; |
| job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; |
| |
| xassert(node_ptr); |
| |
| if (!_het_job_on_node(job_ptr, node_ptr->index)) |
| return 0; /* job not on this node */ |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| uint32_t suspend_job_state = job_ptr->job_state; |
| /* |
| * we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_state_set(job_ptr, JOB_CANCELLED); |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_state_set(job_ptr, suspend_job_state); |
| suspended = true; |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| if (!bit_test(job_ptr->node_bitmap_cg, node_ptr->index)) |
| return 0; |
| foreach_kill_job_by->kill_job_cnt++; |
| bit_clear(job_ptr->node_bitmap_cg, node_ptr->index); |
| job_update_tres_cnt(job_ptr, node_ptr->index); |
| if (job_ptr->node_cnt) |
| (job_ptr->node_cnt)--; |
| else { |
| error("node_cnt underflow on %pJ", job_ptr); |
| } |
| cleanup_completing(job_ptr, true); |
| |
| if (node_ptr->comp_job_cnt) |
| node_ptr->comp_job_cnt--; |
| else { |
| error("Node %s comp_job_cnt underflow, %pJ", |
| node_ptr->name, job_ptr); |
| } |
| } else if (IS_JOB_RUNNING(job_ptr) || suspended) { |
| foreach_kill_job_by->kill_job_cnt++; |
| if ((job_ptr->details) && |
| (job_ptr->kill_on_node_fail == 0) && |
| (job_ptr->node_cnt > 1) && |
| !IS_JOB_CONFIGURING(job_ptr)) { |
| bitstr_t *orig_job_node_bitmap; |
| |
| /* keep job running on remaining nodes */ |
| srun_node_fail(job_ptr, node_ptr->name); |
| error("Removing failed node %s from %pJ", |
| node_ptr->name, job_ptr); |
| job_pre_resize_acctg(job_ptr); |
| kill_step_on_node(job_ptr, node_ptr, true); |
| orig_job_node_bitmap = |
| bit_copy(job_resrcs_ptr->node_bitmap); |
| excise_node_from_job(job_ptr, node_ptr); |
| /* Resize the bitmaps of the job's steps */ |
| rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap); |
| FREE_NULL_BITMAP(orig_job_node_bitmap); |
| (void) gs_job_start(job_ptr); |
| gres_stepmgr_job_build_details( |
| job_ptr->gres_list_alloc, |
| job_ptr->nodes, |
| &job_ptr->gres_detail_cnt, |
| &job_ptr->gres_detail_str, |
| &job_ptr->gres_used); |
| job_post_resize_acctg(job_ptr); |
| } else if (job_ptr->batch_flag && |
| ((job_ptr->details && |
| job_ptr->details->requeue) || |
| (foreach_kill_job_by->requeue_on_resume_failure && |
| IS_NODE_POWERED_DOWN(node_ptr) && |
| IS_JOB_CONFIGURING(job_ptr)))) { |
| srun_node_fail(job_ptr, node_ptr->name); |
| info("requeue job %pJ due to failure of node %s", |
| job_ptr, node_ptr->name); |
| job_ptr->time_last_active = foreach_kill_job_by->now; |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(foreach_kill_job_by->now, |
| job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = foreach_kill_job_by->now; |
| |
| /* |
| * We want this job to look like it |
| * was terminated in the accounting logs. |
| * Set a new submit time so the restarted |
| * job looks like a new job. |
| */ |
| job_state_set(job_ptr, JOB_NODE_FAIL); |
| job_ptr->failed_node = xstrdup(node_ptr->name); |
| build_cg_bitmap(job_ptr); |
| job_ptr->exit_code = 1; |
| job_completion_logger(job_ptr, true); |
| deallocate_nodes(job_ptr, false, suspended, false); |
| |
| _set_requeued_job_pending_completing(job_ptr); |
| |
| job_ptr->restart_cnt++; |
| |
| /* clear signal sent flag on requeue */ |
| job_ptr->warn_flags &= ~WARN_SENT; |
| |
| job_ptr->exit_code = 0; |
| |
| /* |
| * Since the job completion logger |
| * removes the submit we need to add it |
| * again. |
| */ |
| acct_policy_add_job_submit(job_ptr, false); |
| |
| if (!job_ptr->node_bitmap_cg || |
| bit_ffs(job_ptr->node_bitmap_cg) == -1) |
| batch_requeue_fini(job_ptr); |
| } else { |
| info("Killing %pJ on failed node %s", |
| job_ptr, node_ptr->name); |
| srun_node_fail(job_ptr, node_ptr->name); |
| job_state_set(job_ptr, |
| (JOB_NODE_FAIL | JOB_COMPLETING)); |
| job_ptr->failed_node = xstrdup(node_ptr->name); |
| build_cg_bitmap(job_ptr); |
| job_ptr->state_reason = FAIL_DOWN_NODE; |
| xfree(job_ptr->state_desc); |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(foreach_kill_job_by->now, |
| job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = foreach_kill_job_by->now; |
| job_ptr->exit_code = 1; |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, suspended, false); |
| } |
| } |
| return 0; |
| } |
| |
| extern int kill_running_job_by_node_ptr(node_record_t *node_ptr) |
| { |
| static time_t sched_update = 0; |
| static bool requeue_on_resume_failure = false; |
| foreach_kill_job_by_t foreach_kill_job_by = { |
| .node_ptr = node_ptr, |
| .now = time(NULL), |
| }; |
| |
| if (sched_update != slurm_conf.last_update) { |
| requeue_on_resume_failure = |
| xstrcasestr(slurm_conf.sched_params, |
| "requeue_on_resume_failure"); |
| sched_update = slurm_conf.last_update; |
| } |
| |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(NODE_LOCK, WRITE_LOCK)); |
| |
| if (!foreach_kill_job_by.node_ptr) /* No such node */ |
| return 0; |
| |
| foreach_kill_job_by.requeue_on_resume_failure = |
| requeue_on_resume_failure; |
| |
| list_for_each(job_list, _foreach_kill_running_job_by_node, |
| &foreach_kill_job_by); |
| |
| if (foreach_kill_job_by.kill_job_cnt) |
| last_job_update = foreach_kill_job_by.now; |
| |
| return foreach_kill_job_by.kill_job_cnt; |
| } |
| |
| /* Remove one node from a job's allocation */ |
| extern void excise_node_from_job(job_record_t *job_ptr, |
| node_record_t *node_ptr) |
| { |
| make_node_idle(node_ptr, job_ptr); /* updates bitmap */ |
| xfree(job_ptr->nodes); |
| job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap); |
| |
| job_ptr->total_nodes = job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap); |
| |
| (void) select_g_job_resized(job_ptr, node_ptr); |
| } |
| |
| /* |
| * dump_job_desc - dump the incoming job submit request message |
| * IN job_desc - job specification from RPC |
| */ |
| void dump_job_desc(job_desc_msg_t *job_desc) |
| { |
| long pn_min_cpus, pn_min_tmp_disk, min_cpus; |
| uint64_t pn_min_memory; |
| long time_limit, priority, contiguous, nice, time_min; |
| long kill_on_node_fail, shared, immediate, wait_all_nodes; |
| long cpus_per_task, requeue, num_tasks, overcommit; |
| long ntasks_per_node, ntasks_per_socket, ntasks_per_core; |
| long ntasks_per_tres; |
| int spec_count; |
| char *mem_type, buf[256], *signal_flags, *spec_type, *job_id; |
| |
| if (get_log_level() < LOG_LEVEL_DEBUG3) |
| return; |
| |
| if (job_desc == NULL) |
| return; |
| |
| if (job_desc->job_id_str) |
| job_id = job_desc->job_id_str; |
| else if (job_desc->job_id == NO_VAL) |
| job_id = "N/A"; |
| else { |
| snprintf(buf, sizeof(buf), "%u", job_desc->job_id); |
| job_id = buf; |
| } |
| debug3("JobDesc: user_id=%u JobId=%s partition=%s name=%s", |
| job_desc->user_id, job_id, |
| job_desc->partition, job_desc->name); |
| |
| min_cpus = (job_desc->min_cpus != NO_VAL) ? |
| (long) job_desc->min_cpus : -1L; |
| pn_min_cpus = (job_desc->pn_min_cpus != NO_VAL16) ? |
| (long) job_desc->pn_min_cpus : -1L; |
| if (job_desc->core_spec == NO_VAL16) { |
| spec_type = "core"; |
| spec_count = -1; |
| } else if (job_desc->core_spec & CORE_SPEC_THREAD) { |
| spec_type = "thread"; |
| spec_count = job_desc->core_spec & (~CORE_SPEC_THREAD); |
| } else { |
| spec_type = "core"; |
| spec_count = job_desc->core_spec; |
| } |
| debug3(" cpus=%ld-%u pn_min_cpus=%ld %s_spec=%d", |
| min_cpus, job_desc->max_cpus, pn_min_cpus, |
| spec_type, spec_count); |
| |
| debug3(" Nodes=%u-[%u] Sock/Node=%u Core/Sock=%u Thread/Core=%u", |
| job_desc->min_nodes, job_desc->max_nodes, |
| job_desc->sockets_per_node, job_desc->cores_per_socket, |
| job_desc->threads_per_core); |
| |
| if (job_desc->pn_min_memory == NO_VAL64) { |
| pn_min_memory = -1L; |
| mem_type = "job"; |
| } else if (job_desc->pn_min_memory & MEM_PER_CPU) { |
| pn_min_memory = job_desc->pn_min_memory & (~MEM_PER_CPU); |
| mem_type = "cpu"; |
| } else { |
| pn_min_memory = job_desc->pn_min_memory; |
| mem_type = "job"; |
| } |
| pn_min_tmp_disk = (job_desc->pn_min_tmp_disk != NO_VAL) ? |
| (long) job_desc->pn_min_tmp_disk : -1L; |
| debug3(" pn_min_memory_%s=%"PRIu64" pn_min_tmp_disk=%ld", |
| mem_type, pn_min_memory, pn_min_tmp_disk); |
| immediate = (job_desc->immediate == 0) ? 0L : 1L; |
| debug3(" immediate=%ld reservation=%s", |
| immediate, job_desc->reservation); |
| debug3(" features=%s batch_features=%s cluster_features=%s prefer=%s", |
| job_desc->features, job_desc->batch_features, |
| job_desc->cluster_features, job_desc->prefer); |
| |
| debug3(" req_nodes=%s exc_nodes=%s", |
| job_desc->req_nodes, job_desc->exc_nodes); |
| |
| time_limit = (job_desc->time_limit != NO_VAL) ? |
| (long) job_desc->time_limit : -1L; |
| time_min = (job_desc->time_min != NO_VAL) ? |
| (long) job_desc->time_min : time_limit; |
| priority = (job_desc->priority != NO_VAL) ? |
| (long) job_desc->priority : -1L; |
| contiguous = (job_desc->contiguous != NO_VAL16) ? |
| (long) job_desc->contiguous : -1L; |
| shared = (job_desc->shared != NO_VAL16) ? |
| (long) job_desc->shared : -1L; |
| debug3(" time_limit=%ld-%ld priority=%ld contiguous=%ld shared=%ld", |
| time_min, time_limit, priority, contiguous, shared); |
| |
| kill_on_node_fail = (job_desc->kill_on_node_fail != |
| NO_VAL16) ? |
| (long) job_desc->kill_on_node_fail : -1L; |
| if (job_desc->script) /* log has problem with string len & null */ |
| debug3(" kill_on_node_fail=%ld script=%.40s...", |
| kill_on_node_fail, job_desc->script); |
| else |
| debug3(" kill_on_node_fail=%ld script=(null)", |
| kill_on_node_fail); |
| |
| if (job_desc->argc == 1) |
| debug3(" argv=\"%s\"", |
| job_desc->argv[0]); |
| else if (job_desc->argc == 2) |
| debug3(" argv=%s,%s", |
| job_desc->argv[0], |
| job_desc->argv[1]); |
| else if (job_desc->argc > 2) |
| debug3(" argv=%s,%s,%s,...", |
| job_desc->argv[0], |
| job_desc->argv[1], |
| job_desc->argv[2]); |
| |
| if (job_desc->env_size == 1) |
| debug3(" environment=\"%s\"", |
| job_desc->environment[0]); |
| else if (job_desc->env_size == 2) |
| debug3(" environment=%s,%s", |
| job_desc->environment[0], |
| job_desc->environment[1]); |
| else if (job_desc->env_size > 2) |
| debug3(" environment=%s,%s,%s,...", |
| job_desc->environment[0], |
| job_desc->environment[1], |
| job_desc->environment[2]); |
| |
| if (job_desc->spank_job_env_size == 1) |
| debug3(" spank_job_env=\"%s\"", |
| job_desc->spank_job_env[0]); |
| else if (job_desc->spank_job_env_size == 2) |
| debug3(" spank_job_env=%s,%s", |
| job_desc->spank_job_env[0], |
| job_desc->spank_job_env[1]); |
| else if (job_desc->spank_job_env_size > 2) |
| debug3(" spank_job_env=%s,%s,%s,...", |
| job_desc->spank_job_env[0], |
| job_desc->spank_job_env[1], |
| job_desc->spank_job_env[2]); |
| |
| debug3(" stdin=%s stdout=%s stderr=%s", |
| job_desc->std_in, job_desc->std_out, job_desc->std_err); |
| |
| debug3(" work_dir=%s alloc_node:sid=%s:%u", |
| job_desc->work_dir, |
| job_desc->alloc_node, job_desc->alloc_sid); |
| |
| debug3(" resp_host=%s alloc_resp_port=%u other_port=%u", |
| job_desc->resp_host, |
| job_desc->alloc_resp_port, job_desc->other_port); |
| debug3(" dependency=%s account=%s qos=%s comment=%s", |
| job_desc->dependency, job_desc->account, |
| job_desc->qos, job_desc->comment); |
| |
| num_tasks = (job_desc->num_tasks != NO_VAL) ? |
| (long) job_desc->num_tasks : -1L; |
| overcommit = (job_desc->overcommit != NO_VAL8) ? |
| (long) job_desc->overcommit : -1L; |
| nice = (job_desc->nice != NO_VAL) ? |
| ((int64_t)job_desc->nice - NICE_OFFSET) : 0; |
| debug3(" mail_type=%u mail_user=%s nice=%ld num_tasks=%ld " |
| "open_mode=%u overcommit=%ld acctg_freq=%s", |
| job_desc->mail_type, job_desc->mail_user, nice, num_tasks, |
| job_desc->open_mode, overcommit, job_desc->acctg_freq); |
| |
| slurm_make_time_str(&job_desc->begin_time, buf, sizeof(buf)); |
| cpus_per_task = (job_desc->cpus_per_task != NO_VAL16) ? |
| (long) job_desc->cpus_per_task : -1L; |
| requeue = (job_desc->requeue != NO_VAL16) ? |
| (long) job_desc->requeue : -1L; |
| debug3(" network=%s begin=%s cpus_per_task=%ld requeue=%ld " |
| "licenses=%s", |
| job_desc->network, buf, cpus_per_task, requeue, |
| job_desc->licenses); |
| |
| slurm_make_time_str(&job_desc->end_time, buf, sizeof(buf)); |
| wait_all_nodes = (job_desc->wait_all_nodes != NO_VAL16) ? |
| (long) job_desc->wait_all_nodes : -1L; |
| if (job_desc->warn_flags & KILL_JOB_BATCH) |
| signal_flags = "B:"; |
| else |
| signal_flags = ""; |
| cpu_freq_debug(NULL, NULL, buf, sizeof(buf), job_desc->cpu_freq_gov, |
| job_desc->cpu_freq_min, job_desc->cpu_freq_max, |
| NO_VAL); |
| debug3(" end_time=%s signal=%s%u@%u wait_all_nodes=%ld cpu_freq=%s", |
| buf, signal_flags, job_desc->warn_signal, job_desc->warn_time, |
| wait_all_nodes, buf); |
| |
| ntasks_per_node = (job_desc->ntasks_per_node != NO_VAL16) ? |
| (long) job_desc->ntasks_per_node : -1L; |
| ntasks_per_socket = (job_desc->ntasks_per_socket != |
| NO_VAL16) ? |
| (long) job_desc->ntasks_per_socket : -1L; |
| ntasks_per_core = (job_desc->ntasks_per_core != NO_VAL16) ? |
| (long) job_desc->ntasks_per_core : -1L; |
| ntasks_per_tres = (job_desc->ntasks_per_tres != NO_VAL16) ? |
| (long) job_desc->ntasks_per_tres : -1L; |
| debug3(" ntasks_per_node=%ld ntasks_per_socket=%ld ntasks_per_core=%ld ntasks_per_tres=%ld", |
| ntasks_per_node, ntasks_per_socket, ntasks_per_core, |
| ntasks_per_tres); |
| |
| debug3(" mem_bind=%u:%s plane_size:%u", |
| job_desc->mem_bind_type, job_desc->mem_bind, |
| job_desc->plane_size); |
| debug3(" array_inx=%s", job_desc->array_inx); |
| debug3(" burst_buffer=%s", job_desc->burst_buffer); |
| debug3(" mcs_label=%s", job_desc->mcs_label); |
| slurm_make_time_str(&job_desc->deadline, buf, sizeof(buf)); |
| debug3(" deadline=%s", buf); |
| debug3(" bitflags=0x%"PRIx64" delay_boot=%u", |
| job_desc->bitflags, job_desc->delay_boot); |
| |
| if (job_desc->cpus_per_tres) |
| debug3(" CPUs_per_TRES=%s", job_desc->cpus_per_tres); |
| if (job_desc->mem_per_tres) |
| debug3(" Mem_per_TRES=%s", job_desc->mem_per_tres); |
| if (job_desc->tres_bind) |
| debug3(" TRES_bind=%s", job_desc->tres_bind); |
| if (job_desc->tres_freq) |
| debug3(" TRES_freq=%s", job_desc->tres_freq); |
| if (job_desc->tres_per_job) |
| debug3(" TRES_per_job=%s", job_desc->tres_per_job); |
| if (job_desc->tres_per_node) |
| debug3(" TRES_per_node=%s", job_desc->tres_per_node); |
| if (job_desc->tres_per_socket) |
| debug3(" TRES_per_socket=%s", job_desc->tres_per_socket); |
| if (job_desc->tres_per_task) |
| debug3(" TRES_per_task=%s", job_desc->tres_per_task); |
| |
| if (job_desc->container || job_desc->container_id) |
| debug3(" container=%s container-id=%s", |
| job_desc->container, job_desc->container_id); |
| } |
| |
| /* |
| * init_job_conf - initialize the job configuration tables and values. |
| * this should be called after creating node information, but |
| * before creating any job entries. Pre-existing job entries are |
| * left unchanged. |
| * NOTE: The job hash table size does not change after initial creation. |
| * global: last_job_update - time of last job table update |
| * job_list - pointer to global job list |
| * purge_jobs_list - pointer to purge_jobs_list |
| */ |
| void init_job_conf(void) |
| { |
| if (job_list == NULL) { |
| job_count = 0; |
| job_list = list_create(_move_to_purge_jobs_list); |
| } |
| |
| last_job_update = time(NULL); |
| |
| if (!purge_files_list) { |
| purge_files_list = list_create(xfree_ptr); |
| } |
| |
| if (!purge_jobs_list) |
| purge_jobs_list = list_create(job_record_delete); |
| } |
| |
| /* |
| * rehash_jobs - Create or rebuild the job hash table. |
| */ |
| extern void rehash_jobs(void) |
| { |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| |
| if (job_hash == NULL) { |
| hash_table_size = slurm_conf.max_job_cnt; |
| job_hash = xcalloc(hash_table_size, sizeof(job_record_t *)); |
| job_array_hash_j = xcalloc(hash_table_size, |
| sizeof(job_record_t *)); |
| job_array_hash_t = xcalloc(hash_table_size, |
| sizeof(job_record_t *)); |
| if (xstrcasestr(slurm_conf.sched_params, |
| "enable_job_state_cache")) |
| setup_job_state_hash(hash_table_size); |
| } else if (hash_table_size < (slurm_conf.max_job_cnt / 2)) { |
| /* If the MaxJobCount grows by too much, the hash table will |
| * be ineffective without rebuilding. We don't presently bother |
| * to rebuild the hash table, but cut MaxJobCount back as |
| * needed. */ |
| error ("MaxJobCount reset too high, restart slurmctld"); |
| slurm_conf.max_job_cnt = hash_table_size; |
| } |
| } |
| |
| /* Create an exact copy of an existing job record for a job array. |
| * IN job_ptr - META job record for a job array, which is to become an |
| * individual task of the job array. |
| * Set the job's array_task_id to the task to be split out. |
| * RET - The new job record, which is the new META job record. */ |
| extern job_record_t *job_array_split(job_record_t *job_ptr, bool list_add) |
| { |
| job_record_t *job_ptr_pend = NULL; |
| job_details_t *job_details, *details_new, *save_details; |
| uint32_t save_job_id, save_db_flags = job_ptr->db_flags; |
| uint64_t save_db_index = job_ptr->db_index; |
| priority_factors_t *save_prio_factors; |
| list_t *save_step_list = NULL; |
| int i; |
| |
| job_ptr_pend = _create_job_record(0, list_add); |
| |
| _remove_job_hash(job_ptr, JOB_HASH_JOB); |
| job_ptr_pend->job_id = job_ptr->job_id; |
| if (_set_job_id(job_ptr) != SLURM_SUCCESS) |
| fatal("%s: _set_job_id error", __func__); |
| if (!job_ptr->array_recs) { |
| fatal_abort("%s: %pJ record lacks array structure", |
| __func__, job_ptr); |
| } |
| |
| /* |
| * Copy most of original job data. |
| * This could be done in parallel, but performance was worse. |
| */ |
| save_job_id = job_ptr_pend->job_id; |
| save_details = job_ptr_pend->details; |
| save_prio_factors = job_ptr_pend->prio_factors; |
| save_step_list = job_ptr_pend->step_list; |
| memcpy(job_ptr_pend, job_ptr, sizeof(job_record_t)); |
| |
| job_ptr_pend->job_id = save_job_id; |
| job_ptr_pend->details = save_details; |
| job_ptr_pend->db_flags = save_db_flags; |
| job_ptr_pend->step_list = save_step_list; |
| job_ptr_pend->db_index = save_db_index; |
| |
| job_ptr_pend->prio_factors = save_prio_factors; |
| slurm_copy_priority_factors(job_ptr_pend->prio_factors, |
| job_ptr->prio_factors); |
| |
| job_ptr_pend->account = xstrdup(job_ptr->account); |
| job_ptr_pend->admin_comment = xstrdup(job_ptr->admin_comment); |
| job_ptr_pend->alias_list = NULL; |
| job_ptr_pend->alloc_node = xstrdup(job_ptr->alloc_node); |
| job_ptr_pend->node_addrs = NULL; |
| |
| job_ptr_pend->array_recs = job_ptr->array_recs; |
| job_ptr->array_recs = NULL; |
| |
| if (job_ptr_pend->array_recs && |
| job_ptr_pend->array_recs->task_id_bitmap) { |
| bit_clear(job_ptr_pend->array_recs->task_id_bitmap, |
| job_ptr_pend->array_task_id); |
| } |
| xfree(job_ptr_pend->array_recs->task_id_str); |
| if (job_ptr_pend->array_recs->task_cnt) { |
| job_ptr_pend->array_recs->task_cnt--; |
| if (job_ptr_pend->array_recs->task_cnt <= 1) { |
| /* |
| * This is the last task of the job array, so we need to |
| * set array_task_id to a specific task id. We also |
| * need to call job_array_post_sched() to do cleanup |
| * on the array, specifically how job_array_post_sched() |
| * handles adding the job to the array_hash, otherwise |
| * we'll get errors. |
| */ |
| i = bit_ffs(job_ptr_pend->array_recs->task_id_bitmap); |
| if (i < 0) { |
| error("%s: No tasks in task_id_bitmap for %pJ", |
| __func__, job_ptr_pend); |
| job_ptr_pend->array_task_id = NO_VAL; |
| } else { |
| job_ptr_pend->array_task_id = i; |
| job_array_post_sched(job_ptr_pend, true); |
| } |
| } else { |
| /* Still have tasks left to split off in the array */ |
| job_ptr_pend->array_task_id = NO_VAL; |
| } |
| } else { |
| error("%pJ array_recs->task_cnt underflow", |
| job_ptr); |
| job_ptr_pend->array_task_id = NO_VAL; |
| } |
| |
| job_ptr_pend->batch_features = xstrdup(job_ptr->batch_features); |
| job_ptr_pend->batch_host = NULL; |
| job_ptr_pend->burst_buffer = xstrdup(job_ptr->burst_buffer); |
| job_ptr_pend->burst_buffer_state = xstrdup(job_ptr->burst_buffer_state); |
| job_ptr_pend->clusters = xstrdup(job_ptr->clusters); |
| job_ptr_pend->comment = xstrdup(job_ptr->comment); |
| job_ptr_pend->container = xstrdup(job_ptr->container); |
| job_ptr_pend->container_id = xstrdup(job_ptr->container_id); |
| job_ptr_pend->extra = xstrdup(job_ptr->extra); |
| if ((extra_constraints_parse(job_ptr_pend->extra, |
| &job_ptr_pend->extra_constraints)) != |
| SLURM_SUCCESS) |
| error("%s: %pJ Invalid extra_constraints %s", |
| __func__, job_ptr, job_ptr_pend->extra); |
| |
| |
| job_ptr_pend->fed_details = _dup_job_fed_details(job_ptr->fed_details); |
| |
| /* job_details_t *details; *** NOTE: Copied below */ |
| |
| job_ptr_pend->limit_set.tres = xcalloc(slurmctld_tres_cnt, |
| sizeof(uint16_t)); |
| memcpy(job_ptr_pend->limit_set.tres, job_ptr->limit_set.tres, |
| sizeof(uint16_t) * slurmctld_tres_cnt); |
| |
| _add_job_hash(job_ptr); /* Sets job_next */ |
| _add_job_hash(job_ptr_pend); /* Sets job_next */ |
| _add_job_array_hash(job_ptr); |
| job_ptr_pend->job_resrcs = NULL; |
| |
| job_ptr_pend->id = copy_identity(job_ptr->id); |
| job_ptr_pend->licenses = xstrdup(job_ptr->licenses); |
| job_ptr_pend->licenses_allocated = NULL; |
| job_ptr_pend->license_list = license_copy(job_ptr->license_list); |
| job_ptr_pend->licenses_to_preempt = NULL; |
| job_ptr_pend->lic_req = xstrdup(job_ptr->lic_req); |
| job_ptr_pend->mail_user = xstrdup(job_ptr->mail_user); |
| job_ptr_pend->mcs_label = xstrdup(job_ptr->mcs_label); |
| job_ptr_pend->name = xstrdup(job_ptr->name); |
| job_ptr_pend->network = xstrdup(job_ptr->network); |
| job_ptr_pend->node_bitmap = NULL; |
| job_ptr_pend->node_bitmap_cg = NULL; |
| job_ptr_pend->node_bitmap_pr = NULL; |
| job_ptr_pend->node_bitmap_preempt = NULL; |
| job_ptr_pend->nodes = NULL; |
| job_ptr_pend->nodes_completing = NULL; |
| job_ptr_pend->nodes_pr = NULL; |
| job_ptr_pend->origin_cluster = xstrdup(job_ptr->origin_cluster); |
| job_ptr_pend->partition = xstrdup(job_ptr->partition); |
| job_ptr_pend->part_ptr_list = part_list_copy(job_ptr->part_ptr_list); |
| /* On jobs that are held the priority_array isn't set up yet, |
| * so check to see if it exists before copying. */ |
| if ((job_ptr->part_ptr_list || job_ptr->qos_list) && |
| job_ptr->prio_mult) { |
| job_ptr_pend->prio_mult = |
| xmalloc(sizeof(*job_ptr_pend->prio_mult)); |
| |
| if (job_ptr->prio_mult->priority_array) { |
| i = xsize(job_ptr->prio_mult->priority_array); |
| job_ptr_pend->prio_mult->priority_array = xmalloc(i); |
| memcpy(job_ptr_pend->prio_mult->priority_array, |
| job_ptr->prio_mult->priority_array, i); |
| } |
| |
| job_ptr_pend->prio_mult->priority_array_names = |
| xstrdup(job_ptr->prio_mult->priority_array_names); |
| } else if (job_ptr->prio_mult) { |
| /* this should never happen */ |
| error("%s: prio_mult is set without part_ptr_list or qos_list, setting prio_mult to NULL.", |
| __func__); |
| job_ptr_pend->prio_mult = NULL; |
| } |
| if (job_ptr->qos_list) |
| job_ptr_pend->qos_list = list_shallow_copy(job_ptr->qos_list); |
| job_ptr_pend->resv_name = xstrdup(job_ptr->resv_name); |
| if (job_ptr->resv_list) |
| job_ptr_pend->resv_list = list_shallow_copy(job_ptr->resv_list); |
| job_ptr_pend->resv_ports = NULL; |
| job_ptr_pend->resv_port_array = NULL; |
| job_ptr_pend->resp_host = xstrdup(job_ptr->resp_host); |
| job_ptr_pend->selinux_context = xstrdup(job_ptr->selinux_context); |
| job_ptr_pend->sched_nodes = NULL; |
| if (job_ptr->spank_job_env_size) { |
| job_ptr_pend->spank_job_env = |
| xcalloc((job_ptr->spank_job_env_size + 1), |
| sizeof(char *)); |
| for (i = 0; i < job_ptr->spank_job_env_size; i++) { |
| job_ptr_pend->spank_job_env[i] = |
| xstrdup(job_ptr->spank_job_env[i]); |
| } |
| } |
| job_ptr_pend->state_desc = xstrdup(job_ptr->state_desc); |
| |
| job_ptr_pend->system_comment = xstrdup(job_ptr->system_comment); |
| |
| i = sizeof(uint64_t) * slurmctld_tres_cnt; |
| job_ptr_pend->tres_req_cnt = xmalloc(i); |
| memcpy(job_ptr_pend->tres_req_cnt, job_ptr->tres_req_cnt, i); |
| job_ptr_pend->tres_req_str = xstrdup(job_ptr->tres_req_str); |
| job_ptr_pend->tres_fmt_req_str = xstrdup(job_ptr->tres_fmt_req_str); |
| job_ptr_pend->tres_alloc_str = NULL; |
| job_ptr_pend->tres_fmt_alloc_str = NULL; |
| job_ptr_pend->tres_alloc_cnt = NULL; |
| |
| job_ptr_pend->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres); |
| job_ptr_pend->mem_per_tres = xstrdup(job_ptr->mem_per_tres); |
| job_ptr_pend->tres_bind = xstrdup(job_ptr->tres_bind); |
| job_ptr_pend->tres_freq = xstrdup(job_ptr->tres_freq); |
| job_ptr_pend->tres_per_job = xstrdup(job_ptr->tres_per_job); |
| job_ptr_pend->tres_per_node = xstrdup(job_ptr->tres_per_node); |
| job_ptr_pend->tres_per_socket = xstrdup(job_ptr->tres_per_socket); |
| job_ptr_pend->tres_per_task = xstrdup(job_ptr->tres_per_task); |
| |
| job_ptr_pend->user_name = xstrdup(job_ptr->user_name); |
| job_ptr_pend->wckey = xstrdup(job_ptr->wckey); |
| job_ptr_pend->deadline = job_ptr->deadline; |
| |
| job_details = job_ptr->details; |
| details_new = job_ptr_pend->details; |
| memcpy(details_new, job_details, sizeof(job_details_t)); |
| |
| /* |
| * Reset the preempt_start_time or high priority array jobs will hang |
| * for a period before preempting more jobs. |
| */ |
| details_new->preempt_start_time = 0; |
| |
| details_new->acctg_freq = xstrdup(job_details->acctg_freq); |
| if (job_details->argc) { |
| details_new->argv = |
| xcalloc((job_details->argc + 1), sizeof(char *)); |
| for (i = 0; i < job_details->argc; i++) { |
| details_new->argv[i] = xstrdup(job_details->argv[i]); |
| } |
| } |
| details_new->cpu_bind = xstrdup(job_details->cpu_bind); |
| details_new->cpu_bind_type = job_details->cpu_bind_type; |
| details_new->cpu_freq_min = job_details->cpu_freq_min; |
| details_new->cpu_freq_max = job_details->cpu_freq_max; |
| details_new->cpu_freq_gov = job_details->cpu_freq_gov; |
| details_new->depend_list = depended_list_copy(job_details->depend_list); |
| details_new->dependency = xstrdup(job_details->dependency); |
| details_new->orig_dependency = xstrdup(job_details->orig_dependency); |
| if (job_details->env_cnt) { |
| details_new->env_sup = |
| xcalloc((job_details->env_cnt + 1), sizeof(char *)); |
| for (i = 0; i < job_details->env_cnt; i++) { |
| details_new->env_sup[i] = |
| xstrdup(job_details->env_sup[i]); |
| } |
| } |
| if (job_details->exc_node_bitmap) { |
| details_new->exc_node_bitmap = |
| bit_copy(job_details->exc_node_bitmap); |
| } |
| details_new->exc_nodes = xstrdup(job_details->exc_nodes); |
| details_new->feature_list = |
| feature_list_copy(job_details->feature_list); |
| details_new->features = xstrdup(job_details->features); |
| details_new->cluster_features = xstrdup(job_details->cluster_features); |
| if (job_details->job_size_bitmap) { |
| details_new->job_size_bitmap = |
| bit_copy(job_details->job_size_bitmap); |
| } |
| details_new->prefer = xstrdup(job_details->prefer); |
| details_new->prefer_list = |
| feature_list_copy(job_details->prefer_list); |
| set_job_features_use(details_new); |
| if (job_details->mc_ptr) { |
| i = sizeof(multi_core_data_t); |
| details_new->mc_ptr = xmalloc(i); |
| memcpy(details_new->mc_ptr, job_details->mc_ptr, i); |
| } |
| details_new->mem_bind = xstrdup(job_details->mem_bind); |
| details_new->mem_bind_type = job_details->mem_bind_type; |
| details_new->qos_req = xstrdup(job_details->qos_req); |
| details_new->resv_req = xstrdup(job_details->resv_req); |
| if (job_details->req_node_bitmap) { |
| details_new->req_node_bitmap = |
| bit_copy(job_details->req_node_bitmap); |
| } |
| details_new->req_context = xstrdup(job_details->req_context); |
| details_new->req_nodes = xstrdup(job_details->req_nodes); |
| details_new->std_err = xstrdup(job_details->std_err); |
| details_new->std_in = xstrdup(job_details->std_in); |
| details_new->std_out = xstrdup(job_details->std_out); |
| details_new->submit_line = xstrdup(job_details->submit_line); |
| details_new->work_dir = xstrdup(job_details->work_dir); |
| details_new->x11_magic_cookie = xstrdup(job_details->x11_magic_cookie); |
| details_new->env_hash = xstrdup(job_details->env_hash); |
| details_new->script_hash = xstrdup(job_details->script_hash); |
| |
| if (job_ptr->gres_list_req) { |
| if (details_new->whole_node & WHOLE_NODE_REQUIRED) { |
| multi_core_data_t *mc_ptr = details_new->mc_ptr; |
| gres_job_state_validate_t gres_js_val = { |
| .cpus_per_tres = job_ptr_pend->cpus_per_tres, |
| .mem_per_tres = job_ptr_pend->mem_per_tres, |
| .tres_freq = job_ptr_pend->tres_freq, |
| .tres_per_job = job_ptr_pend->tres_per_job, |
| .tres_per_node = job_ptr_pend->tres_per_node, |
| .tres_per_socket = job_ptr->tres_per_socket, |
| .tres_per_task = job_ptr->tres_per_task, |
| |
| .cpus_per_task = |
| &details_new->orig_cpus_per_task, |
| .max_nodes = &details_new->max_nodes, |
| .min_cpus = &details_new->min_cpus, |
| .min_nodes = &details_new->min_nodes, |
| .ntasks_per_node = |
| &details_new->ntasks_per_node, |
| .ntasks_per_socket = &mc_ptr->ntasks_per_socket, |
| .ntasks_per_tres = |
| &details_new->ntasks_per_tres, |
| .num_tasks = &details_new->num_tasks, |
| .sockets_per_node = &mc_ptr->sockets_per_node, |
| |
| .gres_list = &job_ptr_pend->gres_list_req, |
| }; |
| |
| /* |
| * We need to reset the gres_list to what was requested |
| * instead of what was given exclusively. |
| */ |
| job_ptr_pend->gres_list_req = NULL; |
| (void)gres_job_state_validate(&gres_js_val); |
| } else |
| job_ptr_pend->gres_list_req = |
| gres_job_state_list_dup(job_ptr->gres_list_req); |
| } |
| job_ptr_pend->gres_list_req_accum = NULL; |
| job_ptr_pend->gres_list_alloc = NULL; |
| job_ptr_pend->gres_detail_cnt = 0; |
| job_ptr_pend->gres_detail_str = NULL; |
| job_ptr_pend->gres_used = NULL; |
| |
| if (job_ptr->fed_details) { |
| add_fed_job_info(job_ptr); |
| /* |
| * The new (split) job needs its remote dependencies tested |
| * separately from just the meta job, so send remote |
| * dependencies to siblings if needed. |
| */ |
| if (job_ptr->details->dependency && |
| job_ptr->details->depend_list) |
| fed_mgr_submit_remote_dependencies(job_ptr, false, |
| false); |
| } |
| |
| on_job_state_change(job_ptr, job_ptr->job_state); |
| on_job_state_change(job_ptr_pend, job_ptr_pend->job_state); |
| |
| return job_ptr_pend; |
| } |
| |
| /* Add job array data structure to the job record */ |
| static void _create_job_array(job_record_t *job_ptr, job_desc_msg_t *job_desc) |
| { |
| job_details_t *details; |
| char *sep = NULL; |
| int max_run_tasks, min_task_id, max_task_id, step_task_id = 1, task_cnt; |
| |
| if (!job_desc->array_bitmap) |
| return; |
| |
| if ((min_task_id = bit_ffs(job_desc->array_bitmap)) == -1) { |
| info("%s: %pJ array_bitmap is empty", __func__, job_ptr); |
| return; |
| } |
| |
| job_ptr->array_job_id = job_ptr->job_id; |
| job_ptr->array_recs = xmalloc(sizeof(job_array_struct_t)); |
| max_task_id = bit_fls(job_desc->array_bitmap); |
| task_cnt = bit_set_count(job_desc->array_bitmap); |
| bit_realloc(job_desc->array_bitmap, max_task_id + 1); |
| job_ptr->array_recs->task_id_bitmap = job_desc->array_bitmap; |
| job_desc->array_bitmap = NULL; |
| job_ptr->array_recs->task_cnt = |
| bit_set_count(job_ptr->array_recs->task_id_bitmap); |
| if (job_ptr->array_recs->task_cnt > 1) |
| job_count += (job_ptr->array_recs->task_cnt - 1); |
| |
| if (job_desc->array_inx) |
| sep = strchr(job_desc->array_inx, '%'); |
| if (sep) { |
| max_run_tasks = atoi(sep + 1); |
| if (max_run_tasks > 0) |
| job_ptr->array_recs->max_run_tasks = max_run_tasks; |
| } |
| |
| details = job_ptr->details; |
| if (details) { |
| if (job_desc->array_inx) { |
| sep = strchr(job_desc->array_inx, ':'); |
| if (sep) |
| step_task_id = atoi(sep + 1); |
| } |
| xrecalloc(details->env_sup, |
| MAX(job_ptr->details->env_cnt, 1) + 4, |
| sizeof(char *)); |
| xstrfmtcat(details->env_sup[details->env_cnt++], |
| "SLURM_ARRAY_TASK_COUNT=%d", task_cnt); |
| xstrfmtcat(details->env_sup[details->env_cnt++], |
| "SLURM_ARRAY_TASK_MIN=%d", min_task_id); |
| xstrfmtcat(details->env_sup[details->env_cnt++], |
| "SLURM_ARRAY_TASK_MAX=%d", max_task_id); |
| xstrfmtcat(details->env_sup[details->env_cnt++], |
| "SLURM_ARRAY_TASK_STEP=%d", step_task_id); |
| } |
| |
| on_job_state_change(job_ptr, job_ptr->job_state); |
| } |
| |
| static int _select_nodes_base(job_node_select_t *job_node_select) |
| { |
| job_node_select->rc_part_limits = |
| job_limits_check(&job_node_select->job_ptr, false); |
| |
| if ((job_node_select->rc_part_limits != WAIT_NO_REASON) && |
| (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ANY)) |
| return SLURM_ERROR; |
| |
| if ((job_node_select->rc_part_limits != WAIT_NO_REASON) && |
| (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL)) { |
| if (job_node_select->rc_part_limits != WAIT_PART_DOWN) { |
| job_node_select->rc_best = |
| ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| return SLURM_SUCCESS; |
| } else { |
| job_node_select->rc_best = ESLURM_PARTITION_DOWN; |
| } |
| } |
| |
| if (job_node_select->rc_part_limits == WAIT_NO_REASON) { |
| job_node_select->rc = select_nodes(job_node_select, |
| job_node_select->test_only, |
| true, |
| SLURMDB_JOB_FLAG_SUBMIT); |
| } else if (job_node_select->rc_part_limits != WAIT_PART_CONFIG) { |
| job_node_select->rc = select_nodes(job_node_select, |
| true, |
| true, |
| SLURMDB_JOB_FLAG_SUBMIT); |
| if ((job_node_select->rc == SLURM_SUCCESS) && |
| (job_node_select->rc_part_limits == WAIT_PART_DOWN)) |
| job_node_select->rc = ESLURM_PARTITION_DOWN; |
| } |
| if ((job_node_select->rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && |
| (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL)) { |
| /* Job can not run */ |
| job_node_select->rc_best = job_node_select->rc; |
| return SLURM_SUCCESS; |
| } |
| if ((job_node_select->rc != ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && |
| (job_node_select->rc != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && |
| (job_node_select->rc != ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE) && |
| (job_node_select->rc != ESLURM_RESERVATION_BUSY) && |
| (job_node_select->rc != ESLURM_NODES_BUSY)) { |
| /* Job can run now */ |
| job_node_select->rc_best = job_node_select->rc; |
| if ((slurm_conf.enforce_part_limits == |
| PARTITION_ENFORCE_ANY) || |
| (slurm_conf.enforce_part_limits == |
| PARTITION_ENFORCE_NONE) || |
| (!job_node_select->test_only && |
| (job_node_select->rc_part_limits == WAIT_NO_REASON))) |
| return SLURM_SUCCESS; |
| } |
| if (((job_node_select->rc == ESLURM_NODES_BUSY) || |
| (job_node_select->rc == ESLURM_RESERVATION_BUSY) || |
| (job_node_select->rc == ESLURM_PORTS_BUSY)) && |
| (job_node_select->rc_best == -1)) { |
| if (job_node_select->test_only) |
| return SLURM_SUCCESS; |
| |
| /* Keep looking for partition where job can start now */ |
| job_node_select->rc_best = job_node_select->rc; |
| } |
| if ((job_node_select->job_ptr->preempt_in_progress) && |
| (job_node_select->rc != ESLURM_NODES_BUSY)) { |
| /* Already started preempting jobs, don't |
| * consider starting this job in another |
| * partition as we iterator over others. */ |
| job_node_select->test_only = true; |
| } |
| |
| return SLURM_ERROR; |
| } |
| |
| static int _foreach_select_nodes_resvs(void *object, void *args) |
| { |
| slurmctld_resv_t *resv_ptr = object; |
| job_node_select_t *job_node_select = args; |
| job_record_t *job_ptr = job_node_select->job_ptr; |
| |
| job_ptr->resv_ptr = resv_ptr; |
| job_ptr->resv_id = resv_ptr->resv_id; |
| |
| if ((job_ptr->bit_flags & JOB_PART_ASSIGNED) && resv_ptr->part_ptr) |
| job_ptr->part_ptr = resv_ptr->part_ptr; |
| |
| debug2("Try %pJ on next reservation %s", job_ptr, resv_ptr->name); |
| |
| if ((job_node_select->rc_resv = |
| _select_nodes_base(job_node_select)) == SLURM_SUCCESS) { |
| /* break if success */ |
| if ((job_node_select->rc != ESLURM_RESERVATION_NOT_USABLE) && |
| (job_node_select->rc != ESLURM_RESERVATION_BUSY)) { |
| return -1; |
| } |
| } |
| |
| return 0; |
| } |
| |
| static int _select_nodes_resvs(job_node_select_t *job_node_select) |
| { |
| job_record_t *job_ptr = job_node_select->job_ptr; |
| |
| if (!job_ptr->resv_list) |
| return _select_nodes_base(job_node_select); |
| |
| job_node_select->rc_resv = SLURM_ERROR; |
| (void) list_for_each(job_ptr->resv_list, |
| _foreach_select_nodes_resvs, |
| job_node_select); |
| |
| return job_node_select->rc_resv; |
| } |
| |
| static int _foreach_select_nodes_qos(void *object, void *args) |
| { |
| slurmdb_qos_rec_t *qos_ptr = object; |
| job_node_select_t *job_node_select = args; |
| job_record_t *job_ptr = job_node_select->job_ptr; |
| |
| job_ptr->qos_ptr = qos_ptr; |
| |
| debug2("Try %pJ on next QOS %s", job_ptr, qos_ptr->name); |
| |
| /* break if success */ |
| if ((job_node_select->rc_qos = |
| _select_nodes_resvs(job_node_select)) == SLURM_SUCCESS) |
| return -1; |
| |
| return 0; |
| } |
| |
| static int _select_nodes_qos(job_node_select_t *job_node_select) |
| { |
| job_record_t *job_ptr = job_node_select->job_ptr; |
| |
| if (!job_ptr->qos_list) |
| return _select_nodes_resvs(job_node_select); |
| |
| job_node_select->rc_qos = SLURM_ERROR; |
| (void) list_for_each(job_ptr->qos_list, |
| _foreach_select_nodes_qos, |
| job_node_select); |
| |
| return job_node_select->rc_qos; |
| } |
| |
| static int _foreach_select_nodes_part_list(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| job_node_select_t *job_node_select = arg; |
| job_record_t *job_ptr = job_node_select->job_ptr; |
| |
| job_ptr->part_ptr = part_ptr; |
| debug2("Try %pJ on next partition %s", job_ptr, part_ptr->name); |
| |
| if (_select_nodes_qos(job_node_select) == SLURM_SUCCESS) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* |
| * Wrapper for select_nodes() function that will test all valid partitions |
| * for a new job |
| * IN job_ptr - pointer to the job record |
| * IN test_only - if set do not allocate nodes, just confirm they |
| * could be allocated now |
| * OUT err_msg - error message for job, caller must xfree |
| */ |
| static int _select_nodes_parts(job_record_t *job_ptr, bool test_only, |
| char **err_msg) |
| { |
| job_node_select_t job_node_select = { |
| .err_msg = err_msg, |
| .job_ptr = job_ptr, |
| .rc_part_limits = WAIT_NO_REASON, |
| .rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE, |
| .rc_best = -1, |
| .test_only = test_only, |
| }; |
| int rc, best_rc, part_limits_rc; |
| |
| if (job_ptr->part_ptr_list) { |
| /* part_ptr_list is already sorted */ |
| (void) list_find_first(job_ptr->part_ptr_list, |
| _foreach_select_nodes_part_list, |
| &job_node_select); |
| } else { |
| /* |
| * We don't need to check the return code of this as the rc we |
| * are sending in is the rc we care about. |
| */ |
| (void)_select_nodes_qos(&job_node_select); |
| } |
| |
| rc = job_node_select.rc; |
| best_rc = job_node_select.rc_best; |
| part_limits_rc = job_node_select.rc_part_limits; |
| if (best_rc != -1) |
| rc = best_rc; |
| else if (part_limits_rc == WAIT_PART_DOWN) |
| rc = ESLURM_PARTITION_DOWN; |
| if (rc == ESLURM_NODES_BUSY) |
| job_ptr->state_reason = WAIT_RESOURCES; |
| else if ((rc == ESLURM_RESERVATION_BUSY) || |
| (rc == ESLURM_RESERVATION_NOT_USABLE)) |
| job_ptr->state_reason = WAIT_RESERVATION; |
| else if (rc == ESLURM_JOB_HELD) |
| /* Do not reset the state_reason field here. select_nodes() |
| * already set the state_reason field, and this error code |
| * does not distinguish between user and admin holds. */ |
| ; |
| else if (rc == ESLURM_NODE_NOT_AVAIL) |
| job_ptr->state_reason = WAIT_NODE_NOT_AVAIL; |
| else if (rc == ESLURM_QOS_THRES) |
| job_ptr->state_reason = WAIT_QOS_THRES; |
| else if (rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) |
| job_ptr->state_reason = WAIT_PART_CONFIG; |
| else if (rc == ESLURM_BURST_BUFFER_WAIT) |
| job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE; |
| else if (rc == ESLURM_PARTITION_DOWN) |
| job_ptr->state_reason = WAIT_PART_DOWN; |
| else if (rc == ESLURM_INVALID_QOS) |
| job_ptr->state_reason = FAIL_QOS; |
| else if (rc == ESLURM_INVALID_ACCOUNT) |
| job_ptr->state_reason = FAIL_ACCOUNT; |
| |
| return rc; |
| } |
| |
| static inline bool _has_deadline(job_record_t *job_ptr) |
| { |
| if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) { |
| queue_job_scheduler(); |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * job_allocate - create job_records for the supplied job specification and |
| * allocate nodes for it. |
| * IN job_desc - job specifications |
| * IN immediate - if set then either initiate the job immediately or fail |
| * IN will_run - don't initiate the job if set, just test if it could run |
| * now or later |
| * OUT resp - will run response (includes start location, time, etc.) |
| * IN allocate - resource allocation request only if set, batch job if zero |
| * IN submit_uid -uid of user issuing the request |
| * OUT job_pptr - set to pointer to job record |
| * OUT err_msg - Custom error message to the user, caller to xfree results |
| * IN protocol_version - version of the code the caller is using |
| * RET 0 or an error code. If the job would only be able to execute with |
| * some change in partition configuration then |
| * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned |
| * globals: job_list - pointer to global job list |
| * list_part - global list of partition info |
| * default_part_loc - pointer to default partition |
| */ |
| extern int job_allocate(job_desc_msg_t *job_desc, int immediate, |
| int will_run, will_run_response_msg_t **resp, |
| int allocate, uid_t submit_uid, bool cron, |
| job_record_t **job_pptr, char **err_msg, |
| uint16_t protocol_version) |
| { |
| static time_t sched_update = 0; |
| static bool defer_batch = false, defer_sched = false; |
| static bool ignore_prefer_val = false, ignore_constraint_val = false; |
| int error_code, i; |
| bool no_alloc, top_prio, test_only, too_fragmented, independent; |
| job_record_t *job_ptr; |
| time_t now = time(NULL); |
| bool held_user = false; |
| bool defer_this = false; |
| |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(NODE_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(PART_LOCK, READ_LOCK)); |
| |
| if (sched_update != slurm_conf.last_update) { |
| char *tmp_ptr; |
| sched_update = slurm_conf.last_update; |
| defer_batch = defer_sched = false; |
| if (xstrcasestr(slurm_conf.sched_params, "defer_batch")) |
| defer_batch = true; |
| else if (xstrcasestr(slurm_conf.sched_params, "defer")) |
| defer_sched = true; |
| if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, |
| "delay_boot="))) { |
| char *tmp_comma; |
| if ((tmp_comma = xstrstr(tmp_ptr, ","))) |
| *tmp_comma = '\0'; |
| i = time_str2secs(tmp_ptr + 11); |
| if (i != NO_VAL) |
| delay_boot = i; |
| if (tmp_comma) |
| *tmp_comma = ','; |
| } |
| bf_min_age_reserve = 0; |
| if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, |
| "bf_min_age_reserve="))) { |
| int min_age = atoi(tmp_ptr + 19); |
| if (min_age > 0) |
| bf_min_age_reserve = min_age; |
| } |
| |
| if (xstrcasestr(slurm_conf.sched_params, "allow_zero_lic")) |
| validate_cfgd_licenses = false; |
| |
| if (xstrcasestr(slurm_conf.sched_params, |
| "ignore_prefer_validation")) |
| ignore_prefer_val = true; |
| else |
| ignore_prefer_val = false; |
| if (xstrcasestr(slurm_conf.sched_params, |
| "ignore_constraint_validation")) |
| ignore_constraint_val = true; |
| else |
| ignore_constraint_val = false; |
| } |
| |
| if (job_desc->array_bitmap) |
| i = bit_set_count(job_desc->array_bitmap); |
| else |
| i = 1; |
| |
| if ((job_count + i) > slurm_conf.max_job_cnt) { |
| error("%s: MaxJobCount limit from slurm.conf reached (%u)", |
| __func__, slurm_conf.max_job_cnt); |
| return EAGAIN; |
| } |
| |
| error_code = _job_create(job_desc, allocate, will_run, cron, |
| &job_ptr, submit_uid, err_msg, |
| protocol_version); |
| *job_pptr = job_ptr; |
| if (error_code) { |
| if (job_ptr && (immediate || will_run)) { |
| /* this should never really happen here */ |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| error("%s: setting %pJ to \"%s\"", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason)); |
| } |
| return error_code; |
| } |
| xassert(job_ptr); |
| if (job_desc->array_bitmap) |
| independent = false; |
| else |
| independent = job_independent(job_ptr); |
| /* |
| * priority needs to be calculated after this since we set a |
| * begin time in job_independent and that lets us know if the |
| * job is eligible. |
| */ |
| if (job_ptr->priority == NO_VAL) |
| set_job_prio(job_ptr); |
| |
| if (job_ptr->state_reason == WAIT_HELD_USER) |
| held_user = true; |
| |
| /* Avoid resource fragmentation if important */ |
| if ((submit_uid || (job_desc->req_nodes == NULL)) && |
| independent && job_is_completing(NULL)) |
| too_fragmented = true; /* Don't pick nodes for job now */ |
| /* |
| * FIXME: Ideally we only want to refuse the request if the |
| * required node list is insufficient to satisfy the job's |
| * processor or node count requirements, but the overhead is |
| * rather high to do that right here. We let requests from |
| * user root proceed if a node list is specified, for |
| * meta-schedulers (e.g. Maui, Moab, etc.). |
| */ |
| else |
| too_fragmented = false; |
| |
| defer_this = defer_sched || (defer_batch && job_ptr->batch_flag); |
| |
| if (independent && (!too_fragmented) && !defer_this) |
| top_prio = _top_priority(job_ptr, job_desc->het_job_offset); |
| else |
| top_prio = true; /* don't bother testing, |
| * it is not runable anyway */ |
| |
| if (immediate && |
| (too_fragmented || (!top_prio) || (!independent) || defer_this)) { |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| if (!independent) { |
| debug2("%s: setting %pJ to \"%s\" due to dependency (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(ESLURM_DEPENDENCY)); |
| return ESLURM_DEPENDENCY; |
| } |
| else if (too_fragmented) { |
| debug2("%s: setting %pJ to \"%s\" due to fragmentation (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(ESLURM_FRAGMENTATION)); |
| return ESLURM_FRAGMENTATION; |
| } |
| else if (!top_prio) { |
| debug2("%s: setting %pJ to \"%s\" because it's not top priority (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(ESLURM_NOT_TOP_PRIORITY)); |
| return ESLURM_NOT_TOP_PRIORITY; |
| } else { |
| job_ptr->state_reason = FAIL_DEFER; |
| debug2("%s: setting %pJ to \"%s\" due to SchedulerParameters=defer (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(ESLURM_DEFER)); |
| return ESLURM_DEFER; |
| } |
| } |
| |
| if (will_run && resp) { |
| int rc; |
| rc = job_start_data(job_ptr, resp); |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->start_time = job_ptr->end_time = now; |
| purge_job_record(job_ptr->job_id); |
| return rc; |
| } |
| |
| /* |
| * We should have a job_ptr->details here if not something is really |
| * wrong. |
| */ |
| xassert(job_ptr->details); |
| |
| /* |
| * fed jobs need to go to the siblings first so don't attempt to |
| * schedule the job now. |
| */ |
| test_only = will_run || job_ptr->deadline || (allocate == 0) || |
| job_ptr->fed_details; |
| |
| no_alloc = test_only || too_fragmented || _has_deadline(job_ptr) || |
| (!top_prio) || (!independent) || |
| (job_desc->het_job_offset != NO_VAL) || defer_this || |
| (job_ptr->details->prefer && ignore_prefer_val) || |
| (job_ptr->details->features && ignore_constraint_val); |
| |
| no_alloc = no_alloc || (bb_g_job_test_stage_in(job_ptr, no_alloc) != 1); |
| |
| no_alloc = no_alloc || (!job_ptr->resv_name && |
| get_magnetic_resv_count()); |
| |
| /* |
| * If we have a prefer feature list check that, if not check the |
| * normal features. |
| */ |
| if (job_ptr->details->prefer && !ignore_prefer_val) { |
| job_ptr->details->features_use = job_ptr->details->prefer; |
| job_ptr->details->feature_list_use = |
| job_ptr->details->prefer_list; |
| } else if (!ignore_constraint_val) { |
| job_ptr->details->features_use = job_ptr->details->features; |
| job_ptr->details->feature_list_use = |
| job_ptr->details->feature_list; |
| } else { |
| /* |
| * Set features_use to "" because ignore_constraint_val is set. |
| * We also set no_alloc to true to avoid actually allocating |
| * with this setup. |
| * We are using an empty string rather than NULL because |
| * valid_feature_counts() will use features rather than |
| * features_use if it is NULL. |
| */ |
| job_ptr->details->features_use = ""; |
| job_ptr->details->feature_list_use = NULL; |
| } |
| |
| error_code = _select_nodes_parts(job_ptr, no_alloc, err_msg); |
| |
| set_job_features_use(job_ptr->details); |
| |
| if (!test_only) { |
| last_job_update = now; |
| } |
| |
| if (held_user) |
| job_ptr->state_reason = WAIT_HELD_USER; |
| /* |
| * Moved this (_create_job_array) here to handle when a job |
| * array is submitted since we |
| * want to know the array task count when we check the job against |
| * QOS/Assoc limits |
| */ |
| _create_job_array(job_ptr, job_desc); |
| |
| slurmctld_diag_stats.jobs_submitted += |
| (job_ptr->array_recs && job_ptr->array_recs->task_cnt) ? |
| job_ptr->array_recs->task_cnt : 1; |
| |
| acct_policy_add_job_submit(job_ptr, false); |
| |
| /* |
| * This only needs to happen if the job didn't schedule immediately. |
| * select_nodes() can start it if there are nodes available, but if |
| * that didn't happened send the start record now. |
| */ |
| if (!IS_JOB_IN_DB(job_ptr)) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| if ((error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && |
| (slurm_conf.enforce_part_limits != PARTITION_ENFORCE_NONE)) |
| ; /* Reject job submission */ |
| else if ((error_code == ESLURM_NODES_BUSY) || |
| (error_code == ESLURM_RESERVATION_BUSY) || |
| (error_code == ESLURM_JOB_HELD) || |
| (error_code == ESLURM_NODE_NOT_AVAIL) || |
| (error_code == ESLURM_QOS_THRES) || |
| (error_code == ESLURM_ACCOUNTING_POLICY) || |
| (error_code == ESLURM_RESERVATION_NOT_USABLE) || |
| (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) || |
| (error_code == ESLURM_BURST_BUFFER_WAIT) || |
| (error_code == ESLURM_PARTITION_DOWN) || |
| (error_code == ESLURM_LICENSES_UNAVAILABLE) || |
| (error_code == ESLURM_PORTS_BUSY) || |
| ((error_code == ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && |
| (job_ptr->state_reason == FAIL_CONSTRAINTS))) { |
| /* |
| * Non-fatal error, but job can't be scheduled right now. |
| * |
| * Note: Keep list in sync with nonfatal_errors[] in |
| * openapi/slurmctld. |
| */ |
| if (immediate) { |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| debug2("%s: setting %pJ to \"%s\" because it cannot be immediately allocated (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(error_code)); |
| } else { /* job remains queued */ |
| if ((error_code == ESLURM_NODES_BUSY) || |
| (error_code == ESLURM_BURST_BUFFER_WAIT) || |
| (error_code == ESLURM_RESERVATION_BUSY) || |
| (error_code == ESLURM_ACCOUNTING_POLICY) || |
| (error_code == ESLURM_PORTS_BUSY) || |
| ((error_code == ESLURM_PARTITION_DOWN) && |
| (job_ptr->batch_flag))) { |
| job_ptr->details->features_use = NULL; |
| job_ptr->details->feature_list_use = NULL; |
| error_code = SLURM_SUCCESS; |
| } |
| } |
| return error_code; |
| } |
| |
| if (error_code) { /* fundamental flaw in job request */ |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| debug2("%s: setting %pJ to \"%s\" due to a flaw in the job request (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(error_code)); |
| return error_code; |
| } |
| |
| if (will_run) { /* job would run, flag job destruction */ |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->start_time = job_ptr->end_time = now; |
| purge_job_record(job_ptr->job_id); |
| } |
| |
| if (!will_run) { |
| sched_debug2("%pJ allocated resources: NodeList=%s", |
| job_ptr, job_ptr->nodes); |
| rebuild_job_part_list(job_ptr); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * job_fail - terminate a job due to initiation failure |
| * IN job_ptr - Pointer to job to be killed |
| * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.) |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| static int _job_fail(job_record_t *job_ptr, uint32_t job_state) |
| { |
| time_t now = time(NULL); |
| bool suspended = false; |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| uint32_t suspend_job_state = job_ptr->job_state; |
| /* |
| * we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_state_set(job_ptr, JOB_CANCELLED); |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_state_set(job_ptr, suspend_job_state); |
| suspended = true; |
| } |
| |
| if (IS_JOB_CONFIGURING(job_ptr) || IS_JOB_RUNNING(job_ptr) || |
| suspended) { |
| /* No need to signal steps, deallocate kills them */ |
| job_ptr->time_last_active = now; |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| last_job_update = now; |
| job_state_set(job_ptr, (job_state | JOB_COMPLETING)); |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_LAUNCH; |
| xfree(job_ptr->state_desc); |
| job_completion_logger(job_ptr, false); |
| if (job_ptr->node_bitmap) { |
| build_cg_bitmap(job_ptr); |
| deallocate_nodes(job_ptr, false, suspended, false); |
| } |
| return SLURM_SUCCESS; |
| } |
| /* All other states */ |
| verbose("job_fail: %pJ can't be killed from state=%s", |
| job_ptr, job_state_string(job_ptr->job_state)); |
| |
| return ESLURM_TRANSITION_STATE_NO_UPDATE; |
| |
| } |
| |
| /* |
| * IN signal_args - Append the response to signal_args->responses. |
| * IN cluster_id - If set, then this identifies the sibling cluster that the |
| * job is running on or originated from. |
| * IN eror_code - Error code to use in the response. |
| * IN err_msg - If set, use this as the response error message. |
| * IN id - Identifier for the job. Job id is different than the actual job id |
| * if the job is an array task or a het job component that is not the |
| * het job leader. |
| * IN real_job_id - The real job id or NO_VAL |
| */ |
| static void _add_signal_job_resp(signal_jobs_args_t *signal_args, |
| char *sibling_name, int error_code, |
| char *err_msg, slurm_selected_step_t *id, |
| uint32_t real_job_id) |
| { |
| kill_jobs_resp_job_t *job_resp = xmalloc(sizeof(*job_resp)); |
| |
| job_resp->error_code = error_code; |
| if (err_msg) |
| job_resp->error_msg = err_msg; |
| else if (error_code != SLURM_SUCCESS) |
| job_resp->error_msg = xstrdup(slurm_strerror(error_code)); |
| job_resp->id = xmalloc(sizeof(*job_resp->id)); |
| memcpy(job_resp->id, id, sizeof(*id)); |
| /* Full copy job_resp->id->array_bitmap */ |
| if (id->array_bitmap) |
| job_resp->id->array_bitmap = bit_copy(id->array_bitmap); |
| |
| job_resp->real_job_id = real_job_id; |
| job_resp->sibling_name = sibling_name; |
| |
| list_append(signal_args->responses, job_resp); |
| } |
| |
| static int _match_part_name(void *x, void *key) |
| { |
| part_record_t *part_ptr = x; |
| char *part_name = key; |
| |
| if (!xstrcmp(part_ptr->name, part_name)) |
| return 1; |
| return 0; |
| } |
| |
| static int _match_resv_name(void *x, void *key) |
| { |
| slurmctld_resv_t *resv_ptr = x; |
| char *resv_name = key; |
| |
| if (!xstrcmp(resv_ptr->name, resv_name)) |
| return 1; |
| return 0; |
| } |
| |
| static void _slurm_selected_step_init(job_record_t *job_ptr, |
| slurm_selected_step_t *id) |
| { |
| xassert(job_ptr); |
| |
| id->array_bitmap = NULL; |
| id->array_task_id = job_ptr->array_task_id; |
| if (job_ptr->array_task_id != NO_VAL) |
| id->step_id.job_id = job_ptr->array_job_id; |
| else if (job_ptr->het_job_offset) |
| id->step_id.job_id = job_ptr->het_job_id; |
| else |
| id->step_id.job_id = job_ptr->job_id; |
| |
| if (job_ptr->het_job_offset) |
| id->het_job_offset = job_ptr->het_job_offset; |
| else |
| id->het_job_offset = NO_VAL; |
| |
| id->step_id.step_het_comp = NO_VAL; |
| id->step_id.step_id = NO_VAL; |
| } |
| |
| static void _handle_signal_filter_mismatch(job_record_t *job_ptr, |
| signal_jobs_args_t *signal_args, |
| uint32_t error_code, |
| char *filter_err_msg) |
| { |
| slurm_selected_step_t id; |
| char *err_msg = NULL; |
| |
| /* |
| * If the job is revoked on this cluster and started on a sibling, the |
| * revoked job's state, reservation, and partition will not necessarily |
| * match the other cluster, and the other cluster has the cluster lock |
| * for this job. For example, this job's state is 0+REVOKED and the job |
| * state on the other cluster could be suspended, running, etc. |
| * In that case, always send a response back to the client that we |
| * could not signal the job. |
| */ |
| if (fed_mgr_fed_rec && fed_mgr_job_started_on_sib(job_ptr)) { |
| char *sib_name; |
| |
| sib_name = fed_mgr_get_cluster_name( |
| job_ptr->fed_details->cluster_lock); |
| err_msg = xstrdup_printf("Job started on sibling cluster %s: %s", |
| sib_name, slurm_strerror(error_code)); |
| _slurm_selected_step_init(job_ptr, &id); |
| _add_signal_job_resp(signal_args, sib_name, error_code, |
| err_msg, &id, job_ptr->job_id); |
| /* sib_name is added to the job_resp, do not free */ |
| return; |
| } |
| |
| if (!signal_args->filter_specific_job_ids) |
| return; |
| |
| if (filter_err_msg) |
| err_msg = xstrdup_printf("%s: %s", |
| filter_err_msg, |
| slurm_strerror(error_code)); |
| else |
| err_msg = xstrdup_printf("%s", slurm_strerror(error_code)); |
| |
| _slurm_selected_step_init(job_ptr, &id); |
| _add_signal_job_resp(signal_args, NULL, error_code, |
| err_msg, &id, job_ptr->job_id); |
| } |
| |
| static bool _signal_job_matches_filter(job_record_t *job_ptr, |
| signal_jobs_args_t *signal_args) |
| { |
| bool matches_filter = true; |
| int error_code = ESLURM_JOB_SIGNAL_FAILED; |
| uint32_t job_base_state = job_ptr->job_state & JOB_STATE_BASE; |
| char *filter_err_msg = NULL; |
| kill_jobs_msg_t *kill_msg = signal_args->kill_msg; |
| |
| if (IS_JOB_FINISHED(job_ptr)) { |
| error_code = ESLURM_ALREADY_DONE; |
| matches_filter = false; |
| goto fini; |
| } |
| |
| if (kill_msg->account && xstrcmp(job_ptr->account, kill_msg->account)) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = xstrdup_printf("Job account %s != filter account %s", |
| job_ptr->account, |
| kill_msg->account); |
| } |
| matches_filter = false; |
| goto fini; |
| } |
| |
| if (kill_msg->job_name && xstrcmp(job_ptr->name, kill_msg->job_name)) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = xstrdup_printf("Job name %s != filter name %s", |
| job_ptr->name, |
| kill_msg->job_name); |
| } |
| matches_filter = false; |
| goto fini; |
| } |
| |
| /* |
| * If the job is submitted to multiple partitions, then its partition |
| * string is all the partitions. We need to find if the requested |
| * partition matches any of the partitions that the job was submitted |
| * to if the job is still pending. If the job is running, only check |
| * the partition the job is running in. |
| */ |
| if (kill_msg->partition) { |
| if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr_list) { |
| if (!list_find_first(job_ptr->part_ptr_list, |
| _match_part_name, |
| kill_msg->partition)) |
| matches_filter = false; |
| } else if (job_ptr->part_ptr) { |
| if (xstrcmp(job_ptr->part_ptr->name, |
| kill_msg->partition)) |
| matches_filter = false; |
| } else { |
| if (xstrcmp(job_ptr->partition, kill_msg->partition)) |
| matches_filter = false; |
| } |
| |
| if (!matches_filter) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = |
| xstrdup_printf("Job partition %s does not include filter partition %s", |
| job_ptr->partition, |
| kill_msg->partition); |
| } |
| goto fini; |
| } |
| } |
| |
| if (kill_msg->qos) { |
| char *qos_name = "NULL"; |
| |
| if (!job_ptr->qos_ptr) |
| matches_filter = false; |
| else if (xstrcmp(job_ptr->qos_ptr->name, kill_msg->qos)) { |
| matches_filter = false; |
| qos_name = job_ptr->qos_ptr->name; |
| } |
| |
| if (!matches_filter) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = xstrdup_printf("Job qos %s != filter qos %s", |
| qos_name, |
| kill_msg->qos); |
| } |
| goto fini; |
| } |
| } |
| |
| if (kill_msg->reservation) { |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { |
| slurmctld_resv_t *resv_ptr = |
| find_resv_name(kill_msg->reservation); |
| |
| if (!(resv_ptr && |
| (resv_ptr->resv_id == job_ptr->resv_id))) |
| matches_filter = false; |
| } else if (job_ptr->resv_list) { |
| if (!list_find_first(job_ptr->resv_list, |
| _match_resv_name, |
| kill_msg->reservation)) |
| matches_filter = false; |
| } else if (job_ptr->resv_ptr) { |
| if (xstrcmp(job_ptr->resv_ptr->name, |
| kill_msg->reservation)) |
| matches_filter = false; |
| } else { |
| if (xstrcmp(job_ptr->resv_name, kill_msg->reservation)) |
| matches_filter = false; |
| } |
| |
| if (!matches_filter) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = |
| xstrdup_printf("Job reservation %s does not include filter reservation %s", |
| job_ptr->resv_name, |
| kill_msg->reservation); |
| } |
| goto fini; |
| } |
| } |
| |
| if ((kill_msg->state != JOB_END) && |
| (job_base_state != kill_msg->state)) { |
| if (signal_args->filter_specific_job_ids) { |
| char *msg_state_str = job_state_string(kill_msg->state); |
| char *job_state_str = job_state_string(job_base_state); |
| |
| filter_err_msg = xstrdup_printf("Job state %s != filter state %s", |
| job_state_str, |
| msg_state_str); |
| } |
| matches_filter = false; |
| goto fini; |
| } |
| |
| if (kill_msg->user_name && (job_ptr->user_id != kill_msg->user_id)) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = xstrdup_printf("Job user id %u != filter user id %u", |
| job_ptr->user_id, |
| kill_msg->user_id); |
| } |
| matches_filter = false; |
| goto fini; |
| } |
| |
| if (kill_msg->nodelist) { |
| hostset_t *hs; |
| bool intersects; |
| |
| if (!job_ptr->nodes) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = |
| xstrdup_printf("Job does not have nodes but filter has nodes %s", |
| kill_msg->nodelist); |
| } |
| matches_filter = false; |
| goto fini; |
| } |
| |
| hs = hostset_create(job_ptr->nodes); |
| intersects = hostset_intersects(hs, kill_msg->nodelist); |
| hostset_destroy(hs); |
| if (!intersects) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = |
| xstrdup_printf("Job nodes %s does not intersect with filter nodes %s", |
| job_ptr->nodes, |
| kill_msg->nodelist); |
| } |
| matches_filter = false; |
| goto fini; |
| } |
| } |
| |
| if (kill_msg->wckey) { |
| char *job_key = job_ptr->wckey; |
| |
| /* |
| * A wckey that begins with '*' indicates that the wckey |
| * was applied by default. When the --wckey option does |
| * not begin with a '*', act on all wckeys with the same |
| * name, default or not. |
| */ |
| if ((kill_msg->wckey[0] != '*') && job_key && |
| (job_key[0] == '*')) |
| job_key++; |
| |
| if (xstrcmp(job_key, kill_msg->wckey)) { |
| if (signal_args->filter_specific_job_ids) { |
| filter_err_msg = |
| xstrdup_printf("Job wckey %s != filter wckey %s", |
| job_ptr->wckey, |
| kill_msg->wckey); |
| } |
| matches_filter = false; |
| goto fini; |
| } |
| } |
| |
| if (job_ptr->het_job_offset) { |
| if (signal_args->het_leader && |
| signal_args->het_leader->job_id && |
| (job_ptr->het_job_id == |
| signal_args->het_leader->het_job_id)) { |
| /* |
| * Filter out HetJob non-leader component as its leader |
| * should have already been evaluated and hasn't been |
| * filtered out. |
| * |
| * The leader RPC signal handler will affect all the |
| * components, so this avoids extra unneeded RPCs, races |
| * and issues interpreting multiple error codes. |
| * |
| * This can be done assuming the walking of the loaded |
| * jobs is guaranteed to evaluate in an order such that |
| * HetJob leaders are evaluated before their matching |
| * non-leaders and the whole HetJob is evaluated |
| * contiguously. The slurmctld job_list is ordered by |
| * job creation time (always leader first) and HetJobs |
| * are created in a row. |
| */ |
| return false; |
| } |
| |
| /* |
| * Het job components may not be signalled individually if they |
| * are pending or if whole_hetjob is set. |
| */ |
| if (IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_NOT_WHOLE_HET_JOB; |
| if (signal_args->filter_specific_job_ids) |
| filter_err_msg = xstrdup("Het job component cannot be signalled while pending"); |
| goto fini; |
| } |
| if (_get_whole_hetjob()) { |
| error_code = ESLURM_NOT_WHOLE_HET_JOB; |
| if (signal_args->filter_specific_job_ids) |
| filter_err_msg = xstrdup("slurm.conf whole_hetjob is set"); |
| goto fini; |
| } |
| } |
| |
| fini: |
| if (!matches_filter) |
| _handle_signal_filter_mismatch(job_ptr, signal_args, |
| error_code, filter_err_msg); |
| else { |
| /* Track most recent het leader. */ |
| if (job_ptr->het_job_id && !job_ptr->het_job_offset) |
| signal_args->het_leader = job_ptr; |
| } |
| |
| xfree(filter_err_msg); |
| |
| return matches_filter; |
| } |
| |
| /* |
| * Figure out if the job (job_ptr) matches the specified filters: |
| * - filter_id describes a job or set of jobs if it is an array expression. |
| * - signal_args->kill_msg has filters requested by the client. |
| * |
| * If the job does not match the specified filters in signal_args, then |
| * _signal_job_matches_filter() adds a response message for the job and we |
| * return. |
| * |
| * If the job matches the specified filters, but the user is not authorized to |
| * signal the job, add a response message and return. |
| * |
| * If the job matches the specified filters and the user is authorized to signal |
| * the job, place the job into the appropriate list of jobs which will later be |
| * signaled. The lists are in signal_args. |
| * - pending_array_task_list: A meta record with pending array tasks that are |
| * requested to be signaled, or a single pending array task that has not yet |
| * been split from the meta record. |
| * - array_leader_list - A meta record for an array where that entire array has |
| * been requested to be signaled. |
| * - other_job_list - All other jobs to be signaled. |
| */ |
| static void _apply_signal_jobs_filter(job_record_t *job_ptr, |
| slurm_selected_step_t *filter_id, |
| signal_jobs_args_t *signal_args) |
| { |
| bool is_pending_meta_record_with_tasks; |
| uid_t auth_uid = signal_args->auth_uid; |
| |
| if (!_signal_job_matches_filter(job_ptr, signal_args)) |
| return; |
| |
| /* Verify that the user can kill the requested job */ |
| if ((job_ptr->user_id != auth_uid) && |
| !validate_operator_locked(auth_uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, auth_uid, |
| job_ptr->account, true)) { |
| slurm_selected_step_t *use_id; |
| slurm_selected_step_t id; |
| |
| if (filter_id) |
| use_id = filter_id; |
| else { |
| _slurm_selected_step_init(job_ptr, &id); |
| use_id = &id; |
| } |
| _add_signal_job_resp(signal_args, NULL, ESLURM_ACCESS_DENIED, |
| NULL, use_id, job_ptr->job_id); |
| return; |
| } |
| |
| is_pending_meta_record_with_tasks = (IS_JOB_PENDING(job_ptr) && |
| job_ptr->array_recs && |
| job_ptr->array_recs->task_cnt); |
| |
| if (filter_id && !filter_id->array_bitmap && |
| (filter_id->array_task_id != NO_VAL) && |
| is_pending_meta_record_with_tasks) { |
| /* |
| * A pending job array task that has not been split from the |
| * meta array record. |
| */ |
| array_task_filter_t *atf = xmalloc(sizeof(*atf)); |
| |
| /* Copy filter_id, but use a new array_bitmap */ |
| atf->filter_id = xmalloc(sizeof(*atf->filter_id)); |
| memcpy(atf->filter_id, filter_id, sizeof(*filter_id)); |
| |
| atf->filter_id->array_bitmap = bit_alloc(max_array_size); |
| bit_set(atf->filter_id->array_bitmap, filter_id->array_task_id); |
| atf->free_array_bitmap = true; |
| atf->job_ptr = job_ptr; |
| |
| list_append(signal_args->pending_array_task_list, atf); |
| } else if (filter_id && filter_id->array_bitmap && |
| is_pending_meta_record_with_tasks) { |
| /* A job array expression with pending array tasks */ |
| array_task_filter_t *atf = xmalloc(sizeof(*atf)); |
| |
| atf->filter_id = xmalloc(sizeof(*atf->filter_id)); |
| memcpy(atf->filter_id, filter_id, sizeof(*filter_id)); |
| atf->job_ptr = job_ptr; |
| |
| list_append(signal_args->pending_array_task_list, atf); |
| } else if (job_ptr->array_recs) |
| list_append(signal_args->array_leader_list, job_ptr); |
| else |
| list_append(signal_args->other_job_list, job_ptr); |
| } |
| |
| static int _foreach_filter_job_list(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| signal_jobs_args_t *signal_args = arg; |
| |
| _apply_signal_jobs_filter(job_ptr, NULL, signal_args); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_signal_job(void *x, void *arg) |
| { |
| int error_code; |
| job_record_t *job_ptr = x; |
| signal_jobs_args_t *signal_args = arg; |
| kill_jobs_msg_t *kill_msg = signal_args->kill_msg; |
| |
| if (job_ptr->het_job_list) |
| error_code = het_job_signal(job_ptr, kill_msg->signal, |
| kill_msg->flags, |
| signal_args->auth_uid, 0); |
| else |
| error_code = job_signal(job_ptr, kill_msg->signal, |
| kill_msg->flags, |
| signal_args->auth_uid, 0); |
| |
| if (error_code || (kill_msg->flags & KILL_JOBS_VERBOSE)) { |
| slurm_selected_step_t id; |
| |
| _slurm_selected_step_init(job_ptr, &id); |
| _add_signal_job_resp(signal_args, NULL, error_code, NULL, &id, |
| job_ptr->job_id); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_signal_job_array_tasks(void *x, void *arg) |
| { |
| array_task_filter_t *atf = x; |
| signal_jobs_args_t *signal_args = arg; |
| kill_jobs_msg_t *kill_msg = signal_args->kill_msg; |
| int32_t i_last; |
| int error_code = SLURM_SUCCESS; |
| |
| /* |
| * Signal the pending array tasks in the array job. The tasks that |
| * have already been split out are not part of the meta job's array |
| * bitmap and are handled elsewhere. |
| * |
| * _signal_pending_job_array_tasks() removes the pending tasks from |
| * array_bitmap. For the response to the client, we want to the pending |
| * tasks that were signalled. To get that, operate on a copy of |
| * array_bitmap which will be returned with the running tasks. Then |
| * remove the running tasks from the original bitmap (bit_and_not). |
| */ |
| i_last = bit_fls(atf->filter_id->array_bitmap); |
| if (i_last >= 0) { |
| bitstr_t *array_bitmap_running = |
| bit_copy(atf->filter_id->array_bitmap); |
| |
| _signal_pending_job_array_tasks(atf->job_ptr, |
| &array_bitmap_running, |
| kill_msg->signal, |
| signal_args->auth_uid, |
| i_last, signal_args->now, |
| &error_code); |
| bit_and_not(atf->filter_id->array_bitmap, array_bitmap_running); |
| FREE_NULL_BITMAP(array_bitmap_running); |
| } |
| |
| if (error_code || (kill_msg->flags & KILL_JOBS_VERBOSE)) |
| _add_signal_job_resp(signal_args, NULL, error_code, NULL, |
| atf->filter_id, atf->job_ptr->job_id); |
| |
| return 0; |
| } |
| |
| static foreach_job_by_id_control_t _job_not_found(const slurm_selected_step_t |
| *id, |
| void *arg) |
| { |
| signal_jobs_args_t *signal_args = arg; |
| uint32_t job_id = id->step_id.job_id; |
| |
| if (fed_mgr_fed_rec && !fed_mgr_is_origin_job_id(job_id)) { |
| int error_code = ESLURM_JOB_SIGNAL_FAILED; |
| char *err_msg = NULL; |
| |
| err_msg = xstrdup_printf("Job id not in federation: %s", |
| slurm_strerror(error_code)); |
| _add_signal_job_resp(signal_args, NULL, error_code, |
| err_msg, (slurm_selected_step_t *) id, |
| NO_VAL); |
| } else { |
| _add_signal_job_resp(signal_args, NULL, ESLURM_INVALID_JOB_ID, |
| NULL, (slurm_selected_step_t *) id, |
| NO_VAL); |
| } |
| return FOR_EACH_JOB_BY_ID_EACH_CONT; |
| } |
| |
| static foreach_job_by_id_control_t _filter_job(job_record_t *job_ptr, |
| const slurm_selected_step_t *id, |
| void *arg) |
| { |
| _apply_signal_jobs_filter(job_ptr, (slurm_selected_step_t *) id, arg); |
| |
| return FOR_EACH_JOB_BY_ID_EACH_CONT; |
| } |
| |
| static void _filter_jobs_ids(slurm_selected_step_t **job_ids, uint32_t cnt, |
| signal_jobs_args_t *signal_args) |
| { |
| signal_args->filter_specific_job_ids = true; |
| for (int i = 0; i < cnt; i++) { |
| slurm_selected_step_t *filter = job_ids[i]; |
| uint32_t job_id = filter->step_id.job_id; |
| int rc; |
| |
| if (fed_mgr_cluster_rec && !fed_mgr_is_job_id_in_fed(job_id)) { |
| rc = ESLURM_JOB_NOT_FEDERATED; |
| _add_signal_job_resp(signal_args, NULL, rc, NULL, |
| filter, NO_VAL); |
| continue; |
| } |
| |
| (void) foreach_job_by_id(filter, _filter_job, _job_not_found, |
| signal_args); |
| } |
| } |
| |
| static int _foreach_xfer_responses(void *x, void *arg) |
| { |
| kill_jobs_resp_job_t *job_resp = x; |
| xfer_signal_jobs_responses_args_t *args = arg; |
| |
| memcpy(&args->resp_msg->job_responses[args->curr_count], job_resp, |
| sizeof(*job_resp)); |
| |
| /* |
| * Pointers in job_resp were transferred and will be free'd with |
| * job_responses |
| */ |
| xfree(job_resp); |
| args->curr_count++; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _build_kill_jobs_resp_msg(signal_jobs_args_t *signal_args, |
| kill_jobs_resp_msg_t **resp_msg_p) |
| { |
| kill_jobs_resp_msg_t *resp_msg = xmalloc(sizeof(*resp_msg)); |
| xfer_signal_jobs_responses_args_t foreach_args = { |
| .resp_msg = resp_msg, |
| }; |
| |
| *resp_msg_p = resp_msg; |
| resp_msg->jobs_cnt = list_count(signal_args->responses); |
| |
| if (!resp_msg->jobs_cnt) |
| return; |
| |
| resp_msg->job_responses = xcalloc(resp_msg->jobs_cnt, |
| sizeof(*resp_msg->job_responses)); |
| list_for_each(signal_args->responses, _foreach_xfer_responses, |
| &foreach_args); |
| } |
| |
| /* |
| * Signal a job based upon job pointer. |
| * Authentication and authorization checks must be performed before calling. |
| */ |
| extern int job_signal(job_record_t *job_ptr, uint16_t signal, |
| uint16_t flags, uid_t uid, bool preempt) |
| { |
| uint16_t job_term_state; |
| time_t now = time(NULL); |
| |
| log_flag(TRACE_JOBS, "%s: enter %pJ", __func__, job_ptr); |
| |
| if (IS_JOB_STAGE_OUT(job_ptr) && (flags & KILL_HURRY)) { |
| job_ptr->bit_flags |= JOB_KILL_HURRY; |
| return bb_g_job_cancel(job_ptr); |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| |
| /* |
| * If is origin job then cancel siblings -- if they exist. |
| * origin job = because it knows where the siblings are |
| * If the job is running locally then just do the normal signaling |
| */ |
| if (!(flags & KILL_NO_SIBS) && !IS_JOB_RUNNING(job_ptr) && |
| job_ptr->fed_details && fed_mgr_fed_rec) { |
| uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id); |
| slurmdb_cluster_rec_t *origin = |
| fed_mgr_get_cluster_by_id(origin_id); |
| |
| if (origin && (origin == fed_mgr_cluster_rec) && |
| fed_mgr_job_started_on_sib(job_ptr)) { |
| /* |
| * If the job is running on a remote cluster then wait |
| * for the job to report back that it's completed, |
| * otherwise just signal the pending siblings and itself |
| * (by not returning). |
| */ |
| return fed_mgr_job_cancel(job_ptr, signal, flags, uid, |
| false); |
| } else if (origin && (origin == fed_mgr_cluster_rec)) { |
| /* cancel origin job and revoke sibling jobs */ |
| fed_mgr_job_revoke_sibs(job_ptr); |
| fed_mgr_remove_remote_dependencies(job_ptr); |
| } else if (!origin || |
| !origin->fed.send || |
| !((persist_conn_t *) origin->fed.send)->tls_conn) { |
| /* |
| * The origin is down just signal all of the viable |
| * sibling jobs |
| */ |
| fed_mgr_job_cancel(job_ptr, signal, flags, uid, true); |
| } |
| } |
| |
| last_job_update = now; |
| |
| /* |
| * Handle jobs submitted through scrontab. |
| */ |
| if (job_ptr->bit_flags & CRON_JOB) { |
| cron_entry_t *entry = |
| (cron_entry_t *) job_ptr->details->crontab_entry; |
| /* |
| * The KILL_CRON flag being set here is indicating that the |
| * user has specifically requested killing scrontab jobs. To |
| * avoid interfering with other possible ways of killing jobs, |
| * the KILL_CRON flag being set must mean that killing cron |
| * jobs is permitted. |
| */ |
| if (xstrcasestr(slurm_conf.scron_params, "explicit_scancel") && |
| !(flags & KILL_CRON)) |
| return ESLURM_CANNOT_CANCEL_CRON_JOB; |
| job_ptr->bit_flags &= ~CRON_JOB; |
| error("cancelling cron job, lines %u %u", |
| entry->line_start, entry->line_end); |
| crontab_add_disabled_lines(job_ptr->user_id, entry->line_start, |
| entry->line_end); |
| } |
| |
| /* save user ID of the one who requested the job be cancelled */ |
| if (signal == SIGKILL) |
| job_ptr->requid = uid; |
| if (IS_JOB_PENDING(job_ptr) && IS_JOB_COMPLETING(job_ptr) && |
| (signal == SIGKILL)) { |
| /* Prevent job requeue, otherwise preserve state */ |
| job_state_set(job_ptr, (JOB_CANCELLED | JOB_COMPLETING)); |
| |
| /* build_cg_bitmap() not needed, job already completing */ |
| verbose("%s: %u of requeuing %pJ successful", |
| __func__, signal, job_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| if (flags & KILL_HURRY) |
| job_ptr->bit_flags |= JOB_KILL_HURRY; |
| |
| if (IS_JOB_CONFIGURING(job_ptr) && (signal == SIGKILL)) { |
| last_job_update = now; |
| job_ptr->end_time = now; |
| job_state_set(job_ptr, (JOB_CANCELLED | JOB_COMPLETING)); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_set_flag(job_ptr, JOB_REQUEUE); |
| slurmscriptd_flush_job(job_ptr->job_id); |
| track_script_flush_job(job_ptr->job_id); |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, false, false); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_unset_flag(job_ptr, JOB_REQUEUE); |
| |
| verbose("%s: %u of configuring %pJ successful", |
| __func__, signal, job_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| if (IS_JOB_PENDING(job_ptr) && (signal == SIGKILL)) { |
| job_state_set(job_ptr, JOB_CANCELLED); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_set_flag(job_ptr, JOB_REQUEUE); |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| srun_allocate_abort(job_ptr); |
| slurmscriptd_flush_job(job_ptr->job_id); |
| track_script_flush_job(job_ptr->job_id); |
| job_completion_logger(job_ptr, false); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_unset_flag(job_ptr, JOB_REQUEUE); |
| |
| /* |
| * Send back a response to the origin cluster, in other cases |
| * where the job is running the job will send back a response |
| * after the job is is completed. This can happen when the |
| * pending origin job is put into a hold state and the siblings |
| * are removed or when the job is canceled from the origin. |
| */ |
| fed_mgr_job_complete(job_ptr, 0, now); |
| verbose("%s: %u of pending %pJ successful", |
| __func__, signal, job_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| if (preempt) |
| job_term_state = JOB_PREEMPTED; |
| else if (flags & KILL_FAIL_JOB) |
| job_term_state = JOB_FAILED; |
| else |
| job_term_state = JOB_CANCELLED; |
| if (IS_JOB_SUSPENDED(job_ptr) && (signal == SIGKILL)) { |
| last_job_update = now; |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); |
| job_state_set(job_ptr, (job_term_state | JOB_COMPLETING)); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_set_flag(job_ptr, JOB_REQUEUE); |
| build_cg_bitmap(job_ptr); |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_completion_logger(job_ptr, false); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_unset_flag(job_ptr, JOB_REQUEUE); |
| deallocate_nodes(job_ptr, false, true, preempt); |
| verbose("%s: %u of suspended %pJ successful", |
| __func__, signal, job_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| if (IS_JOB_RUNNING(job_ptr)) { |
| |
| if ((signal == SIGSTOP) || (signal == SIGCONT)) { |
| if (IS_JOB_SIGNALING(job_ptr)) { |
| verbose("%s: %u not send to %pJ 0x%x", |
| __func__, signal, job_ptr, |
| job_ptr->job_state); |
| return ESLURM_TRANSITION_STATE_NO_UPDATE; |
| } |
| job_state_set_flag(job_ptr, JOB_SIGNALING); |
| } |
| |
| if ((signal == SIGKILL) |
| && !(flags & KILL_STEPS_ONLY) |
| && !(flags & KILL_JOB_BATCH)) { |
| /* No need to signal steps, deallocate kills them |
| */ |
| job_ptr->time_last_active = now; |
| job_ptr->end_time = now; |
| last_job_update = now; |
| job_state_set(job_ptr, (job_term_state | |
| JOB_COMPLETING)); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_set_flag(job_ptr, JOB_REQUEUE); |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, false, false, preempt); |
| if (flags & KILL_FED_REQUEUE) |
| job_state_unset_flag(job_ptr, JOB_REQUEUE); |
| } else if (job_ptr->batch_flag && (flags & KILL_JOB_BATCH)) { |
| _signal_batch_job(job_ptr, signal, flags); |
| } else if ((flags & KILL_JOB_BATCH) && !job_ptr->batch_flag) { |
| if ((signal == SIGSTOP) || (signal == SIGCONT)) |
| job_state_unset_flag(job_ptr, JOB_SIGNALING); |
| return ESLURM_JOB_SCRIPT_MISSING; |
| } else { |
| _signal_job(job_ptr, signal, flags); |
| } |
| verbose("%s: %u of running %pJ successful 0x%x", |
| __func__, signal, job_ptr, job_ptr->job_state); |
| return SLURM_SUCCESS; |
| } |
| |
| verbose("%s: %pJ can't be sent signal %u from state=%s", |
| __func__, job_ptr, signal, |
| job_state_string(job_ptr->job_state)); |
| |
| log_flag(TRACE_JOBS, "%s: return %pJ", __func__, job_ptr); |
| |
| return ESLURM_TRANSITION_STATE_NO_UPDATE; |
| } |
| |
| static int foreach_het_job_signal(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| foreach_kill_hetjob_t *foreach_kill_hetjob = arg; |
| |
| if (foreach_kill_hetjob->het_job_leader->het_job_id != |
| het_job->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", |
| __func__, foreach_kill_hetjob->het_job_leader); |
| } else { |
| int rc1 = job_signal(het_job, |
| foreach_kill_hetjob->signal, |
| foreach_kill_hetjob->flags, |
| foreach_kill_hetjob->uid, |
| foreach_kill_hetjob->preempt); |
| if (rc1 != SLURM_SUCCESS) |
| foreach_kill_hetjob->rc = rc1; |
| } |
| |
| return 0; |
| } |
| |
| /* Signal all components of a hetjob */ |
| extern int het_job_signal(job_record_t *het_job_leader, uint16_t signal, |
| uint16_t flags, uid_t uid, bool preempt) |
| { |
| foreach_kill_hetjob_t foreach_kill_hetjob = { |
| .flags = flags, |
| .het_job_leader = het_job_leader, |
| .preempt = preempt, |
| .rc = SLURM_SUCCESS, |
| .signal = signal, |
| .uid = uid, |
| }; |
| |
| if (!het_job_leader->het_job_id) |
| return ESLURM_NOT_HET_JOB; |
| else if (!het_job_leader->het_job_list) |
| return ESLURM_NOT_HET_JOB_LEADER; |
| |
| (void) list_for_each(het_job_leader->het_job_list, |
| foreach_het_job_signal, |
| &foreach_kill_hetjob); |
| |
| return foreach_kill_hetjob.rc; |
| } |
| |
| /* |
| * Returns average pn_min_memory, considering DefMemPer{CPU,Node,GPU} from both |
| * the partition and cluster configuration |
| * WARNING: assumes memory is evenly distributed across all nodes in job, |
| * may return an inaccurate value if this is not the case |
| */ |
| static uint64_t _get_def_mem(part_record_t *part_ptr, uint64_t *tres_req_cnt) |
| { |
| if (part_ptr && part_ptr->def_mem_per_cpu && |
| (part_ptr->def_mem_per_cpu != MEM_PER_CPU) && |
| (part_ptr->def_mem_per_cpu != NO_VAL64)) |
| return part_ptr->def_mem_per_cpu; |
| else if (tres_req_cnt && tres_req_cnt[TRES_ARRAY_MEM] && |
| (tres_req_cnt[TRES_ARRAY_MEM] != NO_VAL64)) { |
| xassert(tres_req_cnt[TRES_ARRAY_NODE]); |
| return tres_req_cnt[TRES_ARRAY_MEM] / |
| tres_req_cnt[TRES_ARRAY_NODE]; |
| } else |
| return slurm_conf.def_mem_per_cpu; |
| } |
| |
| static bool _get_whole_hetjob(void) |
| { |
| static time_t sched_update = 0; |
| static bool whole_hetjob = false; |
| |
| if (sched_update != slurm_conf.last_update) { |
| sched_update = slurm_conf.last_update; |
| if (xstrcasestr(slurm_conf.sched_params, "whole_hetjob") || |
| xstrcasestr(slurm_conf.sched_params, "whole_pack")) |
| whole_hetjob = true; |
| else |
| whole_hetjob = false; |
| } |
| |
| return whole_hetjob; |
| } |
| |
| static job_record_t *_find_meta_job_record(uint32_t job_id) |
| { |
| job_record_t *job_ptr; |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == job_id) |
| break; |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| } |
| if ((job_ptr == NULL) || |
| ((job_ptr->array_task_id == NO_VAL) && |
| (job_ptr->array_recs == NULL))) |
| return NULL; |
| |
| return job_ptr; |
| } |
| |
| static void _signal_pending_job_array_tasks(job_record_t *job_ptr, |
| bitstr_t **array_bitmap, |
| uint16_t signal, |
| uid_t uid, |
| int32_t i_last, |
| time_t now, |
| int *rc) |
| { |
| int len; |
| |
| xassert(job_ptr); |
| |
| if (!(IS_JOB_PENDING(job_ptr) && job_ptr->array_recs && |
| job_ptr->array_recs->task_id_bitmap)) |
| return; /* No tasks to signal */ |
| |
| /* Ensure bitmap sizes match for AND operations */ |
| len = bit_size(job_ptr->array_recs->task_id_bitmap); |
| i_last++; |
| if (i_last < len) { |
| bit_realloc(*array_bitmap, len); |
| } else { |
| bit_realloc(*array_bitmap, i_last); |
| bit_realloc(job_ptr->array_recs->task_id_bitmap, i_last); |
| } |
| if (signal == SIGKILL) { |
| uint32_t orig_task_cnt, new_task_count; |
| /* task_id_bitmap changes, so we need a copy of it */ |
| bitstr_t *task_id_bitmap_orig = |
| bit_copy(job_ptr->array_recs->task_id_bitmap); |
| |
| bit_and_not(job_ptr->array_recs->task_id_bitmap, |
| *array_bitmap); |
| xfree(job_ptr->array_recs->task_id_str); |
| orig_task_cnt = job_ptr->array_recs->task_cnt; |
| new_task_count = bit_set_count(job_ptr->array_recs-> |
| task_id_bitmap); |
| if (!new_task_count) { |
| last_job_update = now; |
| job_state_set(job_ptr, JOB_CANCELLED); |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| job_ptr->requid = uid; |
| srun_allocate_abort(job_ptr); |
| job_completion_logger(job_ptr, false); |
| /* |
| * Master job record, even without tasks, |
| * counts as one job record |
| */ |
| job_count -= (orig_task_cnt - 1); |
| } else { |
| _job_array_comp(job_ptr, false, false); |
| job_count -= (orig_task_cnt - new_task_count); |
| /* |
| * Since we are altering the job array's |
| * task_cnt we must go alter this count in the |
| * acct_policy code as if they are finishing |
| * (accrue_cnt/job_submit etc...). |
| */ |
| if (job_ptr->array_recs->task_cnt > |
| new_task_count) { |
| uint32_t tmp_state = job_ptr->job_state; |
| job_state_set(job_ptr, JOB_CANCELLED); |
| |
| job_ptr->array_recs->task_cnt -= |
| new_task_count; |
| acct_policy_remove_job_submit(job_ptr, |
| false); |
| job_ptr->bit_flags &= ~JOB_ACCRUE_OVER; |
| job_state_set(job_ptr, tmp_state); |
| } |
| } |
| |
| /* |
| * Set the task_cnt here since |
| * job_completion_logger needs the total |
| * pending count to handle the acct_policy |
| * limit for submitted jobs correctly. |
| */ |
| job_ptr->array_recs->task_cnt = new_task_count; |
| bit_and_not(*array_bitmap, task_id_bitmap_orig); |
| FREE_NULL_BITMAP(task_id_bitmap_orig); |
| } else { |
| bit_and_not(*array_bitmap, |
| job_ptr->array_recs->task_id_bitmap); |
| *rc = ESLURM_TRANSITION_STATE_NO_UPDATE; |
| } |
| } |
| |
| /* |
| * job_str_signal - signal the specified job |
| * IN job_id_str - id of the job to be signaled, valid formats include "#" |
| * "#_#" and "#_[expr]" |
| * IN signal - signal to send, SIGKILL == cancel the job |
| * IN flags - see KILL_JOB_* flags in slurm.h |
| * IN uid - uid of requesting user |
| * IN preempt - true if job being preempted |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags, |
| uid_t uid, bool preempt) |
| { |
| job_record_t *job_ptr; |
| uint32_t job_id; |
| time_t now = time(NULL); |
| char *end_ptr = NULL; |
| long int long_id; |
| bitstr_t *array_bitmap = NULL; |
| int32_t i, i_first, i_last; |
| int rc = SLURM_SUCCESS, rc2; |
| |
| if (max_array_size == NO_VAL) { |
| max_array_size = slurm_conf.max_array_sz; |
| } |
| |
| long_id = strtol(job_id_str, &end_ptr, 10); |
| if ((long_id <= 0) || (long_id == LONG_MAX) || |
| ((end_ptr[0] != '\0') && (end_ptr[0] != '_') && |
| (end_ptr[0] != '+'))) { |
| info("%s(1): invalid JobId=%s", __func__, job_id_str); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| if ((end_ptr[0] == '_') && (end_ptr[1] == '*')) |
| end_ptr += 2; /* Defaults to full job array */ |
| |
| if (end_ptr[0] == '+') { /* Signal hetjob element */ |
| job_id = (uint32_t) long_id; |
| long_id = strtol(end_ptr + 1, &end_ptr, 10); |
| if ((long_id < 0) || (long_id == LONG_MAX) || |
| (end_ptr[0] != '\0')) { |
| info("%s(2): invalid JobId=%s", __func__, job_id_str); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| job_ptr = find_het_job_record(job_id, (uint32_t) long_id); |
| if (!job_ptr) |
| return ESLURM_INVALID_JOB_ID; |
| if ((job_ptr->user_id != uid) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) { |
| error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u", |
| job_ptr, uid); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if (!job_ptr->het_job_id) |
| return ESLURM_NOT_HET_JOB; |
| |
| if (!job_ptr->het_job_offset) |
| /* |
| * HetJob leader. Attempt to signal all components no |
| * matter what. If we cared about state or whole_hetjob |
| * for the leader, we would be being inconsistent with |
| * direct format '#' below. But even if we made an |
| * exception here for leader R and no whole_hetjob, |
| * job_complete() would end all the components anyways. |
| */ |
| return het_job_signal(job_ptr, signal, flags, uid, |
| preempt); |
| |
| /* HetJob non-leader component. */ |
| if (_get_whole_hetjob()) { |
| /* Attempt to signal all components no matter state. */ |
| job_record_t *het_leader = NULL; |
| if (!(het_leader = find_het_job_record(job_id, 0))) { |
| /* Leader not found. Attempt individual. */ |
| error("%s: can't find HetJob leader for HetJob component %pJ", |
| __func__, job_ptr); |
| return job_signal(job_ptr, signal, |
| flags, uid, preempt); |
| } else { |
| /* Got the leader, signal all. */ |
| return het_job_signal(het_leader, |
| signal, flags, |
| uid, preempt); |
| } |
| } |
| |
| if (IS_JOB_PENDING(job_ptr)) |
| return ESLURM_NOT_WHOLE_HET_JOB; |
| else |
| return job_signal(job_ptr, signal, flags, uid, preempt); |
| } |
| |
| last_job_update = now; |
| job_id = (uint32_t) long_id; |
| if (end_ptr[0] == '\0') { /* Single job (or full job array) */ |
| int jobs_done = 0, jobs_signaled = 0; |
| job_record_t *job_ptr_done = NULL; |
| job_ptr = find_job_record(job_id); |
| if (job_ptr && (job_ptr->user_id != uid) && |
| !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) { |
| error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u", |
| job_ptr, uid); |
| return ESLURM_ACCESS_DENIED; |
| } |
| if (job_ptr && job_ptr->het_job_list) { /* Hetjob leader */ |
| return het_job_signal(job_ptr, signal, flags, uid, |
| preempt); |
| } |
| if (job_ptr && job_ptr->het_job_id && _get_whole_hetjob()) { |
| job_record_t *het_job_leader; |
| het_job_leader = find_job_record(job_ptr->het_job_id); |
| if (het_job_leader && het_job_leader->het_job_list) { |
| return het_job_signal(het_job_leader, signal, |
| flags, uid, preempt); |
| } |
| error("%s: Hetjob leader %pJ not found", |
| __func__, job_ptr); |
| } |
| if (job_ptr && job_ptr->het_job_id && IS_JOB_PENDING(job_ptr)) |
| return ESLURM_NOT_WHOLE_HET_JOB;/* Hetjob child */ |
| |
| if (job_ptr && |
| (((job_ptr->array_task_id == NO_VAL) && |
| (job_ptr->array_recs == NULL)) || |
| ((job_ptr->array_task_id != NO_VAL) && |
| ((job_ptr->array_job_id != job_id) || |
| (flags & KILL_ARRAY_TASK))))) { |
| /* |
| * This is a regular job or a single task of a job |
| * array. KILL_ARRAY_TASK indicates that the meta job |
| * should be treated as a single task. |
| */ |
| return job_signal(job_ptr, signal, flags, uid, preempt); |
| } |
| |
| /* |
| * This will kill the meta record that holds all |
| * pending jobs. We want to kill this first so we |
| * don't start jobs just to kill them as we are |
| * killing other elements of the array. |
| */ |
| if (job_ptr && job_ptr->array_recs) { |
| /* This is a job array */ |
| job_ptr_done = job_ptr; |
| rc = job_signal(job_ptr, signal, flags, uid, preempt); |
| if (rc == ESLURM_ACCESS_DENIED) |
| return rc; |
| jobs_signaled++; |
| if (rc == ESLURM_ALREADY_DONE) { |
| jobs_done++; |
| rc = SLURM_SUCCESS; |
| } |
| } |
| |
| /* Signal all tasks of this job array */ |
| job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; |
| if (!job_ptr && !job_ptr_done) { |
| info("%s(3): invalid JobId=%u", __func__, job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| while (job_ptr) { |
| if (job_ptr->array_job_id == job_id) |
| break; |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == job_id) && |
| (job_ptr != job_ptr_done)) { |
| rc2 = job_signal(job_ptr, signal, flags, uid, |
| preempt); |
| jobs_signaled++; |
| if (rc2 == ESLURM_ALREADY_DONE) { |
| jobs_done++; |
| } else { |
| rc = MAX(rc, rc2); |
| } |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| if ((rc == SLURM_SUCCESS) && (jobs_done == jobs_signaled)) |
| return ESLURM_ALREADY_DONE; |
| return rc; |
| |
| } |
| |
| array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size, |
| &i_last); |
| if (!array_bitmap) { |
| info("%s(4): invalid JobId=%s", __func__, job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto endit; |
| } |
| |
| /* Find some job record and validate the user signaling the job */ |
| if (!(job_ptr = _find_meta_job_record(job_id))) { |
| info("%s(5): invalid JobId=%s", __func__, job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto endit; |
| } |
| |
| if ((job_ptr->user_id != uid) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) { |
| error("%s: Security violation JOB_CANCEL RPC for %pJ from uid %u", |
| __func__, job_ptr, uid); |
| rc = ESLURM_ACCESS_DENIED; |
| goto endit; |
| } |
| |
| _signal_pending_job_array_tasks(job_ptr, &array_bitmap, signal, uid, |
| i_last, now, &rc); |
| |
| i_first = bit_ffs(array_bitmap); |
| if (i_first >= 0) |
| i_last = bit_fls(array_bitmap); |
| else |
| i_last = -2; |
| for (i = i_first; i <= i_last; i++) { |
| if (!bit_test(array_bitmap, i)) |
| continue; |
| job_ptr = find_job_array_rec(job_id, i); |
| if (job_ptr == NULL) { |
| info("%s(6): invalid JobId=%u_%d", |
| __func__, job_id, i); |
| rc = ESLURM_INVALID_JOB_ID; |
| continue; |
| } |
| |
| rc2 = job_signal(job_ptr, signal, flags, uid, preempt); |
| rc = MAX(rc, rc2); |
| } |
| endit: |
| FREE_NULL_BITMAP(array_bitmap); |
| |
| return rc; |
| } |
| |
| static void _free_selected_step_array(slurm_selected_step_t ***jobs_p, |
| uint32_t cnt) |
| { |
| slurm_selected_step_t **jobs = *jobs_p; |
| |
| for (int i = 0; i < cnt; i++) |
| slurm_destroy_selected_step(jobs[i]); |
| xfree(jobs); |
| *jobs_p = NULL; |
| } |
| |
| static void _free_array_task_filter(void *x) |
| { |
| array_task_filter_t *rec = x; |
| |
| if (!rec) |
| return; |
| |
| /* |
| * Do not use slurm_destroy_selected_step() as that will |
| * unconditionally free the bitmap. |
| */ |
| if (rec->free_array_bitmap) |
| FREE_NULL_BITMAP(rec->filter_id->array_bitmap); |
| xfree(rec->filter_id); |
| /* Do not free rec->job_ptr */ |
| xfree(rec); |
| } |
| |
| static int _parse_jobs_array(char **jobs_array, uint32_t jobs_cnt, |
| slurm_selected_step_t ***jobs_p) |
| { |
| slurm_selected_step_t **jobs = NULL; |
| |
| if (!jobs_array) |
| return SLURM_SUCCESS; |
| if (max_array_size == NO_VAL) |
| max_array_size = slurm_conf.max_array_sz; |
| |
| jobs = xcalloc(jobs_cnt, sizeof(*jobs)); |
| for (int i = 0; i < jobs_cnt; i++) { |
| int rc; |
| |
| jobs[i] = xmalloc(sizeof(*jobs[i])); |
| rc = unfmt_job_id_string(jobs_array[i], jobs[i], |
| max_array_size); |
| if (rc != SLURM_SUCCESS) { |
| _free_selected_step_array(&jobs, i + 1); |
| return rc; |
| } |
| } |
| |
| *jobs_p = jobs; |
| return SLURM_SUCCESS; |
| } |
| |
| static bool _verify_kill_jobs_msg(kill_jobs_msg_t *kill_msg) |
| { |
| /* At least one job id or filter must be specified */ |
| if (!kill_msg->account && !kill_msg->job_name && |
| !kill_msg->jobs_cnt && !kill_msg->partition && !kill_msg->qos && |
| !kill_msg->reservation && |
| ((kill_msg->state & JOB_STATE_BASE) == JOB_END) && |
| !kill_msg->user_name && !kill_msg->wckey && !kill_msg->nodelist) |
| return false; |
| |
| return true; |
| } |
| |
| extern int job_mgr_signal_jobs(kill_jobs_msg_t *kill_msg, uid_t auth_uid, |
| kill_jobs_resp_msg_t **resp_msg_p) |
| { |
| int rc = 0; |
| signal_jobs_args_t signal_args = { |
| .auth_uid = auth_uid, |
| .kill_msg = kill_msg, |
| }; |
| slurm_selected_step_t **jobs = NULL; |
| assoc_mgr_lock_t assoc_lock = { |
| .user = READ_LOCK, |
| }; |
| |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(NODE_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(FED_LOCK, READ_LOCK)); |
| |
| if (!_verify_kill_jobs_msg(kill_msg)) |
| return ESLURM_SIGNAL_JOBS_INVALID; |
| |
| /* |
| * Items in the signal_args.responses list are free'd in |
| * _foreach_xfer_responses |
| */ |
| signal_args.responses = list_create(NULL); |
| signal_args.array_leader_list = list_create(NULL); |
| signal_args.other_job_list = list_create(NULL); |
| |
| if (kill_msg->jobs_cnt) { |
| rc = _parse_jobs_array(kill_msg->jobs_array, |
| kill_msg->jobs_cnt, &jobs); |
| if (rc != SLURM_SUCCESS) |
| return rc; |
| signal_args.pending_array_task_list = |
| list_create(_free_array_task_filter); |
| } |
| |
| if (max_array_size == NO_VAL) |
| max_array_size = slurm_conf.max_array_sz; |
| |
| /* |
| * Get a list of jobs to signal first, then signal the jobs outside of |
| * the job_list lock. Array job leaders need to be signalled before |
| * the tasks in their array. Try to signal each job; add each failure |
| * to signal_args.responses. |
| * |
| * We check if the auth_uid is able to signal the job on every possible |
| * job that matches the filter. Lock the assoc lock once here rather |
| * than every time we check. |
| */ |
| assoc_mgr_lock(&assoc_lock); |
| if (jobs) |
| _filter_jobs_ids(jobs, kill_msg->jobs_cnt, &signal_args); |
| else |
| list_for_each_ro(job_list, _foreach_filter_job_list, |
| &signal_args); |
| /* |
| * het_leader is only used during filtering; explicitly NULL it out |
| * so it cannot accidentally be used later. |
| */ |
| signal_args.het_leader = NULL; |
| assoc_mgr_unlock(&assoc_lock); |
| |
| list_for_each(signal_args.array_leader_list, _foreach_signal_job, |
| &signal_args); |
| if (signal_args.pending_array_task_list) { |
| signal_args.now = time(NULL); |
| list_for_each(signal_args.pending_array_task_list, |
| _foreach_signal_job_array_tasks, &signal_args); |
| } |
| list_for_each(signal_args.other_job_list, _foreach_signal_job, |
| &signal_args); |
| |
| _build_kill_jobs_resp_msg(&signal_args, resp_msg_p); |
| |
| /* Cleanup */ |
| _free_selected_step_array(&jobs, kill_msg->jobs_cnt); |
| FREE_NULL_LIST(signal_args.array_leader_list); |
| FREE_NULL_LIST(signal_args.pending_array_task_list); |
| FREE_NULL_LIST(signal_args.other_job_list); |
| FREE_NULL_LIST(signal_args.responses); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal, |
| uint16_t flags) |
| { |
| bitoff_t i; |
| signal_tasks_msg_t *signal_tasks_msg = NULL; |
| agent_arg_t *agent_args = NULL; |
| node_record_t *node_ptr; |
| |
| xassert(job_ptr); |
| xassert(job_ptr->batch_host); |
| i = bit_ffs(job_ptr->node_bitmap); |
| if (i < 0) { |
| error("%s: %pJ lacks assigned nodes", __func__, job_ptr); |
| return; |
| } |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_SIGNAL_TASKS; |
| agent_args->retry = 1; |
| agent_args->node_count = 1; |
| if ((node_ptr = find_node_record(job_ptr->batch_host))) |
| agent_args->protocol_version = node_ptr->protocol_version; |
| agent_args->hostlist = hostlist_create(job_ptr->batch_host); |
| signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t)); |
| signal_tasks_msg->step_id.job_id = job_ptr->job_id; |
| signal_tasks_msg->step_id.step_id = SLURM_BATCH_SCRIPT; |
| signal_tasks_msg->step_id.step_het_comp = NO_VAL; |
| |
| signal_tasks_msg->flags = flags; |
| signal_tasks_msg->signal = signal; |
| |
| agent_args->msg_args = signal_tasks_msg; |
| set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_args); |
| } |
| |
| /* |
| * prolog_complete - note the normal termination of the prolog |
| * IN job_id - id of the job which completed |
| * IN prolog_return_code - prolog's return code, |
| * if set then set job state to FAILED |
| * RET - 0 on success, otherwise ESLURM error code |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| */ |
| extern int prolog_complete(uint32_t job_id, uint32_t prolog_return_code, |
| char *node_name) |
| { |
| job_record_t *job_ptr; |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| info("prolog_complete: invalid JobId=%u", job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr)) |
| return SLURM_SUCCESS; |
| |
| if (prolog_return_code) { |
| error("Prolog launch failure, %pJ", job_ptr); |
| job_ptr->exit_code = prolog_return_code; |
| } |
| /* |
| * job_ptr->node_bitmap_pr is always NULL for front end systems |
| */ |
| if (job_ptr->node_bitmap_pr) { |
| node_record_t *node_ptr = NULL; |
| |
| if (node_name) |
| node_ptr = find_node_record(node_name); |
| |
| if (node_ptr) { |
| bit_clear(job_ptr->node_bitmap_pr, node_ptr->index); |
| } else { |
| if (node_name) |
| error("%s: can't find node:%s", |
| __func__, node_name); |
| bit_clear_all(job_ptr->node_bitmap_pr); |
| } |
| } |
| if (!job_ptr->node_bitmap_pr || |
| (bit_ffs(job_ptr->node_bitmap_pr) == -1)) |
| { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| agent_trigger(999, false, true); |
| } |
| last_job_update = time(NULL); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _handle_requeue_limit(job_record_t *job_ptr, const char *caller) |
| { |
| if (job_ptr->batch_flag <= slurm_conf.max_batch_requeue) |
| return; |
| |
| debug("%s: Holding %pJ, repeated requeue failures", |
| caller, job_ptr); |
| |
| job_state_set_flag(job_ptr, JOB_REQUEUE_HOLD); |
| job_ptr->state_reason = WAIT_MAX_REQUEUE; |
| xfree(job_ptr->state_desc); |
| job_ptr->state_desc = |
| xstrdup("launch failure limit exceeded requeued held"); |
| job_ptr->batch_flag = 1; |
| job_ptr->priority = 0; |
| } |
| |
| static int _job_complete(job_record_t *job_ptr, uid_t uid, bool requeue, |
| bool node_fail, uint32_t job_return_code) |
| { |
| node_record_t *node_ptr; |
| time_t now = time(NULL); |
| uint32_t job_comp_flag = 0; |
| bool suspended = false; |
| int i; |
| int use_cloud = false; |
| uint16_t over_time_limit; |
| |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(FED_LOCK, READ_LOCK)); |
| |
| if (IS_JOB_FINISHED(job_ptr)) { |
| if (job_ptr->exit_code == 0) |
| job_ptr->exit_code = job_return_code; |
| return ESLURM_ALREADY_DONE; |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr)) |
| return SLURM_SUCCESS; /* avoid replay */ |
| |
| if ((job_return_code & 0xff) == SIG_OOM) { |
| info("%s: %pJ OOM failure", __func__, job_ptr); |
| } else if (WIFSIGNALED(job_return_code)) { |
| info("%s: %pJ WTERMSIG %d", |
| __func__, job_ptr, WTERMSIG(job_return_code)); |
| } else if (WIFEXITED(job_return_code)) { |
| info("%s: %pJ WEXITSTATUS %d", |
| __func__, job_ptr, WEXITSTATUS(job_return_code)); |
| } |
| |
| if (IS_JOB_RUNNING(job_ptr)) |
| job_comp_flag = JOB_COMPLETING; |
| else if (IS_JOB_PENDING(job_ptr)) { |
| job_return_code = NO_VAL; |
| fed_mgr_job_revoke_sibs(job_ptr); |
| } |
| |
| if ((job_return_code == NO_VAL) && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) { |
| if (node_fail) { |
| info("%s: %pJ cancelled by node failure", |
| __func__, job_ptr); |
| } else { |
| info("%s: %pJ cancelled by interactive user", |
| __func__, job_ptr); |
| } |
| } |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| uint32_t suspend_job_state = job_ptr->job_state; |
| /* |
| * we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_state_set(job_ptr, JOB_CANCELLED); |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_state_set(job_ptr, suspend_job_state); |
| job_comp_flag = JOB_COMPLETING; |
| suspended = true; |
| } |
| |
| if (job_comp_flag && (job_ptr->node_cnt == 0)) { |
| /* |
| * Job has no resources left (used to expand another job). |
| * Avoid duplicate run of epilog and underflow in CPU count. |
| */ |
| job_comp_flag = 0; |
| } |
| |
| if (requeue && job_ptr->details && job_ptr->batch_flag) { |
| /* |
| * We want this job to look like it was terminated in the |
| * accounting logs. Set a new submit time so the restarted |
| * job looks like a new job. |
| */ |
| job_ptr->end_time = now; |
| if (job_ptr->bit_flags & GRACE_PREEMPT) { |
| job_state_set(job_ptr, (JOB_PREEMPTED | job_comp_flag)); |
| |
| /* clear signal sent on GracePeriod start */ |
| job_ptr->bit_flags &= (~GRACE_PREEMPT); |
| } else { |
| job_state_set(job_ptr, JOB_NODE_FAIL); |
| job_ptr->exit_code = job_return_code; |
| } |
| |
| job_completion_logger(job_ptr, true); |
| /* |
| * Do this after the epilog complete. |
| * Setting it here is too early. |
| */ |
| //job_record_set_sluid(job_ptr); |
| //job_ptr->details->submit_time = now + 1; |
| if (job_ptr->node_bitmap) { |
| i = bit_ffs(job_ptr->node_bitmap); |
| if (i >= 0) { |
| node_ptr = node_record_table_ptr[i]; |
| if (IS_NODE_CLOUD(node_ptr)) |
| use_cloud = true; |
| } |
| } |
| if (!use_cloud) |
| job_ptr->batch_flag++; /* only one retry */ |
| job_ptr->restart_cnt++; |
| |
| /* clear signal sent flag on requeue */ |
| job_ptr->warn_flags &= ~WARN_SENT; |
| |
| |
| job_state_set(job_ptr, (JOB_PENDING | job_comp_flag)); |
| job_ptr->exit_code = 0; |
| /* |
| * Since the job completion logger removes the job submit |
| * information, we need to add it again. |
| */ |
| acct_policy_add_job_submit(job_ptr, false); |
| if (node_fail) { |
| info("%s: requeue %pJ due to node failure", |
| __func__, job_ptr); |
| } else { |
| info("%s: requeue %pJ per user/system request", |
| __func__, job_ptr); |
| } |
| /* hold job if over requeue limit */ |
| _handle_requeue_limit(job_ptr, __func__); |
| } else if (IS_JOB_PENDING(job_ptr) && job_ptr->details && |
| job_ptr->batch_flag) { |
| /* |
| * Possible failure mode with DOWN node and job requeue. |
| * The DOWN node might actually respond to the cancel and |
| * take us here. Don't run job_completion_logger here since |
| * this is here to catch duplicate cancels from slowly |
| * responding slurmds |
| */ |
| return SLURM_SUCCESS; |
| } else { |
| if (job_ptr->part_ptr && |
| (job_ptr->part_ptr->over_time_limit != NO_VAL16)) { |
| over_time_limit = job_ptr->part_ptr->over_time_limit; |
| } else { |
| over_time_limit = slurm_conf.over_time_limit; |
| } |
| |
| if (node_fail) { |
| job_state_set(job_ptr, (JOB_NODE_FAIL | job_comp_flag)); |
| job_ptr->exit_code = job_return_code; |
| job_ptr->requid = uid; |
| } else if (job_ptr->bit_flags & GRACE_PREEMPT) { |
| job_state_set(job_ptr, (JOB_PREEMPTED | job_comp_flag)); |
| } else if (job_return_code == NO_VAL) { |
| job_state_set(job_ptr, (JOB_CANCELLED | job_comp_flag)); |
| job_ptr->requid = uid; |
| } else if ((job_return_code & 0xff) == SIG_OOM) { |
| job_state_set(job_ptr, (JOB_OOM | job_comp_flag)); |
| job_ptr->exit_code = job_return_code; |
| job_ptr->state_reason = FAIL_OOM; |
| xfree(job_ptr->state_desc); |
| } else if (WIFEXITED(job_return_code) && |
| WEXITSTATUS(job_return_code)) { |
| job_state_set(job_ptr, (JOB_FAILED | job_comp_flag)); |
| job_ptr->exit_code = job_return_code; |
| job_ptr->state_reason = FAIL_EXIT_CODE; |
| xfree(job_ptr->state_desc); |
| } else if (WIFSIGNALED(job_return_code)) { |
| job_state_set(job_ptr, (JOB_FAILED | job_comp_flag)); |
| job_ptr->exit_code = job_return_code; |
| job_ptr->state_reason = FAIL_SIGNAL; |
| xfree(job_ptr->state_desc); |
| xstrfmtcat(job_ptr->state_desc, |
| "RaisedSignal:%d(%s)", |
| WTERMSIG(job_return_code), |
| strsignal(WTERMSIG(job_return_code))); |
| } else if (job_comp_flag |
| && ((job_ptr->end_time |
| + over_time_limit * 60) < now)) { |
| /* |
| * Test if the job has finished before its allowed |
| * over time has expired. |
| */ |
| job_state_set(job_ptr, (JOB_TIMEOUT | job_comp_flag)); |
| job_ptr->state_reason = FAIL_TIMEOUT; |
| xfree(job_ptr->state_desc); |
| } else { |
| job_state_set(job_ptr, (JOB_COMPLETE | job_comp_flag)); |
| job_ptr->exit_code = job_return_code; |
| } |
| |
| if (suspended) { |
| job_ptr->end_time = job_ptr->suspend_time; |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else |
| job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| } |
| |
| last_job_update = now; |
| job_ptr->time_last_active = now; /* Timer for resending kill RPC */ |
| if (job_comp_flag) { /* job was running */ |
| build_cg_bitmap(job_ptr); |
| deallocate_nodes(job_ptr, false, suspended, false); |
| } |
| |
| /* Check for and cleanup stuck scripts */ |
| if (IS_JOB_PENDING(job_ptr) || IS_JOB_CONFIGURING(job_ptr) || |
| (job_ptr->details && job_ptr->details->prolog_running)) { |
| slurmscriptd_flush_job(job_ptr->job_id); |
| track_script_flush_job(job_ptr->job_id); |
| } |
| |
| info("%s: %pJ done", __func__, job_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_het_job_complete(void *x, void *arg) |
| { |
| job_record_t *het_job_ptr = x; |
| foreach_complete_hetjob_t *foreach_complete_hetjob = arg; |
| job_record_t *het_job_leader = foreach_complete_hetjob->het_job_leader; |
| int rc; |
| |
| if (het_job_leader->het_job_id != het_job_ptr->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", |
| __func__, het_job_leader); |
| return 0; |
| } |
| rc = _job_complete(het_job_ptr, |
| foreach_complete_hetjob->uid, |
| foreach_complete_hetjob->requeue, |
| foreach_complete_hetjob->node_fail, |
| foreach_complete_hetjob->job_return_code); |
| if (rc != SLURM_SUCCESS) |
| foreach_complete_hetjob->rc = rc; |
| |
| return 0; |
| } |
| |
| /* |
| * job_complete - note the normal termination the specified job |
| * IN job_id - id of the job which completed |
| * IN uid - user id of user issuing the RPC |
| * IN requeue - job should be run again if possible |
| * IN node_fail - true if job terminated due to node failure |
| * IN job_return_code - job's return code, if set then set state to FAILED |
| * RET - 0 on success, otherwise ESLURM error code |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| */ |
| extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, |
| bool node_fail, uint32_t job_return_code) |
| { |
| job_record_t *job_ptr; |
| int rc; |
| |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(FED_LOCK, READ_LOCK)); |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| info("%s: invalid JobId=%u", __func__, job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| |
| if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) { |
| error("%s: Security violation, JOB_COMPLETE RPC for %pJ from uid %u", |
| __func__, job_ptr, uid); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| if (job_ptr->het_job_list) { |
| foreach_complete_hetjob_t foreach_complete_hetjob = { |
| .het_job_leader = job_ptr, |
| .job_return_code = job_return_code, |
| .node_fail = node_fail, |
| .requeue = requeue, |
| .rc = SLURM_SUCCESS, |
| .uid = uid, |
| }; |
| (void) list_for_each(job_ptr->het_job_list, |
| _foreach_het_job_complete, |
| &foreach_complete_hetjob); |
| |
| rc = foreach_complete_hetjob.rc; |
| } else { |
| rc = _job_complete(job_ptr, uid, requeue, node_fail, |
| job_return_code); |
| } |
| |
| return rc; |
| } |
| |
| static int _alt_part_test(part_record_t *part_ptr, part_record_t **part_ptr_new) |
| { |
| part_record_t *alt_part_ptr = NULL; |
| char *alt_name; |
| |
| *part_ptr_new = NULL; |
| if ((part_ptr->state_up & PARTITION_SUBMIT) == 0) { |
| info("_alt_part_test: original partition is not available " |
| "(drain or inactive): %s", part_ptr->name); |
| alt_name = part_ptr->alternate; |
| while (alt_name) { |
| alt_part_ptr = find_part_record(alt_name); |
| if (alt_part_ptr == NULL) { |
| info("_alt_part_test: invalid alternate " |
| "partition name specified: %s", alt_name); |
| return ESLURM_INVALID_PARTITION_NAME; |
| } |
| if (alt_part_ptr == part_ptr) { |
| info("_alt_part_test: no valid alternate " |
| "partition is available"); |
| return ESLURM_PARTITION_NOT_AVAIL; |
| } |
| if (alt_part_ptr->state_up & PARTITION_SUBMIT) |
| break; |
| /* Try next alternate in the sequence */ |
| alt_name = alt_part_ptr->alternate; |
| } |
| if (alt_name == NULL) { |
| info("_alt_part_test: no valid alternate partition is " |
| "available"); |
| return ESLURM_PARTITION_NOT_AVAIL; |
| } |
| *part_ptr_new = alt_part_ptr; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| static int _qos_part_check(void *object, void *arg) |
| { |
| slurmdb_qos_rec_t *qos_ptr = object; |
| qos_part_check_t *qos_part_check = arg; |
| part_record_t *part_ptr = qos_part_check->part_ptr; |
| |
| if ((part_ptr->state_up & PARTITION_SCHED) && |
| (qos_part_check->min_nodes != NO_VAL) && |
| (qos_part_check->min_nodes < part_ptr->min_nodes) && |
| (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_MIN_NODE))) { |
| debug2("%s: Job requested for nodes (%u) smaller than partition %s(%u) min nodes", |
| __func__, qos_part_check->min_nodes, |
| part_ptr->name, part_ptr->min_nodes); |
| qos_part_check->error_code = ESLURM_INVALID_NODE_COUNT; |
| return -1; |
| } |
| |
| if ((part_ptr->state_up & PARTITION_SCHED) && |
| (qos_part_check->max_nodes != NO_VAL) && |
| (qos_part_check->max_nodes > part_ptr->max_nodes) && |
| (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_MAX_NODE))) { |
| debug2("%s: Job requested for nodes (%u) greater than partition %s(%u) max nodes", |
| __func__, qos_part_check->max_nodes, |
| part_ptr->name, part_ptr->max_nodes); |
| qos_part_check->error_code = ESLURM_INVALID_NODE_COUNT; |
| return -1; |
| } |
| |
| if ((part_ptr->state_up & PARTITION_SCHED) && |
| (qos_part_check->time_limit != NO_VAL) && |
| (qos_part_check->time_limit > part_ptr->max_time) && |
| (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT))) { |
| debug2("%s: Job time limit (%u) exceeds limit of partition %s(%u)", |
| __func__, qos_part_check->time_limit, |
| part_ptr->name, part_ptr->max_time); |
| qos_part_check->error_code = ESLURM_INVALID_TIME_LIMIT; |
| return -1; |
| } |
| |
| if (slurm_conf.enforce_part_limits) { |
| if ((qos_part_check->error_code = |
| part_policy_valid_qos(part_ptr, qos_ptr, |
| qos_part_check->submit_uid, |
| NULL)) != SLURM_SUCCESS) |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Test if this job can use this partition |
| * |
| * NOTE: This function is also called with a dummy job_desc_msg_t from |
| * job_limits_check() if there is any new check added here you may also have to |
| * add that parameter to the job_desc_msg_t in that function. |
| */ |
| static int _part_access_check(part_record_t *part_ptr, job_desc_msg_t *job_desc, |
| bitstr_t *req_bitmap, uid_t submit_uid, |
| slurmdb_qos_rec_t *qos_ptr, |
| list_t *qos_ptr_list, char *acct) |
| { |
| uint32_t total_nodes; |
| qos_part_check_t qos_part_check = { |
| .error_code = SLURM_SUCCESS, |
| .max_nodes = job_desc->max_nodes, |
| .min_nodes = job_desc->min_nodes, |
| .part_ptr = part_ptr, |
| .submit_uid = submit_uid, |
| .time_limit = job_desc->time_limit, |
| }; |
| int rc = SLURM_SUCCESS; |
| |
| xassert(verify_assoc_lock(ASSOC_LOCK, READ_LOCK)); |
| xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK)); |
| |
| if ((part_ptr->flags & PART_FLAG_REQ_RESV) && |
| (!job_desc->reservation || job_desc->reservation[0] == '\0')) { |
| debug2("%s: uid %u access to partition %s " |
| "denied, requires reservation", __func__, |
| (unsigned int) submit_uid, part_ptr->name); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && (submit_uid != 0) && |
| (submit_uid != slurm_conf.slurm_user_id)) { |
| debug2("%s: uid %u access to partition %s " |
| "denied, not root", __func__, |
| (unsigned int) submit_uid, part_ptr->name); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if ((job_desc->user_id == 0) && (part_ptr->flags & PART_FLAG_NO_ROOT)) { |
| error("%s: Security violation, SUBMIT_JOB for " |
| "user root disabled", __func__); |
| return ESLURM_USER_ID_MISSING; |
| } |
| |
| if (validate_alloc_node(part_ptr, job_desc->alloc_node) == 0) { |
| debug2("%s: uid %u access to partition %s " |
| "denied, bad allocating node: %s", __func__, |
| (unsigned int) job_desc->user_id, part_ptr->name, |
| job_desc->alloc_node); |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if ((part_ptr->state_up & PARTITION_SCHED) && |
| (job_desc->min_cpus != NO_VAL)) { |
| if (job_desc->min_cpus > part_ptr->total_cpus) { |
| debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)", |
| __func__, job_desc->min_cpus, part_ptr->name, |
| part_ptr->total_cpus); |
| return ESLURM_TOO_MANY_REQUESTED_CPUS; |
| } else if (job_desc->min_cpus > |
| (part_ptr->max_cpus_per_node * |
| part_ptr->total_nodes)) { |
| debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)", |
| __func__, job_desc->min_cpus, part_ptr->name, |
| (part_ptr->max_cpus_per_node * |
| part_ptr->total_nodes)); |
| return ESLURM_TOO_MANY_REQUESTED_CPUS; |
| } |
| } |
| |
| /* Check against total nodes on the partition */ |
| total_nodes = part_ptr->total_nodes; |
| if ((part_ptr->state_up & PARTITION_SCHED) && |
| (job_desc->min_nodes != NO_VAL) && |
| (job_desc->min_nodes > total_nodes)) { |
| debug2("%s: Job requested too many nodes (%u) " |
| "of partition %s(%u)", __func__, |
| job_desc->min_nodes, part_ptr->name, total_nodes); |
| return ESLURM_INVALID_NODE_COUNT; |
| } |
| |
| if (req_bitmap && !bit_super_set(req_bitmap, part_ptr->node_bitmap)) { |
| debug2("%s: requested nodes %s not in partition %s", __func__, |
| job_desc->req_nodes, part_ptr->name); |
| return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION; |
| } |
| |
| /* Check against min/max node limits in the partition */ |
| if (qos_ptr_list) |
| (void) list_for_each(qos_ptr_list, |
| _qos_part_check, |
| &qos_part_check); |
| else |
| (void) _qos_part_check(qos_ptr, &qos_part_check); |
| if (qos_part_check.error_code != SLURM_SUCCESS) |
| return qos_part_check.error_code; |
| |
| if (slurm_conf.enforce_part_limits) { |
| if (!validate_group(part_ptr, job_desc->user_id)) { |
| debug2("%s: uid %u not in group permitted to use this partition (%s). groups allowed: %s", |
| __func__, job_desc->user_id, part_ptr->name, |
| part_ptr->allow_groups); |
| rc = ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP; |
| goto fini; |
| } |
| |
| if ((rc = part_policy_valid_acct(part_ptr, acct, NULL)) |
| != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| fini: |
| return rc; |
| } |
| |
| static int _foreach_rebuild_part_names(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| foreach_rebuild_names_t *foreach_rebuild_names = arg; |
| |
| if (!foreach_rebuild_names->names) |
| foreach_rebuild_names->part_ptr = part_ptr; |
| else |
| xstrcatat(foreach_rebuild_names->names, |
| &foreach_rebuild_names->names_pos, |
| ","); |
| xstrcatat(foreach_rebuild_names->names, |
| &foreach_rebuild_names->names_pos, |
| part_ptr->name); |
| |
| return 0; |
| } |
| |
| static int _get_job_parts(job_desc_msg_t *job_desc, part_record_t **part_pptr, |
| list_t **part_pptr_list, char **err_msg) |
| { |
| part_record_t *part_ptr = NULL, *part_ptr_new = NULL; |
| list_t *part_ptr_list = NULL; |
| int rc = SLURM_SUCCESS; |
| |
| /* Identify partition(s) and set pointer(s) to their struct */ |
| if (job_desc->partition) { |
| char *err_part = NULL; |
| get_part_list(job_desc->partition, &part_ptr_list, &part_ptr, |
| &err_part); |
| if (part_ptr == NULL) { |
| info("%s: invalid partition specified: %s", |
| __func__, job_desc->partition); |
| if (err_msg) { |
| xfree(*err_msg); |
| xstrfmtcat(*err_msg, |
| "invalid partition specified: %s", |
| err_part); |
| xfree(err_part); |
| } |
| FREE_NULL_LIST(part_ptr_list); |
| return ESLURM_INVALID_PARTITION_NAME; |
| } |
| } else if (job_desc->reservation && job_desc->reservation[0] != '\0' ) { |
| slurmctld_resv_t *resv_ptr = NULL; |
| resv_ptr = find_resv_name(job_desc->reservation); |
| if (resv_ptr) |
| part_ptr = resv_ptr->part_ptr; |
| if (part_ptr) |
| job_desc->partition = xstrdup(part_ptr->name); |
| } |
| |
| if (!part_ptr) { |
| if (default_part_loc == NULL) { |
| error("%s: default partition not set", __func__); |
| return ESLURM_DEFAULT_PARTITION_NOT_SET; |
| } |
| part_ptr = default_part_loc; |
| job_desc->partition = xstrdup(part_ptr->name); |
| job_desc->bitflags |= JOB_PART_ASSIGNED; |
| } |
| |
| /* Change partition pointer(s) to alternates as needed */ |
| if (part_ptr_list) { |
| int fail_rc = SLURM_SUCCESS; |
| part_record_t *part_ptr_tmp; |
| bool rebuild_name_list = false; |
| list_itr_t *iter = list_iterator_create(part_ptr_list); |
| |
| /* |
| * Skipping this for now since we are replacing items in the |
| * list. This is the only place in the code we use |
| * list_insert(). There is probably other ways of doing this, |
| * saving for future generations. |
| */ |
| while ((part_ptr_tmp = list_next(iter))) { |
| rc = _alt_part_test(part_ptr_tmp, &part_ptr_new); |
| if (rc != SLURM_SUCCESS) { |
| fail_rc = rc; |
| list_remove(iter); |
| rebuild_name_list = true; |
| continue; |
| } |
| if (part_ptr_new) { |
| list_insert(iter, part_ptr_new); |
| list_remove(iter); |
| rebuild_name_list = true; |
| } |
| } |
| list_iterator_destroy(iter); |
| if (list_is_empty(part_ptr_list)) { |
| if (fail_rc != SLURM_SUCCESS) |
| rc = fail_rc; |
| else |
| rc = ESLURM_PARTITION_NOT_AVAIL; |
| goto fini; |
| } |
| rc = SLURM_SUCCESS; /* At least some partition usable */ |
| if (rebuild_name_list) { |
| foreach_rebuild_names_t foreach_rebuild_names = { 0 }; |
| (void) list_for_each(part_ptr_list, |
| _foreach_rebuild_part_names, |
| &foreach_rebuild_names); |
| part_ptr = foreach_rebuild_names.part_ptr; |
| xfree(job_desc->partition); |
| job_desc->partition = foreach_rebuild_names.names; |
| foreach_rebuild_names.names = NULL; |
| |
| if (!part_ptr) { |
| rc = ESLURM_PARTITION_NOT_AVAIL; |
| goto fini; |
| } |
| } |
| } else { |
| rc = _alt_part_test(part_ptr, &part_ptr_new); |
| if (rc != SLURM_SUCCESS) |
| goto fini; |
| if (part_ptr_new) { |
| part_ptr = part_ptr_new; |
| xfree(job_desc->partition); |
| job_desc->partition = xstrdup(part_ptr->name); |
| } |
| } |
| |
| *part_pptr = part_ptr; |
| if (part_pptr_list) { |
| *part_pptr_list = part_ptr_list; |
| part_ptr_list = NULL; |
| } else |
| FREE_NULL_LIST(part_ptr_list); |
| |
| fini: |
| return rc; |
| } |
| |
| static int _foreach_valid_part(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| foreach_valid_part_t *foreach_valid_part = arg; |
| int rc; |
| |
| /* |
| * Associations should have already be checked before |
| * this. It is not allowed to have a multiple partition |
| * request with partition based associations. |
| */ |
| rc = _part_access_check(part_ptr, |
| foreach_valid_part->job_desc, |
| foreach_valid_part->req_bitmap, |
| foreach_valid_part->submit_uid, |
| foreach_valid_part->qos_ptr, |
| foreach_valid_part->qos_ptr_list, |
| foreach_valid_part->assoc_ptr ? |
| foreach_valid_part->assoc_ptr->acct : NULL); |
| |
| if ((rc != SLURM_SUCCESS) && |
| ((rc == ESLURM_ACCESS_DENIED) || |
| (rc == ESLURM_USER_ID_MISSING) || |
| (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL))) { |
| foreach_valid_part->rc = rc; |
| return -1; |
| } else if (rc != SLURM_SUCCESS) { |
| foreach_valid_part->rc = rc; |
| } else { |
| foreach_valid_part->any_check = true; |
| } |
| |
| /* Set to success since we found a usable partition */ |
| if (foreach_valid_part->any_check && |
| (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ANY)) |
| foreach_valid_part->rc = SLURM_SUCCESS; |
| |
| foreach_valid_part->min_nodes_orig = |
| MIN(foreach_valid_part->min_nodes_orig, |
| part_ptr->min_nodes_orig); |
| foreach_valid_part->max_nodes_orig = |
| MAX(foreach_valid_part->max_nodes_orig, |
| part_ptr->max_nodes_orig); |
| foreach_valid_part->max_time = |
| MAX(foreach_valid_part->max_time, part_ptr->max_time); |
| |
| return 0; |
| } |
| |
| static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid, |
| bitstr_t *req_bitmap, part_record_t *part_ptr, |
| list_t *part_ptr_list, |
| slurmdb_assoc_rec_t *assoc_ptr, |
| slurmdb_qos_rec_t *qos_ptr, |
| list_t *qos_ptr_list) |
| { |
| int rc = SLURM_SUCCESS; |
| uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1; |
| uint32_t max_time = 0; |
| bool any_check = false; |
| |
| xassert(verify_assoc_lock(ASSOC_LOCK, READ_LOCK)); |
| xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK)); |
| |
| /* Change partition pointer(s) to alternates as needed */ |
| if (part_ptr_list) { |
| foreach_valid_part_t foreach_valid_part = { |
| .any_check = any_check, |
| .assoc_ptr = assoc_ptr, |
| .job_desc = job_desc, |
| .max_nodes_orig = 1, |
| .max_time = 0, |
| .min_nodes_orig = INFINITE, |
| .qos_ptr = qos_ptr, |
| .qos_ptr_list = qos_ptr_list, |
| .req_bitmap = req_bitmap, |
| .submit_uid = submit_uid, |
| }; |
| (void) list_for_each(part_ptr_list, _foreach_valid_part, |
| &foreach_valid_part); |
| |
| if (list_is_empty(part_ptr_list) || |
| (slurm_conf.enforce_part_limits && |
| (foreach_valid_part.rc != SLURM_SUCCESS))) { |
| if (slurm_conf.enforce_part_limits == |
| PARTITION_ENFORCE_ALL) |
| rc = foreach_valid_part.rc; |
| else if (slurm_conf.enforce_part_limits == |
| PARTITION_ENFORCE_ANY && !any_check) |
| rc = foreach_valid_part.rc; |
| else { |
| rc = ESLURM_PARTITION_NOT_AVAIL; |
| } |
| goto fini; |
| } |
| any_check = foreach_valid_part.any_check; |
| min_nodes_orig = foreach_valid_part.min_nodes_orig; |
| max_nodes_orig = foreach_valid_part.max_nodes_orig; |
| max_time = foreach_valid_part.max_time; |
| rc = SLURM_SUCCESS; /* At least some partition usable */ |
| } else { |
| min_nodes_orig = part_ptr->min_nodes_orig; |
| max_nodes_orig = part_ptr->max_nodes_orig; |
| max_time = part_ptr->max_time; |
| rc = _part_access_check(part_ptr, job_desc, req_bitmap, |
| submit_uid, qos_ptr, qos_ptr_list, |
| assoc_ptr ? assoc_ptr->acct : NULL); |
| if ((rc != SLURM_SUCCESS) && |
| ((rc == ESLURM_ACCESS_DENIED) || |
| (rc == ESLURM_USER_ID_MISSING) || |
| slurm_conf.enforce_part_limits)) |
| goto fini; |
| /* Enforce Part Limit = no */ |
| rc = SLURM_SUCCESS; |
| } |
| |
| /* Validate job limits against partition limits */ |
| |
| /* Check Partition with the highest limits when there are multiple */ |
| if (job_desc->min_nodes == NO_VAL) { |
| /* Avoid setting the job request to 0 nodes unless requested */ |
| if (!min_nodes_orig) |
| job_desc->min_nodes = 1; |
| else |
| job_desc->min_nodes = min_nodes_orig; |
| } else if ((job_desc->min_nodes > max_nodes_orig) && |
| slurm_conf.enforce_part_limits && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_MAX_NODE)))) { |
| info("%s: job's min nodes greater than " |
| "partition's max nodes (%u > %u)", |
| __func__, job_desc->min_nodes, max_nodes_orig); |
| rc = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } else if ((job_desc->min_nodes < min_nodes_orig) && |
| ((job_desc->max_nodes == NO_VAL) || |
| (job_desc->max_nodes >= min_nodes_orig))) { |
| job_desc->min_nodes = min_nodes_orig; |
| } |
| |
| if ((job_desc->max_nodes != NO_VAL) && |
| slurm_conf.enforce_part_limits && |
| (job_desc->max_nodes < min_nodes_orig) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags |
| & QOS_FLAG_PART_MIN_NODE)))) { |
| info("%s: job's max nodes less than partition's " |
| "min nodes (%u < %u)", |
| __func__, job_desc->max_nodes, min_nodes_orig); |
| rc = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } |
| /* Zero node count OK for persistent burst buffer create or destroy */ |
| if ((job_desc->min_nodes == 0) && |
| (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) || |
| (!job_desc->burst_buffer && !job_desc->script))) { |
| info("%s: min_nodes is zero", __func__); |
| rc = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } |
| |
| if ((job_desc->time_limit == NO_VAL) && |
| (part_ptr->default_time == 0)) { |
| info("%s: job's default time is 0", __func__); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| |
| if ((job_desc->time_limit == NO_VAL) && |
| (part_ptr->default_time != NO_VAL)) |
| job_desc->time_limit = part_ptr->default_time; |
| |
| if ((job_desc->time_min != NO_VAL) && |
| (job_desc->time_min > max_time) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_TIME_LIMIT)))) { |
| info("%s: job's min time greater than " |
| "partition's (%u > %u)", |
| __func__, job_desc->time_min, max_time); |
| rc = ESLURM_INVALID_TIME_MIN_LIMIT; |
| goto fini; |
| } |
| if ((job_desc->time_limit != NO_VAL) && |
| (job_desc->time_limit > max_time) && |
| (job_desc->time_min == NO_VAL) && |
| slurm_conf.enforce_part_limits && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_TIME_LIMIT)))) { |
| info("%s: job's time limit greater than " |
| "partition's (%u > %u)", |
| __func__, job_desc->time_limit, max_time); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| if ((job_desc->time_min != NO_VAL) && |
| (job_desc->time_min > job_desc->time_limit) && |
| (!qos_ptr || (qos_ptr && !(qos_ptr->flags & |
| QOS_FLAG_PART_TIME_LIMIT)))) { |
| info("%s: job's min_time greater time limit " |
| "(%u > %u)", |
| __func__, job_desc->time_min, job_desc->time_limit); |
| rc = ESLURM_INVALID_TIME_MIN_LIMIT; |
| goto fini; |
| } |
| if ((job_desc->deadline) && (job_desc->deadline != NO_VAL)) { |
| char time_str_earliest[256]; |
| char time_str_deadline[256]; |
| time_t now = time(NULL); |
| time_t begin_time = job_desc->begin_time; |
| time_t earliest_start = MAX(begin_time, now); |
| time_t limit_in_sec = job_desc->time_limit * 60; |
| time_t min_in_sec = job_desc->time_min * 60; |
| |
| slurm_make_time_str(&job_desc->deadline, time_str_deadline, |
| sizeof(time_str_deadline)); |
| slurm_make_time_str(&earliest_start, time_str_earliest, |
| sizeof(time_str_earliest)); |
| |
| if (job_desc->deadline < earliest_start) { |
| info("%s: job's deadline is before its earliest start time (%s < %s)", |
| __func__, time_str_deadline, time_str_earliest); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| if ((job_desc->time_min) && (job_desc->time_min != NO_VAL) && |
| (job_desc->deadline < (earliest_start + min_in_sec))) { |
| info("%s: job's min_time exceeds the deadline (%s + %lu > %s)", |
| __func__, time_str_earliest, min_in_sec, |
| time_str_deadline); |
| rc = ESLURM_INVALID_TIME_MIN_LIMIT; |
| goto fini; |
| } |
| if ((!job_desc->time_min || job_desc->time_min == NO_VAL) && |
| (job_desc->time_limit) && |
| (job_desc->time_limit != NO_VAL) && |
| (job_desc->deadline < (earliest_start + limit_in_sec))) { |
| info("%s: job's time_limit exceeds the deadline (%s + %lu > %s)", |
| __func__, time_str_earliest, limit_in_sec, |
| time_str_deadline); |
| rc = ESLURM_INVALID_TIME_LIMIT; |
| goto fini; |
| } |
| } |
| |
| fini: |
| return rc; |
| } |
| |
| /* |
| * job_limits_check - check the limits specified for the job. |
| * IN job_ptr - pointer to job table entry. |
| * IN check_min_time - if true test job's minimum time limit, |
| * otherwise test maximum time limit |
| * RET WAIT_NO_REASON on success, fail status otherwise. |
| */ |
| extern int job_limits_check(job_record_t **job_pptr, bool check_min_time) |
| { |
| job_details_t *detail_ptr; |
| enum job_state_reason fail_reason; |
| part_record_t *part_ptr = NULL; |
| job_record_t *job_ptr = NULL; |
| slurmdb_qos_rec_t *qos_ptr; |
| slurmdb_assoc_rec_t *assoc_ptr; |
| job_desc_msg_t job_desc; |
| int rc; |
| |
| assoc_mgr_lock_t assoc_mgr_read_lock = { |
| .assoc = READ_LOCK, |
| .qos = READ_LOCK, |
| .user = READ_LOCK, |
| }; |
| |
| assoc_mgr_lock(&assoc_mgr_read_lock); |
| |
| job_ptr = *job_pptr; |
| detail_ptr = job_ptr->details; |
| part_ptr = job_ptr->part_ptr; |
| qos_ptr = job_ptr->qos_ptr; |
| assoc_ptr = job_ptr->assoc_ptr; |
| if (!detail_ptr || !part_ptr) { |
| fatal_abort("%pJ has NULL details_ptr and/or part_ptr", |
| job_ptr); |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| return WAIT_NO_REASON; /* To prevent CLANG error */ |
| } |
| |
| fail_reason = WAIT_NO_REASON; |
| |
| /* |
| * Here we need to pretend we are just submitting the job so we can |
| * utilize the already existing function _part_access_check. If any |
| * additional fields in that function are ever checked, the fields set |
| * below will need to be modified. |
| */ |
| slurm_init_job_desc_msg(&job_desc); |
| job_desc.reservation = job_ptr->resv_name; |
| job_desc.user_id = job_ptr->user_id; |
| job_desc.alloc_node = job_ptr->alloc_node; |
| job_desc.min_cpus = detail_ptr->orig_min_cpus; |
| job_desc.min_nodes = detail_ptr->min_nodes; |
| /* _part_access_check looks for NO_VAL instead of 0 */ |
| job_desc.max_nodes = detail_ptr->max_nodes ? |
| detail_ptr->max_nodes : NO_VAL;; |
| if (check_min_time && job_ptr->time_min) |
| job_desc.time_limit = job_ptr->time_min; |
| else |
| job_desc.time_limit = job_ptr->time_limit; |
| |
| /* For qos_ptr_list we are checking that now, so send in NULL */ |
| if ((rc = _part_access_check(part_ptr, &job_desc, NULL, |
| job_ptr->user_id, qos_ptr, |
| NULL, |
| job_ptr->account))) { |
| debug2("%pJ can't run in partition %s: %s", |
| job_ptr, part_ptr->name, slurm_strerror(rc)); |
| switch (rc) { |
| case ESLURM_INVALID_TIME_LIMIT: |
| case ESLURM_INVALID_TIME_MIN_LIMIT: |
| if (job_ptr->limit_set.time != ADMIN_SET_LIMIT) |
| fail_reason = WAIT_PART_TIME_LIMIT; |
| break; |
| case ESLURM_INVALID_NODE_COUNT: |
| fail_reason = WAIT_PART_NODE_LIMIT; |
| break; |
| /* FIXME */ |
| /* case ESLURM_TOO_MANY_REQUESTED_CPUS: */ |
| /* failt_reason = NON_EXISTANT_WAIT_PART_CPU_LIMIT; */ |
| /* break; */ |
| default: |
| fail_reason = WAIT_PART_CONFIG; |
| break; |
| } |
| } else if (part_ptr->state_up == PARTITION_DOWN) { |
| debug2("%pJ requested down partition %s", |
| job_ptr, part_ptr->name); |
| fail_reason = WAIT_PART_DOWN; |
| } else if (part_ptr->state_up == PARTITION_INACTIVE) { |
| debug2("%pJ requested inactive partition %s", |
| job_ptr, part_ptr->name); |
| fail_reason = WAIT_PART_INACTIVE; |
| } else if (qos_ptr && assoc_ptr && |
| (qos_ptr->flags & QOS_FLAG_ENFORCE_USAGE_THRES) && |
| (!fuzzy_equal(qos_ptr->usage_thres, NO_VAL))) { |
| if (!job_ptr->prio_factors) { |
| job_ptr->prio_factors = |
| xmalloc(sizeof(priority_factors_t)); |
| } |
| if (!job_ptr->prio_factors->priority_fs) { |
| if (fuzzy_equal(assoc_ptr->usage->usage_efctv, NO_VAL)) |
| priority_g_set_assoc_usage(assoc_ptr); |
| job_ptr->prio_factors->priority_fs = |
| priority_g_calc_fs_factor( |
| assoc_ptr->usage->usage_efctv, |
| (long double)assoc_ptr->usage-> |
| shares_norm); |
| } |
| if (job_ptr->prio_factors->priority_fs < qos_ptr->usage_thres){ |
| debug2("%pJ exceeds usage threshold", job_ptr); |
| fail_reason = WAIT_QOS_THRES; |
| } |
| } else if (fail_reason == WAIT_NO_REASON) { |
| /* |
| * Here we need to pretend we are just submitting the job so we |
| * can utilize the already existing function _valid_pn_min_mem. |
| * If anything else is ever checked in that function this will |
| * most likely have to be updated. Some of the needed members |
| * were already initialized above to call _part_access_check, as |
| * well as the memset for job_desc. |
| */ |
| if (job_ptr->bit_flags & JOB_MEM_SET) |
| job_desc.pn_min_memory = detail_ptr->orig_pn_min_memory; |
| else { |
| /* |
| * Don't consider DefMemPerGPU here when coming up with |
| * a pn_min_memory, we don't know how many nodes the |
| * gpus may be split over yet so _get_def_mem may |
| * overestimate. |
| */ |
| job_desc.pn_min_memory = _get_def_mem(part_ptr, NULL); |
| } |
| if (detail_ptr->orig_cpus_per_task == NO_VAL16) |
| job_desc.cpus_per_task = 1; |
| else |
| job_desc.cpus_per_task = detail_ptr->orig_cpus_per_task; |
| /* |
| * Passing the value directly since detail_ptr->num_tasks |
| * already set correctly. If it is zero _valid_pn_min_mem() |
| * already handles it. |
| */ |
| job_desc.num_tasks = detail_ptr->num_tasks; |
| //job_desc.min_cpus = detail_ptr->min_cpus; /* init'ed above */ |
| job_desc.max_cpus = detail_ptr->orig_max_cpus; |
| job_desc.shared = (uint16_t)detail_ptr->share_res; |
| /* |
| * At this point detail_ptr->ntasks_per_node is expected to |
| * hold 0 (not set) or a regular value, but never NO_VAL16. |
| * _valid_pn_min_mem will check for job_desc.ntasks_per_node |
| * being different than NO_VAL16, which is its initial value. |
| */ |
| if (detail_ptr->ntasks_per_node) |
| job_desc.ntasks_per_node = detail_ptr->ntasks_per_node; |
| job_desc.ntasks_per_tres = detail_ptr->ntasks_per_tres; |
| job_desc.pn_min_cpus = detail_ptr->orig_pn_min_cpus; |
| job_desc.job_id = job_ptr->job_id; |
| job_desc.bitflags = job_ptr->bit_flags; |
| job_desc.tres_per_task = xstrdup(job_ptr->tres_per_task); |
| if (!_valid_pn_min_mem(&job_desc, part_ptr)) { |
| /* debug2 message already logged inside the function. */ |
| fail_reason = WAIT_PN_MEM_LIMIT; |
| } else { |
| /* Copy back to job_record adjusted members */ |
| detail_ptr->pn_min_memory = job_desc.pn_min_memory; |
| detail_ptr->cpus_per_task = job_desc.cpus_per_task; |
| detail_ptr->min_cpus = job_desc.min_cpus; |
| detail_ptr->max_cpus = job_desc.max_cpus; |
| detail_ptr->pn_min_cpus = job_desc.pn_min_cpus; |
| SWAP(job_ptr->tres_per_task, job_desc.tres_per_task); |
| } |
| |
| xfree(job_desc.tres_per_task); |
| } |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| |
| return (fail_reason); |
| } |
| |
| static void _set_tot_license_req(job_desc_msg_t *job_desc, |
| job_record_t *job_ptr) |
| { |
| char *lic_req = NULL, *lic_req_pos = NULL; |
| uint32_t num_tasks = job_desc->num_tasks; |
| char *tres_per_task = job_desc->tres_per_task; |
| |
| /* |
| * If !tres_per_task we check to see if num_tasks has changed. |
| * If it has then use the current tres. |
| */ |
| if (job_ptr && !tres_per_task && (job_desc->bitflags & TASKS_CHANGED)) { |
| tres_per_task = job_ptr->tres_per_task; |
| } |
| |
| /* |
| * Here we are seeing we we are setting something explicit. If we are |
| * set it. If we are changing tasks we need what was already on the job. |
| */ |
| if (job_desc->licenses && (job_desc->licenses[0] || |
| (job_desc->bitflags & RESET_LIC_JOB))) |
| xstrfmtcatat(lic_req, &lic_req_pos, "%s", job_desc->licenses); |
| else if (tres_per_task && |
| !(job_desc->bitflags & RESET_LIC_JOB) && |
| job_ptr && |
| job_ptr->lic_req) |
| xstrfmtcatat(lic_req, &lic_req_pos, "%s", job_ptr->lic_req); |
| |
| if (job_desc->bitflags & RESET_LIC_TASK) { |
| /* removed tres */ |
| if (!lic_req) |
| lic_req = xstrdup(""); |
| } else if (tres_per_task) { |
| char *lic_tmp = slurm_get_tres_sub_string( |
| tres_per_task, "license", num_tasks, false, false); |
| if (lic_tmp) { |
| if (lic_req) { |
| xstrfmtcatat(lic_req, &lic_req_pos, |
| ",%s", lic_tmp); |
| xfree(lic_tmp); |
| } else { |
| lic_req = lic_tmp; |
| lic_tmp = NULL; |
| } |
| } |
| } |
| |
| xfree(job_desc->licenses_tot); |
| job_desc->licenses_tot = lic_req; |
| lic_req = NULL; |
| } |
| |
| static void _enable_stepmgr(job_record_t *job_ptr, job_desc_msg_t *job_desc) |
| { |
| static bool first_time = true; |
| static bool stepmgr_enabled = false; |
| |
| if (first_time) { |
| first_time = false; |
| stepmgr_enabled = xstrstr(slurm_conf.slurmctld_params, |
| "enable_stepmgr"); |
| } |
| |
| if ((stepmgr_enabled || (job_desc->bitflags & STEPMGR_ENABLED)) && |
| (job_desc->het_job_offset == NO_VAL)) { |
| job_ptr->bit_flags |= STEPMGR_ENABLED; |
| } else { |
| job_ptr->bit_flags &= ~STEPMGR_ENABLED; |
| } |
| |
| if ((job_ptr->bit_flags & STEPMGR_ENABLED) && |
| !(slurm_conf.prolog_flags & PROLOG_FLAG_CONTAIN)) { |
| error("STEP_MGR not supported without PrologFlags=contain"); |
| job_ptr->bit_flags &= ~STEPMGR_ENABLED; |
| } |
| } |
| |
| /* |
| * _job_create - create a job table record for the supplied specifications. |
| * This performs only basic tests for request validity (access to |
| * partition, nodes count in partition, and sufficient processors in |
| * partition). |
| * IN job_desc - job specifications |
| * IN allocate - resource allocation request if set rather than job submit |
| * IN will_run - job is not to be created, test of validity only |
| * OUT job_pptr - pointer to the job (NULL on error) |
| * OUT err_msg - Error message for user |
| * RET 0 on success, otherwise ESLURM error code. If the job would only be |
| * able to execute with some change in partition configuration then |
| * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned |
| */ |
| |
| static int _job_create(job_desc_msg_t *job_desc, int allocate, int will_run, |
| bool cron, job_record_t **job_pptr, uid_t submit_uid, |
| char **err_msg, uint16_t protocol_version) |
| { |
| int error_code = SLURM_SUCCESS; |
| part_record_t *part_ptr = NULL; |
| list_t *part_ptr_list = NULL, *qos_ptr_list = NULL; |
| bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL; |
| job_record_t *job_ptr = NULL; |
| slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL; |
| list_t *license_list = NULL, *gres_list = NULL; |
| bool valid; |
| slurmdb_qos_rec_t *qos_ptr; |
| uint32_t user_submit_priority, acct_reason = 0; |
| uint32_t qos_id = 0; |
| acct_policy_limit_set_t acct_policy_limit_set; |
| assoc_mgr_lock_t assoc_mgr_read_lock = { |
| .assoc = READ_LOCK, |
| .qos = READ_LOCK, |
| .user = READ_LOCK, |
| }; |
| gres_job_state_validate_t gres_js_val = { |
| .cpus_per_task = &job_desc->cpus_per_task, |
| .max_nodes = &job_desc->max_nodes, |
| .min_cpus = &job_desc->min_cpus, |
| .min_nodes = &job_desc->min_nodes, |
| .ntasks_per_node = &job_desc->ntasks_per_node, |
| .ntasks_per_socket = &job_desc->ntasks_per_socket, |
| .ntasks_per_tres = &job_desc->ntasks_per_tres, |
| .num_tasks = &job_desc->num_tasks, |
| .sockets_per_node = &job_desc->sockets_per_node, |
| |
| .gres_list = &gres_list, |
| }; |
| |
| memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set)); |
| acct_policy_limit_set.tres = xcalloc(slurmctld_tres_cnt, |
| sizeof(uint16_t)); |
| |
| *job_pptr = NULL; |
| |
| user_submit_priority = job_desc->priority; |
| |
| /* ensure that selected nodes are in this partition */ |
| if (job_desc->req_nodes) { |
| error_code = node_name2bitmap(job_desc->req_nodes, false, |
| &req_bitmap, NULL); |
| if (error_code) { |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto cleanup_fail; |
| } |
| if ((job_desc->contiguous != NO_VAL16) && |
| (job_desc->contiguous)) |
| bit_fill_gaps(req_bitmap); |
| if (bit_set_count(req_bitmap) > job_desc->min_nodes) { |
| /* |
| * If a nodelist has been provided with more nodes than |
| * are required for the job, translate this into an |
| * exclusion of all nodes except those requested. |
| */ |
| exc_bitmap = bit_alloc(node_record_count); |
| bit_or_not(exc_bitmap, req_bitmap); |
| FREE_NULL_BITMAP(req_bitmap); |
| } |
| } |
| |
| /* Zero node count OK for persistent burst buffer create or destroy */ |
| if ((job_desc->max_nodes == 0) && |
| (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) || |
| (!job_desc->burst_buffer && !job_desc->script))) { |
| info("%s: max_nodes is zero", __func__); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto cleanup_fail; |
| } |
| |
| error_code = _get_job_parts(job_desc, &part_ptr, &part_ptr_list, |
| err_msg); |
| if (error_code != SLURM_SUCCESS) |
| goto cleanup_fail; |
| |
| memset(&assoc_rec, 0, sizeof(assoc_rec)); |
| assoc_rec.acct = job_desc->account; |
| assoc_rec.partition = part_ptr->name; |
| assoc_rec.uid = job_desc->user_id; |
| /* |
| * Checks are done later to validate assoc_ptr, so we don't |
| * need to lock outside of fill_in_assoc. |
| */ |
| assoc_mgr_lock(&assoc_mgr_read_lock); |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, &assoc_ptr, true)) { |
| info("%s: invalid account or partition for user %u, " |
| "account '%s', and partition '%s'", __func__, |
| job_desc->user_id, assoc_rec.acct, assoc_rec.partition); |
| error_code = ESLURM_INVALID_ACCOUNT; |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| goto cleanup_fail; |
| } else if (slurm_with_slurmdbd() && |
| !assoc_ptr && |
| !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) { |
| /* |
| * If not enforcing associations we want to look for the |
| * default account and use it to avoid getting trash in the |
| * accounting records. |
| */ |
| assoc_rec.acct = NULL; |
| (void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, &assoc_ptr, |
| true); |
| if (assoc_ptr) { |
| info("%s: account '%s' has no association for user %u " |
| "using default account '%s'", |
| __func__, job_desc->account, job_desc->user_id, |
| assoc_rec.acct); |
| xfree(job_desc->account); |
| } |
| } |
| |
| if ((error_code = _check_for_part_assocs( |
| part_ptr_list, assoc_ptr)) != SLURM_SUCCESS) { |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| goto cleanup_fail; |
| } |
| |
| if (job_desc->account == NULL) |
| job_desc->account = xstrdup(assoc_rec.acct); |
| |
| /* This must be done after we have the assoc_ptr set */ |
| error_code = _get_qos_info(job_desc->qos, 0, |
| &qos_ptr_list, |
| &qos_ptr, |
| job_desc->reservation, |
| assoc_ptr, |
| false, true, LOG_LEVEL_ERROR); |
| if (error_code != SLURM_SUCCESS) { |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| goto cleanup_fail; |
| } |
| |
| error_code = _valid_job_part(job_desc, submit_uid, req_bitmap, |
| part_ptr, part_ptr_list, |
| assoc_ptr, qos_ptr, qos_ptr_list); |
| if (qos_ptr) |
| qos_id = qos_ptr->id; |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| if (error_code != SLURM_SUCCESS) |
| goto cleanup_fail; |
| |
| if ((error_code = _validate_job_desc(job_desc, allocate, cron, |
| submit_uid, part_ptr, |
| part_ptr_list))) { |
| goto cleanup_fail; |
| } |
| |
| job_desc->tres_req_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t)); |
| |
| _set_tot_license_req(job_desc, NULL); |
| |
| license_list = |
| license_validate(job_desc->licenses_tot, validate_cfgd_licenses, |
| true, false, job_desc->tres_req_cnt, &valid); |
| |
| if (!valid) { |
| info("Job's requested licenses are invalid: %s", |
| job_desc->licenses_tot); |
| error_code = ESLURM_INVALID_LICENSES; |
| goto cleanup_fail; |
| } |
| |
| if ((job_desc->bitflags & GRES_ONE_TASK_PER_SHARING) && |
| (!(slurm_conf.select_type_param & |
| SELECT_MULTIPLE_SHARING_GRES_PJ))) { |
| info("%s: one-task-per-sharing requires MULTIPLE_SHARING_GRES_PJ", |
| __func__); |
| error_code = ESLURM_INVALID_GRES; |
| goto cleanup_fail; |
| } |
| |
| gres_js_val.cpus_per_tres = job_desc->cpus_per_tres; |
| gres_js_val.mem_per_tres = job_desc->mem_per_tres; |
| gres_js_val.tres_freq = job_desc->tres_freq; |
| gres_js_val.tres_per_job = job_desc->tres_per_job; |
| gres_js_val.tres_per_node = job_desc->tres_per_node; |
| gres_js_val.tres_per_socket = job_desc->tres_per_socket; |
| gres_js_val.tres_per_task = job_desc->tres_per_task; |
| if ((error_code = gres_job_state_validate(&gres_js_val))) |
| goto cleanup_fail; |
| |
| if (!assoc_mgr_valid_tres_cnt(job_desc->cpus_per_tres, 0) || |
| !assoc_mgr_valid_tres_cnt(job_desc->mem_per_tres, 0) || |
| tres_bind_verify_cmdline(job_desc->tres_bind) || |
| tres_freq_verify_cmdline(job_desc->tres_freq) || |
| !assoc_mgr_valid_tres_cnt(job_desc->mem_per_tres, 0) || |
| !assoc_mgr_valid_tres_cnt(job_desc->tres_per_job, 0) || |
| !assoc_mgr_valid_tres_cnt(job_desc->tres_per_node, 0) || |
| !assoc_mgr_valid_tres_cnt(job_desc->tres_per_socket, 0) || |
| !assoc_mgr_valid_tres_cnt(job_desc->tres_per_task, 0)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto cleanup_fail; |
| } |
| |
| gres_stepmgr_set_job_tres_cnt( |
| gres_list, |
| job_desc->min_nodes, |
| job_desc->tres_req_cnt, |
| false); |
| |
| /* gres_job_state_validate() can update min_nodes and min_cpus. */ |
| job_desc->tres_req_cnt[TRES_ARRAY_NODE] = job_desc->min_nodes; |
| job_desc->tres_req_cnt[TRES_ARRAY_CPU] = job_desc->min_cpus; |
| |
| /* Get GRES before mem so we can pass gres_list to job_get_tres_mem() */ |
| job_desc->tres_req_cnt[TRES_ARRAY_MEM] = |
| job_get_tres_mem(NULL, |
| job_desc->pn_min_memory, |
| job_desc->tres_req_cnt[TRES_ARRAY_CPU], |
| job_desc->min_nodes, part_ptr, |
| gres_list, |
| job_desc->bitflags & JOB_MEM_SET, |
| job_desc->sockets_per_node, |
| job_desc->num_tasks); |
| |
| /* |
| * Do this last,after other TRES' have been set as it uses the other |
| * values to calculate the billing value. |
| */ |
| job_desc->tres_req_cnt[TRES_ARRAY_BILLING] = |
| assoc_mgr_tres_weighted(job_desc->tres_req_cnt, |
| part_ptr->billing_weights, |
| slurm_conf.priority_flags, false); |
| |
| if ((error_code = bb_g_job_validate(job_desc, submit_uid, err_msg)) |
| != SLURM_SUCCESS) |
| goto cleanup_fail; |
| |
| if (job_desc->deadline && (job_desc->time_limit == NO_VAL) && |
| (job_desc->time_min == NO_VAL)) |
| job_desc->time_min = 1; |
| if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) && |
| (!acct_policy_validate(job_desc, part_ptr, part_ptr_list, |
| assoc_ptr, qos_ptr, &acct_reason, |
| &acct_policy_limit_set, 0))) { |
| if (err_msg) { |
| xfree(*err_msg); |
| *err_msg = |
| xstrdup(job_state_reason_string(acct_reason)); |
| } |
| info("%s: exceeded association/QOS limit for user %u: %s", |
| __func__, job_desc->user_id, |
| err_msg ? *err_msg : job_state_reason_string(acct_reason)); |
| error_code = ESLURM_ACCOUNTING_POLICY; |
| goto cleanup_fail; |
| } |
| |
| if (job_desc->exc_nodes) { |
| bitstr_t *old_exc_bitmap = exc_bitmap; |
| |
| error_code = node_name2bitmap(job_desc->exc_nodes, false, |
| &exc_bitmap, NULL); |
| if (error_code) { |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto cleanup_fail; |
| } |
| |
| if (old_exc_bitmap) |
| bit_or(exc_bitmap, old_exc_bitmap); |
| FREE_NULL_BITMAP(old_exc_bitmap); |
| } |
| if (exc_bitmap && req_bitmap) { |
| bitstr_t *tmp_bitmap = NULL; |
| bitoff_t first_set; |
| tmp_bitmap = bit_copy(exc_bitmap); |
| bit_and(tmp_bitmap, req_bitmap); |
| first_set = bit_ffs(tmp_bitmap); |
| FREE_NULL_BITMAP(tmp_bitmap); |
| if (first_set != -1) { |
| info("Job's required and excluded node lists overlap"); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto cleanup_fail; |
| } |
| } |
| |
| if (job_desc->min_nodes == NO_VAL) |
| job_desc->min_nodes = 1; |
| |
| if (job_desc->max_nodes == NO_VAL) |
| job_desc->max_nodes = 0; |
| |
| if (job_desc->max_nodes && |
| (job_desc->max_nodes < job_desc->min_nodes)) { |
| info("%s: Job's max_nodes(%u) < min_nodes(%u)", |
| __func__, job_desc->max_nodes, job_desc->min_nodes); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto cleanup_fail; |
| } |
| |
| if ((error_code = _copy_job_desc_to_job_record(job_desc, |
| job_pptr, |
| &req_bitmap, |
| &exc_bitmap))) { |
| if (error_code == SLURM_ERROR) |
| error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY; |
| job_ptr = *job_pptr; |
| goto cleanup_fail; |
| } |
| |
| job_ptr = *job_pptr; |
| job_ptr->start_protocol_ver = protocol_version; |
| job_ptr->part_ptr = part_ptr; |
| job_ptr->part_ptr_list = part_ptr_list; |
| job_ptr->qos_list = qos_ptr_list; |
| job_ptr->bit_flags |= JOB_DEPENDENT; |
| job_ptr->last_sched_eval = time(NULL); |
| |
| part_ptr_list = NULL; |
| qos_ptr_list = NULL; |
| |
| memcpy(&job_ptr->limit_set, &acct_policy_limit_set, |
| sizeof(acct_policy_limit_set_t)); |
| acct_policy_limit_set.tres = NULL; |
| |
| job_ptr->assoc_id = assoc_rec.id; |
| job_ptr->assoc_ptr = (void *) assoc_ptr; |
| job_ptr->qos_ptr = (void *) qos_ptr; |
| job_ptr->qos_id = qos_id; |
| |
| if (mcs_g_set_mcs_label(job_ptr, job_desc->mcs_label) != 0 ) { |
| if (job_desc->mcs_label == NULL) { |
| error("Failed to create job: No valid mcs_label found"); |
| } else { |
| error("Failed to create job: Invalid mcs-label: %s", |
| job_desc->mcs_label); |
| } |
| error_code = ESLURM_INVALID_MCS_LABEL; |
| goto cleanup_fail; |
| } |
| |
| /* |
| * Permission for altering priority was confirmed above. The job_submit |
| * plugin may have set the priority directly or put the job on hold. If |
| * the priority is not given, we will figure it out later after we see |
| * if the job is eligible or not. So we want NO_VAL if not set. |
| */ |
| job_ptr->priority = job_desc->priority; |
| if (job_ptr->priority == 0) { |
| if (user_submit_priority == 0) |
| job_ptr->state_reason = WAIT_HELD_USER; |
| else |
| job_ptr->state_reason = WAIT_HELD; |
| } else if ((job_ptr->priority != NO_VAL) && |
| (job_ptr->priority != INFINITE)) { |
| job_ptr->direct_set_prio = 1; |
| } else if ((job_ptr->priority == INFINITE) && |
| (user_submit_priority == INFINITE)) { |
| /* This happens when "hold": false is specified to slurmrestd */ |
| job_ptr->priority = NO_VAL; |
| } |
| |
| /* |
| * The job submit plugin sets site_factor to NO_VAL so that it can |
| * only be set the by the job submit plugin at submission. |
| */ |
| if (job_desc->site_factor != NO_VAL) |
| job_ptr->site_factor = job_desc->site_factor; |
| |
| error_code = update_job_dependency(job_ptr, job_desc->dependency); |
| if (error_code != SLURM_SUCCESS) |
| goto cleanup_fail; |
| job_ptr->details->orig_dependency = xstrdup(job_ptr->details-> |
| dependency); |
| |
| if ((error_code = build_feature_list(job_ptr, false, false))) |
| goto cleanup_fail; |
| |
| if ((error_code = build_feature_list(job_ptr, true, false))) |
| goto cleanup_fail; |
| |
| error_code = extra_constraints_parse(job_ptr->extra, |
| &job_ptr->extra_constraints); |
| if (error_code != SLURM_SUCCESS) |
| goto cleanup_fail; |
| |
| /* |
| * NOTE: If this job is being used to expand another job, this job's |
| * gres_list has already been filled in with a copy of gres_list job |
| * to be expanded by update_job_dependency() |
| */ |
| if (!job_ptr->details->expanding_jobid) { |
| job_ptr->gres_list_req = gres_list; |
| gres_list = NULL; |
| } |
| |
| job_ptr->gres_detail_cnt = 0; |
| job_ptr->gres_detail_str = NULL; |
| gres_job_state_log(job_ptr->gres_list_req, job_ptr->job_id); |
| |
| if ((error_code = validate_job_resv(job_ptr))) |
| goto cleanup_fail; |
| |
| if (job_desc->script |
| && (!will_run)) { /* don't bother with copy if just a test */ |
| char *tmp; |
| if ((error_code = _copy_job_desc_to_file(job_desc, |
| job_ptr->job_id))) { |
| error_code = ESLURM_WRITING_TO_FILE; |
| goto cleanup_fail; |
| } |
| job_ptr->batch_flag = 1; |
| |
| if (slurm_conf.conf_flags & CONF_FLAG_SJE) { |
| tmp = xstring_bytes2hex(job_desc->env_hash.hash, |
| sizeof(job_desc->env_hash.hash), |
| NULL); |
| job_ptr->details->env_hash = |
| xstrdup_printf("%d:%s", |
| job_desc->env_hash.type, |
| tmp); |
| xfree(tmp); |
| } |
| |
| if (slurm_conf.conf_flags & CONF_FLAG_SJS) { |
| tmp = xstring_bytes2hex( |
| job_desc->script_hash.hash, |
| sizeof(job_desc->script_hash.hash), NULL); |
| |
| job_ptr->details->script_hash = |
| xstrdup_printf("%d:%s", |
| job_desc->script_hash.type, |
| tmp); |
| xfree(tmp); |
| } |
| } else |
| job_ptr->batch_flag = 0; |
| if (!will_run && |
| (error_code = bb_g_job_validate2(job_ptr, err_msg))) |
| goto cleanup_fail; |
| |
| job_ptr->license_list = license_list; |
| license_list = NULL; |
| |
| if (job_desc->req_switch != NO_VAL) { /* Max # of switches */ |
| job_ptr->req_switch = job_desc->req_switch; |
| if (job_desc->wait4switch != NO_VAL) { |
| job_ptr->wait4switch = |
| _max_switch_wait(job_desc->wait4switch); |
| } else |
| job_ptr->wait4switch = _max_switch_wait(INFINITE); |
| } |
| job_ptr->best_switch = true; |
| |
| _enable_stepmgr(job_ptr, job_desc); |
| |
| FREE_NULL_LIST(license_list); |
| FREE_NULL_LIST(gres_list); |
| FREE_NULL_BITMAP(req_bitmap); |
| FREE_NULL_BITMAP(exc_bitmap); |
| return error_code; |
| |
| cleanup_fail: |
| if (job_ptr) { |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_SYSTEM; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = time(NULL); |
| purge_job_record(job_ptr->job_id); |
| *job_pptr = NULL; |
| } |
| FREE_NULL_LIST(license_list); |
| xfree(acct_policy_limit_set.tres); |
| FREE_NULL_LIST(gres_list); |
| FREE_NULL_LIST(part_ptr_list); |
| FREE_NULL_LIST(qos_ptr_list); |
| FREE_NULL_BITMAP(req_bitmap); |
| FREE_NULL_BITMAP(exc_bitmap); |
| return error_code; |
| } |
| |
| static int _test_strlen(char *test_str, char *str_name, int max_str_len) |
| { |
| int i = 0; |
| |
| if (test_str) |
| i = strlen(test_str); |
| if (i > max_str_len) { |
| info("job_create_request: strlen(%s) too big (%d > %d)", |
| str_name, i, max_str_len); |
| return ESLURM_PATHNAME_TOO_LONG; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* Translate a job array expression into the equivalent bitmap */ |
| static bool _valid_array_inx(job_desc_msg_t *job_desc) |
| { |
| static time_t sched_update = 0; |
| static uint32_t max_task_cnt = NO_VAL; |
| uint32_t task_cnt; |
| bool valid = true; |
| char *tmp, *tok, *last = NULL; |
| |
| FREE_NULL_BITMAP(job_desc->array_bitmap); |
| if (!job_desc->array_inx || !job_desc->array_inx[0]) |
| return true; |
| if (!job_desc->script || !job_desc->script[0]) |
| return false; |
| |
| if (max_array_size == NO_VAL) { |
| max_array_size = slurm_conf.max_array_sz; |
| } |
| if (max_array_size == 0) { |
| verbose("Job arrays disabled, MaxArraySize=0"); |
| return false; |
| } |
| |
| if (sched_update != slurm_conf.last_update) { |
| char *key; |
| max_task_cnt = max_array_size; |
| sched_update = slurm_conf.last_update; |
| if ((key = xstrcasestr(slurm_conf.sched_params, |
| "max_array_tasks="))) { |
| key += 16; |
| max_task_cnt = atoi(key); |
| } |
| } |
| |
| /* We have a job array request */ |
| job_desc->immediate = 0; /* Disable immediate option */ |
| job_desc->array_bitmap = bit_alloc(max_array_size); |
| |
| tmp = xstrdup(job_desc->array_inx); |
| tok = strtok_r(tmp, ",", &last); |
| while (tok && valid) { |
| valid = slurm_parse_array_tok(tok, job_desc->array_bitmap, |
| max_array_size); |
| tok = strtok_r(NULL, ",", &last); |
| } |
| xfree(tmp); |
| |
| if (valid && (max_task_cnt < max_array_size)) { |
| task_cnt = bit_set_count(job_desc->array_bitmap); |
| if (task_cnt > max_task_cnt) { |
| debug("max_array_tasks exceeded (%u > %u)", |
| task_cnt, max_task_cnt); |
| valid = false; |
| } |
| } |
| |
| return valid; |
| } |
| |
| /* Make sure a job descriptor's strings are not huge, which could result in |
| * a denial of service attack due to memory demands by the slurmctld */ |
| static int _test_job_desc_fields(job_desc_msg_t * job_desc) |
| { |
| static time_t sched_update = 0; |
| static int max_script = DEFAULT_BATCH_SCRIPT_LIMIT; |
| static int max_submit_line = DEFAULT_MAX_SUBMIT_LINE_SIZE; |
| |
| if (sched_update != slurm_conf.last_update) { |
| char *tmp_ptr; |
| sched_update = slurm_conf.last_update; |
| |
| if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, |
| "max_script_size="))) { |
| max_script = atoi(tmp_ptr + 16); |
| } else { |
| max_script = DEFAULT_BATCH_SCRIPT_LIMIT; |
| } |
| |
| if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params, |
| "max_submit_line_size="))) { |
| max_submit_line = atoi(tmp_ptr + 21); |
| } else { |
| max_submit_line = DEFAULT_MAX_SUBMIT_LINE_SIZE; |
| } |
| } |
| |
| if (_test_strlen(job_desc->account, "account", 1024) || |
| _test_strlen(job_desc->alloc_node, "alloc_node", 1024) || |
| _test_strlen(job_desc->array_inx, "array_inx", 1024 * 4) || |
| _test_strlen(job_desc->burst_buffer, "burst_buffer",1024*8) || |
| _test_strlen(job_desc->comment, "comment", 1024) || |
| _test_strlen(job_desc->cpu_bind, "cpu-bind", 1024 * 128) || |
| _test_strlen(job_desc->cpus_per_tres, "cpus_per_tres", 1024)|| |
| _test_strlen(job_desc->dependency, "dependency", 1024*128) || |
| _test_strlen(job_desc->extra, "extra", 1024) || |
| _test_strlen(job_desc->features, "features", 1024) || |
| _test_strlen( |
| job_desc->cluster_features, "cluster_features", 1024) || |
| _test_strlen(job_desc->licenses_tot, "licenses", 1024) || |
| _test_strlen(job_desc->mail_user, "mail_user", 1024) || |
| _test_strlen(job_desc->mcs_label, "mcs_label", 1024) || |
| _test_strlen(job_desc->mem_bind, "mem-bind", 1024 * 128) || |
| _test_strlen(job_desc->mem_per_tres, "mem_per_tres", 1024) || |
| _test_strlen(job_desc->name, "name", 1024) || |
| _test_strlen(job_desc->network, "network", 1024) || |
| _test_strlen(job_desc->partition, "partition", 1024) || |
| _test_strlen(job_desc->prefer, "prefer", 1024) || |
| _test_strlen(job_desc->qos, "qos", 1024) || |
| _test_strlen(job_desc->reservation, "reservation", 1024) || |
| _test_strlen(job_desc->script, "script", max_script) || |
| _test_strlen(job_desc->std_err, "std_err", PATH_MAX) || |
| _test_strlen(job_desc->std_in, "std_in", PATH_MAX) || |
| _test_strlen(job_desc->std_out, "std_out", PATH_MAX) || |
| _test_strlen(job_desc->submit_line, "submit_line", |
| max_submit_line) || |
| _test_strlen(job_desc->tres_bind, "tres_bind", 1024) || |
| _test_strlen(job_desc->tres_freq, "tres_freq", 1024) || |
| _test_strlen(job_desc->tres_per_job, "tres_per_job", 1024) || |
| _test_strlen(job_desc->tres_per_node, "tres_per_node", 1024)|| |
| _test_strlen(job_desc->tres_per_socket, "tres_per_socket", 1024) || |
| _test_strlen(job_desc->tres_per_task, "tres_per_task", 1024)|| |
| _test_strlen(job_desc->wckey, "wckey", 1024) || |
| _test_strlen(job_desc->work_dir, "work_dir", PATH_MAX)) |
| return ESLURM_PATHNAME_TOO_LONG; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _figure_out_num_tasks( |
| job_desc_msg_t *job_desc, job_record_t *job_ptr) |
| { |
| uint32_t num_tasks = job_desc->num_tasks; |
| uint32_t min_nodes = job_desc->min_nodes; |
| uint32_t max_nodes = job_desc->max_nodes; |
| uint16_t ntasks_per_node = job_desc->ntasks_per_node; |
| uint16_t ntasks_per_tres = job_desc->ntasks_per_tres; |
| |
| /* |
| * Don't figure out num tasks / bitflags if updating the job and none |
| * of the relevant influencing fields in job_desc are set. |
| */ |
| if (job_ptr && |
| (job_desc->num_tasks == NO_VAL && job_desc->min_nodes == NO_VAL && |
| job_desc->ntasks_per_node == NO_VAL16 && |
| job_desc->ntasks_per_tres == NO_VAL16)) |
| return; |
| |
| if (num_tasks != NO_VAL) { |
| job_desc->bitflags |= JOB_NTASKS_SET; |
| } |
| |
| if (job_ptr) { |
| if (min_nodes == NO_VAL) |
| min_nodes = job_ptr->details->min_nodes; |
| if (max_nodes == NO_VAL) |
| max_nodes = job_ptr->details->max_nodes; |
| if (max_nodes == 0) |
| max_nodes = min_nodes; |
| |
| if ((ntasks_per_node == NO_VAL16) && |
| job_ptr->details->ntasks_per_node) |
| ntasks_per_node = job_ptr->details->ntasks_per_node; |
| else if ((ntasks_per_tres == NO_VAL16) && |
| job_ptr->details->ntasks_per_tres) |
| ntasks_per_tres = job_ptr->details->ntasks_per_tres; |
| |
| } else if (job_desc->min_nodes == NO_VAL) { |
| min_nodes = job_desc->min_nodes = 1; |
| } |
| |
| /* If we are creating the job we want the tasks to be set every time. */ |
| if ((num_tasks == NO_VAL) && |
| (min_nodes != NO_VAL) && |
| (!job_ptr || (job_ptr && (min_nodes == max_nodes)))) { |
| /* Implicitly set task count */ |
| if (ntasks_per_tres != NO_VAL16) |
| num_tasks = min_nodes * ntasks_per_tres; |
| else if (ntasks_per_node != NO_VAL16) |
| num_tasks = min_nodes * ntasks_per_node; |
| } |
| |
| if (job_ptr) { |
| if ((num_tasks != NO_VAL) && |
| (num_tasks != job_ptr->details->num_tasks)) { |
| job_desc->num_tasks = num_tasks; |
| job_desc->bitflags |= TASKS_CHANGED; |
| } |
| } else if (num_tasks != job_desc->num_tasks) { |
| job_desc->num_tasks = num_tasks; |
| job_desc->bitflags |= TASKS_CHANGED; |
| } |
| } |
| |
| /* Perform some size checks on strings we store to prevent |
| * malicious user filling slurmctld's memory |
| * IN job_desc - user job submit request |
| * IN submit_uid - UID making job submit request |
| * OUT err_msg - custom error message to return |
| * RET 0 or error code */ |
| extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid, |
| char **err_msg) |
| { |
| job_record_t *job_ptr = NULL; |
| int rc; |
| |
| /* |
| * Check user permission for negative 'nice' and non-0 priority values |
| * (restricted to root, SlurmUser, or SLURMDB_ADMIN_OPERATOR) _before_ |
| * running the job_submit plugin. |
| */ |
| if (!validate_operator(submit_uid)) { |
| if (job_desc->priority != 0) |
| job_desc->priority = NO_VAL; |
| if (job_desc->nice < NICE_OFFSET) |
| return ESLURM_INVALID_NICE; |
| } |
| |
| if (!validate_super_user(submit_uid)) { |
| /* AdminComment can only be set by an Admin. */ |
| if (job_desc->admin_comment) |
| return ESLURM_ACCESS_DENIED; |
| |
| if (job_desc->reboot && (job_desc->reboot != NO_VAL16)) { |
| *err_msg = xstrdup("rebooting of nodes is only allowed for admins"); |
| return ESLURM_ACCESS_DENIED; |
| } |
| } |
| |
| rc = job_submit_g_submit(job_desc, submit_uid, err_msg); |
| if (rc != SLURM_SUCCESS) |
| return rc; |
| |
| /* Reject jobs requesting arbitrary distribution without a task count */ |
| if (((job_desc->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_ARBITRARY) && (job_desc->num_tasks == NO_VAL)) { |
| *err_msg = xstrdup("task count required for arbitrary distribution"); |
| return ESLURM_BAD_TASK_COUNT; |
| } |
| |
| /* Add a temporary job_ptr for node_features_g_job_valid */ |
| job_ptr = xmalloc(sizeof(job_record_t)); |
| job_ptr->details = xmalloc(sizeof(job_details_t)); |
| /* Point, don't dup, so don't free */ |
| job_ptr->details->features = job_desc->features; |
| job_ptr->details->prefer = job_desc->prefer; |
| /* job_ptr->job_id = 0; */ |
| job_ptr->user_id = job_desc->user_id; |
| if ((rc = build_feature_list(job_ptr, false, false)) != SLURM_SUCCESS) |
| goto fini; |
| rc = node_features_g_job_valid(job_desc->features, |
| job_ptr->details->feature_list); |
| if (rc != SLURM_SUCCESS) |
| goto fini; |
| |
| if (build_feature_list(job_ptr, true, false) != SLURM_SUCCESS) { |
| rc = ESLURM_INVALID_PREFER; |
| goto fini; |
| } |
| rc = node_features_g_job_valid(job_desc->prefer, |
| job_ptr->details->prefer_list); |
| if (rc == ESLURM_INVALID_FEATURE) |
| rc = ESLURM_INVALID_PREFER; |
| if (rc != SLURM_SUCCESS) { |
| goto fini; |
| } |
| |
| rc = _test_job_desc_fields(job_desc); |
| if (rc != SLURM_SUCCESS) |
| goto fini; |
| |
| if (!_valid_array_inx(job_desc)) { |
| rc = ESLURM_INVALID_ARRAY; |
| goto fini; |
| } |
| |
| if (job_desc->x11 && !(slurm_conf.prolog_flags & PROLOG_FLAG_X11)) { |
| rc = ESLURM_X11_NOT_AVAIL; |
| goto fini; |
| } |
| |
| /* Make sure anything that may be put in the database will be |
| * lower case */ |
| xstrtolower(job_desc->account); |
| xstrtolower(job_desc->wckey); |
| |
| if (job_desc->wckey && (job_desc->wckey[0] == '*')) { |
| rc = ESLURM_INVALID_WCKEY; |
| goto fini; |
| } |
| |
| /* Basic validation of some parameters */ |
| if (job_desc->req_nodes && (job_desc->min_nodes == NO_VAL)) { |
| bitstr_t *node_bitmap = NULL; |
| if (node_name2bitmap(job_desc->req_nodes, false, |
| &node_bitmap, NULL)) { |
| /* likely a badly formatted hostlist */ |
| error("validate_job_create_req: bad hostlist"); |
| rc = ESLURM_INVALID_NODE_NAME; |
| goto fini; |
| } |
| job_desc->min_nodes = bit_set_count(node_bitmap); |
| FREE_NULL_BITMAP(node_bitmap); |
| } |
| |
| _figure_out_num_tasks(job_desc, NULL); |
| |
| /* Only set min and max cpus if overcommit isn't set */ |
| if ((job_desc->overcommit == NO_VAL8) && |
| ((job_desc->min_cpus == NO_VAL) || |
| ((job_desc->min_cpus != NO_VAL) && |
| (job_desc->num_tasks != NO_VAL) && |
| (job_desc->num_tasks > job_desc->min_cpus)))) { |
| if (job_desc->num_tasks != NO_VAL) |
| job_desc->min_cpus = job_desc->num_tasks; |
| else if (job_desc->min_nodes != NO_VAL) |
| job_desc->min_cpus = job_desc->min_nodes; |
| else |
| job_desc->min_cpus = 1; |
| |
| if (job_desc->cpus_per_task != NO_VAL16) |
| job_desc->min_cpus *= job_desc->cpus_per_task; |
| /* This is just a sanity check as we wouldn't ever have a |
| * max_cpus if we didn't have a min_cpus. |
| */ |
| if ((job_desc->max_cpus != NO_VAL) && |
| (job_desc->max_cpus < job_desc->min_cpus)) |
| job_desc->max_cpus = job_desc->min_cpus; |
| } |
| |
| if (job_desc->reboot && (job_desc->reboot != NO_VAL16)) |
| job_desc->shared = 0; |
| |
| fini: |
| on_job_state_change(job_ptr, NO_VAL); |
| FREE_NULL_LIST(job_ptr->details->feature_list); |
| FREE_NULL_LIST(job_ptr->details->prefer_list); |
| xfree(job_ptr->details); |
| xfree(job_ptr); |
| |
| return rc; |
| } |
| |
| /* _copy_job_desc_to_file - copy the job script and environment from the RPC |
| * structure into a file */ |
| static int |
| _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id) |
| { |
| int error_code = 0, hash; |
| char *dir_name, *file_name; |
| DEF_TIMERS; |
| |
| START_TIMER; |
| |
| if (!job_desc->container && |
| (!job_desc->environment || job_desc->env_size == 0)) { |
| error("%s: batch job cannot run without an environment", |
| __func__); |
| return ESLURM_ENVIRONMENT_MISSING; |
| } |
| |
| /* Create directory based upon job ID due to limitations on the number |
| * of files possible in a directory on some file system types (e.g. |
| * up to 64k files on a FAT32 file system). */ |
| hash = job_id % 10; |
| dir_name = xstrdup_printf("%s/hash.%d", |
| slurm_conf.state_save_location, hash); |
| (void) mkdir(dir_name, 0700); |
| |
| /* Create job_id specific directory */ |
| xstrfmtcat(dir_name, "/job.%u", job_id); |
| if (mkdir(dir_name, 0700)) { |
| if (!slurmctld_primary && (errno == EEXIST)) { |
| error("Apparent duplicate JobId=%u. Two primary slurmctld daemons might currently be active", |
| job_id); |
| } |
| error("mkdir(%s) error %m", dir_name); |
| xfree(dir_name); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| /* Create environment file, and write data to it */ |
| file_name = xstrdup_printf("%s/environment", dir_name); |
| error_code = _write_data_array_to_file(file_name, |
| job_desc->environment, |
| job_desc->env_size); |
| xfree(file_name); |
| |
| if (error_code == 0) { |
| /* Create script file */ |
| file_name = xstrdup_printf("%s/script", dir_name); |
| error_code = write_data_to_file(file_name, job_desc->script); |
| xfree(file_name); |
| } |
| |
| xfree(dir_name); |
| END_TIMER2(__func__); |
| return error_code; |
| } |
| |
| /* Return true of the specified job ID already has a batch directory so |
| * that a different job ID can be created. This is to help limit damage from |
| * split-brain, where two slurmctld daemons are running as primary. */ |
| static bool _dup_job_file_test(uint32_t job_id) |
| { |
| char *dir_name_src; |
| struct stat buf; |
| int rc, hash = job_id % 10; |
| |
| dir_name_src = xstrdup_printf("%s/hash.%d/job.%u", |
| slurm_conf.state_save_location, |
| hash, job_id); |
| rc = stat(dir_name_src, &buf); |
| xfree(dir_name_src); |
| if (rc == 0) { |
| error("Vestigial state files for JobId=%u, but no job record. This may be the result of two slurmctld running in primary mode", |
| job_id); |
| return true; |
| } |
| errno = 0; /* don't care about errno */ |
| return false; |
| } |
| |
| /* |
| * Create file with specified name and write the supplied data array to it |
| * IN file_name - file to create and write to |
| * IN data - array of pointers to strings (e.g. env) |
| * IN size - number of elements in data |
| */ |
| static int |
| _write_data_array_to_file(char *file_name, char **data, uint32_t size) |
| { |
| int fd, i, pos, nwrite, amount; |
| |
| fd = creat(file_name, 0600); |
| if (fd < 0) { |
| error("Error creating file %s, %m", file_name); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| amount = write(fd, &size, sizeof(uint32_t)); |
| if (amount < sizeof(uint32_t)) { |
| error("Error writing file %s, %m", file_name); |
| close(fd); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| if (data == NULL) { |
| close(fd); |
| return SLURM_SUCCESS; |
| } |
| |
| for (i = 0; i < size; i++) { |
| nwrite = strlen(data[i]) + 1; |
| pos = 0; |
| while (nwrite > 0) { |
| amount = write(fd, &data[i][pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", |
| file_name); |
| close(fd); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| nwrite -= amount; |
| pos += amount; |
| } |
| } |
| |
| close(fd); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Create file with specified name and write the supplied data array to it |
| * IN file_name - file to create and write to |
| * IN data - pointer to string |
| */ |
| extern int write_data_to_file(char *file_name, char *data) |
| { |
| int fd, pos, nwrite, amount; |
| |
| if (data == NULL) { |
| (void) unlink(file_name); |
| return SLURM_SUCCESS; |
| } |
| |
| fd = creat(file_name, 0700); |
| if (fd < 0) { |
| error("Error creating file %s, %m", file_name); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| |
| nwrite = strlen(data) + 1; |
| pos = 0; |
| while (nwrite > 0) { |
| amount = write(fd, &data[pos], nwrite); |
| if ((amount < 0) && (errno != EINTR)) { |
| error("Error writing file %s, %m", file_name); |
| close(fd); |
| return ESLURM_WRITING_TO_FILE; |
| } |
| nwrite -= amount; |
| pos += amount; |
| } |
| close(fd); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * get_job_env - return the environment variables and their count for a |
| * given job |
| * IN job_ptr - pointer to job for which data is required |
| * OUT env_size - number of elements to read |
| * RET point to array of string pointers containing environment variables |
| */ |
| char **get_job_env(job_record_t *job_ptr, uint32_t *env_size) |
| { |
| char *file_name = NULL, **environment = NULL; |
| int cc, fd = -1, hash; |
| uint32_t use_id; |
| |
| use_id = (job_ptr->array_task_id != NO_VAL) ? |
| job_ptr->array_job_id : job_ptr->job_id; |
| hash = use_id % 10; |
| file_name = xstrdup_printf("%s/hash.%d/job.%u/environment", |
| slurm_conf.state_save_location, |
| hash, use_id); |
| fd = open(file_name, 0); |
| |
| if (fd >= 0) { |
| cc = _read_data_array_from_file(fd, file_name, &environment, |
| env_size, job_ptr); |
| if (cc < 0) |
| environment = NULL; |
| close(fd); |
| } else { |
| error("Could not open environment file for %pJ", job_ptr); |
| } |
| |
| xfree(file_name); |
| return environment; |
| } |
| |
| /* |
| * get_job_script - return the script for a given job |
| * IN job_ptr - pointer to job for which data is required |
| * RET buf_t *containing job script |
| */ |
| buf_t *get_job_script(const job_record_t *job_ptr) |
| { |
| char *file_name = NULL; |
| int hash; |
| uint32_t use_id; |
| buf_t *buf; |
| |
| if (!job_ptr->batch_flag) |
| return NULL; |
| |
| use_id = (job_ptr->array_task_id != NO_VAL) ? |
| job_ptr->array_job_id : job_ptr->job_id; |
| hash = use_id % 10; |
| file_name = xstrdup_printf("%s/hash.%d/job.%u/script", |
| slurm_conf.state_save_location, |
| hash, use_id); |
| |
| if (!(buf = create_mmap_buf(file_name))) |
| error("Could not open script file for %pJ", job_ptr); |
| xfree(file_name); |
| |
| return buf; |
| } |
| |
| extern uint16_t job_get_sockets_per_node(job_record_t *job_ptr) |
| { |
| xassert(job_ptr); |
| |
| if (job_ptr->details && job_ptr->details->mc_ptr && |
| job_ptr->details->mc_ptr->sockets_per_node && |
| (job_ptr->details->mc_ptr->sockets_per_node != NO_VAL16)) |
| return job_ptr->details->mc_ptr->sockets_per_node; |
| return 1; |
| } |
| |
| /* |
| * Read a collection of strings from a file |
| * IN fd - file descriptor |
| * IN file_name - file to read from |
| * OUT data - pointer to array of pointers to strings (e.g. env), |
| * must be xfreed when no longer needed |
| * OUT size - number of elements in data |
| * IN job_ptr - job |
| * RET 0 on success, -1 on error |
| * NOTE: The output format of this must be identical with _xduparray2() |
| */ |
| static int _read_data_array_from_file(int fd, char *file_name, char ***data, |
| uint32_t *size, job_record_t *job_ptr) |
| { |
| int pos, buf_size, amount, i, j; |
| char *buffer, **array_ptr; |
| uint32_t rec_cnt; |
| |
| xassert(file_name); |
| xassert(data); |
| xassert(size); |
| *data = NULL; |
| *size = 0; |
| |
| amount = read(fd, &rec_cnt, sizeof(uint32_t)); |
| if (amount < sizeof(uint32_t)) { |
| if (amount != 0) /* incomplete write */ |
| error("Error reading file %s, %m", file_name); |
| else |
| verbose("File %s has zero size", file_name); |
| return -1; |
| } |
| |
| if (rec_cnt >= INT_MAX) { |
| error("%s: unreasonable record counter %d in file %s", |
| __func__, rec_cnt, file_name); |
| return -1; |
| } |
| |
| if (rec_cnt == 0) { |
| *data = NULL; |
| *size = 0; |
| return 0; |
| } |
| |
| pos = 0; |
| buf_size = BUF_SIZE; |
| buffer = xmalloc(buf_size + 1); |
| while (1) { |
| amount = read(fd, &buffer[pos], BUF_SIZE); |
| if (amount < 0) { |
| error("Error reading file %s, %m", file_name); |
| xfree(buffer); |
| return -1; |
| } |
| buffer[pos + amount] = '\0'; |
| pos += amount; |
| if (amount < BUF_SIZE) /* end of file */ |
| break; |
| buf_size += amount; |
| xrealloc(buffer, buf_size + 1); |
| } |
| |
| /* Allocate extra space for supplemental environment variables */ |
| if (job_ptr->details->env_cnt) { |
| for (j = 0; j < job_ptr->details->env_cnt; j++) |
| pos += (strlen(job_ptr->details->env_sup[j]) + 1); |
| xrealloc(buffer, pos); |
| } |
| |
| /* We have all the data, now let's compute the pointers */ |
| array_ptr = xcalloc((rec_cnt + job_ptr->details->env_cnt) + 1, |
| sizeof(char *)); |
| for (i = 0, pos = 0; i < rec_cnt; i++) { |
| array_ptr[i] = &buffer[pos]; |
| pos += strlen(&buffer[pos]) + 1; |
| if ((pos > buf_size) && ((i + 1) < rec_cnt)) { |
| error("Bad environment file %s", file_name); |
| rec_cnt = i; |
| break; |
| } |
| } |
| |
| /* Add supplemental environment variables */ |
| if (job_ptr->details->env_cnt) { |
| char *tmp_chr; |
| int env_len, name_len; |
| for (j = 0; j < job_ptr->details->env_cnt; j++) { |
| tmp_chr = strchr(job_ptr->details->env_sup[j], '='); |
| if (tmp_chr == NULL) { |
| error("Invalid supplemental environment " |
| "variable: %s", |
| job_ptr->details->env_sup[j]); |
| continue; |
| } |
| env_len = strlen(job_ptr->details->env_sup[j]) + 1; |
| name_len = tmp_chr - job_ptr->details->env_sup[j] + 1; |
| /* search for duplicate */ |
| for (i = 0; i < rec_cnt; i++) { |
| if (xstrncmp(array_ptr[i], |
| job_ptr->details->env_sup[j], |
| name_len)) { |
| continue; |
| } |
| |
| /* |
| * If we are are the front we can not overwrite |
| * that spot, we can clear it an then add to the |
| * end of the array. |
| */ |
| if (i == 0) { |
| array_ptr[0][0] = '\0'; |
| i = rec_cnt; |
| break; |
| } |
| /* over-write duplicate */ |
| memcpy(&buffer[pos], |
| job_ptr->details->env_sup[j], env_len); |
| array_ptr[i] = &buffer[pos]; |
| pos += env_len; |
| break; |
| } |
| if (i >= rec_cnt) { /* add env to array end */ |
| memcpy(&buffer[pos], |
| job_ptr->details->env_sup[j], env_len); |
| array_ptr[rec_cnt++] = &buffer[pos]; |
| pos += env_len; |
| } |
| } |
| } |
| |
| *size = rec_cnt; |
| *data = array_ptr; |
| return 0; |
| } |
| |
| /* Given a job request, return a multi_core_data struct. |
| * Returns NULL if no values set in the job/step request */ |
| static multi_core_data_t * |
| _set_multi_core_data(job_desc_msg_t * job_desc) |
| { |
| multi_core_data_t * mc_ptr; |
| |
| if ((job_desc->sockets_per_node == NO_VAL16) && |
| (job_desc->cores_per_socket == NO_VAL16) && |
| (job_desc->threads_per_core == NO_VAL16) && |
| (job_desc->ntasks_per_socket == NO_VAL16) && |
| (job_desc->ntasks_per_core == NO_VAL16) && |
| (job_desc->plane_size == NO_VAL16)) |
| return NULL; |
| |
| mc_ptr = xmalloc(sizeof(multi_core_data_t)); |
| mc_ptr->sockets_per_node = job_desc->sockets_per_node; |
| mc_ptr->cores_per_socket = job_desc->cores_per_socket; |
| mc_ptr->threads_per_core = job_desc->threads_per_core; |
| if (job_desc->ntasks_per_socket != NO_VAL16) |
| mc_ptr->ntasks_per_socket = job_desc->ntasks_per_socket; |
| else |
| mc_ptr->ntasks_per_socket = INFINITE16; |
| if (job_desc->ntasks_per_core != NO_VAL16) |
| mc_ptr->ntasks_per_core = job_desc->ntasks_per_core; |
| else if (slurm_conf.select_type_param & SELECT_ONE_TASK_PER_CORE) |
| mc_ptr->ntasks_per_core = 1; |
| else |
| mc_ptr->ntasks_per_core = INFINITE16; |
| if (job_desc->plane_size != NO_VAL16) |
| mc_ptr->plane_size = job_desc->plane_size; |
| else |
| mc_ptr->plane_size = 0; |
| |
| return mc_ptr; |
| } |
| |
| /* Return default "wait_all_nodes" option for a new job */ |
| static uint16_t _default_wait_all_nodes(job_desc_msg_t *job_desc) |
| { |
| static uint16_t default_batch_wait = NO_VAL16; |
| static time_t sched_update = 0; |
| |
| if (!job_desc->script) |
| return 0; |
| |
| if ((default_batch_wait != NO_VAL16) && |
| (sched_update == slurm_conf.last_update)) |
| return default_batch_wait; |
| |
| if (xstrcasestr(slurm_conf.sched_params, "sbatch_wait_nodes")) |
| default_batch_wait = 1; |
| else |
| default_batch_wait = 0; |
| sched_update = slurm_conf.last_update; |
| |
| return default_batch_wait; |
| } |
| |
| static int _unroll_min_max_node(job_record_t *job_ptr) |
| { |
| static int max_unroll = -1; |
| static time_t topo_update = 0; |
| job_details_t *detail_ptr = job_ptr->details; |
| int i; |
| |
| if (topo_update != slurm_conf.last_update) { |
| char *tmp_ptr; |
| topo_update = slurm_conf.last_update; |
| char *unroll_opt_str = "TopoMaxSizeUnroll="; |
| |
| if ((topology_get_plugin_id() == TOPOLOGY_PLUGIN_BLOCK) && |
| (tmp_ptr = xstrcasestr(slurm_conf.topology_param, |
| unroll_opt_str))) { |
| i = atoi(tmp_ptr + strlen(unroll_opt_str)); |
| if (i < 0) { |
| error("ignoring TopologyParam: TopoMaxSizeUnroll %d", |
| i); |
| } else { |
| max_unroll = i; |
| } |
| } |
| } |
| |
| if (max_unroll < 0) |
| return SLURM_SUCCESS; |
| |
| if (detail_ptr->job_size_bitmap) |
| return SLURM_SUCCESS; |
| |
| if (!detail_ptr->max_nodes || |
| (detail_ptr->max_nodes == detail_ptr->min_nodes)) |
| return SLURM_SUCCESS; |
| |
| if ((detail_ptr->max_nodes < MAX_JOB_SIZE_BITMAP) && |
| ((detail_ptr->max_nodes - detail_ptr->min_nodes) < max_unroll)) { |
| bitstr_t *size_bitmap; |
| size_bitmap = bit_alloc(detail_ptr->max_nodes + 1); |
| bit_nset(size_bitmap, detail_ptr->min_nodes, |
| detail_ptr->max_nodes); |
| detail_ptr->job_size_bitmap = size_bitmap; |
| } else { |
| return ESLURM_INVALID_NODE_COUNT; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* _copy_job_desc_to_job_record - copy the job descriptor from the RPC |
| * structure into the actual slurmctld job record */ |
| static int _copy_job_desc_to_job_record(job_desc_msg_t *job_desc, |
| job_record_t **job_rec_ptr, |
| bitstr_t **req_bitmap, |
| bitstr_t **exc_bitmap) |
| { |
| int error_code; |
| job_details_t *detail_ptr; |
| job_record_t *job_ptr; |
| |
| if (slurm_conf.conf_flags & CONF_FLAG_WCKEY) { |
| if (!job_desc->wckey) { |
| /* get the default wckey for this user since none was |
| * given */ |
| slurmdb_user_rec_t user_rec; |
| memset(&user_rec, 0, sizeof(user_rec)); |
| user_rec.uid = job_desc->user_id; |
| assoc_mgr_fill_in_user(acct_db_conn, &user_rec, |
| accounting_enforce, NULL, false); |
| if (user_rec.default_wckey) |
| job_desc->wckey = xstrdup_printf( |
| "*%s", user_rec.default_wckey); |
| else if (!(accounting_enforce & |
| ACCOUNTING_ENFORCE_WCKEYS)) |
| job_desc->wckey = xstrdup("*"); |
| else { |
| error("Job didn't specify wckey and user " |
| "%d has no default.", job_desc->user_id); |
| return ESLURM_INVALID_WCKEY; |
| } |
| } else if (job_desc->wckey) { |
| slurmdb_wckey_rec_t wckey_rec, *wckey_ptr = NULL; |
| |
| memset(&wckey_rec, 0, sizeof(wckey_rec)); |
| wckey_rec.uid = job_desc->user_id; |
| wckey_rec.name = job_desc->wckey; |
| |
| if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, |
| accounting_enforce, |
| &wckey_ptr, false)) { |
| if (accounting_enforce & |
| ACCOUNTING_ENFORCE_WCKEYS) { |
| error("%s: invalid wckey '%s' for " |
| "user %u.", |
| __func__, wckey_rec.name, |
| job_desc->user_id); |
| return ESLURM_INVALID_WCKEY; |
| } |
| } |
| } else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) { |
| /* This should never happen */ |
| info("%s: no wckey was given for job submit", __func__); |
| return ESLURM_INVALID_WCKEY; |
| } |
| } |
| |
| job_ptr = _create_job_record(1, true); |
| |
| *job_rec_ptr = job_ptr; |
| job_ptr->partition = xstrdup(job_desc->partition); |
| if (job_desc->profile != ACCT_GATHER_PROFILE_NOT_SET) |
| job_ptr->profile = job_desc->profile; |
| |
| if (job_desc->job_id != NO_VAL) { /* already confirmed unique */ |
| job_ptr->job_id = job_desc->job_id; |
| } else { |
| error_code = _set_job_id(job_ptr); |
| if (error_code) |
| return error_code; |
| } |
| |
| job_ptr->name = xstrdup(job_desc->name); |
| job_ptr->wckey = xstrdup(job_desc->wckey); |
| |
| /* Since this is only used in the slurmctld, copy it now. */ |
| job_ptr->tres_req_cnt = job_desc->tres_req_cnt; |
| job_desc->tres_req_cnt = NULL; |
| set_job_tres_req_str(job_ptr, false); |
| _add_job_hash(job_ptr); |
| |
| job_ptr->user_id = (uid_t) job_desc->user_id; |
| job_ptr->group_id = (gid_t) job_desc->group_id; |
| /* skip copy, just take ownership */ |
| job_ptr->id = job_desc->id; |
| job_desc->id = NULL; |
| |
| job_state_set(job_ptr, JOB_PENDING); |
| job_ptr->time_limit = job_desc->time_limit; |
| job_ptr->deadline = job_desc->deadline; |
| if (job_desc->delay_boot == NO_VAL) |
| job_ptr->delay_boot = delay_boot; |
| else |
| job_ptr->delay_boot = job_desc->delay_boot; |
| if (job_desc->time_min != NO_VAL) |
| job_ptr->time_min = job_desc->time_min; |
| job_ptr->alloc_sid = job_desc->alloc_sid; |
| job_ptr->alloc_node = xstrdup(job_desc->alloc_node); |
| job_ptr->account = xstrdup(job_desc->account); |
| job_ptr->batch_features = xstrdup(job_desc->batch_features); |
| job_ptr->burst_buffer = xstrdup(job_desc->burst_buffer); |
| job_ptr->network = xstrdup(job_desc->network); |
| job_ptr->resv_name = xstrdup(job_desc->reservation); |
| job_ptr->restart_cnt = job_desc->restart_cnt; |
| job_ptr->comment = xstrdup(job_desc->comment); |
| job_ptr->extra = xstrdup(job_desc->extra); |
| job_ptr->container = xstrdup(job_desc->container); |
| job_ptr->container_id = xstrdup(job_desc->container_id); |
| job_ptr->admin_comment = xstrdup(job_desc->admin_comment); |
| |
| if (job_desc->kill_on_node_fail != NO_VAL16) |
| job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail; |
| |
| job_ptr->resp_host = xstrdup(job_desc->resp_host); |
| job_ptr->alloc_resp_port = job_desc->alloc_resp_port; |
| job_ptr->alloc_tls_cert = xstrdup(job_desc->alloc_tls_cert); |
| job_ptr->other_port = job_desc->other_port; |
| job_ptr->time_last_active = time(NULL); |
| job_ptr->derived_ec = 0; |
| |
| job_ptr->licenses = xstrdup(job_desc->licenses_tot); |
| job_ptr->lic_req = xstrdup(job_desc->licenses); |
| job_ptr->mail_user = _get_mail_user(job_desc->mail_user, |
| job_ptr); |
| if (job_desc->mail_type && |
| (job_desc->mail_type != NO_VAL16)) { |
| job_ptr->mail_type = job_desc->mail_type; |
| } |
| |
| job_ptr->bit_flags = job_desc->bitflags; |
| job_ptr->bit_flags &= ~TASKS_CHANGED; |
| job_ptr->bit_flags &= ~BACKFILL_TEST; |
| job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST; |
| |
| job_ptr->resv_port_cnt = job_desc->resv_port_cnt; |
| if (job_desc->resv_port_cnt != NO_VAL16) { |
| error_code = resv_port_check_job_request_cnt(job_ptr); |
| if (error_code) |
| return error_code; |
| } |
| |
| job_ptr->spank_job_env = job_desc->spank_job_env; |
| job_ptr->spank_job_env_size = job_desc->spank_job_env_size; |
| job_desc->spank_job_env = (char **) NULL; /* nothing left to free */ |
| job_desc->spank_job_env_size = 0; /* nothing left to free */ |
| job_ptr->mcs_label = xstrdup(job_desc->mcs_label); |
| job_ptr->origin_cluster = xstrdup(job_desc->origin_cluster); |
| |
| job_ptr->cpus_per_tres = xstrdup(job_desc->cpus_per_tres); |
| job_ptr->mem_per_tres = xstrdup(job_desc->mem_per_tres); |
| job_ptr->tres_bind = xstrdup(job_desc->tres_bind); |
| job_ptr->tres_freq = xstrdup(job_desc->tres_freq); |
| job_ptr->tres_per_job = xstrdup(job_desc->tres_per_job); |
| job_ptr->tres_per_node = xstrdup(job_desc->tres_per_node); |
| job_ptr->tres_per_socket = xstrdup(job_desc->tres_per_socket); |
| job_ptr->tres_per_task = xstrdup(job_desc->tres_per_task); |
| |
| if (job_desc->wait_all_nodes == NO_VAL16) |
| job_ptr->wait_all_nodes = _default_wait_all_nodes(job_desc); |
| else |
| job_ptr->wait_all_nodes = job_desc->wait_all_nodes; |
| job_ptr->warn_flags = job_desc->warn_flags; |
| job_ptr->warn_signal = job_desc->warn_signal; |
| job_ptr->warn_time = job_desc->warn_time; |
| |
| detail_ptr = job_ptr->details; |
| detail_ptr->argc = job_desc->argc; |
| detail_ptr->argv = job_desc->argv; |
| job_desc->argv = (char **) NULL; /* nothing left to free */ |
| job_desc->argc = 0; /* nothing left to free */ |
| detail_ptr->acctg_freq = xstrdup(job_desc->acctg_freq); |
| detail_ptr->cpu_bind_type = job_desc->cpu_bind_type; |
| detail_ptr->cpu_bind = xstrdup(job_desc->cpu_bind); |
| detail_ptr->cpu_freq_gov = job_desc->cpu_freq_gov; |
| detail_ptr->cpu_freq_max = job_desc->cpu_freq_max; |
| detail_ptr->cpu_freq_min = job_desc->cpu_freq_min; |
| detail_ptr->nice = job_desc->nice; |
| detail_ptr->open_mode = job_desc->open_mode; |
| detail_ptr->min_cpus = job_desc->min_cpus; |
| detail_ptr->orig_min_cpus = job_desc->min_cpus; |
| detail_ptr->max_cpus = job_desc->max_cpus; |
| detail_ptr->orig_max_cpus = job_desc->max_cpus; |
| detail_ptr->min_nodes = job_desc->min_nodes; |
| detail_ptr->max_nodes = job_desc->max_nodes; |
| detail_ptr->qos_req = xstrdup(job_desc->qos); |
| if (job_desc->job_size_str && detail_ptr->max_nodes) { |
| if (detail_ptr->max_nodes >= MAX_JOB_SIZE_BITMAP) |
| return ESLURM_INVALID_NODE_COUNT; |
| detail_ptr->job_size_bitmap = |
| bit_alloc(detail_ptr->max_nodes + 1); |
| if (bit_unfmt(detail_ptr->job_size_bitmap, |
| job_desc->job_size_str)) |
| FREE_NULL_BITMAP(detail_ptr->job_size_bitmap); |
| } else { |
| error_code = _unroll_min_max_node(job_ptr); |
| if (error_code) |
| return error_code; |
| } |
| detail_ptr->req_context = xstrdup(job_desc->req_context); |
| detail_ptr->resv_req = xstrdup(job_desc->reservation); |
| detail_ptr->x11 = job_desc->x11; |
| detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie); |
| detail_ptr->x11_target = xstrdup(job_desc->x11_target); |
| detail_ptr->x11_target_port = job_desc->x11_target_port; |
| if (job_desc->req_nodes) { |
| if ((job_desc->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_ARBITRARY) { |
| detail_ptr->req_nodes = xstrdup(job_desc->req_nodes); |
| if ((error_code = |
| job_record_calc_arbitrary_tpn(job_ptr))) |
| return error_code; |
| } else { |
| detail_ptr->req_nodes = |
| _copy_nodelist_no_dup(job_desc->req_nodes); |
| } |
| detail_ptr->req_node_bitmap = *req_bitmap; |
| *req_bitmap = NULL; /* Reused nothing left to free */ |
| detail_ptr->exc_node_bitmap = *exc_bitmap; |
| } |
| if (job_desc->exc_nodes) { |
| detail_ptr->exc_nodes = |
| _copy_nodelist_no_dup(job_desc->exc_nodes); |
| detail_ptr->exc_node_bitmap = *exc_bitmap; |
| } |
| if (job_desc->exc_nodes || job_desc->req_nodes) |
| *exc_bitmap = NULL; /* Reused nothing left to free */ |
| detail_ptr->features = xstrdup(job_desc->features); |
| detail_ptr->cluster_features = xstrdup(job_desc->cluster_features); |
| detail_ptr->prefer = xstrdup(job_desc->prefer); |
| if (job_desc->fed_siblings_viable) { |
| job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t)); |
| job_ptr->fed_details->siblings_viable = |
| job_desc->fed_siblings_viable; |
| update_job_fed_details(job_ptr); |
| } |
| if (job_desc->shared == JOB_SHARED_NONE) { |
| detail_ptr->share_res = 0; |
| detail_ptr->whole_node = WHOLE_NODE_REQUIRED; |
| } else if (job_desc->shared == JOB_SHARED_OK) { |
| detail_ptr->share_res = 1; |
| detail_ptr->whole_node = 0; |
| } else if (job_desc->shared == JOB_SHARED_USER) { |
| detail_ptr->share_res = NO_VAL8; |
| detail_ptr->whole_node = WHOLE_NODE_USER; |
| } else if (job_desc->shared == JOB_SHARED_MCS) { |
| detail_ptr->share_res = NO_VAL8; |
| detail_ptr->whole_node = WHOLE_NODE_MCS; |
| } else if (job_desc->shared == JOB_SHARED_TOPO) { |
| detail_ptr->share_res = NO_VAL8; |
| detail_ptr->whole_node = WHOLE_TOPO; |
| } else { |
| detail_ptr->share_res = NO_VAL8; |
| detail_ptr->whole_node = 0; |
| } |
| if (job_desc->contiguous != NO_VAL16) |
| detail_ptr->contiguous = job_desc->contiguous; |
| if (slurm_conf.conf_flags & CONF_FLAG_ASRU) |
| detail_ptr->core_spec = job_desc->core_spec; |
| else |
| detail_ptr->core_spec = NO_VAL16; |
| if (detail_ptr->core_spec != NO_VAL16) |
| detail_ptr->whole_node = WHOLE_NODE_REQUIRED; |
| if (job_desc->task_dist != NO_VAL) |
| detail_ptr->task_dist = job_desc->task_dist; |
| if (job_desc->cpus_per_task == NO_VAL16) { |
| detail_ptr->cpus_per_task = 1; |
| detail_ptr->orig_cpus_per_task = NO_VAL16; |
| } else { |
| detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1); |
| detail_ptr->orig_cpus_per_task = detail_ptr->cpus_per_task; |
| } |
| if (job_desc->pn_min_cpus != NO_VAL16) |
| detail_ptr->pn_min_cpus = job_desc->pn_min_cpus; |
| if (job_desc->overcommit != NO_VAL8) |
| detail_ptr->overcommit = job_desc->overcommit; |
| if (job_desc->num_tasks != NO_VAL) |
| detail_ptr->num_tasks = job_desc->num_tasks; |
| if (job_desc->ntasks_per_node != NO_VAL16) { |
| detail_ptr->ntasks_per_node = job_desc->ntasks_per_node; |
| if ((detail_ptr->overcommit == 0) && |
| (detail_ptr->num_tasks > 1)) { |
| detail_ptr->pn_min_cpus = |
| MAX(detail_ptr->pn_min_cpus, |
| (detail_ptr->cpus_per_task * |
| detail_ptr->ntasks_per_node)); |
| } |
| } |
| if (job_desc->ntasks_per_tres != NO_VAL16) |
| detail_ptr->ntasks_per_tres = job_desc->ntasks_per_tres; |
| detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus, |
| detail_ptr->cpus_per_task); |
| detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus; |
| if (job_desc->reboot != NO_VAL16) |
| job_ptr->reboot = MIN(job_desc->reboot, 1); |
| else |
| job_ptr->reboot = 0; |
| if (job_desc->requeue != NO_VAL16) |
| detail_ptr->requeue = MIN(job_desc->requeue, 1); |
| else |
| detail_ptr->requeue = slurm_conf.job_requeue; |
| if (job_desc->pn_min_memory != NO_VAL64) |
| detail_ptr->pn_min_memory = job_desc->pn_min_memory; |
| detail_ptr->orig_pn_min_memory = detail_ptr->pn_min_memory; |
| if (job_desc->pn_min_tmp_disk != NO_VAL) |
| detail_ptr->pn_min_tmp_disk = job_desc->pn_min_tmp_disk; |
| |
| detail_ptr->oom_kill_step = job_desc->oom_kill_step; |
| |
| detail_ptr->segment_size = job_desc->segment_size; |
| detail_ptr->std_err = xstrdup(job_desc->std_err); |
| detail_ptr->std_in = xstrdup(job_desc->std_in); |
| detail_ptr->std_out = xstrdup(job_desc->std_out); |
| detail_ptr->submit_line = xstrdup(job_desc->submit_line); |
| detail_ptr->work_dir = xstrdup(job_desc->work_dir); |
| if (job_desc->begin_time > time(NULL)) |
| detail_ptr->begin_time = job_desc->begin_time; |
| |
| job_ptr->clusters = xstrdup(job_desc->clusters); |
| |
| /* |
| * The priority needs to be set after this since we don't have |
| * an association rec yet |
| */ |
| detail_ptr->mc_ptr = _set_multi_core_data(job_desc); |
| |
| if ((job_ptr->bit_flags & SPREAD_JOB) && (detail_ptr->max_nodes == 0) && |
| (detail_ptr->num_tasks != 0)) { |
| if (detail_ptr->min_nodes == 0) |
| detail_ptr->min_nodes = 1; |
| detail_ptr->max_nodes = MIN(active_node_record_count, |
| detail_ptr->num_tasks); |
| } |
| |
| job_ptr->selinux_context = xstrdup(job_desc->selinux_context); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _copy_nodelist_no_dup - Take a node_list string and convert it to an |
| * expression without duplicate names. For example, we want to convert |
| * a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]" |
| * node_list IN - string describing a list of nodes |
| * RET a compact node expression, must be xfreed by the user |
| */ |
| static char *_copy_nodelist_no_dup(char *node_list) |
| { |
| char *buf; |
| |
| hostlist_t *hl = hostlist_create(node_list); |
| if (hl == NULL) |
| return NULL; |
| hostlist_uniq(hl); |
| buf = hostlist_ranged_string_xmalloc(hl); |
| hostlist_destroy(hl); |
| |
| return buf; |
| } |
| |
| /* Return memory on the first node in the identified partition */ |
| static uint64_t _mem_per_node_part(part_record_t *part_ptr) |
| { |
| int node_inx = -1; |
| node_record_t *node_ptr; |
| |
| if (!part_ptr) |
| return 0; |
| |
| if (part_ptr->node_bitmap) |
| node_inx = bit_ffs(part_ptr->node_bitmap); |
| if (node_inx >= 0) { |
| node_ptr = node_record_table_ptr[node_inx]; |
| return (node_ptr->config_ptr->real_memory - |
| node_ptr->mem_spec_limit); |
| } |
| return 0; |
| } |
| |
| /* |
| * Test if this job exceeds any of MaxMemPer[CPU|Node] limits and potentially |
| * adjust mem / cpu ratios. |
| * |
| * NOTE: This function is also called with a dummy job_desc_msg_t from |
| * job_limits_check(), if there is any new check added here you may also have to |
| * add that parameter to the job_desc_msg_t in that function. |
| */ |
| static bool _valid_pn_min_mem(job_desc_msg_t *job_desc_msg, |
| part_record_t *part_ptr) |
| { |
| uint64_t job_mem_limit = job_desc_msg->pn_min_memory; |
| uint64_t sys_mem_limit; |
| uint16_t cpus_per_node; |
| |
| if (part_ptr && part_ptr->max_mem_per_cpu) |
| sys_mem_limit = part_ptr->max_mem_per_cpu; |
| else |
| sys_mem_limit = slurm_conf.max_mem_per_cpu; |
| |
| if ((sys_mem_limit == 0) || (sys_mem_limit == MEM_PER_CPU)) |
| return true; |
| |
| if ((job_mem_limit & MEM_PER_CPU) && (sys_mem_limit & MEM_PER_CPU)) { |
| uint64_t mem_ratio; |
| job_mem_limit &= (~MEM_PER_CPU); |
| sys_mem_limit &= (~MEM_PER_CPU); |
| if (job_mem_limit <= sys_mem_limit) |
| return true; |
| mem_ratio = ROUNDUP(job_mem_limit, sys_mem_limit); |
| debug("JobId=%u: increasing cpus_per_task and decreasing mem_per_cpu by factor of %"PRIu64" based upon mem_per_cpu limits", |
| job_desc_msg->job_id, mem_ratio); |
| if (job_desc_msg->cpus_per_task == NO_VAL16) |
| job_desc_msg->cpus_per_task = mem_ratio; |
| else |
| job_desc_msg->cpus_per_task *= mem_ratio; |
| |
| /* Update tres_per_task, but not if it wasn't set before */ |
| if (job_desc_msg->bitflags & JOB_CPUS_SET) |
| slurm_option_update_tres_per_task( |
| job_desc_msg->cpus_per_task, "cpu", |
| &job_desc_msg->tres_per_task); |
| |
| job_desc_msg->pn_min_memory = |
| ROUNDUP(job_mem_limit, mem_ratio) | MEM_PER_CPU; |
| if ((job_desc_msg->num_tasks != NO_VAL) && |
| (job_desc_msg->num_tasks != 0) && |
| (job_desc_msg->min_cpus != NO_VAL)) { |
| job_desc_msg->min_cpus = |
| job_desc_msg->num_tasks * |
| job_desc_msg->cpus_per_task; |
| |
| if ((job_desc_msg->max_cpus != NO_VAL) && |
| (job_desc_msg->max_cpus < job_desc_msg->min_cpus)) { |
| job_desc_msg->max_cpus = job_desc_msg->min_cpus; |
| } |
| } else { |
| job_desc_msg->pn_min_cpus = job_desc_msg->cpus_per_task; |
| } |
| return true; |
| } |
| |
| if (job_mem_limit == 0) |
| job_mem_limit = _mem_per_node_part(part_ptr); |
| |
| if (((job_mem_limit & MEM_PER_CPU) == 0) && |
| ((sys_mem_limit & MEM_PER_CPU) == 0)) { |
| if (job_mem_limit <= sys_mem_limit) |
| return true; |
| debug2("JobId=%u mem=%"PRIu64"M > MaxMemPerNode=%"PRIu64"M in partition %s", |
| job_desc_msg->job_id, job_mem_limit, sys_mem_limit, |
| (part_ptr && part_ptr->name) ? part_ptr->name : "N/A"); |
| return false; |
| } |
| |
| /* Job and system have different memory limit forms (i.e. one is a |
| * per-job and the other is per-node). Convert them both to per-node |
| * values for comparison. */ |
| if (part_ptr && (!part_ptr->max_share || !job_desc_msg->shared)) { |
| /* Whole node allocation */ |
| cpus_per_node = part_ptr->max_cpu_cnt; |
| } else { |
| if ((job_desc_msg->ntasks_per_node != NO_VAL16) && |
| (job_desc_msg->ntasks_per_node != 0)) |
| cpus_per_node = job_desc_msg->ntasks_per_node; |
| else |
| cpus_per_node = 1; |
| |
| if ((job_desc_msg->num_tasks != NO_VAL) && |
| (job_desc_msg->num_tasks != 0) && |
| (job_desc_msg->max_nodes != NO_VAL) && |
| (job_desc_msg->max_nodes != 0)) { |
| cpus_per_node = MAX(cpus_per_node, |
| ROUNDUP(job_desc_msg->num_tasks, |
| job_desc_msg->max_nodes)); |
| } |
| |
| if ((job_desc_msg->cpus_per_task != NO_VAL16) && |
| (job_desc_msg->cpus_per_task != 0)) |
| cpus_per_node *= job_desc_msg->cpus_per_task; |
| |
| if ((job_desc_msg->pn_min_cpus != NO_VAL16) && |
| (job_desc_msg->pn_min_cpus > cpus_per_node)) |
| cpus_per_node = job_desc_msg->pn_min_cpus; |
| } |
| |
| if (job_mem_limit & MEM_PER_CPU) { |
| /* Job has per-CPU memory limit, system has per-node limit */ |
| job_mem_limit &= (~MEM_PER_CPU); |
| job_mem_limit *= cpus_per_node; |
| } else { |
| /* Job has per-node memory limit, system has per-CPU limit */ |
| uint32_t min_cpus; |
| sys_mem_limit &= (~MEM_PER_CPU); |
| min_cpus = ROUNDUP(job_mem_limit, sys_mem_limit); |
| |
| if ((job_desc_msg->pn_min_cpus == NO_VAL16) || |
| (job_desc_msg->pn_min_cpus < min_cpus)) { |
| job_desc_msg->pn_min_cpus = min_cpus; |
| if (min_cpus > job_desc_msg->min_cpus) { |
| job_desc_msg->min_cpus = min_cpus; |
| job_desc_msg->max_cpus = |
| MAX(min_cpus, job_desc_msg->max_cpus); |
| } |
| cpus_per_node = MAX(cpus_per_node, min_cpus); |
| if (job_desc_msg->ntasks_per_node != NO_VAL16) { |
| job_desc_msg->cpus_per_task = |
| ROUNDUP(job_desc_msg->pn_min_cpus, |
| job_desc_msg->ntasks_per_node); |
| job_desc_msg->pn_min_cpus = |
| MAX(job_desc_msg->cpus_per_task * |
| job_desc_msg->ntasks_per_node, |
| job_desc_msg->pn_min_cpus); |
| } else if (job_desc_msg->num_tasks && |
| (job_desc_msg->num_tasks != NO_VAL) && |
| job_desc_msg->min_nodes && |
| (job_desc_msg->min_nodes != NO_VAL)) { |
| /* |
| * Calculate a new value of cpus/task given the |
| * current nodes and tasks values: |
| * CPUs/Task = (min_cpus_per_node * min_nodes) / num_tasks |
| */ |
| uint32_t cpus = |
| min_cpus * job_desc_msg->min_nodes; |
| job_desc_msg->cpus_per_task = |
| ROUNDUP(cpus, job_desc_msg->num_tasks); |
| /* |
| * Recalculate pn_min_cpus based on the new |
| * CPUs/task. This formula aims to get |
| * an allocation with the least amount of |
| * CPUs combining all the nodes from the job. |
| */ |
| min_cpus = (job_desc_msg->cpus_per_task * |
| job_desc_msg->num_tasks) / |
| job_desc_msg->min_nodes; |
| job_desc_msg->pn_min_cpus = min_cpus; |
| job_desc_msg->min_cpus = |
| MAX(min_cpus, |
| job_desc_msg->pn_min_cpus); |
| } else if (!job_desc_msg->num_tasks) { |
| /* |
| * The job did not request any amount of tasks |
| * explicitly. Assuming 1 per node. |
| */ |
| job_desc_msg->cpus_per_task = |
| MAX(job_desc_msg->pn_min_cpus, |
| job_desc_msg->cpus_per_task); |
| } |
| debug("JobId=%u: Setting job's pn_min_cpus to %u due to memory limit", |
| job_desc_msg->job_id, |
| job_desc_msg->pn_min_cpus); |
| } |
| sys_mem_limit *= cpus_per_node; |
| } |
| |
| if (job_mem_limit <= sys_mem_limit) |
| return true; |
| |
| debug2("JobId=%u mem=%"PRIu64"M > MaxMemPer%s=%"PRIu64"M in partition:%s", |
| job_desc_msg->job_id, job_mem_limit, |
| (job_mem_limit & MEM_PER_CPU) ? "CPU" : "Node", sys_mem_limit, |
| (part_ptr && part_ptr->name) ? part_ptr->name : "N/A"); |
| |
| return false; |
| } |
| |
| /* |
| * Increment time limit of one job record for node configuration. |
| */ |
| static void _job_time_limit_incr(job_record_t *job_ptr, uint32_t boot_job_id) |
| { |
| time_t delta_t, now = time(NULL); |
| |
| delta_t = difftime(now, job_ptr->start_time); |
| if ((job_ptr->job_id != boot_job_id) && !IS_JOB_CONFIGURING(job_ptr)) |
| job_ptr->tot_sus_time = delta_t; |
| |
| if ((job_ptr->time_limit != INFINITE) && |
| ((job_ptr->job_id == boot_job_id) || (delta_t != 0))) { |
| if (delta_t && !IS_JOB_CONFIGURING(job_ptr)) { |
| verbose("Extending %pJ time limit by %u secs for configuration", |
| job_ptr, (uint32_t) delta_t); |
| } |
| job_ptr->end_time = now + (job_ptr->time_limit * 60); |
| job_ptr->end_time_exp = job_ptr->end_time; |
| } |
| } |
| |
| static int _foreach_het_job_time_limit_incr(void *x, void *arg) |
| { |
| _job_time_limit_incr(x, *(uint32_t *)arg); |
| |
| return 0; |
| } |
| |
| /* |
| * Increment time limit for all components of a hetjob for node configuration. |
| * job_ptr IN - pointer to job record for which configuration is complete |
| * boot_job_id - job ID of record with newly powered up node or 0 |
| */ |
| static void _het_job_time_limit_incr(job_record_t *job_ptr, |
| uint32_t boot_job_id) |
| { |
| job_record_t *het_job_leader; |
| |
| if (!job_ptr->het_job_id) { |
| _job_time_limit_incr(job_ptr, boot_job_id); |
| return; |
| } |
| |
| het_job_leader = find_job_record(job_ptr->het_job_id); |
| if (!het_job_leader) { |
| error("%s: Hetjob leader %pJ not found", |
| __func__, job_ptr); |
| _job_time_limit_incr(job_ptr, boot_job_id); |
| return; |
| } |
| if (!het_job_leader->het_job_list) { |
| error("%s: Hetjob leader %pJ job list is NULL", |
| __func__, job_ptr); |
| _job_time_limit_incr(job_ptr, boot_job_id); |
| return; |
| } |
| |
| (void) list_for_each(het_job_leader->het_job_list, |
| _foreach_het_job_time_limit_incr, |
| &boot_job_id); |
| } |
| |
| /* Clear job's CONFIGURING flag and advance end time as needed */ |
| extern void job_config_fini(job_record_t *job_ptr) |
| { |
| time_t now = time(NULL); |
| |
| last_job_update = now; |
| job_state_unset_flag(job_ptr, JOB_CONFIGURING); |
| if (IS_JOB_POWER_UP_NODE(job_ptr)) { |
| info("Resetting %pJ start time for node power up", job_ptr); |
| job_state_unset_flag(job_ptr, JOB_POWER_UP_NODE); |
| job_ptr->start_time = now; |
| _het_job_time_limit_incr(job_ptr, job_ptr->job_id); |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } else { |
| _het_job_time_limit_incr(job_ptr, 0); |
| } |
| |
| if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD")) |
| set_job_alias_list(job_ptr); |
| |
| /* |
| * Request asynchronous launch of a prolog for a non-batch job. |
| * PROLOG_FLAG_CONTAIN also turns on PROLOG_FLAG_ALLOC. |
| */ |
| if (slurm_conf.prolog_flags & PROLOG_FLAG_ALLOC) |
| launch_prolog(job_ptr); |
| } |
| |
| /* |
| * Determine of the nodes are ready to run a job |
| * RET true if ready |
| */ |
| extern bool test_job_nodes_ready(job_record_t *job_ptr) |
| { |
| if (IS_JOB_PENDING(job_ptr)) |
| return false; |
| if (!job_ptr->node_bitmap) /* Revoked allocation */ |
| return true; |
| if (bit_overlap_any(job_ptr->node_bitmap, power_down_node_bitmap)) |
| return false; |
| |
| if (!job_ptr->batch_flag || |
| job_ptr->batch_features || |
| job_ptr->wait_all_nodes || job_ptr->burst_buffer) { |
| /* Make sure all nodes ready to start job */ |
| if ((select_g_job_ready(job_ptr) & READY_NODE_STATE) == 0) |
| return false; |
| } else if (job_ptr->batch_flag) { |
| /* Make sure first node is ready to start batch job */ |
| node_record_t *node_ptr = |
| find_node_record(job_ptr->batch_host); |
| if (!node_ptr || |
| IS_NODE_POWERED_DOWN(node_ptr) || |
| IS_NODE_POWERING_UP(node_ptr)) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| static int _foreach_het_job_configuring_test(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| |
| if (IS_JOB_CONFIGURING(het_job)) |
| return 1; |
| return 0; |
| } |
| |
| /* |
| * For non-hetjob, return true if this job is configuring. |
| * For hetjob, return true if any component of the job is configuring. |
| */ |
| static bool _het_job_configuring_test(job_record_t *job_ptr) |
| { |
| job_record_t *het_job_leader; |
| |
| if (IS_JOB_CONFIGURING(job_ptr)) |
| return true; |
| if (!job_ptr->het_job_id) |
| return false; |
| |
| het_job_leader = find_job_record(job_ptr->het_job_id); |
| if (!het_job_leader) { |
| error("%s: Hetjob leader %pJ not found", __func__, job_ptr); |
| return false; |
| } |
| if (!het_job_leader->het_job_list) { |
| error("%s: Hetjob leader %pJ job list is NULL", |
| __func__, job_ptr); |
| return false; |
| } |
| |
| return list_find_first(het_job_leader->het_job_list, |
| _foreach_het_job_configuring_test, |
| NULL); |
| } |
| |
| /* |
| * job_time_limit - terminate jobs which have exceeded their time limit |
| * global: job_list - pointer global job list |
| * last_job_update - time of last job table update |
| */ |
| void job_time_limit(void) |
| { |
| list_itr_t *job_iterator; |
| job_record_t *job_ptr; |
| time_t now = time(NULL); |
| time_t old = now - ((slurm_conf.inactive_limit * 4 / 3) + |
| slurm_conf.msg_timeout + 1); |
| time_t over_run; |
| uint16_t over_time_limit; |
| uint8_t prolog; |
| int job_test_count = 0; |
| uint32_t resv_over_run = slurm_conf.resv_over_run; |
| |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| |
| if (resv_over_run == INFINITE16) |
| resv_over_run = YEAR_SECONDS; |
| else |
| resv_over_run *= 60; |
| |
| /* |
| * locks same as in _slurmctld_background() (The only current place this |
| * is called). |
| */ |
| slurmctld_lock_t job_write_lock = { |
| READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; |
| DEF_TIMERS; |
| |
| |
| /* |
| * Not making this list_next loop a list_for_each. This loop unlocks the |
| * job_write lock if held too long, but that would not unlock the lists |
| * write lock in a list_for_each. Unless this can be handled this must |
| * remain a list_next loop. |
| */ |
| job_iterator = list_iterator_create(job_list); |
| START_TIMER; |
| while ((job_ptr = list_next(job_iterator))) { |
| xassert (job_ptr->magic == JOB_MAGIC); |
| job_test_count++; |
| |
| if (job_ptr->details) |
| prolog = job_ptr->details->prolog_running; |
| else |
| prolog = 0; |
| if ((prolog == 0) && IS_JOB_CONFIGURING(job_ptr) && |
| test_job_nodes_ready(job_ptr)) { |
| info("%s: Configuration for %pJ complete", |
| __func__, job_ptr); |
| job_config_fini(job_ptr); |
| if (job_ptr->batch_flag) |
| launch_job(job_ptr); |
| } |
| |
| /* |
| * Features have been changed on some node, make job eligiable |
| * to run and test to see if it can run now |
| */ |
| if (node_features_updated && |
| (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) && |
| IS_JOB_PENDING(job_ptr) && (job_ptr->priority == 0)) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| set_job_prio(job_ptr); |
| last_job_update = now; |
| } |
| |
| /* Don't enforce time limits for configuring hetjobs */ |
| if (_het_job_configuring_test(job_ptr)) |
| continue; |
| |
| /* |
| * Only running jobs can be killed due to timeout. Do not kill |
| * suspended jobs due to timeout. |
| */ |
| if (!IS_JOB_RUNNING(job_ptr)) |
| continue; |
| |
| /* |
| * everything above here is considered "quick", and skips the |
| * timeout at the bottom of the loop by using a continue. |
| * everything below is considered "slow", and needs to jump to |
| * time_check before the next job is tested |
| */ |
| if (job_ptr->preempt_time) { |
| (void)slurm_job_preempt(job_ptr, NULL, |
| slurm_job_preempt_mode(job_ptr), |
| false); |
| goto time_check; |
| } |
| |
| if (slurm_conf.inactive_limit && (job_ptr->batch_flag == 0) && |
| (job_ptr->time_last_active <= old) && |
| (job_ptr->other_port) && |
| (job_ptr->part_ptr) && |
| (!(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY))) { |
| /* job inactive, kill it */ |
| info("%s: inactivity time limit reached for %pJ", |
| __func__, job_ptr); |
| _job_timed_out(job_ptr, false); |
| job_ptr->state_reason = FAIL_INACTIVE_LIMIT; |
| xfree(job_ptr->state_desc); |
| goto time_check; |
| } |
| if (job_ptr->time_limit != INFINITE) { |
| send_job_warn_signal(job_ptr, false); |
| if ((job_ptr->mail_type & MAIL_JOB_TIME100) && |
| (now >= job_ptr->end_time)) { |
| job_ptr->mail_type &= (~MAIL_JOB_TIME100); |
| mail_job_info(job_ptr, MAIL_JOB_TIME100); |
| } |
| if ((job_ptr->mail_type & MAIL_JOB_TIME90) && |
| (now + (job_ptr->time_limit * 60 * 0.1) >= |
| job_ptr->end_time)) { |
| job_ptr->mail_type &= (~MAIL_JOB_TIME90); |
| mail_job_info(job_ptr, MAIL_JOB_TIME90); |
| } |
| if ((job_ptr->mail_type & MAIL_JOB_TIME80) && |
| (now + (job_ptr->time_limit * 60 * 0.2) >= |
| job_ptr->end_time)) { |
| job_ptr->mail_type &= (~MAIL_JOB_TIME80); |
| mail_job_info(job_ptr, MAIL_JOB_TIME80); |
| } |
| if ((job_ptr->mail_type & MAIL_JOB_TIME50) && |
| (now + (job_ptr->time_limit * 60 * 0.5) >= |
| job_ptr->end_time)) { |
| job_ptr->mail_type &= (~MAIL_JOB_TIME50); |
| mail_job_info(job_ptr, MAIL_JOB_TIME50); |
| } |
| |
| if (job_ptr->part_ptr && |
| (job_ptr->part_ptr->over_time_limit != NO_VAL16)) { |
| over_time_limit = |
| job_ptr->part_ptr->over_time_limit; |
| } else { |
| over_time_limit = slurm_conf.over_time_limit; |
| } |
| if (over_time_limit == INFINITE16) |
| over_run = now - YEAR_SECONDS; |
| else |
| over_run = now - (over_time_limit * 60); |
| if (job_ptr->end_time <= over_run) { |
| last_job_update = now; |
| info("Time limit exhausted for %pJ", job_ptr); |
| _job_timed_out(job_ptr, false); |
| job_ptr->state_reason = FAIL_TIMEOUT; |
| xfree(job_ptr->state_desc); |
| goto time_check; |
| } |
| } |
| |
| if (job_ptr->resv_ptr && |
| !(job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) && |
| (job_ptr->resv_ptr->end_time + resv_over_run) < time(NULL)){ |
| last_job_update = now; |
| info("Reservation ended for %pJ", job_ptr); |
| xfree(job_ptr->state_desc); |
| xstrfmtcat(job_ptr->state_desc, "Reservation %s, which this job was running under, has ended", |
| job_ptr->resv_ptr->name); |
| _job_timed_out(job_ptr, false); |
| job_ptr->state_reason = FAIL_TIMEOUT; |
| xfree(job_ptr->state_desc); |
| goto time_check; |
| } |
| |
| /* |
| * check if any individual job steps have exceeded |
| * their time limit |
| */ |
| list_for_each(job_ptr->step_list, check_job_step_time_limit, |
| &now); |
| |
| acct_policy_job_time_out(job_ptr); |
| |
| if (job_ptr->state_reason == FAIL_TIMEOUT) { |
| last_job_update = now; |
| _job_timed_out(job_ptr, false); |
| xfree(job_ptr->state_desc); |
| goto time_check; |
| } |
| |
| /* Give srun command warning message about pending timeout */ |
| if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2)) |
| srun_timeout (job_ptr); |
| |
| /* |
| * _job_timed_out() and other calls can take a long time on |
| * some platforms. This loop is holding the job_write lock; |
| * if a lot of jobs need to be timed out within the same cycle |
| * this stalls other threads from running and causes |
| * communication issues within the cluster. |
| * |
| * This test happens last, as job_ptr may be pointing to a job |
| * that would be deleted by a separate thread when the job_write |
| * lock is released. However, list_next itself is thread safe, |
| * and can be used again once the locks are reacquired. |
| * list_peek_next is used in the unlikely event the timer has |
| * expired just as the end of the job_list is reached. |
| */ |
| time_check: |
| /* Use a hard-coded 3 second timeout, with a 1 second sleep. */ |
| if (slurm_delta_tv(&tv1) >= 3000000 && |
| list_peek_next(job_iterator)) { |
| END_TIMER; |
| debug("%s: yielding locks after testing %d jobs, %s", |
| __func__, job_test_count, TIME_STR); |
| unlock_slurmctld(job_write_lock); |
| usleep(1000000); |
| lock_slurmctld(job_write_lock); |
| START_TIMER; |
| job_test_count = 0; |
| } |
| } |
| list_iterator_destroy(job_iterator); |
| node_features_updated = false; |
| } |
| |
| extern void job_set_req_tres(job_record_t *job_ptr, bool assoc_mgr_locked) |
| { |
| uint32_t cpu_cnt = 0, node_cnt = 0; |
| uint64_t mem_cnt = 0; |
| uint16_t sockets_per_node; |
| uint32_t num_tasks = 1; /* Default to 1 if it's not set */ |
| assoc_mgr_lock_t locks = { .tres = READ_LOCK }; |
| |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| |
| xfree(job_ptr->tres_req_str); |
| xfree(job_ptr->tres_fmt_req_str); |
| xfree(job_ptr->tres_req_cnt); |
| |
| if (!assoc_mgr_locked) |
| assoc_mgr_lock(&locks); |
| |
| job_ptr->tres_req_cnt = xcalloc(g_tres_count, sizeof(uint64_t)); |
| |
| if (job_ptr->details) { |
| node_cnt = job_ptr->details->min_nodes; |
| cpu_cnt = job_ptr->details->min_cpus; |
| if (job_ptr->details->pn_min_memory) |
| mem_cnt = job_ptr->details->pn_min_memory; |
| num_tasks = job_ptr->details->num_tasks; |
| } |
| |
| /* if this is set just override */ |
| if (job_ptr->total_cpus) |
| cpu_cnt = job_ptr->total_cpus; |
| |
| if (job_ptr->node_cnt) |
| node_cnt = job_ptr->node_cnt; |
| |
| job_ptr->tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)node_cnt; |
| job_ptr->tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)cpu_cnt; |
| sockets_per_node = job_get_sockets_per_node(job_ptr); |
| job_ptr->tres_req_cnt[TRES_ARRAY_MEM] = |
| job_get_tres_mem(job_ptr->job_resrcs, |
| mem_cnt, cpu_cnt, |
| node_cnt, |
| job_ptr->part_ptr, |
| job_ptr->gres_list_req, |
| (job_ptr->bit_flags & JOB_MEM_SET), |
| sockets_per_node, |
| num_tasks); |
| |
| license_set_job_tres_cnt(job_ptr->license_list, |
| job_ptr->tres_req_cnt, |
| true); |
| |
| /* FIXME: this assumes that all nodes have equal TRES */ |
| gres_stepmgr_set_job_tres_cnt( |
| job_ptr->gres_list_req, |
| node_cnt, |
| job_ptr->tres_req_cnt, |
| true); |
| |
| bb_g_job_set_tres_cnt(job_ptr, |
| job_ptr->tres_req_cnt, |
| true); |
| |
| /* |
| * Do this last as it calculates off of everything else. |
| * Don't use calc_job_billable_tres() as it relies on allocated tres |
| * If the partition was destroyed the part_ptr will be NULL. As this |
| * could be run on already finished jobs running in the assoc mgr |
| * cache. |
| */ |
| if (job_ptr->part_ptr) |
| job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] = |
| assoc_mgr_tres_weighted( |
| job_ptr->tres_req_cnt, |
| job_ptr->part_ptr->billing_weights, |
| slurm_conf.priority_flags, true); |
| |
| /* now that the array is filled lets make the string from it */ |
| set_job_tres_req_str(job_ptr, true); |
| |
| if (!assoc_mgr_locked) |
| assoc_mgr_unlock(&locks); |
| } |
| |
| extern void job_set_alloc_tres(job_record_t *job_ptr, bool assoc_mgr_locked) |
| { |
| uint32_t alloc_nodes = 0; |
| assoc_mgr_lock_t locks = { .tres = READ_LOCK }; |
| |
| xfree(job_ptr->tres_alloc_str); |
| xfree(job_ptr->tres_alloc_cnt); |
| xfree(job_ptr->tres_fmt_alloc_str); |
| |
| /* |
| * We only need to do this on non-pending jobs. |
| * Requeued jobs are marked as PENDING|COMPLETING until the epilog is |
| * finished so we still need the alloc tres until then. |
| */ |
| if (IS_JOB_PENDING(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) |
| return; |
| |
| if (!assoc_mgr_locked) |
| assoc_mgr_lock(&locks); |
| |
| job_ptr->tres_alloc_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t)); |
| |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU] = (uint64_t)job_ptr->total_cpus; |
| |
| alloc_nodes = job_ptr->node_cnt; |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE] = (uint64_t)alloc_nodes; |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_MEM] = |
| job_get_tres_mem(job_ptr->job_resrcs, |
| job_ptr->details->pn_min_memory, |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU], |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE], |
| job_ptr->part_ptr, |
| job_ptr->gres_list_req, |
| job_ptr->bit_flags & JOB_MEM_SET, |
| job_get_sockets_per_node(job_ptr), |
| job_ptr->details->num_tasks); |
| |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] = NO_VAL64; |
| |
| license_set_job_tres_cnt(job_ptr->license_list, |
| job_ptr->tres_alloc_cnt, |
| true); |
| gres_stepmgr_set_job_tres_cnt( |
| job_ptr->gres_list_alloc, |
| alloc_nodes, |
| job_ptr->tres_alloc_cnt, |
| true); |
| |
| bb_g_job_set_tres_cnt(job_ptr, |
| job_ptr->tres_alloc_cnt, |
| true); |
| |
| /* Do this last as it calculates off of everything else. */ |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_BILLING] = |
| calc_job_billable_tres(job_ptr, job_ptr->start_time, true); |
| |
| /* now that the array is filled lets make the string from it */ |
| assoc_mgr_set_job_tres_alloc_str(job_ptr, true); |
| |
| if (!assoc_mgr_locked) |
| assoc_mgr_unlock(&locks); |
| } |
| |
| /* |
| * job_update_tres_cnt - when job is completing remove allocated tres |
| * from count. |
| * IN/OUT job_ptr - job structure to be updated |
| * IN node_inx - node bit that is finished with job. |
| * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow |
| */ |
| extern int job_update_tres_cnt(job_record_t *job_ptr, int node_inx) |
| { |
| int cpu_cnt, offset = -1, rc = SLURM_SUCCESS; |
| |
| xassert(job_ptr); |
| |
| if (job_ptr->details->whole_node & WHOLE_NODE_REQUIRED) { |
| /* |
| * Since we are allocating whole nodes don't rely on |
| * the job_resrcs since it could be less because the |
| * node could of only used 1 thread per core. |
| */ |
| node_record_t *node_ptr = |
| node_record_table_ptr[node_inx]; |
| cpu_cnt = node_ptr->config_ptr->cpus; |
| } else { |
| if ((offset = job_resources_node_inx_to_cpu_inx( |
| job_ptr->job_resrcs, node_inx)) < 0) { |
| error("%s: problem getting offset of %pJ", |
| __func__, job_ptr); |
| job_ptr->cpu_cnt = 0; |
| return SLURM_ERROR; |
| } |
| |
| cpu_cnt = job_ptr->job_resrcs->cpus[offset]; |
| } |
| if (cpu_cnt > job_ptr->cpu_cnt) { |
| error("%s: cpu_cnt underflow (%d > %u) on %pJ", __func__, |
| cpu_cnt, job_ptr->cpu_cnt, job_ptr); |
| job_ptr->cpu_cnt = 0; |
| rc = SLURM_ERROR; |
| } else |
| job_ptr->cpu_cnt -= cpu_cnt; |
| |
| if (IS_JOB_RESIZING(job_ptr)) { |
| if (cpu_cnt > job_ptr->total_cpus) { |
| error("%s: total_cpus underflow on %pJ", |
| __func__, job_ptr); |
| job_ptr->total_cpus = 0; |
| rc = SLURM_ERROR; |
| } else |
| job_ptr->total_cpus -= cpu_cnt; |
| |
| job_set_alloc_tres(job_ptr, false); |
| } |
| return rc; |
| } |
| |
| /* Terminate a job that has exhausted its time limit */ |
| static void _job_timed_out(job_record_t *job_ptr, bool preempted) |
| { |
| xassert(job_ptr); |
| |
| srun_timeout(job_ptr); |
| if (job_ptr->details) { |
| time_t now = time(NULL); |
| job_ptr->end_time = now; |
| job_ptr->time_last_active = now; |
| if (!job_ptr->preempt_time) |
| job_state_set(job_ptr, (JOB_TIMEOUT | JOB_COMPLETING)); |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, false); |
| deallocate_nodes(job_ptr, !preempted, false, preempted); |
| } else |
| job_signal(job_ptr, SIGKILL, 0, 0, false); |
| } |
| |
| /* _validate_job_desc - validate that a job descriptor for job submit or |
| * allocate has valid data, set values to defaults as required |
| * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed |
| * IN allocate - if clear job to be queued, if set allocate for user now |
| * IN submit_uid - who request originated |
| */ |
| static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate, |
| bool cron, uid_t submit_uid, |
| part_record_t *part_ptr, list_t *part_list) |
| { |
| if ((job_desc_msg->min_cpus == NO_VAL) && |
| (job_desc_msg->min_nodes == NO_VAL) && |
| (job_desc_msg->req_nodes == NULL)) { |
| info("%s: job specified no min_cpus, min_nodes or req_nodes", |
| __func__); |
| return ESLURM_JOB_MISSING_SIZE_SPECIFICATION; |
| } |
| if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) && |
| (job_desc_msg->script == NULL) && |
| !(job_desc_msg->bitflags & EXTERNAL_JOB)) { |
| info("%s: job failed to specify Script", __func__); |
| return ESLURM_JOB_SCRIPT_MISSING; |
| } |
| if (job_desc_msg->script && job_desc_msg->x11) { |
| info("%s: batch job cannot use X11 forwarding", __func__); |
| return ESLURM_X11_NOT_AVAIL; |
| } |
| if (job_desc_msg->user_id == NO_VAL) { |
| info("%s: job failed to specify User", __func__); |
| return ESLURM_USER_ID_MISSING; |
| } |
| if ( job_desc_msg->group_id == NO_VAL ) { |
| debug("%s: job failed to specify group", __func__); |
| return ESLURM_GROUP_ID_MISSING; |
| } |
| if (!job_desc_msg->container_id && !job_desc_msg->container && |
| (!job_desc_msg->work_dir || !job_desc_msg->work_dir[0])) { |
| debug("%s: job working directory has to be set", __func__); |
| return ESLURM_MISSING_WORK_DIR; |
| } |
| if ((job_desc_msg->warn_flags & KILL_JOB_RESV) && |
| (slurm_conf.preempt_mode == PREEMPT_MODE_OFF)) { |
| debug("%s: job specified \"R:\" option of --signal, which is incompatible with PreemptMode=OFF", |
| __func__); |
| return ESLURM_PREEMPTION_REQUIRED; |
| } |
| if (job_desc_msg->contiguous == NO_VAL16) |
| job_desc_msg->contiguous = 0; |
| |
| if (job_desc_msg->task_dist == NO_VAL) { |
| /* not typically set by salloc or sbatch */ |
| job_desc_msg->task_dist = SLURM_DIST_CYCLIC; |
| } |
| if (job_desc_msg->plane_size == NO_VAL16) |
| job_desc_msg->plane_size = 0; |
| |
| if (job_desc_msg->segment_size == NO_VAL16) |
| job_desc_msg->segment_size = 0; |
| |
| if (job_desc_msg->kill_on_node_fail == NO_VAL16) |
| job_desc_msg->kill_on_node_fail = 1; |
| |
| if (job_desc_msg->job_id != NO_VAL) { |
| job_record_t *dup_job_ptr; |
| if (!fed_mgr_fed_rec && |
| (submit_uid != 0) && |
| (submit_uid != slurm_conf.slurm_user_id)) { |
| info("attempt by uid %u to set JobId=%u", |
| submit_uid, job_desc_msg->job_id); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| if (job_desc_msg->job_id == 0) { |
| info("attempt by uid %u to set JobId=0", |
| submit_uid); |
| return ESLURM_INVALID_JOB_ID; |
| } |
| dup_job_ptr = find_job_record(job_desc_msg->job_id); |
| if (dup_job_ptr) { |
| info("attempt to reuse active %pJ", dup_job_ptr); |
| return ESLURM_DUPLICATE_JOB_ID; |
| } |
| } |
| |
| if (job_desc_msg->nice == NO_VAL) |
| job_desc_msg->nice = NICE_OFFSET; |
| |
| if (job_desc_msg->pn_min_memory == NO_VAL64) |
| job_desc_msg->pn_min_memory = _get_def_mem(part_ptr, NULL); |
| else if (!_validate_min_mem_partition(job_desc_msg, part_ptr, |
| part_list)) { |
| return ESLURM_INVALID_TASK_MEMORY; |
| } else { |
| /* Memory limit explicitly set by user */ |
| job_desc_msg->bitflags |= JOB_MEM_SET; |
| } |
| |
| job_desc_msg->bitflags &= ~BACKFILL_TEST; |
| job_desc_msg->bitflags &= ~BF_WHOLE_NODE_TEST; |
| job_desc_msg->bitflags &= ~JOB_ACCRUE_OVER; |
| job_desc_msg->bitflags &= ~JOB_KILL_HURRY; |
| job_desc_msg->bitflags &= ~SIB_JOB_FLUSH; |
| job_desc_msg->bitflags &= ~TRES_STR_CALC; |
| job_desc_msg->bitflags &= ~JOB_WAS_RUNNING; |
| if (!cron) |
| job_desc_msg->bitflags &= ~CRON_JOB; |
| |
| if (job_desc_msg->pn_min_memory == MEM_PER_CPU) { |
| /* Map --mem-per-cpu=0 to --mem=0 for simpler logic */ |
| job_desc_msg->pn_min_memory = 0; |
| } |
| |
| /* Validate a job's accounting frequency, if specified */ |
| if (acct_gather_check_acct_freq_task( |
| job_desc_msg->pn_min_memory, job_desc_msg->acctg_freq)) |
| return ESLURMD_INVALID_ACCT_FREQ; |
| |
| if (job_desc_msg->min_nodes == NO_VAL) |
| job_desc_msg->min_nodes = 1; /* default node count of 1 */ |
| if (job_desc_msg->min_cpus == NO_VAL) |
| job_desc_msg->min_cpus = job_desc_msg->min_nodes; |
| |
| if ((job_desc_msg->pn_min_cpus == NO_VAL16) || |
| (job_desc_msg->pn_min_cpus == 0)) |
| job_desc_msg->pn_min_cpus = 1; /* default 1 cpu per node */ |
| if (job_desc_msg->pn_min_tmp_disk == NO_VAL) |
| job_desc_msg->pn_min_tmp_disk = 0;/* default 0MB disk per node */ |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_valid_pn_min_mem(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| foreach_valid_pn_min_mem_t *foreach_valid_pn_min_mem = arg; |
| job_desc_msg_t *job_desc_msg = foreach_valid_pn_min_mem->job_desc; |
| |
| foreach_valid_pn_min_mem->rc = |
| _valid_pn_min_mem(job_desc_msg, part_ptr); |
| |
| /* for ALL we have to test them all */ |
| if (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL) { |
| if (!foreach_valid_pn_min_mem->rc) |
| return -1; |
| } else if (foreach_valid_pn_min_mem->rc) /* break, we found one! */ |
| return -1; |
| else if (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ANY) { |
| debug("%s: Job requested for (%"PRIu64")MB is invalid for partition %s", |
| __func__, job_desc_msg->pn_min_memory, |
| part_ptr->name); |
| } |
| |
| job_desc_msg->pn_min_memory = foreach_valid_pn_min_mem->pn_min_memory; |
| job_desc_msg->cpus_per_task = foreach_valid_pn_min_mem->cpus_per_task; |
| job_desc_msg->min_cpus = foreach_valid_pn_min_mem->min_cpus; |
| job_desc_msg->max_cpus = foreach_valid_pn_min_mem->max_cpus; |
| job_desc_msg->pn_min_cpus = foreach_valid_pn_min_mem->pn_min_cpus; |
| |
| return 0; |
| } |
| |
| /* |
| * Traverse the list of partitions and invoke the |
| * function validating the job memory specification. |
| */ |
| static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg, |
| part_record_t *part_ptr, |
| list_t *part_list) |
| { |
| uint64_t tmp_pn_min_memory; |
| uint16_t tmp_cpus_per_task; |
| uint32_t tmp_min_cpus; |
| uint32_t tmp_max_cpus; |
| uint32_t tmp_pn_min_cpus; |
| bool cc = false; |
| |
| /* no reason to check them here as we aren't enforcing them */ |
| if (!slurm_conf.enforce_part_limits) |
| return true; |
| |
| tmp_pn_min_memory = job_desc_msg->pn_min_memory; |
| tmp_cpus_per_task = job_desc_msg->cpus_per_task; |
| tmp_min_cpus = job_desc_msg->min_cpus; |
| tmp_max_cpus = job_desc_msg->max_cpus; |
| tmp_pn_min_cpus = job_desc_msg->pn_min_cpus; |
| |
| if (part_list == NULL) { |
| cc = _valid_pn_min_mem(job_desc_msg, part_ptr); |
| } else { |
| foreach_valid_pn_min_mem_t foreach_valid_pn_min_mem = { |
| .cpus_per_task = tmp_cpus_per_task, |
| .job_desc = job_desc_msg, |
| .max_cpus = tmp_max_cpus, |
| .min_cpus = tmp_min_cpus, |
| .pn_min_cpus = tmp_pn_min_cpus, |
| .pn_min_memory = tmp_pn_min_memory, |
| }; |
| |
| (void) list_for_each(part_list, _foreach_valid_pn_min_mem, |
| &foreach_valid_pn_min_mem); |
| cc = foreach_valid_pn_min_mem.rc; |
| } |
| |
| /* |
| * Restoring original values, if it is necessary, |
| * these will be modified in job_limits_check() |
| */ |
| job_desc_msg->pn_min_memory = tmp_pn_min_memory; |
| job_desc_msg->cpus_per_task = tmp_cpus_per_task; |
| job_desc_msg->min_cpus = tmp_min_cpus; |
| job_desc_msg->max_cpus = tmp_max_cpus; |
| job_desc_msg->pn_min_cpus = tmp_pn_min_cpus; |
| |
| return cc; |
| } |
| |
| static void _delete_job_common(job_record_t *job_ptr) |
| { |
| if (!job_ptr->job_id) |
| return; |
| |
| /* Remove record from fed_job_list */ |
| fed_mgr_remove_fed_job_info(job_ptr->job_id); |
| |
| /* Remove the record from job hash table */ |
| _remove_job_hash(job_ptr, JOB_HASH_JOB); |
| |
| /* Remove the record from job array hash tables, if applicable */ |
| if (job_ptr->array_task_id != NO_VAL) { |
| _remove_job_hash(job_ptr, JOB_HASH_ARRAY_JOB); |
| _remove_job_hash(job_ptr, JOB_HASH_ARRAY_TASK); |
| } |
| } |
| |
| /* |
| * Remove the job record from hash tables and append to purge_jobs_list. |
| */ |
| static void _move_to_purge_jobs_list(void *job_entry) |
| { |
| job_record_t *job_ptr = job_entry; |
| int job_array_size; |
| |
| if (!job_entry) |
| return; |
| |
| xassert(job_ptr->magic == JOB_MAGIC); |
| |
| _delete_job_common(job_ptr); |
| |
| if (job_ptr->array_recs) { |
| job_array_size = MAX(1, job_ptr->array_recs->task_cnt); |
| } else if (!job_ptr->job_id) { /* reservation */ |
| job_array_size = 0; |
| } else { |
| job_array_size = 1; |
| } |
| |
| if (job_array_size > job_count) { |
| error("job_count underflow"); |
| job_count = 0; |
| } else { |
| job_count -= job_array_size; |
| } |
| |
| list_append(purge_jobs_list, job_ptr); |
| } |
| |
| /* |
| * find specific job_id entry in the job list, key is job_id_ptr |
| */ |
| static int _list_find_job_id(void *job_entry, void *key) |
| { |
| job_record_t *job_ptr = (job_record_t *) job_entry; |
| uint32_t *job_id_ptr = (uint32_t *) key; |
| |
| if (job_ptr->job_id == *job_id_ptr) |
| return 1; |
| |
| return 0; |
| } |
| |
| /* |
| * _list_find_job_old - find old entries in the job list, |
| * see common/list.h for documentation, key is ignored |
| * job_entry IN - job pointer |
| * key IN - if not NULL, then skip hetjobs |
| */ |
| static int _list_find_job_old(void *job_entry, void *key) |
| { |
| time_t kill_age, min_age, now = time(NULL); |
| job_record_t *job_ptr = (job_record_t *) job_entry; |
| |
| if ((job_ptr->job_id == NO_VAL) && IS_JOB_REVOKED(job_ptr)) |
| return 1; |
| |
| if (job_ptr->het_job_id && (job_ptr->bit_flags & HETJOB_PURGE)) |
| return 1; |
| |
| if (IS_JOB_COMPLETING(job_ptr) && !LOTS_OF_AGENTS) { |
| kill_age = now - (slurm_conf.kill_wait + |
| 2 * slurm_conf.msg_timeout); |
| if (job_ptr->time_last_active < kill_age) { |
| job_ptr->time_last_active = now; |
| re_kill_job(job_ptr); |
| } |
| return 0; /* Job still completing */ |
| } |
| |
| if (job_ptr->epilog_running) |
| return 0; /* EpilogSlurmctld still running */ |
| |
| if (slurm_conf.min_job_age == 0) |
| return 0; /* No job record purging */ |
| |
| if (fed_mgr_fed_rec && job_ptr->fed_details && |
| !fed_mgr_is_origin_job(job_ptr)) { |
| uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id); |
| slurmdb_cluster_rec_t *origin = |
| fed_mgr_get_cluster_by_id(origin_id); |
| |
| /* keep job around until origin comes back and is synced */ |
| if (origin && |
| (!origin->fed.send || |
| !((persist_conn_t *) origin->fed.send)->tls_conn || |
| !origin->fed.sync_sent)) |
| return 0; |
| } |
| |
| min_age = now - slurm_conf.min_job_age; |
| if (job_ptr->end_time > min_age) |
| return 0; /* Too new to purge */ |
| |
| if (!(IS_JOB_COMPLETED(job_ptr))) |
| return 0; /* Job still active */ |
| |
| if (job_ptr->step_list && list_count(job_ptr->step_list)) { |
| debug("%pJ still has %d active steps", |
| job_ptr, list_count(job_ptr->step_list)); |
| /* |
| * If the job has been around more than 30 days the steps are |
| * bogus. Blow the job away. This was witnessed <= 16.05 but |
| * hasn't be seen since. This is here just to clear them out if |
| * this ever shows up again. |
| */ |
| min_age = now - PURGE_OLD_JOB_IN_SEC; |
| if (job_ptr->end_time <= min_age) { |
| info("Force purge of %pJ. It ended over 30 days ago, the slurmctld thinks there are still steps running but they are most likely bogus. In any case you might want to check nodes %s to make sure nothing remains of the job.", |
| job_ptr, job_ptr->nodes); |
| goto end_it; |
| } else |
| return 0; /* steps are still active */ |
| } |
| |
| if (job_ptr->array_recs) { |
| if (job_ptr->array_recs->tot_run_tasks || |
| !_test_job_array_purged(job_ptr->array_job_id)) { |
| /* Some tasks from this job array still active */ |
| return 0; |
| } |
| } |
| |
| if (bb_g_job_test_stage_out(job_ptr) != 1) |
| return 0; /* Stage out in progress */ |
| |
| end_it: |
| |
| return 1; /* Purge the job */ |
| } |
| |
| static int _foreach_is_part_visible(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| part_record_t **visible_parts = arg; |
| |
| for (int i = 0; visible_parts[i]; i++) { |
| if (visible_parts[i] == part_ptr) { |
| return -1; |
| } |
| } |
| return 0; |
| } |
| |
| /* Determine if ALL partitions associated with a job are hidden */ |
| static bool _all_parts_hidden(job_record_t *job_ptr, |
| part_record_t **visible_parts) |
| { |
| if (job_ptr->part_ptr_list) { |
| if (list_find_first(part_list, _foreach_is_part_visible, |
| visible_parts)) |
| return false; |
| return true; |
| } |
| |
| if (job_ptr->part_ptr) { |
| if (_foreach_is_part_visible(job_ptr->part_ptr, visible_parts)) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* Determine if a given job should be seen by a specific user */ |
| static bool _hide_job_user_rec(job_record_t *job_ptr, slurmdb_user_rec_t *user, |
| uint16_t show_flags) |
| { |
| if (!job_ptr) |
| return true; |
| |
| if ((slurm_conf.private_data & PRIVATE_DATA_JOBS) && |
| (job_ptr->user_id != user->uid) && |
| (((slurm_mcs_get_privatedata() == 0) && |
| !assoc_mgr_is_user_acct_coord_user_rec(user, job_ptr->account)) || |
| ((slurm_mcs_get_privatedata() == 1) && |
| (mcs_g_check_mcs_label(user->uid, job_ptr->mcs_label, |
| true) != 0)))) |
| return true; |
| return false; |
| } |
| |
| static int _pack_job(void *object, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *)object; |
| _foreach_pack_job_info_t *pack_info = (_foreach_pack_job_info_t *)arg; |
| |
| xassert (job_ptr->magic == JOB_MAGIC); |
| |
| if ((pack_info->filter_uid != NO_VAL) && |
| (pack_info->filter_uid != job_ptr->user_id)) |
| return SLURM_SUCCESS; |
| |
| if (!(pack_info->show_flags & SHOW_ALL) && IS_JOB_REVOKED(job_ptr)) |
| return SLURM_SUCCESS; |
| |
| if (!pack_info->privileged) { |
| if (((pack_info->show_flags & SHOW_ALL) == 0) && |
| _all_parts_hidden(job_ptr, pack_info->visible_parts)) |
| return SLURM_SUCCESS; |
| |
| if (_hide_job_user_rec(job_ptr, &pack_info->user_rec, |
| pack_info->show_flags)) |
| return SLURM_SUCCESS; |
| } |
| |
| pack_job(job_ptr, pack_info->show_flags, pack_info->buffer, |
| pack_info->protocol_version, pack_info->uid, |
| pack_info->has_qos_lock); |
| |
| pack_info->jobs_packed++; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_pack_het_job(void *x, void *arg) |
| { |
| job_record_t *het_job_ptr = x; |
| _foreach_pack_job_info_t *pack_info = arg; |
| |
| xassert(pack_info->het_leader); |
| xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK)); |
| if (het_job_ptr->het_job_id != pack_info->het_leader->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", __func__, |
| pack_info->het_leader); |
| return 0; |
| } |
| |
| pack_job(het_job_ptr, pack_info->show_flags, pack_info->buffer, |
| pack_info->protocol_version, pack_info->uid, |
| pack_info->has_qos_lock); |
| |
| pack_info->jobs_packed++; |
| |
| return 0; |
| } |
| |
| static int _foreach_pack_jobid(void *object, void *arg) |
| { |
| job_record_t *job_ptr; |
| uint32_t job_id = *(uint32_t *)object; |
| _foreach_pack_job_info_t *info = (_foreach_pack_job_info_t *)arg; |
| |
| if (!(job_ptr = find_job_record(job_id))) |
| return SLURM_SUCCESS; |
| |
| return _pack_job(job_ptr, info); |
| } |
| |
| /* |
| * _pack_init_job_info - create buffer with header packed for a job_info_msg_t |
| * |
| * NOTE: change _unpack_job_info_msg() in common/slurm_protocol_pack.c |
| * whenever the data format changes |
| */ |
| static buf_t *_pack_init_job_info(uint16_t protocol_version) |
| { |
| buf_t *buffer = init_buf(BUF_SIZE); |
| |
| /* write message body header : size and time */ |
| /* put in a place holder job record count of 0 for now */ |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| pack32(0, buffer); |
| pack_time(time(NULL), buffer); |
| pack_time(slurmctld_diag_stats.bf_when_last_cycle, buffer); |
| } |
| |
| return buffer; |
| } |
| |
| /* |
| * pack_all_jobs - dump all job information for all jobs in |
| * machine independent form (for network transmission) |
| * IN show_flags - job filtering options |
| * IN uid - uid of user making request (for partition filtering) |
| * IN filter_uid - pack only jobs belonging to this user if not NO_VAL |
| * OUT buffer |
| * global: job_list - global list of job records |
| * NOTE: the buffer at *buffer_ptr must be xfreed by the caller |
| */ |
| extern buf_t *pack_all_jobs(uint16_t show_flags, uid_t uid, uint32_t filter_uid, |
| uint16_t protocol_version) |
| { |
| uint32_t tmp_offset; |
| _foreach_pack_job_info_t pack_info = { |
| .buffer = _pack_init_job_info(protocol_version), |
| .filter_uid = filter_uid, |
| .jobs_packed = 0, |
| .protocol_version = protocol_version, |
| .show_flags = show_flags, |
| .uid = uid, |
| .has_qos_lock = true, |
| .user_rec.uid = uid, |
| }; |
| assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .user = READ_LOCK, |
| .qos = READ_LOCK }; |
| |
| assoc_mgr_lock(&locks); |
| assoc_mgr_fill_in_user(acct_db_conn, &pack_info.user_rec, |
| accounting_enforce, NULL, true); |
| pack_info.privileged = validate_operator_user_rec(&pack_info.user_rec); |
| pack_info.visible_parts = build_visible_parts( |
| uid, (pack_info.privileged || (show_flags & SHOW_ALL))); |
| list_for_each_ro(job_list, _pack_job, &pack_info); |
| assoc_mgr_unlock(&locks); |
| |
| /* put the real record count in the message body header */ |
| tmp_offset = get_buf_offset(pack_info.buffer); |
| set_buf_offset(pack_info.buffer, 0); |
| pack32(pack_info.jobs_packed, pack_info.buffer); |
| set_buf_offset(pack_info.buffer, tmp_offset); |
| |
| xfree(pack_info.visible_parts); |
| |
| return pack_info.buffer; |
| } |
| |
| /* |
| * pack_spec_jobs - dump job information for specified jobs in |
| * machine independent form (for network transmission) |
| * IN show_flags - job filtering options |
| * IN job_ids - list of job_ids to pack |
| * IN uid - uid of user making request (for partition filtering) |
| * IN filter_uid - pack only jobs belonging to this user if not NO_VAL |
| * OUT buffer |
| * global: job_list - global list of job records |
| * NOTE: the buffer at *buffer_ptr must be xfreed by the caller |
| */ |
| extern buf_t *pack_spec_jobs(list_t *job_ids, uint16_t show_flags, uid_t uid, |
| uint32_t filter_uid, uint16_t protocol_version) |
| { |
| uint32_t tmp_offset; |
| _foreach_pack_job_info_t pack_info = { |
| .buffer = _pack_init_job_info(protocol_version), |
| .filter_uid = filter_uid, |
| .jobs_packed = 0, |
| .protocol_version = protocol_version, |
| .show_flags = show_flags, |
| .uid = uid, |
| .has_qos_lock = true, |
| .user_rec.uid = uid, |
| }; |
| assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .user = READ_LOCK, |
| .qos = READ_LOCK }; |
| |
| xassert(job_ids); |
| |
| assoc_mgr_lock(&locks); |
| assoc_mgr_fill_in_user(acct_db_conn, &pack_info.user_rec, |
| accounting_enforce, NULL, true); |
| pack_info.privileged = validate_operator_user_rec(&pack_info.user_rec); |
| pack_info.visible_parts = build_visible_parts( |
| uid, (pack_info.privileged || (show_flags & SHOW_ALL))); |
| list_for_each_ro(job_ids, _foreach_pack_jobid, &pack_info); |
| assoc_mgr_unlock(&locks); |
| |
| /* put the real record count in the message body header */ |
| tmp_offset = get_buf_offset(pack_info.buffer); |
| set_buf_offset(pack_info.buffer, 0); |
| pack32(pack_info.jobs_packed, pack_info.buffer); |
| set_buf_offset(pack_info.buffer, tmp_offset); |
| |
| xfree(pack_info.visible_parts); |
| |
| return pack_info.buffer; |
| } |
| |
| /* |
| * pack_one_job - dump information for one jobs in |
| * machine independent form (for network transmission) |
| * IN job_id - ID of job that we want info for |
| * IN show_flags - job filtering options |
| * IN uid - uid of user making request (for partition filtering) |
| * OUT buffer |
| */ |
| extern buf_t *pack_one_job(uint32_t job_id, uint16_t show_flags, uid_t uid, |
| uint16_t protocol_version) |
| { |
| job_record_t *job_ptr; |
| uint32_t jobs_packed = 0, tmp_offset; |
| buf_t *buffer; |
| assoc_mgr_lock_t locks = { .qos = READ_LOCK, .user = READ_LOCK }; |
| slurmdb_user_rec_t user_rec = { 0 }; |
| bool hide_job = false; |
| bool valid_operator; |
| |
| buffer = _pack_init_job_info(protocol_version); |
| |
| assoc_mgr_lock(&locks); |
| user_rec.uid = uid; |
| assoc_mgr_fill_in_user(acct_db_conn, &user_rec, |
| accounting_enforce, NULL, true); |
| |
| job_ptr = find_job_record(job_id); |
| |
| if (!(valid_operator = validate_operator_user_rec(&user_rec))) |
| hide_job = _hide_job_user_rec(job_ptr, &user_rec, show_flags); |
| |
| if (!(show_flags & SHOW_ALL) && job_ptr && IS_JOB_REVOKED(job_ptr)) |
| hide_job = true; |
| |
| if (job_ptr && job_ptr->het_job_list) { |
| /* Pack heterogeneous job components */ |
| if (!hide_job) { |
| _foreach_pack_job_info_t pack_info = { |
| .buffer = buffer, |
| .het_leader = job_ptr, |
| .jobs_packed = 0, |
| .protocol_version = protocol_version, |
| .show_flags = show_flags, |
| .uid = uid, |
| .has_qos_lock = true, |
| }; |
| (void) list_for_each(job_ptr->het_job_list, |
| _foreach_pack_het_job, |
| &pack_info); |
| |
| jobs_packed = pack_info.jobs_packed; |
| buffer = pack_info.buffer; |
| } |
| } else if (job_ptr && (job_ptr->array_task_id == NO_VAL) && |
| !job_ptr->array_recs) { |
| /* Pack regular (not array) job */ |
| if (!hide_job) { |
| pack_job(job_ptr, show_flags, buffer, protocol_version, |
| uid, true); |
| jobs_packed++; |
| } |
| } else { |
| bool packed_head = false; |
| |
| /* Either the job is not found or it is a job array */ |
| if (job_ptr) { |
| packed_head = true; |
| if (!hide_job) { |
| pack_job(job_ptr, show_flags, buffer, |
| protocol_version, uid, true); |
| jobs_packed++; |
| } |
| } |
| |
| job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; |
| while (job_ptr) { |
| if ((job_ptr->job_id == job_id) && packed_head) { |
| ; /* Already packed */ |
| } else if (!(show_flags & SHOW_ALL) && |
| IS_JOB_REVOKED(job_ptr)) { |
| /* |
| * Array jobs can't be federated but to be |
| * consistent and future proof, don't pack |
| * revoked array jobs. |
| */ |
| } else if (job_ptr->array_job_id == job_id) { |
| if (valid_operator || |
| !_hide_job_user_rec(job_ptr, &user_rec, |
| show_flags)) { |
| pack_job(job_ptr, show_flags, buffer, |
| protocol_version, uid, true); |
| jobs_packed++; |
| } |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| } |
| |
| assoc_mgr_unlock(&locks); |
| |
| if (jobs_packed == 0) { |
| FREE_NULL_BUFFER(buffer); |
| return NULL; |
| } |
| |
| /* put the real record count in the message body header */ |
| tmp_offset = get_buf_offset(buffer); |
| set_buf_offset(buffer, 0); |
| pack32(jobs_packed, buffer); |
| set_buf_offset(buffer, tmp_offset); |
| |
| return buffer; |
| } |
| |
| static void _pack_job_gres(job_record_t *dump_job_ptr, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| if (!IS_JOB_STARTED(dump_job_ptr) || IS_JOB_FINISHED(dump_job_ptr) || |
| (dump_job_ptr->gres_list_req == NULL)) { |
| packstr_array(NULL, 0, buffer); |
| return; |
| } |
| |
| packstr_array(dump_job_ptr->gres_detail_str, |
| dump_job_ptr->gres_detail_cnt, buffer); |
| } |
| |
| /* |
| * pack_job - dump all configuration information about a specific job in |
| * machine independent form (for network transmission) |
| * IN dump_job_ptr - pointer to job for which information is requested |
| * IN show_flags - job filtering options |
| * IN/OUT buffer - buffer in which data is placed, pointers automatically |
| * updated |
| * IN uid - user requesting the data |
| * NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c |
| * whenever the data format changes |
| */ |
| void pack_job(job_record_t *dump_job_ptr, uint16_t show_flags, buf_t *buffer, |
| uint16_t protocol_version, uid_t uid, bool has_qos_lock) |
| { |
| job_details_t *detail_ptr; |
| time_t accrue_time = 0, begin_time = 0, start_time = 0, end_time = 0; |
| uint32_t time_limit; |
| char *nodelist = NULL; |
| assoc_mgr_lock_t locks = { .qos = READ_LOCK }; |
| xassert(!has_qos_lock || verify_assoc_lock(QOS_LOCK, READ_LOCK)); |
| |
| /* |
| * NOTE: There are nested pack blocks in |
| * job_record_pack_details_common() and |
| * job_record_pack_details_common(). Bump this protocol block when |
| * bumping the blocks in these functions to help keep symmetry between |
| * pack and unpacks. |
| */ |
| if (protocol_version >= SLURM_25_05_PROTOCOL_VERSION) { |
| job_record_pack_common(dump_job_ptr, false, buffer, |
| protocol_version); |
| |
| if (dump_job_ptr->array_recs) { |
| build_array_str(dump_job_ptr); |
| packstr(dump_job_ptr->array_recs->task_id_str, buffer); |
| pack32(dump_job_ptr->array_recs->max_run_tasks, buffer); |
| } else { |
| job_record_t *array_head = NULL; |
| packnull(buffer); |
| if (dump_job_ptr->array_job_id) { |
| array_head = find_job_record( |
| dump_job_ptr->array_job_id); |
| } |
| if (array_head && array_head->array_recs) { |
| pack32(array_head->array_recs->max_run_tasks, |
| buffer); |
| } else { |
| pack32(0, buffer); |
| } |
| } |
| if ((dump_job_ptr->time_limit == NO_VAL) && |
| dump_job_ptr->part_ptr) |
| time_limit = dump_job_ptr->part_ptr->max_time; |
| else |
| time_limit = dump_job_ptr->time_limit; |
| |
| pack32(time_limit, buffer); |
| |
| if (IS_JOB_STARTED(dump_job_ptr)) { |
| /* Report actual start time, in past */ |
| start_time = dump_job_ptr->start_time; |
| end_time = dump_job_ptr->end_time; |
| } else if (dump_job_ptr->start_time != 0) { |
| /* |
| * Report expected start time, |
| * making sure that time is not in the past |
| */ |
| start_time = MAX(dump_job_ptr->start_time, time(NULL)); |
| if (time_limit != NO_VAL) { |
| end_time = MAX(dump_job_ptr->end_time, |
| (start_time + time_limit * 60)); |
| } |
| } else if (dump_job_ptr->details->begin_time > time(NULL)) { |
| /* earliest start time in the future */ |
| start_time = dump_job_ptr->details->begin_time; |
| if (time_limit != NO_VAL) { |
| end_time = MAX(dump_job_ptr->end_time, |
| (start_time + time_limit * 60)); |
| } |
| } |
| pack_time(start_time, buffer); |
| pack_time(end_time, buffer); |
| |
| if (dump_job_ptr->prio_mult) { |
| pack32_array(dump_job_ptr->prio_mult->priority_array, |
| (dump_job_ptr->prio_mult->priority_array) ? |
| list_count(dump_job_ptr->part_ptr_list) : |
| 0, buffer); |
| packstr(dump_job_ptr->prio_mult->priority_array_names, |
| buffer); |
| } else { |
| packnull(buffer); |
| packnull(buffer); |
| } |
| |
| packstr(slurm_conf.cluster_name, buffer); |
| |
| /* |
| * Only send the allocated nodelist since we are only sending |
| * the number of cpus and nodes that are currently allocated. |
| */ |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| packstr(dump_job_ptr->nodes, buffer); |
| else { |
| nodelist = bitmap2node_name( |
| dump_job_ptr->node_bitmap_cg); |
| packstr(nodelist, buffer); |
| xfree(nodelist); |
| } |
| packstr(dump_job_ptr->sched_nodes, buffer); |
| |
| if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) |
| packstr(dump_job_ptr->part_ptr->name, buffer); |
| else |
| packstr(dump_job_ptr->partition, buffer); |
| |
| if (IS_JOB_PENDING(dump_job_ptr) && |
| dump_job_ptr->details->qos_req) |
| packstr(dump_job_ptr->details->qos_req, buffer); |
| else { |
| if (!has_qos_lock) |
| assoc_mgr_lock(&locks); |
| if (dump_job_ptr->qos_ptr) |
| packstr(dump_job_ptr->qos_ptr->name, buffer); |
| else { |
| if (assoc_mgr_qos_list) { |
| packstr(slurmdb_qos_str( |
| assoc_mgr_qos_list, |
| dump_job_ptr->qos_id), |
| buffer); |
| } else |
| packnull(buffer); |
| } |
| } |
| |
| if (IS_JOB_STARTED(dump_job_ptr) && |
| (slurm_conf.preempt_mode != PREEMPT_MODE_OFF) && |
| (slurm_job_preempt_mode(dump_job_ptr) != |
| PREEMPT_MODE_OFF)) { |
| time_t preemptable = acct_policy_get_preemptable_time( |
| dump_job_ptr); |
| pack_time(preemptable, buffer); |
| } else { |
| pack_time(0, buffer); |
| } |
| if (!has_qos_lock) |
| assoc_mgr_unlock(&locks); |
| |
| if (show_flags & SHOW_DETAIL) { |
| pack_job_resources(dump_job_ptr->job_resrcs, buffer, |
| protocol_version); |
| _pack_job_gres(dump_job_ptr, buffer, protocol_version); |
| } else { |
| pack32(NO_VAL, buffer); |
| pack32((uint32_t)0, buffer); |
| } |
| |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer); |
| else |
| pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer); |
| |
| /* A few details are always dumped here */ |
| _pack_default_job_details(dump_job_ptr, buffer, |
| protocol_version); |
| |
| /* |
| * other job details are only dumped until the job starts |
| * running (at which time they become meaningless) |
| */ |
| _pack_pending_job_details(dump_job_ptr->details, |
| buffer, protocol_version); |
| } else if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) { |
| job_record_pack_common(dump_job_ptr, false, buffer, |
| protocol_version); |
| |
| if (dump_job_ptr->array_recs) { |
| build_array_str(dump_job_ptr); |
| packstr(dump_job_ptr->array_recs->task_id_str, buffer); |
| pack32(dump_job_ptr->array_recs->max_run_tasks, buffer); |
| } else { |
| job_record_t *array_head = NULL; |
| packnull(buffer); |
| if (dump_job_ptr->array_job_id) { |
| array_head = find_job_record( |
| dump_job_ptr->array_job_id); |
| } |
| if (array_head && array_head->array_recs) { |
| pack32(array_head->array_recs->max_run_tasks, |
| buffer); |
| } else { |
| pack32(0, buffer); |
| } |
| } |
| if ((dump_job_ptr->time_limit == NO_VAL) && |
| dump_job_ptr->part_ptr) |
| time_limit = dump_job_ptr->part_ptr->max_time; |
| else |
| time_limit = dump_job_ptr->time_limit; |
| |
| pack32(time_limit, buffer); |
| |
| if (IS_JOB_STARTED(dump_job_ptr)) { |
| /* Report actual start time, in past */ |
| start_time = dump_job_ptr->start_time; |
| end_time = dump_job_ptr->end_time; |
| } else if (dump_job_ptr->start_time != 0) { |
| /* |
| * Report expected start time, |
| * making sure that time is not in the past |
| */ |
| start_time = MAX(dump_job_ptr->start_time, time(NULL)); |
| if (time_limit != NO_VAL) { |
| end_time = MAX(dump_job_ptr->end_time, |
| (start_time + time_limit * 60)); |
| } |
| } else if (dump_job_ptr->details->begin_time > time(NULL)) { |
| /* earliest start time in the future */ |
| start_time = dump_job_ptr->details->begin_time; |
| if (time_limit != NO_VAL) { |
| end_time = MAX(dump_job_ptr->end_time, |
| (start_time + time_limit * 60)); |
| } |
| } |
| pack_time(start_time, buffer); |
| pack_time(end_time, buffer); |
| |
| if (dump_job_ptr->prio_mult) { |
| pack32_array(dump_job_ptr->prio_mult->priority_array, |
| (dump_job_ptr->prio_mult->priority_array) ? |
| list_count(dump_job_ptr->part_ptr_list) : |
| 0, buffer); |
| packstr(dump_job_ptr->prio_mult->priority_array_names, |
| buffer); |
| } else { |
| packnull(buffer); |
| packnull(buffer); |
| } |
| |
| packstr(slurm_conf.cluster_name, buffer); |
| |
| /* |
| * Only send the allocated nodelist since we are only sending |
| * the number of cpus and nodes that are currently allocated. |
| */ |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| packstr(dump_job_ptr->nodes, buffer); |
| else { |
| nodelist = bitmap2node_name( |
| dump_job_ptr->node_bitmap_cg); |
| packstr(nodelist, buffer); |
| xfree(nodelist); |
| } |
| packstr(dump_job_ptr->sched_nodes, buffer); |
| |
| if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) |
| packstr(dump_job_ptr->part_ptr->name, buffer); |
| else |
| packstr(dump_job_ptr->partition, buffer); |
| |
| if (IS_JOB_PENDING(dump_job_ptr) && |
| dump_job_ptr->details->qos_req) |
| packstr(dump_job_ptr->details->qos_req, buffer); |
| else { |
| if (!has_qos_lock) |
| assoc_mgr_lock(&locks); |
| if (dump_job_ptr->qos_ptr) |
| packstr(dump_job_ptr->qos_ptr->name, buffer); |
| else { |
| if (assoc_mgr_qos_list) { |
| packstr(slurmdb_qos_str( |
| assoc_mgr_qos_list, |
| dump_job_ptr->qos_id), |
| buffer); |
| } else |
| packnull(buffer); |
| } |
| } |
| |
| if (IS_JOB_STARTED(dump_job_ptr) && |
| (slurm_conf.preempt_mode != PREEMPT_MODE_OFF) && |
| (slurm_job_preempt_mode(dump_job_ptr) != |
| PREEMPT_MODE_OFF)) { |
| time_t preemptable = acct_policy_get_preemptable_time( |
| dump_job_ptr); |
| pack_time(preemptable, buffer); |
| } else { |
| pack_time(0, buffer); |
| } |
| if (!has_qos_lock) |
| assoc_mgr_unlock(&locks); |
| |
| if (show_flags & SHOW_DETAIL) { |
| pack_job_resources(dump_job_ptr->job_resrcs, buffer, |
| protocol_version); |
| _pack_job_gres(dump_job_ptr, buffer, protocol_version); |
| } else { |
| pack32(NO_VAL, buffer); |
| pack32((uint32_t)0, buffer); |
| } |
| |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer); |
| else |
| pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer); |
| |
| /* A few details are always dumped here */ |
| _pack_default_job_details(dump_job_ptr, buffer, |
| protocol_version); |
| |
| /* |
| * other job details are only dumped until the job starts |
| * running (at which time they become meaningless) |
| */ |
| _pack_pending_job_details(dump_job_ptr->details, |
| buffer, protocol_version); |
| } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| detail_ptr = dump_job_ptr->details; |
| pack32(dump_job_ptr->array_job_id, buffer); |
| pack32(dump_job_ptr->array_task_id, buffer); |
| if (dump_job_ptr->array_recs) { |
| build_array_str(dump_job_ptr); |
| packstr(dump_job_ptr->array_recs->task_id_str, buffer); |
| pack32(dump_job_ptr->array_recs->max_run_tasks, buffer); |
| } else { |
| job_record_t *array_head = NULL; |
| packnull(buffer); |
| if (dump_job_ptr->array_job_id) { |
| array_head = find_job_record( |
| dump_job_ptr->array_job_id); |
| } |
| if (array_head && array_head->array_recs) { |
| pack32(array_head->array_recs->max_run_tasks, |
| buffer); |
| } else { |
| pack32(0, buffer); |
| } |
| } |
| |
| pack32(dump_job_ptr->assoc_id, buffer); |
| packstr(dump_job_ptr->container, buffer); |
| packstr(dump_job_ptr->container_id, buffer); |
| pack32(dump_job_ptr->delay_boot, buffer); |
| packstr(dump_job_ptr->failed_node, buffer); |
| pack32(dump_job_ptr->job_id, buffer); |
| pack32(dump_job_ptr->user_id, buffer); |
| pack32(dump_job_ptr->group_id, buffer); |
| pack32(dump_job_ptr->het_job_id, buffer); |
| packstr(dump_job_ptr->het_job_id_set, buffer); |
| pack32(dump_job_ptr->het_job_offset, buffer); |
| pack32(dump_job_ptr->profile, buffer); |
| |
| pack32(dump_job_ptr->job_state, buffer); |
| pack16(dump_job_ptr->batch_flag, buffer); |
| pack32(dump_job_ptr->state_reason, buffer); |
| pack8(0, buffer); /* was power_flags */ |
| pack8(dump_job_ptr->reboot, buffer); |
| pack16(dump_job_ptr->restart_cnt, buffer); |
| pack16(show_flags, buffer); |
| pack_time(dump_job_ptr->deadline, buffer); |
| |
| pack32(dump_job_ptr->alloc_sid, buffer); |
| if ((dump_job_ptr->time_limit == NO_VAL) && |
| dump_job_ptr->part_ptr) |
| time_limit = dump_job_ptr->part_ptr->max_time; |
| else |
| time_limit = dump_job_ptr->time_limit; |
| |
| pack32(time_limit, buffer); |
| pack32(dump_job_ptr->time_min, buffer); |
| |
| if (dump_job_ptr->details) { |
| pack32(dump_job_ptr->details->nice, buffer); |
| pack_time(dump_job_ptr->details->submit_time, buffer); |
| /* Earliest possible begin time */ |
| begin_time = dump_job_ptr->details->begin_time; |
| /* When we started accruing time for priority */ |
| accrue_time = dump_job_ptr->details->accrue_time; |
| } else { /* Some job details may be purged after completion */ |
| pack32(NICE_OFFSET, buffer); /* Best guess */ |
| pack_time((time_t)0, buffer); |
| } |
| |
| pack_time(begin_time, buffer); |
| pack_time(accrue_time, buffer); |
| |
| if (IS_JOB_STARTED(dump_job_ptr)) { |
| /* Report actual start time, in past */ |
| start_time = dump_job_ptr->start_time; |
| end_time = dump_job_ptr->end_time; |
| } else if (dump_job_ptr->start_time != 0) { |
| /* |
| * Report expected start time, |
| * making sure that time is not in the past |
| */ |
| start_time = MAX(dump_job_ptr->start_time, time(NULL)); |
| if (time_limit != NO_VAL) { |
| end_time = MAX(dump_job_ptr->end_time, |
| (start_time + time_limit * 60)); |
| } |
| } else if (begin_time > time(NULL)) { |
| /* earliest start time in the future */ |
| start_time = begin_time; |
| if (time_limit != NO_VAL) { |
| end_time = MAX(dump_job_ptr->end_time, |
| (start_time + time_limit * 60)); |
| } |
| } |
| pack_time(start_time, buffer); |
| pack_time(end_time, buffer); |
| |
| pack_time(dump_job_ptr->suspend_time, buffer); |
| pack_time(dump_job_ptr->pre_sus_time, buffer); |
| pack_time(dump_job_ptr->resize_time, buffer); |
| pack_time(dump_job_ptr->last_sched_eval, buffer); |
| pack_time(dump_job_ptr->preempt_time, buffer); |
| pack32(dump_job_ptr->priority, buffer); |
| if (dump_job_ptr->prio_mult) { |
| pack32_array(dump_job_ptr->prio_mult->priority_array, |
| (dump_job_ptr->prio_mult->priority_array) ? |
| list_count(dump_job_ptr->part_ptr_list) : |
| 0, buffer); |
| packstr(dump_job_ptr->prio_mult->priority_array_names, |
| buffer); |
| } else { |
| packnull(buffer); |
| packnull(buffer); |
| } |
| packdouble(dump_job_ptr->billable_tres, buffer); |
| |
| packstr(slurm_conf.cluster_name, buffer); |
| /* |
| * Only send the allocated nodelist since we are only sending |
| * the number of cpus and nodes that are currently allocated. |
| */ |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| packstr(dump_job_ptr->nodes, buffer); |
| else { |
| nodelist = bitmap2node_name( |
| dump_job_ptr->node_bitmap_cg); |
| packstr(nodelist, buffer); |
| xfree(nodelist); |
| } |
| |
| packstr(dump_job_ptr->sched_nodes, buffer); |
| |
| if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) |
| packstr(dump_job_ptr->part_ptr->name, buffer); |
| else |
| packstr(dump_job_ptr->partition, buffer); |
| packstr(dump_job_ptr->account, buffer); |
| packstr(dump_job_ptr->admin_comment, buffer); |
| pack32(dump_job_ptr->site_factor, buffer); |
| packstr(dump_job_ptr->network, buffer); |
| packstr(dump_job_ptr->comment, buffer); |
| packstr(dump_job_ptr->extra, buffer); |
| packstr(dump_job_ptr->container, buffer); |
| packstr(dump_job_ptr->batch_features, buffer); |
| packstr(dump_job_ptr->batch_host, buffer); |
| packstr(dump_job_ptr->burst_buffer, buffer); |
| packstr(dump_job_ptr->burst_buffer_state, buffer); |
| packstr(dump_job_ptr->system_comment, buffer); |
| |
| if (!has_qos_lock) |
| assoc_mgr_lock(&locks); |
| if (dump_job_ptr->qos_ptr) |
| packstr(dump_job_ptr->qos_ptr->name, buffer); |
| else { |
| if (assoc_mgr_qos_list) { |
| packstr(slurmdb_qos_str(assoc_mgr_qos_list, |
| dump_job_ptr->qos_id), |
| buffer); |
| } else |
| packnull(buffer); |
| } |
| |
| if (IS_JOB_STARTED(dump_job_ptr) && |
| (slurm_conf.preempt_mode != PREEMPT_MODE_OFF) && |
| (slurm_job_preempt_mode(dump_job_ptr) != |
| PREEMPT_MODE_OFF)) { |
| time_t preemptable = acct_policy_get_preemptable_time( |
| dump_job_ptr); |
| pack_time(preemptable, buffer); |
| } else { |
| pack_time(0, buffer); |
| } |
| if (!has_qos_lock) |
| assoc_mgr_unlock(&locks); |
| |
| packstr(dump_job_ptr->licenses, buffer); |
| packstr(dump_job_ptr->state_desc, buffer); |
| packstr(dump_job_ptr->resv_name, buffer); |
| packstr(dump_job_ptr->resv_ports, buffer); |
| packstr(dump_job_ptr->mcs_label, buffer); |
| |
| pack32(dump_job_ptr->exit_code, buffer); |
| pack32(dump_job_ptr->derived_ec, buffer); |
| |
| packstr(dump_job_ptr->gres_used, buffer); |
| if (show_flags & SHOW_DETAIL) { |
| pack_job_resources(dump_job_ptr->job_resrcs, buffer, |
| protocol_version); |
| _pack_job_gres(dump_job_ptr, buffer, protocol_version); |
| } else { |
| pack32(NO_VAL, buffer); |
| pack32((uint32_t)0, buffer); |
| } |
| |
| packstr(dump_job_ptr->name, buffer); |
| packstr(dump_job_ptr->user_name, buffer); |
| packstr(dump_job_ptr->wckey, buffer); |
| pack32(dump_job_ptr->req_switch, buffer); |
| pack32(dump_job_ptr->wait4switch, buffer); |
| |
| packstr(dump_job_ptr->alloc_node, buffer); |
| if (!IS_JOB_COMPLETING(dump_job_ptr)) |
| pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer); |
| else |
| pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer); |
| |
| /* A few details are always dumped here */ |
| _pack_default_job_details(dump_job_ptr, buffer, |
| protocol_version); |
| |
| /* |
| * other job details are only dumped until the job starts |
| * running (at which time they become meaningless) |
| */ |
| if (detail_ptr) |
| _pack_pending_job_details(detail_ptr, buffer, |
| protocol_version); |
| else |
| _pack_pending_job_details(NULL, buffer, |
| protocol_version); |
| pack64(dump_job_ptr->bit_flags, buffer); |
| packstr(dump_job_ptr->tres_fmt_alloc_str, buffer); |
| packstr(dump_job_ptr->tres_fmt_req_str, buffer); |
| pack16(dump_job_ptr->start_protocol_ver, buffer); |
| |
| if (dump_job_ptr->fed_details) { |
| packstr(dump_job_ptr->fed_details->origin_str, buffer); |
| pack64(dump_job_ptr->fed_details->siblings_active, |
| buffer); |
| packstr(dump_job_ptr->fed_details->siblings_active_str, |
| buffer); |
| pack64(dump_job_ptr->fed_details->siblings_viable, |
| buffer); |
| packstr(dump_job_ptr->fed_details->siblings_viable_str, |
| buffer); |
| } else { |
| packnull(buffer); |
| pack64((uint64_t)0, buffer); |
| packnull(buffer); |
| pack64((uint64_t)0, buffer); |
| packnull(buffer); |
| } |
| |
| packstr(dump_job_ptr->cpus_per_tres, buffer); |
| packstr(dump_job_ptr->mem_per_tres, buffer); |
| packstr(dump_job_ptr->tres_bind, buffer); |
| packstr(dump_job_ptr->tres_freq, buffer); |
| packstr(dump_job_ptr->tres_per_job, buffer); |
| packstr(dump_job_ptr->tres_per_node, buffer); |
| packstr(dump_job_ptr->tres_per_socket, buffer); |
| packstr(dump_job_ptr->tres_per_task, buffer); |
| |
| pack16(dump_job_ptr->mail_type, buffer); |
| packstr(dump_job_ptr->mail_user, buffer); |
| |
| packstr(dump_job_ptr->selinux_context, buffer); |
| } else { |
| error("pack_job: protocol_version " |
| "%hu not supported", protocol_version); |
| } |
| } |
| |
| static void _find_node_config(int *cpu_cnt_ptr, int *core_cnt_ptr) |
| { |
| static int max_cpu_cnt = -1, max_core_cnt = -1; |
| static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; |
| int i; |
| node_record_t *node_ptr; |
| |
| slurm_mutex_lock(&lock); |
| if (max_cpu_cnt == -1) { |
| for (i = 0; (node_ptr = next_node(&i)); i++) { |
| /* Only data from config_record used for scheduling */ |
| max_cpu_cnt = MAX(max_cpu_cnt, |
| node_ptr->config_ptr->cpus); |
| max_core_cnt = MAX(max_core_cnt, |
| node_ptr->config_ptr->cores); |
| } |
| } |
| slurm_mutex_unlock(&lock); |
| |
| *cpu_cnt_ptr = max_cpu_cnt; |
| *core_cnt_ptr = max_core_cnt; |
| } |
| |
| /* pack default job details for "get_job_info" RPC */ |
| static void _pack_default_job_details(job_record_t *job_ptr, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| int max_cpu_cnt = -1, max_core_cnt = -1; |
| job_details_t *detail_ptr = job_ptr->details; |
| uint16_t shared = 0; |
| |
| shared = get_job_share_value(job_ptr); |
| |
| if (job_ptr->part_ptr && job_ptr->part_ptr->max_cpu_cnt) { |
| max_cpu_cnt = job_ptr->part_ptr->max_cpu_cnt; |
| max_core_cnt = job_ptr->part_ptr->max_core_cnt; |
| } else |
| _find_node_config(&max_cpu_cnt, &max_core_cnt); |
| |
| if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) { |
| if (!detail_ptr) { |
| packbool(false, buffer); |
| |
| if (job_ptr->total_cpus) |
| pack32(job_ptr->total_cpus, buffer); |
| else |
| pack32(job_ptr->cpu_cnt, buffer); |
| |
| pack32(job_ptr->node_cnt, buffer); |
| pack32(NICE_OFFSET, buffer); /* Best guess */ |
| return; |
| } |
| packbool(true, buffer); |
| job_record_pack_details_common(detail_ptr, buffer, |
| protocol_version); |
| |
| if (!IS_JOB_PENDING(job_ptr)) { |
| packstr(detail_ptr->features_use, buffer); |
| packnull(buffer); |
| } else { |
| packstr(detail_ptr->features, buffer); |
| packstr(detail_ptr->prefer, buffer); |
| } |
| |
| if (detail_ptr->argv) |
| packstr(detail_ptr->argv[0], buffer); |
| else |
| packnull(buffer); |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) { |
| pack32(job_ptr->cpu_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_cpus && |
| !IS_JOB_PENDING(job_ptr)) { |
| /* If job is PENDING ignore total_cpus, |
| * which may have been set by previous run |
| * followed by job requeue. */ |
| pack32(job_ptr->total_cpus, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else { |
| pack32(detail_ptr->min_cpus, buffer); |
| if (detail_ptr->max_cpus != NO_VAL) |
| pack32(detail_ptr->max_cpus, buffer); |
| else |
| pack32((uint32_t) 0, buffer); |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) { |
| pack32(job_ptr->node_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_nodes) { |
| pack32(job_ptr->total_nodes, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->node_cnt_wag) { |
| /* This should catch everything else, but |
| * just in case this is 0 (startup or |
| * whatever) we will keep the rest of |
| * this if statement around. |
| */ |
| pack32(job_ptr->node_cnt_wag, buffer); |
| pack32((uint32_t) detail_ptr->max_nodes, |
| buffer); |
| } else if (detail_ptr->ntasks_per_node) { |
| /* min_nodes based upon task count and ntasks |
| * per node */ |
| uint32_t min_nodes; |
| min_nodes = detail_ptr->num_tasks / |
| detail_ptr->ntasks_per_node; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else if (detail_ptr->cpus_per_task > 1) { |
| /* min_nodes based upon task count and cpus |
| * per task */ |
| uint32_t ntasks_per_node, min_nodes; |
| ntasks_per_node = max_cpu_cnt / |
| detail_ptr->cpus_per_task; |
| ntasks_per_node = MAX(ntasks_per_node, 1); |
| min_nodes = detail_ptr->num_tasks / |
| ntasks_per_node; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else if (detail_ptr->mc_ptr && |
| detail_ptr->mc_ptr->ntasks_per_core && |
| (detail_ptr->mc_ptr->ntasks_per_core |
| != INFINITE16)) { |
| /* min_nodes based upon task count and ntasks |
| * per core */ |
| uint32_t min_cores, min_nodes; |
| min_cores = ROUNDUP(detail_ptr->num_tasks, |
| detail_ptr->mc_ptr-> |
| ntasks_per_core); |
| min_nodes = ROUNDUP(min_cores, max_core_cnt); |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else { |
| /* min_nodes based upon task count only */ |
| uint32_t min_nodes; |
| uint32_t max_nodes; |
| |
| min_nodes = ROUNDUP(detail_ptr->num_tasks, |
| max_cpu_cnt); |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| max_nodes = MAX(min_nodes, |
| detail_ptr->max_nodes); |
| pack32(min_nodes, buffer); |
| pack32(max_nodes, buffer); |
| } |
| if (detail_ptr->num_tasks) |
| pack32(detail_ptr->num_tasks, buffer); |
| else if (IS_JOB_PENDING(job_ptr)) |
| pack32(detail_ptr->min_nodes, buffer); |
| else if (job_ptr->tres_alloc_cnt) |
| pack32((uint32_t) |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE], |
| buffer); |
| else |
| pack32(NO_VAL, buffer); |
| |
| pack16(shared, buffer); |
| |
| if (detail_ptr->crontab_entry) |
| packstr(detail_ptr->crontab_entry->cronspec, |
| buffer); |
| else |
| packnull(buffer); |
| } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| if (detail_ptr) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| packstr(detail_ptr->features_use, buffer); |
| packnull(buffer); |
| } else { |
| packstr(detail_ptr->features, buffer); |
| packstr(detail_ptr->prefer, buffer); |
| } |
| packstr(detail_ptr->cluster_features, buffer); |
| packstr(detail_ptr->work_dir, buffer); |
| packstr(detail_ptr->dependency, buffer); |
| |
| if (detail_ptr->argv) |
| packstr(detail_ptr->argv[0], buffer); |
| else |
| packnull(buffer); |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) { |
| pack32(job_ptr->cpu_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_cpus && |
| !IS_JOB_PENDING(job_ptr)) { |
| /* If job is PENDING ignore total_cpus, |
| * which may have been set by previous run |
| * followed by job requeue. */ |
| pack32(job_ptr->total_cpus, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else { |
| pack32(detail_ptr->min_cpus, buffer); |
| if (detail_ptr->max_cpus != NO_VAL) |
| pack32(detail_ptr->max_cpus, buffer); |
| else |
| pack32((uint32_t) 0, buffer); |
| } |
| |
| if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) { |
| pack32(job_ptr->node_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->total_nodes) { |
| pack32(job_ptr->total_nodes, buffer); |
| pack32((uint32_t) 0, buffer); |
| } else if (job_ptr->node_cnt_wag) { |
| /* This should catch everything else, but |
| * just in case this is 0 (startup or |
| * whatever) we will keep the rest of |
| * this if statement around. |
| */ |
| pack32(job_ptr->node_cnt_wag, buffer); |
| pack32((uint32_t) detail_ptr->max_nodes, |
| buffer); |
| } else if (detail_ptr->ntasks_per_node) { |
| /* min_nodes based upon task count and ntasks |
| * per node */ |
| uint32_t min_nodes; |
| min_nodes = detail_ptr->num_tasks / |
| detail_ptr->ntasks_per_node; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else if (detail_ptr->cpus_per_task > 1) { |
| /* min_nodes based upon task count and cpus |
| * per task */ |
| uint32_t ntasks_per_node, min_nodes; |
| ntasks_per_node = max_cpu_cnt / |
| detail_ptr->cpus_per_task; |
| ntasks_per_node = MAX(ntasks_per_node, 1); |
| min_nodes = detail_ptr->num_tasks / |
| ntasks_per_node; |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else if (detail_ptr->mc_ptr && |
| detail_ptr->mc_ptr->ntasks_per_core && |
| (detail_ptr->mc_ptr->ntasks_per_core |
| != INFINITE16)) { |
| /* min_nodes based upon task count and ntasks |
| * per core */ |
| uint32_t min_cores, min_nodes; |
| min_cores = ROUNDUP(detail_ptr->num_tasks, |
| detail_ptr->mc_ptr-> |
| ntasks_per_core); |
| min_nodes = ROUNDUP(min_cores, max_core_cnt); |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| pack32(min_nodes, buffer); |
| pack32(detail_ptr->max_nodes, buffer); |
| } else { |
| /* min_nodes based upon task count only */ |
| uint32_t min_nodes; |
| uint32_t max_nodes; |
| |
| min_nodes = ROUNDUP(detail_ptr->num_tasks, |
| max_cpu_cnt); |
| min_nodes = MAX(min_nodes, |
| detail_ptr->min_nodes); |
| max_nodes = MAX(min_nodes, |
| detail_ptr->max_nodes); |
| pack32(min_nodes, buffer); |
| pack32(max_nodes, buffer); |
| } |
| pack_bit_str_hex(detail_ptr->job_size_bitmap, buffer); |
| |
| pack16(detail_ptr->requeue, buffer); |
| pack16(detail_ptr->ntasks_per_node, buffer); |
| pack16(detail_ptr->ntasks_per_tres, buffer); |
| if (detail_ptr->num_tasks) |
| pack32(detail_ptr->num_tasks, buffer); |
| else if (IS_JOB_PENDING(job_ptr)) |
| pack32(detail_ptr->min_nodes, buffer); |
| else if (job_ptr->tres_alloc_cnt) |
| pack32((uint32_t) |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE], |
| buffer); |
| else |
| pack32(NO_VAL, buffer); |
| |
| pack16(shared, buffer); |
| pack32(detail_ptr->cpu_freq_min, buffer); |
| pack32(detail_ptr->cpu_freq_max, buffer); |
| pack32(detail_ptr->cpu_freq_gov, buffer); |
| |
| if (detail_ptr->crontab_entry) |
| packstr(detail_ptr->crontab_entry->cronspec, |
| buffer); |
| else |
| packnull(buffer); |
| } else { |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| if (job_ptr->total_cpus) |
| pack32(job_ptr->total_cpus, buffer); |
| else |
| pack32(job_ptr->cpu_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| pack32(job_ptr->node_cnt, buffer); |
| pack32((uint32_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack32((uint32_t) 0, buffer); |
| pack32((uint32_t) 0, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| packnull(buffer); |
| } |
| } else { |
| error("_pack_default_job_details: protocol_version " |
| "%hu not supported", protocol_version); |
| } |
| } |
| |
| /* pack pending job details for "get_job_info" RPC */ |
| static void _pack_pending_job_details(job_details_t *detail_ptr, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) { |
| if (detail_ptr) { |
| pack16(detail_ptr->contiguous, buffer); |
| pack16(detail_ptr->core_spec, buffer); |
| pack16(detail_ptr->cpus_per_task, buffer); |
| pack16(detail_ptr->pn_min_cpus, buffer); |
| |
| pack64(detail_ptr->pn_min_memory, buffer); |
| pack32(detail_ptr->pn_min_tmp_disk, buffer); |
| |
| pack16(detail_ptr->oom_kill_step, buffer); |
| |
| packstr(detail_ptr->req_nodes, buffer); |
| pack_bit_str_hex(detail_ptr->req_node_bitmap, buffer); |
| packstr(detail_ptr->exc_nodes, buffer); |
| pack_bit_str_hex(detail_ptr->exc_node_bitmap, buffer); |
| |
| packstr(detail_ptr->std_err, buffer); |
| packstr(detail_ptr->std_in, buffer); |
| packstr(detail_ptr->std_out, buffer); |
| |
| pack_multi_core_data(detail_ptr->mc_ptr, buffer, |
| protocol_version); |
| } else { |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| |
| pack64((uint64_t) 0, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| pack16((uint16_t) 0, buffer); |
| |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| pack_multi_core_data(NULL, buffer, protocol_version); |
| } |
| } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| if (detail_ptr) { |
| pack16(detail_ptr->contiguous, buffer); |
| pack16(detail_ptr->core_spec, buffer); |
| pack16(detail_ptr->cpus_per_task, buffer); |
| pack16(detail_ptr->pn_min_cpus, buffer); |
| |
| pack64(detail_ptr->pn_min_memory, buffer); |
| pack32(detail_ptr->pn_min_tmp_disk, buffer); |
| |
| packstr(detail_ptr->req_nodes, buffer); |
| pack_bit_str_hex(detail_ptr->req_node_bitmap, buffer); |
| packstr(detail_ptr->exc_nodes, buffer); |
| pack_bit_str_hex(detail_ptr->exc_node_bitmap, buffer); |
| |
| packstr(detail_ptr->std_err, buffer); |
| packstr(detail_ptr->std_in, buffer); |
| packstr(detail_ptr->std_out, buffer); |
| |
| pack_multi_core_data(detail_ptr->mc_ptr, buffer, |
| protocol_version); |
| } else { |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| pack16((uint16_t) 0, buffer); |
| |
| pack64((uint64_t) 0, buffer); |
| pack32((uint32_t) 0, buffer); |
| |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| packnull(buffer); |
| packnull(buffer); |
| packnull(buffer); |
| |
| pack_multi_core_data(NULL, buffer, protocol_version); |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", __func__, |
| protocol_version); |
| } |
| } |
| |
| static int _foreach_set_het_job_for_purge(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| job_record_t *het_leader = arg; |
| |
| if (het_leader->het_job_id != het_job->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", __func__, het_leader); |
| return 0; |
| } |
| |
| het_job->bit_flags |= HETJOB_PURGE; |
| |
| return 0; |
| } |
| |
| static int _foreach_check_old_het_job(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| job_record_t *het_leader = arg; |
| |
| if (het_leader->het_job_id != het_job->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", __func__, het_leader); |
| return 0; |
| } |
| |
| if (!_list_find_job_old(het_job, NULL)) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* If this is a hetjob leader and all components are complete, |
| * then purge all job of its hetjob records |
| * RET true if this record purged */ |
| static inline bool _purge_complete_het_job(job_record_t *het_job_leader) |
| { |
| if (!het_job_leader->het_job_list) |
| return false; /* Not hetjob leader */ |
| if (!IS_JOB_FINISHED(het_job_leader)) |
| return false; /* Hetjob leader incomplete */ |
| |
| if (list_find_first(het_job_leader->het_job_list, |
| _foreach_check_old_het_job, |
| het_job_leader)) |
| return false; |
| |
| (void) list_for_each(het_job_leader->het_job_list, |
| _foreach_set_het_job_for_purge, |
| het_job_leader); |
| |
| return true; |
| } |
| |
| static int _foreach_pre_purge_old_job(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| |
| if (_purge_complete_het_job(job_ptr)) |
| return 0; |
| if (!IS_JOB_PENDING(job_ptr)) |
| return 0; |
| |
| if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL) && |
| !deadline_ok(job_ptr, __func__)) |
| return 0; |
| |
| /* |
| * If the dependency is already invalid there's no reason to |
| * keep checking it. |
| */ |
| if (job_ptr->state_reason == WAIT_DEP_INVALID) |
| return 0; |
| if (test_job_dependency(job_ptr, NULL) == FAIL_DEPEND) { |
| /* Check what are the job disposition |
| * to deal with invalid dependencies |
| */ |
| handle_invalid_dependency(job_ptr); |
| } |
| return 0; |
| } |
| |
| /* |
| * If the job or slurm.conf requests to not kill on invalid dependency, |
| * then set the job state reason to WAIT_DEP_INVALID. Otherwise, kill the |
| * job. |
| */ |
| void handle_invalid_dependency(job_record_t *job_ptr) |
| { |
| job_ptr->state_reason = WAIT_DEP_INVALID; |
| xfree(job_ptr->state_desc); |
| |
| if (job_ptr->mail_type & MAIL_INVALID_DEPEND) |
| mail_job_info(job_ptr, MAIL_INVALID_DEPEND); |
| |
| if (job_ptr->bit_flags & KILL_INV_DEP) { |
| _kill_dependent(job_ptr); |
| } else if (job_ptr->bit_flags & NO_KILL_INV_DEP) { |
| debug("%s: %pJ job dependency never satisfied", |
| __func__, job_ptr); |
| } else if (kill_invalid_dep) { |
| _kill_dependent(job_ptr); |
| } else { |
| debug("%s: %pJ job dependency never satisfied", |
| __func__, job_ptr); |
| job_ptr->state_reason = WAIT_DEP_INVALID; |
| } |
| fed_mgr_remove_remote_dependencies(job_ptr); |
| } |
| |
| /* |
| * purge_old_job - purge old job records. |
| * The jobs must have completed at least MIN_JOB_AGE minutes ago. |
| * Test job dependencies, handle after_ok, after_not_ok before |
| * purging any jobs. |
| */ |
| void purge_old_job(void) |
| { |
| int i, purge_job_count; |
| |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(NODE_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(FED_LOCK, READ_LOCK)); |
| |
| if ((purge_job_count = list_count(purge_files_list))) |
| debug("%s: job file deletion is falling behind, " |
| "%d left to remove", __func__, purge_job_count); |
| |
| (void) list_for_each(job_list, _foreach_pre_purge_old_job, NULL); |
| |
| fed_mgr_test_remote_dependencies(); |
| |
| i = list_delete_all(job_list, &_list_find_job_old, ""); |
| if (i) { |
| debug2("purge_old_job: purged %d old job records", i); |
| last_job_update = time(NULL); |
| slurm_mutex_lock(&purge_thread_lock); |
| slurm_cond_signal(&purge_thread_cond); |
| slurm_mutex_unlock(&purge_thread_lock); |
| } |
| } |
| |
| extern void free_old_jobs(void) |
| { |
| job_record_t *job_ptr; |
| /* |
| * Delete records one-by-one to avoid blocking purge_job_record(). |
| */ |
| while ((job_ptr = list_pop(purge_jobs_list))) |
| job_record_delete(job_ptr); |
| } |
| |
| /* |
| * purge_job_record - purge specific job record. No testing is performed to |
| * ensure the job records has no active references. Use only for job |
| * records that were never fully operational (e.g. WILL_RUN test, failed |
| * job load, failed job create, etc.). |
| * IN job_id - job_id of job record to be purged |
| * RET int - count of job's purged |
| * global: job_list - global job table |
| */ |
| extern int purge_job_record(uint32_t job_id) |
| { |
| int count = 0; |
| count = list_delete_all(job_list, _list_find_job_id, (void *)&job_id); |
| if (count) { |
| last_job_update = time(NULL); |
| slurm_mutex_lock(&purge_thread_lock); |
| slurm_cond_signal(&purge_thread_cond); |
| slurm_mutex_unlock(&purge_thread_lock); |
| } |
| |
| return count; |
| } |
| |
| extern void unlink_job_record(job_record_t *job_ptr) |
| { |
| uint32_t *job_id; |
| |
| xassert(job_ptr->magic == JOB_MAGIC); |
| |
| _delete_job_common(job_ptr); |
| |
| job_id = xmalloc(sizeof(uint32_t)); |
| *job_id = job_ptr->job_id; |
| list_enqueue(purge_files_list, job_id); |
| |
| job_ptr->job_id = NO_VAL; |
| |
| last_job_update = time(NULL); |
| slurm_mutex_lock(&purge_thread_lock); |
| slurm_cond_signal(&purge_thread_cond); |
| slurm_mutex_unlock(&purge_thread_lock); |
| } |
| |
| /* update first assigned job id as needed on reconfigure */ |
| void reset_first_job_id(void) |
| { |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| job_id_sequence = MAX(job_id_sequence, slurm_conf.first_job_id); |
| } |
| |
| /* |
| * Return the next available job_id to be used. |
| * |
| * IN test_only - if true, doesn't advance the job_id sequence, just returns |
| * what the next job id will be. |
| * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted. |
| */ |
| extern uint32_t get_next_job_id(bool test_only) |
| { |
| int i; |
| uint32_t new_id, max_jobs, tmp_id_sequence; |
| |
| xassert(verify_lock(JOB_LOCK, READ_LOCK)); |
| xassert(test_only || verify_lock(JOB_LOCK, WRITE_LOCK)); |
| xassert(verify_lock(FED_LOCK, READ_LOCK)); |
| |
| max_jobs = slurm_conf.max_job_id - slurm_conf.first_job_id; |
| tmp_id_sequence = MAX(job_id_sequence, slurm_conf.first_job_id); |
| |
| /* Ensure no conflict in job id if we roll over 32 bits */ |
| for (i = 0; i < max_jobs; i++) { |
| if (tmp_id_sequence >= slurm_conf.max_job_id) |
| tmp_id_sequence = slurm_conf.first_job_id; |
| |
| new_id = fed_mgr_get_job_id(tmp_id_sequence); |
| |
| if (find_job_record(new_id)) { |
| tmp_id_sequence++; |
| continue; |
| } |
| if (_dup_job_file_test(new_id)) { |
| tmp_id_sequence++; |
| continue; |
| } |
| |
| if (!test_only) |
| job_id_sequence = tmp_id_sequence + 1; |
| |
| return new_id; |
| } |
| |
| error("We have exhausted our supply of valid job id values. FirstJobId=%u MaxJobId=%u", |
| slurm_conf.first_job_id, slurm_conf.max_job_id); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * _set_job_id - set a default job_id, ensure that it is unique |
| * IN job_ptr - pointer to the job_record |
| */ |
| static int _set_job_id(job_record_t *job_ptr) |
| { |
| uint32_t new_id; |
| |
| xassert(job_ptr); |
| xassert (job_ptr->magic == JOB_MAGIC); |
| |
| if ((new_id = get_next_job_id(false)) != SLURM_ERROR) { |
| job_ptr->job_id = new_id; |
| /* When we get a new job id might as well make sure |
| * the db_index is set since there is no way it will be |
| * correct otherwise :). */ |
| job_record_set_sluid(job_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| job_ptr->job_id = NO_VAL; |
| return EAGAIN; |
| } |
| |
| |
| /* |
| * set_job_prio - set a default job priority |
| * IN job_ptr - pointer to the job_record |
| */ |
| extern void set_job_prio(job_record_t *job_ptr) |
| { |
| uint32_t relative_prio; |
| |
| xassert(job_ptr); |
| xassert (job_ptr->magic == JOB_MAGIC); |
| |
| if (IS_JOB_FINISHED(job_ptr)) |
| return; |
| job_ptr->priority = priority_g_set(lowest_prio, job_ptr); |
| if ((job_ptr->priority == 0) || (job_ptr->direct_set_prio)) |
| return; |
| |
| relative_prio = job_ptr->priority; |
| if (job_ptr->details && (job_ptr->details->nice != NICE_OFFSET)) { |
| int64_t offset = job_ptr->details->nice; |
| offset -= NICE_OFFSET; |
| relative_prio += offset; |
| } |
| lowest_prio = MIN(relative_prio, lowest_prio); |
| } |
| |
| /* After recovering job state, if using priority/basic then we increment the |
| * priorities of all jobs to avoid decrementing the base down to zero */ |
| extern void sync_job_priorities(void) |
| { |
| uint32_t prio_boost = 0; |
| |
| if ((highest_prio != 0) && (highest_prio < TOP_PRIORITY)) |
| prio_boost = TOP_PRIORITY - highest_prio; |
| |
| prio_boost = priority_g_recover(prio_boost); |
| lowest_prio += prio_boost; |
| } |
| |
| /* |
| * _higher_precedence - determine if job_ptr should be considered before |
| * job_ptr2 when scheduling jobs at submission time. |
| * This compares priority, submit time, and job id (in this order). |
| * |
| * IN job_ptr - pointer to first job |
| * IN job_ptr2 - pointer to second job |
| * RET true if job_ptr has higher scheduling precedence over job_ptr2 |
| */ |
| static bool _higher_precedence(job_record_t *job_ptr, job_record_t *job_ptr2) |
| { |
| xassert(job_ptr); |
| xassert(job_ptr2); |
| |
| /* Compare priority */ |
| if (job_ptr->priority > job_ptr2->priority) |
| return true; |
| if (job_ptr2->priority > job_ptr->priority) |
| return false; |
| |
| /* Compare submit time */ |
| if (job_ptr->details->submit_time && job_ptr2->details->submit_time) { |
| if (job_ptr->details->submit_time < |
| job_ptr2->details->submit_time) |
| return true; |
| if (job_ptr2->details->submit_time < |
| job_ptr->details->submit_time) |
| return false; |
| } |
| |
| /* Compare job id */ |
| return job_ptr->job_id < job_ptr2->job_id; |
| } |
| |
| static int _is_flex_or_any_nodes(void *x, void *none) |
| { |
| slurmctld_resv_t *resv_ptr = x; |
| xassert(resv_ptr); |
| if (resv_ptr->flags & (RESERVE_FLAG_FLEX | RESERVE_FLAG_ANY_NODES)) |
| return true; |
| return false; |
| } |
| |
| static bool _use_none_resv_nodes(job_record_t *job_ptr) |
| { |
| if (!job_ptr->resv_name) |
| return true; /* no reservation is used */ |
| if (!job_ptr->resv_list) |
| return _is_flex_or_any_nodes(job_ptr->resv_ptr, NULL); |
| return list_find_first(job_ptr->resv_list, _is_flex_or_any_nodes, NULL); |
| } |
| |
| static int _match_resv_id(void *x, void *key) |
| { |
| slurmctld_resv_t *resv_ptr = (slurmctld_resv_t *) x; |
| uint32_t *resv_id = (uint32_t *) key; |
| |
| xassert(resv_ptr); |
| |
| if (resv_ptr->resv_id != *resv_id) |
| return 0; |
| else |
| return 1; /* match */ |
| } |
| |
| static int _will_resv_allow_warn_time(void *x, void *arg) |
| { |
| slurmctld_resv_t *resv_ptr = x; |
| uint16_t *warn_time = arg; |
| |
| xassert(resv_ptr); |
| xassert(warn_time); |
| |
| if (resv_ptr->max_start_delay && |
| (*warn_time <= resv_ptr->max_start_delay)) |
| return true; |
| |
| return false; |
| } |
| |
| static int _findfirst_resv_overlap_internal(void *x, void *arg) |
| { |
| slurmctld_resv_t *cur_resv_in = x; |
| findfirst_resv_overlap_t *findfirst_resv_overlap = arg; |
| slurmctld_resv_t *cur_resv_check = findfirst_resv_overlap->cur_resv; |
| |
| if (cur_resv_check->resv_id == cur_resv_in->resv_id) { |
| findfirst_resv_overlap->found = true; |
| return -1; |
| } else if (cur_resv_check->resv_id < cur_resv_in->resv_id) { |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| static int _findfirst_resv_overlap(void *x, void *arg) |
| { |
| findfirst_resv_overlap_t *findfirst_resv_overlap = arg; |
| job_record_t *job_ptr2 = findfirst_resv_overlap->job_ptr2; |
| |
| findfirst_resv_overlap->cur_resv = x; |
| |
| /* |
| * Continue if the cur_resv is less than any of the resv in the |
| * second job's list. Otherwise return |
| * findfirst_resv_overlap.found. |
| */ |
| if (!list_find_first(job_ptr2->resv_list, |
| _findfirst_resv_overlap_internal, |
| findfirst_resv_overlap) || |
| findfirst_resv_overlap->found) |
| return -1; |
| |
| return 0; |
| } |
| |
| static bool _can_resv_overlap(top_prio_args_t *job_args, job_record_t *job_ptr2) |
| { |
| job_record_t *job_ptr1 = job_args->job_ptr; |
| findfirst_resv_overlap_t findfirst_resv_overlap = { |
| .found = false, |
| .job_ptr2 = job_ptr2, |
| }; |
| |
| if (job_args->use_none_resv_nodes && _use_none_resv_nodes(job_ptr2)) |
| return true; |
| |
| /* |
| * If job_ptr1 does not have a resv but uses --signal=R, check if any of |
| * job_ptr2's resv will allow overlap. |
| */ |
| if (!job_ptr1->resv_ptr && job_ptr2->resv_ptr && |
| (job_ptr1->warn_flags & KILL_JOB_RESV)) { |
| if (!job_ptr2->resv_list) |
| return _will_resv_allow_warn_time(job_ptr2->resv_ptr, |
| &job_ptr1->warn_time); |
| return list_find_first(job_ptr2->resv_list, |
| _will_resv_allow_warn_time, |
| &job_ptr1->warn_time); |
| } |
| |
| /* If 0-1 resv is used per job see if they match */ |
| if (!job_ptr1->resv_list && !job_ptr2->resv_list) |
| return !xstrcmp(job_ptr1->resv_name, job_ptr2->resv_name); |
| |
| /* If one doesn't use resv at this point they can't overlap */ |
| if (!job_ptr1->resv_ptr || !job_ptr2->resv_ptr) |
| return false; |
| |
| /* If one has a list of resv and the other has one resv */ |
| if (job_ptr1->resv_list && !job_ptr2->resv_list) |
| return list_find_first(job_ptr1->resv_list, _match_resv_id, |
| &job_ptr2->resv_ptr->resv_id); |
| if (job_ptr2->resv_list && !job_ptr1->resv_list) |
| return list_find_first(job_ptr2->resv_list, _match_resv_id, |
| &job_ptr1->resv_ptr->resv_id); |
| |
| /* Both jobs have resv lists - Note resv_list is sorted by id */ |
| (void) list_find_first(job_ptr1->resv_list, _findfirst_resv_overlap, |
| &findfirst_resv_overlap); |
| |
| return findfirst_resv_overlap.found; |
| } |
| |
| static int _union_part_nodes(void *x, void *arg) |
| { |
| part_record_t *part_ptr = x; |
| bitstr_t *node_bitmap = arg; |
| |
| xassert(part_ptr); |
| xassert(node_bitmap); |
| |
| bit_or(node_bitmap, part_ptr->node_bitmap); |
| return SLURM_SUCCESS; |
| } |
| |
| static bitstr_t *_get_all_part_nodes(job_record_t *job_ptr) |
| { |
| bitstr_t *node_bitmap = NULL; |
| |
| if (!job_ptr->part_ptr_list) |
| return bit_copy(job_ptr->part_ptr->node_bitmap); |
| |
| node_bitmap = bit_alloc(bit_size(job_ptr->part_ptr->node_bitmap)); |
| list_for_each(job_ptr->part_ptr_list, _union_part_nodes, node_bitmap); |
| return node_bitmap; |
| } |
| |
| /* Return 1 if higher, 0 if the same, and -1 if lower */ |
| static int _cmp_part_prio_tier(top_prio_args_t *job_args, |
| job_record_t *job_ptr2) |
| { |
| uint16_t max_prio_tier2 = job_ptr2->part_ptr->priority_tier; |
| if (job_ptr2->part_ptr_list) { |
| /* part_ptr_list is sorted by priority tier */ |
| part_record_t *part_ptr = list_peek(job_ptr2->part_ptr_list); |
| max_prio_tier2 = part_ptr->priority_tier; |
| } |
| |
| /* |
| * Comparing the min partition priority tier of job_ptr1 |
| * (the job in job_args) to the max of job_ptr2 is an optimization. It |
| * will prevent job_ptr1 from being considered top priority if it is |
| * possible for it to start in a lower priority tier partition than what |
| * job_ptr2 could start in, even if job_ptr1 could also potentially |
| * start in a higher priority tier partition. |
| */ |
| if (job_args->min_part_prio_tier > max_prio_tier2) |
| return 1; |
| if (job_args->min_part_prio_tier == max_prio_tier2) |
| return 0; |
| return -1; |
| } |
| |
| static int _set_min_prio_tier(void *x, void *arg) |
| { |
| part_record_t * part_ptr = x; |
| uint16_t *min_prio_tier = arg; |
| |
| xassert(part_ptr); |
| xassert(min_prio_tier); |
| |
| if (part_ptr->priority_tier < *min_prio_tier) |
| *min_prio_tier = part_ptr->priority_tier; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _destroy_top_prio_args(top_prio_args_t *args) |
| { |
| if (!args || !args->job_ptr) |
| return; |
| |
| /* Intentionally not freeing the job_ptr */ |
| FREE_NULL_BITMAP(args->part_nodes); |
| } |
| |
| static int _foreach_top_priority(void *x, void *arg) |
| { |
| job_record_t *job_ptr2 = x; |
| top_prio_args_t *job_args = arg; |
| job_record_t *job_ptr = job_args->job_ptr; |
| bool overlap_with_resv = false; |
| bool parts_overlap = false; |
| int part_prio_cmp; |
| bitstr_t *node_bitmap2 = NULL; |
| |
| xassert(job_args->job_ptr); |
| |
| if (job_ptr2 == job_ptr) |
| return 0; |
| if ((job_args->het_job_offset != NO_VAL) && |
| (job_ptr->job_id == (job_ptr2->job_id + job_args->het_job_offset))) |
| return 0; |
| if (!IS_JOB_PENDING(job_ptr2)) |
| return 0; |
| if (IS_JOB_COMPLETING(job_ptr2)) { |
| /* Job is hung in pending & completing state, |
| * indicative of job requeue */ |
| return 0; |
| } |
| |
| if (bf_min_age_reserve) { |
| int pend_time; |
| if (!job_ptr2->details->begin_time) |
| return 0; |
| pend_time = difftime(job_args->now, |
| job_ptr2->details->begin_time); |
| if (pend_time < bf_min_age_reserve) |
| return 0; |
| } |
| if (job_state_reason_check(job_ptr2->state_reason, |
| JSR_QOS_ASSOC | JSR_MISC | JSR_PART) || |
| !job_independent(job_ptr2)) |
| return 0; |
| |
| if (job_ptr->resv_name && !job_ptr2->resv_name) |
| return 0; /* job's with resv have priority */ |
| if (!_can_resv_overlap(job_args, job_ptr2)) |
| return 0; /* job can't overlap nodes */ |
| if (!job_ptr->resv_name && job_ptr2->resv_name) |
| overlap_with_resv = true; |
| |
| if (bb_g_job_test_stage_in(job_ptr2, true) != 1) |
| return 0; /* Waiting for buffer */ |
| |
| /* |
| * Priority tiers doesn't matter if job_ptr2 uses a resv |
| * and job_ptr does not since resv take precedence |
| */ |
| part_prio_cmp = overlap_with_resv ? |
| -1 : _cmp_part_prio_tier(job_args, job_ptr2); |
| if ((part_prio_cmp == 1) || |
| ((part_prio_cmp == 0) && _higher_precedence(job_ptr, job_ptr2))) |
| return 0; |
| |
| /* |
| * Here job_ptr2 is either in a higher priority tier |
| * partition or is using a resv while job_ptr is not. |
| * If partitions overlap job_ptr is not top priority. |
| */ |
| if (!job_args->part_nodes) |
| job_args->part_nodes = _get_all_part_nodes(job_ptr); |
| |
| node_bitmap2 = _get_all_part_nodes(job_ptr2); |
| parts_overlap = bit_overlap_any(job_args->part_nodes, node_bitmap2); |
| FREE_NULL_BITMAP(node_bitmap2); |
| |
| if (!parts_overlap) |
| return 0; /* no nodes overlap in partitions */ |
| |
| return -1; |
| } |
| |
| |
| /* |
| * _top_priority - determine if any other job has a higher priority than the |
| * specified job |
| * IN job_ptr - pointer to selected job |
| * RET true if selected job has highest priority |
| */ |
| static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset) |
| { |
| job_details_t *detail_ptr = job_ptr->details; |
| bool top; |
| |
| if (job_ptr->priority == 0) /* user held */ |
| top = false; |
| else { |
| top_prio_args_t job_args = { |
| .het_job_offset = het_job_offset, |
| .job_ptr = job_ptr, |
| .min_part_prio_tier = job_ptr->part_ptr->priority_tier, |
| .now = time(NULL), |
| .use_none_resv_nodes = _use_none_resv_nodes(job_ptr), |
| }; |
| |
| if (job_ptr->part_ptr_list) |
| list_for_each(job_ptr->part_ptr_list, |
| _set_min_prio_tier, |
| &job_args.min_part_prio_tier); |
| |
| |
| top = true; /* assume top priority until found otherwise */ |
| if (list_find_first(job_list, _foreach_top_priority, &job_args)) |
| top = false; |
| |
| _destroy_top_prio_args(&job_args); |
| } |
| |
| if ((!top) && detail_ptr) { /* not top prio */ |
| if (job_ptr->priority == 0) { /* user/admin hold */ |
| if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS |
| && (job_ptr->state_reason != WAIT_RESV_DELETED) |
| && (job_ptr->state_reason != FAIL_BURST_BUFFER_OP) |
| && (job_ptr->state_reason != FAIL_ACCOUNT) |
| && (job_ptr->state_reason != FAIL_QOS) |
| && (job_ptr->state_reason != WAIT_HELD) |
| && (job_ptr->state_reason != WAIT_HELD_USER) |
| && job_ptr->state_reason != WAIT_MAX_REQUEUE) { |
| job_ptr->state_reason = WAIT_HELD; |
| xfree(job_ptr->state_desc); |
| } |
| } else if (job_ptr->state_reason == WAIT_NO_REASON && |
| het_job_offset == NO_VAL) { |
| job_ptr->state_reason = WAIT_PRIORITY; |
| xfree(job_ptr->state_desc); |
| } |
| } |
| return top; |
| } |
| |
| static void _merge_job_licenses(job_record_t *shrink_job_ptr, |
| job_record_t *expand_job_ptr) |
| { |
| xassert(shrink_job_ptr); |
| xassert(expand_job_ptr); |
| |
| /* FIXME: do we really need to update accounting here? It |
| * might already happen */ |
| |
| if (!shrink_job_ptr->licenses) /* No licenses to add */ |
| return; |
| |
| if (!expand_job_ptr->licenses) { /* Just transfer licenses */ |
| expand_job_ptr->licenses = shrink_job_ptr->licenses; |
| shrink_job_ptr->licenses = NULL; |
| FREE_NULL_LIST(expand_job_ptr->license_list); |
| expand_job_ptr->license_list = shrink_job_ptr->license_list; |
| shrink_job_ptr->license_list = NULL; |
| return; |
| } |
| |
| /* Merge the license information into expanding job */ |
| xstrcat(expand_job_ptr->licenses, ","); |
| xstrcat(expand_job_ptr->licenses, shrink_job_ptr->licenses); |
| xfree(shrink_job_ptr->licenses); |
| FREE_NULL_LIST(expand_job_ptr->license_list); |
| FREE_NULL_LIST(shrink_job_ptr->license_list); |
| license_job_merge(expand_job_ptr); |
| } |
| |
| static void _hold_job_rec(job_record_t *job_ptr, uid_t uid) |
| { |
| int i, j; |
| time_t now = time(NULL); |
| |
| job_ptr->direct_set_prio = 1; |
| job_ptr->priority = 0; |
| |
| if (job_ptr->details && (job_ptr->details->begin_time < now)) |
| job_ptr->details->begin_time = 0; |
| |
| /* Update job with new begin_time. */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| if (IS_JOB_PENDING(job_ptr)) |
| acct_policy_remove_accrue_time(job_ptr, false); |
| |
| if (job_ptr->part_ptr_list && |
| job_ptr->prio_mult && |
| job_ptr->prio_mult->priority_array) { |
| j = list_count(job_ptr->part_ptr_list); |
| for (i = 0; i < j; i++) { |
| job_ptr->prio_mult->priority_array[i] = 0; |
| } |
| } |
| sched_info("%s: hold on %pJ by uid %u", __func__, job_ptr, uid); |
| } |
| |
| static int _foreach_hold_het_comp(void *x, void *arg) |
| { |
| _hold_job_rec(x, *(uid_t *) arg); |
| return 0; |
| } |
| |
| static void _hold_job(job_record_t *job_ptr, uid_t uid) |
| { |
| job_record_t *het_job_leader = NULL; |
| |
| if (job_ptr->het_job_id && _get_whole_hetjob()) |
| het_job_leader = find_job_record(job_ptr->het_job_id); |
| if (het_job_leader && het_job_leader->het_job_list) |
| (void) list_for_each(het_job_leader->het_job_list, |
| _foreach_hold_het_comp, &uid); |
| else |
| _hold_job_rec(job_ptr, uid); |
| } |
| |
| static void _release_job_rec(job_record_t *job_ptr, uid_t uid) |
| { |
| time_t now = time(NULL); |
| if (job_ptr->details && (job_ptr->details->begin_time < now)) |
| job_ptr->details->begin_time = 0; |
| job_ptr->direct_set_prio = 0; |
| set_job_prio(job_ptr); |
| job_ptr->state_reason = WAIT_NO_REASON; |
| job_state_unset_flag(job_ptr, JOB_SPECIAL_EXIT); |
| xfree(job_ptr->state_desc); |
| job_ptr->exit_code = 0; |
| fed_mgr_job_requeue(job_ptr); /* submit sibling jobs */ |
| sched_info("%s: release hold on %pJ by uid %u", |
| __func__, job_ptr, uid); |
| } |
| |
| static int _foreach_release_het_comp(void *x, void *arg) |
| { |
| _release_job_rec(x, *(uid_t *) arg); |
| return 0; |
| } |
| |
| static void _release_job(job_record_t *job_ptr, uid_t uid) |
| { |
| job_record_t *het_job_leader = NULL; |
| |
| if (job_ptr->het_job_id && _get_whole_hetjob()) |
| het_job_leader = find_job_record(job_ptr->het_job_id); |
| if (het_job_leader && het_job_leader->het_job_list) |
| (void) list_for_each(het_job_leader->het_job_list, |
| _foreach_release_het_comp, &uid); |
| else |
| _release_job_rec(job_ptr, uid); |
| } |
| |
| /* |
| * Gets a new association giving priority to the given parameters in job_desc, |
| * and if not possible using the job_ptr ones. |
| * IN job_desc: The new job description to use for getting the assoc_ptr. |
| * IN job_ptr: The original job_ptr to use when parameters are not in job_desc. |
| * RET assoc_rec, the new association combining the most updated information |
| * from job_desc. |
| */ |
| static slurmdb_assoc_rec_t *_retrieve_new_assoc(job_desc_msg_t *job_desc, |
| job_record_t *job_ptr) |
| { |
| slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL; |
| |
| memset(&assoc_rec, 0, sizeof(assoc_rec)); |
| |
| if (job_desc->partition) { |
| part_record_t *part_ptr = NULL; |
| int error_code = |
| _get_job_parts(job_desc, &part_ptr, NULL, NULL); |
| /* We don't need this we only care about part_ptr */ |
| if (error_code != SLURM_SUCCESS) { |
| errno = error_code; |
| return NULL; |
| } else if (!(part_ptr->state_up & PARTITION_SUBMIT)) { |
| errno = ESLURM_PARTITION_NOT_AVAIL; |
| return NULL; |
| } |
| |
| assoc_rec.partition = part_ptr->name; |
| } else if (job_ptr->part_ptr) |
| assoc_rec.partition = job_ptr->part_ptr->name; |
| |
| if (job_desc->account) |
| assoc_rec.acct = job_desc->account; |
| else |
| assoc_rec.acct = job_ptr->account; |
| |
| assoc_rec.uid = job_ptr->user_id; |
| |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| &assoc_ptr, false)) { |
| info("%s: invalid account %s for %pJ", |
| __func__, assoc_rec.acct, job_ptr); |
| errno = ESLURM_INVALID_ACCOUNT; |
| return NULL; |
| } else if (slurm_with_slurmdbd() && |
| !assoc_ptr && |
| !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) && |
| assoc_rec.acct) { |
| /* if not enforcing associations we want to look for |
| * the default account and use it to avoid getting |
| * trash in the accounting records. |
| */ |
| assoc_rec.acct = NULL; |
| (void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| &assoc_ptr, false); |
| } |
| |
| return assoc_ptr; |
| } |
| |
| /* Allocate nodes to new job. Old job info will be cleared at epilog complete */ |
| static void _realloc_nodes(job_record_t *job_ptr, bitstr_t *orig_node_bitmap) |
| { |
| bitstr_t *node_bitmap; |
| node_record_t *node_ptr; |
| |
| xassert(job_ptr); |
| xassert(orig_node_bitmap); |
| if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) |
| return; |
| |
| node_bitmap = job_ptr->job_resrcs->node_bitmap; |
| for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) { |
| if (bit_test(orig_node_bitmap, i)) |
| continue; |
| make_node_alloc(node_ptr, job_ptr); |
| } |
| node_mgr_make_node_blocked(job_ptr, true); |
| } |
| |
| extern bool permit_job_expansion(void) |
| { |
| static time_t sched_update = 0; |
| static bool permit_job_expansion = false; |
| |
| if (sched_update != slurm_conf.last_update) { |
| sched_update = slurm_conf.last_update; |
| if (xstrcasestr(slurm_conf.sched_params, |
| "permit_job_expansion")) |
| permit_job_expansion = true; |
| else |
| permit_job_expansion = false; |
| } |
| |
| return permit_job_expansion; |
| } |
| |
| extern bool permit_job_shrink(void) |
| { |
| static time_t sched_update = 0; |
| static bool permit_job_shrink = false; |
| |
| if (sched_update != slurm_conf.last_update) { |
| sched_update = slurm_conf.last_update; |
| if (xstrcasestr(slurm_conf.sched_params, "disable_job_shrink")) |
| permit_job_shrink = false; |
| else |
| permit_job_shrink = true; |
| } |
| |
| return permit_job_shrink; |
| } |
| |
| /* |
| * Job expansion is not allowed for jobs that requested OR licenses. |
| */ |
| static bool _valid_license_job_expansion(job_record_t *job_ptr1, |
| job_record_t *job_ptr2) |
| { |
| if (xstrchr(job_ptr1->licenses, '|') || |
| xstrchr(job_ptr2->licenses, '|')) |
| return false; |
| |
| return true; |
| } |
| |
| static int _update_job(job_record_t *job_ptr, job_desc_msg_t *job_desc, |
| uid_t uid, char **err_msg) |
| { |
| int error_code = SLURM_SUCCESS; |
| enum job_state_reason fail_reason; |
| bool privileged = false; |
| bool is_coord_oldacc = false, is_coord_newacc = false; |
| uint32_t save_min_nodes = 0, save_max_nodes = 0; |
| uint32_t save_min_cpus = 0, save_max_cpus = 0; |
| job_details_t *detail_ptr; |
| part_record_t *new_part_ptr = NULL, *use_part_ptr = NULL; |
| bitstr_t *exc_bitmap = NULL, *new_req_bitmap = NULL; |
| bitstr_t *orig_job_node_bitmap = NULL; |
| time_t now = time(NULL); |
| multi_core_data_t *mc_ptr = NULL; |
| bool update_accounting = false, new_req_bitmap_given = false; |
| acct_policy_limit_set_t acct_policy_limit_set; |
| uint16_t tres[slurmctld_tres_cnt]; |
| bool acct_limit_already_exceeded; |
| bool tres_changed = false; |
| int tres_pos; |
| uint64_t tres_req_cnt[slurmctld_tres_cnt]; |
| bool tres_req_cnt_set = false, valid_licenses = false; |
| list_t *gres_list = NULL, *license_list = NULL; |
| list_t *part_ptr_list = NULL; |
| uint32_t orig_time_limit; |
| bool gres_update = false; |
| slurmdb_assoc_rec_t *new_assoc_ptr = NULL, *use_assoc_ptr = NULL; |
| slurmdb_qos_rec_t *new_qos_ptr = NULL, *use_qos_ptr = NULL; |
| slurmctld_resv_t *new_resv_ptr = NULL; |
| list_t *new_resv_list = NULL; |
| list_t *new_qos_list = NULL; |
| uint32_t user_site_factor; |
| uint32_t new_qos_id = 0; |
| uint64_t mem_req; |
| |
| assoc_mgr_lock_t locks = { .tres = READ_LOCK }; |
| assoc_mgr_lock_t assoc_mgr_read_lock = { |
| .assoc = READ_LOCK, |
| .qos = READ_LOCK, |
| .user = READ_LOCK, |
| }; |
| |
| /* |
| * Block scontrol updates of scrontab jobs. |
| */ |
| if (job_ptr->bit_flags & CRON_JOB) |
| return ESLURM_CANNOT_MODIFY_CRON_JOB; |
| |
| privileged = validate_operator(uid); |
| |
| /* Check authorization for modifying this job */ |
| is_coord_oldacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, |
| false); |
| is_coord_newacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_desc->account, |
| false); |
| if ((job_ptr->user_id != uid) && !privileged) { |
| /* |
| * Fail if we are not coordinators of the current account or |
| * if we are changing an account and we are not coordinators |
| * of both src and dest accounts. |
| */ |
| if (!is_coord_oldacc || |
| (!is_coord_newacc && job_desc->account)) { |
| error("Security violation, JOB_UPDATE RPC from uid %u", |
| uid); |
| return ESLURM_USER_ID_MISSING; |
| } |
| } |
| |
| if (job_desc->burst_buffer) { |
| /* |
| * burst_buffer contents are validated at job submit time and |
| * data is possibly being staged at later times. It can not |
| * be changed except to clear the value on a completed job and |
| * purge the record in order to recover from a failure mode |
| */ |
| if (IS_JOB_COMPLETED(job_ptr) && privileged && |
| (job_desc->burst_buffer[0] == '\0')) { |
| xfree(job_ptr->burst_buffer); |
| last_job_update = now; |
| } else { |
| error_code = ESLURM_NOT_SUPPORTED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->array_inx && job_ptr->array_recs) { |
| int throttle; |
| throttle = strtoll(job_desc->array_inx, (char **) NULL, 10); |
| if (throttle >= 0) { |
| info("%s: set max_run_tasks to %d for job array %pJ", |
| __func__, throttle, job_ptr); |
| job_ptr->array_recs->max_run_tasks = throttle; |
| } else { |
| info("%s: invalid max_run_tasks of %d for job array %pJ, ignored", |
| __func__, throttle, job_ptr); |
| error_code = ESLURM_BAD_TASK_COUNT; |
| } |
| /* |
| * Even if the job is complete, permit changing |
| * ArrayTaskThrottle for other elements of the task array |
| */ |
| if (IS_JOB_FINISHED(job_ptr)) |
| goto fini; |
| } |
| |
| if (IS_JOB_FINISHED(job_ptr)) { |
| error_code = ESLURM_JOB_FINISHED; |
| goto fini; |
| } |
| |
| /* |
| * Validate before job_submit_g_modify() so that the job_submit |
| * plugin can make changes to the field without triggering an auth |
| * issue. |
| */ |
| if (job_desc->admin_comment && !validate_super_user(uid)) { |
| error("Attempt to change admin_comment for %pJ", job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| goto fini; |
| } |
| |
| /* Save before submit plugin potentially modifies it. */ |
| user_site_factor = job_desc->site_factor; |
| |
| if (job_desc->user_id == SLURM_AUTH_NOBODY) { |
| /* |
| * Used by job_submit/lua to find default partition and |
| * access control logic below to validate partition change |
| */ |
| job_desc->user_id = job_ptr->user_id; |
| } |
| error_code = job_submit_g_modify(job_desc, job_ptr, uid, err_msg); |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| |
| error_code = _test_job_desc_fields(job_desc); |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| |
| /* Do not update MCS label unless explicitly provided */ |
| if (job_desc->mcs_label) { |
| /* Only pending jobs can be updated */ |
| if (!IS_JOB_PENDING(job_ptr)) |
| return ESLURM_JOB_NOT_PENDING; |
| /* This is an attempt to explicitly reset the value */ |
| if (job_desc->mcs_label[0] == '\0') |
| xfree(job_desc->mcs_label); |
| |
| if (mcs_g_set_mcs_label(job_ptr, job_desc->mcs_label)) { |
| if (!job_desc->mcs_label) |
| error("Failed to update job: No valid mcs_label found"); |
| else |
| error("Failed to update job: Invalid mcs-label: %s", |
| job_desc->mcs_label); |
| return ESLURM_INVALID_MCS_LABEL; |
| } |
| } |
| |
| memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set)); |
| acct_policy_limit_set.tres = tres; |
| |
| if (privileged) { |
| /* set up the acct_policy if we are at least an operator */ |
| for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++) |
| acct_policy_limit_set.tres[tres_pos] = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.time = ADMIN_SET_LIMIT; |
| acct_policy_limit_set.qos = ADMIN_SET_LIMIT; |
| } else |
| memset(tres, 0, sizeof(tres)); |
| |
| detail_ptr = job_ptr->details; |
| if (detail_ptr) |
| mc_ptr = detail_ptr->mc_ptr; |
| last_job_update = now; |
| |
| /* |
| * Check to see if the new requested job_desc exceeds any |
| * existing limit. If it passes, cool, we will check the new |
| * association/qos/part later in the code and fail if it is wrong. |
| * |
| * If it doesn't pass this mean some limit was exceededed before the |
| * update request so let's keep the user continue screwing up herself |
| * with the limit if it is what she wants. We do this by not exiting |
| * on the later call to acct_policy_validate() if it fails. |
| * |
| * We will also prevent the update to return an error code that is |
| * confusing since many things could successfully update and we are now |
| * just already violating a limit. The job won't be allowed to run, |
| * but it will allow the update to happen which is most likely what |
| * was desired. |
| * |
| * Changes in between this check and the next acct_policy_validate() |
| * will not be constrained to accounting enforce limits. |
| */ |
| orig_time_limit = job_desc->time_limit; |
| |
| |
| /* |
| * We need to figure out if we changed task cnt. |
| */ |
| _figure_out_num_tasks(job_desc, job_ptr); |
| |
| memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt)); |
| job_desc->tres_req_cnt = tres_req_cnt; |
| tres_req_cnt_set = true; |
| |
| acct_limit_already_exceeded = false; |
| |
| if (!privileged && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) { |
| if (!acct_policy_validate(job_desc, job_ptr->part_ptr, |
| job_ptr->part_ptr_list, |
| job_ptr->assoc_ptr, job_ptr->qos_ptr, |
| NULL, &acct_policy_limit_set, |
| true)) { |
| debug("%s: already exceeded association's cpu, node, " |
| "memory or time limit for user %u", |
| __func__, job_desc->user_id); |
| acct_limit_already_exceeded = true; |
| } |
| job_desc->time_limit = orig_time_limit; |
| } |
| |
| /* |
| * The partition, assoc, qos, reservation, and req_node_bitmap all have |
| * to be set before checking later. So here we set them into temporary |
| * variables set in the job way later. |
| */ |
| if (job_desc->partition && |
| !xstrcmp(job_desc->partition, job_ptr->partition)) { |
| sched_debug("%s: new partition identical to old partition %pJ", |
| __func__, job_ptr); |
| } else if (job_desc->partition) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } |
| |
| error_code = _get_job_parts(job_desc, |
| &new_part_ptr, |
| &part_ptr_list, NULL); |
| |
| if (error_code != SLURM_SUCCESS) |
| ; |
| else if ((new_part_ptr->state_up & PARTITION_SUBMIT) == 0) |
| error_code = ESLURM_PARTITION_NOT_AVAIL; |
| else if (!part_ptr_list && |
| !xstrcmp(new_part_ptr->name, job_ptr->partition)) { |
| sched_debug("%s: 2 new partition identical to old partition %pJ", |
| __func__, job_ptr); |
| new_part_ptr = NULL; |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| use_part_ptr = new_part_ptr ? new_part_ptr : job_ptr->part_ptr; |
| |
| /* Check the account and the partition as both affect the association */ |
| if (job_desc->account || new_part_ptr) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else { |
| new_assoc_ptr = _retrieve_new_assoc(job_desc, job_ptr); |
| |
| if (!new_assoc_ptr) |
| error_code = errno; |
| else if (new_assoc_ptr == job_ptr->assoc_ptr) { |
| new_assoc_ptr = NULL; |
| sched_debug("%s: new association identical to old association %u", |
| __func__, job_ptr->job_id); |
| } |
| |
| /* |
| * Clear errno that may have been set by |
| * _retrieve_new_assoc. |
| */ |
| errno = 0; |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| use_assoc_ptr = new_assoc_ptr ? new_assoc_ptr : job_ptr->assoc_ptr; |
| |
| if (job_desc->qos) { |
| char *resv_name; |
| assoc_mgr_lock_t qos_read_lock = { |
| .qos = READ_LOCK, |
| }; |
| |
| if (job_desc->reservation |
| && job_desc->reservation[0] != '\0') |
| resv_name = job_desc->reservation; |
| else |
| resv_name = job_ptr->resv_name; |
| |
| assoc_mgr_lock(&qos_read_lock); |
| |
| error_code = |
| _get_qos_info(job_desc->qos, 0, &new_qos_list, |
| &new_qos_ptr, resv_name, use_assoc_ptr, |
| privileged, true, LOG_LEVEL_ERROR); |
| if ((error_code == SLURM_SUCCESS) && new_qos_ptr) { |
| if (!new_qos_list && |
| (job_ptr->qos_ptr == new_qos_ptr)) { |
| sched_debug("%s: new QOS identical to old QOS %pJ", |
| __func__, job_ptr); |
| new_qos_ptr = NULL; |
| } else if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| new_qos_ptr = NULL; |
| } |
| } |
| |
| if (new_qos_ptr) |
| new_qos_id = new_qos_ptr->id; |
| |
| assoc_mgr_unlock(&qos_read_lock); |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| use_qos_ptr = new_qos_ptr ? new_qos_ptr : job_ptr->qos_ptr; |
| |
| if (job_desc->bitflags & RESET_ACCRUE_TIME) { |
| if (!IS_JOB_PENDING(job_ptr) || !detail_ptr) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } else if (detail_ptr->accrue_time) { |
| uint64_t bit_flags = job_ptr->bit_flags; |
| acct_policy_remove_accrue_time(job_ptr, false); |
| /* |
| * Set the accrue_time to 'now' since we are not |
| * removing this job, but resetting the time |
| * instead. Since acct_policy_remove_accrue_time() |
| * will set this to 0 which will cause the next time |
| * through acct_policy_handle_accrue_time() to set |
| * things back to the original time thus making it as if |
| * nothing happened here. |
| * |
| * We also reset the bit_flags to be the same as it was |
| * before so we don't loose JOB_ACCRUE_OVER if set |
| * beforehand. |
| */ |
| job_ptr->bit_flags = bit_flags; |
| detail_ptr->accrue_time = now; |
| } |
| } |
| |
| /* |
| * Before any action over excluded or required nodes, we are going to |
| * reset them to their original values. |
| * |
| * We will decide later if those values need update, or even if we need |
| * to merge the negated required list into the excluded one (when |
| * -N < size required list). |
| */ |
| FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap); |
| if (detail_ptr->exc_nodes) { |
| /* This error should never happen */ |
| if (node_name2bitmap(detail_ptr->exc_nodes, |
| false, &exc_bitmap, NULL)) { |
| sched_info("%s: Invalid excluded nodes list in job records: %s", |
| __func__, detail_ptr->exc_nodes); |
| FREE_NULL_BITMAP(exc_bitmap); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto fini; |
| } |
| detail_ptr->exc_node_bitmap = exc_bitmap; |
| exc_bitmap = NULL; |
| } |
| FREE_NULL_BITMAP(detail_ptr->req_node_bitmap); |
| if (detail_ptr->req_nodes) { |
| /* This error should never happen */ |
| if (node_name2bitmap(detail_ptr->req_nodes, |
| false, &new_req_bitmap, NULL)) { |
| sched_info("%s: Invalid required nodes list in job records: %s", |
| __func__, detail_ptr->req_nodes); |
| FREE_NULL_BITMAP(new_req_bitmap); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto fini; |
| } |
| detail_ptr->req_node_bitmap = new_req_bitmap; |
| new_req_bitmap = NULL; |
| } |
| |
| if (job_desc->exc_nodes && detail_ptr && |
| !xstrcmp(job_desc->exc_nodes, detail_ptr->exc_nodes)) { |
| sched_debug("%s: new exc_nodes identical to old exc_nodes %s", |
| __func__, job_desc->exc_nodes); |
| } else if (job_desc->exc_nodes) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (job_desc->exc_nodes[0] == '\0') { |
| xfree(detail_ptr->exc_nodes); |
| FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap); |
| } else { |
| if (node_name2bitmap(job_desc->exc_nodes, false, |
| &exc_bitmap, NULL)) { |
| sched_error("%s: Invalid node list for update of %pJ: %s", |
| __func__, job_ptr, |
| job_desc->exc_nodes); |
| FREE_NULL_BITMAP(exc_bitmap); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| } |
| if (exc_bitmap) { |
| xfree(detail_ptr->exc_nodes); |
| detail_ptr->exc_nodes = |
| xstrdup(job_desc->exc_nodes); |
| FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap); |
| detail_ptr->exc_node_bitmap = exc_bitmap; |
| sched_info("%s: setting exc_nodes to %s for %pJ", |
| __func__, job_desc->exc_nodes, job_ptr); |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| /* |
| * Must check req_nodes to set the job_ptr->details->req_node_bitmap |
| * before we validate it later. |
| */ |
| if (job_desc->req_nodes && detail_ptr && |
| !xstrcmp(job_desc->req_nodes, detail_ptr->req_nodes)) { |
| sched_debug("%s: new req_nodes identical to old req_nodes %s", |
| __func__, job_desc->req_nodes); |
| } else if (job_desc->req_nodes && detail_ptr && |
| (detail_ptr->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_ARBITRARY) { |
| sched_info("%s: Cannot update node list of %pJ. Not compatible with arbitrary distribution", |
| __func__, job_ptr); |
| error_code = ESLURM_NOT_SUPPORTED; |
| goto fini; |
| } else if (job_desc->req_nodes && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| /* |
| * Use req_nodes to change the nodes associated with a running |
| * for lack of other field in the job request to use |
| */ |
| if (!permit_job_shrink()) { |
| error("%s: request to shrink %pJ denied by configuration", |
| __func__, job_ptr); |
| error_code = ESLURM_NOT_SUPPORTED; |
| goto fini; |
| } |
| |
| if ((job_desc->req_nodes[0] == '\0') || |
| node_name2bitmap(job_desc->req_nodes, false, |
| &new_req_bitmap, NULL) || |
| !bit_super_set(new_req_bitmap, job_ptr->node_bitmap) || |
| (job_ptr->details && job_ptr->details->expanding_jobid)) { |
| sched_info("%s: Invalid node list (%s) for %pJ update", |
| __func__, job_desc->req_nodes, job_ptr); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto fini; |
| } |
| |
| if (new_req_bitmap) { |
| node_record_t *node_ptr; |
| bitstr_t *rem_nodes; |
| |
| /* |
| * They requested a new list of nodes for the job. If |
| * the batch host isn't in this list, then deny this |
| * request. |
| */ |
| if (job_ptr->batch_flag) { |
| int batch_inx = node_name_get_inx( |
| job_ptr->batch_host); |
| |
| if (batch_inx == -1) |
| error("%s: Invalid batch host %s for %pJ; this should never happen", |
| __func__, job_ptr->batch_host, |
| job_ptr); |
| else if (!bit_test(new_req_bitmap, batch_inx)) { |
| error("%s: Batch host %s for %pJ is not in the requested node list %s. You cannot remove the batch host from a job when resizing.", |
| __func__, job_ptr->batch_host, |
| job_ptr, job_desc->req_nodes); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| goto fini; |
| } |
| } |
| |
| sched_info("%s: setting nodes to %s for %pJ", |
| __func__, job_desc->req_nodes, job_ptr); |
| job_pre_resize_acctg(job_ptr); |
| rem_nodes = bit_copy(job_ptr->node_bitmap); |
| bit_and_not(rem_nodes, new_req_bitmap); |
| abort_job_on_nodes(job_ptr, rem_nodes); |
| orig_job_node_bitmap = |
| bit_copy(job_ptr->job_resrcs->node_bitmap); |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(rem_nodes, &i)); |
| i++) { |
| kill_step_on_node(job_ptr, node_ptr, false); |
| excise_node_from_job(job_ptr, node_ptr); |
| } |
| /* Resize the core bitmaps of the job's steps */ |
| rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap); |
| |
| FREE_NULL_BITMAP(orig_job_node_bitmap); |
| FREE_NULL_BITMAP(rem_nodes); |
| (void) gs_job_start(job_ptr); |
| gres_stepmgr_job_build_details( |
| job_ptr->gres_list_alloc, |
| job_ptr->nodes, |
| &job_ptr->gres_detail_cnt, |
| &job_ptr->gres_detail_str, |
| &job_ptr->gres_used); |
| job_post_resize_acctg(job_ptr); |
| /* |
| * Since job_post_resize_acctg will restart |
| * things, don't do it again. |
| */ |
| update_accounting = false; |
| } else { |
| update_accounting = true; |
| } |
| FREE_NULL_BITMAP(new_req_bitmap); |
| } else if (job_desc->req_nodes) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (job_desc->req_nodes[0] == '\0') |
| new_req_bitmap_given = true; |
| else { |
| if (node_name2bitmap(job_desc->req_nodes, false, |
| &new_req_bitmap, NULL)) { |
| sched_info("%s: Invalid node list for job_update: %s", |
| __func__, job_desc->req_nodes); |
| FREE_NULL_BITMAP(new_req_bitmap); |
| error_code = ESLURM_INVALID_NODE_NAME; |
| } else |
| new_req_bitmap_given = true; |
| } |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (new_req_bitmap_given) { |
| xfree(detail_ptr->req_nodes); |
| if (job_desc->req_nodes[0] != '\0') |
| detail_ptr->req_nodes = xstrdup(job_desc->req_nodes); |
| FREE_NULL_BITMAP(detail_ptr->req_node_bitmap); |
| detail_ptr->req_node_bitmap = new_req_bitmap; |
| new_req_bitmap = NULL; |
| sched_info("%s: setting req_nodes to %s for %pJ", |
| __func__, job_desc->req_nodes, job_ptr); |
| } |
| |
| /* this needs to be after partition and QOS checks */ |
| if (job_desc->reservation |
| && (!xstrcmp(job_desc->reservation, job_ptr->resv_name) || |
| (!job_ptr->resv_name && job_desc->reservation[0] == '\0'))) { |
| sched_debug("%s: new reservation identical to old reservation %pJ", |
| __func__, job_ptr); |
| } else if (job_desc->reservation) { |
| if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) { |
| error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING; |
| } else { |
| job_record_t tmp_job_rec; |
| |
| memcpy(&tmp_job_rec, job_ptr, sizeof(job_record_t)); |
| tmp_job_rec.resv_name = xstrdup(job_desc->reservation); |
| tmp_job_rec.resv_ptr = NULL; |
| tmp_job_rec.resv_list = NULL; |
| tmp_job_rec.part_ptr = use_part_ptr; |
| tmp_job_rec.qos_ptr = use_qos_ptr; |
| tmp_job_rec.assoc_ptr = use_assoc_ptr; |
| |
| error_code = validate_job_resv(&tmp_job_rec); |
| |
| /* |
| * It doesn't matter what this is, just set it as |
| * failure will be NULL. |
| */ |
| new_resv_ptr = tmp_job_rec.resv_ptr; |
| new_resv_list = tmp_job_rec.resv_list; |
| |
| /* |
| * Make sure this job isn't using a partition or QOS |
| * that requires it to be in a reservation. |
| */ |
| if ((error_code == SLURM_SUCCESS) && !new_resv_ptr) { |
| if (use_part_ptr |
| && use_part_ptr->flags & PART_FLAG_REQ_RESV) |
| error_code = ESLURM_ACCESS_DENIED; |
| |
| if (use_qos_ptr |
| && use_qos_ptr->flags & QOS_FLAG_REQ_RESV) |
| error_code = ESLURM_INVALID_QOS; |
| } |
| |
| if (job_ptr->state_reason == WAIT_RESV_INVALID) |
| _release_job(job_ptr, uid); |
| |
| xfree(tmp_job_rec.resv_name); |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| } |
| |
| if (job_desc->cpus_per_tres || job_desc->tres_per_job || |
| job_desc->tres_per_node || job_desc->tres_per_socket || |
| job_desc->tres_per_task || job_desc->mem_per_tres || |
| (job_desc->bitflags & TASKS_CHANGED)) |
| gres_update = true; |
| if (gres_update) { |
| uint16_t orig_ntasks_per_socket = NO_VAL16; |
| gres_job_state_validate_t gres_js_val = { |
| .cpus_per_task = &job_desc->cpus_per_task, |
| .max_nodes = &job_desc->max_nodes, |
| .min_cpus = &job_desc->min_cpus, |
| .min_nodes = &job_desc->min_nodes, |
| .ntasks_per_node = &job_desc->ntasks_per_node, |
| .ntasks_per_socket = &job_desc->ntasks_per_socket, |
| .ntasks_per_tres = &job_desc->ntasks_per_tres, |
| .num_tasks = &job_desc->num_tasks, |
| .sockets_per_node = &job_desc->sockets_per_node, |
| |
| .gres_list = &gres_list, |
| }; |
| |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) || |
| (detail_ptr->expanding_jobid != 0)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } |
| if (!job_desc->cpus_per_tres) |
| job_desc->cpus_per_tres = |
| xstrdup(job_ptr->cpus_per_tres); |
| if (!job_desc->tres_freq) |
| job_desc->tres_freq = xstrdup(job_ptr->tres_freq); |
| if (!job_desc->tres_per_job) |
| job_desc->tres_per_job = xstrdup(job_ptr->tres_per_job); |
| if (!job_desc->tres_per_node) |
| job_desc->tres_per_node = |
| xstrdup(job_ptr->tres_per_node); |
| if (!job_desc->tres_per_socket) |
| job_desc->tres_per_socket = |
| xstrdup(job_ptr->tres_per_socket); |
| if (!job_desc->tres_per_task) |
| job_desc->tres_per_task = |
| xstrdup(job_ptr->tres_per_task); |
| if (!job_desc->mem_per_tres) |
| job_desc->mem_per_tres = xstrdup(job_ptr->mem_per_tres); |
| if (job_desc->num_tasks == NO_VAL) |
| job_desc->num_tasks = detail_ptr->num_tasks; |
| if (job_desc->min_cpus == NO_VAL) |
| job_desc->min_cpus = 0; /* min_cpus could decrease */ |
| if (job_desc->min_nodes == NO_VAL) |
| job_desc->min_nodes = detail_ptr->min_nodes; |
| if (job_desc->max_nodes == NO_VAL) |
| job_desc->max_nodes = detail_ptr->max_nodes; |
| if (job_desc->ntasks_per_node == NO_VAL16) |
| job_desc->ntasks_per_node = detail_ptr->ntasks_per_node; |
| if ((job_desc->ntasks_per_socket == NO_VAL16) && |
| (detail_ptr->mc_ptr) && |
| (detail_ptr->mc_ptr->ntasks_per_socket != INFINITE16)) { |
| job_desc->ntasks_per_socket = |
| mc_ptr->ntasks_per_socket; |
| orig_ntasks_per_socket = job_desc->ntasks_per_socket; |
| } |
| if (job_desc->sockets_per_node == NO_VAL16) |
| job_desc->sockets_per_node = |
| detail_ptr->mc_ptr->sockets_per_node; |
| if (job_desc->cpus_per_task == NO_VAL16) |
| job_desc->cpus_per_task = |
| detail_ptr->orig_cpus_per_task; |
| if (!job_desc->ntasks_per_tres) |
| job_desc->ntasks_per_tres = detail_ptr->ntasks_per_tres; |
| |
| gres_js_val.cpus_per_tres = job_desc->cpus_per_tres; |
| gres_js_val.mem_per_tres = job_desc->mem_per_tres; |
| gres_js_val.tres_freq = job_desc->tres_freq; |
| gres_js_val.tres_per_job = job_desc->tres_per_job; |
| gres_js_val.tres_per_node = job_desc->tres_per_node; |
| gres_js_val.tres_per_socket = job_desc->tres_per_socket; |
| gres_js_val.tres_per_task = job_desc->tres_per_task; |
| |
| if ((error_code = gres_job_state_validate(&gres_js_val))) { |
| sched_info("%s: invalid GRES for %pJ", |
| __func__, job_ptr); |
| goto fini; |
| } |
| if (job_desc->num_tasks == detail_ptr->num_tasks) |
| job_desc->num_tasks = NO_VAL; /* Unchanged */ |
| if ((job_desc->min_cpus == detail_ptr->min_cpus) || |
| (job_desc->min_cpus == 0)) /* Unchanged */ |
| job_desc->min_cpus = NO_VAL; |
| if (job_desc->min_nodes == detail_ptr->min_nodes) |
| job_desc->min_nodes = NO_VAL; /* Unchanged */ |
| if (job_desc->max_nodes == detail_ptr->max_nodes) |
| job_desc->max_nodes = NO_VAL; /* Unchanged */ |
| if (job_desc->ntasks_per_node == detail_ptr->ntasks_per_node) |
| job_desc->ntasks_per_node = NO_VAL16; /* Unchanged */ |
| if (job_desc->ntasks_per_socket == orig_ntasks_per_socket) |
| job_desc->ntasks_per_socket = NO_VAL16; /* Unchanged */ |
| if (job_desc->sockets_per_node == |
| detail_ptr->mc_ptr->sockets_per_node) |
| job_desc->sockets_per_node = NO_VAL16; |
| if (job_desc->cpus_per_task == detail_ptr->cpus_per_task) |
| job_desc->cpus_per_task = NO_VAL16; /* Unchanged */ |
| if (job_desc->ntasks_per_tres == detail_ptr->ntasks_per_tres) |
| job_desc->ntasks_per_tres = 0; |
| if (!xstrcmp(job_desc->cpus_per_tres, job_ptr->cpus_per_tres)) |
| xfree(job_desc->cpus_per_tres); |
| if (!xstrcmp(job_desc->tres_freq, job_ptr->tres_freq)) |
| xfree(job_desc->tres_freq); |
| if (!xstrcmp(job_desc->tres_per_job, job_ptr->tres_per_job)) |
| xfree(job_desc->tres_per_job); |
| if (!xstrcmp(job_desc->tres_per_node, job_ptr->tres_per_node)) |
| xfree(job_desc->tres_per_node); |
| if (!xstrcmp(job_desc->tres_per_socket, |
| job_ptr->tres_per_socket)) |
| xfree(job_desc->tres_per_socket); |
| if (!xstrcmp(job_desc->tres_per_task, job_ptr->tres_per_task)) |
| xfree(job_desc->tres_per_task); |
| if (!xstrcmp(job_desc->mem_per_tres, job_ptr->mem_per_tres)) |
| xfree(job_desc->mem_per_tres); |
| |
| } |
| |
| if ((job_desc->min_nodes != NO_VAL) && |
| (job_desc->min_nodes != INFINITE)) { |
| uint32_t min_cpus = (job_desc->pn_min_cpus != NO_VAL16 ? |
| job_desc->pn_min_cpus : detail_ptr->pn_min_cpus) * |
| job_desc->min_nodes; |
| uint32_t num_cpus = job_desc->min_cpus != NO_VAL ? |
| job_desc->min_cpus : |
| IS_JOB_PENDING(job_ptr) ? |
| job_ptr->tres_req_cnt[TRES_ARRAY_CPU] : |
| job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU]; |
| uint32_t num_tasks = job_desc->num_tasks != NO_VAL ? |
| job_desc->num_tasks : detail_ptr->num_tasks; |
| |
| if (!num_tasks) { |
| num_tasks = job_desc->min_nodes; |
| } else if (num_tasks < job_desc->min_nodes) { |
| info("%s: adjusting num_tasks (prev: %u) to be at least min_nodes: %u", |
| __func__, num_tasks, job_desc->min_nodes); |
| num_tasks = job_desc->min_nodes; |
| if (IS_JOB_PENDING(job_ptr)) |
| job_desc->num_tasks = num_tasks; |
| } |
| |
| num_tasks *= job_desc->cpus_per_task != NO_VAL16 ? |
| job_desc->cpus_per_task : detail_ptr->cpus_per_task; |
| num_tasks = MAX(num_tasks, min_cpus); |
| if (num_tasks > num_cpus) { |
| info("%s: adjusting min_cpus (prev: %u) to be at least : %u", |
| __func__, num_cpus, num_tasks); |
| job_desc->min_cpus = num_tasks; |
| |
| job_desc->pn_min_memory = |
| job_desc->pn_min_memory != NO_VAL64 ? |
| job_desc->pn_min_memory : |
| detail_ptr->pn_min_memory; |
| } |
| |
| assoc_mgr_lock(&locks); |
| |
| if (!job_desc->licenses) { |
| license_set_job_tres_cnt(job_ptr->license_list, |
| job_desc->tres_req_cnt, |
| true); |
| } |
| assoc_mgr_unlock(&locks); |
| |
| |
| job_desc->tres_req_cnt[TRES_ARRAY_NODE] = job_desc->min_nodes; |
| } |
| |
| if (job_desc->min_cpus != NO_VAL) |
| job_desc->tres_req_cnt[TRES_ARRAY_CPU] = job_desc->min_cpus; |
| else if ((job_desc->pn_min_cpus != NO_VAL16) && |
| (job_desc->pn_min_cpus != 0)) { |
| job_desc->tres_req_cnt[TRES_ARRAY_CPU] = |
| job_desc->pn_min_cpus * |
| (job_desc->min_nodes != NO_VAL ? |
| job_desc->min_nodes : |
| detail_ptr ? detail_ptr->min_nodes : 1); |
| job_desc->min_cpus = job_desc->tres_req_cnt[TRES_ARRAY_CPU]; |
| } else if (job_desc->bitflags & TASKS_CHANGED) { |
| job_desc->tres_req_cnt[TRES_ARRAY_CPU] = job_desc->min_cpus = |
| job_desc->num_tasks; |
| } |
| |
| mem_req = |
| job_get_tres_mem(NULL, |
| job_desc->pn_min_memory, |
| job_desc->tres_req_cnt[TRES_ARRAY_CPU] ? |
| job_desc->tres_req_cnt[TRES_ARRAY_CPU] : |
| job_ptr->tres_req_cnt[TRES_ARRAY_CPU], |
| job_desc->min_nodes != NO_VAL ? |
| job_desc->min_nodes : |
| detail_ptr ? detail_ptr->min_nodes : 1, |
| use_part_ptr, |
| gres_list ? gres_list : job_ptr->gres_list_req, |
| (job_desc->pn_min_memory != NO_VAL64), |
| job_desc->sockets_per_node, |
| job_desc->num_tasks); |
| if (mem_req) |
| job_desc->tres_req_cnt[TRES_ARRAY_MEM] = mem_req; |
| |
| if (gres_update) { |
| gres_stepmgr_set_job_tres_cnt( |
| gres_list, |
| job_desc->tres_req_cnt[TRES_ARRAY_NODE], |
| job_desc->tres_req_cnt, false); |
| } |
| |
| /* Check if we are clearing licenses */ |
| if (job_desc->licenses && !job_desc->licenses[0]) |
| job_desc->bitflags |= RESET_LIC_JOB; |
| if (job_desc->tres_per_task && |
| !xstrcasestr(job_desc->tres_per_task, "license/")) |
| job_desc->bitflags |= RESET_LIC_TASK; |
| |
| _set_tot_license_req(job_desc, job_ptr); |
| |
| if (job_desc->licenses_tot && !xstrcmp(job_desc->licenses_tot, |
| job_ptr->licenses)) { |
| sched_debug("%s: new licenses identical to old licenses \"%s\"", |
| __func__, job_ptr->licenses); |
| } else if (job_desc->licenses_tot) { |
| bool pending = IS_JOB_PENDING(job_ptr); |
| license_list = |
| license_validate(job_desc->licenses_tot, true, true, |
| false, |
| pending ? job_desc->tres_req_cnt : |
| NULL, |
| &valid_licenses); |
| |
| if (!valid_licenses) { |
| sched_info("%s: invalid licenses: %s", |
| __func__, job_desc->licenses_tot); |
| error_code = ESLURM_INVALID_LICENSES; |
| } else if (!license_list) |
| xfree(job_desc->licenses_tot); |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| |
| if (job_desc->min_nodes == INFINITE) { |
| /* Used by scontrol just to get current configuration info */ |
| job_desc->min_nodes = NO_VAL; |
| } |
| if ((job_desc->min_nodes != NO_VAL) && |
| (job_desc->min_nodes > job_ptr->node_cnt) && |
| !permit_job_expansion() && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| info("%s: Change of size for %pJ not supported", __func__, |
| job_ptr); |
| error_code = ESLURM_NOT_SUPPORTED; |
| goto fini; |
| } |
| |
| if (job_desc->req_switch != NO_VAL) { |
| job_ptr->req_switch = job_desc->req_switch; |
| info("%s: Change of switches to %u %pJ", |
| __func__, job_desc->req_switch, job_ptr); |
| } |
| if (job_desc->wait4switch != NO_VAL) { |
| job_ptr->wait4switch = _max_switch_wait(job_desc->wait4switch); |
| info("%s: Change of switch wait to %u secs %pJ", |
| __func__, job_ptr->wait4switch, job_ptr); |
| } |
| |
| if (job_desc->admin_comment) { |
| if (!validate_super_user(uid)) { |
| error("%s: Attempt to change admin_comment for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } else { |
| xfree(job_ptr->admin_comment); |
| job_ptr->admin_comment = |
| xstrdup(job_desc->admin_comment); |
| info("%s: setting admin_comment to %s for %pJ", |
| __func__, job_ptr->admin_comment, job_ptr); |
| } |
| } |
| |
| if (job_desc->comment) { |
| xfree(job_ptr->comment); |
| job_ptr->comment = xstrdup(job_desc->comment); |
| info("%s: setting comment to %s for %pJ", |
| __func__, job_ptr->comment, job_ptr); |
| } |
| |
| if (job_desc->extra) { |
| elem_t *head = NULL; |
| |
| error_code = extra_constraints_parse(job_desc->extra, &head); |
| if (error_code != SLURM_SUCCESS) { |
| error("%s: Invalid extra constraints", __func__); |
| } else { |
| xfree(job_ptr->extra); |
| job_ptr->extra = xstrdup(job_desc->extra); |
| FREE_NULL_EXTRA_CONSTRAINTS(job_ptr->extra_constraints); |
| job_ptr->extra_constraints = head; |
| info("%s: setting extra to %s for %pJ", |
| __func__, job_ptr->extra, job_ptr); |
| } |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| /* |
| * Now that we know what the new part, qos, and association are going |
| * to be lets check the limits. |
| * If a limit was already exceeded before this update |
| * request, let's assume it is expected and allow the change to happen. |
| */ |
| if (new_qos_ptr || new_assoc_ptr || new_part_ptr) { |
| list_t *use_part_list = new_part_ptr ? |
| part_ptr_list : job_ptr->part_ptr_list; |
| assoc_mgr_lock(&assoc_mgr_read_lock); |
| if ((error_code = _check_for_part_assocs( |
| use_part_list, use_assoc_ptr)) != SLURM_SUCCESS) { |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| goto fini; |
| } |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| |
| if (!privileged && |
| (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) { |
| uint32_t acct_reason = 0; |
| char *resv_orig = NULL; |
| bool resv_reset = false, min_reset = false, |
| max_reset = false, |
| time_min_reset = false; |
| if (!acct_policy_validate(job_desc, use_part_ptr, |
| use_part_list, |
| use_assoc_ptr, use_qos_ptr, |
| &acct_reason, |
| &acct_policy_limit_set, |
| true) |
| && !acct_limit_already_exceeded) { |
| info("%s: exceeded association/QOS limit for user %u: %s", |
| __func__, job_desc->user_id, |
| job_state_reason_string(acct_reason)); |
| error_code = ESLURM_ACCOUNTING_POLICY; |
| goto fini; |
| } |
| /* |
| * We need to set the various parts of job_desc below |
| * to something since _valid_job_part() will validate |
| * them. Note the reservation part is validated in the |
| * sub call to _part_access_check(). |
| */ |
| if (job_desc->min_nodes == NO_VAL) { |
| job_desc->min_nodes = detail_ptr->min_nodes; |
| min_reset = true; |
| } |
| if ((job_desc->max_nodes == NO_VAL) && |
| (detail_ptr->max_nodes != 0)) { |
| job_desc->max_nodes = detail_ptr->max_nodes; |
| max_reset = true; |
| } |
| |
| if ((job_desc->time_min == NO_VAL) && |
| (job_ptr->time_min != 0)) { |
| job_desc->time_min = job_ptr->time_min; |
| time_min_reset = true; |
| } |
| |
| /* |
| * This always gets reset, so don't worry about tracking |
| * it. |
| */ |
| if (job_desc->time_limit == NO_VAL) |
| job_desc->time_limit = job_ptr->time_limit; |
| |
| if (!job_desc->reservation |
| || job_desc->reservation[0] == '\0') { |
| resv_reset = true; |
| resv_orig = job_desc->reservation; |
| job_desc->reservation = job_ptr->resv_name; |
| } |
| |
| assoc_mgr_lock(&assoc_mgr_read_lock); |
| if ((error_code = _valid_job_part( |
| job_desc, uid, |
| new_req_bitmap_given ? |
| new_req_bitmap : |
| job_ptr->details->req_node_bitmap, |
| use_part_ptr, |
| new_part_ptr ? |
| part_ptr_list : job_ptr->part_ptr_list, |
| use_assoc_ptr, use_qos_ptr, NULL))) { |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| goto fini; |
| } |
| assoc_mgr_unlock(&assoc_mgr_read_lock); |
| |
| if (min_reset) |
| job_desc->min_nodes = NO_VAL; |
| if (max_reset) |
| job_desc->max_nodes = NO_VAL; |
| if (time_min_reset) |
| job_desc->time_min = NO_VAL; |
| if (resv_reset) |
| job_desc->reservation = resv_orig; |
| |
| job_desc->time_limit = orig_time_limit; |
| } |
| |
| /* |
| * Since we are successful to this point remove the job from the |
| * old qos/assoc's |
| */ |
| acct_policy_remove_job_submit(job_ptr, false); |
| acct_policy_remove_accrue_time(job_ptr, false); |
| } |
| |
| if (new_qos_ptr) { |
| /* Change QOS */ |
| job_ptr->qos_id = new_qos_id; |
| job_ptr->qos_ptr = new_qos_ptr; |
| FREE_NULL_LIST(job_ptr->qos_list); |
| job_ptr->qos_list = new_qos_list; |
| new_qos_list = NULL; |
| xfree(detail_ptr->qos_req); |
| detail_ptr->qos_req = job_desc->qos; |
| job_desc->qos = NULL; |
| |
| job_ptr->limit_set.qos = acct_policy_limit_set.qos; |
| |
| if (job_ptr->state_reason == FAIL_QOS) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| |
| info("%s: setting QOS to %s for %pJ", |
| __func__, detail_ptr->qos_req, job_ptr); |
| } |
| |
| if (new_assoc_ptr) { |
| /* Change account/association */ |
| xfree(job_ptr->account); |
| job_ptr->account = xstrdup(new_assoc_ptr->acct); |
| job_ptr->assoc_id = new_assoc_ptr->id; |
| job_ptr->assoc_ptr = new_assoc_ptr; |
| |
| if (job_ptr->state_reason == FAIL_ACCOUNT) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| |
| info("%s: setting account to %s for %pJ", |
| __func__, job_ptr->account, job_ptr); |
| } |
| |
| if (new_part_ptr) { |
| /* Change partition */ |
| job_ptr->part_ptr = new_part_ptr; |
| job_ptr->bit_flags &= ~JOB_PART_ASSIGNED; |
| |
| FREE_NULL_LIST(job_ptr->part_ptr_list); |
| job_ptr->part_ptr_list = part_ptr_list; |
| part_ptr_list = NULL; /* nothing to free */ |
| |
| rebuild_job_part_list(job_ptr); |
| |
| /* Rebuilt in priority/multifactor plugin */ |
| if (job_ptr->prio_mult) |
| xfree(job_ptr->prio_mult->priority_array); |
| |
| info("%s: setting partition to %s for %pJ", |
| __func__, job_desc->partition, job_ptr); |
| } |
| |
| /* Now add the job to the new qos/assoc's */ |
| if (new_qos_ptr || new_assoc_ptr || new_part_ptr) { |
| update_accounting = true; |
| acct_policy_add_job_submit(job_ptr, false); |
| } |
| |
| if (new_resv_ptr) { |
| FREE_NULL_LIST(job_ptr->resv_list); |
| xfree(job_ptr->resv_name); |
| job_ptr->resv_name = xstrdup(job_desc->reservation); |
| xfree(job_ptr->details->resv_req); |
| job_ptr->details->resv_req = xstrdup(job_desc->reservation); |
| job_ptr->resv_list = new_resv_list; |
| job_ptr->resv_id = new_resv_ptr->resv_id; |
| job_ptr->resv_ptr = new_resv_ptr; |
| |
| sched_info("%s: setting reservation to %s for %pJ", __func__, |
| job_ptr->resv_name, job_ptr); |
| update_accounting = true; |
| } else if (job_desc->reservation && |
| job_desc->reservation[0] == '\0' && |
| job_ptr->resv_name) { |
| FREE_NULL_LIST(job_ptr->resv_list); |
| xfree(job_ptr->resv_name); |
| job_ptr->resv_id = 0; |
| job_ptr->resv_ptr = NULL; |
| sched_info("%s: setting reservation to '' for %pJ", |
| __func__, job_ptr); |
| update_accounting = true; |
| } |
| |
| /* Reset min and max cpu counts as needed, ensure consistency */ |
| if (job_desc->min_cpus != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (job_desc->min_cpus < 1) |
| error_code = ESLURM_INVALID_CPU_COUNT; |
| else { |
| save_min_cpus = detail_ptr->min_cpus; |
| detail_ptr->min_cpus = job_desc->min_cpus; |
| } |
| } |
| if (job_desc->max_cpus != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else { |
| save_max_cpus = detail_ptr->max_cpus; |
| detail_ptr->max_cpus = job_desc->max_cpus; |
| } |
| } |
| if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus && |
| (detail_ptr->max_cpus < detail_ptr->min_cpus)) { |
| error_code = ESLURM_INVALID_CPU_COUNT; |
| if (save_min_cpus) { |
| detail_ptr->min_cpus = save_min_cpus; |
| save_min_cpus = 0; |
| } |
| if (save_max_cpus) { |
| detail_ptr->max_cpus = save_max_cpus; |
| save_max_cpus = 0; |
| } |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (save_min_cpus && (detail_ptr->min_cpus != save_min_cpus)) { |
| info("%s: setting min_cpus from %u to %u for %pJ", |
| __func__, save_min_cpus, detail_ptr->min_cpus, job_ptr); |
| job_ptr->limit_set.tres[TRES_ARRAY_CPU] = |
| acct_policy_limit_set.tres[TRES_ARRAY_CPU]; |
| detail_ptr->orig_min_cpus = job_desc->min_cpus; |
| update_accounting = true; |
| } |
| if (save_max_cpus && (detail_ptr->max_cpus != save_max_cpus)) { |
| info("%s: setting max_cpus from %u to %u for %pJ", |
| __func__, save_max_cpus, detail_ptr->max_cpus, job_ptr); |
| /* |
| * Always use the acct_policy_limit_set.* since if set by a |
| * super user it be set correctly |
| */ |
| job_ptr->limit_set.tres[TRES_ARRAY_CPU] = |
| acct_policy_limit_set.tres[TRES_ARRAY_CPU]; |
| detail_ptr->orig_max_cpus = job_desc->max_cpus; |
| update_accounting = true; |
| } |
| |
| if ((job_desc->pn_min_cpus != NO_VAL16) && |
| (job_desc->pn_min_cpus != 0)) { |
| |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| } else { |
| detail_ptr->pn_min_cpus = job_desc->pn_min_cpus; |
| detail_ptr->orig_pn_min_cpus = job_desc->pn_min_cpus; |
| info("%s: setting pn_min_cpus to %u for %pJ", |
| __func__, job_desc->pn_min_cpus, job_ptr); |
| } |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->cpus_per_task != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| } else if (job_desc->cpus_per_task == 0) { |
| error("%s: trying to set cpus_per_task to an erroneous value: %u", |
| __func__, job_desc->cpus_per_task); |
| error_code = ESLURM_INVALID_CPU_COUNT; |
| } else if (detail_ptr->cpus_per_task != |
| job_desc->cpus_per_task) { |
| info("%s: setting cpus_per_task from %u to %u for %pJ", |
| __func__, detail_ptr->cpus_per_task, |
| job_desc->cpus_per_task, job_ptr); |
| detail_ptr->cpus_per_task = job_desc->cpus_per_task; |
| detail_ptr->orig_cpus_per_task = |
| job_desc->cpus_per_task; |
| } |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| /* Reset min and max node counts as needed, ensure consistency */ |
| if (job_desc->min_nodes != NO_VAL) { |
| if (job_ptr->details && |
| (job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_ARBITRARY) { |
| info("%s: Cannot update node count of %pJ. Not compatible with arbitrary distribution", |
| __func__, job_ptr); |
| error_code = ESLURM_NOT_SUPPORTED; |
| } else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) |
| ; /* shrink running job, processed later */ |
| else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (job_desc->min_nodes < 1) { |
| info("%s: min_nodes < 1 for %pJ", __func__, job_ptr); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| } else { |
| /* Resize of pending job */ |
| save_min_nodes = detail_ptr->min_nodes; |
| detail_ptr->min_nodes = job_desc->min_nodes; |
| } |
| } |
| if (job_desc->max_nodes != NO_VAL) { |
| if ((IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) && |
| (job_desc->max_nodes == job_desc->min_nodes)) |
| ; /* shrink running job, processed later */ |
| else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else { |
| save_max_nodes = detail_ptr->max_nodes; |
| detail_ptr->max_nodes = job_desc->max_nodes; |
| } |
| } |
| if ((save_min_nodes || save_max_nodes) && detail_ptr->max_nodes && |
| (detail_ptr->max_nodes < detail_ptr->min_nodes)) { |
| info("%s: max_nodes < min_nodes (%u < %u) for %pJ", __func__, |
| detail_ptr->max_nodes, detail_ptr->min_nodes, |
| job_ptr); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| if (save_min_nodes) { |
| detail_ptr->min_nodes = save_min_nodes; |
| save_min_nodes = 0; |
| } |
| if (save_max_nodes) { |
| detail_ptr->max_nodes = save_max_nodes; |
| save_max_nodes = 0; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (save_min_nodes && (save_min_nodes!= detail_ptr->min_nodes)) { |
| info("%s: setting min_nodes from %u to %u for %pJ", __func__, |
| save_min_nodes, detail_ptr->min_nodes, job_ptr); |
| job_ptr->limit_set.tres[TRES_ARRAY_NODE] = |
| acct_policy_limit_set.tres[TRES_ARRAY_NODE]; |
| update_accounting = true; |
| FREE_NULL_BITMAP(detail_ptr->job_size_bitmap); |
| } |
| if (save_max_nodes && (save_max_nodes != detail_ptr->max_nodes)) { |
| info("%s: setting max_nodes from %u to %u for %pJ", __func__, |
| save_max_nodes, detail_ptr->max_nodes, job_ptr); |
| /* |
| * Always use the acct_policy_limit_set.* since if set by a |
| * super user it be set correctly |
| */ |
| job_ptr->limit_set.tres[TRES_ARRAY_NODE] = |
| acct_policy_limit_set.tres[TRES_ARRAY_NODE]; |
| update_accounting = true; |
| FREE_NULL_BITMAP(detail_ptr->job_size_bitmap); |
| } |
| if (job_desc->job_size_str) { |
| if ((!IS_JOB_PENDING(job_ptr)) || !detail_ptr) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (detail_ptr->min_nodes && detail_ptr->max_nodes && |
| (detail_ptr->max_nodes != NO_VAL) && |
| (detail_ptr->max_nodes < MAX_JOB_SIZE_BITMAP)) { |
| bitstr_t *new_size_bitmap; |
| new_size_bitmap = bit_alloc(detail_ptr->max_nodes + 1); |
| if (bit_unfmt(new_size_bitmap, |
| job_desc->job_size_str)) { |
| FREE_NULL_BITMAP(new_size_bitmap); |
| info("%s: %pJ: invalid job_size_str:%s", |
| __func__, job_ptr, job_desc->job_size_str); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| } else { |
| FREE_NULL_BITMAP(detail_ptr->job_size_bitmap); |
| detail_ptr->job_size_bitmap = new_size_bitmap; |
| } |
| } else { |
| info("%s: %pJ: invalid job_size_str:%s", __func__, |
| job_ptr, job_desc->job_size_str); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| } |
| |
| } else { |
| error_code = _unroll_min_max_node(job_ptr); |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if ((job_desc->num_tasks != NO_VAL) && |
| (job_desc->bitflags & TASKS_CHANGED)) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (job_desc->num_tasks < 1) |
| error_code = ESLURM_BAD_TASK_COUNT; |
| else { |
| detail_ptr->num_tasks = job_desc->num_tasks; |
| /* |
| * Once you actually requested ntasks you will get |
| * SLURM_NTASKS in your environment. There is no way to |
| * remove that. |
| */ |
| if (job_desc->bitflags & JOB_NTASKS_SET) |
| job_ptr->bit_flags |= JOB_NTASKS_SET; |
| info("%s: setting num_tasks to %u for %pJ", |
| __func__, job_desc->num_tasks, job_ptr); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| /* |
| * If the job records now holds a required nodelist with more nodes than |
| * are required, translate this list into an exclusion of all nodes |
| * except those requested. |
| * |
| * Merge the resulting negated version into the excluded nodelist of the |
| * job. |
| */ |
| if (detail_ptr->req_node_bitmap && |
| (bit_set_count(detail_ptr->req_node_bitmap) > |
| detail_ptr->min_nodes)) { |
| if (!detail_ptr->exc_node_bitmap) |
| detail_ptr->exc_node_bitmap = |
| bit_alloc(node_record_count); |
| bit_or_not(detail_ptr->exc_node_bitmap, |
| detail_ptr->req_node_bitmap); |
| FREE_NULL_BITMAP(detail_ptr->req_node_bitmap); |
| } |
| |
| if (job_desc->time_limit != NO_VAL) { |
| if (IS_JOB_FINISHED(job_ptr) || job_ptr->preempt_time) |
| error_code = ESLURM_JOB_FINISHED; |
| else if (job_ptr->time_limit == job_desc->time_limit) { |
| sched_debug("%s: new time limit identical to old time limit %pJ", |
| __func__, job_ptr); |
| } else if (privileged || |
| (job_ptr->time_limit > job_desc->time_limit)) { |
| time_t old_time = job_ptr->time_limit; |
| uint32_t use_time_min = job_desc->time_min != NO_VAL ? |
| job_desc->time_min : job_ptr->time_min; |
| if (old_time == INFINITE) /* one year in mins */ |
| old_time = (365 * 24 * 60); |
| if (job_desc->time_limit < use_time_min) { |
| sched_info("%s: attempt to set time_limit < time_min (%u < %u)", |
| __func__, |
| job_desc->time_limit, |
| use_time_min); |
| error_code = ESLURM_INVALID_TIME_MIN_LIMIT; |
| goto fini; |
| } |
| acct_policy_alter_job(job_ptr, job_desc->time_limit); |
| job_ptr->time_limit = job_desc->time_limit; |
| if (IS_JOB_RUNNING(job_ptr) || |
| IS_JOB_SUSPENDED(job_ptr)) { |
| if (job_ptr->preempt_time) { |
| ; /* Preemption in progress */ |
| } else if (job_ptr->time_limit == INFINITE) { |
| /* Set end time in one year */ |
| job_ptr->end_time = now + |
| (365 * 24 * 60 * 60); |
| } else { |
| /* |
| * Update end_time based upon change |
| * to preserve suspend time info |
| */ |
| job_ptr->end_time = job_ptr->end_time + |
| ((job_ptr->time_limit - |
| old_time) * 60); |
| } |
| if (job_ptr->end_time < now) |
| job_ptr->end_time = now; |
| job_ptr->end_time_exp = job_ptr->end_time; |
| } |
| sched_info("%s: setting time_limit to %u for %pJ", |
| __func__, job_desc->time_limit, job_ptr); |
| /* |
| * Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly |
| */ |
| job_ptr->limit_set.time = acct_policy_limit_set.time; |
| update_accounting = true; |
| } else if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr && |
| (job_ptr->part_ptr->max_time >= |
| job_desc->time_limit)) { |
| job_ptr->time_limit = job_desc->time_limit; |
| sched_info("%s: setting time_limit to %u for %pJ", |
| __func__, job_desc->time_limit, job_ptr); |
| /* |
| * Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly |
| */ |
| job_ptr->limit_set.time = acct_policy_limit_set.time; |
| update_accounting = true; |
| } else { |
| sched_info("%s: Attempt to increase time limit for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if ((job_desc->time_min != NO_VAL) && IS_JOB_PENDING(job_ptr)) { |
| if (job_desc->time_min > job_ptr->time_limit) { |
| info("%s: attempt to set TimeMin > TimeLimit (%u > %u)", |
| __func__, job_desc->time_min, job_ptr->time_limit); |
| error_code = ESLURM_INVALID_TIME_MIN_LIMIT; |
| } else if (job_ptr->time_min != job_desc->time_min) { |
| job_ptr->time_min = job_desc->time_min; |
| info("%s: setting TimeMin to %u for %pJ", |
| __func__, job_desc->time_min, job_ptr); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->end_time) { |
| if (!IS_JOB_RUNNING(job_ptr) || job_ptr->preempt_time) { |
| /* |
| * We may want to use this for deadline scheduling |
| * at some point in the future. For now only reset |
| * the time limit of running jobs. |
| */ |
| error_code = ESLURM_JOB_NOT_RUNNING; |
| } else if (job_desc->end_time < now) { |
| error_code = ESLURM_INVALID_TIME_VALUE; |
| } else if (privileged || |
| (job_ptr->end_time > job_desc->end_time)) { |
| int delta_t = job_desc->end_time - job_ptr->end_time; |
| job_ptr->end_time = job_desc->end_time; |
| job_ptr->time_limit += (delta_t+30)/60; /* Sec->min */ |
| sched_info("%s: setting time_limit to %u for %pJ", |
| __func__, job_ptr->time_limit, job_ptr); |
| /* Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly */ |
| job_ptr->limit_set.time = acct_policy_limit_set.time; |
| update_accounting = true; |
| } else { |
| sched_info("%s: Attempt to extend end time for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| |
| if ((job_desc->deadline) && (!IS_JOB_RUNNING(job_ptr))) { |
| char time_str[256]; |
| slurm_make_time_str(&job_ptr->deadline, time_str, |
| sizeof(time_str)); |
| if (job_desc->deadline < now) { |
| error_code = ESLURM_INVALID_TIME_VALUE; |
| } else if (privileged) { |
| /* update deadline */ |
| job_ptr->deadline = job_desc->deadline; |
| sched_info("%s: setting deadline to %s for %pJ", |
| __func__, time_str, job_ptr); |
| /* |
| * Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly |
| */ |
| job_ptr->limit_set.time = acct_policy_limit_set.time; |
| update_accounting = true; |
| } else { |
| sched_info("%s: Attempt to extend end time for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->delay_boot != NO_VAL) { |
| job_ptr->delay_boot = job_desc->delay_boot; |
| sched_info("%s: setting delay_boot to %u for %pJ", |
| __func__, job_desc->delay_boot, job_ptr); |
| } |
| |
| if ((job_desc->requeue != NO_VAL16) && detail_ptr) { |
| detail_ptr->requeue = MIN(job_desc->requeue, 1); |
| sched_info("%s: setting requeue to %u for %pJ", |
| __func__, job_desc->requeue, job_ptr); |
| } |
| |
| if (job_desc->priority != NO_VAL) { |
| /* |
| * If we are doing time slicing we could update the |
| * priority of the job while running to give better |
| * position (larger time slices) than competing jobs |
| */ |
| if (IS_JOB_FINISHED(job_ptr) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_FINISHED; |
| else if (job_ptr->priority == job_desc->priority) { |
| debug("%s: setting priority to current value",__func__); |
| if ((job_ptr->priority == 0) && privileged) { |
| /* |
| * Authorized user can change from user hold |
| * to admin hold or admin hold to user hold |
| */ |
| if (job_desc->alloc_sid == ALLOC_SID_USER_HOLD) |
| job_ptr->state_reason = WAIT_HELD_USER; |
| else |
| job_ptr->state_reason = WAIT_HELD; |
| } |
| } else if ((job_ptr->priority == 0) && |
| (job_desc->priority == INFINITE) && |
| (privileged || |
| (job_ptr->state_reason == WAIT_RESV_DELETED) || |
| (job_ptr->state_reason == WAIT_HELD_USER))) { |
| _release_job(job_ptr, uid); |
| } else if ((job_ptr->priority == 0) && |
| (job_desc->priority != INFINITE)) { |
| info("%s: ignore priority reset request on held %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_JOB_HELD; |
| } else if (privileged || |
| (job_ptr->priority > job_desc->priority)) { |
| if (job_desc->priority != 0) |
| job_ptr->details->nice = NICE_OFFSET; |
| if (job_desc->priority == INFINITE) { |
| job_ptr->direct_set_prio = 0; |
| set_job_prio(job_ptr); |
| } else if (job_desc->priority == 0) { |
| _hold_job(job_ptr, uid); |
| } else { |
| if (privileged) { |
| /* |
| * Only administrator can make |
| * persistent change to a job's |
| * priority, except holding a job |
| */ |
| job_ptr->direct_set_prio = 1; |
| } else |
| error_code = ESLURM_PRIO_RESET_FAIL; |
| job_ptr->priority = job_desc->priority; |
| if (job_ptr->part_ptr_list && |
| job_ptr->prio_mult && |
| job_ptr->prio_mult->priority_array) { |
| int i, j = list_count( |
| job_ptr->part_ptr_list); |
| for (i = 0; i < j; i++) { |
| job_ptr->prio_mult-> |
| priority_array[i] = |
| job_desc->priority; |
| } |
| } |
| } |
| sched_info("%s: set priority to %u for %pJ", |
| __func__, job_ptr->priority, job_ptr); |
| update_accounting = true; |
| if (job_ptr->priority == 0) { |
| if (!privileged || (job_desc->alloc_sid == |
| ALLOC_SID_USER_HOLD)) { |
| job_ptr->state_reason = WAIT_HELD_USER; |
| } else |
| job_ptr->state_reason = WAIT_HELD; |
| xfree(job_ptr->state_desc); |
| |
| /* remove pending remote sibling jobs */ |
| if (IS_JOB_PENDING(job_ptr) && |
| !IS_JOB_REVOKED(job_ptr)) { |
| fed_mgr_job_revoke_sibs(job_ptr); |
| } |
| } |
| } else if ((job_ptr->priority != 0) && |
| (job_desc->priority == INFINITE)) { |
| /* |
| * If the job was already released, ignore another |
| * release request. |
| */ |
| debug("%s: %pJ already released, ignoring request", |
| __func__, job_ptr); |
| } else { |
| sched_error("Attempt to modify priority for %pJ", |
| job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } else if (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) { |
| /* |
| * We need to check if the state is BadConstraints here since we |
| * are altering the job the bad constraint might have gone |
| * away. If it did the priority (0) wouldn't get reset so the |
| * job would just go into JobAdminHeld otherwise. |
| */ |
| job_ptr->direct_set_prio = 0; |
| set_job_prio(job_ptr); |
| sched_debug("%s: job request changed somehow, removing the bad constraints to reevaluate %pJ uid %u", |
| __func__, job_ptr, uid); |
| job_ptr->state_reason = WAIT_NO_REASON; |
| } |
| |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->nice != NO_VAL) { |
| if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) |
| error_code = ESLURM_JOB_FINISHED; |
| else if (job_ptr->details && |
| (job_ptr->details->nice == job_desc->nice)) |
| sched_debug("%s: new nice identical to old nice %pJ", |
| __func__, job_ptr); |
| else if (job_ptr->direct_set_prio && job_ptr->priority != 0) |
| info("%s: ignore nice set request on %pJ", |
| __func__, job_ptr); |
| else if (privileged || (job_desc->nice >= NICE_OFFSET)) { |
| if (!xstrcmp(slurm_conf.priority_type, |
| "priority/basic")) { |
| int64_t new_prio = job_ptr->priority; |
| new_prio += job_ptr->details->nice; |
| new_prio -= job_desc->nice; |
| job_ptr->priority = MAX(new_prio, 2); |
| sched_info("%s: nice changed from %u to %u, setting priority to %u for %pJ", |
| __func__, job_ptr->details->nice, |
| job_desc->nice, |
| job_ptr->priority, job_ptr); |
| } |
| job_ptr->details->nice = job_desc->nice; |
| update_accounting = true; |
| } else { |
| sched_error("%s: Attempt to modify nice for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->pn_min_memory != NO_VAL64) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| } else if (job_desc->pn_min_memory |
| == detail_ptr->pn_min_memory) { |
| sched_debug("%s: new memory limit identical to old limit for %pJ", |
| __func__, job_ptr); |
| } else { |
| char *entity; |
| if (job_desc->pn_min_memory == MEM_PER_CPU) { |
| /* Map --mem-per-cpu=0 to --mem=0 */ |
| job_desc->pn_min_memory = 0; |
| } |
| if (job_desc->pn_min_memory & MEM_PER_CPU) |
| entity = "cpu"; |
| else |
| entity = "job"; |
| |
| detail_ptr->pn_min_memory = job_desc->pn_min_memory; |
| detail_ptr->orig_pn_min_memory = |
| job_desc->pn_min_memory; |
| job_ptr->bit_flags |= JOB_MEM_SET; |
| sched_info("%s: setting min_memory_%s to %"PRIu64" for %pJ", |
| __func__, entity, |
| (job_desc->pn_min_memory & (~MEM_PER_CPU)), |
| job_ptr); |
| /* |
| * Always use the acct_policy_limit_set.* |
| * since if set by a super user it be set correctly |
| */ |
| job_ptr->limit_set.tres[TRES_ARRAY_MEM] = |
| acct_policy_limit_set.tres[TRES_ARRAY_MEM]; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->pn_min_tmp_disk != NO_VAL) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| } else { |
| detail_ptr->pn_min_tmp_disk = |
| job_desc->pn_min_tmp_disk; |
| |
| sched_info("%s: setting job_min_tmp_disk to %u for %pJ", |
| __func__, job_desc->pn_min_tmp_disk, |
| job_ptr); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->sockets_per_node != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } else { |
| mc_ptr->sockets_per_node = job_desc->sockets_per_node; |
| sched_info("%s: setting sockets_per_node to %u for %pJ", |
| __func__, job_desc->sockets_per_node, |
| job_ptr); |
| } |
| } |
| |
| if (job_desc->cores_per_socket != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } else { |
| mc_ptr->cores_per_socket = job_desc->cores_per_socket; |
| sched_info("%s: setting cores_per_socket to %u for %pJ", |
| __func__, job_desc->cores_per_socket, |
| job_ptr); |
| } |
| } |
| |
| if ((job_desc->threads_per_core != NO_VAL16)) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } else { |
| mc_ptr->threads_per_core = job_desc->threads_per_core; |
| sched_info("%s: setting threads_per_core to %u for %pJ", |
| __func__, job_desc->threads_per_core, |
| job_ptr); |
| } |
| } |
| |
| if (job_desc->shared != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| } else if (!privileged) { |
| sched_error("%s: Attempt to change sharing for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } else { |
| if (job_desc->shared) { |
| detail_ptr->share_res = 1; |
| detail_ptr->whole_node = 0; |
| } else { |
| detail_ptr->share_res = 0; |
| } |
| sched_info("%s: setting shared to %u for %pJ", |
| __func__, job_desc->shared, job_ptr); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->contiguous != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (privileged || |
| (detail_ptr->contiguous > job_desc->contiguous)) { |
| detail_ptr->contiguous = job_desc->contiguous; |
| sched_info("%s: setting contiguous to %u for %pJ", |
| __func__, job_desc->contiguous, job_ptr); |
| } else { |
| sched_error("%s: Attempt to add contiguous for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->core_spec != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (privileged && |
| (slurm_conf.conf_flags & CONF_FLAG_ASRU)) { |
| if (job_desc->core_spec == INFINITE16) |
| detail_ptr->core_spec = NO_VAL16; |
| else |
| detail_ptr->core_spec = job_desc->core_spec; |
| sched_info("%s: setting core_spec to %u for %pJ", |
| __func__, detail_ptr->core_spec, job_ptr); |
| if (detail_ptr->core_spec != NO_VAL16) |
| detail_ptr->whole_node |= WHOLE_NODE_REQUIRED; |
| } else { |
| sched_error("%s Attempt to modify core_spec for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->features && detail_ptr && |
| !xstrcmp(job_desc->features, detail_ptr->features)) { |
| sched_debug("%s: new features identical to old features %s", |
| __func__, job_desc->features); |
| } else if (job_desc->features) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (job_desc->features[0] != '\0') { |
| char *old_features = detail_ptr->features; |
| list_t *old_list = detail_ptr->feature_list; |
| detail_ptr->features = xstrdup(job_desc->features); |
| detail_ptr->feature_list = NULL; |
| if (build_feature_list(job_ptr, false, false)) { |
| sched_info("%s: invalid features(%s) for %pJ", |
| __func__, job_desc->features, |
| job_ptr); |
| FREE_NULL_LIST(detail_ptr->feature_list); |
| xfree(detail_ptr->features); |
| detail_ptr->features = old_features; |
| detail_ptr->feature_list = old_list; |
| error_code = ESLURM_INVALID_FEATURE; |
| } else if (node_features_g_job_valid( |
| detail_ptr->features, |
| detail_ptr->feature_list) != |
| SLURM_SUCCESS) { |
| FREE_NULL_LIST(detail_ptr->feature_list); |
| xfree(detail_ptr->features); |
| detail_ptr->features = old_features; |
| detail_ptr->feature_list = old_list; |
| error_code = ESLURM_INVALID_FEATURE; |
| } else { |
| sched_info("%s: setting features to %s for %pJ", |
| __func__, job_desc->features, |
| job_ptr); |
| xfree(old_features); |
| FREE_NULL_LIST(old_list); |
| detail_ptr->features_use = detail_ptr->features; |
| detail_ptr->feature_list_use = |
| detail_ptr->feature_list; |
| } |
| } else { |
| sched_info("%s: cleared features for %pJ", __func__, |
| job_ptr); |
| xfree(detail_ptr->features); |
| FREE_NULL_LIST(detail_ptr->feature_list); |
| detail_ptr->features_use = NULL; |
| detail_ptr->feature_list_use = NULL; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->prefer && detail_ptr && |
| !xstrcmp(job_desc->prefer, detail_ptr->prefer)) { |
| sched_debug("%s: new prefer identical to old prefer %s", |
| __func__, job_desc->prefer); |
| } else if (job_desc->prefer) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (job_desc->prefer[0] != '\0') { |
| char *old_prefer = detail_ptr->prefer; |
| list_t *old_list = detail_ptr->prefer_list; |
| detail_ptr->prefer = xstrdup(job_desc->prefer); |
| detail_ptr->prefer_list = NULL; |
| if (build_feature_list(job_ptr, true, false)) { |
| sched_info("%s: invalid prefer(%s) for %pJ", |
| __func__, job_desc->prefer, |
| job_ptr); |
| FREE_NULL_LIST(detail_ptr->prefer_list); |
| xfree(detail_ptr->prefer); |
| detail_ptr->prefer = old_prefer; |
| detail_ptr->prefer_list = old_list; |
| error_code = ESLURM_INVALID_PREFER; |
| } else if (node_features_g_job_valid( |
| detail_ptr->prefer, |
| detail_ptr->prefer_list) != |
| SLURM_SUCCESS) { |
| FREE_NULL_LIST(detail_ptr->prefer_list); |
| xfree(detail_ptr->prefer); |
| detail_ptr->features = old_prefer; |
| detail_ptr->feature_list = old_list; |
| error_code = ESLURM_INVALID_PREFER; |
| } else { |
| sched_info("%s: setting prefer to %s for %pJ", |
| __func__, job_desc->prefer, |
| job_ptr); |
| xfree(old_prefer); |
| FREE_NULL_LIST(old_list); |
| detail_ptr->features_use = detail_ptr->prefer; |
| detail_ptr->feature_list_use = |
| detail_ptr->prefer_list; |
| } |
| } else { |
| sched_info("%s: cleared prefer for %pJ", __func__, |
| job_ptr); |
| xfree(detail_ptr->prefer); |
| FREE_NULL_LIST(detail_ptr->prefer_list); |
| detail_ptr->features_use = NULL; |
| detail_ptr->feature_list_use = NULL; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->cluster_features && |
| (error_code = fed_mgr_update_job_cluster_features( |
| job_ptr, job_desc->cluster_features))) |
| goto fini; |
| |
| if (job_desc->clusters && |
| (error_code = fed_mgr_update_job_clusters(job_ptr, |
| job_desc->clusters))) |
| goto fini; |
| |
| if (gres_list) { |
| char *tmp = NULL; |
| if (job_desc->cpus_per_tres) { |
| xstrfmtcat(tmp, "cpus_per_tres:%s ", |
| job_desc->cpus_per_tres); |
| xfree(job_ptr->cpus_per_tres); |
| job_ptr->cpus_per_tres = job_desc->cpus_per_tres; |
| job_desc->cpus_per_tres = NULL; |
| } |
| if (job_desc->tres_per_job) { |
| xstrfmtcat(tmp, "tres_per_job:%s ", |
| job_desc->tres_per_job); |
| xfree(job_ptr->tres_per_job); |
| job_ptr->tres_per_job = job_desc->tres_per_job; |
| job_desc->tres_per_job = NULL; |
| } |
| if (job_desc->tres_per_node) { |
| xstrfmtcat(tmp, "tres_per_node:%s ", |
| job_desc->tres_per_node); |
| xfree(job_ptr->tres_per_node); |
| job_ptr->tres_per_node = job_desc->tres_per_node; |
| job_desc->tres_per_node = NULL; |
| } |
| if (job_desc->tres_per_socket) { |
| xstrfmtcat(tmp, "tres_per_socket:%s ", |
| job_desc->tres_per_socket); |
| xfree(job_ptr->tres_per_socket); |
| job_ptr->tres_per_socket = job_desc->tres_per_socket; |
| job_desc->tres_per_socket = NULL; |
| } |
| if (job_desc->tres_per_task) { |
| xstrfmtcat(tmp, "tres_per_task:%s ", |
| job_desc->tres_per_task); |
| xfree(job_ptr->tres_per_task); |
| job_ptr->tres_per_task = job_desc->tres_per_task; |
| job_desc->tres_per_task = NULL; |
| } |
| if (job_desc->mem_per_tres) { |
| xstrfmtcat(tmp, "mem_per_tres:%s ", |
| job_desc->mem_per_tres); |
| xfree(job_ptr->mem_per_tres); |
| job_ptr->mem_per_tres = job_desc->mem_per_tres; |
| job_desc->mem_per_tres = NULL; |
| } |
| if (tmp) { |
| sched_info("%s: setting %sfor %pJ", |
| __func__, tmp, job_ptr); |
| xfree(tmp); |
| } |
| FREE_NULL_LIST(job_ptr->gres_list_req); |
| job_ptr->gres_list_req = gres_list; |
| |
| gres_list = NULL; |
| } |
| |
| if (job_desc->name) { |
| if (IS_JOB_FINISHED(job_ptr)) { |
| error_code = ESLURM_JOB_FINISHED; |
| goto fini; |
| } else if (!xstrcmp(job_desc->name, job_ptr->name)) { |
| sched_debug("%s: new name identical to old name %pJ", |
| __func__, job_ptr); |
| } else { |
| xfree(job_ptr->name); |
| job_ptr->name = xstrdup(job_desc->name); |
| |
| sched_info("%s: setting name to %s for %pJ", |
| __func__, job_ptr->name, job_ptr); |
| update_accounting = true; |
| } |
| } |
| |
| if (job_desc->work_dir && detail_ptr && |
| !xstrcmp(job_desc->work_dir, detail_ptr->work_dir)) { |
| sched_debug("%s: new work_dir identical to old work_dir %s", |
| __func__, job_desc->work_dir); |
| } else if (job_desc->work_dir) { |
| if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } else if (detail_ptr) { |
| xfree(detail_ptr->work_dir); |
| detail_ptr->work_dir = xstrdup(job_desc->work_dir); |
| sched_info("%s: setting work_dir to %s for %pJ", |
| __func__, detail_ptr->work_dir, job_ptr); |
| update_accounting = true; |
| } |
| } |
| |
| if (job_desc->std_err && detail_ptr && |
| !xstrcmp(job_desc->std_err, detail_ptr->std_err)) { |
| sched_debug("%s: new std_err identical to old std_err %s", |
| __func__, job_desc->std_err); |
| } else if (job_desc->std_err) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (detail_ptr && job_desc->std_err[0] == '\0') |
| xfree(detail_ptr->std_err); |
| else if (detail_ptr) { |
| xfree(detail_ptr->std_err); |
| detail_ptr->std_err = xstrdup(job_desc->std_err); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->std_in && detail_ptr && |
| !xstrcmp(job_desc->std_in, detail_ptr->std_in)) { |
| sched_debug("%s: new std_in identical to old std_in %s", |
| __func__, job_desc->std_in); |
| } else if (job_desc->std_in) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (detail_ptr && job_desc->std_in[0] == '\0') |
| xfree(detail_ptr->std_in); |
| else if (detail_ptr) { |
| xfree(detail_ptr->std_in); |
| detail_ptr->std_in = xstrdup(job_desc->std_in); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->std_out && detail_ptr && |
| !xstrcmp(job_desc->std_out, detail_ptr->std_out)) { |
| sched_debug("%s: new std_out identical to old std_out %s", |
| __func__, job_desc->std_out); |
| } else if (job_desc->std_out) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (detail_ptr && job_desc->std_out[0] == '\0') |
| xfree(detail_ptr->std_out); |
| else if (detail_ptr) { |
| xfree(detail_ptr->std_out); |
| detail_ptr->std_out = xstrdup(job_desc->std_out); |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->wckey |
| && !xstrcmp(job_desc->wckey, job_ptr->wckey)) { |
| sched_debug("%s: new wckey identical to old wckey %pJ", |
| __func__, job_ptr); |
| } else if (job_desc->wckey) { |
| if (!IS_JOB_PENDING(job_ptr)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else { |
| int rc = update_job_wckey((char *) __func__, |
| job_ptr, job_desc->wckey); |
| if (rc != SLURM_SUCCESS) |
| error_code = rc; |
| else |
| update_accounting = true; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if ((job_desc->min_nodes != NO_VAL) && |
| (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { |
| uint32_t new_min_task_cnt; |
| /* |
| * Use req_nodes to change the nodes associated with a running |
| * for lack of other field in the job request to use |
| */ |
| if ((job_desc->min_nodes == 0) && (job_ptr->node_cnt > 0) && |
| job_ptr->details && job_ptr->details->expanding_jobid) { |
| job_record_t *expand_job_ptr; |
| bitstr_t *orig_job_node_bitmap, *orig_jobx_node_bitmap; |
| |
| expand_job_ptr = find_job_record(job_ptr->details-> |
| expanding_jobid); |
| if (expand_job_ptr == NULL) { |
| info("%s: Invalid node count (%u) for %pJ update, JobId=%u to expand not found", |
| __func__, job_desc->min_nodes, job_ptr, |
| job_ptr->details->expanding_jobid); |
| error_code = ESLURM_INVALID_JOB_ID; |
| goto fini; |
| } |
| if (IS_JOB_SUSPENDED(job_ptr) || |
| IS_JOB_SUSPENDED(expand_job_ptr)) { |
| info("%s: Can not expand %pJ from %pJ, job is suspended", |
| __func__, expand_job_ptr, job_ptr); |
| error_code = ESLURM_JOB_SUSPENDED; |
| goto fini; |
| } |
| if ((job_ptr->step_list != NULL) && |
| (list_count(job_ptr->step_list) != 0)) { |
| info("%s: Attempt to merge %pJ with active steps into %pJ", |
| __func__, job_ptr, expand_job_ptr); |
| error_code = ESLURMD_STEP_EXISTS; |
| goto fini; |
| } |
| if (!_valid_license_job_expansion(job_ptr, |
| expand_job_ptr)) { |
| info("%s: Cannot merge %pJ with %pJ - cannot mix AND and OR licenses (%s vs %s)", |
| __func__, job_ptr, expand_job_ptr, |
| job_ptr->licenses, |
| expand_job_ptr->licenses); |
| error_code = ESLURM_INVALID_LICENSES; |
| goto fini; |
| } |
| |
| sched_info("%s: killing %pJ and moving all resources to %pJ", |
| __func__, job_ptr, expand_job_ptr); |
| job_pre_resize_acctg(job_ptr); |
| job_pre_resize_acctg(expand_job_ptr); |
| _send_job_kill(job_ptr); |
| |
| xassert(job_ptr->job_resrcs); |
| xassert(job_ptr->job_resrcs->node_bitmap); |
| xassert(expand_job_ptr->job_resrcs->node_bitmap); |
| orig_job_node_bitmap = bit_copy(job_ptr->node_bitmap); |
| orig_jobx_node_bitmap = bit_copy(expand_job_ptr-> |
| job_resrcs-> |
| node_bitmap); |
| error_code = select_g_job_expand(job_ptr, |
| expand_job_ptr); |
| if (error_code == SLURM_SUCCESS) { |
| _merge_job_licenses(job_ptr, expand_job_ptr); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| job_ptr->node_bitmap = orig_job_node_bitmap; |
| orig_job_node_bitmap = NULL; |
| deallocate_nodes(job_ptr, false, false, false); |
| bit_clear_all(job_ptr->node_bitmap); |
| job_state_set(job_ptr, (JOB_COMPLETE | |
| (job_ptr->job_state & |
| JOB_STATE_FLAGS))); |
| _realloc_nodes(expand_job_ptr, |
| orig_jobx_node_bitmap); |
| rebuild_step_bitmaps(expand_job_ptr, |
| orig_jobx_node_bitmap); |
| (void) gs_job_fini(job_ptr); |
| (void) gs_job_start(expand_job_ptr); |
| } |
| FREE_NULL_BITMAP(orig_job_node_bitmap); |
| FREE_NULL_BITMAP(orig_jobx_node_bitmap); |
| job_post_resize_acctg(job_ptr); |
| job_post_resize_acctg(expand_job_ptr); |
| /* |
| * Since job_post_resize_acctg will restart things, |
| * don't do it again. |
| */ |
| update_accounting = false; |
| if (error_code) |
| goto fini; |
| } else if ((job_desc->min_nodes == 0) || |
| (job_desc->min_nodes > job_ptr->node_cnt) || |
| job_ptr->details->expanding_jobid) { |
| sched_info("%s: Invalid node count (%u) for %pJ update", |
| __func__, job_desc->min_nodes, job_ptr); |
| error_code = ESLURM_INVALID_NODE_COUNT; |
| goto fini; |
| } else if (job_desc->min_nodes == job_ptr->node_cnt) { |
| debug2("%s: No change in node count update for %pJ", |
| __func__, job_ptr); |
| } else if (!permit_job_shrink()) { |
| error("%s: request to shrink %pJ denied by configuration", |
| __func__, job_ptr); |
| error_code = ESLURM_NOT_SUPPORTED; |
| goto fini; |
| } else { |
| int total = 0; |
| node_record_t *node_ptr; |
| bitstr_t *rem_nodes, *tmp_nodes; |
| sched_info("%s: set node count to %u for %pJ", __func__, |
| job_desc->min_nodes, job_ptr); |
| job_pre_resize_acctg(job_ptr); |
| |
| /* |
| * Don't remove the batch host from the job. The batch |
| * host isn't guaranteed to be the first bit set in |
| * job_ptr->node_bitmap because the batch host can be |
| * selected with the --batch and --constraint sbatch |
| * flags. |
| */ |
| tmp_nodes = bit_copy(job_ptr->node_bitmap); |
| if (job_ptr->batch_host) { |
| bitstr_t *batch_host_bitmap; |
| if (node_name2bitmap(job_ptr->batch_host, false, |
| &batch_host_bitmap, NULL)) |
| error("%s: Invalid batch host %s for %pJ; this should never happen", |
| __func__, job_ptr->batch_host, |
| job_ptr); |
| else { |
| bit_and_not(tmp_nodes, |
| batch_host_bitmap); |
| FREE_NULL_BITMAP(batch_host_bitmap); |
| /* |
| * Set total to 1 since we're |
| * guaranteeing that we won't remove the |
| * batch host. |
| */ |
| total = 1; |
| } |
| } |
| |
| rem_nodes = bit_alloc(bit_size(tmp_nodes)); |
| for (int i = 0; next_node_bitmap(tmp_nodes, &i); i++) { |
| if (++total <= job_desc->min_nodes) |
| continue; |
| bit_set(rem_nodes, i); |
| } |
| abort_job_on_nodes(job_ptr, rem_nodes); |
| orig_job_node_bitmap = |
| bit_copy(job_ptr->job_resrcs->node_bitmap); |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(rem_nodes, &i)); |
| i++) { |
| kill_step_on_node(job_ptr, node_ptr, false); |
| excise_node_from_job(job_ptr, node_ptr); |
| } |
| /* Resize the core bitmaps of the job's steps */ |
| rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap); |
| |
| FREE_NULL_BITMAP(orig_job_node_bitmap); |
| FREE_NULL_BITMAP(rem_nodes); |
| FREE_NULL_BITMAP(tmp_nodes); |
| (void) gs_job_start(job_ptr); |
| job_post_resize_acctg(job_ptr); |
| sched_info("%s: set nodes to %s for %pJ", |
| __func__, job_ptr->nodes, job_ptr); |
| /* |
| * Since job_post_resize_acctg() will restart |
| * things don't do it again. |
| */ |
| update_accounting = false; |
| } |
| gres_stepmgr_job_build_details( |
| job_ptr->gres_list_alloc, |
| job_ptr->nodes, |
| &job_ptr->gres_detail_cnt, |
| &job_ptr->gres_detail_str, |
| &job_ptr->gres_used); |
| |
| /* |
| * Ensure that the num_tasks is less than |
| * the number of cpus now that tasks can be changed |
| * for a running job. |
| */ |
| new_min_task_cnt = job_ptr->cpu_cnt / detail_ptr->cpus_per_task; |
| if (detail_ptr->num_tasks > new_min_task_cnt) |
| detail_ptr->num_tasks = new_min_task_cnt; |
| |
| tres_req_cnt_set = false; |
| } |
| |
| if (job_desc->ntasks_per_node != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (privileged) { |
| detail_ptr->ntasks_per_node = |
| job_desc->ntasks_per_node; |
| if (detail_ptr->pn_min_cpus < |
| detail_ptr->ntasks_per_node) { |
| detail_ptr->pn_min_cpus = |
| detail_ptr->orig_pn_min_cpus = |
| job_desc->ntasks_per_node; |
| } |
| sched_info("%s: setting ntasks_per_node to %u for %pJ", |
| __func__, job_desc->ntasks_per_node, job_ptr); |
| } else { |
| sched_error("%s: Not super user: ignore ntasks_per_node change for job %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->ntasks_per_socket != NO_VAL16) { |
| if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) || |
| (detail_ptr->mc_ptr == NULL)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| } else if (privileged) { |
| detail_ptr->mc_ptr->ntasks_per_socket = |
| job_desc->ntasks_per_socket; |
| sched_info("%s: setting ntasks_per_socket to %u for %pJ", |
| __func__, job_desc->ntasks_per_socket, |
| job_ptr); |
| } else { |
| sched_error("%s: Not super user: ignore ntasks_per_socket change for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->dependency) { |
| /* Can't update dependency of revoked job */ |
| if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL) || |
| IS_JOB_REVOKED(job_ptr)) |
| error_code = ESLURM_JOB_NOT_PENDING; |
| else if (!fed_mgr_is_origin_job(job_ptr)) { |
| /* |
| * If the job became independent because of a dependency |
| * update, that job gets requeued on siblings and then |
| * the dependency update gets sent to siblings. So we |
| * silently ignore this update on the sibling. |
| */ |
| } else { |
| int rc; |
| rc = update_job_dependency(job_ptr, |
| job_desc->dependency); |
| if (rc != SLURM_SUCCESS) |
| error_code = rc; |
| /* |
| * Because dependencies updated and we don't know where |
| * they used to be, send dependencies to all siblings |
| * so the siblings can update their dependency list. |
| */ |
| else { |
| rc = fed_mgr_submit_remote_dependencies(job_ptr, |
| true, |
| false); |
| if (rc) { |
| error("%s: %pJ Failed to send remote dependencies to some or all siblings.", |
| __func__, job_ptr); |
| error_code = rc; |
| } |
| /* |
| * Even if we fail to send remote dependencies, |
| * we already succeeded in updating the job's |
| * dependency locally, so we still need to |
| * do these things. |
| */ |
| xfree(job_ptr->details->orig_dependency); |
| job_ptr->details->orig_dependency = |
| xstrdup(job_ptr->details->dependency); |
| sched_info("%s: setting dependency to %s for %pJ", |
| __func__, |
| job_ptr->details->dependency, |
| job_ptr); |
| /* |
| * If the job isn't independent, remove pending |
| * remote sibling jobs |
| */ |
| if (!job_independent(job_ptr)) |
| fed_mgr_job_revoke_sibs(job_ptr); |
| } |
| } |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| if (job_desc->begin_time) { |
| if (IS_JOB_PENDING(job_ptr) && detail_ptr) { |
| char time_str[256]; |
| /* |
| * Make sure this time is current, it does no good for |
| * accounting to say this job could have started before |
| * now |
| */ |
| if (job_desc->begin_time < now) |
| job_desc->begin_time = now; |
| |
| if (detail_ptr->begin_time != job_desc->begin_time) { |
| detail_ptr->begin_time = job_desc->begin_time; |
| update_accounting = true; |
| slurm_make_time_str(&detail_ptr->begin_time, |
| time_str, sizeof(time_str)); |
| sched_info("%s: setting begin to %s for %pJ", |
| __func__, time_str, job_ptr); |
| acct_policy_remove_accrue_time(job_ptr, false); |
| } else |
| sched_debug("%s: new begin time identical to old begin time %pJ", |
| __func__, job_ptr); |
| } else { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } |
| } |
| |
| if (valid_licenses) { |
| if (IS_JOB_PENDING(job_ptr)) { |
| FREE_NULL_LIST(job_ptr->license_list); |
| job_ptr->license_list = license_list; |
| license_list = NULL; |
| sched_info("%s: changing licenses from '%s' to '%s' for pending %pJ", |
| __func__, job_ptr->licenses, |
| job_desc->licenses_tot, job_ptr); |
| xfree(job_ptr->licenses); |
| job_ptr->licenses = xstrdup(job_desc->licenses_tot); |
| if (job_desc->bitflags & RESET_LIC_JOB) |
| xfree(job_ptr->lic_req); |
| else if (job_desc->licenses) { |
| xfree(job_ptr->lic_req); |
| job_ptr->lic_req = xstrdup(job_desc->licenses); |
| } |
| } else if (IS_JOB_RUNNING(job_ptr)) { |
| /* |
| * Operators can modify license counts on running jobs, |
| * regular users can only completely remove license |
| * counts on running jobs. |
| */ |
| if (!privileged && license_list) { |
| sched_error("%s: Not operator user: ignore licenses change for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| goto fini; |
| } |
| |
| /* |
| * NOTE: This can result in oversubscription of |
| * licenses |
| */ |
| license_job_return(job_ptr); |
| FREE_NULL_LIST(job_ptr->license_list); |
| job_ptr->license_list = license_list; |
| license_list = NULL; |
| sched_info("%s: changing licenses from '%s' to '%s' for running %pJ", |
| __func__, job_ptr->licenses, |
| job_desc->licenses, job_ptr); |
| xfree(job_ptr->licenses); |
| job_ptr->licenses = xstrdup(job_desc->licenses); |
| license_job_get(job_ptr, false); |
| } else { |
| /* |
| * licenses are valid, but job state or user not |
| * allowed to make changes |
| */ |
| sched_info("%s: could not change licenses for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING; |
| FREE_NULL_LIST(license_list); |
| } |
| |
| update_accounting = true; |
| } |
| if (error_code != SLURM_SUCCESS) |
| goto fini; |
| |
| fail_reason = job_limits_check(&job_ptr, false); |
| if (fail_reason != WAIT_NO_REASON) { |
| if (fail_reason == WAIT_QOS_THRES) |
| error_code = ESLURM_QOS_THRES; |
| else if ((fail_reason == WAIT_PART_TIME_LIMIT) || |
| (fail_reason == WAIT_PART_NODE_LIMIT) || |
| (fail_reason == WAIT_PART_DOWN) || |
| (fail_reason == WAIT_HELD)) |
| error_code = SLURM_SUCCESS; |
| else |
| error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| |
| if (error_code != SLURM_SUCCESS) { |
| if ((job_ptr->state_reason != WAIT_HELD) && |
| (job_ptr->state_reason != WAIT_HELD_USER) && |
| (job_ptr->state_reason != WAIT_RESV_DELETED)) { |
| job_ptr->state_reason = fail_reason; |
| xfree(job_ptr->state_desc); |
| } |
| goto fini; |
| } |
| } else if ((job_ptr->state_reason != WAIT_HELD) |
| && (job_ptr->state_reason != WAIT_HELD_USER) |
| && (job_ptr->state_reason != WAIT_RESV_DELETED) |
| /* |
| * A job update can come while the prolog is running. |
| * Don't change state_reason if the prolog is running. |
| * _is_prolog_finished() relies on state_reason==WAIT_PROLOG |
| * to know if the prolog is running. If we change it here, |
| * then slurmctld will think that the prolog isn't running |
| * anymore and _slurm_rpc_job_ready will tell srun that the |
| * prolog is done even if it isn't. Then srun can launch a |
| * job step before the prolog is done, which breaks the |
| * behavior of PrologFlags=alloc and means that the job step |
| * could launch before the extern step sets up x11. |
| */ |
| && (job_ptr->state_reason != WAIT_PROLOG) |
| && (job_ptr->state_reason != WAIT_MAX_REQUEUE)) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| |
| if (job_desc->reboot != NO_VAL16) { |
| if (!validate_super_user(uid)) { |
| error("%s: Attempt to change reboot for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| } else if (!IS_JOB_PENDING(job_ptr)) { |
| error_code = ESLURM_JOB_NOT_PENDING; |
| goto fini; |
| } else { |
| sched_info("%s: setting reboot to %u for %pJ", |
| __func__, job_desc->reboot, job_ptr); |
| if (job_desc->reboot == 0) |
| job_ptr->reboot = 0; |
| else |
| job_ptr->reboot = MAX(1, job_desc->reboot); |
| } |
| } |
| |
| if (job_desc->network && !xstrcmp(job_desc->network, |
| job_ptr->network)) { |
| sched_debug("%s: new network identical to old network %s", |
| __func__, job_ptr->network); |
| } else if (job_desc->network) { |
| xfree(job_ptr->network); |
| if (!strlen(job_desc->network) |
| || !xstrcmp(job_desc->network, "none")) { |
| sched_info("%s: clearing Network option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->network = xstrdup(job_desc->network); |
| sched_info("%s: setting Network to %s for %pJ", |
| __func__, job_ptr->network, job_ptr); |
| } |
| } |
| |
| if (job_desc->fed_siblings_viable) { |
| if (!job_ptr->fed_details) { |
| error_code = ESLURM_JOB_NOT_FEDERATED; |
| goto fini; |
| } |
| |
| info("%s: setting fed_siblings from %"PRIu64" to %"PRIu64" for %pJ", |
| __func__, job_ptr->fed_details->siblings_viable, |
| job_desc->fed_siblings_viable, job_ptr); |
| |
| job_ptr->fed_details->siblings_viable = |
| job_desc->fed_siblings_viable; |
| update_job_fed_details(job_ptr); |
| } |
| |
| if (job_desc->cpus_per_tres) { |
| if (!assoc_mgr_valid_tres_cnt(job_desc->cpus_per_tres, 0)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->cpus_per_tres); |
| if (!strlen(job_desc->cpus_per_tres)) { |
| sched_info("%s: clearing CpusPerTres option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->cpus_per_tres = |
| xstrdup(job_desc->cpus_per_tres); |
| sched_info("%s: setting CpusPerTres to %s for %pJ", |
| __func__, job_ptr->cpus_per_tres, job_ptr); |
| } |
| } |
| |
| if (job_desc->mem_per_tres) { |
| if (!assoc_mgr_valid_tres_cnt(job_desc->mem_per_tres, 0)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->mem_per_tres); |
| if (!strlen(job_desc->mem_per_tres)) { |
| sched_info("%s: clearing MemPerTres option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->mem_per_tres = |
| xstrdup(job_desc->mem_per_tres); |
| sched_info("%s: setting MemPerTres to %s for %pJ", |
| __func__, job_ptr->mem_per_tres, job_ptr); |
| } |
| } |
| |
| if (job_desc->tres_bind) { |
| if (tres_bind_verify_cmdline(job_desc->tres_bind)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->tres_bind); |
| if (!strlen(job_desc->tres_bind)) { |
| sched_info("%s: clearing TresBind option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->tres_bind = xstrdup(job_desc->tres_bind); |
| sched_info("%s: setting TresBind to %s for %pJ", |
| __func__, job_ptr->tres_bind, job_ptr); |
| } |
| } |
| |
| if (job_desc->tres_freq) { |
| if (tres_freq_verify_cmdline(job_desc->tres_freq)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->tres_freq); |
| if (!strlen(job_desc->tres_freq)) { |
| sched_info("%s: clearing TresFreq option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->tres_freq = xstrdup(job_desc->tres_freq); |
| sched_info("%s: setting TresFreq to %s for %pJ", |
| __func__, job_ptr->tres_freq, job_ptr); |
| } |
| } |
| |
| if (job_desc->tres_per_job) { |
| if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_job, 0)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->tres_per_job); |
| if (!strlen(job_desc->tres_per_job)) { |
| sched_info("%s: clearing TresPerJob option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->tres_per_job = |
| xstrdup(job_desc->tres_per_job); |
| sched_info("%s: setting TresPerJob to %s for %pJ", |
| __func__, job_ptr->tres_per_job, job_ptr); |
| } |
| } |
| if (job_desc->tres_per_node) { |
| if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_node, 0)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->tres_per_node); |
| if (!strlen(job_desc->tres_per_node)) { |
| sched_info("%s: clearing TresPerNode option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->tres_per_node = |
| xstrdup(job_desc->tres_per_node); |
| sched_info("%s: setting TresPerNode to %s for %pJ", |
| __func__, job_ptr->tres_per_node, job_ptr); |
| } |
| } |
| |
| if (job_desc->tres_per_socket) { |
| if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_socket, 0)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->tres_per_socket); |
| if (!strlen(job_desc->tres_per_socket)) { |
| sched_info("%s: clearing TresPerSocket option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->tres_per_socket = |
| xstrdup(job_desc->tres_per_socket); |
| sched_info("%s: setting TresPerSocket to %s for %pJ", |
| __func__, job_ptr->tres_per_socket, job_ptr); |
| } |
| } |
| |
| if (job_desc->tres_per_task) { |
| if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_task, 0)) { |
| error_code = ESLURM_INVALID_TRES; |
| goto fini; |
| } |
| xfree(job_ptr->tres_per_task); |
| if (!strlen(job_desc->tres_per_task)) { |
| sched_info("%s: clearing TresPerTask option for %pJ", |
| __func__, job_ptr); |
| } else { |
| job_ptr->tres_per_task = |
| xstrdup(job_desc->tres_per_task); |
| sched_info("%s: setting TresPerTask to %s for %pJ", |
| __func__, job_ptr->tres_per_task, job_ptr); |
| } |
| } |
| |
| if (job_desc->mail_type != NO_VAL16) { |
| job_ptr->mail_type = job_desc->mail_type; |
| sched_info("%s: setting mail_type to %u for %pJ", |
| __func__, job_ptr->mail_type, job_ptr); |
| } |
| |
| if (job_desc->mail_user) { |
| xfree(job_ptr->mail_user); |
| job_ptr->mail_user = _get_mail_user(job_desc->mail_user, |
| job_ptr); |
| sched_info("%s: setting mail_user to %s for %pJ", |
| __func__, job_ptr->mail_user, job_ptr); |
| } |
| |
| /* |
| * The job submit plugin sets site_factor to NO_VAL before calling |
| * the plugin to prevent the user from specifying it. |
| */ |
| if (user_site_factor != NO_VAL) { |
| if (!privileged) { |
| error("%s: Attempt to change SiteFactor for %pJ", |
| __func__, job_ptr); |
| error_code = ESLURM_ACCESS_DENIED; |
| job_desc->site_factor = NO_VAL; |
| } else |
| job_desc->site_factor = user_site_factor; |
| } |
| if (job_desc->site_factor != NO_VAL) { |
| sched_info("%s: setting AdinPrioFactor to %u for %pJ", |
| __func__, job_desc->site_factor, job_ptr); |
| job_ptr->site_factor = job_desc->site_factor; |
| } |
| |
| fini: |
| FREE_NULL_BITMAP(new_req_bitmap); |
| FREE_NULL_LIST(part_ptr_list); |
| |
| if ((error_code == SLURM_SUCCESS) && tres_req_cnt_set) { |
| for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++) { |
| if (tres_req_cnt[tres_pos] == |
| job_ptr->tres_req_cnt[tres_pos]) |
| continue; |
| |
| job_ptr->tres_req_cnt[tres_pos] = |
| tres_req_cnt[tres_pos]; |
| tres_changed = true; |
| } |
| if (tres_changed) { |
| job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] = |
| assoc_mgr_tres_weighted( |
| job_ptr->tres_req_cnt, |
| job_ptr->part_ptr->billing_weights, |
| slurm_conf.priority_flags, false); |
| set_job_tres_req_str(job_ptr, false); |
| update_accounting = true; |
| job_ptr->node_cnt_wag = 0; |
| } |
| } |
| |
| /* This was a local variable, so set it back to NULL */ |
| job_desc->tres_req_cnt = NULL; |
| |
| if (!list_count(job_ptr->gres_list_req)) |
| FREE_NULL_LIST(job_ptr->gres_list_req); |
| |
| FREE_NULL_LIST(gres_list); |
| FREE_NULL_LIST(license_list); |
| if (update_accounting) { |
| info("%s: updating accounting", __func__); |
| /* Update job record in accounting to reflect changes */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } |
| |
| /* |
| * If job isn't held recalculate the priority when not using |
| * priority/basic. Since many factors of an update may affect priority |
| * considerations. Do this whether or not the update was successful or |
| * not. |
| */ |
| if ((job_ptr->priority != 0) && |
| xstrcmp(slurm_conf.priority_type, "priority/basic")) |
| set_job_prio(job_ptr); |
| |
| if ((error_code == SLURM_SUCCESS) && |
| fed_mgr_fed_rec && |
| job_ptr->fed_details && fed_mgr_is_origin_job(job_ptr)) { |
| /* Send updates to sibling jobs */ |
| /* Add the siblings_active to be updated. They could have been |
| * updated if the job's ClusterFeatures were updated. */ |
| job_desc->fed_siblings_viable = |
| job_ptr->fed_details->siblings_viable; |
| fed_mgr_update_job(job_ptr->job_id, job_desc, |
| job_ptr->fed_details->siblings_active, uid); |
| } |
| |
| return error_code; |
| } |
| |
| static int _foreach_update_hetjob(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| foreach_update_hetjob_t *update_hetjob = arg; |
| |
| if (update_hetjob->het_leader->het_job_id != het_job->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", |
| __func__, update_hetjob->het_leader); |
| return 0; |
| } |
| if (update_hetjob->job_desc->array_inx) { |
| update_hetjob->err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId"); |
| update_hetjob->rc = ESLURM_NOT_SUPPORTED; |
| return -1; |
| } else { |
| update_hetjob->rc = _update_job(het_job, |
| update_hetjob->job_desc, |
| update_hetjob->uid, |
| &update_hetjob->err_msg); |
| } |
| return 0; |
| } |
| |
| /* |
| * update_job - update a job's parameters per the supplied specifications |
| * IN msg - RPC to update job, including change specification |
| * IN uid - uid of user issuing RPC |
| * IN send_msg - whether to send msg back or not |
| * RET returns an error code from slurm_errno.h |
| * global: job_list - global list of job entries |
| * last_job_update - time of last job table update |
| */ |
| extern int update_job(slurm_msg_t *msg, uid_t uid, bool send_msg) |
| { |
| job_desc_msg_t *job_desc = msg->data; |
| job_record_t *job_ptr; |
| char *hostname = auth_g_get_host(msg); |
| char *err_msg = NULL; |
| int rc; |
| |
| xfree(job_desc->job_id_str); |
| xstrfmtcat(job_desc->job_id_str, "%u", job_desc->job_id); |
| |
| if (hostname) { |
| xfree(job_desc->alloc_node); |
| job_desc->alloc_node = hostname; |
| } |
| |
| job_ptr = find_job_record(job_desc->job_id); |
| if (job_ptr == NULL) { |
| info("%s: JobId=%u does not exist", |
| __func__, job_desc->job_id); |
| rc = ESLURM_INVALID_JOB_ID; |
| } else { |
| if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) |
| job_desc->array_bitmap = |
| bit_copy(job_ptr->array_recs->task_id_bitmap); |
| |
| rc = _update_job(job_ptr, job_desc, uid, &err_msg); |
| } |
| if (send_msg) |
| slurm_send_rc_err_msg(msg, rc, err_msg); |
| xfree(job_desc->job_id_str); |
| |
| return rc; |
| } |
| |
| /* |
| * IN msg - RPC to update job, including change specification |
| * IN job_desc - a job's specification |
| * IN uid - uid of user issuing RPC |
| * RET returns an error code from slurm_errno.h |
| * global: job_list - global list of job entries |
| * last_job_update - time of last job table update |
| */ |
| extern int update_job_str(slurm_msg_t *msg, uid_t uid) |
| { |
| job_desc_msg_t *job_desc = msg->data; |
| job_record_t *job_ptr, *new_job_ptr; |
| char *hostname = auth_g_get_host(msg); |
| long int long_id; |
| uint32_t job_id = 0, het_job_offset; |
| bitstr_t *array_bitmap = NULL, *tmp_bitmap; |
| int32_t i, i_first, i_last; |
| int len, rc = SLURM_SUCCESS, rc2; |
| char *end_ptr, *tmp = NULL; |
| char *job_id_str; |
| char *err_msg = NULL; |
| resp_array_struct_t *resp_array = NULL; |
| |
| job_id_str = job_desc->job_id_str; |
| |
| if (hostname) { |
| xfree(job_desc->alloc_node); |
| job_desc->alloc_node = hostname; |
| |
| } |
| |
| if (max_array_size == NO_VAL) |
| max_array_size = slurm_conf.max_array_sz; |
| |
| long_id = strtol(job_id_str, &end_ptr, 10); |
| if ((long_id <= 0) || (long_id == LONG_MAX) || |
| ((end_ptr[0] != '\0') && (end_ptr[0] != '_') && |
| (end_ptr[0] != '+'))) { |
| info("%s: invalid JobId=%s", __func__, job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| job_id = (uint32_t) long_id; |
| if (end_ptr[0] == '\0') { /* Single job (or full job array) */ |
| job_record_t *job_ptr_done = NULL; |
| job_ptr = find_job_record(job_id); |
| if (job_ptr && job_ptr->het_job_list) { |
| foreach_update_hetjob_t update_hetjob = { |
| .het_leader = job_ptr, |
| .job_desc = job_desc, |
| .rc = SLURM_SUCCESS, |
| .uid = uid, |
| }; |
| (void) list_for_each(job_ptr->het_job_list, |
| _foreach_update_hetjob, |
| &update_hetjob); |
| rc = update_hetjob.rc; |
| err_msg = update_hetjob.err_msg; |
| update_hetjob.err_msg = NULL; |
| goto reply; |
| } |
| if (job_ptr && |
| (((job_ptr->array_task_id == NO_VAL) && |
| (job_ptr->array_recs == NULL)) || |
| ((job_ptr->array_task_id != NO_VAL) && |
| (job_ptr->array_job_id != job_id)))) { |
| /* This is a regular job or single task of job array */ |
| if (job_desc->array_inx) { |
| err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId"); |
| rc = ESLURM_NOT_SUPPORTED; |
| } else |
| rc = _update_job(job_ptr, job_desc, uid, |
| &err_msg); |
| goto reply; |
| } |
| |
| if (job_ptr && job_ptr->array_recs) { |
| /* This is a job array */ |
| job_ptr_done = job_ptr; |
| if (job_ptr->array_recs->task_id_bitmap) |
| job_desc->array_bitmap = bit_copy( |
| job_ptr->array_recs->task_id_bitmap); |
| rc2 = _update_job(job_ptr, job_desc, uid, &err_msg); |
| _resp_array_add(&resp_array, job_ptr, rc2, err_msg); |
| xfree(err_msg); |
| } |
| |
| /* Update all tasks of this job array */ |
| job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; |
| if (!job_ptr && !job_ptr_done) { |
| info("%s: invalid JobId=%u", __func__, job_id); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == job_id) && |
| (job_ptr != job_ptr_done)) { |
| rc2 = _update_job(job_ptr, job_desc, uid, |
| &err_msg); |
| _resp_array_add(&resp_array, job_ptr, rc2, |
| err_msg); |
| xfree(err_msg); |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| goto reply; |
| } else if (end_ptr[0] == '+') { /* Hetjob element */ |
| long_id = strtol(end_ptr+1, &tmp, 10); |
| if ((long_id < 0) || (long_id == LONG_MAX) || |
| (tmp[0] != '\0')) { |
| info("%s: invalid JobId=%s", __func__, job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| het_job_offset = (uint32_t) long_id; |
| job_ptr = find_het_job_record(job_id, het_job_offset); |
| if (!job_ptr) { |
| info("%s: invalid JobId=%u", __func__, job_id); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| if (job_desc->array_inx) { |
| err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId"); |
| rc = ESLURM_NOT_SUPPORTED; |
| } else { |
| rc = _update_job(job_ptr, job_desc, uid, &err_msg); |
| } |
| goto reply; |
| } |
| |
| array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size, |
| &i_last); |
| if (!array_bitmap) { |
| info("%s: invalid JobId=%s", __func__, job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr && IS_JOB_PENDING(job_ptr) && |
| job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) { |
| /* Ensure bitmap sizes match for AND operations */ |
| len = bit_size(job_ptr->array_recs->task_id_bitmap); |
| i_last++; |
| if (i_last < len) { |
| bit_realloc(array_bitmap, len); |
| } else { |
| bit_realloc(array_bitmap, i_last); |
| bit_realloc(job_ptr->array_recs->task_id_bitmap, |
| i_last); |
| } |
| if (!bit_overlap_any(job_ptr->array_recs->task_id_bitmap, |
| array_bitmap)) { |
| /* Nothing to do with this job record */ |
| } else if (bit_super_set(job_ptr->array_recs->task_id_bitmap, |
| array_bitmap)) { |
| /* Update the record with all pending tasks */ |
| job_desc->array_bitmap = |
| bit_copy(job_ptr->array_recs->task_id_bitmap); |
| if (job_desc->array_inx) { |
| err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId"); |
| rc2 = ESLURM_NOT_SUPPORTED; |
| } else |
| rc2 = _update_job(job_ptr, job_desc, uid, |
| &err_msg); |
| _resp_array_add(&resp_array, job_ptr, rc2, err_msg); |
| xfree(err_msg); |
| bit_and_not(array_bitmap, job_desc->array_bitmap); |
| } else { |
| /* Need to split out tasks to separate job records */ |
| tmp_bitmap = bit_copy(job_ptr->array_recs-> |
| task_id_bitmap); |
| bit_and(tmp_bitmap, array_bitmap); |
| i_first = bit_ffs(tmp_bitmap); |
| if (i_first >= 0) |
| i_last = bit_fls(tmp_bitmap); |
| else |
| i_last = -2; |
| for (i = i_first; i <= i_last; i++) { |
| if (!bit_test(tmp_bitmap, i)) |
| continue; |
| job_ptr->array_task_id = i; |
| new_job_ptr = job_array_split(job_ptr, true); |
| |
| /* |
| * The array_recs structure is moved to the |
| * new job record copy. |
| */ |
| bb_g_job_validate2(job_ptr, NULL); |
| job_ptr = new_job_ptr; |
| } |
| FREE_NULL_BITMAP(tmp_bitmap); |
| } |
| } |
| |
| i_first = bit_ffs(array_bitmap); |
| if (i_first >= 0) |
| i_last = bit_fls(array_bitmap); |
| else |
| i_last = -2; |
| for (i = i_first; i <= i_last; i++) { |
| if (!bit_test(array_bitmap, i)) |
| continue; |
| job_ptr = find_job_array_rec(job_id, i); |
| if (job_ptr == NULL) { |
| info("%s: invalid JobId=%u_%d", __func__, job_id, i); |
| _resp_array_add_id(&resp_array, job_id, i, |
| ESLURM_INVALID_JOB_ID); |
| continue; |
| } |
| |
| if (job_desc->array_inx) { |
| err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId"); |
| rc2 = ESLURM_NOT_SUPPORTED; |
| } else |
| rc2 = _update_job(job_ptr, job_desc, uid, &err_msg); |
| _resp_array_add(&resp_array, job_ptr, rc2, err_msg); |
| xfree(err_msg); |
| } |
| |
| reply: |
| if (msg->tls_conn) { |
| if (resp_array) { |
| job_array_resp_msg_t *resp_array_msg = |
| _resp_array_xlate(resp_array, job_id); |
| (void) send_msg_response(msg, RESPONSE_JOB_ARRAY_ERRORS, |
| resp_array_msg); |
| slurm_free_job_array_resp(resp_array_msg); |
| } else { |
| slurm_send_rc_err_msg(msg, rc, err_msg); |
| } |
| } |
| xfree(err_msg); |
| _resp_array_free(resp_array); |
| |
| FREE_NULL_BITMAP(array_bitmap); |
| |
| return rc; |
| } |
| |
| extern kill_job_msg_t *create_kill_job_msg(job_record_t *job_ptr, |
| uint16_t protocol_version) |
| { |
| slurm_cred_arg_t cred_arg; |
| kill_job_msg_t *msg = xmalloc(sizeof(*msg)); |
| |
| xassert(job_ptr); |
| xassert(job_ptr->details); |
| |
| setup_cred_arg(&cred_arg, job_ptr); |
| |
| cred_arg.step_id.job_id = job_ptr->job_id; |
| cred_arg.step_id.step_het_comp = NO_VAL; |
| cred_arg.step_id.step_id = NO_VAL; |
| |
| msg->cred = slurm_cred_create(&cred_arg, false, protocol_version); |
| |
| msg->derived_ec = job_ptr->derived_ec; |
| msg->details = xstrdup(job_ptr->state_desc); |
| msg->exit_code = job_ptr->exit_code; |
| msg->het_job_id = job_ptr->het_job_id; |
| msg->job_gres_prep = gres_g_prep_build_env(job_ptr->gres_list_alloc, |
| job_ptr->nodes); |
| msg->job_state = job_ptr->job_state; |
| msg->job_uid = job_ptr->user_id; |
| msg->job_gid = job_ptr->group_id; |
| msg->start_time = job_ptr->start_time; |
| msg->step_id.job_id = job_ptr->job_id; |
| msg->step_id.step_het_comp = NO_VAL; |
| msg->step_id.step_id = NO_VAL; |
| msg->spank_job_env = xduparray(job_ptr->spank_job_env_size, |
| job_ptr->spank_job_env); |
| msg->spank_job_env_size = job_ptr->spank_job_env_size; |
| msg->time = time(NULL); |
| msg->work_dir = xstrdup(job_ptr->details->work_dir); |
| |
| return msg; |
| } |
| |
| static void _send_job_kill(job_record_t *job_ptr) |
| { |
| agent_arg_t *agent_args = NULL; |
| node_record_t *node_ptr; |
| kill_job_msg_t *kill_job; |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_TERMINATE_JOB; |
| agent_args->retry = 0; /* re_kill_job() resends as needed */ |
| agent_args->hostlist = hostlist_create(NULL); |
| |
| last_node_update = time(NULL); |
| |
| if (!job_ptr->node_bitmap_cg) |
| build_cg_bitmap(job_ptr); |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i)); i++) { |
| if (agent_args->protocol_version > node_ptr->protocol_version) |
| agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, node_ptr->name); |
| agent_args->node_count++; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| } |
| if (agent_args->node_count == 0) { |
| if (job_ptr->details->expanding_jobid == 0) { |
| error("%s: %pJ allocated no nodes to be killed on", |
| __func__, job_ptr); |
| } |
| hostlist_destroy(agent_args->hostlist); |
| xfree(agent_args); |
| return; |
| } |
| |
| kill_job = create_kill_job_msg(job_ptr, agent_args->protocol_version); |
| kill_job->nodes = xstrdup(job_ptr->nodes); |
| |
| agent_args->msg_args = kill_job; |
| set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_args); |
| } |
| |
| /* Record accounting information for a job immediately before changing size */ |
| extern void job_pre_resize_acctg(job_record_t *job_ptr) |
| { |
| job_state_set_flag(job_ptr, JOB_RESIZING); |
| job_ptr->resize_time = time(NULL); |
| /* NOTE: job_completion_logger() calls |
| * acct_policy_remove_job_submit() */ |
| job_completion_logger(job_ptr, false); |
| |
| /* This doesn't happen in job_completion_logger, but gets |
| * added back in with job_post_resize_acctg so remove it here. */ |
| acct_policy_job_fini(job_ptr, false); |
| |
| /* NOTE: The RESIZING FLAG needed to be cleared with |
| job_post_resize_acctg */ |
| } |
| |
| /* Record accounting information for a job immediately after changing size */ |
| extern void job_post_resize_acctg(job_record_t *job_ptr) |
| { |
| /* |
| * NOTE: The RESIZING FLAG needed to be set with job_pre_resize_acctg() |
| * the assert is here to make sure we code it that way. |
| */ |
| xassert(IS_JOB_RESIZING(job_ptr)); |
| acct_policy_add_job_submit(job_ptr, false); |
| /* job_set_alloc_tres() must be called before acct_policy_job_begin() */ |
| job_set_alloc_tres(job_ptr, false); |
| |
| /* |
| * Clear out the old request and replace it with the new alloc. |
| * This probably isn't totally perfect in all situations, but it will |
| * make it tres_req_* correct enough to the user. The tres_req_* isn't |
| * used to make any decisions. It is stored in the database, but only |
| * as a reference for non-pending jobs, which in this case will always |
| * be the case. |
| */ |
| memcpy(job_ptr->tres_req_cnt, job_ptr->tres_alloc_cnt, |
| slurmctld_tres_cnt * sizeof(uint64_t)); |
| xfree(job_ptr->tres_req_str); |
| job_ptr->tres_req_str = xstrdup(job_ptr->tres_alloc_str); |
| xfree(job_ptr->tres_fmt_req_str); |
| job_ptr->tres_fmt_req_str = xstrdup(job_ptr->tres_fmt_alloc_str); |
| |
| acct_policy_job_begin(job_ptr, false); |
| resv_replace_update(job_ptr); |
| |
| /* |
| * Get new sluid now that we are basically a new job. |
| */ |
| job_record_set_sluid(job_ptr); |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| job_state_unset_flag(job_ptr, JOB_RESIZING); |
| |
| /* |
| * Reset the end_time_exp that was probably set to NO_VAL when |
| * ending the job on the resize. If using the |
| * priority/multifactor plugin if the end_time_exp is NO_VAL |
| * it will not run again for the job. |
| */ |
| job_ptr->end_time_exp = job_ptr->end_time; |
| } |
| |
| /* |
| * validate_jobs_on_node - validate that any jobs that should be on the node |
| * are actually running, if not clean up the job records and/or node |
| * records. |
| * IN slurm_msg - contains the node registration message |
| */ |
| extern void validate_jobs_on_node(slurm_msg_t *slurm_msg) |
| { |
| int i, jobs_on_node; |
| node_record_t *node_ptr; |
| job_record_t *job_ptr; |
| step_record_t *step_ptr; |
| time_t now = time(NULL); |
| |
| slurm_node_registration_status_msg_t *reg_msg = slurm_msg->data; |
| |
| node_ptr = find_node_record(reg_msg->node_name); |
| if (node_ptr == NULL) { |
| error("slurmd registered on unknown node %s", |
| reg_msg->node_name); |
| return; |
| } |
| |
| /* |
| * Set protocol_version now because abort_job_on_node() needs to know |
| * the node's correct version. validate_node_specs() sets it but that's |
| * too late. |
| */ |
| node_ptr->protocol_version = slurm_msg->protocol_version; |
| |
| if (reg_msg->energy) |
| memcpy(node_ptr->energy, reg_msg->energy, |
| sizeof(acct_gather_energy_t)); |
| |
| if (node_ptr->up_time > reg_msg->up_time) { |
| verbose("Node %s rebooted %u secs ago", |
| reg_msg->node_name, reg_msg->up_time); |
| } |
| |
| if (reg_msg->up_time <= now) { |
| node_ptr->up_time = reg_msg->up_time; |
| node_ptr->boot_time = now - reg_msg->up_time; |
| node_ptr->slurmd_start_time = reg_msg->slurmd_start_time; |
| } else { |
| error("Node up_time is invalid: %u>%u", reg_msg->up_time, |
| (uint32_t) now); |
| } |
| |
| if (waiting_for_node_boot(node_ptr) || |
| waiting_for_node_power_down(node_ptr)) |
| return; |
| |
| /* Check that jobs running are really supposed to be there */ |
| for (i = 0; i < reg_msg->job_count; i++) { |
| if ( (reg_msg->step_id[i].job_id >= MIN_NOALLOC_JOBID) && |
| (reg_msg->step_id[i].job_id <= MAX_NOALLOC_JOBID) ) { |
| info("NoAllocate %ps reported on node %s", |
| ®_msg->step_id[i], reg_msg->node_name); |
| continue; |
| } |
| |
| job_ptr = find_job_record(reg_msg->step_id[i].job_id); |
| if (job_ptr == NULL) { |
| error("Orphan %ps reported on node %s", |
| ®_msg->step_id[i], |
| reg_msg->node_name); |
| abort_job_on_node(reg_msg->step_id[i].job_id, |
| job_ptr, node_ptr->name); |
| } |
| |
| else if (IS_JOB_RUNNING(job_ptr) || |
| IS_JOB_SUSPENDED(job_ptr)) { |
| if (bit_test(job_ptr->node_bitmap, node_ptr->index)) { |
| if ((job_ptr->batch_flag) && |
| (node_ptr->index == bit_ffs( |
| job_ptr->node_bitmap))) { |
| /* NOTE: Used for purging defunct |
| * batch jobs */ |
| job_ptr->time_last_active = now; |
| } |
| step_ptr = find_step_record(job_ptr, |
| ®_msg-> |
| step_id[i]); |
| if (step_ptr) |
| step_ptr->time_last_active = now; |
| debug3("Registered %pS on node %s", |
| step_ptr, reg_msg->node_name); |
| } else { |
| /* Typically indicates a job requeue and |
| * restart on another nodes. A node from the |
| * original allocation just responded here. */ |
| error("Registered %pJ %ps on wrong node %s", |
| job_ptr, |
| ®_msg->step_id[i], |
| reg_msg->node_name); |
| info("%s: job nodes %s count %d inx %d", |
| __func__, job_ptr->nodes, |
| job_ptr->node_cnt, node_ptr->index); |
| abort_job_on_node(reg_msg->step_id[i].job_id, |
| job_ptr, |
| node_ptr->name); |
| } |
| } |
| |
| else if (IS_JOB_COMPLETING(job_ptr)) { |
| /* |
| * Re-send kill request as needed, |
| * not necessarily an error |
| */ |
| kill_job_on_node(job_ptr, node_ptr); |
| } |
| |
| |
| else if (IS_JOB_PENDING(job_ptr)) { |
| /* Typically indicates a job requeue and the hung |
| * slurmd that went DOWN is now responding */ |
| error("Registered PENDING %pJ %ps on node %s", |
| job_ptr, |
| ®_msg->step_id[i], |
| reg_msg->node_name); |
| abort_job_on_node(reg_msg->step_id[i].job_id, |
| job_ptr, node_ptr->name); |
| } else if (difftime(now, job_ptr->end_time) < |
| slurm_conf.msg_timeout) { |
| /* Race condition */ |
| debug("Registered newly completed %pJ %ps on %s", |
| job_ptr, |
| ®_msg->step_id[i], |
| node_ptr->name); |
| } |
| |
| else { /* else job is supposed to be done */ |
| error("Registered %pJ %ps in state %s on node %s", |
| job_ptr, |
| ®_msg->step_id[i], |
| job_state_string(job_ptr->job_state), |
| reg_msg->node_name); |
| kill_job_on_node(job_ptr, node_ptr); |
| } |
| } |
| |
| jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; |
| if (jobs_on_node) |
| _purge_missing_jobs(node_ptr->index, now); |
| |
| if (jobs_on_node != reg_msg->job_count) { |
| /* slurmd will not know of a job unless the job has |
| * steps active at registration time, so this is not |
| * an error condition, slurmd is also reporting steps |
| * rather than jobs */ |
| debug3("resetting job_count on node %s from %u to %d", |
| reg_msg->node_name, reg_msg->job_count, jobs_on_node); |
| reg_msg->job_count = jobs_on_node; |
| } |
| } |
| |
| static int _foreach_notify_srun_missing_step(void *x, void *arg) |
| { |
| step_record_t *step_ptr = x; |
| foreach_purge_missing_jobs_t *foreach_purge_missing_jobs = arg; |
| job_record_t *job_ptr = foreach_purge_missing_jobs->job_ptr; |
| time_t node_boot_time = foreach_purge_missing_jobs->node_boot_time; |
| int node_inx = foreach_purge_missing_jobs->node_inx; |
| time_t now = foreach_purge_missing_jobs->now; |
| char *node_name = node_record_table_ptr[node_inx]->name; |
| |
| if ((step_ptr->step_id.step_id == SLURM_EXTERN_CONT) || |
| (step_ptr->step_id.step_id == SLURM_BATCH_SCRIPT) || |
| (step_ptr->state != JOB_RUNNING)) |
| return 0; |
| if (!bit_test(step_ptr->step_node_bitmap, node_inx)) |
| return 0; |
| if (step_ptr->time_last_active >= now) { |
| /* Back up timer in case more than one node |
| * registration happens at this same time. |
| * We don't want this node's registration |
| * to count toward a different node's |
| * registration message. */ |
| step_ptr->time_last_active = now - 1; |
| } else if (step_ptr->host && step_ptr->port) { |
| /* srun may be able to verify step exists on |
| * this node using I/O sockets and kill the |
| * job as needed */ |
| srun_step_missing(step_ptr, node_name); |
| } else if ((step_ptr->start_time < node_boot_time) && |
| !(step_ptr->flags & SSF_NO_KILL)) { |
| /* There is a risk that the job step's tasks completed |
| * on this node before its reboot, but that should be |
| * very rare and there is no srun to work with (POE) */ |
| info("Node %s rebooted, killing missing step %u.%u", |
| node_name, job_ptr->job_id, step_ptr->step_id.step_id); |
| signal_step_tasks_on_node(node_name, step_ptr, SIGKILL, |
| REQUEST_TERMINATE_TASKS); |
| } |
| |
| return 0; |
| } |
| |
| static int _foreach_purge_missing_jobs(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| foreach_purge_missing_jobs_t *foreach_purge_missing_jobs = arg; |
| time_t startup_time = foreach_purge_missing_jobs->batch_startup_time; |
| |
| if ((IS_JOB_CONFIGURING(job_ptr) || |
| (!IS_JOB_RUNNING(job_ptr) && |
| !IS_JOB_SUSPENDED(job_ptr))) || |
| (!bit_test(job_ptr->node_bitmap, |
| foreach_purge_missing_jobs->node_inx))) |
| return 0; |
| |
| if (job_ptr->batch_flag && |
| foreach_purge_missing_jobs->power_save_on && |
| (job_ptr->start_time < |
| foreach_purge_missing_jobs->node_boot_time)) |
| startup_time -= slurm_conf.resume_timeout; |
| |
| if (job_ptr->batch_flag && |
| !job_ptr->het_job_offset && |
| (job_ptr->time_last_active < startup_time) && |
| (job_ptr->start_time < startup_time) && |
| (foreach_purge_missing_jobs->node_ptr == |
| find_node_record(job_ptr->batch_host))) { |
| bool requeue = false; |
| char *requeue_msg = ""; |
| if (job_ptr->details && job_ptr->details->requeue) { |
| requeue = true; |
| requeue_msg = ", Requeuing job"; |
| } |
| info("Batch %pJ missing from batch node %s (not found BatchStartTime after startup)%s", |
| job_ptr, job_ptr->batch_host, requeue_msg); |
| xfree(job_ptr->failed_node); |
| job_ptr->failed_node = xstrdup(job_ptr->batch_host); |
| job_complete(job_ptr->job_id, slurm_conf.slurm_user_id, |
| requeue, true, 1); |
| } else { |
| foreach_purge_missing_jobs->job_ptr = job_ptr; |
| |
| (void) list_for_each(job_ptr->step_list, |
| _foreach_notify_srun_missing_step, |
| foreach_purge_missing_jobs); |
| } |
| return 0; |
| } |
| |
| /* Purge any batch job that should have its script running on node |
| * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds |
| * for startup. |
| * |
| * Purge all job steps that were started before the node was last booted. |
| * |
| * Also notify srun if any job steps should be active on this node |
| * but are not found. */ |
| static void _purge_missing_jobs(int node_inx, time_t now) |
| { |
| static bool power_save_on = false; |
| static time_t sched_update = 0; |
| foreach_purge_missing_jobs_t foreach_purge_missing_jobs = { |
| .node_inx = node_inx, |
| .node_ptr = node_record_table_ptr[node_inx], |
| .now = now, |
| }; |
| |
| if (sched_update != slurm_conf.last_update) { |
| power_save_on = power_save_test(); |
| sched_update = slurm_conf.last_update; |
| } |
| |
| foreach_purge_missing_jobs.power_save_on = power_save_on; |
| |
| if (foreach_purge_missing_jobs.node_ptr->boot_time > |
| (slurm_conf.msg_timeout + 5)) { |
| /* allow for message timeout and other delays */ |
| foreach_purge_missing_jobs.node_boot_time = |
| foreach_purge_missing_jobs.node_ptr->boot_time - |
| (slurm_conf.msg_timeout + 5); |
| } |
| |
| foreach_purge_missing_jobs.batch_startup_time = |
| now - slurm_conf.batch_start_timeout - |
| MIN(DEFAULT_MSG_TIMEOUT, slurm_conf.msg_timeout); |
| |
| (void) list_for_each(job_list, _foreach_purge_missing_jobs, |
| &foreach_purge_missing_jobs); |
| } |
| |
| /* |
| * abort_job_on_node - Kill the specific job_id on a specific node, |
| * the request is not processed immediately, but queued. |
| * This is to prevent a flood of pthreads if slurmctld restarts |
| * without saved state and slurmd daemons register with a |
| * multitude of running jobs. Slurmctld will not recognize |
| * these jobs and use this function to kill them - one |
| * agent request per node as they register. |
| * IN job_id - id of the job to be killed |
| * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. job reported |
| * by slurmd on some node, but job records already purged from |
| * slurmctld) |
| * IN node_name - name of the node on which the job resides |
| */ |
| extern void abort_job_on_node(uint32_t job_id, job_record_t *job_ptr, |
| char *node_name) |
| { |
| agent_arg_t *agent_info; |
| kill_job_msg_t *kill_req; |
| |
| agent_info = xmalloc(sizeof(agent_arg_t)); |
| agent_info->node_count = 1; |
| agent_info->retry = 0; |
| agent_info->hostlist = hostlist_create(node_name); |
| node_record_t *node_ptr; |
| if ((node_ptr = find_node_record(node_name))) |
| agent_info->protocol_version = node_ptr->protocol_version; |
| if (job_ptr) |
| debug("Aborting %pJ on node %s", job_ptr, node_name); |
| else |
| debug("Aborting JobId=%u on node %s", job_id, node_name); |
| |
| if (job_ptr) { /* NULL if unknown */ |
| kill_req = create_kill_job_msg(job_ptr, |
| agent_info->protocol_version); |
| } else { |
| kill_req = xmalloc(sizeof(*kill_req)); |
| kill_req->step_id.job_id = job_id; |
| kill_req->step_id.step_id = NO_VAL; |
| kill_req->step_id.step_het_comp = NO_VAL; |
| kill_req->time = time(NULL); |
| /* kill_req->start_time = 0; Default value */ |
| } |
| |
| kill_req->nodes = xstrdup(node_name); |
| |
| agent_info->msg_type = REQUEST_ABORT_JOB; |
| agent_info->msg_args = kill_req; |
| |
| set_agent_arg_r_uid(agent_info, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_info); |
| } |
| |
| /* |
| * abort_job_on_nodes - Kill the specific job_on the specific nodes, |
| * the request is not processed immediately, but queued. |
| * This is to prevent a flood of pthreads if slurmctld restarts |
| * without saved state and slurmd daemons register with a |
| * multitude of running jobs. Slurmctld will not recognize |
| * these jobs and use this function to kill them - one |
| * agent request per node as they register. |
| * IN job_ptr - pointer to terminating job |
| * IN node_name - name of the node on which the job resides |
| */ |
| extern void abort_job_on_nodes(job_record_t *job_ptr, |
| bitstr_t *node_bitmap) |
| { |
| bitstr_t *full_node_bitmap, *tmp_node_bitmap; |
| node_record_t *node_ptr; |
| int zero = 0; |
| agent_arg_t *agent_info; |
| kill_job_msg_t *kill_req; |
| uint16_t protocol_version; |
| |
| xassert(node_bitmap); |
| /* Send a separate message for nodes at different protocol_versions */ |
| full_node_bitmap = bit_copy(node_bitmap); |
| while ((node_ptr = next_node_bitmap(full_node_bitmap, &zero))) { |
| protocol_version = node_ptr->protocol_version; |
| tmp_node_bitmap = bit_alloc(bit_size(node_bitmap)); |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(full_node_bitmap, &i)); i++) { |
| if (node_ptr->protocol_version != protocol_version) |
| continue; |
| bit_clear(full_node_bitmap, i); |
| bit_set(tmp_node_bitmap, i); |
| } |
| kill_req = create_kill_job_msg(job_ptr, protocol_version); |
| kill_req->nodes = bitmap2node_name_sortable(tmp_node_bitmap, |
| false); |
| agent_info = xmalloc(sizeof(agent_arg_t)); |
| agent_info->node_count = bit_set_count(tmp_node_bitmap); |
| agent_info->retry = 1; |
| agent_info->hostlist = hostlist_create(kill_req->nodes); |
| debug("Aborting %pJ on nodes %s", job_ptr, kill_req->nodes); |
| agent_info->msg_type = REQUEST_ABORT_JOB; |
| agent_info->msg_args = kill_req; |
| agent_info->protocol_version = protocol_version; |
| set_agent_arg_r_uid(agent_info, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_info); |
| FREE_NULL_BITMAP(tmp_node_bitmap); |
| } |
| FREE_NULL_BITMAP(full_node_bitmap); |
| } |
| |
| /* |
| * kill_job_on_node - Kill the specific job on a specific node. |
| * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned) |
| * IN node_ptr - pointer to the node on which the job resides |
| */ |
| extern void kill_job_on_node(job_record_t *job_ptr, |
| node_record_t *node_ptr) |
| { |
| agent_arg_t *agent_info; |
| kill_job_msg_t *kill_req; |
| |
| agent_info = xmalloc(sizeof(agent_arg_t)); |
| agent_info->node_count = 1; |
| agent_info->retry = 0; |
| agent_info->protocol_version = node_ptr->protocol_version; |
| agent_info->hostlist = hostlist_create(node_ptr->name); |
| debug("Killing %pJ on node %s", job_ptr, node_ptr->name); |
| |
| kill_req = create_kill_job_msg(job_ptr, agent_info->protocol_version); |
| kill_req->nodes = xstrdup(node_ptr->name); |
| |
| agent_info->msg_type = REQUEST_TERMINATE_JOB; |
| agent_info->msg_args = kill_req; |
| |
| set_agent_arg_r_uid(agent_info, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_info); |
| } |
| |
| static int _foreach_job_all_finished(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| |
| if (!IS_JOB_FINISHED(het_job)) |
| return -1; |
| return 0; |
| } |
| |
| /* |
| * Return true if this job is complete (including all elements of a hetjob) |
| */ |
| static bool _job_all_finished(job_record_t *job_ptr) |
| { |
| if (!IS_JOB_FINISHED(job_ptr)) |
| return false; |
| |
| if (job_ptr->het_job_list && |
| list_find_first(job_ptr->het_job_list, |
| _foreach_job_all_finished, |
| NULL)) |
| return false; |
| |
| return true; |
| } |
| |
| /* |
| * job_alloc_info_ptr - get details about an existing job allocation |
| * IN uid - job issuing the code |
| * IN job_ptr - pointer to job record |
| * NOTE: See job_alloc_info() if job pointer not known |
| */ |
| extern int job_alloc_info_ptr(uint32_t uid, job_record_t *job_ptr) |
| { |
| uint8_t prolog = 0; |
| |
| if ((slurm_conf.private_data & PRIVATE_DATA_JOBS) && |
| (job_ptr->user_id != uid) && !validate_operator(uid) && |
| (((slurm_mcs_get_privatedata() == 0) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) || |
| ((slurm_mcs_get_privatedata() == 1) && |
| (mcs_g_check_mcs_label(uid, job_ptr->mcs_label, false) != 0)))) |
| return ESLURM_ACCESS_DENIED; |
| if (IS_JOB_PENDING(job_ptr)) |
| return ESLURM_JOB_PENDING; |
| if (_job_all_finished(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| if (job_ptr->details) |
| prolog = job_ptr->details->prolog_running; |
| |
| if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") && |
| (prolog == 0) && job_ptr->node_bitmap && |
| (bit_overlap_any(power_down_node_bitmap, |
| job_ptr->node_bitmap) == 0)) { |
| last_job_update = time(NULL); |
| set_job_alias_list(job_ptr); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * job_alloc_info - get details about an existing job allocation |
| * IN uid - job issuing the code |
| * IN job_id - ID of job for which info is requested |
| * OUT job_pptr - set to pointer to job record |
| * NOTE: See job_alloc_info_ptr() if job pointer is known |
| */ |
| extern int job_alloc_info(uint32_t uid, uint32_t job_id, |
| job_record_t **job_pptr) |
| { |
| job_record_t *job_ptr; |
| |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) |
| return ESLURM_INVALID_JOB_ID; |
| if (job_pptr) |
| *job_pptr = job_ptr; |
| return job_alloc_info_ptr(uid, job_ptr); |
| } |
| |
| /* |
| * If we can't find the job_id we remove the defunct file. If we do find it we |
| * set HAS_STATE_DIR. |
| */ |
| static void _sync_job_with_batch_dir(uint32_t job_id) |
| { |
| job_record_t *job_ptr = find_job_record(job_id); |
| |
| if (job_ptr) { |
| job_ptr->bit_flags |= HAS_STATE_DIR; |
| |
| if (job_ptr->array_recs) { /* Update all tasks */ |
| uint32_t array_job_id = job_ptr->array_job_id; |
| job_ptr = job_array_hash_j[JOB_HASH_INX(array_job_id)]; |
| while (job_ptr) { |
| if (job_ptr->array_job_id == array_job_id) |
| job_ptr->bit_flags |= HAS_STATE_DIR; |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| } |
| } else { |
| info("Purged files for defunct batch JobId=%u", job_id); |
| delete_job_desc_files(job_id); |
| } |
| } |
| |
| /* |
| * Synchronize the batch job in the system with their files. |
| * All pending batch jobs must have script and environment files |
| * No other jobs should have such files |
| */ |
| int sync_job_files(void) |
| { |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| |
| if (!slurmctld_primary) /* Don't purge files from backup slurmctld */ |
| return SLURM_SUCCESS; |
| |
| list_for_each(job_list, _clear_state_dir_flag, NULL); |
| |
| _validate_job_files(); |
| |
| list_for_each(job_list, _test_state_dir_flag, NULL); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _validate_job_files(void) |
| { |
| DIR *f_dir, *h_dir; |
| struct dirent *dir_ent, *hash_ent; |
| uint32_t job_id; |
| char *endptr; |
| |
| xassert(verify_lock(CONF_LOCK, READ_LOCK)); |
| xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); |
| |
| xassert(slurm_conf.state_save_location); |
| f_dir = opendir(slurm_conf.state_save_location); |
| if (!f_dir) { |
| error("opendir(%s): %m", slurm_conf.state_save_location); |
| return; |
| } |
| |
| while ((dir_ent = readdir(f_dir))) { |
| if (!xstrncmp("hash.#", dir_ent->d_name, 5)) { |
| char *h_path = NULL; |
| xstrfmtcat(h_path, "%s/%s", |
| slurm_conf.state_save_location, |
| dir_ent->d_name); |
| h_dir = opendir(h_path); |
| xfree(h_path); |
| if (!h_dir) |
| continue; |
| while ((hash_ent = readdir(h_dir))) { |
| if (xstrncmp("job.#", hash_ent->d_name, 4)) |
| continue; |
| job_id = strtoul(&hash_ent->d_name[4], |
| &endptr, 10); |
| if ((job_id == 0) || (endptr[0] != '\0')) |
| continue; |
| debug3("Found batch directory for JobId=%u", |
| job_id); |
| _sync_job_with_batch_dir(job_id); |
| } |
| closedir(h_dir); |
| } |
| } |
| |
| closedir(f_dir); |
| } |
| |
| static int _clear_state_dir_flag(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *) x; |
| job_ptr->bit_flags &= ~HAS_STATE_DIR; |
| return 0; |
| } |
| |
| static int _test_state_dir_flag(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *) x; |
| |
| if (job_ptr->bit_flags & HAS_STATE_DIR) { |
| job_ptr->bit_flags &= ~HAS_STATE_DIR; |
| return 0; |
| } |
| |
| if (!job_ptr->batch_flag || !IS_JOB_PENDING(job_ptr) || |
| (job_ptr->het_job_offset > 0)) |
| return 0; /* No files expected */ |
| |
| error("Script for %pJ lost, state set to FAILED", job_ptr); |
| job_state_set(job_ptr, JOB_FAILED); |
| job_ptr->exit_code = 1; |
| job_ptr->state_reason = FAIL_SYSTEM; |
| xfree(job_ptr->state_desc); |
| job_ptr->start_time = job_ptr->end_time = time(NULL); |
| job_completion_logger(job_ptr, false); |
| return 0; |
| } |
| |
| /* Get requested gres but only if mem_per_gres was set for that gres */ |
| static int _get_req_gres(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js_out = arg; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| |
| /* |
| * This assumes that only one gres name has mem_per_gres in the job. |
| * This won't work if two different gres names (for example, "gpu" and |
| * "license") both have mem_per_gres. Right now we only allow |
| * mem_per_gres for GPU so this works. |
| */ |
| if (!gres_js->mem_per_gres) |
| return SLURM_SUCCESS; |
| |
| /* |
| * In theory MAX(mem_per_gres) shouldn't matter because we should only |
| * allow one gres name to have mem_per_gres and it should be the same |
| * for all types (e.g., gpu:k80 vs gpu:tesla) of that same gres (gpu). |
| */ |
| gres_js_out->mem_per_gres = MAX(gres_js_out->mem_per_gres, |
| gres_js->mem_per_gres); |
| |
| gres_js_out->gres_per_job += gres_js->gres_per_job; |
| gres_js_out->gres_per_node += gres_js->gres_per_node; |
| gres_js_out->gres_per_socket += gres_js->gres_per_socket; |
| gres_js_out->gres_per_task += gres_js->gres_per_task; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern uint64_t job_get_tres_mem(struct job_resources *job_res, |
| uint64_t pn_min_memory, uint32_t cpu_cnt, |
| uint32_t node_cnt, part_record_t *part_ptr, |
| list_t *gres_list, bool user_set_mem, |
| uint16_t min_sockets_per_node, |
| uint32_t num_tasks) |
| { |
| uint64_t mem_total = 0; |
| int i; |
| |
| if (job_res) { |
| for (i = 0; i < job_res->nhosts; i++) { |
| mem_total += job_res->memory_allocated[i]; |
| } |
| return mem_total; |
| } |
| |
| if (pn_min_memory == NO_VAL64) |
| return mem_total; |
| |
| if (!user_set_mem && gres_list && running_cons_tres()) { |
| /* mem_per_[cpu|node] not set, check if mem_per_gres was set */ |
| gres_job_state_t gres_js; |
| gres_state_t *gres_state_job; |
| uint32_t gpu_plugin_id; |
| memset(&gres_js, 0, sizeof(gres_js)); |
| list_for_each(gres_list, _get_req_gres, &gres_js); |
| if (gres_js.mem_per_gres) { |
| /* Requested node_cnt == 1 if not given */ |
| if (node_cnt == NO_VAL) |
| node_cnt = 1; |
| |
| /* Estimate requested gres per job */ |
| if (gres_js.gres_per_job) |
| return gres_js.mem_per_gres * |
| gres_js.gres_per_job; |
| if (gres_js.gres_per_node) |
| return gres_js.mem_per_gres * |
| gres_js.gres_per_node * node_cnt; |
| if (gres_js.gres_per_socket) { |
| if (min_sockets_per_node && |
| (min_sockets_per_node != NO_VAL16)) |
| return gres_js.mem_per_gres * |
| gres_js.gres_per_socket * |
| node_cnt * min_sockets_per_node; |
| else |
| return gres_js.mem_per_gres * |
| gres_js.gres_per_socket * |
| node_cnt; |
| } |
| if (gres_js.gres_per_task) { |
| if (num_tasks && (num_tasks != NO_VAL)) |
| return gres_js.mem_per_gres * |
| gres_js.gres_per_task * |
| num_tasks; |
| else |
| return gres_js.mem_per_gres * |
| gres_js.gres_per_task; |
| } |
| /* |
| * mem_per_gres set but no gres requested. |
| * We shouldn't get here. |
| */ |
| return 0; |
| } |
| /* |
| * If no mem_per_gres was explicitly set |
| * Set mem_per_gres with DefMemPerGPU |
| */ |
| gpu_plugin_id = gres_get_gpu_plugin_id(); |
| gres_state_job = list_find_first( |
| gres_list, gres_find_id, &gpu_plugin_id); |
| if (gres_state_job) { |
| gres_job_state_t *gres_js_gpu = |
| gres_state_job->gres_data; |
| mem_total = NO_VAL64; |
| if (part_ptr && part_ptr->job_defaults_list) { |
| mem_total = slurm_get_def_mem_per_gpu( |
| part_ptr->job_defaults_list); |
| } |
| if ((mem_total == NO_VAL64) && |
| slurm_conf.job_defaults_list) { |
| mem_total = slurm_get_def_mem_per_gpu( |
| slurm_conf.job_defaults_list); |
| } |
| if (mem_total != NO_VAL64) { |
| mem_total = mem_total * gres_js_gpu->total_gres; |
| return mem_total; |
| } |
| } |
| } |
| |
| if (pn_min_memory == 0) |
| pn_min_memory = _mem_per_node_part(part_ptr); |
| |
| if (pn_min_memory & MEM_PER_CPU) { |
| if (cpu_cnt != NO_VAL) { |
| mem_total = pn_min_memory & (~MEM_PER_CPU); |
| mem_total *= cpu_cnt; |
| } |
| } else if (node_cnt != NO_VAL) |
| mem_total = pn_min_memory * node_cnt; |
| |
| return mem_total; |
| } |
| |
| /* |
| * job_epilog_complete - Note the completion of the epilog script for a |
| * given job |
| * IN job_id - id of the job for which the epilog was executed |
| * IN node_name - name of the node on which the epilog was executed |
| * IN return_code - return code from epilog script |
| * RET true if job is COMPLETED, otherwise false |
| */ |
| extern bool job_epilog_complete(uint32_t job_id, char *node_name, |
| uint32_t return_code) |
| { |
| job_record_t *job_ptr = find_job_record(job_id); |
| node_record_t *node_ptr; |
| |
| if (job_ptr == NULL) { |
| debug("%s: unable to find JobId=%u for node=%s with return_code=%u.", |
| __func__, job_id, node_name, return_code); |
| return true; |
| } |
| |
| log_flag(TRACE_JOBS, "%s: enter %pJ", __func__, job_ptr); |
| |
| /* |
| * There is a potential race condition this handles. |
| * If slurmctld cold-starts while slurmd keeps running, slurmd could |
| * notify slurmctld of a job epilog completion before getting synced |
| * up with slurmctld state. If a new job arrives and the job_id is |
| * reused, we could try to note the termination of a job that hasn't |
| * really started. Very rare obviously. |
| */ |
| if ((IS_JOB_PENDING(job_ptr) && (!IS_JOB_COMPLETING(job_ptr))) || |
| ((!job_ptr->node_bitmap_cg) && (!IS_JOB_COMPLETING(job_ptr))) || |
| (job_ptr->node_bitmap == NULL)) { |
| uint32_t base_state = NODE_STATE_UNKNOWN; |
| node_ptr = find_node_record(node_name); |
| if (node_ptr) |
| base_state = node_ptr->node_state & NODE_STATE_BASE; |
| if (base_state == NODE_STATE_DOWN) { |
| debug("%s: %pJ complete response from DOWN node %s", |
| __func__, job_ptr, node_name); |
| } else if (job_ptr->restart_cnt) { |
| /* |
| * Duplicate epilog complete can be due to race |
| */ |
| debug("%s: %pJ duplicate epilog complete response", |
| __func__, job_ptr); |
| } else { |
| error("%s: %pJ is non-running slurmctld and slurmd out of sync", |
| __func__, job_ptr); |
| } |
| return false; |
| } |
| |
| if (return_code) { |
| error("%s: %pJ epilog error on %s, draining the node", |
| __func__, job_ptr, node_name); |
| drain_nodes(node_name, "Epilog error", |
| slurm_conf.slurm_user_id); |
| } |
| /* Change job from completing to completed */ |
| node_ptr = find_node_record(node_name); |
| if (node_ptr) |
| make_node_idle(node_ptr, job_ptr); |
| |
| /* nodes_completing is out of date, rebuild when next saved */ |
| xfree(job_ptr->nodes_completing); |
| if (!IS_JOB_COMPLETING(job_ptr)) { /* COMPLETED */ |
| batch_requeue_fini(job_ptr); |
| return true; |
| } else |
| return false; |
| } |
| |
| /* Complete a batch job requeue logic after all steps complete so that |
| * subsequent jobs appear in a separate accounting record. */ |
| void batch_requeue_fini(job_record_t *job_ptr) |
| { |
| if (IS_JOB_COMPLETING(job_ptr) || |
| !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag) |
| return; |
| |
| info("Requeuing %pJ", job_ptr); |
| |
| /* Clear everything so this appears to be a new job and then restart |
| * it in accounting. */ |
| job_ptr->start_time = 0; |
| job_ptr->end_time_exp = job_ptr->end_time = 0; |
| job_ptr->total_cpus = 0; |
| job_ptr->pre_sus_time = 0; |
| job_ptr->preempt_time = 0; |
| job_ptr->suspend_time = 0; |
| job_ptr->tot_sus_time = 0; |
| job_ptr->next_step_id = 0; |
| job_ptr->state_reason_prev_db = 0; |
| |
| job_ptr->node_cnt = 0; |
| job_ptr->total_nodes = 0; |
| xfree(job_ptr->alias_list); |
| xfree(job_ptr->batch_host); |
| free_job_resources(&job_ptr->job_resrcs); |
| FREE_NULL_LIST(job_ptr->license_list); |
| xfree(job_ptr->licenses_allocated); |
| xfree(job_ptr->nodes); |
| xfree(job_ptr->node_addrs); |
| xfree(job_ptr->nodes_completing); |
| xfree(job_ptr->failed_node); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); |
| FREE_NULL_LIST(job_ptr->gres_list_alloc); |
| |
| job_resv_clear_magnetic_flag(job_ptr); |
| |
| if (job_ptr->details) { |
| time_t now = time(NULL); |
| /* The time stamp on the new batch launch credential must be |
| * larger than the time stamp on the revoke request. Also the |
| * I/O must be all cleared out, the named socket purged and |
| * the job credential purged by slurmd. */ |
| if (job_ptr->details->begin_time <= now) { |
| int cred_lifetime = DEFAULT_EXPIRATION_WINDOW; |
| time_t begin_time; |
| cred_lifetime = cred_expiration(); |
| begin_time = now + cred_lifetime + 1; |
| if ((job_ptr->bit_flags & CRON_JOB) && |
| job_ptr->details->crontab_entry) { |
| begin_time = calc_next_cron_start( |
| job_ptr->details->crontab_entry, |
| begin_time); |
| } else if (job_ptr->bit_flags & CRON_JOB) { |
| /* |
| * Skip requeuing this instead of crashing. |
| */ |
| error("Missing cron details for %pJ. This should never happen. Clearing CRON_JOB flag and skipping requeue.", |
| job_ptr); |
| job_ptr->bit_flags &= ~CRON_JOB; |
| } |
| job_ptr->details->begin_time = begin_time; |
| } |
| |
| /* Since this could happen on a launch we need to make sure the |
| * submit isn't the same as the last submit so put now + 1 so |
| * we get different records in the database */ |
| if (now == job_ptr->details->submit_time) |
| now++; |
| job_ptr->details->submit_time = now; |
| |
| /* clear the accrue flag */ |
| job_ptr->bit_flags &= ~JOB_ACCRUE_OVER; |
| job_ptr->details->accrue_time = 0; |
| |
| if ((job_ptr->details->whole_node & WHOLE_NODE_REQUIRED) && |
| job_ptr->gres_list_req) { |
| job_details_t *detail_ptr = job_ptr->details; |
| multi_core_data_t *mc_ptr = detail_ptr->mc_ptr; |
| gres_job_state_validate_t gres_js_val = { |
| .cpus_per_tres = job_ptr->cpus_per_tres, |
| .mem_per_tres = job_ptr->mem_per_tres, |
| .tres_freq = job_ptr->tres_freq, |
| .tres_per_job = job_ptr->tres_per_job, |
| .tres_per_node = job_ptr->tres_per_node, |
| .tres_per_socket = job_ptr->tres_per_socket, |
| .tres_per_task = job_ptr->tres_per_task, |
| |
| .cpus_per_task = |
| &detail_ptr->orig_cpus_per_task, |
| .max_nodes = &detail_ptr->max_nodes, |
| .min_cpus = &detail_ptr->min_cpus, |
| .min_nodes = &detail_ptr->min_nodes, |
| .ntasks_per_node = &detail_ptr->ntasks_per_node, |
| .ntasks_per_socket = &mc_ptr->ntasks_per_socket, |
| .ntasks_per_tres = &detail_ptr->ntasks_per_tres, |
| .num_tasks = &detail_ptr->num_tasks, |
| .sockets_per_node = &mc_ptr->sockets_per_node, |
| |
| .gres_list = &job_ptr->gres_list_req, |
| }; |
| |
| /* |
| * We need to reset the gres_list to what was requested |
| * instead of what was given exclusively. |
| */ |
| FREE_NULL_LIST(job_ptr->gres_list_req); |
| (void)gres_job_state_validate(&gres_js_val); |
| } |
| } |
| |
| /* Reset the priority (begin and accrue times were reset) */ |
| if (job_ptr->priority != 0) |
| set_job_prio(job_ptr); |
| |
| /* |
| * If a reservation ended and was a repeated (e.g., daily, weekly) |
| * reservation, its ID will be different; make sure |
| * job->resv_id matches the reservation id. |
| */ |
| if (job_ptr->resv_ptr) |
| job_ptr->resv_id = job_ptr->resv_ptr->resv_id; |
| |
| /* Reset this after the batch step has finished or the batch step |
| * information will be attributed to the next run of the job. */ |
| job_record_set_sluid(job_ptr); |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| /* Submit new sibling jobs for fed jobs */ |
| if (fed_mgr_is_origin_job(job_ptr)) { |
| if (fed_mgr_job_requeue(job_ptr)) { |
| error("failed to submit requeued sibling jobs for fed %pJ", |
| job_ptr); |
| } |
| } |
| } |
| |
| |
| /* job_fini - free all memory associated with job records */ |
| void job_fini (void) |
| { |
| FREE_NULL_LIST(job_list); |
| xfree(job_hash); |
| xfree(job_array_hash_j); |
| xfree(job_array_hash_t); |
| FREE_NULL_LIST(purge_jobs_list); |
| FREE_NULL_LIST(purge_files_list); |
| FREE_NULL_BITMAP(requeue_exit); |
| FREE_NULL_BITMAP(requeue_exit_hold); |
| } |
| |
| /* Record the start of one job array task */ |
| extern void job_array_start(job_record_t *job_ptr) |
| { |
| job_record_t *base_job_ptr; |
| |
| if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { |
| base_job_ptr = find_job_record(job_ptr->array_job_id); |
| if (base_job_ptr && base_job_ptr->array_recs) { |
| base_job_ptr->array_recs->tot_run_tasks++; |
| } |
| } |
| } |
| |
| /* Return true if a job array task can be started */ |
| extern bool job_array_start_test(job_record_t *job_ptr) |
| { |
| job_record_t *base_job_ptr; |
| time_t now = time(NULL); |
| |
| if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { |
| base_job_ptr = find_job_record(job_ptr->array_job_id); |
| if (base_job_ptr && base_job_ptr->array_recs && |
| (base_job_ptr->array_recs->max_run_tasks != 0) && |
| (base_job_ptr->array_recs->tot_run_tasks >= |
| base_job_ptr->array_recs->max_run_tasks)) { |
| if (job_ptr->details && |
| (job_ptr->details->begin_time <= now)) |
| job_ptr->details->begin_time = (time_t) 0; |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_ARRAY_TASK_LIMIT; |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| static void _job_array_comp(job_record_t *job_ptr, bool was_running, |
| bool requeue) |
| { |
| job_record_t *base_job_ptr; |
| uint32_t status; |
| |
| if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { |
| status = job_ptr->exit_code; |
| if ((status == 0) && !IS_JOB_COMPLETE(job_ptr)) { |
| /* Avoid max_exit_code == 0 if task did not run to |
| * successful completion (e.g. Cancelled, NodeFail) */ |
| status = 9; |
| } |
| base_job_ptr = find_job_record(job_ptr->array_job_id); |
| if (base_job_ptr && base_job_ptr->array_recs) { |
| if (requeue) { |
| base_job_ptr->array_recs->array_flags |= |
| ARRAY_TASK_REQUEUED; |
| } else if (!base_job_ptr->array_recs->tot_comp_tasks) { |
| base_job_ptr->array_recs->min_exit_code = |
| status; |
| base_job_ptr->array_recs->max_exit_code = |
| status; |
| } else { |
| base_job_ptr->array_recs->min_exit_code = |
| MIN(status, base_job_ptr-> |
| array_recs->min_exit_code); |
| base_job_ptr->array_recs->max_exit_code = |
| MAX(status, base_job_ptr-> |
| array_recs->max_exit_code); |
| } |
| if (was_running && |
| base_job_ptr->array_recs->tot_run_tasks) |
| base_job_ptr->array_recs->tot_run_tasks--; |
| base_job_ptr->array_recs->tot_comp_tasks++; |
| } |
| } |
| } |
| |
| /* log the completion of the specified job */ |
| extern void job_completion_logger(job_record_t *job_ptr, bool requeue) |
| { |
| int base_state; |
| bool arr_finished = false, task_failed = false, task_requeued = false; |
| bool was_running = false; |
| job_record_t *master_job = NULL; |
| uint32_t max_exit_code = 0; |
| |
| xassert(job_ptr); |
| |
| if (job_ptr->resv_ports) |
| resv_port_job_free(job_ptr); |
| |
| acct_policy_remove_job_submit(job_ptr, false); |
| if (job_ptr->nodes && ((job_ptr->bit_flags & JOB_KILL_HURRY) == 0) |
| && !IS_JOB_RESIZING(job_ptr)) { |
| (void) bb_g_job_start_stage_out(job_ptr); |
| } else if (job_ptr->nodes && IS_JOB_RESIZING(job_ptr)){ |
| debug("%s: %pJ resizing, skipping bb stage_out", |
| __func__, job_ptr); |
| } else { |
| /* |
| * Never allocated compute nodes. |
| * Unless job ran, there is no data to stage-out |
| */ |
| (void) bb_g_job_cancel(job_ptr); |
| } |
| if (job_ptr->bit_flags & JOB_WAS_RUNNING) { |
| job_ptr->bit_flags &= ~JOB_WAS_RUNNING; |
| was_running = true; |
| } |
| |
| _job_array_comp(job_ptr, was_running, requeue); |
| |
| if (!IS_JOB_RESIZING(job_ptr) && |
| (!IS_JOB_PENDING(job_ptr) || requeue) && |
| !IS_JOB_REVOKED(job_ptr) && |
| ((job_ptr->array_task_id == NO_VAL) || |
| (job_ptr->mail_type & MAIL_ARRAY_TASKS) || |
| (arr_finished = test_job_array_finished(job_ptr->array_job_id)))) { |
| /* Remove configuring state just to make sure it isn't there |
| * since it will throw off displays of the job. */ |
| job_state_unset_flag(job_ptr, JOB_CONFIGURING); |
| |
| /* make sure all parts of the job are notified |
| * Fed Jobs: only signal the srun from where the job is running |
| * or from the origin if the job wasn't running. */ |
| if (!job_ptr->fed_details || |
| fed_mgr_job_is_self_owned(job_ptr) || |
| (fed_mgr_is_origin_job(job_ptr) && |
| !fed_mgr_job_is_locked(job_ptr))) |
| srun_job_complete(job_ptr); |
| |
| /* mail out notifications of completion */ |
| if (arr_finished) { |
| /* We need to summarize different tasks states. */ |
| master_job = find_job_record(job_ptr->array_job_id); |
| if (master_job && master_job->array_recs) { |
| task_requeued = |
| (master_job->array_recs->array_flags & |
| ARRAY_TASK_REQUEUED); |
| if (task_requeued && |
| (job_ptr->mail_type & MAIL_JOB_REQUEUE)) { |
| /* |
| * At least 1 task requeued and job |
| * req. to be notified on requeues. |
| */ |
| mail_job_info(master_job, |
| MAIL_JOB_REQUEUE); |
| } |
| |
| max_exit_code = |
| master_job->array_recs->max_exit_code; |
| task_failed = (WIFEXITED(max_exit_code) && |
| WEXITSTATUS(max_exit_code)); |
| if (task_failed && |
| (job_ptr->mail_type & MAIL_JOB_FAIL)) { |
| /* |
| * At least 1 task failed and job |
| * req. to be notified on failures. |
| */ |
| mail_job_info(master_job, |
| MAIL_JOB_FAIL); |
| } else if (job_ptr->mail_type & MAIL_JOB_END) { |
| /* |
| * Job req. to be notified on END. |
| */ |
| mail_job_info(job_ptr, MAIL_JOB_END); |
| } |
| } |
| } else { |
| base_state = job_ptr->job_state & JOB_STATE_BASE; |
| if ((job_ptr->mail_type & MAIL_JOB_FAIL) && |
| (base_state >= JOB_FAILED) && |
| ((base_state != JOB_PREEMPTED) || !requeue)) |
| mail_job_info(job_ptr, MAIL_JOB_FAIL); |
| else if ((job_ptr->mail_type & MAIL_JOB_END) && |
| (base_state >= JOB_COMPLETE)) |
| mail_job_info(job_ptr, MAIL_JOB_END); |
| |
| if (requeue && |
| (job_ptr->mail_type & MAIL_JOB_REQUEUE)) |
| mail_job_info(job_ptr, |
| MAIL_JOB_REQUEUE); |
| |
| } |
| } |
| |
| if (!(job_ptr->bit_flags & TRES_STR_CALC) && |
| job_ptr->tres_alloc_cnt && |
| (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) |
| assoc_mgr_set_job_tres_alloc_str(job_ptr, false); |
| |
| jobcomp_g_record_job_end(job_ptr); |
| |
| jobacct_storage_g_job_complete(acct_db_conn, job_ptr); |
| } |
| |
| /* |
| * job_independent - determine if this job has a dependent job pending |
| * or if the job's scheduled begin time is in the future |
| * IN job_ptr - pointer to job being tested |
| * RET - true if job no longer must be deferred for another job |
| */ |
| extern bool job_independent(job_record_t *job_ptr) |
| { |
| job_details_t *detail_ptr = job_ptr->details; |
| time_t now = time(NULL); |
| int depend_rc; |
| |
| if ((job_ptr->state_reason == FAIL_BURST_BUFFER_OP) || |
| (job_ptr->state_reason == FAIL_ACCOUNT) || |
| (job_ptr->state_reason == FAIL_QOS) || |
| (job_ptr->state_reason == WAIT_HELD) || |
| (job_ptr->state_reason == WAIT_HELD_USER) || |
| (job_ptr->state_reason == WAIT_MAX_REQUEUE) || |
| (job_ptr->state_reason == WAIT_RESV_DELETED) || |
| (job_ptr->state_reason == WAIT_RESV_INVALID) || |
| (job_ptr->state_reason == WAIT_DEP_INVALID)) |
| return false; |
| |
| /* Test dependencies first so we can cancel jobs before dependent |
| * job records get purged (e.g. afterok, afternotok) */ |
| depend_rc = test_job_dependency(job_ptr, NULL); |
| if ((depend_rc == LOCAL_DEPEND) || (depend_rc == REMOTE_DEPEND)) { |
| /* start_time has passed but still has dependency which |
| * makes it ineligible */ |
| if (detail_ptr->begin_time < now) |
| detail_ptr->begin_time = 0; |
| job_ptr->state_reason = WAIT_DEPENDENCY; |
| xfree(job_ptr->state_desc); |
| return false; |
| } else if (depend_rc == FAIL_DEPEND) { |
| handle_invalid_dependency(job_ptr); |
| return false; |
| } |
| /* Job is eligible to start now */ |
| if (job_ptr->state_reason == WAIT_DEPENDENCY) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| /* Submit the job to its siblings. */ |
| if (job_ptr->details) { |
| fed_mgr_job_requeue(job_ptr); |
| } |
| } |
| |
| /* Check for maximum number of running tasks in a job array */ |
| if (!job_array_start_test(job_ptr)) |
| return false; |
| |
| if (detail_ptr && (detail_ptr->begin_time > now)) { |
| job_ptr->state_reason = WAIT_TIME; |
| xfree(job_ptr->state_desc); |
| return false; /* not yet time */ |
| } |
| |
| if (job_test_resv_now(job_ptr) != SLURM_SUCCESS) { |
| job_ptr->state_reason = WAIT_RESERVATION; |
| xfree(job_ptr->state_desc); |
| return false; /* not yet time */ |
| } |
| |
| if ((detail_ptr && (detail_ptr->begin_time == 0) && |
| (job_ptr->priority != 0))) { |
| detail_ptr->begin_time = now; |
| /* |
| * Send begin time to the database if it is already there, or it |
| * won't get there until the job starts. |
| */ |
| if (IS_JOB_IN_DB(job_ptr)) |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| } else if (job_ptr->state_reason == WAIT_TIME) { |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| } |
| return true; |
| } |
| |
| /* |
| * determine if job is ready to execute per the node select plugin |
| * IN job_id - job to test |
| * OUT ready - 1 if job is ready to execute 0 otherwise |
| * RET Slurm error code |
| */ |
| extern int job_node_ready(uint32_t job_id, int *ready) |
| { |
| int rc; |
| job_record_t *job_ptr; |
| xassert(ready); |
| |
| *ready = 0; |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) |
| return ESLURM_INVALID_JOB_ID; |
| |
| /* |
| * If the job is configuring, the node might be booting, or a script |
| * such as PrologSlurmctld is running; delay job launch until these |
| * are finished. |
| */ |
| if (IS_JOB_CONFIGURING(job_ptr)) |
| return EAGAIN; |
| |
| /* Always call select_g_job_ready() so that select/bluegene can |
| * test and update block state information. */ |
| rc = select_g_job_ready(job_ptr); |
| if (rc == READY_JOB_FATAL) |
| return ESLURM_INVALID_PARTITION_NAME; |
| if (rc == READY_JOB_ERROR) |
| return EAGAIN; |
| if (rc) |
| rc = READY_NODE_STATE; |
| |
| if (job_ptr->details && !job_ptr->details->prolog_running) |
| rc |= READY_PROLOG_STATE; |
| if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) |
| rc |= READY_JOB_STATE; |
| if ((rc == (READY_NODE_STATE | READY_JOB_STATE | READY_PROLOG_STATE)) && |
| job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") && |
| job_ptr->node_bitmap && |
| (bit_overlap_any(power_down_node_bitmap, |
| job_ptr->node_bitmap) == 0)) { |
| last_job_update = time(NULL); |
| set_job_alias_list(job_ptr); |
| } |
| |
| *ready = rc; |
| return SLURM_SUCCESS; |
| } |
| |
| /* Send specified signal to all steps associated with a job */ |
| static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags) |
| { |
| node_record_t *node_ptr; |
| agent_arg_t *agent_args = NULL; |
| signal_tasks_msg_t *signal_job_msg = NULL; |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_SIGNAL_TASKS; |
| agent_args->retry = 1; |
| agent_args->hostlist = hostlist_create(NULL); |
| signal_job_msg = xmalloc(sizeof(signal_tasks_msg_t)); |
| signal_job_msg->step_id.job_id = job_ptr->job_id; |
| |
| /* |
| * We don't ever want to kill a step with this message. The flags below |
| * will make sure that does happen. Just in case though, set the |
| * step_id to an impossible number. |
| */ |
| signal_job_msg->step_id.step_id = slurm_conf.max_step_cnt + 1; |
| signal_job_msg->step_id.step_het_comp = NO_VAL; |
| |
| /* |
| * Encode the flags for slurm stepd to know what steps get signaled |
| * Here if we aren't signaling the full job we always only want to |
| * signal all other steps. |
| */ |
| if ((flags & KILL_FULL_JOB) || |
| (flags & KILL_JOB_BATCH) || |
| (flags & KILL_STEPS_ONLY)) |
| signal_job_msg->flags = flags; |
| else |
| signal_job_msg->flags = KILL_STEPS_ONLY; |
| |
| signal_job_msg->signal = signal; |
| |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| if (agent_args->protocol_version > node_ptr->protocol_version) |
| agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, node_ptr->name); |
| agent_args->node_count++; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| } |
| |
| if (agent_args->node_count == 0) { |
| xfree(signal_job_msg); |
| xfree(agent_args); |
| return; |
| } |
| |
| agent_args->msg_args = signal_job_msg; |
| set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_args); |
| } |
| |
| /* Send suspend request to slumrd of all nodes associated with a job |
| * job_ptr IN - job to be suspended or resumed |
| * op IN - SUSPEND_JOB or RESUME_JOB |
| * indf_susp IN - set if job is being suspended indefinitely by user |
| * or admin, otherwise suspended for gang scheduling |
| */ |
| static void _suspend_job(job_record_t *job_ptr, uint16_t op) |
| { |
| node_record_t *node_ptr; |
| agent_arg_t *agent_args; |
| suspend_int_msg_t *sus_ptr; |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_SUSPEND_INT; |
| agent_args->retry = 0; /* don't resend, gang scheduler can |
| * quickly induce huge backlog |
| * of agent.c RPCs */ |
| agent_args->hostlist = hostlist_create(NULL); |
| sus_ptr = xmalloc(sizeof(suspend_int_msg_t)); |
| sus_ptr->job_id = job_ptr->job_id; |
| sus_ptr->op = op; |
| |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| if (agent_args->protocol_version > node_ptr->protocol_version) |
| agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, node_ptr->name); |
| agent_args->node_count++; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| } |
| |
| if (agent_args->node_count == 0) { |
| slurm_free_suspend_int_msg(sus_ptr); |
| xfree(agent_args); |
| return; |
| } |
| |
| agent_args->msg_args = sus_ptr; |
| set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_args); |
| } |
| |
| /* |
| * Specified job is being suspended, release allocated nodes |
| * job_ptr IN - job to be suspended |
| * indf_susp IN - set if job is being suspended indefinitely by user |
| * or admin, otherwise suspended for gang scheduling |
| */ |
| static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp) |
| { |
| int rc = SLURM_SUCCESS; |
| node_record_t *node_ptr; |
| uint32_t node_flags; |
| time_t now = time(NULL); |
| |
| if ((rc = select_g_job_suspend(job_ptr, indf_susp)) != SLURM_SUCCESS) |
| return rc; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| node_ptr->sus_job_cnt++; |
| if (node_ptr->run_job_cnt) |
| (node_ptr->run_job_cnt)--; |
| else { |
| error("%s: %pJ node %s run_job_cnt underflow", |
| __func__, job_ptr, node_ptr->name); |
| } |
| if (job_ptr->details && (job_ptr->details->share_res == 0)) { |
| if (node_ptr->no_share_job_cnt) |
| (node_ptr->no_share_job_cnt)--; |
| else { |
| error("%s: %pJ node %s no_share_job_cnt underflow", |
| __func__, job_ptr, node_ptr->name); |
| } |
| if (node_ptr->no_share_job_cnt == 0) |
| bit_set(share_node_bitmap, i); |
| } |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| if ((node_ptr->run_job_cnt == 0) && |
| (node_ptr->comp_job_cnt == 0)) { |
| bit_set(idle_node_bitmap, i); |
| } |
| if (IS_NODE_DOWN(node_ptr)) { |
| debug3("%s: %pJ node %s left DOWN", |
| __func__, job_ptr, node_ptr->name); |
| } else if (node_ptr->run_job_cnt) { |
| node_ptr->node_state = |
| NODE_STATE_ALLOCATED | node_flags; |
| } else { |
| node_ptr->node_state = NODE_STATE_IDLE | node_flags; |
| node_ptr->last_busy = now; |
| } |
| } |
| last_job_update = last_node_update = now; |
| return rc; |
| } |
| |
| /* |
| * Specified job is being resumed, re-allocate the nodes |
| * job_ptr IN - job to be resumed |
| * indf_susp IN - set i f job is being resumed from indefinite suspend by user |
| * or admin, otherwise resume from gang scheduling |
| */ |
| static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp) |
| { |
| int rc = SLURM_SUCCESS; |
| node_record_t *node_ptr; |
| uint32_t node_flags; |
| |
| if ((rc = select_g_job_resume(job_ptr, indf_susp)) != SLURM_SUCCESS) |
| return rc; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| if (IS_NODE_DOWN(node_ptr)) |
| return SLURM_ERROR; |
| } |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| if (node_ptr->sus_job_cnt) |
| (node_ptr->sus_job_cnt)--; |
| else { |
| error("Node %s sus_job_cnt underflow", |
| node_ptr->name); |
| } |
| node_ptr->run_job_cnt++; |
| if (job_ptr->details && |
| (job_ptr->details->share_res == 0)) { |
| node_ptr->no_share_job_cnt++; |
| if (node_ptr->no_share_job_cnt) |
| bit_clear(share_node_bitmap, i); |
| } |
| |
| if (slurm_mcs_get_select(job_ptr) == 1) { |
| xfree(node_ptr->mcs_label); |
| node_ptr->mcs_label = xstrdup(job_ptr->mcs_label); |
| } |
| |
| bit_clear(idle_node_bitmap, i); |
| node_flags = node_ptr->node_state & NODE_STATE_FLAGS; |
| node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; |
| } |
| last_job_update = last_node_update = time(NULL); |
| return rc; |
| } |
| |
| static int _foreach_job_resume_test(void *x, void *arg) |
| { |
| job_record_t *test_job_ptr = x; |
| job_record_t *job_ptr = arg; |
| |
| if (test_job_ptr->details && |
| (test_job_ptr->details->core_spec != NO_VAL16) && |
| IS_JOB_RUNNING(test_job_ptr) && |
| test_job_ptr->node_bitmap && |
| bit_overlap_any(test_job_ptr->node_bitmap, job_ptr->node_bitmap)) { |
| return -1; |
| } |
| /* FIXME: Also test for ESLURM_INTERCONNECT_BUSY */ |
| return 0; |
| } |
| |
| /* |
| * Determine if a job can be resumed. |
| * Check for multiple jobs on the same nodes with core specialization. |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| static int _job_resume_test(job_record_t *job_ptr) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| if ((job_ptr->details == NULL) || |
| (job_ptr->details->core_spec == NO_VAL16) || |
| (job_ptr->node_bitmap == NULL)) |
| return rc; |
| |
| if (list_find_first(job_list, _foreach_job_resume_test, job_ptr)) |
| rc = ESLURM_NODES_BUSY; |
| |
| return rc; |
| } |
| |
| /* |
| * _job_suspend_op - perform some suspend/resume operation on a job |
| * op IN - operation: suspend/resume |
| * indf_susp IN - set if job is being suspended indefinitely by user or admin |
| * and we should clear it's priority, otherwise suspended |
| * temporarily for gang scheduling |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| static int _job_suspend_op(job_record_t *job_ptr, uint16_t op, bool indf_susp) |
| { |
| int rc = SLURM_SUCCESS; |
| time_t now = time(NULL); |
| |
| if (IS_JOB_PENDING(job_ptr)) |
| return ESLURM_JOB_PENDING; |
| if (IS_JOB_FINISHED(job_ptr)) |
| return ESLURM_ALREADY_DONE; |
| if ((op == RESUME_JOB) && (rc = _job_resume_test(job_ptr))) |
| return rc; |
| |
| /* perform the operation */ |
| if (op == SUSPEND_JOB) { |
| if (IS_JOB_SUSPENDED(job_ptr) && indf_susp) { |
| debug("%s: Holding %pJ, re-suspend operation", |
| __func__, job_ptr); |
| job_ptr->priority = 0; /* Prevent gang sched resume */ |
| return SLURM_SUCCESS; |
| } |
| if (!IS_JOB_RUNNING(job_ptr)) |
| return ESLURM_JOB_NOT_RUNNING; |
| rc = _suspend_job_nodes(job_ptr, indf_susp); |
| if (rc != SLURM_SUCCESS) |
| return rc; |
| _suspend_job(job_ptr, op); |
| job_state_set(job_ptr, JOB_SUSPENDED); |
| if (indf_susp) { /* Job being manually suspended, not gang */ |
| debug("%s: Holding %pJ, suspend operation", |
| __func__, job_ptr); |
| job_ptr->priority = 0; |
| (void) gs_job_fini(job_ptr); |
| } |
| if (job_ptr->suspend_time) { |
| job_ptr->pre_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| } else { |
| job_ptr->pre_sus_time += |
| difftime(now, job_ptr->start_time); |
| } |
| suspend_job_step(job_ptr); |
| } else if (op == RESUME_JOB) { |
| if (!IS_JOB_SUSPENDED(job_ptr)) |
| return ESLURM_JOB_NOT_SUSPENDED; |
| rc = _resume_job_nodes(job_ptr, indf_susp); |
| if (rc != SLURM_SUCCESS) |
| return rc; |
| _suspend_job(job_ptr, op); |
| if (job_ptr->priority == 0) { |
| /* Job was manually suspended, not gang */ |
| set_job_prio(job_ptr); |
| (void) gs_job_start(job_ptr); |
| } |
| job_state_set(job_ptr, JOB_RUNNING); |
| job_ptr->tot_sus_time += |
| difftime(now, job_ptr->suspend_time); |
| |
| if ((job_ptr->time_limit != INFINITE) && |
| (!job_ptr->preempt_time)) { |
| debug3("%pJ resumed, updating end_time", job_ptr); |
| job_ptr->end_time_exp = job_ptr->end_time = |
| now + (job_ptr->time_limit * 60) |
| - job_ptr->pre_sus_time; |
| } |
| resume_job_step(job_ptr); |
| } |
| |
| job_ptr->time_last_active = now; |
| job_ptr->suspend_time = now; |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| |
| return rc; |
| } |
| |
| static int _foreach_hetjob_suspend(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| foreach_sus_hetjob_t *sus_hetjob = arg; |
| int rc = SLURM_SUCCESS; |
| |
| if (sus_hetjob->het_leader->het_job_id != het_job->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", |
| __func__, sus_hetjob->het_leader); |
| return 0; |
| } |
| rc = _job_suspend_op(het_job, sus_hetjob->op, sus_hetjob->indf_susp); |
| if (rc != SLURM_SUCCESS) |
| sus_hetjob->rc = rc; |
| return 0; |
| } |
| |
| /* |
| * _job_suspend - perform some suspend/resume operation, if the specified |
| * job records is a hetjob leader, perform the operation on all |
| * components of the hetjob |
| * job_ptr - job to operate upon |
| * op IN - operation: suspend/resume |
| * indf_susp IN - set if job is being suspended indefinitely by user or admin |
| * and we should clear it's priority, otherwise suspended |
| * temporarily for gang scheduling |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| static int _job_suspend(job_record_t *job_ptr, uint16_t op, bool indf_susp) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| if (job_ptr->het_job_id && !job_ptr->het_job_list) |
| return ESLURM_NOT_WHOLE_HET_JOB; |
| |
| /* Notify salloc/srun of suspend/resume */ |
| srun_job_suspend(job_ptr, op); |
| |
| if (job_ptr->het_job_list) { |
| foreach_sus_hetjob_t sus_hetjob = { |
| .het_leader = job_ptr, |
| .indf_susp = indf_susp, |
| .op = op, |
| .rc = SLURM_SUCCESS, |
| }; |
| (void) list_for_each(job_ptr->het_job_list, |
| _foreach_hetjob_suspend, |
| &sus_hetjob); |
| rc = sus_hetjob.rc; |
| } else { |
| rc = _job_suspend_op(job_ptr, op, indf_susp); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * job_suspend - perform some suspend/resume operation |
| * NOTE: job_suspend - Uses the job_id field and ignores job_id_str |
| * |
| * IN msg - original msg |
| * IN sus_ptr - suspend/resume request message |
| * IN uid - user id of the user issuing the RPC |
| * indf_susp IN - set if job is being suspended indefinitely by user or admin |
| * and we should clear it's priority, otherwise suspended |
| * temporarily for gang scheduling |
| * IN protocol_version - slurm protocol version of client |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_suspend(slurm_msg_t *msg, suspend_msg_t *sus_ptr, uid_t uid, |
| bool indf_susp, uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| job_record_t *job_ptr = NULL; |
| |
| xfree(sus_ptr->job_id_str); |
| xstrfmtcat(sus_ptr->job_id_str, "%u", sus_ptr->job_id); |
| |
| /* find the job */ |
| job_ptr = find_job_record (sus_ptr->job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| |
| /* validate the request */ |
| if (!validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) { |
| error("SECURITY VIOLATION: Attempt to suspend job from user %u", |
| uid); |
| rc = ESLURM_ACCESS_DENIED; |
| goto reply; |
| } |
| |
| rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); |
| |
| reply: |
| |
| /* Since we have already used it lets make sure we don't leak |
| memory */ |
| xfree(sus_ptr->job_id_str); |
| |
| if (msg) |
| slurm_send_rc_msg(msg, rc); |
| |
| return rc; |
| } |
| |
| /* |
| * job_suspend2 - perform some suspend/resume operation |
| * NB job_suspend2 - Ignores the job_id field and uses job_id_str |
| * |
| * IN msg - original msg |
| * IN sus_ptr - suspend/resume request message |
| * IN uid - user id of the user issuing the RPC |
| * indf_susp IN - set if job is being suspended indefinitely by user or admin |
| * and we should clear it's priority, otherwise suspended |
| * temporarily for gang scheduling |
| * IN protocol_version - slurm protocol version of client |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_suspend2(slurm_msg_t *msg, suspend_msg_t *sus_ptr, uid_t uid, |
| bool indf_susp, uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS, rc2; |
| job_record_t *job_ptr = NULL; |
| long int long_id; |
| uint32_t job_id = 0; |
| char *end_ptr = NULL; |
| bitstr_t *array_bitmap = NULL; |
| resp_array_struct_t *resp_array = NULL; |
| |
| if (max_array_size == NO_VAL) { |
| max_array_size = slurm_conf.max_array_sz; |
| } |
| |
| long_id = strtol(sus_ptr->job_id_str, &end_ptr, 10); |
| if (end_ptr[0] == '+') |
| rc = ESLURM_NOT_WHOLE_HET_JOB; |
| else if ((long_id <= 0) || (long_id == LONG_MAX) || |
| ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) |
| rc = ESLURM_INVALID_JOB_ID; |
| else { |
| job_id = (uint32_t) long_id; |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) |
| rc = ESLURM_INVALID_JOB_ID; |
| } |
| if (rc != SLURM_SUCCESS) { |
| info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str); |
| goto reply; |
| } |
| |
| /* validate the request */ |
| if (!validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) { |
| error("SECURITY VIOLATION: Attempt to suspend job from user %u", |
| uid); |
| rc = ESLURM_ACCESS_DENIED; |
| goto reply; |
| } |
| |
| if (end_ptr[0] == '\0') { /* Single job (or full job array) */ |
| job_record_t *job_ptr_done = NULL; |
| if (job_ptr && |
| (((job_ptr->array_task_id == NO_VAL) && |
| (job_ptr->array_recs == NULL)) || |
| ((job_ptr->array_task_id != NO_VAL) && |
| (job_ptr->array_job_id != job_id)))) { |
| /* This is a regular job or single task of job array */ |
| rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); |
| goto reply; |
| } |
| |
| if (job_ptr && job_ptr->array_recs) { |
| /* This is a job array */ |
| job_ptr_done = job_ptr; |
| rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp); |
| _resp_array_add(&resp_array, job_ptr, rc2, NULL); |
| } |
| |
| /* Suspend all tasks of this job array */ |
| job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; |
| if (!job_ptr && !job_ptr_done) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == job_id) && |
| (job_ptr != job_ptr_done)) { |
| rc2 = _job_suspend(job_ptr, sus_ptr->op, |
| indf_susp); |
| _resp_array_add(&resp_array, job_ptr, rc2, |
| NULL); |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| goto reply; |
| } |
| |
| array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size, |
| NULL); |
| if (!array_bitmap) { |
| info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| |
| for (int i = 0; (i = bit_ffs_from_bit(array_bitmap, i)) >= 0; i++) { |
| job_ptr = find_job_array_rec(job_id, i); |
| if (job_ptr == NULL) { |
| info("%s: invalid JobId=%u_%d", __func__, job_id, i); |
| _resp_array_add_id(&resp_array, job_id, i, |
| ESLURM_INVALID_JOB_ID); |
| continue; |
| } |
| rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp); |
| _resp_array_add(&resp_array, job_ptr, rc2, NULL); |
| } |
| |
| reply: |
| if (resp_array) { |
| job_array_resp_msg_t *resp_array_msg = |
| _resp_array_xlate(resp_array, job_id); |
| (void) send_msg_response(msg, RESPONSE_JOB_ARRAY_ERRORS, |
| resp_array_msg); |
| slurm_free_job_array_resp(resp_array_msg); |
| } else |
| slurm_send_rc_msg(msg, rc); |
| |
| _resp_array_free(resp_array); |
| |
| FREE_NULL_BITMAP(array_bitmap); |
| |
| return rc; |
| } |
| |
| /* |
| * _job_requeue_op - Requeue a running or pending batch job |
| * IN uid - user id of user issuing the RPC |
| * IN job_ptr - job to be requeued |
| * IN preempt - true if job being preempted |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| static int _job_requeue_op(uid_t uid, job_record_t *job_ptr, bool preempt, |
| uint32_t flags) |
| { |
| static time_t config_update = 0; |
| static bool requeue_nohold_prolog = true; |
| bool is_running = false, is_suspended = false, is_completed = false; |
| bool is_completing = false; |
| bool force_requeue = false; |
| time_t now = time(NULL); |
| uint32_t completing_flags = 0; |
| |
| if (config_update != slurm_conf.last_update) { |
| requeue_nohold_prolog = (xstrcasestr(slurm_conf.sched_params, |
| "nohold_on_prolog_fail")); |
| config_update = slurm_conf.last_update; |
| } |
| |
| /* validate the request */ |
| if ((uid != job_ptr->user_id) && !validate_operator(uid) && |
| !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, |
| job_ptr->account, false)) { |
| return ESLURM_ACCESS_DENIED; |
| } |
| |
| if (((flags & JOB_STATE_BASE) == JOB_RUNNING) && |
| !IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) { |
| return SLURM_SUCCESS; |
| } |
| |
| if (flags & JOB_RECONFIG_FAIL) |
| node_features_g_get_node(job_ptr->nodes); |
| |
| /* |
| * If the partition was removed don't allow the job to be |
| * requeued. If it doesn't have details then something is very |
| * wrong and if the job doesn't want to be requeued don't unless |
| * it's being forced to do so after a launch failure. |
| */ |
| if ((flags & JOB_LAUNCH_FAILED) && |
| (slurm_conf.prolog_flags & PROLOG_FLAG_FORCE_REQUEUE_ON_FAIL)) |
| force_requeue = true; |
| if (!job_ptr->part_ptr || !job_ptr->details |
| || (!job_ptr->details->requeue && !force_requeue)) { |
| if (flags & JOB_RECONFIG_FAIL) |
| (void) _job_fail(job_ptr, JOB_BOOT_FAIL); |
| return ESLURM_DISABLED; |
| } |
| |
| if (job_ptr->batch_flag == 0) { |
| debug("Job-requeue can only be done for batch jobs"); |
| if (flags & JOB_RECONFIG_FAIL) |
| (void) _job_fail(job_ptr, JOB_BOOT_FAIL); |
| return ESLURM_BATCH_ONLY; |
| } |
| |
| /* |
| * If the job is already pending, just return an error. |
| * A federated origin job can be pending and revoked with a sibling job |
| * on another cluster. |
| */ |
| if (IS_JOB_PENDING(job_ptr) && |
| (!job_ptr->fed_details || !job_ptr->fed_details->cluster_lock)) |
| return ESLURM_JOB_PENDING; |
| |
| if ((flags & JOB_RECONFIG_FAIL) && IS_JOB_CANCELLED(job_ptr)) { |
| /* |
| * Job was cancelled (likely be the user) while node |
| * reconfiguration was in progress, so don't requeue it |
| * if the node reconfiguration failed. |
| */ |
| return ESLURM_DISABLED; |
| } |
| |
| if (job_ptr->fed_details) { |
| int rc; |
| if ((rc = fed_mgr_job_requeue_test(job_ptr, flags))) |
| return rc; |
| |
| /* Sent requeue request to origin cluster */ |
| if (job_ptr->job_state & JOB_REQUEUE_FED) |
| return SLURM_SUCCESS; |
| } |
| |
| last_job_update = now; |
| |
| /* |
| * In the job is in the process of completing |
| * return SLURM_SUCCESS and set the status |
| * to JOB_PENDING since we support requeue |
| * of done/exit/exiting jobs. |
| */ |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| completing_flags = job_ptr->job_state & JOB_STATE_FLAGS; |
| is_completing = true; |
| } |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) { |
| uint32_t suspend_job_state = job_ptr->job_state; |
| /* |
| * we can't have it as suspended when we call the |
| * accounting stuff. |
| */ |
| job_state_set(job_ptr, JOB_REQUEUE); |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| job_state_set(job_ptr, suspend_job_state); |
| is_suspended = true; |
| } |
| |
| job_ptr->time_last_active = now; |
| if (is_suspended) |
| job_ptr->end_time = job_ptr->suspend_time; |
| else if (!is_completing) |
| job_ptr->end_time = now; |
| |
| /* |
| * Save the state of the job so that |
| * we deallocate the nodes if is in |
| * running state. |
| */ |
| if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) |
| is_running = true; |
| else if (IS_JOB_COMPLETED(job_ptr)) |
| is_completed = true; |
| |
| /* Only change state to requeue for local jobs */ |
| if (fed_mgr_is_origin_job(job_ptr) && |
| !fed_mgr_is_tracker_only_job(job_ptr)) { |
| /* |
| * We want this job to have the requeued/preempted state in the |
| * accounting logs. Set a new submit time so the restarted |
| * job looks like a new job. |
| */ |
| if (preempt) { |
| job_state_set(job_ptr, JOB_PREEMPTED); |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, true); |
| job_state_set(job_ptr, JOB_REQUEUE); |
| } else { |
| job_state_set(job_ptr, JOB_REQUEUE); |
| build_cg_bitmap(job_ptr); |
| job_completion_logger(job_ptr, true); |
| } |
| } |
| |
| /* |
| * Increment restart counter before completing reply so that completing |
| * jobs get counted and so that fed jobs get counted before submitting |
| * new siblings in batch_requeue_fini() |
| */ |
| job_ptr->restart_cnt++; |
| |
| if (is_completing) { |
| job_state_set(job_ptr, (JOB_PENDING | completing_flags)); |
| goto reply; |
| } |
| |
| /* |
| * Deallocate resources only if the job has some. |
| * JOB_COMPLETING is needed to properly clean up steps. |
| */ |
| if (is_running) { |
| job_state_set_flag(job_ptr, JOB_COMPLETING); |
| deallocate_nodes(job_ptr, false, is_suspended, preempt); |
| if (!IS_JOB_COMPLETING(job_ptr) && !job_ptr->fed_details) |
| is_completed = true; |
| else |
| job_state_unset_flag(job_ptr, JOB_COMPLETING); |
| } |
| |
| _set_requeued_job_pending_completing(job_ptr); |
| |
| /* |
| * Mark the origin job as requeuing. Will finish requeuing fed job |
| * after job has completed. |
| * If it's completed, batch_requeue_fini is called below and will call |
| * fed_mgr_job_requeue() to submit new siblings. |
| * If it's not completed, batch_requeue_fini will either be called when |
| * the running origin job finishes or the running remote sibling job |
| * reports that the job is finished. |
| */ |
| if (job_ptr->fed_details && !is_completed) { |
| job_state_set_flag(job_ptr, (JOB_COMPLETING | JOB_REQUEUE_FED)); |
| } |
| |
| /* |
| * If we set the time limit it means the user didn't so reset |
| * it here or we could bust some limit when we try again |
| */ |
| if (job_ptr->limit_set.time == 1) { |
| job_ptr->time_limit = NO_VAL; |
| job_ptr->limit_set.time = 0; |
| } |
| |
| reply: |
| job_ptr->pre_sus_time = (time_t) 0; |
| job_ptr->suspend_time = (time_t) 0; |
| job_ptr->tot_sus_time = (time_t) 0; |
| |
| job_ptr->db_flags = 0; |
| |
| /* clear signal sent flag on requeue */ |
| job_ptr->warn_flags &= ~WARN_SENT; |
| |
| /* |
| * Since the job completion logger removes the submit we need |
| * to add it again. |
| */ |
| acct_policy_add_job_submit(job_ptr, false); |
| |
| acct_policy_update_pending_job(job_ptr); |
| |
| if (flags & JOB_SPECIAL_EXIT) { |
| job_state_set_flag(job_ptr, JOB_SPECIAL_EXIT); |
| job_ptr->state_reason = WAIT_HELD_USER; |
| xfree(job_ptr->state_desc); |
| job_ptr->state_desc = |
| xstrdup("job requeued in special exit state"); |
| debug("%s: Holding %pJ, special exit", __func__, job_ptr); |
| job_ptr->priority = 0; |
| } |
| if (flags & JOB_REQUEUE_HOLD) { |
| job_ptr->state_reason = WAIT_HELD_USER; |
| xfree(job_ptr->state_desc); |
| job_ptr->state_desc = xstrdup("job requeued in held state"); |
| debug("%s: Holding %pJ, requeue-hold exit", __func__, job_ptr); |
| job_ptr->priority = 0; |
| } |
| if (flags & JOB_LAUNCH_FAILED) { |
| job_ptr->batch_flag++; |
| _handle_requeue_limit(job_ptr, __func__); |
| |
| /* If job not already held, make it so if needed. */ |
| if (!(job_ptr->job_state & JOB_REQUEUE_HOLD) && |
| ((!requeue_nohold_prolog || (flags & JOB_GETENV_FAILED)))) { |
| job_ptr->state_reason = WAIT_HELD_USER; |
| xfree(job_ptr->state_desc); |
| if (flags & JOB_GETENV_FAILED) { |
| job_ptr->state_desc = |
| xstrdup("user env retrieval failed requeued held"); |
| debug("%s: Holding %pJ due to user environment retrieval failure or timeout", |
| __func__, job_ptr); |
| } else { |
| job_ptr->state_desc = |
| xstrdup("launch failed requeued held"); |
| debug("%s: Holding %pJ due to prolog failure", |
| __func__, job_ptr); |
| } |
| job_ptr->priority = 0; |
| } |
| } |
| |
| /* |
| * When jobs are requeued while running/completing batch_requeue_fini is |
| * called after the job is completely finished. If the job is already |
| * finished it needs to be called to clear out states (especially the |
| * db_index or we will just write over the last job in the database). |
| * Call batch_requeue_fini after setting priority to 0 for requeue_hold |
| * and special_exit so federation doesn't submit siblings for held job. |
| */ |
| if (is_completed) |
| batch_requeue_fini(job_ptr); |
| |
| debug("%s: %pJ state 0x%x reason %u priority %d", |
| __func__, job_ptr, job_ptr->job_state, |
| job_ptr->state_reason, job_ptr->priority); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_hetjob_requeue(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| foreach_requeue_hetjob_t *requeue_hetjob = arg; |
| int rc; |
| |
| if (requeue_hetjob->het_leader->het_job_id != het_job->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", |
| __func__, requeue_hetjob->het_leader); |
| return 0; |
| } |
| rc = _job_requeue_op(requeue_hetjob->uid, |
| het_job, |
| requeue_hetjob->preempt, |
| requeue_hetjob->flags); |
| if (rc != SLURM_SUCCESS) |
| requeue_hetjob->rc = rc; |
| return 0; |
| } |
| |
| /* |
| * _job_requeue - Requeue a running or pending batch job, if the specified |
| * job records is a hetjob leader, perform the operation on all |
| * components of the hetjob |
| * IN uid - user id of user issuing the RPC |
| * IN job_ptr - job to be requeued |
| * IN preempt - true if job being preempted |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| static int _job_requeue(uid_t uid, job_record_t *job_ptr, bool preempt, |
| uint32_t flags) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| if (job_ptr->het_job_id && !job_ptr->het_job_list) |
| return ESLURM_NOT_HET_JOB_LEADER; |
| |
| if (job_ptr->het_job_list) { |
| foreach_requeue_hetjob_t requeue_hetjob = { |
| .flags = flags, |
| .het_leader = job_ptr, |
| .preempt = preempt, |
| .rc = SLURM_SUCCESS, |
| .uid = uid, |
| }; |
| (void) list_for_each(job_ptr->het_job_list, |
| _foreach_hetjob_requeue, |
| &requeue_hetjob); |
| rc = requeue_hetjob.rc; |
| } else { |
| rc = _job_requeue_op(uid, job_ptr, preempt, flags); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * job_requeue - Requeue a running or pending batch job |
| * IN uid - user id of user issuing the RPC |
| * IN job_id - id of the job to be requeued |
| * IN msg - slurm_msg to send response back on |
| * IN preempt - true if job being preempted |
| * IN flags - JobExitRequeue | Hold | JobFailed | etc. |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_requeue(uid_t uid, uint32_t job_id, slurm_msg_t *msg, |
| bool preempt, uint32_t flags) |
| { |
| int rc = SLURM_SUCCESS; |
| job_record_t *job_ptr = NULL; |
| |
| /* find the job */ |
| job_ptr = find_job_record(job_id); |
| if (job_ptr == NULL) { |
| rc = ESLURM_INVALID_JOB_ID; |
| } else { |
| /* _job_requeue already handles het jobs */ |
| rc = _job_requeue(uid, job_ptr, preempt, flags); |
| } |
| |
| if (msg) { |
| slurm_send_rc_msg(msg, rc); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * job_requeue2 - Requeue a running or pending batch job |
| * IN uid - user id of user issuing the RPC |
| * IN req_ptr - request including ID of the job to be requeued |
| * IN msg - slurm_msg to send response back on |
| * IN preempt - true if job being preempted |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, slurm_msg_t *msg, |
| bool preempt) |
| { |
| int rc = SLURM_SUCCESS, rc2; |
| job_record_t *job_ptr = NULL; |
| long int long_id; |
| uint32_t job_id = 0; |
| char *end_ptr = NULL; |
| bitstr_t *array_bitmap = NULL; |
| uint32_t flags = req_ptr->flags; |
| char *job_id_str = req_ptr->job_id_str; |
| resp_array_struct_t *resp_array = NULL; |
| job_array_resp_msg_t *resp_array_msg = NULL; |
| |
| if (max_array_size == NO_VAL) { |
| max_array_size = slurm_conf.max_array_sz; |
| } |
| |
| long_id = strtol(job_id_str, &end_ptr, 10); |
| if ((long_id <= 0) || (long_id == LONG_MAX) || |
| ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) { |
| info("%s: invalid JobId=%s", __func__, job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| if ((end_ptr[0] == '_') && (end_ptr[1] == '*')) |
| end_ptr += 2; /* Defaults to full job array */ |
| |
| job_id = (uint32_t) long_id; |
| if (end_ptr[0] == '\0') { /* Single job (or full job array) */ |
| job_record_t *job_ptr_done = NULL; |
| job_ptr = find_job_record(job_id); |
| if (job_ptr && |
| (((job_ptr->array_task_id == NO_VAL) && |
| (job_ptr->array_recs == NULL)) || |
| ((job_ptr->array_task_id != NO_VAL) && |
| (job_ptr->array_job_id != job_id)))) { |
| /* This is a regular job or single task of job array */ |
| rc = _job_requeue(uid, job_ptr, preempt, flags); |
| goto reply; |
| } |
| |
| if (job_ptr && job_ptr->array_recs) { |
| /* This is a job array */ |
| job_ptr_done = job_ptr; |
| rc2 = _job_requeue(uid, job_ptr, preempt, flags); |
| _resp_array_add(&resp_array, job_ptr, rc2, NULL); |
| } |
| |
| /* Requeue all tasks of this job array */ |
| job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; |
| if (!job_ptr && !job_ptr_done) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| while (job_ptr) { |
| if ((job_ptr->array_job_id == job_id) && |
| (job_ptr != job_ptr_done)) { |
| rc2 = _job_requeue(uid, job_ptr, preempt,flags); |
| _resp_array_add(&resp_array, job_ptr, rc2, |
| NULL); |
| } |
| job_ptr = job_ptr->job_array_next_j; |
| } |
| goto reply; |
| } |
| |
| array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size, |
| NULL); |
| if (!array_bitmap) { |
| info("%s: invalid JobId=%s", __func__, job_id_str); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| |
| for (int i = 0; (i = bit_ffs_from_bit(array_bitmap, i)) >= 0; i++) { |
| job_ptr = find_job_array_rec(job_id, i); |
| if (job_ptr == NULL) { |
| info("%s: invalid JobId=%u_%d", __func__, job_id, i); |
| _resp_array_add_id(&resp_array, job_id, i, |
| ESLURM_INVALID_JOB_ID); |
| continue; |
| } |
| |
| rc2 = _job_requeue(uid, job_ptr, preempt, flags); |
| _resp_array_add(&resp_array, job_ptr, rc2, NULL); |
| } |
| |
| reply: |
| if (msg) { |
| if (resp_array) { |
| resp_array_msg = _resp_array_xlate(resp_array, job_id); |
| (void) send_msg_response(msg, RESPONSE_JOB_ARRAY_ERRORS, |
| resp_array_msg); |
| slurm_free_job_array_resp(resp_array_msg); |
| } else { |
| slurm_send_rc_msg(msg, rc); |
| } |
| } |
| _resp_array_free(resp_array); |
| |
| FREE_NULL_BITMAP(array_bitmap); |
| |
| return rc; |
| } |
| |
| static int _top_job_flag_clear(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *) x; |
| job_ptr->bit_flags &= (~TOP_PRIO_TMP); |
| return 0; |
| } |
| |
| /* This sorts so the highest priorities come off the list first */ |
| static int _top_job_prio_sort(void *x, void *y) |
| { |
| uint32_t *prio1, *prio2; |
| prio1 = *(uint32_t **) x; |
| prio2 = *(uint32_t **) y; |
| if (*prio1 < *prio2) |
| return 1; |
| if (*prio1 > *prio2) |
| return -1; |
| return 0; |
| } |
| |
| static int _set_top(list_t *top_job_list, uid_t uid) |
| { |
| list_t *prio_list, *other_job_list; |
| list_itr_t *iter; |
| job_record_t *job_ptr, *first_job_ptr = NULL; |
| int rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS; |
| uint32_t last_prio = NO_VAL, next_prio; |
| int64_t delta_prio, delta_nice, total_delta = 0; |
| int other_job_cnt = 0; |
| uint32_t *prio_elem; |
| |
| xassert(job_list); |
| xassert(top_job_list); |
| prio_list = list_create(xfree_ptr); |
| (void) list_for_each(job_list, _top_job_flag_clear, NULL); |
| |
| /* |
| * Skipping all these list iterators. |
| * We want to rewrite how job_set_top works. |
| */ |
| |
| /* Validate the jobs in our "top" list */ |
| iter = list_iterator_create(top_job_list); |
| while ((job_ptr = list_next(iter))) { |
| if ((job_ptr->user_id != uid) && (uid != 0)) { |
| error("Security violation: REQUEST_TOP_JOB for %pJ from uid=%u", |
| job_ptr, uid); |
| rc = ESLURM_ACCESS_DENIED; |
| break; |
| } |
| if (!IS_JOB_PENDING(job_ptr) || (job_ptr->details == NULL)) { |
| debug("%s: %pJ not pending", __func__, job_ptr); |
| list_remove(iter); |
| rc2 = ESLURM_JOB_NOT_PENDING; |
| continue; |
| } |
| if (job_ptr->part_ptr_list) { |
| debug("%s: %pJ in partition list", __func__, job_ptr); |
| list_remove(iter); |
| rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| break; |
| } |
| if (job_ptr->priority == 0) { |
| debug("%s: %pJ is held", __func__, job_ptr); |
| list_remove(iter); |
| rc2 = ESLURM_JOB_HELD; |
| continue; |
| } |
| if (job_ptr->bit_flags & TOP_PRIO_TMP) { |
| /* Duplicate job ID */ |
| list_remove(iter); |
| continue; |
| } |
| if (!first_job_ptr) |
| first_job_ptr = job_ptr; |
| job_ptr->bit_flags |= TOP_PRIO_TMP; |
| prio_elem = xmalloc(sizeof(uint32_t)); |
| *prio_elem = job_ptr->priority; |
| list_append(prio_list, prio_elem); |
| } |
| list_iterator_destroy(iter); |
| if (rc != SLURM_SUCCESS) { |
| FREE_NULL_LIST(prio_list); |
| return rc; |
| } |
| if (!first_job_ptr) { |
| FREE_NULL_LIST(prio_list); |
| return rc2; |
| } |
| |
| /* Identify other jobs which we can adjust the nice value of */ |
| other_job_list = list_create(NULL); |
| iter = list_iterator_create(job_list); |
| while ((job_ptr = list_next(iter))) { |
| /* |
| * Do not select jobs with priority 0 (held), or |
| * priority 1 (would be held if we lowered the priority). |
| */ |
| if ((job_ptr->bit_flags & TOP_PRIO_TMP) || |
| (job_ptr->details == NULL) || |
| (job_ptr->part_ptr_list) || |
| (job_ptr->priority <= 1) || |
| (job_ptr->assoc_ptr != first_job_ptr->assoc_ptr) || |
| (job_ptr->part_ptr != first_job_ptr->part_ptr) || |
| (job_ptr->qos_ptr != first_job_ptr->qos_ptr) || |
| (job_ptr->user_id != first_job_ptr->user_id) || |
| (!IS_JOB_PENDING(job_ptr))) |
| continue; |
| other_job_cnt++; |
| job_ptr->bit_flags |= TOP_PRIO_TMP; |
| prio_elem = xmalloc(sizeof(uint32_t)); |
| *prio_elem = job_ptr->priority; |
| list_append(prio_list, prio_elem); |
| list_append(other_job_list, job_ptr); |
| } |
| list_iterator_destroy(iter); |
| |
| /* Now adjust nice values and priorities of the listed "top" jobs */ |
| list_sort(prio_list, _top_job_prio_sort); |
| iter = list_iterator_create(top_job_list); |
| while ((job_ptr = list_next(iter))) { |
| prio_elem = list_pop(prio_list); |
| next_prio = *prio_elem; |
| xfree(prio_elem); |
| if ((last_prio != NO_VAL) && (next_prio == last_prio) && |
| (last_prio > 2)) |
| /* |
| * We don't want to set job priority lower than 1, so |
| * last_prio cannot be smaller than 2, since we will |
| * later use last_prio - 1 for the new job priority. |
| */ |
| next_prio = last_prio - 1; |
| last_prio = next_prio; |
| delta_prio = (int64_t) next_prio - job_ptr->priority; |
| delta_nice = MIN(job_ptr->details->nice, delta_prio); |
| total_delta += delta_nice; |
| job_ptr->priority = next_prio; |
| job_ptr->details->nice -= delta_nice; |
| job_ptr->bit_flags &= (~TOP_PRIO_TMP); |
| } |
| list_iterator_destroy(iter); |
| FREE_NULL_LIST(prio_list); |
| |
| /* Now adjust nice values and priorities of remaining effected jobs */ |
| if (other_job_cnt) { |
| iter = list_iterator_create(other_job_list); |
| while ((job_ptr = list_next(iter))) { |
| delta_prio = total_delta / other_job_cnt; |
| next_prio = job_ptr->priority - delta_prio; |
| if (next_prio >= last_prio) { |
| next_prio = last_prio - 1; |
| delta_prio = job_ptr->priority - next_prio; |
| } |
| delta_nice = delta_prio; |
| job_ptr->priority = next_prio; |
| job_ptr->details->nice += delta_nice; |
| job_ptr->bit_flags &= (~TOP_PRIO_TMP); |
| total_delta -= delta_nice; |
| if (--other_job_cnt == 0) |
| break; /* Count will match list size anyway */ |
| } |
| list_iterator_destroy(iter); |
| } |
| FREE_NULL_LIST(other_job_list); |
| |
| last_job_update = time(NULL); |
| |
| return rc; |
| } |
| |
| /* |
| * job_set_top - Move the specified jobs to the top of the queue (at least |
| * for that user ID, partition, account, and QOS). |
| * |
| * IN msg - original request msg |
| * IN top_ptr - user request |
| * IN uid - user id of the user issuing the RPC |
| * IN protocol_version - slurm protocol version of client |
| * RET 0 on success, otherwise ESLURM error code |
| */ |
| extern int job_set_top(slurm_msg_t *msg, top_job_msg_t *top_ptr, uid_t uid, |
| uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| list_t *top_job_list = NULL; |
| char *job_str_tmp = NULL, *tok, *save_ptr = NULL, *end_ptr = NULL; |
| job_record_t *job_ptr = NULL; |
| long int long_id; |
| uint32_t job_id = 0, task_id = 0; |
| uid_t job_uid = uid; |
| |
| if (validate_operator(uid)) { |
| job_uid = 0; |
| } else { |
| bool disable_user_top = true; |
| if (xstrcasestr(slurm_conf.sched_params, "enable_user_top")) |
| disable_user_top = false; |
| if (disable_user_top) { |
| rc = ESLURM_ACCESS_DENIED; |
| goto reply; |
| } |
| } |
| |
| top_job_list = list_create(NULL); |
| job_str_tmp = xstrdup(top_ptr->job_id_str); |
| tok = strtok_r(job_str_tmp, ",", &save_ptr); |
| while (tok) { |
| long_id = strtol(tok, &end_ptr, 10); |
| if ((long_id <= 0) || (long_id == LONG_MAX) || |
| ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) { |
| info("%s: invalid job id %s", __func__, tok); |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| job_id = (uint32_t) long_id; |
| if ((end_ptr[0] == '\0') || /* Single job (or full job array) */ |
| ((end_ptr[0] == '_') && (end_ptr[1] == '*') && |
| (end_ptr[2] == '\0'))) { |
| job_ptr = find_job_record(job_id); |
| if (!job_ptr) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| list_append(top_job_list, job_ptr); |
| } else if (end_ptr[0] != '_') { /* Invalid job ID spec */ |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } else { /* Single task of a job array */ |
| task_id = strtol(end_ptr + 1, &end_ptr, 10); |
| if (end_ptr[0] != '\0') { /* Invalid job ID spec */ |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| job_ptr = find_job_array_rec(job_id, task_id); |
| if (!job_ptr) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| list_append(top_job_list, job_ptr); |
| } |
| tok = strtok_r(NULL, ",", &save_ptr); |
| } |
| |
| if (list_count(top_job_list) == 0) { |
| rc = ESLURM_INVALID_JOB_ID; |
| goto reply; |
| } |
| rc = _set_top(top_job_list, job_uid); |
| |
| reply: FREE_NULL_LIST(top_job_list); |
| xfree(job_str_tmp); |
| slurm_send_rc_msg(msg, rc); |
| return rc; |
| } |
| |
| /* |
| * job_end_time - Process JOB_END_TIME |
| * IN time_req_msg - job end time request |
| * OUT timeout_msg - job timeout response to be sent |
| * RET SLURM_SUCCESS or an error code |
| */ |
| extern int job_end_time(job_alloc_info_msg_t *time_req_msg, |
| srun_timeout_msg_t *timeout_msg) |
| { |
| job_record_t *job_ptr; |
| xassert(timeout_msg); |
| |
| job_ptr = find_job_record(time_req_msg->job_id); |
| if (!job_ptr) |
| return ESLURM_INVALID_JOB_ID; |
| |
| memset(timeout_msg, 0, sizeof(srun_timeout_msg_t)); |
| timeout_msg->step_id.job_id = time_req_msg->job_id; |
| timeout_msg->step_id.step_id = NO_VAL; |
| timeout_msg->step_id.step_het_comp = NO_VAL; |
| timeout_msg->timeout = job_ptr->end_time; |
| return SLURM_SUCCESS; |
| } |
| |
| static int _update_job_nodes_str(job_record_t *job_ptr) |
| { |
| xfree(job_ptr->nodes_completing); |
| xfree(job_ptr->nodes_pr); |
| |
| if (!job_ptr->node_bitmap) |
| return 0; |
| |
| if (IS_JOB_COMPLETING(job_ptr)) { |
| if (job_ptr->node_bitmap_cg) { |
| job_ptr->nodes_completing = |
| bitmap2node_name(job_ptr->node_bitmap_cg); |
| } else { |
| job_ptr->nodes_completing = |
| bitmap2node_name(job_ptr->node_bitmap); |
| } |
| } |
| if (job_ptr->state_reason == WAIT_PROLOG) { |
| if (job_ptr->node_bitmap_pr) { |
| job_ptr->nodes_pr = |
| bitmap2node_name(job_ptr->node_bitmap_pr); |
| } else { |
| job_ptr->nodes_pr = |
| bitmap2node_name(job_ptr->node_bitmap); |
| } |
| } |
| |
| return 0; |
| } |
| |
| static int _foreach_hold_by_assoc(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| foreach_hold_by_id_t *hold_by_id = arg; |
| |
| if (job_ptr->assoc_id == hold_by_id->id) |
| hold_by_id->cnt += _job_fail_account(job_ptr, __func__, false); |
| |
| return 0; |
| } |
| |
| /* |
| * job_hold_by_assoc_id - Hold all pending jobs with a given |
| * association ID. This happens when an association is deleted (e.g. when |
| * a user is removed from the association database). |
| * RET count of held jobs |
| */ |
| extern int job_hold_by_assoc_id(uint32_t assoc_id) |
| { |
| /* Write lock on jobs */ |
| slurmctld_lock_t job_write_lock = |
| { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; |
| foreach_hold_by_id_t hold_by_id = { |
| .id = assoc_id, |
| .cnt = 0, |
| }; |
| |
| if (!job_list) |
| return 0; |
| |
| lock_slurmctld(job_write_lock); |
| (void) list_for_each(job_list, _foreach_hold_by_assoc, &hold_by_id); |
| unlock_slurmctld(job_write_lock); |
| |
| return hold_by_id.cnt; |
| } |
| |
| static int _foreach_hold_by_qos(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| foreach_hold_by_id_t *hold_by_id = arg; |
| |
| if (job_ptr->qos_blocking_ptr && |
| (job_ptr->qos_blocking_ptr->id == hold_by_id->id)) |
| job_ptr->qos_blocking_ptr = NULL; |
| if (job_ptr->qos_list) { |
| if (!list_find_first(job_ptr->qos_list, |
| slurmdb_find_qos_in_list, |
| &hold_by_id->id)) |
| return 0; |
| } else if (job_ptr->qos_id != hold_by_id->id) |
| return 0; |
| |
| hold_by_id->cnt += job_fail_qos(job_ptr, __func__, false); |
| |
| return 0; |
| } |
| |
| /* |
| * job_hold_by_qos_id - Hold all pending jobs with a given |
| * QOS ID. This happens when a QOS is deleted (e.g. when |
| * a QOS is removed from the association database). |
| * RET count of held jobs |
| */ |
| extern int job_hold_by_qos_id(uint32_t qos_id) |
| { |
| /* Write lock on jobs */ |
| slurmctld_lock_t job_write_lock = |
| { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; |
| foreach_hold_by_id_t hold_by_id = { |
| .id = qos_id, |
| .cnt = 0, |
| }; |
| |
| if (!job_list) |
| return 0; |
| |
| lock_slurmctld(job_write_lock); |
| (void) list_for_each(job_list, _foreach_hold_by_qos, &hold_by_id); |
| unlock_slurmctld(job_write_lock); |
| return hold_by_id.cnt; |
| } |
| |
| /* |
| * Modify the account associated with a pending job |
| * IN module - where this is called from |
| * IN job_ptr - pointer to job which should be modified |
| * IN new_wckey - desired wckey name |
| * RET SLURM_SUCCESS or error code |
| */ |
| extern int update_job_wckey(char *module, job_record_t *job_ptr, |
| char *new_wckey) |
| { |
| slurmdb_wckey_rec_t wckey_rec, *wckey_ptr; |
| |
| if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) { |
| info("%s: attempt to modify account for non-pending %pJ", |
| module, job_ptr); |
| return ESLURM_JOB_NOT_PENDING; |
| } |
| |
| memset(&wckey_rec, 0, sizeof(wckey_rec)); |
| wckey_rec.uid = job_ptr->user_id; |
| wckey_rec.name = new_wckey; |
| if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, |
| accounting_enforce, &wckey_ptr, false)) { |
| info("%s: invalid wckey %s for %pJ", |
| module, new_wckey, job_ptr); |
| return ESLURM_INVALID_WCKEY; |
| } else if (slurm_with_slurmdbd() && |
| !wckey_ptr && |
| !(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) { |
| /* if not enforcing associations we want to look for |
| the default account and use it to avoid getting |
| trash in the accounting records. |
| */ |
| wckey_rec.name = NULL; |
| assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, |
| accounting_enforce, &wckey_ptr, false); |
| if (!wckey_ptr) { |
| debug("%s: we didn't have a wckey record for wckey " |
| "'%s' and user '%u', and we can't seem to find " |
| "a default one either. Setting it anyway. " |
| "This will produce trash in accounting. " |
| "If this is not what you desire please put " |
| "AccountStorageEnforce=wckeys in your slurm.conf " |
| "file.", module, new_wckey, |
| job_ptr->user_id); |
| wckey_rec.name = new_wckey; |
| } |
| } |
| |
| xfree(job_ptr->wckey); |
| if (wckey_rec.name && wckey_rec.name[0] != '\0') { |
| job_ptr->wckey = xstrdup(wckey_rec.name); |
| info("%s: setting wckey to %s for %pJ", |
| module, wckey_rec.name, job_ptr); |
| } else { |
| info("%s: cleared wckey for %pJ", module, job_ptr); |
| } |
| |
| last_job_update = time(NULL); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_send_jobs_to_accounting(void *x, void *arg) |
| { |
| job_record_t *job_ptr = x; |
| |
| if (!job_ptr->assoc_id) { |
| slurmdb_assoc_rec_t assoc_rec = { |
| .acct = job_ptr->account, |
| .partition = job_ptr->part_ptr ? |
| job_ptr->part_ptr->name : NULL, |
| .uid = job_ptr->user_id, |
| }; |
| |
| if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, |
| accounting_enforce, |
| &job_ptr->assoc_ptr, false) && |
| (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) { |
| _job_fail_account(job_ptr, __func__, false); |
| return 0; |
| } else |
| job_ptr->assoc_id = assoc_rec.id; |
| } |
| |
| /* we only want active, un accounted for jobs */ |
| if (IS_JOB_IN_DB(job_ptr) || IS_JOB_FINISHED(job_ptr)) |
| return 0; |
| |
| debug("first reg: starting %pJ in accounting", job_ptr); |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| if (IS_JOB_SUSPENDED(job_ptr)) |
| jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); |
| return 0; |
| } |
| |
| /* |
| * Currently only sends active and suspsended jobs not already in the database. |
| * |
| * On node changes, we opt not to send updated node_inx's due to the heavy cost |
| * of doing so. If we were to update the job's node_inx's, this could be done by |
| * resizing the job which will create a new db record for the job with the |
| * changed node_inx's -- like how reservations are done. |
| * e.g. |
| * job_pre_resize_acctg(job_ptr); |
| * job_post_resize_acctg(job_ptr); |
| */ |
| extern int send_jobs_to_accounting(void) |
| { |
| slurmctld_lock_t job_write_lock = { |
| NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; |
| |
| /* send jobs in pending or running state */ |
| lock_slurmctld(job_write_lock); |
| (void) list_for_each(job_list, _foreach_send_jobs_to_accounting, NULL); |
| unlock_slurmctld(job_write_lock); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * copy_job_record_to_job_desc - construct a job_desc_msg_t for a job. |
| * IN job_ptr - the job record |
| * RET the job_desc_msg_t, NULL on error |
| */ |
| extern job_desc_msg_t *copy_job_record_to_job_desc(job_record_t *job_ptr) |
| { |
| job_desc_msg_t *job_desc; |
| job_details_t *details = job_ptr->details; |
| multi_core_data_t *mc_ptr = details->mc_ptr; |
| int i; |
| |
| /* construct a job_desc_msg_t from job */ |
| job_desc = xmalloc(sizeof(job_desc_msg_t)); |
| |
| job_desc->account = xstrdup(job_ptr->account); |
| job_desc->acctg_freq = xstrdup(details->acctg_freq); |
| job_desc->alloc_node = xstrdup(job_ptr->alloc_node); |
| /* Since the allocating salloc or srun is not expected to exist |
| * when this checkpointed job is restarted, do not save these: |
| * |
| * job_desc->alloc_resp_port = job_ptr->alloc_resp_port; |
| * job_desc->alloc_sid = job_ptr->alloc_sid; |
| */ |
| job_desc->argc = details->argc; |
| job_desc->argv = xcalloc(job_desc->argc, sizeof(char *)); |
| for (i = 0; i < job_desc->argc; i ++) |
| job_desc->argv[i] = xstrdup(details->argv[i]); |
| job_desc->begin_time = details->begin_time; |
| job_desc->bitflags = job_ptr->bit_flags; |
| job_desc->clusters = xstrdup(job_ptr->clusters); |
| job_desc->comment = xstrdup(job_ptr->comment); |
| job_desc->container = xstrdup(job_ptr->container); |
| job_desc->container_id = xstrdup(job_ptr->container_id); |
| job_desc->contiguous = details->contiguous; |
| job_desc->core_spec = details->core_spec; |
| job_desc->cpu_bind = xstrdup(details->cpu_bind); |
| job_desc->cpu_bind_type = details->cpu_bind_type; |
| job_desc->cpu_freq_min = details->cpu_freq_min; |
| job_desc->cpu_freq_max = details->cpu_freq_max; |
| job_desc->cpu_freq_gov = details->cpu_freq_gov; |
| job_desc->deadline = job_ptr->deadline; |
| job_desc->dependency = xstrdup(details->dependency); |
| job_desc->end_time = 0; /* Unused today */ |
| job_desc->environment = get_job_env(job_ptr, |
| &job_desc->env_size); |
| job_desc->exc_nodes = xstrdup(details->exc_nodes); |
| job_desc->extra = xstrdup(job_ptr->extra); |
| job_desc->features = xstrdup(details->features); |
| job_desc->cluster_features = xstrdup(details->cluster_features); |
| job_desc->group_id = job_ptr->group_id; |
| job_desc->immediate = 0; /* nowhere to get this value */ |
| job_desc->job_id = job_ptr->job_id; |
| job_desc->kill_on_node_fail = job_ptr->kill_on_node_fail; |
| job_desc->licenses = xstrdup(job_ptr->lic_req); |
| job_desc->mail_type = job_ptr->mail_type; |
| job_desc->mail_user = xstrdup(job_ptr->mail_user); |
| job_desc->mcs_label = xstrdup(job_ptr->mcs_label); |
| job_desc->mem_bind = xstrdup(details->mem_bind); |
| job_desc->mem_bind_type = details->mem_bind_type; |
| job_desc->name = xstrdup(job_ptr->name); |
| job_desc->network = xstrdup(job_ptr->network); |
| job_desc->nice = details->nice; |
| job_desc->num_tasks = details->num_tasks; |
| job_desc->open_mode = details->open_mode; |
| job_desc->origin_cluster = xstrdup(job_ptr->origin_cluster); |
| job_desc->other_port = job_ptr->other_port; |
| job_desc->overcommit = details->overcommit; |
| job_desc->partition = xstrdup(job_ptr->partition); |
| job_desc->plane_size = mc_ptr->plane_size; |
| job_desc->prefer = xstrdup(details->prefer); |
| job_desc->priority = job_ptr->priority; |
| if (job_ptr->qos_ptr) |
| job_desc->qos = xstrdup(job_ptr->qos_ptr->name); |
| job_desc->resp_host = xstrdup(job_ptr->resp_host); |
| job_desc->req_nodes = xstrdup(details->req_nodes); |
| job_desc->requeue = details->requeue; |
| job_desc->reservation = xstrdup(job_ptr->resv_name); |
| job_desc->restart_cnt = job_ptr->restart_cnt; |
| job_desc->segment_size = details->segment_size; |
| job_desc->script_buf = get_job_script(job_ptr); |
| if (details->share_res == 1) |
| job_desc->shared = JOB_SHARED_OK; |
| else if (details->whole_node & WHOLE_NODE_REQUIRED) |
| job_desc->shared = JOB_SHARED_NONE; |
| else if (details->whole_node & WHOLE_NODE_USER) |
| job_desc->shared = JOB_SHARED_USER; |
| else if (details->whole_node & WHOLE_NODE_MCS) |
| job_desc->shared = JOB_SHARED_MCS; |
| else |
| job_desc->shared = NO_VAL16; |
| job_desc->spank_job_env_size = job_ptr->spank_job_env_size; |
| job_desc->spank_job_env = xcalloc(job_desc->spank_job_env_size, |
| sizeof(char *)); |
| for (i = 0; i < job_desc->spank_job_env_size; i ++) |
| job_desc->spank_job_env[i]= xstrdup(job_ptr->spank_job_env[i]); |
| job_desc->std_err = xstrdup(details->std_err); |
| job_desc->std_in = xstrdup(details->std_in); |
| job_desc->std_out = xstrdup(details->std_out); |
| job_desc->submit_line = xstrdup(details->submit_line); |
| job_desc->task_dist = details->task_dist; |
| job_desc->time_limit = job_ptr->time_limit; |
| job_desc->time_min = job_ptr->time_min; |
| job_desc->user_id = job_ptr->user_id; |
| job_desc->wait_all_nodes = job_ptr->wait_all_nodes; |
| job_desc->warn_flags = job_ptr->warn_flags; |
| job_desc->warn_signal = job_ptr->warn_signal; |
| job_desc->warn_time = job_ptr->warn_time; |
| job_desc->wckey = xstrdup(job_ptr->wckey); |
| job_desc->work_dir = xstrdup(details->work_dir); |
| job_desc->pn_min_cpus = details->pn_min_cpus; |
| job_desc->pn_min_memory = details->pn_min_memory; |
| job_desc->oom_kill_step = details->oom_kill_step; |
| job_desc->pn_min_tmp_disk = details->pn_min_tmp_disk; |
| job_desc->min_cpus = details->min_cpus; |
| job_desc->max_cpus = details->max_cpus; |
| job_desc->min_nodes = details->min_nodes; |
| job_desc->max_nodes = details->max_nodes; |
| if (job_desc->max_nodes == 0) /* set 0 in _job_create() */ |
| job_desc->max_nodes = NO_VAL; |
| job_desc->sockets_per_node = mc_ptr->sockets_per_node; |
| job_desc->cores_per_socket = mc_ptr->cores_per_socket; |
| job_desc->threads_per_core = mc_ptr->threads_per_core; |
| job_desc->cpus_per_task = details->cpus_per_task; |
| job_desc->ntasks_per_node = details->ntasks_per_node; |
| job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket; |
| job_desc->ntasks_per_core = mc_ptr->ntasks_per_core; |
| |
| job_desc->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres); |
| job_desc->mem_per_tres = xstrdup(job_ptr->mem_per_tres); |
| job_desc->tres_bind = xstrdup(job_ptr->tres_bind); |
| job_desc->tres_freq = xstrdup(job_ptr->tres_freq); |
| job_desc->tres_per_job = xstrdup(job_ptr->tres_per_job); |
| job_desc->tres_per_node = xstrdup(job_ptr->tres_per_node); |
| job_desc->tres_per_socket = xstrdup(job_ptr->tres_per_socket); |
| job_desc->tres_per_task = xstrdup(job_ptr->tres_per_task); |
| |
| if (job_ptr->fed_details) { |
| job_desc->fed_siblings_active = |
| job_ptr->fed_details->siblings_active; |
| job_desc->fed_siblings_viable = |
| job_ptr->fed_details->siblings_viable; |
| } |
| |
| return job_desc; |
| } |
| |
| /* Build a bitmap of nodes completing this job */ |
| extern void build_cg_bitmap(job_record_t *job_ptr) |
| { |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); |
| if (job_ptr->node_bitmap) { |
| job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap); |
| if (bit_ffs(job_ptr->node_bitmap_cg) == -1) |
| job_state_unset_flag(job_ptr, JOB_COMPLETING); |
| } else { |
| error("build_cg_bitmap: node_bitmap is NULL"); |
| job_ptr->node_bitmap_cg = bit_alloc(node_record_count); |
| job_state_unset_flag(job_ptr, JOB_COMPLETING); |
| } |
| } |
| |
| /* job_hold_requeue() |
| * |
| * Requeue the job based upon its current state. |
| * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state. |
| * If JOB_REQUEUE_HOLD then requeue and hold. |
| * If JOB_REQUEUE then requeue and let it run again. |
| * The requeue can happen directly from job_requeue() or from |
| * job_epilog_complete() after the last component has finished. |
| * |
| * RET returns true if the job was requeued |
| */ |
| extern bool job_hold_requeue(job_record_t *job_ptr) |
| { |
| uint32_t state; |
| uint32_t flags; |
| job_record_t *base_job_ptr = NULL; |
| |
| xassert(job_ptr); |
| |
| /* If the job is already pending it was |
| * eventually requeued somewhere else. |
| */ |
| if (IS_JOB_PENDING(job_ptr) && !IS_JOB_REVOKED(job_ptr)) |
| return false; |
| |
| /* If the job is not on the origin cluster, then don't worry about |
| * requeuing the job here. The exit code will be sent the origin |
| * cluster and the origin cluster will decide if the job should be |
| * requeued or not. */ |
| if (!fed_mgr_is_origin_job(job_ptr)) |
| return false; |
| |
| /* |
| * A job may be canceled during its epilog in which case we need to |
| * check that the job (or base job in the case of an array) was not |
| * canceled before attempting to requeue. |
| */ |
| if (IS_JOB_CANCELLED(job_ptr) || |
| (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) && |
| (base_job_ptr = find_job_record(job_ptr->array_job_id)) && |
| base_job_ptr->array_recs && IS_JOB_CANCELLED(base_job_ptr))) |
| return false; |
| |
| /* Check if the job exit with one of the |
| * configured requeue values. */ |
| _set_job_requeue_exit_value(job_ptr); |
| |
| /* handle crontab jobs */ |
| if ((job_ptr->bit_flags & CRON_JOB) && |
| job_ptr->details->crontab_entry) { |
| job_state_set_flag(job_ptr, JOB_REQUEUE); |
| job_ptr->details->begin_time = |
| calc_next_cron_start(job_ptr->details->crontab_entry, |
| 0); |
| } else if (job_ptr->bit_flags & CRON_JOB) { |
| /* |
| * Skip requeuing this instead of crashing. |
| */ |
| error("Missing cron details for %pJ. This should never happen. Clearing CRON_JOB flag and skipping requeue.", |
| job_ptr); |
| job_ptr->bit_flags &= ~CRON_JOB; |
| } |
| |
| state = job_ptr->job_state; |
| |
| if (! (state & JOB_REQUEUE)) |
| return false; |
| |
| /* Sent event requeue to the database. */ |
| if (!(job_ptr->bit_flags & TRES_STR_CALC) && |
| job_ptr->tres_alloc_cnt && |
| (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) |
| assoc_mgr_set_job_tres_alloc_str(job_ptr, false); |
| jobacct_storage_g_job_complete(acct_db_conn, job_ptr); |
| |
| debug("%s: %pJ state 0x%x", __func__, job_ptr, state); |
| |
| /* Set the job pending */ |
| flags = job_ptr->job_state & JOB_STATE_FLAGS; |
| job_state_set(job_ptr, (JOB_PENDING | flags)); |
| |
| job_ptr->restart_cnt++; |
| |
| /* clear signal sent flag on requeue */ |
| job_ptr->warn_flags &= ~WARN_SENT; |
| |
| /* |
| * Test if user wants to requeue the job |
| * in hold or with a special exit value. |
| */ |
| if (state & JOB_SPECIAL_EXIT) { |
| /* |
| * JOB_SPECIAL_EXIT means requeue the job, |
| * put it on hold and display state as JOB_SPECIAL_EXIT. |
| */ |
| job_state_set_flag(job_ptr, JOB_SPECIAL_EXIT); |
| job_ptr->state_reason = WAIT_HELD_USER; |
| debug("%s: Holding %pJ, special exit", __func__, job_ptr); |
| job_ptr->priority = 0; |
| } |
| |
| job_state_unset_flag(job_ptr, JOB_REQUEUE); |
| |
| /* |
| * Mark array as requeued. Exit codes have already been handled in |
| * _job_array_comp() |
| */ |
| if (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) && |
| (base_job_ptr = find_job_record(job_ptr->array_job_id)) && |
| base_job_ptr->array_recs) { |
| base_job_ptr->array_recs->array_flags |= ARRAY_TASK_REQUEUED; |
| } |
| |
| debug("%s: %pJ state 0x%x reason %u priority %d", |
| __func__, job_ptr, job_ptr->job_state, |
| job_ptr->state_reason, job_ptr->priority); |
| |
| return true; |
| } |
| |
| static void _parse_max_depend_depth(char *str) |
| { |
| int i = atoi(str); |
| if (i < 0) |
| error("ignoring max_depend_depth value of %d", i); |
| else |
| max_depend_depth = i; |
| } |
| |
| extern void init_depend_policy(void) |
| { |
| char *tmp_ptr; |
| |
| disable_remote_singleton = |
| (xstrcasestr(slurm_conf.dependency_params, |
| "disable_remote_singleton")) ? |
| true : false; |
| |
| kill_invalid_dep = |
| (xstrcasestr(slurm_conf.dependency_params, |
| "kill_invalid_depend")) ? |
| true : false; |
| |
| /* 01234567890123456 */ |
| if ((tmp_ptr = xstrcasestr(slurm_conf.dependency_params, |
| "max_depend_depth="))) |
| _parse_max_depend_depth(tmp_ptr + 17); |
| else |
| max_depend_depth = 10; |
| |
| log_flag(DEPENDENCY, "%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d", |
| __func__, kill_invalid_dep, disable_remote_singleton, |
| max_depend_depth); |
| } |
| |
| /* init_requeue_policy() |
| * Initialize the requeue exit/hold bitmaps. |
| */ |
| extern void init_requeue_policy(void) |
| { |
| /* clean first as we can be reconfiguring */ |
| FREE_NULL_BITMAP(requeue_exit); |
| FREE_NULL_BITMAP(requeue_exit_hold); |
| |
| requeue_exit = _make_requeue_array(slurm_conf.requeue_exit); |
| requeue_exit_hold = _make_requeue_array(slurm_conf.requeue_exit_hold); |
| } |
| |
| /* _make_requeue_array() |
| * |
| * Process the RequeueExit|RequeueExitHold configuration |
| * parameters creating two bitmaps holding the exit values |
| * of jobs for which they have to be requeued. |
| */ |
| static bitstr_t *_make_requeue_array(char *conf_buf) |
| { |
| hostset_t *hs; |
| bitstr_t *bs = NULL; |
| char *tok = NULL, *end_ptr = NULL; |
| long val; |
| |
| if (conf_buf == NULL) |
| return bs; |
| |
| xstrfmtcat(tok, "[%s]", conf_buf); |
| hs = hostset_create(tok); |
| xfree(tok); |
| if (!hs) { |
| error("%s: exit values: %s", __func__, conf_buf); |
| return bs; |
| } |
| |
| debug("%s: exit values: %s", __func__, conf_buf); |
| |
| bs = bit_alloc(MAX_EXIT_VAL + 1); |
| while ((tok = hostset_shift(hs))) { |
| val = strtol(tok, &end_ptr, 10); |
| if ((end_ptr[0] == '\0') && |
| (val >= 0) && (val <= MAX_EXIT_VAL)) { |
| bit_set(bs, val); |
| } else { |
| error("%s: exit values: %s (%s)", |
| __func__, conf_buf, tok); |
| } |
| free(tok); |
| } |
| hostset_destroy(hs); |
| |
| return bs; |
| } |
| |
| /* _set_job_requeue_exit_value() |
| * |
| * Compared the job exit values with the configured |
| * RequeueExit and RequeueHoldExit and a match is |
| * found, set the appropriate state for job_hold_requeue() |
| */ |
| static void _set_job_requeue_exit_value(job_record_t *job_ptr) |
| { |
| int exit_code; |
| |
| /* --no-requeue option supersedes config for RequeueExit & |
| * RequeueExitHold |
| */ |
| if (job_ptr->details && !job_ptr->details->requeue) |
| return; |
| |
| exit_code = WEXITSTATUS(job_ptr->exit_code); |
| |
| if (requeue_exit && bit_test(requeue_exit, exit_code)) { |
| debug2("%s: %pJ exit code %d state JOB_REQUEUE", |
| __func__, job_ptr, exit_code); |
| job_state_set_flag(job_ptr, JOB_REQUEUE); |
| return; |
| } |
| |
| if (requeue_exit_hold && bit_test(requeue_exit_hold, exit_code)) { |
| /* Not sure if want to set special exit state in this case */ |
| debug2("%s: %pJ exit code %d state JOB_SPECIAL_EXIT", |
| __func__, job_ptr, exit_code); |
| job_state_set_flag(job_ptr, (JOB_REQUEUE | JOB_SPECIAL_EXIT)); |
| return; |
| } |
| } |
| |
| /* |
| * Reset a job's end_time based upon it's start_time and time_limit. |
| * NOTE: Do not reset the end_time if already being preempted |
| */ |
| extern void job_end_time_reset(job_record_t *job_ptr) |
| { |
| if (job_ptr->preempt_time) |
| return; /* Preemption in progress */ |
| if (job_ptr->time_limit == INFINITE) { |
| job_ptr->end_time = job_ptr->start_time + |
| (365 * 24 * 60 * 60); /* secs in year */ |
| } else { |
| job_ptr->end_time = job_ptr->start_time + |
| (job_ptr->time_limit * 60); /* secs */ |
| } |
| job_ptr->end_time_exp = job_ptr->end_time; |
| } |
| |
| /* If this is a job array meta-job, prepare it for being scheduled */ |
| extern void job_array_pre_sched(job_record_t *job_ptr) |
| { |
| int32_t i; |
| |
| if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap) |
| return; |
| |
| i = bit_ffs(job_ptr->array_recs->task_id_bitmap); |
| if (i < 0) { |
| /* This happens if the final task in a meta-job is requeued */ |
| if (job_ptr->restart_cnt == 0) { |
| error("%pJ has empty task_id_bitmap", job_ptr); |
| } |
| FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap); |
| return; |
| } |
| |
| job_ptr->array_job_id = job_ptr->job_id; |
| job_ptr->array_task_id = i; |
| } |
| |
| /* If this is a job array meta-job, clean up after scheduling attempt */ |
| extern job_record_t *job_array_post_sched(job_record_t *job_ptr, bool list_add) |
| { |
| job_record_t *new_job_ptr = NULL; |
| |
| if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap) |
| return job_ptr; |
| |
| if (job_ptr->array_recs->task_cnt <= 1) { |
| /* Preserve array_recs for min/max exit codes for job array */ |
| if (job_ptr->array_recs->task_cnt) { |
| job_ptr->array_recs->task_cnt--; |
| } else if (job_ptr->restart_cnt) { |
| /* Last task of a job array has been requeued */ |
| } else { |
| error("job %pJ array_recs task count underflow", |
| job_ptr); |
| } |
| xfree(job_ptr->array_recs->task_id_str); |
| if (job_ptr->array_recs->task_cnt == 0) |
| FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap); |
| |
| |
| /* Update the job in the database. */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| /* If job is requeued, it will already be in the hash table */ |
| if (!find_job_array_rec(job_ptr->array_job_id, |
| job_ptr->array_task_id)) { |
| _add_job_array_hash(job_ptr); |
| } |
| new_job_ptr = job_ptr; |
| } else { |
| new_job_ptr = job_array_split(job_ptr, list_add); |
| job_state_set(new_job_ptr, JOB_PENDING); |
| new_job_ptr->start_time = (time_t) 0; |
| } |
| |
| return new_job_ptr; |
| } |
| |
| /* _kill_dependent() |
| * |
| * Exterminate the job that has invalid dependency |
| * condition. |
| */ |
| static void _kill_dependent(job_record_t *job_ptr) |
| { |
| time_t now = time(NULL); |
| |
| info("%s: Job dependency can't be satisfied, cancelling %pJ", |
| __func__, job_ptr); |
| job_state_set(job_ptr, JOB_CANCELLED); |
| job_ptr->start_time = now; |
| job_ptr->end_time = now; |
| job_completion_logger(job_ptr, false); |
| last_job_update = now; |
| srun_allocate_abort(job_ptr); |
| } |
| |
| static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src) |
| { |
| job_fed_details_t *dst = NULL; |
| |
| if (!src) |
| return NULL; |
| |
| dst = xmalloc(sizeof(job_fed_details_t)); |
| memcpy(dst, src, sizeof(job_fed_details_t)); |
| dst->origin_str = xstrdup(src->origin_str); |
| dst->siblings_active_str = xstrdup(src->siblings_active_str); |
| dst->siblings_viable_str = xstrdup(src->siblings_viable_str); |
| |
| return dst; |
| } |
| |
| /* Set federated job's sibling strings. */ |
| extern void update_job_fed_details(job_record_t *job_ptr) |
| { |
| xassert(job_ptr); |
| xassert(job_ptr->fed_details); |
| |
| xfree(job_ptr->fed_details->siblings_active_str); |
| xfree(job_ptr->fed_details->siblings_viable_str); |
| |
| job_ptr->fed_details->siblings_active_str = |
| fed_mgr_cluster_ids_to_names( |
| job_ptr->fed_details->siblings_active); |
| job_ptr->fed_details->siblings_viable_str = |
| fed_mgr_cluster_ids_to_names( |
| job_ptr->fed_details->siblings_viable); |
| |
| /* only set once */ |
| if (!job_ptr->fed_details->origin_str) |
| job_ptr->fed_details->origin_str = |
| fed_mgr_get_cluster_name( |
| fed_mgr_get_cluster_id(job_ptr->job_id)); |
| } |
| |
| /* |
| * Set the allocation response with the current cluster's information and the |
| * job's allocated node's addr's if the allocation is being filled by a cluster |
| * other than the cluster that submitted the job |
| * |
| * Note: make sure that the resp's working_cluster_rec is NULL'ed out before the |
| * resp is free'd since it points to global memory. |
| * |
| * IN resp - allocation response being sent back to client. |
| * IN job_ptr - allocated job |
| * IN req_cluster - the cluster requesting the allocation info. |
| */ |
| extern void set_remote_working_response( |
| resource_allocation_response_msg_t *resp, |
| job_record_t *job_ptr, const char *req_cluster) |
| { |
| xassert(resp); |
| xassert(job_ptr); |
| |
| if (job_ptr->node_cnt && req_cluster && |
| xstrcmp(slurm_conf.cluster_name, req_cluster)) { |
| if (job_ptr->fed_details && |
| fed_mgr_cluster_rec) { |
| resp->working_cluster_rec = fed_mgr_cluster_rec; |
| } else { |
| resp->working_cluster_rec = response_cluster_rec; |
| } |
| |
| if (!job_ptr->node_addrs) { |
| /* |
| * The job may be owned by the local cluster but a |
| * remote srun might be trying to launch a job in the |
| * allocation. |
| */ |
| set_job_node_addrs(job_ptr, req_cluster); |
| } |
| } |
| } |
| |
| /* |
| * Calculate billable TRES based on partition's defined BillingWeights. If none |
| * is defined, return total_cpus. This is cached on job_ptr->billable_tres and |
| * is updated if the job was resized since the last iteration. |
| * |
| * IN job_ptr - job to calc billable tres on |
| * IN start_time - time the has started or been resized |
| * IN assoc_mgr_locked - whether the tres assoc lock is set or not |
| */ |
| extern double calc_job_billable_tres(job_record_t *job_ptr, time_t start_time, |
| bool assoc_mgr_locked) |
| { |
| xassert(job_ptr); |
| |
| part_record_t *part_ptr = job_ptr->part_ptr; |
| |
| /* We don't have any resources allocated, just return 0. */ |
| if (!job_ptr->tres_alloc_cnt) |
| return 0; |
| |
| /* Don't recalculate unless the job is new or resized */ |
| if ((!fuzzy_equal(job_ptr->billable_tres, NO_VAL)) && |
| difftime(job_ptr->resize_time, start_time) < 0.0) |
| return job_ptr->billable_tres; |
| |
| log_flag(PRIO, "BillingWeight: %pJ is either new or it was resized", |
| job_ptr); |
| |
| /* No billing weights defined. Return CPU count */ |
| if (!part_ptr || !part_ptr->billing_weights) { |
| job_ptr->billable_tres = job_ptr->total_cpus; |
| return job_ptr->billable_tres; |
| } |
| |
| log_flag(PRIO, "BillingWeight: %pJ using \"%s\" from partition %s", |
| job_ptr, part_ptr->billing_weights_str, |
| job_ptr->part_ptr->name); |
| |
| job_ptr->billable_tres = |
| assoc_mgr_tres_weighted(job_ptr->tres_alloc_cnt, |
| part_ptr->billing_weights, |
| slurm_conf.priority_flags, |
| assoc_mgr_locked); |
| |
| log_flag(PRIO, "BillingWeight: %pJ %s = %f", |
| job_ptr, |
| (slurm_conf.priority_flags & PRIORITY_FLAGS_MAX_TRES) ? |
| "MAX(node TRES) + SUM(Global TRES)" : |
| (slurm_conf.priority_flags & PRIORITY_FLAGS_MAX_TRES_GRES) ? |
| "MAX(node TRES) + node GRES + SUM(Global TRES)" : "SUM(TRES)", |
| job_ptr->billable_tres); |
| |
| return job_ptr->billable_tres; |
| } |
| |
| /* |
| * Send warning signal to job before end time. |
| * |
| * IN job_ptr - job to send warn signal to. |
| * IN ignore_time - If set, ignore the warn time and just send it. |
| */ |
| extern void send_job_warn_signal(job_record_t *job_ptr, bool ignore_time) |
| { |
| if (job_ptr->warn_signal && |
| !(job_ptr->warn_flags & WARN_SENT) && |
| (ignore_time || |
| (job_ptr->warn_time && |
| ((job_ptr->warn_time + PERIODIC_TIMEOUT + time(NULL)) >= |
| job_ptr->end_time)))) { |
| /* |
| * If --signal B option was not specified, |
| * signal only the steps but not the batch step. |
| */ |
| if (!(job_ptr->warn_flags & KILL_JOB_BATCH)) |
| job_ptr->warn_flags |= KILL_STEPS_ONLY; |
| |
| /* send SIGCONT first */ |
| job_signal(job_ptr, SIGCONT, job_ptr->warn_flags, 0, false); |
| |
| debug("%s: warning signal %u to %pJ", |
| __func__, job_ptr->warn_signal, job_ptr); |
| |
| job_signal(job_ptr, job_ptr->warn_signal, |
| job_ptr->warn_flags, 0, false); |
| |
| /* mark job as signaled */ |
| job_ptr->warn_flags |= WARN_SENT; |
| } |
| } |
| |
| static int _overlap_and_running_internal(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *)x; |
| job_overlap_args_t *overlap_args = (job_overlap_args_t *)arg; |
| |
| /* We always break if we find something not running */ |
| if ((!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) { |
| overlap_args->rc = 0; |
| return -1; |
| } |
| |
| /* |
| * We are just looking for something overlapping. On a hetjob we need |
| * to check everything. |
| */ |
| if (license_list_overlap(overlap_args->license_list, |
| job_ptr->license_list) || |
| (job_ptr->node_bitmap && |
| bit_overlap_any(overlap_args->node_map, job_ptr->node_bitmap))) |
| overlap_args->rc = 1; |
| |
| return 0; |
| } |
| |
| extern bool job_overlap_and_running(bitstr_t *node_map, list_t *license_list, |
| job_record_t *job_ptr) |
| { |
| job_overlap_args_t overlap_args = { |
| .node_map = node_map, |
| .license_list = license_list, |
| }; |
| |
| if (!job_ptr->het_job_list) |
| (void)_overlap_and_running_internal(job_ptr, &overlap_args); |
| else |
| (void)list_for_each(job_ptr->het_job_list, |
| _overlap_and_running_internal, |
| &overlap_args); |
| |
| return overlap_args.rc; |
| } |
| |
| static int _add_hetcomp_hostset(void *x, void *arg) |
| { |
| job_record_t *het_job = x; |
| foreach_hetcomp_args_t *args = arg; |
| |
| if (args->job_ptr->het_job_id != het_job->het_job_id) { |
| error("%s: Bad het_job_list for %pJ", __func__, args->job_ptr); |
| return 0; |
| } |
| |
| if (!het_job->nodes) { |
| debug("%s: %pJ het_job->nodes == NULL. Usually this means the job was canceled while it was starting and shouldn't be a real issue.", |
| __func__, args->job_ptr); |
| return 0; |
| } |
| |
| if (args->hs) |
| (void) hostset_insert(args->hs, het_job->nodes); |
| else |
| args->hs = hostset_create(het_job->nodes); |
| |
| return 0; |
| } |
| |
| extern char **job_common_env_vars(job_record_t *job_ptr, bool is_complete) |
| { |
| char **my_env, *name, *eq, buf[32]; |
| int exit_code, i, signal; |
| |
| my_env = xmalloc(sizeof(char *)); |
| my_env[0] = NULL; |
| |
| /* Set SPANK env vars first so that we can overwrite as needed |
| * below. Prevent user hacking from setting SLURM_JOB_ID etc. */ |
| if (job_ptr->spank_job_env_size) { |
| env_array_merge(&my_env, |
| (const char **) job_ptr->spank_job_env); |
| valid_spank_job_env(my_env, job_ptr->spank_job_env_size, |
| job_ptr->user_id); |
| } |
| |
| setenvf(&my_env, "SLURM_JOB_ACCOUNT", "%s", job_ptr->account); |
| |
| if (is_complete) { |
| exit_code = signal = 0; |
| if (WIFEXITED(job_ptr->exit_code)) { |
| exit_code = WEXITSTATUS(job_ptr->exit_code); |
| } |
| if (WIFSIGNALED(job_ptr->exit_code)) { |
| signal = WTERMSIG(job_ptr->exit_code); |
| } |
| snprintf(buf, sizeof(buf), "%d:%d", exit_code, signal); |
| setenvf(&my_env, "SLURM_JOB_DERIVED_EC", "%u", |
| job_ptr->derived_ec); |
| setenvf(&my_env, "SLURM_JOB_EXIT_CODE2", "%s", buf); |
| setenvf(&my_env, "SLURM_JOB_EXIT_CODE", "%u", |
| job_ptr->exit_code); |
| } |
| |
| if (job_ptr->array_task_id != NO_VAL) { |
| setenvf(&my_env, "SLURM_ARRAY_JOB_ID", "%u", |
| job_ptr->array_job_id); |
| setenvf(&my_env, "SLURM_ARRAY_TASK_ID", "%u", |
| job_ptr->array_task_id); |
| if (job_ptr->details && job_ptr->details->env_sup && |
| job_ptr->details->env_cnt) { |
| for (i = 0; i < job_ptr->details->env_cnt; i++) { |
| if (xstrncmp(job_ptr->details->env_sup[i], |
| "SLURM_ARRAY_TASK", 16)) |
| continue; |
| eq = strchr(job_ptr->details->env_sup[i], '='); |
| if (!eq) |
| continue; |
| eq[0] = '\0'; |
| setenvf(&my_env, |
| job_ptr->details->env_sup[i], |
| "%s", eq + 1); |
| eq[0] = '='; |
| } |
| } |
| } |
| |
| if (slurm_conf.cluster_name) { |
| setenvf(&my_env, "SLURM_CLUSTER_NAME", "%s", |
| slurm_conf.cluster_name); |
| } |
| |
| if (job_ptr->comment) |
| setenvf(&my_env, "SLURM_JOB_COMMENT", "%s", job_ptr->comment); |
| |
| setenvf(&my_env, "SLURM_JOB_END_TIME", "%lu", job_ptr->end_time); |
| |
| if (job_ptr->extra) |
| setenvf(&my_env, "SLURM_JOB_EXTRA", "%s", job_ptr->extra); |
| |
| if (job_ptr->het_job_id) { |
| /* Continue support for old hetjob terminology. */ |
| setenvf(&my_env, "SLURM_PACK_JOB_ID", "%u", |
| job_ptr->het_job_id); |
| setenvf(&my_env, "SLURM_PACK_JOB_OFFSET", "%u", |
| job_ptr->het_job_offset); |
| setenvf(&my_env, "SLURM_HET_JOB_ID", "%u", |
| job_ptr->het_job_id); |
| setenvf(&my_env, "SLURM_HET_JOB_OFFSET", "%u", |
| job_ptr->het_job_offset); |
| if ((job_ptr->het_job_offset == 0) && job_ptr->het_job_list) { |
| foreach_hetcomp_args_t args = { |
| .job_ptr = job_ptr, |
| }; |
| list_for_each(job_ptr->het_job_list, |
| _add_hetcomp_hostset, &args); |
| if (args.hs) { |
| char *buf = hostset_ranged_string_xmalloc( |
| args.hs); |
| /* Support for old hetjob terminology. */ |
| setenvf(&my_env, "SLURM_PACK_JOB_NODELIST", |
| "%s", buf); |
| setenvf(&my_env, "SLURM_HET_JOB_NODELIST", |
| "%s", buf); |
| xfree(buf); |
| hostset_destroy(args.hs); |
| } |
| } |
| } |
| setenvf(&my_env, "SLURM_JOB_GID", "%u", job_ptr->group_id); |
| name = group_from_job(job_ptr); |
| setenvf(&my_env, "SLURM_JOB_GROUP", "%s", name); |
| xfree(name); |
| setenvf(&my_env, "SLURM_JOBID", "%u", job_ptr->job_id); |
| setenvf(&my_env, "SLURM_JOB_ID", "%u", job_ptr->job_id); |
| if (job_ptr->licenses) |
| setenvf(&my_env, "SLURM_JOB_LICENSES", "%s", job_ptr->licenses); |
| setenvf(&my_env, "SLURM_JOB_NAME", "%s", job_ptr->name); |
| setenvf(&my_env, "SLURM_JOB_NODELIST", "%s", job_ptr->nodes); |
| if (job_ptr->job_resrcs) { |
| char *tmp; |
| |
| tmp = uint32_compressed_to_str( |
| job_ptr->job_resrcs->cpu_array_cnt, |
| job_ptr->job_resrcs->cpu_array_value, |
| job_ptr->job_resrcs->cpu_array_reps); |
| setenvf(&my_env, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp); |
| xfree(tmp); |
| |
| setenvf(&my_env, "SLURM_JOB_NUM_NODES", "%u", |
| job_ptr->job_resrcs->nhosts); |
| } |
| if (job_ptr->part_ptr) { |
| setenvf(&my_env, "SLURM_JOB_PARTITION", "%s", |
| job_ptr->part_ptr->name); |
| } else { |
| setenvf(&my_env, "SLURM_JOB_PARTITION", "%s", |
| job_ptr->partition); |
| } |
| |
| if (job_ptr->resv_ptr) |
| setenvf(&my_env, "SLURM_JOB_RESERVATION", "%s", |
| job_ptr->resv_ptr->name); |
| |
| setenvf(&my_env, "SLURM_JOB_RESTART_COUNT", "%d", job_ptr->restart_cnt); |
| |
| setenvf(&my_env, "SLURM_JOB_START_TIME", "%lu", job_ptr->start_time); |
| |
| setenvf(&my_env, "SLURM_JOB_UID", "%u", job_ptr->user_id); |
| name = user_from_job(job_ptr); |
| setenvf(&my_env, "SLURM_JOB_USER", "%s", name); |
| xfree(name); |
| if (job_ptr->wckey) { |
| setenvf(&my_env, "SLURM_WCKEY", "%s", job_ptr->wckey); |
| } |
| |
| if (job_ptr->details) { |
| if (job_ptr->details->features_use) |
| setenvf(&my_env, "SLURM_JOB_CONSTRAINTS", "%s", |
| job_ptr->details->features_use); |
| |
| setenvf(&my_env, "SLURM_JOB_OVERSUBSCRIBE", "%s", |
| job_share_string(get_job_share_value(job_ptr))); |
| |
| if (job_ptr->details->std_err) |
| setenvf(&my_env, "SLURM_JOB_STDERR", "%s", |
| job_ptr->details->std_err); |
| if (job_ptr->details->std_in) |
| setenvf(&my_env, "SLURM_JOB_STDIN", "%s", |
| job_ptr->details->std_in); |
| if (job_ptr->details->std_out) |
| setenvf(&my_env, "SLURM_JOB_STDOUT", "%s", |
| job_ptr->details->std_out); |
| if (job_ptr->details->work_dir) |
| setenvf(&my_env, "SLURM_JOB_WORK_DIR", "%s", |
| job_ptr->details->work_dir); |
| } |
| |
| return my_env; |
| } |
| |
| extern job_record_t *job_mgr_copy_resv_desc_to_job_record( |
| resv_desc_msg_t *resv_desc_ptr) |
| { |
| job_record_t *job_ptr; |
| job_details_t *detail_ptr; |
| part_record_t *part_ptr = NULL; |
| |
| job_ptr = _create_job_record(1, false); |
| detail_ptr = job_ptr->details; |
| |
| job_ptr->partition = xstrdup(resv_desc_ptr->partition); |
| |
| if (job_ptr->partition) |
| part_ptr = find_part_record(job_ptr->partition); |
| detail_ptr->pn_min_memory = |
| _get_def_mem(part_ptr, job_ptr->tres_req_cnt); |
| |
| job_ptr->time_limit = resv_desc_ptr->duration; |
| |
| detail_ptr->begin_time = resv_desc_ptr->start_time; |
| if (resv_desc_ptr->node_cnt != NO_VAL) { |
| detail_ptr->max_nodes = detail_ptr->min_nodes = |
| resv_desc_ptr->node_cnt; |
| } else { |
| detail_ptr->min_nodes = 1; |
| /* 500000 comes from job_scheduler.c job_start_data() */ |
| detail_ptr->max_nodes = 500000; |
| } |
| |
| if (resv_desc_ptr->node_list) { |
| hostlist_t *hl = hostlist_create(resv_desc_ptr->node_list); |
| hostlist_uniq(hl); |
| detail_ptr->req_nodes = hostlist_ranged_string_xmalloc(hl); |
| detail_ptr->max_nodes = detail_ptr->min_nodes = |
| hostlist_count(hl); |
| hostlist_destroy(hl); |
| |
| (void) node_name2bitmap(detail_ptr->req_nodes, true, |
| &detail_ptr->req_node_bitmap, NULL); |
| } |
| |
| if (resv_desc_ptr->tres_str || resv_desc_ptr->core_cnt != NO_VAL) { |
| detail_ptr->mc_ptr = job_record_create_mc(); |
| |
| /* |
| * Since reservations are core based we need to request it that |
| * way with one thread per core and one task per core. |
| */ |
| detail_ptr->mc_ptr->ntasks_per_core = 1; |
| detail_ptr->mc_ptr->threads_per_core = 1; |
| |
| detail_ptr->num_tasks = detail_ptr->min_cpus = |
| resv_desc_ptr->core_cnt; |
| if (detail_ptr->min_cpus == NO_VAL) |
| detail_ptr->min_cpus = detail_ptr->min_nodes; |
| } else { |
| detail_ptr->num_tasks = detail_ptr->min_cpus = |
| detail_ptr->min_nodes; |
| detail_ptr->whole_node = WHOLE_NODE_REQUIRED; |
| } |
| detail_ptr->core_spec = NO_VAL16; |
| detail_ptr->cpus_per_task = 1; |
| detail_ptr->orig_min_cpus = detail_ptr->min_cpus; |
| detail_ptr->orig_max_cpus = detail_ptr->max_cpus = NO_VAL; |
| if ((resv_desc_ptr->flags & RESERVE_TRES_PER_NODE) && |
| (resv_desc_ptr->core_cnt != NO_VAL) && |
| (resv_desc_ptr->node_cnt != NO_VAL)) { |
| detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus = |
| resv_desc_ptr->core_cnt / resv_desc_ptr->node_cnt; |
| } else |
| detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus = 1; |
| detail_ptr->features = xstrdup(resv_desc_ptr->features); |
| |
| if (build_feature_list(job_ptr, false, true)) { |
| error("%s: invalid features(%s) for reservation given", |
| __func__, detail_ptr->features); |
| } |
| |
| detail_ptr->task_dist = SLURM_DIST_BLOCK; |
| job_ptr->best_switch = true; |
| |
| if (resv_desc_ptr->tres_str) { |
| gres_job_state_validate_t gres_js_val = { |
| .cpus_per_tres = NULL, |
| .mem_per_tres = NULL, |
| .tres_freq = NULL, |
| .tres_per_socket = NULL, |
| .tres_per_task = NULL, |
| |
| .cpus_per_task = &detail_ptr->orig_cpus_per_task, |
| .max_nodes = &detail_ptr->max_nodes, |
| .min_cpus = &detail_ptr->min_cpus, |
| .min_nodes = &detail_ptr->min_nodes, |
| .ntasks_per_node = &detail_ptr->ntasks_per_node, |
| .ntasks_per_socket = |
| &detail_ptr->mc_ptr->ntasks_per_socket, |
| .ntasks_per_tres = &detail_ptr->ntasks_per_tres, |
| .num_tasks = &detail_ptr->num_tasks, |
| .sockets_per_node = |
| &detail_ptr->mc_ptr->sockets_per_node, |
| |
| .gres_list = &job_ptr->gres_list_req, |
| }; |
| |
| detail_ptr->mc_ptr->ntasks_per_socket = NO_VAL16; |
| detail_ptr->mc_ptr->sockets_per_node = NO_VAL16; |
| detail_ptr->orig_cpus_per_task = NO_VAL16; |
| detail_ptr->ntasks_per_tres = NO_VAL16; |
| |
| job_ptr->tres_req_str = xstrdup(resv_desc_ptr->tres_str); |
| |
| if (resv_desc_ptr->flags & RESERVE_TRES_PER_NODE) |
| job_ptr->tres_per_node = xstrdup(job_ptr->tres_req_str); |
| else |
| job_ptr->tres_per_job = xstrdup(job_ptr->tres_req_str); |
| |
| gres_js_val.tres_per_job = job_ptr->tres_per_job; |
| gres_js_val.tres_per_node = job_ptr->tres_per_node; |
| |
| (void)gres_job_state_validate(&gres_js_val); |
| |
| if (detail_ptr->num_tasks == NO_VAL) |
| detail_ptr->num_tasks = 0; |
| if (detail_ptr->min_cpus == NO_VAL) |
| detail_ptr->min_cpus = 1; |
| |
| if (resv_desc_ptr->flags & RESERVE_TRES_PER_NODE) |
| detail_ptr->ntasks_per_node = detail_ptr->pn_min_cpus; |
| else if (detail_ptr->ntasks_per_node == NO_VAL16) |
| detail_ptr->ntasks_per_node = 0; |
| |
| if (detail_ptr->mc_ptr->ntasks_per_socket == NO_VAL16) |
| detail_ptr->mc_ptr->ntasks_per_socket = INFINITE16; |
| if (job_ptr->gres_list_req) |
| job_ptr->bit_flags |= GRES_ENFORCE_BIND; |
| gres_job_state_log(job_ptr->gres_list_req, job_ptr->job_id); |
| } |
| return job_ptr; |
| } |
| |
| extern uint16_t job_mgr_determine_cpus_per_core( |
| job_details_t *details, int node_inx) |
| { |
| uint16_t ncpus_per_core = INFINITE16; /* Usable CPUs per core */ |
| uint16_t threads_per_core = node_record_table_ptr[node_inx]->tpc; |
| |
| if ((slurm_conf.select_type_param & SELECT_ONE_TASK_PER_CORE) && |
| (details->min_gres_cpu > 0)) { |
| /* May override default of 1 CPU per core */ |
| return node_record_table_ptr[node_inx]->tpc; |
| } |
| |
| if (details && details->mc_ptr) { |
| multi_core_data_t *mc_ptr = details->mc_ptr; |
| if ((mc_ptr->ntasks_per_core != INFINITE16) && |
| (mc_ptr->ntasks_per_core)) { |
| ncpus_per_core = MIN(threads_per_core, |
| (mc_ptr->ntasks_per_core * |
| details->cpus_per_task)); |
| } |
| if ((mc_ptr->threads_per_core != NO_VAL16) && |
| (mc_ptr->threads_per_core < ncpus_per_core)) { |
| ncpus_per_core = mc_ptr->threads_per_core; |
| } |
| } |
| |
| threads_per_core = MIN(threads_per_core, ncpus_per_core); |
| |
| return threads_per_core; |
| } |
| |
| static int _sort_part_lists(void *x, void *none) |
| { |
| job_record_t *job_ptr = x; |
| if (job_ptr && job_ptr->part_ptr_list) |
| list_sort(job_ptr->part_ptr_list, priority_sort_part_tier); |
| return SLURM_SUCCESS; |
| } |
| |
| extern void sort_all_jobs_partition_lists() |
| { |
| list_for_each(job_list, _sort_part_lists, NULL); |
| } |
| |
| extern void job_mgr_handle_cred_failure(job_record_t *job_ptr) |
| { |
| job_ptr->priority = 0; /* Hold job */ |
| xfree(job_ptr->system_comment); |
| job_ptr->system_comment = |
| xstrdup("slurm_cred_create failure, holding job."); |
| job_complete(job_ptr->job_id, slurm_conf.slurm_user_id, true, false, 0); |
| } |