| /*****************************************************************************\ |
| * node_scheduler.c - select and allocated nodes to jobs |
| * Note: there is a global node table (node_record_table_ptr) |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Copyright (C) SchedMD LLC. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <errno.h> |
| #include <pthread.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <syslog.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/group_cache.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/id_util.h" |
| #include "src/common/job_features.h" |
| #include "src/common/list.h" |
| #include "src/common/node_features.h" |
| #include "src/common/port_mgr.h" |
| #include "src/common/slurm_protocol_pack.h" |
| #include "src/common/xassert.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/interfaces/accounting_storage.h" |
| #include "src/interfaces/burst_buffer.h" |
| #include "src/interfaces/gres.h" |
| #include "src/interfaces/jobcomp.h" |
| #include "src/interfaces/mcs.h" |
| #include "src/interfaces/node_features.h" |
| #include "src/interfaces/preempt.h" |
| #include "src/interfaces/priority.h" |
| #include "src/interfaces/select.h" |
| #include "src/interfaces/switch.h" |
| #include "src/interfaces/topology.h" |
| |
| #include "src/slurmctld/acct_policy.h" |
| #include "src/slurmctld/agent.h" |
| #include "src/slurmctld/gang.h" |
| #include "src/slurmctld/job_scheduler.h" |
| #include "src/slurmctld/licenses.h" |
| #include "src/slurmctld/node_scheduler.h" |
| #include "src/slurmctld/power_save.h" |
| #include "src/slurmctld/proc_req.h" |
| #include "src/slurmctld/reservation.h" |
| #include "src/slurmctld/slurmctld.h" |
| |
| #include "src/stepmgr/gres_stepmgr.h" |
| #include "src/stepmgr/stepmgr.h" |
| |
| #define _DEBUG 0 |
| #define MAX_FEATURES 64 /* max exclusive features "[fs1|fs2]"=2 */ |
| |
| struct node_set { /* set of nodes with same configuration */ |
| uint16_t cpus_per_node; /* NOTE: This is the minimum count */ |
| char *features; /* Node features */ |
| bitstr_t *feature_bits; /* MORed feature's position */ |
| uint32_t flags; /* See NODE_SET_* below */ |
| bitstr_t *my_bitmap; /* Node bitmap */ |
| uint32_t node_cnt; /* Node count */ |
| uint32_t node_weight; /* Node weight */ |
| uint64_t real_memory; /* Real memory on node */ |
| uint64_t sched_weight; /* Scheduling weight, based upon |
| * node_weight and flags */ |
| }; |
| |
| #define NODE_SET_NOFLAG SLURM_BIT(0) |
| #define NODE_SET_REBOOT SLURM_BIT(1) |
| #define NODE_SET_OUTSIDE_FLEX SLURM_BIT(2) |
| #define NODE_SET_POWER_DN SLURM_BIT(3) |
| #define NODE_SET_POWERING_UP SLURM_BIT(4) |
| |
| enum { |
| IN_FL, /* Inside flex reservation */ |
| OUT_FL, /* Outside flex reservation */ |
| IN_FL_RE, /* Inside flex reservation + need reboot */ |
| OUT_FL_NO_RE, /* Outside flex reservation + NO to need reboot */ |
| OUT_FL_RE, /* Outside flex reservation + need reboot */ |
| REBOOT, /* Needs reboot */ |
| NM_TYPES /* Number of node types */ |
| }; |
| |
| static int _build_node_list(job_record_t *job_ptr, |
| struct node_set **node_set_pptr, |
| int *node_set_size, char **err_msg, |
| bool test_only, bool can_reboot); |
| static bitstr_t *_find_grp_node_bitmap(job_record_t *job_ptr); |
| static bool _first_array_task(job_record_t *job_ptr); |
| static void _log_node_set(job_record_t *job_ptr, |
| struct node_set *node_set_ptr, |
| int node_set_size); |
| static int _match_feature(list_t *feature_list, bitstr_t **inactive_bitmap); |
| static int _nodes_in_sets(bitstr_t *req_bitmap, |
| struct node_set * node_set_ptr, |
| int node_set_size); |
| static int _pick_best_nodes(struct node_set *node_set_ptr, |
| int node_set_size, bitstr_t ** select_bitmap, |
| job_record_t *job_ptr, part_record_t *part_ptr, |
| uint32_t min_nodes, uint32_t max_nodes, |
| uint32_t req_nodes, bool test_only, |
| list_t *preemptee_candidates, |
| list_t **preemptee_job_list, bool has_xand, |
| resv_exc_t *resv_exc_ptr, bool resv_overlap); |
| static void _set_err_msg(bool cpus_ok, bool mem_ok, bool disk_ok, |
| bool job_mc_ok, char **err_msg); |
| static void _set_sched_weight(struct node_set *node_set_ptr); |
| static int _sort_node_set(const void *x, const void *y); |
| static bitstr_t *_valid_features(job_record_t *job_ptr, |
| config_record_t *config_ptr, |
| bool can_reboot, bitstr_t *reboot_bitmap); |
| |
| /* |
| * _get_ntasks_per_core - Retrieve the value of ntasks_per_core from |
| * the given job_details record. If it wasn't set, return INFINITE16. |
| * Intended for use with the adjust_cpus_nppcu function. |
| */ |
| static uint16_t _get_ntasks_per_core(job_details_t *details) |
| { |
| if (details->mc_ptr) |
| return details->mc_ptr->ntasks_per_core; |
| else |
| return INFINITE16; |
| } |
| |
| /* |
| * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED |
| * also claim required licenses and resources reserved by accounting |
| * policy association |
| * IN job_ptr - job being allocated resources |
| */ |
| extern void allocate_nodes(job_record_t *job_ptr) |
| { |
| node_record_t *node_ptr; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| make_node_alloc(node_ptr, job_ptr); |
| } |
| |
| node_mgr_make_node_blocked(job_ptr, true); |
| |
| last_node_update = time(NULL); |
| license_job_get(job_ptr, false); |
| set_initial_job_alias_list(job_ptr); |
| } |
| |
| extern void set_initial_job_alias_list(job_record_t *job_ptr) |
| { |
| node_record_t *node_ptr; |
| bool has_cloud = false, has_cloud_power_save = false; |
| bool has_dynamic_norm = false; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| if (IS_NODE_DYNAMIC_FUTURE(node_ptr)) |
| has_cloud = true; |
| |
| if (IS_NODE_DYNAMIC_NORM(node_ptr)) { |
| /* Must set alias list as nodes won't exist in conf */ |
| has_cloud = true; |
| has_dynamic_norm = true; |
| } |
| |
| if (IS_NODE_CLOUD(node_ptr)) { |
| has_cloud = true; |
| if (IS_NODE_POWERED_DOWN(node_ptr) || |
| IS_NODE_POWERING_UP(node_ptr)) |
| has_cloud_power_save = true; |
| } |
| } |
| |
| if (has_cloud) { |
| if (has_cloud_power_save && |
| job_ptr->origin_cluster && |
| xstrcmp(slurm_conf.cluster_name, job_ptr->origin_cluster)) { |
| /* Set TBD so remote srun will updated node_addrs */ |
| job_ptr->alias_list = xstrdup("TBD"); |
| job_ptr->wait_all_nodes = 1; |
| } else if (cloud_dns && !has_dynamic_norm) { |
| job_ptr->wait_all_nodes = 1; |
| } else if (has_cloud_power_save) { |
| job_ptr->alias_list = xstrdup("TBD"); |
| job_ptr->wait_all_nodes = 1; |
| } else |
| set_job_alias_list(job_ptr); |
| } else { |
| /* set addrs if the job is coming from a different cluster */ |
| set_job_node_addrs(job_ptr, job_ptr->origin_cluster); |
| } |
| } |
| |
| /* |
| * Set addrs if: |
| * 1. There is an alias_list (cloud/dynamic nodes) and it isn't TBD (nodes are |
| * powering up). |
| * 2. No alias_list but job/request is from a different cluster. |
| */ |
| extern void set_job_node_addrs(job_record_t *job_ptr, |
| const char *origin_cluster) |
| { |
| if (!job_ptr->node_addrs && |
| job_ptr->node_bitmap && |
| bit_set_count(job_ptr->node_bitmap) && |
| ((!job_ptr->alias_list && /* remote job */ |
| origin_cluster && |
| xstrcmp(origin_cluster, slurm_conf.cluster_name)) || |
| (job_ptr->alias_list && xstrcmp(job_ptr->alias_list, "TBD")))) { |
| node_record_t *node_ptr; |
| |
| job_ptr->node_addrs = |
| xcalloc(bit_set_count(job_ptr->node_bitmap), |
| sizeof(slurm_addr_t)); |
| for (int i = 0, addr_index = 0; |
| (node_ptr = next_node_bitmap(job_ptr->node_bitmap, |
| &i)); |
| i++) { |
| slurm_conf_get_addr(node_ptr->name, |
| &job_ptr->node_addrs[addr_index++], |
| 0); |
| } |
| } |
| } |
| |
| /* Set a job's alias_list string */ |
| extern void set_job_alias_list(job_record_t *job_ptr) |
| { |
| node_record_t *node_ptr; |
| |
| xfree(job_ptr->alias_list); |
| |
| if (cloud_dns && bit_super_set(job_ptr->node_bitmap, cloud_node_bitmap)) |
| return; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| if (IS_NODE_DYNAMIC_FUTURE(node_ptr) || |
| IS_NODE_DYNAMIC_NORM(node_ptr) || |
| (!cloud_dns && IS_NODE_CLOUD(node_ptr))) { |
| if (job_ptr->alias_list) |
| xstrcat(job_ptr->alias_list, ","); |
| |
| xstrfmtcat(job_ptr->alias_list, "%s:[%s]:%s", |
| node_ptr->name, node_ptr->comm_name, |
| node_ptr->node_hostname); |
| } |
| } |
| |
| set_job_node_addrs(job_ptr, job_ptr->origin_cluster); |
| } |
| |
| extern void set_job_features_use(job_details_t *details_ptr) |
| { |
| if (!details_ptr) |
| return; |
| |
| if (details_ptr->prefer) { |
| details_ptr->features_use = details_ptr->prefer; |
| details_ptr->feature_list_use = details_ptr->prefer_list; |
| } else { |
| details_ptr->features_use = details_ptr->features; |
| details_ptr->feature_list_use = details_ptr->feature_list; |
| } |
| } |
| |
| /* |
| * deallocate_nodes - for a given job, deallocate its nodes and make |
| * their state NODE_STATE_COMPLETING also release the job's licenses |
| * and resources reserved by accounting policy association |
| * IN job_ptr - pointer to terminating job (already in some COMPLETING state) |
| * IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT |
| * RPC instead of REQUEST_TERMINATE_JOB |
| * IN suspended - true if job was already suspended (node's run_job_cnt |
| * already decremented); |
| * IN preempted - true if job is being preempted |
| */ |
| extern void deallocate_nodes(job_record_t *job_ptr, bool timeout, |
| bool suspended, bool preempted) |
| { |
| kill_job_msg_t *kill_job = NULL; |
| agent_arg_t *agent_args = NULL; |
| node_record_t *node_ptr; |
| hostlist_t *hostlist = NULL; |
| uint16_t use_protocol_version = 0; |
| uint16_t msg_flags = 0; |
| |
| xassert(job_ptr); |
| xassert(job_ptr->details); |
| |
| log_flag(TRACE_JOBS, "%s: %pJ", __func__, job_ptr); |
| |
| acct_policy_job_fini(job_ptr, false); |
| |
| node_mgr_make_node_blocked(job_ptr, false); |
| |
| if (select_g_job_fini(job_ptr) != SLURM_SUCCESS) |
| error("select_g_job_fini(%pJ): %m", job_ptr); |
| |
| /* Release any job-related switch data */ |
| switch_g_job_complete(job_ptr); |
| |
| epilog_slurmctld(job_ptr); |
| |
| if (!job_ptr->details->prolog_running) |
| hostlist = hostlist_create(NULL); |
| |
| if (!job_ptr->node_bitmap_cg) |
| build_cg_bitmap(job_ptr); |
| use_protocol_version = SLURM_PROTOCOL_VERSION; |
| |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i)); i++) { |
| /* Sync up conditionals with make_node_comp() */ |
| if (IS_NODE_DOWN(node_ptr) || |
| IS_NODE_POWERED_DOWN(node_ptr) || |
| IS_NODE_POWERING_UP(node_ptr)) { |
| /* Issue the KILL RPC, but don't verify response */ |
| bit_clear(job_ptr->node_bitmap_cg, i); |
| job_update_tres_cnt(job_ptr, i); |
| /* |
| * node_cnt indicates how many nodes we are waiting |
| * to get epilog complete messages from, so do not |
| * count down nodes. NOTE: The job's node_cnt will not |
| * match the number of entries in the node string |
| * during its completion. |
| */ |
| job_ptr->node_cnt--; |
| } |
| make_node_comp(node_ptr, job_ptr, suspended); |
| |
| if (hostlist && |
| !IS_NODE_POWERED_DOWN(node_ptr) && |
| !IS_NODE_POWERING_UP(node_ptr)) { |
| hostlist_push_host(hostlist, node_ptr->name); |
| if (use_protocol_version > node_ptr->protocol_version) { |
| use_protocol_version = |
| node_ptr->protocol_version; |
| debug3("%s: protocol version downgraded to %u from node %s", |
| __func__, use_protocol_version, |
| node_ptr->name); |
| } |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| msg_flags |= SLURM_PACK_ADDRS; |
| } |
| } |
| |
| if (job_ptr->details->prolog_running) { |
| /* |
| * Job was configuring when it was cancelled and epilog wasn't |
| * run on the nodes, so cleanup the nodes now. Final cleanup |
| * will happen after EpilogSlurmctld is done. |
| */ |
| if (job_ptr->node_bitmap_cg) { |
| /* |
| * Call cleanup_completing before job_epilog_complete or |
| * we will end up requeuing there before this is called. |
| */ |
| cleanup_completing(job_ptr, false); |
| |
| /* |
| * job_epilog_complete() can free |
| * job_ptr->node_bitmap_cg |
| */ |
| for (int i = 0; |
| job_ptr->node_bitmap_cg && |
| (node_ptr = next_node_bitmap( |
| job_ptr->node_bitmap_cg, &i)); |
| i++) { |
| job_epilog_complete(job_ptr->job_id, |
| node_ptr->name, 0); |
| } |
| } |
| |
| return; |
| } |
| |
| /* Can not wait for epilog complete to release licenses and |
| * update gang scheduling table */ |
| cleanup_completing(job_ptr, false); |
| |
| resv_replace_update(job_ptr); |
| |
| if (!hostlist || !hostlist_count(hostlist)) { |
| hostlist_destroy(hostlist); |
| return; |
| } |
| |
| if (job_ptr->bit_flags & EXTERNAL_JOB) { |
| debug("%s: %pJ is external, no need to wait to complete", |
| __func__, job_ptr); |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i)); |
| i++) { |
| make_node_idle(node_ptr, job_ptr); |
| } |
| hostlist_destroy(hostlist); |
| return; |
| } |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| if (timeout) |
| agent_args->msg_type = REQUEST_KILL_TIMELIMIT; |
| else if (preempted) |
| agent_args->msg_type = REQUEST_KILL_PREEMPTED; |
| else |
| agent_args->msg_type = REQUEST_TERMINATE_JOB; |
| agent_args->retry = 0; /* re_kill_job() resends as needed */ |
| agent_args->protocol_version = use_protocol_version; |
| agent_args->hostlist = hostlist; |
| agent_args->node_count = hostlist_count(hostlist); |
| agent_args->msg_flags = msg_flags; |
| |
| last_node_update = time(NULL); |
| kill_job = create_kill_job_msg(job_ptr, use_protocol_version); |
| kill_job->nodes = xstrdup(job_ptr->nodes); |
| |
| agent_args->msg_args = kill_job; |
| set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_args); |
| } |
| |
| static void _log_feature_nodes(job_feature_t *job_feat_ptr) |
| { |
| char *tmp1, *tmp2, *tmp3, *tmp4 = NULL; |
| |
| if (!(slurm_conf.debug_flags & DEBUG_FLAG_NODE_FEATURES)) |
| return; |
| |
| if (job_feat_ptr->op_code == FEATURE_OP_OR) |
| tmp3 = "OR"; |
| else if (job_feat_ptr->op_code == FEATURE_OP_AND) |
| tmp3 = "AND"; |
| else if (job_feat_ptr->op_code == FEATURE_OP_MOR) |
| tmp3 = "MOR"; |
| else if (job_feat_ptr->op_code == FEATURE_OP_XAND) |
| tmp3 = "XAND"; |
| else if (job_feat_ptr->op_code == FEATURE_OP_END) |
| tmp3 = "END"; |
| else { |
| xstrfmtcat(tmp4, "UNKNOWN:%u", job_feat_ptr->op_code); |
| tmp3 = tmp4; |
| } |
| tmp1 = bitmap2node_name(job_feat_ptr->node_bitmap_active); |
| tmp2 = bitmap2node_name(job_feat_ptr->node_bitmap_avail); |
| log_flag(NODE_FEATURES, "%s: FEAT:%s COUNT:%u BRACKET:%u PAREN:%d OP:%s ACTIVE:%s AVAIL:%s", |
| __func__, job_feat_ptr->name, job_feat_ptr->count, |
| job_feat_ptr->bracket, job_feat_ptr->paren, tmp3, tmp1, tmp2); |
| xfree(tmp1); |
| xfree(tmp2); |
| xfree(tmp4); |
| } |
| |
| /* |
| * For every element in the feature_list, identify the nodes with that feature |
| * either active or available and set the feature_list's node_bitmap_active and |
| * node_bitmap_avail fields accordingly. |
| */ |
| extern void find_feature_nodes(list_t *feature_list, bool can_reboot) |
| { |
| list_itr_t *feat_iter; |
| job_feature_t *job_feat_ptr; |
| node_feature_t *node_feat_ptr; |
| |
| if (!feature_list) |
| return; |
| feat_iter = list_iterator_create(feature_list); |
| while ((job_feat_ptr = list_next(feat_iter))) { |
| FREE_NULL_BITMAP(job_feat_ptr->node_bitmap_active); |
| FREE_NULL_BITMAP(job_feat_ptr->node_bitmap_avail); |
| node_feat_ptr = list_find_first(active_feature_list, |
| list_find_feature, |
| job_feat_ptr->name); |
| if (node_feat_ptr && node_feat_ptr->node_bitmap) { |
| job_feat_ptr->node_bitmap_active = |
| bit_copy(node_feat_ptr->node_bitmap); |
| } else { /* This feature not active */ |
| job_feat_ptr->node_bitmap_active = |
| bit_alloc(node_record_count); |
| } |
| if (can_reboot && job_feat_ptr->changeable) { |
| node_feat_ptr = list_find_first(avail_feature_list, |
| list_find_feature, |
| job_feat_ptr->name); |
| if (node_feat_ptr && node_feat_ptr->node_bitmap) { |
| job_feat_ptr->node_bitmap_avail = |
| bit_copy(node_feat_ptr->node_bitmap); |
| } else { /* This feature not available */ |
| job_feat_ptr->node_bitmap_avail = |
| bit_alloc(node_record_count); |
| } |
| } else if (job_feat_ptr->node_bitmap_active) { |
| job_feat_ptr->node_bitmap_avail = |
| bit_copy(job_feat_ptr->node_bitmap_active); |
| } |
| |
| _log_feature_nodes(job_feat_ptr); |
| } |
| list_iterator_destroy(feat_iter); |
| } |
| |
| /* |
| * _match_feature - determine which of the job features are now inactive |
| * IN feature_list - Job's feature request list |
| * OUT inactive_bitmap - Nodes with this as inactive feature |
| * RET 1 if some nodes with this inactive feature, 0 no inactive feature |
| * NOTE: Currently fully supports only AND/OR of features, not XAND/MOR |
| */ |
| static int _match_feature(list_t *feature_list, bitstr_t **inactive_bitmap) |
| { |
| list_itr_t *job_feat_iter; |
| job_feature_t *job_feat_ptr; |
| int last_op = FEATURE_OP_AND, last_paren_op = FEATURE_OP_AND; |
| int i, last_paren_cnt = 0; |
| bitstr_t *feature_bitmap, *paren_bitmap = NULL, *work_bitmap; |
| |
| xassert(inactive_bitmap); |
| |
| if (!feature_list || /* nothing to look for */ |
| (node_features_g_count() == 0)) /* No inactive features */ |
| return 0; |
| |
| feature_bitmap = node_conf_get_active_bitmap(); |
| work_bitmap = feature_bitmap; |
| job_feat_iter = list_iterator_create(feature_list); |
| while ((job_feat_ptr = list_next(job_feat_iter))) { |
| if (last_paren_cnt < job_feat_ptr->paren) { |
| /* Start of expression in parenthesis */ |
| last_paren_op = last_op; |
| last_op = FEATURE_OP_AND; |
| FREE_NULL_BITMAP(paren_bitmap); |
| paren_bitmap = node_conf_get_active_bitmap(); |
| work_bitmap = paren_bitmap; |
| } |
| |
| if (job_feat_ptr->node_bitmap_avail) { |
| if (last_op == FEATURE_OP_AND) { |
| bit_and(work_bitmap, |
| job_feat_ptr->node_bitmap_active); |
| } else if (last_op == FEATURE_OP_OR) { |
| bit_or(work_bitmap, |
| job_feat_ptr->node_bitmap_active); |
| } else { /* FEATURE_OP_MOR or FEATURE_OP_XAND */ |
| bit_and(work_bitmap, |
| job_feat_ptr->node_bitmap_active); |
| } |
| } else { /* feature not found */ |
| if (last_op == FEATURE_OP_AND) { |
| bit_clear_all(work_bitmap); |
| } |
| } |
| |
| if (last_paren_cnt > job_feat_ptr->paren) { |
| /* End of expression in parenthesis */ |
| if (last_paren_op == FEATURE_OP_AND) { |
| bit_and(feature_bitmap, work_bitmap); |
| } else if (last_paren_op == FEATURE_OP_OR) { |
| bit_or(feature_bitmap, work_bitmap); |
| } else { /* FEATURE_OP_MOR or FEATURE_OP_XAND */ |
| bit_and(feature_bitmap, work_bitmap); |
| } |
| work_bitmap = feature_bitmap; |
| } |
| |
| last_op = job_feat_ptr->op_code; |
| last_paren_cnt = job_feat_ptr->paren; |
| } |
| list_iterator_destroy(job_feat_iter); |
| #if 0 |
| { |
| char tmp[32]; |
| bit_fmt(tmp, sizeof(tmp), work_bitmap); |
| info("%s: NODE_BITMAP:%s", __func__, tmp); |
| } |
| #endif |
| FREE_NULL_BITMAP(paren_bitmap); |
| i = bit_ffc(feature_bitmap); |
| if (i == -1) { /* No required node features inactive */ |
| FREE_NULL_BITMAP(feature_bitmap); |
| return 0; |
| } |
| bit_not(feature_bitmap); |
| *inactive_bitmap = feature_bitmap; |
| return 1; |
| } |
| |
| /* |
| * For a given job, if the available nodes differ from those with currently |
| * active features, return a bitmap of nodes with the job's required |
| * features currently active |
| * IN job_ptr - job requesting resource allocation |
| * IN avail_bitmap - nodes currently available for this job |
| * OUT active_bitmap - nodes with job's features currently active, NULL if |
| * identical to avail_bitmap |
| * NOTE: Currently fully supports only AND/OR of features, not XAND/MOR |
| */ |
| extern void build_active_feature_bitmap(job_record_t *job_ptr, |
| bitstr_t *avail_bitmap, |
| bitstr_t **active_bitmap) |
| { |
| job_details_t *details_ptr = job_ptr->details; |
| bitstr_t *tmp_bitmap = NULL; |
| bool can_reboot; |
| |
| *active_bitmap = NULL; |
| if (!details_ptr->feature_list_use || /* nothing to look for */ |
| (node_features_g_count() == 0)) /* No inactive features */ |
| return; |
| |
| can_reboot = node_features_g_user_update(job_ptr->user_id); |
| find_feature_nodes(details_ptr->feature_list_use, can_reboot); |
| if (_match_feature(details_ptr->feature_list_use, &tmp_bitmap) == 0) |
| return; /* No inactive features */ |
| |
| bit_not(tmp_bitmap); |
| if (bit_super_set(avail_bitmap, tmp_bitmap)) { |
| FREE_NULL_BITMAP(tmp_bitmap); |
| return; |
| } |
| bit_and(tmp_bitmap, avail_bitmap); |
| *active_bitmap = tmp_bitmap; |
| } |
| |
| /* Return bitmap of nodes with all specified features currently active */ |
| extern bitstr_t *build_active_feature_bitmap2(char *reboot_features) |
| { |
| const char *delim = ","; |
| char *tmp, *tok, *save_ptr = NULL; |
| bitstr_t *active_node_bitmap = NULL; |
| node_feature_t *node_feat_ptr; |
| |
| if (!reboot_features || (reboot_features[0] == '\0')) { |
| active_node_bitmap = node_conf_get_active_bitmap(); |
| return active_node_bitmap; |
| } |
| |
| tmp = xstrdup(reboot_features); |
| tok = strtok_r(tmp, delim, &save_ptr); |
| |
| while (tok) { |
| node_feat_ptr = list_find_first(active_feature_list, |
| list_find_feature, tok); |
| if (node_feat_ptr && node_feat_ptr->node_bitmap) { |
| /* |
| * Found feature, add nodes with this feature and |
| * remove nodes without this feature (bit_and) |
| */ |
| if (!active_node_bitmap) |
| active_node_bitmap = |
| bit_copy(node_feat_ptr->node_bitmap); |
| else |
| bit_and(active_node_bitmap, |
| node_feat_ptr->node_bitmap); |
| } else { |
| /* |
| * Feature not found in any nodes, so we definitely |
| * need to reboot all of the nodes |
| */ |
| if (!active_node_bitmap) |
| active_node_bitmap = |
| bit_alloc(node_record_count); |
| else |
| bit_clear_all(active_node_bitmap); |
| break; |
| } |
| |
| tok = strtok_r(NULL, delim, &save_ptr); |
| } |
| |
| xfree(tmp); |
| |
| return active_node_bitmap; |
| } |
| |
| /* |
| * Decide if a job can share nodes with other jobs based on the |
| * following three input parameters: |
| * |
| * IN user_flag - may be 0 (do not share nodes), 1 (node sharing allowed), |
| * or any other number means "don't care" |
| * IN part_max_share - current partition's node sharing policy |
| * |
| * |
| * The followed table details the node SHARED state for the various scenarios |
| * |
| * part= part= part= part= |
| * cons_tres user_request EXCLUS NO YES FORCE |
| * -------- ------------ ------ ----- ----- ----- |
| * no default whole whole whole whole/O |
| * no exclusive whole whole whole whole/O |
| * no share=yes whole whole whole/O whole/O |
| * yes default whole share share share/O |
| * yes exclusive whole whole whole whole/O |
| * yes share=yes whole share share/O share/O |
| * |
| * whole = entire node is allocated to the job |
| * share = less than entire node may be allocated to the job |
| * -/O = resources can be over-committed (e.g. gang scheduled) |
| * |
| * part->max_share: |
| * &SHARED_FORCE = FORCE |
| * 0 = EXCLUSIVE |
| * 1 = NO |
| * > 1 = YES |
| * |
| * job_ptr->details->share_res: |
| * 0 = default or share=no |
| * 1 = share=yes |
| * |
| * job_ptr->details->whole_node: |
| * 0 = default |
| * WHOLE_NODE_REQUIRED = 1 = exclusive |
| * WHOLE_NODE_USER = 2 = user |
| * WHOLE_NODE_MCS = 3 = mcs |
| * |
| * Return values: |
| * 0 = requires idle nodes |
| * 1 = can use non-idle nodes |
| */ |
| static int _resolve_shared_status(job_record_t *job_ptr, |
| uint16_t part_max_share) |
| { |
| if (job_ptr->reboot) |
| return 0; |
| |
| /* no sharing if partition OverSubscribe=EXCLUSIVE */ |
| if (part_max_share == 0) { |
| job_ptr->details->whole_node |= WHOLE_NODE_REQUIRED; |
| job_ptr->details->share_res = 0; |
| return 0; |
| } |
| |
| /* sharing if partition OverSubscribe=FORCE with count > 1 */ |
| if ((part_max_share & SHARED_FORCE) && |
| ((part_max_share & (~SHARED_FORCE)) > 1)) { |
| job_ptr->details->share_res = 1; |
| return 1; |
| } |
| |
| if (running_cons_tres()) { |
| if ((job_ptr->details->share_res == 0) || |
| (job_ptr->details->whole_node & WHOLE_NODE_REQUIRED)) { |
| job_ptr->details->share_res = 0; |
| return 0; |
| } |
| return 1; |
| } else { |
| job_ptr->details->whole_node |= WHOLE_NODE_REQUIRED; |
| if (part_max_share == 1) { /* partition is OverSubscribe=NO */ |
| job_ptr->details->share_res = 0; |
| return 0; |
| } |
| /* share if the user requested it */ |
| if (job_ptr->details->share_res == 1) |
| return 1; |
| job_ptr->details->share_res = 0; |
| return 0; |
| } |
| } |
| |
| typedef struct { |
| job_record_t *job_ptr; |
| bitstr_t *usable_node_mask; |
| } foreach_filter_by_node_t; |
| |
| static int _foreach_filter_by_node_owner(void *x, void *arg) |
| { |
| job_record_t *job_ptr2 = x; |
| foreach_filter_by_node_t *argstruct = arg; |
| job_record_t *job_ptr = argstruct->job_ptr; |
| bitstr_t *usable_node_mask = argstruct->usable_node_mask; |
| |
| if (IS_JOB_PENDING(job_ptr2) || IS_JOB_COMPLETED(job_ptr2) || |
| (job_ptr->user_id == job_ptr2->user_id) || !job_ptr2->node_bitmap) |
| return 0; |
| |
| bit_and_not(usable_node_mask, job_ptr2->node_bitmap); |
| |
| return 0; |
| } |
| |
| /* |
| * Remove nodes from consideration for allocation based upon "ownership" by |
| * other users |
| * job_ptr IN - Job to be scheduled |
| * usable_node_mask IN/OUT - Nodes available for use by this job's user |
| */ |
| extern void filter_by_node_owner(job_record_t *job_ptr, |
| bitstr_t *usable_node_mask) |
| { |
| node_record_t *node_ptr; |
| int i; |
| foreach_filter_by_node_t argstruct = { .job_ptr = job_ptr, |
| .usable_node_mask = |
| usable_node_mask }; |
| |
| if ((job_ptr->details->whole_node & WHOLE_NODE_USER) || |
| (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)) { |
| /* Need to remove all nodes allocated to any active job from |
| * any other user */ |
| list_for_each(job_list, _foreach_filter_by_node_owner, |
| &argstruct); |
| return; |
| } |
| |
| /* Need to filter out any nodes exclusively allocated to other users */ |
| for (i = 0; (node_ptr = next_node(&i)); i++) { |
| if ((node_ptr->owner != NO_VAL) && |
| (node_ptr->owner != job_ptr->user_id)) |
| bit_clear(usable_node_mask, node_ptr->index); |
| } |
| } |
| |
| /* |
| * Remove nodes from consideration for allocation based upon "mcs" by |
| * other users |
| * job_ptr IN - Job to be scheduled |
| * usable_node_mask IN/OUT - Nodes available for use by this job's mcs |
| */ |
| extern void filter_by_node_mcs(job_record_t *job_ptr, int mcs_select, |
| bitstr_t *usable_node_mask) |
| { |
| node_record_t *node_ptr; |
| int i; |
| |
| /* Need to filter out any nodes allocated with other mcs */ |
| if (job_ptr->mcs_label && (mcs_select == 1)) { |
| for (i = 0; (node_ptr = next_node(&i)); i++) { |
| /* if there is a mcs_label -> OK if it's the same */ |
| if ((node_ptr->mcs_label != NULL) && |
| xstrcmp(node_ptr->mcs_label,job_ptr->mcs_label)) { |
| bit_clear(usable_node_mask, node_ptr->index); |
| } |
| /* if no mcs_label -> OK if no jobs running */ |
| if ((node_ptr->mcs_label == NULL) && |
| (node_ptr->run_job_cnt != 0)) { |
| bit_clear(usable_node_mask, node_ptr->index); |
| } |
| } |
| } else { |
| for (i = 0; (node_ptr = next_node(&i)); i++) { |
| if (node_ptr->mcs_label != NULL) { |
| bit_clear(usable_node_mask, node_ptr->index); |
| } |
| } |
| } |
| } |
| |
| /* |
| * Remove nodes from the "avail_node_bitmap" which need to be rebooted in order |
| * to be used if the job's "delay_boot" time has not yet been reached. |
| */ |
| static void _filter_by_node_feature(job_record_t *job_ptr, |
| struct node_set *node_set_ptr, |
| int node_set_size) |
| { |
| int i; |
| |
| if ((job_ptr->details == NULL) || |
| ((job_ptr->details->begin_time != 0) && |
| ((job_ptr->details->begin_time + job_ptr->delay_boot) <= |
| time(NULL)))) |
| return; |
| |
| for (i = 0; i < node_set_size; i++) { |
| if (node_set_ptr[i].flags & NODE_SET_REBOOT) { |
| bit_and_not(avail_node_bitmap, |
| node_set_ptr[i].my_bitmap); |
| } |
| } |
| } |
| |
| static void _find_qos_grp_node_bitmap(job_record_t *job_ptr, |
| slurmdb_qos_rec_t *qos_ptr, |
| bitstr_t **grp_node_bitmap, |
| bool *per_grp_limit, |
| bool *per_user_limit, |
| bool *per_acct_limit) |
| { |
| slurmdb_used_limits_t *used_limits = NULL; |
| |
| if (!qos_ptr || !qos_ptr->usage) |
| return; |
| |
| if (!*per_grp_limit && |
| qos_ptr->usage->grp_node_bitmap && |
| (qos_ptr->grp_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)) { |
| *per_grp_limit = true; |
| *grp_node_bitmap = bit_copy(qos_ptr->usage->grp_node_bitmap); |
| } |
| |
| if (!*per_user_limit && |
| (qos_ptr->max_tres_pu_ctld[TRES_ARRAY_NODE] != INFINITE64)) { |
| *per_user_limit = true; |
| used_limits = acct_policy_get_user_used_limits( |
| &qos_ptr->usage->user_limit_list, |
| job_ptr->user_id); |
| if (used_limits && used_limits->node_bitmap) { |
| if (*grp_node_bitmap) |
| bit_or(*grp_node_bitmap, |
| used_limits->node_bitmap); |
| else |
| *grp_node_bitmap = |
| bit_copy(used_limits->node_bitmap); |
| } |
| } |
| |
| if (!*per_acct_limit && |
| job_ptr->assoc_ptr && |
| (qos_ptr->max_tres_pa_ctld[TRES_ARRAY_NODE] != INFINITE64)) { |
| *per_acct_limit = true; |
| used_limits = acct_policy_get_acct_used_limits( |
| &qos_ptr->usage->acct_limit_list, |
| job_ptr->assoc_ptr->acct); |
| if (used_limits && used_limits->node_bitmap) { |
| if (*grp_node_bitmap) |
| bit_or(*grp_node_bitmap, |
| used_limits->node_bitmap); |
| else |
| *grp_node_bitmap = |
| bit_copy(used_limits->node_bitmap); |
| } |
| } |
| } |
| |
| /* |
| * For a given job, return a bitmap of nodes to be preferred in it's allocation |
| */ |
| static bitstr_t *_find_grp_node_bitmap(job_record_t *job_ptr) |
| { |
| bitstr_t *grp_node_bitmap = NULL; |
| slurmdb_qos_rec_t *qos_ptr1 = NULL, *qos_ptr2 = NULL; |
| bool per_acct_limit = false, per_user_limit = false, |
| per_grp_limit = false; |
| assoc_mgr_lock_t qos_read_locks = |
| { .assoc = READ_LOCK, .qos = READ_LOCK }; |
| slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr; |
| |
| /* check to see if we are enforcing associations */ |
| if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) |
| return NULL; |
| |
| assoc_mgr_lock(&qos_read_locks); |
| |
| acct_policy_set_qos_order(job_ptr, &qos_ptr1, &qos_ptr2); |
| |
| _find_qos_grp_node_bitmap(job_ptr, qos_ptr1, &grp_node_bitmap, |
| &per_grp_limit, |
| &per_user_limit, |
| &per_acct_limit); |
| |
| _find_qos_grp_node_bitmap(job_ptr, qos_ptr2, &grp_node_bitmap, |
| &per_grp_limit, |
| &per_user_limit, |
| &per_acct_limit); |
| |
| while (assoc_ptr && assoc_ptr->usage && !per_grp_limit) { |
| if (assoc_ptr->usage->grp_node_bitmap && |
| (assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE] != INFINITE64)) { |
| per_grp_limit = true; |
| if (grp_node_bitmap) |
| bit_or(grp_node_bitmap, |
| assoc_ptr->usage->grp_node_bitmap); |
| else |
| grp_node_bitmap = bit_copy(assoc_ptr->usage-> |
| grp_node_bitmap); |
| break; |
| } |
| assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; |
| } |
| |
| assoc_mgr_unlock(&qos_read_locks); |
| |
| return grp_node_bitmap; |
| } |
| |
| /* |
| * If the job has required feature counts, then accumulate those |
| * required resources using multiple calls to _pick_best_nodes() |
| * and adding those selected nodes to the job's required node list. |
| * Upon completion, return job's requirements to match the values |
| * which were in effect upon calling this function. |
| * Input and output are the same as _pick_best_nodes(). |
| */ |
| static int _get_req_features(struct node_set *node_set_ptr, int node_set_size, |
| bitstr_t **select_bitmap, job_record_t *job_ptr, |
| part_record_t *part_ptr, uint32_t min_nodes, |
| uint32_t max_nodes, uint32_t req_nodes, |
| bool test_only, list_t **preemptee_job_list, |
| bool can_reboot, bool submission) |
| { |
| uint32_t saved_min_nodes, saved_job_min_nodes, saved_job_num_tasks; |
| bitstr_t *saved_req_node_bitmap = NULL; |
| bitstr_t *inactive_bitmap = NULL; |
| uint32_t saved_min_cpus, saved_req_nodes; |
| int resv_rc = SLURM_SUCCESS, tmp_node_set_size; |
| int mcs_select = 0; |
| struct node_set *tmp_node_set_ptr, *prev_node_set_ptr; |
| int error_code = SLURM_SUCCESS, i; |
| bitstr_t *feature_bitmap, *accumulate_bitmap = NULL; |
| bitstr_t *save_avail_node_bitmap = NULL, *resv_bitmap = NULL; |
| bitstr_t *save_share_node_bitmap = NULL; |
| list_t *preemptee_candidates = NULL; |
| bool old_feat_change = false; |
| bool has_xand = false; |
| bool resv_overlap = false; |
| resv_exc_t resv_exc = { 0 }; |
| /* |
| * Mark nodes reserved for other jobs as off limit for this job. |
| * If the job has a reservation, we've already limited the contents |
| * of select_bitmap to those nodes. Assume node reboot required |
| * since we have not selected the compute nodes yet. |
| */ |
| if (job_ptr->resv_name == NULL) { |
| time_t start_res = time(NULL); |
| resv_rc = job_test_resv(job_ptr, &start_res, false, |
| &resv_bitmap, &resv_exc, |
| &resv_overlap, true); |
| if ((resv_rc == ESLURM_NODES_BUSY) || |
| (resv_rc == ESLURM_RESERVATION_MAINT)) { |
| save_avail_node_bitmap = avail_node_bitmap; |
| avail_node_bitmap = bit_alloc(node_record_count); |
| FREE_NULL_BITMAP(resv_bitmap); |
| /* |
| * Continue executing through _pick_best_nodes() below |
| * in order reject job if it can never run |
| */ |
| } else if (resv_rc != SLURM_SUCCESS) { |
| FREE_NULL_BITMAP(resv_bitmap); |
| reservation_delete_resv_exc_parts(&resv_exc); |
| return ESLURM_NODES_BUSY; /* reserved */ |
| } else if (resv_bitmap && |
| (!bit_equal(resv_bitmap, avail_node_bitmap))) { |
| bit_and(resv_bitmap, avail_node_bitmap); |
| save_avail_node_bitmap = avail_node_bitmap; |
| if (slurm_conf.debug_flags & DEBUG_FLAG_RESERVATION && |
| !bit_equal(avail_node_bitmap, resv_bitmap)) { |
| bitstr_t *removed_nodes = |
| bit_copy(save_avail_node_bitmap); |
| bit_and_not(removed_nodes, resv_bitmap); |
| log_flag(RESERVATION, "Advanced reservation removed nodes:%s from consideration for %pJ", |
| bitmap2node_name(removed_nodes), |
| job_ptr); |
| FREE_NULL_BITMAP(removed_nodes); |
| } |
| avail_node_bitmap = resv_bitmap; |
| resv_bitmap = NULL; |
| } else { |
| FREE_NULL_BITMAP(resv_bitmap); |
| } |
| } else { |
| time_t start_res = time(NULL); |
| /* |
| * We do not care about return value. |
| * We are just interested in resv_exc being filled in |
| */ |
| (void) job_test_resv(job_ptr, &start_res, false, &resv_bitmap, |
| &resv_exc, &resv_overlap, true); |
| FREE_NULL_BITMAP(resv_bitmap); |
| } |
| |
| if (submission) |
| resv_overlap = false; |
| |
| if (!save_avail_node_bitmap) |
| save_avail_node_bitmap = bit_copy(avail_node_bitmap); |
| save_share_node_bitmap = bit_copy(share_node_bitmap); |
| filter_by_node_owner(job_ptr, share_node_bitmap); |
| |
| if (can_reboot && !test_only) |
| _filter_by_node_feature(job_ptr, node_set_ptr, node_set_size); |
| |
| if (!test_only) { |
| mcs_select = slurm_mcs_get_select(job_ptr); |
| filter_by_node_mcs(job_ptr, mcs_select, share_node_bitmap); |
| } |
| |
| if (!test_only) { |
| hres_filter(job_ptr, avail_node_bitmap); |
| } |
| |
| /* save job and request state */ |
| saved_min_nodes = min_nodes; |
| saved_req_nodes = req_nodes; |
| saved_job_min_nodes = job_ptr->details->min_nodes; |
| if (job_ptr->details->req_node_bitmap) { |
| accumulate_bitmap = job_ptr->details->req_node_bitmap; |
| saved_req_node_bitmap = bit_copy(accumulate_bitmap); |
| job_ptr->details->req_node_bitmap = NULL; |
| } |
| saved_min_cpus = job_ptr->details->min_cpus; |
| /* |
| * Don't mess with max_cpus here since it is only set to be a limit |
| * and not user configurable. |
| */ |
| job_ptr->details->min_cpus = 1; |
| tmp_node_set_ptr = xcalloc((node_set_size * 2), sizeof(struct node_set)); |
| |
| /* Accumulate nodes with required feature counts. */ |
| preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); |
| if (job_ptr->details->feature_list_use) { |
| list_itr_t *feat_iter; |
| job_feature_t *feat_ptr; |
| int last_paren_cnt = 0, last_paren_opt = FEATURE_OP_AND; |
| bitstr_t *paren_bitmap = NULL, *work_bitmap; |
| uint64_t smallest_min_mem = INFINITE64; |
| uint64_t orig_req_mem = job_ptr->details->pn_min_memory; |
| bool feat_change = false; |
| |
| feat_iter = list_iterator_create( |
| job_ptr->details->feature_list_use); |
| while ((feat_ptr = list_next(feat_iter))) { |
| bool sort_again = false; |
| if (last_paren_cnt < feat_ptr->paren) { |
| /* Start of expression in parenthesis */ |
| if (paren_bitmap) { |
| error("%s@%d: %pJ has bad feature expression: %s", |
| __func__, __LINE__, job_ptr, |
| job_ptr->details->features_use); |
| FREE_NULL_BITMAP(paren_bitmap); |
| } |
| feat_change |= feat_ptr->changeable; |
| paren_bitmap = |
| bit_copy(feat_ptr->node_bitmap_avail); |
| last_paren_opt = feat_ptr->op_code; |
| last_paren_cnt = feat_ptr->paren; |
| continue; |
| } else if (last_paren_cnt > 0) { |
| feat_change |= feat_ptr->changeable; |
| if (last_paren_opt == FEATURE_OP_AND) { |
| bit_and(paren_bitmap, |
| feat_ptr->node_bitmap_avail); |
| } else { |
| bit_or(paren_bitmap, |
| feat_ptr->node_bitmap_avail); |
| } |
| last_paren_opt = feat_ptr->op_code; |
| last_paren_cnt = feat_ptr->paren; |
| if (last_paren_cnt) |
| continue; |
| work_bitmap = paren_bitmap; |
| } else { |
| /* Outside of parenthesis */ |
| feat_change = feat_ptr->changeable; |
| work_bitmap = feat_ptr->node_bitmap_avail; |
| } |
| if (feat_ptr->count == 0) { |
| FREE_NULL_BITMAP(paren_bitmap); |
| continue; |
| } |
| tmp_node_set_size = 0; |
| /* |
| * _pick_best_nodes() is destructive of the node_set |
| * data structure, so we need to make a copy and then |
| * purge it |
| */ |
| for (i = 0; i < node_set_size; i++) { |
| if (!bit_overlap_any(node_set_ptr[i].my_bitmap, |
| work_bitmap)) |
| continue; |
| tmp_node_set_ptr[tmp_node_set_size]. |
| cpus_per_node = |
| node_set_ptr[i].cpus_per_node; |
| tmp_node_set_ptr[tmp_node_set_size]. |
| real_memory = |
| node_set_ptr[i].real_memory; |
| tmp_node_set_ptr[tmp_node_set_size].node_weight = |
| node_set_ptr[i].node_weight; |
| tmp_node_set_ptr[tmp_node_set_size].sched_weight = |
| node_set_ptr[i].sched_weight; |
| tmp_node_set_ptr[tmp_node_set_size].flags = |
| node_set_ptr[i].flags; |
| tmp_node_set_ptr[tmp_node_set_size].features = |
| xstrdup(node_set_ptr[i].features); |
| tmp_node_set_ptr[tmp_node_set_size]. |
| feature_bits = |
| bit_copy(node_set_ptr[i].feature_bits); |
| tmp_node_set_ptr[tmp_node_set_size].my_bitmap = |
| bit_copy(node_set_ptr[i].my_bitmap); |
| bit_and(tmp_node_set_ptr[tmp_node_set_size]. |
| my_bitmap, work_bitmap); |
| if (accumulate_bitmap && has_xand) { |
| bit_and_not(tmp_node_set_ptr[ |
| tmp_node_set_size].my_bitmap, |
| accumulate_bitmap); |
| } |
| tmp_node_set_ptr[tmp_node_set_size].node_cnt = |
| bit_set_count(tmp_node_set_ptr |
| [tmp_node_set_size].my_bitmap); |
| prev_node_set_ptr = tmp_node_set_ptr + |
| tmp_node_set_size; |
| tmp_node_set_size++; |
| |
| if (test_only || !can_reboot || |
| (prev_node_set_ptr->flags & |
| NODE_SET_REBOOT)) |
| continue; |
| inactive_bitmap = |
| bit_copy(node_set_ptr[i].my_bitmap); |
| bit_and_not(inactive_bitmap, |
| feat_ptr->node_bitmap_active); |
| if (bit_ffs(inactive_bitmap) == -1) { |
| /* No inactive nodes (require reboot) */ |
| FREE_NULL_BITMAP(inactive_bitmap); |
| continue; |
| } |
| sort_again = true; |
| if (bit_equal(prev_node_set_ptr->my_bitmap, |
| inactive_bitmap)) { |
| prev_node_set_ptr->flags |= |
| NODE_SET_REBOOT; |
| FREE_NULL_BITMAP(inactive_bitmap); |
| continue; |
| } |
| tmp_node_set_ptr[tmp_node_set_size]. |
| cpus_per_node = |
| node_set_ptr[i].cpus_per_node; |
| tmp_node_set_ptr[tmp_node_set_size]. |
| real_memory = |
| node_set_ptr[i].real_memory; |
| tmp_node_set_ptr[tmp_node_set_size].flags |= |
| NODE_SET_REBOOT; |
| tmp_node_set_ptr[tmp_node_set_size].features = |
| xstrdup(node_set_ptr[i].features); |
| tmp_node_set_ptr[tmp_node_set_size]. |
| feature_bits = |
| bit_copy(node_set_ptr[i].feature_bits); |
| tmp_node_set_ptr[tmp_node_set_size].my_bitmap = |
| bit_copy(tmp_node_set_ptr |
| [tmp_node_set_size-1].my_bitmap); |
| bit_and(tmp_node_set_ptr[tmp_node_set_size]. |
| my_bitmap, inactive_bitmap); |
| tmp_node_set_ptr[tmp_node_set_size].node_cnt = |
| bit_set_count(tmp_node_set_ptr |
| [tmp_node_set_size].my_bitmap); |
| bit_and_not(tmp_node_set_ptr[tmp_node_set_size-1]. |
| my_bitmap, inactive_bitmap); |
| tmp_node_set_ptr[tmp_node_set_size-1].node_cnt = |
| bit_set_count(tmp_node_set_ptr |
| [tmp_node_set_size-1].my_bitmap); |
| tmp_node_set_size++; |
| FREE_NULL_BITMAP(inactive_bitmap); |
| } |
| FREE_NULL_BITMAP(paren_bitmap); |
| feature_bitmap = NULL; |
| min_nodes = feat_ptr->count; |
| req_nodes = feat_ptr->count; |
| saved_job_num_tasks = job_ptr->details->num_tasks; |
| job_ptr->details->min_nodes = feat_ptr->count; |
| job_ptr->details->min_cpus = feat_ptr->count; |
| /* |
| * Ensure that num_tasks is accurate if ntasks_per_node |
| * is set |
| */ |
| if (job_ptr->details->ntasks_per_node) |
| job_ptr->details->num_tasks = min_nodes * |
| job_ptr->details->ntasks_per_node; |
| FREE_NULL_LIST(*preemptee_job_list); |
| job_ptr->details->pn_min_memory = orig_req_mem; |
| if (sort_again) { |
| for (i = 0; i < tmp_node_set_size; i++) |
| _set_sched_weight(tmp_node_set_ptr + i); |
| qsort(tmp_node_set_ptr, tmp_node_set_size, |
| sizeof(struct node_set), _sort_node_set); |
| } |
| error_code = _pick_best_nodes(tmp_node_set_ptr, |
| tmp_node_set_size, &feature_bitmap, |
| job_ptr, part_ptr, min_nodes, |
| max_nodes, req_nodes, test_only, |
| preemptee_candidates, |
| preemptee_job_list, false, |
| &resv_exc, resv_overlap); |
| job_ptr->details->num_tasks = saved_job_num_tasks; |
| if (job_ptr->details->pn_min_memory) { |
| if (job_ptr->details->pn_min_memory < |
| smallest_min_mem) |
| smallest_min_mem = |
| job_ptr->details->pn_min_memory; |
| else |
| job_ptr->details->pn_min_memory = |
| smallest_min_mem; |
| } |
| #if _DEBUG |
| { |
| char *tmp_str = bitmap2node_name(feature_bitmap); |
| info("%pJ needs %u nodes with feature %s, using %s, error_code=%d", |
| job_ptr, feat_ptr->count, feat_ptr->name, |
| tmp_str, error_code); |
| xfree(tmp_str); |
| } |
| #endif |
| for (i = 0; i < tmp_node_set_size; i++) { |
| xfree(tmp_node_set_ptr[i].features); |
| FREE_NULL_BITMAP(tmp_node_set_ptr[i]. |
| feature_bits); |
| FREE_NULL_BITMAP(tmp_node_set_ptr[i]. |
| my_bitmap); |
| } |
| if (error_code != SLURM_SUCCESS) { |
| FREE_NULL_BITMAP(feature_bitmap); |
| break; |
| } |
| if (feature_bitmap) { |
| if (feat_ptr->op_code == FEATURE_OP_XAND) |
| has_xand = true; |
| if (has_xand) { |
| if (old_feat_change && feat_change) { |
| error_code = |
| ESLURM_MULTI_KNL_CONSTRAINT; |
| break; |
| } |
| old_feat_change |= feat_change; |
| /* |
| * Don't make nodes required since we |
| * check value on each call to |
| * _pick_best_nodes() |
| */ |
| } else if (job_ptr->details->req_node_bitmap) { |
| bit_or(job_ptr->details-> |
| req_node_bitmap, |
| feature_bitmap); |
| } else { |
| job_ptr->details->req_node_bitmap = |
| bit_copy(feature_bitmap); |
| } |
| if (accumulate_bitmap) { |
| bit_or(accumulate_bitmap, |
| feature_bitmap); |
| FREE_NULL_BITMAP(feature_bitmap); |
| } else |
| accumulate_bitmap = feature_bitmap; |
| } |
| } |
| list_iterator_destroy(feat_iter); |
| if (paren_bitmap) { |
| error("%s@%d: %pJ has bad feature expression: %s", |
| __func__, __LINE__, job_ptr, |
| job_ptr->details->features_use); |
| FREE_NULL_BITMAP(paren_bitmap); |
| } |
| } |
| |
| /* restore most of job state and accumulate remaining resources */ |
| if (saved_req_node_bitmap) { |
| FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); |
| job_ptr->details->req_node_bitmap = |
| bit_copy(saved_req_node_bitmap); |
| } |
| if (accumulate_bitmap) { |
| uint32_t node_cnt; |
| if (job_ptr->details->req_node_bitmap) { |
| bit_or(job_ptr->details->req_node_bitmap, |
| accumulate_bitmap); |
| FREE_NULL_BITMAP(accumulate_bitmap); |
| } else |
| job_ptr->details->req_node_bitmap = accumulate_bitmap; |
| node_cnt = bit_set_count(job_ptr->details->req_node_bitmap); |
| job_ptr->details->min_cpus = MAX(saved_min_cpus, node_cnt); |
| min_nodes = MAX(saved_min_nodes, node_cnt); |
| job_ptr->details->min_nodes = min_nodes; |
| req_nodes = MAX(min_nodes, req_nodes); |
| if (req_nodes > max_nodes) |
| error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } else { |
| min_nodes = saved_min_nodes; |
| req_nodes = saved_req_nodes; |
| job_ptr->details->min_cpus = saved_min_cpus; |
| job_ptr->details->min_nodes = saved_job_min_nodes; |
| } |
| |
| #if _DEBUG |
| { |
| char *tmp_str = bitmap2node_name(job_ptr->details->req_node_bitmap); |
| info("%pJ requires %d:%d:%d req_nodes:%s err:%u", |
| job_ptr, min_nodes, req_nodes, max_nodes, tmp_str, error_code); |
| xfree(tmp_str); |
| } |
| #endif |
| xfree(tmp_node_set_ptr); |
| if (error_code == SLURM_SUCCESS) { |
| FREE_NULL_LIST(*preemptee_job_list); |
| error_code = _pick_best_nodes(node_set_ptr, node_set_size, |
| select_bitmap, job_ptr, part_ptr, min_nodes, |
| max_nodes, req_nodes, test_only, |
| preemptee_candidates, preemptee_job_list, |
| has_xand, &resv_exc, resv_overlap); |
| } |
| |
| if ((resv_rc == ESLURM_RESERVATION_MAINT) && |
| (error_code == ESLURM_NODE_NOT_AVAIL)) |
| error_code = ESLURM_RESERVATION_MAINT; |
| #if _DEBUG |
| { |
| char *tmp_str = bitmap2node_name(*select_bitmap); |
| info("%pJ allocated nodes:%s err:%u", job_ptr, tmp_str, error_code); |
| xfree(tmp_str); |
| } |
| #endif |
| |
| FREE_NULL_LIST(preemptee_candidates); |
| |
| /* restore job's initial required node bitmap */ |
| FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); |
| job_ptr->details->req_node_bitmap = saved_req_node_bitmap; |
| job_ptr->details->min_cpus = saved_min_cpus; |
| job_ptr->details->min_nodes = saved_job_min_nodes; |
| |
| /* Restore available node bitmap, ignoring reservations */ |
| if (save_avail_node_bitmap) { |
| FREE_NULL_BITMAP(avail_node_bitmap); |
| avail_node_bitmap = save_avail_node_bitmap; |
| } |
| if (save_share_node_bitmap) { |
| FREE_NULL_BITMAP(share_node_bitmap); |
| share_node_bitmap = save_share_node_bitmap; |
| } |
| reservation_delete_resv_exc_parts(&resv_exc); |
| |
| return error_code; |
| } |
| |
| static void _sync_node_weight(struct node_set *node_set_ptr, int node_set_size) |
| { |
| node_record_t *node_ptr; |
| |
| for (int s = 0; s < node_set_size; s++) { |
| if (!node_set_ptr[s].my_bitmap) |
| continue; /* No nodes in this set */ |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(node_set_ptr[s].my_bitmap, |
| &i)); |
| i++) { |
| node_ptr->sched_weight = node_set_ptr[s].sched_weight; |
| } |
| } |
| } |
| |
| static int _bit_or_cond_internal(void *x, void *arg) |
| { |
| job_record_t *job_ptr = (job_record_t *)x; |
| bitstr_t *bitmap = (bitstr_t *)arg; |
| |
| if (!IS_JOB_RUNNING(job_ptr) || job_ptr->details->share_res || |
| !job_ptr->job_resrcs) |
| return 0; |
| |
| bit_or(bitmap, job_ptr->job_resrcs->node_bitmap); |
| |
| return 0; |
| } |
| |
| static void _bit_or_cond(job_record_t *job_ptr, bitstr_t *bitmap) |
| { |
| if (!job_ptr->het_job_list) |
| _bit_or_cond_internal(job_ptr, bitmap); |
| else |
| list_for_each_nobreak(job_ptr->het_job_list, |
| _bit_or_cond_internal, bitmap); |
| } |
| |
| /* |
| * _pick_best_nodes - from a weight order list of all nodes satisfying a |
| * job's specifications, select the "best" for use |
| * IN node_set_ptr - pointer to node specification information |
| * IN node_set_size - number of entries in records pointed to by node_set_ptr |
| * OUT select_bitmap - returns bitmap of selected nodes, must FREE_NULL_BITMAP |
| * IN job_ptr - pointer to job being scheduled |
| * IN part_ptr - pointer to the partition in which the job is being scheduled |
| * IN min_nodes - minimum count of nodes required by the job |
| * IN max_nodes - maximum count of nodes required by the job (0==no limit) |
| * IN req_nodes - requested (or desired) count of nodes |
| * IN test_only - do not actually allocate resources |
| * IN/OUT preemptee_job_list - list of pointers to jobs to be preempted |
| * IN resv_exc_ptr - Various TRES which can not be used |
| * NULL on first entry |
| * IN has_xand - set of the constraint list includes XAND operators *and* |
| * we have already satisfied them all |
| * in resv_overlap - designated reservation overlaps another reservation |
| * RET SLURM_SUCCESS on success, |
| * ESLURM_NODES_BUSY if request can not be satisfied now, |
| * ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE if request can never |
| * be satisfied, |
| * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE if the job can not be |
| * initiated until the partition's configuration changes or |
| * ESLURM_NODE_NOT_AVAIL if required nodes are DOWN or DRAINED |
| * ESLURM_RESERVATION_BUSY if requested reservation overlaps another |
| * NOTE: the caller must FREE_NULL_BITMAP memory pointed to by select_bitmap |
| * Notes: The algorithm is |
| * 1) If required node list is specified, determine implicitly required |
| * processor and node count |
| * 2) Determine how many disjoint required "features" are represented |
| * (e.g. "FS1|FS2|FS3") |
| * 3) For each feature: find matching node table entries, identify nodes |
| * that are up and available (idle or shared) and add them to a bit |
| * map |
| * 4) Select_g_job_test() to select the "best" of those based upon |
| * topology and/or workload |
| * 5) If request can't be satisfied now, execute select_g_job_test() |
| * against the list of nodes that exist in any state (perhaps DOWN |
| * DRAINED or ALLOCATED) to determine if the request can |
| * ever be satisfied. |
| */ |
| static int _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, |
| bitstr_t **select_bitmap, job_record_t *job_ptr, |
| part_record_t *part_ptr, uint32_t min_nodes, |
| uint32_t max_nodes, uint32_t req_nodes, |
| bool test_only, list_t *preemptee_candidates, |
| list_t **preemptee_job_list, bool has_xand, |
| resv_exc_t *resv_exc_ptr, bool resv_overlap) |
| { |
| int error_code = SLURM_SUCCESS, i, j, pick_code = SLURM_SUCCESS; |
| int total_nodes = 0, avail_nodes = 0; |
| bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL; |
| bitstr_t *backup_bitmap = NULL; |
| bitstr_t *possible_bitmap = NULL; |
| bitstr_t *node_set_map; |
| int max_feature, min_feature; |
| bool runable_ever = false; /* Job can ever run */ |
| bool runable_avail = false; /* Job can run with available nodes */ |
| bool tried_sched = false; /* Tried to schedule with avail nodes */ |
| bool preempt_flag = false; |
| bool nodes_busy = false; |
| bool licenses_unavailable = false; |
| int shared = 0, select_mode; |
| list_t *preemptee_cand = NULL; |
| |
| /* |
| * Since you could potentially have multiple features and the |
| * job might not request memory we need to keep track of a minimum |
| * from the selected features. This is to fulfill commit |
| * 700e7b1d4e9. |
| * If no memory is requested but we are running with |
| * SELECT_*_MEMORY and the request is for |
| * nodes of different memory sizes we need to reset the |
| * pn_min_memory as select_g_job_test can |
| * alter that making it so the order of constraints |
| * matter since the first pass through this will set the |
| * pn_min_memory based on that first constraint and if |
| * it isn't smaller than all the other requests they |
| * will fail. We have to keep track of the |
| * memory for accounting, these next 2 variables do this for us. |
| */ |
| uint64_t smallest_min_mem = INFINITE64; |
| uint64_t orig_req_mem = job_ptr->details->pn_min_memory; |
| |
| if (test_only) |
| select_mode = SELECT_MODE_TEST_ONLY; |
| else |
| select_mode = SELECT_MODE_RUN_NOW; |
| |
| if ((job_ptr->details->min_nodes == 0) && |
| (job_ptr->details->max_nodes == 0)) { |
| /* Zero compute node job (burst buffer use only) */ |
| avail_bitmap = bit_alloc(node_record_count); |
| pick_code = select_g_job_test(job_ptr, |
| avail_bitmap, |
| 0, 0, 0, |
| select_mode, |
| preemptee_candidates, |
| preemptee_job_list, |
| resv_exc_ptr, |
| NULL); |
| |
| if (pick_code == SLURM_SUCCESS) { |
| *select_bitmap = avail_bitmap; |
| return SLURM_SUCCESS; |
| } else { |
| FREE_NULL_BITMAP(avail_bitmap); |
| if (pick_code == ESLURM_LICENSES_UNAVAILABLE) |
| return ESLURM_LICENSES_UNAVAILABLE; |
| else |
| return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } |
| } else if (node_set_size == 0) { |
| info("%s: empty node set for selection", __func__); |
| return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } |
| |
| shared = _resolve_shared_status(job_ptr, part_ptr->max_share); |
| |
| /* |
| * If job preemption is enabled, then do NOT limit the set of available |
| * nodes by their current 'sharable' or 'idle' setting |
| */ |
| preempt_flag = slurm_preemption_enabled(); |
| |
| if (job_ptr->details->req_node_bitmap) { /* specific nodes required */ |
| /* |
| * We have already confirmed that all of these nodes have a |
| * usable configuration and are in the proper partition. |
| * Check that these nodes can be used by this job. |
| */ |
| if (min_nodes != 0) { |
| total_nodes = bit_set_count( |
| job_ptr->details->req_node_bitmap); |
| } |
| if (total_nodes > max_nodes) { /* exceeds node limit */ |
| return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| } |
| if ((job_ptr->details->core_spec != NO_VAL16) && |
| ((job_ptr->details->core_spec & CORE_SPEC_THREAD) == 0)) { |
| i = bit_ffs(job_ptr->details->req_node_bitmap); |
| if (i >= 0) { |
| j = node_record_table_ptr[i]->tot_cores; |
| } |
| if ((i >= 0) && (job_ptr->details->core_spec >= j)) { |
| if (part_ptr->name) { |
| info("%s: %pJ never runnable in partition %s", |
| __func__, job_ptr, |
| part_ptr->name); |
| } else { |
| info("%s: %pJ never runnable", |
| __func__, job_ptr); |
| } |
| return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } |
| } |
| |
| /* |
| * Check the availability of these nodes. |
| * Should we check memory availability on these nodes? |
| */ |
| if (!bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_node_bitmap)) { |
| return ESLURM_NODE_NOT_AVAIL; |
| } |
| |
| /* |
| * Still must go through select_g_job_test() to determine the |
| * validity of request and/or perform set-up before job launch |
| */ |
| total_nodes = 0; /* reinitialize */ |
| } |
| |
| /* identify the min and max feature values for possible exclusive OR */ |
| max_feature = -1; |
| min_feature = MAX_FEATURES; |
| for (i = 0; i < node_set_size; i++) { |
| j = bit_ffs(node_set_ptr[i].feature_bits); |
| if ((j >= 0) && (j < min_feature)) |
| min_feature = j; |
| j = bit_fls(node_set_ptr[i].feature_bits); |
| if ((j >= 0) && (j > max_feature)) |
| max_feature = j; |
| } |
| |
| debug3("%s: %pJ idle_nodes %u share_nodes %u", |
| __func__, job_ptr, bit_set_count(idle_node_bitmap), |
| bit_set_count(share_node_bitmap)); |
| |
| if (running_cons_tres()) |
| _sync_node_weight(node_set_ptr, node_set_size); |
| /* |
| * Accumulate resources for this job based upon its required |
| * features (possibly with node counts). |
| */ |
| for (j = min_feature; j <= max_feature; j++) { |
| if (job_ptr->details->req_node_bitmap) { |
| bool missing_required_nodes = false; |
| bool feature_found = false; |
| for (i = 0; i < node_set_size; i++) { |
| if (!bit_test(node_set_ptr[i].feature_bits, j)) |
| continue; |
| feature_found = true; |
| node_set_map = |
| bit_copy(node_set_ptr[i].my_bitmap); |
| |
| if ((node_set_ptr[i].flags & NODE_SET_REBOOT)) { |
| /* Node reboot required */ |
| bit_and(node_set_map, |
| idle_node_bitmap); |
| /* |
| * Powered up cloud nodes can't be |
| * rebooted to get new features. Must be |
| * powered down first. |
| */ |
| bit_and_not(node_set_map, |
| cloud_node_bitmap); |
| } |
| |
| if (avail_bitmap) { |
| bit_or(avail_bitmap, node_set_map); |
| FREE_NULL_BITMAP(node_set_map); |
| } else { |
| avail_bitmap = node_set_map; |
| } |
| |
| } |
| if (!feature_found) |
| continue; |
| if (!bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_bitmap)) |
| missing_required_nodes = true; |
| |
| if (missing_required_nodes) |
| continue; |
| FREE_NULL_BITMAP(avail_bitmap); |
| avail_bitmap = bit_copy(job_ptr->details-> |
| req_node_bitmap); |
| bit_and_not(avail_bitmap, rs_node_bitmap); |
| } |
| for (i = 0; i < node_set_size; i++) { |
| int count1 = 0, count2 = 0; |
| if (!has_xand && |
| !bit_test(node_set_ptr[i].feature_bits, j)) { |
| if ((i+1) < node_set_size || !avail_bitmap) |
| continue; |
| else |
| goto try_sched; |
| } |
| |
| if (total_bitmap) { |
| bit_or(total_bitmap, |
| node_set_ptr[i].my_bitmap); |
| } else { |
| total_bitmap = bit_copy( |
| node_set_ptr[i].my_bitmap); |
| } |
| |
| if ((node_set_ptr[i].flags & NODE_SET_REBOOT)) { |
| /* Node reboot required */ |
| count1 = bit_set_count(node_set_ptr[i]. |
| my_bitmap); |
| bit_and(node_set_ptr[i].my_bitmap, |
| idle_node_bitmap); |
| /* |
| * Powered up cloud nodes can't be rebooted to |
| * get new features. Must be powered down first. |
| */ |
| bit_and_not(node_set_ptr[i].my_bitmap, |
| cloud_node_bitmap); |
| count2 = bit_set_count(node_set_ptr[i]. |
| my_bitmap); |
| if (count1 != count2) |
| nodes_busy = true; |
| } |
| |
| bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap); |
| if (!nodes_busy) { |
| count1 = bit_set_count(node_set_ptr[i]. |
| my_bitmap); |
| } |
| if (!preempt_flag) { |
| if (shared) { |
| bit_and(node_set_ptr[i].my_bitmap, |
| share_node_bitmap); |
| bit_and_not(node_set_ptr[i].my_bitmap, |
| cg_node_bitmap); |
| } else { |
| bit_and(node_set_ptr[i].my_bitmap, |
| idle_node_bitmap); |
| /* IDLE nodes are not COMPLETING */ |
| } |
| } else { |
| bit_and_not(node_set_ptr[i].my_bitmap, |
| cg_node_bitmap); |
| } |
| |
| /* |
| * We must skip the node *only* in the case it is |
| * rebooted with ASAP flag. |
| */ |
| bit_and_not(node_set_ptr[i].my_bitmap, |
| asap_node_bitmap); |
| |
| if (!nodes_busy) { |
| count2 = bit_set_count(node_set_ptr[i]. |
| my_bitmap); |
| if (count1 != count2) |
| nodes_busy = true; |
| } |
| if (avail_bitmap) { |
| bit_or(avail_bitmap, |
| node_set_ptr[i].my_bitmap); |
| } else { |
| avail_bitmap = bit_copy(node_set_ptr[i]. |
| my_bitmap); |
| } |
| |
| tried_sched = false; /* need to test these nodes */ |
| |
| if (running_cons_tres() && ((i + 1) < node_set_size)) { |
| /* |
| * Execute select_g_job_test() _once_ using |
| * sched_weight in node_record_t as set |
| * by _sync_node_weight() |
| */ |
| continue; |
| } |
| |
| try_sched: |
| /* NOTE: select_g_job_test() is destructive of |
| * avail_bitmap, so save a backup copy */ |
| backup_bitmap = bit_copy(avail_bitmap); |
| FREE_NULL_LIST(*preemptee_job_list); |
| if (job_ptr->details->req_node_bitmap == NULL) |
| bit_and(avail_bitmap, avail_node_bitmap); |
| |
| bit_and(avail_bitmap, share_node_bitmap); |
| |
| avail_nodes = bit_set_count(avail_bitmap); |
| if (((avail_nodes < min_nodes) || |
| ((avail_nodes >= min_nodes) && |
| (avail_nodes < req_nodes))) && |
| ((i+1) < node_set_size)) { |
| FREE_NULL_BITMAP(avail_bitmap); |
| avail_bitmap = backup_bitmap; |
| continue; /* Keep accumulating nodes */ |
| } |
| |
| /* Only preempt jobs when all possible nodes are being |
| * considered for use, otherwise we would preempt jobs |
| * to use the lowest weight nodes. */ |
| if ((i+1) < node_set_size || !preemptee_candidates) |
| preemptee_cand = NULL; |
| else if (preempt_flag) { |
| job_record_t *tmp_job_ptr = NULL; |
| list_itr_t *job_iterator; |
| job_iterator = list_iterator_create(preemptee_candidates); |
| while ((tmp_job_ptr = list_next(job_iterator))) |
| _bit_or_cond(tmp_job_ptr, avail_bitmap); |
| list_iterator_destroy(job_iterator); |
| bit_and(avail_bitmap, avail_node_bitmap); |
| bit_and(avail_bitmap, total_bitmap); |
| preemptee_cand = preemptee_candidates; |
| } else |
| preemptee_cand = preemptee_candidates; |
| |
| job_ptr->details->pn_min_memory = orig_req_mem; |
| pick_code = select_g_job_test(job_ptr, |
| avail_bitmap, |
| min_nodes, |
| max_nodes, |
| req_nodes, |
| select_mode, |
| preemptee_cand, |
| preemptee_job_list, |
| resv_exc_ptr, |
| NULL); |
| if (job_ptr->details->pn_min_memory) { |
| if (job_ptr->details->pn_min_memory < |
| smallest_min_mem) |
| smallest_min_mem = |
| job_ptr->details->pn_min_memory; |
| else |
| job_ptr->details->pn_min_memory = |
| smallest_min_mem; |
| } |
| |
| #if _DEBUG |
| { |
| char *tmp_str1 = bitmap2node_name(avail_bitmap); |
| char *tmp_str2 = bitmap2node_name(backup_bitmap); |
| info("%s: %pJ err:%d nodes:%u:%u:%u mode:%u select %s from %s", |
| __func__, job_ptr, pick_code, min_nodes, req_nodes, |
| max_nodes, select_mode, tmp_str1, tmp_str2); |
| xfree(tmp_str1); |
| xfree(tmp_str2); |
| } |
| #endif |
| |
| if (pick_code == SLURM_SUCCESS) { |
| FREE_NULL_BITMAP(backup_bitmap); |
| if (bit_set_count(avail_bitmap) > max_nodes) { |
| /* end of tests for this feature */ |
| avail_nodes = 0; |
| break; |
| } |
| FREE_NULL_BITMAP(total_bitmap); |
| FREE_NULL_BITMAP(possible_bitmap); |
| *select_bitmap = avail_bitmap; |
| return SLURM_SUCCESS; |
| } else { |
| tried_sched = true; /* test failed */ |
| FREE_NULL_BITMAP(avail_bitmap); |
| avail_bitmap = backup_bitmap; |
| } |
| } /* for (i = 0; i < node_set_size; i++) */ |
| |
| /* try to get req_nodes now for this feature */ |
| if (avail_bitmap && (!tried_sched) && |
| (avail_nodes >= min_nodes) && |
| ((job_ptr->details->req_node_bitmap == NULL) || |
| bit_super_set(job_ptr->details->req_node_bitmap, |
| avail_bitmap))) { |
| FREE_NULL_LIST(*preemptee_job_list); |
| job_ptr->details->pn_min_memory = orig_req_mem; |
| pick_code = select_g_job_test(job_ptr, avail_bitmap, |
| min_nodes, max_nodes, |
| req_nodes, |
| select_mode, |
| preemptee_candidates, |
| preemptee_job_list, |
| resv_exc_ptr, |
| NULL); |
| |
| if (job_ptr->details->pn_min_memory) { |
| if (job_ptr->details->pn_min_memory < |
| smallest_min_mem) |
| smallest_min_mem = |
| job_ptr->details->pn_min_memory; |
| else |
| job_ptr->details->pn_min_memory = |
| smallest_min_mem; |
| } |
| |
| if ((pick_code == SLURM_SUCCESS) && |
| (bit_set_count(avail_bitmap) <= max_nodes)) { |
| FREE_NULL_BITMAP(total_bitmap); |
| FREE_NULL_BITMAP(possible_bitmap); |
| *select_bitmap = avail_bitmap; |
| return SLURM_SUCCESS; |
| } |
| } |
| |
| if (pick_code == ESLURM_LICENSES_UNAVAILABLE) |
| licenses_unavailable = true; |
| |
| /* determine if job could possibly run (if all configured |
| * nodes available) */ |
| if (total_bitmap) |
| total_nodes = bit_set_count(total_bitmap); |
| if (total_bitmap && |
| (!runable_ever || !runable_avail) && |
| (total_nodes >= min_nodes) && |
| ((job_ptr->details->req_node_bitmap == NULL) || |
| (bit_super_set(job_ptr->details->req_node_bitmap, |
| total_bitmap)))) { |
| avail_nodes = bit_set_count(avail_bitmap); |
| if (!runable_avail && (avail_nodes >= min_nodes)) { |
| FREE_NULL_BITMAP(avail_bitmap); |
| avail_bitmap = bit_copy(total_bitmap); |
| bit_and(avail_bitmap, avail_node_bitmap); |
| job_ptr->details->pn_min_memory = orig_req_mem; |
| pick_code = select_g_job_test(job_ptr, |
| avail_bitmap, |
| min_nodes, |
| max_nodes, |
| req_nodes, |
| SELECT_MODE_TEST_ONLY, |
| preemptee_candidates, NULL, |
| resv_exc_ptr, |
| NULL); |
| |
| if (job_ptr->details->pn_min_memory) { |
| if (job_ptr->details->pn_min_memory < |
| smallest_min_mem) |
| smallest_min_mem = |
| job_ptr->details-> |
| pn_min_memory; |
| else |
| job_ptr->details-> |
| pn_min_memory = |
| smallest_min_mem; |
| } |
| |
| if (pick_code == SLURM_SUCCESS) { |
| runable_ever = true; |
| if (bit_set_count(avail_bitmap) <= |
| max_nodes) |
| runable_avail = true; |
| FREE_NULL_BITMAP(possible_bitmap); |
| possible_bitmap = avail_bitmap; |
| avail_bitmap = NULL; |
| } |
| } |
| if (!runable_ever) { |
| job_ptr->details->pn_min_memory = orig_req_mem; |
| pick_code = select_g_job_test(job_ptr, |
| total_bitmap, |
| min_nodes, |
| max_nodes, |
| req_nodes, |
| SELECT_MODE_TEST_ONLY, |
| preemptee_candidates, NULL, |
| resv_exc_ptr, |
| NULL); |
| |
| if (job_ptr->details->pn_min_memory) { |
| if (job_ptr->details->pn_min_memory < |
| smallest_min_mem) |
| smallest_min_mem = |
| job_ptr->details-> |
| pn_min_memory; |
| else |
| job_ptr->details-> |
| pn_min_memory = |
| smallest_min_mem; |
| } |
| |
| if (pick_code == SLURM_SUCCESS) { |
| FREE_NULL_BITMAP(possible_bitmap); |
| possible_bitmap = total_bitmap; |
| total_bitmap = NULL; |
| runable_ever = true; |
| } |
| } |
| } |
| FREE_NULL_BITMAP(avail_bitmap); |
| FREE_NULL_BITMAP(total_bitmap); |
| if (error_code != SLURM_SUCCESS) |
| break; |
| } |
| FREE_NULL_BITMAP(avail_bitmap); |
| FREE_NULL_BITMAP(total_bitmap); |
| |
| /* The job is not able to start right now, return a |
| * value indicating when the job can start */ |
| if (!runable_ever && resv_overlap && |
| (pick_code != ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE)) { |
| error_code = ESLURM_RESERVATION_BUSY; |
| return error_code; |
| } |
| if (licenses_unavailable) { |
| error_code = ESLURM_LICENSES_UNAVAILABLE; |
| } else if (!runable_ever) { |
| char *tmp; |
| |
| /* |
| * If a job requested extra_constraints, then assume |
| * that the job might be runnable at some point in the |
| * future. FIXME: This is a kludge and this assumption |
| * may be wrong. |
| */ |
| tmp = job_ptr->extra_constraints ? |
| "currently not runnable" : "never runnable"; |
| if (part_ptr->name) { |
| info("%s: %pJ %s in partition %s", |
| __func__, job_ptr, tmp, part_ptr->name); |
| } else { |
| info("%s: job %pJ %s", |
| __func__, job_ptr, tmp); |
| } |
| |
| if (pick_code == ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE) |
| error_code = pick_code; |
| else |
| error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| |
| } else if (!runable_avail && !nodes_busy) { |
| error_code = ESLURM_NODE_NOT_AVAIL; |
| } else if (job_ptr->details->req_node_bitmap && |
| bit_overlap_any(job_ptr->details->req_node_bitmap, |
| rs_node_bitmap)) { |
| error_code = ESLURM_NODES_BUSY; |
| } else if (!preempt_flag && job_ptr->details->req_node_bitmap) { |
| /* specific nodes required */ |
| if (shared) { |
| if (!bit_super_set(job_ptr->details->req_node_bitmap, |
| share_node_bitmap)) { |
| error_code = ESLURM_NODES_BUSY; |
| } |
| if (bit_overlap_any(job_ptr->details->req_node_bitmap, |
| cg_node_bitmap)) { |
| error_code = ESLURM_NODES_BUSY; |
| } |
| } else if (!bit_super_set(job_ptr->details->req_node_bitmap, |
| idle_node_bitmap)) { |
| error_code = ESLURM_NODES_BUSY; |
| /* Note: IDLE nodes are not COMPLETING */ |
| } |
| } else if (job_ptr->details->req_node_bitmap && |
| bit_overlap_any(job_ptr->details->req_node_bitmap, |
| cg_node_bitmap)) { |
| error_code = ESLURM_NODES_BUSY; |
| } |
| |
| if (error_code == SLURM_SUCCESS) { |
| error_code = ESLURM_NODES_BUSY; |
| } |
| |
| if (possible_bitmap && runable_ever) { |
| *select_bitmap = possible_bitmap; |
| } else { |
| FREE_NULL_BITMAP(possible_bitmap); |
| } |
| return error_code; |
| } |
| |
| static void _preempt_jobs(list_t *preemptee_job_list, bool kill_pending, |
| int *error_code, job_record_t *preemptor_ptr) |
| { |
| list_itr_t *iter; |
| job_record_t *job_ptr; |
| uint16_t mode; |
| int job_cnt = 0; |
| static time_t sched_update = 0; |
| |
| if (sched_update != slurm_conf.last_update) { |
| preempt_send_user_signal = false; |
| if (xstrcasestr(slurm_conf.preempt_params, |
| "send_user_signal") || |
| xstrcasestr(slurm_conf.slurmctld_params, |
| "preempt_send_user_signal")) |
| preempt_send_user_signal = true; |
| |
| sched_update = slurm_conf.last_update; |
| } |
| |
| iter = list_iterator_create(preemptee_job_list); |
| while ((job_ptr = list_next(iter))) { |
| mode = slurm_job_preempt_mode(job_ptr); |
| |
| if (mode == PREEMPT_MODE_OFF) { |
| error("%s: Invalid preempt_mode %u for %pJ", |
| __func__, mode, job_ptr); |
| continue; |
| } |
| |
| if ((mode == PREEMPT_MODE_SUSPEND) && |
| (slurm_conf.preempt_mode & PREEMPT_MODE_GANG)) { |
| debug("preempted %pJ suspended by gang scheduler to reclaim resources for %pJ", |
| job_ptr, preemptor_ptr); |
| job_ptr->preempt_time = time(NULL); |
| continue; |
| } |
| |
| job_cnt++; |
| if (!kill_pending) |
| continue; |
| |
| if (slurm_job_preempt(job_ptr, preemptor_ptr, mode, true) != |
| SLURM_SUCCESS) |
| continue; |
| } |
| list_iterator_destroy(iter); |
| |
| if (job_cnt > 0) |
| *error_code = ESLURM_NODES_BUSY; |
| } |
| |
| /* Return true if this job record is |
| * 1) not a job array OR |
| * 2) the first task of a job array to begin execution */ |
| static bool _first_array_task(job_record_t *job_ptr) |
| { |
| job_record_t *meta_job_ptr; |
| |
| if (job_ptr->array_task_id == NO_VAL) |
| return true; |
| |
| meta_job_ptr = find_job_record(job_ptr->array_job_id); |
| if (!meta_job_ptr || !meta_job_ptr->array_recs) { |
| error("%s: Could not find meta job record for %pJ", |
| __func__, job_ptr); |
| return true; |
| } |
| if ((meta_job_ptr->array_recs->tot_run_tasks == 1) && /* This task */ |
| (meta_job_ptr->array_recs->tot_comp_tasks == 0)) |
| return true; |
| |
| return false; |
| } |
| |
| /* |
| * This job has zero node count. It is only designed to create or destroy |
| * persistent burst buffer resources. Terminate it now. |
| */ |
| static void _end_null_job(job_record_t *job_ptr) |
| { |
| time_t now = time(NULL); |
| |
| job_ptr->exit_code = 0; |
| gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req); |
| gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req_accum); |
| FREE_NULL_LIST(job_ptr->gres_list_alloc); |
| job_state_set(job_ptr, JOB_RUNNING); |
| job_ptr->bit_flags |= JOB_WAS_RUNNING; |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| xfree(job_ptr->nodes); |
| xfree(job_ptr->sched_nodes); |
| job_ptr->start_time = now; |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| job_ptr->time_last_active = now; |
| if (!job_ptr->step_list) |
| job_ptr->step_list = list_create(free_step_record); |
| |
| (void) job_array_post_sched(job_ptr, true); |
| (void) bb_g_job_begin(job_ptr); |
| job_array_start(job_ptr); |
| rebuild_job_part_list(job_ptr); |
| if ((job_ptr->mail_type & MAIL_JOB_BEGIN) && |
| ((job_ptr->mail_type & MAIL_ARRAY_TASKS) || |
| _first_array_task(job_ptr))) |
| mail_job_info(job_ptr, MAIL_JOB_BEGIN); |
| slurmctld_diag_stats.jobs_started++; |
| /* Call job_set_alloc_tres() before acct_policy_job_begin() */ |
| job_set_alloc_tres(job_ptr, false); |
| acct_policy_job_begin(job_ptr, false); |
| /* |
| * If run with slurmdbd, this is handled out of band in the job if |
| * happening right away. If the job has already become eligible and |
| * registered in the db then the start message. |
| */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| jobcomp_g_record_job_start(job_ptr); |
| prolog_slurmctld(job_ptr); |
| |
| job_ptr->end_time = now; |
| job_state_set(job_ptr, JOB_COMPLETE); |
| job_completion_logger(job_ptr, false); |
| acct_policy_job_fini(job_ptr, false); |
| if (select_g_job_fini(job_ptr) != SLURM_SUCCESS) |
| error("select_g_job_fini(%pJ): %m", job_ptr); |
| epilog_slurmctld(job_ptr); |
| } |
| |
| static void _handle_explicit_req(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| list_t **ret_gres_list = arg; |
| |
| /* Copy over the explicit gres, skip others */ |
| if (!(gres_state_job->config_flags & GRES_CONF_EXPLICIT) && |
| !gres_id_shared(gres_state_job->config_flags)) |
| return; |
| |
| if (!*ret_gres_list) |
| *ret_gres_list = list_create(gres_job_list_delete); |
| |
| list_append(*ret_gres_list, |
| gres_create_state( |
| gres_state_job, |
| GRES_STATE_SRC_STATE_PTR, |
| GRES_STATE_TYPE_JOB, |
| gres_job_state_dup(gres_state_job->gres_data))); |
| } |
| |
| static void _gres_select_explicit( |
| list_t *req_gres_list, list_t **ret_gres_list) |
| { |
| if (!req_gres_list) |
| return; |
| |
| (void) list_for_each(req_gres_list, |
| (ListForF) _handle_explicit_req, |
| ret_gres_list); |
| } |
| |
| static list_t *_handle_exclusive_gres(job_record_t *job_ptr, |
| bitstr_t *select_bitmap, bool test_only) |
| { |
| list_t *post_list = NULL; |
| node_record_t *node_ptr; |
| |
| if (test_only || !gres_get_gres_cnt()) |
| return NULL; |
| |
| xassert(job_ptr); |
| xassert(select_bitmap); |
| |
| if (!job_ptr->details || |
| !(job_ptr->details->whole_node & WHOLE_NODE_REQUIRED)) |
| return NULL; |
| |
| if (job_ptr->gres_list_req) |
| _gres_select_explicit(job_ptr->gres_list_req, &post_list); |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(select_bitmap, &i)); i++) { |
| gres_stepmgr_job_select_whole_node( |
| &post_list, |
| node_ptr->gres_list, |
| job_ptr->job_id, |
| node_ptr->name); |
| } |
| |
| return post_list; |
| } |
| |
| typedef struct { |
| uint64_t gpu_cnt; |
| int node_inx; |
| } foreach_node_gpu_args_t; |
| |
| static int _get_node_gpu_sum(void *x, void *arg) |
| { |
| foreach_node_gpu_args_t *args = arg; |
| gres_state_t *gres_job_state = x; |
| gres_job_state_t *gres_js; |
| |
| if (gres_job_state->plugin_id != gres_get_gpu_plugin_id()) |
| return SLURM_SUCCESS; |
| gres_js = gres_job_state->gres_data; |
| args->gpu_cnt += gres_js->gres_cnt_node_select[args->node_inx]; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static uint64_t _get_max_node_gpu_cnt(bitstr_t *node_bitmap, list_t* gres_list) |
| { |
| foreach_node_gpu_args_t args; |
| uint64_t max_node_gpu_cnt = 0; |
| |
| xassert(node_bitmap); |
| xassert(gres_list); |
| |
| for (int i = 0; (i = bit_ffs_from_bit(node_bitmap, i)) >= 0; i++) { |
| args.gpu_cnt = 0; |
| args.node_inx = i; |
| /* Get the sum of all gpu types on the node */ |
| list_for_each(gres_list, _get_node_gpu_sum, &args); |
| max_node_gpu_cnt = MAX(max_node_gpu_cnt, args.gpu_cnt); |
| } |
| |
| return max_node_gpu_cnt; |
| } |
| |
| static int _get_resv_mpi_ports(job_record_t *job_ptr, |
| uint16_t *orig_resv_port_cnt, |
| uint32_t node_cnt, |
| time_t now) |
| { |
| int error_code = SLURM_SUCCESS; |
| bool resv_ports_present = false; |
| |
| if (!(job_ptr->bit_flags & STEPMGR_ENABLED)) |
| return SLURM_SUCCESS; |
| |
| if (slurm_conf.mpi_params && xstrstr(slurm_conf.mpi_params, "ports=")) |
| resv_ports_present = true; |
| |
| if (resv_ports_present && |
| (job_ptr->resv_port_cnt == NO_VAL16)) { |
| if (!job_ptr->job_resrcs) { |
| error("Select plugin failed to set job resources"); |
| /* |
| * Do not attempt to allocate the select_bitmap nodes |
| * since select plugin failed to set job resources |
| */ |
| error_code = ESLURM_NODES_BUSY; |
| job_ptr->start_time = 0; |
| job_ptr->time_last_active = 0; |
| job_ptr->end_time = 0; |
| job_ptr->state_reason = WAIT_RESOURCES; |
| last_job_update = now; |
| xfree(job_ptr->state_desc); |
| return error_code; |
| } |
| |
| *orig_resv_port_cnt = job_ptr->resv_port_cnt; |
| job_ptr->resv_port_cnt = 0; |
| |
| /* |
| * reserved port count set to maximum task count on |
| * any node plus one, or if the job is exclusive give all |
| * resvered ports. |
| */ |
| if ((job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) || |
| (job_ptr->details->whole_node & WHOLE_NODE_REQUIRED)) { |
| job_ptr->resv_port_cnt = |
| resv_port_get_resv_port_cnt() - 1; |
| } else if (!job_ptr->details->overcommit && |
| (job_ptr->details->num_tasks || |
| job_ptr->details->ntasks_per_node || |
| job_ptr->details->ntasks_per_tres)) { |
| for (int i = 0; i < node_cnt; i++) { |
| uint16_t tasks = |
| job_ptr->job_resrcs->tasks_per_node[i]; |
| job_ptr->resv_port_cnt = |
| MAX(job_ptr->resv_port_cnt, tasks * 2); |
| } |
| } else if (!job_ptr->details->overcommit) { |
| uint16_t max_node_cpus = 0; |
| for (int i = 0; i < node_cnt; i++) { |
| max_node_cpus = |
| MAX(max_node_cpus, |
| job_ptr->job_resrcs->cpus[i] * 2); |
| } |
| job_ptr->resv_port_cnt = max_node_cpus; |
| } else if (job_ptr->details->ntasks_per_node) { |
| job_ptr->resv_port_cnt = |
| job_ptr->details->ntasks_per_node; |
| } else if (job_ptr->details->ntasks_per_tres && |
| job_ptr->gres_list_req ) { |
| uint64_t max_gpu_per_node = |
| _get_max_node_gpu_cnt( |
| job_ptr->node_bitmap, |
| job_ptr->gres_list_req); |
| |
| if (max_gpu_per_node > slurm_conf.max_tasks_per_node) |
| max_gpu_per_node = |
| slurm_conf.max_tasks_per_node; |
| job_ptr->resv_port_cnt = |
| (uint16_t) max_gpu_per_node * |
| job_ptr->details->ntasks_per_tres; |
| } else if (job_ptr->details->num_tasks) { |
| job_ptr->resv_port_cnt = ROUNDUP( |
| job_ptr->details->num_tasks, node_cnt); |
| } else { |
| job_ptr->resv_port_cnt = ROUNDUP( |
| job_ptr->job_resrcs->ncpus, node_cnt); |
| } |
| |
| job_ptr->resv_port_cnt++; |
| } |
| if ((job_ptr->resv_port_cnt != NO_VAL16) && |
| (job_ptr->resv_port_cnt != 0)) { |
| error_code = resv_port_job_alloc(job_ptr); |
| if (error_code) { |
| job_ptr->start_time = 0; |
| job_ptr->time_last_active = 0; |
| job_ptr->end_time = 0; |
| job_ptr->state_reason = WAIT_MPI_PORTS_BUSY; |
| last_job_update = now; |
| xfree(job_ptr->state_desc); |
| } |
| } |
| |
| return error_code; |
| } |
| |
| /* |
| * select_nodes - select and allocate nodes to a specific job |
| * IN job_node_select - pointer with at least a pointer to the job record |
| * IN test_only - if set do not allocate nodes, just confirm they |
| * could be allocated now |
| * IN select_node_bitmap - bitmap of nodes to be used for the |
| * job's resource allocation (not returned if NULL), caller |
| * must free |
| * IN submission - if set ignore reservations |
| * IN scheduler_type - which scheduler is calling this |
| * (i.e. SLURMDB_JOB_FLAG_BACKFILL, SLURMDB_JOB_FLAG_SCHED, etc) |
| * RET 0 on success, ESLURM code from slurm_errno.h otherwise |
| * globals: list_part - global list of partition info |
| * default_part_loc - pointer to default partition |
| * config_list - global list of node configuration info |
| * Notes: The algorithm is |
| * 1) Build a table (node_set_ptr) of nodes with the requisite |
| * configuration. Each table entry includes their weight, |
| * node_list, features, etc. |
| * 2) Call _pick_best_nodes() to select those nodes best satisfying |
| * the request, (e.g. best-fit or other criterion) |
| * 3) Call allocate_nodes() to perform the actual allocation |
| */ |
| extern int select_nodes(job_node_select_t *job_node_select, |
| bool test_only, bool submission, |
| uint32_t scheduler_type) |
| { |
| int bb, error_code = SLURM_SUCCESS, i, node_set_size = 0; |
| bitstr_t *select_bitmap = NULL; |
| struct node_set *node_set_ptr = NULL; |
| part_record_t *part_ptr = NULL; |
| uint8_t orig_whole_node, orig_share_res; |
| uint16_t orig_resv_port_cnt = 0; |
| uint32_t min_nodes = 0, max_nodes = 0, req_nodes = 0; |
| time_t now = time(NULL); |
| bool configuring = false; |
| list_t *preemptee_job_list = NULL; |
| uint32_t selected_node_cnt = NO_VAL; |
| uint64_t tres_req_cnt[slurmctld_tres_cnt]; |
| bool can_reboot; |
| uint32_t qos_flags = 0; |
| assoc_mgr_lock_t qos_read_lock = |
| { .assoc = READ_LOCK, .qos = READ_LOCK }; |
| assoc_mgr_lock_t job_read_locks = |
| { .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK }; |
| list_t *gres_list_pre = NULL; |
| bool gres_list_pre_set = false; |
| job_record_t *tmp_job, *job_ptr = job_node_select->job_ptr; |
| |
| xassert(job_ptr); |
| xassert(job_ptr->magic == JOB_MAGIC); |
| |
| /* |
| * The call path from _get_req_features() (called later in this |
| * function) can eventually call _resolve_shared_status(). This latter |
| * function can alter the job_ptr->details->{whole_node,share_res}. |
| * |
| * Saving the original values here and restoring them at cleanup time |
| * at the bottom of this function if needed. |
| */ |
| orig_whole_node = job_ptr->details->whole_node; |
| orig_share_res = job_ptr->details->share_res; |
| |
| if (!acct_policy_job_runnable_pre_select(job_ptr, false)) |
| return ESLURM_ACCOUNTING_POLICY; |
| |
| part_ptr = job_ptr->part_ptr; |
| |
| /* identify partition */ |
| if (part_ptr == NULL) { |
| part_ptr = find_part_record(job_ptr->partition); |
| xassert(part_ptr); |
| job_ptr->part_ptr = part_ptr; |
| error("partition pointer reset for %pJ, part %s", |
| job_ptr, job_ptr->partition); |
| } |
| |
| /* Quick check to see if this QOS is allowed on this partition. */ |
| assoc_mgr_lock(&qos_read_lock); |
| if (job_ptr->qos_ptr) |
| qos_flags = job_ptr->qos_ptr->flags; |
| if ((error_code = part_policy_valid_qos(job_ptr->part_ptr, |
| job_ptr->qos_ptr, |
| job_ptr->user_id, job_ptr)) != |
| SLURM_SUCCESS) { |
| assoc_mgr_unlock(&qos_read_lock); |
| return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| } |
| |
| /* Quick check to see if this account is allowed on this partition. */ |
| if ((error_code = part_policy_valid_acct( |
| job_ptr->part_ptr, |
| job_ptr->assoc_ptr ? job_ptr->assoc_ptr->acct : NULL, |
| job_ptr)) |
| != SLURM_SUCCESS) { |
| assoc_mgr_unlock(&qos_read_lock); |
| return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| } |
| assoc_mgr_unlock(&qos_read_lock); |
| |
| /* Quick check to see if this group is allowed on this partition. */ |
| if (!validate_group(job_ptr->part_ptr, job_ptr->user_id)) { |
| xfree(job_ptr->state_desc); |
| xstrfmtcat(job_ptr->state_desc, |
| "uid %u not in group permitted to use this partition (%s). groups allowed: %s", |
| job_ptr->user_id, job_ptr->part_ptr->name, |
| part_ptr->allow_groups); |
| debug2("%s: %s", __func__, job_ptr->state_desc); |
| job_ptr->state_reason = WAIT_ACCOUNT; |
| last_job_update = now; |
| return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| } |
| |
| if (job_ptr->priority == 0) { /* user/admin hold */ |
| if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS |
| && (job_ptr->state_reason != FAIL_BURST_BUFFER_OP) |
| && (job_ptr->state_reason != WAIT_HELD) |
| && (job_ptr->state_reason != WAIT_HELD_USER) |
| && (job_ptr->state_reason != WAIT_MAX_REQUEUE)) { |
| job_ptr->state_reason = WAIT_HELD; |
| } |
| return ESLURM_JOB_HELD; |
| } |
| |
| bb = bb_g_job_test_stage_in(job_ptr, test_only); |
| if (bb != 1) { |
| if ((bb == -1) && |
| (job_ptr->state_reason == FAIL_BURST_BUFFER_OP)) |
| return ESLURM_BURST_BUFFER_WAIT; /* Fatal BB event */ |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| if (bb == 0) |
| job_ptr->state_reason = WAIT_BURST_BUFFER_STAGING; |
| else |
| job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE; |
| return ESLURM_BURST_BUFFER_WAIT; |
| } |
| |
| if ((job_ptr->details->min_nodes == 0) && |
| (job_ptr->details->max_nodes == 0)) { |
| if (!job_ptr->burst_buffer) |
| return ESLURM_INVALID_NODE_COUNT; |
| if (!test_only) |
| _end_null_job(job_ptr); |
| return SLURM_SUCCESS; |
| } |
| |
| /* build sets of usable nodes based upon their configuration */ |
| can_reboot = node_features_g_user_update(job_ptr->user_id); |
| error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size, |
| job_node_select->err_msg, |
| test_only, can_reboot); |
| if (error_code) |
| return error_code; |
| if (node_set_ptr == NULL) /* Should never be true */ |
| return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| |
| for (i = 0; i < node_set_size; i++) |
| _set_sched_weight(node_set_ptr + i); |
| qsort(node_set_ptr, node_set_size, sizeof(struct node_set), |
| _sort_node_set); |
| _log_node_set(job_ptr, node_set_ptr, node_set_size); |
| |
| /* ensure that selected nodes are in these node sets */ |
| if (job_ptr->details->req_node_bitmap) { |
| error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap, |
| node_set_ptr, node_set_size); |
| if (error_code) { |
| info("No nodes satisfy requirements for %pJ in partition %s", |
| job_ptr, job_ptr->part_ptr->name); |
| goto cleanup; |
| } |
| } |
| |
| /* enforce both user's and partition's node limits if the qos |
| * isn't set to override them */ |
| /* info("req: %u-%u, %u", job_ptr->details->min_nodes, */ |
| /* job_ptr->details->max_nodes, part_ptr->max_nodes); */ |
| error_code = get_node_cnts(job_ptr, qos_flags, part_ptr, |
| &min_nodes, &req_nodes, &max_nodes); |
| if ((error_code == ESLURM_ACCOUNTING_POLICY) || |
| (error_code == ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE)) |
| goto cleanup; |
| else if ((error_code != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && |
| (error_code != ESLURM_RESERVATION_MAINT)) { |
| /* Select resources for the job here */ |
| job_array_pre_sched(job_ptr); |
| if (job_ptr->job_resrcs) |
| debug2("%s: calling _get_req_features() for %pJ with not NULL job resources", |
| __func__, job_ptr); |
| error_code = _get_req_features(node_set_ptr, node_set_size, |
| &select_bitmap, job_ptr, |
| part_ptr, min_nodes, max_nodes, |
| req_nodes, test_only, |
| &preemptee_job_list, can_reboot, |
| submission); |
| } |
| |
| /* Set this guess here to give the user tools an idea |
| * of how many nodes Slurm is planning on giving the job. |
| * This needs to be done on success or not. It means the job |
| * could run on nodes. |
| */ |
| if (select_bitmap) { |
| list_t *gres_list_whole_node = _handle_exclusive_gres( |
| job_ptr, select_bitmap, test_only); |
| |
| selected_node_cnt = bit_set_count(select_bitmap); |
| job_ptr->node_cnt_wag = selected_node_cnt; |
| |
| if (gres_list_whole_node) { |
| gres_list_pre_set = true; |
| gres_list_pre = job_ptr->gres_list_req; |
| job_ptr->gres_list_req = gres_list_whole_node; |
| } |
| |
| } else |
| selected_node_cnt = req_nodes; |
| |
| if (!test_only && select_bitmap && (max_powered_nodes != NO_VAL)) { |
| bitstr_t *tmp = bit_copy(select_bitmap); |
| hostlist_t *select = NULL, *need = NULL; |
| char *select_str = NULL, *need_str = NULL; |
| int32_t count, powerup_count, before_count = 0; |
| |
| /* selected and powered down */ |
| bit_and(tmp, power_down_node_bitmap); |
| powerup_count = bit_set_count(tmp); |
| if (slurm_conf.debug_flags & DEBUG_FLAG_POWER) { |
| select = bitmap2hostlist(select_bitmap); |
| select_str = slurm_hostlist_ranged_string_xmalloc( |
| select); |
| need = bitmap2hostlist(tmp); |
| need_str = slurm_hostlist_ranged_string_xmalloc(need); |
| before_count = bit_set_count(power_up_node_bitmap); |
| } |
| bit_or(tmp, power_up_node_bitmap); |
| count = bit_set_count(tmp); |
| log_flag(POWER, "Need to power up %d nodes (%s) from (%s). powered up count before: %d after: %d", |
| powerup_count, need_str, select_str, before_count, |
| count); |
| |
| if ((powerup_count > 0) && (count > max_powered_nodes)) { |
| error_code = ESLURM_MAX_POWERED_NODES; |
| log_flag(POWER, "%s: Cannot power up more nodes for %pJ due to MaxPoweredUpNodes limit", |
| __func__, job_ptr); |
| } |
| |
| FREE_NULL_BITMAP(tmp); |
| FREE_NULL_HOSTLIST(need); |
| FREE_NULL_HOSTLIST(select); |
| xfree(select_str); |
| xfree(need_str); |
| } |
| |
| memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt)); |
| tres_req_cnt[TRES_ARRAY_CPU] = |
| (uint64_t)(job_ptr->total_cpus ? |
| job_ptr->total_cpus : job_ptr->details->min_cpus); |
| tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( |
| job_ptr->job_resrcs, |
| job_ptr->details->pn_min_memory, |
| tres_req_cnt[TRES_ARRAY_CPU], |
| selected_node_cnt, job_ptr->part_ptr, |
| job_ptr->gres_list_req, |
| job_ptr->bit_flags & JOB_MEM_SET, |
| job_get_sockets_per_node(job_ptr), |
| job_ptr->details->num_tasks); |
| tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt; |
| |
| assoc_mgr_lock(&job_read_locks); |
| gres_stepmgr_set_job_tres_cnt( |
| job_ptr->gres_list_req, |
| selected_node_cnt, |
| tres_req_cnt, |
| true); |
| |
| tres_req_cnt[TRES_ARRAY_BILLING] = |
| assoc_mgr_tres_weighted(tres_req_cnt, |
| job_ptr->part_ptr->billing_weights, |
| slurm_conf.priority_flags, true); |
| |
| if (!test_only && (selected_node_cnt != NO_VAL) && |
| !acct_policy_job_runnable_post_select(job_ptr, tres_req_cnt, true)) { |
| assoc_mgr_unlock(&job_read_locks); |
| /* If there was an reason we couldn't schedule before hand we |
| * want to check if an accounting limit was also breached. If |
| * it was we want to override the other reason so if we are |
| * backfilling we don't reserve resources if we don't have to. |
| */ |
| free_job_resources(&job_ptr->job_resrcs); |
| if (error_code != SLURM_SUCCESS) |
| debug2("Replacing scheduling error code for %pJ from '%s' to 'Accounting policy'", |
| job_ptr, slurm_strerror(error_code)); |
| error_code = ESLURM_ACCOUNTING_POLICY; |
| goto cleanup; |
| } |
| assoc_mgr_unlock(&job_read_locks); |
| |
| /* set up the cpu_cnt here so we can decrement it as nodes |
| * free up. total_cpus is set within _get_req_features */ |
| job_ptr->cpu_cnt = job_ptr->total_cpus; |
| |
| if (!test_only && preemptee_job_list |
| && (error_code == SLURM_SUCCESS)) { |
| job_details_t *detail_ptr = job_ptr->details; |
| time_t now = time(NULL); |
| bool kill_pending = true; |
| if ((detail_ptr->preempt_start_time != 0) && |
| (detail_ptr->preempt_start_time > |
| (now - slurm_conf.kill_wait - slurm_conf.msg_timeout))) { |
| /* Job preemption may still be in progress, |
| * do not cancel or requeue any more jobs yet */ |
| kill_pending = false; |
| } |
| _preempt_jobs(preemptee_job_list, kill_pending, &error_code, |
| job_ptr); |
| if ((error_code == ESLURM_NODES_BUSY) && kill_pending) { |
| detail_ptr->preempt_start_time = now; |
| job_ptr->preempt_in_progress = true; |
| if (job_ptr->array_recs) |
| job_ptr->array_recs->pend_run_tasks++; |
| } |
| } |
| if (error_code) { |
| /* Fatal errors for job here */ |
| if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) { |
| /* Too many nodes requested */ |
| debug3("%s: %pJ not runnable with present config", |
| __func__, job_ptr); |
| job_ptr->state_reason = WAIT_PART_NODE_LIMIT; |
| xfree(job_ptr->state_desc); |
| last_job_update = now; |
| |
| /* Non-fatal errors for job below */ |
| } else if (error_code == ESLURM_NODE_NOT_AVAIL) { |
| /* Required nodes are down or drained */ |
| char *node_str = NULL, *unavail_node = NULL; |
| bitstr_t *unavail_bitmap; |
| debug3("%s: %pJ required nodes not avail", |
| __func__, job_ptr); |
| job_ptr->state_reason = WAIT_NODE_NOT_AVAIL; |
| xfree(job_ptr->state_desc); |
| unavail_bitmap = bit_copy(avail_node_bitmap); |
| filter_by_node_owner(job_ptr, unavail_bitmap); |
| bit_not(unavail_bitmap); |
| bit_and_not(unavail_bitmap, future_node_bitmap); |
| bit_and(unavail_bitmap, part_ptr->node_bitmap); |
| bit_and_not(unavail_bitmap, up_node_bitmap); |
| if (job_ptr->details->req_node_bitmap) { |
| bit_and(unavail_bitmap, |
| job_ptr->details->req_node_bitmap); |
| } |
| if (bit_ffs(unavail_bitmap) != -1) { |
| unavail_node = bitmap2node_name(unavail_bitmap); |
| node_str = unavail_node; |
| } |
| FREE_NULL_BITMAP(unavail_bitmap); |
| if (node_str) { |
| xstrfmtcat(job_ptr->state_desc, |
| "ReqNodeNotAvail, " |
| "UnavailableNodes:%s", |
| node_str); |
| } else { |
| xstrfmtcat(job_ptr->state_desc, |
| "ReqNodeNotAvail, May be reserved " |
| "for other job"); |
| } |
| xfree(unavail_node); |
| last_job_update = now; |
| } else if (error_code == ESLURM_RESERVATION_MAINT) { |
| error_code = ESLURM_RESERVATION_BUSY; /* All reserved */ |
| job_ptr->state_reason = WAIT_NODE_NOT_AVAIL; |
| xfree(job_ptr->state_desc); |
| xstrfmtcat(job_ptr->state_desc, |
| "ReqNodeNotAvail, Reserved for maintenance"); |
| } else if ((error_code == ESLURM_RESERVATION_NOT_USABLE) || |
| (error_code == ESLURM_RESERVATION_BUSY)) { |
| job_ptr->state_reason = WAIT_RESERVATION; |
| xfree(job_ptr->state_desc); |
| } else if (error_code == ESLURM_LICENSES_UNAVAILABLE) { |
| job_ptr->state_reason = WAIT_LICENSES; |
| xfree(job_ptr->state_desc); |
| } else if ((job_ptr->state_reason == WAIT_HELD) && |
| (job_ptr->priority == 0)) { |
| /* Held by select plugin due to some failure */ |
| } else if ((error_code == |
| ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && |
| job_ptr->extra_constraints) { |
| /* |
| * If a job requested extra_constraints, then assume |
| * that the job might be runnable at some point in the |
| * future. FIXME: This is a kludge and this assumption |
| * may be wrong. |
| */ |
| job_ptr->state_reason = FAIL_CONSTRAINTS; |
| xfree(job_ptr->state_desc); |
| } else if (error_code == ESLURM_MAX_POWERED_NODES) { |
| job_ptr->state_reason = WAIT_MAX_POWERED_NODES; |
| xfree(job_ptr->state_desc); |
| } else { |
| job_ptr->state_reason = WAIT_RESOURCES; |
| xfree(job_ptr->state_desc); |
| } |
| goto cleanup; |
| } |
| |
| if (test_only) { /* set if job not highest priority */ |
| error_code = SLURM_SUCCESS; |
| goto cleanup; |
| } |
| |
| /* |
| * This job may be getting requeued, clear vestigial state information |
| * before over-writing and leaking memory or referencing old GRES or |
| * step data. |
| */ |
| job_ptr->bit_flags &= ~JOB_KILL_HURRY; |
| job_state_unset_flag(job_ptr, JOB_POWER_UP_NODE); |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| xfree(job_ptr->nodes); |
| xfree(job_ptr->sched_nodes); |
| job_ptr->exit_code = 0; |
| gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req); |
| gres_stepmgr_job_clear_alloc(job_ptr->gres_list_req_accum); |
| FREE_NULL_LIST(job_ptr->gres_list_alloc); |
| if (!job_ptr->step_list) |
| job_ptr->step_list = list_create(free_step_record); |
| |
| job_ptr->node_bitmap = select_bitmap; |
| select_bitmap = NULL; /* nothing left to free */ |
| |
| if ((error_code = _get_resv_mpi_ports(job_ptr, &orig_resv_port_cnt, |
| selected_node_cnt, now))) |
| goto cleanup; |
| |
| /* |
| * we need to have these times set to know when the endtime |
| * is for the job when we place it |
| */ |
| job_ptr->start_time = job_ptr->time_last_active = now; |
| if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT) && |
| ((job_ptr->time_limit == NO_VAL) || |
| ((job_ptr->time_limit > part_ptr->max_time) && |
| !(qos_flags & QOS_FLAG_PART_TIME_LIMIT)))) { |
| if (part_ptr->default_time != NO_VAL) |
| job_ptr->time_limit = part_ptr->default_time; |
| else |
| job_ptr->time_limit = part_ptr->max_time; |
| job_ptr->limit_set.time = 1; |
| } |
| |
| job_end_time_reset(job_ptr); |
| |
| /* |
| * job_array_post_sched() must happen before allocate_nodes() because |
| * we need the pending job array state to be copied. For example, |
| * allocate_nodes() calls license_job_get() which can modify the job's |
| * license_list if the job requested OR'd licenses. |
| */ |
| tmp_job = job_array_post_sched(job_ptr, true); |
| if (tmp_job && (tmp_job != job_ptr) && (orig_resv_port_cnt == NO_VAL16)) |
| tmp_job->resv_port_cnt = orig_resv_port_cnt; |
| |
| if (bb_g_job_begin(job_ptr) != SLURM_SUCCESS) { |
| /* Leave job queued, something is hosed */ |
| error_code = ESLURM_INVALID_BURST_BUFFER_REQUEST; |
| error("bb_g_job_begin(%pJ): %s", |
| job_ptr, slurm_strerror(error_code)); |
| job_ptr->start_time = 0; |
| job_ptr->time_last_active = 0; |
| job_ptr->end_time = 0; |
| job_ptr->priority = 0; |
| job_ptr->state_reason = WAIT_HELD; |
| last_job_update = now; |
| goto cleanup; |
| } |
| if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) { |
| /* Leave job queued, something is hosed */ |
| error("select_g_job_begin(%pJ): %m", job_ptr); |
| |
| /* Cancel previously started job */ |
| (void) bb_g_job_revoke_alloc(job_ptr); |
| |
| error_code = ESLURM_NODES_BUSY; |
| job_ptr->start_time = 0; |
| job_ptr->time_last_active = 0; |
| job_ptr->end_time = 0; |
| job_ptr->state_reason = WAIT_RESOURCES; |
| last_job_update = now; |
| goto cleanup; |
| } |
| |
| /* assign the nodes and stage_in the job */ |
| job_ptr->state_reason = WAIT_NO_REASON; |
| xfree(job_ptr->state_desc); |
| |
| if (job_ptr->job_resrcs && job_ptr->job_resrcs->nodes) { |
| job_ptr->nodes = xstrdup(job_ptr->job_resrcs->nodes); |
| } else { |
| error("Select plugin failed to set job resources, nodes"); |
| /* Do not attempt to allocate the select_bitmap nodes since |
| * select plugin failed to set job resources */ |
| |
| /* Cancel previously started job */ |
| (void) bb_g_job_revoke_alloc(job_ptr); |
| |
| error_code = ESLURM_NODES_BUSY; |
| job_ptr->start_time = 0; |
| job_ptr->time_last_active = 0; |
| job_ptr->end_time = 0; |
| job_ptr->state_reason = WAIT_RESOURCES; |
| last_job_update = now; |
| goto cleanup; |
| } |
| |
| job_ptr->db_flags &= ~SLURMDB_JOB_CLEAR_SCHED; |
| job_ptr->db_flags |= scheduler_type; |
| |
| /* This could be set in the select plugin so we want to keep the flag */ |
| configuring = IS_JOB_CONFIGURING(job_ptr); |
| |
| job_state_set(job_ptr, JOB_RUNNING); |
| job_ptr->bit_flags |= JOB_WAS_RUNNING; |
| |
| if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) { |
| error("select_g_select_nodeinfo_set(%pJ): %m", job_ptr); |
| if (!job_ptr->job_resrcs) { |
| /* If we don't exit earlier the empty job_resrcs might |
| * be dereferenced later */ |
| |
| /* Cancel previously started job */ |
| (void) bb_g_job_revoke_alloc(job_ptr); |
| |
| error_code = ESLURM_NODES_BUSY; |
| job_ptr->start_time = 0; |
| job_ptr->time_last_active = 0; |
| job_ptr->end_time = 0; |
| job_ptr->state_reason = WAIT_RESOURCES; |
| job_state_set(job_ptr, JOB_PENDING); |
| last_job_update = now; |
| goto cleanup; |
| } |
| } |
| |
| allocate_nodes(job_ptr); |
| job_array_start(job_ptr); |
| build_node_details(job_ptr, true); |
| rebuild_job_part_list(job_ptr); |
| |
| if ((job_ptr->mail_type & MAIL_JOB_BEGIN) && |
| ((job_ptr->mail_type & MAIL_ARRAY_TASKS) || |
| _first_array_task(job_ptr))) |
| mail_job_info(job_ptr, MAIL_JOB_BEGIN); |
| |
| slurmctld_diag_stats.jobs_started++; |
| |
| /* job_set_alloc_tres has to be done before acct_policy_job_begin */ |
| job_set_alloc_tres(job_ptr, false); |
| acct_policy_job_begin(job_ptr, false); |
| |
| resv_replace_update(job_ptr); |
| |
| /* |
| * If ran with slurmdbd this is handled out of band in the |
| * job if happening right away. If the job has already |
| * become eligible and registered in the db then the start message. |
| */ |
| jobacct_storage_g_job_start(acct_db_conn, job_ptr); |
| |
| jobcomp_g_record_job_start(job_ptr); |
| switch_g_job_start(job_ptr); |
| prolog_slurmctld(job_ptr); |
| reboot_job_nodes(job_ptr); |
| gs_job_start(job_ptr); |
| |
| if (bit_overlap_any(job_ptr->node_bitmap, power_down_node_bitmap)) { |
| job_state_set_flag(job_ptr, JOB_POWER_UP_NODE); |
| if (resume_job_list) { |
| uint32_t *tmp = xmalloc(sizeof(uint32_t)); |
| *tmp = job_ptr->job_id; |
| list_append(resume_job_list, tmp); |
| } |
| } |
| if (configuring || IS_JOB_POWER_UP_NODE(job_ptr) || |
| !bit_super_set(job_ptr->node_bitmap, avail_node_bitmap)) { |
| /* This handles nodes explicitly requesting node reboot */ |
| job_state_set_flag(job_ptr, JOB_CONFIGURING); |
| } |
| |
| /* |
| * Request asynchronous launch of a prolog for a |
| * non-batch job as long as the node is not configuring for |
| * a reboot first. Job state could be changed above so we need to |
| * recheck its state to see if it's currently configuring. |
| * PROLOG_FLAG_CONTAIN also turns on PROLOG_FLAG_ALLOC. |
| */ |
| if (!IS_JOB_CONFIGURING(job_ptr)) { |
| if (slurm_conf.prolog_flags & PROLOG_FLAG_ALLOC) |
| launch_prolog(job_ptr); |
| } |
| |
| cleanup: |
| if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap && |
| !IS_JOB_STARTED(job_ptr) && |
| (bit_ffs(job_ptr->array_recs->task_id_bitmap) != -1)) { |
| job_ptr->array_task_id = NO_VAL; |
| } |
| FREE_NULL_LIST(preemptee_job_list); |
| FREE_NULL_BITMAP(select_bitmap); |
| |
| if (node_set_ptr) { |
| for (i = 0; i < node_set_size; i++) { |
| xfree(node_set_ptr[i].features); |
| FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap); |
| FREE_NULL_BITMAP(node_set_ptr[i].feature_bits); |
| } |
| xfree(node_set_ptr); |
| } |
| |
| if (error_code != SLURM_SUCCESS) { |
| if (gres_list_pre_set && |
| (job_ptr->gres_list_req != gres_list_pre)) { |
| FREE_NULL_LIST(job_ptr->gres_list_req); |
| job_ptr->gres_list_req = gres_list_pre; |
| } |
| |
| if (orig_resv_port_cnt == NO_VAL16) |
| job_ptr->resv_port_cnt = orig_resv_port_cnt; |
| if (job_ptr->resv_ports) { |
| resv_port_job_free(job_ptr); |
| xfree(job_ptr->resv_ports); |
| } |
| FREE_NULL_BITMAP(job_ptr->node_bitmap); |
| } else |
| FREE_NULL_LIST(gres_list_pre); |
| |
| /* |
| * Unless the job is allocated resources now, we need to restore the |
| * original whole_node/share_res values since _resolve_shared_status() |
| * might have altered them during evaluation, and we don't want to |
| * propagate the changes for potential subsequent evaluations for the |
| * same job in a different partition with different configuration. |
| * |
| * NOTE: If we ever add an early return between the call to |
| * _get_req_features() and the last return below we should ensure to |
| * amend the restore logic consequently (probably copy this snippet |
| * before such early return). |
| * |
| * NOTE: We could have moved this snippet right after the call to |
| * _get_req_features(), but we need it here since after the call the |
| * error_code might change. |
| * |
| * NOTE: select_nodes() is the first common caller ancestor of the |
| * different call tree ramifications ending in _resolve_shared_status(), |
| * thus considered the appropriate spot for the save/restore logic. |
| */ |
| if (test_only || (error_code != SLURM_SUCCESS)) { |
| job_ptr->details->whole_node = orig_whole_node; |
| job_ptr->details->share_res = orig_share_res; |
| } |
| |
| return error_code; |
| } |
| |
| /* |
| * get_node_cnts - determine the number of nodes for the requested job. |
| * IN job_ptr - pointer to the job record. |
| * IN qos_flags - Flags of the job_ptr's qos. This is so we don't have to send |
| * in a pointer or lock the qos read lock before calling. |
| * IN part_ptr - pointer to the job's partition. |
| * OUT min_nodes - The minimum number of nodes for the job. |
| * OUT req_nodes - The number of node the select plugin should target. |
| * OUT max_nodes - The max number of nodes for the job. |
| * RET SLURM_SUCCESS on success, ESLURM code from slurm_errno.h otherwise. |
| */ |
| extern int get_node_cnts(job_record_t *job_ptr, uint32_t qos_flags, |
| part_record_t *part_ptr, uint32_t *min_nodes, |
| uint32_t *req_nodes, uint32_t *max_nodes) |
| { |
| int error_code = SLURM_SUCCESS, i; |
| uint32_t acct_max_nodes; |
| uint32_t wait_reason = 0; |
| |
| xassert(job_ptr); |
| xassert(part_ptr); |
| |
| /* On BlueGene systems don't adjust the min/max node limits |
| * here. We are working on midplane values. */ |
| if (qos_flags & QOS_FLAG_PART_MIN_NODE) |
| *min_nodes = job_ptr->details->min_nodes; |
| else |
| *min_nodes = MAX(job_ptr->details->min_nodes, |
| part_ptr->min_nodes); |
| if (!job_ptr->details->max_nodes) |
| *max_nodes = part_ptr->max_nodes; |
| else if (qos_flags & QOS_FLAG_PART_MAX_NODE) |
| *max_nodes = job_ptr->details->max_nodes; |
| else |
| *max_nodes = MIN(job_ptr->details->max_nodes, |
| part_ptr->max_nodes); |
| |
| if (job_ptr->details->req_node_bitmap && job_ptr->details->max_nodes) { |
| i = bit_set_count(job_ptr->details->req_node_bitmap); |
| if (i > job_ptr->details->max_nodes) { |
| info("%pJ required node list has more nodes than the job can use (%d > %u)", |
| job_ptr, i, job_ptr->details->max_nodes); |
| error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| goto end_it; |
| } |
| } |
| |
| /* Don't call functions in MIN/MAX it will result in the |
| * function being called multiple times. */ |
| acct_max_nodes = acct_policy_get_max_nodes(job_ptr, &wait_reason); |
| *max_nodes = MIN(*max_nodes, acct_max_nodes); |
| *max_nodes = MIN(*max_nodes, 500000); /* prevent overflows */ |
| |
| if (!job_ptr->limit_set.tres[TRES_ARRAY_NODE] && |
| job_ptr->details->max_nodes && |
| !(job_ptr->bit_flags & USE_MIN_NODES)) |
| *req_nodes = *max_nodes; |
| else |
| *req_nodes = *min_nodes; |
| |
| if (acct_max_nodes < *min_nodes) { |
| error_code = ESLURM_ACCOUNTING_POLICY; |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = wait_reason; |
| goto end_it; |
| } else if (*max_nodes < *min_nodes) { |
| error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; |
| goto end_it; |
| } |
| end_it: |
| return error_code; |
| } |
| |
| /* |
| * Launch prolog via RPC to slurmd. This is useful when we need to run |
| * prolog at allocation stage. Then we ask slurmd to launch the prolog |
| * asynchronously and wait on REQUEST_COMPLETE_PROLOG message from slurmd. |
| */ |
| extern void launch_prolog(job_record_t *job_ptr) |
| { |
| prolog_launch_msg_t *prolog_msg_ptr; |
| uint16_t protocol_version = job_ptr->start_protocol_ver; |
| uint16_t msg_flags = 0; |
| agent_arg_t *agent_arg_ptr; |
| job_resources_t *job_resrcs_ptr; |
| slurm_cred_arg_t cred_arg; |
| node_record_t *node_ptr; |
| |
| xassert(job_ptr); |
| |
| if (job_ptr->bit_flags & EXTERNAL_JOB) |
| return; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| if (protocol_version > node_ptr->protocol_version) |
| protocol_version = node_ptr->protocol_version; |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| msg_flags |= SLURM_PACK_ADDRS; |
| } |
| |
| prolog_msg_ptr = xmalloc(sizeof(prolog_launch_msg_t)); |
| |
| /* Locks: Write job */ |
| if ((slurm_conf.prolog_flags & PROLOG_FLAG_ALLOC) && |
| !(slurm_conf.prolog_flags & PROLOG_FLAG_NOHOLD)) { |
| job_ptr->state_reason = WAIT_PROLOG; |
| FREE_NULL_BITMAP(job_ptr->node_bitmap_pr); |
| job_ptr->node_bitmap_pr = bit_copy(job_ptr->node_bitmap); |
| } |
| |
| prolog_msg_ptr->alloc_tls_cert = xstrdup(job_ptr->alloc_tls_cert); |
| prolog_msg_ptr->job_gres_prep = |
| gres_g_prep_build_env(job_ptr->gres_list_alloc, |
| job_ptr->nodes); |
| prolog_msg_ptr->job_id = job_ptr->job_id; |
| prolog_msg_ptr->het_job_id = job_ptr->het_job_id; |
| prolog_msg_ptr->uid = job_ptr->user_id; |
| prolog_msg_ptr->gid = job_ptr->group_id; |
| if (!job_ptr->user_name) |
| job_ptr->user_name = user_from_job(job_ptr); |
| prolog_msg_ptr->nodes = xstrdup(job_ptr->nodes); |
| prolog_msg_ptr->work_dir = xstrdup(job_ptr->details->work_dir); |
| prolog_msg_ptr->x11 = job_ptr->details->x11; |
| if (prolog_msg_ptr->x11) { |
| char *x11_alloc_host = NULL; |
| prolog_msg_ptr->x11_magic_cookie = |
| xstrdup(job_ptr->details->x11_magic_cookie); |
| /* |
| * If resp_host is localhost, send slurmctld's hostname instead. |
| * This gives the compute node a better chance of getting the |
| * connection set up - otherwise it'd try to connect back to |
| * itself by mistake. |
| */ |
| if (!xstrncmp(job_ptr->resp_host, "127.", 4) || |
| !xstrcmp(job_ptr->resp_host, "::1")) { |
| char hostname[HOST_NAME_MAX]; |
| if (!gethostname(hostname, sizeof(hostname))) |
| x11_alloc_host = xstrdup(hostname); |
| } |
| if (!x11_alloc_host) |
| x11_alloc_host = xstrdup(job_ptr->resp_host); |
| prolog_msg_ptr->x11_alloc_host = x11_alloc_host; |
| prolog_msg_ptr->x11_alloc_port = job_ptr->other_port; |
| prolog_msg_ptr->x11_target = xstrdup(job_ptr->details->x11_target); |
| prolog_msg_ptr->x11_target_port = job_ptr->details->x11_target_port; |
| } |
| prolog_msg_ptr->spank_job_env_size = job_ptr->spank_job_env_size; |
| prolog_msg_ptr->spank_job_env = xduparray(job_ptr->spank_job_env_size, |
| job_ptr->spank_job_env); |
| |
| if (job_ptr->bit_flags & STEPMGR_ENABLED) { |
| node_record_t *bit_node; |
| |
| /* Only keep pointers to nodes */ |
| list_t *job_node_array = list_create(NULL); |
| for (int i = 0; |
| (bit_node = next_node_bitmap(job_ptr->node_bitmap, &i)); |
| i++) { |
| list_append(job_node_array, bit_node); |
| } |
| |
| /* |
| * Pack while we are in locks so that we don't need to make a |
| * copies of job_ptr and job_node_array since the agent queue |
| * doesn't pack until sending. |
| */ |
| prolog_msg_ptr->job_ptr_buf = init_buf(BUF_SIZE); |
| job_record_pack(job_ptr, slurmctld_tres_cnt, |
| prolog_msg_ptr->job_ptr_buf, protocol_version); |
| |
| prolog_msg_ptr->job_node_array_buf = init_buf(BUF_SIZE); |
| slurm_pack_list(job_node_array, node_record_pack, |
| prolog_msg_ptr->job_node_array_buf, |
| protocol_version); |
| |
| prolog_msg_ptr->part_ptr_buf = init_buf(BUF_SIZE); |
| part_record_pack(job_ptr->part_ptr, |
| prolog_msg_ptr->part_ptr_buf, |
| protocol_version); |
| |
| FREE_NULL_LIST(job_node_array); |
| } |
| |
| xassert(job_ptr->job_resrcs); |
| job_resrcs_ptr = job_ptr->job_resrcs; |
| setup_cred_arg(&cred_arg, job_ptr); |
| cred_arg.step_id.job_id = job_ptr->job_id; |
| cred_arg.step_id.step_id = SLURM_EXTERN_CONT; |
| cred_arg.step_id.step_het_comp = NO_VAL; |
| if (job_resrcs_ptr->memory_allocated) { |
| slurm_array64_to_value_reps(job_resrcs_ptr->memory_allocated, |
| job_resrcs_ptr->nhosts, |
| &cred_arg.job_mem_alloc, |
| &cred_arg.job_mem_alloc_rep_count, |
| &cred_arg.job_mem_alloc_size); |
| } |
| |
| cred_arg.step_core_bitmap = job_resrcs_ptr->core_bitmap; |
| cred_arg.step_hostlist = job_ptr->job_resrcs->nodes; |
| |
| switch_g_extern_stepinfo(&cred_arg.switch_step, job_ptr); |
| |
| prolog_msg_ptr->cred = slurm_cred_create(&cred_arg, false, |
| protocol_version); |
| switch_g_free_stepinfo(cred_arg.switch_step); |
| xfree(cred_arg.job_mem_alloc); |
| xfree(cred_arg.job_mem_alloc_rep_count); |
| |
| if (!prolog_msg_ptr->cred) { |
| error("%s: slurm_cred_create failure for %pJ, holding job", |
| __func__, job_ptr); |
| slurm_free_prolog_launch_msg(prolog_msg_ptr); |
| job_mgr_handle_cred_failure(job_ptr); |
| return; |
| } |
| |
| agent_arg_ptr = xmalloc(sizeof(agent_arg_t)); |
| agent_arg_ptr->retry = 0; |
| agent_arg_ptr->protocol_version = protocol_version; |
| agent_arg_ptr->hostlist = hostlist_create(job_ptr->nodes); |
| agent_arg_ptr->node_count = job_ptr->node_cnt; |
| agent_arg_ptr->msg_type = REQUEST_LAUNCH_PROLOG; |
| agent_arg_ptr->msg_args = (void *) prolog_msg_ptr; |
| agent_arg_ptr->msg_flags = msg_flags; |
| |
| /* At least on a Cray we have to treat this as a real step, so |
| * this is where to do it. |
| */ |
| if (slurm_conf.prolog_flags & PROLOG_FLAG_CONTAIN) { |
| step_record_t *step_ptr = build_extern_step(job_ptr); |
| if (!step_ptr) |
| error("%s: build_extern_step failure for %pJ", |
| __func__, job_ptr); |
| } |
| |
| job_ptr->prolog_launch_time = time(NULL); |
| |
| /* Launch the RPC via agent */ |
| set_agent_arg_r_uid(agent_arg_ptr, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_arg_ptr); |
| } |
| |
| /* |
| * valid_feature_counts - validate a job's features can be satisfied |
| * by the selected nodes (NOTE: does not process MOR or XAND operators) |
| * IN job_ptr - job to operate on |
| * IN use_active - if set, then only consider nodes with the identified features |
| * active, otherwise use available features |
| * IN/OUT node_bitmap - nodes available for use, clear if unusable |
| * OUT has_mor - set if MOR/XAND found in feature expression |
| * RET SLURM_SUCCESS or error |
| */ |
| extern int valid_feature_counts(job_record_t *job_ptr, bool use_active, |
| bitstr_t *node_bitmap, bool *has_mor) |
| { |
| job_details_t *detail_ptr = job_ptr->details; |
| list_itr_t *job_feat_iter; |
| job_feature_t *job_feat_ptr; |
| int last_op = FEATURE_OP_AND, last_paren_op = FEATURE_OP_AND; |
| int last_paren_cnt = 0; |
| bitstr_t *feature_bitmap, *paren_bitmap = NULL; |
| bitstr_t *tmp_bitmap, *work_bitmap; |
| bool have_count = false, user_update; |
| int rc = SLURM_SUCCESS; |
| list_t *feature_list = NULL; |
| char *features; |
| |
| xassert(detail_ptr); |
| xassert(node_bitmap); |
| xassert(has_mor); |
| |
| /* |
| * This is used in two different ways. 1 to pick nodes where |
| * feature_use is set and another to set the predicted start time where |
| * it isn't. |
| */ |
| if (detail_ptr->features_use) { |
| feature_list = detail_ptr->feature_list_use; |
| features = detail_ptr->features_use; |
| } else { |
| feature_list = detail_ptr->feature_list; |
| features = detail_ptr->features; |
| } |
| |
| *has_mor = false; |
| if (!feature_list) /* no constraints */ |
| return rc; |
| |
| user_update = node_features_g_user_update(job_ptr->user_id); |
| find_feature_nodes(feature_list, user_update); |
| feature_bitmap = bit_copy(node_bitmap); |
| work_bitmap = feature_bitmap; |
| job_feat_iter = list_iterator_create(feature_list); |
| while ((job_feat_ptr = list_next(job_feat_iter))) { |
| if (last_paren_cnt < job_feat_ptr->paren) { |
| /* Start of expression in parenthesis */ |
| /* |
| * If this pair of parentheses is inside of brackets, |
| * then this is XAND or MOR. Set last_paren_op to |
| * avoid incorrectly doing bit_and() or bit_or() at the |
| * end of parentheses. This only matters if the |
| * parentheses are the first thing inside of brackets, |
| * in which case last_op is AND or OR depending on what |
| * (if anything) came before the brackets. If the |
| * parentheses are not the first thing inside of |
| * brackets then last_op is XAND or MOR. |
| */ |
| if (job_feat_ptr->bracket && |
| (last_op != FEATURE_OP_XAND) && |
| (last_op != FEATURE_OP_MOR)) |
| last_paren_op = FEATURE_OP_XAND; |
| else |
| last_paren_op = last_op; |
| last_op = FEATURE_OP_AND; |
| if (paren_bitmap) { |
| if (job_ptr->job_id) { |
| error("%s: %pJ has bad feature expression: %s", |
| __func__, job_ptr, |
| features); |
| } else { |
| error("%s: Reservation has bad feature expression: %s", |
| __func__, features); |
| } |
| FREE_NULL_BITMAP(paren_bitmap); |
| } |
| paren_bitmap = bit_copy(node_bitmap); |
| work_bitmap = paren_bitmap; |
| } |
| |
| if (use_active) |
| tmp_bitmap = job_feat_ptr->node_bitmap_active; |
| else |
| tmp_bitmap = job_feat_ptr->node_bitmap_avail; |
| if (tmp_bitmap) { |
| /* |
| * Here we need to use the current feature for MOR/AND |
| * not the last_op. For instance fastio&[xeon|nehalem] |
| * should ignore xeon (in valid_feature_count), but if |
| * would be based on last_op it will see AND operation. |
| * This should only be used when dealing with middle |
| * options, not for the end as done in the last_paren |
| * check below. |
| */ |
| if ((job_feat_ptr->op_code == FEATURE_OP_MOR) || |
| (job_feat_ptr->op_code == FEATURE_OP_XAND)) { |
| *has_mor = true; |
| } else if (last_op == FEATURE_OP_AND) { |
| bit_and(work_bitmap, tmp_bitmap); |
| } else if (last_op == FEATURE_OP_OR) { |
| bit_or(work_bitmap, tmp_bitmap); |
| } |
| } else { /* feature not found */ |
| if (last_op == FEATURE_OP_AND) |
| bit_clear_all(work_bitmap); |
| } |
| if (job_feat_ptr->count) |
| have_count = true; |
| |
| if (last_paren_cnt > job_feat_ptr->paren) { |
| /* End of expression in parenthesis */ |
| if (last_paren_op == FEATURE_OP_AND) { |
| bit_and(feature_bitmap, work_bitmap); |
| } else if (last_paren_op == FEATURE_OP_OR) { |
| bit_or(feature_bitmap, work_bitmap); |
| } else { /* FEATURE_OP_MOR or FEATURE_OP_XAND */ |
| *has_mor = true; |
| } |
| FREE_NULL_BITMAP(paren_bitmap); |
| work_bitmap = feature_bitmap; |
| } |
| |
| last_op = job_feat_ptr->op_code; |
| last_paren_cnt = job_feat_ptr->paren; |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_NODE_FEATURES) { |
| char *tmp_f, *tmp_w, *tmp_t; |
| tmp_f = bitmap2node_name(feature_bitmap); |
| tmp_w = bitmap2node_name(work_bitmap); |
| tmp_t = bitmap2node_name(tmp_bitmap); |
| log_flag(NODE_FEATURES, "%s: feature:%s feature_bitmap:%s work_bitmap:%s tmp_bitmap:%s count:%u", |
| __func__, job_feat_ptr->name, tmp_f, tmp_w, |
| tmp_t, job_feat_ptr->count); |
| xfree(tmp_f); |
| xfree(tmp_w); |
| xfree(tmp_t); |
| } |
| } |
| list_iterator_destroy(job_feat_iter); |
| if (!have_count) |
| bit_and(node_bitmap, work_bitmap); |
| FREE_NULL_BITMAP(feature_bitmap); |
| FREE_NULL_BITMAP(paren_bitmap); |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_NODE_FEATURES) { |
| char *tmp = bitmap2node_name(node_bitmap); |
| log_flag(NODE_FEATURES, "%s: NODES:%s HAS_MOR:%c status:%s", |
| __func__, tmp, (*has_mor ? 'T' : 'F'), |
| slurm_strerror(rc)); |
| xfree(tmp); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * job_req_node_filter - job request node filter. |
| * clear from a bitmap the nodes which can not be used for a job |
| * test memory size, required features, processor count, etc. |
| * NOTE: Does not support exclusive OR of features. |
| * It just matches first element of MOR and ignores count. |
| * IN job_ptr - pointer to node to be scheduled |
| * IN/OUT bitmap - set of nodes being considered for use |
| * RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features) |
| */ |
| extern int job_req_node_filter(job_record_t *job_ptr, |
| bitstr_t *avail_bitmap, bool test_only) |
| { |
| job_details_t *detail_ptr = job_ptr->details; |
| multi_core_data_t *mc_ptr; |
| node_record_t *node_ptr; |
| bool has_mor = false; |
| |
| if (detail_ptr == NULL) { |
| error("%s: %pJ has no details", |
| __func__, job_ptr); |
| return EINVAL; |
| } |
| |
| mc_ptr = detail_ptr->mc_ptr; |
| for (int i = 0; (node_ptr = next_node_bitmap(avail_bitmap, &i)); i++) { |
| if ((detail_ptr->pn_min_cpus > node_ptr->cpus) || |
| ((detail_ptr->pn_min_memory & (~MEM_PER_CPU)) > |
| node_ptr->real_memory) || |
| ((detail_ptr->pn_min_memory & (MEM_PER_CPU)) && |
| ((detail_ptr->pn_min_memory & (~MEM_PER_CPU)) * |
| detail_ptr->pn_min_cpus) > |
| node_ptr->real_memory) || |
| (detail_ptr->pn_min_tmp_disk > |
| node_ptr->tmp_disk)) { |
| bit_clear(avail_bitmap, i); |
| continue; |
| } |
| if (mc_ptr && |
| (((mc_ptr->sockets_per_node > node_ptr->tot_sockets) && |
| (mc_ptr->sockets_per_node != NO_VAL16)) || |
| ((mc_ptr->cores_per_socket > node_ptr->cores) && |
| (mc_ptr->cores_per_socket != NO_VAL16)) || |
| ((mc_ptr->threads_per_core > node_ptr->threads) && |
| (mc_ptr->threads_per_core != NO_VAL16)))) { |
| bit_clear(avail_bitmap, i); |
| continue; |
| } |
| } |
| |
| return valid_feature_counts(job_ptr, false, avail_bitmap, &has_mor); |
| } |
| |
| /* |
| * Split the node set record in two |
| * IN node_set_ptr - array of node_set records |
| * IN config_ptr - configuration info for the nodes being added to a node set |
| * IN nset_inx_base - index of original/base node_set to split |
| * IN nset_inx - index of the new node_set record |
| * IN nset_feature_bits - feature bitmap for the new node_set record |
| * IN nset_node_bitmap - bitmap of nodes for the new node_set record |
| * IN nset_flags - flags of nodes for the new node_set record |
| */ |
| static void _split_node_set(struct node_set *nset, config_record_t *config_ptr, |
| int nset_inx_base, int nset_inx, |
| bitstr_t *nset_feature_bits, |
| bitstr_t *nset_node_bitmap, uint32_t nset_flags) |
| { |
| nset[nset_inx].cpus_per_node = config_ptr->cpus; |
| nset[nset_inx].features = xstrdup(config_ptr->feature); |
| nset[nset_inx].feature_bits = bit_copy(nset_feature_bits); |
| nset[nset_inx].flags = nset_flags; |
| nset[nset_inx].real_memory = config_ptr->real_memory; |
| nset[nset_inx].node_weight = nset[nset_inx_base].node_weight; |
| |
| /* |
| * The bitmap of this new nodeset will contain only the nodes that |
| * are present both in the original bitmap AND in the new bitmap. |
| */ |
| nset[nset_inx].my_bitmap = bit_copy(nset[nset_inx_base].my_bitmap); |
| bit_and(nset[nset_inx].my_bitmap, nset_node_bitmap); |
| nset[nset_inx].node_cnt = bit_set_count(nset[nset_inx].my_bitmap); |
| |
| /* Now we remove these nodes from the original bitmap */ |
| bit_and_not(nset[nset_inx_base].my_bitmap, nset_node_bitmap); |
| nset[nset_inx_base].node_cnt -= nset[nset_inx].node_cnt; |
| } |
| |
| /* Split from an existing node_set */ |
| static void _split_node_set2(struct node_set *nset, int idx, int *last_inx, |
| int cnt, bitstr_t *nset_bitmap, |
| uint32_t nset_flags) |
| { |
| nset[*last_inx].cpus_per_node = nset[idx].cpus_per_node; |
| nset[*last_inx].features = xstrdup(nset[idx].features); |
| nset[*last_inx].feature_bits = bit_copy(nset[idx].feature_bits); |
| nset[*last_inx].flags = nset_flags; |
| nset[*last_inx].real_memory = nset[idx].real_memory; |
| nset[*last_inx].node_weight = nset[idx].node_weight; |
| |
| nset[*last_inx].my_bitmap = bit_copy(nset[idx].my_bitmap); |
| bit_and(nset[*last_inx].my_bitmap, nset_bitmap); |
| nset[*last_inx].node_cnt = cnt; |
| |
| /* Remove the bits and count from the original set */ |
| bit_and_not(nset[idx].my_bitmap, nset_bitmap); |
| nset[idx].node_cnt -= cnt; |
| |
| (*last_inx)++; |
| } |
| |
| static void _apply_extra_constraints(job_record_t *job_ptr, |
| bitstr_t *usable_node_mask) |
| { |
| node_record_t *node_ptr = NULL; |
| |
| xassert(job_ptr->extra); |
| xassert(job_ptr->extra_constraints); |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(usable_node_mask, &i)); |
| i++) { |
| if (!node_ptr->extra_data) { |
| bit_clear(usable_node_mask, i); |
| continue; |
| } |
| |
| if (!extra_constraints_test(job_ptr->extra_constraints, |
| node_ptr->extra_data)) { |
| bit_clear(usable_node_mask, i); |
| continue; |
| } |
| } |
| } |
| |
| /* |
| * _build_node_list - identify which nodes could be allocated to a job |
| * based upon node features, memory, processors, etc. Note that a |
| * bitmap is set to indicate which of the job's features that the |
| * nodes satisfy. |
| * IN job_ptr - pointer to node to be scheduled |
| * OUT node_set_pptr - list of node sets which could be used for the job |
| * OUT node_set_size - number of node_set entries |
| * OUT err_msg - error message for job, caller must xfree |
| * IN test_only - true if only testing if job can be started at some point |
| * IN can_reboot - if true node can use any available feature, |
| * else job can use only active features |
| * RET error code |
| */ |
| static int _build_node_list(job_record_t *job_ptr, |
| struct node_set **node_set_pptr, |
| int *node_set_size, char **err_msg, bool test_only, |
| bool can_reboot) |
| { |
| int adj_cpus, i, node_set_inx, node_set_len, node_set_inx_base; |
| int rc, qos_cnt; |
| struct node_set *node_set_ptr, *prev_node_set_ptr; |
| config_record_t *config_ptr; |
| part_record_t *part_ptr = job_ptr->part_ptr; |
| list_itr_t *config_iterator; |
| int total_cores; |
| job_details_t *detail_ptr = job_ptr->details; |
| bitstr_t *usable_node_mask = NULL; |
| multi_core_data_t *mc_ptr = detail_ptr->mc_ptr; |
| bitstr_t *tmp_feature; |
| bitstr_t *grp_node_bitmap; |
| bool has_mor = false; |
| bool resv_overlap = false; |
| bitstr_t *node_maps[NM_TYPES] = { NULL, NULL, NULL, NULL, NULL, NULL }; |
| bitstr_t *reboot_bitmap = NULL; |
| |
| if (job_ptr->resv_name) { |
| /* |
| * Limit node selection to those in selected reservation. |
| * Assume node reboot required since we have not selected the |
| * compute nodes yet. |
| */ |
| time_t start_res = time(NULL); |
| rc = job_test_resv(job_ptr, &start_res, false, |
| &usable_node_mask, NULL, &resv_overlap, |
| true); |
| if (rc != SLURM_SUCCESS) { |
| job_ptr->state_reason = WAIT_RESERVATION; |
| xfree(job_ptr->state_desc); |
| if (rc == ESLURM_INVALID_TIME_VALUE) |
| return ESLURM_RESERVATION_NOT_USABLE; |
| |
| if (rc == ESLURM_NODES_BUSY) |
| return ESLURM_NODES_BUSY; |
| |
| if (err_msg) { |
| xfree(*err_msg); |
| *err_msg = xstrdup("Problem using reservation"); |
| } |
| return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } |
| if ((detail_ptr->req_node_bitmap) && |
| (!bit_super_set(detail_ptr->req_node_bitmap, |
| usable_node_mask))) { |
| job_ptr->state_reason = WAIT_RESERVATION; |
| xfree(job_ptr->state_desc); |
| FREE_NULL_BITMAP(usable_node_mask); |
| if (err_msg) { |
| xfree(*err_msg); |
| *err_msg = xstrdup("Required nodes outside of " |
| "the reservation"); |
| } |
| return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| } |
| if (resv_overlap && bit_ffs(usable_node_mask) < 0) { |
| job_ptr->state_reason = WAIT_NODE_NOT_AVAIL; |
| xfree(job_ptr->state_desc); |
| xstrfmtcat(job_ptr->state_desc, |
| "ReqNodeNotAvail, Reserved for maintenance"); |
| FREE_NULL_BITMAP(usable_node_mask); |
| return ESLURM_RESERVATION_BUSY; /* All reserved */ |
| } |
| } |
| |
| if (detail_ptr->exc_node_bitmap) { |
| if (usable_node_mask) { |
| bit_and_not(usable_node_mask, detail_ptr->exc_node_bitmap); |
| } else { |
| usable_node_mask = |
| bit_copy(detail_ptr->exc_node_bitmap); |
| bit_not(usable_node_mask); |
| } |
| } else if (usable_node_mask == NULL) { |
| usable_node_mask = node_conf_get_active_bitmap(); |
| } |
| |
| if (!(job_ptr->bit_flags & EXTERNAL_JOB)) { |
| bit_and_not(usable_node_mask, external_node_bitmap); |
| } |
| |
| if (!test_only && job_ptr->extra_constraints) { |
| _apply_extra_constraints(job_ptr, usable_node_mask); |
| if (!bit_set_count(usable_node_mask)) { |
| rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| debug("%s: No nodes satisfy %pJ extra constraints in partition %s", |
| __func__, job_ptr, job_ptr->part_ptr->name); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_CONSTRAINTS; |
| debug2("%s: setting %pJ to \"%s\" (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(rc)); |
| FREE_NULL_BITMAP(usable_node_mask); |
| return rc; |
| } |
| } |
| |
| if ((rc = valid_feature_counts(job_ptr, false, usable_node_mask, |
| &has_mor))) { |
| info("%pJ feature requirements can not be satisfied: %s", |
| job_ptr, slurm_strerror(rc)); |
| FREE_NULL_BITMAP(usable_node_mask); |
| if (err_msg) { |
| xfree(*err_msg); |
| *err_msg = xstrdup("Node feature requirements can not " |
| "be satisfied"); |
| } |
| return rc; |
| } |
| |
| if (can_reboot) |
| reboot_bitmap = bit_alloc(node_record_count); |
| node_set_inx = 0; |
| node_set_len = list_count(config_list) * 32 + 1; |
| node_set_ptr = xcalloc(node_set_len, sizeof(struct node_set)); |
| config_iterator = list_iterator_create(config_list); |
| while ((config_ptr = list_next(config_iterator))) { |
| bool cpus_ok = false, mem_ok = false, disk_ok = false; |
| bool job_mc_ok = false, config_filter = false; |
| total_cores = config_ptr->tot_sockets * config_ptr->cores; |
| adj_cpus = adjust_cpus_nppcu(_get_ntasks_per_core(detail_ptr), |
| detail_ptr->cpus_per_task, |
| total_cores, config_ptr->cpus); |
| if (detail_ptr->pn_min_cpus <= adj_cpus) |
| cpus_ok = true; |
| if ((detail_ptr->pn_min_memory & (~MEM_PER_CPU)) <= |
| config_ptr->real_memory) |
| mem_ok = true; |
| if (detail_ptr->pn_min_tmp_disk <= config_ptr->tmp_disk) |
| disk_ok = true; |
| if (!mc_ptr) |
| job_mc_ok = true; |
| if (mc_ptr && |
| (((mc_ptr->sockets_per_node <= config_ptr->tot_sockets) || |
| (mc_ptr->sockets_per_node == NO_VAL16)) && |
| ((mc_ptr->cores_per_socket <= config_ptr->cores) || |
| (mc_ptr->cores_per_socket == NO_VAL16)) && |
| ((mc_ptr->threads_per_core <= config_ptr->threads) || |
| (mc_ptr->threads_per_core == NO_VAL16)))) |
| job_mc_ok = true; |
| config_filter = !(cpus_ok && mem_ok && disk_ok && job_mc_ok); |
| /* |
| * since nodes can register with more resources than defined |
| * in the configuration, we want to use those higher values |
| * for scheduling, but only as needed (slower) |
| */ |
| node_set_ptr[node_set_inx].my_bitmap = |
| bit_copy(config_ptr->node_bitmap); |
| bit_and(node_set_ptr[node_set_inx].my_bitmap, |
| part_ptr->node_bitmap); |
| if (usable_node_mask) { |
| bit_and(node_set_ptr[node_set_inx].my_bitmap, |
| usable_node_mask); |
| } |
| node_set_ptr[node_set_inx].node_cnt = |
| bit_set_count(node_set_ptr[node_set_inx].my_bitmap); |
| if (node_set_ptr[node_set_inx].node_cnt == 0) { |
| debug2("%s: JobId=%u matched 0 nodes (%s) due to job partition or features", |
| __func__, job_ptr->job_id, config_ptr->nodes); |
| FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap); |
| continue; |
| } |
| |
| if (config_filter) { |
| _set_err_msg(cpus_ok, mem_ok, disk_ok, job_mc_ok, |
| err_msg); |
| debug2("%s: JobId=%u filtered all nodes (%s): %s", |
| __func__, job_ptr->job_id, config_ptr->nodes, |
| err_msg ? *err_msg : NULL); |
| FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap); |
| continue; |
| } |
| |
| if (has_mor) { |
| tmp_feature = _valid_features(job_ptr, config_ptr, |
| can_reboot, reboot_bitmap); |
| if (tmp_feature == NULL) { |
| debug2("%s: JobId=%u matched 0 nodes (%s) due to MOR job features", |
| __func__, job_ptr->job_id, |
| config_ptr->nodes); |
| FREE_NULL_BITMAP(node_set_ptr[node_set_inx]. |
| my_bitmap); |
| continue; |
| } |
| } else { |
| /* We've already filtered for AND/OR features */ |
| tmp_feature = bit_alloc(MAX_FEATURES); |
| bit_set(tmp_feature, 0); |
| } |
| /* NOTE: FREE_NULL_BITMAP(tmp_feature) to avoid memory leak */ |
| |
| node_set_ptr[node_set_inx].cpus_per_node = |
| config_ptr->cpus; |
| node_set_ptr[node_set_inx].real_memory = |
| config_ptr->real_memory; |
| node_set_ptr[node_set_inx].node_weight = config_ptr->weight; |
| node_set_ptr[node_set_inx].features = |
| xstrdup(config_ptr->feature); |
| node_set_ptr[node_set_inx].feature_bits = tmp_feature; |
| debug2("found %u usable nodes from config containing %s", |
| node_set_ptr[node_set_inx].node_cnt, config_ptr->nodes); |
| prev_node_set_ptr = node_set_ptr + node_set_inx; |
| node_set_inx++; |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| |
| /* |
| * If we have a FLEX reservation we will want a nodeset for |
| * those nodes outside the reservation. |
| */ |
| if (job_ptr->resv_ptr && |
| (job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) && |
| job_ptr->resv_ptr->node_bitmap && |
| !bit_super_set(prev_node_set_ptr->my_bitmap, |
| job_ptr->resv_ptr->node_bitmap)) { |
| node_maps[IN_FL] = |
| bit_copy(job_ptr->resv_ptr->node_bitmap); |
| node_maps[OUT_FL] = |
| bit_copy(prev_node_set_ptr->my_bitmap); |
| bit_and_not(node_maps[OUT_FL], node_maps[IN_FL]); |
| } |
| |
| /* Identify the nodes that need reboot for use */ |
| if (!test_only && can_reboot) { |
| if (has_mor) { |
| node_maps[REBOOT] = bit_copy(reboot_bitmap); |
| } else { |
| (void) _match_feature( |
| job_ptr->details->feature_list_use, |
| &node_maps[REBOOT]); |
| } |
| /* No nodes in set require reboot */ |
| if (node_maps[REBOOT] && |
| !bit_overlap_any(prev_node_set_ptr->my_bitmap, |
| node_maps[REBOOT])) |
| FREE_NULL_BITMAP(node_maps[REBOOT]); |
| } |
| |
| /* No nodes to split from this node set */ |
| if (!node_maps[OUT_FL] && !node_maps[REBOOT]) |
| continue; |
| |
| /* Just need to split these nodes that need reboot */ |
| if (!node_maps[OUT_FL] && node_maps[REBOOT]) { |
| if (bit_super_set(prev_node_set_ptr->my_bitmap, |
| node_maps[REBOOT])) { |
| /* All nodes in set require reboot */ |
| prev_node_set_ptr->flags = NODE_SET_REBOOT; |
| goto end_node_set; |
| } |
| node_set_inx_base = node_set_inx - 1; |
| _split_node_set(node_set_ptr, config_ptr, |
| node_set_inx_base, node_set_inx, |
| tmp_feature, node_maps[REBOOT], |
| NODE_SET_REBOOT); |
| node_set_inx++; |
| goto end_node_set; |
| } |
| |
| /* Just need to split for these nodes that are outside FLEX */ |
| if (node_maps[OUT_FL] && !node_maps[REBOOT]) { |
| if (bit_super_set(prev_node_set_ptr->my_bitmap, |
| node_maps[OUT_FL])) { |
| /* All nodes outside of flex reservation */ |
| prev_node_set_ptr->flags =NODE_SET_OUTSIDE_FLEX; |
| goto end_node_set; |
| } |
| node_set_inx_base = node_set_inx - 1; |
| _split_node_set(node_set_ptr, config_ptr, |
| node_set_inx_base, node_set_inx, |
| tmp_feature, node_maps[OUT_FL], |
| NODE_SET_OUTSIDE_FLEX); |
| node_set_inx++; |
| goto end_node_set; |
| } |
| |
| /* We may have to split in several subsets */ |
| if (node_maps[OUT_FL] && node_maps[REBOOT]) { |
| node_maps[IN_FL_RE] = bit_copy(node_maps[IN_FL]); |
| bit_and(node_maps[IN_FL_RE], node_maps[REBOOT]); |
| |
| node_maps[OUT_FL_RE] = bit_copy(node_maps[OUT_FL]); |
| bit_and(node_maps[OUT_FL_RE], node_maps[REBOOT]); |
| |
| node_maps[OUT_FL_NO_RE] = bit_copy(node_maps[OUT_FL]); |
| bit_and_not(node_maps[OUT_FL_NO_RE], |
| node_maps[REBOOT]); |
| } |
| |
| /* |
| * All nodes in this set should be avoided. No need to split. |
| * Just set the FLAGS and the Weight. |
| */ |
| if (bit_super_set(prev_node_set_ptr->my_bitmap, |
| node_maps[IN_FL_RE])) { |
| prev_node_set_ptr->flags = NODE_SET_REBOOT; |
| goto end_node_set; |
| } |
| if (bit_super_set(prev_node_set_ptr->my_bitmap, |
| node_maps[OUT_FL_NO_RE])) { |
| prev_node_set_ptr->flags = NODE_SET_OUTSIDE_FLEX; |
| goto end_node_set; |
| } |
| if (bit_super_set(prev_node_set_ptr->my_bitmap, |
| node_maps[OUT_FL_RE])) { |
| prev_node_set_ptr->flags = (NODE_SET_OUTSIDE_FLEX | |
| NODE_SET_REBOOT); |
| goto end_node_set; |
| } |
| |
| /* |
| * At this point we split the node set record in four, |
| * in this order of priority: |
| * |
| * 1. Inside flex reservation and need to reboot |
| * 2. Outside flex reservation and NO need to reboot |
| * 3. Outside flex reservation and need to reboot |
| * 4. Available now, inside the flex reservation and NO need |
| * to reboot |
| * |
| * If there are no such reservations or need to reboot, |
| * additional nodesets will not be created. |
| */ |
| |
| node_set_inx_base = node_set_inx - 1; |
| |
| if (node_maps[IN_FL_RE]) { |
| _split_node_set(node_set_ptr, config_ptr, |
| node_set_inx_base, node_set_inx, |
| tmp_feature, node_maps[IN_FL_RE], |
| NODE_SET_REBOOT); |
| FREE_NULL_BITMAP(node_maps[IN_FL_RE]); |
| node_set_inx++; |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| } |
| |
| if (node_maps[OUT_FL_NO_RE]) { |
| _split_node_set(node_set_ptr, config_ptr, |
| node_set_inx_base, node_set_inx, |
| tmp_feature, node_maps[OUT_FL_NO_RE], |
| (NODE_SET_OUTSIDE_FLEX)); |
| FREE_NULL_BITMAP(node_maps[OUT_FL_NO_RE]); |
| node_set_inx++; |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| } |
| |
| if (node_maps[OUT_FL_RE]) { |
| _split_node_set(node_set_ptr, config_ptr, |
| node_set_inx_base, node_set_inx, |
| tmp_feature, node_maps[OUT_FL_RE], |
| (NODE_SET_OUTSIDE_FLEX | |
| NODE_SET_REBOOT)); |
| FREE_NULL_BITMAP(node_maps[OUT_FL_RE]); |
| node_set_inx++; |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| } |
| |
| end_node_set: |
| for (i = 0; i < NM_TYPES; i++) |
| FREE_NULL_BITMAP(node_maps[i]); |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| } |
| list_iterator_destroy(config_iterator); |
| |
| /* eliminate any incomplete node_set record */ |
| xfree(node_set_ptr[node_set_inx].features); |
| FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap); |
| FREE_NULL_BITMAP(node_set_ptr[node_set_inx].feature_bits); |
| FREE_NULL_BITMAP(usable_node_mask); |
| |
| if (node_set_inx == 0) { |
| rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| info("%s: No nodes satisfy %pJ requirements in partition %s", |
| __func__, job_ptr, job_ptr->part_ptr->name); |
| xfree(node_set_ptr); |
| xfree(job_ptr->state_desc); |
| job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; |
| debug2("%s: setting %pJ to \"%s\" (%s)", |
| __func__, job_ptr, |
| job_state_reason_string(job_ptr->state_reason), |
| slurm_strerror(rc)); |
| FREE_NULL_BITMAP(reboot_bitmap); |
| return rc; |
| } |
| |
| /* |
| * Clear message about any nodes which fail to satisfy specific |
| * job requirements as there are some nodes which can be used |
| */ |
| if (err_msg) |
| xfree(*err_msg); |
| |
| /* |
| * If any nodes are powered down or powering up, put them into a |
| * new node_sets record with a higher scheduling weight. This means |
| * we avoid scheduling jobs on powered down and powering up nodes where |
| * possible. If those are required we prefer powering up nodes over |
| * powered down nodes. |
| */ |
| for (i = (node_set_inx - 1); i >= 0; i--) { |
| int booting_cnt = bit_overlap(node_set_ptr[i].my_bitmap, |
| booting_node_bitmap); |
| if (booting_cnt == 0) |
| continue; /* no nodes powering up */ |
| if (booting_cnt == node_set_ptr[i].node_cnt) { |
| node_set_ptr[i].flags = NODE_SET_POWERING_UP; |
| continue; /* all nodes powering up */ |
| } |
| |
| /* Some nodes powering up, split record */ |
| _split_node_set2(node_set_ptr, i, &node_set_inx, booting_cnt, |
| booting_node_bitmap, NODE_SET_POWERING_UP); |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| } |
| |
| for (i = (node_set_inx-1); i >= 0; i--) { |
| int power_cnt = bit_overlap(node_set_ptr[i].my_bitmap, |
| power_down_node_bitmap); |
| if (power_cnt == 0) |
| continue; /* no nodes powered down */ |
| if (power_cnt == node_set_ptr[i].node_cnt) { |
| node_set_ptr[i].flags = NODE_SET_POWER_DN; |
| continue; /* all nodes powered down */ |
| } |
| |
| /* Some nodes powered down, others up, split record */ |
| _split_node_set2(node_set_ptr, i, &node_set_inx, power_cnt, |
| power_down_node_bitmap, NODE_SET_POWER_DN); |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| } |
| |
| grp_node_bitmap = _find_grp_node_bitmap(job_ptr); |
| |
| if (grp_node_bitmap) { |
| #if _DEBUG |
| char node_bitstr[64]; |
| bit_fmt(node_bitstr, sizeof(node_bitstr), grp_node_bitmap); |
| info("%s: _find_grp_node_bitmap() grp_node_bitmap:%s", __func__, node_bitstr); |
| #endif |
| for (i = (node_set_inx-1); i >= 0; i--) { |
| qos_cnt = bit_overlap(node_set_ptr[i].my_bitmap, |
| grp_node_bitmap); |
| if (qos_cnt == 0) { |
| node_set_ptr[node_set_inx].node_weight += 1; |
| continue; /* no nodes overlap */ |
| } |
| if (qos_cnt == node_set_ptr[i].node_cnt) { |
| continue; /* all nodes overlap */ |
| } |
| /* Some nodes overlap, split record */ |
| _split_node_set2(node_set_ptr, i, &node_set_inx, |
| qos_cnt, grp_node_bitmap, |
| node_set_ptr[i].flags); |
| node_set_ptr[i].node_weight++; |
| if (node_set_inx >= node_set_len) { |
| error("%s: node_set buffer filled", __func__); |
| break; |
| } |
| } |
| FREE_NULL_BITMAP(grp_node_bitmap); |
| } |
| FREE_NULL_BITMAP(reboot_bitmap); |
| *node_set_size = node_set_inx; |
| *node_set_pptr = node_set_ptr; |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * For a given node_set, set a scheduling weight based upon a combination of |
| * node_weight and flags (e.g. try to avoid reboot). |
| * 0x20000000000 - Requires boot |
| * 0x10000000000 - Outside of flex reservation |
| * 0x0########00 - Node weight |
| * 0x000000000## - Reserved for cons_tres, favor nodes with co-located CPU/GPU |
| */ |
| static void _set_sched_weight(struct node_set *node_set_ptr) |
| { |
| xassert(node_set_ptr); |
| |
| node_set_ptr->sched_weight = node_set_ptr->node_weight << 8; |
| node_set_ptr->sched_weight |= 0xff; |
| if ((node_set_ptr->flags & NODE_SET_REBOOT) || |
| (node_set_ptr->flags & NODE_SET_POWER_DN)) /* Boot required */ |
| node_set_ptr->sched_weight |= 0x30000000000; |
| else if ((node_set_ptr->flags & NODE_SET_POWERING_UP)) |
| node_set_ptr->sched_weight |= 0x20000000000; |
| else if (node_set_ptr->flags & NODE_SET_OUTSIDE_FLEX || |
| node_set_ptr->flags & NODE_SET_POWERING_UP) |
| node_set_ptr->sched_weight |= 0x10000000000; |
| } |
| |
| static int _sort_node_set(const void *x, const void *y) |
| { |
| struct node_set *node_set_ptr1 = (struct node_set *) x; |
| struct node_set *node_set_ptr2 = (struct node_set *) y; |
| |
| xassert(node_set_ptr1); |
| xassert(node_set_ptr2); |
| |
| if (node_set_ptr1->sched_weight < node_set_ptr2->sched_weight) |
| return -1; |
| if (node_set_ptr1->sched_weight > node_set_ptr2->sched_weight) |
| return 1; |
| return 0; |
| } |
| |
| static void _log_node_set(job_record_t *job_ptr, |
| struct node_set *node_set_ptr, |
| int node_set_size) |
| { |
| char *node_list, feature_bits[64]; |
| int i; |
| |
| if (get_log_level() < LOG_LEVEL_DEBUG2) |
| return; |
| |
| debug2("NodeSet for %pJ", job_ptr); |
| for (i = 0; i < node_set_size; i++) { |
| node_list = bitmap2node_name(node_set_ptr[i].my_bitmap); |
| if (node_set_ptr[i].feature_bits) { |
| bit_fmt(feature_bits, sizeof(feature_bits), |
| node_set_ptr[i].feature_bits); |
| } else |
| feature_bits[0] = '\0'; |
| debug2("NodeSet[%d] Nodes:%s NodeWeight:%u Flags:%u FeatureBits:%s SchedWeight:%"PRIu64, |
| i, node_list, node_set_ptr[i].node_weight, |
| node_set_ptr[i].flags, feature_bits, |
| node_set_ptr[i].sched_weight); |
| xfree(node_list); |
| } |
| } |
| |
| static void _set_err_msg(bool cpus_ok, bool mem_ok, bool disk_ok, |
| bool job_mc_ok, char **err_msg) |
| { |
| if (!err_msg) |
| return; |
| if (!cpus_ok) { |
| xfree(*err_msg); |
| *err_msg = xstrdup("CPU count per node can not be satisfied"); |
| return; |
| } |
| if (!mem_ok) { |
| xfree(*err_msg); |
| *err_msg = xstrdup("Memory specification can not be satisfied"); |
| return; |
| } |
| if (!disk_ok) { |
| xfree(*err_msg); |
| *err_msg = xstrdup("Temporary disk specification can not be " |
| "satisfied"); |
| return; |
| } |
| if (!job_mc_ok) { |
| xfree(*err_msg); |
| *err_msg = xstrdup("Socket, core and/or thread specification " |
| "can not be satisfied"); |
| return; |
| } |
| } |
| |
| /* |
| * _nodes_in_sets - Determine if required nodes are included in node_set(s) |
| * IN req_bitmap - nodes specifically required by the job |
| * IN node_set_ptr - sets of valid nodes |
| * IN node_set_size - count of node_set entries |
| * RET 0 if in set, otherwise an error code |
| */ |
| static int _nodes_in_sets(bitstr_t *req_bitmap, |
| struct node_set * node_set_ptr, |
| int node_set_size) |
| { |
| bitstr_t *scratch_bitmap = NULL; |
| int error_code = SLURM_SUCCESS, i; |
| |
| for (i=0; i<node_set_size; i++) { |
| if (scratch_bitmap) |
| bit_or(scratch_bitmap, |
| node_set_ptr[i].my_bitmap); |
| else { |
| scratch_bitmap = |
| bit_copy(node_set_ptr[i].my_bitmap); |
| } |
| } |
| |
| if ((scratch_bitmap == NULL) |
| || (bit_super_set(req_bitmap, scratch_bitmap) != 1)) |
| error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; |
| |
| FREE_NULL_BITMAP(scratch_bitmap); |
| return error_code; |
| } |
| |
| /* |
| * build_node_details - sets addresses for allocated nodes |
| * IN job_ptr - pointer to a job record |
| * IN new_alloc - set if new job allocation, cleared if state recovery |
| */ |
| extern void build_node_details(job_record_t *job_ptr, bool new_alloc) |
| { |
| hostlist_t *host_list = NULL; |
| node_record_t *node_ptr; |
| char *this_node_name; |
| int node_inx = 0; |
| |
| if ((job_ptr->node_bitmap == NULL) || (job_ptr->nodes == NULL)) { |
| /* No nodes allocated, we're done... */ |
| job_ptr->node_cnt = 0; |
| return; |
| } |
| |
| /* Use hostlist here to ensure ordering of info matches that of srun */ |
| if ((host_list = hostlist_create(job_ptr->nodes)) == NULL) |
| fatal("hostlist_create error for %s: %m", job_ptr->nodes); |
| job_ptr->total_nodes = job_ptr->node_cnt = hostlist_count(host_list); |
| |
| xfree(job_ptr->batch_host); |
| |
| while ((this_node_name = hostlist_shift(host_list))) { |
| if ((node_ptr = find_node_record(this_node_name))) { |
| node_inx++; |
| } else { |
| error("Invalid node %s in %pJ", |
| this_node_name, job_ptr); |
| } |
| if (!job_ptr->batch_host && !job_ptr->batch_features) { |
| /* |
| * Do not select until launch_job() as node features |
| * might be changed by node_features plugin between |
| * allocation time (now) and launch. |
| */ |
| job_ptr->batch_host = xstrdup(this_node_name); |
| } |
| free(this_node_name); |
| } |
| hostlist_destroy(host_list); |
| if (job_ptr->node_cnt != node_inx) { |
| error("Node count mismatch for %pJ (%u,%u)", |
| job_ptr, job_ptr->node_cnt, node_inx); |
| } |
| } |
| |
| /* |
| * Set "batch_host" for this job based upon it's "batch_features" and |
| * "node_bitmap". Selection is performed on a best-effort basis (i.e. if no |
| * node satisfies the batch_features specification then pick first node). |
| * Execute this AFTER any node feature changes are made by the node_features |
| * plugin. |
| * |
| * If changes are made here, see if changes need to be made in |
| * test_job_nodes_ready(). |
| * |
| * Return SLURM_SUCCESS or error code |
| */ |
| extern int pick_batch_host(job_record_t *job_ptr) |
| { |
| int i, i_first; |
| node_record_t *node_ptr; |
| char *tmp, *tok, sep, last_sep = '&'; |
| node_feature_t *feature_ptr; |
| list_itr_t *feature_iter; |
| bitstr_t *feature_bitmap; |
| |
| if (job_ptr->batch_host) |
| return SLURM_SUCCESS; |
| |
| if (!job_ptr->node_bitmap) { |
| error("%s: %pJ lacks a node_bitmap", __func__, job_ptr); |
| return SLURM_ERROR; |
| } |
| |
| i_first = bit_ffs(job_ptr->node_bitmap); |
| if (i_first < 0) { |
| error("%s: %pJ allocated no nodes", __func__, job_ptr); |
| return SLURM_ERROR; |
| } |
| if (!job_ptr->batch_features) { |
| /* Run batch script on first node of job allocation */ |
| node_ptr = node_record_table_ptr[i_first]; |
| job_ptr->batch_host = xstrdup(node_ptr->name); |
| return SLURM_SUCCESS; |
| } |
| |
| feature_bitmap = bit_copy(job_ptr->node_bitmap); |
| tmp = xstrdup(job_ptr->batch_features); |
| tok = tmp; |
| for (i = 0; ; i++) { |
| if (tmp[i] == '&') |
| sep = '&'; |
| else if (tmp[i] == '|') |
| sep = '|'; |
| else if (tmp[i] == '\0') |
| sep = '\0'; |
| else |
| continue; |
| tmp[i] = '\0'; |
| |
| feature_iter = list_iterator_create(active_feature_list); |
| while ((feature_ptr = list_next(feature_iter))) { |
| if (xstrcmp(feature_ptr->name, tok)) |
| continue; |
| if (last_sep == '&') { |
| bit_and(feature_bitmap, |
| feature_ptr->node_bitmap); |
| } else { |
| bit_or(feature_bitmap, |
| feature_ptr->node_bitmap); |
| } |
| break; |
| } |
| list_iterator_destroy(feature_iter); |
| if (!feature_ptr) /* No match */ |
| bit_clear_all(feature_bitmap); |
| if (sep == '\0') |
| break; |
| tok = tmp + i + 1; |
| last_sep = sep; |
| } |
| xfree(tmp); |
| |
| bit_and(feature_bitmap, job_ptr->node_bitmap); |
| if ((i = bit_ffs(feature_bitmap)) >= 0) |
| node_ptr = node_record_table_ptr[i]; |
| else |
| node_ptr = node_record_table_ptr[i_first]; |
| job_ptr->batch_host = xstrdup(node_ptr->name); |
| FREE_NULL_BITMAP(feature_bitmap); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _valid_features - Determine if the requested features are satisfied by |
| * the available nodes. This is only used for MOR operators. |
| * IN job_ptr - job being scheduled |
| * IN config_ptr - node's configuration record |
| * IN can_reboot - if true node can use any available feature, |
| * else job can use only active features |
| * IN reboot_bitmap - bitmap of nodes requiring reboot for use (updated) |
| * RET NULL if request is not satisfied, otherwise a bitmap indicating |
| * which mutually exclusive features are satisfied. For example |
| * _valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns a bitmap with |
| * the third bit set. For another example |
| * _valid_features("[fs1|fs2|fs3|fs4]", "fs1,fs3") returns a bitmap |
| * with the first and third bits set. The function returns a bitmap |
| * with the first bit set if requirements are satisfied without a |
| * mutually exclusive feature list. |
| */ |
| static bitstr_t *_valid_features(job_record_t *job_ptr, |
| config_record_t *config_ptr, |
| bool can_reboot, bitstr_t *reboot_bitmap) |
| { |
| job_details_t *details_ptr = job_ptr->details; |
| bitstr_t *result_node_bitmap = NULL, *paren_node_bitmap = NULL; |
| bitstr_t *working_node_bitmap, *active_node_bitmap = NULL; |
| bitstr_t *tmp_node_bitmap = NULL; |
| list_itr_t *feat_iter; |
| job_feature_t *job_feat_ptr; |
| int last_op = FEATURE_OP_AND, paren_op = FEATURE_OP_AND; |
| int last_paren = 0, position = 0; |
| |
| if (details_ptr->feature_list_use == NULL) { /* no constraints */ |
| result_node_bitmap = bit_alloc(MAX_FEATURES); |
| bit_set(result_node_bitmap, 0); |
| return result_node_bitmap; |
| } |
| |
| feat_iter = list_iterator_create(details_ptr->feature_list_use); |
| while ((job_feat_ptr = list_next(feat_iter))) { |
| if (job_feat_ptr->paren > last_paren) { |
| /* Combine features within parenthesis */ |
| paren_node_bitmap = |
| bit_copy(job_feat_ptr->node_bitmap_avail); |
| if (can_reboot) |
| active_node_bitmap = bit_copy(paren_node_bitmap); |
| last_paren = job_feat_ptr->paren; |
| paren_op = job_feat_ptr->op_code; |
| /* |
| * If this pair of parentheses is inside of brackets, |
| * then this is XAND or MOR. Set last_op so that the |
| * features in parentheses are considered as XAND or |
| * MOR and are evaluated in the if at the bottom of this |
| * loop. This only matters if the parentheses are the |
| * first thing inside of brackets because last_op is |
| * initialized to AND. |
| */ |
| if (job_feat_ptr->bracket && |
| (last_op != FEATURE_OP_XAND) && |
| (last_op != FEATURE_OP_MOR)) |
| last_op = FEATURE_OP_XAND; |
| |
| while ((job_feat_ptr = list_next(feat_iter))) { |
| if ((paren_op == FEATURE_OP_AND) && |
| can_reboot) { |
| bit_and(paren_node_bitmap, |
| job_feat_ptr->node_bitmap_avail); |
| bit_and(active_node_bitmap, |
| job_feat_ptr->node_bitmap_active); |
| } else if (paren_op == FEATURE_OP_AND) { |
| bit_and(paren_node_bitmap, |
| job_feat_ptr->node_bitmap_active); |
| } else if ((paren_op == FEATURE_OP_OR) && |
| can_reboot) { |
| bit_or(paren_node_bitmap, |
| job_feat_ptr->node_bitmap_avail); |
| bit_or(active_node_bitmap, |
| job_feat_ptr->node_bitmap_active); |
| } else if (paren_op == FEATURE_OP_OR) { |
| bit_or(paren_node_bitmap, |
| job_feat_ptr->node_bitmap_active); |
| } else { |
| error("%s: Bad feature expression for %pJ: %s", |
| __func__, job_ptr, |
| details_ptr->features_use); |
| break; |
| } |
| paren_op = job_feat_ptr->op_code; |
| if (job_feat_ptr->paren < last_paren) { |
| last_paren = job_feat_ptr->paren; |
| break; |
| } |
| } |
| working_node_bitmap = paren_node_bitmap; |
| } else { |
| working_node_bitmap = job_feat_ptr->node_bitmap_avail; |
| } |
| |
| if (!job_feat_ptr) { |
| error("%s: Bad feature expression for %pJ: %s", |
| __func__, job_ptr, details_ptr->features_use); |
| } |
| if ((job_feat_ptr->op_code == FEATURE_OP_XAND) || |
| (job_feat_ptr->op_code == FEATURE_OP_MOR) || |
| ((job_feat_ptr->op_code != FEATURE_OP_XAND) && |
| (job_feat_ptr->op_code != FEATURE_OP_MOR) && |
| ((last_op == FEATURE_OP_XAND) || |
| (last_op == FEATURE_OP_MOR)))) { |
| if (bit_overlap_any(config_ptr->node_bitmap, |
| working_node_bitmap)) { |
| if (!result_node_bitmap) |
| result_node_bitmap = |
| bit_alloc(MAX_FEATURES); |
| bit_set(result_node_bitmap, position); |
| if (can_reboot && reboot_bitmap && |
| active_node_bitmap) { |
| tmp_node_bitmap = bit_copy(config_ptr-> |
| node_bitmap); |
| bit_and_not(tmp_node_bitmap, |
| active_node_bitmap); |
| bit_or(reboot_bitmap, tmp_node_bitmap); |
| FREE_NULL_BITMAP(tmp_node_bitmap); |
| } |
| } |
| position++; |
| last_op = job_feat_ptr->op_code; |
| } |
| FREE_NULL_BITMAP(active_node_bitmap); |
| FREE_NULL_BITMAP(paren_node_bitmap); |
| } |
| list_iterator_destroy(feat_iter); |
| |
| #if _DEBUG |
| { |
| char tmp[64]; |
| if (result_node_bitmap) |
| bit_fmt(tmp, sizeof(tmp), result_node_bitmap); |
| else |
| snprintf(tmp, sizeof(tmp), "NONE"); |
| info("CONFIG_FEATURE:%s FEATURE_MOR_BITS:%s", config_ptr->feature, tmp); |
| if (reboot_bitmap && (bit_ffs(reboot_bitmap) >= 0)) { |
| char *reboot_node_str = bitmap2node_name(reboot_bitmap); |
| info("REBOOT_NODES:%s", reboot_node_str); |
| xfree(reboot_node_str); |
| } |
| } |
| #endif |
| |
| return result_node_bitmap; |
| } |
| |
| /* |
| * re_kill_job - for a given job, deallocate its nodes for a second time, |
| * basically a cleanup for failed deallocate() calls |
| * IN job_ptr - pointer to terminating job (already in some COMPLETING state) |
| * globals: node_record_count - number of nodes in the system |
| * node_record_table_ptr - pointer to global node table |
| */ |
| extern void re_kill_job(job_record_t *job_ptr) |
| { |
| agent_arg_t *agent_args; |
| hostlist_t *kill_hostlist; |
| char *host_str = NULL; |
| static uint32_t last_job_id = 0; |
| node_record_t *node_ptr; |
| |
| xassert(job_ptr); |
| xassert(job_ptr->details); |
| |
| kill_hostlist = hostlist_create(NULL); |
| |
| agent_args = xmalloc(sizeof(agent_arg_t)); |
| agent_args->msg_type = REQUEST_TERMINATE_JOB; |
| agent_args->hostlist = hostlist_create(NULL); |
| agent_args->protocol_version = SLURM_PROTOCOL_VERSION; |
| agent_args->retry = 0; |
| |
| if (job_ptr->node_bitmap_cg) { |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i)); |
| i++) { |
| if (IS_NODE_DOWN(node_ptr)) { |
| /* Consider job already completed */ |
| bit_clear(job_ptr->node_bitmap_cg, |
| node_ptr->index); |
| job_update_tres_cnt(job_ptr, node_ptr->index); |
| if (node_ptr->comp_job_cnt) |
| (node_ptr->comp_job_cnt)--; |
| if ((job_ptr->node_cnt > 0) && |
| ((--job_ptr->node_cnt) == 0)) { |
| cleanup_completing(job_ptr, true); |
| last_node_update = time(NULL); |
| } |
| } else if (!IS_NODE_NO_RESPOND(node_ptr)) { |
| (void)hostlist_push_host(kill_hostlist, |
| node_ptr->name); |
| if (agent_args->protocol_version > |
| node_ptr->protocol_version) |
| agent_args->protocol_version = |
| node_ptr->protocol_version; |
| hostlist_push_host(agent_args->hostlist, |
| node_ptr->name); |
| agent_args->node_count++; |
| } |
| if (PACK_FANOUT_ADDRS(node_ptr)) |
| agent_args->msg_flags |= SLURM_PACK_ADDRS; |
| } |
| } |
| |
| if (agent_args->node_count == 0) { |
| FREE_NULL_HOSTLIST(agent_args->hostlist); |
| xfree(agent_args); |
| hostlist_destroy(kill_hostlist); |
| return; |
| } |
| hostlist_uniq(kill_hostlist); |
| host_str = hostlist_ranged_string_xmalloc(kill_hostlist); |
| if (job_ptr->job_id != last_job_id) { |
| info("Resending TERMINATE_JOB request %pJ Nodelist=%s", |
| job_ptr, host_str); |
| } else { |
| debug("Resending TERMINATE_JOB request %pJ Nodelist=%s", |
| job_ptr, host_str); |
| } |
| |
| xfree(host_str); |
| last_job_id = job_ptr->job_id; |
| hostlist_destroy(kill_hostlist); |
| agent_args->msg_args = |
| create_kill_job_msg(job_ptr, agent_args->protocol_version); |
| set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY); |
| agent_queue_request(agent_args); |
| } |