blob: 9f81c5f5db99c31c6c2a8a6874b07417e2c8baef [file] [log] [blame]
/*****************************************************************************\
* acct_policy.c - Enforce accounting policy
*****************************************************************************
* Copyright (C) 2008 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "slurm/slurm_errno.h"
#include "src/common/assoc_mgr.h"
#include "src/interfaces/accounting_storage.h"
#include "src/interfaces/priority.h"
#include "src/interfaces/select.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/acct_policy.h"
#define _DEBUG 0
enum {
ACCT_POLICY_ADD_SUBMIT,
ACCT_POLICY_REM_SUBMIT,
ACCT_POLICY_JOB_BEGIN,
ACCT_POLICY_JOB_FINI
};
typedef enum {
TRES_USAGE_OKAY,
TRES_USAGE_CUR_EXCEEDS_LIMIT,
TRES_USAGE_REQ_EXCEEDS_LIMIT,
TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE
} acct_policy_tres_usage_t;
typedef struct het_job_limits {
slurmdb_assoc_rec_t *assoc_ptr;
job_record_t *job_ptr;
} het_job_limits_t;
typedef struct acct_policy_validate_args {
acct_policy_limit_set_t *acct_policy_limit_set;
slurmdb_assoc_rec_t *assoc_in;
job_desc_msg_t *job_desc;
slurmdb_qos_rec_t *job_qos_ptr;
uint32_t *reason;
bool update_call;
} acct_policy_validate_args_t;
typedef struct {
char *acct;
slurmdb_assoc_rec_t *assoc_ptr;
int cnt;
job_record_t *job_ptr;
bool limits_filled;
time_t now;
slurmdb_qos_rec_t *qos_ptr;
uid_t uid;
slurmdb_used_limits_t *used_limits_acct;
slurmdb_used_limits_t *used_limits_user;
} acct_policy_accrue_t;
typedef struct {
uint32_t job_cnt;
job_record_t *job_ptr;
list_t *part_qos_list;
int type;
uint64_t *used_tres_run_secs;
} foreach_part_qos_limit_usage_t;
static void _apply_limit_factor(uint64_t *limit, double limit_factor)
{
int64_t new_val;
xassert(limit);
if ((limit_factor <= 0.0) ||
(*limit == NO_VAL64) ||
(*limit == INFINITE64))
return;
new_val = (int64_t)(*limit) * limit_factor;
if (new_val < 0) {
/* We overflowed, setting to INFINITE */
debug2("Factored limit overflowed setting to INFINITE");
*limit = INFINITE64;
} else {
debug2("Limit adjusted from %"PRIu64" to %"PRIu64,
*limit, new_val);
*limit = new_val;
}
}
/*
* Update a job's allocated node count to reflect only nodes that are not
* already allocated to this association. Needed to enforce GrpNode limit.
*/
static void _get_unique_job_node_cnt(job_record_t *job_ptr,
bitstr_t *grp_node_bitmap,
uint64_t *node_cnt)
{
xassert(node_cnt);
#if _DEBUG
char node_bitstr[64];
if (job_ptr->job_resrcs && job_ptr->job_resrcs->node_bitmap) {
bit_fmt(node_bitstr, sizeof(node_bitstr),
job_ptr->job_resrcs->node_bitmap);
info("%s: %pJ job_resrcs->node_bitmap:%s", __func__, job_ptr,
node_bitstr);
} else {
info("%s: %pJ job_resrcs->node_bitmap:NULL", __func__,
job_ptr);
}
if (grp_node_bitmap) {
bit_fmt(node_bitstr, sizeof(node_bitstr), grp_node_bitmap);
info("%s: object grp_node_bitmap:%s", __func__,
node_bitstr);
} else {
info("%s: object grp_node_bitmap:NULL", __func__);
}
#endif
if (job_ptr->job_resrcs && job_ptr->job_resrcs->node_bitmap &&
grp_node_bitmap) {
uint64_t overlap_cnt = bit_overlap(
job_ptr->job_resrcs->node_bitmap, grp_node_bitmap);
if (overlap_cnt) {
uint64_t init_cnt = bit_set_count(
job_ptr->job_resrcs->node_bitmap);
*node_cnt = init_cnt - overlap_cnt;
debug2("%s: %pJ unique allocated node count changed from %"PRIu64" to %"PRIu64,
__func__, job_ptr, init_cnt, *node_cnt);
}
} else if (job_ptr->details && job_ptr->details->req_node_bitmap &&
grp_node_bitmap) {
uint64_t overlap_cnt = bit_overlap(
job_ptr->details->req_node_bitmap, grp_node_bitmap);
if (overlap_cnt <= *node_cnt) {
*node_cnt -= overlap_cnt;
debug2("%s: %pJ unique allocated node count changed from %"PRIu64" to %"PRIu64,
__func__, job_ptr, *node_cnt + overlap_cnt, *node_cnt);
}
} else if (job_ptr->node_bitmap_preempt && grp_node_bitmap) {
uint64_t overlap_cnt = bit_overlap(job_ptr->node_bitmap_preempt,
grp_node_bitmap);
if (overlap_cnt) {
uint64_t init_cnt =
bit_set_count(job_ptr->node_bitmap_preempt);
*node_cnt = init_cnt - overlap_cnt;
debug2("%s: %pJ unique allocated node count changed from %"PRIu64" to %"PRIu64,
__func__, job_ptr, init_cnt, *node_cnt);
}
}
}
/*
* Update node allocation information for a job being started.
* This includes grp_node_bitmap, grp_node_job_cnt and
* grp_used_tres[TRES_ARRAY_NODE] of an object (qos, assoc, etc).
*/
static void _add_usage_node_bitmap(job_record_t *job_ptr,
bitstr_t **grp_node_bitmap,
uint16_t **grp_node_job_cnt,
uint64_t *grp_used_tres)
{
xassert(grp_node_bitmap);
xassert(grp_node_job_cnt);
xassert(grp_used_tres);
if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) {
if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id) {
/*
* Hetjobs reach here as part of testing before any
* resource allocation. See _het_job_limit_check()
* in src/plugins/sched/backfill/backfill.c
*/
} else if (job_ptr->node_cnt == 0) {
/* Zero size jobs OK to create/destroy burst buffers */
} else {
error("%s: %pJ lacks allocated node bitmap", __func__,
job_ptr);
}
return;
}
slurmdb_merge_grp_node_usage(grp_node_bitmap,
grp_node_job_cnt,
job_ptr->job_resrcs->node_bitmap,
NULL);
*grp_used_tres = bit_set_count(*grp_node_bitmap);
}
/*
* Update node allocation information for a job being completed.
* This includes grp_node_bitmap, grp_node_job_cnt and
* grp_used_tres[TRES_ARRAY_NODE] of an object (qos, assoc, etc).
*/
static void _rm_usage_node_bitmap(job_record_t *job_ptr,
bitstr_t *grp_node_bitmap,
uint16_t *grp_node_job_cnt,
uint64_t *grp_used_tres)
{
xassert(grp_used_tres);
if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) {
if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id) {
/*
* Hetjobs reach here as part of testing before any
* resource allocation. See _het_job_limit_check()
* in src/plugins/sched/backfill/backfill.c
*/
} else if (job_ptr->node_cnt == 0) {
/* Zero size jobs OK to create/destroy burst buffers */
} else {
error("%s: %pJ lacks allocated node bitmap", __func__,
job_ptr);
}
return;
}
if (!grp_node_bitmap) {
error("%s: grp_node_bitmap is NULL", __func__);
return;
}
if (!grp_node_job_cnt) {
error("%s: grp_node_job_cnt is NULL", __func__);
return;
}
for (int i = 0;
next_node_bitmap(job_ptr->job_resrcs->node_bitmap, &i); i++) {
if (--grp_node_job_cnt[i] == 0)
bit_clear(grp_node_bitmap, i);
}
*grp_used_tres = bit_set_count(grp_node_bitmap);
}
static int _get_tres_state_reason(int tres_pos, int unk_reason)
{
switch (tres_pos) {
case TRES_ARRAY_CPU:
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_CPU;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_CPU_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_CPU_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_CPU_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_CPU_MINS_PER_JOB;
case WAIT_ASSOC_MAX_UNK_PER_NODE:
return WAIT_ASSOC_MAX_CPU_PER_NODE;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_CPU;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_CPU_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_CPU_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_CPU_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_NODE:
return WAIT_QOS_MAX_CPU_PER_NODE;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_CPU_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_CPU_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_CPU_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_CPU_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_CPU_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_CPU;
default:
return unk_reason;
break;
}
break;
case TRES_ARRAY_MEM:
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_MEM;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_MEM_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_MEM_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_MEM_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_MEM_MINS_PER_JOB;
case WAIT_ASSOC_MAX_UNK_PER_NODE:
return WAIT_ASSOC_MAX_MEM_PER_NODE;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_MEM;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_MEM_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_MEM_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_MEM_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_NODE:
return WAIT_QOS_MAX_MEM_PER_NODE;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_MEM_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_MEM_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_MEM_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_MEM_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_MEM_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_MEM;
default:
return unk_reason;
break;
}
break;
case TRES_ARRAY_ENERGY:
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_ENERGY;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_ENERGY_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_ENERGY_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_ENERGY_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_ENERGY_MINS_PER_JOB;
case WAIT_ASSOC_MAX_UNK_PER_NODE:
return WAIT_ASSOC_MAX_ENERGY_PER_NODE;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_ENERGY;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_ENERGY_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_ENERGY_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_ENERGY_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_NODE:
return WAIT_QOS_MAX_ENERGY_PER_NODE;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_ENERGY_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_ENERGY_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_ENERGY_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_ENERGY_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_ENERGY_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_ENERGY;
default:
return unk_reason;
break;
}
break;
case TRES_ARRAY_NODE:
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_NODE;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_NODE_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_NODE_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_NODE_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_NODE_MINS_PER_JOB;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_NODE;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_NODE_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_NODE_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_NODE_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_NODE_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_NODE_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_NODE_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_NODE_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_NODE_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_NODE;
default:
return unk_reason;
break;
}
break;
case TRES_ARRAY_BILLING:
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_BILLING;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_BILLING_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_BILLING_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_BILLING_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_BILLING_MINS_PER_JOB;
case WAIT_ASSOC_MAX_UNK_PER_NODE:
return WAIT_ASSOC_MAX_BILLING_PER_NODE;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_BILLING;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_BILLING_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_BILLING_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_BILLING_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_NODE:
return WAIT_QOS_MAX_BILLING_PER_NODE;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_BILLING_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_BILLING_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_BILLING_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_BILLING_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_BILLING_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_BILLING;
default:
return unk_reason;
break;
}
break;
default:
if (!xstrcmp("gres", assoc_mgr_tres_array[tres_pos]->type))
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_GRES;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_GRES_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_GRES_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_GRES_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_GRES_MINS_PER_JOB;
case WAIT_ASSOC_MAX_UNK_PER_NODE:
return WAIT_ASSOC_MAX_GRES_PER_NODE;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_GRES;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_GRES_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_GRES_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_GRES_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_NODE:
return WAIT_QOS_MAX_GRES_PER_NODE;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_GRES_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_GRES_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_GRES_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_GRES_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_GRES_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_GRES;
default:
return unk_reason;
break;
}
else if (!xstrcmp("license",
assoc_mgr_tres_array[tres_pos]->type))
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_LIC;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_LIC_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_LIC_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_LIC_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_LIC_MINS_PER_JOB;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_LIC;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_LIC_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_LIC_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_LIC_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_LIC_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_LIC_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_LIC_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_LIC_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_LIC_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_LIC;
default:
return unk_reason;
break;
}
else if (!xstrcmp("bb", assoc_mgr_tres_array[tres_pos]->type))
switch (unk_reason) {
case WAIT_ASSOC_GRP_UNK:
return WAIT_ASSOC_GRP_BB;
case WAIT_ASSOC_GRP_UNK_MIN:
return WAIT_ASSOC_GRP_BB_MIN;
case WAIT_ASSOC_GRP_UNK_RUN_MIN:
return WAIT_ASSOC_GRP_BB_RUN_MIN;
case WAIT_ASSOC_MAX_UNK_PER_JOB:
return WAIT_ASSOC_MAX_BB_PER_JOB;
case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB:
return WAIT_ASSOC_MAX_BB_MINS_PER_JOB;
case WAIT_ASSOC_MAX_UNK_PER_NODE:
return WAIT_ASSOC_MAX_BB_PER_NODE;
case WAIT_QOS_GRP_UNK:
return WAIT_QOS_GRP_BB;
case WAIT_QOS_GRP_UNK_MIN:
return WAIT_QOS_GRP_BB_MIN;
case WAIT_QOS_GRP_UNK_RUN_MIN:
return WAIT_QOS_GRP_BB_RUN_MIN;
case WAIT_QOS_MAX_UNK_PER_JOB:
return WAIT_QOS_MAX_BB_PER_JOB;
case WAIT_QOS_MAX_UNK_PER_NODE:
return WAIT_QOS_MAX_BB_PER_NODE;
case WAIT_QOS_MAX_UNK_PER_ACCT:
return WAIT_QOS_MAX_BB_PER_ACCT;
case WAIT_QOS_MAX_UNK_PER_USER:
return WAIT_QOS_MAX_BB_PER_USER;
case WAIT_QOS_MAX_UNK_MINS_PER_JOB:
return WAIT_QOS_MAX_BB_MINS_PER_JOB;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT:
return WAIT_QOS_MAX_BB_RUN_MINS_PER_ACCT;
case WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER:
return WAIT_QOS_MAX_BB_RUN_MINS_PER_USER;
case WAIT_QOS_MIN_UNK:
return WAIT_QOS_MIN_BB;
default:
return unk_reason;
break;
}
break;
}
return unk_reason;
}
static int _find_used_limits_for_acct(void *x, void *key)
{
slurmdb_used_limits_t *used_limits = (slurmdb_used_limits_t *)x;
char *account = (char *)key;
if (!xstrcmp(account, used_limits->acct))
return 1;
return 0;
}
static int _find_used_limits_for_user(void *x, void *key)
{
slurmdb_used_limits_t *used_limits = (slurmdb_used_limits_t *)x;
uint32_t user_id = *(uint32_t *)key;
if (used_limits->uid == user_id)
return 1;
return 0;
}
static bool _valid_job_assoc(job_record_t *job_ptr)
{
slurmdb_assoc_rec_t assoc_rec;
if ((job_ptr->assoc_ptr == NULL) ||
(job_ptr->assoc_ptr->id != job_ptr->assoc_id) ||
(job_ptr->assoc_ptr->uid != job_ptr->user_id)) {
error("Invalid assoc_ptr for %pJ", job_ptr);
memset(&assoc_rec, 0, sizeof(slurmdb_assoc_rec_t));
assoc_rec.acct = job_ptr->account;
if (job_ptr->part_ptr)
assoc_rec.partition = job_ptr->part_ptr->name;
assoc_rec.uid = job_ptr->user_id;
if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
accounting_enforce,
&job_ptr->assoc_ptr, false)) {
info("%s: invalid account or partition for uid=%u %pJ",
__func__, job_ptr->user_id, job_ptr);
return false;
}
job_ptr->assoc_id = assoc_rec.id;
}
return true;
}
/* Set the job_ptr->qos_ptr to the highest priority QOS */
static void _set_highest_prio_qos_ptr(job_record_t *job_ptr)
{
xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK));
if (!job_ptr->qos_list || !list_count(job_ptr->qos_list))
return;
job_ptr->qos_ptr = list_peek(job_ptr->qos_list);
job_ptr->qos_id = job_ptr->qos_ptr->id;
}
static void _qos_adjust_limit_usage(int type, job_record_t *job_ptr,
slurmdb_qos_rec_t *qos_ptr,
uint64_t *used_tres_run_secs,
uint32_t job_cnt)
{
slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
int i;
if (!qos_ptr || !job_ptr->assoc_ptr)
return;
used_limits_a = acct_policy_get_acct_used_limits(
&qos_ptr->usage->acct_limit_list,
job_ptr->assoc_ptr->acct);
used_limits = acct_policy_get_user_used_limits(
&qos_ptr->usage->user_limit_list,
job_ptr->user_id);
switch (type) {
case ACCT_POLICY_ADD_SUBMIT:
qos_ptr->usage->grp_used_submit_jobs += job_cnt;
used_limits->submit_jobs += job_cnt;
used_limits_a->submit_jobs += job_cnt;
break;
case ACCT_POLICY_REM_SUBMIT:
if (qos_ptr->usage->grp_used_submit_jobs >= job_cnt)
qos_ptr->usage->grp_used_submit_jobs -= job_cnt;
else {
qos_ptr->usage->grp_used_submit_jobs = 0;
debug2("acct_policy_remove_job_submit: "
"grp_submit_jobs underflow for qos %s",
qos_ptr->name);
}
if (used_limits->submit_jobs >= job_cnt)
used_limits->submit_jobs -= job_cnt;
else {
used_limits->submit_jobs = 0;
debug2("acct_policy_remove_job_submit: "
"used_submit_jobs underflow for "
"qos %s user %d",
qos_ptr->name, used_limits->uid);
}
if (used_limits_a->submit_jobs >= job_cnt)
used_limits_a->submit_jobs -= job_cnt;
else {
used_limits_a->submit_jobs = 0;
debug2("acct_policy_remove_job_submit: "
"used_submit_jobs underflow for "
"qos %s account %s",
qos_ptr->name, used_limits_a->acct);
}
break;
case ACCT_POLICY_JOB_BEGIN:
/*
* Now that the job has started set the id correctly. This is
* needed when we have multiple QOS, the qos_ptr will be set
* correctly, but the qos_id is only set to the highest priority
* until now.
*/
if (job_ptr->qos_ptr == qos_ptr)
job_ptr->qos_id = qos_ptr->id;
qos_ptr->usage->grp_used_jobs++;
for (i=0; i<slurmctld_tres_cnt; i++) {
/* tres_alloc_cnt for ENERGY is currently after the
* fact, so don't add it here or you will get underflows
* when you remove it. If this ever changes this will
* have to be moved to a new TRES ARRAY probably.
*/
if (i == TRES_ARRAY_ENERGY)
continue;
if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
continue;
used_limits->tres[i] += job_ptr->tres_alloc_cnt[i];
used_limits->tres_run_secs[i] += used_tres_run_secs[i];
used_limits_a->tres[i] += job_ptr->tres_alloc_cnt[i];
used_limits_a->tres_run_secs[i] += used_tres_run_secs[i];
qos_ptr->usage->grp_used_tres[i] +=
job_ptr->tres_alloc_cnt[i];
qos_ptr->usage->grp_used_tres_run_secs[i] +=
used_tres_run_secs[i];
debug2("acct_policy_job_begin: after adding %pJ, qos %s grp_used_tres_run_secs(%s) is %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[i],
qos_ptr->usage->grp_used_tres_run_secs[i]);
}
used_limits->jobs++;
used_limits_a->jobs++;
_add_usage_node_bitmap(
job_ptr,
&qos_ptr->usage->grp_node_bitmap,
&qos_ptr->usage->grp_node_job_cnt,
&qos_ptr->usage->grp_used_tres[TRES_ARRAY_NODE]);
_add_usage_node_bitmap(
job_ptr,
&used_limits->node_bitmap,
&used_limits->node_job_cnt,
&used_limits->tres[TRES_ARRAY_NODE]);
_add_usage_node_bitmap(
job_ptr,
&used_limits_a->node_bitmap,
&used_limits_a->node_job_cnt,
&used_limits_a->tres[TRES_ARRAY_NODE]);
break;
case ACCT_POLICY_JOB_FINI:
/*
* If tres_alloc_cnt doesn't exist means ACCT_POLICY_JOB_BEGIN
* was never called so no need to clean up that which was never
* set up.
*/
if (!job_ptr->tres_alloc_cnt)
break;
qos_ptr->usage->grp_used_jobs--;
if ((int32_t)qos_ptr->usage->grp_used_jobs < 0) {
qos_ptr->usage->grp_used_jobs = 0;
debug2("acct_policy_job_fini: used_jobs "
"underflow for qos %s", qos_ptr->name);
}
for (i=0; i<slurmctld_tres_cnt; i++) {
if (i == TRES_ARRAY_ENERGY)
continue;
if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
continue;
if (job_ptr->tres_alloc_cnt[i] >
qos_ptr->usage->grp_used_tres[i]) {
qos_ptr->usage->grp_used_tres[i] = 0;
debug2("acct_policy_job_fini: "
"grp_used_tres(%s) "
"underflow for QOS %s",
assoc_mgr_tres_name_array[i],
qos_ptr->name);
} else
qos_ptr->usage->grp_used_tres[i] -=
job_ptr->tres_alloc_cnt[i];
if (job_ptr->tres_alloc_cnt[i] > used_limits->tres[i]) {
used_limits->tres[i] = 0;
debug2("acct_policy_job_fini: "
"used_limits->tres(%s) "
"underflow for qos %s user %u",
assoc_mgr_tres_name_array[i],
qos_ptr->name, used_limits->uid);
} else
used_limits->tres[i] -=
job_ptr->tres_alloc_cnt[i];
if (job_ptr->tres_alloc_cnt[i] >
used_limits_a->tres[i]) {
used_limits_a->tres[i] = 0;
debug2("acct_policy_job_fini: "
"used_limits->tres(%s) "
"underflow for qos %s account %s",
assoc_mgr_tres_name_array[i],
qos_ptr->name, used_limits_a->acct);
} else
used_limits_a->tres[i] -=
job_ptr->tres_alloc_cnt[i];
}
if (used_limits->jobs)
used_limits->jobs--;
else
debug2("acct_policy_job_fini: used_jobs "
"underflow for qos %s user %d",
qos_ptr->name, used_limits->uid);
if (used_limits_a->jobs)
used_limits_a->jobs--;
else
debug2("acct_policy_job_fini: used_jobs "
"underflow for qos %s account %s",
qos_ptr->name, used_limits_a->acct);
_rm_usage_node_bitmap(
job_ptr,
qos_ptr->usage->grp_node_bitmap,
qos_ptr->usage->grp_node_job_cnt,
&qos_ptr->usage->grp_used_tres[TRES_ARRAY_NODE]);
_rm_usage_node_bitmap(
job_ptr,
used_limits->node_bitmap,
used_limits->node_job_cnt,
&used_limits->tres[TRES_ARRAY_NODE]);
_rm_usage_node_bitmap(
job_ptr,
used_limits_a->node_bitmap,
used_limits_a->node_job_cnt,
&used_limits_a->tres[TRES_ARRAY_NODE]);
break;
default:
error("acct_policy: qos unknown type %d", type);
break;
}
}
static int _find_qos_part(void *x, void *key)
{
if ((slurmdb_qos_rec_t *) x == (slurmdb_qos_rec_t *) key)
return 1; /* match */
return 0;
}
static int _foreach_part_qos_limit_usage(void *x, void *arg)
{
part_record_t *part_ptr = x;
foreach_part_qos_limit_usage_t *part_qos_limit_usage = arg;
if (!part_ptr->qos_ptr)
return 0;
if (!part_qos_limit_usage->part_qos_list)
part_qos_limit_usage->part_qos_list = list_create(NULL);
/*
* Don't adjust usage to this partition's qos if
* it's the same as the qos of another partition
* that we already handled.
*/
if (list_find_first(part_qos_limit_usage->part_qos_list, _find_qos_part,
part_ptr->qos_ptr))
return 0;
list_push(part_qos_limit_usage->part_qos_list, part_ptr->qos_ptr);
_qos_adjust_limit_usage(part_qos_limit_usage->type,
part_qos_limit_usage->job_ptr,
part_ptr->qos_ptr,
part_qos_limit_usage->used_tres_run_secs,
part_qos_limit_usage->job_cnt);
return 0;
}
static void _adjust_limit_usage(int type, job_record_t *job_ptr,
bool assoc_locked)
{
slurmdb_assoc_rec_t *assoc_ptr = NULL;
slurmdb_qos_rec_t *orig_qos_ptr = NULL;
assoc_mgr_lock_t locks =
{ .assoc = WRITE_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
uint64_t used_tres_run_secs[slurmctld_tres_cnt];
int i;
uint32_t job_cnt = 1;
if (assoc_locked) {
xassert(verify_assoc_lock(ASSOC_LOCK, WRITE_LOCK));
xassert(verify_assoc_lock(QOS_LOCK, WRITE_LOCK));
xassert(verify_assoc_lock(TRES_LOCK, READ_LOCK));
} else {
xassert(verify_assoc_unlock(ASSOC_LOCK));
xassert(verify_assoc_unlock(QOS_LOCK));
xassert(verify_assoc_unlock(TRES_LOCK));
}
memset(used_tres_run_secs, 0, sizeof(uint64_t) * slurmctld_tres_cnt);
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
|| !_valid_job_assoc(job_ptr))
return;
if (type == ACCT_POLICY_JOB_FINI)
priority_g_job_end(job_ptr);
else if (type == ACCT_POLICY_JOB_BEGIN) {
uint64_t time_limit_secs = (uint64_t)job_ptr->time_limit * 60;
/*
* Take into account usage factor.
*
* qos_ptr is set correctly if we have a qos_list here, no need
* to do anything other than that.
*/
if (job_ptr->qos_ptr &&
(job_ptr->qos_ptr->usage_factor >= 0))
time_limit_secs *= job_ptr->qos_ptr->usage_factor;
for (i = 0; i < slurmctld_tres_cnt; i++) {
if (i == TRES_ARRAY_ENERGY)
continue;
if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
continue;
used_tres_run_secs[i] =
job_ptr->tres_alloc_cnt[i] * time_limit_secs;
}
} else if (((type == ACCT_POLICY_ADD_SUBMIT) ||
(type == ACCT_POLICY_REM_SUBMIT)) &&
job_ptr->array_recs && job_ptr->array_recs->task_cnt)
job_cnt = job_ptr->array_recs->task_cnt;
if (!assoc_locked)
assoc_mgr_lock(&locks);
/*
* This handles removal of the accrual_cnt pending on
* state. We do not want to call this on add submit as it could push
* other jobs pending waiting in line for the limit. The main call to
* this that handles the initial call happens in build_job_queue().
*/
if (type != ACCT_POLICY_ADD_SUBMIT)
acct_policy_handle_accrue_time(job_ptr, true);
if ((type == ACCT_POLICY_ADD_SUBMIT) ||
(type == ACCT_POLICY_REM_SUBMIT)) {
orig_qos_ptr = job_ptr->qos_ptr;
_set_highest_prio_qos_ptr(job_ptr);
}
/*
* If we have submitted to multiple partitions we need to handle all of
* them on submit and remove if the job was cancelled before it ran
* (!job_ptr->tres_alloc_str).
*/
if (((type == ACCT_POLICY_ADD_SUBMIT) ||
(type == ACCT_POLICY_REM_SUBMIT)) &&
job_ptr->part_ptr_list &&
(IS_JOB_PENDING(job_ptr) || !job_ptr->tres_alloc_str)) {
bool job_first = false;
foreach_part_qos_limit_usage_t part_qos_limit_usage = {
.job_cnt = job_cnt,
.job_ptr = job_ptr,
.part_qos_list = NULL,
.type = type,
.used_tres_run_secs = used_tres_run_secs,
};
if (job_ptr->qos_ptr &&
(((slurmdb_qos_rec_t *)job_ptr->qos_ptr)->flags
& QOS_FLAG_OVER_PART_QOS))
job_first = true;
if (job_first) {
_qos_adjust_limit_usage(type, job_ptr, job_ptr->qos_ptr,
used_tres_run_secs, job_cnt);
part_qos_limit_usage.part_qos_list = list_create(NULL);
list_push(part_qos_limit_usage.part_qos_list,
job_ptr->qos_ptr);
}
(void) list_for_each(job_ptr->part_ptr_list,
_foreach_part_qos_limit_usage,
&part_qos_limit_usage);
/*
* Don't adjust usage to this job's qos if
* it's the same as the qos of a partition
* that we already handled.
*/
if (!job_first && job_ptr->qos_ptr &&
(!part_qos_limit_usage.part_qos_list ||
!list_find_first(part_qos_limit_usage.part_qos_list,
_find_qos_part,
job_ptr->qos_ptr)))
_qos_adjust_limit_usage(type, job_ptr, job_ptr->qos_ptr,
used_tres_run_secs, job_cnt);
FREE_NULL_LIST(part_qos_limit_usage.part_qos_list);
} else {
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
/*
* Here if the job is starting and we had a part_ptr_list before
* hand we need to remove the submit from all partition qos
* outside of the one we actually are going to run on.
*/
if ((type == ACCT_POLICY_JOB_BEGIN) &&
job_ptr->part_ptr_list) {
foreach_part_qos_limit_usage_t part_qos_limit_usage = {
.job_cnt = job_cnt,
.job_ptr = job_ptr,
.part_qos_list = list_create(NULL),
.type = ACCT_POLICY_REM_SUBMIT,
.used_tres_run_secs = used_tres_run_secs,
};
if (job_ptr->qos_ptr)
list_push(part_qos_limit_usage.part_qos_list,
job_ptr->qos_ptr);
if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr &&
job_ptr->qos_ptr != job_ptr->part_ptr->qos_ptr)
list_push(part_qos_limit_usage.part_qos_list,
job_ptr->part_ptr->qos_ptr);
(void) list_for_each(job_ptr->part_ptr_list,
_foreach_part_qos_limit_usage,
&part_qos_limit_usage);
FREE_NULL_LIST(part_qos_limit_usage.part_qos_list);
}
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
_qos_adjust_limit_usage(type, job_ptr, qos_ptr_1,
used_tres_run_secs, job_cnt);
_qos_adjust_limit_usage(type, job_ptr, qos_ptr_2,
used_tres_run_secs, job_cnt);
}
assoc_ptr = job_ptr->assoc_ptr;
while (assoc_ptr) {
switch (type) {
case ACCT_POLICY_ADD_SUBMIT:
assoc_ptr->usage->used_submit_jobs += job_cnt;
break;
case ACCT_POLICY_REM_SUBMIT:
if (assoc_ptr->usage->used_submit_jobs)
assoc_ptr->usage->used_submit_jobs -= job_cnt;
else
debug2("acct_policy_remove_job_submit: "
"used_submit_jobs underflow for "
"account %s",
assoc_ptr->acct);
break;
case ACCT_POLICY_JOB_BEGIN:
assoc_ptr->usage->used_jobs++;
_add_usage_node_bitmap(
job_ptr,
&assoc_ptr->usage->grp_node_bitmap,
&assoc_ptr->usage->grp_node_job_cnt,
&assoc_ptr->usage->
grp_used_tres[TRES_ARRAY_NODE]);
for (i = 0; i < slurmctld_tres_cnt; i++) {
if (i == TRES_ARRAY_ENERGY)
continue;
if (job_ptr->tres_alloc_cnt[i] ==
NO_CONSUME_VAL64)
continue;
if (i != TRES_ARRAY_NODE) {
assoc_ptr->usage->grp_used_tres[i] +=
job_ptr->tres_alloc_cnt[i];
}
assoc_ptr->usage->grp_used_tres_run_secs[i] +=
used_tres_run_secs[i];
debug2("acct_policy_job_begin: after adding %pJ, assoc %u(%s/%s/%s) grp_used_tres_run_secs(%s) is %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[i],
assoc_ptr->usage->
grp_used_tres_run_secs[i]);
}
break;
case ACCT_POLICY_JOB_FINI:
if (assoc_ptr->usage->used_jobs)
assoc_ptr->usage->used_jobs--;
else
debug2("acct_policy_job_fini: used_jobs "
"underflow for account %s",
assoc_ptr->acct);
_rm_usage_node_bitmap(
job_ptr,
assoc_ptr->usage->grp_node_bitmap,
assoc_ptr->usage->grp_node_job_cnt,
&assoc_ptr->usage->
grp_used_tres[TRES_ARRAY_NODE]);
for (i = 0; i < slurmctld_tres_cnt; i++) {
if ((i == TRES_ARRAY_ENERGY) ||
(i == TRES_ARRAY_NODE))
continue;
if (job_ptr->tres_alloc_cnt[i] ==
NO_CONSUME_VAL64)
continue;
if (job_ptr->tres_alloc_cnt[i] >
assoc_ptr->usage->grp_used_tres[i]) {
assoc_ptr->usage->grp_used_tres[i] = 0;
debug2("acct_policy_job_fini: "
"grp_used_tres(%s) "
"underflow for assoc "
"%u(%s/%s/%s)",
assoc_mgr_tres_name_array[i],
assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user,
assoc_ptr->partition);
} else {
assoc_ptr->usage->grp_used_tres[i] -=
job_ptr->tres_alloc_cnt[i];
}
}
break;
default:
error("acct_policy: association unknown type %d", type);
break;
}
/* now handle all the group limits of the parents */
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
}
/*
* Now that we are done with accrue set things back to the way
* it was qos wise. Accrue limits are always based on the
* highest priority QOS.
*/
if (orig_qos_ptr && (orig_qos_ptr != job_ptr->qos_ptr)) {
job_ptr->qos_ptr = orig_qos_ptr;
job_ptr->qos_id = orig_qos_ptr->id;
}
if (!assoc_locked)
assoc_mgr_unlock(&locks);
}
static void _set_time_limit(uint32_t *time_limit, uint32_t part_max_time,
uint32_t limit_max_time, uint16_t *limit_set_time)
{
if ((*time_limit) == NO_VAL) {
if (limit_max_time)
(*time_limit) = limit_max_time;
else if (part_max_time != INFINITE)
(*time_limit) = part_max_time;
else
(*time_limit) = INFINITE;
if (limit_set_time)
(*limit_set_time) = 1;
} else if (limit_set_time && (*limit_set_time) &&
((*time_limit) > limit_max_time))
(*time_limit) = limit_max_time;
}
static void _qos_alter_job(job_record_t *job_ptr,
slurmdb_qos_rec_t *qos_ptr,
uint64_t *used_tres_run_secs,
uint64_t *new_used_tres_run_secs)
{
int i;
slurmdb_used_limits_t *used_limits_a = NULL, *used_limits_u = NULL;
if (!qos_ptr || !job_ptr)
return;
used_limits_a = acct_policy_get_acct_used_limits(
&qos_ptr->usage->acct_limit_list,
job_ptr->assoc_ptr->acct);
used_limits_u = acct_policy_get_user_used_limits(
&qos_ptr->usage->user_limit_list,
job_ptr->user_id);
for (i=0; i<slurmctld_tres_cnt; i++) {
if (used_tres_run_secs[i] == new_used_tres_run_secs[i])
continue;
/*
* Handle the case when remaining usage is less than
* the original job request.
*/
int64_t used_tres_run_sec_decr =
used_tres_run_secs[i] -
new_used_tres_run_secs[i];
if ((used_tres_run_sec_decr < 0) ||
(used_tres_run_sec_decr <
qos_ptr->usage->grp_used_tres_run_secs[i]))
qos_ptr->usage->grp_used_tres_run_secs[i] -=
used_tres_run_sec_decr;
else
qos_ptr->usage->grp_used_tres_run_secs[i] = 0;
if ((used_tres_run_sec_decr < 0) ||
(used_tres_run_sec_decr <
used_limits_a->tres_run_secs[i]))
used_limits_a->tres_run_secs[i] -=
used_tres_run_sec_decr;
else
used_limits_a->tres_run_secs[i] = 0;
if ((used_tres_run_sec_decr < 0) ||
(used_tres_run_sec_decr <
used_limits_u->tres_run_secs[i]))
used_limits_u->tres_run_secs[i] -=
used_tres_run_sec_decr;
else
used_limits_u->tres_run_secs[i] = 0;
debug2("altering %pJ QOS %s got %"PRIu64" just removed %"PRIu64" and added %"PRIu64,
job_ptr, qos_ptr->name,
qos_ptr->usage->grp_used_tres_run_secs[i],
used_tres_run_secs[i],
new_used_tres_run_secs[i]);
}
}
/*
* _validate_tres_limits_for_assoc - validate the tres requested against limits
* of an association as well as qos skipping any limit an admin set
*
* OUT - tres_pos - if false is returned position in array of failed limit
* IN - job_tres_array - count of various TRES requested by the job
* IN - divisor - divide the job_tres_array TRES by this variable, 0 if none
* IN - assoc_tres_array - TRES limits from an association (Grp, Max, Min)
* IN - qos_tres_array - TRES limits QOS has imposed already
* IN - acct_policy_limit_set_array - limits that have been overridden
* by an admin
* IN strict_checking - If a limit needs to be enforced now or not.
* IN update_call - If this is an update or a create call
* IN max_limit - Limits are for MAX else, the limits are MIN.
*
* RET - True if no limit is violated, false otherwise with tres_pos
* being set to the position of the failed limit.
*/
static bool _validate_tres_limits_for_assoc(
int *tres_pos,
uint64_t *job_tres_array,
uint64_t divisor,
uint64_t *assoc_tres_array,
uint64_t *qos_tres_array,
uint16_t *admin_set_limit_tres_array,
bool strict_checking,
bool update_call, bool max_limit)
{
int i;
uint64_t job_tres;
if (!strict_checking)
return true;
for (i = 0; i < g_tres_count; i++) {
(*tres_pos) = i;
if ((admin_set_limit_tres_array[i] == ADMIN_SET_LIMIT)
|| (qos_tres_array[i] != INFINITE64)
|| (assoc_tres_array[i] == INFINITE64)
|| (!job_tres_array[i] && !update_call))
continue;
job_tres = job_tres_array[i];
if (divisor)
job_tres /= divisor;
if (max_limit) {
if (job_tres > assoc_tres_array[i])
return false;
} else if (job_tres < assoc_tres_array[i])
return false;
}
return true;
}
/*
* _validate_tres_limits_for_qos - validate the tres requested against limits
* of a QOS as well as qos skipping any limit an admin set
*
* OUT - tres_pos - if false is returned position in array of failed limit
* IN - job_tres_array - count of various TRES requested by the job
* IN - divisor - divide the job_tres_array TRES by this variable, 0 if none
* IN - grp_tres_array - Grp TRES limits from QOS
* IN - max_tres_array - Max/Min TRES limits from QOS
* IN/OUT - out_grp_tres_array - Grp TRES limits QOS has imposed already,
* if a new limit is found the limit is filled in.
* IN/OUT - out_max_tres_array - Max/Min TRES limits QOS has imposed already,
* if a new limit is found the limit is filled in.
* IN - acct_policy_limit_set_array - limits that have been overridden
* by an admin
* IN strict_checking - If a limit needs to be enforced now or not.
* IN max_limit - Limits are for MAX else, the limits are MIN.
*
* RET - True if no limit is violated, false otherwise with tres_pos
* being set to the position of the failed limit.
*/
static bool _validate_tres_limits_for_qos(
int *tres_pos,
uint64_t *job_tres_array,
uint64_t divisor,
uint64_t *grp_tres_array,
uint64_t *max_tres_array,
uint64_t *out_grp_tres_array,
uint64_t *out_max_tres_array,
uint16_t *admin_set_limit_tres_array,
bool strict_checking, bool max_limit)
{
uint64_t max_tres_limit, out_max_tres_limit;
int i;
uint64_t job_tres;
if (!strict_checking)
return true;
for (i = 0; i < g_tres_count; i++) {
(*tres_pos) = i;
if (grp_tres_array) {
max_tres_limit = MIN(grp_tres_array[i],
max_tres_array[i]);
out_max_tres_limit = MIN(out_grp_tres_array[i],
out_max_tres_array[i]);
} else {
max_tres_limit = max_tres_array[i];
out_max_tres_limit = out_max_tres_array[i];
}
/* we don't need to look at this limit */
if ((admin_set_limit_tres_array[i] == ADMIN_SET_LIMIT)
|| (out_max_tres_limit != INFINITE64)
|| (max_tres_limit == INFINITE64)
|| (job_tres_array[i] && (job_tres_array[i] == NO_VAL64)))
continue;
out_max_tres_array[i] = max_tres_array[i];
job_tres = job_tres_array[i];
if (divisor)
job_tres /= divisor;
if (out_grp_tres_array && grp_tres_array) {
if (out_grp_tres_array[i] == INFINITE64)
out_grp_tres_array[i] = grp_tres_array[i];
if (max_limit) {
if (job_tres > grp_tres_array[i])
return false;
} else if (job_tres < grp_tres_array[i])
return false;
}
if (max_limit) {
if (job_tres > max_tres_array[i])
return false;
} else if (job_tres < max_tres_array[i])
return false;
}
return true;
}
/* Only check the time_limits if the admin didn't set
* the timelimit.
* It is important we look at these even if strict_checking
* isn't set so we get the correct time_limit from the job.
*/
static bool _validate_time_limit(uint32_t *time_limit_in,
uint32_t part_max_time,
uint64_t tres_req_cnt,
uint64_t max_limit,
void *out_max_limit,
uint16_t *limit_set_time,
bool strict_checking,
bool is64)
{
uint32_t max_time_limit;
uint64_t out_max_64 = *(uint64_t *)out_max_limit;
uint32_t out_max_32 = *(uint32_t *)out_max_limit;
if (!tres_req_cnt || (((*time_limit_in) != NO_VAL) &&
(!strict_checking ||
(*limit_set_time) == ADMIN_SET_LIMIT)))
return true;
if (is64) {
if ((out_max_64 != INFINITE64) ||
(max_limit == INFINITE64) ||
(tres_req_cnt == NO_VAL64))
return true;
} else {
if ((out_max_32 != INFINITE) ||
((uint32_t)max_limit == INFINITE) ||
((uint32_t)tres_req_cnt == NO_VAL))
return true;
}
max_time_limit = (uint32_t)(max_limit / tres_req_cnt);
_set_time_limit(time_limit_in, part_max_time, max_time_limit,
limit_set_time);
if (is64)
(*(uint64_t *)out_max_limit) = max_limit;
else
(*(uint32_t *)out_max_limit) = (uint32_t)max_limit;
if ((*time_limit_in) > max_time_limit)
return false;
return true;
}
/*
* _validate_tres_time_limits - validate the tres requested
* against limits of an association as well as qos skipping any limit
* an admin set
*
* OUT - tres_pos - if false is returned position in array of failed limit
* IN/OUT - time_limit_in - Job's time limit, set and returned based off limits
* if none is given.
* IN - part_max_time - Job's partition max time limit
* IN - job_tres_array - count of various TRES requested by the job
* IN - max_tres_array - Max TRES limits of association/QOS
* OUT - out_max_tres_array - Max TRES limits as set by the various TRES
* OUT - limit_set_time - set if the time_limit was set by a limit QOS/Assoc or
* otherwise.
* IN strict_checking - If a limit needs to be enforced now or not.
*
* RET - True if no limit is violated, false otherwise with tres_pos
* being set to the position of the failed limit.
*/
static bool _validate_tres_time_limits(
int *tres_pos,
uint32_t *time_limit_in,
uint32_t part_max_time,
uint64_t *job_tres_array,
uint64_t *max_tres_array,
uint64_t *out_max_tres_array,
uint16_t *limit_set_time,
bool strict_checking)
{
int i;
// uint32_t max_time_limit;
if (!strict_checking || (*limit_set_time) == ADMIN_SET_LIMIT)
return true;
for (i = 0; i < g_tres_count; i++) {
(*tres_pos) = i;
if (!_validate_time_limit(time_limit_in, part_max_time,
job_tres_array[i],
max_tres_array[i],
&out_max_tres_array[i],
limit_set_time,
strict_checking, true))
return false;
/* if ((out_max_tres_array[i] != INFINITE64) || */
/* (max_tres_array[i] == INFINITE64) || */
/* (job_tres_array[i] == NO_VAL64) || */
/* (job_tres_array[i] == 0)) */
/* continue; */
/* max_time_limit = (uint32_t)(max_tres_array[i] / */
/* job_tres_array[i]); */
/* _set_time_limit(time_limit_in, */
/* part_max_time, max_time_limit, */
/* limit_set_time); */
/* out_max_tres_array[i] = max_tres_array[i]; */
/* if ((*time_limit_in) > max_time_limit) */
/* return false; */
}
return true;
}
/*
* _validate_tres_usage_limits - validate the TRES requested against
* specified limits; when checking for safe limits, also take into
* consideration already used and currently running TRES resources
*
* OUT - tres_pos - if function returns other than TRES_USAGE_OKAY,
* position in TRES array of failed limit
* IN - tres_limit_array - count of various TRES limits to check against
* OUT - out_tres_limit_array - optional; assigned values from tres_limit_array
* when out_tres_limit_set is true,
* skipped when any of:
* 1) admin_limit_set is set and is an admin
* limit
* 2) out_tres_limit_array is set and its value
* has been changed since initially being set
* to INFINITE64
* 3) tres_limit_array is INFINITE64
* IN - tres_req_cnt - must be set; the following is checked with tres_req_cnt:
* 1) safe_limits && tres_req_cnt > tres_limit_array,
* return TRES_USAGE_REQ_EXCEEDS_LIMIT
* 2) when safe_limits and tres_usage are set:
* (tres_req_cnt + tres_usage) >
* (tres_limit_array - curr_usage),
* return TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE
* curr_usage will be 0 when not passed
* IN - tres_usage - TRES (currently running if curr_usage is set, already used
* otherwise) optional; This value is used only if
* safe_limits is true. It will be added to tres_req_cnt to
* count as extra time to observe, see tres_req_cnt section
* above for tres_usage interaction
* IN - curr_usage - TRES (already used) optional; when set, check if:
* 1) curr_usage > tres_limit_array && tres_req_cnt
* return TRES_USAGE_CUR_EXCEEDS_LIMIT
* 2) when safe_limits is true, see tres_req_cnt section
* above for curr_usage interaction
* IN - admin_limit_set - limits that have been overridden by an admin, see
* out_tres_limit_array section above for interaction
* IN - safe_limits - see tres_req_cnt section above for interaction
* IN - out_tres_limit_set - out_tres_limit_array is set as described above
* when true; out_tres_limit_array is not modified when false
* RET - TRES_USAGE_OKAY if no limit is violated, otherwise one of the other
* acct_policy_tres_usage_t enumerations with tres_pos being set to the
* position of the failed limit.
*/
static acct_policy_tres_usage_t _validate_tres_usage_limits(
int *tres_pos,
uint64_t *tres_limit_array,
uint64_t *out_tres_limit_array,
uint64_t *tres_req_cnt,
uint64_t *tres_usage,
uint64_t *curr_usage,
uint16_t *admin_limit_set,
bool safe_limits,
bool out_tres_limit_set)
{
int i;
uint64_t usage = 0;
xassert(tres_limit_array);
xassert(tres_req_cnt);
for (i = 0; i < g_tres_count; i++) {
(*tres_pos) = i;
if ((admin_limit_set &&
admin_limit_set[i] == ADMIN_SET_LIMIT) ||
(out_tres_limit_array &&
out_tres_limit_array[i] != INFINITE64) ||
(tres_limit_array[i] == INFINITE64))
continue;
if (out_tres_limit_set && out_tres_limit_array)
out_tres_limit_array[i] = tres_limit_array[i];
if (curr_usage && tres_req_cnt[i] &&
(curr_usage[i] >= tres_limit_array[i]))
return TRES_USAGE_CUR_EXCEEDS_LIMIT;
if (safe_limits) {
if (tres_req_cnt[i] > tres_limit_array[i])
return TRES_USAGE_REQ_EXCEEDS_LIMIT;
if (curr_usage)
usage = curr_usage[i];
if (tres_usage && tres_req_cnt[i] &&
((tres_req_cnt[i] + tres_usage[i]) >
(tres_limit_array[i] - usage)))
return TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE;
}
}
return TRES_USAGE_OKAY;
}
/*
* _validate_tres_usage_limits_for_qos - validate the tres requested
* against limits of an qos skipping any limit an admin set
*
* OUT - tres_pos - if false is returned position in array of failed limit
* IN - tres_limit_array - TRES limits from an association
* IN/OUT - out_tres_limit_array - TRES limits QOS has imposed already, if a new
* limit is found the limit is filled in.
* IN - tres_req_cnt - TRES requested from the job
* IN - tres_usage - TRES usage from the QOS (in minutes)
* IN - curr_usage - TRES usage in use right now by the QOS (running jobs)
* IN - admin_limit_set - TRES limits that have been overridden by an admin
* IN - safe_limits - if the safe flag was set on AccountingStorageEnforce
*
* RET - True if no limit is violated, false otherwise with tres_pos
* being set to the position of the failed limit.
*/
static acct_policy_tres_usage_t _validate_tres_usage_limits_for_qos(
int *tres_pos,
uint64_t *tres_limit_array,
uint64_t *out_tres_limit_array,
uint64_t *tres_req_cnt,
uint64_t *tres_usage,
uint64_t *curr_usage,
uint16_t *admin_limit_set,
bool safe_limits)
{
return _validate_tres_usage_limits(tres_pos,
tres_limit_array,
out_tres_limit_array,
tres_req_cnt,
tres_usage,
curr_usage,
admin_limit_set,
safe_limits,
true);
}
/*
* _validate_tres_usage_limits_for_assoc - validate the tres requested
* against limits of an association as well as qos skipping any limit
* an admin set
*
* OUT - tres_pos - if false is returned position in array of failed limit
* IN - tres_limit_array - TRES limits from an association
* IN - qos_tres_limit_array - TRES limits QOS has imposed already
* IN - tres_req_cnt - TRES requested from the job
* IN - tres_usage - TRES usage from the association (in minutes)
* IN - curr_usage - TRES usage in use right now by the assoc (running jobs)
* IN - admin_limit_set - TRES limits that have been overridden by an admin
* IN - safe_limits - if the safe flag was set on AccountingStorageEnforce
*
* RET - True if no limit is violated, false otherwise with tres_pos
* being set to the position of the failed limit.
*/
static acct_policy_tres_usage_t _validate_tres_usage_limits_for_assoc(
int *tres_pos,
uint64_t *tres_limit_array,
uint64_t *qos_tres_limit_array,
uint64_t *tres_req_cnt,
uint64_t *tres_usage,
uint64_t *curr_usage,
uint16_t *admin_limit_set,
bool safe_limits)
{
return _validate_tres_usage_limits(tres_pos,
tres_limit_array,
qos_tres_limit_array,
tres_req_cnt,
tres_usage,
curr_usage,
admin_limit_set,
safe_limits,
false);
}
static int _qos_policy_validate(job_desc_msg_t *job_desc,
slurmdb_assoc_rec_t *assoc_ptr,
part_record_t *part_ptr,
slurmdb_qos_rec_t *qos_ptr,
slurmdb_qos_rec_t *qos_out_ptr,
uint32_t *reason,
acct_policy_limit_set_t *acct_policy_limit_set,
bool update_call,
char *user_name,
int job_cnt,
bool strict_checking)
{
int rc = true;
int tres_pos = 0;
if (!qos_ptr || !qos_out_ptr)
return rc;
if (!_validate_tres_limits_for_qos(&tres_pos,
job_desc->tres_req_cnt, 0,
NULL,
qos_ptr->max_tres_pa_ctld,
NULL,
qos_out_ptr->max_tres_pa_ctld,
acct_policy_limit_set->tres,
strict_checking, 1)) {
if (job_desc->tres_req_cnt[tres_pos] >
qos_ptr->max_tres_pa_ctld[tres_pos]) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"per-acct max tres limit %"PRIu64" for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos],
qos_ptr->max_tres_pa_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
}
if (!_validate_tres_limits_for_qos(&tres_pos,
job_desc->tres_req_cnt, 0,
qos_ptr->grp_tres_ctld,
qos_ptr->max_tres_pu_ctld,
qos_out_ptr->grp_tres_ctld,
qos_out_ptr->max_tres_pu_ctld,
acct_policy_limit_set->tres,
strict_checking, 1)) {
if (job_desc->tres_req_cnt[tres_pos] >
qos_ptr->max_tres_pu_ctld[tres_pos]) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"per-user max tres limit %"PRIu64" for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos],
qos_ptr->max_tres_pu_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
} else if (job_desc->tres_req_cnt[tres_pos] >
qos_ptr->grp_tres_ctld[tres_pos]) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"group max tres limit %"PRIu64" for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos],
qos_ptr->grp_tres_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
}
/* for validation we don't need to look at
* qos_ptr->grp_jobs.
*/
if ((qos_out_ptr->grp_submit_jobs == INFINITE) &&
(qos_ptr->grp_submit_jobs != INFINITE)) {
qos_out_ptr->grp_submit_jobs = qos_ptr->grp_submit_jobs;
if ((qos_ptr->usage->grp_used_submit_jobs + job_cnt)
> qos_ptr->grp_submit_jobs) {
if (reason)
*reason = WAIT_QOS_GRP_SUB_JOB;
debug2("job submit for user %s(%u): group max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
user_name,
job_desc->user_id,
qos_ptr->grp_submit_jobs,
qos_ptr->usage->grp_used_submit_jobs, job_cnt,
qos_ptr->name);
rc = false;
goto end_it;
}
}
/* Only check the time_limits if the admin didn't set the timelimit.
* It is important we look at these even if strict_checking
* isn't set so we get the correct time_limit from the job.
*/
if (acct_policy_limit_set->time != ADMIN_SET_LIMIT) {
if (!_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
qos_ptr->max_tres_mins_pj_ctld,
qos_out_ptr->max_tres_mins_pj_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos,
WAIT_QOS_MAX_UNK_MINS_PER_JOB);
debug2("job submit for user %s(%u): "
"tres(%s) time limit request %"PRIu64" "
"exceeds max per-job limit %"PRIu64" "
"for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
qos_ptr->max_tres_mins_pj_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
if (!_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
qos_ptr->grp_tres_mins_ctld,
qos_out_ptr->grp_tres_mins_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK_MIN);
debug2("job submit for user %s(%u): "
"tres(%s) time limit request %"PRIu64" "
"exceeds group max limit %"PRIu64" "
"for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
qos_ptr->grp_tres_mins_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
if (!_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
qos_ptr->grp_tres_run_mins_ctld,
qos_out_ptr->grp_tres_run_mins_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
debug2("job submit for user %s(%u): "
"tres(%s) time limit request %"PRIu64" "
"exceeds group max running limit %"PRIu64" "
"for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
qos_ptr->grp_tres_run_mins_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
if (!_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
qos_ptr->max_tres_run_mins_pa_ctld,
qos_out_ptr->max_tres_run_mins_pa_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos,
WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT);
debug2("job submit for user %s(%u): tres(%s) time limit request %"PRIu64"exceeds account max running limit %"PRIu64"for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
qos_ptr->max_tres_run_mins_pa_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
if (!_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
qos_ptr->max_tres_run_mins_pu_ctld,
qos_out_ptr->max_tres_run_mins_pu_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos,
WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER);
debug2("job submit for user %s(%u): tres(%s) time limit request %"PRIu64"exceeds user max running limit %"PRIu64"for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
qos_ptr->max_tres_run_mins_pu_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
if ((qos_out_ptr->max_wall_pj == INFINITE) &&
(qos_ptr->max_wall_pj != INFINITE) &&
(!update_call || (job_desc->time_limit != NO_VAL))) {
_set_time_limit(&job_desc->time_limit,
part_ptr->max_time,
qos_ptr->max_wall_pj,
&acct_policy_limit_set->time);
qos_out_ptr->max_wall_pj = qos_ptr->max_wall_pj;
if (strict_checking
&& job_desc->time_limit > qos_ptr->max_wall_pj) {
if (reason)
*reason = WAIT_QOS_MAX_WALL_PER_JOB;
debug2("job submit for user %s(%u): "
"time limit %u exceeds qos max %u",
user_name,
job_desc->user_id,
job_desc->time_limit,
qos_ptr->max_wall_pj);
rc = false;
goto end_it;
}
}
if ((qos_out_ptr->grp_wall == INFINITE) &&
(qos_ptr->grp_wall != INFINITE) &&
(!update_call || (job_desc->time_limit != NO_VAL))) {
_set_time_limit(&job_desc->time_limit,
part_ptr->max_time,
qos_ptr->grp_wall,
&acct_policy_limit_set->time);
qos_out_ptr->grp_wall = qos_ptr->grp_wall;
if (strict_checking
&& job_desc->time_limit > qos_ptr->grp_wall) {
if (reason)
*reason = WAIT_QOS_GRP_WALL;
debug2("job submit for user %s(%u): "
"time limit %u exceeds qos grp max %u",
user_name,
job_desc->user_id,
job_desc->time_limit,
qos_ptr->grp_wall);
rc = false;
goto end_it;
}
}
}
if (!_validate_tres_limits_for_qos(&tres_pos,
job_desc->tres_req_cnt, 0,
NULL,
qos_ptr->max_tres_pj_ctld,
NULL,
qos_out_ptr->max_tres_pj_ctld,
acct_policy_limit_set->tres,
strict_checking, 1)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_JOB);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"per-job max tres limit %"PRIu64" for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos],
qos_ptr->max_tres_pj_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
if (!_validate_tres_limits_for_qos(&tres_pos,
job_desc->tres_req_cnt,
job_desc->tres_req_cnt[
TRES_ARRAY_NODE],
NULL,
qos_ptr->max_tres_pn_ctld,
NULL,
qos_out_ptr->max_tres_pn_ctld,
acct_policy_limit_set->tres,
strict_checking, 1)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_NODE);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"per-node max tres limit %"PRIu64" for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos] /
job_desc->tres_req_cnt[TRES_ARRAY_NODE],
qos_ptr->max_tres_pn_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
/* for validation we don't need to look at
* qos_ptr->max_jobs.
*/
/* we don't need to check min_tres_pj here */
if ((qos_out_ptr->max_submit_jobs_pa == INFINITE) &&
(qos_ptr->max_submit_jobs_pa != INFINITE)) {
slurmdb_used_limits_t *used_limits =
acct_policy_get_acct_used_limits(
&qos_ptr->usage->acct_limit_list,
assoc_ptr->acct);
qos_out_ptr->max_submit_jobs_pa = qos_ptr->max_submit_jobs_pa;
if ((used_limits->submit_jobs + job_cnt) >
qos_ptr->max_submit_jobs_pa) {
if (reason)
*reason = WAIT_QOS_MAX_SUB_JOB_PER_ACCT;
debug2("job submit for account %s: qos max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
assoc_ptr->acct,
qos_ptr->max_submit_jobs_pa,
used_limits->submit_jobs, job_cnt,
qos_ptr->name);
rc = false;
goto end_it;
}
}
if ((qos_out_ptr->max_submit_jobs_pu == INFINITE) &&
(qos_ptr->max_submit_jobs_pu != INFINITE)) {
slurmdb_used_limits_t *used_limits =
acct_policy_get_user_used_limits(
&qos_ptr->usage->user_limit_list,
job_desc->user_id);
qos_out_ptr->max_submit_jobs_pu = qos_ptr->max_submit_jobs_pu;
if ((used_limits->submit_jobs + job_cnt) >
qos_ptr->max_submit_jobs_pu) {
if (reason)
*reason = WAIT_QOS_MAX_SUB_JOB;
debug2("job submit for user %s(%u): qos max submit job limit exceeded %u (used:%u + requested:%d) for qos '%s'",
user_name,
job_desc->user_id,
qos_ptr->max_submit_jobs_pu,
used_limits->submit_jobs, job_cnt,
qos_ptr->name);
rc = false;
goto end_it;
}
}
if (!_validate_tres_limits_for_qos(&tres_pos,
job_desc->tres_req_cnt, 0,
NULL,
qos_ptr->min_tres_pj_ctld,
NULL,
qos_out_ptr->min_tres_pj_ctld,
acct_policy_limit_set->tres,
strict_checking, 0)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MIN_UNK);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"per-job max tres limit %"PRIu64" for qos '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos],
qos_ptr->min_tres_pj_ctld[tres_pos],
qos_ptr->name);
rc = false;
goto end_it;
}
end_it:
return rc;
}
static int _qos_job_runnable_pre_select(job_record_t *job_ptr,
slurmdb_qos_rec_t *qos_ptr,
slurmdb_qos_rec_t *qos_out_ptr)
{
uint32_t wall_mins;
uint32_t time_limit = NO_VAL;
int rc = true;
slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
bool safe_limits = false;
slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
if (!qos_ptr || !qos_out_ptr || !assoc_ptr)
return rc;
/*
* check to see if we should be using safe limits, if so we
* will only start a job if there are sufficient remaining
* cpu-minutes for it to run to completion
*/
if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
safe_limits = true;
wall_mins = qos_ptr->usage->grp_used_wall / 60;
used_limits_a = acct_policy_get_acct_used_limits(
&qos_ptr->usage->acct_limit_list,
assoc_ptr->acct);
used_limits = acct_policy_get_user_used_limits(
&qos_ptr->usage->user_limit_list,
job_ptr->user_id);
/* we don't need to check grp_tres_mins here */
/* we don't need to check grp_tres here */
/* we don't need to check grp_mem here */
if ((qos_out_ptr->grp_jobs == INFINITE) &&
(qos_ptr->grp_jobs != INFINITE)) {
qos_out_ptr->grp_jobs = qos_ptr->grp_jobs;
if (qos_ptr->usage->grp_used_jobs >= qos_ptr->grp_jobs) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_QOS_GRP_JOB;
debug2("%pJ being held, the job is at or exceeds group max jobs limit %u with %u for QOS %s",
job_ptr, qos_ptr->grp_jobs,
qos_ptr->usage->grp_used_jobs, qos_ptr->name);
rc = false;
goto end_it;
}
}
/* we don't need to check grp_submit_jobs here */
/* we don't need to check grp_tres_run_mins here */
/* we don't need to check grp_nodes here */
/* we don't need to check submit_jobs here */
if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
&& (qos_out_ptr->grp_wall == INFINITE)
&& (qos_ptr->grp_wall != INFINITE)) {
if (time_limit == NO_VAL) {
time_limit = job_ptr->time_limit;
_set_time_limit(&time_limit,
job_ptr->part_ptr->max_time,
MIN(qos_ptr->grp_wall,
qos_ptr->max_wall_pj),
&job_ptr->limit_set.time);
/* Account for usage factor, if necessary */
if ((job_ptr->qos_ptr &&
(job_ptr->qos_ptr->flags &
QOS_FLAG_USAGE_FACTOR_SAFE) &&
(job_ptr->qos_ptr->usage_factor >= 0)) &&
((time_limit != INFINITE) ||
(job_ptr->qos_ptr->usage_factor < 1.0))) {
time_limit *= job_ptr->qos_ptr->usage_factor;
}
}
qos_out_ptr->grp_wall = qos_ptr->grp_wall;
if (wall_mins >= qos_ptr->grp_wall) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_QOS_GRP_WALL;
debug2("%pJ being held, the job is at or exceeds group wall limit %u with %u for QOS %s",
job_ptr, qos_ptr->grp_wall,
wall_mins, qos_ptr->name);
rc = false;
goto end_it;
} else if (safe_limits &&
((wall_mins + time_limit) > qos_ptr->grp_wall)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_QOS_GRP_WALL;
debug2("%pJ being held, the job request will exceed group wall limit %u if ran with %u for QOS %s",
job_ptr, qos_ptr->grp_wall,
wall_mins + time_limit, qos_ptr->name);
rc = false;
goto end_it;
}
}
/* we don't need to check max_tres_mins_pj here */
/* we don't need to check max_tres_pj here */
/* we don't need to check max_tres_pn here */
/* we don't need to check min_tres_pj here */
/* we don't need to check max_tres_pa here */
/* we don't need to check max_tres_pu here */
/* we don't need to check max_tres_run_mins_pa here */
/* we don't need to check max_tres_run_mins_pu here */
if ((qos_out_ptr->max_jobs_pa == INFINITE)
&& (qos_ptr->max_jobs_pa != INFINITE)) {
qos_out_ptr->max_jobs_pa = qos_ptr->max_jobs_pa;
if (used_limits_a->jobs >= qos_ptr->max_jobs_pa) {
xfree(job_ptr->state_desc);
job_ptr->state_reason =
WAIT_QOS_MAX_JOB_PER_ACCT;
debug2("%pJ being held, the job is at or exceeds max jobs per-acct (%s) limit %u with %u for QOS %s",
job_ptr, used_limits_a->acct,
qos_ptr->max_jobs_pa,
used_limits_a->jobs, qos_ptr->name);
rc = false;
goto end_it;
}
}
if ((qos_out_ptr->max_jobs_pu == INFINITE)
&& (qos_ptr->max_jobs_pu != INFINITE)) {
qos_out_ptr->max_jobs_pu = qos_ptr->max_jobs_pu;
if (used_limits->jobs >= qos_ptr->max_jobs_pu) {
xfree(job_ptr->state_desc);
job_ptr->state_reason =
WAIT_QOS_MAX_JOB_PER_USER;
debug2("%pJ being held, the job is at or exceeds max jobs per-user limit %u with %u for QOS %s",
job_ptr, qos_ptr->max_jobs_pu,
used_limits->jobs, qos_ptr->name);
rc = false;
goto end_it;
}
}
/* we don't need to check submit_jobs_pa here */
/* we don't need to check submit_jobs_pu here */
/*
* if the QOS limits have changed since job
* submission and job can not run, then kill it
*/
if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
&& (qos_out_ptr->max_wall_pj == INFINITE)
&& (qos_ptr->max_wall_pj != INFINITE)) {
if (time_limit == NO_VAL) {
time_limit = job_ptr->time_limit;
_set_time_limit(&time_limit,
job_ptr->part_ptr->max_time,
qos_ptr->max_wall_pj,
&job_ptr->limit_set.time);
}
/* Account for usage factor, if necessary */
if ((job_ptr->qos_ptr &&
(job_ptr->qos_ptr->flags &
QOS_FLAG_USAGE_FACTOR_SAFE) &&
(job_ptr->qos_ptr->usage_factor >= 0)) &&
((time_limit != INFINITE) ||
(job_ptr->qos_ptr->usage_factor < 1.0))) {
time_limit *= job_ptr->qos_ptr->usage_factor;
}
qos_out_ptr->max_wall_pj = qos_ptr->max_wall_pj;
if (time_limit > qos_out_ptr->max_wall_pj) {
xfree(job_ptr->state_desc);
job_ptr->state_reason =
WAIT_QOS_MAX_WALL_PER_JOB;
debug2("%pJ being held, time limit %u exceeds QOS max wall pj %u",
job_ptr, time_limit, qos_out_ptr->max_wall_pj);
rc = false;
goto end_it;
}
}
end_it:
return rc;
}
static int _qos_job_runnable_post_select(job_record_t *job_ptr,
slurmdb_qos_rec_t *qos_ptr,
slurmdb_qos_rec_t *qos_out_ptr,
uint64_t *tres_req_cnt,
uint64_t *job_tres_time_limit)
{
uint64_t tres_usage_mins[slurmctld_tres_cnt];
uint64_t tres_run_mins[slurmctld_tres_cnt];
uint64_t tres_run_mins_pa[slurmctld_tres_cnt];
uint64_t tres_run_mins_pu[slurmctld_tres_cnt];
uint64_t orig_node_cnt;
slurmdb_used_limits_t *used_limits = NULL, *used_limits_a = NULL;
bool safe_limits = false;
int rc = true, i, tres_pos = 0;
acct_policy_tres_usage_t tres_usage;
slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
double usage_factor = 1.0;
if (!qos_ptr || !qos_out_ptr || !assoc_ptr)
return rc;
/*
* check to see if we should be using safe limits, if so we will only
* will only start a job if there are sufficient remaining cpu-minutes
* for it to run to completion
*/
if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
safe_limits = true;
used_limits_a = acct_policy_get_acct_used_limits(
&qos_ptr->usage->acct_limit_list,
assoc_ptr->acct);
used_limits = acct_policy_get_user_used_limits(
&qos_ptr->usage->user_limit_list,
job_ptr->user_id);
/* clang needs this memset to avoid a warning */
memset(tres_run_mins, 0, sizeof(tres_run_mins));
memset(tres_run_mins_pa, 0, sizeof(tres_run_mins_pa));
memset(tres_run_mins_pu, 0, sizeof(tres_run_mins_pu));
memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
if (job_ptr->qos_ptr &&
(job_ptr->qos_ptr->usage_factor >= 0))
usage_factor = job_ptr->qos_ptr->usage_factor;
for (i=0; i<slurmctld_tres_cnt; i++) {
tres_run_mins[i] =
qos_ptr->usage->grp_used_tres_run_secs[i] / 60;
tres_run_mins_pa[i] =
used_limits_a->tres_run_secs[i] / 60;
tres_run_mins_pu[i] =
used_limits->tres_run_secs[i] / 60;
tres_usage_mins[i] =
(uint64_t)(qos_ptr->usage->usage_tres_raw[i] / 60.0);
/*
* Clear usage if factor is 0 so that jobs can run. Otherwise
* multiplying can cause more jobs to be run than the limit
* allows (e.g. usagefactor=.5).
*/
if (usage_factor == 0.0) {
tres_run_mins[i] *= usage_factor;
tres_run_mins_pa[i] *= usage_factor;
tres_usage_mins[i] *= usage_factor;
}
}
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos, qos_ptr->grp_tres_mins_ctld,
qos_out_ptr->grp_tres_mins_ctld, job_tres_time_limit,
tres_run_mins, tres_usage_mins, job_ptr->limit_set.tres,
safe_limits);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK_MIN);
debug2("%pJ being held, QOS %s group max tres(%s) minutes limit of %"PRIu64" is already at or exceeded with %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_mins_ctld[tres_pos],
tres_usage_mins[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK_MIN);
debug2("%pJ being held, the job is requesting more than allowed with QOS %s's group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_mins_ctld[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/*
* If we're using safe limits start
* the job only if there are
* sufficient cpu-mins left such that
* it will run to completion without
* being killed
*/
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK_MIN);
debug2("%pJ being held, the job is at or exceeds QOS %s's group max tres(%s) minutes of %"PRIu64" of which %"PRIu64" are still available but request is for %"PRIu64" (plus %"PRIu64" already in use) tres minutes (request tres count %"PRIu64")",
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_mins_ctld[tres_pos],
qos_ptr->grp_tres_mins_ctld[tres_pos] -
tres_usage_mins[tres_pos],
job_tres_time_limit[tres_pos],
tres_run_mins[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_OKAY:
/* all good */
break;
}
/*
* If the job's CPU limit wasn't administratively set and the QOS
* has a GrpCPU limit, cancel the job if its minimum CPU requirement
* has exceeded the limit for all CPUs usable by the QOS
*/
orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
_get_unique_job_node_cnt(job_ptr, qos_ptr->usage->grp_node_bitmap,
&tres_req_cnt[TRES_ARRAY_NODE]);
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos,
qos_ptr->grp_tres_ctld, qos_out_ptr->grp_tres_ctld,
tres_req_cnt, qos_ptr->usage->grp_used_tres,
NULL, job_ptr->limit_set.tres, true);
tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK);
debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds group max tres limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
tres_req_cnt[tres_pos],
qos_ptr->grp_tres_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK);
debug2("%pJ being held, if allowed the job request will exceed QOS %s group max tres(%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_ctld[tres_pos],
qos_ptr->usage->grp_used_tres[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
goto end_it;
case TRES_USAGE_OKAY:
/* all good */
break;
}
/* we don't need to check grp_jobs here */
/* we don't need to check grp_submit_jobs here */
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos,
qos_ptr->grp_tres_run_mins_ctld,
qos_out_ptr->grp_tres_run_mins_ctld,
job_tres_time_limit, tres_run_mins, NULL, NULL, true);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
debug2("%pJ is being held, QOS %s group max running tres(%s) minutes request %"PRIu64" exceeds limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
job_tres_time_limit[tres_pos],
qos_ptr->grp_tres_run_mins_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN);
debug2("%pJ being held, if allowed the job request will exceed QOS %s group max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_run_mins_ctld[tres_pos],
tres_run_mins[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_OKAY:
/* all good */
break;
}
/* we don't need to check submit_jobs here */
/* we don't need to check grp_wall here */
if (!_validate_tres_limits_for_qos(&tres_pos,
job_tres_time_limit, 0,
NULL,
qos_ptr->max_tres_mins_pj_ctld,
NULL,
qos_out_ptr->max_tres_mins_pj_ctld,
job_ptr->limit_set.tres,
1, 1)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_MINS_PER_JOB);
debug2("%pJ being held, the job is requesting more than allowed with QOS %s's max tres(%s) minutes of %"PRIu64" with %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->max_tres_mins_pj_ctld[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
}
if (!_validate_tres_limits_for_qos(&tres_pos,
tres_req_cnt, 0,
NULL,
qos_ptr->max_tres_pj_ctld,
NULL,
qos_out_ptr->max_tres_pj_ctld,
job_ptr->limit_set.tres,
1, 1)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_JOB);
debug2("%pJ is being held, QOS %s min tres(%s) per job request %"PRIu64" exceeds max tres limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
tres_req_cnt[tres_pos],
qos_ptr->max_tres_pj_ctld[tres_pos]);
rc = false;
goto end_it;
}
if (!_validate_tres_limits_for_qos(&tres_pos,
tres_req_cnt,
tres_req_cnt[TRES_ARRAY_NODE],
NULL,
qos_ptr->max_tres_pn_ctld,
NULL,
qos_out_ptr->max_tres_pn_ctld,
job_ptr->limit_set.tres,
1, 1)) {
uint64_t req_per_node;
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_NODE);
req_per_node = tres_req_cnt[tres_pos];
if (tres_req_cnt[TRES_ARRAY_NODE] > 1)
req_per_node /= tres_req_cnt[TRES_ARRAY_NODE];
debug2("%pJ is being held, QOS %s min tres(%s) per node request %"PRIu64" exceeds max tres limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
req_per_node,
qos_ptr->max_tres_pn_ctld[tres_pos]);
rc = false;
goto end_it;
}
if (!_validate_tres_limits_for_qos(&tres_pos,
tres_req_cnt, 0,
NULL,
qos_ptr->min_tres_pj_ctld,
NULL,
qos_out_ptr->min_tres_pj_ctld,
job_ptr->limit_set.tres,
1, 0)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MIN_UNK);
debug2("%pJ is being held, QOS %s min tres(%s) per job request %"PRIu64" exceeds min tres limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
tres_req_cnt[tres_pos],
qos_ptr->min_tres_pj_ctld[tres_pos]);
rc = false;
goto end_it;
}
orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
_get_unique_job_node_cnt(job_ptr, used_limits_a->node_bitmap,
&tres_req_cnt[TRES_ARRAY_NODE]);
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos,
qos_ptr->max_tres_pa_ctld, qos_out_ptr->max_tres_pa_ctld,
tres_req_cnt, used_limits_a->tres,
NULL, job_ptr->limit_set.tres, true);
tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
/*
* Hold the job if it exceeds the per-acct
* TRES limit for the given QOS
*/
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds max tres per account (%s) limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
tres_req_cnt[tres_pos],
used_limits_a->acct,
qos_ptr->max_tres_pa_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/*
* Hold the job if the user has exceeded the QOS per-user
* TRES limit with their current usage
*/
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_ACCT);
debug2("%pJ being held, if allowed the job request will exceed QOS %s max tres(%s) per account (%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
used_limits_a->acct,
qos_ptr->max_tres_pa_ctld[tres_pos],
used_limits_a->tres[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
goto end_it;
case TRES_USAGE_OKAY:
/* all good */
break;
}
orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
_get_unique_job_node_cnt(job_ptr, used_limits->node_bitmap,
&tres_req_cnt[TRES_ARRAY_NODE]);
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos,
qos_ptr->max_tres_pu_ctld, qos_out_ptr->max_tres_pu_ctld,
tres_req_cnt, used_limits->tres,
NULL, job_ptr->limit_set.tres, true);
tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
/*
* Hold the job if it exceeds the per-user
* TRES limit for the given QOS
*/
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
debug2("%pJ is being held, QOS %s min tres(%s) request %"PRIu64" exceeds max tres per user limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
tres_req_cnt[tres_pos],
qos_ptr->max_tres_pu_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/*
* Hold the job if the user has exceeded the QOS
* per-user TRES limit with their current usage
*/
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_PER_USER);
debug2("%pJ being held, if allowed the job request will exceed QOS %s max tres(%s) per user limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->max_tres_pu_ctld[tres_pos],
used_limits->tres[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
goto end_it;
case TRES_USAGE_OKAY:
/* all good */
break;
}
/* We do not need to check max_jobs_pa here */
/* We do not need to check max_jobs_pu here */
/* we don't need to check submit_jobs_pa here */
/* we don't need to check submit_jobs_pu here */
/* we don't need to check max_wall_pj here */
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos, qos_ptr->max_tres_run_mins_pa_ctld,
qos_out_ptr->max_tres_run_mins_pa_ctld, job_tres_time_limit,
tres_run_mins_pa, NULL, NULL, true);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT);
debug2("%pJ is being held, QOS %s account max running tres(%s) minutes request %"PRIu64" exceeds limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
job_tres_time_limit[tres_pos],
qos_ptr->max_tres_run_mins_pa_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_RUN_MINS_PER_ACCT);
debug2("%pJ being held, if allowed the job request will exceed QOS %s account max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->max_tres_run_mins_pa_ctld[tres_pos],
tres_run_mins_pa[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_OKAY:
/* all good */
break;
}
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos, qos_ptr->max_tres_run_mins_pu_ctld,
qos_out_ptr->max_tres_run_mins_pu_ctld, job_tres_time_limit,
tres_run_mins_pu, NULL, NULL, true);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER);
debug2("%pJ is being held, QOS %s user max running tres(%s) minutes request %"PRIu64" exceeds limit %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
job_tres_time_limit[tres_pos],
qos_ptr->max_tres_run_mins_pu_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_QOS_MAX_UNK_RUN_MINS_PER_USER);
debug2("%pJ being held, if allowed the job request will exceed QOS %s user max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->max_tres_run_mins_pu_ctld[tres_pos],
tres_run_mins_pu[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_OKAY:
/* all good */
break;
}
end_it:
if (!rc)
job_ptr->qos_blocking_ptr = qos_ptr;
return rc;
}
static int _qos_job_time_out(job_record_t *job_ptr,
slurmdb_qos_rec_t *qos_ptr,
slurmdb_qos_rec_t *qos_out_ptr,
uint64_t *job_tres_usage_mins)
{
uint64_t tres_usage_mins[slurmctld_tres_cnt];
uint32_t wall_mins;
int rc = true, tres_pos = 0, i;
acct_policy_tres_usage_t tres_usage;
time_t now = time(NULL);
if (!qos_ptr || !qos_out_ptr)
return rc;
/*
* The idea here is for QOS to trump what an association has set for
* a limit, so if an association set of wall 10 mins and the QOS has
* 20 mins set and the job has been running for 11 minutes it continues
* until 20.
*/
/* clang needs this memset to avoid a warning */
memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
for (i = 0; i < slurmctld_tres_cnt; i++)
tres_usage_mins[i] =
(uint64_t)(qos_ptr->usage->usage_tres_raw[i] / 60.0);
wall_mins = qos_ptr->usage->grp_used_wall / 60;
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos, qos_ptr->grp_tres_mins_ctld,
qos_out_ptr->grp_tres_mins_ctld, job_tres_usage_mins,
NULL, tres_usage_mins, NULL, false);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
last_job_update = now;
info("%pJ timed out, the job is at or exceeds QOS %s's group max tres(%s) minutes of %"PRIu64" with %"PRIu64"",
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_mins_ctld[tres_pos],
tres_usage_mins[tres_pos]);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc, "Job is at or exceeds QOS %s's group max TRES(%s) minutes of %"PRIu64" with %"PRIu64,
qos_ptr->name, assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_mins_ctld[tres_pos],
tres_usage_mins[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
/* not possible safe_limits is 0 */
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/* not possible safe_limits is 0 */
case TRES_USAGE_OKAY:
/* all good */
break;
}
if ((qos_out_ptr->grp_wall == INFINITE)
&& (qos_ptr->grp_wall != INFINITE)) {
qos_out_ptr->grp_wall = qos_ptr->grp_wall;
if (wall_mins >= qos_ptr->grp_wall) {
last_job_update = now;
info("%pJ timed out, the job is at or exceeds QOS %s's group wall limit of %u with %u",
job_ptr, qos_ptr->name,
qos_ptr->grp_wall, wall_mins);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc, "Job is at or exceeds QOS %s's group wall limit of %u with %u",
qos_ptr->name, qos_ptr->grp_wall, wall_mins);
rc = false;
goto end_it;
}
}
tres_usage = _validate_tres_usage_limits_for_qos(
&tres_pos, qos_ptr->max_tres_mins_pj_ctld,
qos_out_ptr->max_tres_mins_pj_ctld, job_tres_usage_mins,
NULL, NULL, NULL, true);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible curr_usage is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
last_job_update = now;
info("%pJ timed out, the job is at or exceeds QOS %s's max tres(%s) minutes of %"PRIu64" with %"PRIu64,
job_ptr, qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->max_tres_mins_pj_ctld[tres_pos],
job_tres_usage_mins[tres_pos]);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc, "Job is at or exceeds QOS %s's max TRES(%s) minutes of %"PRIu64" with %"PRIu64,
qos_ptr->name, assoc_mgr_tres_name_array[tres_pos],
qos_ptr->max_tres_mins_pj_ctld[tres_pos],
job_tres_usage_mins[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/* not possible tres_usage is NULL */
case TRES_USAGE_OKAY:
/* all good */
break;
}
end_it:
return rc;
}
/*
* acct_policy_add_job_submit - Note that a job has been submitted for
* accounting policy purposes.
*/
extern void acct_policy_add_job_submit(job_record_t *job_ptr, bool assoc_locked)
{
_adjust_limit_usage(ACCT_POLICY_ADD_SUBMIT, job_ptr, assoc_locked);
}
/*
* acct_policy_remove_job_submit - Note that a job has finished (might
* not had started or been allocated resources) for accounting
* policy purposes.
*/
extern void acct_policy_remove_job_submit(job_record_t *job_ptr,
bool assoc_locked)
{
_adjust_limit_usage(ACCT_POLICY_REM_SUBMIT, job_ptr, assoc_locked);
}
/*
* acct_policy_job_begin - Note that a job is starting for accounting
* policy purposes.
*/
extern void acct_policy_job_begin(job_record_t *job_ptr, bool assoc_locked)
{
_adjust_limit_usage(ACCT_POLICY_JOB_BEGIN, job_ptr, assoc_locked);
}
/*
* acct_policy_job_fini - Note that a job is completing for accounting
* policy purposes.
*/
extern void acct_policy_job_fini(job_record_t *job_ptr, bool assoc_locked)
{
/* if end_time_exp == NO_VAL this has already happened */
if (job_ptr->end_time_exp != (time_t)NO_VAL)
_adjust_limit_usage(ACCT_POLICY_JOB_FINI, job_ptr,
assoc_locked);
else
debug2("We have already ran the job_fini for %pJ", job_ptr);
}
extern void acct_policy_alter_job(job_record_t *job_ptr,
uint32_t new_time_limit)
{
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
slurmdb_assoc_rec_t *assoc_ptr = NULL;
assoc_mgr_lock_t locks =
{ .assoc = WRITE_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
uint64_t used_tres_run_secs[slurmctld_tres_cnt];
uint64_t new_used_tres_run_secs[slurmctld_tres_cnt];
uint64_t time_limit_secs, new_time_limit_secs;
int i;
if (!IS_JOB_RUNNING(job_ptr) || (job_ptr->time_limit == new_time_limit))
return;
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
|| !_valid_job_assoc(job_ptr))
return;
time_limit_secs = (uint64_t)job_ptr->time_limit * 60;
new_time_limit_secs = (uint64_t)new_time_limit * 60;
/* take into account usage factor */
if (job_ptr->qos_ptr && (job_ptr->qos_ptr->usage_factor >= 0)) {
time_limit_secs *= job_ptr->qos_ptr->usage_factor;
new_time_limit_secs *= job_ptr->qos_ptr->usage_factor;
}
/* clang needs these memset to avoid a warning */
memset(used_tres_run_secs, 0, sizeof(used_tres_run_secs));
memset(new_used_tres_run_secs, 0, sizeof(new_used_tres_run_secs));
for (i=0; i<slurmctld_tres_cnt; i++) {
if (i == TRES_ARRAY_ENERGY)
continue;
if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
continue;
used_tres_run_secs[i] =
job_ptr->tres_alloc_cnt[i] * time_limit_secs;
new_used_tres_run_secs[i] =
job_ptr->tres_alloc_cnt[i] * new_time_limit_secs;
}
assoc_mgr_lock(&locks);
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
_qos_alter_job(job_ptr, qos_ptr_1,
used_tres_run_secs, new_used_tres_run_secs);
_qos_alter_job(job_ptr, qos_ptr_2,
used_tres_run_secs, new_used_tres_run_secs);
assoc_ptr = job_ptr->assoc_ptr;
while (assoc_ptr) {
for (i=0; i<slurmctld_tres_cnt; i++) {
if (used_tres_run_secs[i] == new_used_tres_run_secs[i])
continue;
/*
* Handle the case when remaining usage is less than
* the original job request.
*/
int64_t used_tres_run_sec_decr =
used_tres_run_secs[i] -
new_used_tres_run_secs[i];
if ((used_tres_run_sec_decr < 0) ||
(used_tres_run_sec_decr <
assoc_ptr->usage->grp_used_tres_run_secs[i]))
assoc_ptr->usage->grp_used_tres_run_secs[i] -=
used_tres_run_sec_decr;
else
assoc_ptr->usage->grp_used_tres_run_secs[i] = 0;
debug2("altering %pJ assoc %u(%s/%s/%s) got %"PRIu64" just removed %"PRIu64" and added %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_ptr->usage->grp_used_tres_run_secs[i],
used_tres_run_secs[i],
new_used_tres_run_secs[i]);
}
/* now handle all the group limits of the parents */
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
}
assoc_mgr_unlock(&locks);
}
static void _get_prio_thresh(uint32_t *prio_thresh, uint32_t in_thresh)
{
/*
* If we already set prio_thresh then call it good.
* If in_thresh is INFINITE we don't have a limit
*/
if ((*prio_thresh) || (in_thresh == INFINITE))
return;
*prio_thresh = in_thresh;
}
static void _get_accrue_create_cnt(uint32_t *max_jobs_accrue, int *create_cnt,
uint32_t in_accrue, uint32_t in_used)
{
/*
* If in_accrue is INFINITE we don't have a limit
* If we already set max_jobs_accrue and it is the most restrictive,
* then call it good.
*/
if ((in_accrue == INFINITE) ||
((*max_jobs_accrue != INFINITE) && (*max_jobs_accrue <= in_accrue)))
return;
*max_jobs_accrue = in_accrue;
if (*max_jobs_accrue > in_used)
*create_cnt = *max_jobs_accrue - in_used;
else
*create_cnt = 0;
}
static void _add_accrue_time_internal(void *x, void *arg)
{
slurmdb_qos_rec_t *qos_ptr = x;
acct_policy_accrue_t *acct_policy_accrue = arg;
slurmdb_assoc_rec_t *assoc_ptr = acct_policy_accrue->assoc_ptr;
log_flag(ACCRUE, "%s: Adding %d to assoc_ptr %p (%p %p %p)",
__func__, acct_policy_accrue->cnt, assoc_ptr, qos_ptr,
acct_policy_accrue->used_limits_acct,
acct_policy_accrue->used_limits_user);
if (qos_ptr)
qos_ptr->usage->accrue_cnt += acct_policy_accrue->cnt;
if (acct_policy_accrue->used_limits_acct)
acct_policy_accrue->used_limits_acct->accrue_cnt +=
acct_policy_accrue->cnt;
if (acct_policy_accrue->used_limits_user)
acct_policy_accrue->used_limits_user->accrue_cnt +=
acct_policy_accrue->cnt;
while (assoc_ptr) {
log_flag(ACCRUE, "assoc_id %u(%s/%s/%s/%p) added %d count %d",
assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user,
assoc_ptr->partition, assoc_ptr->usage,
acct_policy_accrue->cnt,
assoc_ptr->usage->accrue_cnt);
assoc_ptr->usage->accrue_cnt += acct_policy_accrue->cnt;
/* now go up the hierarchy */
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
}
}
static void _remove_accrue_time_internal(void *x, void *arg)
{
slurmdb_qos_rec_t *qos_ptr = x;
acct_policy_accrue_t *acct_policy_accrue = arg;
slurmdb_assoc_rec_t *assoc_ptr = acct_policy_accrue->assoc_ptr;
log_flag(ACCRUE, "%s: Removing %d from assoc_ptr %p (%p %p %p)",
__func__, acct_policy_accrue->cnt, assoc_ptr, qos_ptr,
acct_policy_accrue->used_limits_acct,
acct_policy_accrue->used_limits_user);
if (qos_ptr) {
if (qos_ptr->usage->accrue_cnt >= acct_policy_accrue->cnt)
qos_ptr->usage->accrue_cnt -= acct_policy_accrue->cnt;
else {
error("%s: QOS %s accrue_cnt underflow",
__func__, qos_ptr->name);
qos_ptr->usage->accrue_cnt = 0;
}
}
if (acct_policy_accrue->used_limits_acct) {
if (acct_policy_accrue->used_limits_acct->accrue_cnt >=
acct_policy_accrue->cnt)
acct_policy_accrue->used_limits_acct->accrue_cnt -=
acct_policy_accrue->cnt;
else {
if (qos_ptr) {
error("%s: QOS %s acct %s accrue_cnt underflow",
__func__, qos_ptr->name,
acct_policy_accrue->used_limits_acct->
acct);
}
acct_policy_accrue->used_limits_acct->accrue_cnt = 0;
}
}
if (acct_policy_accrue->used_limits_user) {
if (acct_policy_accrue->used_limits_user->accrue_cnt >=
acct_policy_accrue->cnt)
acct_policy_accrue->used_limits_user->accrue_cnt -=
acct_policy_accrue->cnt;
else {
if (qos_ptr) {
error("%s: QOS %s user %u accrue_cnt underflow",
__func__, qos_ptr->name,
acct_policy_accrue->used_limits_user->
uid);
}
acct_policy_accrue->used_limits_user->accrue_cnt = 0;
}
}
while (assoc_ptr) {
if (assoc_ptr->usage->accrue_cnt >= acct_policy_accrue->cnt) {
log_flag(ACCRUE, "assoc_id %u(%s/%s/%s/%p) removed %d count %d",
assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_ptr->usage, acct_policy_accrue->cnt,
assoc_ptr->usage->accrue_cnt);
assoc_ptr->usage->accrue_cnt -= acct_policy_accrue->cnt;
} else {
error("%s: assoc_id %u(%s/%s/%s) accrue_cnt underflow",
__func__, assoc_ptr->id,
assoc_ptr->acct,
assoc_ptr->user,
assoc_ptr->partition);
assoc_ptr->usage->accrue_cnt = 0;
}
/* now go up the hierarchy */
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
}
}
static void _fill_in_qos_used_limits(slurmdb_qos_rec_t *qos_ptr,
acct_policy_accrue_t *acct_policy_accrue)
{
if (acct_policy_accrue->limits_filled)
return;
acct_policy_accrue->limits_filled = true;
if (!qos_ptr) {
acct_policy_accrue->used_limits_acct = NULL;
acct_policy_accrue->used_limits_user = NULL;
return;
}
xassert(acct_policy_accrue->acct);
acct_policy_accrue->used_limits_acct =
acct_policy_get_acct_used_limits(
&qos_ptr->usage->acct_limit_list,
acct_policy_accrue->acct);
acct_policy_accrue->used_limits_user =
acct_policy_get_user_used_limits(
&qos_ptr->usage->user_limit_list,
acct_policy_accrue->uid);
}
static int _for_each_qos_remove_accrue_time(void *x, void *arg)
{
slurmdb_qos_rec_t *qos_ptr = x;
acct_policy_accrue_t *acct_policy_accrue = arg;
_fill_in_qos_used_limits(qos_ptr, acct_policy_accrue);
_remove_accrue_time_internal(qos_ptr, acct_policy_accrue);
/* Only do assoc_ptr stuff once */
acct_policy_accrue->assoc_ptr = NULL;
return 0;
}
static bool _acct_policy_validate(job_desc_msg_t *job_desc,
part_record_t *part_ptr,
slurmdb_assoc_rec_t *assoc_in,
slurmdb_qos_rec_t *qos_ptr_1,
slurmdb_qos_rec_t *qos_ptr_2,
uint32_t *reason,
acct_policy_limit_set_t *
acct_policy_limit_set,
bool update_call, bool locked)
{
slurmdb_qos_rec_t qos_rec;
slurmdb_assoc_rec_t *assoc_ptr = assoc_in;
int parent = 0, job_cnt = 1;
char *user_name = NULL;
bool rc = true;
assoc_mgr_lock_t locks =
{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
bool strict_checking;
double limit_factor = -1.0;
uint64_t grp_tres_ctld[slurmctld_tres_cnt];
uint64_t max_tres_ctld[slurmctld_tres_cnt];
xassert(acct_policy_limit_set);
if (!assoc_ptr) {
error("acct_policy_validate: no assoc_ptr given for job.");
return false;
}
user_name = assoc_ptr->user;
if (job_desc->array_bitmap)
job_cnt = bit_set_count(job_desc->array_bitmap);
slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
if (!locked)
assoc_mgr_lock(&locks);
xassert(verify_assoc_lock(ASSOC_LOCK, READ_LOCK));
xassert(verify_assoc_lock(QOS_LOCK, WRITE_LOCK));
xassert(verify_assoc_lock(TRES_LOCK, READ_LOCK));
assoc_mgr_set_qos_tres_cnt(&qos_rec);
if (qos_ptr_1) {
strict_checking = (qos_ptr_1->flags & QOS_FLAG_DENY_LIMIT);
if (qos_ptr_2 && !strict_checking)
strict_checking =
qos_ptr_2->flags & QOS_FLAG_DENY_LIMIT;
if (!(rc = _qos_policy_validate(
job_desc, assoc_ptr, part_ptr,
qos_ptr_1, &qos_rec,
reason, acct_policy_limit_set, update_call,
user_name, job_cnt, strict_checking)))
goto end_it;
if (!(rc = _qos_policy_validate(
job_desc, assoc_ptr,
part_ptr, qos_ptr_2, &qos_rec,
reason, acct_policy_limit_set, update_call,
user_name, job_cnt, strict_checking)))
goto end_it;
} else /*
* We don't have a QOS to determine if we should fail or not, so
* we will go with strict_checking by default.
*/
strict_checking = true;
if (qos_ptr_1 && !fuzzy_equal(qos_ptr_1->limit_factor, INFINITE))
limit_factor = qos_ptr_1->limit_factor;
else if (qos_ptr_2 && !fuzzy_equal(qos_ptr_2->limit_factor, INFINITE))
limit_factor = qos_ptr_2->limit_factor;
while (assoc_ptr) {
int tres_pos = 0;
for (int i = 0; i < slurmctld_tres_cnt; i++) {
grp_tres_ctld[i] = assoc_ptr->grp_tres_ctld[i];
max_tres_ctld[i] = assoc_ptr->max_tres_ctld[i];
_apply_limit_factor(&grp_tres_ctld[i], limit_factor);
_apply_limit_factor(&max_tres_ctld[i], limit_factor);
}
if (!_validate_tres_limits_for_assoc(
&tres_pos, job_desc->tres_req_cnt, 0,
grp_tres_ctld,
qos_rec.grp_tres_ctld,
acct_policy_limit_set->tres,
strict_checking, update_call, 1)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"group max tres limit %"PRIu64" for account %s",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos],
grp_tres_ctld[tres_pos],
assoc_ptr->acct);
rc = false;
break;
}
/* for validation we don't need to look at
* assoc_ptr->grp_jobs.
*/
if ((qos_rec.grp_submit_jobs == INFINITE) &&
(assoc_ptr->grp_submit_jobs != INFINITE) &&
((assoc_ptr->usage->used_submit_jobs + job_cnt)
> assoc_ptr->grp_submit_jobs)) {
if (reason)
*reason = WAIT_ASSOC_GRP_SUB_JOB;
debug2("job submit for user %s(%u): group max submit job limit exceeded %u (used:%u + requested:%d) for account '%s'",
user_name,
job_desc->user_id,
assoc_ptr->grp_submit_jobs,
assoc_ptr->usage->used_submit_jobs, job_cnt,
assoc_ptr->acct);
rc = false;
break;
}
tres_pos = 0;
if (!update_call && !_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
assoc_ptr->grp_tres_mins_ctld,
qos_rec.grp_tres_mins_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos,
WAIT_ASSOC_GRP_UNK_MIN);
debug2("job submit for user %s(%u): "
"tres(%s) time limit request %"PRIu64" "
"exceeds group max limit %"PRIu64" "
"for account '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
assoc_ptr->
grp_tres_mins_ctld[tres_pos],
assoc_ptr->acct);
rc = false;
goto end_it;
}
tres_pos = 0;
if (!update_call && !_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
assoc_ptr->grp_tres_run_mins_ctld,
qos_rec.grp_tres_run_mins_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos,
WAIT_ASSOC_GRP_UNK_RUN_MIN);
debug2("job submit for user %s(%u): "
"tres(%s) time limit request %"PRIu64" "
"exceeds group max running "
"limit %"PRIu64" for account '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
assoc_ptr->
grp_tres_run_mins_ctld[tres_pos],
assoc_ptr->acct);
rc = false;
goto end_it;
}
if (!update_call && !_validate_time_limit(
&job_desc->time_limit,
part_ptr->max_time,
1,
assoc_ptr->grp_wall,
&qos_rec.grp_wall,
&acct_policy_limit_set->time,
strict_checking, false)) {
if (reason)
*reason = WAIT_ASSOC_GRP_WALL;
debug2("job submit for user %s(%u): "
"time limit %u exceeds max group %u for "
"account '%s'",
user_name,
job_desc->user_id,
job_desc->time_limit,
assoc_ptr->grp_wall,
assoc_ptr->acct);
rc = false;
break;
}
/* We don't need to look at the regular limits for
* parents since we have pre-propagated them, so just
* continue with the next parent
*/
if (parent) {
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
continue;
}
/* for validation we don't need to look at
* assoc_ptr->max_cpu_mins_pj.
*/
tres_pos = 0;
if (!_validate_tres_limits_for_assoc(
&tres_pos, job_desc->tres_req_cnt, 0,
max_tres_ctld,
qos_rec.max_tres_pj_ctld,
acct_policy_limit_set->tres,
strict_checking, update_call, 1)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"max tres limit %"PRIu64" for account %s",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos],
max_tres_ctld[tres_pos],
assoc_ptr->acct);
rc = false;
break;
}
tres_pos = 0;
if (!_validate_tres_limits_for_assoc(
&tres_pos, job_desc->tres_req_cnt,
job_desc->tres_req_cnt[TRES_ARRAY_NODE],
assoc_ptr->max_tres_pn_ctld,
qos_rec.max_tres_pn_ctld,
acct_policy_limit_set->tres,
strict_checking, update_call, 1)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos,
WAIT_ASSOC_MAX_UNK_PER_NODE);
debug2("job submit for user %s(%u): "
"min tres(%s) request %"PRIu64" exceeds "
"max tres limit %"PRIu64" per node "
"for account %s",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
job_desc->tres_req_cnt[tres_pos] /
job_desc->tres_req_cnt[TRES_ARRAY_NODE],
assoc_ptr->max_tres_pn_ctld[tres_pos],
assoc_ptr->acct);
rc = false;
break;
}
/* for validation we don't need to look at
* assoc_ptr->max_jobs.
*/
if ((qos_rec.max_submit_jobs_pa == INFINITE) &&
(qos_rec.max_submit_jobs_pu == INFINITE) &&
(assoc_ptr->max_submit_jobs != INFINITE) &&
((assoc_ptr->usage->used_submit_jobs + job_cnt)
> assoc_ptr->max_submit_jobs)) {
if (reason)
*reason = WAIT_ASSOC_MAX_SUB_JOB;
debug2("job submit for user %s(%u): account max submit job limit exceeded %u (used:%u + requested:%d) for account '%s'",
user_name,
job_desc->user_id,
assoc_ptr->max_submit_jobs,
assoc_ptr->usage->used_submit_jobs, job_cnt,
assoc_ptr->acct);
rc = false;
break;
}
if (!update_call && !_validate_tres_time_limits(
&tres_pos,
&job_desc->time_limit,
part_ptr->max_time,
job_desc->tres_req_cnt,
assoc_ptr->max_tres_mins_ctld,
qos_rec.max_tres_mins_pj_ctld,
&acct_policy_limit_set->time,
strict_checking)) {
if (reason)
*reason = _get_tres_state_reason(
tres_pos,
WAIT_ASSOC_MAX_UNK_MINS_PER_JOB);
debug2("job submit for user %s(%u): "
"tres(%s) time limit request %"PRIu64" "
"exceeds max per-job limit %"PRIu64" "
"for account '%s'",
user_name,
job_desc->user_id,
assoc_mgr_tres_name_array[tres_pos],
((uint64_t)job_desc->time_limit *
job_desc->tres_req_cnt[tres_pos]),
assoc_ptr->max_tres_mins_ctld[tres_pos],
assoc_ptr->acct);
rc = false;
break;
}
if (!update_call && !_validate_time_limit(
&job_desc->time_limit,
part_ptr->max_time,
1,
assoc_ptr->max_wall_pj,
&qos_rec.max_wall_pj,
&acct_policy_limit_set->time,
strict_checking, false)) {
if (reason)
*reason = WAIT_ASSOC_MAX_WALL_PER_JOB;
debug2("job submit for user %s(%u): "
"time limit %u exceeds max %u for "
"account '%s'",
user_name,
job_desc->user_id,
job_desc->time_limit,
assoc_ptr->max_wall_pj,
assoc_ptr->acct);
rc = false;
break;
}
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
parent = 1;
}
end_it:
if (!locked)
assoc_mgr_unlock(&locks);
slurmdb_free_qos_rec_members(&qos_rec);
return rc;
}
static int _list_acct_policy_validate(void *x, void *arg)
{
part_record_t *part_ptr = (part_record_t *) x;
acct_policy_validate_args_t *args = (acct_policy_validate_args_t *) arg;
slurmdb_qos_rec_t *qos_ptr_1 = NULL, *qos_ptr_2 = NULL;
job_record_t job_rec;
bool rc;
job_rec.qos_ptr = args->job_qos_ptr;
job_rec.part_ptr = part_ptr;
acct_policy_set_qos_order(&job_rec, &qos_ptr_1, &qos_ptr_2);
rc = _acct_policy_validate(args->job_desc, part_ptr, args->assoc_in,
qos_ptr_1, qos_ptr_2, args->reason,
args->acct_policy_limit_set,
args->update_call, true);
if (!rc)
return SLURM_ERROR; /* Break out of list_for_each. */
return rc;
}
/*
* acct_policy_validate - validate that a job request can be satisfied without
* exceeding any association or QOS limit.
* job_desc IN - job descriptor being submitted
* part_ptr IN - first partition to which the job is being submitted
* part_ptr_list IN - list of partitions to which the job is being submitted
* (can be NULL)
* assoc_in IN - pointer to association to which the job is being submitted
* qos_ptr IN - pointer to QOS to which the job is being submitted
* state_reason OUT - if non-NULL, set to reason for rejecting the job
* acct_policy_limit_set IN/OUT - limits set for the job, pre-allocated storage
* is filled in by acct_policy_validate
* update_call IN - true if request to update existing job request
* RET true if valid
*/
extern bool acct_policy_validate(job_desc_msg_t *job_desc,
part_record_t *part_ptr,
list_t *part_ptr_list,
slurmdb_assoc_rec_t *assoc_in,
slurmdb_qos_rec_t *qos_ptr,
uint32_t *reason,
acct_policy_limit_set_t *acct_policy_limit_set,
bool update_call)
{
int rc = true;
assoc_mgr_lock_t locks =
{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
acct_policy_validate_args_t args = {
.acct_policy_limit_set = acct_policy_limit_set,
.assoc_in = assoc_in, .job_desc = job_desc,
.job_qos_ptr = qos_ptr, .reason = reason,
.update_call = update_call };
assoc_mgr_lock(&locks);
if (!part_ptr_list) {
if (_list_acct_policy_validate(part_ptr, &args) == SLURM_ERROR)
rc = false;
assoc_mgr_unlock(&locks);
return rc;
}
if (list_for_each(part_ptr_list, _list_acct_policy_validate, &args) < 0)
rc = false;
assoc_mgr_unlock(&locks);
return rc;
}
/*
* acct_policy_validate_het_job - validate that a hetjob as a whole (all
* components at once) can be satisfied without exceeding any association
* limit. Build a list of every job's association and QOS information then combine
* usage information for every job sharing an association and test that against
* the appropriate limit.
*
* NOTE: This test is imperfect. Each job actually has up to 3 sets of limits
* to test (association, job QOS and partition QOS). Ideally each would be tested
* independently, but that is complicated due to QOS limits overriding the
* association limits and the ability to have 3 sets of limits for each job.
* This only tests the association limit for each hetjob component based
* upon that component's job and partition QOS.
*
* NOTE: That a hetjob passes this test does not mean that it will be able
* to run. For example, this test assumes resource allocation at the CPU level.
* If each task is allocated one core, with 2 CPUs, then the CPU limit test
* would not be accurate.
*
* submit_job_list IN - list of job_record_t entries (already created)
* RET true if valid
*/
extern bool acct_policy_validate_het_job(list_t *submit_job_list)
{
assoc_mgr_lock_t locks =
{ .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK };
list_t *het_job_limit_list = NULL;
list_itr_t *iter1, *iter2;
job_record_t *job_ptr1, *job_ptr2;
het_job_limits_t *job_limit1, *job_limit2;
bool rc = true;
job_desc_msg_t job_desc;
bool build_job_desc = true;
acct_policy_limit_set_t acct_policy_limit_set;
int i, job_cnt;
uint32_t reason = 0;
int tres_req_size = sizeof(uint64_t) * g_tres_count;
memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set_t));
acct_policy_limit_set.tres =
xmalloc(sizeof(uint16_t) * slurmctld_tres_cnt);
/* Build list of QOS, association, and job pointers */
het_job_limit_list = list_create(xfree_ptr);
iter1 = list_iterator_create(submit_job_list);
assoc_mgr_lock(&locks);
while ((job_ptr1 = list_next(iter1))) {
job_limit1 = xmalloc(sizeof(het_job_limits_t));
job_limit1->assoc_ptr = job_ptr1->assoc_ptr;
job_limit1->job_ptr = job_ptr1;
list_append(het_job_limit_list, job_limit1);
}
assoc_mgr_unlock(&locks);
list_iterator_destroy(iter1);
iter1 = list_iterator_create(het_job_limit_list);
while ((job_limit1 = list_next(iter1))) {
job_ptr1 = job_limit1->job_ptr;
if (build_job_desc) {
build_job_desc = false;
job_desc.time_limit = job_ptr1->time_limit;
job_desc.tres_req_cnt = xmalloc(tres_req_size);
job_desc.user_id = job_ptr1->user_id;
}
if (job_limit1->assoc_ptr) {
job_cnt = 1;
memcpy(job_desc.tres_req_cnt, job_ptr1->tres_req_cnt,
tres_req_size);
iter2 = list_iterator_create(het_job_limit_list);
while ((job_limit2 = list_next(iter2))) {
if ((job_limit2 == job_limit1) ||
(job_limit2->assoc_ptr !=
job_limit1->assoc_ptr))
continue;
job_ptr2 = job_limit2->job_ptr;
for (i = 0 ; i < g_tres_count; i++) {
job_desc.tres_req_cnt[i] +=
job_ptr2->tres_req_cnt[i];
}
job_cnt++;
}
list_iterator_destroy(iter2);
if (job_cnt > 1) {
job_desc.array_bitmap = bit_alloc(job_cnt);
/*
* SET NO BITS. Make this look like zero jobs
* are being added. The job count was already
* validated when each individual component of
* the heterogeneous job was created.
*/
rc = acct_policy_validate(&job_desc,
job_ptr1->part_ptr,
job_ptr1->part_ptr_list,
job_limit1->assoc_ptr,
job_ptr1->qos_ptr,
&reason,
&acct_policy_limit_set,
false);
FREE_NULL_BITMAP(job_desc.array_bitmap);
if (!rc)
break;
}
}
}
list_iterator_destroy(iter1);
xfree(job_desc.tres_req_cnt);
FREE_NULL_LIST(het_job_limit_list);
xfree(acct_policy_limit_set.tres);
return rc;
}
/*
* acct_policy_job_runnable_pre_select - Determine if the specified
* job can execute right now or not depending upon accounting
* policy (e.g. running job limit for this association). If the
* association limits prevent the job from ever running (lowered
* limits since job submission), then cancel the job.
*/
extern bool acct_policy_job_runnable_pre_select(job_record_t *job_ptr,
bool assoc_mgr_locked)
{
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
slurmdb_qos_rec_t qos_rec;
slurmdb_assoc_rec_t *assoc_ptr;
uint32_t time_limit = NO_VAL;
bool rc = true;
uint32_t wall_mins;
bool safe_limits = false;
int parent = 0; /* flag to tell us if we are looking at the
* parent or not
*/
assoc_mgr_lock_t locks =
{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
/* check to see if we are enforcing associations */
if (!accounting_enforce)
return true;
if (!_valid_job_assoc(job_ptr)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = FAIL_ACCOUNT;
return false;
}
/* now see if we are enforcing limits */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return true;
/* clear old state reason */
if (job_state_reason_check(job_ptr->state_reason, JSR_QOS_ASSOC)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_NO_REASON;
}
slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
if (!assoc_mgr_locked)
assoc_mgr_lock(&locks);
assoc_mgr_set_qos_tres_cnt(&qos_rec);
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
/* check the first QOS setting it's values in the qos_rec */
if (qos_ptr_1 &&
!(rc = _qos_job_runnable_pre_select(job_ptr, qos_ptr_1, &qos_rec)))
goto end_it;
/* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
if (qos_ptr_2 &&
!(rc = _qos_job_runnable_pre_select(job_ptr, qos_ptr_2, &qos_rec)))
goto end_it;
/*
* check to see if we should be using safe limits, if so we
* will only start a job if there are sufficient remaining
* cpu-minutes for it to run to completion
*/
if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
safe_limits = true;
assoc_ptr = job_ptr->assoc_ptr;
while (assoc_ptr) {
/* This only trips when the grp_used_wall is divisible
* by 60, i.e if a limit is 1 min and you have only
* accumulated 59 seconds you will still be able to
* get another job in as 59/60 = 0 int wise.
*/
wall_mins = assoc_ptr->usage->grp_used_wall / 60;
#if _DEBUG
info("acct_job_limits: %u of %u",
assoc_ptr->usage->used_jobs, assoc_ptr->max_jobs);
#endif
/* we don't need to check grp_cpu_mins here */
/* we don't need to check grp_cpus here */
/* we don't need to check grp_mem here */
if ((qos_rec.grp_jobs == INFINITE) &&
(assoc_ptr->grp_jobs != INFINITE) &&
(assoc_ptr->usage->used_jobs >= assoc_ptr->grp_jobs)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_ASSOC_GRP_JOB;
debug2("%pJ being held, assoc %u is at or exceeds group max jobs limit %u with %u for account %s",
job_ptr, assoc_ptr->id, assoc_ptr->grp_jobs,
assoc_ptr->usage->used_jobs, assoc_ptr->acct);
rc = false;
goto end_it;
}
/* we don't need to check grp_cpu_run_mins here */
/* we don't need to check grp_nodes here */
/* we don't need to check submit_jobs here */
if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
&& (qos_rec.grp_wall == INFINITE)
&& (assoc_ptr->grp_wall != INFINITE)) {
if (time_limit == NO_VAL) {
time_limit = job_ptr->time_limit;
_set_time_limit(&time_limit,
job_ptr->part_ptr->max_time,
MIN(assoc_ptr->grp_wall,
assoc_ptr->max_wall_pj),
&job_ptr->limit_set.time);
/* Account for usage factor, if necessary */
if ((job_ptr->qos_ptr &&
(job_ptr->qos_ptr->flags &
QOS_FLAG_USAGE_FACTOR_SAFE) &&
(job_ptr->qos_ptr->usage_factor >= 0)) &&
((time_limit != INFINITE) ||
(job_ptr->qos_ptr->usage_factor < 1.0))) {
time_limit *=
job_ptr->qos_ptr->usage_factor;
}
}
if (wall_mins >= assoc_ptr->grp_wall) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_ASSOC_GRP_WALL;
debug2("%pJ being held, assoc %u is at or exceeds group wall limit %u with %u for account %s",
job_ptr, assoc_ptr->id,
assoc_ptr->grp_wall,
wall_mins, assoc_ptr->acct);
rc = false;
goto end_it;
} else if (safe_limits &&
((wall_mins + time_limit) >
assoc_ptr->grp_wall)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_ASSOC_GRP_WALL;
debug2("%pJ being held, the job request with assoc %u will exceed group wall limit %u if ran with %u for account %s",
job_ptr, assoc_ptr->id,
assoc_ptr->grp_wall,
wall_mins + time_limit, assoc_ptr->acct);
rc = false;
goto end_it;
}
}
/*
* We don't need to look at the regular limits for parents
* since we have pre-propagated them, so just continue with
* the next parent.
*/
if (parent) {
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
continue;
}
/* we don't need to check max_cpu_mins_pj here */
/* we don't need to check max_cpus_pj here */
if ((qos_rec.max_jobs_pa == INFINITE) &&
(qos_rec.max_jobs_pu == INFINITE) &&
(assoc_ptr->max_jobs != INFINITE) &&
(assoc_ptr->usage->used_jobs >= assoc_ptr->max_jobs)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_ASSOC_MAX_JOBS;
debug2("%pJ being held, assoc %u is at or exceeds max jobs limit %u with %u for account %s",
job_ptr, assoc_ptr->id,
assoc_ptr->max_jobs,
assoc_ptr->usage->used_jobs, assoc_ptr->acct);
rc = false;
goto end_it;
}
/* we don't need to check submit_jobs here */
/*
* if the association limits have changed since job
* submission and job can not run, then kill it
*/
if ((job_ptr->limit_set.time != ADMIN_SET_LIMIT)
&& (qos_rec.max_wall_pj == INFINITE)
&& (assoc_ptr->max_wall_pj != INFINITE)) {
if (time_limit == NO_VAL) {
time_limit = job_ptr->time_limit;
_set_time_limit(&time_limit,
job_ptr->part_ptr->max_time,
assoc_ptr->max_wall_pj,
&job_ptr->limit_set.time);
/* Account for usage factor, if necessary */
if ((job_ptr->qos_ptr &&
(job_ptr->qos_ptr->flags &
QOS_FLAG_USAGE_FACTOR_SAFE) &&
(job_ptr->qos_ptr->usage_factor >= 0)) &&
((time_limit != INFINITE) ||
(job_ptr->qos_ptr->usage_factor < 1.0))) {
time_limit *=
job_ptr->qos_ptr->usage_factor;
}
}
if (time_limit > assoc_ptr->max_wall_pj) {
xfree(job_ptr->state_desc);
job_ptr->state_reason =
WAIT_ASSOC_MAX_WALL_PER_JOB;
debug2("%pJ being held, time limit %u exceeds account max %u",
job_ptr, job_ptr->time_limit,
time_limit);
rc = false;
goto end_it;
}
}
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
parent = 1;
}
end_it:
if (!assoc_mgr_locked)
assoc_mgr_unlock(&locks);
slurmdb_free_qos_rec_members(&qos_rec);
return rc;
}
/*
* acct_policy_job_runnable_post_select - After nodes have been
* selected for the job verify the counts don't exceed aggregated limits.
*/
extern bool acct_policy_job_runnable_post_select(job_record_t *job_ptr,
uint64_t *tres_req_cnt,
bool assoc_mgr_locked)
{
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
slurmdb_qos_rec_t qos_rec;
slurmdb_assoc_rec_t *assoc_ptr;
uint64_t grp_tres_ctld[slurmctld_tres_cnt];
uint64_t max_tres_ctld[slurmctld_tres_cnt];
uint64_t tres_usage_mins[slurmctld_tres_cnt];
uint64_t tres_run_mins[slurmctld_tres_cnt];
uint64_t job_tres_time_limit[slurmctld_tres_cnt];
uint64_t orig_node_cnt;
uint32_t time_limit;
bool rc = true;
bool safe_limits = false;
int i, tres_pos = 0;
acct_policy_tres_usage_t tres_usage;
double usage_factor = 1.0;
double limit_factor = -1.0;
int parent = 0; /* flag to tell us if we are looking at the
* parent or not
*/
assoc_mgr_lock_t locks =
{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
xassert(job_ptr);
xassert(job_ptr->part_ptr);
xassert(tres_req_cnt);
/* check to see if we are enforcing associations */
if (!accounting_enforce)
return true;
/* probably don't need to check this here */
/* if (!_valid_job_assoc(job_ptr)) { */
/* job_ptr->state_reason = FAIL_ACCOUNT; */
/* return false; */
/* } */
/* now see if we are enforcing limits */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return true;
/* check to see if we should be using safe limits, if so we
* will only start a job if there are sufficient remaining
* cpu-minutes for it to run to completion */
if (accounting_enforce & ACCOUNTING_ENFORCE_SAFE)
safe_limits = true;
/* clear old state reason */
if (job_state_reason_check(job_ptr->state_reason, JSR_QOS_ASSOC)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = WAIT_NO_REASON;
}
job_ptr->qos_blocking_ptr = NULL;
/* clang needs this memset to avoid a warning */
memset(tres_run_mins, 0, sizeof(tres_run_mins));
memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
memset(job_tres_time_limit, 0, sizeof(job_tres_time_limit));
time_limit = job_ptr->time_limit;
_set_time_limit(&time_limit, job_ptr->part_ptr->max_time,
job_ptr->part_ptr->default_time, NULL);
if (job_ptr->qos_ptr) {
usage_factor = job_ptr->qos_ptr->usage_factor;
if ((usage_factor >= 0) &&
(job_ptr->qos_ptr->flags & QOS_FLAG_USAGE_FACTOR_SAFE) &&
((time_limit != INFINITE) || (usage_factor < 1.0))) {
time_limit *= usage_factor;
}
}
for (i=0; i<slurmctld_tres_cnt; i++)
job_tres_time_limit[i] = (uint64_t)time_limit * tres_req_cnt[i];
slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
if (!assoc_mgr_locked)
assoc_mgr_lock(&locks);
assoc_mgr_set_qos_tres_cnt(&qos_rec);
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
/* check the first QOS setting it's values in the qos_rec */
if (qos_ptr_1 &&
!(rc = _qos_job_runnable_post_select(job_ptr, qos_ptr_1,
&qos_rec, tres_req_cnt,
job_tres_time_limit)))
goto end_it;
/* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
if (qos_ptr_2 &&
!(rc = _qos_job_runnable_post_select(job_ptr, qos_ptr_2,
&qos_rec, tres_req_cnt,
job_tres_time_limit)))
goto end_it;
if (qos_ptr_1 && !fuzzy_equal(qos_ptr_1->limit_factor, INFINITE))
limit_factor = qos_ptr_1->limit_factor;
else if (qos_ptr_2 && !fuzzy_equal(qos_ptr_2->limit_factor, INFINITE))
limit_factor = qos_ptr_2->limit_factor;
assoc_ptr = job_ptr->assoc_ptr;
while (assoc_ptr) {
for (i = 0; i < slurmctld_tres_cnt; i++) {
tres_usage_mins[i] =
(uint64_t)(assoc_ptr->usage->usage_tres_raw[i]
/ 60);
tres_run_mins[i] =
assoc_ptr->usage->grp_used_tres_run_secs[i] /
60;
/*
* Clear usage if factor is 0 so that jobs can run.
* Otherwise multiplying can cause more jobs to be run
* than the limit allows (e.g. usagefactor=.5).
*/
if (usage_factor == 0.0) {
tres_usage_mins[i] *= usage_factor;
tres_run_mins[i] *= usage_factor;
}
grp_tres_ctld[i] = assoc_ptr->grp_tres_ctld[i];
max_tres_ctld[i] = assoc_ptr->max_tres_ctld[i];
_apply_limit_factor(&grp_tres_ctld[i], limit_factor);
_apply_limit_factor(&max_tres_ctld[i], limit_factor);
}
#if _DEBUG
info("acct_job_limits: %u of %u",
assoc_ptr->usage->used_jobs, assoc_ptr->max_jobs);
#endif
/*
* If the association has a GrpCPUMins limit set (and there
* is no QOS with GrpCPUMins set) we may hold the job
*/
tres_usage = _validate_tres_usage_limits_for_assoc(
&tres_pos, assoc_ptr->grp_tres_mins_ctld,
qos_rec.grp_tres_mins_ctld,
job_tres_time_limit, tres_run_mins,
tres_usage_mins, job_ptr->limit_set.tres,
safe_limits);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
debug2("%pJ being held, assoc %u(%s/%s/%s) group max tres(%s) minutes limit of %"PRIu64" is already at or exceeded with %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc_ptr->grp_tres_mins_ctld[tres_pos],
tres_usage_mins[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
debug2("%pJ being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc_ptr->grp_tres_mins_ctld[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/*
* If we're using safe limits start
* the job only if there are
* sufficient cpu-mins left such that
* it will run to completion without
* being killed
*/
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK_MIN);
debug2("%pJ being held, the job is at or exceeds assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" of which %"PRIu64" are still available but request is for %"PRIu64" (plus %"PRIu64" already in use) tres minutes (request tres count %"PRIu64")",
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc_ptr->grp_tres_mins_ctld[tres_pos],
assoc_ptr->grp_tres_mins_ctld[tres_pos] -
tres_usage_mins[tres_pos],
job_tres_time_limit[tres_pos],
tres_run_mins[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_OKAY:
/* all good */
break;
}
orig_node_cnt = tres_req_cnt[TRES_ARRAY_NODE];
_get_unique_job_node_cnt(job_ptr,
assoc_ptr->usage->grp_node_bitmap,
&tres_req_cnt[TRES_ARRAY_NODE]);
tres_usage = _validate_tres_usage_limits_for_assoc(
&tres_pos,
grp_tres_ctld, qos_rec.grp_tres_ctld,
tres_req_cnt, assoc_ptr->usage->grp_used_tres,
NULL, job_ptr->limit_set.tres, true);
tres_req_cnt[TRES_ARRAY_NODE] = orig_node_cnt;
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL*/
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK);
debug2("%pJ is being held, assoc %u(%s/%s/%s) min tres(%s) request %"PRIu64" exceeds group max tres limit %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
tres_req_cnt[tres_pos],
grp_tres_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK);
debug2("%pJ being held, if allowed the job request will exceed assoc %u(%s/%s/%s) group max tres(%s) limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
grp_tres_ctld[tres_pos],
assoc_ptr->usage->grp_used_tres[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
goto end_it;
case TRES_USAGE_OKAY:
/* all good */
break;
}
/* we don't need to check grp_jobs here */
tres_usage = _validate_tres_usage_limits_for_assoc(
&tres_pos,
assoc_ptr->grp_tres_run_mins_ctld,
qos_rec.grp_tres_run_mins_ctld,
job_tres_time_limit, tres_run_mins, NULL, NULL, true);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible because the curr_usage sent in is NULL*/
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN);
debug2("%pJ is being held, assoc %u(%s/%s/%s) group max running tres(%s) minutes request %"PRIu64" exceeds limit %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
job_tres_time_limit[tres_pos],
assoc_ptr->grp_tres_run_mins_ctld[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN);
debug2("%pJ being held, if allowed the job request will exceed assoc %u(%s/%s/%s) group max running tres(%s) minutes limit %"PRIu64" with already used %"PRIu64" + requested %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc_ptr->grp_tres_run_mins_ctld[tres_pos],
tres_run_mins[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
break;
case TRES_USAGE_OKAY:
/* all good */
break;
}
/* we don't need to check submit_jobs here */
/* we don't need to check grp_wall here */
/* We don't need to look at the regular limits for
* parents since we have pre-propagated them, so just
* continue with the next parent
*/
if (parent) {
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
continue;
}
if (!_validate_tres_limits_for_assoc(
&tres_pos, job_tres_time_limit, 0,
assoc_ptr->max_tres_mins_ctld,
qos_rec.max_tres_mins_pj_ctld,
job_ptr->limit_set.tres,
1, 0, 1)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_MAX_UNK_MINS_PER_JOB);
debug2("%pJ being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) minutes of %"PRIu64" with %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc_ptr->max_tres_mins_ctld[tres_pos],
job_tres_time_limit[tres_pos]);
rc = false;
goto end_it;
}
if (!_validate_tres_limits_for_assoc(
&tres_pos, tres_req_cnt, 0,
max_tres_ctld,
qos_rec.max_tres_pj_ctld,
job_ptr->limit_set.tres,
1, 0, 1)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB);
debug2("%pJ is being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) limit of %"PRIu64" with %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
max_tres_ctld[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
break;
}
if (!_validate_tres_limits_for_assoc(
&tres_pos, tres_req_cnt,
tres_req_cnt[TRES_ARRAY_NODE],
assoc_ptr->max_tres_pn_ctld,
qos_rec.max_tres_pn_ctld,
job_ptr->limit_set.tres,
1, 0, 1)) {
xfree(job_ptr->state_desc);
job_ptr->state_reason = _get_tres_state_reason(
tres_pos, WAIT_ASSOC_MAX_UNK_PER_NODE);
debug2("%pJ is being held, the job is requesting more than allowed with assoc %u(%s/%s/%s) max tres(%s) per node limit of %"PRIu64" with %"PRIu64,
job_ptr, assoc_ptr->id, assoc_ptr->acct,
assoc_ptr->user, assoc_ptr->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc_ptr->max_tres_pn_ctld[tres_pos],
tres_req_cnt[tres_pos]);
rc = false;
break;
}
/* we do not need to check max_jobs here */
/* we don't need to check submit_jobs here */
/* we don't need to check max_wall_pj here */
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
parent = 1;
}
end_it:
if (!assoc_mgr_locked)
assoc_mgr_unlock(&locks);
slurmdb_free_qos_rec_members(&qos_rec);
FREE_NULL_BITMAP(job_ptr->node_bitmap_preempt);
return rc;
}
extern uint32_t acct_policy_get_max_nodes(job_record_t *job_ptr,
uint32_t *wait_reason)
{
uint64_t max_nodes_limit = INFINITE64, qos_max_p_limit = INFINITE64,
grp_nodes = INFINITE64;
assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .qos = READ_LOCK };
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
bool parent = 0; /* flag to tell us if we are looking at the
* parent or not
*/
bool grp_set = 0;
double limit_factor = -1.0;
/* check to see if we are enforcing associations */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return max_nodes_limit;
xassert(wait_reason);
assoc_mgr_lock(&locks);
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
if (qos_ptr_1) {
uint64_t max_nodes_pj =
qos_ptr_1->max_tres_pj_ctld[TRES_ARRAY_NODE];
uint64_t max_nodes_pu =
qos_ptr_1->max_tres_pu_ctld[TRES_ARRAY_NODE];
uint64_t max_nodes_pa =
qos_ptr_1->max_tres_pa_ctld[TRES_ARRAY_NODE];
grp_nodes = qos_ptr_1->grp_tres_ctld[TRES_ARRAY_NODE];
if (!fuzzy_equal(qos_ptr_1->limit_factor, INFINITE))
limit_factor = qos_ptr_1->limit_factor;
if (qos_ptr_2) {
if (max_nodes_pa == INFINITE64)
max_nodes_pa = qos_ptr_2->max_tres_pa_ctld[
TRES_ARRAY_NODE];
if (max_nodes_pj == INFINITE64)
max_nodes_pj = qos_ptr_2->max_tres_pj_ctld[
TRES_ARRAY_NODE];
if (max_nodes_pu == INFINITE64)
max_nodes_pu = qos_ptr_2->max_tres_pu_ctld[
TRES_ARRAY_NODE];
if (grp_nodes == INFINITE64)
grp_nodes = qos_ptr_2->grp_tres_ctld[
TRES_ARRAY_NODE];
if ((limit_factor == -1.0) &&
!fuzzy_equal(qos_ptr_1->limit_factor, INFINITE))
limit_factor = qos_ptr_2->limit_factor;
}
if (max_nodes_pa < max_nodes_limit) {
max_nodes_limit = max_nodes_pa;
*wait_reason = WAIT_QOS_MAX_NODE_PER_ACCT;
}
if (max_nodes_pj < max_nodes_limit) {
max_nodes_limit = max_nodes_pj;
*wait_reason = WAIT_QOS_MAX_NODE_PER_JOB;
}
if (max_nodes_pu < max_nodes_limit) {
max_nodes_limit = max_nodes_pu;
*wait_reason = WAIT_QOS_MAX_NODE_PER_USER;
}
qos_max_p_limit = max_nodes_limit;
if (grp_nodes < max_nodes_limit) {
max_nodes_limit = grp_nodes;
*wait_reason = WAIT_QOS_GRP_NODE;
}
}
/* We have to traverse all the associations because QOS might
not override a particular limit.
*/
while (assoc_ptr) {
uint64_t node_limit = assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE];
_apply_limit_factor(&node_limit, limit_factor);
if ((!qos_ptr_1 || (grp_nodes == INFINITE64))
&& (node_limit != INFINITE64)
&& (node_limit < max_nodes_limit)) {
max_nodes_limit = node_limit;
*wait_reason = WAIT_ASSOC_GRP_NODE;
grp_set = 1;
}
node_limit = assoc_ptr->max_tres_ctld[TRES_ARRAY_NODE];
_apply_limit_factor(&node_limit, limit_factor);
if (!parent
&& (qos_max_p_limit == INFINITE64)
&& (node_limit != INFINITE64)
&& (node_limit < max_nodes_limit)) {
max_nodes_limit = node_limit;
*wait_reason = WAIT_ASSOC_MAX_NODE_PER_JOB;
}
/* only check the first grp set */
if (grp_set)
break;
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
parent = 1;
continue;
}
assoc_mgr_unlock(&locks);
return max_nodes_limit;
}
/*
* acct_policy_update_pending_job - Make sure the limits imposed on a job on
* submission are correct after an update to a qos or association. If
* the association/qos limits prevent the job from running (lowered
* limits since job submission), then reset its reason field.
*/
extern int acct_policy_update_pending_job(job_record_t *job_ptr)
{
job_desc_msg_t job_desc;
acct_policy_limit_set_t acct_policy_limit_set;
bool update_accounting = false;
job_details_t *details_ptr;
int rc = SLURM_SUCCESS;
uint64_t tres_req_cnt[slurmctld_tres_cnt];
/* check to see if we are enforcing associations and the job
* is pending or if we are even enforcing limits. */
if (!accounting_enforce || !IS_JOB_PENDING(job_ptr)
|| !(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return SLURM_SUCCESS;
details_ptr = job_ptr->details;
if (!details_ptr) {
error("acct_policy_update_pending_job: no details");
return SLURM_ERROR;
}
/* set up the job desc to make sure things are the way we
* need.
*/
slurm_init_job_desc_msg(&job_desc);
/* copy the limits set from the job the only one that
* acct_policy_validate changes is the time limit so we
* should be ok with the memcpy here */
memcpy(&acct_policy_limit_set, &job_ptr->limit_set,
sizeof(acct_policy_limit_set_t));
job_desc.tres_req_cnt = tres_req_cnt;
/* copy all the tres requests over */
memcpy(job_desc.tres_req_cnt, job_ptr->tres_req_cnt,
sizeof(uint64_t) * slurmctld_tres_cnt);
/* Only set this value if not set from a limit */
if (job_ptr->limit_set.time == ADMIN_SET_LIMIT)
acct_policy_limit_set.time = job_ptr->limit_set.time;
else if ((job_ptr->time_limit != NO_VAL) && !job_ptr->limit_set.time)
job_desc.time_limit = job_ptr->time_limit;
if (!acct_policy_validate(&job_desc, job_ptr->part_ptr,
job_ptr->part_ptr_list,
job_ptr->assoc_ptr, job_ptr->qos_ptr,
&job_ptr->state_reason,
&acct_policy_limit_set, 0)) {
info("%s: exceeded association/qos's cpu, node, memory or time limit for %pJ",
__func__, job_ptr);
return SLURM_ERROR;
}
/* The only variable in acct_policy_limit_set that is changed
* in acct_policy_validate is the time limit so only worry
* about that one.
*/
/* If it isn't an admin set limit replace it. */
if (!acct_policy_limit_set.time && (job_ptr->limit_set.time == 1)) {
job_ptr->time_limit = NO_VAL;
job_ptr->limit_set.time = 0;
update_accounting = true;
} else if (acct_policy_limit_set.time != ADMIN_SET_LIMIT) {
if (job_ptr->time_limit != job_desc.time_limit) {
job_ptr->time_limit = job_desc.time_limit;
update_accounting = true;
}
job_ptr->limit_set.time = acct_policy_limit_set.time;
}
if (update_accounting) {
last_job_update = time(NULL);
debug("limits changed for %pJ: updating accounting", job_ptr);
/* Update job record in accounting to reflect changes */
jobacct_storage_g_job_start(acct_db_conn, job_ptr);
}
return rc;
}
/*
* acct_policy_job_runnable - Determine if the specified job has timed
* out based on it's QOS or association.
*/
extern bool acct_policy_job_time_out(job_record_t *job_ptr)
{
uint64_t job_tres_usage_mins[slurmctld_tres_cnt];
uint64_t time_delta;
uint64_t tres_usage_mins[slurmctld_tres_cnt];
uint32_t wall_mins, orig_node_cnt;
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
slurmdb_qos_rec_t qos_rec;
slurmdb_assoc_rec_t *assoc = NULL;
assoc_mgr_lock_t locks =
{ .assoc = READ_LOCK, .qos = WRITE_LOCK, .tres = READ_LOCK };
time_t now;
int i, tres_pos = 0;
acct_policy_tres_usage_t tres_usage;
/*
* Now see if we are enforcing limits. If Safe is set then
* return false as well since we are being safe if the limit
* was changed after the job was already deemed safe to start.
*/
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
|| (accounting_enforce & ACCOUNTING_ENFORCE_SAFE))
return false;
slurmdb_init_qos_rec(&qos_rec, 0, INFINITE);
assoc_mgr_lock(&locks);
assoc_mgr_set_qos_tres_cnt(&qos_rec);
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
assoc = job_ptr->assoc_ptr;
now = time(NULL);
time_delta = (uint64_t)(((now - job_ptr->start_time) -
job_ptr->tot_sus_time) / 60);
/* clang needs this memset to avoid a warning */
memset(job_tres_usage_mins, 0, sizeof(tres_usage_mins));
memset(tres_usage_mins, 0, sizeof(tres_usage_mins));
/*
* find out how many CPU minutes this job has been running for.
* We add 1 here to make it so we can check for just > instead of
* >= in our checks.
*/
for (i = 0; i < slurmctld_tres_cnt; i++) {
if (i == TRES_ARRAY_ENERGY)
continue;
if (job_ptr->tres_alloc_cnt[i] == NO_CONSUME_VAL64)
continue;
if (job_ptr->tres_alloc_cnt[i]) {
job_tres_usage_mins[i] =
(time_delta * job_ptr->tres_alloc_cnt[i]) + 1;
}
}
/* check the first QOS setting it's values in the qos_rec */
if (qos_ptr_1 && !_qos_job_time_out(job_ptr, qos_ptr_1,
&qos_rec, job_tres_usage_mins))
goto job_failed;
/* If qos_ptr_1 didn't set the value use the 2nd QOS to set the limit */
if (qos_ptr_2 && !_qos_job_time_out(job_ptr, qos_ptr_2,
&qos_rec, job_tres_usage_mins))
goto job_failed;
/* handle any association stuff here */
while (assoc) {
for (i = 0; i < slurmctld_tres_cnt; i++)
tres_usage_mins[i] =
(uint64_t)(assoc->usage->usage_tres_raw[i]
/ 60.0);
wall_mins = assoc->usage->grp_used_wall / 60;
tres_usage = _validate_tres_usage_limits_for_assoc(
&tres_pos, assoc->grp_tres_mins_ctld,
qos_rec.grp_tres_mins_ctld, job_tres_usage_mins,
NULL, tres_usage_mins, NULL, false);
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
last_job_update = now;
info("%pJ timed out, the job is at or exceeds assoc %u(%s/%s/%s) group max tres(%s) minutes of %"PRIu64" with %"PRIu64,
job_ptr, assoc->id, assoc->acct,
assoc->user, assoc->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc->grp_tres_mins_ctld[tres_pos],
tres_usage_mins[tres_pos]);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc, "Job is at or exceeds association (acc=%s/user=%s/part=%s) group max TRES(%s) minutes of %"PRIu64" with %"PRIu64,
assoc->acct, assoc->user, assoc->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc->grp_tres_mins_ctld[tres_pos],
tres_usage_mins[tres_pos]);
goto job_failed;
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
/* not possible safe_limits is 0 */
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/* not possible safe_limits is 0 */
case TRES_USAGE_OKAY:
/* all good */
break;
}
if ((qos_rec.grp_wall == INFINITE)
&& (assoc->grp_wall != INFINITE)
&& (wall_mins >= assoc->grp_wall)) {
info("%pJ timed out, assoc %u is at or exceeds group wall limit %u with %u for account %s",
job_ptr, assoc->id, assoc->grp_wall,
wall_mins, assoc->acct);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc, "Job is at or exceeds association (acc=%s/user=%s/part=%s) group wall limit %u with %u",
assoc->acct, assoc->user, assoc->partition,
assoc->grp_wall, wall_mins);
break;
}
orig_node_cnt = job_tres_usage_mins[TRES_ARRAY_NODE];
job_tres_usage_mins[TRES_ARRAY_NODE] = 0;
tres_usage = _validate_tres_usage_limits_for_assoc(
&tres_pos, assoc->max_tres_mins_ctld,
qos_rec.max_tres_mins_pj_ctld, job_tres_usage_mins,
NULL, NULL, NULL, true);
job_tres_usage_mins[TRES_ARRAY_NODE] = orig_node_cnt;
switch (tres_usage) {
case TRES_USAGE_CUR_EXCEEDS_LIMIT:
/* not possible curr_usage is NULL */
break;
case TRES_USAGE_REQ_EXCEEDS_LIMIT:
last_job_update = now;
info("%pJ timed out, the job is at or exceeds assoc %u(%s/%s/%s) max tres(%s) minutes of %"PRIu64" with %"PRIu64,
job_ptr, assoc->id, assoc->acct,
assoc->user, assoc->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc->max_tres_mins_ctld[tres_pos],
job_tres_usage_mins[tres_pos]);
job_ptr->state_reason = FAIL_TIMEOUT;
xfree(job_ptr->state_desc);
xstrfmtcat(job_ptr->state_desc, "Job is at or exceeds association (acc=%s/user=%s/part=%s) max TRES(%s) minutes of %"PRIu64" with %"PRIu64,
assoc->acct, assoc->user, assoc->partition,
assoc_mgr_tres_name_array[tres_pos],
assoc->max_tres_mins_ctld[tres_pos],
job_tres_usage_mins[tres_pos]);
goto job_failed;
break;
case TRES_USAGE_REQ_NOT_SAFE_WITH_USAGE:
/* not possible tres_usage is NULL */
case TRES_USAGE_OKAY:
/* all good */
break;
}
assoc = assoc->usage->parent_assoc_ptr;
/* these limits don't apply to the root assoc */
if (assoc == assoc_mgr_root_assoc)
break;
}
job_failed:
assoc_mgr_unlock(&locks);
slurmdb_free_qos_rec_members(&qos_rec);
if (job_ptr->state_reason == FAIL_TIMEOUT)
return true;
return false;
}
static void _get_accrue_limits(acct_policy_accrue_t *acct_policy_accrue,
uint32_t *max_jobs_accrue_ptr,
int *create_cnt_ptr)
{
job_record_t *job_ptr = acct_policy_accrue->job_ptr;
slurmdb_assoc_rec_t *assoc_ptr;
bool parent = false;
xassert(verify_assoc_lock(ASSOC_LOCK, WRITE_LOCK));
xassert(verify_assoc_lock(QOS_LOCK, WRITE_LOCK));
if (job_ptr->qos_ptr) {
_fill_in_qos_used_limits(job_ptr->qos_ptr, acct_policy_accrue);
/* Find the most restrictive qos limit */
_get_accrue_create_cnt(max_jobs_accrue_ptr, create_cnt_ptr,
job_ptr->qos_ptr->grp_jobs_accrue,
job_ptr->qos_ptr->usage->accrue_cnt);
if (acct_policy_accrue->used_limits_acct)
_get_accrue_create_cnt(
max_jobs_accrue_ptr, create_cnt_ptr,
job_ptr->qos_ptr->max_jobs_accrue_pa,
acct_policy_accrue->used_limits_acct->
accrue_cnt);
if (acct_policy_accrue->used_limits_user)
_get_accrue_create_cnt(
max_jobs_accrue_ptr, create_cnt_ptr,
job_ptr->qos_ptr->max_jobs_accrue_pu,
acct_policy_accrue->used_limits_user->
accrue_cnt);
}
assoc_ptr = job_ptr->assoc_ptr;
while (assoc_ptr) {
/*
* Find the first limit whether it be from the qos above or in
* the hierarchy.
*/
if (*max_jobs_accrue_ptr != INFINITE)
break;
_get_accrue_create_cnt(max_jobs_accrue_ptr, create_cnt_ptr,
assoc_ptr->grp_jobs_accrue,
assoc_ptr->usage->accrue_cnt);
/*
* We don't need to look at the regular limits for
* parents since we have pre-propagated them, so just
* continue with the next parent
*/
if (!parent)
_get_accrue_create_cnt(max_jobs_accrue_ptr,
create_cnt_ptr,
assoc_ptr->max_jobs_accrue,
assoc_ptr->usage->accrue_cnt);
/* now go up the hierarchy */
assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
parent = true;
}
}
static void _handle_add_accrue(acct_policy_accrue_t *acct_policy_accrue)
{
job_record_t *job_ptr = acct_policy_accrue->job_ptr;
job_details_t *details_ptr = job_ptr->details;
job_record_t *old_job_ptr;
uint32_t max_jobs_accrue = INFINITE;
_get_accrue_limits(acct_policy_accrue, &max_jobs_accrue,
&acct_policy_accrue->cnt);
/* No limit (or there is space to accrue) */
if ((max_jobs_accrue == INFINITE) ||
(acct_policy_accrue->cnt &&
(!job_ptr->array_recs || !job_ptr->array_recs->task_cnt))) {
if (!details_ptr->accrue_time &&
job_ptr->details->begin_time) {
/*
* If no limit and begin_time hasn't happened yet
* then set accrue_time to now.
*/
details_ptr->accrue_time =
((max_jobs_accrue == INFINITE) &&
details_ptr->begin_time) ?
details_ptr->begin_time : time(NULL);
/*
* If we have an array here and no limit we want to add
* all the tasks in the array.
*/
if (job_ptr->array_recs &&
job_ptr->array_recs->task_cnt)
acct_policy_accrue->cnt =
job_ptr->array_recs->task_cnt;
else
acct_policy_accrue->cnt = 1;
_add_accrue_time_internal(job_ptr->qos_ptr,
acct_policy_accrue);
}
return;
}
/* Looks like we are at the limit */
if (!acct_policy_accrue->cnt) {
log_flag(ACCRUE, "%s: %pJ can't accrue, we are over a limit",
__func__, job_ptr);
return;
}
acct_policy_accrue->cnt = MIN(acct_policy_accrue->cnt,
job_ptr->array_recs->task_cnt);
/* How many can we spin off? */
for (int i = 0; i < acct_policy_accrue->cnt; i++) {
/*
* After we split off the old_job_ptr is what we want to alter
* as the job_ptr returned from job_array_post_sched will be the
* master job_ptr for the array and we will use that to split
* more off if needed.
*/
old_job_ptr = job_ptr;
job_array_pre_sched(job_ptr);
job_ptr = job_array_post_sched(job_ptr, true);
details_ptr = old_job_ptr->details;
if (!details_ptr) {
fatal_abort("%s: no details after split", __func__);
return;
}
details_ptr->accrue_time = acct_policy_accrue->now;
log_flag(ACCRUE, "%pJ is now accruing time %ld",
old_job_ptr, acct_policy_accrue->now);
}
/*
* Here we are ok to use all the same pointers from the main job_ptr as
* an array will always have the same pointers. If this ever changes in
* the future some how we will need to address it.
*/
_add_accrue_time_internal(job_ptr->qos_ptr, acct_policy_accrue);
}
static void _handle_accrue_time(acct_policy_accrue_t *acct_policy_accrue)
{
job_record_t *job_ptr = acct_policy_accrue->job_ptr;
/* We have started running, let's clear us out of the mix. */
if (job_ptr->details->accrue_time) {
if (!(job_ptr->bit_flags & JOB_ACCRUE_OVER) &&
!IS_JOB_PENDING(job_ptr)) {
/*
* Normally only single jobs come in here, but if we
* don't have any limits and an array is cancelled the
* array itself comes in so we need to remove all of it.
*/
if (job_ptr->array_recs &&
job_ptr->array_recs->task_cnt)
acct_policy_accrue->cnt =
job_ptr->array_recs->task_cnt;
else
acct_policy_accrue->cnt = 1;
/* We only want to handle this once */
job_ptr->bit_flags |= JOB_ACCRUE_OVER;
(void) _for_each_qos_remove_accrue_time(
job_ptr->qos_ptr, acct_policy_accrue);
}
/* We already have our time and we aren't an array, endit */
if (!IS_JOB_PENDING(job_ptr) ||
!job_ptr->array_recs || !job_ptr->array_recs->task_cnt)
return;
} else if (!IS_JOB_PENDING(job_ptr))
return;
_handle_add_accrue(acct_policy_accrue);
}
extern int acct_policy_handle_accrue_time(job_record_t *job_ptr,
bool assoc_mgr_locked)
{
job_details_t *details_ptr;
int rc = SLURM_SUCCESS;
time_t now = time(NULL);
assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK,
NO_LOCK, NO_LOCK, NO_LOCK };
details_ptr = job_ptr->details;
if (!details_ptr) {
error("%s: no details", __func__);
return SLURM_ERROR;
}
/*
* ACCRUE_ALWAYS flag will always force the accrue_time to be the
* submit_time (Not begin). Accrue limits don't work with this flag.
*/
if (slurm_conf.priority_flags & PRIORITY_FLAGS_ACCRUE_ALWAYS) {
if (!details_ptr->accrue_time)
details_ptr->accrue_time = details_ptr->submit_time;
return SLURM_SUCCESS;
}
/* Always set accrue_time to begin time when not enforcing limits. */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
if (!details_ptr->accrue_time)
details_ptr->accrue_time = details_ptr->begin_time;
return SLURM_SUCCESS;
}
/*
* If the job is not eligible because it is either held, dependent or
* because its begin time is in the future don't accrue time.
*/
if (!job_ptr->priority || (job_ptr->bit_flags & JOB_DEPENDENT) ||
(details_ptr->begin_time && (details_ptr->begin_time > now)))
return SLURM_SUCCESS;
/* No accrue_time and the job isn't pending, bail */
if (!details_ptr->accrue_time && !IS_JOB_PENDING(job_ptr))
return SLURM_SUCCESS;
if (!assoc_mgr_locked)
assoc_mgr_lock(&locks);
if (!job_ptr->assoc_ptr) {
debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
__func__, job_ptr);
rc = SLURM_ERROR;
} else {
slurmdb_qos_rec_t *orig_qos_ptr = job_ptr->qos_ptr;
acct_policy_accrue_t acct_policy_accrue = {
.acct = job_ptr->assoc_ptr->acct,
.assoc_ptr = job_ptr->assoc_ptr,
.job_ptr = job_ptr,
.now = now,
.uid = job_ptr->user_id,
};
_set_highest_prio_qos_ptr(job_ptr);
_handle_accrue_time(&acct_policy_accrue);
/*
* Now that we are done with accrue set things back to the way
* it was qos wise. Accrue limits are always based on the
* highest priority QOS.
*/
if (job_ptr->qos_ptr != orig_qos_ptr) {
job_ptr->qos_ptr = orig_qos_ptr;
job_ptr->qos_id = orig_qos_ptr->id;
}
}
if (!assoc_mgr_locked)
assoc_mgr_unlock(&locks);
return rc;
}
extern void acct_policy_add_accrue_time(job_record_t *job_ptr,
bool assoc_mgr_locked)
{
slurmdb_assoc_rec_t *assoc_ptr;
assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK,
NO_LOCK, NO_LOCK, NO_LOCK };
job_details_t *details_ptr = job_ptr->details;
time_t now = time(NULL);
acct_policy_accrue_t acct_policy_accrue = {
.assoc_ptr = job_ptr->assoc_ptr,
.job_ptr = job_ptr,
.now = now,
.uid = job_ptr->user_id,
};
/*
* ACCRUE_ALWAYS flag will always force the accrue_time to be the
* submit_time (Not begin). Accrue limits don't work with this flag.
*/
if (slurm_conf.priority_flags & PRIORITY_FLAGS_ACCRUE_ALWAYS)
return;
/* check to see if we are enforcing limits */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return;
/*
* If the job is not eligible because it is either held, dependent or
* because its begin time is in the future don't accrue time.
*/
if (!job_ptr->priority || (job_ptr->bit_flags & JOB_DEPENDENT) ||
(details_ptr &&
(details_ptr->begin_time && (details_ptr->begin_time > now)))) {
/*
* If the job was previously accruing time (for example,
* ACCRUE_ALWAYS could have been on or not having
* ACCOUNTING_ENFORCE_LIMITS), we need to remove the accrue_time.
*/
if (details_ptr)
details_ptr->accrue_time = 0;
return;
}
/* Job has to be pending to accrue time. */
if (!IS_JOB_PENDING(job_ptr))
return;
if (!assoc_mgr_locked)
assoc_mgr_lock(&locks);
assoc_ptr = job_ptr->assoc_ptr;
if (!assoc_ptr) {
debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
__func__, job_ptr);
goto endit;
}
acct_policy_accrue.acct = job_ptr->assoc_ptr->acct;
_set_highest_prio_qos_ptr(job_ptr);
_handle_add_accrue(&acct_policy_accrue);
endit:
if (!assoc_mgr_locked)
assoc_mgr_unlock(&locks);
}
extern void acct_policy_remove_accrue_time(job_record_t *job_ptr,
bool assoc_mgr_locked)
{
assoc_mgr_lock_t locks = { .assoc = WRITE_LOCK, .qos = WRITE_LOCK };
acct_policy_accrue_t acct_policy_accrue = {
.uid = job_ptr->user_id,
};
/*
* ACCRUE_ALWAYS flag will always force the accrue_time to be the
* submit_time (Not begin). Accrue limits don't work with this flag.
*/
if (slurm_conf.priority_flags & PRIORITY_FLAGS_ACCRUE_ALWAYS)
return;
/* check to see if we are enforcing limits */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return;
if (!job_ptr->details || !job_ptr->details->accrue_time)
return;
/* Job has to be pending to accrue time. */
if (!IS_JOB_PENDING(job_ptr))
return;
if (!assoc_mgr_locked)
assoc_mgr_lock(&locks);
acct_policy_accrue.assoc_ptr = job_ptr->assoc_ptr;
if (!acct_policy_accrue.assoc_ptr) {
debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
__func__, job_ptr);
goto end_it;
}
acct_policy_accrue.acct = acct_policy_accrue.assoc_ptr->acct;
/*
* Normally only single jobs come in here, but if we don't have any
* limits the array itself comes in so we need to add it all.
*/
if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
acct_policy_accrue.cnt = job_ptr->array_recs->task_cnt;
else
acct_policy_accrue.cnt = 1;
_set_highest_prio_qos_ptr(job_ptr);
(void) _for_each_qos_remove_accrue_time(
job_ptr->qos_ptr, &acct_policy_accrue);
/* reset the job */
job_ptr->details->accrue_time = 0;
job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
end_it:
if (!assoc_mgr_locked)
assoc_mgr_unlock(&locks);
}
extern uint32_t acct_policy_get_prio_thresh(job_record_t *job_ptr,
bool assoc_mgr_locked)
{
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
slurmdb_assoc_rec_t *assoc_ptr;
uint32_t prio_thresh = 0;
assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK,
NO_LOCK, NO_LOCK, NO_LOCK };
/* check to see if we are enforcing limits */
if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS))
return 0;
if (!assoc_mgr_locked)
assoc_mgr_lock(&locks);
assoc_ptr = job_ptr->assoc_ptr;
if (!assoc_ptr) {
debug("%s: no assoc_ptr, this usually means the association was removed right after the job (%pJ) was started, but didn't make it to the database before it was removed.",
__func__, job_ptr);
goto endit;
}
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
if (qos_ptr_1)
_get_prio_thresh(&prio_thresh, qos_ptr_1->min_prio_thresh);
if (qos_ptr_2)
_get_prio_thresh(&prio_thresh, qos_ptr_2->min_prio_thresh);
_get_prio_thresh(&prio_thresh, assoc_ptr->min_prio_thresh);
endit:
if (!assoc_mgr_locked)
assoc_mgr_unlock(&locks);
return prio_thresh;
}
extern time_t acct_policy_get_preemptable_time(job_record_t *job_ptr)
{
slurmdb_qos_rec_t *qos_ptr_1, *qos_ptr_2;
uint32_t min1, min2, conf_min;
time_t start = job_ptr->start_time;
xassert(verify_lock(CONF_LOCK, READ_LOCK));
xassert(verify_lock(JOB_LOCK, READ_LOCK));
xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK));
acct_policy_set_qos_order(job_ptr, &qos_ptr_1, &qos_ptr_2);
min1 = (qos_ptr_1) ? qos_ptr_1->preempt_exempt_time : INFINITE;
min2 = (qos_ptr_2) ? qos_ptr_2->preempt_exempt_time : INFINITE;
conf_min = slurm_conf.preempt_exempt_time;
/* priority: min1 > min2 > conf_min. INFINITE means none. */
if (min1 != INFINITE)
return start + min1;
else if (min2 != INFINITE)
return start + min2;
else if (conf_min != INFINITE)
return start + conf_min;
else
return start;
}
extern bool acct_policy_is_job_preempt_exempt(job_record_t *job_ptr)
{
time_t now = time(0);
assoc_mgr_lock_t locks = { .qos = READ_LOCK };
assoc_mgr_lock(&locks);
time_t preempt_time = acct_policy_get_preemptable_time(job_ptr);
assoc_mgr_unlock(&locks);
return now < preempt_time;
}
/*
* WARNING: Since we only look at the first partition's QOS, this function
* must only be used in places where we loop over all partitions in the job.
*/
extern void acct_policy_set_qos_order(job_record_t *job_ptr,
slurmdb_qos_rec_t **qos_ptr_1,
slurmdb_qos_rec_t **qos_ptr_2)
{
xassert(job_ptr);
xassert(qos_ptr_1);
xassert(qos_ptr_2);
/* Initialize incoming pointers */
*qos_ptr_1 = NULL;
*qos_ptr_2 = NULL;
if (job_ptr->qos_ptr) {
if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr) {
/*
* If the job's QOS has the flag to over ride the
* partition then use that otherwise use the
* partition's QOS as the king.
*/
if (job_ptr->qos_ptr->flags & QOS_FLAG_OVER_PART_QOS) {
*qos_ptr_1 = job_ptr->qos_ptr;
*qos_ptr_2 = job_ptr->part_ptr->qos_ptr;
} else {
*qos_ptr_1 = job_ptr->part_ptr->qos_ptr;
*qos_ptr_2 = job_ptr->qos_ptr;
}
/*
* No reason to look at the same QOS twice, actually
* we never want to do that ;).
*/
if (*qos_ptr_1 == *qos_ptr_2)
*qos_ptr_2 = NULL;
} else
*qos_ptr_1 = job_ptr->qos_ptr;
} else if (job_ptr->part_ptr && job_ptr->part_ptr->qos_ptr)
*qos_ptr_1 = job_ptr->part_ptr->qos_ptr;
}
/*
* Checks for record in *user_limit_list of user_id if
* *user_limit_list doesn't exist it will create it, if the user_id
* record doesn't exist it will add it to the list.
* In all cases the user record is returned.
*/
extern slurmdb_used_limits_t *acct_policy_get_acct_used_limits(
list_t **acct_limit_list, char *acct)
{
slurmdb_used_limits_t *used_limits;
xassert(acct_limit_list);
if (!*acct_limit_list)
*acct_limit_list = list_create(slurmdb_destroy_used_limits);
if (!(used_limits = list_find_first(*acct_limit_list,
_find_used_limits_for_acct,
acct))) {
int i = sizeof(uint64_t) * slurmctld_tres_cnt;
used_limits = xmalloc(sizeof(slurmdb_used_limits_t));
used_limits->acct = xstrdup(acct);
used_limits->tres = xmalloc(i);
used_limits->tres_run_secs = xmalloc(i);
list_append(*acct_limit_list, used_limits);
}
return used_limits;
}
/*
* Checks for record in *user_limit_list of user_id if
* *user_limit_list doesn't exist it will create it, if the user_id
* record doesn't exist it will add it to the list.
* In all cases the user record is returned.
*/
extern slurmdb_used_limits_t *acct_policy_get_user_used_limits(
list_t **user_limit_list, uint32_t user_id)
{
slurmdb_used_limits_t *used_limits;
xassert(user_limit_list);
if (!*user_limit_list)
*user_limit_list = list_create(slurmdb_destroy_used_limits);
if (!(used_limits = list_find_first(*user_limit_list,
_find_used_limits_for_user,
&user_id))) {
int i = sizeof(uint64_t) * slurmctld_tres_cnt;
used_limits = xmalloc(sizeof(slurmdb_used_limits_t));
used_limits->uid = user_id;
used_limits->tres = xmalloc(i);
used_limits->tres_run_secs = xmalloc(i);
list_append(*user_limit_list, used_limits);
}
return used_limits;
}