blob: e20105a7f34cd8efc21bbfb3eb5c9ad062b9a376 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test that partition and job qos limits are enforced when using
# the OverPartQos flag for the job's qos
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./inc21.34_tests
set test_node ""
# Total cpus in test node
set totcpus 0
set nthreads 0
set acct test_acct
set user_name ""
set test_part "${test_name}_part"
set part_qos "${test_name}_part_qos"
set job_qos "${test_name}_job_qos"
set qostest ""
set grn GrpNodes
set grn_num 0
set grcpu GrpCpus
set grcpu_num 0
set grpcpumin GrpCPUMins
set grpcpumin_num 0
# Set grpcpurunmin_num to multiple of CPUs per core to work with most configurations
# Also make sure that it is at least 4 so we can add and subtract from it
set grpcpurunmin GrpCPURunMins
set grpcpurunmin_num 40
set grjobs GrpJobs
set grjobs_num 2
set grpmem GrpMem
set grpmem_num 100
set grsub GrpSubmit
set grsub_num 2
set grpwall GrpWall
set grpwall_num 1
set maxcpu MaxCpus
set maxcpu_num 0
# Set maxcpumin_num to multiple of CPUs per core to work with most configurations
set maxcpumin MaxCPUMins
set maxcpumin_num 2
set maxwall MaxWall
set maxwall_num 2
set maxcpuspu MaxCPUSPerUser
set maxcpuspu_num 2
set maxnodes MaxNodes
set maxnode_num 0
set maxnodespu MaxNodesPerUser
set maxnodespu_num 0
set maxjobs MaxJobs
set maxjobs_num 2
set maxjobsub MaxSubmitJobs
set maxjobsub_num 2
set time_spacing 1
set tres_cpu_mult 2
# cr_core = 1 / cr_cpu = 0
set selectparam 0
set def_part [default_partition]
# mod qos
array set mod_job_qos {
GrpNodes -1
GrpCpus -1
GrpJob -1
GrpSubmit -1
GrpCpuMin -1
GrpCpuRunMin -1
GrpMem -1
GrpWall -1
MaxCpus -1
MaxNode -1
MaxJobs -1
MaxSubmitJobs -1
MaxCpuMin -1
MaxWall -1
MaxCpusPerUser -1
MaxNode -1
MaxNodesPerUser -1
GrpTRES=billing -1
GrpTRESMins=billing -1
GrpTRESRunMins=billing -1
MaxTRESPerJob=billing -1
MaxTRESMinsPerJob=billing -1
MaxTRESPerUser=billing -1
}
array set mod_part_qos {
GrpNodes -1
GrpCpus -1
GrpJob -1
GrpSubmit -1
GrpCpuMin -1
GrpCpuRunMin -1
GrpMem -1
GrpWall -1
MaxCpus -1
MaxNode -1
MaxJobs -1
MaxSubmitJobs -1
MaxCpuMin -1
MaxWall -1
MaxCpusPerUser -1
MaxNode -1
MaxNodesPerUser -1
GrpTRES=billing -1
GrpTRESMins=billing -1
GrpTRESRunMins=billing -1
MaxTRESPerJob=billing -1
MaxTRESMinsPerJob=billing -1
MaxTRESPerUser=billing -1
}
#
# Cannot run the test if OverTimeLimit is set, since we test time limits.
#
regexp "($number)" [get_config_param "OverTimeLimit"] {} overtimelim
if {$overtimelim != 0} {
skip "Cannot run this test when OverTimeLimit is set. Exiting now"
}
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
if {![param_contains [get_config_param "AccountingStorageEnforce"] "limits"]} {
skip "This test can't be run without enforcing limits"
}
if {![is_super_user]} {
skip "Test can only be ran as SlurmUser"
}
# Determine what the selecttype param is
set select_type_parameters [get_config_param "SelectTypeParameters"]
if {[param_contains $select_type_parameters "CR_SOCKET*"]} {
skip "This test is incompatible with CR_SOCKET allocations"
}
if {[param_contains $select_type_parameters "CR_ONE_TASK_PER_CORE"]} {
skip "This test is incompatible with CR_ONE_TASK_PER_CORE allocations"
}
if {[param_contains $select_type_parameters "CR_CORE_*"]} {
set selectparam 1
}
if {[param_contains [get_config_param "SelectType"] "select/linear"]} {
skip "This test is incompatible select/linear"
}
if {[get_config_param "PriorityType"] eq "priority/multifactor"} {
set prio_multifactor 1
} else {
set prio_multifactor 0
}
proc cleanup { } {
global acct job_qos part_qos scontrol sacctmgr test_part def_part
# Delete the test qos
run_command -none "$sacctmgr -i delete qos $job_qos,$part_qos"
# Delete account
run_command -none "$sacctmgr -i delete account $acct"
run_command -none "$scontrol delete partitionname=$test_part"
if {[string length $def_part]} {
run_command -none "$scontrol update partitionname=$def_part default=yes"
}
}
# Remove any vestigial data
cleanup
# Check to see that there are enough resources in the default partition
set tmpc 0
set tmpn 0
spawn $scontrol show part [default_partition]
expect {
-re "TotalCPUs=($number)" {
set tmpc [expr $expect_out(1,string) - 1]
exp_continue
}
-re "TotalNodes=($number)" {
set tmpn [expr $expect_out(1,string) - 1]
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
if {$tmpc == 0 || $tmpn == 0} {
skip "Not enough Nodes and/or CPUs"
}
# Get the number of nodes in the default partition, minus one.
# $maxnode_num is used as MaxNodes in QOS and we will test requests of
# $maxnode_num + 1. If not EnforcePartLimits could interfere.
set num_nodes [expr [llength [get_nodes_by_state]] -1]
if {$num_nodes == 0} {
fail "No cpus were found"
} else {
# Set QoS node values
set grn_num $num_nodes
set maxnode_num $num_nodes
set maxnodespu_num $num_nodes
}
# Create 2 test qos
add_qos $part_qos ""
add_qos $job_qos ""
# create a tmp partition to use for testing
spawn $scontrol create partitionname=$test_part qos=$part_qos tresbillingweights=cpu=$tres_cpu_mult default=yes \
nodes=ALL
expect {
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
set got_node 0
spawn $srun -N1 printenv SLURM_NODELIST
expect {
-re "($re_word_str)" {
set test_node $expect_out(1,string)
set got_node 1
exp_continue
}
timeout {
fail "srun is not responding"
}
eof {
wait
}
}
if {$got_node != 1} {
fail "Did not get node for testing"
}
# Get the number of cpus on a node
lassign [get_node_cpus $test_node] totcpus nthreads
if {$totcpus == 0} {
fail "No cpus were found"
} else {
# Set QoS CPU values
set grcpu_num [expr $totcpus - $nthreads]
set grpcpumin_num $totcpus
set maxcpu_num [expr $totcpus - $nthreads]
set maxcpumin_num $totcpus
}
# Gets user
set user_name [get_my_user_name]
# Add account with qos
set acctmatch 0
spawn $sacctmgr -i add account $acct qos=$job_qos
expect {
-re "Adding Account" {
incr acctmatch
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
if {$acctmatch != 1} {
fail "sacctmgr had a problem adding the account"
}
# Add user to account
spawn $sacctmgr -i create user name=$user_name account=$acct
expect {
timeout {
fail "sacctmgr not responding"
}
eof {
wait
}
}
log_info "========== Run limit test on partition's qos limits =========="
part_test
#
# Set overpartqos flag on job's qos
#
set changed 0
spawn $sacctmgr -i mod qos $job_qos set flag=overpartqos
expect {
-re "Modified qos" {
set changed 1
exp_continue
}
timeout {
fail "sacctmgr is not resonding"
}
eof {
wait
}
}
log_info "========== Run limit test on job's qos limits =========="
qos_test