blob: a45e2bc1c91b71bd1f1599b8d03f93409a9990ca [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test that the OverSubscribe option in partitions is being enforced.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set input_file_name "input"
set file_in "$test_dir/$input_file_name"
set test_part_1 "${test_name}_part_1"
set test_part_2 "${test_name}_part_2"
set test_part_3 "${test_name}_part_3"
set num_jobs 0
set cr_core 0
set cr_cpu 0
set cr_socket 0
set cpu_cnt 0
set socket_cnt 0
set thread_cnt 0
set oversubscribe_j_cnt 2
set job_id 0
set node_name ""
# TODO: Temporary variables to debug bug 12729 (remove once fixed)
set config_dir [get_conf_path]
if {[check_config_select "linear"]} {
skip "Test is not compatible with a config of SelectType=select/linear"
}
if {![is_super_user]} {
skip "This test can't be run except as SlurmUser"
}
set select_type_parameters [get_config_param "SelectTypeParameters"]
if {[param_contains $select_type_parameters "CR_SOCKET*"]} {
skip "This test is incompatible with CR_SOCKET allocations"
}
if {[param_contains $select_type_parameters "CR_CORE*"]} {
set cr_core 1
}
if {[param_contains $select_type_parameters "CR_CPU*"]} {
set cr_cpu 1
}
set job_mem_opt "--mem-per-cpu=4"
if {![param_contains $select_type_parameters "*MEMORY"]} {
set job_mem_opt "--comment=no_mem"
subskip "SelectTypeParameters needs to include MEMORY for a better oversubscribe testing"
}
set node_name [get_nodes_by_request "-N1 --exclusive -t1"]
if { [llength $node_name] != 1 } {
skip "This test need to be able to submit jobs with at least -N1 --exclusive -t1"
}
if {[get_node_param $node_name "CoreSpecCount"] != "MISSING"} {
skip "This test is incompatible with nodes that have a CoreSpecCount (e.g. $node_name)"
}
if {[get_node_param $node_name "CPUSpecList"] != "MISSING"} {
skip "This test is incompatible with nodes that have a CPUSpecList (e.g. $node_name)"
}
proc cleanup {} {
global job_id scontrol test_part_1 test_part_2 test_part_3
cancel_job $job_id
run_command -none "$scontrol delete partitionname=$test_part_1"
run_command -none "$scontrol delete partitionname=$test_part_2"
run_command -none "$scontrol delete partitionname=$test_part_3"
# TODO: Temporary cleanup of debug for bug 12729 (remove once fixed)
global config_dir
run_command "$scontrol show node"
restore_conf $config_dir/slurm.conf
run_command "$scontrol reconfigure"
}
proc cr_core_cpu { node } {
global cr_cpu cr_core core_cnt socket_cnt scontrol number
global cpu_cnt thread_cnt
set node_conf [dict get [get_nodes $node] $node]
set core_cnt [dict get $node_conf "CoresPerSocket"]
set cpu_cnt [dict get $node_conf "CPUTot"]
set socket_cnt [dict get $node_conf "Sockets"]
set thread_cnt [dict get $node_conf "ThreadsPerCore"]
if { $cr_cpu == 1 } {
return [expr $cpu_cnt / $thread_cnt]
} elseif { $cr_core == 1 } {
return [expr $core_cnt * $socket_cnt]
} else {
fail "An error occurred when getting the cpu or core count"
}
}
# Get default_queue_depth to see how many jobs will start at one time
proc default_queue_depth { } {
global number
set depth 100
set scheduler_params [get_config_param "SchedulerParameters"]
regexp "default_queue_depth=($number)" $scheduler_params - depth
return $depth
}
proc sub_job { job oversubscribe part } {
global file_in job_id job_mem_opt
if { $oversubscribe } {
set job_id [submit_job -fail "-a$job -t1 $job_mem_opt -p$part --oversubscribe -o/dev/null $file_in"]
} else {
set job_id [submit_job -fail "-a$job -t1 $job_mem_opt -p$part -o/dev/null $file_in"]
}
}
proc check_job { exp_num_jobs } {
global squeue job_id num_jobs input_file_name
# Wait a bit for the job to start
wait_for_job -fail ${job_id}_0 "RUNNING"
# Wait for the expected number of jobs to be running
if [
wait_for {$job_cnt == $exp_num_jobs} {
set job_cnt [
regexp -all $input_file_name [
run_command_output -fail "$squeue --job=$job_id --state=RUNNING,SUSPENDED -Oname --noheader"
]
]
}
] {
fail "The number of possible jobs that could run were not reached ($job_cnt != $exp_num_jobs). This could be due to memory or other limits."
}
}
##################################Test Begins##################################
# TODO: Temporarily increase logging to debug bug 12729 (remove once fixed)
save_conf $config_dir/slurm.conf
run_command -none "$bin_echo SlurmdDebug=debug3 >> $config_dir/slurm.conf"
run_command -none "$bin_echo DebugFlags=cgroup >> $config_dir/slurm.conf"
run_command "$scontrol reconfigure"
make_bash_script $file_in "sleep 10"
####################################
#
# Test partition with oversubscribe=NO
#
####################################
log_info "Test partition with oversubscribe=NO"
# Determine the number of cores or CPUs
set num_jobs [cr_core_cpu $node_name]
set job_start [default_queue_depth]
if { $num_jobs >= $job_start } {
skip "default_queue_depth less than core count ($num_jobs < $job_start), can not run this test"
}
# Create first test partition
run_command -fail "$scontrol create partitionname=$test_part_1 OverSubscribe=NO nodes=$node_name"
# Submit a job with oversubscribe
sub_job "0-$num_jobs" 1 $test_part_1
# Check that the correct number of jobs are running
check_job $num_jobs
cancel_job $job_id
# Submit a job without oversubscribe
sub_job "0-$num_jobs" 0 $test_part_1
check_job $num_jobs
cancel_job $job_id
# Delete first test partition
run_command -fail "$scontrol delete partitionname=$test_part_1"
####################################
#
# Test partition with oversubscribe=YES:2
#
####################################
set new_job_limit [expr $num_jobs * 2]
if [param_contains [get_config_param "PreemptMode"] "GANG"] {
skip "Test partition with oversubscribe=YES:2 incompatible with gang scheduling"
} elseif { $new_job_limit >= $job_start } {
skip "default_queue_depth less than desired job count ($new_job_limit < $job_start), can not run oversubscribe=YES:2"
} else {
log_info "Test partition with oversubscribe=YES:2"
# Make a new partition with oversubscribe=yes:2
run_command -fail "$scontrol create partitionname=$test_part_2 OverSubscribe=YES:$oversubscribe_j_cnt nodes=$node_name"
# Submit a job with oversubscribe (expect 2 jobs per core/CPU)
sub_job "0-$new_job_limit" 1 $test_part_2
check_job $new_job_limit
cancel_job $job_id
# Submit a job without oversubscribe (expect 1 job per core/CPU)
sub_job "0-$num_jobs" 0 $test_part_2
check_job $num_jobs
cancel_job $job_id
# Delete second test partition
run_command -fail "$scontrol delete partitionname=$test_part_2"
}
########################################
#
# Test partition with oversubscribe=EXCLUSIVE
#
########################################
log_info "Test partition with oversubscribe=EXCLUSIVE"
# Make a new partition with oversubscribe=EXCLUSIVE
run_command -fail "$scontrol create partitionname=$test_part_3 OverSubscribe=EXCLUSIVE nodes=$node_name"
# Submit a job with oversubscribe (expected 1 job per node)
sub_job "0-$num_jobs" 1 $test_part_3
check_job 1
cancel_job $job_id
# Submit a job with oversubscribe (expected 1 job per node)
sub_job "0-$num_jobs" 0 $test_part_3
check_job 1
cancel_job $job_id
# Delete third test partition
run_command -fail "$scontrol delete partitionname=$test_part_3"