blob: b88ccdeef9bc7994eb92400b55d11e9691a9ebd3 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test that the core spec option in sbatch allocates the correct
# number of cores and that tasks spread over multiple nodes
# when there is not enough resources on one node.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in "$test_dir/input"
set file_out "$test_dir/output"
set spec_in "$test_dir/spec_core_script\.in"
set job_id 0
proc cleanup {} {
global job_id
cancel_job $job_id
}
#############################################################################
#
# Checks that the node uses the correct number of specialized cores
# and that the number of nodes the job uses is correct.
#
# exp_node = 0: job must only use the specified node
# exp_node = 1: job must use more then specified node
# exp_node = -1: job must fail because the job exceeds the number of cores
#
#############################################################################
proc core_spec_job {task node core_spec exp_nodes} {
global sbatch scontrol spec_in file_out number thread_cnt
global cpu_tot
set job_id 0
set num_nodes 0
# Determine the number of tasks that can be run
set cpu_used_by_spec [expr $thread_cnt * $core_spec]
if {$cpu_tot > $cpu_used_by_spec} {
set task_limit [expr $cpu_tot - $cpu_used_by_spec]
} else {
set task_limit 1
}
set ntasks [expr abs($task_limit + $task)]
if {$ntasks == 0} {
set ntasks 1
}
set error_chk 0
spawn $sbatch -t1 -w$node -S$core_spec -n$ntasks -o$file_out $spec_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
-re "error" {
if {$exp_nodes != -1} {
fail "sbatch should not have produced an error"
}
set error_chk 1
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if {$job_id == 0 && $error_chk == 0} {
fail "Job was not submitted"
} elseif {$exp_nodes == -1 && $job_id != 0} {
fail "This job should have failed but did not"
} elseif {$exp_nodes == -1 && $error_chk != 0} {
log_debug "This error is expected do not worry"
} else {
wait_for_job -fail $job_id "RUNNING"
set core_chk 0
spawn $scontrol show job $job_id
expect {
-re "NumNodes=($number)" {
set num_nodes $expect_out(1,string)
exp_continue
}
-re "CoreSpec=$core_spec" {
set core_chk 1
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
subtest {$core_chk != 0} "Check that the node uses the correct number of specialized cores"
wait_for_job -fail $job_id "DONE"
}
if {$exp_nodes == 1} {
subtest {$num_nodes > 1} "Check that the job uses the correct number of nodes" "Job $job_id should use more then 1 node"
}
if {$exp_nodes == 0} {
subtest {$num_nodes == 1} "Check that the job uses the correct number of nodes" "Job $job_id should use only $node"
}
}
#############################################################################
#
# Tests begin here
#
#############################################################################
# Run basic client tool checks before validating configuration
set output [run_command_output -xfail -subtest "$sbatch --thread-spec=1 -S1 --wrap='/bin/true'"]
subtest {[regexp "sbatch: fatal: -S/--core-spec and --thred-spec options are mutually exclusive" $output]} "sbatch should fail for both --thread-spec and -S given"
set output [run_command_output -xfail -subtest "SBATCH_CORE_SPEC=1 SBATCH_THREAD_SPEC=1 $sbatch --wrap='/bin/true'"]
subtest {[regexp "Both --core-spec and --thread-spec set using environment variables. Those options are mutually exclusive" $output]} "sbatch should fail when both SBATCH_CORE_SPEC SBATCH_THREAD_SPEC set in environment and no cli given"
if {[check_config_select "linear"]} {
skip "This test is incompatible with select/linear"
}
if {[param_contains [get_config_param "SelectTypeParameters"] "CR_SOCKET"]} {
skip "This test is incompatible with CR_SOCKET allocations"
}
if {![get_config_param "AllowSpecResourcesUsage"]} {
skip "AllowSpecResourcesUsage not configured to permit core specialization"
}
# Remove any vestigial files
exec $bin_rm -f $file_in $file_out $spec_in
make_bash_script $file_in "
first=\$($scontrol show hostnames \$SLURM_JOB_NODELIST\ | head -n1)\
$scontrol show node \$first\
"
make_bash_script $spec_in "sleep 5"
spawn $sbatch --exclusive -t1 -N2 -o$file_out $file_in
expect {
-re "Batch job submission failed" {
skip "Can't test srun task distribution"
}
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
fail "sbatch did not submit job"
}
wait_for_file -fail $file_out
set first_node ""
set core_cnt 0
set cpu_tot 1
set socket_cnt 1
set thread_cnt 1
spawn $bin_cat $file_out
expect {
-re "NodeName=($re_word_str)" {
set first_node $expect_out(1,string)
exp_continue
}
-re "CoresPerSocket=($number)" {
set core_cnt $expect_out(1,string)
exp_continue
}
-re "CPUTot=($number)" {
set cpu_tot $expect_out(1,string)
exp_continue
}
-re "Sockets=($number)" {
set socket_cnt $expect_out(1,string)
exp_continue
}
-re "ThreadsPerCore=($number)" {
set thread_cnt $expect_out(1,string)
exp_continue
}
timeout {
fail "cat is not responding"
}
eof {
wait
}
}
set core_cnt [expr $core_cnt * $socket_cnt]
if {$core_cnt == 0} {
fail "sbatch did not find the number of cores"
}
if {$core_cnt < 4} {
skip "Core count too low for testing ($core_cnt < 4)"
}
# The environment variable should get overwritten.
set env(SBATCH_THREAD_SPEC) "99"
#
# Using the core spec within the node limits
#
log_info "Run within the specified node"
core_spec_job 0 $first_node [expr $core_cnt - 2] 0
core_spec_job -2 $first_node [expr $core_cnt - 2] 0
#
# Using core spec with more tasks then the node can handle. This should
# cause the tasks to spread across multiple nodes as needed
#
log_info "Spread job across multiple nodes"
core_spec_job 1 $first_node [expr $core_cnt - 2] 1
core_spec_job 1 $first_node [expr $core_cnt - 1] 1
#
# Using core spec with more cores then the specified node has
#
log_info "Fail by trying to use more cores than exist"
core_spec_job 1 $first_node [expr $core_cnt + 5] -1
core_spec_job 1 $first_node [expr $core_cnt + 7] -1