blob: 4f1d83778e2f193269643c8b5eefb03225c8308a [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test --ntasks-per-gpu
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in "$test_dir/input"
set file_out "$test_dir/output"
set job_list [list]
set nodes [get_nodes_by_request "--gres=gpu:2 -N2"]
if {[llength $nodes] != 2} {
skip "This test requires 2 or more GPUs on at least 2 nodes in the default partition"
}
if {![check_config_select "cons_tres"]} {
skip "This test is only compatible with select/cons_tres"
}
set constrain_dev [expr {[get_config_param "ConstrainDevices"] ne "yes"}]
set sockets_per_node [list [get_node_param [lindex $nodes 0] "Sockets"] \
[get_node_param [lindex $nodes 1] "Sockets"]]
log_debug "`[lindex $nodes 0]` Sockets per node: [lindex $sockets_per_node 0]"
log_debug "`[lindex $nodes 1]` Sockets per node: [lindex $sockets_per_node 1]"
if {[lindex $sockets_per_node 0] != [lindex $sockets_per_node 1]} {
skip "Nodes `[lindex $nodes 0]` and `[lindex $nodes 1]` must have the same # of sockets"
}
proc get_count_two_gpus {output} {
set match 0
set match0 0
set match1 0
set match0_index -1
set match1_index -1
foreach line [split $output "\n"] {
if {$match0_index == -1 && [regexp {[0-9]+} $line {} {}] == 1} {
set match0_index $line
}
if {$match0_index == $line} {
incr match0
incr match
continue
}
if {$match1_index == -1 && [regexp {[0-9]+} $line {} {}] == 1} {
set match1_index $line
}
if {$match1_index == $line} {
incr match1
incr match
}
# We are only looking for two gpus so we stop here.
}
return [dict create match $match \
match0 $match0 \
match1 $match1 \
match0_index $match0_index \
match1_index $match1_index]
}
################################################################################
# Cleanup
################################################################################
proc cleanup {} {
global job_list
cancel_job $job_list
}
################################################################################
# Test --ntasks-per-gpu in srun
################################################################################
# Assumes 2 GPUs
proc test_srun {expected_tot expected_gpu srun_args {srun_env ""}} {
global srun constrain_dev
if {$srun_env != ""} {
set output [run_command_output "env $srun_env $srun $srun_args --quiet -t1 printenv CUDA_VISIBLE_DEVICES"]
} else {
set output [run_command_output "$srun $srun_args --quiet -t1 printenv CUDA_VISIBLE_DEVICES"]
}
set result [get_count_two_gpus $output]
dict with result {}
if {$constrain_dev} {
subtest {$match0 == $expected_gpu} "GPU $match0_index should be bound to $expected_gpu" "$match0 != $expected_gpu"
subtest {$match1 == $expected_gpu} "GPU $match1_index should be bound to $expected_gpu" "$match1 != $expected_gpu"
} else {
# If we constrain devices then CUDA_VISIBLE_DEVICES will always equal 0 because each task will have its own cgroup.
subtest {$match0_index == 0} "GPU environment variable index should be 0 because it is in task cgroup"
subtest {$match0 == $expected_gpu*2} "GPU $match0_index should be bound to [expr {$expected_gpu * 2}]" "$match0 != $expected_gpu"
subtest {$match1 == 0} "There should not be any other gpu index besides 0"
}
subtest {$match == $expected_tot} "Total bounds should be $expected_tot" "$match != $expected_tot"
}
################################################################################
# Test sbatch + srun
################################################################################
proc test_sbatch {expected_tasks args {env ""}} {
global sbatch srun file_in file_out test_name number
global bin_cat job_list
set job_id 0
set job_id [submit_job -fail -env $env "$args -t1 -o $file_out -J $test_name $file_in"]
lappend job_list $job_id
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out
set output [run_command_output -fail "$bin_cat $file_out"]
set result [get_count_two_gpus $output]
dict with result {}
subtest {$match == $expected_tasks} "Number of tasks bound to 1 GPU should be $expected_tasks" "$match != $expected_tasks"
}
################################################################################
# Test invalid argument combinations
################################################################################
proc test_invalid {cmd args {env ""}} {
global srun file_out test_name number
global bin_cat job_list
set job_id 0
set env_str ""
if {$env != ""} {
set env_str " and env \"$env\""
}
if {$cmd == "sbatch"} {
set job_id [submit_job -xfail -env $env "$args -t1 -o $file_out --wrap='sleep 30'"]
subtest {$job_id == 0} "sbatch should have failed due to invalid argument combination: $args$env_str" "$job_id != 0"
lappend job_list $job_id
} elseif {$cmd == "srun"} {
set rc 0
if {$env != ""} {
set rc [run_command_status "env $env $srun $args -t1 sleep 30"]
} else {
set rc [run_command_status "$srun $args -t1 sleep 30"]
}
subtest {$rc == 1} "srun should have failed due to invalid argument combination: $args$env_str" "$job_id != 0"
} else {
fail "Command '$cmd' is invalid"
}
}
# Create the sbatch script
make_bash_script $file_in "
$srun printenv CUDA_VISIBLE_DEVICES
"
# Tests 1.*
# Allocate tasks to fill up the # of GPUs specified for the job
testproc test_srun 4 2 "-N1 --gpus=2 --ntasks-per-gpu=2"
testproc test_srun 4 2 "-N1 --gpus-per-node=2 --ntasks-per-gpu=2"
testproc test_srun 4 2 "-N1 --gres=gpu:2 --ntasks-per-gpu=2"
# Allocate GPUs to fill up the # of tasks specified for the job
testproc test_srun 4 2 "-N1 -n4 --ntasks-per-gpu=2"
# Test env
testproc test_srun 4 2 "-N1 --gpus=2" "SLURM_NTASKS_PER_GPU=2"
testproc test_srun 4 2 "-N1 -n4" "SLURM_NTASKS_PER_GPU=2"
# Test ntasks-per-tres as well (but leave undocumented in favor of ntasks-per-gpu)
testproc test_srun 4 2 "-N1 -n4 --ntasks-per-tres=2"
testproc test_srun 4 2 "-N1 --gpus=2 --ntasks-per-tres=2"
testproc test_srun 4 2 "-N1 -n4" "SLURM_NTASKS_PER_TRES=2"
testproc test_srun 4 2 "-N1 --gpus=2" "SLURM_NTASKS_PER_TRES=2"
# Tests 2.*
# Note: sbatch does not take any input envs for --ntasks-per-[gpu|tres]
testproc test_sbatch 4 "-N2 --ntasks-per-gpu=2 --gres=gpu:1"
testproc test_sbatch 4 "-N1 --ntasks-per-gpu=2 --gres=gpu:2"
# Test (undocumented) ntasks-per-tres options
testproc test_sbatch 4 "-N2 --ntasks-per-tres=2 --gres=gpu:1"
testproc test_sbatch 4 "-N1 --ntasks-per-tres=2 --gres=gpu:2"
# Test invalid argument combinations
testproc test_invalid "sbatch" "--ntasks-per-gpu=2 --gpus-per-task=2"
testproc test_invalid "sbatch" "--ntasks-per-gpu=2 --gpus-per-socket=2 --sockets-per-node=1"
testproc test_invalid "sbatch" "--ntasks-per-gpu=2 --ntasks-per-node=2"
testproc test_invalid "sbatch" "--ntasks-per-gpu=2 --ntasks-per-tres=2"
testproc test_invalid "srun" "--ntasks-per-gpu=2 --gpus-per-task=2"
testproc test_invalid "srun" "--ntasks-per-gpu=2 --gpus-per-socket=2 --sockets-per-node=1"
testproc test_invalid "srun" "--ntasks-per-gpu=2 --ntasks-per-node=2"
testproc test_invalid "srun" "--ntasks-per-gpu=2 --ntasks-per-tres=2"
# Note that sbatch/salloc don't do input envs - only srun does
testproc test_invalid "srun" "--ntasks-per-gpu=2" "SLURM_NTASKS_PER_TRES=2"
testproc test_invalid "srun" "--ntasks-per-tres=2" "SLURM_NTASKS_PER_GPU=2"