blob: 060acbd8b13ecac22f857e1704c9d62ffc168aad [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Validate heterogeneous gpu job options.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set is_skip false
if {[get_highest_gres_count 1 "gpu"] < 2} {
skip "This test requires 2 or more GPUs per node in the default partition"
}
proc submit_job_error {cmd expected_error} {
set err_match 0
spawn {*}$cmd
expect {
-re $expected_error {
incr err_match
exp_continue
}
timeout {
fail "Command ($cmd) not responding"
}
eof {
wait
}
}
subtest {$err_match == 1} "Job submission should return $expected_error"
}
proc submit_job {cmd} {
global number
set err_match 0
set job_id 0
spawn {*}$cmd
expect {
-re "job ($number)" {
set job_id $expect_out(1,string)
}
timeout {
fail "Command ($cmd) not responding"
}
eof {
wait
}
}
if {!$job_id} {
fail "Didn't get a job_id"
}
return $job_id
}
proc check_job {job_id grep re} {
global bin_bash bin_grep scontrol
set matches 0
spawn $bin_bash -c "exec $scontrol show job $job_id | $bin_grep -i '$grep'"
expect {
-re $re {
incr matches
exp_continue
}
timeout {
fail "scontrol not responding"
}
eof {
wait
}
}
subtest {$matches == 1} "Job info should contain $re"
}
proc test_gpu_bind {} {
global sbatch srun salloc
log_info "Testing --gpu-bind"
set tests [list \
"--gpu-bind=blah : --gpu-bind=closest" \
"--gpu-bind=closest : --gpu-bind=blah" \
]
set submission_errors {
"error: Invalid --gpu-bind argument: blah"
"error: Invalid --gpu-bind argument: blah"
}
foreach cmd {sbatch srun salloc} {
foreach a $tests b $submission_errors {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
submit_job_error $run $b
}
}
set tests [list \
"--gpu-bind=closest : --gpu-bind=closest" \
"--gpu-bind=closest : --gpu-bind=map_gpu:1" \
"--gpu-bind=map_gpu:1 : --gpu-bind=closest" \
"--gpu-bind=closest : -n1" \
"-n1 : --gpu-bind=closest" \
]
set regexes [list \
"^JobId=.*TresBind=gres/gpu:closest.*JobId=.*TresBind=gres/gpu:closest" \
"^JobId=.*TresBind=gres/gpu:closest.*JobId=.*TresBind=gres/gpu:map_gpu:1" \
"^JobId=.*TresBind=gres/gpu:map_gpu:1.*JobId=.*TresBind=gres/gpu:closest" \
"^JobId=.*TresBind=gres/gpu:closest.*JobId=" \
"^JobId=.*JobId=.*TresBind=gres/gpu:closest" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|TresBind" $re
cancel_job $job_id
}
}
}
proc test_gpu_freq {} {
global sbatch srun salloc
log_info "Testing --gpu-freq"
set tests [list \
"--gpu-freq=blah : --gpu-freq=low" \
"--gpu-freq=low : --gpu-freq=blah" \
]
set submission_errors {
"error: Invalid --gpu-freq argument: gpu:blah"
"error: Invalid --gpu-freq argument: gpu:blah"
}
foreach cmd {sbatch srun salloc} {
foreach a $tests b $submission_errors {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
submit_job_error $run $b
}
}
set tests [list \
"--gpu-freq=low : --gpu-freq=low" \
"--gpu-freq=low : --gpu-freq=medium" \
"--gpu-freq=medium : --gpu-freq=low" \
"--gpu-freq=low : -n1" \
"-n1 : --gpu-freq=low" \
]
set regexes [list \
"^JobId=.*TresFreq=gpu:low.*JobId=.*TresFreq=gpu:low" \
"^JobId=.*TresFreq=gpu:low.*JobId=.*TresFreq=gpu:medium" \
"^JobId=.*TresFreq=gpu:medium.*JobId=.*TresFreq=gpu:low" \
"^JobId=.*TresFreq=gpu:low.*JobId=" \
"^JobId=.*JobId=.*TresFreq=gpu:low" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|TresFreq" $re
cancel_job $job_id
}
}
}
proc test_cpus_per_gpu {} {
global sbatch srun salloc
log_info "Testing --cpus-per-gpu"
set tests [list \
"--gpus=1 --cpus-per-gpu=1 : --gpus=1 --cpus-per-gpu=2" \
"--gpus=1 --cpus-per-gpu=2 : --gpus=1 --cpus-per-gpu=1" \
"--gpus=1 --cpus-per-gpu=2 : -n1" \
"-n1 : --gpus=1 --cpus-per-gpu=2" \
]
set regexes [list \
"^JobId=.*CpusPerTres=.*gpu:1.*JobId=.*CpusPerTres=.*gpu:2" \
"^JobId=.*CpusPerTres=.*gpu:2.*JobId=.*CpusPerTres=.*gpu:1" \
"^JobId=.*CpusPerTres=.*gpu:2.*JobId=.*" \
"^JobId=.*JobId=.*CpusPerTres=.*gpu:2" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|CpusPerTres" $re
cancel_job $job_id
}
}
}
proc test_gpus_per_job {} {
global sbatch srun salloc
log_info "Testing --cpus-per-job"
set tests [list \
"-n1 --gpus=1 : -n1 --gpus=2" \
"-n1 --gpus=2 : -n1 --gpus=1" \
"-n1 --gpus=2 : -n1" \
"-n1 : -n1 --gpus=2" \
]
set regexes [list \
"^JobId=.*TresPerJob=.*gpu:1.*JobId=.*TresPerJob=.*gpu:2" \
"^JobId=.*TresPerJob=.*gpu:2.*JobId=.*TresPerJob=.*gpu:1" \
"^JobId=.*TresPerJob=.*gpu:2.*JobId=.*" \
"^JobId=.*JobId=.*TresPerJob=.*gpu:2" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|TresPerJob" $re
cancel_job $job_id
}
}
}
proc test_gpus_per_node {} {
global sbatch srun salloc
log_info "Testing --gpus-per-node"
set tests [list \
"-n1 --gpus-per-node=1 : -n1 --gpus-per-node=2" \
"-n1 --gpus-per-node=2 : -n1 --gpus-per-node=1" \
"-n1 --gpus-per-node=2 : -n1" \
"-n1 : -n1 --gpus-per-node=2" \
"-n1 --gres=gpu:1 : -n1 --gres=gpu:2" \
"-n1 --gres=gpu:2 : -n1 --gres=gpu:1" \
"-n1 --gres=gpu:2 : -n1" \
"-n1 : -n1 --gres=gpu:2" \
"-n1 --gpus-per-node=1 --gres=gpu:2 : -n1 --gpus-per-node=2 --gres=gpu:1" \
]
set regexes [list \
"^JobId=.*TresPerNode=.*gpu:1.*JobId=.*TresPerNode=.*gpu:2" \
"^JobId=.*TresPerNode=.*gpu:2.*JobId=.*TresPerNode=.*gpu:1" \
"^JobId=.*TresPerNode=.*gpu:2.*JobId=.*" \
"^JobId=.*JobId=.*TresPerNode=.*gpu:2" \
"^JobId=.*TresPerNode=.*gpu:1.*JobId=.*TresPerNode=.*gpu:2" \
"^JobId=.*TresPerNode=.*gpu:2.*JobId=.*TresPerNode=.*gpu:1" \
"^JobId=.*TresPerNode=.*gpu:2.*JobId=.*" \
"^JobId=.*JobId=.*TresPerNode=.*gpu:2" \
"^JobId=.*TresPerNode=.*gpu:1,.*gpu:2.*JobId=.*TresPerNode=.*gpu:2,.*gpu:1" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|TresPerNode" $re
cancel_job $job_id
}
}
}
proc test_gpus_per_socket {} {
global sbatch srun salloc
log_info "Testing --gpus-per-socket"
set tests [list \
"-n1 --gpus-per-socket=1 : -n1 " \
"-n1 : -n1 --gpus-per-socket=1" \
]
set submission_errors {
"--gpus-per-socket option requires --sockets-per-node specification"
"--gpus-per-socket option requires --sockets-per-node specification"
}
foreach cmd {sbatch srun salloc} {
foreach a $tests b $submission_errors {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
submit_job_error $run $b
}
}
set tests [list \
"-n1 --sockets-per-node=1 --gpus-per-socket=1 : -n1 --sockets-per-node=1 --gpus-per-socket=2" \
"-n1 --sockets-per-node=1 --gpus-per-socket=2 : -n1 --sockets-per-node=1 --gpus-per-socket=1" \
"-n1 --sockets-per-node=1 --gpus-per-socket=2 : -n1" \
"-n1 : -n1 --sockets-per-node=1 --gpus-per-socket=2" \
]
set regexes [list \
"^JobId=.*TresPerSocket=.*gpu:1.*JobId=.*TresPerSocket=.*gpu:2" \
"^JobId=.*TresPerSocket=.*gpu:2.*JobId=.*TresPerSocket=.*gpu:1" \
"^JobId=.*TresPerSocket=.*gpu:2.*JobId=.*" \
"^JobId=.*JobId=.*TresPerSocket=.*gpu:2" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|TresPerSocket" $re
cancel_job $job_id
}
}
}
proc test_gpus_per_task {} {
global sbatch srun salloc
log_info "Testing --gpus-per-task"
set tests [list \
"-n1 --gpus-per-task=1 : -n1 --gpus-per-task=2" \
"-n1 --gpus-per-task=2 : -n1 --gpus-per-task=1" \
"-n1 --gpus-per-task=2 : -n1" \
"-n1 : -n1 --gpus-per-task=2" \
]
set regexes [list \
"^JobId=.*TresPerTask=.*gpu=1.*JobId=.*TresPerTask=.*gpu=2" \
"^JobId=.*TresPerTask=.*gpu=2.*JobId=.*TresPerTask=.*gpu=1" \
"^JobId=.*TresPerTask=.*gpu=2.*JobId=.*" \
"^JobId=.*JobId=.*TresPerTask=.*gpu=2" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|TresPerTask" $re
cancel_job $job_id
}
}
}
proc test_mem_per_gpu {} {
global sbatch srun salloc
log_info "Testing --mem-per-gpu"
set tests [list \
"-n1 --gpus=1 --mem-per-gpu=1 : -n1 --gpus=1 --mem-per-gpu=2" \
"-n1 --gpus=1 --mem-per-gpu=2 : -n1 --gpus=1 --mem-per-gpu=1" \
"-n1 --gpus=1 --mem-per-gpu=2 : -n1" \
"-n1 : -n1 --gpus=1 --mem-per-gpu=2" \
]
set regexes [list \
"^JobId=.*MemPerTres=.*gpu:1.*JobId=.*MemPerTres=.*gpu:2" \
"^JobId=.*MemPerTres=.*gpu:2.*JobId=.*MemPerTres=.*gpu:1" \
"^JobId=.*MemPerTres=.*gpu:2.*JobId=.*" \
"^JobId=.*JobId=.*MemPerTres=.*gpu:2" \
]
foreach cmd {sbatch srun salloc} {
foreach a $tests re $regexes {
set run ""
if {$cmd eq "sbatch"} {
set run "$sbatch -o/dev/null $a --wrap=hostname"
} elseif {$cmd eq "srun"} {
set run "$srun $a hostname"
} else {
set run "$salloc $a hostname"
}
set job_id [submit_job $run]
check_job $job_id "^JobId\\|MemPerTres" $re
cancel_job $job_id
}
}
}
test_gpu_bind
test_gpu_freq
if {[check_config_select "cons_tres"]} {
test_gpus_per_node
test_cpus_per_gpu
test_gpus_per_job
test_gpus_per_socket
test_gpus_per_task
test_mem_per_gpu
} else {
log_warn "Some tests are skipped because they require SelectType=cons_tres."
set is_skip true
}
if {$is_skip} {
skip "Some tests were skipped"
}