blob: 8325f649475bc5b8e9c15b29579ca80da0d85969 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Validate proper GRES operation under heavy load (many jobs)
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
############################################################################
source ./globals
set file_in1 "$test_dir/input1"
set file_in2 "$test_dir/input2"
set file_out "$test_dir/output"
set job_list [list]
set number_commas "\[0-9_,\]+"
proc cleanup {} {
global job_list
# cancel jobs
cancel_job $job_list
}
cleanup
if {![check_config_select "cons_tres"]} {
skip "This test is only compatible with select/cons_tres"
}
set nb_nodes [get_partition_param [default_partition] "TotalNodes"]
if {$nb_nodes > 1} {
set nb_nodes 2
}
set gpu_cnt [get_highest_gres_count $nb_nodes "gpu"]
if {$gpu_cnt < 1} {
skip "This test requires 1 or more GPUs on $nb_nodes nodes of the default partition"
}
set node_name [get_nodes_by_request "--gres=gpu:1 -n1 -t1"]
if { [llength $node_name] != 1 } {
skip "This test need to be able to submit jobs with at least --gres=gpu:1"
}
set cpus_per_node [get_node_param $node_name "CPUTot"]
set sockets_per_node [get_node_param $node_name "Sockets"]
set cpus_per_socket [expr $cpus_per_node / $sockets_per_node]
log_debug "GPU count is $gpu_cnt"
log_debug "Sockets per node is $sockets_per_node"
log_debug "CPUs per socket is $cpus_per_socket"
set tot_cpus $cpus_per_node
set tot_gpus $gpu_cnt
if {$nb_nodes > 1} {
incr tot_gpus $gpu_cnt
}
if {$tot_gpus > 32} {
set tot_gpus 32
}
#
# Build input script file
#
make_bash_script $file_in1 "$srun -l -N \$SLURM_NNODES -n \$SLURM_NNODES $file_in2
$scontrol -dd show job \$SLURM_JOB_ID | grep gpu
sleep 5
exit 0"
make_bash_script $file_in2 "echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES"
#
# Submit job with various --gpus counters, up to 2 full nodes or 32 GPUs
#
# spawn $sbatch --cpus-per-gpu=1 --gpus=$inx -t1 -w $node_name -J $test_name --output=$ofile $file_in1
for {set inx 1} {$inx <= $tot_gpus} {incr inx} {
set job_id 0
set ofile ${file_out}.${inx}
if {$tot_gpus > $tot_cpus} {
# Assumes no configured DefCPUsPerGPU
set job_id [submit_job "--gpus=$inx -t1 -w $node_name -J $test_name --output=$ofile $file_in1"]
} else {
set job_id [submit_job "--cpus-per-gpu=1 --gpus=$inx -t1 -w $node_name -J $test_name --output=$ofile $file_in1"]
}
if {$job_id == 0} {
fail "sbatch job submit failure"
} else {
lappend job_list $job_id
}
}
foreach job_id $job_list {
wait_for_job -fail $job_id "DONE"
}
for {set inx 1} {$inx <= $tot_gpus} {incr inx} {
set ofile ${file_out}.${inx}
wait_for_file -fail -timeout 20 $ofile
set output [run_command_output "$bin_cat $ofile"]
set matches_list [regexp -inline -all "CUDA_VISIBLE_DEVICES:($number_commas)" $output]
set match 0
foreach {re devices} $matches_list {
incr match [cuda_count $devices]
}
subtest {$match == $inx} "GPU count should be $inx in output file" "$match != $inx, file: $ofile, output: $output"
}