blob: 72c4d08f1825ea66f3aeaeb8ceb0215687fa8755 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test heterogeneous job steps
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in1 "$test_dir/input1"
set file_in2 "$test_dir/input2"
set file_in3 "$test_dir/input3"
set file_out "$test_dir/output"
set job_id_list [list]
set matches 0
proc get_first_node_of_job {job_id} {
global scontrol
set nodelist [get_job_param $job_id "NodeList"]
set hostnames [run_command_output -fail "$scontrol show hostnames $nodelist"]
set node [lindex $hostnames 0]
return $node
}
#
# Requirements
#
set nodes [get_nodes_by_request "-N2"]
set nb_nodes [llength $nodes]
if { $nb_nodes != 2 } {
skip "Insufficient nodes in default partition ($nodes)"
}
set gpu_cnt [get_highest_gres_count $nb_nodes "gpu"]
if {$gpu_cnt < 2} {
skip "This test requires 2 or more GPUs on $nb_nodes nodes of the default partition"
}
if {[get_config_param "SchedulerType"] ne "sched/backfill"} {
skip "This test requires SchedulerType = sched/backfill"
}
proc cleanup {} {
global job_id_list
cancel_job $job_id_list
}
# het job
make_bash_script $file_in1 "$srun -l --mpi=none $file_in2 : $file_in3"
make_bash_script $file_in2 "$bin_echo FILE2"
make_bash_script $file_in3 "$bin_echo FILE3"
log_info "SUBTEST: Confirm het job srun runs within non-het job salloc"
set matches 0
set output [run_command_output -fail "$salloc -N2 $file_in1"]
if {[regexp "Granted job allocation ($number)" $output - job_id]} {
lappend job_id_list $job_id
} else {
fail "Allocation not granted ($output)"
}
set lines_matched [regexp -all -inline "($number): FILE($number)" $output]
foreach {re step file} $lines_matched {
if {$step == 0 && $file == 2} {
incr matches
} elseif {$step == 1 && $file == 3} {
incr matches
} else {
fail "Invalid output ($re from $lines_matched)"
}
}
subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2"
log_info "SUBTEST: Confirm het job srun runs within non-het job sbatch"
set matches 0
set job_id [submit_job -fail "-N2 -o $file_out $file_in1"]
lappend job_id_list $job_id
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out
set output [run_command_output -fail "$bin_cat $file_out"]
set matches [regexp -all {FILE[23]} $output]
subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2"
log_info "SUBTEST: Confirm het job srun runs on different nodes in non-het job sbatch"
set matches 0
set job_id [submit_job -fail "-N2 -o $file_out --wrap=\"$srun $bin_printenv SLURMD_NODENAME\""]
lappend job_id_list $job_id
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out
set matches [run_command_output -fail "sort -u $file_out | wc -l"]
subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2"
log_info "SUBTEST: Confirm het job srun with gres runs in non-het job sbatch"
if [check_config_select "cons_tres"] {
set matches 0
set job_id [submit_job -fail "-N2 --gpus=2 -o $file_out --wrap=\"$srun --gpus=1 echo ok : --gpus=1 echo ok\""]
lappend job_id_list $job_id
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out
set matches [run_command_output -fail "grep ok $file_out | wc -l"]
subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2"
} else {
subskip "Subtest requires SelectType=select/cons_tres (Job should be granted and output match the expected)"
}
log_info "SUBTEST: Confirm het job srun fails in het job salloc"
set matches 0
set output [run_command_output -xfail -fail "$salloc $file_in1 : $file_in1"]
if {[regexp "Granted job allocation ($number)" $output - job_id]} {
lappend job_id_list $job_id
} else {
fail "Allocation not granted ($output)"
}
if {[regexp "srun: error: Allocation failure" $output]} {
incr matches
}
subtest {$matches == 1} "Job should be granted but allocation should fail" "$matches != 1"
log_info "SUBTEST: Confirm het job srun fails when trying to overlap nodes"
set matches 0
set timeout $max_job_delay
spawn $salloc -N2 $bin_bash
expect {
-re "Granted job allocation ($number)" {
set job_id $expect_out(1,string)
lappend job_id_list $job_id
reset_bash_prompt
exp_continue
}
-re "$test_prompt" {
# job started
}
timeout {
fail "salloc not responding"
}
}
set node [get_first_node_of_job $job_id]
log_info "node=$node for job $job_id"
send "$srun -w $node : -w $node $bin_true\r"
expect {
-re "$test_prompt" {
#break
}
-re "overlaps with excluded $node" {
incr matches
}
timeout {
fail "srun not responding"
}
}
send "exit\r"
subtest {$matches == 1} "srun should return overlaps with excluded" "$matches != 1"