| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test heterogeneous job steps |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| |
| source ./globals |
| |
| set file_in1 "$test_dir/input1" |
| set file_in2 "$test_dir/input2" |
| set file_in3 "$test_dir/input3" |
| set file_out "$test_dir/output" |
| set job_id_list [list] |
| set matches 0 |
| |
| proc get_first_node_of_job {job_id} { |
| global scontrol |
| |
| set nodelist [get_job_param $job_id "NodeList"] |
| set hostnames [run_command_output -fail "$scontrol show hostnames $nodelist"] |
| set node [lindex $hostnames 0] |
| |
| return $node |
| } |
| |
| # |
| # Requirements |
| # |
| set nodes [get_nodes_by_request "-N2"] |
| set nb_nodes [llength $nodes] |
| if { $nb_nodes != 2 } { |
| skip "Insufficient nodes in default partition ($nodes)" |
| } |
| set gpu_cnt [get_highest_gres_count $nb_nodes "gpu"] |
| if {$gpu_cnt < 2} { |
| skip "This test requires 2 or more GPUs on $nb_nodes nodes of the default partition" |
| } |
| if {[get_config_param "SchedulerType"] ne "sched/backfill"} { |
| skip "This test requires SchedulerType = sched/backfill" |
| } |
| |
| proc cleanup {} { |
| global job_id_list |
| |
| cancel_job $job_id_list |
| } |
| |
| # het job |
| make_bash_script $file_in1 "$srun -l --mpi=none $file_in2 : $file_in3" |
| make_bash_script $file_in2 "$bin_echo FILE2" |
| make_bash_script $file_in3 "$bin_echo FILE3" |
| |
| log_info "SUBTEST: Confirm het job srun runs within non-het job salloc" |
| set matches 0 |
| set output [run_command_output -fail "$salloc -N2 $file_in1"] |
| if {[regexp "Granted job allocation ($number)" $output - job_id]} { |
| lappend job_id_list $job_id |
| } else { |
| fail "Allocation not granted ($output)" |
| } |
| set lines_matched [regexp -all -inline "($number): FILE($number)" $output] |
| foreach {re step file} $lines_matched { |
| if {$step == 0 && $file == 2} { |
| incr matches |
| } elseif {$step == 1 && $file == 3} { |
| incr matches |
| } else { |
| fail "Invalid output ($re from $lines_matched)" |
| } |
| } |
| subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2" |
| |
| log_info "SUBTEST: Confirm het job srun runs within non-het job sbatch" |
| set matches 0 |
| set job_id [submit_job -fail "-N2 -o $file_out $file_in1"] |
| lappend job_id_list $job_id |
| wait_for_job -fail $job_id "DONE" |
| wait_for_file -fail $file_out |
| set output [run_command_output -fail "$bin_cat $file_out"] |
| set matches [regexp -all {FILE[23]} $output] |
| subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2" |
| |
| log_info "SUBTEST: Confirm het job srun runs on different nodes in non-het job sbatch" |
| set matches 0 |
| set job_id [submit_job -fail "-N2 -o $file_out --wrap=\"$srun $bin_printenv SLURMD_NODENAME\""] |
| lappend job_id_list $job_id |
| wait_for_job -fail $job_id "DONE" |
| wait_for_file -fail $file_out |
| set matches [run_command_output -fail "sort -u $file_out | wc -l"] |
| subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2" |
| |
| log_info "SUBTEST: Confirm het job srun with gres runs in non-het job sbatch" |
| if [check_config_select "cons_tres"] { |
| set matches 0 |
| set job_id [submit_job -fail "-N2 --gpus=2 -o $file_out --wrap=\"$srun --gpus=1 echo ok : --gpus=1 echo ok\""] |
| lappend job_id_list $job_id |
| wait_for_job -fail $job_id "DONE" |
| wait_for_file -fail $file_out |
| set matches [run_command_output -fail "grep ok $file_out | wc -l"] |
| subtest {$matches == 2} "Job should be granted and output match the expected" "$matches != 2" |
| } else { |
| subskip "Subtest requires SelectType=select/cons_tres (Job should be granted and output match the expected)" |
| } |
| |
| log_info "SUBTEST: Confirm het job srun fails in het job salloc" |
| set matches 0 |
| set output [run_command_output -xfail -fail "$salloc $file_in1 : $file_in1"] |
| if {[regexp "Granted job allocation ($number)" $output - job_id]} { |
| lappend job_id_list $job_id |
| } else { |
| fail "Allocation not granted ($output)" |
| } |
| if {[regexp "srun: error: Allocation failure" $output]} { |
| incr matches |
| } |
| subtest {$matches == 1} "Job should be granted but allocation should fail" "$matches != 1" |
| |
| log_info "SUBTEST: Confirm het job srun fails when trying to overlap nodes" |
| set matches 0 |
| set timeout $max_job_delay |
| spawn $salloc -N2 $bin_bash |
| expect { |
| -re "Granted job allocation ($number)" { |
| set job_id $expect_out(1,string) |
| lappend job_id_list $job_id |
| reset_bash_prompt |
| exp_continue |
| } |
| -re "$test_prompt" { |
| # job started |
| } |
| timeout { |
| fail "salloc not responding" |
| } |
| } |
| |
| set node [get_first_node_of_job $job_id] |
| log_info "node=$node for job $job_id" |
| |
| send "$srun -w $node : -w $node $bin_true\r" |
| expect { |
| -re "$test_prompt" { |
| #break |
| } |
| -re "overlaps with excluded $node" { |
| incr matches |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| } |
| send "exit\r" |
| subtest {$matches == 1} "srun should return overlaps with excluded" "$matches != 1" |