| #!/usr/bin/expect |
| ############################################################################ |
| # Purpose: Test of SLURM functionality |
| # Confirm node selection from within a job step on existing allocation |
| # (--nodelist, --exclude, --nodes and --nprocs options). |
| # |
| # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR |
| # "WARNING: ..." with an explanation of why the test can't be made, OR |
| # "FAILURE: ..." otherwise with an explanation of the failure, OR |
| # anything else indicates a failure mode that must be investigated. |
| ############################################################################ |
| # Copyright (C) 2002-5 The Regents of the University of California. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # UCRL-CODE-217948. |
| # |
| # This file is part of SLURM, a resource management program. |
| # For details, see <http://www.llnl.gov/linux/slurm/>. |
| # |
| # SLURM is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with SLURM; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set test_id "1.86" |
| set exit_code 0 |
| set file_in "test$test_id.input" |
| set prompt "SLURM_QA_PROMPT: " |
| |
| print_header $test_id |
| |
| if {[test_front_end] != 0} { |
| send_user "\nWARNING: This test is incompatable with front-end systems\n" |
| exit 0 |
| } |
| |
| # |
| # Build input script file |
| # |
| make_bash_script $file_in " |
| export PS1=\"$prompt\" |
| $bin_bash --norc |
| " |
| |
| # |
| # Submit a 2 node job |
| # |
| set timeout $max_job_delay |
| spawn $srun -N2 -A $file_in |
| expect { |
| -re "More ($alpha) requested than permitted" { |
| send_user "\nWARNING: can't test srun task distribution\n" |
| exec $bin_rm -f $file_in |
| exit $exit_code |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "Job initiated\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exec $bin_rm -f $file_in |
| exit 1 |
| } |
| eof { |
| wait |
| send_user "\nFAILURE: srun terminated\n" |
| kill_srun |
| exec $bin_rm -f $file_in |
| exit 1 |
| } |
| } |
| exec $bin_rm -f $file_in |
| |
| # |
| # Get node names |
| # |
| set host_0 "" |
| set host_1 "" |
| send "$srun -l $bin_hostname\n" |
| expect { |
| -re "($number): ($alpha_numeric)" { |
| set host_inx $expect_out(1,string) |
| if {$host_inx == 0} { |
| set host_0 $expect_out(2,string) |
| } |
| if {$host_inx == 1} { |
| set host_1 $expect_out(2,string) |
| } |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| |
| # |
| # Verify node count |
| # |
| if {[string compare $host_0 ""] == 0} { |
| send_user "\nFAILURE: Did not get hostname of task 0\n" |
| set exit_code 1 |
| } |
| if {[string compare $host_1 ""] == 0} { |
| send_user "\nFAILURE: Did not get hostname of task 1\n" |
| set exit_code 1 |
| } |
| if {$exit_code != 0} { |
| exit $exit_code |
| } |
| |
| # |
| # Exclude specific node |
| # |
| set matches 0 |
| send "$srun -l -N1 -n1 --exclude=$host_0 $bin_hostname\n" |
| expect { |
| -re "0: ($alpha_numeric)" { |
| if {[string compare $expect_out(1,string) $host_1] == 0} { |
| incr matches |
| } else { |
| send_user "\nFAILURE: wrong node responded\n" |
| set exit_code 1 |
| } |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {$matches == 0} { |
| send_user "\nFAILURE: required node failed to respond\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Exclude specific node |
| # |
| set matches 0 |
| send "$srun -l -N1 -n1 --exclude=$host_1 $bin_hostname\n" |
| expect { |
| -re "0: ($alpha_numeric)" { |
| if {[string compare $expect_out(1,string) $host_0] == 0} { |
| incr matches |
| } else { |
| send_user "\nFAILURE: wrong node responded\n" |
| set exit_code 1 |
| } |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {$matches == 0} { |
| send_user "\nFAILURE: required node failed to respond\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Include specific node |
| # |
| set matches 0 |
| send "$srun -l -N1 -n1 --nodelist=$host_0 $bin_hostname\n" |
| expect { |
| -re "0: ($alpha_numeric)" { |
| if {[string compare $expect_out(1,string) $host_0] == 0} { |
| incr matches |
| } else { |
| send_user "\nFAILURE: wrong node responded\n" |
| set exit_code 1 |
| } |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {$matches == 0} { |
| send_user "\nFAILURE: required node failed to respond\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Include specific node |
| # |
| set matches 0 |
| send "$srun -l -N1 -n1 --nodelist=$host_1 $bin_hostname\n" |
| expect { |
| -re "0: ($alpha_numeric)" { |
| if {[string compare $expect_out(1,string) $host_1] == 0} { |
| incr matches |
| } else { |
| send_user "\nFAILURE: wrong node responded\n" |
| set exit_code 1 |
| } |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {$matches == 0} { |
| send_user "\nFAILURE: required node failed to respond\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Error test: Overlapping include/exclude node list |
| # NOTE: some versions of Expect have a problem with long input |
| # lines, so we use two send commands for this execute line |
| # |
| set matches 0 |
| send "$srun -l -N1 -n1 --nodelist=$host_0" |
| send " --exclude=$host_0 $bin_hostname\n" |
| expect { |
| -re "0: ($alpha_numeric)" { |
| send_user "\nFAILURE: wrong node responded\n" |
| set exit_code 1 |
| exp_continue |
| } |
| "error:" { |
| send_user "This error is expected, no worries\n" |
| incr matches |
| exp_continue |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {$matches == 0} { |
| send_user "\nFAILURE: No error for overlapping include/exclude node list\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Error test: Exceed node count |
| # |
| set matches 0 |
| send "$srun -l -N3 -n3 -O $bin_hostname\n" |
| expect { |
| -re "0: ($alpha_numeric)" { |
| send_user "\nFAILURE: wrong node responded\n" |
| set exit_code 1 |
| exp_continue |
| } |
| "error:" { |
| send_user "This error is expected, no worries\n" |
| incr matches |
| exp_continue |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {$matches == 0} { |
| send_user "\nFAILURE: No error for exceeding node count\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Run with fewer nodes |
| # |
| set test_0 "" |
| set test_1 "" |
| send "$srun -l -N1-1 -n2 -O $bin_hostname\n" |
| expect { |
| -re "($number): ($alpha_numeric)" { |
| set host_inx $expect_out(1,string) |
| if {$host_inx == 0} { |
| set test_0 $expect_out(2,string) |
| } |
| if {$host_inx == 1} { |
| set test_1 $expect_out(2,string) |
| } |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {[string compare $test_0 $test_1] != 0} { |
| send_user "\nFAILURE: Multiple nodes responded, should be only one\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Error test: Exceed task count, first get the processor count then exceed it |
| # |
| set processors 1 |
| send "$srun -l -c1 $bin_hostname\n" |
| expect { |
| -re "($number): ($alpha_numeric)" { |
| incr processors |
| exp_continue |
| } |
| -re "Unable to contact" { |
| send_user "\nFAILURE: slurm appears to be down\n" |
| exit 1 |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| set matches 0 |
| send "$srun -l -n $processors $bin_hostname\n" |
| expect { |
| -re "0: ($alpha_numeric)" { |
| send_user "\nFAILURE: wrong node responded\n" |
| set exit_code 1 |
| exp_continue |
| } |
| "error:" { |
| incr matches |
| exp_continue |
| } |
| "$prompt" { |
| send_user "srun completed\n" |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| send_user "\nFAILURE: srun EOF\n" |
| exit 1 |
| } |
| } |
| if {$matches == 0} { |
| send_user "\nFAILURE: No error for exceeding processor count\n" |
| set exit_code 1 |
| } else { |
| send_user "This error is expected, no worries\n" |
| } |
| |
| # |
| # Post-processing |
| # |
| send "exit\n" |
| expect { |
| -re "error.*Exit 1" { |
| send_user "This error is expected, no worries\n" |
| exp_continue |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| kill_srun |
| exit 1 |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$exit_code == 0} { |
| send_user "\nSUCCESS\n" |
| } |
| exit $exit_code |