| #!/usr/bin/expect |
| ############################################################################ |
| # Purpose: Test of SLURM functionality |
| # Submit job directly to slurmd without use of slurmctld scheduler. |
| # (--no-allocate option). NOTE: Needs to run as SlurmUser or root. |
| # |
| # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR |
| # "WARNING: ..." with an explanation of why the test can't be made, OR |
| # "FAILURE: ..." otherwise with an explanation of the failure, OR |
| # anything else indicates a failure mode that must be investigated. |
| ############################################################################ |
| # Copyright (C) 2002-2006 The Regents of the University of California. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # UCRL-CODE-226842. |
| # |
| # This file is part of SLURM, a resource management program. |
| # For details, see <http://www.llnl.gov/linux/slurm/>. |
| # |
| # SLURM is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with SLURM; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set test_id "1.26" |
| set interations 100 |
| set exit_code 0 |
| |
| print_header $test_id |
| |
| if {[is_super_user] == 0} { |
| send_user "\nWARNING: This test can't be run except as SlurmUser\n" |
| exit 0 |
| } |
| |
| if {[test_bluegene] == 1} { |
| send_user "\nWARNING: This test is incompatable with BlueGene systems\n" |
| exit 0 |
| } |
| |
| # |
| # Submit a 1 node job and record the node name |
| # |
| # NOTE: Check explicity for "^0:" or "\n0:. Otherwise in srun verbose mode we |
| # can get a hostname ending with 0 in the messages that gets used to generate |
| # a bad value for host_0 below. |
| # |
| set host_0 "" |
| set nodelist_name "" |
| set timeout $max_job_delay |
| set srun_pid [spawn $srun -v -N1 -l $bin_printenv SLURMD_NODENAME] |
| expect { |
| -re "on host ($alpha_numeric)," { |
| set nodelist_name $expect_out(1,string) |
| exp_continue |
| } |
| -re "^0: ($alpha_numeric)" { |
| set host_0 $expect_out(1,string) |
| exp_continue |
| } |
| -re "\n0: ($alpha_numeric)" { |
| set host_0 $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| slow_kill $srun_pid |
| set exit_code 1 |
| } |
| eof { |
| wait |
| } |
| } |
| |
| # |
| # Verify node count |
| # |
| if {[string compare $host_0 ""] == 0} { |
| send_user "\nFAILURE: Did not get hostname of task 0\n" |
| exit 1 |
| } |
| if {[string compare $nodelist_name ""] == 0} { |
| send_user "\nFAILURE: Did not get nodelist_name of task 0\n" |
| exit 1 |
| } |
| set include_node $host_0 |
| |
| # |
| # Submit a job directly to that node |
| # |
| set host_1 "" |
| set slurm_user 1 |
| set timeout 10 |
| set srun_pid [spawn $srun -N1 -l --nodelist=$include_node --no-allocate -t1 $bin_printenv SLURMD_NODENAME] |
| expect { |
| -re "Invalid job credential" { |
| send_user "\nWARNING: Not SlurmUser or root.\n" |
| set slurm_user 0 |
| exp_continue |
| } |
| -re "error: .*try again" { |
| send_user "Can't avoid this possible error\n" |
| set host_1 $host_0 |
| exp_continue |
| } |
| -re "error: .*already in shared memory" { |
| send_user "Can't avoid this possible error\n" |
| set host_1 $host_0 |
| exp_continue |
| } |
| -re "error: .*exit code 1" { |
| exp_continue |
| } |
| -re "0: ($alpha_numeric)" { |
| set host_1 $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| send_user "\nWARNING: srun not responding, " |
| send_user "expected if not SlurmUser or root.\n" |
| slow_kill $srun_pid |
| set slurm_user 0 |
| } |
| eof { |
| wait |
| } |
| } |
| if {$slurm_user == 0} { |
| exit 0; |
| } |
| if {[string compare $host_1 $include_node]} { |
| send_user "\nFAILURE: Allocation lacked an included node\n" |
| set exit_code 1 |
| } |
| |
| # |
| # Run three tasks at a time on some node and do so repeatedly |
| # This checks for slurmd race conditions |
| # The sleep between cycles is to make sure the job step completion |
| # logic has time to be processed (slurmd -> slurmctld messages) |
| # Note: process output in order of expected completion |
| # |
| set front_end [test_front_end] |
| |
| set successes 0 |
| for {set inx 0} {$inx < $interations} {incr inx} { |
| exec $bin_usleep 250000 |
| |
| set failures 0 |
| set srun_pid [spawn $srun -N1 --nodelist=$include_node -t1 -l $bin_printenv SLURMD_NODENAME] |
| set alloc $spawn_id |
| |
| set srun_pid1 [spawn $srun -N1 --nodelist=$include_node -Z $bin_usleep 500000] |
| set noalloc1 $spawn_id |
| |
| set srun_pid2 [spawn $srun -N1 --nodelist=$include_node -Z $bin_usleep 250000] |
| set noalloc2 $spawn_id |
| |
| set timeout 20 |
| set spawn_id $noalloc2 |
| expect { |
| -i $noalloc2 |
| -re "error: .*try again" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: .*already in shared memory" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: .*exit code 1" { |
| exp_continue |
| } |
| -re "error: elan" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: qsw_prog_init" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error:" { |
| send_user "\nFAILURE: some error happened\n" |
| set failures 1 |
| exp_continue |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| slow_kill $srun_pid2 |
| set failures 1 |
| } |
| eof { |
| wait |
| } |
| } |
| |
| set spawn_id $noalloc1 |
| expect { |
| -i $noalloc1 |
| -re "error: .*try again" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: .*already in shared memory" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: .*exit code 1" { |
| exp_continue |
| } |
| -re "error: elan" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: qsw_prog_init" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error:" { |
| send_user "\nFAILURE: some error happened\n" |
| set failures 1 |
| exp_continue |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| slow_kill $srun_pid1 |
| set failures 1 |
| } |
| eof { |
| wait |
| } |
| } |
| |
| set timeout $max_job_delay |
| set spawn_id $alloc |
| expect { |
| -i $alloc |
| -re "Invalid node name specified" { |
| if {$front_end == 0} { |
| send_user "\nFAILURE: some error happened\n" |
| set failures 1 |
| } else { |
| send_user "\nExpected error on front-end systems\n" |
| } |
| exp_continue |
| } |
| -re "error: elan" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: qsw_prog_init" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: .*configuring interconnect" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error: update_failed_tasks" { |
| send_user "Can't avoid this possible error\n" |
| exp_continue |
| } |
| -re "error:" { |
| send_user "\nFAILURE: some error happened\n" |
| set failures 1 |
| exp_continue |
| } |
| -re "0: ($alpha_numeric)" { |
| set host_0 $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| send_user "\nFAILURE: srun not responding\n" |
| slow_kill $srun_pid |
| set failures 1 |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {$failures == 0} { |
| incr successes |
| } else { |
| set exit_code 1 |
| } |
| } |
| if {$successes != $interations} { |
| send_user "\nFAILURE: only $successes of $interations completed successfully\n" |
| } |
| |
| if {$exit_code == 0} { |
| send_user "\nSUCCESS\n" |
| } |
| exit $exit_code |