| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test of Slurm functionality |
| # Test of job suspend/resume. |
| ############################################################################ |
| # Copyright (C) 2005-2007 The Regents of the University of California. |
| # Copyright (C) 2008 Lawrence Livermore National Security. |
| # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| # Written by Morris Jette <jette1@llnl.gov> |
| # CODE-OCEC-09-009. All rights reserved. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| source ./globals |
| |
| set file_out1 "$test_dir/output1" |
| set file_out2 "$test_dir/output2" |
| set file_prog "$test_name.prog" |
| set file_prog_sh1 "$test_dir/script1" |
| set file_prog_sh2 "$test_dir/script2" |
| set job_id1 0 |
| set job_id2 0 |
| set host_name "" |
| set not_supported 0 |
| |
| if [param_contains [get_config_param "PreemptMode"] "GANG"] { |
| skip "This test can't be run with gang scheduling" |
| } |
| if {![is_super_user]} { |
| skip "This test can't be run except as SlurmUser" |
| } |
| |
| proc cleanup {} { |
| global job_id1 job_id2 file_prog |
| |
| cancel_job [list $job_id1 $job_id2] |
| file delete $file_prog |
| } |
| |
| proc suspend_job { job_id operation } { |
| global scontrol not_supported |
| |
| spawn $scontrol $operation $job_id |
| expect { |
| -re "Requested operation not supported" { |
| log_warn "Job suspend not supported" |
| set not_supported 1 |
| exp_continue |
| } |
| -re "error" { |
| fail "scontrol error" |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| proc check_output { file_name } { |
| global bin_cat |
| |
| set match1 0 |
| set match2 0 |
| |
| spawn $bin_cat $file_name |
| expect { |
| -re "JobSuspended" { |
| set match1 1 |
| exp_continue |
| } |
| -re "AllDone" { |
| set match2 1 |
| exp_continue |
| } |
| timeout { |
| fail "cat not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| set failure_notice "Failure may be due to use of gang scheduler, a race conditions, or the ProcTrack plugin not identifying the application as part of the job" |
| |
| subtest { $match1 != 0 } "Verify job is suspended" $failure_notice |
| subtest { $match2 != 0 } "Verify job runs to completion" $failure_notice |
| } |
| |
| # |
| # Delete left-over program and rebuild it |
| # |
| file delete $file_prog |
| make_bash_script $file_prog_sh1 "$srun ./$file_prog" |
| make_bash_script $file_prog_sh2 "./$file_prog" |
| exec $bin_cc -o $file_prog ${file_prog}.c |
| exec $bin_chmod 700 $file_prog |
| |
| if {[get_affinity_types] eq "none"} { |
| set affinity_mem 0 |
| } else { |
| set affinity_mem 1 |
| } |
| if {[param_contains [get_config_param "SelectTypeParameters"] "*Memory"] || $affinity_mem} { |
| set job_mem 16 |
| } else { |
| set job_mem 1 |
| } |
| |
| # |
| # Submit two jobs to the same node,. |
| # The first job includes srun, second only the application |
| # |
| set job_id1 [submit_job -fail "-N1 -t2 --output=$file_out1 --mem=${job_mem} $file_prog_sh1"] |
| |
| # Get the job's allocate host name |
| wait_for_job -fail $job_id1 RUNNING |
| spawn $scontrol show job $job_id1 |
| expect { |
| -re " NodeList=($re_word_str)" { |
| set host_name $expect_out(1,string) |
| exp_continue |
| } |
| -re "MidplaneList=($re_word_str)" { |
| set host_name $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$host_name eq ""} { |
| fail "Did not get hostname for job ($job_id1)" |
| } |
| # Submit another job to that same node |
| set job_id2 [submit_job -fail "-N1 -t2 --output=$file_out2 --mem=${job_mem} $file_prog_sh2"] |
| |
| # |
| # Stop job 1 and let job 2 begin, |
| # Stop job 2 and let job 1 complete, |
| # Then resume job 2 |
| # The sleep commands are added so the program can recognize |
| # that it has been suspended. |
| # The long initial sleep is due to POE launching jobs slowly |
| # |
| sleep 10 |
| suspend_job $job_id1 suspend |
| if {$not_supported == 1} { |
| fail "Job suspension is not supported" |
| } |
| if {$not_supported == 0} { |
| wait_for_job -fail $job_id2 RUNNING |
| sleep 5 |
| suspend_job $job_id2 suspend |
| suspend_job $job_id1 resume |
| wait_for_job -fail $job_id1 DONE |
| suspend_job $job_id2 resume |
| } |
| wait_for_job -fail $job_id2 DONE |
| if {$not_supported == 0} { |
| wait_for_file -fail $file_out1 |
| wait_for_file -fail $file_out2 |
| |
| check_output $file_out1 |
| check_output $file_out2 |
| } |