blob: 7d6a134ac6070cbcdcfe5078f9d29be0abf622e7 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of SPANK plugin with hetjobs
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set cwd "[$bin_pwd]"
set file_in "$test_dir/input"
set file_out "$test_dir/output"
set file_prog "$cwd/${test_name}.prog"
set orig_spank_conf "$test_dir/orig_conf"
set new_spank_conf "$test_dir/new_conf"
set spank_out "$test_dir/spank.out"
set job_id 0
if {![is_super_user]} {
skip "This test must be run as SlurmUser"
}
if {[get_config_param "SchedulerType"] ne "sched/backfill"} {
skip "This test requires SchedulerType = sched/backfill"
}
proc cleanup {} {
global spank_conf_file
global job_id file_prog
cancel_job $job_id
#
# Restore the original plugstack
#
restore_conf $spank_conf_file
file delete ${file_prog}.so
}
set node_count [llength [get_nodes_by_state idle,alloc,comp]]
if {$node_count < 2} {
skip "Insufficient node count to run test"
}
#
# Build the plugin
#
exec $bin_rm -f ${file_prog}.so
exec $bin_cc -fPIC -shared -I${slurm_dir}/include -o ${file_prog}.so ${file_prog}.c
set output [run_command_output -fail -nolog "$scontrol -V"]
regexp "slurm ($re_word_str)" $output - loc_slurm_ver
if {[get_config_param "SLURM_VERSION"] != $loc_slurm_ver} {
skip "Slurmctld ([get_config_param SLURM_VERSION]) and local Slurm ($loc_slurm_ver) versions are not the same, can not continue"
}
#
# Locate slurm.conf's directory, copy the original plugstack.conf file
# and create an updated one using our new plugin
#
set spank_conf_file [get_conf_path]/plugstack.conf
save_conf $spank_conf_file
run_command -fail "$bin_echo 'required ${file_prog}.so ${spank_out}' >> $spank_conf_file"
#
# Test of locally logged messages().
# Add sleep for NFS delays in propagating $spank_conf_file
#
# NOTE: This test will fail if plugstack.conf is not in a shared location
# The login node (where test is started) will modify the file and the
# slurmd/slurmstepd on the compute node will not get the updated
# configuration.
#
exec $bin_sleep 30
log_info "Test locally logged messages........."
log_debug "Checksum of local $spank_conf_file"
spawn $bin_sum $spank_conf_file
expect {
-re timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
make_bash_script $file_in "
$srun --mpi=none --test_suite_srun=5 --het-group=0,1 $bin_sum $spank_conf_file
"
set matches 0
spawn $sbatch --test_suite_sbatch=4 -t1 -o $file_out -n1 : -n1 $file_in
expect {
-re "_test_opt_process_sbatch: opt_arg_sbatch=4" {
incr matches
exp_continue
}
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
-re "slurm_spank_exit: opt_arg_sbatch=4 opt_arg_srun=0" {
incr matches
exp_continue
}
-re timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
fail "Batch submit failure"
}
if {$matches != 3} {
fail "Spank options not processed by sbatch ($matches != 3)"
}
#
# Wait for job to complete and check for files
#
wait_for_job -fail $job_id "DONE"
# NOTE: spank logs from sbatch and srun would be intermingled here
wait_for_file -fail $file_out
set matches 0
set matches_sbatch 0
set matches_srun 0
spawn $bin_cat $file_out
expect {
-re "error" {
fail "Some error happened"
}
-re "_test_opt_process_(.*?=$number)" {
set substr $expect_out(1,string)
if {[string equal "sbatch: opt_arg_sbatch=4" $substr]} {
incr matches_sbatch
} elseif {[string equal "srun: opt_arg_srun=5" $substr]} {
incr matches_srun
}
exp_continue
}
-re "slurm_spank_local_user_init" {
incr matches
exp_continue
}
-re "slurm_spank_exit: opt_arg_sbatch=4 opt_arg_srun=5" {
incr matches
exp_continue
}
eof {
wait
}
}
set failure_addendum "This may be due to NFS synchronization problems: Multiple processes on different nodes writing to the same file. The SlurmdLogFile on each node should include SPANK logs for each step"
if {$matches_sbatch != 2} {
fail "Local (sbatch) sbatch spank plugin failure ($matches_sbatch != 2). $failure_addendum"
}
if {$matches_srun != 2} {
fail "Local (srun) srun spank plugin failure ($matches_srun != 2). $failure_addendum"
}
if {$matches != 2} {
fail "Local (srun) spank plugin failure ($matches != 2). $failure_addendum"
}
log_debug "Local (srun) spank plugin success"
#
# Test for slurmd output in file
#
wait_for_file -fail $spank_out
set matches 0
set matches_sbatch 0
set matches_srun 0
set matches_spank_exit 0
set matches_spank_init 0
spawn $bin_sort $spank_out
expect {
-re "slurm_spank_(\\S+): opt_arg_sbatch=($number) opt_arg_srun=($number)" {
set spank_type $expect_out(1,string)
set sbatch_arg $expect_out(2,string)
set srun_arg $expect_out(3,string)
if {($spank_type eq "exit") && ($sbatch_arg eq "4")} {
# Skip (possible) external job containers
incr matches_spank_exit
} elseif {$spank_type eq "task_init"} {
incr matches_spank_init
}
if {($sbatch_arg eq "4") && ($srun_arg eq "0")} {
incr matches_sbatch
} elseif {($sbatch_arg eq "4") && ($srun_arg eq "5")} {
incr matches_srun
}
exp_continue
}
-re "spank_get_item: argv" {
incr matches
exp_continue
}
-re "spank_get_item: my_uid=" {
incr matches
exp_continue
}
eof {
wait
}
}
set failure_addendum "Check for matching checksums on the plugstack.conf file. Different checksums could indicate file system delays"
if {$matches_spank_exit != 3} {
fail "Remote (slurmd ) spank_exit spank plugin failure ($matches_spank_exit != 3). $failure_addendum"
}
if {$matches_spank_init != 3} {
fail "Remote (slurmd) spank_init spank plugin failure ($matches_spank_init != 3). $failure_addendum"
}
if {$matches_sbatch != 2} {
fail "Remote (slurmd) sbatch spank plugin failure ($matches_sbatch != 2). $failure_addendum"
}
if {$matches_srun != 4} {
fail "Remote (slurmd) srun spank plugin failure ($matches_srun != 4). $failure_addendum"
}
if {$matches != 8} {
fail "Remote (slurmd) spank plugin failure ($matches != 8). $failure_addendum"
}
log_info "Remote (slurmd) spank plugin success"