blob: 9db602dfc3278bb9e9cf25ec8845f97c9795fd85 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of SPANK error codes
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in "${test_dir}/input"
set file_out "${test_dir}/output"
set file_prog "[$bin_pwd]/${test_name}.prog"
set shared_lib "${file_prog}.so"
set new_spank_conf "${test_dir}/new_spank_conf"
set spank_out "${test_dir}/spank.out"
set spank_conf_file "[get_conf_path]/plugstack.conf"
set pgid [param_contains [get_config_param -slurm "ProctrackType"] "*pgid"]
set testnode [get_nodes_by_request "-N1"]
if {[llength $testnode] != 1} {
skip "Test needs to be able to submit jobs to 1 node"
}
if {![is_super_user]} {
skip "This test must be run as SlurmUser"
}
#
# Verify the slurmctld and local slurm version are the same
#
set slurmctld_version [get_config_param "SLURM_VERSION"]
set scontrol_V_output [run_command_output -fail "$scontrol -V"]
if {![regexp "slurm ($re_word_str)" $scontrol_V_output {} local_slurm_version]} {
fail "Unable to determine local slurm version"
}
if {[string compare $slurmctld_version $local_slurm_version]} {
skip "slurmctld ($slurmctld_version) and local Slurm ($local_slurm_version) versions are not the same. Cannot continue."
}
proc is_node_drain {} {
global testnode
if {$testnode == ""} {
return 0
}
set node_state [get_node_param $testnode "State"]
return [regexp {DRAIN} $node_state]
}
proc resume_node {} {
global testnode scontrol test_name
if [is_node_drain] {
run_command -fail "$scontrol update nodename=$testnode state=resume reason=$test_name"
}
}
proc cleanup { } {
global shared_lib spank_conf_file
resume_node
# Restore the original plugstack
restore_conf $spank_conf_file
reconfigure
file delete $shared_lib
}
#
# Build the plugin
#
cd $test_dir
log_debug "Using slurm_dir $slurm_dir"
compile_against_libslurm -shared "$file_prog" "-ggdb3 -Wall"
#
# Copy the original plugstack.conf file
# and create an updated one using our new plugin
#
save_conf $spank_conf_file
# create the output file with lower permissions to avoid root umask issues
exec $bin_touch "${spank_out}"
exec $bin_echo "required $shared_lib ${spank_out}" > $new_spank_conf
run_command_output -fail "$bin_cp -f $new_spank_conf $spank_conf_file"
reconfigure
# Ensure that SPANK will not kill the test jobs yet
if {[info exists env(TEST_FUNC)]} {unset env(TEST_FUNC)}
if {[info exists env(TEST_CTXT)]} {unset env(TEST_CTXT)}
proc process_result {function context expected_rc reports drains fails job_id_assigned job_id rc output} {
subtest {$rc == $expected_rc} "Verify return code reported:$rc == expected:$expected_rc"
# Look for api failure like:
# srun: error: spank: required plugin test7.24.prog.so: init() failed with rc=-3000
set short_function [string replace $function 0 [expr {[string length "slurm_spank_"]-1}]]
set rc 0
# TODO: Temporary debug traces to for rare failures.
# Remove once ticket 19090 is fixed.
log_debug "Expected RC: ${reports}"
log_debug "Output: ${output}"
log_debug "Regexp string: error: spank: required plugin \\S+: $short_function\\\(\\\) failed with rc=(-?\\d+)"
set reported [regexp "error: spank: required plugin \\S+: $short_function\\\(\\\) failed with rc=(-?\\d+)" $output wholematch rc]
subtest {$rc == $reports} \
[concat "SPANK API failure report expected rc:$reports == rc:$rc" ]
if {$job_id_assigned} {
if {[subtest {$job_id > 0} "jobid != 0"]} {
set jobs_dict [get_jobs $job_id]
set job_state [dict get $jobs_dict $job_id "JobState"]
set failed [expr {$job_state eq "FAILED" || $job_state eq "CANCELLED"}]
if {$fails} {
set failstr "FAILED"
} else {
set failstr "!FAILED"
}
subtest {$fails == $failed} "JobId=$job_id state: expected=$failstr == actual=$job_state"
}
} else {
subtest {$job_id == 0} "jobid should not have been assigned"
}
set drained [is_node_drain]
subtest {$drains == $drained} \
[concat "expect node to drain: " \
[expr {$drains ? "should drain" : "should not drain"}] \
" == " \
[expr {$drained ? "drained" : "did not drain"}]]
}
proc test_interactive {command function context expected_rc reports drains fails job_id_assigned} {
global scontrol number test_name
global env
resume_node
# Invoke command
set env(TEST_FUNC) "$function"
set env(TEST_CTXT) "$context"
set result [run_command -none "$command"]
set rc [dict get $result exit_code]
set output [dict get $result output]
# Extract job id from api output
if {[subtest {[regexp "\\\[Job: ($number)\\\] Found \\\($function,$context\\\)" $output {} job_id]} "verify SPANK plugin printed debug logs"]} {
wait_for_job $job_id "DONE"
return [process_result $function $context $expected_rc $reports $drains $fails $job_id_assigned $job_id $rc $output]
} else {
return $::RETURN_ERROR
}
}
proc test_batch {function context expected_rc sbatch_rc sbatch_reports reports drains fails job_id_assigned outputs} {
global re_word_str scontrol bin_bash number test_name sbatch file_out file_in bin_cat number bin_rm
global env sbatch_test
resume_node
# Execute command
set env(TEST_FUNC) "$function"
set env(TEST_CTXT) "$context"
set result [run_command -none "$sbatch_test"]
set rc [dict get $result exit_code]
set output [dict get $result output]
set job_id 0
subtest {$rc == $sbatch_rc} "Verify sbatch return code:$rc == expected:$sbatch_rc"
# Extract job id from command output
regexp "Submitted batch job ($number)" $output {} job_id
subtest {$job_id_assigned == [expr $job_id != 0]} "job_id != 0"
# Look for api failure
set short_function [string replace $function 0 [expr {[string length "slurm_spank_"]-1}]]
set reported [regexp "error: spank: required plugin \\S+: $short_function\\\(\\\) failed with rc=(-?\\d+)" $output wholematch rc]
subtest {$rc == $sbatch_reports} \
"SPANK API failure report expected sbatch rc:$sbatch_reports == reported:$rc"
wait_for_job $job_id "DONE"
unset output
# Job must run to produce file output, which in some cases it should not
if {$outputs} {
set output ""
if {![wait_for_file -timeout 5 -pollinterval .1 $file_out]} {
set output [run_command_output -fail "$bin_cat $file_out"]
# Remove the output file so we don't use it in the next tuple
exec $bin_rm -f $file_out
if {[subtest {$output != ""} "verify output made"]} {
return [process_result $function $context $expected_rc $reports $drains $fails $job_id_assigned $job_id $rc $output]
}
} else {
subfail "verify output mode" "output missing"
}
} else {
subtest {![file exists $file_out]} "verify output not made"
return [process_result $function $context $expected_rc $reports $drains $fails $job_id_assigned $job_id $rc ""]
}
}
# Listing args at the end of the line to make it easier to edit:
# expected_rc reports
# drains
# fails
# job_id_assigned
log_info "Testing srun.........."
set srun_test "$srun -K -I10 -W10 -w $testnode -t1 true"
testproc test_interactive "$srun_test" slurm_spank_init local $::RETURN_ERROR -3000 0 1 0
testproc test_interactive "$srun_test" slurm_spank_init_post_opt local $::RETURN_ERROR -3000 0 1 0
testproc test_interactive "$srun_test" slurm_spank_local_user_init local $::RETURN_ERROR -3000 0 1 1
testproc test_interactive "$srun_test" slurm_spank_user_init remote $::RETURN_ERROR -3000 0 1 1
testproc test_interactive "$srun_test" slurm_spank_task_init_privileged remote $::RETURN_ERROR -3000 0 1 1
if {$pgid} {
testproc test_interactive "$srun_test" slurm_spank_task_post_fork remote $::RETURN_TIMEOUT -3000 0 0 1
} else {
testproc test_interactive "$srun_test" slurm_spank_task_post_fork remote $::RETURN_ERROR -3000 0 1 1
}
testproc test_interactive "$srun_test" slurm_spank_task_init remote $::RETURN_ERROR -3000 0 1 1
testproc test_interactive "$srun_test" slurm_spank_task_exit remote $::RETURN_SUCCESS -3000 0 0 1
testproc test_interactive "$srun_test" slurm_spank_exit local $::RETURN_SUCCESS -3000 0 1 0
log_info "Testing salloc.........."
set salloc_test "$salloc -I10 -w$testnode -t1 $srun true"
testproc test_interactive "$salloc_test" slurm_spank_init allocator $::RETURN_ERROR -3000 0 1 0
testproc test_interactive "$salloc_test" slurm_spank_init_post_opt allocator $::RETURN_ERROR -3000 0 1 0
testproc test_interactive "$salloc_test" slurm_spank_init local $::RETURN_ERROR -3000 0 1 0
testproc test_interactive "$salloc_test" slurm_spank_init_post_opt local $::RETURN_ERROR -3000 0 1 0
testproc test_interactive "$salloc_test" slurm_spank_local_user_init local $::RETURN_ERROR -3000 0 1 1
testproc test_interactive "$salloc_test" slurm_spank_user_init remote $::RETURN_ERROR -3000 0 1 1
testproc test_interactive "$salloc_test" slurm_spank_task_init_privileged remote $::RETURN_ERROR -3000 0 1 1
if {$pgid} {
testproc test_interactive "$salloc_test" slurm_spank_task_post_fork remote $::RETURN_TIMEOUT -3000 0 0 1
} else {
testproc test_interactive "$salloc_test" slurm_spank_task_post_fork remote $::RETURN_ERROR -3000 0 1 1
}
testproc test_interactive "$salloc_test" slurm_spank_task_init remote $::RETURN_ERROR -3000 0 1 1
testproc test_interactive "$salloc_test" slurm_spank_task_exit remote $::RETURN_SUCCESS -3000 0 0 1
testproc test_interactive "$salloc_test" slurm_spank_exit local $::RETURN_SUCCESS -3000 0 1 0
testproc test_interactive "$salloc_test" slurm_spank_exit allocator $::RETURN_SUCCESS -3000 0 1 0
# Listing args at the end of the line to make it easier to edit:
# expected_rc sbatch_rc
# sbatch_reports
# reports error
# drains
# fails
# job_id_assigned
# outputs
log_info "Testing sbatch.........."
make_bash_script $file_in "$srun $bin_echo IT_RAN"
set sbatch_test "$sbatch --no-requeue -W -w $testnode -t1 -o $file_out -e $file_out $file_in"
testproc test_batch slurm_spank_init allocator -3000 1 -3000 0 0 1 0 0
testproc test_batch slurm_spank_init_post_opt allocator -3000 1 -3000 0 0 1 0 0
testproc test_batch slurm_spank_init local 1 1 1 -3000 0 1 1 1
testproc test_batch slurm_spank_init_post_opt local $::RETURN_ERROR 1 1 -3000 0 1 1 1
testproc test_batch slurm_spank_local_user_init local $::RETURN_ERROR 1 1 -3000 0 1 1 1
if {!$pgid} {
testproc test_batch slurm_spank_user_init remote $::RETURN_ERROR 1 1 -3000 1 1 1 1
} else {
testproc test_batch slurm_spank_user_init remote $::RETURN_SUCCESS 0 0 -3000 0 0 1 1
}
testproc test_batch slurm_spank_task_init_privileged remote $::RETURN_ERROR 1 1 -3000 0 1 1 1
if {!$pgid} {
testproc test_batch slurm_spank_task_post_fork remote $::RETURN_ERROR 1 1 -3000 1 1 1 1
} else {
testproc test_batch slurm_spank_task_post_fork remote $::RETURN_TIMEOUT 0 0 -3000 1 0 1 1
}
testproc test_batch slurm_spank_task_init remote $::RETURN_ERROR 1 1 -3000 0 1 1 1
testproc test_batch slurm_spank_task_exit remote $::RETURN_SUCCESS 0 0 -3000 0 0 1 1
testproc test_batch slurm_spank_exit local $::RETURN_SUCCESS 0 0 -3000 0 0 1 1
testproc test_batch slurm_spank_exit allocator -3000 0 -3000 0 0 0 1 1