blob: 8b1604d9b6513a71f1a601b2ec95c8889494a035 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test local and remote job dependencies
#
# Reqs: 1. Using slurmdbd accounting storage type and is up
# 2. fed_slurm_base is defined in globals.local - set to directory that
# has access to each federation configure (fedc1, fedc2, fedc3).
# Eg.
# fedr/slurm/ (src)
# fedr/fed1/bin
# fedr/fed1/sbin
# fedr/fed1/etc
# fedr/fed1/...
# fedr/fed2/...
# fedr/fed3/...
# 3. controllers are up and running.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
set c1 $fedc1
set c2 $fedc2
set c3 $fedc3
set job_id1 0
set job_id2 0
set user_name [get_my_user_name]
set file_in_long "$test_dir/long.in"
set file_in_short "$test_dir/short.in"
set my_scancel "${fed_slurm_base}/$c1/bin/scancel"
set my_scontrol "${fed_slurm_base}/$c1/bin/scontrol"
set reason ""
set dependency ""
set fed_name "fed_$test_name"
set all_clusters "$c1,$c2,$c3"
###############################################################################
# Functions
###############################################################################
proc cancel_federation_jobs { } {
global user_name c1 c2 c3 my_scancel all_clusters test_id
spawn $my_scancel -M$all_clusters --jobname test${test_id}_job
expect {
eof {
wait
}
}
sleep 5
}
proc cancel_job { job_id clusters } {
global my_scancel
spawn $my_scancel $job_id
expect {
eof {
wait
}
}
wait_for_fed_job $job_id "DONE" $clusters
}
proc submit_job { options cdir file_in } {
global bin_sleep sbatch number fed_slurm_base test_name
set job_id 0
set my_sbatch "${fed_slurm_base}/$cdir/bin/sbatch"
if {![check_federation_setup]} {
set my_sbatch $sbatch
}
set command "$my_sbatch --job-name=${test_name}_job -t1 \
$options --output=/dev/null $file_in"
spawn {*}$command
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "Failed to submit job"
}
return $job_id
}
proc get_job_dependency { job_id } {
global my_scontrol reason dependency re_word_str
set reason ""
set dependency ""
set reason_match "\[a-zA-Z_\]+"
# Possible dependency syntax:
# <type>:<jobid>+<time>(state)
# <type>:<jobid>(state)
# singleton(unfulfilled)
# Notes:
# * Multiple dependencies are separated by a comma or a question mark
# * The state for singleton will only ever be "unfulfilled"
# * The state is either failed or unfulfilled. Fulfilled dependencies
# are cleared from the list
# * When there are no dependencies, it will be this string: "(null)"
# This regex takes care of handling dependencies separated by comma or
# question mark.
# ([a-zA-Z_]+:[0-9_*+]+\([a-zA-Z]+\)\?*\,*|singleton\(unfulfilled\)\?*\,*)+|\(null\)
#set depend_match "\[a-zA-Z_\]+:\[0-9_*+\]+\\($re_word_str\\)\|\\(null\\)\|singleton\\(unfulfilled\\)"
set type "\[a-zA-Z_\]+"
set jobid_time "\[0-9_*+\]+"
set state "\\($re_word_str\\)\\"
set delim ",*\\?*"
set depend_regex "\($type:$jobid_time$state$delim\|singleton\\(unfulfilled\\)$delim\)+"
set no_depend "\\(null\\)"
set depend_match "$depend_regex\|$no_depend"
log_user 0
spawn $my_scontrol show job $job_id
expect {
-re "Reason=($reason_match) Dependency=($depend_match)" {
set reason $expect_out(1,string)
set dependency $expect_out(2,string)
exp_continue
}
timeout {
fail "scontrol not responding"
}
eof {
wait
}
}
log_user 1
log_info "job $job_id; actual reason: \"$reason\"; dependency: \"$dependency\""
return $dependency
}
proc check_depend { job_id expected_reason expected_dependency } {
global reason dependency
get_job_dependency $job_id
if {"$reason" ne "$expected_reason"} {
return 1
}
if {"$dependency" ne "$expected_dependency"} {
return 1
}
return 0
}
proc wait_for_depend { job_id expected_reason expected_dependency } {
global reason dependency
set error 0
set my_delay 0
# By default we test remote dependencies every 30 seconds.
# Once the dependencies are cleared, we have to wait for the job to be
# scheduled which could be another 30-60 seconds depending on the
# main scheduler and backfill scheduler timing.
set max_delay 120
set poll_interval 3
if {$expected_reason ne "DependencyNeverSatisfied"} {
set want_never_satisfied 0
} else {
set want_never_satisfied 1
}
log_info "job $job_id; expected reason: \"$expected_reason\"; dependency: \"$expected_dependency\""
while 1 {
if {![check_depend $job_id $expected_reason $expected_dependency]} {
return 0
}
if {(!$want_never_satisfied) && $reason eq "DependencyNeverSatisfied"} {
log_error "Job dependency failed, but it shouldn't have."
set error 1
}
if {$my_delay >= $max_delay} {
log_info "delay $my_delay max $max_delay"
log_error "Timeout waiting for dependency to change."
set error 1
}
if {$error} {
fail "Job ($job_id) actual (reason=\"$reason\"; dependency=\"$dependency\") != expected (reason=\"$expected_reason\"; dependency=\"expected_dependency\")"
}
exec sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
}
proc my_wait_for_fed_job { job_id state cluster } {
set ret_cluster [wait_for_fed_job $job_id $state $cluster]
if {$cluster ne $ret_cluster} {
fail "Job ($job_id) did not reach state $state on cluster ($cluster)"
}
}
proc is_job_on_cluster { job_id cluster } {
global my_scontrol
log_user 0
spawn $my_scontrol -M$cluster --local -o show job $job_id
expect {
-re "JobId=$job_id" {
log_user 1
log_info "Found job $job_id in cluster $cluster"
return 1
}
-re "Invalid job id specified" {
log_user 1
log_info "Did not find job $job_id in cluster $cluster"
return 0
}
timeout {
log_user 1
fail "scontrol not responding"
}
eof {
wait
}
}
}
proc test_local_after { } {
global c1 file_in_long bin_sleep
log_info "#############################################################"
log_info "# Test local after"
log_info "#############################################################"
# Local dependency succeeds
log_info "after: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1 --begin=now+5" $c1 $file_in_long]
set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id1 $c1
cancel_job $job_id2 $c1
}
proc test_remote_after { } {
global c1 c2 file_in_long bin_sleep
log_info "#############################################################"
log_info "# Test remote after"
log_info "#############################################################"
# Remote dependency succeeds
log_info "after: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2 --begin=now+5" $c2 $file_in_long]
set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id1 $c2
cancel_job $job_id2 $c1
# Test after with a time attached. file_in_long sleeps for 60 seconds.
log_info "after: test that a after+time works:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=after:$job_id1+1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)"
log_info "Check that job $job_id2 is still dependent after 45 seconds"
exec $bin_sleep 45
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)"
log_info "Wait for job $job_id2 dependency to be fulfilled"
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# After dependency never fails.
}
proc test_local_afterany { } {
global c1 file_in_long
log_info "#############################################################"
log_info "# Test local afterany"
log_info "#############################################################"
# Local dependency succeeds
log_info "afterany: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c1
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
}
proc test_remote_afterany { } {
global c1 c2 file_in_long
log_info "#############################################################"
log_info "# Test remote afterany"
log_info "#############################################################"
# Remote dependency succeeds
log_info "afterany: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c2
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Test old syntax: --depend=jobid,jobid
log_info "afterany: test old syntax: --depend=jobid\[,jobid,jobid...\]"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=$job_id1 -M$c1" $c1 \
$file_in_long]
set job_id3 [submit_job "--depend=$job_id1,$job_id2 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)"
wait_for_depend $job_id3 "Dependency" \
"afterany:$job_id1\(unfulfilled\),afterany:$job_id2\(unfulfilled\)"
cancel_federation_jobs
}
proc test_local_aftercorr { } {
global c1 kill_invalid_depend file_in_long file_in_short
log_info "#############################################################"
log_info "# Test local aftercorr"
log_info "#############################################################"
# Local dependency succeeds
log_info "aftercorr: test that local dependency succeeds:"
set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_short]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
my_wait_for_fed_job "$job_array1\_1" "DONE" $c1
my_wait_for_fed_job "$job_array1\_2" "DONE" $c1
wait_for_depend "$job_array2\_1" "None" "(null)"
wait_for_depend "$job_array2\_2" "None" "(null)"
my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1
cancel_job $job_array2 $c1
# Local dependency fails
log_info "aftercorr: test that local dependency fails:"
set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_long]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
cancel_job "$job_array1\_1" $c1
wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\_*(failed)"
cancel_job "$job_array1\_2" $c1
wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\_*(failed)"
if { !$kill_invalid_depend } {
cancel_job "$job_array2" $c1
}
}
proc test_remote_aftercorr { } {
global c1 c2 kill_invalid_depend file_in_long file_in_short
log_info "#############################################################"
log_info "# Test remote aftercorr"
log_info "#############################################################"
# Remote dependency succeeds
log_info "aftercorr: test that remote dependency succeeds:"
set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_short]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
# The dependency on the remote side has _*, but the dependency locally
# doesn't because it couldn't find the remote job.
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
my_wait_for_fed_job "$job_array1\_1" "DONE" $c2
my_wait_for_fed_job "$job_array1\_2" "DONE" $c2
wait_for_depend "$job_array2\_1" "None" "(null)"
wait_for_depend "$job_array2\_2" "None" "(null)"
my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1
cancel_job $job_array2 $c1
# Remote dependency fails
log_info "aftercorr: test that remote dependency fails:"
set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_long]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
cancel_job "$job_array1\_1" $c2
wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\(failed)"
cancel_job "$job_array1\_2" $c2
wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\(failed)"
if { !$kill_invalid_depend } {
cancel_job "$job_array2" $c1
}
}
proc test_local_afterok { } {
global c1 kill_invalid_depend file_in_long file_in_short
log_info "#############################################################"
log_info "# Test local afterok"
log_info "#############################################################"
# Local dependency succeeds
log_info "afterok: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_short]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Local dependency fails
log_info "afterok: test that local dependency fails:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afterok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
}
proc test_remote_afterok { } {
global c1 c2 kill_invalid_depend file_in_long file_in_short
log_info "#############################################################"
log_info "# Test remote afterok"
log_info "#############################################################"
# Remote dependency succeeds
log_info "afterok: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2" $c2 $file_in_short]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Remote dependency fails
log_info "afterok: test that remote dependency fails"
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afterok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
}
proc test_local_afternotok { } {
global c1 kill_invalid_depend file_in_long file_in_short
log_info "#############################################################"
log_info "# Test local afternotok"
log_info "#############################################################"
# Local dependency succeeds
log_info "afternotok: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Local dependency fails
log_info "afternotok: test that local dependency fails:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_short]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c1
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afternotok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
}
proc test_remote_afternotok { } {
global c1 c2 kill_invalid_depend file_in_long file_in_short
log_info "#############################################################"
log_info "# Test remote afternotok"
log_info "#############################################################"
# Remote dependency succeeds
log_info "afternotok: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Remote dependency fails
log_info "afternotok: test that remote dependency fails"
set job_id1 [submit_job "-M$c2" $c2 $file_in_short]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c2
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afternotok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
}
proc test_local_singleton { } {
global c1 disable_remote_singleton file_in_long
log_info "#############################################################"
log_info "# Test local singleton"
log_info "#############################################################"
# Test one cluster
log_info "singleton: test on one cluster"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "singleton(unfulfilled)"
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
}
proc test_remote_singleton { } {
global c1 c2 c3 disable_remote_singleton file_in_long
log_info "#############################################################"
log_info "# Test remote singleton"
log_info "#############################################################"
# Test multiple clusters
if { $disable_remote_singleton } {
# Test that remote jobs don't affect the singleton dependency
log_info "singleton: test that disable_remote_singleton works"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "--depend=singleton -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" "singleton(unfulfilled)"
# Cancel job 1 - job 3 should start running even though job 2 is
# running on another cluster
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id2 $c2
cancel_job $job_id3 $c1
} else {
# Test that singleton doesn't get cleared until jobs on all
# clusters are done
log_info "singleton: test with jobs on all clusters"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "-M$c3" $c3 $file_in_long]
set job_id4 [submit_job "--depend=singleton -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "RUNNING" $c3
my_wait_for_fed_job $job_id4 "PENDING" $c1
wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)"
# Job 4 shouldn't start until jobs 1, 2, and 3 are done.
# Test that it starts when a remote job is finished last.
cancel_job $job_id1 $c1
# Should still have the same dependency
wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)"
cancel_job $job_id2 $c2
cancel_job $job_id3 $c3
# Now the dependency should be cleared
wait_for_depend $job_id4 "None" "(null)"
my_wait_for_fed_job $job_id4 "RUNNING" $c1
cancel_job $job_id4 $c1
}
}
proc test_add_remove_clusters { } {
global c1 c2 c3 fed_name file_in_long \
disable_remote_singleton kill_invalid_depend
# Test adding/removing clusters from the federation
# Removing a cluster from a federation should cause dependencies on
# jobs on that cluster to fail.
# Adding a cluster to a federation means that any singleton dependencies
# have to be fulfilled on that cluster.
log_info "#############################################################"
log_info "# Test adding/removing a cluster from the federation"
log_info "#############################################################"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "-M$c3" $c3 $file_in_long]
set job_id4 [submit_job "--depend=afterok:$job_id3 -M$c1" $c1 \
$file_in_long]
set job_id5 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "RUNNING" $c3
my_wait_for_fed_job $job_id4 "PENDING" $c1
wait_for_depend $job_id4 "Dependency" "afterok:$job_id3\(unfulfilled\)"
wait_for_depend $job_id5 "Dependency" "singleton(unfulfilled)"
log_info "Test that removing cluster $c3 from fed $fed_name makes dependencies on jobs on $c3 fail"
if [remove_cluster_from_fed $c3 $fed_name] {
fail "Unable to remove cluster ($c3) from federation ($fed_name)"
}
wait_for_depend $job_id4 "DependencyNeverSatisfied" \
"afterok:$job_id3\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id4 $c1
}
if { $disable_remote_singleton } {
cancel_job $job_id1 $c1
cancel_job $job_id2 $c2
cancel_job $job_id3 $c3
cancel_job $job_id5 $c1
return
}
log_info "Test that the singleton dependency was resent back to cluster $c3 when it was added back to the federation."
if [add_cluster_to_fed $c3 $fed_name] {
fail "Unable to add cluster ($c3) to federation ($fed_name)"
}
cancel_job $job_id1 $c1
cancel_job $job_id2 $c2
cancel_job $job_id3 $c3
wait_for_depend $job_id5 "None" "(null)"
my_wait_for_fed_job $job_id5 "RUNNING" $c1
cancel_job $job_id5 $c1
}
proc test_submit_to_all_clusters { } {
global c1 c2 c3 file_in_long
log_info "#############################################################"
log_info "# Test submitting a dependent job to all clusters"
log_info "#############################################################"
log_info "Test that a dependent job is only on its origin cluster while dependent and that it is submitted to all clusters when its dependency is cleared."
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1,$c2,$c3 \
--begin=now+60" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
log_info "Test that job $job_id2 is not on clusters $c2 or $c3."
if { [is_job_on_cluster $job_id2 $c2] || \
[is_job_on_cluster $job_id2 $c3] } {
fail "Job ($job_id2) is in cluster ($c2 and/or $c3) when it shouldn't be."
}
log_info "Test that job $job_id2 is submitted to all sibling clusters $c2 and $c3 when its dependency is fulfilled."
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "BeginTime" "(null)"
my_wait_for_fed_job $job_id2 "PENDING" "$c1"
my_wait_for_fed_job $job_id2 "PENDING" "$c2"
my_wait_for_fed_job $job_id2 "PENDING" "$c3"
cancel_job $job_id2 "$c1,$c2,$c3"
}
proc test_local_or_dependencies { } {
global c1 file_in_long kill_invalid_depend
log_info "#############################################################"
log_info "# Test local OR dependencies"
log_info "#############################################################"
log_info "OR dependencies: Test that one fulfilled dependency makes the whole dependency fulfilled:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c1" $c1 $file_in_long]
set job_id3 [submit_job \
"--depend=afternotok:$job_id1?afternotok:$job_id2 -M$c1" \
$c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c1
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)?afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c1
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id1 $c1
cancel_job $job_id3 $c1
log_info "OR dependencies: Test that the dependency doesn't fail until all dependencies have failed:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c1" $c1 $file_in_long]
set job_id3 [submit_job "--depend=afterok:$job_id1?afterok:$job_id2 \
-M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c1
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(unfulfilled\)?afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(failed\)?afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c1
wait_for_depend $job_id3 "DependencyNeverSatisfied" \
"afterok:$job_id1\(failed\)?afterok:$job_id2\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id3 $c1
}
}
proc test_remote_or_dependencies { } {
global c1 c2 file_in_long kill_invalid_depend
log_info "#############################################################"
log_info "# Test remote OR dependencies"
log_info "#############################################################"
log_info "OR dependencies: Test that one fulfilled dependency makes the whole dependency fulfilled:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job \
"--depend=afternotok:$job_id1?afternotok:$job_id2 -M$c1" \
$c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)?afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id1 $c1
cancel_job $job_id3 $c1
log_info "OR dependencies: Test that the dependency doesn't fail until all dependencies have failed:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "--depend=afterok:$job_id1?afterok:$job_id2 \
-M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(unfulfilled\)?afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(failed\)?afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "DependencyNeverSatisfied" \
"afterok:$job_id1\(failed\)?afterok:$job_id2\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id3 $c1
}
}
proc test_local_and_dependencies { } {
global c1 file_in_long kill_invalid_depend
log_info "#############################################################"
log_info "# Test local AND dependencies"
log_info "#############################################################"
log_info "AND dependencies: Test that the dependency isn't fulfilled until all dependencies are fulfilled:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c1" $c1 $file_in_long]
set job_id3 [submit_job \
"--depend=afternotok:$job_id1,afternotok:$job_id2 -M$c1" \
$c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c1
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id1\(unfulfilled\),afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c1
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id3 $c1
log_info "AND dependencies: Test that the whole dependency fails when a single dependency fails:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c1" $c1 $file_in_long]
set job_id3 [submit_job "--depend=afterok:$job_id1,afterok:$job_id2 \
-M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c1
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c1
wait_for_depend $job_id3 "DependencyNeverSatisfied" \
"afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id3 $c1
}
cancel_job $job_id1 $c1
}
proc test_remote_and_dependencies { } {
global c1 c2 file_in_long kill_invalid_depend
log_info "#############################################################"
log_info "# Test remote AND dependencies"
log_info "#############################################################"
log_info "AND dependencies: Test that the dependency isn't fulfilled until all dependencies are fulfilled:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job \
"--depend=afternotok:$job_id1,afternotok:$job_id2 -M$c1" \
$c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id1\(unfulfilled\),afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id3 $c1
log_info "AND dependencies: Test that the whole dependency fails when a single dependency fails:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "--depend=afterok:$job_id1,afterok:$job_id2 \
-M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "DependencyNeverSatisfied" \
"afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id3 $c1
}
cancel_job $job_id1 $c1
}
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
if {[check_federation_setup]} {
if {![check_federation_up]} {
skip "All of the clusters in the federation must be up"
}
delete_federations $fed_name
if [setup_federation $fed_name] {
fail "Unable to set up federation ($fed_name)"
}
} else {
log_warn "Not running remote dependency tests"
set c1 [get_config_param "ClusterName"]
set all_clusters "$c1"
set my_scancel $scancel
set my_scontrol $scontrol
}
proc cleanup {} {
global fed_name bin_rm file_in_long file_in_short
delete_federations $fed_name
cancel_federation_jobs
exec $bin_rm -f $file_in_long $file_in_short
}
###############################################################################
# Begin test
###############################################################################
# Use file_in_short when we have to wait for the job to end.
# Use file_in_long everywhere else.
make_bash_script $file_in_long "$bin_sleep 60"
make_bash_script $file_in_short "$bin_sleep 5"
set kill_invalid_depend [param_contains [get_config_param "DependencyParameters"] "kill_invalid_depend"]
set disable_remote_singleton [param_contains [get_config_param "DependencyParameters"] "disable_remote_singleton"]
log_info "kill_invalid_depend: $kill_invalid_depend; disable_remote_singleton: $disable_remote_singleton\n"
cancel_federation_jobs
test_local_after
test_local_afterany
# --depend=afterburstbuffer is tested in test35.6
test_local_aftercorr
test_local_afterok
test_local_afternotok
test_local_singleton
test_local_or_dependencies
test_local_and_dependencies
# Test --depend=expand in another test
if {![check_federation_setup]} {
skip "Subtests with federations cannot be run"
} else {
test_remote_after
test_remote_afterany
test_remote_aftercorr
test_remote_afterok
test_remote_afternotok
test_remote_singleton
test_remote_or_dependencies
test_remote_and_dependencies
test_add_remove_clusters
test_submit_to_all_clusters
}