| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test local and remote job dependencies |
| # |
| # Reqs: 1. Using slurmdbd accounting storage type and is up |
| # 2. fed_slurm_base is defined in globals.local - set to directory that |
| # has access to each federation configure (fedc1, fedc2, fedc3). |
| # Eg. |
| # fedr/slurm/ (src) |
| # fedr/fed1/bin |
| # fedr/fed1/sbin |
| # fedr/fed1/etc |
| # fedr/fed1/... |
| # fedr/fed2/... |
| # fedr/fed3/... |
| # 3. controllers are up and running. |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| |
| source ./globals |
| source ./globals_accounting |
| source ./globals_federation |
| |
| set c1 $fedc1 |
| set c2 $fedc2 |
| set c3 $fedc3 |
| set job_id1 0 |
| set job_id2 0 |
| set user_name [get_my_user_name] |
| set file_in_long "$test_dir/long.in" |
| set file_in_short "$test_dir/short.in" |
| set my_scancel "${fed_slurm_base}/$c1/bin/scancel" |
| set my_scontrol "${fed_slurm_base}/$c1/bin/scontrol" |
| set reason "" |
| set dependency "" |
| set fed_name "fed_$test_name" |
| set all_clusters "$c1,$c2,$c3" |
| |
| ############################################################################### |
| # Functions |
| ############################################################################### |
| |
| proc cancel_federation_jobs { } { |
| global user_name c1 c2 c3 my_scancel all_clusters test_id |
| |
| spawn $my_scancel -M$all_clusters --jobname test${test_id}_job |
| expect { |
| eof { |
| wait |
| } |
| } |
| sleep 5 |
| } |
| |
| proc cancel_job { job_id clusters } { |
| global my_scancel |
| |
| spawn $my_scancel $job_id |
| expect { |
| eof { |
| wait |
| } |
| } |
| wait_for_fed_job $job_id "DONE" $clusters |
| } |
| |
| proc submit_job { options cdir file_in } { |
| global bin_sleep sbatch number fed_slurm_base test_name |
| |
| set job_id 0 |
| set my_sbatch "${fed_slurm_base}/$cdir/bin/sbatch" |
| if {![check_federation_setup]} { |
| set my_sbatch $sbatch |
| } |
| set command "$my_sbatch --job-name=${test_name}_job -t1 \ |
| $options --output=/dev/null $file_in" |
| spawn {*}$command |
| expect { |
| -re "Submitted batch job ($number)" { |
| set job_id $expect_out(1,string) |
| exp_continue |
| } |
| timeout { |
| fail "sbatch not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if { $job_id == 0 } { |
| fail "Failed to submit job" |
| } |
| |
| return $job_id |
| } |
| |
| proc get_job_dependency { job_id } { |
| global my_scontrol reason dependency re_word_str |
| |
| set reason "" |
| set dependency "" |
| set reason_match "\[a-zA-Z_\]+" |
| # Possible dependency syntax: |
| # <type>:<jobid>+<time>(state) |
| # <type>:<jobid>(state) |
| # singleton(unfulfilled) |
| # Notes: |
| # * Multiple dependencies are separated by a comma or a question mark |
| # * The state for singleton will only ever be "unfulfilled" |
| # * The state is either failed or unfulfilled. Fulfilled dependencies |
| # are cleared from the list |
| # * When there are no dependencies, it will be this string: "(null)" |
| |
| # This regex takes care of handling dependencies separated by comma or |
| # question mark. |
| # ([a-zA-Z_]+:[0-9_*+]+\([a-zA-Z]+\)\?*\,*|singleton\(unfulfilled\)\?*\,*)+|\(null\) |
| |
| |
| #set depend_match "\[a-zA-Z_\]+:\[0-9_*+\]+\\($re_word_str\\)\|\\(null\\)\|singleton\\(unfulfilled\\)" |
| |
| set type "\[a-zA-Z_\]+" |
| set jobid_time "\[0-9_*+\]+" |
| set state "\\($re_word_str\\)\\" |
| set delim ",*\\?*" |
| set depend_regex "\($type:$jobid_time$state$delim\|singleton\\(unfulfilled\\)$delim\)+" |
| set no_depend "\\(null\\)" |
| set depend_match "$depend_regex\|$no_depend" |
| |
| log_user 0 |
| spawn $my_scontrol show job $job_id |
| expect { |
| -re "Reason=($reason_match) Dependency=($depend_match)" { |
| set reason $expect_out(1,string) |
| set dependency $expect_out(2,string) |
| exp_continue |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| log_user 1 |
| log_info "job $job_id; actual reason: \"$reason\"; dependency: \"$dependency\"" |
| return $dependency |
| } |
| |
| proc check_depend { job_id expected_reason expected_dependency } { |
| global reason dependency |
| |
| get_job_dependency $job_id |
| if {"$reason" ne "$expected_reason"} { |
| return 1 |
| } |
| if {"$dependency" ne "$expected_dependency"} { |
| return 1 |
| } |
| return 0 |
| } |
| |
| proc wait_for_depend { job_id expected_reason expected_dependency } { |
| global reason dependency |
| |
| set error 0 |
| set my_delay 0 |
| # By default we test remote dependencies every 30 seconds. |
| # Once the dependencies are cleared, we have to wait for the job to be |
| # scheduled which could be another 30-60 seconds depending on the |
| # main scheduler and backfill scheduler timing. |
| set max_delay 120 |
| set poll_interval 3 |
| |
| if {$expected_reason ne "DependencyNeverSatisfied"} { |
| set want_never_satisfied 0 |
| } else { |
| set want_never_satisfied 1 |
| } |
| |
| log_info "job $job_id; expected reason: \"$expected_reason\"; dependency: \"$expected_dependency\"" |
| |
| while 1 { |
| if {![check_depend $job_id $expected_reason $expected_dependency]} { |
| return 0 |
| } |
| |
| if {(!$want_never_satisfied) && $reason eq "DependencyNeverSatisfied"} { |
| log_error "Job dependency failed, but it shouldn't have." |
| set error 1 |
| } |
| if {$my_delay >= $max_delay} { |
| log_info "delay $my_delay max $max_delay" |
| log_error "Timeout waiting for dependency to change." |
| set error 1 |
| } |
| |
| if {$error} { |
| fail "Job ($job_id) actual (reason=\"$reason\"; dependency=\"$dependency\") != expected (reason=\"$expected_reason\"; dependency=\"expected_dependency\")" |
| } |
| |
| exec sleep $poll_interval |
| set my_delay [expr $my_delay + $poll_interval] |
| } |
| } |
| |
| proc my_wait_for_fed_job { job_id state cluster } { |
| set ret_cluster [wait_for_fed_job $job_id $state $cluster] |
| if {$cluster ne $ret_cluster} { |
| fail "Job ($job_id) did not reach state $state on cluster ($cluster)" |
| } |
| } |
| |
| proc is_job_on_cluster { job_id cluster } { |
| global my_scontrol |
| |
| log_user 0 |
| spawn $my_scontrol -M$cluster --local -o show job $job_id |
| expect { |
| -re "JobId=$job_id" { |
| log_user 1 |
| log_info "Found job $job_id in cluster $cluster" |
| return 1 |
| } |
| -re "Invalid job id specified" { |
| log_user 1 |
| log_info "Did not find job $job_id in cluster $cluster" |
| return 0 |
| } |
| timeout { |
| log_user 1 |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| } |
| |
| proc test_local_after { } { |
| global c1 file_in_long bin_sleep |
| log_info "#############################################################" |
| log_info "# Test local after" |
| log_info "#############################################################" |
| |
| # Local dependency succeeds |
| log_info "after: test that local dependency succeeds:" |
| set job_id1 [submit_job "-M$c1 --begin=now+5" $c1 $file_in_long] |
| set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id1 $c1 |
| cancel_job $job_id2 $c1 |
| |
| } |
| |
| proc test_remote_after { } { |
| global c1 c2 file_in_long bin_sleep |
| |
| log_info "#############################################################" |
| log_info "# Test remote after" |
| log_info "#############################################################" |
| |
| # Remote dependency succeeds |
| log_info "after: test that remote dependency succeeds:" |
| set job_id1 [submit_job "-M$c2 --begin=now+5" $c2 $file_in_long] |
| set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "RUNNING" $c2 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id1 $c2 |
| cancel_job $job_id2 $c1 |
| |
| # Test after with a time attached. file_in_long sleeps for 60 seconds. |
| log_info "after: test that a after+time works:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "--depend=after:$job_id1+1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)" |
| log_info "Check that job $job_id2 is still dependent after 45 seconds" |
| exec $bin_sleep 45 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)" |
| log_info "Wait for job $job_id2 dependency to be fulfilled" |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| |
| # After dependency never fails. |
| } |
| |
| proc test_local_afterany { } { |
| global c1 file_in_long |
| |
| log_info "#############################################################" |
| log_info "# Test local afterany" |
| log_info "#############################################################" |
| |
| # Local dependency succeeds |
| log_info "afterany: test that local dependency succeeds:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| } |
| |
| proc test_remote_afterany { } { |
| global c1 c2 file_in_long |
| |
| log_info "#############################################################" |
| log_info "# Test remote afterany" |
| log_info "#############################################################" |
| |
| # Remote dependency succeeds |
| log_info "afterany: test that remote dependency succeeds:" |
| set job_id1 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "RUNNING" $c2 |
| cancel_job $job_id1 $c2 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| |
| # Test old syntax: --depend=jobid,jobid |
| log_info "afterany: test old syntax: --depend=jobid\[,jobid,jobid...\]" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "--depend=$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| set job_id3 [submit_job "--depend=$job_id1,$job_id2 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)" |
| wait_for_depend $job_id3 "Dependency" \ |
| "afterany:$job_id1\(unfulfilled\),afterany:$job_id2\(unfulfilled\)" |
| cancel_federation_jobs |
| } |
| |
| proc test_local_aftercorr { } { |
| global c1 kill_invalid_depend file_in_long file_in_short |
| |
| log_info "#############################################################" |
| log_info "# Test local aftercorr" |
| log_info "#############################################################" |
| |
| # Local dependency succeeds |
| log_info "aftercorr: test that local dependency succeeds:" |
| set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_short] |
| set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ |
| --array=1-2" $c1 $file_in_long] |
| |
| my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1 |
| my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1 |
| my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 |
| my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 |
| wait_for_depend "$job_array2\_1" "Dependency" \ |
| "aftercorr:$job_array1\_*(unfulfilled)" |
| wait_for_depend "$job_array2\_2" "Dependency" \ |
| "aftercorr:$job_array1\_*(unfulfilled)" |
| |
| my_wait_for_fed_job "$job_array1\_1" "DONE" $c1 |
| my_wait_for_fed_job "$job_array1\_2" "DONE" $c1 |
| wait_for_depend "$job_array2\_1" "None" "(null)" |
| wait_for_depend "$job_array2\_2" "None" "(null)" |
| my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1 |
| my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1 |
| cancel_job $job_array2 $c1 |
| |
| # Local dependency fails |
| log_info "aftercorr: test that local dependency fails:" |
| set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_long] |
| set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ |
| --array=1-2" $c1 $file_in_long] |
| |
| my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1 |
| my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1 |
| my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 |
| my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 |
| wait_for_depend "$job_array2\_1" "Dependency" \ |
| "aftercorr:$job_array1\_*(unfulfilled)" |
| wait_for_depend "$job_array2\_2" "Dependency" \ |
| "aftercorr:$job_array1\_*(unfulfilled)" |
| |
| cancel_job "$job_array1\_1" $c1 |
| wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \ |
| "aftercorr:$job_array1\_*(failed)" |
| cancel_job "$job_array1\_2" $c1 |
| wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \ |
| "aftercorr:$job_array1\_*(failed)" |
| if { !$kill_invalid_depend } { |
| cancel_job "$job_array2" $c1 |
| } |
| } |
| |
| proc test_remote_aftercorr { } { |
| global c1 c2 kill_invalid_depend file_in_long file_in_short |
| |
| log_info "#############################################################" |
| log_info "# Test remote aftercorr" |
| log_info "#############################################################" |
| |
| # Remote dependency succeeds |
| log_info "aftercorr: test that remote dependency succeeds:" |
| set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_short] |
| set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ |
| --array=1-2" $c1 $file_in_long] |
| |
| my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2 |
| my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2 |
| my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 |
| my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 |
| # The dependency on the remote side has _*, but the dependency locally |
| # doesn't because it couldn't find the remote job. |
| wait_for_depend "$job_array2\_1" "Dependency" \ |
| "aftercorr:$job_array1\(unfulfilled)" |
| wait_for_depend "$job_array2\_2" "Dependency" \ |
| "aftercorr:$job_array1\(unfulfilled)" |
| |
| my_wait_for_fed_job "$job_array1\_1" "DONE" $c2 |
| my_wait_for_fed_job "$job_array1\_2" "DONE" $c2 |
| wait_for_depend "$job_array2\_1" "None" "(null)" |
| wait_for_depend "$job_array2\_2" "None" "(null)" |
| my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1 |
| my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1 |
| cancel_job $job_array2 $c1 |
| |
| # Remote dependency fails |
| log_info "aftercorr: test that remote dependency fails:" |
| set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_long] |
| set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ |
| --array=1-2" $c1 $file_in_long] |
| |
| my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2 |
| my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2 |
| my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 |
| my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 |
| wait_for_depend "$job_array2\_1" "Dependency" \ |
| "aftercorr:$job_array1\(unfulfilled)" |
| wait_for_depend "$job_array2\_2" "Dependency" \ |
| "aftercorr:$job_array1\(unfulfilled)" |
| |
| cancel_job "$job_array1\_1" $c2 |
| wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \ |
| "aftercorr:$job_array1\(failed)" |
| cancel_job "$job_array1\_2" $c2 |
| wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \ |
| "aftercorr:$job_array1\(failed)" |
| if { !$kill_invalid_depend } { |
| cancel_job "$job_array2" $c1 |
| } |
| } |
| |
| proc test_local_afterok { } { |
| global c1 kill_invalid_depend file_in_long file_in_short |
| |
| log_info "#############################################################" |
| log_info "# Test local afterok" |
| log_info "#############################################################" |
| |
| # Local dependency succeeds |
| log_info "afterok: test that local dependency succeeds:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_short] |
| set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "DONE" $c1 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| |
| # Local dependency fails |
| log_info "afterok: test that local dependency fails:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id2 "DependencyNeverSatisfied" \ |
| "afterok:$job_id1\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id2 $c1 |
| } |
| } |
| |
| proc test_remote_afterok { } { |
| global c1 c2 kill_invalid_depend file_in_long file_in_short |
| |
| log_info "#############################################################" |
| log_info "# Test remote afterok" |
| log_info "#############################################################" |
| |
| # Remote dependency succeeds |
| log_info "afterok: test that remote dependency succeeds:" |
| set job_id1 [submit_job "-M$c2" $c2 $file_in_short] |
| set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "DONE" $c2 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| |
| # Remote dependency fails |
| log_info "afterok: test that remote dependency fails" |
| set job_id1 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" |
| cancel_job $job_id1 $c2 |
| wait_for_depend $job_id2 "DependencyNeverSatisfied" \ |
| "afterok:$job_id1\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id2 $c1 |
| } |
| } |
| |
| proc test_local_afternotok { } { |
| global c1 kill_invalid_depend file_in_long file_in_short |
| |
| log_info "#############################################################" |
| log_info "# Test local afternotok" |
| log_info "#############################################################" |
| |
| # Local dependency succeeds |
| log_info "afternotok: test that local dependency succeeds:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\)" |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| |
| # Local dependency fails |
| log_info "afternotok: test that local dependency fails:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_short] |
| set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "DONE" $c1 |
| wait_for_depend $job_id2 "DependencyNeverSatisfied" \ |
| "afternotok:$job_id1\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id2 $c1 |
| } |
| } |
| |
| proc test_remote_afternotok { } { |
| global c1 c2 kill_invalid_depend file_in_long file_in_short |
| |
| log_info "#############################################################" |
| log_info "# Test remote afternotok" |
| log_info "#############################################################" |
| |
| # Remote dependency succeeds |
| log_info "afternotok: test that remote dependency succeeds:" |
| set job_id1 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\)" |
| cancel_job $job_id1 $c2 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| |
| # Remote dependency fails |
| log_info "afternotok: test that remote dependency fails" |
| set job_id1 [submit_job "-M$c2" $c2 $file_in_short] |
| set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\)" |
| my_wait_for_fed_job $job_id1 "DONE" $c2 |
| wait_for_depend $job_id2 "DependencyNeverSatisfied" \ |
| "afternotok:$job_id1\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id2 $c1 |
| } |
| } |
| |
| proc test_local_singleton { } { |
| global c1 disable_remote_singleton file_in_long |
| |
| log_info "#############################################################" |
| log_info "# Test local singleton" |
| log_info "#############################################################" |
| |
| # Test one cluster |
| log_info "singleton: test on one cluster" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" "singleton(unfulfilled)" |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id2 "None" "(null)" |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| cancel_job $job_id2 $c1 |
| |
| } |
| |
| proc test_remote_singleton { } { |
| global c1 c2 c3 disable_remote_singleton file_in_long |
| |
| log_info "#############################################################" |
| log_info "# Test remote singleton" |
| log_info "#############################################################" |
| |
| # Test multiple clusters |
| if { $disable_remote_singleton } { |
| # Test that remote jobs don't affect the singleton dependency |
| log_info "singleton: test that disable_remote_singleton works" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id3 [submit_job "--depend=singleton -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" "singleton(unfulfilled)" |
| # Cancel job 1 - job 3 should start running even though job 2 is |
| # running on another cluster |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id3 "None" "(null)" |
| my_wait_for_fed_job $job_id3 "RUNNING" $c1 |
| cancel_job $job_id2 $c2 |
| cancel_job $job_id3 $c1 |
| } else { |
| # Test that singleton doesn't get cleared until jobs on all |
| # clusters are done |
| log_info "singleton: test with jobs on all clusters" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id3 [submit_job "-M$c3" $c3 $file_in_long] |
| set job_id4 [submit_job "--depend=singleton -M$c1" $c1 \ |
| $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id3 "RUNNING" $c3 |
| my_wait_for_fed_job $job_id4 "PENDING" $c1 |
| wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)" |
| |
| # Job 4 shouldn't start until jobs 1, 2, and 3 are done. |
| # Test that it starts when a remote job is finished last. |
| cancel_job $job_id1 $c1 |
| # Should still have the same dependency |
| wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)" |
| cancel_job $job_id2 $c2 |
| cancel_job $job_id3 $c3 |
| # Now the dependency should be cleared |
| wait_for_depend $job_id4 "None" "(null)" |
| my_wait_for_fed_job $job_id4 "RUNNING" $c1 |
| cancel_job $job_id4 $c1 |
| } |
| } |
| |
| proc test_add_remove_clusters { } { |
| global c1 c2 c3 fed_name file_in_long \ |
| disable_remote_singleton kill_invalid_depend |
| |
| # Test adding/removing clusters from the federation |
| # Removing a cluster from a federation should cause dependencies on |
| # jobs on that cluster to fail. |
| # Adding a cluster to a federation means that any singleton dependencies |
| # have to be fulfilled on that cluster. |
| log_info "#############################################################" |
| log_info "# Test adding/removing a cluster from the federation" |
| log_info "#############################################################" |
| |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id3 [submit_job "-M$c3" $c3 $file_in_long] |
| set job_id4 [submit_job "--depend=afterok:$job_id3 -M$c1" $c1 \ |
| $file_in_long] |
| set job_id5 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long] |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id3 "RUNNING" $c3 |
| my_wait_for_fed_job $job_id4 "PENDING" $c1 |
| wait_for_depend $job_id4 "Dependency" "afterok:$job_id3\(unfulfilled\)" |
| wait_for_depend $job_id5 "Dependency" "singleton(unfulfilled)" |
| |
| log_info "Test that removing cluster $c3 from fed $fed_name makes dependencies on jobs on $c3 fail" |
| if [remove_cluster_from_fed $c3 $fed_name] { |
| fail "Unable to remove cluster ($c3) from federation ($fed_name)" |
| } |
| wait_for_depend $job_id4 "DependencyNeverSatisfied" \ |
| "afterok:$job_id3\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id4 $c1 |
| } |
| |
| if { $disable_remote_singleton } { |
| cancel_job $job_id1 $c1 |
| cancel_job $job_id2 $c2 |
| cancel_job $job_id3 $c3 |
| cancel_job $job_id5 $c1 |
| return |
| } |
| |
| log_info "Test that the singleton dependency was resent back to cluster $c3 when it was added back to the federation." |
| if [add_cluster_to_fed $c3 $fed_name] { |
| fail "Unable to add cluster ($c3) to federation ($fed_name)" |
| } |
| cancel_job $job_id1 $c1 |
| cancel_job $job_id2 $c2 |
| cancel_job $job_id3 $c3 |
| wait_for_depend $job_id5 "None" "(null)" |
| my_wait_for_fed_job $job_id5 "RUNNING" $c1 |
| cancel_job $job_id5 $c1 |
| } |
| |
| proc test_submit_to_all_clusters { } { |
| global c1 c2 c3 file_in_long |
| |
| log_info "#############################################################" |
| log_info "# Test submitting a dependent job to all clusters" |
| log_info "#############################################################" |
| |
| log_info "Test that a dependent job is only on its origin cluster while dependent and that it is submitted to all clusters when its dependency is cleared." |
| set job_id1 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1,$c2,$c3 \ |
| --begin=now+60" $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id2 "PENDING" $c1 |
| wait_for_depend $job_id2 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\)" |
| |
| log_info "Test that job $job_id2 is not on clusters $c2 or $c3." |
| if { [is_job_on_cluster $job_id2 $c2] || \ |
| [is_job_on_cluster $job_id2 $c3] } { |
| fail "Job ($job_id2) is in cluster ($c2 and/or $c3) when it shouldn't be." |
| } |
| |
| log_info "Test that job $job_id2 is submitted to all sibling clusters $c2 and $c3 when its dependency is fulfilled." |
| cancel_job $job_id1 $c2 |
| wait_for_depend $job_id2 "BeginTime" "(null)" |
| my_wait_for_fed_job $job_id2 "PENDING" "$c1" |
| my_wait_for_fed_job $job_id2 "PENDING" "$c2" |
| my_wait_for_fed_job $job_id2 "PENDING" "$c3" |
| cancel_job $job_id2 "$c1,$c2,$c3" |
| } |
| |
| proc test_local_or_dependencies { } { |
| global c1 file_in_long kill_invalid_depend |
| |
| log_info "#############################################################" |
| log_info "# Test local OR dependencies" |
| log_info "#############################################################" |
| |
| log_info "OR dependencies: Test that one fulfilled dependency makes the whole dependency fulfilled:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id3 [submit_job \ |
| "--depend=afternotok:$job_id1?afternotok:$job_id2 -M$c1" \ |
| $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\)?afternotok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id2 $c1 |
| wait_for_depend $job_id3 "None" "(null)" |
| my_wait_for_fed_job $job_id3 "RUNNING" $c1 |
| cancel_job $job_id1 $c1 |
| cancel_job $job_id3 $c1 |
| |
| log_info "OR dependencies: Test that the dependency doesn't fail until all dependencies have failed:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id3 [submit_job "--depend=afterok:$job_id1?afterok:$job_id2 \ |
| -M$c1" $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afterok:$job_id1\(unfulfilled\)?afterok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afterok:$job_id1\(failed\)?afterok:$job_id2\(unfulfilled\)" |
| cancel_job $job_id2 $c1 |
| wait_for_depend $job_id3 "DependencyNeverSatisfied" \ |
| "afterok:$job_id1\(failed\)?afterok:$job_id2\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id3 $c1 |
| } |
| } |
| |
| proc test_remote_or_dependencies { } { |
| global c1 c2 file_in_long kill_invalid_depend |
| |
| log_info "#############################################################" |
| log_info "# Test remote OR dependencies" |
| log_info "#############################################################" |
| |
| log_info "OR dependencies: Test that one fulfilled dependency makes the whole dependency fulfilled:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id3 [submit_job \ |
| "--depend=afternotok:$job_id1?afternotok:$job_id2 -M$c1" \ |
| $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\)?afternotok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id2 $c2 |
| wait_for_depend $job_id3 "None" "(null)" |
| my_wait_for_fed_job $job_id3 "RUNNING" $c1 |
| cancel_job $job_id1 $c1 |
| cancel_job $job_id3 $c1 |
| |
| log_info "OR dependencies: Test that the dependency doesn't fail until all dependencies have failed:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id3 [submit_job "--depend=afterok:$job_id1?afterok:$job_id2 \ |
| -M$c1" $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afterok:$job_id1\(unfulfilled\)?afterok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afterok:$job_id1\(failed\)?afterok:$job_id2\(unfulfilled\)" |
| cancel_job $job_id2 $c2 |
| wait_for_depend $job_id3 "DependencyNeverSatisfied" \ |
| "afterok:$job_id1\(failed\)?afterok:$job_id2\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id3 $c1 |
| } |
| } |
| |
| proc test_local_and_dependencies { } { |
| global c1 file_in_long kill_invalid_depend |
| |
| log_info "#############################################################" |
| log_info "# Test local AND dependencies" |
| log_info "#############################################################" |
| |
| log_info "AND dependencies: Test that the dependency isn't fulfilled until all dependencies are fulfilled:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id3 [submit_job \ |
| "--depend=afternotok:$job_id1,afternotok:$job_id2 -M$c1" \ |
| $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\),afternotok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afternotok:$job_id2\(unfulfilled\)" |
| cancel_job $job_id2 $c1 |
| wait_for_depend $job_id3 "None" "(null)" |
| my_wait_for_fed_job $job_id3 "RUNNING" $c1 |
| cancel_job $job_id3 $c1 |
| |
| log_info "AND dependencies: Test that the whole dependency fails when a single dependency fails:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id3 [submit_job "--depend=afterok:$job_id1,afterok:$job_id2 \ |
| -M$c1" $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id2 $c1 |
| wait_for_depend $job_id3 "DependencyNeverSatisfied" \ |
| "afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id3 $c1 |
| } |
| cancel_job $job_id1 $c1 |
| } |
| |
| proc test_remote_and_dependencies { } { |
| global c1 c2 file_in_long kill_invalid_depend |
| |
| log_info "#############################################################" |
| log_info "# Test remote AND dependencies" |
| log_info "#############################################################" |
| |
| log_info "AND dependencies: Test that the dependency isn't fulfilled until all dependencies are fulfilled:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id3 [submit_job \ |
| "--depend=afternotok:$job_id1,afternotok:$job_id2 -M$c1" \ |
| $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afternotok:$job_id1\(unfulfilled\),afternotok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id1 $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afternotok:$job_id2\(unfulfilled\)" |
| cancel_job $job_id2 $c2 |
| wait_for_depend $job_id3 "None" "(null)" |
| my_wait_for_fed_job $job_id3 "RUNNING" $c1 |
| cancel_job $job_id3 $c1 |
| |
| log_info "AND dependencies: Test that the whole dependency fails when a single dependency fails:" |
| set job_id1 [submit_job "-M$c1" $c1 $file_in_long] |
| set job_id2 [submit_job "-M$c2" $c2 $file_in_long] |
| set job_id3 [submit_job "--depend=afterok:$job_id1,afterok:$job_id2 \ |
| -M$c1" $c1 $file_in_long] |
| |
| my_wait_for_fed_job $job_id1 "RUNNING" $c1 |
| my_wait_for_fed_job $job_id2 "RUNNING" $c2 |
| my_wait_for_fed_job $job_id3 "PENDING" $c1 |
| wait_for_depend $job_id3 "Dependency" \ |
| "afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(unfulfilled\)" |
| |
| cancel_job $job_id2 $c2 |
| wait_for_depend $job_id3 "DependencyNeverSatisfied" \ |
| "afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(failed\)" |
| if { !$kill_invalid_depend } { |
| cancel_job $job_id3 $c1 |
| } |
| cancel_job $job_id1 $c1 |
| } |
| |
| if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { |
| skip "This test can't be run without a usable AccountStorageType" |
| } |
| |
| if {[check_federation_setup]} { |
| if {![check_federation_up]} { |
| skip "All of the clusters in the federation must be up" |
| } |
| |
| delete_federations $fed_name |
| if [setup_federation $fed_name] { |
| fail "Unable to set up federation ($fed_name)" |
| } |
| } else { |
| log_warn "Not running remote dependency tests" |
| set c1 [get_config_param "ClusterName"] |
| set all_clusters "$c1" |
| set my_scancel $scancel |
| set my_scontrol $scontrol |
| } |
| |
| proc cleanup {} { |
| global fed_name bin_rm file_in_long file_in_short |
| |
| delete_federations $fed_name |
| cancel_federation_jobs |
| exec $bin_rm -f $file_in_long $file_in_short |
| } |
| |
| ############################################################################### |
| # Begin test |
| ############################################################################### |
| |
| # Use file_in_short when we have to wait for the job to end. |
| # Use file_in_long everywhere else. |
| make_bash_script $file_in_long "$bin_sleep 60" |
| make_bash_script $file_in_short "$bin_sleep 5" |
| |
| set kill_invalid_depend [param_contains [get_config_param "DependencyParameters"] "kill_invalid_depend"] |
| set disable_remote_singleton [param_contains [get_config_param "DependencyParameters"] "disable_remote_singleton"] |
| log_info "kill_invalid_depend: $kill_invalid_depend; disable_remote_singleton: $disable_remote_singleton\n" |
| |
| cancel_federation_jobs |
| |
| test_local_after |
| test_local_afterany |
| # --depend=afterburstbuffer is tested in test35.6 |
| test_local_aftercorr |
| test_local_afterok |
| test_local_afternotok |
| test_local_singleton |
| test_local_or_dependencies |
| test_local_and_dependencies |
| # Test --depend=expand in another test |
| |
| if {![check_federation_setup]} { |
| skip "Subtests with federations cannot be run" |
| } else { |
| test_remote_after |
| test_remote_afterany |
| test_remote_aftercorr |
| test_remote_afterok |
| test_remote_afternotok |
| test_remote_singleton |
| test_remote_or_dependencies |
| test_remote_and_dependencies |
| test_add_remove_clusters |
| test_submit_to_all_clusters |
| } |