| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test removing clusters from federation |
| # |
| # Reqs: 1. Using slurmdbd accounting storage type and is up |
| # 2. fed_slurm_base is defined in globals.local - set to directory that |
| # has access to each federation configure (fedc1, fedc2, fedc3). |
| # Eg. |
| # fedr/slurm/ (src) |
| # fedr/fed1/bin |
| # fedr/fed1/sbin |
| # fedr/fed1/etc |
| # fedr/fed1/... |
| # fedr/fed2/... |
| # fedr/fed3/... |
| # 3. controllers are up and running. |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| |
| source ./globals |
| source ./globals_accounting |
| source ./globals_federation |
| |
| set fed_name "feda" |
| set user_name "" |
| set srun_job_cnt 0 |
| set my_sbatch "${fed_slurm_base}/$fedc1/bin/sbatch" |
| set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue" |
| set my_sacctmgr "${fed_slurm_base}/$fedc1/bin/sacctmgr" |
| set my_srun "${fed_slurm_base}/$fedc2/bin/srun" |
| set my_sacct "${fed_slurm_base}/$fedc1/bin/sacct" |
| set my_scontrol "${fed_slurm_base}/$fedc1/bin/scontrol" |
| |
| # |
| # Check accounting config and bail if not found. |
| # |
| |
| if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { |
| skip "This test can't be run without a usable AccountStorageType" |
| } |
| |
| if {[get_admin_level] ne "Administrator"} { |
| skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin" |
| } |
| |
| |
| proc setup_cluster_features {} { |
| global sacctmgr fedc1 fedc2 fedc3 eol |
| set matches 0 |
| spawn $sacctmgr -i modify cluster $fedc1 set features=fa |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Feature\\s+=\\s+fa" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified cluster...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$fedc1$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 4} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| set matches 0 |
| spawn $sacctmgr -i modify cluster $fedc2 set features=fb |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Feature\\s+=\\s+fb" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified cluster...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$fedc2$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 4} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| set matches 0 |
| spawn $sacctmgr -i modify cluster $fedc3 set features=fc |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Feature\\s+=\\s+fc" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified cluster...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$fedc3$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 4} { |
| fail "Unexpected error (got $matches)" |
| } |
| } |
| |
| proc scontrol { fed job_id index regex } { |
| global my_scontrol eol srun_spawn_id |
| |
| set matches 0 |
| |
| log_debug "Executing $my_scontrol -M$fed notify $job_id I'm Alive" |
| log_debug "Output: [exec $my_scontrol -M$fed notify $job_id I'm Alive]" |
| |
| set spawn_id $srun_spawn_id($index) |
| expect { |
| -re "$regex" { |
| incr matches |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "srun failure (expected $regex)" |
| } |
| } |
| |
| proc sbatch { options } { |
| global number bin_sleep my_sbatch fedc1 fedc2 fedc3 |
| |
| set matches 0 |
| set job_id 0 |
| set command "$my_sbatch -N10 --exclusive -o/dev/null " |
| append command $options |
| append command " --wrap \"sleep 300\"" |
| set regex "Submitted batch job ($number)" |
| spawn {*}$command |
| expect { |
| -re "$regex" { |
| incr matches |
| set job_id $expect_out(1,string) |
| } |
| timeout { |
| fail "sbatch not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "Batch submit failure (expected $regex)" |
| } |
| |
| return $job_id |
| } |
| |
| proc srun { options index } { |
| global number srun_spawn_id bin_sleep my_srun |
| |
| set matches 0 |
| set job_id 0 |
| set command "$my_srun --exclusive -N10 " |
| append command $options |
| append command " sleep 300" |
| set regex "($number)" |
| spawn {*}$command |
| set srun_spawn_id($index) $spawn_id |
| expect { |
| -re "$regex" { |
| incr matches |
| set job_id $expect_out(1,string) |
| } |
| timeout { |
| fail "srun not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "srun failure (expected $regex)" |
| } |
| |
| return $job_id |
| } |
| |
| proc squeue { fed m regex } { |
| global my_squeue |
| |
| set matches 0 |
| set command "$my_squeue -Ostatecompact:.4,name:.10,jobid:.10,siblingsviable:.20,siblingsactive:.20 -a -M$fed" |
| spawn {*}$command |
| expect { |
| -re "$regex" { |
| incr matches |
| exp_continue |
| } |
| } |
| if {$matches != $m} { |
| fail "Unexpected error in squeue (expected $regex: Matched $matches/$m times)" |
| } |
| } |
| |
| proc cancel_federation_jobs { } { |
| global scancel user_name fedc1 fedc2 fedc3 |
| |
| spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name |
| expect { |
| eof { |
| wait |
| } |
| } |
| sleep 5 |
| } |
| proc cancel_job { job_id options } { |
| global scancel |
| |
| spawn $scancel $options $job_id |
| expect { |
| eof { |
| wait |
| } |
| } |
| } |
| |
| proc cleanup { } { |
| global fed_name bin_rm global test_name bin_bash |
| |
| cancel_federation_jobs |
| exec $bin_bash -c "$bin_rm -f $test_name*.out" |
| |
| delete_federations $fed_name |
| } |
| |
| # Start test |
| |
| if {![check_federation_setup]} { |
| skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local" |
| } |
| |
| if {![check_federation_up]} { |
| skip "This test can't be run without all clusters up" |
| } |
| |
| set user_name [get_my_user_name] |
| |
| # Remove existing setup |
| cleanup |
| |
| # Add clusters to federation |
| if {[setup_federation $fed_name]} { |
| fail "Failed to setup federation" |
| } |
| setup_cluster_features |
| |
| |
| log_info "################################################################" |
| log_info "Test sacctmgr remove federation" |
| log_info "################################################################" |
| |
| cancel_federation_jobs |
| |
| sleep 5 |
| |
| set srun_spawn_id(0) 0 |
| |
| set j(0) [sbatch -M$fedc1] |
| wait_for_fed_job $j(0) RUNNING $fedc1 |
| set j(1) [sbatch -M$fedc2] |
| wait_for_fed_job $j(1) RUNNING $fedc2 |
| set j(2) [sbatch ""] |
| wait_for_fed_job $j(2) RUNNING $fedc3 |
| |
| set j(6) [sbatch "--cluster-constraint=fb,fc"] |
| set j(7) [sbatch "--cluster-constraint=fb,fc"] |
| set j(8) [sbatch "--cluster-constraint=fb,fc"] |
| set j(9) [sbatch "--cluster-constraint=fb"] |
| set j(10) [sbatch "--cluster-constraint=fa"] |
| |
| |
| set j(3) [sbatch ""] |
| |
| set my_sbatch "${fed_slurm_base}/$fedc2/bin/sbatch" |
| set j(4) [sbatch ""] |
| |
| set my_sbatch "${fed_slurm_base}/$fedc3/bin/sbatch" |
| set j(5) [sbatch ""] |
| |
| srun "" 0 |
| |
| sleep 5 |
| |
| # Verify jobs are where they should be |
| # O: origin |
| # R: running |
| # PD: Pending |
| # RV: Revoked |
| # fed1 fed2 fed3 |
| # j(0)O:R |
| # j(1)O:R |
| # j(2)O:RV j(2)R |
| # j(3)O:PD j(3)PD j(3)PD |
| # j(4)PD j(4)O:PD j(4)PD |
| # j(5)PD j(5)PD j(5)O:PD |
| # |
| # j(6)O:RV j(6)PD j(6)PD |
| # j(7)O:RV j(7)PD j(7)PD |
| # j(8)O:RV j(8)PD j(8)PD |
| # j(9)O:RV j(9)PD |
| # j(10)O:PD |
| # |
| # srun:PD srun:O:PD srun:PD |
| |
| set f1 "\\s+$fedc1" |
| set f2 "\\s+$fedc2" |
| set f3 "\\s+$fedc3" |
| set n "\\s+NA" |
| set srun_job "sleep\\s+\\d+" |
| |
| set sib "\\s+$fedc1,$fedc2,$fedc3" |
| set sib2 "\\s+$fedc2,$fedc3" |
| |
| set r1 "$j(0)$f1$f1|$j(2)$sib$f3|$j(3)$sib$sib|$j(4)$sib$n|$j(5)$sib$n|$srun_job$sib$n|$j(6)$sib2$sib2|$j(7)$sib2$sib2|$j(8)$sib2$sib2|$j(9)$f2$f2|$j(10)$f1$f1" |
| set r2 "$j(1)$f2$f2|$j(3)$sib$n|$j(4)$sib$sib|$j(5)$sib$n|$srun_job$sib$sib|$j(6)$sib2$n|$j(7)$sib2$n|$j(8)$sib2$n|$j(9)$f2$n" |
| set r3 "$j(2)$sib$f3|$j(3)$sib$n|$j(4)$sib$n|$j(5)$sib$sib|$srun_job$sib$n" |
| |
| squeue "$fedc1" "11" $r1 |
| squeue "$fedc2" "9" $r2 |
| squeue "$fedc3" "5" $r3 |
| |
| log_info "################################################################" |
| log_info "Remove $fedc1" |
| log_info "################################################################" |
| spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc1 |
| |
| sleep 5 |
| |
| # Verify: |
| # 1. If origin is removed, that pending jobs are left on origin cluster and |
| # removed from non-origin clusters. |
| # 2. If there is a sibling job that is on a non-origin sibling where that |
| # sibling is the only viable sibling then that job become non-federated and |
| # remain pending and the origin tracking job should be removed. |
| # 3. If there is a job that has multiple viable siblings that and the origin is |
| # not a viable sibling then the non-origin jobs should stay as federated |
| # jobs and the origin tracking job should be removed. These jobs will |
| # schedule amongst themselves to start. |
| # 4. non-origin running jobs are still running and don't have federation |
| # information (e.g. viable and active siblings). |
| # 5. non-origin jobs have the removed cluster removed from the viable and |
| # active siblings |
| # 6. Verify that pending sruns don't get terminated message when siblings are |
| # revoked. |
| # 7. Verify jobs schedule amongst the siblings when the origin is removed. |
| # 7. Verify jobs run when the only viable sibling and the origin is removed. |
| # |
| # fed1 fed2 fed3 |
| # j(0)O:R |
| # j(1)O:R |
| # j(2)R |
| # j(3)O:PD |
| # j(4)O:PD j(4)PD |
| # j(5)PD j(5)O:PD |
| # srun:O:PD srun:PD |
| # |
| # j(6)PD j(6)PD |
| # j(7)PD j(7)PD |
| # j(8)PD j(8)PD |
| # j(9)PD |
| # j(10)O:PD |
| |
| set sib "\\s+$fedc2,$fedc3" |
| |
| set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n" |
| set r2 "$j(1)$f2$f2|$j(4)$sib$sib|$j(5)$sib$n|$srun_job$sib$sib|$j(6)$sib$n|$j(7)$sib$n|$j(8)$sib$n|$j(9)$n$n" |
| set r3 "$j(2)$n$n|$j(4)$sib$n|$j(5)$sib$sib|$srun_job$sib$n|$j(6)$sib$n|$j(7)$sib$n|$j(8)$sib$n" |
| |
| squeue "$fedc1" "3" $r1 |
| squeue "$fedc2" "8" $r2 |
| squeue "$fedc3" "7" $r3 |
| |
| #check db |
| # Test that job4 is marked REVOKED on the removed cluster (fed1) and still |
| # pending on the other clusters. |
| set matches 0 |
| |
| set r1 "REVOKED$f1" |
| set r2 "PENDING$f2" |
| set r3 "PENDING$f3" |
| |
| spawn $my_sacct -o state,cluster -j $j(4) -M$fedc1,$fedc2,$fedc3 -D |
| expect { |
| -re "$r1|$r2|$f3" { |
| incr matches |
| exp_continue |
| } |
| } |
| if {$matches != 3} { |
| fail "Unexpected error in sacct ($matches != 3)" |
| } |
| |
| # Test that job3 is marked REVOKED on the siblings clusters (fed2,fed3) and |
| # pending on the origin cluster. |
| set matches 0 |
| |
| set r1 "PENDING$f1" |
| set r2 "REVOKED$f2" |
| set r3 "REVOKED$f3" |
| |
| spawn $my_sacct -o state,cluster -j $j(3) -M$fedc1,$fedc2,$fedc3 -D |
| expect { |
| -re "$r1|$r2|$f3" { |
| incr matches |
| exp_continue |
| } |
| } |
| if {$matches != 3} { |
| fail "Unexpected error in sacct" |
| } |
| |
| |
| # Cancel running job on fed2 so that j(6) will schedule amongst fed2 and fed3 |
| # to start the job. |
| cancel_job $j(1) "-M$fedc2" |
| |
| set run_cluster [wait_for_fed_job $j(6) RUNNING $fedc2] |
| if {$run_cluster eq ""} { |
| fail "Didn't find running job on cluster" |
| } |
| |
| |
| |
| log_info "################################################################" |
| log_info "Remove $fedc2" |
| log_info "################################################################" |
| spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc2 |
| |
| sleep 5 |
| |
| # Verify: |
| # fed1 fed2 fed3 |
| # j(0)O:R |
| # j(6)R |
| # j(2)R |
| # j(3)O:PD |
| # j(4)O:PD |
| # j(5)O:PD |
| # srun:O:PD |
| # |
| # j(7)PD |
| # j(8)PD |
| # j(9)PD |
| # j(10)O:PD |
| |
| set old_timeout $timeout |
| set timeout 1 |
| set matches 0 |
| set spawn_id $srun_spawn_id(0) |
| expect { |
| -re "srun: Force Terminated job $number" { |
| incr matches |
| exp_continue |
| } |
| eof {} |
| } |
| if {$matches != 0} { |
| fail "srun got signaled when it shouldn't have" |
| } |
| set timeout $old_timeout |
| |
| set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n" |
| set r2 "$j(6)$n$n|$j(4)$n$n|$srun_job$n$n|$j(9)$n$n" |
| set r3 "$j(2)$n$n|$j(5)$f3$f3|$j(7)$f3$n|$j(8)$f3$n" |
| |
| squeue "$fedc1" "3" $r1 |
| squeue "$fedc2" "4" $r2 |
| squeue "$fedc3" "4" $r3 |
| |
| |
| # Cancel running job on fed3 so that j(7) will schedule be scheduled on fed3. |
| # j(7) is still considered a federated job but only has one sibling. |
| cancel_job $j(2) "-M$fedc3" |
| |
| set run_cluster [wait_for_fed_job $j(7) RUNNING $fedc3] |
| if {$run_cluster eq ""} { |
| fail "Didn't find running job on cluster" |
| } |
| |
| log_info "################################################################" |
| log_info "Remove $fedc3" |
| log_info "################################################################" |
| spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc3 |
| |
| sleep 5 |
| |
| # Verify: |
| # fed1 fed2 fed3 |
| # j(0)O:R |
| # j(6)R |
| # j(3)O:PD |
| # j(4)O:PD |
| # j(5)O:PD |
| # srun:O:PD |
| # |
| # j(7)R |
| # j(8)PD |
| # j(9)PD |
| # j(10)O:PD |
| set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n" |
| set r2 "$j(6)$n$n|$j(4)$n$n|$srun_job$n$n|$j(9)$n$n" |
| set r3 "$j(5)$n$n|$j(7)$n$n|$j(8)$n$n" |
| |
| squeue "$fedc1" "3" $r1 |
| squeue "$fedc2" "4" $r2 |
| squeue "$fedc3" "3" $r3 |
| |
| log_info "################################################################" |
| log_info "Test revoked origin jobs are removed." |
| log_info "################################################################" |
| |
| # Start from scratch |
| # Start jobs on all clusters and remove non-origin clusters. |
| # Remove non-origin clusters |
| # Running jobs on the non-origin clusters should remain running. |
| # Revoked tracking jobs on origin should go away. |
| cancel_federation_jobs |
| |
| exec $my_sacctmgr mod fed $fed_name set -i clusters=$fedc1,$fedc2,$fedc3 |
| |
| #Test whether origin tracking jobs are removed when remote cluster is. |
| |
| set my_srun "${fed_slurm_base}/$fedc1/bin/srun" |
| |
| set j(0) [srun -M$fedc1 1] |
| wait_for_fed_job $j(0) RUNNING $fedc1 |
| set j(1) [srun "" 2] |
| set clus1 [wait_for_fed_job $j(1) RUNNING $fedc2,$fedc3] |
| set j(2) [srun "" 3] |
| set clus2 [wait_for_fed_job $j(2) RUNNING $fedc3,$fedc2] |
| |
| # fed1 fed2 |
| # clus1 clus2 |
| # j(0)O:R |
| # j(1)O:R |
| # j(2)O:RV j(2)R |
| |
| set f1 "\\s*$fedc1" |
| set f2 "\\s*$clus1" |
| set f3 "\\s*$clus2" |
| set n "\\s*NA" |
| |
| set sib "\\s*($fedc1,*|$clus1,*|$clus2,*){3}" |
| |
| set r1 "$j(0)$f1$f1|$j(1)$sib$f2|$j(2)$sib$f3" |
| |
| squeue "$fedc1" "3" $r1 |
| |
| spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$clus1 |
| |
| sleep 5 |
| |
| set sib "\\s*$fedc1,$clus2" |
| |
| set r1 "$j(0)$f1$f1|$j(2)$sib$f3" |
| |
| squeue "$fedc1" "2" $r1 |
| |
| spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$clus2 |
| |
| sleep 5 |
| |
| set r1 "$j(0)$f1$f1" |
| |
| squeue "$fedc1" "1" $r1 |
| |
| spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc1 |
| |
| #Make sure all of the jobs(sruns) have not terminated |
| |
| scontrol $fedc1 $j(0) 1 "srun: I'm Alive" |
| scontrol $clus1 $j(1) 2 "srun: I'm Alive" |
| scontrol $clus2 $j(2) 3 "srun: I'm Alive" |