blob: b79f8b5624fc5eb73c7b3d1d908b724fe5adee88 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test removing clusters from federation
#
# Reqs: 1. Using slurmdbd accounting storage type and is up
# 2. fed_slurm_base is defined in globals.local - set to directory that
# has access to each federation configure (fedc1, fedc2, fedc3).
# Eg.
# fedr/slurm/ (src)
# fedr/fed1/bin
# fedr/fed1/sbin
# fedr/fed1/etc
# fedr/fed1/...
# fedr/fed2/...
# fedr/fed3/...
# 3. controllers are up and running.
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
set fed_name "feda"
set user_name ""
set srun_job_cnt 0
set my_sbatch "${fed_slurm_base}/$fedc1/bin/sbatch"
set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue"
set my_sacctmgr "${fed_slurm_base}/$fedc1/bin/sacctmgr"
set my_srun "${fed_slurm_base}/$fedc2/bin/srun"
set my_sacct "${fed_slurm_base}/$fedc1/bin/sacct"
set my_scontrol "${fed_slurm_base}/$fedc1/bin/scontrol"
#
# Check accounting config and bail if not found.
#
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
if {[get_admin_level] ne "Administrator"} {
skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin"
}
proc setup_cluster_features {} {
global sacctmgr fedc1 fedc2 fedc3 eol
set matches 0
spawn $sacctmgr -i modify cluster $fedc1 set features=fa
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+Feature\\s+=\\s+fa" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s+$fedc1$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
set matches 0
spawn $sacctmgr -i modify cluster $fedc2 set features=fb
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+Feature\\s+=\\s+fb" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s+$fedc2$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
set matches 0
spawn $sacctmgr -i modify cluster $fedc3 set features=fc
expect {
-re "Setting$eol" {
incr matches
exp_continue
}
-re "^\\s+Feature\\s+=\\s+fc" {
incr matches
exp_continue
}
-re "Modified cluster...$eol" {
incr matches
exp_continue
}
-re "^\\s+$fedc3$eol" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr mod not responding"
}
eof {
wait
}
}
if {$matches != 4} {
fail "Unexpected error (got $matches)"
}
}
proc scontrol { fed job_id index regex } {
global my_scontrol eol srun_spawn_id
set matches 0
log_debug "Executing $my_scontrol -M$fed notify $job_id I'm Alive"
log_debug "Output: [exec $my_scontrol -M$fed notify $job_id I'm Alive]"
set spawn_id $srun_spawn_id($index)
expect {
-re "$regex" {
incr matches
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "srun failure (expected $regex)"
}
}
proc sbatch { options } {
global number bin_sleep my_sbatch fedc1 fedc2 fedc3
set matches 0
set job_id 0
set command "$my_sbatch -N10 --exclusive -o/dev/null "
append command $options
append command " --wrap \"sleep 300\""
set regex "Submitted batch job ($number)"
spawn {*}$command
expect {
-re "$regex" {
incr matches
set job_id $expect_out(1,string)
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "Batch submit failure (expected $regex)"
}
return $job_id
}
proc srun { options index } {
global number srun_spawn_id bin_sleep my_srun
set matches 0
set job_id 0
set command "$my_srun --exclusive -N10 "
append command $options
append command " sleep 300"
set regex "($number)"
spawn {*}$command
set srun_spawn_id($index) $spawn_id
expect {
-re "$regex" {
incr matches
set job_id $expect_out(1,string)
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "srun failure (expected $regex)"
}
return $job_id
}
proc squeue { fed m regex } {
global my_squeue
set matches 0
set command "$my_squeue -Ostatecompact:.4,name:.10,jobid:.10,siblingsviable:.20,siblingsactive:.20 -a -M$fed"
spawn {*}$command
expect {
-re "$regex" {
incr matches
exp_continue
}
}
if {$matches != $m} {
fail "Unexpected error in squeue (expected $regex: Matched $matches/$m times)"
}
}
proc cancel_federation_jobs { } {
global scancel user_name fedc1 fedc2 fedc3
spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name
expect {
eof {
wait
}
}
sleep 5
}
proc cancel_job { job_id options } {
global scancel
spawn $scancel $options $job_id
expect {
eof {
wait
}
}
}
proc cleanup { } {
global fed_name bin_rm global test_name bin_bash
cancel_federation_jobs
exec $bin_bash -c "$bin_rm -f $test_name*.out"
delete_federations $fed_name
}
# Start test
if {![check_federation_setup]} {
skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local"
}
if {![check_federation_up]} {
skip "This test can't be run without all clusters up"
}
set user_name [get_my_user_name]
# Remove existing setup
cleanup
# Add clusters to federation
if {[setup_federation $fed_name]} {
fail "Failed to setup federation"
}
setup_cluster_features
log_info "################################################################"
log_info "Test sacctmgr remove federation"
log_info "################################################################"
cancel_federation_jobs
sleep 5
set srun_spawn_id(0) 0
set j(0) [sbatch -M$fedc1]
wait_for_fed_job $j(0) RUNNING $fedc1
set j(1) [sbatch -M$fedc2]
wait_for_fed_job $j(1) RUNNING $fedc2
set j(2) [sbatch ""]
wait_for_fed_job $j(2) RUNNING $fedc3
set j(6) [sbatch "--cluster-constraint=fb,fc"]
set j(7) [sbatch "--cluster-constraint=fb,fc"]
set j(8) [sbatch "--cluster-constraint=fb,fc"]
set j(9) [sbatch "--cluster-constraint=fb"]
set j(10) [sbatch "--cluster-constraint=fa"]
set j(3) [sbatch ""]
set my_sbatch "${fed_slurm_base}/$fedc2/bin/sbatch"
set j(4) [sbatch ""]
set my_sbatch "${fed_slurm_base}/$fedc3/bin/sbatch"
set j(5) [sbatch ""]
srun "" 0
sleep 5
# Verify jobs are where they should be
# O: origin
# R: running
# PD: Pending
# RV: Revoked
# fed1 fed2 fed3
# j(0)O:R
# j(1)O:R
# j(2)O:RV j(2)R
# j(3)O:PD j(3)PD j(3)PD
# j(4)PD j(4)O:PD j(4)PD
# j(5)PD j(5)PD j(5)O:PD
#
# j(6)O:RV j(6)PD j(6)PD
# j(7)O:RV j(7)PD j(7)PD
# j(8)O:RV j(8)PD j(8)PD
# j(9)O:RV j(9)PD
# j(10)O:PD
#
# srun:PD srun:O:PD srun:PD
set f1 "\\s+$fedc1"
set f2 "\\s+$fedc2"
set f3 "\\s+$fedc3"
set n "\\s+NA"
set srun_job "sleep\\s+\\d+"
set sib "\\s+$fedc1,$fedc2,$fedc3"
set sib2 "\\s+$fedc2,$fedc3"
set r1 "$j(0)$f1$f1|$j(2)$sib$f3|$j(3)$sib$sib|$j(4)$sib$n|$j(5)$sib$n|$srun_job$sib$n|$j(6)$sib2$sib2|$j(7)$sib2$sib2|$j(8)$sib2$sib2|$j(9)$f2$f2|$j(10)$f1$f1"
set r2 "$j(1)$f2$f2|$j(3)$sib$n|$j(4)$sib$sib|$j(5)$sib$n|$srun_job$sib$sib|$j(6)$sib2$n|$j(7)$sib2$n|$j(8)$sib2$n|$j(9)$f2$n"
set r3 "$j(2)$sib$f3|$j(3)$sib$n|$j(4)$sib$n|$j(5)$sib$sib|$srun_job$sib$n"
squeue "$fedc1" "11" $r1
squeue "$fedc2" "9" $r2
squeue "$fedc3" "5" $r3
log_info "################################################################"
log_info "Remove $fedc1"
log_info "################################################################"
spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc1
sleep 5
# Verify:
# 1. If origin is removed, that pending jobs are left on origin cluster and
# removed from non-origin clusters.
# 2. If there is a sibling job that is on a non-origin sibling where that
# sibling is the only viable sibling then that job become non-federated and
# remain pending and the origin tracking job should be removed.
# 3. If there is a job that has multiple viable siblings that and the origin is
# not a viable sibling then the non-origin jobs should stay as federated
# jobs and the origin tracking job should be removed. These jobs will
# schedule amongst themselves to start.
# 4. non-origin running jobs are still running and don't have federation
# information (e.g. viable and active siblings).
# 5. non-origin jobs have the removed cluster removed from the viable and
# active siblings
# 6. Verify that pending sruns don't get terminated message when siblings are
# revoked.
# 7. Verify jobs schedule amongst the siblings when the origin is removed.
# 7. Verify jobs run when the only viable sibling and the origin is removed.
#
# fed1 fed2 fed3
# j(0)O:R
# j(1)O:R
# j(2)R
# j(3)O:PD
# j(4)O:PD j(4)PD
# j(5)PD j(5)O:PD
# srun:O:PD srun:PD
#
# j(6)PD j(6)PD
# j(7)PD j(7)PD
# j(8)PD j(8)PD
# j(9)PD
# j(10)O:PD
set sib "\\s+$fedc2,$fedc3"
set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n"
set r2 "$j(1)$f2$f2|$j(4)$sib$sib|$j(5)$sib$n|$srun_job$sib$sib|$j(6)$sib$n|$j(7)$sib$n|$j(8)$sib$n|$j(9)$n$n"
set r3 "$j(2)$n$n|$j(4)$sib$n|$j(5)$sib$sib|$srun_job$sib$n|$j(6)$sib$n|$j(7)$sib$n|$j(8)$sib$n"
squeue "$fedc1" "3" $r1
squeue "$fedc2" "8" $r2
squeue "$fedc3" "7" $r3
#check db
# Test that job4 is marked REVOKED on the removed cluster (fed1) and still
# pending on the other clusters.
set matches 0
set r1 "REVOKED$f1"
set r2 "PENDING$f2"
set r3 "PENDING$f3"
spawn $my_sacct -o state,cluster -j $j(4) -M$fedc1,$fedc2,$fedc3 -D
expect {
-re "$r1|$r2|$f3" {
incr matches
exp_continue
}
}
if {$matches != 3} {
fail "Unexpected error in sacct ($matches != 3)"
}
# Test that job3 is marked REVOKED on the siblings clusters (fed2,fed3) and
# pending on the origin cluster.
set matches 0
set r1 "PENDING$f1"
set r2 "REVOKED$f2"
set r3 "REVOKED$f3"
spawn $my_sacct -o state,cluster -j $j(3) -M$fedc1,$fedc2,$fedc3 -D
expect {
-re "$r1|$r2|$f3" {
incr matches
exp_continue
}
}
if {$matches != 3} {
fail "Unexpected error in sacct"
}
# Cancel running job on fed2 so that j(6) will schedule amongst fed2 and fed3
# to start the job.
cancel_job $j(1) "-M$fedc2"
set run_cluster [wait_for_fed_job $j(6) RUNNING $fedc2]
if {$run_cluster eq ""} {
fail "Didn't find running job on cluster"
}
log_info "################################################################"
log_info "Remove $fedc2"
log_info "################################################################"
spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc2
sleep 5
# Verify:
# fed1 fed2 fed3
# j(0)O:R
# j(6)R
# j(2)R
# j(3)O:PD
# j(4)O:PD
# j(5)O:PD
# srun:O:PD
#
# j(7)PD
# j(8)PD
# j(9)PD
# j(10)O:PD
set old_timeout $timeout
set timeout 1
set matches 0
set spawn_id $srun_spawn_id(0)
expect {
-re "srun: Force Terminated job $number" {
incr matches
exp_continue
}
eof {}
}
if {$matches != 0} {
fail "srun got signaled when it shouldn't have"
}
set timeout $old_timeout
set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n"
set r2 "$j(6)$n$n|$j(4)$n$n|$srun_job$n$n|$j(9)$n$n"
set r3 "$j(2)$n$n|$j(5)$f3$f3|$j(7)$f3$n|$j(8)$f3$n"
squeue "$fedc1" "3" $r1
squeue "$fedc2" "4" $r2
squeue "$fedc3" "4" $r3
# Cancel running job on fed3 so that j(7) will schedule be scheduled on fed3.
# j(7) is still considered a federated job but only has one sibling.
cancel_job $j(2) "-M$fedc3"
set run_cluster [wait_for_fed_job $j(7) RUNNING $fedc3]
if {$run_cluster eq ""} {
fail "Didn't find running job on cluster"
}
log_info "################################################################"
log_info "Remove $fedc3"
log_info "################################################################"
spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc3
sleep 5
# Verify:
# fed1 fed2 fed3
# j(0)O:R
# j(6)R
# j(3)O:PD
# j(4)O:PD
# j(5)O:PD
# srun:O:PD
#
# j(7)R
# j(8)PD
# j(9)PD
# j(10)O:PD
set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n"
set r2 "$j(6)$n$n|$j(4)$n$n|$srun_job$n$n|$j(9)$n$n"
set r3 "$j(5)$n$n|$j(7)$n$n|$j(8)$n$n"
squeue "$fedc1" "3" $r1
squeue "$fedc2" "4" $r2
squeue "$fedc3" "3" $r3
log_info "################################################################"
log_info "Test revoked origin jobs are removed."
log_info "################################################################"
# Start from scratch
# Start jobs on all clusters and remove non-origin clusters.
# Remove non-origin clusters
# Running jobs on the non-origin clusters should remain running.
# Revoked tracking jobs on origin should go away.
cancel_federation_jobs
exec $my_sacctmgr mod fed $fed_name set -i clusters=$fedc1,$fedc2,$fedc3
#Test whether origin tracking jobs are removed when remote cluster is.
set my_srun "${fed_slurm_base}/$fedc1/bin/srun"
set j(0) [srun -M$fedc1 1]
wait_for_fed_job $j(0) RUNNING $fedc1
set j(1) [srun "" 2]
set clus1 [wait_for_fed_job $j(1) RUNNING $fedc2,$fedc3]
set j(2) [srun "" 3]
set clus2 [wait_for_fed_job $j(2) RUNNING $fedc3,$fedc2]
# fed1 fed2
# clus1 clus2
# j(0)O:R
# j(1)O:R
# j(2)O:RV j(2)R
set f1 "\\s*$fedc1"
set f2 "\\s*$clus1"
set f3 "\\s*$clus2"
set n "\\s*NA"
set sib "\\s*($fedc1,*|$clus1,*|$clus2,*){3}"
set r1 "$j(0)$f1$f1|$j(1)$sib$f2|$j(2)$sib$f3"
squeue "$fedc1" "3" $r1
spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$clus1
sleep 5
set sib "\\s*$fedc1,$clus2"
set r1 "$j(0)$f1$f1|$j(2)$sib$f3"
squeue "$fedc1" "2" $r1
spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$clus2
sleep 5
set r1 "$j(0)$f1$f1"
squeue "$fedc1" "1" $r1
spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc1
#Make sure all of the jobs(sruns) have not terminated
scontrol $fedc1 $j(0) 1 "srun: I'm Alive"
scontrol $clus1 $j(1) 2 "srun: I'm Alive"
scontrol $clus2 $j(2) 3 "srun: I'm Alive"