| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Test Federated Cluster States |
| # |
| # Reqs: 1. Using slurmdbd accounting storage type and is up |
| # 2. fed_slurm_base is defined in globals.local - set to directory that |
| # has access to each federation configure (fedc1, fedc2, fedc3). |
| # Eg. |
| # fedr/slurm/ (src) |
| # fedr/fed1/bin |
| # fedr/fed1/sbin |
| # fedr/fed1/etc |
| # fedr/fed1/... |
| # fedr/fed2/... |
| # fedr/fed3/... |
| # 3. controllers are up and running. |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| # |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the included file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with Slurm; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| |
| source ./globals |
| source ./globals_accounting |
| source ./globals_federation |
| |
| set fed_name "feda" |
| set user_name "" |
| set srun_job_cnt 0 |
| set my_scontrol "${fed_slurm_base}/$fedc1/bin/scontrol" |
| set my_sacctmgr "${fed_slurm_base}/$fedc1/bin/sacctmgr" |
| set my_sbatch "${fed_slurm_base}/$fedc1/bin/sbatch" |
| set my_srun "${fed_slurm_base}/$fedc1/bin/srun" |
| set my_salloc "${fed_slurm_base}/$fedc1/bin/salloc" |
| set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue" |
| set drain_time 10 |
| |
| # |
| # Check accounting config and bail if not found. |
| # |
| if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { |
| skip "This test can't be run without a usable AccountStorageType" |
| } |
| |
| if {[get_admin_level] ne "Administrator"} { |
| skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin" |
| } |
| |
| proc sacctmgr_show { regex } { |
| global number bin_sleep node_count my_sacctmgr fed_name |
| |
| set matches 0 |
| |
| spawn $my_sacctmgr show fed $fed_name --noheader |
| expect { |
| -re "$regex" { |
| incr matches |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "Didn't match regex ($regex: $matches)" |
| } |
| } |
| |
| proc sacctmgr_mod { state } { |
| global my_sacctmgr eol fedc1 |
| |
| set matches 0 |
| |
| spawn $my_sacctmgr mod cluster $fedc1 set fedstate=$state -i |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+FedState\\s+=\\s+$state" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified cluster...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$fedc1$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 4} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| } |
| |
| proc scontrol { category regex } { |
| global fedc1 fed_slurm_base bin_sleep my_scontrol |
| |
| set matches 0 |
| |
| spawn $my_scontrol show $category |
| expect { |
| -re "$regex" { |
| incr matches |
| } |
| timeout { |
| fail "scontrol not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "Didn't match regex ($regex: $matches)" |
| } |
| |
| } |
| |
| proc sbatch { mfed error } { |
| global number bin_sleep node_count my_sbatch fedc1 |
| |
| |
| set script "\"sleep 10\"" |
| set matches 0 |
| set job_id 0 |
| set command "$my_sbatch -N$node_count --exclusive --output=/dev/null \ |
| --error=/dev/null -t300 --requeue --wrap $script" |
| set regex "Submitted batch job ($number)" |
| set collect 1 |
| |
| if { $error } { |
| set regex "sbatch: error: Batch job submission failed: System submissions disabled" |
| set collect 0 |
| } |
| if {$mfed} { |
| append command " -M$fedc1" |
| } |
| spawn {*}$command |
| expect { |
| -re "$regex" { |
| incr matches |
| if { $collect } { |
| set job_id $expect_out(1,string) |
| } |
| |
| } |
| timeout { |
| fail "sbatch not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "Batch submit failure" |
| } |
| |
| sleep 3 |
| return $job_id |
| } |
| |
| proc squeue { regex } { |
| global fed_slurm_base fedc1 |
| |
| set matches 0 |
| set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue" |
| spawn $my_squeue --noheader -Ostatecompact:.5 |
| expect { |
| -re "$regex" { |
| incr matches |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "Unexpected error in squeue (expected $regex)" |
| } |
| } |
| |
| proc cancel_federation_jobs { } { |
| global scancel user_name fedc1 fedc2 fedc3 |
| |
| spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name |
| expect { |
| eof { |
| wait |
| } |
| } |
| sleep 5 |
| } |
| |
| proc cleanup { } { |
| global fed_name bin_bash bin_rm test_name |
| |
| cancel_federation_jobs |
| delete_federations $fed_name |
| exec $bin_bash -c "$bin_rm -f $test_name*.out" |
| } |
| |
| # Start test |
| |
| if {![check_federation_setup]} { |
| skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local" |
| } |
| |
| if {![check_federation_up]} { |
| skip "This test can't be run without all clusters up" |
| } |
| |
| set user_name [get_my_user_name] |
| |
| # Remove existing setup |
| cleanup |
| |
| # Get number of nodes per cluster |
| set node_count [llength [get_nodes_by_state idle,alloc,comp]] |
| |
| log_info "################################################################" |
| log_info "Setup cluster features" |
| log_info "################################################################" |
| |
| set job_id 0 |
| set matches 0 |
| |
| log_trace "Here" |
| |
| exec $my_sacctmgr add Federation $fed_name clusters=$fedc1,$fedc2 -i |
| |
| log_trace "Here" |
| |
| set matches 0 |
| |
| sacctmgr_show ".+ACTIVE.+ACTIVE.+" |
| |
| scontrol fed ".+ACTIVE.+ACTIVE.+" |
| |
| sbatch 1 0 |
| |
| set job_id [sbatch 1 0] |
| |
| sacctmgr_mod INACTIVE |
| |
| squeue "PD\\s+R\\s+" |
| |
| sleep 10 |
| |
| squeue "PD\\s+" |
| |
| sbatch 1 1 |
| |
| set matches 0 |
| |
| spawn $my_srun hostname |
| expect { |
| -re "srun: error: Unable to allocate resources: System submissions disabled" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| set matches 0 |
| |
| spawn $my_salloc hostname |
| expect { |
| -re "salloc: error: Job submit/allocate failed: System submissions disabled" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| sacctmgr_mod ACTIVE |
| |
| wait_for_fed_job $job_id RUNNING $fedc1,$fedc2 |
| |
| squeue "R\\s+" |
| |
| sbatch 1 0 |
| |
| squeue "R\\s+" |
| |
| sbatch 1 0 |
| |
| sbatch 1 0 |
| |
| sacctmgr_mod DRAIN |
| |
| sbatch 1 1 |
| |
| sacctmgr_show "\\s+DRAIN.+ACTIVE\\s+" |
| |
| scontrol fed ".+DRAIN.+ACTIVE.+" |
| |
| squeue "PD\\s+R\\s+" |
| |
| sleep 10 |
| |
| squeue "R\\s+" |
| |
| sleep 10 |
| |
| squeue "" |
| |
| log_info "Sleep $drain_time seconds until system is drained" |
| sleep $drain_time |
| |
| scontrol fed ".+DRAINED.+ACTIVE.+" |
| |
| sacctmgr_show "\\s+DRAINED.+ACTIVE\\s+" |
| |
| sacctmgr_mod ACTIVE |
| |
| sbatch 1 0 |
| |
| squeue "R\\s+" |
| |
| set matches 0 |
| |
| spawn $my_sacctmgr mod cluster $fedc1 set fedstate=DRAIN+REMOVE -i |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+FedState\\s+=\\s+DRAIN\\+REMOVE" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified cluster...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$fedc1$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 4} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| sbatch 1 1 |
| |
| squeue "" |
| |
| log_info "Sleep $drain_time seconds until system is removed" |
| sleep $drain_time |
| |
| scontrol fed "" |
| |
| sacctmgr_show "\\s+ACTIVE\\s+" |
| |
| # Once the system is drained, the controller will mark all jobs as non-requeueable. |
| scontrol jobs "Requeue=0.+" |
| |
| |
| # Make sure we can submit to removed cluster. |
| |
| set job_id [sbatch 1 0] |
| |
| if {[expr $job_id >> 26]} { |
| fail "Removed cluster still giving federated job id" |
| } |
| |
| wait_for_fed_job $job_id DONE $fedc1 |
| |
| set matches 0 |
| |
| spawn $my_sacctmgr mod fed $fed_name set clusters+=$fedc1 -i |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Cluster\\s+\\+=\\s+$fedc1" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified federation...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$fed_name$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 4} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| |
| sacctmgr_show "\\s+ACTIVE.+ACTIVE\\s+" |
| |
| set matches 0 |
| |
| spawn $my_sacctmgr mod cluster $fedc2 set fedstate=DRAIN -i |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+FedState\\s+=\\s+DRAIN" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified cluster...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$fedc2$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr mod not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 4} { |
| fail "Unexpected error (got $matches)" |
| } |
| |
| sacctmgr_show "\\s+ACTIVE.+DRAIN\\s+" |
| |
| scontrol fed ".+ACTIVE.+DRAIN.+" |
| |
| sbatch 0 0 |
| |
| squeue "R\\s+" |
| |
| sbatch 0 0 |
| |
| squeue "PD\\s+R\\s+" |