| #!/usr/bin/env expect |
| ############################################################################ |
| # Purpose: Establish global state information for Slurm federation tests |
| # |
| # To define site-specific state information, set the values in a file |
| # named 'globals.local'. Those values will override any specified here. |
| # for example: |
| # |
| # $ cat globals.local |
| # set slurm_dir "/usr/local" |
| # set mpicc "/usr/local/bin/mpicc" |
| # |
| ############################################################################ |
| # Copyright (C) SchedMD LLC. |
| |
| # This file is part of Slurm, a resource management program. |
| # For details, see <https://slurm.schedmd.com/>. |
| # Please also read the supplied file: DISCLAIMER. |
| # |
| # Slurm is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 2 of the License, or (at your option) |
| # any later version. |
| # |
| # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| # details. |
| # |
| # You should have received a copy of the GNU General Public License along |
| # with SLURM; if not, write to the Free Software Foundation, Inc., |
| # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ############################################################################ |
| |
| source ./globals |
| |
| # Set if testing federations |
| cset fed_slurm_base "" |
| cset fedc1 "" |
| cset fedc2 "" |
| cset fedc3 "" |
| |
| proc check_federation_setup { } { |
| global fed_slurm_base fedc1 fedc2 fedc3 |
| set rc true |
| if {$fed_slurm_base eq "" || $fedc1 eq "" || $fedc2 eq "" || $fedc3 eq ""} { |
| set rc false |
| } |
| |
| return $rc |
| } |
| |
| |
| proc setup_federation { fed_name } { |
| global sacctmgr fedc1 fedc2 fedc3 eol |
| |
| set rc $::RETURN_SUCCESS |
| |
| spawn $sacctmgr -i add federation $fed_name |
| set matches 0 |
| expect { |
| -re "Adding Federation\\(s\\)$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "$fed_name$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {!$rc && $matches != 2} { |
| log_error "Failed to create federation" |
| return $::RETURN_ERROR |
| } |
| |
| set count 0 |
| foreach cluster [list $fedc1 $fedc2 $fedc3] { |
| incr count |
| spawn $sacctmgr -i mod cluster $cluster set federation=$fed_name features= |
| set matches 0 |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Feature\\s+=\\s+$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Federation\\s+=\\s+$fed_name$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "Modified cluster...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+$cluster$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {!$rc && $matches != 5} { |
| log_error "Failed to add $cluster to federation" |
| set rc $::RETURN_ERROR |
| break |
| } |
| |
| if {$count > 1} { |
| sleep 5 |
| } |
| } |
| return $rc |
| } |
| |
| |
| proc check_cluster_up { cname } { |
| set rc true |
| set matches 0 |
| set timeout 2 |
| global fed_slurm_base fedc1 fedc2 fedc3 |
| set my_scontrol "${fed_slurm_base}/$cname/bin/scontrol" |
| log_user 0 |
| set my_pid [spawn $my_scontrol show config] |
| expect { |
| "Configuration data as of" { |
| incr matches |
| } |
| timeout { |
| log_warn "$cname not responding" |
| slow_kill $my_pid |
| set rc false |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches != 1} { |
| log_error "$cname not responding" |
| set rc false |
| } |
| log_user 1 |
| return $rc |
| } |
| |
| |
| proc check_federation_up {} { |
| set rc true |
| global fedc1 fedc2 fedc3 |
| |
| if {![check_cluster_up $fedc1] || |
| ![check_cluster_up $fedc2] || |
| ![check_cluster_up $fedc3]} { |
| log_warn "This test can't be run if any clusters--$fedc1,\ |
| $fedc2, or $fedc3--are down" |
| set rc false |
| } |
| |
| return $rc |
| } |
| |
| |
| proc delete_federations { names } { |
| global sacctmgr |
| set matches 0 |
| set rc $::RETURN_SUCCESS |
| set object "federation" |
| spawn $sacctmgr -i delete $object $names |
| expect { |
| -re "privilege to perform this action" { |
| log_error "Don't have privileges" |
| set rc $::RETURN_ERROR |
| } |
| -re "(There was a problem|Unknown condition|Bad format on|Bad MaxWall|Unknown option)" { |
| log_error "There was a problem with the sacctmgr command" |
| set rc $::RETURN_ERROR |
| } |
| -re "Problem getting" { |
| log_error "There was a problem getting information from the database" |
| set rc $::RETURN_ERROR |
| } |
| -re "Problem adding" { |
| log_error "There was an unknown problem" |
| set rc $::RETURN_ERROR |
| } |
| -re "No associations" { |
| log_error "Your command didn't return anything" |
| set rc $::RETURN_ERROR |
| } |
| -re "Deleting $object" { |
| incr matches |
| exp_continue |
| } |
| -re " Nothing deleted" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr delete not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| |
| if {!$rc && $matches != 1} { |
| log_error "sacctmgr had a problem deleting $object. Got $matches" |
| } |
| |
| return $rc |
| } |
| |
| |
| proc get_clusterfed_info { fed_name } { |
| global sacctmgr eol |
| set matches 0 |
| array set clusters {} |
| spawn $sacctmgr show cluster federation=$fed_name \ |
| format="cluster%20,federation%20,id,controlhost,controlport,features,fedstate" |
| expect { |
| -re "Cluster\\s+Federation\\s+ID\\s+ControlHost\\s+ControlPort\\s+Features\\s+FedState $eol" { |
| incr matches |
| exp_continue |
| } |
| -re "\\s+(\\S+)\\s+$fed_name\\s+(\\d+)\\s+(\\S+)\\s+(\\d+)\\s+(\\S*)\\s+(\\S*) $eol" { |
| set clusters($expect_out(1,string)) [dict create id $expect_out(2,string) \ |
| host $expect_out(3,string) \ |
| port $expect_out(4,string) \ |
| features $expect_out(5,string) \ |
| state $expect_out(6,string)] |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$matches < 2} { |
| fail "Didn't match enough clusters for federation ($fed_name) ($matches < 2)" |
| } |
| |
| return [array get clusters] |
| } |
| |
| |
| # |
| # Add a single cluster to the given federation. |
| # IN: cname - name of cluster to add to federation. |
| # IN: fed_name - name of federation to add cluster to. |
| # RET: RETURN_SUCCESS, or non-zero on error. |
| # |
| proc add_cluster_to_fed {cname fed_name} { |
| global sacctmgr eol |
| |
| set rc $::RETURN_SUCCESS |
| set matches 0 |
| spawn $sacctmgr -i modify federation $fed_name set clusters+=$cname |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "Cluster\\s+ \\+= $cname$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Modified federation...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "\\s+$fed_name$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$rc || $matches != 4} { |
| log_error "Failed to add $cname to $fed_name ($matches != 4)" |
| set $rc $::RETURN_ERROR |
| } |
| |
| return $rc |
| } |
| |
| |
| # |
| # Remove a single cluster from the given federation. |
| # IN: cname - name of cluster to remove from the federation. |
| # IN: fed_name - name of federation to remove cluster from. |
| # RET: RETURN_SUCCESS, or non-zero on error. |
| # |
| proc remove_cluster_from_fed {cname fed_name} { |
| global sacctmgr eol |
| |
| set rc $::RETURN_SUCCESS |
| set matches 0 |
| spawn $sacctmgr -i modify federation $fed_name set clusters-=$cname |
| expect { |
| -re "Setting$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "Cluster\\s+ -= $cname$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "^\\s+Modified federation...$eol" { |
| incr matches |
| exp_continue |
| } |
| -re "\\s+$fed_name$eol" { |
| incr matches |
| exp_continue |
| } |
| timeout { |
| fail "sacctmgr add not responding" |
| } |
| eof { |
| wait |
| } |
| } |
| if {$rc || $matches != 4} { |
| log_error "Failed to remove $cname from $fed_name" |
| set $rc $::RETURN_ERROR |
| } |
| |
| return $rc |
| } |
| |
| |
| ################################################################ |
| # |
| # NAME |
| # wait_for_fed_job - waits for a job to reach the desired state |
| # |
| # SYNOPSIS |
| # wait_for_fed_job ?options? job_id desired_state clusters |
| # |
| # DESCRIPTION |
| # Wait for a previously submitted Slurm job to reach the desired state. |
| # |
| # OPTIONS |
| # -timeout <integer_number> |
| # time in seconds to wait for the job to be in the desired state |
| # before timing out (default is 360) |
| # -pollinterval <integer_number> |
| # time in seconds between each job state check (default is 1) |
| # |
| # ARGUMENTS |
| # job_id |
| # The Slurm job id of a job we want to wait for. |
| # desired_state |
| # The state you want the job to attain before |
| # returning. Currently supports: |
| # DONE any terminated state |
| # PENDING job is pending |
| # RUNNING job is running |
| # SUSPENDED job is suspended |
| # clusters |
| # The list of clusters to wait for. If empty "" the default |
| # clusters of fedc1, fedc2 and fedc3 will be used. |
| # |
| # RETURN VALUE |
| # The name of the first cluster on which the job is found |
| # in the desired state. Empty string indicates a failure. |
| # |
| # NOTE |
| # We sleep for two seconds before replying that a job is |
| # done to give time for I/O completion (stdout/stderr files) |
| # |
| ################################################################ |
| |
| proc wait_for_fed_job args { |
| global scontrol fedc1 fedc2 fedc3 |
| |
| set timeout 360 |
| set poll_interval 1 |
| while {[llength $args]} { |
| switch -glob -- [lindex $args 0] { |
| -time* {set args [lassign $args - timeout]} |
| -poll* {set args [lassign $args - poll_interval]} |
| -* {fail "Unknown option: [lindex $args 0]"} |
| default break |
| } |
| } |
| set argument_count [llength $args] |
| if {$argument_count != 3} { |
| fail "Invalid number of arguments ($argument_count): $args" |
| } else { |
| lassign $args job_id desired_state clusters |
| } |
| |
| # First verify that desired_state is supported |
| switch $desired_state { |
| "DONE" {} |
| "PENDING" {} |
| "REVOKED" {} |
| "RUNNING" {} |
| "SUSPENDED" {} |
| "SPECIAL_EXIT" {} |
| default { |
| log_error "Invalid desired state: $desired_state" |
| return "" |
| } |
| } |
| |
| if {$job_id == 0} { |
| log_error "Invalid job ID: $job_id" |
| return "" |
| } |
| |
| set my_delay 0 |
| |
| set spec_clusters [list $fedc1 $fedc2 $fedc3] |
| if {$clusters ne ""} { |
| set spec_clusters [split $clusters ","] |
| } |
| log_debug "Checking for job '$job_id' in state '$desired_state' on [join $spec_clusters ,]" |
| |
| while 1 { |
| foreach cluster $spec_clusters { |
| log_debug "Checking $cluster" |
| set fd [open "|$scontrol -M$cluster --local -a -o show job $job_id"] |
| gets $fd line |
| catch {close $fd} |
| if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} { |
| log_error "$desired_state not found on cluster $cluster" |
| continue |
| } |
| |
| switch $state { |
| "NOT_FOUND" - |
| "CANCELLED" - |
| "DEADLINE" - |
| "FAILED" - |
| "TIMEOUT" - |
| "NODE_FAIL" - |
| "PREEMPTED" - |
| "COMPLETED" { |
| if {$desired_state eq "DONE"} { |
| log_debug "Job $job_id is DONE ($state) on $cluster" |
| sleep 2 |
| return $cluster |
| } |
| if {$desired_state eq "RUNNING"} { |
| log_warn "Job $job_id is $state but we wanted RUNNING" |
| } |
| if {$desired_state eq "SUSPENDED"} { |
| log_warn "Job $job_id is $state but we wanted SUSPENDED" |
| } |
| return "" |
| } |
| "PENDING" { |
| if {$desired_state eq "PENDING"} { |
| log_debug "Job $job_id is PENDING on $cluster" |
| return $cluster |
| } |
| log_warn "Job $job_id is in state $state, but desired state $desired_state" |
| } |
| "REVOKED" { |
| if {$desired_state eq "REVOKED"} { |
| log_debug "Job $job_id is REVOKED on $cluster" |
| return $cluster |
| } |
| log_warn "Job $job_id is in state $state, but desired state $desired_state" |
| } |
| "RUNNING" { |
| if {$desired_state eq "RUNNING"} { |
| log_debug "Job $job_id is RUNNING on $cluster" |
| return $cluster |
| } |
| log_warn "Job $job_id is in state $state, but desired state $desired_state" |
| } |
| "SPECIAL_EXIT" { |
| if {$desired_state eq "SPECIAL_EXIT"} { |
| log_debug "Job $job_id is SPECIAL_EXIT on $cluster" |
| return $cluster |
| } |
| log_warn "Job $job_id is in state $state, but desired state $desired_state" |
| } |
| "SUSPENDED" { |
| if {$desired_state eq "SUSPENDED"} { |
| log_debug "Job $job_id is SUSPENDED on $cluster" |
| return $cluster |
| } |
| log_warn "Job $job_id is in state $state, but desired state $desired_state" |
| } |
| default { |
| log_debug "Job $job_id is in state $state. Desired state was $desired_state" |
| } |
| } |
| } |
| |
| if { $my_delay > $timeout } { |
| log_error "Timeout waiting for job state $desired_state" |
| return "" |
| } |
| |
| exec sleep $poll_interval |
| set my_delay [expr $my_delay + $poll_interval] |
| } |
| } |