blob: 09b47dfcb277925b7952fe9ccabf75def37b5b34 [file] [log] [blame]
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Validate that sacct -D shows correct job steps and states
# when a job is requeued
############################################################################
# Copyright (C) SchedMD LLC.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set job_id 0
set node [get_nodes_by_request -fail "-t1 --exclusive"]
set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"]
if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} {
skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)"
}
if {![is_super_user]} {
skip "Test can only be run as SlurmUser"
}
proc cleanup {} {
global job_id
cancel_job $job_id
}
proc mod_state { state reason } {
global scontrol node
set output [run_command_output "$scontrol update nodename=$node state=$state reason=$reason"]
set bad_state [regexp "Invalid node state" $output]
if {$bad_state == 1 && $state eq "resume" && [get_config_param "ReturnToService"] == 2} {
log_warn "This error is expected, no worries"
set bad_state 0
}
if {$bad_state == 1} {
fail "Problem changing node state"
}
}
proc check_step { num } {
global sacct job_id
set output [run_command_output -fail "$sacct --job=$job_id\.batch -D --start=now-15minutes --noheader --format=jobid -P"]
set steps [regexp -all "batch" $output]
subtest {$num == $steps} "Check number of steps" "$steps != $num"
}
# Count the number of jobs and steps with a specific job ID and state
proc check_sacct_states {states} {
global job_id sacct
# This test will requeue jobs making those jobs be eligible in the
# future from sacct's perspective. Since sacct only shows eligible
# jobs we have to specify end in the future.
set output [run_command_output -fail "$sacct --job=$job_id --duplicates --parsable2 --start=now-15minutes --end=tomorrow --noheader -o JobID,State"]
# NOTE: Skip "extern" job container optionally spawned by "PrologFlags=contain"
set state_num [regexp -all "\[0-9_\]+(\.(?!extern)\[a-z\]+)*\\|$states" $output]
return $state_num
}
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "Not using accounting_storage/slurmdbd"
}
# Submit job to be requeued
log_info "Test 1"
set job_id [submit_job -fail "-N1 -w$node --exclusive -o/dev/null --requeue --wrap='$bin_sleep 20'"]
wait_for_job -fail $job_id "RUNNING"
# Wait for batch script to start (after message delays, prologs, etc.)
sleep 5
# Set the node that the job is running on to down
mod_state "down" "$test_name"
# Wait a little bit for node state to change
sleep 5
# Set the node back to resume
mod_state "resume" "$test_name"
# Check the job state
log_info "Test 2"
wait_for_job -fail $job_id "PENDING"
# Wait for the state changes to propagate to the database for sacct
sleep 5
# The job state should be NODE_FAIL
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"
# The batch step state should be CANCELLED
set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 1} "Test CANCELLED count" "$canc_count != 1"
# The requeued job state should be PENDING
set pend_count [check_sacct_states "PENDING"]
subtest {$pend_count == 1} "Test PENDING count" "$pend_count != 1"
wait_for_job -fail $job_id "RUNNING"
# Wait for batch script to start (after message delays, prologs, etc.)
sleep 5
log_info "Test 3"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"
set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 1} "Test CANCELLED count" "$canc_count != 1"
set run_count [check_sacct_states "RUNNING"]
# The requeued job and its batch step should now be running.
subtest {$run_count == 2} "Test RUNNING count" "$run_count != 2"
# Requeue the job
run_command -fail "$scontrol requeue $job_id"
# Wait a bit for the job to be requeued then check its state
sleep 8
wait_for_job -fail $job_id "PENDING"
# Wait for the state changes to propagate to the database for sacct
sleep 5
log_info "Test 4"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"
set req_count [check_sacct_states "REQUEUE"]
subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1"
# The first and second batch steps should both show CANCELLED
set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2"
set pend_count [check_sacct_states "PENDING"]
subtest {$pend_count == 1} "Test PENDING count" "$pend_count != 1"
wait_for_job -fail $job_id "RUNNING"
# Wait for batch script to start (after message delays, prologs, etc.)
sleep 5
# Check for steps after requeue. There should be 3 batch steps - the first 2
# that are CANCELLED, and now the last one that is running.
check_step 3
log_info "Test 5"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"
set req_count [check_sacct_states "REQUEUE"]
subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1"
set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2"
# The job and its batch step should be RUNNING
set run_count [check_sacct_states "RUNNING"]
subtest {$run_count == 2} "Test RUNNING count" "$run_count != 2"
wait_for_job -fail $job_id "DONE"
# Check steps after job has completed
check_step 3
log_info "Test 6"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"
set req_count [check_sacct_states "REQUEUE"]
subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1"
set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2"
set comp_count [check_sacct_states "COMPLETED"]
subtest {$comp_count == 2} "Test COMPLETED count" "$comp_count != 2"