blob: ff3e669dfbe9e8aeec2a434c113288bc2806c685 [file] [log] [blame]
############################################################################
# Copyright (C) SchedMD LLC.
############################################################################
import atf
import pytest
import time
# Note that node_prefix needs to be known or handled properly in s2n variant
node_prefix = "node"
# Note that power_interval has to be at least 5 seconds, recommended 10 seconds
# for later tests
power_interval = 10
suspend_time = 10
suspend_timeout = 10
resume_timeout = 10
@pytest.fixture(scope="module", autouse=True)
def setup():
atf.require_auto_config("Runs slurmd on same machine as slurmctld")
atf.require_config_parameter("NodeFeaturesPlugins", "node_features/helpers")
atf.require_config_parameter("SelectType", "select/cons_tres")
atf.require_config_parameter("SelectTypeParameters", "CR_CPU")
atf.require_config_parameter("TreeWidth", 65533)
atf.require_config_parameter("ResumeProgram", "/bin/true")
atf.require_config_parameter("SuspendProgram", "/bin/true")
# Time for cloud node to sit idle with no jobs before told to power down
atf.require_config_parameter("SuspendTime", suspend_time)
# Time allowed for cloud node to finish POWERING_DOWN
atf.require_config_parameter("SuspendTimeout", suspend_timeout)
# Time to wait for cloud node to power up and register with slurmctld after
# being ALLOCATED job
atf.require_config_parameter("ResumeTimeout", resume_timeout)
# Mark nodes as IDLE, regardless of current state, when suspending nodes with
# SuspendProgram so that nodes will be eligible to be resumed at a later time
atf.require_config_parameter_includes("SlurmctldParameters", "idle_on_node_suspend")
# Register the cloud node in slurm.conf
atf.require_config_parameter(
"NodeName", {f"{node_prefix}1": {"Feature": "f1,nf1", "State": "CLOUD"}}
)
# Create set/group of nodes that have feature f1 and assign to partitions
atf.require_config_parameter("Nodeset", {"ns1": {"Feature": "f1"}})
atf.require_config_parameter(
"PartitionName",
{
"primary": {"Nodes": "ALL"},
"cloud1": {"Nodes": "ns1"},
"powerdownonidle": {"Nodes": "ns1", "PowerDownOnIdle": "yes"},
},
)
# Define nf1 feature in 'helpers.conf'. Used for testing Node Features
atf.require_config_parameter(
"NodeName",
{f"{node_prefix}1": {"Feature": "nf1", "Helper": "/bin/true"}},
source="helpers",
)
# Don't run the usual atf.require_slurm_running() because tests will start
# slurmds manually
atf.start_slurmctld(clean=True)
yield
# Have to manually kill the slurmctld and slurmd in the teardown
kill_slurmds()
kill_slurmctld()
@pytest.fixture(scope="function", autouse=True)
def cloud_state():
"""stop all jobs and reset cloud node state after each test"""
yield
atf.cancel_all_jobs()
kill_slurmds()
atf.restart_slurmctld(clean=True)
time.sleep(2)
# Helper teardown functions
# Since our teardown doesn't seem to remove cloud nodes' slurmds, we need to
# delete the slurmds manually
def kill_slurmds():
get_slurmd_processes = atf.run_command(
f"pidof {atf.properties['slurm-sbin-dir']}/slurmd"
)
pids = get_slurmd_processes["stdout"].strip().split()
for pid in pids:
atf.run_command(f"kill {pid}", fatal=True, user="root")
# Since our teardown doesn't seem to remove slurmctlds started with
# "atf.start_slurmctld()", we need to delete the slurmctld
def kill_slurmctld():
get_slurmctld_process = atf.run_command(
f"pidof {atf.properties['slurm-sbin-dir']}/slurmctld"
)
pids = get_slurmctld_process["stdout"].strip().split()
for pid in pids:
atf.run_command(f"kill {pid}", fatal=True, user="root")
# Tests
# Test state cycle of cloud nodes: POWERED_DOWN, POWERING_UP, IDLE,
# POWERING_DOWN, POWERED_DOWN
def test_cloud_state_cycle():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# Schedule a job to cloud node's partition, transitioning node to ALLOCATED
# and POWERING_UP state
job_id = atf.submit_job_sbatch("-p cloud1 --wrap 'srun hostname'", fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "ALLOCATED", timeout=5, fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "CONFIGURING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Submitted job should be in CONFIGURING state while its ALLOCATED cloud node is POWERING_UP"
# TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
# Remove once bug 16459 is fixed.
time.sleep(2)
# Register the new slurmd
atf.run_command(
f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'",
fatal=True,
user="root",
)
# Make sure the cloud node resumes
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERING_UP",
reverse=True,
timeout=resume_timeout + 5,
fatal=True,
)
# Cloud node takes PERIODIC_TIMEOUT seconds to register and resume.
# The job has 45 seconds to finish
atf.wait_for_job_state(
job_id, "COMPLETED", timeout=atf.PERIODIC_TIMEOUT + 15, fatal=True
)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# Make sure the cloud node starts suspending once it hasn't received a job
# for suspend_time
atf.wait_for_node_state_any(
f"{node_prefix}1",
["POWER_DOWN", "POWERING_DOWN"],
timeout=suspend_time + 5,
fatal=True,
)
# Make sure the cloud node is fully POWERED_DOWN by suspend_timeout
atf.wait_for_node_state(
f"{node_prefix}1", "POWERED_DOWN", timeout=suspend_timeout + 5, fatal=True
)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# Test that cloud node powers down if exceeding resume_timeout
def test_resume_timeout():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# Schedule a job to cloud node's partition, transitioning node to ALLOCATED
# and POWERING_UP state
job_id = atf.submit_job_sbatch("-p cloud1 --wrap 'srun hostname'", fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "ALLOCATED", timeout=5, fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "CONFIGURING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Submitted job should be in CONFIGURING state while its ALLOCATED cloud node is POWERING_UP"
# Never spin up the accompanying slurmd, waiting resume_timeout for cloud
# node to be DOWN
atf.wait_for_node_state(
f"{node_prefix}1", "DOWN", timeout=resume_timeout + 5, fatal=True
)
# Assert surpassing resume_timeout correctly set cloud node's state
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Cloud node didn't enter POWERED_DOWN state immediately after surpassing resume timeout"
# Test scontrol setting cloud node state using POWER_UP and POWER_DOWN
def test_scontrol_power_up_down():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# scontrol POWER_UP
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=POWER_UP",
fatal=True,
user="slurm",
)
node_state = set(atf.get_node_parameter(f"{node_prefix}1", "state"))
assert (
set(["POWER_UP", "POWERING_UP"]) & node_state
), "Cloud node should enter powering up process after scontrol command"
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
# Remove once bug 16459 is fixed.
time.sleep(2)
# Register the new slurmd
atf.run_command(
f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'",
fatal=True,
user="root",
)
# Make sure the cloud node resumes
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERING_UP",
reverse=True,
timeout=resume_timeout + 5,
fatal=True,
)
# scontrol POWER_DOWN
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=POWER_DOWN",
fatal=True,
user="slurm",
)
node_state = set(atf.get_node_parameter(f"{node_prefix}1", "state"))
assert (
set(["POWER_DOWN", "POWERING_DOWN"]) & node_state
), "POWER_DOWN should immediately be added to cloud node's state"
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_DOWN", fatal=True)
atf.wait_for_node_state(
f"{node_prefix}1", "POWERED_DOWN", timeout=suspend_timeout + 5, fatal=True
)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# Test scontrol setting cloud node state using POWER_DOWN_ASAP
def test_scontrol_power_down_asap():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# scontrol POWER_UP
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=POWER_UP",
fatal=True,
user="slurm",
)
node_state = set(atf.get_node_parameter(f"{node_prefix}1", "state"))
assert (
set(["POWER_UP", "POWERING_UP"]) & node_state
), "Cloud node should enter powering up process after scontrol command"
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
# Remove once bug 16459 is fixed.
time.sleep(2)
# Register the new slurmd
atf.run_command(
f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'",
fatal=True,
user="root",
)
# Make sure the cloud node resumes
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERING_UP",
reverse=True,
timeout=resume_timeout + 5,
fatal=True,
)
# Submit job in preparation for POWER_DOWN_ASAP
job_id = atf.submit_job_sbatch("-p cloud1 --wrap 'srun sleep 5'", fatal=True)
atf.wait_for_job_state(
job_id, "RUNNING", timeout=atf.PERIODIC_TIMEOUT + 5, fatal=True
)
assert "ALLOCATED" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Cloud node should be ALLOCATED while running a job"
# scontrol POWER_DOWN_ASAP
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=POWER_DOWN_ASAP",
fatal=True,
user="slurm",
)
assert "POWER_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "POWER_DOWN should immediately be added to cloud node's state"
assert "DRAIN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Cloud node should be in DRAIN state to prepare for POWER_DOWN state"
# Wait for job to finish and then assert cloud node becomes POWERED_DOWN
atf.wait_for_job_state(job_id, "COMPLETED", fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_DOWN", fatal=True)
atf.wait_for_node_state(
f"{node_prefix}1", "POWERED_DOWN", timeout=suspend_timeout + 5, fatal=True
)
# Test scontrol setting cloud node state using POWER_DOWN_FORCE and RESUME
def test_scontrol_power_down_force_and_resume():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# scontrol POWER_UP
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=POWER_UP",
fatal=True,
user="slurm",
)
node_state = set(atf.get_node_parameter(f"{node_prefix}1", "state"))
assert (
set(["POWER_UP", "POWERING_UP"]) & node_state
), "Cloud node should enter powering up process after scontrol command"
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
# Remove once bug 16459 is fixed.
time.sleep(2)
# Register the new slurmd
atf.run_command(
f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'",
fatal=True,
user="root",
)
# Make sure the cloud node resumes
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERING_UP",
reverse=True,
timeout=resume_timeout + 5,
fatal=True,
)
# Submit job in preparation for POWER_DOWN_FORCE to cancel
job_id = atf.submit_job_sbatch("-p cloud1 --wrap 'srun sleep 300'", fatal=True)
atf.wait_for_job_state(
job_id, "RUNNING", timeout=atf.PERIODIC_TIMEOUT + 5, fatal=True
)
assert "ALLOCATED" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Cloud node should be ALLOCATED while running a job"
# scontrol POWER_DOWN_FORCE when cloud node is already up
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=POWER_DOWN_FORCE",
fatal=True,
user="slurm",
)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# Make sure job is requeued and cloud node is POWERED_DOWN
node_state = set(atf.get_node_parameter(f"{node_prefix}1", "state"))
assert (
set(["POWER_DOWN", "POWERING_DOWN"]) & node_state
), "Cloud node should enter powering down process immediately after scontrol command"
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_DOWN", fatal=True)
assert atf.wait_for_job_state(
job_id, "PENDING", timeout=10, fatal=True
), "Job should be requeued and PENDING after node is powered down"
# Test scontrol RESUME sets cloud node to POWERED_DOWN when POWERING_DOWN
# already
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=RESUME",
fatal=True,
user="slurm",
)
atf.wait_for_node_state(f"{node_prefix}1", "POWERED_DOWN", timeout=5, fatal=True)
# Test cloud nodes POWER_DOWN and then power up with different ActiveFeatures
# to handle jobs
def test_node_features():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# Periodically make sure that the AvailableFeatures for the cloud node are correct
available_features = sorted(["f1", "nf1"])
assert available_features == sorted(
atf.get_node_parameter(f"{node_prefix}1", "features")
), "Cloud node's AvailableFeatures don't match what was set in slurm.conf"
# Schedule a job to cloud node's partition, transitioning node to ALLOCATED
# and POWERING_UP state
job_id = atf.submit_job_sbatch("-p cloud1 -C f1 --wrap 'srun hostname'", fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "ALLOCATED", timeout=5, fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "CONFIGURING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Submitted job should be in CONFIGURING state while its ALLOCATED cloud node is POWERING_UP"
# TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
# Remove once bug 16459 is fixed.
time.sleep(2)
# Register the new slurmd
atf.run_command(
f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'",
fatal=True,
user="root",
)
# Make sure the cloud node resumes
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERING_UP",
reverse=True,
timeout=resume_timeout + 5,
fatal=True,
)
# Assert cloud node maintains expected configuration of features (not nf1)
assert available_features == sorted(
atf.get_node_parameter(f"{node_prefix}1", "features")
), "Cloud node's AvailableFeatures don't match what was set in slurm.conf"
assert ["f1"] == atf.get_node_parameter(
f"{node_prefix}1", "active_features"
), "Cloud node should only have the 'f1' feature when none are explicitly requested"
# Cloud node takes PERIODIC_TIMEOUT seconds to register and resume.
# The job has 45 seconds to finish
atf.wait_for_job_state(
job_id, "COMPLETED", timeout=atf.PERIODIC_TIMEOUT + 15, fatal=True
)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# Now submit job requiring nf1 feature, which cloud node doesn't currently
# have active
job_id = atf.submit_job_sbatch(
f"-p cloud1 -C nf1 -w {node_prefix}1 --wrap 'srun hostname'", fatal=True
)
# Make sure cloud node enters POWERING_DOWN state without running the job
atf.wait_for_node_state(
f"{node_prefix}1", "POWERING_DOWN", timeout=suspend_time + 15, fatal=True
)
assert "PENDING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Job should be pending because there are no cloud nodes with the requested feature active"
# Cloud node should enter "POWERED_DOWN" state
atf.wait_for_node_state(
f"{node_prefix}1", "POWERED_DOWN", timeout=suspend_timeout + 5, fatal=True
)
# Should become ALLOCATED once scheduler runs, notwithstanding whether node
# was already PLANNED or not
atf.wait_for_node_state(
f"{node_prefix}1", "ALLOCATED", timeout=atf.PERIODIC_TIMEOUT + 15, fatal=True
)
# Cloud node then enters POWERING_UP once the power thread runs
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "CONFIGURING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Submitted job should be in CONFIGURING state while its ALLOCATED cloud node is POWERING_UP"
# TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
# Remove once bug 16459 is fixed.
time.sleep(2)
# Register the new slurmd
atf.run_command(
f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'",
fatal=True,
user="root",
)
# Make sure the cloud node resumes
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERING_UP",
reverse=True,
timeout=resume_timeout + 5,
fatal=True,
)
# Test the cloud node now has the activated feature needed to run the job
assert available_features == sorted(
atf.get_node_parameter(f"{node_prefix}1", "features")
), "Cloud node's AvailableFeatures don't match what was set in slurm.conf"
assert available_features == sorted(
atf.get_node_parameter(f"{node_prefix}1", "active_features")
), "Cloud node should have both of its available features active"
# Cloud node takes PERIODIC_TIMEOUT seconds to register and resume.
# The job has 45 seconds to finish
atf.wait_for_job_state(
job_id, "COMPLETED", timeout=atf.PERIODIC_TIMEOUT + 15, fatal=True
)
atf.wait_for_node_state(f"{node_prefix}1", "IDLE", timeout=5, fatal=True)
# Test partition flag 'PowerDownOnIdle=yes'
def test_power_down_on_idle():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# Schedule a job to cloud node's partition, transitioning node to ALLOCATED
# and POWERING_UP state
job_id = atf.submit_job_sbatch(
"-p powerdownonidle --wrap 'srun hostname'", fatal=True
)
atf.wait_for_node_state(f"{node_prefix}1", "ALLOCATED", timeout=5, fatal=True)
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
assert "CONFIGURING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Submitted job should be in CONFIGURING state while its ALLOCATED cloud node is POWERING_UP"
# TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
# Remove once bug 16459 is fixed.
time.sleep(2)
# Register the new slurmd
atf.run_command(
f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'",
fatal=True,
user="root",
)
# Make sure the cloud node resumes
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERING_UP",
reverse=True,
timeout=resume_timeout + 5,
fatal=True,
)
# Cloud node takes PERIODIC_TIMEOUT seconds to register and resume.
# The job has 45 seconds to finish
atf.wait_for_job_state(
job_id, "COMPLETED", timeout=atf.PERIODIC_TIMEOUT + 15, fatal=True
)
# Immediately upon job completion and becoming IDLE, cloud node should
# POWER_DOWN
node_state = set(atf.get_node_parameter(f"{node_prefix}1", "state"))
assert (
set(["POWER_DOWN", "POWERING_DOWN"]) & node_state
), "Cloud node wasn't immediately POWER_DOWN once idle, in contrary to 'PowerDownOnIdle=yes' flag for partition"
atf.wait_for_node_state(f"{node_prefix}1", "POWERING_DOWN", fatal=True)
atf.wait_for_node_state(
f"{node_prefix}1", "POWERED_DOWN", timeout=suspend_timeout + 5, fatal=True
)
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# Make the check-in interval of the power save thread longer so we have time to
# POWER_DOWN_FORCE a cloud node that's already POWERED_DOWN and ALLOCATED before
# it's powered up. Saved as last test due to changing the slurm.conf file
def test_scontrol_power_down_force():
assert "CLOUD" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node state should always contain CLOUD"
assert "IDLE" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in IDLE state"
assert "POWERED_DOWN" in atf.get_node_parameter(
f"{node_prefix}1", "state"
), "CLOUD node must be in POWERED_DOWN state"
# Change power save thread minimum interval and restart slurmctld
atf.require_config_parameter_includes(
"SlurmctldParameters", f"power_save_min_interval={power_interval}"
)
atf.restart_slurmctld(clean=True)
# Submit job, get it assigned to the cloud node, and make sure everything
# goes well before cloud node enters POWERING_UP state
job_id = atf.submit_job_sbatch("-p cloud1 --wrap 'srun sleep 300'", fatal=True)
atf.wait_for_node_state(
f"{node_prefix}1", "ALLOCATED", timeout=5, poll_interval=0.1, fatal=True
)
assert "CONFIGURING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Submitted job should be CONFIGURING while there is a corresponding ALLOCATED cloud node"
# Now POWER_DOWN_FORCE the cloud node, make sure it enters POWER_DOWN and
# never POWERING_UP, and assure that the job gets requeued
assert atf.wait_for_node_state(
f"{node_prefix}1", "POWERED_DOWN"
), "Couldn't run POWER_DOWN_FORCE on cloud node before it left POWERED_DOWN state. Try making power_interval longer to give the test more time"
atf.run_command(
f"scontrol update nodename={node_prefix}1 state=POWER_DOWN_FORCE",
fatal=True,
user="slurm",
)
assert atf.wait_for_node_state(
f"{node_prefix}1", "IDLE"
), "Per 'SlurmctldParameters=idle_on_node_suspend' in slurm.conf, cloud node should always be in IDLE state except when ALLOCATED/MIXED for an assigned job"
# Handle a slight race condition where the node could be POWERING_DOWN before the check
# Pretty unlikely
node_state = set(atf.get_node_parameter(f"{node_prefix}1", "state"))
assert (
set(["POWER_DOWN", "POWERING_DOWN"]) & node_state
), "POWER_DOWN should immediately be added to cloud node's state"
assert "PENDING" == atf.get_job_parameter(
job_id, "JobState", default="NOT_FOUND", quiet=True
), "Job should be PENDING after being requeued"
# Make sure node finishes powering down correctly
atf.wait_for_node_state(
f"{node_prefix}1", "POWERING_DOWN", timeout=power_interval + 5, fatal=True
)
power_down_timeout = suspend_timeout + 5
# FIXED: Ticket 19895
if atf.get_version("sbin/slurmctld") < (24, 5):
power_down_timeout += power_interval
atf.wait_for_node_state(
f"{node_prefix}1",
"POWERED_DOWN",
timeout=power_down_timeout,
fatal=True,
)