blob: 96ff717d82930d7150d7945f7472244b76f43439 [file] [edit]
############################################################################
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
############################################################################
"""Regression: an EXTERNAL node must not enter NOT_RESPONDING after resume.
EXTERNAL nodes have no slurmd, so the controller must not flag them as
NODE_STATE_NO_RESPOND when transitioning back to IDLE via `state=resume`.
"""
import atf
import pytest
@pytest.fixture(scope="module", autouse=True)
def setup():
atf.require_auto_config("Wants to create EXTERNAL dynamic nodes")
atf.require_config_parameter("MaxNodeCount", 8)
atf.require_config_parameter("SelectType", "select/cons_tres")
atf.require_config_parameter("SelectTypeParameters", "CR_CPU")
atf.require_slurm_running()
@pytest.fixture
def external_node():
"""Create an EXTERNAL node and delete it after the test."""
node_name = "ext_resume_test"
slurm_user = atf.properties["slurm-user"]
atf.run_command(
f"scontrol create nodename={node_name} state=external",
user=slurm_user,
fatal=True,
)
yield node_name
atf.run_command(
f"scontrol delete nodename={node_name}",
user=slurm_user,
quiet=True,
)
def _states(node):
st = atf.get_node_parameter(node, "state")
if isinstance(st, str):
return [st]
return st
@pytest.mark.xfail(
atf.get_version("bin/scontrol") < (25, 11),
reason="Ticket 50831: EXTERNAL node resume fix is only present in slurmctld 25.11+, but scontrol required due issue #50689 of NODE_STATE_EXTERNAL missing with --json",
)
@pytest.mark.parametrize("pre_state", ["drain", "down"])
def test_resume_external_no_not_responding(external_node, pre_state):
"""resume on an EXTERNAL node must clear/avoid NOT_RESPONDING."""
node = external_node
slurm_user = atf.properties["slurm-user"]
initial = _states(node)
assert "EXTERNAL" in initial, f"Setup error: {node} not EXTERNAL: {initial!r}"
assert (
"NOT_RESPONDING" not in initial
), f"Fresh EXTERNAL {node} should not be NOT_RESPONDING: {initial!r}"
atf.run_command(
f"scontrol update nodename={node} state={pre_state} reason=test_external_resume",
user=slurm_user,
fatal=True,
)
expected_pre = "DRAIN" if pre_state == "drain" else "DOWN"
st = _states(node)
assert expected_pre in st, (
f"EXTERNAL {node} should be {expected_pre} after state={pre_state}; "
f"state={st!r}"
)
assert "NOT_RESPONDING" not in st, (
f"EXTERNAL {node} must not be NOT_RESPONDING after state={pre_state}; "
f"state={st!r}"
)
atf.run_command(
f"scontrol update nodename={node} state=resume",
user=slurm_user,
fatal=True,
)
st = _states(node)
assert "IDLE" in st, (
f"EXTERNAL {node} should be IDLE after resume from {pre_state}; "
f"state={st!r}"
)
assert expected_pre not in st, (
f"EXTERNAL {node} should no longer be {expected_pre} after resume; "
f"state={st!r}"
)
assert "NOT_RESPONDING" not in st, (
f"EXTERNAL {node} must not be NOT_RESPONDING after resume from "
f"{pre_state}; state={st!r}"
)