| ############################################################################ |
| # Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved |
| ############################################################################ |
| """Validate the slurmd-supplied-field first-registration gate for |
| dynamic-normal nodes. The same gate |
| (node_ptr->boot_time > node_ptr->last_response in |
| validate_node_specs()) governs four fields delivered by slurmd: |
| topology (via dynamic_conf Topology=...), instance_id (--instance-id), |
| instance_type (--instance-type), and extra (--extra). |
| |
| The gate fires on the first registration we see for the node |
| (last_response is 0, slurmd's reported boot_time is positive) and on |
| an actual node reboot (boot_time advances past last_response). It is |
| the same condition the existing reboot-detection branch uses to mark |
| a node "unexpectedly rebooted". Steady-state pings and slurmd |
| restarts without an actual reboot have boot_time < last_response and |
| are skipped, so admin overrides via scontrol update node ... are not |
| clobbered. last_response is state-saved, so the gate also survives |
| slurmctld restart: admin overrides persist across controller |
| downtime. |
| |
| Topology scenarios (a/c/d/e/f/g/h) check two views in parallel: the |
| Topology= string on the node (scontrol show node), and the leaf |
| switch the node lives under (scontrol show topology). The first |
| reads node_ptr->topology_str directly, the second proves the |
| topology plugin's add/remove path also ran. |
| |
| instance_id / instance_type / extra scenarios (i/j/k/l) cover the |
| sibling field-apply paths in the same gated block; each field has |
| its own if-statement inside the gate, so per-field coverage guards |
| against one being broken without breaking the others. |
| """ |
| import re |
| |
| import atf |
| import pytest |
| |
| |
| PORT_BASE = 61500 |
| |
| |
| @pytest.fixture(scope="module", autouse=True) |
| def setup(): |
| atf.require_auto_config("creates a custom topology.yaml and restarts slurmctld") |
| atf.require_version( |
| (26, 11), |
| "sbin/slurmctld", |
| reason="being able to change dynamic topology after reboot added in 26.11", |
| ) |
| atf.require_version( |
| (26, 11), |
| "sbin/slurmd", |
| reason="being able to change dynamic topology after reboot added in 26.11", |
| ) |
| # One static node to bootstrap; one dynamic-normal node per test. |
| atf.require_config_parameter("SelectType", "select/cons_tres") |
| atf.require_config_parameter("SelectTypeParameters", "CR_Core_Memory") |
| atf.require_config_parameter("MaxNodeCount", 2) |
| |
| # Scenario h depends on the "Node unexpectedly rebooted" branch in |
| # validate_node_specs(), which is skipped when ReturnToService=2 |
| # (RETURN_TO_SERVICE_ALL). Force a value that lets the branch fire. |
| atf.require_config_parameter("ReturnToService", "1") |
| |
| # A tree topology with three named leaf switches, so the dynamic |
| # node's --conf "Topology=tree_topo:sw_*" places it under that |
| # switch and we can verify placement via scontrol show topology. |
| # node_ptr->topology_str is interpreted by the multi-topology |
| # dispatcher as "<topology_name>:<unit>" (see |
| # src/interfaces/topology.c:topology_g_add_rm_node), so the |
| # tree_topo: prefix is required. |
| atf.require_config_file( |
| "topology.yaml", |
| """ |
| - topology: tree_topo |
| cluster_default: true |
| tree: |
| switches: |
| - switch: sw_root |
| children: sw_alpha,sw_plain,sw_gamma |
| - switch: sw_alpha |
| - switch: sw_plain |
| - switch: sw_gamma |
| """, |
| ) |
| |
| atf.require_slurm_running() |
| |
| |
| def _slurmd_cmd(name, port, conf_extra="", slurmd_args=""): |
| conf = f"Port={port}" |
| if conf_extra: |
| conf = f"{conf} {conf_extra}" |
| cmd = f"{atf.properties['slurm-sbin-dir']}/slurmd -N {name} -Z " f"--conf '{conf}'" |
| if slurmd_args: |
| cmd = f"{cmd} {slurmd_args}" |
| return cmd |
| |
| |
| def _start_dynamic_slurmd(name, port, conf_extra="", slurmd_args=""): |
| """Start a dynamic-normal slurmd and wait until it is IDLE.""" |
| atf.run_command( |
| _slurmd_cmd(name, port, conf_extra, slurmd_args), |
| user="root", |
| fatal=True, |
| ) |
| atf.repeat_until( |
| lambda: name in atf.get_nodes(quiet=True), |
| lambda found: found, |
| timeout=30, |
| fatal=True, |
| ) |
| assert atf.wait_for_node_state( |
| name, "IDLE", timeout=30, fatal=True |
| ), f"Dynamic node {name} should reach IDLE state" |
| |
| |
| def _kill_dynamic_slurmd(name): |
| """Kill the slurmd for the given dynamic node and wait for the process |
| to exit so we can safely start it again.""" |
| pid = atf.run_command_output( |
| f"pgrep -f '{atf.properties['slurm-sbin-dir']}/slurmd -N {name} -Z'", |
| fatal=False, |
| ).strip() |
| if not pid: |
| return |
| atf.run_command(f"kill {pid}", user="root", fatal=False) |
| atf.repeat_until( |
| lambda: atf.run_command_output( |
| f"pgrep -f '{atf.properties['slurm-sbin-dir']}/slurmd -N {name} -Z'", |
| fatal=False, |
| ).strip(), |
| lambda out: not out, |
| timeout=15, |
| fatal=False, |
| ) |
| |
| |
| def _delete_dynamic_node(name): |
| _kill_dynamic_slurmd(name) |
| atf.run_command(f"scontrol delete NodeName={name}", user="slurm", fatal=False) |
| |
| |
| def _wait_for_slurmd_reregister(name, old_start_time, timeout=30): |
| """Wait until node_ptr->slurmd_start_time differs from old_start_time, |
| indicating a freshly started slurmd has registered with the |
| controller. wait_for_node_state(IDLE) alone is unreliable when the |
| node was already IDLE before the slurmd kill -- the state may not |
| have transitioned, so the assertion can read stale field values |
| before the new registration arrives.""" |
| ok = atf.repeat_until( |
| lambda: atf.get_node_parameter(name, "slurmd_start_time"), |
| lambda t: t and t != old_start_time, |
| timeout=timeout, |
| fatal=False, |
| ) |
| if not ok: |
| pytest.fail( |
| f"Slurmd did not re-register on {name} within {timeout}s " |
| f"(slurmd_start_time still {old_start_time})" |
| ) |
| |
| |
| def _node_topology(name): |
| """Return the node's currently reported Topology= string, or None.""" |
| return atf.get_node_parameter(name, "topology") |
| |
| |
| def _assert_topology(name, expected, timeout=10): |
| """Poll until the node's topology matches `expected`, or fail with a |
| diagnostic showing the value that was actually observed. |
| |
| Pass expected=None to assert the topology is cleared (None or empty).""" |
| if expected is None: |
| ok = atf.repeat_until( |
| lambda: _node_topology(name), |
| lambda topo: not topo, |
| timeout=timeout, |
| fatal=False, |
| ) |
| else: |
| ok = atf.repeat_until( |
| lambda: _node_topology(name), |
| lambda topo: topo == expected, |
| timeout=timeout, |
| fatal=False, |
| ) |
| if not ok: |
| pytest.fail( |
| f"Expected topology {expected!r} on {name}, got " |
| f"{_node_topology(name)!r}" |
| ) |
| |
| |
| def _switch_for_node(name): |
| """Return the leaf SwitchName containing `name`, or None. |
| |
| Parses 'scontrol show topology' output. Only level-0 (leaf) switches |
| list nodes, so we ignore intermediate switches.""" |
| out = atf.run_command_output("scontrol show topology", fatal=False) |
| for line in out.splitlines(): |
| m = re.match(r"SwitchName=(\S+) Level=0 .*Nodes=(\S+)", line) |
| if not m: |
| continue |
| sw, nodes_expr = m.group(1), m.group(2) |
| if name in atf.node_range_to_list(nodes_expr): |
| return sw |
| return None |
| |
| |
| def _assert_node_under_switch(name, expected, timeout=10): |
| """Poll until `name` lives under SwitchName=expected. |
| |
| Pass expected=None to assert the node is under no configured |
| leaf switch (i.e. its topology was cleared).""" |
| ok = atf.repeat_until( |
| lambda: _switch_for_node(name), |
| lambda sw: sw == expected, |
| timeout=timeout, |
| fatal=False, |
| ) |
| if not ok: |
| topo_dump = atf.run_command_output("scontrol show topology", fatal=False) |
| pytest.fail( |
| f"Expected {name} under switch {expected!r}, got " |
| f"{_switch_for_node(name)!r}. scontrol show topology:\n" |
| f"{topo_dump}" |
| ) |
| |
| |
| def test_baseline_topology_from_slurmd_conf(): |
| """Scenario a: dynamic-normal slurmd with Topology=tree_topo:sw_alpha in its |
| --conf registers under switch sw_alpha.""" |
| name = "node10" |
| try: |
| _start_dynamic_slurmd(name, PORT_BASE, "Topology=tree_topo:sw_alpha") |
| _assert_topology(name, "tree_topo:sw_alpha") |
| _assert_node_under_switch(name, "sw_alpha") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| def test_admin_override_survives_slurmd_ping(): |
| """Scenario c: after scontrol update node Topology=, a slurmd ping |
| that still reports the original Topology must not clobber the |
| admin override. |
| |
| Re-registration is forced by killing and restarting slurmd with |
| the same --conf; from the controller's perspective this is a |
| registration RPC carrying the original Topology, which is exactly |
| the ping case for the new validate_node_specs() gate.""" |
| name = "node11" |
| port = PORT_BASE + 1 |
| try: |
| _start_dynamic_slurmd(name, port, "Topology=tree_topo:sw_alpha") |
| _assert_topology(name, "tree_topo:sw_alpha") |
| _assert_node_under_switch(name, "sw_alpha") |
| |
| atf.run_command( |
| f"scontrol update NodeName={name} Topology=tree_topo:sw_plain", |
| user="slurm", |
| fatal=True, |
| ) |
| _assert_topology(name, "tree_topo:sw_plain") |
| _assert_node_under_switch(name, "sw_plain") |
| |
| # Re-register with the same --conf (slurmd still says sw_alpha). |
| old_start = atf.get_node_parameter(name, "slurmd_start_time") |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, "Topology=tree_topo:sw_alpha"), |
| user="root", |
| fatal=True, |
| ) |
| _wait_for_slurmd_reregister(name, old_start) |
| _assert_topology(name, "tree_topo:sw_plain") |
| _assert_node_under_switch(name, "sw_plain") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| def test_admin_override_survives_slurmd_restart(): |
| """Scenario d: explicit slurmd kill/restart with the same --conf |
| after an admin override -- override survives. |
| |
| Distinct from scenario c only in the framing (a full daemon |
| restart vs. a registration RPC); both hit the same controller |
| code path, so this test mainly guards against future drift if |
| that ever changes.""" |
| name = "node12" |
| port = PORT_BASE + 2 |
| try: |
| _start_dynamic_slurmd(name, port, "Topology=tree_topo:sw_alpha") |
| atf.run_command( |
| f"scontrol update NodeName={name} Topology=tree_topo:sw_plain", |
| user="slurm", |
| fatal=True, |
| ) |
| _assert_topology(name, "tree_topo:sw_plain") |
| _assert_node_under_switch(name, "sw_plain") |
| |
| old_start = atf.get_node_parameter(name, "slurmd_start_time") |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, "Topology=tree_topo:sw_alpha"), |
| user="root", |
| fatal=True, |
| ) |
| _wait_for_slurmd_reregister(name, old_start) |
| _assert_topology(name, "tree_topo:sw_plain") |
| _assert_node_under_switch(name, "sw_plain") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| def test_slurmd_new_topology_ignored(): |
| """Scenario e: when slurmd restarts with a different Topology=, the |
| new value is IGNORED -- slurmd does not take topology ownership |
| back. To re-apply a new Topology= the operator must scontrol delete |
| the node and let it re-register.""" |
| name = "node13" |
| port = PORT_BASE + 3 |
| try: |
| _start_dynamic_slurmd(name, port, "Topology=tree_topo:sw_alpha") |
| _assert_topology(name, "tree_topo:sw_alpha") |
| _assert_node_under_switch(name, "sw_alpha") |
| |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, "Topology=tree_topo:sw_gamma"), |
| user="root", |
| fatal=True, |
| ) |
| assert atf.wait_for_node_state(name, "IDLE", timeout=30, fatal=True) |
| # Slurmd's new Topology= must be ignored: the node stays under |
| # the topology assigned on the first registration. |
| _assert_topology(name, "tree_topo:sw_alpha") |
| _assert_node_under_switch(name, "sw_alpha") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| def test_topology_retained_when_slurmd_drops_topology(): |
| """Scenario f: when slurmd restarts without Topology= in --conf, |
| the topology assigned on the first registration is retained |
| (slurmd's drop is not a take-back signal).""" |
| name = "node14" |
| port = PORT_BASE + 4 |
| try: |
| _start_dynamic_slurmd(name, port, "Topology=tree_topo:sw_alpha") |
| _assert_topology(name, "tree_topo:sw_alpha") |
| _assert_node_under_switch(name, "sw_alpha") |
| |
| _kill_dynamic_slurmd(name) |
| # Restart slurmd without Topology= -- the prior value must stay. |
| atf.run_command(_slurmd_cmd(name, port), user="root", fatal=True) |
| assert atf.wait_for_node_state(name, "IDLE", timeout=30, fatal=True) |
| _assert_topology(name, "tree_topo:sw_alpha") |
| _assert_node_under_switch(name, "sw_alpha") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| def test_admin_override_survives_slurmctld_restart(): |
| """Scenario g: last_response is persisted in the node state file, |
| so an admin override placed after slurmd's first registration must |
| survive a slurmctld restart and the next slurmd re-registration: |
| on restore the gate (boot_time > last_response) correctly skips |
| when slurmd reports the same OS boot it had before the controller |
| went down.""" |
| name = "node15" |
| port = PORT_BASE + 5 |
| try: |
| _start_dynamic_slurmd(name, port, "Topology=tree_topo:sw_alpha") |
| atf.run_command( |
| f"scontrol update NodeName={name} Topology=tree_topo:sw_plain", |
| user="slurm", |
| fatal=True, |
| ) |
| _assert_topology(name, "tree_topo:sw_plain") |
| _assert_node_under_switch(name, "sw_plain") |
| |
| # Restart slurmctld -- boot_time must be reloaded from the |
| # state file so the next slurmd registration is recognized as |
| # "not the first" and leaves topology_str alone. |
| atf.restart_slurmctld(clean=False) |
| |
| # Force a slurmd re-registration carrying the same --conf. |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, "Topology=tree_topo:sw_alpha"), |
| user="root", |
| fatal=True, |
| ) |
| assert atf.wait_for_node_state(name, "IDLE", timeout=30, fatal=True) |
| _assert_topology(name, "tree_topo:sw_plain") |
| _assert_node_under_switch(name, "sw_plain") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| def test_reboot_during_slurmctld_downtime_detected(): |
| """Scenario h: a node reboot during slurmctld downtime must be |
| detected on the next registration after the controller comes |
| back up. State-saved boot_time makes the "Node unexpectedly |
| rebooted" branch in validate_node_specs fire correctly across |
| controller restart, where previously boot_time reset to 0 on |
| every controller start and the check could not fire on the |
| first post-restart registration. |
| |
| Simulated by killing slurmd while slurmctld is down and |
| restarting it with -b, which sets slurmd's conf->boot_time to |
| "now" (slurmd.c) so the controller derives a boot_time later |
| than the saved last_response.""" |
| name = "node16" |
| port = PORT_BASE + 6 |
| try: |
| _start_dynamic_slurmd(name, port, "Topology=tree_topo:sw_alpha") |
| assert atf.wait_for_node_state(name, "IDLE", timeout=30, fatal=True) |
| |
| # Stop slurmctld; slurmd keeps running but its pings fail. |
| atf.stop_slurmctld() |
| |
| # Kill slurmd and restart with -b. The -b flag sets slurmd's |
| # conf->boot_time = time(NULL), so on next registration the |
| # controller-side boot_time = now - up_time will be slurmd's |
| # restart time, which is later than the last_response we have |
| # in the state file from the pre-shutdown registration. |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, "Topology=tree_topo:sw_alpha") + " -b", |
| user="root", |
| fatal=True, |
| ) |
| |
| # Bring slurmctld back; state is restored (including boot_time |
| # and last_response). Slurmd's pending registration is then |
| # processed and the reboot-detection branch should fire. |
| atf.start_slurmctld(clean=False) |
| |
| assert atf.wait_for_node_state( |
| name, "DOWN", timeout=30, fatal=True |
| ), "Node should be marked DOWN after reboot-during-downtime" |
| |
| reason = atf.get_node_parameter(name, "reason") |
| assert ( |
| reason and "rebooted" in reason.lower() |
| ), f"Expected reboot-related reason on {name}, got: {reason!r}" |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| # ---------------------------------------------------------------------------- |
| # instance_id / instance_type / extra share the same first-registration gate |
| # (boot_time > last_response) as topology. The scenarios below exercise the |
| # parallel field-apply paths in the unified gated block in |
| # validate_node_specs(): each field is its own if-statement inside the gate, |
| # so per-field coverage guards against one being broken without breaking the |
| # others. |
| # ---------------------------------------------------------------------------- |
| |
| |
| def _assert_node_field(name, field, expected, timeout=10): |
| """Poll until atf.get_node_parameter(name, field) matches expected.""" |
| ok = atf.repeat_until( |
| lambda: atf.get_node_parameter(name, field), |
| lambda v: v == expected, |
| timeout=timeout, |
| fatal=False, |
| ) |
| if not ok: |
| pytest.fail( |
| f"Expected {field}={expected!r} on {name}, got " |
| f"{atf.get_node_parameter(name, field)!r}" |
| ) |
| |
| |
| def test_baseline_instance_id_type_extra_from_slurmd_conf(): |
| """Scenario i: dynamic-normal slurmd with --instance-id, --instance-type, |
| and --extra on the command line populates the three corresponding node |
| fields on first registration.""" |
| name = "node17" |
| port = PORT_BASE + 7 |
| try: |
| _start_dynamic_slurmd( |
| name, |
| port, |
| slurmd_args="--instance-id=id-i --instance-type=type-i --extra=extra-i", |
| ) |
| _assert_node_field(name, "instance_id", "id-i") |
| _assert_node_field(name, "instance_type", "type-i") |
| _assert_node_field(name, "extra", "extra-i") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| @pytest.mark.parametrize( |
| "name, field,slurmd_flag,scontrol_kw,orig,admin", |
| [ |
| ("node18", "instance_id", "--instance-id", "InstanceId", "id-orig", "id-admin"), |
| ( |
| "node19", |
| "instance_type", |
| "--instance-type", |
| "InstanceType", |
| "type-orig", |
| "type-admin", |
| ), |
| ("node20", "extra", "--extra", "Extra", "extra-orig", "extra-admin"), |
| ], |
| ) |
| def test_admin_override_survives_slurmd_restart_field( |
| name, field, slurmd_flag, scontrol_kw, orig, admin |
| ): |
| """Scenarios j/k/l: after a scontrol update of the field, a slurmd |
| kill/restart that re-reports the original value does not clobber |
| the admin override. One pytest node per field so a per-field |
| regression localizes to one apply block in the unified gate.""" |
| port = PORT_BASE + 8 + ["instance_id", "instance_type", "extra"].index(field) |
| try: |
| _start_dynamic_slurmd(name, port, slurmd_args=f"{slurmd_flag}={orig}") |
| _assert_node_field(name, field, orig) |
| |
| atf.run_command( |
| f"scontrol update NodeName={name} {scontrol_kw}={admin}", |
| user="slurm", |
| fatal=True, |
| ) |
| _assert_node_field(name, field, admin) |
| |
| old_start = atf.get_node_parameter(name, "slurmd_start_time") |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, slurmd_args=f"{slurmd_flag}={orig}"), |
| user="root", |
| fatal=True, |
| ) |
| _wait_for_slurmd_reregister(name, old_start) |
| _assert_node_field(name, field, admin) |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| def test_slurmd_reboot_applies_new_topology(): |
| """Scenario m: when slurmd is restarted with -b (simulating a node |
| reboot) and a different Topology= in --conf, the new topology must |
| be applied -- including over an admin override placed via |
| scontrol after first registration. This exercises the reboot |
| half of the first-reg/reboot gate: boot_time advances past |
| last_response and the field-apply block in validate_node_specs |
| re-runs for any node type. The reboot-detection branch also |
| drains the node DOWN with "unexpectedly rebooted", but the |
| field-apply block runs first in the same call, so topology_str |
| is updated to slurmd's reboot value (overriding the admin's |
| pre-reboot setting). |
| |
| This scenario was added because without the topology-on-reboot |
| change, the apply block is gated on |
| IS_NODE_CLOUD && (was_powering_up || was_powered_down) and never |
| fires for dynamic-normal nodes -- subsequent slurmd reboots |
| silently keep the admin's pre-reboot topology.""" |
| name = "node21" |
| port = PORT_BASE + 11 |
| try: |
| _start_dynamic_slurmd(name, port, "Topology=tree_topo:sw_alpha") |
| _assert_topology(name, "tree_topo:sw_alpha") |
| _assert_node_under_switch(name, "sw_alpha") |
| |
| # Admin override before the reboot. On a normal (no-reboot) |
| # slurmd restart this would survive (see scenarios c/d), but |
| # a real reboot brings in slurmd's new view. |
| atf.run_command( |
| f"scontrol update NodeName={name} Topology=tree_topo:sw_plain", |
| user="slurm", |
| fatal=True, |
| ) |
| _assert_topology(name, "tree_topo:sw_plain") |
| _assert_node_under_switch(name, "sw_plain") |
| |
| # Kill slurmd and restart with -b AND a different Topology=. |
| # The -b sets conf->boot_time = now so the controller-derived |
| # boot_time on the next registration is later than the saved |
| # last_response, firing the gate. |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, "Topology=tree_topo:sw_gamma") + " -b", |
| user="root", |
| fatal=True, |
| ) |
| # The node is marked DOWN by the reboot-detection branch, but |
| # topology_str is updated by the field-apply block earlier in |
| # the same validate_node_specs() call -- replacing the admin |
| # override with slurmd's reboot value. |
| assert atf.wait_for_node_state( |
| name, "DOWN", timeout=30, fatal=True |
| ), "Node should be marked DOWN after simulated reboot" |
| _assert_topology(name, "tree_topo:sw_gamma") |
| _assert_node_under_switch(name, "sw_gamma") |
| finally: |
| _delete_dynamic_node(name) |
| |
| |
| @pytest.mark.parametrize( |
| "name,field,slurmd_flag,scontrol_kw,orig,admin,new", |
| [ |
| ( |
| "node22", |
| "instance_id", |
| "--instance-id", |
| "InstanceId", |
| "id-orig", |
| "id-admin", |
| "id-new", |
| ), |
| ( |
| "node23", |
| "instance_type", |
| "--instance-type", |
| "InstanceType", |
| "type-orig", |
| "type-admin", |
| "type-new", |
| ), |
| ( |
| "node24", |
| "extra", |
| "--extra", |
| "Extra", |
| "extra-orig", |
| "extra-admin", |
| "extra-new", |
| ), |
| ], |
| ) |
| def test_slurmd_reboot_applies_new_field( |
| name, field, slurmd_flag, scontrol_kw, orig, admin, new |
| ): |
| """Scenarios n/o/p: when slurmd is restarted with -b (simulating a |
| node reboot) and a different value for the field, the new value |
| is applied -- including over an admin override placed between |
| the original registration and the reboot. Mirrors scenario m for |
| the corresponding field-apply path in the unified gated block. |
| One pytest node per field so a per-field regression localizes.""" |
| port = PORT_BASE + 12 + ["instance_id", "instance_type", "extra"].index(field) |
| try: |
| _start_dynamic_slurmd(name, port, slurmd_args=f"{slurmd_flag}={orig}") |
| _assert_node_field(name, field, orig) |
| |
| atf.run_command( |
| f"scontrol update NodeName={name} {scontrol_kw}={admin}", |
| user="slurm", |
| fatal=True, |
| ) |
| _assert_node_field(name, field, admin) |
| |
| _kill_dynamic_slurmd(name) |
| atf.run_command( |
| _slurmd_cmd(name, port, slurmd_args=f"{slurmd_flag}={new} -b"), |
| user="root", |
| fatal=True, |
| ) |
| assert atf.wait_for_node_state( |
| name, "DOWN", timeout=30, fatal=True |
| ), "Node should be marked DOWN after simulated reboot" |
| _assert_node_field(name, field, new) |
| finally: |
| _delete_dynamic_node(name) |