blob: 36a21989dfd25f9516097b0c94fff4891e1488f6 [file] [log] [blame] [edit]
############################################################################
# Copyright (C) SchedMD LLC.
############################################################################
import atf
import pytest
import pexpect
suser = atf.properties["slurm-user"]
# Setup
@pytest.fixture(scope="module", autouse=True)
def setup():
atf.require_slurm_running()
def test_single_job():
"""Submit job directly to slurmd without use of slurmctld scheduler."""
node_dict = atf.get_nodes()
node = list(node_dict.keys())[0]
job_output = atf.run_job_output(
f"-N1 --nodelist={node} --no-allocate printenv SLURMD_NODENAME", user=suser
)
assert (
job_output.strip("\n") == node
), f"The job failed to print out the node name: {node}"
def test_multiple_jobs():
"""
Run three tasks at a time on some node and do so repeatedly
This checks for slurmd race conditions
logic has time to be processed (slurmd -> slurmctld messages)
Note: process output in order of expected completion
"""
node_dict = atf.get_nodes()
node = list(node_dict.keys())[0]
for it in range(100):
child1 = pexpect.spawn(f"srun -N1 --nodelist={node} true")
child2 = pexpect.spawn(
f'sudo -u {suser} bash -lc "srun -N1 --nodelist={node} -Z sleep 0.5"'
)
child3 = pexpect.spawn(
f'sudo -u {suser} bash -lc "srun -N1 --nodelist={node} -Z sleep 0.25"'
)
pattern_index = child2.expect(
[r"error:.*configuring interconnect", r"error:", pexpect.EOF],
timeout=atf.default_command_timeout,
)
assert pattern_index != 1, f"Child 2 failed to run on iteration {it}"
pattern_index = child3.expect(
[r"error:.*configuring interconnect", r"error:", pexpect.EOF],
timeout=atf.default_command_timeout,
)
assert pattern_index != 1, f"Child 3 failed to run on iteration {it}"
pattern_index = child1.expect(
[r"error:.*configuring interconnect", r"error:", pexpect.EOF],
timeout=atf.default_command_timeout,
)
assert pattern_index != 1, f"Child 1 failed to run on iteration {it}"
# Ensure that jobs ended
for child in [child1, child2, child3]:
atf.repeat_until(
lambda: child1.isalive(),
lambda alive: not alive,
fatal=True,
)