blob: fcb0ae9c5a879f7b6b0e91545732c4b90cf8a272 [file] [edit]
############################################################################
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
############################################################################
import os
import re
import atf
import pytest
node_count = 1
cpu_allowed_list_regex = re.compile(r"([0-9]+):\s*Cpus_allowed_list:\s*([0-9\-,]+)")
# Setup
@pytest.fixture(scope="module", autouse=True)
def setup():
atf.require_config_parameter_includes("GresTypes", "gpu")
atf.require_config_parameter("SelectType", "select/cons_tres")
atf.require_config_parameter("SelectTypeParameters", "CR_CPU")
atf.require_config_parameter("TaskPlugin", "cgroup,affinity")
atf.require_nodes(node_count, [("CPUs", 4), ("Gres", "gpu:2")])
# Require 8 tty because one test requests 8 "GPU"s (4 GPUS each for 2 nodes)
for tty_num in range(node_count * 2):
atf.require_tty(tty_num)
atf.require_config_parameter(
"Name", {"gpu": {"File": f"/dev/tty[0-{node_count * 2 - 1}]"}}, source="gres"
)
atf.require_slurm_running()
@pytest.fixture(scope="function", autouse=True)
def cleanup_state(setup):
yield
if os.path.exists("output.txt"):
os.unlink("output.txt")
def parse_cpurange(s):
result = []
for block in s.split(","):
if "-" in block:
start, end = block.split("-")
result.extend(range(int(start), int(end) + 1))
else:
result.append(int(block))
return tuple(result)
def get_task_cpus(output):
ret = {}
for line in output.split("\n"):
match = cpu_allowed_list_regex.search(line)
if match:
ret[match.groups()[0]] = match.groups()[1]
return ret
def test_step_exclusivity_default_exact():
"""
User can run a job specifying --exclusive and get a whole node allocation
with all GRES but with the exact behavior slicing up the CPUs and GPUs
exactly
Expected Result:
0: Cpus_allowed_list: 0
1: Cpus_allowed_list: 1
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output = atf.run_job_output(
'--exclusive --gpus-per-task=1 -n 2 -N 1 --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 1
assert len(parse_cpurange(task_cpus["1"])) == 1
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None
@pytest.mark.skipif(
atf.get_version("bin/srun") < (26, 5),
reason="Ticket 24115: The srun option --exclusive=allocate was added in 26.05+",
)
def test_step_exclusivity_allocation_only():
"""
User can run a job specifying --exclusive=allocation and get a whole node
allocation with all GRES but with all CPUs per task
Expected Result:
0: Cpus_allowed_list: 0-3
1: Cpus_allowed_list: 0-3
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output = atf.run_job_output(
'--exclusive=allocation --gpus-per-task=1 -n 2 -N 1 --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 4
assert len(parse_cpurange(task_cpus["1"])) == 4
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None
@pytest.mark.skipif(
atf.get_version("bin/srun") < (26, 5),
reason="Ticket 24115: The srun option --exclusive=allocate was added in 26.05+",
)
def test_exclusive_exact_forward():
"""
User can run a job specifying --exclusive=allocation --exact and get a
whole node allocation with all GRES and retain exact behavior
Expected Result:
0: Cpus_allowed_list: 0
1: Cpus_allowed_list: 1
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output = atf.run_job_output(
'--exclusive=allocation --exact --gpus-per-task=1 -n 2 -N 1 --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 1
assert len(parse_cpurange(task_cpus["1"])) == 1
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None
@pytest.mark.skipif(
atf.get_version("bin/srun") < (26, 5),
reason="Ticket 24115: The srun option --exclusive=allocate was added in 26.05+",
)
def test_exclusive_exact_reverse():
"""
User can run a job specifying --exact --exclusive=allocation and get a
whole node allocation with all GRES and retain exact behavior
Expected Result:
0: Cpus_allowed_list: 0
1: Cpus_allowed_list: 1
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output = atf.run_job_output(
'--exact --exclusive=allocation --gpus-per-task=1 -n 2 -N 1 --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 1
assert len(parse_cpurange(task_cpus["1"])) == 1
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None
@pytest.mark.skipif(
atf.get_version("bin/srun") < (26, 5),
reason="Ticket 24115: The srun option --exclusive=allocate was added in 26.05+",
)
def test_sbatch_normal_with_exclusive_allocation():
"""
User can submit a batch job specifying --exclusive=allocation and get a
whole node allocation with all GRES and all CPUs by default (no exact)
This test is to ensure the use of the option does not break standard
behavior. The --exclusive=allocation option is not intended to affect
sbatch in any way.
Expected Result:
0: Cpus_allowed_list: 0-3
1: Cpus_allowed_list: 0-3
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output_file = "output.txt"
job_id = atf.submit_job_sbatch(
f"--exclusive=allocation --gpus-per-task=1 -n 2 -N 1 --output={output_file} --wrap='srun --label /bin/bash -c \"grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES\"'"
)
assert job_id != 0
assert atf.wait_for_job_state(job_id, "DONE")
with open(output_file, "r") as rfp:
output = rfp.read().strip()
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 4
assert len(parse_cpurange(task_cpus["1"])) == 4
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None
@pytest.mark.skipif(
atf.get_version("bin/srun") < (26, 5),
reason="Ticket 24115: The srun option --exclusive=allocate was added in 26.05+",
)
def test_salloc_normal_with_exclusive_allocation():
"""
User can submit an interactive salloc job specifying --exclusive=allocation
and get a whole node allocation with all GRES and all CPUs by default
(no exact)
This test is to ensure the use of the option does not break standard
behavior. The --excluive=allocation option is not intended to affect
salloc in any way.
Expected Result:
0: Cpus_allowed_list: 0-3
1: Cpus_allowed_list: 0-3
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output_file = "output.txt"
job_id = atf.submit_job_salloc(
f'--exclusive=allocation --gpus-per-task=1 -n 2 -N 1 srun --output={output_file} --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
assert job_id != 0
assert atf.wait_for_job_state(job_id, "DONE")
with open(output_file, "r") as rfp:
output = rfp.read().strip()
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 4
assert len(parse_cpurange(task_cpus["1"])) == 4
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None