blob: 239f8f191a435651c5e12980cef1fd0ff49b55c6 [file] [edit]
############################################################################
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
############################################################################
import re
import atf
import pytest
node_count = 1
cpu_allowed_list_regex = re.compile(r"([0-9]+):\s*Cpus_allowed_list:\s*([0-9\-,]+)")
# Setup
@pytest.fixture(scope="module", autouse=True)
def setup():
atf.require_config_parameter_includes("GresTypes", "gpu")
atf.require_config_parameter("SelectType", "select/cons_tres")
atf.require_config_parameter("SelectTypeParameters", "CR_CPU")
atf.require_config_parameter("TaskPlugin", "cgroup,affinity")
atf.require_config_parameter_includes(
"LaunchParameters", "srun_exclusive_allocation"
)
atf.require_nodes(node_count, [("CPUs", 4), ("Gres", "gpu:2")])
# Require 8 tty because one test requests 8 "GPU"s (4 GPUS each for 2 nodes)
for tty_num in range(node_count * 2):
atf.require_tty(tty_num)
atf.require_config_parameter(
"Name", {"gpu": {"File": f"/dev/tty[0-{node_count * 2 - 1}]"}}, source="gres"
)
atf.require_slurm_running()
def parse_cpurange(s):
result = []
for block in s.split(","):
if "-" in block:
start, end = block.split("-")
result.extend(range(int(start), int(end) + 1))
else:
result.append(int(block))
return tuple(result)
def get_task_cpus(output):
ret = {}
for line in output.split("\n"):
match = cpu_allowed_list_regex.search(line)
if match:
ret[match.groups()[0]] = match.groups()[1]
return ret
@pytest.mark.skipif(
atf.get_version("bin/srun") < (26, 5),
reason="Ticket 24115: LaunchParamters=srun_exclusive_allocation was added in 26.05+",
)
def test_srun_exclusive_allocation_launch_param():
"""
User can run a job specifying --exclusive and get a whole node
allocation with all GRES but with all CPUs per task, so long as
slurm.conf LaunchParamters=srun_exclusive_allocation parameter
is set
Expected Result:
0: Cpus_allowed_list: 0-3
1: Cpus_allowed_list: 0-3
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output = atf.run_job_output(
'--exclusive --gpus-per-task=1 -n 2 -N 1 --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 4
assert len(parse_cpurange(task_cpus["1"])) == 4
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None
def test_srun_exclusive_allocation_with_exact():
"""
User can run a job specifying --exclusive --exact and get a whole node
allocation with all GRES but specific CPUs set per task, even with
LaunchParamters=srun_exclusive_allocation parameter set.
Expected Result:
0: Cpus_allowed_list: 0
1: Cpus_allowed_list: 1
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
"""
output = atf.run_job_output(
'--exclusive --exact --gpus-per-task=1 -n 2 -N 1 --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 1
assert len(parse_cpurange(task_cpus["1"])) == 1
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None
def test_srun_exclusive_allocation_with_exact_reverse():
"""
User can run a job specifying --exact --exclusive and get a whole node
allocation with all GRES but specific CPUs set per task, even if
LaunchParamters=srun_exclusive_allocation parameter is set.
Expected Result:
0: Cpus_allowed_list: 0
1: Cpus_allowed_list: 1
0: CUDA_VISIBLE_DEVICES=0
1: CUDA_VISIBLE_DEVICES=1
Moreover, doing this with `--exact` first ensures that `--exclusive` is not
overwriting anything and making it so we get different behavior depending
on order of the arguments.
"""
output = atf.run_job_output(
'--exact --exclusive --gpus-per-task=1 -n 2 -N 1 --label /bin/bash -c "grep Cpus_allowed_list /proc/self/status && env | grep CUDA_VISIBLE_DEVICES"'
)
task_cpus = get_task_cpus(output)
assert len(parse_cpurange(task_cpus["0"])) == 1
assert len(parse_cpurange(task_cpus["1"])) == 1
assert re.search(r"0: CUDA_VISIBLE_DEVICES=\d", output) is not None
assert re.search(r"1: CUDA_VISIBLE_DEVICES=\d", output) is not None