Revert recent changes for job step layout problems.
These changes probably made the problem worse.
diff --git a/NEWS b/NEWS
index c2f8a5e..3ac5091 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,6 @@
-- Correct logic that tells slurmstepd how many nodes are associated with a
job step. The bug resulted in the job step completion message from
slurmstepd to slurmctld being delayed by one minute.
- -- Fix some anomalies with respect to task layout with plane distribution.
* Changes in SLURM 2.1.16
=========================
diff --git a/src/common/slurm_step_layout.c b/src/common/slurm_step_layout.c
index 59bc405..52d91be 100644
--- a/src/common/slurm_step_layout.c
+++ b/src/common/slurm_step_layout.c
@@ -668,7 +668,6 @@
uint16_t *cpus)
{
int i, j, k, taskid = 0;
- bool over_subscribe = false;
debug3("_task_layout_plane plane_size %u node_cnt %u task_cnt %u",
step_layout->plane_size,
@@ -686,25 +685,16 @@
}
taskid = 0;
- for (i=0; i<step_layout->node_cnt; i++)
- taskid += cpus[i];
- if (taskid >= step_layout->task_cnt)
- over_subscribe = true;
-
- taskid = 0;
for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */
for (i=0; ((i<step_layout->node_cnt)
&& (taskid<step_layout->task_cnt)); i++) {
/* assign a block of 'plane_size' tasks to this node */
- for (k=0; ((k<step_layout->plane_size) &&
- (taskid<step_layout->task_cnt)); k++) {
+ for (k=0; ((k<step_layout->plane_size)
+ && (taskid<step_layout->task_cnt)); k++) {
step_layout->tids[i][step_layout->tasks[i]] =
taskid;
taskid++;
step_layout->tasks[i]++;
- if (!over_subscribe &&
- (step_layout->tasks[i] >= cpus[i]))
- break;
}
}
}
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 4aeefbc..27881cd 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -579,24 +579,6 @@
task_cnt /= cpus_per_task;
total_tasks = MIN(total_tasks, task_cnt);
}
-
- if (step_spec->plane_size != (uint16_t) NO_VAL) {
- if (avail_tasks < step_spec->plane_size)
- avail_tasks = 0;
- else {
- /* Round count down */
- avail_tasks /= step_spec->plane_size;
- avail_tasks *= step_spec->plane_size;
- }
- if (total_tasks < step_spec->plane_size)
- total_tasks = 0;
- else {
- /* Round count down */
- total_tasks /= step_spec->plane_size;
- total_tasks *= step_spec->plane_size;
- }
- }
-
if ((avail_tasks <= 0) ||
((selected_nodes == NULL) &&
(nodes_picked_cnt >= step_spec->node_count) &&
@@ -1531,12 +1513,6 @@
if (step_ptr->exclusive) {
usable_cpus = job_resrcs_ptr->cpus[pos] -
job_resrcs_ptr->cpus_used[pos];
- if (plane_size &&
- ((uint16_t)plane_size != (uint16_t)NO_VAL)){
- /* Round count down */
- usable_cpus /= plane_size;
- usable_cpus *= plane_size;
- }
} else
usable_cpus = job_resrcs_ptr->cpus[pos];
if (step_ptr->mem_per_cpu && _is_mem_resv()) {