blob: 5cccdb05eadbf8aa59372182a23e01e9ef6d6395 [file] [log] [blame] [edit]
/*****************************************************************************\
* part_data.c - Functions for structures dealing with partitions unique
* to the select plugin.
*****************************************************************************
* Copyright (C) SchedMD LLC.
* Derived in large part from select/cons_[res|tres] plugins
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "select_cons_tres.h"
#include "src/common/xstring.h"
part_res_record_t *select_part_record = NULL;
typedef struct {
int jstart;
struct job_resources *tmpjobs;
} sort_support_t;
/* Sort jobs by first set core, then size (CPU count) */
static int _compare_support(const void *v1, const void *v2)
{
sort_support_t *s1, *s2;
int rc;
s1 = (sort_support_t *) v1;
s2 = (sort_support_t *) v2;
rc = slurm_sort_int_list_asc(&s1->jstart, &s2->jstart);
if (rc)
return rc;
rc = slurm_sort_uint32_list_asc(&s1->tmpjobs->ncpus, &s2->tmpjobs->ncpus);
if (rc)
return rc;
return 0;
}
static int _sort_part_prio(void *x, void *y)
{
part_res_record_t *part1 = *(part_res_record_t **) x;
part_res_record_t *part2 = *(part_res_record_t **) y;
if (part1->part_ptr->priority_tier > part2->part_ptr->priority_tier)
return -1;
if (part1->part_ptr->priority_tier < part2->part_ptr->priority_tier)
return 1;
return 0;
}
/* helper script for part_data_sort_rescommon_sort_part_rows() */
static void _swap_rows(part_row_data_t *a, part_row_data_t *b)
{
part_row_data_t tmprow;
memcpy(&tmprow, a, sizeof(part_row_data_t));
memcpy(a, b, sizeof(part_row_data_t));
memcpy(b, &tmprow, sizeof(part_row_data_t));
}
static void _reset_part_row_bitmap(part_row_data_t *r_ptr)
{
clear_core_array(r_ptr->row_bitmap);
r_ptr->row_set_count = 0;
}
/*
* Add job resource use to the partition data structure
*/
extern void part_data_add_job_to_row(struct job_resources *job,
part_row_data_t *r_ptr)
{
/* add the job to the row_bitmap */
if (r_ptr->row_bitmap && (r_ptr->num_jobs == 0)) {
/* if no jobs, clear the existing row_bitmap first */
_reset_part_row_bitmap(r_ptr);
}
job_res_add_cores(job, r_ptr);
/* add the job to the job_list */
if (r_ptr->num_jobs >= r_ptr->job_list_size) {
r_ptr->job_list_size += 8;
xrealloc(r_ptr->job_list, r_ptr->job_list_size *
sizeof(struct job_resources *));
}
r_ptr->job_list[r_ptr->num_jobs++] = job;
}
/*
* part_data_build_row_bitmaps: A job has been removed from the given partition,
* so the row_bitmap(s) need to be reconstructed.
* Optimize the jobs into the least number of rows,
* and make the lower rows as dense as possible.
*
* IN p_ptr - the partition that has jobs to be optimized
* IN job_ptr - pointer to single job removed, pass NULL to completely rebuild
*/
extern void part_data_build_row_bitmaps(part_res_record_t *p_ptr,
job_record_t *job_ptr)
{
uint32_t i, j, num_jobs;
int x;
part_row_data_t *this_row, *orig_row;
sort_support_t *ss;
p_ptr->rebuild_rows = false;
if (!p_ptr->row)
return;
if (p_ptr->num_rows == 1) {
this_row = p_ptr->row;
if (this_row->num_jobs == 0) {
_reset_part_row_bitmap(this_row);
} else {
if (job_ptr) { /* just remove the job */
xassert(job_ptr->job_resrcs);
job_res_rm_cores(job_ptr->job_resrcs,
this_row);
} else { /* totally rebuild the bitmap */
_reset_part_row_bitmap(this_row);
for (j = 0; j < this_row->num_jobs; j++) {
job_res_add_cores(
this_row->job_list[j],
this_row);
}
}
}
return;
}
/* gather data */
num_jobs = 0;
for (i = 0; i < p_ptr->num_rows; i++) {
num_jobs += p_ptr->row[i].num_jobs;
}
if (num_jobs == 0) {
for (i = 0; i < p_ptr->num_rows; i++)
_reset_part_row_bitmap(&p_ptr->row[i]);
return;
}
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
info("DEBUG: (before):");
part_data_dump_res(p_ptr);
}
debug3("reshuffling %u jobs", num_jobs);
/* make a copy, in case we cannot do better than this */
orig_row = part_data_dup_row(p_ptr->row, p_ptr->num_rows);
if (orig_row == NULL)
return;
/* create a master job list and clear out ALL row data */
ss = xcalloc(num_jobs, sizeof(sort_support_t));
x = 0;
for (i = 0; i < p_ptr->num_rows; i++) {
for (j = 0; j < p_ptr->row[i].num_jobs; j++) {
ss[x].tmpjobs = p_ptr->row[i].job_list[j];
p_ptr->row[i].job_list[j] = NULL;
ss[x].jstart = bit_ffs(ss[x].tmpjobs->node_bitmap);
ss[x].jstart = cr_get_coremap_offset(ss[x].jstart);
ss[x].jstart += bit_ffs(ss[x].tmpjobs->core_bitmap);
x++;
}
p_ptr->row[i].num_jobs = 0;
_reset_part_row_bitmap(&p_ptr->row[i]);
}
/*
* VERY difficult: Optimal placement of jobs in the matrix
* - how to order jobs to be added to the matrix?
* - "by size" does not guarantee optimal placement
*
* - for now, try sorting jobs by first bit set
* - if job allocations stay "in blocks", then this should work OK
* - may still get scenarios where jobs should switch rows
*/
qsort(ss, num_jobs, sizeof(sort_support_t), _compare_support);
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
for (i = 0; i < num_jobs; i++) {
char cstr[64], nstr[64];
if (ss[i].tmpjobs->core_bitmap) {
bit_fmt(cstr, (sizeof(cstr) - 1) ,
ss[i].tmpjobs->core_bitmap);
} else
sprintf(cstr, "[no core_bitmap]");
if (ss[i].tmpjobs->node_bitmap) {
bit_fmt(nstr, (sizeof(nstr) - 1),
ss[i].tmpjobs->node_bitmap);
} else
sprintf(nstr, "[no node_bitmap]");
info("DEBUG: jstart %d job nb %s cb %s",
ss[i].jstart, nstr, cstr);
}
}
/* add jobs to the rows */
for (j = 0; j < num_jobs; j++) {
for (i = 0; i < p_ptr->num_rows; i++) {
if (job_res_fit_in_row(
ss[j].tmpjobs, &(p_ptr->row[i]))) {
/* job fits in row, so add it */
part_data_add_job_to_row(
ss[j].tmpjobs, &(p_ptr->row[i]));
ss[j].tmpjobs = NULL;
break;
}
}
/* job should have been added, so shuffle the rows */
part_data_sort_res(p_ptr);
}
/* test for dangling jobs */
for (j = 0; j < num_jobs; j++) {
if (ss[j].tmpjobs)
break;
}
if (j < num_jobs) {
/*
* we found a dangling job, which means our packing
* algorithm couldn't improve upon the existing layout.
* Thus, we'll restore the original layout here
*/
debug3("dangling job found");
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
info("DEBUG: (post-algorithm):");
part_data_dump_res(p_ptr);
}
part_data_destroy_row(p_ptr->row, p_ptr->num_rows);
p_ptr->row = orig_row;
orig_row = NULL;
/* still need to rebuild row_bitmaps */
for (i = 0; i < p_ptr->num_rows; i++) {
_reset_part_row_bitmap(&p_ptr->row[i]);
if (p_ptr->row[i].num_jobs == 0)
continue;
for (j = 0; j < p_ptr->row[i].num_jobs; j++) {
job_res_add_cores(p_ptr->row[i].job_list[j],
&p_ptr->row[i]);
}
}
}
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
info("DEBUG: (after):");
part_data_dump_res(p_ptr);
}
if (orig_row)
part_data_destroy_row(orig_row, p_ptr->num_rows);
xfree(ss);
return;
/* LEFTOVER DESIGN THOUGHTS, PRESERVED HERE */
/*
* 1. sort jobs by size
* 2. only load core bitmaps with largest jobs that conflict
* 3. sort rows by set count
* 4. add remaining jobs, starting with fullest rows
* 5. compute set count: if disparity between rows got closer, then
* switch non-conflicting jobs that were added
*/
/*
* Step 1: remove empty rows between non-empty rows
* Step 2: try to collapse rows
* Step 3: sort rows by size
* Step 4: try to swap jobs from different rows to pack rows
*/
/*
* WORK IN PROGRESS - more optimization should go here, such as:
*
* - try collapsing jobs from higher rows to lower rows
*
* - produce a load array to identify cores with less load. Test
* to see if those cores are in the lower row. If not, try to swap
* those jobs with jobs in the lower row. If the job can be swapped
* AND the lower row set_count increases, then SUCCESS! else swap
* back. The goal is to pack the lower rows and "bubble up" clear
* bits to the higher rows.
*/
}
/* (re)create the global select_part_record array */
extern void part_data_create_array(void)
{
list_t *part_rec_list = NULL;
list_itr_t *part_iterator;
part_record_t *p_ptr;
part_res_record_t *this_ptr, *last_ptr = NULL;
int num_parts;
part_data_destroy_res(select_part_record);
select_part_record = NULL;
num_parts = list_count(part_list);
if (!num_parts)
return;
info("%s: preparing for %d partitions", plugin_type, num_parts);
part_rec_list = list_create(NULL);
part_iterator = list_iterator_create(part_list);
while ((p_ptr = list_next(part_iterator))) {
this_ptr = xmalloc(sizeof(part_res_record_t));
this_ptr->part_ptr = p_ptr;
this_ptr->num_rows = p_ptr->max_share;
if (this_ptr->num_rows & SHARED_FORCE)
this_ptr->num_rows &= (~SHARED_FORCE);
if (preempt_by_qos) /* Add row for QOS preemption */
this_ptr->num_rows++;
/* SHARED=EXCLUSIVE sets max_share = 0 */
if (this_ptr->num_rows < 1)
this_ptr->num_rows = 1;
/* we'll leave the 'row' array blank for now */
this_ptr->row = NULL;
this_ptr->rebuild_rows = false;
list_append(part_rec_list, this_ptr);
}
list_iterator_destroy(part_iterator);
/* Sort the select_part_records by priority */
list_sort(part_rec_list, _sort_part_prio);
part_iterator = list_iterator_create(part_rec_list);
while ((this_ptr = list_next(part_iterator))) {
if (last_ptr)
last_ptr->next = this_ptr;
else
select_part_record = this_ptr;
last_ptr = this_ptr;
}
list_iterator_destroy(part_iterator);
FREE_NULL_LIST(part_rec_list);
}
/* Delete the given list of partition data */
extern void part_data_destroy_res(part_res_record_t *this_ptr)
{
while (this_ptr) {
part_res_record_t *tmp = this_ptr;
this_ptr = this_ptr->next;
tmp->part_ptr = NULL;
if (tmp->row) {
part_data_destroy_row(tmp->row, tmp->num_rows);
tmp->row = NULL;
}
xfree(tmp);
}
}
/* Delete the given partition row data */
extern void part_data_destroy_row(part_row_data_t *row, uint16_t num_rows)
{
uint32_t r;
for (r = 0; r < num_rows; r++) {
free_core_array(&row[r].row_bitmap);
xfree(row[r].job_list);
}
xfree(row);
}
/* Log contents of partition structure */
extern void part_data_dump_res(part_res_record_t *p_ptr)
{
uint32_t n, r;
node_record_t *node_ptr;
info("part:%s rows:%u prio:%u ", p_ptr->part_ptr->name, p_ptr->num_rows,
p_ptr->part_ptr->priority_tier);
xassert(node_record_count);
if (!p_ptr->row)
return;
for (r = 0; r < p_ptr->num_rows; r++) {
char str[64]; /* print first 64 bits of bitmaps */
char *sep = "", *tmp = NULL;
int max_nodes_rep = 4; /* max 4 allocated nodes to report */
if (!p_ptr->row[r].row_bitmap)
continue;
for (n = 0; n < node_record_count; n++) {
if (!p_ptr->row[r].row_bitmap[n] ||
!bit_set_count(p_ptr->row[r].row_bitmap[n]))
continue;
node_ptr = node_record_table_ptr[n];
bit_fmt(str, sizeof(str), p_ptr->row[r].row_bitmap[n]);
xstrfmtcat(tmp, "%salloc_cores[%s]:%s",
sep, node_ptr->name, str);
sep = ",";
if (--max_nodes_rep == 0)
break;
}
info(" row:%u num_jobs:%u: %s", r, p_ptr->row[r].num_jobs, tmp);
xfree(tmp);
}
}
/* Create a duplicate part_res_record list */
extern part_res_record_t *part_data_dup_res(
part_res_record_t *orig_ptr, bitstr_t *node_map)
{
part_res_record_t *new_part_ptr, *new_ptr;
if (orig_ptr == NULL)
return NULL;
new_part_ptr = xmalloc(sizeof(part_res_record_t));
new_ptr = new_part_ptr;
while (orig_ptr) {
new_ptr->part_ptr = orig_ptr->part_ptr;
if (node_map && orig_ptr->part_ptr->node_bitmap &&
bit_overlap_any(node_map,
orig_ptr->part_ptr->node_bitmap)) {
if (orig_ptr->rebuild_rows) {
/* This shouldn't happen. */
part_data_rebuild_rows(orig_ptr);
}
new_ptr->num_rows = orig_ptr->num_rows;
new_ptr->row = part_data_dup_row(orig_ptr->row,
orig_ptr->num_rows);
new_ptr->rebuild_rows = orig_ptr->rebuild_rows;
} else
new_ptr->rebuild_rows = true;
if (orig_ptr->next) {
new_ptr->next = xmalloc(sizeof(part_res_record_t));
new_ptr = new_ptr->next;
}
orig_ptr = orig_ptr->next;
}
return new_part_ptr;
}
/* sort the rows of a partition from "most allocated" to "least allocated" */
extern void part_data_sort_res(part_res_record_t *p_ptr)
{
uint32_t i, j;
if (!p_ptr->row)
return;
for (i = 0; i < p_ptr->num_rows; i++) {
for (j = i + 1; j < p_ptr->num_rows; j++) {
if (p_ptr->row[j].row_set_count >
p_ptr->row[i].row_set_count) {
_swap_rows(&(p_ptr->row[i]), &(p_ptr->row[j]));
}
}
}
return;
}
/* Create a duplicate part_row_data struct */
extern part_row_data_t *part_data_dup_row(part_row_data_t *orig_row,
uint16_t num_rows)
{
part_row_data_t *new_row;
int i, n;
if (num_rows == 0 || !orig_row)
return NULL;
new_row = xcalloc(num_rows, sizeof(part_row_data_t));
for (i = 0; i < num_rows; i++) {
new_row[i].num_jobs = orig_row[i].num_jobs;
new_row[i].job_list_size = orig_row[i].job_list_size;
if (orig_row[i].row_bitmap) {
new_row[i].row_bitmap = build_core_array();
for (n = 0; n < node_record_count; n++) {
if (!orig_row[i].row_bitmap[n])
continue;
new_row[i].row_bitmap[n] =
bit_copy(orig_row[i].row_bitmap[n]);
}
new_row[i].row_set_count = orig_row[i].row_set_count;
}
if (new_row[i].job_list_size == 0)
continue;
/* copy the job list */
new_row[i].job_list = xcalloc(new_row[i].job_list_size,
sizeof(struct job_resources *));
memcpy(new_row[i].job_list, orig_row[i].job_list,
(sizeof(struct job_resources *) * new_row[i].num_jobs));
}
return new_row;
}
/* rebuild select_part_record rows*/
extern void part_data_rebuild_rows(part_res_record_t *part_ptr)
{
for (; part_ptr; part_ptr = part_ptr->next) {
if (part_ptr->rebuild_rows)
part_data_build_row_bitmaps(part_ptr, NULL);
}
}