blob: 48ae07e15830a59e2d608935747d53f025518bd6 [file] [log] [blame] [edit]
/*****************************************************************************\
* select_linear.c - node selection plugin for simple one-dimensional
* address space. Selects nodes for a job so as to minimize the number
* of sets of consecutive nodes using a best-fit algorithm.
*****************************************************************************
* Copyright (C) 2004-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Portions Copyright (C) 2010 SchedMD <http://www.schedmd.com>.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <https://computing.llnl.gov/linux/slurm/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
# if HAVE_STDINT_H
# include <stdint.h>
# endif
# if HAVE_INTTYPES_H
# include <inttypes.h>
# endif
#endif
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <slurm/slurm.h>
#include <slurm/slurm_errno.h>
#include "src/common/slurm_xlator.h" /* Must be first */
#include "src/common/gres.h"
#include "src/common/job_resources.h"
#include "src/common/list.h"
#include "src/common/log.h"
#include "src/common/node_select.h"
#include "src/common/parse_time.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_resource_info.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/preempt.h"
#include "src/slurmctld/proc_req.h"
#include "src/plugins/select/linear/select_linear.h"
#define NO_SHARE_LIMIT 0xfffe
#define NODEINFO_MAGIC 0x82ad
#define RUN_JOB_INCR 16
#define SELECT_DEBUG 0
/* These are defined here so when we link with something other than
* the slurmctld we will have these symbols defined. They will get
* overwritten when linking with the slurmctld.
*/
#if defined (__APPLE__)
slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
struct node_record *node_record_table_ptr __attribute__((weak_import));
List part_list __attribute__((weak_import));
List job_list __attribute__((weak_import));
int node_record_count __attribute__((weak_import));
time_t last_node_update __attribute__((weak_import));
struct switch_record *switch_record_table __attribute__((weak_import));
int switch_record_cnt __attribute__((weak_import));
#else
slurm_ctl_conf_t slurmctld_conf;
struct node_record *node_record_table_ptr;
List part_list;
List job_list;
int node_record_count;
time_t last_node_update;
struct switch_record *switch_record_table;
int switch_record_cnt;
#endif
struct select_nodeinfo {
uint16_t magic; /* magic number */
uint16_t alloc_cpus;
};
static int _add_job_to_nodes(struct cr_record *cr_ptr,
struct job_record *job_ptr, char *pre_err,
int suspended);
static void _add_run_job(struct cr_record *cr_ptr, uint32_t job_id);
static void _add_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap);
static int _cr_job_list_sort(void *x, void *y);
static void _dump_node_cr(struct cr_record *cr_ptr);
static struct cr_record *_dup_cr(struct cr_record *cr_ptr);
static int _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes);
static void _free_cr(struct cr_record *cr_ptr);
static uint16_t _get_avail_cpus(struct job_record *job_ptr, int index);
static uint16_t _get_total_cpus(int index);
static void _init_node_cr(void);
static int _job_count_bitmap(struct cr_record *cr_ptr,
struct job_record *job_ptr,
bitstr_t * bitmap, bitstr_t * jobmap,
int run_job_cnt, int tot_job_cnt, uint16_t mode);
static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes);
static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes);
static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id);
static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
static int _rm_job_from_nodes(struct cr_record *cr_ptr,
struct job_record *job_ptr, char *pre_err,
bool remove_all);
static int _rm_job_from_one_node(struct job_record *job_ptr,
struct node_record *node_ptr, char *pre_err);
static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
int max_share, uint32_t req_nodes,
List preemptee_candidates,
List *preemptee_job_list);
static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id);
static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, int max_share);
static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
int max_share, uint32_t req_nodes,
List preemptee_candidates,
List *preemptee_job_list);
extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(uint32_t size);
extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo);
/*
* These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it.
*
* plugin_name - a string giving a human-readable description of the
* plugin. There is no maximum length, but the symbol must refer to
* a valid string.
*
* plugin_type - a string suggesting the type of the plugin or its
* applicability to a particular form of data or method of data handling.
* If the low-level plugin API is used, the contents of this string are
* unimportant and may be anything. SLURM uses the higher-level plugin
* interface which requires this string to be of the form
*
* <application>/<method>
*
* where <application> is a description of the intended application of
* the plugin (e.g., "select" for SLURM node selection) and <method>
* is a description of how this plugin satisfies that application. SLURM will
* only load select plugins if the plugin_type string has a
* prefix of "select/".
*
* plugin_version - an unsigned 32-bit integer giving the version number
* of the plugin. If major and minor revisions are desired, the major
* version number may be multiplied by a suitable magnitude constant such
* as 100 or 1000. Various SLURM versions will likely require a certain
* minimum version for their plugins as the node selection API matures.
*/
const char plugin_name[] = "Linear node selection plugin";
const char plugin_type[] = "select/linear";
const uint32_t plugin_id = 102;
const uint32_t plugin_version = 90;
static struct node_record *select_node_ptr = NULL;
static int select_node_cnt = 0;
static uint16_t select_fast_schedule;
static uint16_t cr_type;
/* Record of resources consumed on each node including job details */
static struct cr_record *cr_ptr = NULL;
static pthread_mutex_t cr_mutex = PTHREAD_MUTEX_INITIALIZER;
#ifdef HAVE_XCPU
#define XCPU_POLL_TIME 120
static pthread_t xcpu_thread = 0;
static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
static int agent_fini = 0;
static void *xcpu_agent(void *args)
{
int i;
static time_t last_xcpu_test;
char clone_path[128], down_node_list[512];
struct stat buf;
time_t now;
last_xcpu_test = time(NULL) + XCPU_POLL_TIME;
while (!agent_fini) {
now = time(NULL);
if (difftime(now, last_xcpu_test) >= XCPU_POLL_TIME) {
debug3("Running XCPU node state test");
down_node_list[0] = '\0';
for (i=0; i<select_node_cnt; i++) {
snprintf(clone_path, sizeof(clone_path),
"%s/%s/xcpu/clone", XCPU_DIR,
select_node_ptr[i].name);
if (stat(clone_path, &buf) == 0)
continue;
error("stat %s: %m", clone_path);
if ((strlen(select_node_ptr[i].name) +
strlen(down_node_list) + 2) <
sizeof(down_node_list)) {
if (down_node_list[0] != '\0')
strcat(down_node_list,",");
strcat(down_node_list,
select_node_ptr[i].name);
} else
error("down_node_list overflow");
}
if (down_node_list[0]) {
slurm_drain_nodes(
down_node_list,
"select_linear: Can not stat XCPU ",
slurm_get_slurm_user_id());
}
last_xcpu_test = now;
}
sleep(1);
}
return NULL;
}
static int _init_status_pthread(void)
{
pthread_attr_t attr;
slurm_mutex_lock( &thread_flag_mutex );
if ( xcpu_thread ) {
debug2("XCPU thread already running, not starting another");
slurm_mutex_unlock( &thread_flag_mutex );
return SLURM_ERROR;
}
slurm_attr_init( &attr );
pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_DETACHED );
pthread_create( &xcpu_thread, &attr, xcpu_agent, NULL);
slurm_mutex_unlock( &thread_flag_mutex );
slurm_attr_destroy( &attr );
return SLURM_SUCCESS;
}
static int _fini_status_pthread(void)
{
int i, rc = SLURM_SUCCESS;
slurm_mutex_lock( &thread_flag_mutex );
if ( xcpu_thread ) {
agent_fini = 1;
for (i=0; i<4; i++) {
sleep(1);
if (pthread_kill(xcpu_thread, 0)) {
xcpu_thread = 0;
break;
}
}
if ( xcpu_thread ) {
error("could not kill XCPU agent thread");
rc = SLURM_ERROR;
}
}
slurm_mutex_unlock( &thread_flag_mutex );
return rc;
}
#endif
/* Add job id to record of jobs running on this node */
static void _add_run_job(struct cr_record *cr_ptr, uint32_t job_id)
{
int i;
if (cr_ptr->run_job_ids == NULL) { /* create new array */
cr_ptr->run_job_len = RUN_JOB_INCR;
cr_ptr->run_job_ids = xmalloc(sizeof(uint32_t) *
cr_ptr->run_job_len);
cr_ptr->run_job_ids[0] = job_id;
return;
}
for (i=0; i<cr_ptr->run_job_len; i++) {
if (cr_ptr->run_job_ids[i])
continue;
/* fill in hole */
cr_ptr->run_job_ids[i] = job_id;
return;
}
/* expand array and add to end */
cr_ptr->run_job_len += RUN_JOB_INCR;
xrealloc(cr_ptr->run_job_ids, sizeof(uint32_t) * cr_ptr->run_job_len);
cr_ptr->run_job_ids[i] = job_id;
}
/* Add job id to record of jobs running or suspended on this node */
static void _add_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
{
int i;
if (cr_ptr->tot_job_ids == NULL) { /* create new array */
cr_ptr->tot_job_len = RUN_JOB_INCR;
cr_ptr->tot_job_ids = xmalloc(sizeof(uint32_t) *
cr_ptr->tot_job_len);
cr_ptr->tot_job_ids[0] = job_id;
return;
}
for (i=0; i<cr_ptr->tot_job_len; i++) {
if (cr_ptr->tot_job_ids[i])
continue;
/* fill in hole */
cr_ptr->tot_job_ids[i] = job_id;
return;
}
/* expand array and add to end */
cr_ptr->tot_job_len += RUN_JOB_INCR;
xrealloc(cr_ptr->tot_job_ids, sizeof(uint32_t) * cr_ptr->tot_job_len);
cr_ptr->tot_job_ids[i] = job_id;
}
static bool _ck_run_job(struct cr_record *cr_ptr, uint32_t job_id,
bool clear_it)
{
int i;
bool rc = false;
if ((cr_ptr->run_job_ids == NULL) || (cr_ptr->run_job_len == 0))
return rc;
for (i=0; i<cr_ptr->run_job_len; i++) {
if (cr_ptr->run_job_ids[i] != job_id)
continue;
if (clear_it)
cr_ptr->run_job_ids[i] = 0;
rc = true;
}
return rc;
}
/* Remove job id from record of jobs running,
* RET true if successful, false if the job was not running */
static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id)
{
return _ck_run_job(cr_ptr, job_id, true);
}
/* Test for job id in record of jobs running,
* RET true if successful, false if the job was not running */
static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id)
{
return _ck_run_job(cr_ptr, job_id, false);
}
static bool _ck_tot_job(struct cr_record *cr_ptr, uint32_t job_id,
bool clear_it)
{
int i;
bool rc = false;
if ((cr_ptr->tot_job_ids == NULL) || (cr_ptr->tot_job_len == 0))
return rc;
for (i=0; i<cr_ptr->tot_job_len; i++) {
if (cr_ptr->tot_job_ids[i] != job_id)
continue;
if (clear_it)
cr_ptr->tot_job_ids[i] = 0;
rc = true;
}
return rc;
}
/* Remove job id from record of jobs running or suspended,
* RET true if successful, false if the job was not found */
static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
{
return _ck_tot_job(cr_ptr, job_id, true);
}
/* Test for job id in record of jobs running or suspended,
* RET true if successful, false if the job was not found */
static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
{
return _ck_tot_job(cr_ptr, job_id, false);
}
static bool _enough_nodes(int avail_nodes, int rem_nodes,
uint32_t min_nodes, uint32_t req_nodes)
{
int needed_nodes;
if (req_nodes > min_nodes)
needed_nodes = rem_nodes + min_nodes - req_nodes;
else
needed_nodes = rem_nodes;
return(avail_nodes >= needed_nodes);
}
/*
* _get_avail_cpus - Get the number of "available" cpus on a node
* given this number given the number of cpus_per_task and
* maximum sockets, cores, threads. Note that the value of
* cpus is the lowest-level logical processor (LLLP).
* IN job_ptr - pointer to job being scheduled
* IN index - index of node's configuration information in select_node_ptr
*/
static uint16_t _get_avail_cpus(struct job_record *job_ptr, int index)
{
struct node_record *node_ptr;
uint16_t avail_cpus;
uint16_t cpus, sockets, cores, threads;
uint16_t cpus_per_task = 1;
uint16_t ntasks_per_node = 0, ntasks_per_socket, ntasks_per_core;
uint16_t min_sockets, min_cores, min_threads;
multi_core_data_t *mc_ptr = NULL;
if (job_ptr->details == NULL)
return (uint16_t) 0;
if (job_ptr->details->cpus_per_task)
cpus_per_task = job_ptr->details->cpus_per_task;
if (job_ptr->details->ntasks_per_node)
ntasks_per_node = job_ptr->details->ntasks_per_node;
if ((mc_ptr = job_ptr->details->mc_ptr)) {
ntasks_per_socket = mc_ptr->ntasks_per_socket;
ntasks_per_core = mc_ptr->ntasks_per_core;
min_sockets = mc_ptr->sockets_per_node;
min_cores = mc_ptr->cores_per_socket;
min_threads = mc_ptr->threads_per_core;
} else {
ntasks_per_socket = 0;
ntasks_per_core = 0;
min_sockets = (uint16_t) NO_VAL;
min_cores = (uint16_t) NO_VAL;
min_threads = (uint16_t) NO_VAL;
}
node_ptr = select_node_ptr + index;
if (select_fast_schedule) { /* don't bother checking each node */
cpus = node_ptr->config_ptr->cpus;
sockets = node_ptr->config_ptr->sockets;
cores = node_ptr->config_ptr->cores;
threads = node_ptr->config_ptr->threads;
} else {
cpus = node_ptr->cpus;
sockets = node_ptr->sockets;
cores = node_ptr->cores;
threads = node_ptr->threads;
}
#if SELECT_DEBUG
info("host %s HW_ cpus %u sockets %u cores %u threads %u ",
node_ptr->name, cpus, sockets, cores, threads);
#endif
avail_cpus = slurm_get_avail_procs(
min_sockets, min_cores, min_threads, cpus_per_task,
ntasks_per_node, ntasks_per_socket, ntasks_per_core,
&cpus, &sockets, &cores, &threads, NULL,
CR_CPU, job_ptr->job_id, node_ptr->name);
#if SELECT_DEBUG
debug("avail_cpus index %d = %d (out of %d %d %d %d)",
index, avail_cpus, cpus, sockets, cores, threads);
#endif
return(avail_cpus);
}
/*
* _get_total_cpus - Get the total number of cpus on a node
* Note that the value of cpus is the lowest-level logical
* processor (LLLP).
* IN index - index of node's configuration information in select_node_ptr
*/
static uint16_t _get_total_cpus(int index)
{
struct node_record *node_ptr = &(select_node_ptr[index]);
if (select_fast_schedule)
return node_ptr->config_ptr->cpus;
else
return node_ptr->cpus;
}
/* Build the full job_resources_t *structure for a job based upon the nodes
* allocated to it (the bitmap) and the job's memory requirement */
static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap)
{
int i, j, k;
int first_bit, last_bit;
uint32_t node_cpus, total_cpus = 0, node_cnt;
struct node_record *node_ptr;
uint32_t job_memory_cpu = 0, job_memory_node = 0;
job_resources_t *job_resrcs_ptr;
if (job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
if (job_ptr->details->pn_min_memory & MEM_PER_CPU)
job_memory_cpu = job_ptr->details->pn_min_memory &
(~MEM_PER_CPU);
else
job_memory_node = job_ptr->details->pn_min_memory;
}
if (job_ptr->job_resrcs) /* Old struct due to job requeue */
free_job_resources(&job_ptr->job_resrcs);
node_cnt = bit_set_count(bitmap);
job_ptr->job_resrcs = job_resrcs_ptr = create_job_resources();
job_resrcs_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t) * node_cnt);
job_resrcs_ptr->cpu_array_value = xmalloc(sizeof(uint16_t) * node_cnt);
job_resrcs_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt);
job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt);
job_resrcs_ptr->memory_allocated = xmalloc(sizeof(uint32_t) * node_cnt);
job_resrcs_ptr->memory_used = xmalloc(sizeof(uint32_t) * node_cnt);
job_resrcs_ptr->nhosts = node_cnt;
job_resrcs_ptr->node_bitmap = bit_copy(bitmap);
job_resrcs_ptr->nodes = bitmap2node_name(bitmap);
if (job_resrcs_ptr->node_bitmap == NULL)
fatal("bit_copy malloc failure");
job_resrcs_ptr->ncpus = job_ptr->total_cpus;
if (build_job_resources(job_resrcs_ptr, (void *)select_node_ptr,
select_fast_schedule))
error("_build_select_struct: build_job_resources: %m");
first_bit = bit_ffs(bitmap);
last_bit = bit_fls(bitmap);
if (last_bit == -1)
last_bit = -2; /* no bits set */
for (i=first_bit, j=0, k=-1; i<=last_bit; i++) {
if (!bit_test(bitmap, i))
continue;
node_ptr = &(select_node_ptr[i]);
if (select_fast_schedule)
node_cpus = node_ptr->config_ptr->cpus;
else
node_cpus = node_ptr->cpus;
job_resrcs_ptr->cpus[j] = node_cpus;
if ((k == -1) ||
(job_resrcs_ptr->cpu_array_value[k] != node_cpus)) {
job_resrcs_ptr->cpu_array_cnt++;
job_resrcs_ptr->cpu_array_reps[++k] = 1;
job_resrcs_ptr->cpu_array_value[k] = node_cpus;
} else
job_resrcs_ptr->cpu_array_reps[k]++;
total_cpus += node_cpus;
if (job_memory_node) {
job_resrcs_ptr->memory_allocated[j] = job_memory_node;
} else if (job_memory_cpu) {
job_resrcs_ptr->memory_allocated[j] =
job_memory_cpu * node_cpus;
}
if (set_job_resources_node(job_resrcs_ptr, j)) {
error("_build_select_struct: set_job_resources_node: "
"%m");
}
j++;
}
if (job_resrcs_ptr->ncpus != total_cpus) {
error("_build_select_struct: ncpus mismatch %u != %u",
job_resrcs_ptr->ncpus, total_cpus);
}
}
/*
* Set the bits in 'jobmap' that correspond to bits in the 'bitmap'
* that are running 'run_job_cnt' jobs or less, and clear the rest.
*/
static int _job_count_bitmap(struct cr_record *cr_ptr,
struct job_record *job_ptr,
bitstr_t * bitmap, bitstr_t * jobmap,
int run_job_cnt, int tot_job_cnt, uint16_t mode)
{
int i, i_first, i_last;
int count = 0, total_jobs, total_run_jobs;
struct part_cr_record *part_cr_ptr;
struct node_record *node_ptr;
uint32_t job_memory_cpu = 0, job_memory_node = 0;
uint32_t alloc_mem = 0, job_mem = 0, avail_mem = 0;
uint32_t cpu_cnt, gres_cpus;
List gres_list;
bool use_total_gres = true;
xassert(cr_ptr);
xassert(cr_ptr->nodes);
if (mode != SELECT_MODE_TEST_ONLY) {
use_total_gres = false;
if (job_ptr->details->pn_min_memory &&
(cr_type == CR_MEMORY)) {
if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
job_memory_cpu = job_ptr->details->pn_min_memory
& (~MEM_PER_CPU);
} else {
job_memory_node = job_ptr->details->
pn_min_memory;
}
}
}
i_first = bit_ffs(bitmap);
i_last = bit_fls(bitmap);
if (i_first == -1) /* job has no nodes */
i_last = -2;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(bitmap, i)) {
bit_clear(jobmap, i);
continue;
}
node_ptr = node_record_table_ptr + i;
if (select_fast_schedule)
cpu_cnt = node_ptr->config_ptr->cpus;
else
cpu_cnt = node_ptr->cpus;
if (cr_ptr->nodes[i].gres_list)
gres_list = cr_ptr->nodes[i].gres_list;
else
gres_list = node_ptr->gres_list;
gres_cpus = gres_plugin_job_test(job_ptr->gres_list,
gres_list, use_total_gres,
NULL, 0, 0, job_ptr->job_id,
node_ptr->name);
if ((gres_cpus != NO_VAL) && (gres_cpus < cpu_cnt)) {
bit_clear(jobmap, i);
continue;
}
if (mode == SELECT_MODE_TEST_ONLY) {
bit_set(jobmap, i);
count++;
continue; /* No need to test other resources */
}
if (job_memory_cpu || job_memory_node) {
alloc_mem = cr_ptr->nodes[i].alloc_memory;
if (select_fast_schedule) {
avail_mem = node_ptr->config_ptr->real_memory;
if (job_memory_cpu)
job_mem = job_memory_cpu * cpu_cnt;
else
job_mem = job_memory_node;
} else {
avail_mem = node_ptr->real_memory;
if (job_memory_cpu)
job_mem = job_memory_cpu * cpu_cnt;
else
job_mem = job_memory_node;
}
if ((alloc_mem + job_mem) > avail_mem) {
bit_clear(jobmap, i);
continue;
}
}
if ((mode != SELECT_MODE_TEST_ONLY) &&
(cr_ptr->nodes[i].exclusive_cnt != 0)) {
/* already reserved by some exclusive job */
bit_clear(jobmap, i);
continue;
}
total_jobs = 0;
total_run_jobs = 0;
part_cr_ptr = cr_ptr->nodes[i].parts;
while (part_cr_ptr) {
total_run_jobs += part_cr_ptr->run_job_cnt;
total_jobs += part_cr_ptr->tot_job_cnt;
part_cr_ptr = part_cr_ptr->next;
}
if ((total_run_jobs <= run_job_cnt) &&
(total_jobs <= tot_job_cnt)) {
bit_set(jobmap, i);
count++;
} else {
bit_clear(jobmap, i);
}
}
return count;
}
/* _find_job_mate - does most of the real work for select_p_job_test(),
* in trying to find a suitable job to mate this one with. This is
* a pretty simple algorithm now, but could try to match the job
* with multiple jobs that add up to the proper size or a single
* job plus a few idle nodes. */
static int _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes)
{
ListIterator job_iterator;
struct job_record *job_scan_ptr;
int rc = EINVAL;
job_iterator = list_iterator_create(job_list);
while ((job_scan_ptr = (struct job_record *) list_next(job_iterator))) {
if ((!IS_JOB_RUNNING(job_scan_ptr)) ||
(job_scan_ptr->node_cnt != req_nodes) ||
(job_scan_ptr->total_cpus <
job_ptr->details->min_cpus) ||
(!bit_super_set(job_scan_ptr->node_bitmap, bitmap)))
continue;
if (job_scan_ptr->details && job_ptr->details &&
(job_scan_ptr->details->contiguous !=
job_ptr->details->contiguous))
continue;
if (job_ptr->details->req_node_bitmap &&
(!bit_super_set(job_ptr->details->req_node_bitmap,
job_scan_ptr->node_bitmap)))
continue; /* Required nodes missing from job */
if (job_ptr->details->exc_node_bitmap &&
(bit_overlap(job_ptr->details->exc_node_bitmap,
job_scan_ptr->node_bitmap) != 0))
continue; /* Excluded nodes in this job */
bit_and(bitmap, job_scan_ptr->node_bitmap);
job_ptr->total_cpus = job_scan_ptr->total_cpus;
rc = SLURM_SUCCESS;
break;
}
list_iterator_destroy(job_iterator);
return rc;
}
/* _job_test - does most of the real work for select_p_job_test(), which
* pretty much just handles load-leveling and max_share logic */
static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes)
{
int i, index, error_code = EINVAL, sufficient;
int *consec_nodes; /* how many nodes we can add from this
* consecutive set of nodes */
int *consec_cpus; /* how many nodes we can add from this
* consecutive set of nodes */
int *consec_start; /* where this consecutive set starts (index) */
int *consec_end; /* where this consecutive set ends (index) */
int *consec_req; /* are nodes from this set required
* (in req_bitmap) */
int consec_index, consec_size;
int rem_cpus, rem_nodes; /* remaining resources desired */
int best_fit_nodes, best_fit_cpus, best_fit_req;
int best_fit_location = 0, best_fit_sufficient;
int avail_cpus, alloc_cpus = 0, total_cpus = 0;
if (bit_set_count(bitmap) < min_nodes)
return error_code;
if ((job_ptr->details->req_node_bitmap) &&
(!bit_super_set(job_ptr->details->req_node_bitmap, bitmap)))
return error_code;
if (switch_record_cnt && switch_record_table) {
/* Perform optimized resource selection based upon topology */
return _job_test_topo(job_ptr, bitmap,
min_nodes, max_nodes, req_nodes);
}
consec_index = 0;
consec_size = 50; /* start allocation for 50 sets of
* consecutive nodes */
consec_cpus = xmalloc(sizeof(int) * consec_size);
consec_nodes = xmalloc(sizeof(int) * consec_size);
consec_start = xmalloc(sizeof(int) * consec_size);
consec_end = xmalloc(sizeof(int) * consec_size);
consec_req = xmalloc(sizeof(int) * consec_size);
/* Build table with information about sets of consecutive nodes */
consec_cpus[consec_index] = consec_nodes[consec_index] = 0;
consec_req[consec_index] = -1; /* no required nodes here by default */
rem_cpus = job_ptr->details->min_cpus;
if (req_nodes > min_nodes)
rem_nodes = req_nodes;
else
rem_nodes = min_nodes;
for (index = 0; index < select_node_cnt; index++) {
if (bit_test(bitmap, index)) {
if (consec_nodes[consec_index] == 0)
consec_start[consec_index] = index;
avail_cpus = _get_avail_cpus(job_ptr, index);
if (job_ptr->details->req_node_bitmap &&
(max_nodes > 0) &&
bit_test(job_ptr->details->req_node_bitmap,index)){
if (consec_req[consec_index] == -1) {
/* first required node in set */
consec_req[consec_index] = index;
}
rem_nodes--;
max_nodes--;
rem_cpus -= avail_cpus;
alloc_cpus += avail_cpus;
total_cpus += _get_total_cpus(index);
} else { /* node not required (yet) */
bit_clear(bitmap, index);
consec_cpus[consec_index] += avail_cpus;
consec_nodes[consec_index]++;
}
} else if (consec_nodes[consec_index] == 0) {
consec_req[consec_index] = -1;
/* already picked up any required nodes */
/* re-use this record */
} else {
consec_end[consec_index] = index - 1;
if (++consec_index >= consec_size) {
consec_size *= 2;
xrealloc(consec_cpus,
sizeof(int) * consec_size);
xrealloc(consec_nodes,
sizeof(int) * consec_size);
xrealloc(consec_start,
sizeof(int) * consec_size);
xrealloc(consec_end,
sizeof(int) * consec_size);
xrealloc(consec_req,
sizeof(int) * consec_size);
}
consec_cpus[consec_index] = 0;
consec_nodes[consec_index] = 0;
consec_req[consec_index] = -1;
}
}
if (consec_nodes[consec_index] != 0)
consec_end[consec_index++] = index - 1;
#if SELECT_DEBUG
/* don't compile this, it slows things down too much */
debug3("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes);
for (i = 0; i < consec_index; i++) {
if (consec_req[i] != -1)
debug3("start=%s, end=%s, nodes=%d, cpus=%d, req=%s",
select_node_ptr[consec_start[i]].name,
select_node_ptr[consec_end[i]].name,
consec_nodes[i], consec_cpus[i],
select_node_ptr[consec_req[i]].name);
else
debug3("start=%s, end=%s, nodes=%d, cpus=%d",
select_node_ptr[consec_start[i]].name,
select_node_ptr[consec_end[i]].name,
consec_nodes[i], consec_cpus[i]);
}
#endif
/* accumulate nodes from these sets of consecutive nodes until */
/* sufficient resources have been accumulated */
while (consec_index && (max_nodes > 0)) {
best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
best_fit_req = -1; /* first required node, -1 if none */
for (i = 0; i < consec_index; i++) {
if (consec_nodes[i] == 0)
continue; /* no usable nodes here */
if (job_ptr->details->contiguous &&
job_ptr->details->req_node_bitmap &&
(consec_req[i] == -1))
continue; /* no required nodes here */
sufficient = (consec_cpus[i] >= rem_cpus) &&
_enough_nodes(consec_nodes[i], rem_nodes,
min_nodes, req_nodes);
/* if first possibility OR */
/* contains required nodes OR */
/* first set large enough for request OR */
/* tightest fit (less resource waste) OR */
/* nothing yet large enough, but this is biggest */
if ((best_fit_nodes == 0) ||
((best_fit_req == -1) && (consec_req[i] != -1)) ||
(sufficient && (best_fit_sufficient == 0)) ||
(sufficient && (consec_cpus[i] < best_fit_cpus)) ||
((sufficient == 0) &&
(consec_cpus[i] > best_fit_cpus))) {
best_fit_cpus = consec_cpus[i];
best_fit_nodes = consec_nodes[i];
best_fit_location = i;
best_fit_req = consec_req[i];
best_fit_sufficient = sufficient;
}
if (job_ptr->details->contiguous &&
job_ptr->details->req_node_bitmap) {
/* Must wait for all required nodes to be
* in a single consecutive block */
int j, other_blocks = 0;
for (j = (i+1); j < consec_index; j++) {
if (consec_req[j] != -1) {
other_blocks = 1;
break;
}
}
if (other_blocks) {
best_fit_nodes = 0;
break;
}
}
}
if (best_fit_nodes == 0)
break;
if (job_ptr->details->contiguous &&
((best_fit_cpus < rem_cpus) ||
(!_enough_nodes(best_fit_nodes, rem_nodes,
min_nodes, req_nodes))))
break; /* no hole large enough */
if (best_fit_req != -1) {
/* This collection of nodes includes required ones
* select nodes from this set, first working up
* then down from the required nodes */
for (i = best_fit_req;
i <= consec_end[best_fit_location]; i++) {
if ((max_nodes <= 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0)))
break;
if (bit_test(bitmap, i))
continue;
bit_set(bitmap, i);
rem_nodes--;
max_nodes--;
avail_cpus = _get_avail_cpus(job_ptr, i);
rem_cpus -= avail_cpus;
alloc_cpus += avail_cpus;
total_cpus += _get_total_cpus(i);
}
for (i = (best_fit_req - 1);
i >= consec_start[best_fit_location]; i--) {
if ((max_nodes <= 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0)))
break;
if (bit_test(bitmap, i))
continue;
bit_set(bitmap, i);
rem_nodes--;
max_nodes--;
avail_cpus = _get_avail_cpus(job_ptr, i);
rem_cpus -= avail_cpus;
alloc_cpus += avail_cpus;
total_cpus += _get_total_cpus(i);
}
} else {
for (i = consec_start[best_fit_location];
i <= consec_end[best_fit_location]; i++) {
if ((max_nodes <= 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0)))
break;
if (bit_test(bitmap, i))
continue;
bit_set(bitmap, i);
rem_nodes--;
max_nodes--;
avail_cpus = _get_avail_cpus(job_ptr, i);
rem_cpus -= avail_cpus;
alloc_cpus += avail_cpus;
total_cpus += _get_total_cpus(i);
}
}
if (job_ptr->details->contiguous ||
((rem_nodes <= 0) && (rem_cpus <= 0))) {
error_code = SLURM_SUCCESS;
break;
}
consec_cpus[best_fit_location] = 0;
consec_nodes[best_fit_location] = 0;
}
if (error_code && (rem_cpus <= 0) &&
_enough_nodes(0, rem_nodes, min_nodes, req_nodes)) {
error_code = SLURM_SUCCESS;
}
if (error_code == SLURM_SUCCESS) {
/* job's total_cpus is needed for SELECT_MODE_WILL_RUN */
job_ptr->total_cpus = total_cpus;
}
xfree(consec_cpus);
xfree(consec_nodes);
xfree(consec_start);
xfree(consec_end);
xfree(consec_req);
return error_code;
}
/*
* _job_test_topo - A topology aware version of _job_test()
* NOTE: The logic here is almost identical to that of _eval_nodes_topo() in
* select/cons_res/job_test.c. Any bug found here is probably also there.
*/
static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes)
{
bitstr_t **switches_bitmap; /* nodes on this switch */
int *switches_cpu_cnt; /* total CPUs on switch */
int *switches_node_cnt; /* total nodes on switch */
int *switches_required; /* set if has required node */
bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */
bitstr_t *req_nodes_bitmap = NULL;
int rem_cpus, rem_nodes; /* remaining resources desired */
int avail_cpus, alloc_cpus = 0, total_cpus = 0;
int i, j, rc = SLURM_SUCCESS;
int best_fit_inx, first, last;
int best_fit_nodes, best_fit_cpus;
int best_fit_location = 0, best_fit_sufficient;
bool sufficient;
rem_cpus = job_ptr->details->min_cpus;
if (req_nodes > min_nodes)
rem_nodes = req_nodes;
else
rem_nodes = min_nodes;
if (job_ptr->details->req_node_bitmap) {
req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
i = bit_set_count(req_nodes_bitmap);
if (i > max_nodes) {
info("job %u requires more nodes than currently "
"available (%u>%u)",
job_ptr->job_id, i, max_nodes);
rc = EINVAL;
goto fini;
}
}
/* Construct a set of switch array entries,
* use the same indexes as switch_record_table in slurmctld */
switches_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt);
switches_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt);
switches_node_cnt = xmalloc(sizeof(int) * switch_record_cnt);
switches_required = xmalloc(sizeof(int) * switch_record_cnt);
avail_nodes_bitmap = bit_alloc(node_record_count);
for (i=0; i<switch_record_cnt; i++) {
switches_bitmap[i] = bit_copy(switch_record_table[i].
node_bitmap);
bit_and(switches_bitmap[i], bitmap);
bit_or(avail_nodes_bitmap, switches_bitmap[i]);
switches_node_cnt[i] = bit_set_count(switches_bitmap[i]);
if (req_nodes_bitmap &&
bit_overlap(req_nodes_bitmap, switches_bitmap[i])) {
switches_required[i] = 1;
}
}
bit_nclear(bitmap, 0, node_record_count - 1);
#if SELECT_DEBUG
/* Don't compile this, it slows things down too much */
for (i=0; i<switch_record_cnt; i++) {
char *node_names = NULL;
if (switches_node_cnt[i])
node_names = bitmap2node_name(switches_bitmap[i]);
debug("switch=%s nodes=%u:%s required:%u speed=%u",
switch_record_table[i].name,
switches_node_cnt[i], node_names,
switches_required[i],
switch_record_table[i].link_speed);
xfree(node_names);
}
#endif
if (req_nodes_bitmap &&
(!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) {
info("job %u requires nodes not available on any switch",
job_ptr->job_id);
rc = EINVAL;
goto fini;
}
if (req_nodes_bitmap) {
/* Accumulate specific required resources, if any */
first = bit_ffs(req_nodes_bitmap);
last = bit_fls(req_nodes_bitmap);
for (i=first; ((i<=last) && (first>=0)); i++) {
if (!bit_test(req_nodes_bitmap, i))
continue;
if (max_nodes <= 0) {
info("job %u requires nodes than allowed",
job_ptr->job_id);
rc = EINVAL;
goto fini;
}
bit_set(bitmap, i);
bit_clear(avail_nodes_bitmap, i);
rem_nodes--;
max_nodes--;
avail_cpus = _get_avail_cpus(job_ptr, i);
rem_cpus -= avail_cpus;
alloc_cpus += avail_cpus;
total_cpus += _get_total_cpus(i);
for (j=0; j<switch_record_cnt; j++) {
if (!bit_test(switches_bitmap[j], i))
continue;
bit_clear(switches_bitmap[j], i);
switches_node_cnt[j]--;
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0))
goto fini;
/* Accumulate additional resources from leafs that
* contain required nodes */
for (j=0; j<switch_record_cnt; j++) {
if ((switch_record_table[j].level != 0) ||
(switches_node_cnt[j] == 0) ||
(switches_required[j] == 0)) {
continue;
}
while ((max_nodes > 0) &&
((rem_nodes > 0) || (rem_cpus > 0))) {
i = bit_ffs(switches_bitmap[j]);
if (i == -1)
break;
bit_clear(switches_bitmap[j], i);
switches_node_cnt[j]--;
if (bit_test(bitmap, i)) {
/* node on multiple leaf switches
* and already selected */
continue;
}
bit_set(bitmap, i);
bit_clear(avail_nodes_bitmap, i);
rem_nodes--;
max_nodes--;
avail_cpus = _get_avail_cpus(job_ptr, i);
rem_cpus -= avail_cpus;
alloc_cpus += avail_cpus;
total_cpus += _get_total_cpus(i);
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0))
goto fini;
/* Update bitmaps and node counts for higher-level switches */
for (j=0; j<switch_record_cnt; j++) {
if (switches_node_cnt[j] == 0)
continue;
first = bit_ffs(switches_bitmap[j]);
if (first < 0)
continue;
last = bit_fls(switches_bitmap[j]);
for (i=first; i<=last; i++) {
if (!bit_test(switches_bitmap[j], i))
continue;
if (!bit_test(avail_nodes_bitmap, i)) {
/* cleared from lower level */
bit_clear(switches_bitmap[j], i);
switches_node_cnt[j]--;
} else {
switches_cpu_cnt[j] +=
_get_avail_cpus(job_ptr, i);
}
}
}
} else {
/* No specific required nodes, calculate CPU counts */
for (j=0; j<switch_record_cnt; j++) {
first = bit_ffs(switches_bitmap[j]);
if (first < 0)
continue;
last = bit_fls(switches_bitmap[j]);
for (i=first; i<=last; i++) {
if (!bit_test(switches_bitmap[j], i))
continue;
switches_cpu_cnt[j] +=
_get_avail_cpus(job_ptr, i);
}
}
}
/* Determine lowest level switch satifying request with best fit */
best_fit_inx = -1;
for (j=0; j<switch_record_cnt; j++) {
if ((switches_cpu_cnt[j] < rem_cpus) ||
(!_enough_nodes(switches_node_cnt[j], rem_nodes,
min_nodes, req_nodes)))
continue;
if ((best_fit_inx == -1) ||
(switch_record_table[j].level <
switch_record_table[best_fit_inx].level) ||
((switch_record_table[j].level ==
switch_record_table[best_fit_inx].level) &&
(switches_node_cnt[j] < switches_node_cnt[best_fit_inx])))
best_fit_inx = j;
}
if (best_fit_inx == -1) {
debug("_job_test_topo: could not find resources for job %u",
job_ptr->job_id);
rc = EINVAL;
goto fini;
}
bit_and(avail_nodes_bitmap, switches_bitmap[best_fit_inx]);
/* Identify usable leafs (within higher switch having best fit) */
for (j=0; j<switch_record_cnt; j++) {
if ((switch_record_table[j].level != 0) ||
(!bit_super_set(switches_bitmap[j],
switches_bitmap[best_fit_inx]))) {
switches_node_cnt[j] = 0;
}
}
/* Select resources from these leafs on a best-fit basis */
while ((max_nodes > 0) && ((rem_nodes > 0) || (rem_cpus > 0))) {
best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
for (j=0; j<switch_record_cnt; j++) {
if (switches_node_cnt[j] == 0)
continue;
sufficient = (switches_cpu_cnt[j] >= rem_cpus) &&
_enough_nodes(switches_node_cnt[j],
rem_nodes, min_nodes,
req_nodes);
/* If first possibility OR */
/* first set large enough for request OR */
/* tightest fit (less resource waste) OR */
/* nothing yet large enough, but this is biggest */
if ((best_fit_nodes == 0) ||
(sufficient && (best_fit_sufficient == 0)) ||
(sufficient &&
(switches_cpu_cnt[j] < best_fit_cpus)) ||
((sufficient == 0) &&
(switches_cpu_cnt[j] > best_fit_cpus))) {
best_fit_cpus = switches_cpu_cnt[j];
best_fit_nodes = switches_node_cnt[j];
best_fit_location = j;
best_fit_sufficient = sufficient;
}
}
if (best_fit_nodes == 0)
break;
/* Use select nodes from this leaf */
first = bit_ffs(switches_bitmap[best_fit_location]);
last = bit_fls(switches_bitmap[best_fit_location]);
for (i=first; ((i<=last) && (first>=0)); i++) {
if (!bit_test(switches_bitmap[best_fit_location], i))
continue;
bit_clear(switches_bitmap[best_fit_location], i);
switches_node_cnt[best_fit_location]--;
avail_cpus = _get_avail_cpus(job_ptr, i);
switches_cpu_cnt[best_fit_location] -= avail_cpus;
if (bit_test(bitmap, i)) {
/* node on multiple leaf switches
* and already selected */
continue;
}
bit_set(bitmap, i);
rem_nodes--;
max_nodes--;
rem_cpus -= avail_cpus;
alloc_cpus += avail_cpus;
total_cpus += _get_total_cpus(i);
if ((max_nodes <= 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0)))
break;
}
switches_node_cnt[best_fit_location] = 0;
}
if ((rem_cpus <= 0) &&
_enough_nodes(0, rem_nodes, min_nodes, req_nodes)) {
rc = SLURM_SUCCESS;
} else
rc = EINVAL;
fini: if (rc == SLURM_SUCCESS) {
/* Job's total_cpus is needed for SELECT_MODE_WILL_RUN */
job_ptr->total_cpus = total_cpus;
}
FREE_NULL_BITMAP(avail_nodes_bitmap);
FREE_NULL_BITMAP(req_nodes_bitmap);
for (i=0; i<switch_record_cnt; i++)
FREE_NULL_BITMAP(switches_bitmap[i]);
xfree(switches_bitmap);
xfree(switches_cpu_cnt);
xfree(switches_node_cnt);
xfree(switches_required);
return rc;
}
/*
* deallocate resources that were assigned to this job
*
* if remove_all = false: the job has been suspended, so just deallocate CPUs
* if remove_all = true: deallocate all resources
*/
static int _rm_job_from_nodes(struct cr_record *cr_ptr,
struct job_record *job_ptr, char *pre_err,
bool remove_all)
{
int i, i_first, i_last, node_offset, rc = SLURM_SUCCESS;
struct part_cr_record *part_cr_ptr;
job_resources_t *job_resrcs_ptr;
uint32_t job_memory, job_memory_cpu = 0, job_memory_node = 0;
bool exclusive, is_job_running;
uint16_t cpu_cnt;
struct node_record *node_ptr;
List gres_list;
if (cr_ptr == NULL) {
error("%s: cr_ptr not initialized", pre_err);
return SLURM_ERROR;
}
if (_rem_tot_job(cr_ptr, job_ptr->job_id) == 0) {
info("select/linear: job %u has no resources allocated",
job_ptr->job_id);
return SLURM_ERROR;
}
if (remove_all && job_ptr->details &&
job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
job_memory_cpu = job_ptr->details->pn_min_memory &
(~MEM_PER_CPU);
} else
job_memory_node = job_ptr->details->pn_min_memory;
}
if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) {
error("job %u lacks a job_resources struct", job_ptr->job_id);
return SLURM_ERROR;
}
is_job_running = _rem_run_job(cr_ptr, job_ptr->job_id);
exclusive = (job_ptr->details->shared == 0);
i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
i_last = bit_fls(job_resrcs_ptr->node_bitmap);
if (i_first == -1) /* job has no nodes */
i_last = -2;
node_offset = -1;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(job_resrcs_ptr->node_bitmap, i))
continue;
node_offset++;
if (!bit_test(job_ptr->node_bitmap, i))
continue;
node_ptr = node_record_table_ptr + i;
if (select_fast_schedule)
cpu_cnt = node_ptr->config_ptr->cpus;
else
cpu_cnt = node_ptr->cpus;
if (job_memory_cpu)
job_memory = job_memory_cpu * cpu_cnt;
else
job_memory = job_memory_node;
if (cr_ptr->nodes[i].alloc_memory >= job_memory)
cr_ptr->nodes[i].alloc_memory -= job_memory;
else {
/* This can be the result of FastSchedule=0 and
* the node being configured with fewer CPUs than
* actually exist. The job allocation set when
* slurmctld restarts may be based upon a lower CPU
* count than when the job gets deallocated. */
if (select_fast_schedule ||
(node_ptr->config_ptr->cpus == node_ptr->cpus)) {
error("%s: memory underflow for node %s",
pre_err, node_ptr->name);
} else {
debug("%s: memory underflow for node %s",
pre_err, node_ptr->name);
}
cr_ptr->nodes[i].alloc_memory = 0;
}
if (remove_all) {
if (cr_ptr->nodes[i].gres_list)
gres_list = cr_ptr->nodes[i].gres_list;
else
gres_list = node_ptr->gres_list;
gres_plugin_job_dealloc(job_ptr->gres_list, gres_list,
node_offset, job_ptr->job_id,
node_ptr->name);
gres_plugin_node_state_log(gres_list, node_ptr->name);
}
if (exclusive) {
if (cr_ptr->nodes[i].exclusive_cnt)
cr_ptr->nodes[i].exclusive_cnt--;
else {
error("%s: exclusive_cnt underflow for "
"node %s", pre_err, node_ptr->name);
}
}
part_cr_ptr = cr_ptr->nodes[i].parts;
while (part_cr_ptr) {
if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
part_cr_ptr = part_cr_ptr->next;
continue;
}
if (!is_job_running)
/* cancelled job already suspended */;
else if (part_cr_ptr->run_job_cnt > 0)
part_cr_ptr->run_job_cnt--;
else {
error("%s: run_job_cnt underflow for node %s",
pre_err, node_ptr->name);
}
if (remove_all) {
if (part_cr_ptr->tot_job_cnt > 0)
part_cr_ptr->tot_job_cnt--;
else {
error("%s: tot_job_cnt underflow "
"for node %s",
pre_err, node_ptr->name);
}
if ((part_cr_ptr->tot_job_cnt == 0) &&
(part_cr_ptr->run_job_cnt)) {
part_cr_ptr->run_job_cnt = 0;
error("%s: run_job_cnt out of sync "
"for node %s",
pre_err, node_ptr->name);
}
}
break;
}
if (part_cr_ptr == NULL) {
if (job_ptr->part_nodes_missing) {
;
} else if (job_ptr->part_ptr) {
info("%s: job %u and its partition %s "
"no longer contain node %s",
pre_err, job_ptr->job_id,
job_ptr->partition, node_ptr->name);
} else {
info("%s: job %u has no pointer to partition "
"%s and node %s",
pre_err, job_ptr->job_id,
job_ptr->partition, node_ptr->name);
}
job_ptr->part_nodes_missing = true;
rc = SLURM_ERROR;
}
}
return rc;
}
/*
* deallocate resources that were assigned to this job on one node
*/
static int _rm_job_from_one_node(struct job_record *job_ptr,
struct node_record *node_ptr, char *pre_err)
{
int i, node_inx, node_offset, rc = SLURM_SUCCESS;
struct part_cr_record *part_cr_ptr;
job_resources_t *job_resrcs_ptr;
uint32_t job_memory, job_memory_cpu = 0, job_memory_node = 0;
bool exclusive, is_job_running;
int first_bit, last_bit;
uint16_t cpu_cnt;
List gres_list;
if (cr_ptr == NULL) {
error("%s: cr_ptr not initialized", pre_err);
return SLURM_ERROR;
}
if (_test_tot_job(cr_ptr, job_ptr->job_id) == 0) {
info("select/linear: job %u has no resources allocated",
job_ptr->job_id);
return SLURM_ERROR;
}
if (job_ptr->details &&
job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
job_memory_cpu = job_ptr->details->pn_min_memory &
(~MEM_PER_CPU);
} else
job_memory_node = job_ptr->details->pn_min_memory;
}
if ((job_ptr->job_resrcs == NULL) ||
(job_ptr->job_resrcs->cpus == NULL)) {
error("job %u lacks a job_resources struct", job_ptr->job_id);
return SLURM_ERROR;
}
job_resrcs_ptr = job_ptr->job_resrcs;
node_inx = node_ptr - node_record_table_ptr;
if (!bit_test(job_resrcs_ptr->node_bitmap, node_inx)) {
error("job %u allocated nodes (%s) which have been removed "
"from slurm.conf",
job_ptr->job_id, node_ptr->name);
return SLURM_ERROR;
}
first_bit = bit_ffs(job_resrcs_ptr->node_bitmap);
last_bit = node_inx;
node_offset = -1;
for (i = first_bit; i <= node_inx; i++) {
if (!bit_test(job_resrcs_ptr->node_bitmap, i))
continue;
node_offset++;
}
if (job_resrcs_ptr->cpus[node_offset] == 0) {
error("duplicate relinquish of node %s by job %u",
node_ptr->name, job_ptr->job_id);
return SLURM_ERROR;
}
job_resrcs_ptr->cpus[node_offset] = 0;
build_job_resources_cpu_array(job_resrcs_ptr);
is_job_running = _test_run_job(cr_ptr, job_ptr->job_id);
if (select_fast_schedule)
cpu_cnt = node_ptr->config_ptr->cpus;
else
cpu_cnt = node_ptr->cpus;
if (job_memory_cpu)
job_memory = job_memory_cpu * cpu_cnt;
else
job_memory = job_memory_node;
if (cr_ptr->nodes[node_inx].alloc_memory >= job_memory)
cr_ptr->nodes[node_inx].alloc_memory -= job_memory;
else {
cr_ptr->nodes[node_inx].alloc_memory = 0;
error("%s: memory underflow for node %s",
pre_err, node_ptr->name);
}
if (cr_ptr->nodes[i].gres_list)
gres_list = cr_ptr->nodes[i].gres_list;
else
gres_list = node_ptr->gres_list;
gres_plugin_job_dealloc(job_ptr->gres_list, gres_list, node_offset,
job_ptr->job_id, node_ptr->name);
gres_plugin_node_state_log(gres_list, node_ptr->name);
exclusive = (job_ptr->details->shared == 0);
if (exclusive) {
if (cr_ptr->nodes[node_inx].exclusive_cnt)
cr_ptr->nodes[node_inx].exclusive_cnt--;
else {
error("%s: exclusive_cnt underflow for node %s",
pre_err, node_ptr->name);
}
}
part_cr_ptr = cr_ptr->nodes[node_inx].parts;
while (part_cr_ptr) {
if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
part_cr_ptr = part_cr_ptr->next;
continue;
}
if (!is_job_running)
/* cancelled job already suspended */;
else if (part_cr_ptr->run_job_cnt > 0)
part_cr_ptr->run_job_cnt--;
else {
error("%s: run_job_cnt underflow for node %s",
pre_err, node_ptr->name);
}
if (part_cr_ptr->tot_job_cnt > 0)
part_cr_ptr->tot_job_cnt--;
else {
error("%s: tot_job_cnt underflow for node %s",
pre_err, node_ptr->name);
}
if ((part_cr_ptr->tot_job_cnt == 0) &&
(part_cr_ptr->run_job_cnt)) {
part_cr_ptr->run_job_cnt = 0;
error("%s: run_job_cnt out of sync for node %s",
pre_err, node_ptr->name);
}
break;
}
if (part_cr_ptr == NULL) {
if (job_ptr->part_ptr) {
error("%s: Could not find partition %s for node %s",
pre_err, job_ptr->part_ptr->name, node_ptr->name);
} else {
error("%s: no partition ptr given for job %u and node %s",
pre_err, job_ptr->job_id, node_ptr->name);
}
rc = SLURM_ERROR;
}
return rc;
}
/*
* allocate resources to the given job
*
* if alloc_all = 0: the job has been suspended, so just re-allocate CPUs
* if alloc_all = 1: allocate all resources (CPUs and memory)
*/
static int _add_job_to_nodes(struct cr_record *cr_ptr,
struct job_record *job_ptr, char *pre_err,
int alloc_all)
{
int i, i_first, i_last, node_cnt, node_offset, rc = SLURM_SUCCESS;
bool exclusive;
struct part_cr_record *part_cr_ptr;
job_resources_t *job_resrcs_ptr;
uint32_t job_memory_cpu = 0, job_memory_node = 0;
uint16_t cpu_cnt;
struct node_record *node_ptr;
List gres_list;
if (cr_ptr == NULL) {
error("%s: cr_ptr not initialized", pre_err);
return SLURM_ERROR;
}
if (alloc_all && job_ptr->details &&
job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
job_memory_cpu = job_ptr->details->pn_min_memory &
(~MEM_PER_CPU);
} else
job_memory_node = job_ptr->details->pn_min_memory;
}
if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) {
error("job %u lacks a job_resources struct", job_ptr->job_id);
return SLURM_ERROR;
}
exclusive = (job_ptr->details->shared == 0);
if (alloc_all)
_add_run_job(cr_ptr, job_ptr->job_id);
_add_tot_job(cr_ptr, job_ptr->job_id);
i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
i_last = bit_fls(job_resrcs_ptr->node_bitmap);
node_cnt = bit_set_count(job_resrcs_ptr->node_bitmap);
if (i_first == -1) /* job has no nodes */
i_last = -2;
node_offset = -1;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(job_resrcs_ptr->node_bitmap, i))
continue;
node_offset++;
if (!bit_test(job_ptr->node_bitmap, i))
continue;
node_ptr = node_record_table_ptr + i;
if (select_fast_schedule)
cpu_cnt = node_ptr->config_ptr->cpus;
else
cpu_cnt = node_ptr->cpus;
if (job_memory_cpu) {
cr_ptr->nodes[i].alloc_memory += job_memory_cpu *
cpu_cnt;
} else
cr_ptr->nodes[i].alloc_memory += job_memory_node;
if (alloc_all) {
if (cr_ptr->nodes[i].gres_list)
gres_list = cr_ptr->nodes[i].gres_list;
else
gres_list = node_ptr->gres_list;
gres_plugin_job_alloc(job_ptr->gres_list, gres_list,
node_cnt, node_offset, cpu_cnt,
job_ptr->job_id, node_ptr->name);
gres_plugin_node_state_log(gres_list, node_ptr->name);
}
if (exclusive)
cr_ptr->nodes[i].exclusive_cnt++;
part_cr_ptr = cr_ptr->nodes[i].parts;
while (part_cr_ptr) {
if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
part_cr_ptr = part_cr_ptr->next;
continue;
}
if (alloc_all)
part_cr_ptr->run_job_cnt++;
part_cr_ptr->tot_job_cnt++;
break;
}
if (part_cr_ptr == NULL) {
info("%s: job %u could not find partition %s for "
"node %s",
pre_err, job_ptr->job_id, job_ptr->partition,
node_ptr->name);
job_ptr->part_nodes_missing = true;
rc = SLURM_ERROR;
}
}
return rc;
}
static void _free_cr(struct cr_record *cr_ptr)
{
int i;
struct part_cr_record *part_cr_ptr1, *part_cr_ptr2;
if (cr_ptr == NULL)
return;
for (i = 0; i < select_node_cnt; i++) {
part_cr_ptr1 = cr_ptr->nodes[i].parts;
while (part_cr_ptr1) {
part_cr_ptr2 = part_cr_ptr1->next;
xfree(part_cr_ptr1);
part_cr_ptr1 = part_cr_ptr2;
}
if (cr_ptr->nodes[i].gres_list)
list_destroy(cr_ptr->nodes[i].gres_list);
}
xfree(cr_ptr->nodes);
xfree(cr_ptr->run_job_ids);
xfree(cr_ptr->tot_job_ids);
xfree(cr_ptr);
}
static void _dump_node_cr(struct cr_record *cr_ptr)
{
#if SELECT_DEBUG
int i;
struct part_cr_record *part_cr_ptr;
struct node_record *node_ptr;
List gres_list;
if ((cr_ptr == NULL) || (cr_ptr->nodes == NULL))
return;
for (i = 0; i < cr_ptr->run_job_len; i++) {
if (cr_ptr->run_job_ids[i])
info("Running job:%u", cr_ptr->run_job_ids[i]);
}
for (i = 0; i < cr_ptr->tot_job_len; i++) {
if (cr_ptr->tot_job_ids[i])
info("Alloc job:%u", cr_ptr->tot_job_ids[i]);
}
for (i = 0; i < select_node_cnt; i++) {
node_ptr = node_record_table_ptr + i;
info("Node:%s exclusive_cnt:%u alloc_mem:%u",
node_ptr->name, cr_ptr->nodes[i].exclusive_cnt,
cr_ptr->nodes[i].alloc_memory);
part_cr_ptr = cr_ptr->nodes[i].parts;
while (part_cr_ptr) {
info(" Part:%s run:%u tot:%u",
part_cr_ptr->part_ptr->name,
part_cr_ptr->run_job_cnt,
part_cr_ptr->tot_job_cnt);
part_cr_ptr = part_cr_ptr->next;
}
if (cr_ptr->nodes[i].gres_list)
gres_list = cr_ptr->nodes[i].gres_list;
else
gres_list = node_ptr->gres_list;
if (gres_list)
gres_plugin_node_state_log(gres_list, node_ptr->name);
}
#endif
}
static struct cr_record *_dup_cr(struct cr_record *cr_ptr)
{
int i;
struct cr_record *new_cr_ptr;
struct part_cr_record *part_cr_ptr, *new_part_cr_ptr;
struct node_record *node_ptr;
List gres_list;
if (cr_ptr == NULL)
return NULL;
new_cr_ptr = xmalloc(sizeof(struct cr_record));
new_cr_ptr->run_job_len = cr_ptr->run_job_len;
i = sizeof(uint32_t) * cr_ptr->run_job_len;
new_cr_ptr->run_job_ids = xmalloc(i);
memcpy(new_cr_ptr->run_job_ids, cr_ptr->run_job_ids, i);
new_cr_ptr->tot_job_len = cr_ptr->tot_job_len;
i = sizeof(uint32_t) * cr_ptr->tot_job_len;
new_cr_ptr->tot_job_ids = xmalloc(i);
memcpy(new_cr_ptr->tot_job_ids, cr_ptr->tot_job_ids, i);
new_cr_ptr->nodes = xmalloc(select_node_cnt *
sizeof(struct node_cr_record));
for (i = 0; i < select_node_cnt; i++) {
node_ptr = node_record_table_ptr + i;
new_cr_ptr->nodes[i].alloc_memory = cr_ptr->nodes[i].
alloc_memory;
new_cr_ptr->nodes[i].exclusive_cnt = cr_ptr->nodes[i].
exclusive_cnt;
part_cr_ptr = cr_ptr->nodes[i].parts;
while (part_cr_ptr) {
new_part_cr_ptr =
xmalloc(sizeof(struct part_cr_record));
new_part_cr_ptr->part_ptr = part_cr_ptr->part_ptr;
new_part_cr_ptr->run_job_cnt = part_cr_ptr->run_job_cnt;
new_part_cr_ptr->tot_job_cnt = part_cr_ptr->tot_job_cnt;
new_part_cr_ptr->next = new_cr_ptr->nodes[i].
parts;
new_cr_ptr->nodes[i].parts = new_part_cr_ptr;
part_cr_ptr = part_cr_ptr->next;
}
if (cr_ptr->nodes[i].gres_list)
gres_list = cr_ptr->nodes[i].gres_list;
else
gres_list = node_ptr->gres_list;
new_cr_ptr->nodes[i].gres_list =
gres_plugin_node_state_dup(gres_list);
}
return new_cr_ptr;
}
static void _init_node_cr(void)
{
struct part_record *part_ptr;
struct part_cr_record *part_cr_ptr;
job_resources_t *job_resrcs_ptr;
struct node_record *node_ptr;
ListIterator part_iterator;
struct job_record *job_ptr;
ListIterator job_iterator;
uint32_t job_memory_cpu, job_memory_node;
int exclusive, i, i_first, i_last, node_offset;
if (cr_ptr)
return;
cr_ptr = xmalloc(sizeof(struct cr_record));
cr_ptr->nodes = xmalloc(select_node_cnt
* sizeof(struct node_cr_record));
/* build partition records */
part_iterator = list_iterator_create(part_list);
while ((part_ptr = (struct part_record *) list_next(part_iterator))) {
for (i = 0; i < select_node_cnt; i++) {
if (part_ptr->node_bitmap == NULL)
break;
if (!bit_test(part_ptr->node_bitmap, i))
continue;
part_cr_ptr = xmalloc(sizeof(struct part_cr_record));
part_cr_ptr->next = cr_ptr->nodes[i].parts;
part_cr_ptr->part_ptr = part_ptr;
cr_ptr->nodes[i].parts = part_cr_ptr;
}
}
list_iterator_destroy(part_iterator);
/* Clear existing node Gres allocations */
for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
i++, node_ptr++) {
gres_plugin_node_state_dealloc_all(node_ptr->gres_list);
}
/* record running and suspended jobs in node_cr_records */
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))
continue;
if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) {
error("job %u lacks a job_resources struct",
job_ptr->job_id);
continue;
}
if (IS_JOB_RUNNING(job_ptr) ||
(IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0)))
_add_run_job(cr_ptr, job_ptr->job_id);
_add_tot_job(cr_ptr, job_ptr->job_id);
job_memory_cpu = 0;
job_memory_node = 0;
if (job_ptr->details && job_ptr->details->pn_min_memory &&
(cr_type == CR_MEMORY)) {
if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
job_memory_cpu = job_ptr->details->
pn_min_memory &
(~MEM_PER_CPU);
} else {
job_memory_node = job_ptr->details->
pn_min_memory;
}
}
/* Use job_resrcs_ptr->node_bitmap rather than
* job_ptr->node_bitmap which can have DOWN nodes
* cleared from the bitmap */
if (job_resrcs_ptr->node_bitmap == NULL)
continue;
exclusive = (job_ptr->details->shared == 0);
node_offset = -1;
i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
i_last = bit_fls(job_resrcs_ptr->node_bitmap);
if (i_first == -1)
i_last = -2;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(job_resrcs_ptr->node_bitmap, i))
continue;
node_offset++;
node_ptr = node_record_table_ptr + i;
if (exclusive)
cr_ptr->nodes[i].exclusive_cnt++;
if (job_memory_cpu == 0) {
cr_ptr->nodes[i].alloc_memory +=
job_memory_node;
} else if (select_fast_schedule) {
cr_ptr->nodes[i].alloc_memory +=
job_memory_cpu *
node_record_table_ptr[i].
config_ptr->cpus;
} else {
cr_ptr->nodes[i].alloc_memory +=
job_memory_cpu *
node_record_table_ptr[i].cpus;
}
if (bit_test(job_ptr->node_bitmap, i)) {
gres_plugin_job_alloc(job_ptr->gres_list,
node_ptr->gres_list,
job_resrcs_ptr->nhosts,
node_offset,
job_resrcs_ptr->
cpus[node_offset],
job_ptr->job_id,
node_ptr->name);
}
part_cr_ptr = cr_ptr->nodes[i].parts;
while (part_cr_ptr) {
if (part_cr_ptr->part_ptr !=
job_ptr->part_ptr) {
part_cr_ptr = part_cr_ptr->next;
continue;
}
if (IS_JOB_RUNNING(job_ptr) ||
(IS_JOB_SUSPENDED(job_ptr) &&
(job_ptr->priority != 0))) {
/* Running or being gang scheduled */
part_cr_ptr->run_job_cnt++;
}
part_cr_ptr->tot_job_cnt++;
break;
}
if (part_cr_ptr == NULL) {
info("_init_node_cr: job %u could not find "
"partition %s for node %s",
job_ptr->job_id, job_ptr->partition,
node_ptr->name);
job_ptr->part_nodes_missing = true;
}
}
}
list_iterator_destroy(job_iterator);
_dump_node_cr(cr_ptr);
}
static int _find_job (void *x, void *key)
{
struct job_record *job_ptr = (struct job_record *) x;
if (job_ptr == (struct job_record *) key)
return 1;
return 0;
}
static bool _is_preemptable(struct job_record *job_ptr,
List preemptee_candidates)
{
if (!preemptee_candidates)
return false;
if (list_find_first(preemptee_candidates, _find_job, job_ptr))
return true;
return false;
}
/* Determine if a job can ever run */
static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, int max_share)
{
bitstr_t *orig_map;
int i, rc = SLURM_ERROR;
uint32_t save_mem;
orig_map = bit_copy(bitmap);
if (!orig_map)
fatal("bit_copy: malloc failure");
/* Try to run with currently available nodes */
i = _job_count_bitmap(cr_ptr, job_ptr, orig_map, bitmap,
NO_SHARE_LIMIT, NO_SHARE_LIMIT,
SELECT_MODE_TEST_ONLY);
if (i >= min_nodes) {
save_mem = job_ptr->details->pn_min_memory;
job_ptr->details->pn_min_memory = 0;
rc = _job_test(job_ptr, bitmap, min_nodes,
max_nodes, req_nodes);
job_ptr->details->pn_min_memory = save_mem;
}
FREE_NULL_BITMAP(orig_map);
return rc;
}
/* Allocate resources for a job now, if possible */
static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
int max_share, uint32_t req_nodes,
List preemptee_candidates,
List *preemptee_job_list)
{
bitstr_t *orig_map;
int max_run_job, j, sus_jobs, rc = EINVAL, prev_cnt = -1;
struct job_record *tmp_job_ptr;
ListIterator job_iterator, preemptee_iterator;
struct cr_record *exp_cr;
orig_map = bit_copy(bitmap);
if (!orig_map)
fatal("bit_copy: malloc failure");
for (max_run_job=0; ((max_run_job<max_share) && (rc != SLURM_SUCCESS));
max_run_job++) {
bool last_iteration = (max_run_job == (max_share - 1));
for (sus_jobs=0; ((sus_jobs<5) && (rc != SLURM_SUCCESS));
sus_jobs+=4) {
if (last_iteration)
sus_jobs = NO_SHARE_LIMIT;
j = _job_count_bitmap(cr_ptr, job_ptr,
orig_map, bitmap,
max_run_job,
max_run_job + sus_jobs,
SELECT_MODE_RUN_NOW);
#if SELECT_DEBUG
{
char *node_list = bitmap2node_name(bitmap);
info("_run_job %u iter:%d cnt:%d nodes:%s",
job_ptr->job_id, max_run_job, j,
node_list);
xfree(node_list);
}
#endif
if ((j == prev_cnt) || (j < min_nodes))
continue;
prev_cnt = j;
if (max_run_job > 0) {
/* We need to share. Try to find
* suitable job to share nodes with */
rc = _find_job_mate(job_ptr, bitmap,
min_nodes,
max_nodes, req_nodes);
if (rc == SLURM_SUCCESS)
break;
}
rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
req_nodes);
}
}
if ((rc != SLURM_SUCCESS) && preemptee_candidates &&
(exp_cr = _dup_cr(cr_ptr))) {
/* Remove all preemptable jobs from simulated environment */
job_iterator = list_iterator_create(job_list);
while ((tmp_job_ptr = (struct job_record *)
list_next(job_iterator))) {
if (!IS_JOB_RUNNING(tmp_job_ptr) &&
!IS_JOB_SUSPENDED(tmp_job_ptr))
continue;
if (_is_preemptable(tmp_job_ptr,
preemptee_candidates)) {
bool remove_all = false;
uint16_t mode;
mode = slurm_job_preempt_mode(tmp_job_ptr);
if ((mode == PREEMPT_MODE_REQUEUE) ||
(mode == PREEMPT_MODE_CHECKPOINT) ||
(mode == PREEMPT_MODE_CANCEL))
remove_all = true;
/* Remove preemptable job now */
_rm_job_from_nodes(exp_cr, tmp_job_ptr,
"_run_now",
remove_all);
j = _job_count_bitmap(exp_cr, job_ptr,
orig_map, bitmap,
(max_share - 1),
NO_SHARE_LIMIT,
SELECT_MODE_RUN_NOW);
if (j < min_nodes)
continue;
rc = _job_test(job_ptr, bitmap, min_nodes,
max_nodes, req_nodes);
if (rc == SLURM_SUCCESS)
break;
}
}
list_iterator_destroy(job_iterator);
if ((rc == SLURM_SUCCESS) && preemptee_job_list &&
preemptee_candidates) {
/* Build list of preemptee jobs whose resources are
* actually used */
if (*preemptee_job_list == NULL) {
*preemptee_job_list = list_create(NULL);
if (*preemptee_job_list == NULL)
fatal("list_create malloc failure");
}
preemptee_iterator = list_iterator_create(
preemptee_candidates);
while ((tmp_job_ptr = (struct job_record *)
list_next(preemptee_iterator))) {
if (bit_overlap(bitmap,
tmp_job_ptr->node_bitmap) == 0)
continue;
list_append(*preemptee_job_list,
tmp_job_ptr);
}
list_iterator_destroy(preemptee_iterator);
}
_free_cr(exp_cr);
}
if (rc == SLURM_SUCCESS)
_build_select_struct(job_ptr, bitmap);
FREE_NULL_BITMAP(orig_map);
return rc;
}
/* Determine where and when the job at job_ptr can begin execution by updating
* a scratch cr_record structure to reflect each job terminating at the
* end of its time limit and use this to show where and when the job at job_ptr
* will begin execution. Used by SLURM's sched/backfill plugin and Moab. */
static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
int max_share, uint32_t req_nodes,
List preemptee_candidates,
List *preemptee_job_list)
{
struct cr_record *exp_cr;
struct job_record *tmp_job_ptr;
List cr_job_list;
ListIterator job_iterator, preemptee_iterator;
bitstr_t *orig_map;
int i, max_run_jobs, rc = SLURM_ERROR;
time_t now = time(NULL);
max_run_jobs = MAX((max_share - 1), 1); /* exclude this job */
orig_map = bit_copy(bitmap);
if (!orig_map)
fatal("bit_copy: malloc failure");
/* Try to run with currently available nodes */
i = _job_count_bitmap(cr_ptr, job_ptr, orig_map, bitmap,
max_run_jobs, NO_SHARE_LIMIT,
SELECT_MODE_WILL_RUN);
if (i >= min_nodes) {
rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
req_nodes);
if (rc == SLURM_SUCCESS) {
FREE_NULL_BITMAP(orig_map);
job_ptr->start_time = time(NULL);
return SLURM_SUCCESS;
}
}
/* Job is still pending. Simulate termination of jobs one at a time
* to determine when and where the job can start. */
exp_cr = _dup_cr(cr_ptr);
if (exp_cr == NULL) {
FREE_NULL_BITMAP(orig_map);
return SLURM_ERROR;
}
/* Build list of running and suspended jobs */
cr_job_list = list_create(NULL);
if (!cr_job_list)
fatal("list_create: memory allocation failure");
job_iterator = list_iterator_create(job_list);
while ((tmp_job_ptr = (struct job_record *) list_next(job_iterator))) {
if (!IS_JOB_RUNNING(tmp_job_ptr) &&
!IS_JOB_SUSPENDED(tmp_job_ptr))
continue;
if (tmp_job_ptr->end_time == 0) {
error("Job %u has zero end_time", tmp_job_ptr->job_id);
continue;
}
if (_is_preemptable(tmp_job_ptr, preemptee_candidates)) {
uint16_t mode = slurm_job_preempt_mode(tmp_job_ptr);
bool remove_all = false;
if ((mode == PREEMPT_MODE_REQUEUE) ||
(mode == PREEMPT_MODE_CHECKPOINT) ||
(mode == PREEMPT_MODE_CANCEL))
remove_all = true;
/* Remove preemptable job now */
_rm_job_from_nodes(exp_cr, tmp_job_ptr,
"_will_run_test", remove_all);
} else
list_append(cr_job_list, tmp_job_ptr);
}
list_iterator_destroy(job_iterator);
/* Test with all preemptable jobs gone */
if (preemptee_candidates) {
i = _job_count_bitmap(exp_cr, job_ptr, orig_map, bitmap,
max_run_jobs, NO_SHARE_LIMIT,
SELECT_MODE_RUN_NOW);
if (i >= min_nodes) {
rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
req_nodes);
if (rc == SLURM_SUCCESS)
job_ptr->start_time = now + 1;
}
}
/* Remove the running jobs one at a time from exp_node_cr and try
* scheduling the pending job after each one */
if (rc != SLURM_SUCCESS) {
list_sort(cr_job_list, _cr_job_list_sort);
job_iterator = list_iterator_create(cr_job_list);
while ((tmp_job_ptr = (struct job_record *)
list_next(job_iterator))) {
_rm_job_from_nodes(exp_cr, tmp_job_ptr,
"_will_run_test", true);
i = _job_count_bitmap(exp_cr, job_ptr, orig_map,
bitmap, max_run_jobs,
NO_SHARE_LIMIT,
SELECT_MODE_RUN_NOW);
if (i < min_nodes)
continue;
rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
req_nodes);
if (rc != SLURM_SUCCESS)
continue;
if (tmp_job_ptr->end_time <= now)
job_ptr->start_time = now + 1;
else
job_ptr->start_time = tmp_job_ptr->end_time;
break;
}
list_iterator_destroy(job_iterator);
}
if ((rc == SLURM_SUCCESS) && preemptee_job_list &&
preemptee_candidates) {
/* Build list of preemptee jobs whose resources are
* actually used. List returned even if not killed
* in selected plugin, but by Moab or something else. */
if (*preemptee_job_list == NULL) {
*preemptee_job_list = list_create(NULL);
if (*preemptee_job_list == NULL)
fatal("list_create malloc failure");
}
preemptee_iterator =list_iterator_create(preemptee_candidates);
while ((tmp_job_ptr = (struct job_record *)
list_next(preemptee_iterator))) {
if (bit_overlap(bitmap, tmp_job_ptr->node_bitmap) == 0)
continue;
list_append(*preemptee_job_list, tmp_job_ptr);
}
list_iterator_destroy(preemptee_iterator);
}
list_destroy(cr_job_list);
_free_cr(exp_cr);
FREE_NULL_BITMAP(orig_map);
return rc;
}
static int _cr_job_list_sort(void *x, void *y)
{
struct job_record *job1_ptr = (struct job_record *) x;
struct job_record *job2_ptr = (struct job_record *) y;
return (int) difftime(job1_ptr->end_time, job2_ptr->end_time);
}
/*
* init() is called when the plugin is loaded, before any other functions
* are called. Put global initialization here.
*/
extern int init ( void )
{
int rc = SLURM_SUCCESS;
#ifdef HAVE_XCPU
rc = _init_status_pthread();
#endif
cr_type = slurmctld_conf.select_type_param;
return rc;
}
extern int fini ( void )
{
int rc = SLURM_SUCCESS;
#ifdef HAVE_XCPU
rc = _fini_status_pthread();
#endif
slurm_mutex_lock(&cr_mutex);
_free_cr(cr_ptr);
cr_ptr = NULL;
slurm_mutex_unlock(&cr_mutex);
return rc;
}
/*
* The remainder of this file implements the standard SLURM
* node selection API.
*/
extern int select_p_state_save(char *dir_name)
{
return SLURM_SUCCESS;
}
extern int select_p_state_restore(char *dir_name)
{
return SLURM_SUCCESS;
}
extern int select_p_job_init(List job_list)
{
return SLURM_SUCCESS;
}
extern int select_p_node_init(struct node_record *node_ptr, int node_cnt)
{
if (node_ptr == NULL) {
error("select_p_node_init: node_ptr == NULL");
return SLURM_ERROR;
}
if (node_cnt < 0) {
error("select_p_node_init: node_cnt < 0");
return SLURM_ERROR;
}
/* NOTE: We free the consumable resources info here, but
* can't rebuild it since the partition and node structures
* have not yet had node bitmaps reset. */
slurm_mutex_lock(&cr_mutex);
_free_cr(cr_ptr);
cr_ptr = NULL;
slurm_mutex_unlock(&cr_mutex);
select_node_ptr = node_ptr;
select_node_cnt = node_cnt;
select_fast_schedule = slurm_get_fast_schedule();
return SLURM_SUCCESS;
}
extern int select_p_block_init(List part_list)
{
return SLURM_SUCCESS;
}
/*
* select_p_job_test - Given a specification of scheduling requirements,
* identify the nodes which "best" satisfy the request.
* "best" is defined as either single set of consecutive nodes satisfying
* the request and leaving the minimum number of unused nodes OR
* the fewest number of consecutive node sets
* IN/OUT job_ptr - pointer to job being considered for initiation,
* set's start_time when job expected to start
* IN/OUT bitmap - usable nodes are set on input, nodes not required to
* satisfy the request are cleared, other left set
* IN min_nodes - minimum count of nodes
* IN req_nodes - requested (or desired) count of nodes
* IN max_nodes - maximum count of nodes
* IN mode - SELECT_MODE_RUN_NOW: try to schedule job now
* SELECT_MODE_TEST_ONLY: test if job can ever run
* SELECT_MODE_WILL_RUN: determine when and where job can run
* IN preemptee_candidates - List of pointers to jobs which can be preempted.
* IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the
* jobs to be preempted to initiate the pending job. Not set
* if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL.
* RET zero on success, EINVAL otherwise
* globals (passed via select_p_node_init):
* node_record_count - count of nodes configured
* node_record_table_ptr - pointer to global node table
* NOTE: the job information that is considered for scheduling includes:
* req_node_bitmap: bitmap of specific nodes required by the job
* contiguous: allocated nodes must be sequentially located
* num_cpus: minimum number of processors required by the job
* NOTE: bitmap must be a superset of the job's required at the time that
* select_p_job_test is called
*/
extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, uint16_t mode,
List preemptee_candidates,
List *preemptee_job_list)
{
int max_share = 0, rc = EINVAL;
xassert(bitmap);
if (job_ptr->details == NULL)
return EINVAL;
slurm_mutex_lock(&cr_mutex);
if (cr_ptr == NULL) {
_init_node_cr();
if (cr_ptr == NULL) {
slurm_mutex_unlock(&cr_mutex);
error("select_p_job_test: cr_ptr not initialized");
return SLURM_ERROR;
}
}
if (bit_set_count(bitmap) < min_nodes) {
slurm_mutex_unlock(&cr_mutex);
return EINVAL;
}
if (job_ptr->details->shared)
max_share = job_ptr->part_ptr->max_share & ~SHARED_FORCE;
else /* ((shared == 0) || (shared == (uint16_t) NO_VAL)) */
max_share = 1;
if (mode == SELECT_MODE_WILL_RUN) {
rc = _will_run_test(job_ptr, bitmap, min_nodes, max_nodes,
max_share, req_nodes,
preemptee_candidates, preemptee_job_list);
} else if (mode == SELECT_MODE_TEST_ONLY) {
rc = _test_only(job_ptr, bitmap, min_nodes, max_nodes,
req_nodes, max_share);
} else if (mode == SELECT_MODE_RUN_NOW) {
rc = _run_now(job_ptr, bitmap, min_nodes, max_nodes,
max_share, req_nodes,
preemptee_candidates, preemptee_job_list);
} else
fatal("select_p_job_test: Mode %d is invalid", mode);
slurm_mutex_unlock(&cr_mutex);
return rc;
}
extern int select_p_job_begin(struct job_record *job_ptr)
{
int rc = SLURM_SUCCESS;
#ifdef HAVE_XCPU
int i;
char clone_path[128];
xassert(job_ptr);
xassert(job_ptr->node_bitmap);
for (i=0; i<select_node_cnt; i++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
snprintf(clone_path, sizeof(clone_path),
"%s/%s/xcpu/clone", XCPU_DIR,
select_node_ptr[i].name);
if (chown(clone_path, (uid_t)job_ptr->user_id,
(gid_t)job_ptr->group_id)) {
error("chown %s: %m", clone_path);
rc = SLURM_ERROR;
} else {
debug("chown %s to %u", clone_path,
job_ptr->user_id);
}
}
#endif
slurm_mutex_lock(&cr_mutex);
if (cr_ptr == NULL)
_init_node_cr();
_add_job_to_nodes(cr_ptr, job_ptr, "select_p_job_begin", 1);
gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id);
slurm_mutex_unlock(&cr_mutex);
return rc;
}
/* Determine if allocated nodes are usable (powered up) */
extern int select_p_job_ready(struct job_record *job_ptr)
{
int i, i_first, i_last;
struct node_record *node_ptr;
if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
/* Gang scheduling might suspend job immediately */
return 0;
}
if ((job_ptr->node_bitmap == NULL) ||
((i_first = bit_ffs(job_ptr->node_bitmap)) == -1))
return READY_NODE_STATE;
i_last = bit_fls(job_ptr->node_bitmap);
for (i = i_first; i <= i_last; i++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
node_ptr = node_record_table_ptr + i;
if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr))
return 0;
}
return READY_NODE_STATE;
}
extern int select_p_job_resized(struct job_record *job_ptr,
struct node_record *node_ptr)
{
int rc = SLURM_SUCCESS;
#ifdef HAVE_XCPU
int i = node_ptr - node_record_table_ptr;
char clone_path[128];
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
snprintf(clone_path, sizeof(clone_path), "%s/%s/xcpu/clone", XCPU_DIR,
node_ptr->name);
if (chown(clone_path, (uid_t)0, (gid_t)0)) {
error("chown %s: %m", clone_path);
rc = SLURM_ERROR;
} else
debug("chown %s to 0", clone_path);
#endif
slurm_mutex_lock(&cr_mutex);
if (cr_ptr == NULL)
_init_node_cr();
_rm_job_from_one_node(job_ptr, node_ptr, "select_p_job_resized");
slurm_mutex_unlock(&cr_mutex);
return rc;
}
extern int select_p_job_fini(struct job_record *job_ptr)
{
int rc = SLURM_SUCCESS;
#ifdef HAVE_XCPU
int i;
char clone_path[128];
for (i=0; i<select_node_cnt; i++) {
if (bit_test(job_ptr->node_bitmap, i) == 0)
continue;
snprintf(clone_path, sizeof(clone_path), "%s/%s/xcpu/clone",
XCPU_DIR, select_node_ptr[i].name);
if (chown(clone_path, (uid_t)0, (gid_t)0)) {
error("chown %s: %m", clone_path);
rc = SLURM_ERROR;
} else {
debug("chown %s to 0", clone_path);
}
}
#endif
slurm_mutex_lock(&cr_mutex);
if (cr_ptr == NULL)
_init_node_cr();
_rm_job_from_nodes(cr_ptr, job_ptr, "select_p_job_fini", true);
slurm_mutex_unlock(&cr_mutex);
return rc;
}
extern int select_p_job_suspend(struct job_record *job_ptr)
{
slurm_mutex_lock(&cr_mutex);
if (cr_ptr == NULL)
_init_node_cr();
_rm_job_from_nodes(cr_ptr, job_ptr, "select_p_job_suspend", false);
slurm_mutex_unlock(&cr_mutex);
return SLURM_SUCCESS;
}
extern int select_p_job_resume(struct job_record *job_ptr)
{
slurm_mutex_lock(&cr_mutex);
if (cr_ptr == NULL)
_init_node_cr();
_add_job_to_nodes(cr_ptr, job_ptr, "select_p_job_resume", 0);
slurm_mutex_unlock(&cr_mutex);
return SLURM_SUCCESS;
}
extern int select_p_pack_select_info(time_t last_query_time,
uint16_t show_flags, Buf *buffer_ptr,
uint16_t protocol_version)
{
/* This function is always invalid on normal Linux clusters */
return SLURM_ERROR;
}
extern int select_p_select_nodeinfo_pack(select_nodeinfo_t *nodeinfo,
Buf buffer,
uint16_t protocol_version)
{
pack16(nodeinfo->alloc_cpus, buffer);
return SLURM_SUCCESS;
}
extern int select_p_select_nodeinfo_unpack(select_nodeinfo_t **nodeinfo,
Buf buffer,
uint16_t protocol_version)
{
select_nodeinfo_t *nodeinfo_ptr = NULL;
nodeinfo_ptr = select_p_select_nodeinfo_alloc(NO_VAL);
*nodeinfo = nodeinfo_ptr;
safe_unpack16(&nodeinfo_ptr->alloc_cpus, buffer);
return SLURM_SUCCESS;
unpack_error:
error("select_nodeinfo_unpack: error unpacking here");
select_p_select_nodeinfo_free(nodeinfo_ptr);
*nodeinfo = NULL;
return SLURM_ERROR;
}
extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(uint32_t size)
{
select_nodeinfo_t *nodeinfo = xmalloc(sizeof(struct select_nodeinfo));
nodeinfo->magic = NODEINFO_MAGIC;
return nodeinfo;
}
extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo)
{
if(nodeinfo) {
if (nodeinfo->magic != NODEINFO_MAGIC) {
error("select_p_select_nodeinfo_free: "
"nodeinfo magic bad");
return EINVAL;
}
nodeinfo->magic = 0;
xfree(nodeinfo);
}
return SLURM_SUCCESS;
}
extern int select_p_select_nodeinfo_set_all(time_t last_query_time)
{
struct node_record *node_ptr = NULL;
int i=0;
static time_t last_set_all = 0;
/* only set this once when the last_node_update is newer than
* the last time we set things up. */
if(last_set_all && (last_node_update < last_set_all)) {
debug2("Node select info for set all hasn't "
"changed since %ld",
(long)last_set_all);
return SLURM_NO_CHANGE_IN_DATA;
}
last_set_all = last_node_update;
for (i=0; i<node_record_count; i++) {
select_nodeinfo_t *nodeinfo = NULL;
node_ptr = node_record_table_ptr + i;
/* We have to use the '_g_' here to make sure we get
the correct data to work on. i.e. cray calls this
plugin from within select/cray which has it's own
struct.
*/
select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
SELECT_NODEDATA_PTR, 0,
(void *)&nodeinfo);
if(!nodeinfo) {
error("no nodeinfo returned from structure");
continue;
}
if ((node_ptr->node_state & NODE_STATE_COMPLETING) ||
(node_ptr->node_state == NODE_STATE_ALLOCATED)) {
if (slurmctld_conf.fast_schedule)
nodeinfo->alloc_cpus =
node_ptr->config_ptr->cpus;
else
nodeinfo->alloc_cpus = node_ptr->cpus;
} else
nodeinfo->alloc_cpus = 0;
}
return SLURM_SUCCESS;
}
extern int select_p_select_nodeinfo_set(struct job_record *job_ptr)
{
xassert(job_ptr);
slurm_mutex_lock(&cr_mutex);
if (cr_ptr == NULL)
_init_node_cr();
slurm_mutex_unlock(&cr_mutex);
return SLURM_SUCCESS;
}
extern int select_p_select_nodeinfo_get(select_nodeinfo_t *nodeinfo,
enum select_nodedata_type dinfo,
enum node_states state,
void *data)
{
int rc = SLURM_SUCCESS;
uint16_t *uint16 = (uint16_t *) data;
select_nodeinfo_t **select_nodeinfo = (select_nodeinfo_t **) data;
if (nodeinfo == NULL) {
error("get_nodeinfo: nodeinfo not set");
return SLURM_ERROR;
}
if (nodeinfo->magic != NODEINFO_MAGIC) {
error("get_nodeinfo: nodeinfo magic bad");
return SLURM_ERROR;
}
switch (dinfo) {
case SELECT_NODEDATA_SUBGRP_SIZE:
*uint16 = 0;
break;
case SELECT_NODEDATA_SUBCNT:
if (state == NODE_STATE_ALLOCATED)
*uint16 = nodeinfo->alloc_cpus;
else
*uint16 = 0;
break;
case SELECT_NODEDATA_PTR:
*select_nodeinfo = nodeinfo;
break;
default:
error("Unsupported option %d for get_nodeinfo.", dinfo);
rc = SLURM_ERROR;
break;
}
return rc;
}
extern select_jobinfo_t *select_p_select_jobinfo_alloc(void)
{
return SLURM_SUCCESS;
}
extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo,
enum select_jobdata_type data_type,
void *data)
{
return SLURM_SUCCESS;
}
extern int select_p_select_jobinfo_get (select_jobinfo_t *jobinfo,
enum select_jobdata_type data_type,
void *data)
{
return SLURM_ERROR;
}
extern select_jobinfo_t *select_p_select_jobinfo_copy(
select_jobinfo_t *jobinfo)
{
return NULL;
}
extern int select_p_select_jobinfo_free (select_jobinfo_t *jobinfo)
{
return SLURM_SUCCESS;
}
extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer,
uint16_t protocol_version)
{
return SLURM_SUCCESS;
}
extern int select_p_select_jobinfo_unpack(select_jobinfo_t **jobinfo,
Buf buffer,
uint16_t protocol_version)
{
return SLURM_SUCCESS;
}
extern char *select_p_select_jobinfo_sprint(select_jobinfo_t *jobinfo,
char *buf, size_t size, int mode)
{
if (buf && size) {
buf[0] = '\0';
return buf;
} else
return NULL;
}
extern char *select_p_select_jobinfo_xstrdup(select_jobinfo_t *jobinfo,
int mode)
{
return NULL;
}
extern int select_p_update_block (update_part_msg_t *part_desc_ptr)
{
return SLURM_SUCCESS;
}
extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr)
{
return SLURM_SUCCESS;
}
extern int select_p_get_info_from_plugin (enum select_jobdata_type info,
struct job_record *job_ptr,
void *data)
{
return SLURM_SUCCESS;
}
extern int select_p_update_node_config (int index)
{
return SLURM_SUCCESS;
}
extern int select_p_update_node_state (int index, uint16_t state)
{
return SLURM_SUCCESS;
}
extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data)
{
return SLURM_SUCCESS;
}
extern int select_p_reconfigure(void)
{
slurm_mutex_lock(&cr_mutex);
_free_cr(cr_ptr);
cr_ptr = NULL;
_init_node_cr();
slurm_mutex_unlock(&cr_mutex);
return SLURM_SUCCESS;
}