blob: 9c2d6bb10658da3bce86162873f742f1d8919d23 [file] [log] [blame]
/*****************************************************************************
* gang.c - Gang scheduler functions.
*****************************************************************************
* Copyright (C) 2008 Hewlett-Packard Development Company, L.P.
* Copyright (C) SchedMD LLC.
* Written by Chris Holmes
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
/*
* gang scheduler plugin for SLURM
*/
#include <pthread.h>
#include <signal.h>
#include <unistd.h>
#include "./gang.h"
#include "slurm/slurm.h"
#include "src/common/bitstring.h"
#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/xstring.h"
#include "src/interfaces/preempt.h"
#include "src/interfaces/select.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/slurmctld.h"
/* global timeslicer thread variables */
static bool thread_running = false;
static bool thread_shutdown = false;
static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_t timeslicer_thread_id = (pthread_t) 0;
static list_t *preempt_job_list = NULL;
/* timeslicer flags and structures */
enum entity_type {
GS_NODE,
GS_SOCKET,
GS_CORE,
GS_CPU, /* Without task affinity */
GS_CPU2 /* With task affinity */
};
enum gs_flags {
GS_SUSPEND,
GS_RESUME,
GS_NO_PART,
GS_SUCCESS,
GS_ACTIVE,
GS_NO_ACTIVE,
GS_FILLER
};
struct gs_job {
uint32_t job_id;
job_record_t *job_ptr;
uint16_t sig_state;
uint16_t row_state;
};
struct gs_part {
char *part_name;
uint16_t priority; /* Job priority tier */
uint32_t num_jobs;
struct gs_job **job_list;
uint32_t job_list_size;
uint32_t num_shadows;
struct gs_job **shadow; /* see '"Shadow" Design' below */
uint32_t shadow_size;
uint32_t jobs_active;
bitstr_t *active_resmap;
uint16_t *active_cpus;
uint16_t array_size;
struct gs_part *next;
};
/******************************************
*
* SUMMARY OF DATA MANAGEMENT
*
* For GS_CORE: job_ptr->job_resrcs->{node,core}_bitmap
* For GS_CPU: job_ptr->job_resrcs->{node_bitmap, cpus}
* For GS_CPU2: job_ptr->job_resrcs->{node,core}_bitmap
* For GS_SOCKET: job_ptr->job_resrcs->{node,core}_bitmap
* For GS_NODE: job_ptr->job_resrcs->node_bitmap only
*
* EVALUATION ALGORITHM
*
* For GS_NODE, GS_SOCKET, GS_CORE, and GS_CPU2 the bits CANNOT conflict
* For GS_CPU: if bits conflict, make sure sum of CPUs per
* resource don't exceed physical resource count
*
*
* The core_bitmap and cpus array are a collection of allocated values
* ONLY. For every bit set in node_bitmap, there is a corresponding
* element in cpus and a set of elements in the core_bitmap.
*
******************************************
*
* "Shadow" Design to support Preemption
*
* Jobs in higher priority partitions "cast shadows" on the active
* rows of lower priority partitions. The effect is that jobs that
* are "caught" in these shadows are preempted (suspended)
* indefinitely until the "shadow" disappears. When constructing
* the active row of a partition, any jobs in the 'shadow' array
* are applied first.
*
******************************************
*/
/* global variables */
static uint32_t timeslicer_seconds = 0;
static uint16_t gr_type = GS_NODE;
static list_t *gs_part_list = NULL;
static uint32_t default_job_list_size = 64;
static pthread_mutex_t data_mutex = PTHREAD_MUTEX_INITIALIZER;
static uint32_t num_sorted_part = 0;
/* function declarations */
static void *_timeslicer_thread(void *arg);
static char *_print_flag(int flag)
{
switch (flag) {
case GS_SUSPEND:
return "GS_SUSPEND";
case GS_RESUME:
return "GS_RESUME";
case GS_NO_PART:
return "GS_NO_PART";
case GS_SUCCESS:
return "GS_SUCCESS";
case GS_ACTIVE:
return "GS_ACTIVE";
case GS_NO_ACTIVE:
return "GS_NO_ACTIVE";
case GS_FILLER:
return "GS_FILLER";
default:
return "unknown";
}
}
static void _print_jobs(struct gs_part *p_ptr)
{
int i;
if (slurm_conf.debug_flags & DEBUG_FLAG_GANG) {
info("gang: part %s has %u jobs, %u shadows:",
p_ptr->part_name, p_ptr->num_jobs, p_ptr->num_shadows);
for (i = 0; i < p_ptr->num_shadows; i++) {
info("gang: shadow %pJ row_s %s, sig_s %s",
p_ptr->shadow[i]->job_ptr,
_print_flag(p_ptr->shadow[i]->row_state),
_print_flag(p_ptr->shadow[i]->sig_state));
}
for (i = 0; i < p_ptr->num_jobs; i++) {
info("gang: %pJ row_s %s, sig_s %s",
p_ptr->job_list[i]->job_ptr,
_print_flag(p_ptr->job_list[i]->row_state),
_print_flag(p_ptr->job_list[i]->sig_state));
}
if (p_ptr->active_resmap) {
int s = bit_size(p_ptr->active_resmap);
i = bit_set_count(p_ptr->active_resmap);
info("gang: active resmap has %d of %d bits set",
i, s);
}
}
}
static uint16_t _get_gr_type(void)
{
if (slurm_conf.select_type_param & SELECT_CORE)
return GS_CORE;
if (slurm_conf.select_type_param & SELECT_CPU) {
if (!xstrcmp(slurm_conf.task_plugin, "task/none"))
return GS_CPU;
return GS_CPU2;
}
if (slurm_conf.select_type_param & SELECT_SOCKET)
return GS_SOCKET;
/* note that SELECT_MEMORY is node-level scheduling with
* memory management */
return GS_NODE;
}
static uint16_t _get_part_gr_type(part_record_t *part_ptr)
{
if (part_ptr) {
if (part_ptr->cr_type & SELECT_CORE)
return GS_CORE;
if (part_ptr->cr_type & SELECT_CPU) {
if (!xstrcmp(slurm_conf.task_plugin, "task/none"))
return GS_CPU;
return GS_CPU2;
}
if (part_ptr->cr_type & SELECT_SOCKET)
return GS_SOCKET;
}
/* Use global configuration */
return gr_type;
}
static uint16_t _get_phys_bit_cnt(int node_index)
{
node_record_t *node_ptr = node_record_table_ptr[node_index];
if (gr_type == GS_CPU)
return node_ptr->cpus;
return node_ptr->tot_cores;
}
static uint16_t _get_socket_cnt(int node_index)
{
node_record_t *node_ptr = node_record_table_ptr[node_index];
return node_ptr->config_ptr->tot_sockets;
}
static void _destroy_parts(void *x)
{
int i;
struct gs_part *gs_part_ptr = (struct gs_part *) x;
xfree(gs_part_ptr->part_name);
for (i = 0; i < gs_part_ptr->num_jobs; i++)
xfree(gs_part_ptr->job_list[i]);
xfree(gs_part_ptr->shadow);
FREE_NULL_BITMAP(gs_part_ptr->active_resmap);
xfree(gs_part_ptr->active_cpus);
xfree(gs_part_ptr->job_list);
xfree(gs_part_ptr);
}
/* Build the gs_part_list. The job_list will be created later,
* once a job is added. */
static void _build_parts(void)
{
list_itr_t *part_iterator;
part_record_t *p_ptr;
struct gs_part *gs_part_ptr;
int num_parts;
FREE_NULL_LIST(gs_part_list);
/* reset the sorted list, since it's currently
* pointing to partitions we just destroyed */
num_sorted_part = 0;
num_parts = list_count(part_list);
if (num_parts == 0)
return;
gs_part_list = list_create(_destroy_parts);
part_iterator = list_iterator_create(part_list);
while ((p_ptr = list_next(part_iterator))) {
gs_part_ptr = xmalloc(sizeof(struct gs_part));
gs_part_ptr->part_name = xstrdup(p_ptr->name);
gs_part_ptr->priority = p_ptr->priority_tier;
/* everything else is already set to zero/NULL */
list_append(gs_part_list, gs_part_ptr);
}
list_iterator_destroy(part_iterator);
}
/* Find the gs_part entity with the given name */
static int _find_gs_part(void *x, void *key)
{
struct gs_part *gs_part_ptr = (struct gs_part *) x;
char *name = (char *) key;
if (!xstrcmp(name, gs_part_ptr->part_name))
return 1;
return 0;
}
/* Find the job_list index of the given job_id in the given partition */
static int _find_job_index(struct gs_part *p_ptr, uint32_t job_id)
{
int i;
for (i = 0; i < p_ptr->num_jobs; i++) {
if (p_ptr->job_list[i]->job_ptr->job_id == job_id)
return i;
}
return -1;
}
/* Return 1 if job "cpu count" fits in this row, else return 0 */
static int _can_cpus_fit(job_record_t *job_ptr, struct gs_part *p_ptr)
{
uint16_t *p_cpus, *j_cpus;
job_resources_t *job_res = job_ptr->job_resrcs;
if (gr_type != GS_CPU)
return 0;
p_cpus = p_ptr->active_cpus;
j_cpus = job_res->cpus;
if (!p_cpus || !j_cpus)
return 0;
for (int j = 0, i = 0; next_node_bitmap(job_res->node_bitmap, &i);
i++) {
if (p_cpus[i] + j_cpus[j] > _get_phys_bit_cnt(i))
return 0;
j++;
}
return 1;
}
/* Return 1 if job fits in this row, else return 0 */
static int _job_fits_in_active_row(job_record_t *job_ptr,
struct gs_part *p_ptr)
{
job_resources_t *job_res = job_ptr->job_resrcs;
int count;
bitstr_t *job_map;
uint16_t job_gr_type;
if ((p_ptr->active_resmap == NULL) || (p_ptr->jobs_active == 0))
return 1;
job_gr_type = _get_part_gr_type(job_ptr->part_ptr);
if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) ||
(job_gr_type == GS_SOCKET)) {
return job_fits_into_cores(job_res, p_ptr->active_resmap);
}
/* job_gr_type == GS_NODE || job_gr_type == GS_CPU */
job_map = bit_copy(job_res->node_bitmap);
bit_and(job_map, p_ptr->active_resmap);
/* any set bits indicate contention for the same resource */
count = bit_set_count(job_map);
log_flag(GANG, "gang: %s: %d bits conflict", __func__, count);
FREE_NULL_BITMAP(job_map);
if (count == 0)
return 1;
if (job_gr_type == GS_CPU) {
/* For GS_CPU we check the CPU arrays */
return _can_cpus_fit(job_ptr, p_ptr);
}
return 0;
}
/* a helper function for _add_job_to_active when GS_SOCKET
* a job has just been added to p_ptr->active_resmap, so set all cores of
* each used socket to avoid activating another job on the same socket */
static void _fill_sockets(bitstr_t *job_nodemap, struct gs_part *p_ptr)
{
uint32_t c, i;
int n, first_bit, last_bit;
if (!job_nodemap || !p_ptr || !p_ptr->active_resmap)
return;
first_bit = bit_ffs(job_nodemap);
last_bit = bit_fls(job_nodemap);
if ((first_bit < 0) || (last_bit < 0))
fatal("gang: _afill_sockets: nodeless job?");
for (c = 0, n = 0; n < first_bit; n++) {
c += _get_phys_bit_cnt(n);
}
for (n = first_bit; n <= last_bit; n++) {
uint16_t s, socks, cps, cores_per_node;
cores_per_node = _get_phys_bit_cnt(n);
if (bit_test(job_nodemap, n) == 0) {
c += cores_per_node;
continue;
}
socks = _get_socket_cnt(n);
cps = cores_per_node / socks;
for (s = 0; s < socks; s++) {
for (i = c; i < c+cps; i++) {
if (bit_test(p_ptr->active_resmap, i))
break;
}
if (i < c+cps) {
/* set all bits on this used socket */
bit_nset(p_ptr->active_resmap, c, c+cps-1);
}
c += cps;
}
}
}
/* Add the given job to the "active" structures of
* the given partition and increment the run count */
static void _add_job_to_active(job_record_t *job_ptr, struct gs_part *p_ptr)
{
job_resources_t *job_res = job_ptr->job_resrcs;
uint16_t job_gr_type;
/* add job to active_resmap */
job_gr_type = _get_part_gr_type(job_ptr->part_ptr);
if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) ||
(job_gr_type == GS_SOCKET)) {
if (p_ptr->jobs_active == 0 && p_ptr->active_resmap)
bit_clear_all(p_ptr->active_resmap);
add_job_to_cores(job_res, &(p_ptr->active_resmap));
if (job_gr_type == GS_SOCKET)
_fill_sockets(job_res->node_bitmap, p_ptr);
} else { /* GS_NODE or GS_CPU */
if (!p_ptr->active_resmap) {
log_flag(GANG, "gang: %s: %pJ first",
__func__, job_ptr);
p_ptr->active_resmap = bit_copy(job_res->node_bitmap);
} else if (p_ptr->jobs_active == 0) {
log_flag(GANG, "gang: %s: %pJ copied",
__func__, job_ptr);
bit_copybits(p_ptr->active_resmap,
job_res->node_bitmap);
} else {
log_flag(GANG, "gang: %s: adding %pJ",
__func__, job_ptr);
bit_or(p_ptr->active_resmap, job_res->node_bitmap);
}
}
/* add job to the active_cpus array */
if (job_gr_type == GS_CPU) {
uint32_t i, a, sz = bit_size(p_ptr->active_resmap);
if (!p_ptr->active_cpus) {
/* create active_cpus array */
p_ptr->active_cpus = xmalloc(sz * sizeof(uint16_t));
}
if (p_ptr->jobs_active == 0) {
/* overwrite the existing values in active_cpus */
for (a = 0, i = 0; i < sz; i++) {
if (bit_test(job_res->node_bitmap, i)) {
p_ptr->active_cpus[i] =
job_res->cpus[a++];
} else {
p_ptr->active_cpus[i] = 0;
}
}
} else {
/* add job to existing jobs in the active cpus */
for (a = 0, i = 0; i < sz; i++) {
if (bit_test(job_res->node_bitmap, i)) {
uint16_t limit = _get_phys_bit_cnt(i);
p_ptr->active_cpus[i] +=
job_res->cpus[a++];
/* when adding shadows, the resources
* may get overcommitted */
if (p_ptr->active_cpus[i] > limit)
p_ptr->active_cpus[i] = limit;
}
}
}
}
p_ptr->jobs_active += 1;
}
static int _suspend_job(job_record_t *job_ptr)
{
int rc;
suspend_msg_t msg;
memset(&msg, 0, sizeof(msg));
msg.job_id = job_ptr->job_id;
msg.job_id_str = NULL;
msg.op = SUSPEND_JOB;
rc = job_suspend(NULL, &msg, 0, false, NO_VAL16);
/* job_suspend() returns ESLURM_DISABLED if job is already suspended */
if (rc == SLURM_SUCCESS) {
if (slurm_conf.debug_flags & DEBUG_FLAG_GANG)
info("gang: suspending %pJ", job_ptr);
else
debug("gang: suspending %pJ", job_ptr);
} else if (rc != ESLURM_DISABLED) {
info("gang: suspending %pJ: %s", job_ptr, slurm_strerror(rc));
}
return rc;
}
static void _resume_job(job_record_t *job_ptr)
{
int rc;
suspend_msg_t msg;
memset(&msg, 0, sizeof(msg));
msg.job_id = job_ptr->job_id;
msg.job_id_str = NULL;
msg.op = RESUME_JOB;
rc = job_suspend(NULL, &msg, 0, false, NO_VAL16);
if (rc == SLURM_SUCCESS) {
if (slurm_conf.debug_flags & DEBUG_FLAG_GANG)
info("gang: resuming %pJ", job_ptr);
else
debug("gang: resuming %pJ", job_ptr);
} else if (rc != ESLURM_ALREADY_DONE) {
error("gang: resuming %pJ: %s", job_ptr, slurm_strerror(rc));
}
}
static void _preempt_job_queue(uint32_t job_id)
{
uint32_t *tmp_id = xmalloc(sizeof(uint32_t));
*tmp_id = job_id;
list_append(preempt_job_list, tmp_id);
}
static void _preempt_job_dequeue(void)
{
job_record_t *job_ptr;
uint32_t job_id, *tmp_id;
uint16_t preempt_mode;
xassert(preempt_job_list);
while ((tmp_id = list_pop(preempt_job_list))) {
int rc = SLURM_ERROR;
job_id = *tmp_id;
xfree(tmp_id);
if ((job_ptr = find_job_record(job_id)) == NULL) {
error("%s could not find JobId=%u",
__func__, job_id);
continue;
}
preempt_mode = slurm_job_preempt_mode(job_ptr);
if (preempt_mode == PREEMPT_MODE_SUSPEND) {
if ((rc = _suspend_job(job_ptr)) == ESLURM_DISABLED)
rc = SLURM_SUCCESS;
} else if (preempt_mode == PREEMPT_MODE_CANCEL) {
rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
if (rc == SLURM_SUCCESS) {
info("preempted %pJ has been killed", job_ptr);
}
} else if ((preempt_mode == PREEMPT_MODE_REQUEUE) &&
job_ptr->batch_flag && job_ptr->details &&
(job_ptr->details->requeue > 0)) {
rc = job_requeue(0, job_ptr->job_id, NULL, true, 0);
if (rc == SLURM_SUCCESS) {
info("preempted %pJ has been requeued",
job_ptr);
} else
error("preempted %pJ could not be requeued: %s",
job_ptr, slurm_strerror(rc));
} else if (preempt_mode == PREEMPT_MODE_OFF) {
error("Invalid preempt_mode %u for %pJ",
preempt_mode, job_ptr);
continue;
}
if (rc != SLURM_SUCCESS) {
rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
if (rc == SLURM_SUCCESS)
info("%s: preempted %pJ had to be killed",
__func__,job_ptr);
else {
info("%s: preempted %pJ kill failure %s",
__func__, job_ptr, slurm_strerror(rc));
}
}
}
}
/* This is the reverse order defined by list.h so to generated a list in
* descending order rather than ascending order */
static int _sort_partitions(void *part1, void *part2)
{
struct gs_part *g1;
struct gs_part *g2;
g1 = *(struct gs_part **)part1;
g2 = *(struct gs_part **)part2;
return slurm_sort_uint16_list_desc(&g1->priority, &g2->priority);
}
/* Scan the partition list. Add the given job as a "shadow" to every
* partition with a lower priority than the given partition */
static void _cast_shadow(struct gs_job *j_ptr, uint16_t priority)
{
list_itr_t *part_iterator;
struct gs_part *p_ptr;
int i;
part_iterator = list_iterator_create(gs_part_list);
while ((p_ptr = list_next(part_iterator))) {
if (p_ptr->priority >= priority)
continue;
/* This partition has a lower priority, so add
* the job as a "Shadow" */
if (!p_ptr->shadow) {
p_ptr->shadow_size = default_job_list_size;
p_ptr->shadow = xmalloc(p_ptr->shadow_size *
sizeof(struct gs_job *));
/* 'shadow' is initialized to be NULL filled */
} else {
/* does this shadow already exist? */
for (i = 0; i < p_ptr->num_shadows; i++) {
if (p_ptr->shadow[i] == j_ptr)
break;
}
if (i < p_ptr->num_shadows)
continue;
}
if (p_ptr->num_shadows+1 >= p_ptr->shadow_size) {
p_ptr->shadow_size *= 2;
xrealloc(p_ptr->shadow, p_ptr->shadow_size *
sizeof(struct gs_job *));
}
p_ptr->shadow[p_ptr->num_shadows++] = j_ptr;
}
list_iterator_destroy(part_iterator);
}
/* Remove the given job as a "shadow" from all partitions */
static void _clear_shadow(struct gs_job *j_ptr)
{
list_itr_t *part_iterator;
struct gs_part *p_ptr;
int i;
part_iterator = list_iterator_create(gs_part_list);
while ((p_ptr = list_next(part_iterator))) {
if (!p_ptr->shadow)
continue;
for (i = 0; i < p_ptr->num_shadows; i++) {
if (p_ptr->shadow[i] == j_ptr)
break;
}
if (i >= p_ptr->num_shadows)
/* job not found */
continue;
p_ptr->num_shadows--;
/* shift all other jobs down */
for (; i < p_ptr->num_shadows; i++)
p_ptr->shadow[i] = p_ptr->shadow[i+1];
p_ptr->shadow[p_ptr->num_shadows] = NULL;
}
list_iterator_destroy(part_iterator);
}
/* Rebuild the active row BUT preserve the order of existing jobs.
* This is called after one or more jobs have been removed from
* the partition or if a higher priority "shadow" has been added
* which could preempt running jobs.
*/
static void _update_active_row(struct gs_part *p_ptr, int add_new_jobs)
{
int i;
struct gs_job *j_ptr;
uint16_t preempt_mode;
log_flag(GANG, "gang: update_active_row: rebuilding part %s...",
p_ptr->part_name);
/* rebuild the active row, starting with any shadows */
p_ptr->jobs_active = 0;
for (i = 0; p_ptr->shadow && p_ptr->shadow[i]; i++) {
_add_job_to_active(p_ptr->shadow[i]->job_ptr, p_ptr);
}
/* attempt to add the existing 'active' jobs */
for (i = 0; i < p_ptr->num_jobs; i++) {
j_ptr = p_ptr->job_list[i];
if (j_ptr->row_state != GS_ACTIVE)
continue;
if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) {
_add_job_to_active(j_ptr->job_ptr, p_ptr);
_cast_shadow(j_ptr, p_ptr->priority);
} else {
/* this job has been preempted by a shadow job.
* suspend it and preserve it's job_list order */
if (j_ptr->sig_state != GS_SUSPEND) {
preempt_mode =
slurm_job_preempt_mode(j_ptr->job_ptr);
if (p_ptr->num_shadows &&
(preempt_mode != PREEMPT_MODE_OFF) &&
(preempt_mode != PREEMPT_MODE_SUSPEND)) {
_preempt_job_queue(j_ptr->job_id);
} else
_suspend_job(j_ptr->job_ptr);
j_ptr->sig_state = GS_SUSPEND;
_clear_shadow(j_ptr);
}
j_ptr->row_state = GS_NO_ACTIVE;
}
}
/* attempt to add the existing 'filler' jobs */
for (i = 0; i < p_ptr->num_jobs; i++) {
j_ptr = p_ptr->job_list[i];
if (j_ptr->row_state != GS_FILLER)
continue;
if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) {
_add_job_to_active(j_ptr->job_ptr, p_ptr);
_cast_shadow(j_ptr, p_ptr->priority);
} else {
/* this job has been preempted by a shadow job.
* suspend it and preserve it's job_list order */
if (j_ptr->sig_state != GS_SUSPEND) {
preempt_mode =
slurm_job_preempt_mode(j_ptr->job_ptr);
if (p_ptr->num_shadows &&
(preempt_mode != PREEMPT_MODE_OFF) &&
(preempt_mode != PREEMPT_MODE_SUSPEND)) {
_preempt_job_queue(j_ptr->job_id);
} else
_suspend_job(j_ptr->job_ptr);
j_ptr->sig_state = GS_SUSPEND;
_clear_shadow(j_ptr);
}
j_ptr->row_state = GS_NO_ACTIVE;
}
}
if (!add_new_jobs)
return;
/* attempt to add any new jobs */
for (i = 0; i < p_ptr->num_jobs; i++) {
j_ptr = p_ptr->job_list[i];
if ((j_ptr->row_state != GS_NO_ACTIVE) ||
(j_ptr->job_ptr->priority == 0))
continue;
if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) {
_add_job_to_active(j_ptr->job_ptr, p_ptr);
_cast_shadow(j_ptr, p_ptr->priority);
/* note that this job is a "filler" for this row,
* blocked by a higher priority job */
j_ptr->row_state = GS_FILLER;
/* resume the job */
if (j_ptr->sig_state == GS_SUSPEND) {
_resume_job(j_ptr->job_ptr);
j_ptr->sig_state = GS_RESUME;
}
}
}
}
/* rebuild all active rows without reordering jobs:
* - attempt to preserve running jobs
* - suspend any jobs that have been "shadowed" (preempted)
* - resume any "filler" jobs that can be found
*/
static void _update_all_active_rows(void)
{
list_itr_t *part_iterator;
struct gs_part *p_ptr;
/* Sort the partitions. This way the shadows of any high-priority
* jobs are appropriately adjusted before the lower priority
* partitions are updated */
list_sort(gs_part_list, _sort_partitions);
part_iterator = list_iterator_create(gs_part_list);
while ((p_ptr = list_next(part_iterator)))
_update_active_row(p_ptr, 1);
list_iterator_destroy(part_iterator);
}
/* remove the given job from the given partition
* IN job_id - job to remove
* IN p_ptr - GS partition structure
* IN fini - true is job is in finish state (e.g. not to be resumed)
*/
static void _remove_job_from_part(uint32_t job_id, struct gs_part *p_ptr,
bool fini)
{
int i;
struct gs_job *j_ptr;
if (!job_id || !p_ptr)
return;
/* find the job in the job_list */
i = _find_job_index(p_ptr, job_id);
if (i < 0)
/* job not found */
return;
j_ptr = p_ptr->job_list[i];
log_flag(GANG, "gang: %s: removing %pJ from %s",
__func__, j_ptr->job_ptr, p_ptr->part_name);
/* remove any shadow first */
_clear_shadow(j_ptr);
/* remove the job from the job_list by shifting everyone else down */
p_ptr->num_jobs--;
for (; i < p_ptr->num_jobs; i++) {
p_ptr->job_list[i] = p_ptr->job_list[i+1];
}
p_ptr->job_list[i] = NULL;
/* make sure the job is not suspended by gang, and then delete it */
if (!fini && (j_ptr->sig_state == GS_SUSPEND) &&
j_ptr->job_ptr->priority) {
log_flag(GANG, "gang: %s: resuming suspended %pJ",
__func__, j_ptr->job_ptr);
_resume_job(j_ptr->job_ptr);
}
j_ptr->job_ptr = NULL;
xfree(j_ptr);
}
/* Add the given job to the given partition, and if it remains running
* then "cast it's shadow" over the active row of any partition with a
* lower priority than the given partition. Return the sig state of the
* job (GS_SUSPEND or GS_RESUME) */
static uint16_t _add_job_to_part(struct gs_part *p_ptr, job_record_t *job_ptr)
{
int i;
struct gs_job *j_ptr;
uint16_t preempt_mode;
xassert(p_ptr);
xassert(job_ptr->job_id > 0);
xassert(job_ptr->job_resrcs);
xassert(job_ptr->job_resrcs->node_bitmap);
xassert(job_ptr->job_resrcs->core_bitmap);
log_flag(GANG, "gang: %s: adding %pJ to %s",
__func__, job_ptr, p_ptr->part_name);
/* take care of any memory needs */
if (!p_ptr->job_list) {
p_ptr->job_list_size = default_job_list_size;
p_ptr->job_list = xmalloc(p_ptr->job_list_size *
sizeof(struct gs_job *));
/* job_list is initialized to be NULL filled */
}
/* protect against duplicates */
i = _find_job_index(p_ptr, job_ptr->job_id);
if (i >= 0) {
/* This job already exists, but the resource allocation
* may have changed. In any case, remove the existing
* job before adding this new one.
*/
log_flag(GANG, "gang: %s: duplicate %pJ detected",
__func__, job_ptr);
_remove_job_from_part(job_ptr->job_id, p_ptr, false);
_update_active_row(p_ptr, 0);
}
/* more memory management */
if ((p_ptr->num_jobs + 1) == p_ptr->job_list_size) {
p_ptr->job_list_size *= 2;
xrealloc(p_ptr->job_list, p_ptr->job_list_size *
sizeof(struct gs_job *));
/* enlarged job_list is initialized to be NULL filled */
}
j_ptr = xmalloc(sizeof(struct gs_job));
/* gather job info */
j_ptr->job_id = job_ptr->job_id;
j_ptr->job_ptr = job_ptr;
j_ptr->sig_state = GS_RESUME; /* all jobs are running initially */
j_ptr->row_state = GS_NO_ACTIVE; /* job is not in the active row */
/* append this job to the job_list */
p_ptr->job_list[p_ptr->num_jobs++] = j_ptr;
/* determine the immediate fate of this job (run or suspend) */
if (!IS_JOB_SUSPENDED(job_ptr) &&
_job_fits_in_active_row(job_ptr, p_ptr)) {
log_flag(GANG, "gang: %s: %pJ remains running",
__func__, job_ptr);
_add_job_to_active(job_ptr, p_ptr);
/* note that this job is a "filler" for this row */
j_ptr->row_state = GS_FILLER;
/* all jobs begin in the run state, so
* there's no need to signal this job */
/* since this job is running we need to "cast it's shadow"
* over lower priority partitions */
_cast_shadow(j_ptr, p_ptr->priority);
} else {
log_flag(GANG, "gang: %s: suspending %pJ",
__func__, job_ptr);
preempt_mode = slurm_job_preempt_mode(job_ptr);
if (p_ptr->num_shadows &&
(preempt_mode != PREEMPT_MODE_OFF) &&
(preempt_mode != PREEMPT_MODE_SUSPEND)) {
_preempt_job_queue(job_ptr->job_id);
} else
_suspend_job(job_ptr);
j_ptr->sig_state = GS_SUSPEND;
}
_print_jobs(p_ptr);
return j_ptr->sig_state;
}
/* ensure that all jobs running in Slurm are accounted for.
* this procedure assumes that the gs data has already been
* locked by the caller!
*/
static void _scan_slurm_job_list(void)
{
job_record_t *job_ptr;
struct gs_part *p_ptr;
int i;
list_itr_t *job_iterator;
char *part_name;
if (!job_list) { /* no jobs */
log_flag(GANG, "gang: %s: job_list NULL", __func__);
return;
}
log_flag(GANG, "gang: %s: job_list exists...", __func__);
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
log_flag(GANG, "gang: %s: checking %pJ",
__func__, job_ptr);
/* Exclude HetJobs from gang operation. */
if (job_ptr->het_job_id)
continue;
if (IS_JOB_PENDING(job_ptr))
continue;
if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority == 0))
continue; /* not suspended by gang */
if (job_ptr->part_ptr && job_ptr->part_ptr->name)
part_name = job_ptr->part_ptr->name;
else
part_name = job_ptr->partition;
if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) {
/* are we tracking this job already? */
p_ptr = list_find_first(gs_part_list, _find_gs_part,
part_name);
if (!p_ptr) /* no partition */
continue;
i = _find_job_index(p_ptr, job_ptr->job_id);
if (i >= 0) /* we're tracking it, so continue */
continue;
/* We're not tracking this job. Resume it if it's
* suspended, and then add it to the job list. */
_add_job_to_part(p_ptr, job_ptr);
continue;
}
/* if the job is not pending, suspended, or running, then
* it's completing or completed. Make sure we've released
* this job */
p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name);
if (!p_ptr) /* no partition */
continue;
_remove_job_from_part(job_ptr->job_id, p_ptr, false);
}
list_iterator_destroy(job_iterator);
/* now that all of the old jobs have been flushed out,
* update the active row of all partitions */
_update_all_active_rows();
}
/****************************
* Slurm Timeslicer Hooks
*
* Here is a summary of the primary activities that occur
* within this plugin:
*
* gs_init: initialize plugin
*
* gs_job_start: a new allocation has been created
* gs_job_fini: an existing allocation has been cleared
* gs_reconfig: refresh partition and job data
* _cycle_job_list: timeslicer thread is rotating jobs
*
* gs_fini: terminate plugin
*
***************************/
static void _spawn_timeslicer_thread(void)
{
slurm_mutex_lock( &thread_flag_mutex );
if (thread_running) {
error("timeslicer thread already running, not starting "
"another");
slurm_mutex_unlock(&thread_flag_mutex);
return;
}
slurm_thread_create(&timeslicer_thread_id, _timeslicer_thread, NULL);
thread_running = true;
slurm_mutex_unlock(&thread_flag_mutex);
}
/* Initialize data structures and start the gang scheduling thread */
extern void gs_init(void)
{
if (!(slurm_conf.preempt_mode & PREEMPT_MODE_GANG))
return;
if (timeslicer_thread_id)
return;
/* initialize global variables */
log_flag(GANG, "gang: entering gs_init");
timeslicer_seconds = slurm_conf.sched_time_slice;
gr_type = _get_gr_type();
preempt_job_list = list_create(xfree_ptr);
slurm_mutex_lock(&data_mutex);
_build_parts();
/* load any currently running jobs */
_scan_slurm_job_list();
slurm_mutex_unlock(&data_mutex);
/* spawn the timeslicer thread */
_spawn_timeslicer_thread();
log_flag(GANG, "gang: leaving gs_init");
}
/* Terminate the gang scheduling thread and free its data structures */
extern void gs_fini(void)
{
/* terminate the timeslicer thread */
log_flag(GANG, "gang: entering gs_fini");
slurm_mutex_lock(&thread_flag_mutex);
if (thread_running) {
slurm_mutex_lock(&term_lock);
thread_shutdown = true;
slurm_cond_signal(&term_cond);
slurm_mutex_unlock(&term_lock);
slurm_mutex_unlock(&thread_flag_mutex);
usleep(120000);
if (timeslicer_thread_id)
error("gang: timeslicer pthread still running");
else {
slurm_mutex_lock(&thread_flag_mutex);
thread_running = false;
slurm_mutex_unlock(&thread_flag_mutex);
slurm_mutex_lock(&term_lock);
thread_shutdown = false;
slurm_mutex_unlock(&term_lock);
}
} else {
slurm_mutex_unlock(&thread_flag_mutex);
}
FREE_NULL_LIST(preempt_job_list);
slurm_mutex_lock(&data_mutex);
FREE_NULL_LIST(gs_part_list);
gs_part_list = NULL;
slurm_mutex_unlock(&data_mutex);
log_flag(GANG, "gang: leaving gs_fini");
}
/* Notify the gang scheduler that a job has been resumed or started.
* In either case, add the job to gang scheduling. */
extern void gs_job_start(job_record_t *job_ptr)
{
struct gs_part *p_ptr;
uint16_t job_sig_state;
char *part_name;
if (!(slurm_conf.preempt_mode & PREEMPT_MODE_GANG))
return;
/* Exclude HetJobs from gang operation. */
if (job_ptr->het_job_id)
return;
log_flag(GANG, "gang: entering %s for %pJ", __func__, job_ptr);
/* add job to partition */
if (job_ptr->part_ptr && job_ptr->part_ptr->name)
part_name = job_ptr->part_ptr->name;
else
part_name = job_ptr->partition;
slurm_mutex_lock(&data_mutex);
p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name);
if (p_ptr) {
job_sig_state = _add_job_to_part(p_ptr, job_ptr);
/* if this job is running then check for preemption */
if (job_sig_state == GS_RESUME)
_update_all_active_rows();
}
slurm_mutex_unlock(&data_mutex);
if (!p_ptr) {
/*
* No partition was found for this job, so let it run
* uninterrupted (what else can we do?)
*/
error("gang: could not find partition %s for %pJ",
part_name, job_ptr);
}
_preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */
log_flag(GANG, "gang: leaving gs_job_start");
}
/* Gang scheduling has been disabled by change in configuration,
* resume any suspended jobs */
extern void gs_wake_jobs(void)
{
job_record_t *job_ptr;
list_itr_t *job_iterator;
if (!job_list) /* no jobs */
return;
job_iterator = list_iterator_create(job_list);
while ((job_ptr = list_next(job_iterator))) {
/* Exclude HetJobs from gang operation. */
if (job_ptr->het_job_id)
continue;
if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0)) {
info("gang waking preempted %pJ", job_ptr);
_resume_job(job_ptr);
}
}
list_iterator_destroy(job_iterator);
}
/* Notify the gang scheduler that a job has been suspended or completed.
* In either case, remove the job from gang scheduling. */
extern void gs_job_fini(job_record_t *job_ptr)
{
struct gs_part *p_ptr;
char *part_name;
if (!(slurm_conf.preempt_mode & PREEMPT_MODE_GANG))
return;
/* Exclude HetJobs from gang operation. */
if (job_ptr->het_job_id)
return;
log_flag(GANG, "gang: entering %s for %pJ", __func__, job_ptr);
if (job_ptr->part_ptr && job_ptr->part_ptr->name)
part_name = job_ptr->part_ptr->name;
else
part_name = job_ptr->partition;
slurm_mutex_lock(&data_mutex);
p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name);
if (!p_ptr) {
slurm_mutex_unlock(&data_mutex);
log_flag(GANG, "gang: leaving gs_job_fini");
return;
}
/* remove job from the partition */
_remove_job_from_part(job_ptr->job_id, p_ptr, true);
/* this job may have preempted other jobs, so
* check by updating all active rows */
_update_all_active_rows();
slurm_mutex_unlock(&data_mutex);
log_flag(GANG, "gang: leaving gs_job_fini");
}
/* rebuild data structures from scratch
*
* A reconfigure can affect this plugin in these ways:
* - partitions can be added or removed
* - this affects the gs_part_list
* - nodes can be removed from a partition, or added to a partition
* - this affects the size of the active resmap
*
* Here's the plan:
* 1. save a copy of the global structures, and then construct
* new ones.
* 2. load the new partition structures with existing jobs,
* confirming the job exists and resizing their resmaps
* (if necessary).
* 3. make sure all partitions are accounted for. If a partition
* was removed, make sure any jobs that were in the queue and
* that were suspended are resumed. Conversely, if a partition
* was added, check for existing jobs that may be contending
* for resources that we could begin timeslicing.
* 4. delete the old global structures and return.
*/
extern void gs_reconfig(void)
{
int i;
list_itr_t *part_iterator;
struct gs_part *p_ptr, *newp_ptr;
list_t *old_part_list = NULL;
job_record_t *job_ptr;
struct gs_job *j_ptr;
if (!(slurm_conf.preempt_mode & PREEMPT_MODE_GANG))
return;
if (!timeslicer_thread_id) {
/* gs_init() will be called later from read_slurm_conf()
* if we are enabling gang scheduling via reconfiguration */
return;
}
log_flag(GANG, "gang: entering gs_reconfig");
slurm_mutex_lock(&data_mutex);
old_part_list = gs_part_list;
gs_part_list = NULL;
/* reset global data */
gr_type = _get_gr_type();
_build_parts();
/* scan the old part list and add existing jobs to the new list */
part_iterator = list_iterator_create(old_part_list);
while ((p_ptr = list_next(part_iterator))) {
newp_ptr = (struct gs_part *) list_find_first(gs_part_list,
_find_gs_part,
p_ptr->part_name);
if (!newp_ptr) {
/* this partition was removed, so resume
* any jobs suspended by gang and continue */
for (i = 0; i < p_ptr->num_jobs; i++) {
j_ptr = p_ptr->job_list[i];
if ((j_ptr->sig_state == GS_SUSPEND) &&
(j_ptr->job_ptr->priority != 0)) {
info("resuming job in missing part %s",
p_ptr->part_name);
_resume_job(j_ptr->job_ptr);
j_ptr->sig_state = GS_RESUME;
}
}
continue;
}
if (p_ptr->num_jobs == 0)
/* no jobs to transfer */
continue;
/* we need to transfer the jobs from p_ptr to new_ptr and
* adjust their resmaps (if necessary). then we need to create
* the active resmap and adjust the state of each job (if
* necessary). NOTE: there could be jobs that only overlap
* on nodes that are no longer in the partition, but we're
* not going to worry about those cases.
*
* add the jobs from p_ptr into new_ptr in their current order
* to preserve the state of timeslicing.
*/
for (i = 0; i < p_ptr->num_jobs; i++) {
job_ptr = find_job_record(p_ptr->job_list[i]->job_id);
if (job_ptr == NULL) {
/* job no longer exists in Slurm, so drop it */
continue;
}
if (IS_JOB_SUSPENDED(job_ptr) &&
(job_ptr->priority == 0))
continue; /* not suspended by gang */
/* transfer the job as long as it is still active */
if (IS_JOB_SUSPENDED(job_ptr) ||
IS_JOB_RUNNING(job_ptr)) {
_add_job_to_part(newp_ptr, job_ptr);
}
}
}
list_iterator_destroy(part_iterator);
/* confirm all jobs. Scan the master job_list and confirm that we
* are tracking all jobs */
_scan_slurm_job_list();
FREE_NULL_LIST(old_part_list);
slurm_mutex_unlock(&data_mutex);
_preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */
log_flag(GANG, "gang: leaving gs_reconfig");
}
/************************************
* Timeslicer Functions
***********************************/
/* Build the active row from the job_list.
* The job_list is assumed to be sorted */
static void _build_active_row(struct gs_part *p_ptr)
{
int i;
struct gs_job *j_ptr;
log_flag(GANG, "gang: entering %s", __func__);
p_ptr->jobs_active = 0;
if (p_ptr->num_jobs == 0)
return;
/* apply all shadow jobs first */
for (i = 0; i < p_ptr->num_shadows; i++) {
_add_job_to_active(p_ptr->shadow[i]->job_ptr, p_ptr);
}
/* attempt to add jobs from the job_list in the current order */
for (i = 0; i < p_ptr->num_jobs; i++) {
j_ptr = p_ptr->job_list[i];
if (j_ptr->job_ptr->priority == 0)
continue;
if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) {
_add_job_to_active(j_ptr->job_ptr, p_ptr);
j_ptr->row_state = GS_ACTIVE;
}
}
log_flag(GANG, "gang: leaving %s", __func__);
}
/* _cycle_job_list
*
* This is the heart of the timeslicer. The algorithm works as follows:
*
* 1. Each new job is added to the end of the job list, so the earliest job
* is at the front of the list.
* 2. Any "shadow" jobs are first applied to the active_resmap. Then the
* active_resmap is filled out by starting with the first job in the list,
* and adding to it any job that doesn't conflict with the resources.
* 3. When the timeslice has passed, all jobs that were added to the active
* resmap are moved to the back of the list (preserving their order among
* each other).
* 4. Loop back to step 2, starting with the new "first job in the list".
*/
static void _cycle_job_list(struct gs_part *p_ptr)
{
int i, j;
struct gs_job *j_ptr;
uint16_t preempt_mode;
log_flag(GANG, "gang: entering %s", __func__);
/* re-prioritize the job_list and set all row_states to GS_NO_ACTIVE */
for (i = 0; i < p_ptr->num_jobs; i++) {
while (p_ptr->job_list[i]->row_state == GS_ACTIVE) {
/* move this job to the back row and "deactivate" it */
j_ptr = p_ptr->job_list[i];
j_ptr->row_state = GS_NO_ACTIVE;
for (j = i; j+1 < p_ptr->num_jobs; j++) {
p_ptr->job_list[j] = p_ptr->job_list[j+1];
}
p_ptr->job_list[j] = j_ptr;
}
if (p_ptr->job_list[i]->row_state == GS_FILLER)
p_ptr->job_list[i]->row_state = GS_NO_ACTIVE;
}
log_flag(GANG, "gang: %s reordered job list:", __func__);
/* Rebuild the active row. */
_build_active_row(p_ptr);
log_flag(GANG, "gang: %s new active job list:", __func__);
_print_jobs(p_ptr);
/* Suspend running jobs that are GS_NO_ACTIVE */
for (i = 0; i < p_ptr->num_jobs; i++) {
j_ptr = p_ptr->job_list[i];
if ((j_ptr->row_state == GS_NO_ACTIVE) &&
(j_ptr->sig_state == GS_RESUME)) {
log_flag(GANG, "gang: %s: suspending %pJ",
__func__, j_ptr->job_ptr);
preempt_mode = slurm_job_preempt_mode(j_ptr->job_ptr);
if (p_ptr->num_shadows &&
(preempt_mode != PREEMPT_MODE_OFF) &&
(preempt_mode != PREEMPT_MODE_SUSPEND)) {
_preempt_job_queue(j_ptr->job_id);
} else
_suspend_job(j_ptr->job_ptr);
j_ptr->sig_state = GS_SUSPEND;
_clear_shadow(j_ptr);
}
}
/* Resume suspended jobs that are GS_ACTIVE */
for (i = 0; i < p_ptr->num_jobs; i++) {
j_ptr = p_ptr->job_list[i];
if ((j_ptr->row_state == GS_ACTIVE) &&
(j_ptr->sig_state == GS_SUSPEND) &&
(j_ptr->job_ptr->priority != 0)) { /* Redundant check */
log_flag(GANG, "gang: %s: resuming %pJ",
__func__, j_ptr->job_ptr);
_resume_job(j_ptr->job_ptr);
j_ptr->sig_state = GS_RESUME;
_cast_shadow(j_ptr, p_ptr->priority);
}
}
log_flag(GANG, "gang: leaving %s", __func__);
}
static void _slice_sleep(void)
{
struct timespec ts = {0, 0};
struct timeval now;
gettimeofday(&now, NULL);
ts.tv_sec = now.tv_sec + timeslicer_seconds;
ts.tv_nsec = now.tv_usec * 1000;
slurm_mutex_lock(&term_lock);
if (!thread_shutdown)
slurm_cond_timedwait(&term_cond, &term_lock, &ts);
slurm_mutex_unlock(&term_lock);
}
/* The timeslicer thread */
static void *_timeslicer_thread(void *arg)
{
/* Write locks on job and read lock on nodes */
slurmctld_lock_t job_write_lock = {
NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK, READ_LOCK };
list_itr_t *part_iterator;
struct gs_part *p_ptr;
log_flag(GANG, "gang: starting timeslicer loop");
while (!thread_shutdown) {
_slice_sleep();
if (thread_shutdown)
break;
lock_slurmctld(job_write_lock);
slurm_mutex_lock(&data_mutex);
list_sort(gs_part_list, _sort_partitions);
/* scan each partition... */
log_flag(GANG, "gang: %s: scanning partitions", __func__);
part_iterator = list_iterator_create(gs_part_list);
while ((p_ptr = list_next(part_iterator))) {
log_flag(GANG, "gang: %s: part %s: run %u total %u",
__func__, p_ptr->part_name,
p_ptr->jobs_active, p_ptr->num_jobs);
if (p_ptr->jobs_active <
(p_ptr->num_jobs + p_ptr->num_shadows)) {
_cycle_job_list(p_ptr);
}
}
list_iterator_destroy(part_iterator);
slurm_mutex_unlock(&data_mutex);
/* Preempt jobs that were formerly only suspended */
_preempt_job_dequeue(); /* MUST BE OUTSIDE data_mutex lock */
unlock_slurmctld(job_write_lock);
}
timeslicer_thread_id = (pthread_t) 0;
return NULL;
}