blob: e8a3df033d403ebc3b7e6bb1f8cccd2d6ef0854d [file] [log] [blame] [edit]
/*****************************************************************************\
* power_save.c - support node power saving mode. Nodes which have been
* idle for an extended period of time will be placed into a power saving
* mode by running an arbitrary script. This script can lower the voltage
* or frequency of the nodes or can completely power the nodes off.
* When the node is restored to normal operation, another script will be
* executed. Many parameters are available to control this mode of operation.
*****************************************************************************
* Copyright (C) 2007 The Regents of the University of California.
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of SLURM, a resource management program.
* For details, see <https://computing.llnl.gov/linux/slurm/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#ifndef _GNU_SOURCE
# define _GNU_SOURCE
#endif
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include "src/common/bitstring.h"
#include "src/common/macros.h"
#include "src/common/xstring.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/slurmctld.h"
#if defined (HAVE_DECL_STRSIGNAL) && !HAVE_DECL_STRSIGNAL
# ifndef strsignal
extern char *strsignal(int);
# endif
#endif /* defined HAVE_DECL_STRSIGNAL && !HAVE_DECL_STRSIGNAL */
#define _DEBUG 0
#define PID_CNT 10
#define MAX_SHUTDOWN_DELAY 120 /* seconds to wait for child procs
* to exit after daemon shutdown
* request, then orphan or kill proc */
/* Records for tracking processes forked to suspend/resume nodes */
pid_t child_pid[PID_CNT]; /* pid of process */
time_t child_time[PID_CNT]; /* start time of process */
pthread_mutex_t power_mutex = PTHREAD_MUTEX_INITIALIZER;
bool power_save_enabled = false;
int idle_time, suspend_rate, resume_timeout, resume_rate, suspend_timeout;
char *suspend_prog = NULL, *resume_prog = NULL;
char *exc_nodes = NULL, *exc_parts = NULL;
time_t last_config = (time_t) 0, last_suspend = (time_t) 0;
uint16_t slurmd_timeout;
bitstr_t *exc_node_bitmap = NULL, *suspend_node_bitmap = NULL;
int suspend_cnt, resume_cnt;
float suspend_cnt_f, resume_cnt_f;
static void _clear_power_config(void);
static void _do_power_work(time_t now);
static void _do_resume(char *host);
static void _do_suspend(char *host);
static int _init_power_config(void);
static void *_init_power_save(void *arg);
static int _kill_procs(void);
static int _reap_procs(void);
static void _re_wake(void);
static pid_t _run_prog(char *prog, char *arg);
static void _shutdown_power(void);
static bool _valid_prog(char *file_name);
/* Perform any power change work to nodes */
static void _do_power_work(time_t now)
{
static time_t last_log = 0, last_work_scan = 0;
int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
time_t delta_t;
uint16_t susp_state;
bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
struct node_record *node_ptr;
bool run_suspend = false;
/* Set limit on counts of nodes to have state changed */
delta_t = now - last_work_scan;
if (delta_t >= 60) {
suspend_cnt_f = 0.0;
resume_cnt_f = 0.0;
} else {
float rate = (60 - delta_t) / 60.0;
suspend_cnt_f *= rate;
resume_cnt_f *= rate;
}
suspend_cnt = (suspend_cnt_f + 0.5);
resume_cnt = (resume_cnt_f + 0.5);
if (now > (last_suspend + suspend_timeout)) {
/* ready to start another round of node suspends */
run_suspend = true;
if (last_suspend) {
bit_nclear(suspend_node_bitmap, 0,
(node_record_count - 1));
last_suspend = (time_t) 0;
}
}
last_work_scan = now;
/* Build bitmaps identifying each node which should change state */
for (i=0, node_ptr=node_record_table_ptr;
i<node_record_count; i++, node_ptr++) {
susp_state = IS_NODE_POWER_SAVE(node_ptr);
if (susp_state)
susp_total++;
/* Resume nodes as appropriate */
if (susp_state &&
((resume_rate == 0) || (resume_cnt < resume_rate)) &&
(bit_test(suspend_node_bitmap, i) == 0) &&
(IS_NODE_ALLOCATED(node_ptr) ||
(node_ptr->last_idle > (now - idle_time)))) {
if (wake_node_bitmap == NULL) {
wake_node_bitmap =
bit_alloc(node_record_count);
}
wake_cnt++;
resume_cnt++;
resume_cnt_f++;
node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
node_ptr->node_state |= NODE_STATE_POWER_UP;
node_ptr->node_state |= NODE_STATE_NO_RESPOND;
bit_clear(power_node_bitmap, i);
bit_clear(avail_node_bitmap, i);
node_ptr->last_response = now + resume_timeout;
bit_set(wake_node_bitmap, i);
}
/* Suspend nodes as appropriate */
if (run_suspend &&
(susp_state == 0) &&
((suspend_rate == 0) || (suspend_cnt < suspend_rate)) &&
IS_NODE_IDLE(node_ptr) &&
(node_ptr->sus_job_cnt == 0) &&
(!IS_NODE_COMPLETING(node_ptr)) &&
(!IS_NODE_POWER_UP(node_ptr)) &&
(node_ptr->last_idle < (now - idle_time)) &&
((exc_node_bitmap == NULL) ||
(bit_test(exc_node_bitmap, i) == 0))) {
if (sleep_node_bitmap == NULL) {
sleep_node_bitmap =
bit_alloc(node_record_count);
}
sleep_cnt++;
suspend_cnt++;
suspend_cnt_f++;
node_ptr->node_state |= NODE_STATE_POWER_SAVE;
bit_set(power_node_bitmap, i);
bit_set(sleep_node_bitmap, i);
bit_set(suspend_node_bitmap, i);
last_suspend = now;
}
}
if (((now - last_log) > 600) && (susp_total > 0)) {
info("Power save mode: %d nodes", susp_total);
last_log = now;
}
if (sleep_node_bitmap) {
char *nodes;
nodes = bitmap2node_name(sleep_node_bitmap);
if (nodes)
_do_suspend(nodes);
else
error("power_save: bitmap2nodename");
xfree(nodes);
FREE_NULL_BITMAP(sleep_node_bitmap);
/* last_node_update could be changed already by another thread!
last_node_update = now; */
}
if (wake_node_bitmap) {
char *nodes;
nodes = bitmap2node_name(wake_node_bitmap);
if (nodes)
_do_resume(nodes);
else
error("power_save: bitmap2nodename");
xfree(nodes);
FREE_NULL_BITMAP(wake_node_bitmap);
/* last_node_update could be changed already by another thread!
last_node_update = now; */
}
}
/* If slurmctld crashes, the node state that it recovers could differ
* from the actual hardware state (e.g. ResumeProgram failed to complete).
* To address that, when a node that should be powered up for a running
* job is not responding, they try running ResumeProgram again. */
static void _re_wake(void)
{
struct node_record *node_ptr;
bitstr_t *wake_node_bitmap = NULL;
int i;
node_ptr = node_record_table_ptr;
for (i=0; i<node_record_count; i++, node_ptr++) {
if (IS_NODE_ALLOCATED(node_ptr) &&
IS_NODE_NO_RESPOND(node_ptr) &&
!IS_NODE_POWER_SAVE(node_ptr) &&
(bit_test(suspend_node_bitmap, i) == 0)) {
if (wake_node_bitmap == NULL) {
wake_node_bitmap =
bit_alloc(node_record_count);
}
bit_set(wake_node_bitmap, i);
}
}
if (wake_node_bitmap) {
char *nodes;
nodes = bitmap2node_name(wake_node_bitmap);
if (nodes) {
info("power_save: rewaking nodes %s", nodes);
_run_prog(resume_prog, nodes);
} else
error("power_save: bitmap2nodename");
xfree(nodes);
FREE_NULL_BITMAP(wake_node_bitmap);
}
}
static void _do_resume(char *host)
{
#if _DEBUG
info("power_save: waking nodes %s", host);
#else
verbose("power_save: waking nodes %s", host);
#endif
_run_prog(resume_prog, host);
}
static void _do_suspend(char *host)
{
#if _DEBUG
info("power_save: suspending nodes %s", host);
#else
verbose("power_save: suspending nodes %s", host);
#endif
_run_prog(suspend_prog, host);
}
/* run a suspend or resume program
* prog IN - program to run
* arg IN - program arguments, the hostlist expression
*/
static pid_t _run_prog(char *prog, char *arg)
{
int i;
char program[1024], arg0[1024], arg1[1024], *pname;
pid_t child;
if (prog == NULL) /* disabled, useful for testing */
return -1;
strncpy(program, prog, sizeof(program));
pname = strrchr(program, '/');
if (pname == NULL)
pname = program;
else
pname++;
strncpy(arg0, pname, sizeof(arg0));
strncpy(arg1, arg, sizeof(arg1));
child = fork();
if (child == 0) {
for (i=0; i<128; i++)
close(i);
#ifdef SETPGRP_TWO_ARGS
setpgrp(0, 0);
#else
setpgrp();
#endif
execl(program, arg0, arg1, NULL);
exit(1);
} else if (child < 0) {
error("fork: %m");
} else {
/* save the pid */
for (i=0; i<PID_CNT; i++) {
if (child_pid[i])
continue;
child_pid[i] = child;
child_time[i] = time(NULL);
break;
}
if (i == PID_CNT)
error("power_save: filled child_pid array");
}
return child;
}
/* reap child processes previously forked to modify node state.
* return the count of empty slots in the child_pid array */
static int _reap_procs(void)
{
int empties = 0, delay, i, max_timeout, rc, status;
max_timeout = MAX(suspend_timeout, resume_timeout);
for (i=0; i<PID_CNT; i++) {
if (child_pid[i] == 0) {
empties++;
continue;
}
rc = waitpid(child_pid[i], &status, WNOHANG);
if (rc == 0)
continue;
delay = difftime(time(NULL), child_time[i]);
if (delay > max_timeout) {
info("power_save: program %d ran for %d sec",
(int) child_pid[i], delay);
}
if (WIFEXITED(status)) {
rc = WEXITSTATUS(status);
if (rc != 0) {
error("power_save: program exit status of %d",
rc);
} else
ping_nodes_now = true;
} else if (WIFSIGNALED(status)) {
error("power_save: program signalled: %s",
strsignal(WTERMSIG(status)));
}
child_pid[i] = 0;
child_time[i] = (time_t) 0;
}
return empties;
}
/* kill (or orphan) child processes previously forked to modify node state.
* return the count of killed/orphaned processes */
static int _kill_procs(void)
{
int killed = 0, i, rc, status;
for (i=0; i<PID_CNT; i++) {
if (child_pid[i] == 0)
continue;
rc = waitpid(child_pid[i], &status, WNOHANG);
if (rc == 0) {
#ifdef POWER_SAVE_KILL_PROCS
error("power_save: killing process %d",
child_pid[i]);
kill((0-child_pid[i]), SIGKILL);
#else
error("power_save: orphaning process %d",
child_pid[i]);
#endif
killed++;
} else {
/* process already completed */
}
child_pid[i] = 0;
child_time[i] = (time_t) 0;
}
return killed;
}
/* shutdown power save daemons */
static void _shutdown_power(void)
{
int i, proc_cnt, max_timeout;
max_timeout = MAX(suspend_timeout, resume_timeout);
/* Try to avoid orphan processes */
for (i=0; ; i++) {
proc_cnt = PID_CNT - _reap_procs();
if (proc_cnt == 0) /* all procs completed */
break;
if (i >= max_timeout) {
error("power_save: orphaning %d processes which are "
"not terminating so slurmctld can exit",
proc_cnt);
_kill_procs();
break;
} else if (i == 2) {
info("power_save: waiting for %d processes to "
"complete", proc_cnt);
} else if (i % 5 == 0) {
debug("power_save: waiting for %d processes to "
"complete", proc_cnt);
}
sleep(1);
}
}
/* Free all allocated memory */
static void _clear_power_config(void)
{
xfree(suspend_prog);
xfree(resume_prog);
xfree(exc_nodes);
xfree(exc_parts);
FREE_NULL_BITMAP(exc_node_bitmap);
}
/* Initialize power_save module parameters.
* Return 0 on valid configuration to run power saving,
* otherwise log the problem and return -1 */
static int _init_power_config(void)
{
slurm_ctl_conf_t *conf = slurm_conf_lock();
last_config = slurmctld_conf.last_update;
idle_time = conf->suspend_time - 1;
suspend_rate = conf->suspend_rate;
resume_timeout = conf->resume_timeout;
resume_rate = conf->resume_rate;
slurmd_timeout = conf->slurmd_timeout;
suspend_timeout = conf->suspend_timeout;
_clear_power_config();
if (conf->suspend_program)
suspend_prog = xstrdup(conf->suspend_program);
if (conf->resume_program)
resume_prog = xstrdup(conf->resume_program);
if (conf->suspend_exc_nodes)
exc_nodes = xstrdup(conf->suspend_exc_nodes);
if (conf->suspend_exc_parts)
exc_parts = xstrdup(conf->suspend_exc_parts);
slurm_conf_unlock();
if (idle_time < 0) { /* not an error */
debug("power_save module disabled, SuspendTime < 0");
return -1;
}
if (suspend_rate < 1) {
error("power_save module disabled, SuspendRate < 1");
return -1;
}
if (resume_rate < 1) {
error("power_save module disabled, ResumeRate < 1");
return -1;
}
if (suspend_prog == NULL) {
error("power_save module disabled, NULL SuspendProgram");
return -1;
} else if (!_valid_prog(suspend_prog)) {
error("power_save module disabled, invalid SuspendProgram %s",
suspend_prog);
return -1;
}
if (resume_prog == NULL) {
error("power_save module disabled, NULL ResumeProgram");
return -1;
} else if (!_valid_prog(resume_prog)) {
error("power_save module disabled, invalid ResumeProgram %s",
resume_prog);
return -1;
}
if (exc_nodes &&
(node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
error("power_save module disabled, "
"invalid SuspendExcNodes %s", exc_nodes);
return -1;
}
if (exc_parts) {
char *tmp = NULL, *one_part = NULL, *part_list = NULL;
struct part_record *part_ptr = NULL;
int rc = 0;
part_list = xstrdup(exc_parts);
one_part = strtok_r(part_list, ",", &tmp);
while (one_part != NULL) {
part_ptr = find_part_record(one_part);
if (!part_ptr) {
error("power_save module disabled, "
"invalid SuspendExcPart %s",
one_part);
rc = -1;
break;
}
if (exc_node_bitmap)
bit_or(exc_node_bitmap, part_ptr->node_bitmap);
else
exc_node_bitmap = bit_copy(part_ptr->
node_bitmap);
one_part = strtok_r(NULL, ",", &tmp);
}
xfree(part_list);
if (rc)
return rc;
}
if (exc_node_bitmap) {
char *tmp = bitmap2node_name(exc_node_bitmap);
debug("power_save module, excluded nodes %s", tmp);
xfree(tmp);
}
return 0;
}
static bool _valid_prog(char *file_name)
{
struct stat buf;
if (file_name[0] != '/') {
debug("power_save program %s not absolute pathname",
file_name);
return false;
}
if (access(file_name, X_OK) != 0) {
debug("power_save program %s not executable", file_name);
return false;
}
if (stat(file_name, &buf)) {
debug("power_save program %s not found", file_name);
return false;
}
if (buf.st_mode & 022) {
debug("power_save program %s has group or "
"world write permission",
file_name);
return false;
}
return true;
}
/* start_power_mgr - Start power management thread as needed. The thread
* terminates automatically at slurmctld shutdown time.
* IN thread_id - pointer to thread ID of the started pthread.
*/
extern void start_power_mgr(pthread_t *thread_id)
{
pthread_attr_t thread_attr;
slurm_mutex_lock(&power_mutex);
if (power_save_enabled) { /* Already running */
slurm_mutex_unlock(&power_mutex);
return;
}
power_save_enabled = true;
slurm_mutex_unlock(&power_mutex);
slurm_attr_init(&thread_attr);
while (pthread_create(thread_id, &thread_attr, _init_power_save,
NULL)) {
error("pthread_create %m");
sleep(1);
}
slurm_attr_destroy(&thread_attr);
}
/*
* init_power_save - Onitialize the power save module. Started as a
* pthread. Terminates automatically at slurmctld shutdown time.
* Input and output are unused.
*/
static void *_init_power_save(void *arg)
{
/* Locks: Read nodes */
slurmctld_lock_t node_read_lock = {
NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
/* Locks: Write nodes */
slurmctld_lock_t node_write_lock = {
NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
time_t now, boot_time = 0, last_power_scan = 0;
if (_init_power_config())
goto fini;
suspend_node_bitmap = bit_alloc(node_record_count);
if (suspend_node_bitmap == NULL)
fatal("power_save: malloc error");
while (slurmctld_config.shutdown_time == 0) {
sleep(1);
if (_reap_procs() < 2) {
debug("power_save programs getting backlogged");
continue;
}
if ((last_config != slurmctld_conf.last_update) &&
(_init_power_config())) {
info("power_save mode has been disabled due to "
"configuration changes");
goto fini;
}
now = time(NULL);
if (boot_time == 0)
boot_time = now;
/* Only run every 60 seconds or after a node state change,
* whichever happens first */
if ((last_node_update >= last_power_scan) ||
(now >= (last_power_scan + 60))) {
lock_slurmctld(node_write_lock);
_do_power_work(now);
unlock_slurmctld(node_write_lock);
last_power_scan = now;
}
if (slurmd_timeout &&
(now > (boot_time + (slurmd_timeout / 2)))) {
lock_slurmctld(node_read_lock);
_re_wake();
unlock_slurmctld(node_read_lock);
/* prevent additional executions */
boot_time += (365 * 24 * 60 * 60);
slurmd_timeout = 0;
}
}
fini: _clear_power_config();
FREE_NULL_BITMAP(suspend_node_bitmap);
_shutdown_power();
slurm_mutex_lock(&power_mutex);
power_save_enabled = false;
slurm_mutex_unlock(&power_mutex);
pthread_exit(NULL);
return NULL;
}