blob: 14125b049d8e8fb3075ac295175ccc703ab45a4b [file] [log] [blame]
/*****************************************************************************\
* cpu_frequency.c - support for srun option --cpu-freq=<frequency>
*****************************************************************************
* Copyright (C) 2012 Bull
* Written by Don Albert, <don.albert@bull.com>
* Modified by Rod Schultz, <rod.schultz@bull.com> for min-max:gov
* Modified by Janne Blomqvist, <janne.blomqvist@aalto.fi> for
* intel_pstate support
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "slurm/slurm.h"
#include "src/common/cpu_frequency.h"
#include "src/common/env.h"
#include "src/common/fd.h"
#include "src/common/log.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/strlcpy.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/common/read_config.h"
#include "src/common/slurm_resource_info.h"
#include "src/slurmd/slurmd/slurmd.h"
#define PATH_TO_CPU "/sys/devices/system/cpu/"
#define LINE_LEN 100
#define FREQ_LIST_MAX 64
#define GOV_NAME_LEN 24
#define GOV_CONSERVATIVE 0x01
#define GOV_ONDEMAND 0x02
#define GOV_PERFORMANCE 0x04
#define GOV_POWERSAVE 0x08
#define GOV_USERSPACE 0x10
#define GOV_SCHEDUTIL 0x20
static uint16_t cpu_freq_count = 0;
static int set_batch_freq = -1;
static struct cpu_freq_data {
uint8_t avail_governors;
uint8_t nfreq;
bool org_set;
uint32_t avail_freq[FREQ_LIST_MAX];
char org_governor[GOV_NAME_LEN];
char new_governor[GOV_NAME_LEN];
uint32_t org_frequency;
uint32_t new_frequency;
uint32_t org_min_freq;
uint32_t new_min_freq;
uint32_t org_max_freq;
uint32_t new_max_freq;
} * cpufreq = NULL;
static char *slurmd_spooldir = NULL;
static int _cpu_freq_cpu_avail(int cpx);
static int _cpu_freq_current_state(int cpx);
static uint16_t _cpu_freq_next_cpu(char **core_range, uint16_t *cpx,
uint16_t *start, uint16_t *end);
static uint32_t _cpu_freq_get_scaling_freq(int cpuidx, char* option);
static void _cpu_freq_init_data(int cpx);
static void _cpu_freq_setup_data(stepd_step_rec_t *step, int cpx);
static bool _cpu_freq_test_scaling_freq(int cpuidx, char *option);
static int _derive_avail_freq(int cpuidx);
static int _fd_lock_retry(int fd);
static int _fd_lock_retry(int fd)
{
int i, rc;
for (i = 0; i < 10; i++) {
if (i)
usleep(1000); /* 1000 usec */
rc = fd_get_write_lock(fd);
if (rc == 0)
break;
if ((errno != EACCES) && (errno != EAGAIN))
break; /* Lock held by other job */
}
return rc;
}
/* This set of locks it designed to prevent race conditions when changing
* CPU frequency or govorner. Specifically, when a job ends it should only
* reset CPU frequency if it was the last job to set the CPU frequency.
* with gang scheduling and cancellation of suspended or running jobs there
* can be timing issues.
* _set_cpu_owner_lock - set specified job to own the CPU, file locked at exit
* _test_cpu_owner_lock - test if the specified job owns the CPU
*/
static int _set_cpu_owner_lock(int cpu_id, uint32_t job_id)
{
char tmp[PATH_MAX];
int fd;
snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir);
if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) {
error("mkdir failed: %m %s",tmp);
return -1;
}
snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id);
fd = open(tmp, O_CREAT | O_RDWR, 0600);
if (fd < 0) {
error("%s: open: %m %s", __func__, tmp);
return fd;
}
if (_fd_lock_retry(fd) < 0)
error("%s: fd_get_write_lock: %m %s", __func__, tmp);
safe_write(fd, &job_id, sizeof(job_id));
return fd;
rwfail:
error("%s: write: %m %s", __func__, tmp);
return fd;
}
/* Test if specified job ID owns this CPU for frequency/governor control
* RET 0 if owner, -1 otherwise */
static int _test_cpu_owner_lock(int cpu_id, uint32_t job_id)
{
char tmp[PATH_MAX];
uint32_t in_job_id;
int fd;
snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir);
if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) {
error("%s: mkdir failed: %m %s", __func__, tmp);
return -1;
}
snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id);
fd = open(tmp, O_RDWR, 0600);
if (fd < 0) {
if (errno != ENOENT) /* Race condition */
error("%s: open: %m %s", __func__, tmp);
return -1;
}
if (_fd_lock_retry(fd) < 0) {
error("%s: fd_get_write_lock: %m %s", __func__, tmp);
close(fd);
return -1;
}
safe_read(fd, &in_job_id, sizeof(in_job_id));
(void) fd_release_lock(fd);
if (job_id != in_job_id) {
/* Result of various race conditions */
debug("%s: CPU %d now owned by job %u rather than job %u",
__func__, cpu_id, in_job_id, job_id);
close(fd);
return -1;
}
close(fd);
debug2("%s: CPU %d owned by job %u as expected",
__func__, cpu_id, job_id);
return 0;
rwfail:
error("%s: read: %m %s", __func__, tmp);
(void) fd_release_lock(fd);
close(fd);
return -1;
}
/*
* Try do build a table of available frequencies based upon the min/max values
*/
static int _derive_avail_freq(int cpuidx)
{
uint32_t min_freq, max_freq, delta_freq;
int i;
min_freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_min_freq");
if (min_freq == 0)
return SLURM_ERROR;
max_freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_max_freq");
if (max_freq == 0)
return SLURM_ERROR;
delta_freq = (max_freq - min_freq) / (FREQ_LIST_MAX - 1);
for (i = 0; i < (FREQ_LIST_MAX - 1); i++)
cpufreq[cpuidx].avail_freq[i] = min_freq + (delta_freq * i);
cpufreq[cpuidx].avail_freq[FREQ_LIST_MAX - 1] = max_freq;
cpufreq[cpuidx].nfreq = FREQ_LIST_MAX;
return SLURM_SUCCESS;
}
/*
* Find available frequencies on this cpu
* IN cpuidx - cpu to query
* Return: SLURM_SUCCESS or SLURM_ERROR
* avail_freq array will be in strictly ascending order
*/
static int
_cpu_freq_cpu_avail(int cpuidx)
{
FILE *fp = NULL;
char path[PATH_MAX];
int i, j, k;
uint32_t freq;
bool all_avail = false;
snprintf(path, sizeof(path), PATH_TO_CPU
"cpu%u/cpufreq/scaling_available_frequencies", cpuidx);
if ( ( fp = fopen(path, "r") ) == NULL ) {
/*
* Don't log an error here, scaling_available_frequencies
* does not exist when using the intel_pstate driver.
* Derive values from min/max values
*/
return _derive_avail_freq(cpuidx);
}
for (i = 0; i < (FREQ_LIST_MAX-1); i++) {
if ( fscanf(fp, "%u", &freq) == EOF) {
all_avail = true;
break;
}
/* make sure list is sorted */
for (j = 0; j < i; j++) {
if (freq < cpufreq[cpuidx].avail_freq[j]) {
for (k = i; k >= j; k--) {
cpufreq[cpuidx].avail_freq[k+1] =
cpufreq[cpuidx].avail_freq[k];
}
break;
}
}
cpufreq[cpuidx].avail_freq[j] = freq;
}
cpufreq[cpuidx].nfreq = i;
fclose(fp);
if (!all_avail)
error("all available frequencies not scanned");
return SLURM_SUCCESS;
}
/*
* called to check if the node supports setting CPU frequency
* if so, initialize fields in cpu_freq_data structure
*/
extern void
cpu_freq_init(slurmd_conf_t *conf)
{
char path[PATH_MAX];
struct stat statbuf;
FILE *fp;
char value[LINE_LEN];
unsigned int i, j;
xfree(slurmd_spooldir);
slurmd_spooldir = xstrdup(conf->spooldir);
if (running_in_slurmstepd())
return;
/* check for cpufreq support */
if ( stat(PATH_TO_CPU "cpu0/cpufreq", &statbuf) != 0 ) {
info("CPU frequency setting not configured for this node");
return;
}
if (!S_ISDIR(statbuf.st_mode)) {
error(PATH_TO_CPU "cpu0/cpufreq not a directory");
return;
}
/* get the cpu frequency info into the cpu_freq_data structure */
cpu_freq_count = conf->block_map_size;
if (!cpufreq) {
int cpuidx;
cpufreq = (struct cpu_freq_data *)
xmalloc(cpu_freq_count *
sizeof(struct cpu_freq_data));
for (cpuidx = 0; cpuidx < cpu_freq_count; cpuidx++)
_cpu_freq_init_data(cpuidx);
}
debug2("Gathering cpu frequency information for %u cpus",
cpu_freq_count);
for (i = 0; i < cpu_freq_count; i++) {
snprintf(path, sizeof(path),
PATH_TO_CPU
"cpu%u/cpufreq/scaling_available_governors", i);
if ((fp = fopen(path, "r")) == NULL)
continue;
if (fgets(value, LINE_LEN, fp) == NULL) {
fclose(fp);
continue;
}
if (strstr(value, "conservative")) {
cpufreq[i].avail_governors |= GOV_CONSERVATIVE;
if (i == 0)
log_flag(CPU_FREQ, "cpu_freq: Conservative governor defined on cpu 0");
}
if (strstr(value, "ondemand")) {
cpufreq[i].avail_governors |= GOV_ONDEMAND;
if (i == 0)
log_flag(CPU_FREQ, "cpu_freq: OnDemand governor defined on cpu 0");
}
if (strstr(value, "performance")) {
cpufreq[i].avail_governors |= GOV_PERFORMANCE;
if (i == 0)
log_flag(CPU_FREQ, "cpu_freq: Performance governor defined on cpu 0");
}
if (strstr(value, "powersave")) {
cpufreq[i].avail_governors |= GOV_POWERSAVE;
if (i == 0)
log_flag(CPU_FREQ, "cpu_freq: PowerSave governor defined on cpu 0");
}
if (strstr(value, "userspace")) {
cpufreq[i].avail_governors |= GOV_USERSPACE;
if (i == 0)
log_flag(CPU_FREQ, "cpu_freq: UserSpace governor defined on cpu 0");
}
if (strstr(value, "schedutil")) {
cpufreq[i].avail_governors |= GOV_SCHEDUTIL;
if (i == 0)
log_flag(CPU_FREQ, "cpu_freq: SchedUtil governor defined on cpu 0");
}
fclose(fp);
if (_cpu_freq_cpu_avail(i) == SLURM_ERROR)
continue;
if ((i == 0) &&
(slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ)) {
for (j = 0; j < cpufreq[i].nfreq; j++) {
info("cpu_freq: frequency %u defined on cpu 0",
cpufreq[i].avail_freq[j]);
}
}
}
return;
}
extern void
cpu_freq_fini(void)
{
xfree(cpufreq);
xfree(slurmd_spooldir);
}
/*
* Send the cpu_frequency table info to slurmstepd
*/
extern void
cpu_freq_send_info(int fd)
{
if (cpu_freq_count) {
safe_write(fd, &cpu_freq_count, sizeof(uint16_t));
safe_write(fd, cpufreq,
(cpu_freq_count * sizeof(struct cpu_freq_data)));
} else {
safe_write(fd, &cpu_freq_count, sizeof(uint16_t));
}
return;
rwfail:
error("Unable to send CPU frequency information for %u CPUs",
cpu_freq_count);
return;
}
/*
* Receive the cpu_frequency table info from slurmd
*/
extern void
cpu_freq_recv_info(int fd)
{
safe_read(fd, &cpu_freq_count, sizeof(uint16_t));
if (cpu_freq_count) {
if (!cpufreq) {
cpufreq = (struct cpu_freq_data *)
xmalloc(cpu_freq_count *
sizeof(struct cpu_freq_data));
}
safe_read(fd, cpufreq,
(cpu_freq_count * sizeof(struct cpu_freq_data)));
debug2("Received CPU frequency information for %u CPUs",
cpu_freq_count);
}
return;
rwfail:
error("Unable to receive CPU frequency information for %u CPUs",
cpu_freq_count);
cpu_freq_count = 0;
return;
}
/*
* Validate the cpus and select the frequency to set
* Called from task/affinity code with task launch request containing
* a pointer to a hex map string of the cpus to be used by this step
*/
extern void
cpu_freq_cpuset_validate(stepd_step_rec_t *step)
{
int cpuidx, cpu_num;
bitstr_t *cpus_to_set;
bitstr_t *cpu_map;
char *cpu_bind;
char *cpu_str;
char *savestr = NULL;
char cpu_bind_type_string[128];
if (set_batch_freq == -1) {
if (xstrcasestr(slurm_conf.launch_params,
"batch_step_set_cpu_freq"))
set_batch_freq = 1;
else
set_batch_freq = 0;
}
if (((step->step_id.step_id == SLURM_BATCH_SCRIPT) &&
!set_batch_freq) ||
(step->step_id.step_id == SLURM_INTERACTIVE_STEP) ||
(step->step_id.step_id == SLURM_EXTERN_CONT))
return;
slurm_sprint_cpu_bind_type(cpu_bind_type_string, step->cpu_bind_type);
log_flag(CPU_FREQ, "%s: request: min=(%12d %8x) max=(%12d %8x) governor=%8x",
__func__, step->cpu_freq_min, step->cpu_freq_min,
step->cpu_freq_max, step->cpu_freq_max, step->cpu_freq_gov);
log_flag(CPU_FREQ, " jobid=%u, stepid=%u, tasks=%u cpu/task=%u, cpus=%u",
step->step_id.job_id, step->step_id.step_id,
step->node_tasks, step->cpus_per_task,
step->cpus);
log_flag(CPU_FREQ, " cpu_bind_type=%4x, cpu_bind map=%s",
step->cpu_bind_type, step->cpu_bind);
if (!cpu_freq_count)
return;
if (step->cpu_bind == NULL) {
/*
* slurm_verify_cpu_bind will set cpu_bind to NULL for manual
* binding that doesn't require an argument
*/
if (!((step->cpu_bind_type & CPU_BIND_NONE) ||
(step->cpu_bind_type & CPU_BIND_LDRANK)))
error("cpu_freq_cpuset_validate: cpu_bind string is null");
return;
}
cpu_bind = xstrdup(step->cpu_bind);
if ( (cpu_str = strtok_r(cpu_bind, ",", &savestr) ) == NULL) {
error("cpu_freq_cpuset_validate: cpu_bind string invalid");
xfree(cpu_bind);
return;
}
cpu_map = bit_alloc(cpu_freq_count);
cpus_to_set = bit_alloc(cpu_freq_count);
do {
debug3(" cpu_str = %s", cpu_str);
if ((step->cpu_bind_type & CPU_BIND_MAP) == CPU_BIND_MAP) {
cpu_num = atoi(cpu_str);
if (cpu_num >= cpu_freq_count) {
error("cpu_freq_cpuset_validate: invalid cpu "
"number %d", cpu_num);
FREE_NULL_BITMAP(cpu_map);
FREE_NULL_BITMAP(cpus_to_set);
xfree(cpu_bind);
return;
}
bit_set(cpu_map, (bitoff_t)cpu_num);
} else {
if (bit_unfmt_hexmask(cpu_map, cpu_str) == -1) {
error("cpu_freq_cpuset_validate: invalid cpu "
"mask %s", cpu_bind);
FREE_NULL_BITMAP(cpu_map);
FREE_NULL_BITMAP(cpus_to_set);
xfree(cpu_bind);
return;
}
}
bit_or(cpus_to_set, cpu_map);
} while ( (cpu_str = strtok_r(NULL, ",", &savestr) ) != NULL);
for (cpuidx = 0; cpuidx < cpu_freq_count; cpuidx++) {
if (bit_test(cpus_to_set, cpuidx)) {
_cpu_freq_setup_data(step, cpuidx);
}
}
cpu_freq_set(step);
FREE_NULL_BITMAP(cpu_map);
FREE_NULL_BITMAP(cpus_to_set);
xfree(cpu_bind);
return;
}
/*
* Validate the cpus and select the frequency to set
* Called from task cgroup cpuset code with string containing
* the list of cpus to be used by this step
*/
extern void
cpu_freq_cgroup_validate(stepd_step_rec_t *step, char *step_alloc_cores)
{
uint16_t start = USHRT_MAX;
uint16_t end = USHRT_MAX;
uint16_t cpuidx = 0;
char *core_range;
if (set_batch_freq == -1) {
if (xstrcasestr(slurm_conf.launch_params,
"batch_step_set_cpu_freq"))
set_batch_freq = 1;
else
set_batch_freq = 0;
}
if (((step->step_id.step_id == SLURM_BATCH_SCRIPT) &&
!set_batch_freq) ||
(step->step_id.step_id == SLURM_INTERACTIVE_STEP) ||
(step->step_id.step_id == SLURM_EXTERN_CONT))
return;
log_flag(CPU_FREQ, "%s: request: min=(%12d %8x) max=(%12d %8x) governor=%8x",
__func__, step->cpu_freq_min, step->cpu_freq_min,
step->cpu_freq_max, step->cpu_freq_max, step->cpu_freq_gov);
log_flag(CPU_FREQ, " jobid=%u, stepid=%u, tasks=%u cpu/task=%u, cpus=%u",
step->step_id.job_id, step->step_id.step_id,
step->node_tasks, step->cpus_per_task,
step->cpus);
log_flag(CPU_FREQ, " cpu_bind_type=%4x, cpu_bind map=%s",
step->cpu_bind_type, step->cpu_bind);
log_flag(CPU_FREQ, " step logical cores = %s, step physical cores = %s",
step->step_alloc_cores, step_alloc_cores);
if (!cpu_freq_count)
return;
/* set entries in cpu frequency table for this step's cpus */
core_range = step_alloc_cores;
while ( (cpuidx = _cpu_freq_next_cpu(&core_range, &cpuidx,
&start, &end)) != USHRT_MAX) {
if (cpuidx >= cpu_freq_count) {
error("cpu_freq_validate: index %u exceeds cpu count %u",
cpuidx, cpu_freq_count);
return;
}
_cpu_freq_setup_data(step, cpuidx);
}
cpu_freq_set(step);
return;
}
/*
* get the next number in a range
* assumes range is well-formed, i.e., monotonically increasing,
* no leading/trailing punctuation, either comma separated or dash
* separated: e.g., "4-6,8,10,13-15"
*/
uint16_t
_cpu_freq_next_cpu(char **core_range, uint16_t *cpuidx,
uint16_t *start, uint16_t *end)
{
int i;
char *p;
p = *core_range;
if (*start == USHRT_MAX) {
if (*p == '\0')
return USHRT_MAX;
if (*p == ',')
p++;
i = 0;
while ( isdigit(*p) ) {
i = i*10 + (*p - '0');
p++;
}
*core_range = p;
*start = i;
return i;
}
if (*end == USHRT_MAX) {
switch (*p)
{
case '-' :
p++;
i = 0;
while ( isdigit(*p) ) {
i = i*10 + (*p - '0');
p++;
}
*core_range = p;
*end = i;
break;
case ',':
p++;
i = 0;
while ( isdigit(*p) ) {
i = i*10 + (*p - '0');
p++;
}
*start = i;
*end = USHRT_MAX;
*core_range = p;
return i;
case '\0' :
return USHRT_MAX;
}
}
i = *cpuidx;
if ( i < *end ) {
i++;
if ( i == *end) {
*start = USHRT_MAX;
*end = USHRT_MAX;
}
}
return i;
}
/*
* Find current governor on this cpu
*
* Return: SLURM_SUCCESS or SLURM_ERROR
*/
static int
_cpu_freq_get_cur_gov(int cpuidx)
{
FILE *fp = NULL;
char path[PATH_MAX], gov_value[LINE_LEN];
int j;
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_governor", cpuidx);
if ((fp = fopen(path, "r")) == NULL) {
error("%s: Could not open scaling_governor", __func__);
return SLURM_ERROR;
}
if (fgets(gov_value, LINE_LEN, fp) == NULL) {
error("%s: Could not read scaling_governor", __func__);
fclose(fp);
return SLURM_ERROR;
}
if (strlen(gov_value) >= GOV_NAME_LEN) {
error("%s: scaling_governor is to long", __func__);
fclose(fp);
return SLURM_ERROR;
}
strcpy(cpufreq[cpuidx].org_governor, gov_value);
fclose(fp);
j = strlen(cpufreq[cpuidx].org_governor);
if ((j > 0) && (cpufreq[cpuidx].org_governor[j - 1] == '\n'))
cpufreq[cpuidx].org_governor[j - 1] = '\0';
return SLURM_SUCCESS;
}
/*
* set cpu governor
*/
static int
_cpu_freq_set_gov(stepd_step_rec_t *step, int cpuidx, char *gov)
{
char path[PATH_MAX];
FILE *fp;
int fd, rc;
rc = SLURM_SUCCESS;
snprintf(path, sizeof(path), PATH_TO_CPU
"cpu%u/cpufreq/scaling_governor", cpuidx);
fd = _set_cpu_owner_lock(cpuidx, step->step_id.job_id);
if ((fp = fopen(path, "w"))) {
fputs(gov, fp);
fputc('\n', fp);
fclose(fp);
} else {
error("%s: Can not set CPU governor: %m", __func__);
rc = SLURM_ERROR;
}
if (fd >= 0) {
(void) fd_release_lock(fd);
(void) close(fd);
}
return rc;
}
/*
* get one of scalling_min_freq, scaling_max_freq, cpuinfo_cur_freq
*
* Return: value of scaling_min_freq, or 0 on error
*/
static uint32_t
_cpu_freq_get_scaling_freq(int cpuidx, char* option)
{
FILE *fp = NULL;
char path[PATH_MAX];
uint32_t freq;
/* get the value from 'option' */
snprintf(path, sizeof(path), PATH_TO_CPU
"cpu%u/cpufreq/%s", cpuidx, option);
if ( ( fp = fopen(path, "r") ) == NULL ) {
error("%s: Could not open %s", __func__, option);
return 0;
}
if (fscanf (fp, "%u", &freq) < 1) {
error("%s: Could not read %s", __func__, option);
fclose(fp);
return 0;
}
fclose(fp);
return freq;
}
/*
* test for existence of cpufreq file
*
* Return: true if file found
*/
static bool
_cpu_freq_test_scaling_freq(int cpuidx, char *option)
{
char path[PATH_MAX];
struct stat stat_buf;
/* get the value from 'option' */
snprintf(path, sizeof(path), PATH_TO_CPU
"cpu%u/cpufreq/%s", cpuidx, option);
if (stat(path, &stat_buf) == 0)
return true;
return false;
}
/*
* set one of scalling_min_freq, scaling_max_freq, scaling_setspeed
* -- assume governor already set to userspace ---
*
*/
static int
_cpu_freq_set_scaling_freq(stepd_step_rec_t *step, int cpx, uint32_t freq,
char* option)
{
char path[PATH_MAX];
FILE *fp;
int fd, rc;
uint32_t newfreq;
rc = SLURM_SUCCESS;
snprintf(path, sizeof(path), PATH_TO_CPU
"cpu%u/cpufreq/%s", cpx, option);
fd = _set_cpu_owner_lock(cpx, step->step_id.job_id);
if ((fp = fopen(path, "w"))) {
fprintf(fp, "%u\n", freq);
fclose(fp);
} else {
error("%s: Can not set %s: %m", __func__, option);
rc = SLURM_ERROR;
}
if (fd >= 0) {
(void) fd_release_lock(fd);
(void) close(fd);
}
if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) {
newfreq = _cpu_freq_get_scaling_freq(cpx, option);
if (newfreq != freq) {
error("Failed to set freq_scaling %s to %u (org=%u)",
option, freq, newfreq);
}
}
return rc;
}
/*
* Get current state
*
* IN: cpuidx - cpu to query
* Return: SLURM_SUCCESS or SLURM_ERROR
*/
static int
_cpu_freq_current_state(int cpuidx)
{
static int freq_file = -1;
uint32_t freq;
if (cpufreq[cpuidx].org_set) {
/*
* The current state was already loaded for this cpu.
* Likely caused by stacked task plugins. Prevent
* overwriting the original values so they can be
* restored correctly after job completion.
*/
return SLURM_SUCCESS;
}
/*
* Getting 'previous' values using the 'scaling' values rather
* than the 'cpuinfo' values.
* The 'cpuinfo' values are read only. min/max seem to be raw
* hardware capability.
* The 'scaling' values are set by the governor.
* For the current frequency, use the cpuinfo_cur_freq file
* since the intel_pstate driver doesn't necessarily create
* the scaling_cur_freq file.
*/
if (freq_file == -1) {
if (_cpu_freq_test_scaling_freq(cpuidx, "cpuinfo_cur_freq"))
freq_file = 0;
else /* Use "scaling_cur_freq" */
freq_file = 1;
}
if (freq_file == 0)
freq = _cpu_freq_get_scaling_freq(cpuidx, "cpuinfo_cur_freq");
else
freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_cur_freq");
if (freq == 0)
return SLURM_ERROR;
cpufreq[cpuidx].org_frequency = freq;
freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_min_freq");
if (freq == 0)
return SLURM_ERROR;
cpufreq[cpuidx].org_min_freq = freq;
freq = _cpu_freq_get_scaling_freq(cpuidx, "scaling_max_freq");
if (freq == 0)
return SLURM_ERROR;
cpufreq[cpuidx].org_max_freq = freq;
if (_cpu_freq_get_cur_gov(cpuidx) == SLURM_SUCCESS) {
cpufreq[cpuidx].org_set = true;
return SLURM_SUCCESS;
} else {
return SLURM_ERROR;
}
}
/*
* Copy string representation of a governor into cpufreq structure for a cpu.
*/
static int
_cpu_freq_govspec_string(uint32_t cpu_freq, int cpuidx)
{
if ((cpu_freq & CPU_FREQ_RANGE_FLAG) == 0)
return SLURM_ERROR;
switch(cpu_freq)
{
case CPU_FREQ_CONSERVATIVE:
if (cpufreq[cpuidx].avail_governors & GOV_CONSERVATIVE)
strcpy(cpufreq[cpuidx].new_governor, "conservative");
return SLURM_SUCCESS;
case CPU_FREQ_ONDEMAND:
if (cpufreq[cpuidx].avail_governors & GOV_ONDEMAND)
strcpy(cpufreq[cpuidx].new_governor,"ondemand");
return SLURM_SUCCESS;
case CPU_FREQ_PERFORMANCE:
if (cpufreq[cpuidx].avail_governors & GOV_PERFORMANCE)
strcpy(cpufreq[cpuidx].new_governor, "performance");
return SLURM_SUCCESS;
case CPU_FREQ_POWERSAVE:
if (cpufreq[cpuidx].avail_governors & GOV_POWERSAVE)
strcpy(cpufreq[cpuidx].new_governor, "powersave");
return SLURM_SUCCESS;
case CPU_FREQ_USERSPACE:
if (cpufreq[cpuidx].avail_governors & GOV_USERSPACE)
strcpy(cpufreq[cpuidx].new_governor, "userspace");
return SLURM_SUCCESS;
case CPU_FREQ_SCHEDUTIL:
if (cpufreq[cpuidx].avail_governors & GOV_SCHEDUTIL)
strcpy(cpufreq[cpuidx].new_governor, "schedutil");
return SLURM_SUCCESS;
default:
return SLURM_ERROR;
}
}
/*
* Convert frequency_spec into an actual frequency
* Returns -- frequency from avail frequency list, or NO_VAL
*/
uint32_t
_cpu_freq_freqspec_num(uint32_t cpu_freq, int cpuidx)
{
int fx, j;
if (!cpufreq || !cpufreq[cpuidx].nfreq)
return NO_VAL;
/* assume the frequency list is in ascending order */
if (cpu_freq & CPU_FREQ_RANGE_FLAG) { /* Named values */
switch(cpu_freq)
{
case CPU_FREQ_LOW :
return cpufreq[cpuidx].avail_freq[0];
case CPU_FREQ_MEDIUM :
if (cpufreq[cpuidx].nfreq == 1)
return cpufreq[cpuidx].avail_freq[0];
fx = (cpufreq[cpuidx].nfreq - 1) / 2;
return cpufreq[cpuidx].avail_freq[fx];
case CPU_FREQ_HIGHM1 :
if (cpufreq[cpuidx].nfreq == 1)
return cpufreq[cpuidx].avail_freq[0];
fx = cpufreq[cpuidx].nfreq - 2;
return cpufreq[cpuidx].avail_freq[fx];
case CPU_FREQ_HIGH :
fx = cpufreq[cpuidx].nfreq - 1;
return cpufreq[cpuidx].avail_freq[fx];
default:
return NO_VAL;
}
}
/* check for request above or below available values */
if (cpu_freq < cpufreq[cpuidx].avail_freq[0]) {
error("Rounding requested frequency %d "
"up to lowest available %d", cpu_freq,
cpufreq[cpuidx].avail_freq[0]);
return cpufreq[cpuidx].avail_freq[0];
} else if (cpufreq[cpuidx].avail_freq[cpufreq[cpuidx].nfreq - 1]
< cpu_freq) {
error("Rounding requested frequency %d "
"down to highest available %d", cpu_freq,
cpufreq[cpuidx].avail_freq[cpufreq[cpuidx].nfreq - 1]);
return cpufreq[cpuidx].avail_freq[cpufreq[cpuidx].nfreq - 1];
}
/* check for frequency, round up if no exact match */
for (j = 0; j < cpufreq[cpuidx].nfreq; ) {
if (cpu_freq == cpufreq[cpuidx].avail_freq[j]) {
return cpufreq[cpuidx].avail_freq[j];
}
j++; /* step up to next element to round up *
* safe to advance due to bounds checks above here */
if (cpu_freq < cpufreq[cpuidx].avail_freq[j]) {
info("Rounding requested frequency %d "
"up to next available %d", cpu_freq,
cpufreq[cpuidx].avail_freq[j]);
return cpufreq[cpuidx].avail_freq[j];
}
}
/* loop above must return due to previous bounds checks
* but return NO_VAL here anyways to silence compiler warnings */
return NO_VAL;
}
/*
* Initialize data structure
*/
static void
_cpu_freq_init_data(int cpx)
{
/* avail_governors -- set at initialization */
cpufreq[cpx].org_governor[0] = '\0';
cpufreq[cpx].new_governor[0] = '\0';
cpufreq[cpx].org_frequency = NO_VAL;
cpufreq[cpx].new_frequency = NO_VAL;
cpufreq[cpx].org_min_freq = NO_VAL;
cpufreq[cpx].new_min_freq = NO_VAL;
cpufreq[cpx].org_max_freq = NO_VAL;
cpufreq[cpx].new_max_freq = NO_VAL;
cpufreq[cpx].org_set = false;
}
/*
* Set either current frequency (speed)
* Or min/max governor base on --cpu-freq parameter
*/
static void
_cpu_freq_setup_data(stepd_step_rec_t *step, int cpx)
{
uint32_t freq;
/* If no --cpu-freq, use default governor from conf file. */
if (step->cpu_freq_gov == NO_VAL)
step->cpu_freq_gov = slurm_conf.cpu_freq_def;
if (step->cpu_freq_gov == NO_VAL)
return;
/* Get current state */
if (_cpu_freq_current_state(cpx) == SLURM_ERROR)
return;
if (step->cpu_freq_min == NO_VAL &&
step->cpu_freq_max != NO_VAL &&
step->cpu_freq_gov == NO_VAL) {
/* Pre version 15.08 behavior */
freq = _cpu_freq_freqspec_num(step->cpu_freq_max, cpx);
cpufreq[cpx].new_frequency = freq;
goto newfreq;
}
if (step->cpu_freq_gov == CPU_FREQ_USERSPACE) {
_cpu_freq_govspec_string(step->cpu_freq_gov, cpx);
if (step->cpu_freq_max == NO_VAL) {
return; /* pre version 15.08 behavior. */
}
/* Power capping */
freq = _cpu_freq_freqspec_num(step->cpu_freq_max, cpx);
cpufreq[cpx].new_frequency = freq;
cpufreq[cpx].new_min_freq = freq;
cpufreq[cpx].new_max_freq = freq;
return;
}
if (step->cpu_freq_min != NO_VAL && step->cpu_freq_max != NO_VAL) {
freq = _cpu_freq_freqspec_num(step->cpu_freq_min, cpx);
cpufreq[cpx].new_min_freq = freq;
freq = _cpu_freq_freqspec_num(step->cpu_freq_max, cpx);
cpufreq[cpx].new_max_freq = freq;
}
if (step->cpu_freq_gov != NO_VAL) {
_cpu_freq_govspec_string(step->cpu_freq_gov, cpx);
}
newfreq:
/* Make sure a 'new' frequency is within scaling min/max */
if (cpufreq[cpx].new_frequency != NO_VAL) {
if (cpufreq[cpx].new_frequency < cpufreq[cpx].org_min_freq) {
cpufreq[cpx].new_min_freq = cpufreq[cpx].new_frequency;
}
if (cpufreq[cpx].new_frequency > cpufreq[cpx].org_max_freq) {
cpufreq[cpx].new_max_freq = cpufreq[cpx].new_frequency;
}
}
}
/*
* check an argument against valid governors.
*
* Input: - arg - string value of governor
* - illegal - combination of enums for governors not allowed.
* Returns - enum of governor found
* - or 0 if not found
*/
static uint32_t
_cpu_freq_check_gov(const char* arg, uint32_t illegal)
{
uint32_t rc = 0;
if (xstrncasecmp(arg, "co", 2) == 0) {
rc = CPU_FREQ_CONSERVATIVE;
} else if (xstrncasecmp(arg, "perf", 4) == 0) {
rc = CPU_FREQ_PERFORMANCE;
} else if (xstrncasecmp(arg, "pow", 3) == 0) {
rc = CPU_FREQ_POWERSAVE;
} else if (xstrncasecmp(arg, "user", 4) == 0) {
rc = CPU_FREQ_USERSPACE;
} else if (xstrncasecmp(arg, "onde", 4) == 0) {
rc = CPU_FREQ_ONDEMAND;
} else if (xstrncasecmp(arg, "sche", 4) == 0) {
rc = CPU_FREQ_SCHEDUTIL;
}
rc &= (~illegal);
if (rc == 0)
return 0;
return (rc | CPU_FREQ_RANGE_FLAG);
}
/*
* check an argument for a frequency or frequency synonym.
*
* Input: - arg - string value of frequency
*
* Returns - frequency
* - enum for synonym
* 0 on error.
*/
static uint32_t
_cpu_freq_check_freq(const char* arg)
{
char *end;
uint32_t frequency;
if (xstrncasecmp(arg, "lo", 2) == 0) {
return CPU_FREQ_LOW;
} else if (xstrncasecmp(arg, "him1", 4) == 0 ||
xstrncasecmp(arg, "highm1", 6) == 0) {
return CPU_FREQ_HIGHM1;
} else if (xstrncasecmp(arg, "hi", 2) == 0) {
return CPU_FREQ_HIGH;
} else if (xstrncasecmp(arg, "med", 3) == 0) {
return CPU_FREQ_MEDIUM;
}
frequency = strtoul(arg, &end, 10);
if ((*end != '\0') ||
((frequency == 0) && (errno == EINVAL))) {
error("unrecognized --cpu-freq argument \"%s\"", arg);
return 0;
}
return frequency;
}
/*
* set cpu frequency if possible for each cpu of the job step
*/
extern void
cpu_freq_set(stepd_step_rec_t *step)
{
char freq_detail[100];
uint32_t freq;
int i, rc;
if ((!cpu_freq_count) || (!cpufreq))
return;
for (i = 0; i < cpu_freq_count; i++) {
if (cpufreq[i].new_frequency == NO_VAL
&& cpufreq[i].new_min_freq == NO_VAL
&& cpufreq[i].new_max_freq == NO_VAL
&& cpufreq[i].new_governor[0] == '\0')
continue; /* Nothing to set on this CPU */
log_flag(CPU_FREQ, "cpu_freq: current_state cpu=%d org_min=%u org_freq=%u org_max=%u org_gpv=%s",
i, cpufreq[i].org_min_freq, cpufreq[i].org_frequency,
cpufreq[i].org_max_freq, cpufreq[i].org_governor);
/* Max must be set before min, per
* www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt
*/
if (cpufreq[i].new_max_freq != NO_VAL ) {
freq = cpufreq[i].new_max_freq;
if (cpufreq[i].org_frequency > freq) {
/* The current frequency is > requested max,
* Set it so it is in range
* have to go to UserSpace to do it. */
rc = _cpu_freq_set_gov(step, i, "userspace");
if (rc == SLURM_ERROR)
return;
rc = _cpu_freq_set_scaling_freq(step, i, freq,
"scaling_setspeed");
if (rc == SLURM_ERROR)
continue;
if (cpufreq[i].new_governor[0] == '\0') {
/* Not requesting new gov, so restore */
rc = _cpu_freq_set_gov(step, i,
cpufreq[i].org_governor);
if (rc == SLURM_ERROR)
continue;
}
}
rc = _cpu_freq_set_scaling_freq(step, i, freq,
"scaling_max_freq");
if (rc == SLURM_ERROR)
continue;
}
if (cpufreq[i].new_min_freq != NO_VAL) {
freq = cpufreq[i].new_min_freq;
if (cpufreq[i].org_frequency < freq) {
/* The current frequency is < requested min,
* Set it so it is in range
* have to go to UserSpace to do it. */
rc = _cpu_freq_set_gov(step, i, "userspace");
if (rc == SLURM_ERROR)
continue;
rc = _cpu_freq_set_scaling_freq(step, i, freq,
"scaling_setspeed");
if (rc == SLURM_ERROR)
continue;
if (cpufreq[i].new_governor[0] == '\0') {
/* Not requesting new gov, so restore */
rc= _cpu_freq_set_gov(step, i,
cpufreq[i].org_governor);
if (rc == SLURM_ERROR)
continue;
}
}
rc= _cpu_freq_set_scaling_freq(step, i, freq,
"scaling_min_freq");
if (rc == SLURM_ERROR)
continue;
}
if (cpufreq[i].new_frequency != NO_VAL) {
if (xstrcmp(cpufreq[i].org_governor,"userspace")) {
rc = _cpu_freq_set_gov(step, i, "userspace");
if (rc == SLURM_ERROR)
continue;
}
rc = _cpu_freq_set_scaling_freq(step, i,
cpufreq[i].new_frequency,
"scaling_setspeed");
if (rc == SLURM_ERROR)
continue;
}
if (cpufreq[i].new_governor[0] != '\0') {
rc = _cpu_freq_set_gov(step, i, cpufreq[i].new_governor);
if (rc == SLURM_ERROR)
continue;
}
if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) {
cpu_freq_debug(NULL, NULL,
freq_detail, sizeof(freq_detail),
NO_VAL, cpufreq[i].new_min_freq,
cpufreq[i].new_max_freq,
cpufreq[i].new_frequency);
if (cpufreq[i].new_governor[0] != '\0') {
info("cpu_freq: set cpu=%d %s Governor=%s",
i, freq_detail, cpufreq[i].new_governor);
} else {
info("cpu_freq: reset cpu=%d %s", i,
freq_detail);
}
}
}
}
/*
* reset the cpus used by the process to their
* default frequency and governor type
*/
extern void
cpu_freq_reset(stepd_step_rec_t *step)
{
int i, rc, fd;
char freq_detail[100];
if ((!cpu_freq_count) || (!cpufreq))
return;
for (i = 0; i < cpu_freq_count; i++) {
if (cpufreq[i].new_frequency == NO_VAL
&& cpufreq[i].new_min_freq == NO_VAL
&& cpufreq[i].new_max_freq == NO_VAL
&& cpufreq[i].new_governor[0] == '\0')
continue; /* Nothing to reset on this CPU */
fd = _test_cpu_owner_lock(i, step->step_id.job_id);
if (fd < 0)
continue;
if (cpufreq[i].new_frequency != NO_VAL) {
rc = _cpu_freq_set_gov(step, i, "userspace");
if (rc == SLURM_ERROR)
continue;
rc = _cpu_freq_set_scaling_freq(step, i,
cpufreq[i].org_frequency,
"scaling_setspeed");
if (rc == SLURM_ERROR)
continue;
cpufreq[i].new_governor[0] = 'u'; /* force gov reset */
}
/* Max must be set before min, per
* www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt
*/
if (cpufreq[i].new_max_freq != NO_VAL) {
rc = _cpu_freq_set_scaling_freq(step, i,
cpufreq[i].org_max_freq,
"scaling_max_freq");
if (rc == SLURM_ERROR)
continue;
}
if (cpufreq[i].new_min_freq != NO_VAL) {
rc = _cpu_freq_set_scaling_freq(step, i,
cpufreq[i].org_min_freq,
"scaling_min_freq");
if (rc == SLURM_ERROR)
continue;
}
if (cpufreq[i].new_governor[0] != '\0') {
rc = _cpu_freq_set_gov(
step, i, cpufreq[i].org_governor);
if (rc == SLURM_ERROR)
continue;
}
if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) {
cpu_freq_debug(NULL, NULL,
freq_detail, sizeof(freq_detail),
NO_VAL, cpufreq[i].org_min_freq,
cpufreq[i].org_max_freq,
cpufreq[i].org_frequency);
if (cpufreq[i].new_governor[0] != '\0') {
info("cpu_freq: reset cpu=%d %s Governor=%s",
i, freq_detail, cpufreq[i].org_governor);
} else {
info("cpu_freq: reset cpu=%d %s", i,
freq_detail);
}
}
}
}
/* Convert a cpu_freq number to its equivalent string */
extern void
cpu_freq_to_string(char *buf, int buf_size, uint32_t cpu_freq)
{
if (cpu_freq == CPU_FREQ_LOW)
snprintf(buf, buf_size, "Low");
else if (cpu_freq == CPU_FREQ_MEDIUM)
snprintf(buf, buf_size, "Medium");
else if (cpu_freq == CPU_FREQ_HIGHM1)
snprintf(buf, buf_size, "Highm1");
else if (cpu_freq == CPU_FREQ_HIGH)
snprintf(buf, buf_size, "High");
else if (cpu_freq == CPU_FREQ_CONSERVATIVE)
snprintf(buf, buf_size, "Conservative");
else if (cpu_freq == CPU_FREQ_PERFORMANCE)
snprintf(buf, buf_size, "Performance");
else if (cpu_freq == CPU_FREQ_POWERSAVE)
snprintf(buf, buf_size, "PowerSave");
else if (cpu_freq == CPU_FREQ_USERSPACE)
snprintf(buf, buf_size, "UserSpace");
else if (cpu_freq == CPU_FREQ_ONDEMAND)
snprintf(buf, buf_size, "OnDemand");
else if (cpu_freq == CPU_FREQ_SCHEDUTIL)
snprintf(buf, buf_size, "SchedUtil");
else if (cpu_freq & CPU_FREQ_RANGE_FLAG)
snprintf(buf, buf_size, "Unknown");
else if (fuzzy_equal(cpu_freq, NO_VAL)) {
if (buf_size > 0)
buf[0] = '\0';
} else
convert_num_unit2((double)cpu_freq, buf, buf_size,
UNIT_KILO, NO_VAL, 1000, 0);
}
extern char *cpu_freq_to_cmdline(uint32_t min, uint32_t max, uint32_t gov)
{
char bfgov[32], bfmin[32], bfmax[32];
char *bfall = NULL;
bfgov[0] = '\0';
bfmin[0] = '\0';
bfmax[0] = '\0';
/*
* Default value from command line is NO_VAL,
* Default value from slurmstepd for batch jobs is 0
* Convert slurmstepd values to command line ones.
*/
if (min == 0)
min = NO_VAL;
if (max == 0)
max = NO_VAL;
if (gov == 0)
gov = NO_VAL;
if ((min == NO_VAL) && (max == NO_VAL) && (gov == NO_VAL))
return NULL;
if (min != NO_VAL) {
if (min & CPU_FREQ_RANGE_FLAG) {
cpu_freq_to_string(bfmin, sizeof(bfmin), min);
} else {
snprintf(bfmin, 32, "%u", min);
}
}
if (max != NO_VAL) {
if (max & CPU_FREQ_RANGE_FLAG) {
cpu_freq_to_string(bfmax, sizeof(bfmax), max);
} else {
snprintf(bfmax, 32, "%u", max);
}
}
if (gov != NO_VAL) {
cpu_freq_to_string(bfgov, sizeof(bfgov), gov);
}
if ((min != NO_VAL) && (max != NO_VAL) && (gov != NO_VAL)) {
xstrfmtcat(bfall, "%s-%s:%s", bfmin, bfmax, bfgov);
} else if ((min != NO_VAL) && (max != NO_VAL)) {
xstrfmtcat(bfall, "%s-%s", bfmin, bfmax);
} else if (max != NO_VAL) {
xstrcat(bfall, bfmax);
} else if (gov != NO_VAL) {
xstrcat(bfall, bfgov);
}
return bfall;
}
/*
* Set environment variables associated with the frequency variables.
*/
extern int cpu_freq_set_env(char *var, uint32_t min, uint32_t max,
uint32_t gov)
{
char *bfall = cpu_freq_to_cmdline(min, max, gov);
if (bfall && setenvf(NULL, var, "%s", bfall)) {
xfree(bfall);
error("Unable to set %s", var);
return SLURM_ERROR;
}
xfree(bfall);
return SLURM_SUCCESS;
}
/* Convert a composite cpu governor enum to its equivalent string
*
* Input: - buf - buffer to contain string
* - bufsz - size of buffer
* - gpvs - composite enum of governors
*/
extern void
cpu_freq_govlist_to_string(char* buf, uint16_t bufsz, uint32_t govs)
{
char *list = NULL;
char *sep = "", *pos = NULL;
if ((govs & CPU_FREQ_CONSERVATIVE) == CPU_FREQ_CONSERVATIVE) {
xstrfmtcatat(list, &pos, "%s%s", sep, "Conservative");
sep = ",";
}
if ((govs & CPU_FREQ_ONDEMAND) == CPU_FREQ_ONDEMAND) {
xstrfmtcatat(list, &pos, "%s%s", sep, "OnDemand");
sep = ",";
}
if ((govs & CPU_FREQ_PERFORMANCE) == CPU_FREQ_PERFORMANCE) {
xstrfmtcatat(list, &pos, "%s%s", sep, "Performance");
sep = ",";
}
if ((govs & CPU_FREQ_POWERSAVE) == CPU_FREQ_POWERSAVE) {
xstrfmtcatat(list, &pos, "%s%s", sep, "PowerSave");
sep = ",";
}
if ((govs & CPU_FREQ_SCHEDUTIL) == CPU_FREQ_SCHEDUTIL) {
xstrfmtcatat(list, &pos, "%s%s", sep, "SchedUtil");
sep = ",";
}
if ((govs & CPU_FREQ_USERSPACE) == CPU_FREQ_USERSPACE) {
xstrfmtcatat(list, &pos, "%s%s", sep, "UserSpace");
sep = ",";
}
if (list) {
strlcpy(buf, list, bufsz);
xfree(list);
} else {
strlcpy(buf, "No Governors defined", bufsz);
}
}
/*
* Verify slurm.conf CpuFreqDef option
*
* Input: - arg - governor/frequency value to check:
* valid governor, low, medium, highm1, high,
* or numeric frequency
* - freq - pointer to corresponding enum or numeric value
* Returns - -1 on error, else 0
*/
extern int
cpu_freq_verify_def(const char *arg, uint32_t *freq)
{
uint32_t cpufreq = 0;
cpufreq = _cpu_freq_check_gov(arg, CPU_FREQ_USERSPACE);
if (cpufreq) {
debug3("cpu_freq_verify_def: %s set", arg);
*freq = cpufreq;
return 0;
}
error("%s: CpuFreqDef=%s invalid", __func__, arg);
return -1;
}
/*
* Verify slurm.conf CpuFreqGovernors list
*
* Input: - arg - string list of governors
* - govs - pointer to composite of enum for each governor in list
* Returns - -1 on error, else 0
*/
extern int
cpu_freq_verify_govlist(const char *arg, uint32_t *govs)
{
char *list, *gov, *savestr = NULL;
uint32_t agov;
*govs = 0;
if (arg == NULL) {
error("cpu_freq_verify_govlist: governor list is empty");
return -1;
}
list = xstrdup(arg);
if ( (gov = strtok_r(list, ",", &savestr) ) == NULL) {
error("cpu_freq_verify_govlist: governor list '%s' invalid",
arg);
return -1;
}
do {
debug3("cpu_freq_verify_govlist: gov = %s", gov);
agov = _cpu_freq_check_gov(gov, 0);
if (agov == 0) {
error("cpu_freq_verify_govlist: governor '%s' invalid",
gov);
return -1;
}
*govs |= agov;
} while ( (gov = strtok_r(NULL, ",", &savestr) ) != NULL);
xfree(list);
return 0;
}
/*
* Verify cpu_freq command line option
*
* --cpu-freq=arg
* where arg is p1[-p2][:p3]
*
* - p1 can be [#### | low | medium | high | highm1]
* which will set the current frequency, and set the governor to
* UserSpace.
* - p1 can be [Conservative | OnDemand | Performance | PowerSave | UserSpace]
* which will set the governor to the corresponding value.
* - When p2 is present, p1 will be the minimum frequency and p2 will be
* the maximum. The governor cannot be UserSpace, so CpuFreqDef must be set in
* slurm.conf if there's no p3.
* - p2 can be [#### | medium | high | highm1] p2 must be greater than p1.
* - If the current frequency is < min, it will be set to min.
* Likewise, if the current frequency is > max, it will be set to max.
* - p3 can be [Conservative | OnDemand | Performance | PowerSave | UserSpace]
* which will set the governor to the corresponding value.
* When p3 is UserSpace, p2 must be empty.
* p2 will have been set by PowerCapping.
*
* returns -1 on error, 0 otherwise
*/
extern int
cpu_freq_verify_cmdline(const char *arg,
uint32_t *cpu_freq_min,
uint32_t *cpu_freq_max,
uint32_t *cpu_freq_gov)
{
char *poscolon, *posdash;
char *p1=NULL, *p2=NULL, *p3=NULL;
uint32_t frequency;
int rc = 0;
if (arg == NULL || cpu_freq_min == NULL || cpu_freq_max == NULL
|| cpu_freq_gov == NULL) {
return -1;
}
*cpu_freq_min = NO_VAL;
*cpu_freq_max = NO_VAL;
*cpu_freq_gov = NO_VAL;
poscolon = strchr(arg,':');
if (poscolon) {
p3 = xstrdup((poscolon+1));
}
posdash = strchr(arg,'-');
if (posdash) {
p1 = xstrndup(arg, (posdash-arg));
if (poscolon) {
p2 = xstrndup((posdash+1), ((poscolon-posdash)-1));
} else {
p2 = xstrdup((posdash+1));
}
} else {
if (poscolon) {
p1 = xstrndup(arg, (poscolon-arg));
} else {
p1 = xstrdup(arg);
}
}
frequency = _cpu_freq_check_gov(p1, 0);
if (frequency != 0) {
if (p3) {
error("governor cannot be specified twice "
"%s{-}:%s in --cpu-freq", p1, p3);
rc = -1;
goto clean;
}
*cpu_freq_gov = frequency;
} else {
frequency = _cpu_freq_check_freq(p1);
if (frequency == 0) {
rc = -1;
goto clean;
}
*cpu_freq_max = frequency;
}
if (p2) {
if (!p3 && (slurm_conf.cpu_freq_def == NO_VAL)) {
/*
* If the user specified a range without a governor,
* (even if userspace is not set), we won't accept the
* request. We don't know how the cpus are set and we
* won't decide which one to set for the user. Note that
* a range is valid for multiple governors.
*/
error("You must explicitly choose a governor when defining a range. Please specify only one value for the desired frequency (p1) or choose a specific governor (p3).");
rc = -1;
goto clean;
}
frequency = _cpu_freq_check_freq(p2);
if (frequency == 0) {
rc = -1;
goto clean;
}
*cpu_freq_min = *cpu_freq_max;
*cpu_freq_max = frequency;
if (*cpu_freq_max < *cpu_freq_min) {
error("min cpu-frec (%s) must be < max cpu-freq (%s)",
p1, p2);
rc = -1;
goto clean;
}
}
if (p3) {
frequency = _cpu_freq_check_gov(p3, 0);
if (frequency == 0) {
error("illegal governor: %s on --cpu-freq", p3);
rc = -1;
goto clean;
}
if (!p2) {
if (frequency != CPU_FREQ_USERSPACE) {
error("gov on cpu-frec (%s) illegal without max",
p3);
rc = -1;
goto clean;
}
} else {
if (frequency == CPU_FREQ_USERSPACE) {
error("%s governor does not support a range. Please specify only one value for the desired frequency (p1) or choose a different governor.",
p3);
rc = -1;
goto clean;
}
}
*cpu_freq_gov = frequency;
} else if (p2 && (*cpu_freq_gov == NO_VAL) &&
(slurm_conf.cpu_freq_def != NO_VAL)) {
/*
* No governor specified and a range is specified.
* Use slurm.conf CpuFreqDef if defined. Note that this cannot
* be UserSpace.
*/
*cpu_freq_gov = slurm_conf.cpu_freq_def;
}
/* Also force this in case we specify just one frequency. */
if ((*cpu_freq_gov == NO_VAL) && !p2 && !p3)
*cpu_freq_gov = CPU_FREQ_USERSPACE;
clean:
if (*cpu_freq_gov != NO_VAL) {
if (((*cpu_freq_gov & slurm_conf.cpu_freq_govs)
& ~CPU_FREQ_RANGE_FLAG) == 0) {
error("governor of %s is not allowed in slurm.conf",
arg);
*cpu_freq_gov = NO_VAL;
rc = -1;
}
}
if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_FREQ) {
cpu_freq_debug("command", "NO_VAL", NULL, 0,
*cpu_freq_gov, *cpu_freq_min,
*cpu_freq_max, NO_VAL);
}
xfree(p1);
xfree(p2);
xfree(p3);
return rc;
}
/*
* Convert frequency parameters to strings
* Typically called to produce string for a log or reporting utility.
*
* When label!=NULL, info message is put to log. This is convenient for
* inserting debug calls to verify values in structures or messages.
* noval_str==NULL allows missing parameters not to be reported.
* freq_str is a buffer to hold the composite string for all input values.
* freq_len is length of freq_str
* gov is a governor value
* min is a minimum value
* max is a maximum value
* freq is a (current) frequency value.
*
* Returns 0 if all parameters are NO_VAL (or 0)
*/
extern int
cpu_freq_debug(char* label, char* noval_str, char* freq_str, int freq_len,
uint32_t gov, uint32_t min, uint32_t max, uint32_t freq)
{
int rc = 0;
char bfgov[64], bfmin[32], bfmax[32], bffreq[32];
char *sep1 = " ", *sep2 = " ", *sep3 = " ";
bfgov[0] = '\0';
bfmin[0] = '\0';
bfmax[0] = '\0';
bffreq[0] = '\0';
if (freq != NO_VAL && freq != 0) {
rc = 1;
sprintf(bffreq, "cur_freq=%u", freq);
} else {
sep1 = "";
}
if ((min != NO_VAL) && (min != 0)) {
rc = 1;
if (min & CPU_FREQ_RANGE_FLAG) {
strcpy(bfmin, "CPU_min_freq=");
cpu_freq_to_string(&bfmin[13], (sizeof(bfmin)-13), min);
} else {
sprintf(bfmin, "CPU_min_freq=%u", min);
}
} else if (noval_str) {
if (strlen(noval_str) >= sizeof(bfmin)) {
error("%s: minimum CPU frequency string too large",
__func__);
} else {
strlcpy(bfmin, noval_str, sizeof(bfmin));
}
} else {
sep2 = "";
}
if ((max != NO_VAL) && (max != 0)) {
rc = 1;
if (max & CPU_FREQ_RANGE_FLAG) {
strcpy(bfmax, "CPU_max_freq=");
cpu_freq_to_string(&bfmax[13], (sizeof(bfmax)-13), max);
} else {
sprintf(bfmax, "CPU_max_freq=%u", max);
}
} else if (noval_str) {
if (strlen(noval_str) >= sizeof(bfmax)) {
error("%s: maximum CPU frequency string too large",
__func__);
} else {
strlcpy(bfmax, noval_str, sizeof(bfmax));
}
} else {
sep3 = "";
}
if ((gov != NO_VAL) && (gov != 0)) {
rc = 1;
strcpy(bfgov, "Governor=");
cpu_freq_to_string(&bfgov[9], (sizeof(bfgov)-9), gov);
} else if (noval_str) {
if (strlen(noval_str) >= sizeof(bfgov)) {
error("%s: max CPU governor string too large",
__func__);
} else {
strlcpy(bfgov, noval_str, sizeof(bfgov));
}
}
if (rc) {
if (freq_str) {
snprintf(freq_str, freq_len, "%s%s%s%s%s%s%s",
bffreq, sep1, bfmin, sep2, bfmax, sep3, bfgov);
}
} else {
if (freq_str)
freq_str[0] = '\0';
}
if (label) {
info("cpu-freq: %s :: %s%s%s%s%s%s%s", label,
bffreq, sep1, bfmin, sep2, bfmax, sep3, bfgov);
}
return rc;
}