blob: 28744c58187242ba5b2d8c7f100f931f98656df4 [file] [log] [blame]
/*****************************************************************************\
* cpu_frequency.c - support for srun option --cpu-freq=<frequency>
*****************************************************************************
* Copyright (C) 2012 Bull
* Written by Don Albert, <don.albert@bull.com>
*
* This file is part of SLURM, a resource management program.
* For details, see <http://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#if HAVE_CONFIG_H
#include "config.h"
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <stdlib.h>
#include <limits.h>
#include "slurm/slurm.h"
#include "src/common/xcpuinfo.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/common/cpu_frequency.h"
#define PATH_TO_CPU "/sys/devices/system/cpu/"
#define LINE_LEN 100
#define SYSFS_PATH_MAX 255
#define FREQ_LIST_MAX 16
#define GOV_NAME_LEN 24
static uint16_t cpu_freq_count = 0;
static struct cpu_freq_data {
uint32_t frequency_to_set;
uint32_t reset_frequency;
char reset_governor[GOV_NAME_LEN];
} * cpufreq = NULL;
static void _cpu_freq_find_valid(uint32_t cpu_freq, int cpuidx);
static uint16_t _cpu_freq_next_cpu(char **core_range, uint16_t *cpuidx,
uint16_t *start, uint16_t *end);
/*
* called to check if the node supports setting cpu frequency
* if so, initialize fields in cpu_freq_data structure
*/
extern void
cpu_freq_init(slurmd_conf_t *conf)
{
char path[SYSFS_PATH_MAX];
struct stat statbuf;
FILE *fp;
char value[LINE_LEN];
unsigned int i, j;
/* check for cpufreq support */
if ( stat(PATH_TO_CPU "cpu0/cpufreq", &statbuf) != 0 ) {
info("CPU frequency setting not configured for this node");
return;
}
if (!S_ISDIR(statbuf.st_mode)) {
error(PATH_TO_CPU "cpu0/cpufreq not a directory");
return;
}
/* get the cpu frequency info into the cpu_freq_data structure */
cpu_freq_count = conf->block_map_size;
if (!cpufreq) {
cpufreq = (struct cpu_freq_data *)
xmalloc(cpu_freq_count *
sizeof(struct cpu_freq_data));
}
info("Gathering cpu frequency information for %u cpus", cpu_freq_count);
for (i = 0; i < cpu_freq_count; i++) {
cpufreq[i].frequency_to_set = 0;
cpufreq[i].reset_frequency = 0;
snprintf(path, sizeof(path),
PATH_TO_CPU
"cpu%u/cpufreq/scaling_available_governors", i);
if ( ( fp = fopen(path, "r") ) == NULL )
continue;
if (fgets(value, LINE_LEN, fp) == NULL) {
fclose(fp);
continue;
}
if (strstr(value, "userspace") == NULL) {
fclose(fp);
continue;
}
fclose(fp);
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_governor", i);
if ( ( fp = fopen(path, "r") ) == NULL )
continue;
if (fgets(value, LINE_LEN, fp) == NULL) {
fclose(fp);
continue;
}
if (strlen(value) >= GOV_NAME_LEN) {
fclose(fp);
continue;
}
strcpy(cpufreq[i].reset_governor, value);
fclose(fp);
j = strlen(cpufreq[i].reset_governor);
if ((j > 0) && (cpufreq[i].reset_governor[j - 1] == '\n'))
cpufreq[i].reset_governor[j - 1] = '\0';
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_min_freq", i);
if ( ( fp = fopen(path, "r") ) == NULL )
continue;
if (fscanf (fp, "%u", &cpufreq[i].reset_frequency) < 0) {
error("cpu_freq_cgroup_valid: Could not read "
"scaling_min_freq");
}
fclose(fp);
debug("cpu_freq_init: cpu %u, reset freq: %u, "
"reset governor: %s",
i,cpufreq[i].reset_frequency,cpufreq[i].reset_governor);
}
return;
}
extern void
cpu_freq_fini(void)
{
xfree(cpufreq);
}
/*
* Send the cpu_frequency table info to slurmstepd
*/
void
cpu_freq_send_info(int fd)
{
if (cpu_freq_count) {
safe_write(fd, &cpu_freq_count, sizeof(uint16_t));
safe_write(fd, cpufreq,
(cpu_freq_count * sizeof(struct cpu_freq_data)));
} else {
safe_write(fd, &cpu_freq_count, sizeof(uint16_t));
}
return;
rwfail:
error("Unable to send cpu frequency information for %u cpus",
cpu_freq_count);
return;
}
/*
* Receive the cpu_frequency table info from slurmd
*/
void
cpu_freq_recv_info(int fd)
{
safe_read(fd, &cpu_freq_count, sizeof(uint16_t));
if (cpu_freq_count) {
if (!cpufreq) {
cpufreq = (struct cpu_freq_data *)
xmalloc(cpu_freq_count *
sizeof(struct cpu_freq_data));
}
safe_read(fd, cpufreq,
(cpu_freq_count * sizeof(struct cpu_freq_data)));
info("Received cpu frequency information for %u cpus",
cpu_freq_count);
}
return;
rwfail:
error("Unable to recv cpu frequency information for %u cpus",
cpu_freq_count);
cpu_freq_count = 0;
return;
}
/*
* Validate the cpus and select the frequency to set
* Called from task cpuset code with task launch request containing
* a pointer to a hex map string of the cpus to be used by this step
*/
void
cpu_freq_cpuset_validate(stepd_step_rec_t *job)
{
int cpuidx, cpu_num;
bitstr_t *cpus_to_set;
bitstr_t *cpu_map;
char *cpu_bind;
char *cpu_str;
char *savestr = NULL;
debug2("cpu_freq_cpuset_validate: request = %12d %8x",
job->cpu_freq, job->cpu_freq);
debug2(" jobid=%u, stepid=%u, tasks=%u cpu/task=%u, cpus=%u",
job->jobid, job->stepid, job->node_tasks,
job->cpus_per_task,job->cpus);
debug2(" cpu_bind_type=%4x, cpu_bind map=%s",
job->cpu_bind_type, job->cpu_bind);
if (!cpu_freq_count)
return;
if (job->cpu_bind == NULL) {
error("cpu_freq_cpuset_validate: cpu_bind string is null");
return;
}
cpu_bind = xstrdup(job->cpu_bind);
if ( (cpu_str = strtok_r(cpu_bind, ",", &savestr) ) == NULL) {
error("cpu_freq_cpuset_validate: cpu_bind string invalid");
xfree(cpu_bind);
return;
}
cpu_map = (bitstr_t *) bit_alloc(cpu_freq_count);
cpus_to_set = (bitstr_t *) bit_alloc(cpu_freq_count);
do {
debug3(" cpu_str = %s", cpu_str);
if ((job->cpu_bind_type & CPU_BIND_MAP) == CPU_BIND_MAP) {
cpu_num = atoi(cpu_str);
if (cpu_num >= cpu_freq_count) {
error("cpu_freq_cpuset_validate: invalid cpu "
"number %d", cpu_num);
bit_free(cpu_map);
bit_free(cpus_to_set);
xfree(cpu_bind);
return;
}
bit_set(cpu_map, (bitoff_t)cpu_num);
} else {
if (bit_unfmt_hexmask(cpu_map, cpu_str) == -1) {
error("cpu_freq_cpuset_validate: invalid cpu "
"mask %s", cpu_bind);
bit_free(cpu_map);
bit_free(cpus_to_set);
xfree(cpu_bind);
return;
}
}
bit_or(cpus_to_set, cpu_map);
} while ( (cpu_str = strtok_r(NULL, ",", &savestr) ) != NULL);
for (cpuidx=0; cpuidx < cpu_freq_count; cpuidx++) {
if (bit_test(cpus_to_set, cpuidx)) {
_cpu_freq_find_valid(job->cpu_freq, cpuidx);
}
}
cpu_freq_set(job);
bit_free(cpu_map);
bit_free(cpus_to_set);
xfree(cpu_bind);
return;
}
/*
* Validate the cpus and select the frequency to set
* Called from task cgroup cpuset code with string containing
* the list of cpus to be used by this step
*/
void
cpu_freq_cgroup_validate(stepd_step_rec_t *job, char *step_alloc_cores)
{
uint16_t start = USHRT_MAX;
uint16_t end = USHRT_MAX;
uint16_t cpuidx = 0;
char *core_range;
debug2("cpu_freq_cgroup_validate: request value = %12d %8x",
job->cpu_freq, job->cpu_freq);
debug2(" jobid=%u, stepid=%u, tasks=%u cpu/task=%u, cpus=%u",
job->jobid,job->stepid,job->node_tasks,
job->cpus_per_task,job->cpus);
debug2(" cpu_bind_type=%4x, cpu_bind map=%s",
job->cpu_bind_type, job->cpu_bind);
debug2(" step logical cores = %s, step physical cores = %s",
job->step_alloc_cores, step_alloc_cores);
if (!cpu_freq_count)
return;
/* set entries in cpu frequency table for this step's cpus */
core_range = step_alloc_cores;
while ( (cpuidx = _cpu_freq_next_cpu(&core_range, &cpuidx,
&start, &end)) != USHRT_MAX) {
if (cpuidx >= cpu_freq_count) {
error("cpu_freq_validate: index %u exceeds cpu count %u",
cpuidx, cpu_freq_count);
return;
}
_cpu_freq_find_valid(job->cpu_freq, cpuidx);
}
cpu_freq_set(job);
return;
}
/*
* get the next number in a range
* assumes range is well-formed, i.e., monotonically increasing,
* no leading/trailing punctuation, either comma separated or dash
* separated: e.g., "4-6,8,10,13-15"
*/
uint16_t
_cpu_freq_next_cpu(char **core_range, uint16_t *cpuidx,
uint16_t *start, uint16_t *end)
{
int i;
char *p;
p = *core_range;
if (*start == USHRT_MAX) {
if (*p == '\0')
return USHRT_MAX;
if (*p == ',')
p++;
i = 0;
while ( isdigit(*p) ) {
i = i*10 + (*p - '0');
p++;
}
*core_range = p;
*start = i;
return i;
}
if (*end == USHRT_MAX) {
switch (*p)
{
case '-' :
p++;
i = 0;
while ( isdigit(*p) ) {
i = i*10 + (*p - '0');
p++;
}
*core_range = p;
*end = i;
break;
case ',':
p++;
i = 0;
while ( isdigit(*p) ) {
i = i*10 + (*p - '0');
p++;
}
*start = i;
*end = USHRT_MAX;
*core_range = p;
return i;
case '\0' :
return USHRT_MAX;
}
}
i = *cpuidx;
if ( i < *end ) {
i++;
if ( i == *end) {
*start = USHRT_MAX;
*end = USHRT_MAX;
}
}
return i;
}
/*
* Compute the right frequency value to set, based on request
*
* input: job record containing cpu frequency parameter
* input: index to current cpu entry in cpu_freq_data table
*
* sets "frequency_to_set" table entry if valid value found
*/
void
_cpu_freq_find_valid(uint32_t cpu_freq, int cpuidx)
{
unsigned int j, freq_med = 0;
uint32_t freq_list[FREQ_LIST_MAX] = { 0 };
char path[SYSFS_PATH_MAX];
FILE *fp;
/* see if user requested "high" "medium" or "low" */
if (cpu_freq & CPU_FREQ_RANGE_FLAG) {
switch(cpu_freq)
{
case CPU_FREQ_LOW :
/* get the value from scale min freq */
snprintf(path, sizeof(path),
PATH_TO_CPU
"cpu%u/cpufreq/scaling_min_freq", cpuidx);
if ( ( fp = fopen(path, "r") ) == NULL ) {
error("cpu_freq_cgroup_valid: Could not open "
"scaling_min_freq");
return;
}
if (fscanf (fp, "%u",
&cpufreq[cpuidx].frequency_to_set) < 1) {
error("cpu_freq_cgroup_valid: Could not read "
"scaling_min_freq");
}
break;
case CPU_FREQ_MEDIUM :
case CPU_FREQ_HIGHM1 :
snprintf(path, sizeof(path),
PATH_TO_CPU
"cpu%u/cpufreq/scaling_available_frequencies",
cpuidx);
if ( ( fp = fopen(path, "r") ) == NULL ) {
error("cpu_freq_cgroup_valid: Could not open "
"scaling_available_frequencies");
return;
}
for (j = 0; j < FREQ_LIST_MAX; j++) {
if ( fscanf(fp, "%u", &freq_list[j]) == EOF)
break;
freq_med = (j + 1) / 2;
}
if (cpu_freq == CPU_FREQ_MEDIUM) {
cpufreq[cpuidx].frequency_to_set =
freq_list[freq_med];
} else {
cpufreq[cpuidx].frequency_to_set =
freq_list[j > 0 ? j-1 : 0];
}
break;
case CPU_FREQ_HIGH :
/* get the value from scale max freq */
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_max_freq",
cpuidx);
if ( ( fp = fopen(path, "r") ) == NULL ) {
error("cpu_freq_cgroup_valid: Could not open "
"scaling_max_freq");
return;
}
if (fscanf (fp, "%u",
&cpufreq[cpuidx].frequency_to_set) < 1) {
error("cpu_freq_cgroup_valid: Could not read "
"scaling_max_freq");
}
break;
default :
error("cpu_freq_cgroup_valid: "
"invalid cpu_freq value %u", cpu_freq);
return;
}
fclose(fp);
} else {
/* find legal value close to requested value */
snprintf(path, sizeof(path),
PATH_TO_CPU
"cpu%u/cpufreq/scaling_available_frequencies", cpuidx);
if ( ( fp = fopen(path, "r") ) == NULL )
return;
for (j = 0; j < FREQ_LIST_MAX; j++) {
if ( fscanf(fp, "%u", &freq_list[j]) == EOF)
break;
if (cpu_freq == freq_list[j]) {
cpufreq[cpuidx].frequency_to_set = freq_list[j];
break;
}
if (j > 0) {
if (freq_list[j] > freq_list[j-1] ) {
/* ascending order */
if ((cpu_freq > freq_list[j-1]) &&
(cpu_freq < freq_list[j])) {
cpufreq[cpuidx].frequency_to_set =
freq_list[j];
break;
}
} else {
/* descending order */
if ((cpu_freq > freq_list[j]) &&
(cpu_freq < freq_list[j-1])) {
cpufreq[cpuidx].frequency_to_set =
freq_list[j];
break;
}
}
}
}
fclose(fp);
}
debug3("cpu_freq_cgroup_validate: cpu %u, frequency to set: %u",
cpuidx, cpufreq[cpuidx].frequency_to_set);
return;
}
/*
* verify cpu_freq parameter
*
* in addition to a numeric frequency value, we allow the user
* to specify "low", "medium", or "high" frequency
*
* returns -1 on error, 0 otherwise
*/
int
cpu_freq_verify_param(const char *arg, uint32_t *cpu_freq)
{
char *end;
uint32_t frequency;
if (arg == NULL) {
return 0;
}
if ( (frequency = strtoul(arg, &end, 10) )) {
*cpu_freq = frequency;
return 0;
}
if (strncasecmp(arg, "lo", 2) == 0) {
*cpu_freq = CPU_FREQ_LOW;
return 0;
} else if (strncasecmp(arg, "him1", 4) == 0 ||
strncasecmp(arg, "highm1", 6) == 0) {
*cpu_freq = CPU_FREQ_HIGHM1;
return 0;
} else if (strncasecmp(arg, "hi", 2) == 0) {
*cpu_freq = CPU_FREQ_HIGH;
return 0;
} else if (strncasecmp(arg, "med", 3) == 0) {
*cpu_freq = CPU_FREQ_MEDIUM;
return 0;
}
error("unrecognized --cpu-freq argument \"%s\"", arg);
return -1;
}
/*
* set cpu frequency if possible for each cpu of the job step
*/
void
cpu_freq_set(stepd_step_rec_t *job)
{
char path[SYSFS_PATH_MAX];
FILE *fp;
char value[LINE_LEN];
unsigned int i,j;
if ((!cpu_freq_count) || (!cpufreq))
return;
j = 0;
for (i = 0; i < cpu_freq_count; i++) {
if (cpufreq[i].frequency_to_set == 0)
continue;
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_governor", i);
if ( ( fp = fopen(path, "w") ) == NULL )
continue;
fputs("userspace\n", fp);
fclose(fp);
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_setspeed", i);
snprintf(value, LINE_LEN, "%u", cpufreq[i].frequency_to_set);
if ( ( fp = fopen(path, "w") ) == NULL )
continue;
fputs(value, fp);
fclose(fp);
j++;
debug2("cpu_freq_set: cpu %u, frequency: %u",
i,cpufreq[i].frequency_to_set);
}
debug("cpu_freq_set: #cpus set = %u", j);
}
/*
* reset the cpus used by the process to their
* default frequency and governor type
*/
void
cpu_freq_reset(stepd_step_rec_t *job)
{
char path[SYSFS_PATH_MAX];
FILE *fp;
char value[LINE_LEN];
unsigned int i, j;
if ((!cpu_freq_count) || (!cpufreq))
return;
j = 0;
for (i = 0; i < cpu_freq_count; i++) {
if (cpufreq[i].frequency_to_set == 0)
continue;
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_setspeed", i);
snprintf(value, LINE_LEN, "%u", cpufreq[i].reset_frequency);
if ( ( fp = fopen(path, "w") ) == NULL )
continue;
fputs(value, fp);
fclose(fp);
snprintf(path, sizeof(path),
PATH_TO_CPU "cpu%u/cpufreq/scaling_governor", i);
if ( ( fp = fopen(path, "w") ) == NULL )
continue;
fputs(cpufreq[i].reset_governor, fp);
fputc('\n', fp);
fclose(fp);
j++;
debug3("cpu_freq_reset: "
"cpu %u, frequency reset: %u, governor reset: %s",
i,cpufreq[i].reset_frequency,cpufreq[i].reset_governor);
}
debug("cpu_freq_reset: #cpus reset = %u", j);
}