| /*****************************************************************************\ |
| * set_oomadj.c - prevent slurmd/slurmstepd from being killed by the |
| * kernel OOM killer |
| ***************************************************************************** |
| * Written by Hongjia Cao, National University of Defense Technology, China. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <unistd.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include "src/common/log.h" |
| #include "src/common/env.h" |
| |
| #if !defined(__FreeBSD__) |
| extern int set_oom_adj(int adj) |
| { |
| int fd; |
| char oom_adj[16]; |
| char *oom_adj_file = "/proc/self/oom_score_adj"; |
| |
| fd = open(oom_adj_file, O_WRONLY); |
| if (fd < 0) { |
| if (errno == ENOENT) { |
| debug("%s not found. Falling back to oom_adj", |
| oom_adj_file); |
| oom_adj_file = "/proc/self/oom_adj"; |
| fd = open(oom_adj_file, O_WRONLY); |
| if (fd < 0) { |
| if (errno == ENOENT) |
| error("%s not found", oom_adj_file); |
| else |
| error("failed to open %s: %m", |
| oom_adj_file); |
| return -1; |
| } |
| /* Convert range from [-1000,1000] to [-17,15] |
| * for use with older Linux kernel before 2.6.36 */ |
| if (adj < 0) |
| adj = (adj * 17) / 1000; |
| else if (adj > 0) |
| adj = (adj * 15) / 1000; |
| } else { |
| error("failed to open %s: %m", oom_adj_file); |
| return -1; |
| } |
| } |
| if (snprintf(oom_adj, 16, "%d", adj) >= 16) { |
| close(fd); |
| return -1; |
| } |
| while ((write(fd, oom_adj, strlen(oom_adj)) < 0) && (errno == EINTR)) |
| ; |
| close(fd); |
| |
| return 0; |
| } |
| |
| extern void set_oom_adj_env(int adj) |
| { |
| /* |
| * slurmstepd OOM score must be set to a lower value or the OOM Killer |
| * might kill it if the application use more memory than permitted. |
| * We want it to be killable but to be the last process to be chosen by |
| * the kernel. |
| * |
| * Do not override the value if it was already set |
| * (e.g. in /etc/sysconfig/slurm). |
| */ |
| if (!getenv("SLURMSTEPD_OOM_ADJ")) |
| setenvfs("SLURMSTEPD_OOM_ADJ=%d", adj); |
| } |
| |
| #else /* __FreeBSD__ */ |
| |
| extern int set_oom_adj(int adj) |
| { |
| /* FreeBSD does not handle OOM the same way Linux does */ |
| (void) adj; /* unused argument */ |
| return 0; |
| } |
| |
| extern void set_oom_adj_env(int adj) |
| { |
| return; |
| } |
| #endif |