blob: 6f69a9b89506325b6cb676da28587f91c5039fc4 [file] [log] [blame]
/* Support for specifying IO affinity by various means.
Copyright 2010 Intel Corporation
Author: Andi Kleen
libnuma is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; version
2.1.
libnuma is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should find a copy of v2.1 of the GNU Lesser General Public License
somewhere on your Linux system; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
/* Notebook:
- Separate real errors from no NUMA with fallback
- Infiniband
- FCoE?
- Support for other special IO devices
- Specifying cpu subsets inside the IO node?
- Handle multiple IO nodes (needs kernel changes)
- Better support for multi-path IO?
*/
#define _GNU_SOURCE 1
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <netdb.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <dirent.h>
#include <linux/rtnetlink.h>
#include <linux/netlink.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <ctype.h>
#include <assert.h>
#include <regex.h>
#include <sys/sysmacros.h>
#include "numa.h"
#include "numaint.h"
#include "sysfs.h"
#include "affinity.h"
#include "rtnetlink.h"
static int badchar(const char *s)
{
if (strpbrk(s, "/."))
return 1;
return 0;
}
static int node_parse_failure(int ret, char *cls, const char *dev)
{
if (!cls)
cls = "";
if (ret == -2)
numa_warn(W_node_parse1,
"Kernel does not know node mask for%s%s device `%s'",
*cls ? " " : "", cls, dev);
else
numa_warn(W_node_parse2,
"Cannot read node mask for %s device `%s'",
cls, dev);
return -1;
}
/* Generic sysfs class lookup */
static int
affinity_class(struct bitmask *mask, char *cls, const char *dev)
{
int ret;
while (isspace(*dev))
dev++;
if (badchar(dev)) {
numa_warn(W_badchar, "Illegal characters in `%s' specification",
dev);
return -1;
}
/* Somewhat hackish: extract device from symlink path.
Better would be a direct backlink. This knows slightly too
much about the actual sysfs layout. */
char path[1024];
char *fn = NULL;
if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 &&
readlink(fn, path, sizeof path) > 0) {
regex_t re;
regmatch_t match[2];
char *p;
regcomp(&re, "(/devices/pci[0-9a-fA-F:/]+\\.[0-9]+)/",
REG_EXTENDED);
ret = regexec(&re, path, 2, match, 0);
regfree(&re);
if (ret == 0) {
free(fn);
assert(match[0].rm_so > 0);
assert(match[0].rm_eo > 0);
path[match[1].rm_eo + 1] = 0;
p = path + match[0].rm_so;
ret = sysfs_node_read(mask, "/sys/%s/numa_node", p);
if (ret < 0)
return node_parse_failure(ret, NULL, p);
return ret;
}
}
free(fn);
ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node",
cls, dev);
if (ret < 0)
return node_parse_failure(ret, cls, dev);
return 0;
}
/* Turn file (or device node) into class name */
static int affinity_file(struct bitmask *mask, char *cls, const char *file)
{
struct stat st;
DIR *dir;
int n;
unsigned maj = 0, min = 0;
dev_t d;
struct dirent *dep;
cls = "block";
char fn[sizeof("/sys/class/") + strlen(cls)];
if (stat(file, &st) < 0) {
numa_warn(W_blockdev1, "Cannot stat file %s", file);
return -1;
}
d = st.st_dev;
if (S_ISCHR(st.st_mode)) {
/* Better choice than misc? Most likely misc will not work
anyways unless the kernel is fixed. */
cls = "misc";
d = st.st_rdev;
} else if (S_ISBLK(st.st_mode))
d = st.st_rdev;
sprintf(fn, "/sys/class/%s", cls);
dir = opendir(fn);
if (!dir) {
numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs",
cls);
return -1;
}
while ((dep = readdir(dir)) != NULL) {
char *name = dep->d_name;
int ret;
if (*name == '.')
continue;
char *dev;
char fn2[sizeof("/sys/class/block//dev") + strlen(name)];
n = -1;
if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0)
break;
dev = sysfs_read(fn2);
if (dev) {
n = sscanf(dev, "%u:%u", &maj, &min);
free(dev);
}
if (n != 2) {
numa_warn(W_blockdev3, "Cannot parse sysfs device %s",
name);
continue;
}
if (major(d) != maj || minor(d) != min)
continue;
ret = affinity_class(mask, "block", name);
closedir(dir);
return ret;
}
closedir(dir);
numa_warn(W_blockdev5, "Cannot find block device %x:%x in sysfs for `%s'",
maj, min, file);
return -1;
}
/* Look up interface of route using rtnetlink. */
static int find_route(struct sockaddr *dst, int *iifp)
{
struct rtattr *rta;
const int hdrlen = NLMSG_LENGTH(sizeof(struct rtmsg));
struct {
struct nlmsghdr msg;
struct rtmsg rt;
char buf[256];
} req = {
.msg = {
.nlmsg_len = hdrlen,
.nlmsg_type = RTM_GETROUTE,
.nlmsg_flags = NLM_F_REQUEST,
},
.rt = {
.rtm_family = dst->sa_family,
},
};
struct sockaddr_nl adr = {
.nl_family = AF_NETLINK,
};
if (rta_put_address(&req.msg, RTA_DST, dst) < 0) {
numa_warn(W_netlink1, "Cannot handle network family %x",
dst->sa_family);
return -1;
}
if (rtnetlink_request(&req.msg, sizeof req, &adr) < 0) {
numa_warn(W_netlink2, "Cannot request rtnetlink route: %s",
strerror(errno));
return -1;
}
/* Fish the interface out of the netlink soup. */
rta = NULL;
while ((rta = rta_get(&req.msg, rta, hdrlen)) != NULL) {
if (rta->rta_type == RTA_OIF) {
memcpy(iifp, RTA_DATA(rta), sizeof(int));
return 0;
}
}
numa_warn(W_netlink3, "rtnetlink query did not return interface");
return -1;
}
static int iif_to_name(int iif, struct ifreq *ifr)
{
int n;
int sk = socket(PF_INET, SOCK_DGRAM, 0);
if (sk < 0)
return -1;
ifr->ifr_ifindex = iif;
n = ioctl(sk, SIOCGIFNAME, ifr);
close(sk);
return n;
}
/* Resolve an IP address to the nodes of a network device.
This generally only attempts to handle simple cases:
no multi-path, no bounding etc. In these cases only
the first interface or none is chosen. */
static int affinity_ip(struct bitmask *mask, char *cls, const char *id)
{
struct addrinfo *ai;
int n;
int iif;
struct ifreq ifr;
if ((n = getaddrinfo(id, NULL, NULL, &ai)) != 0) {
numa_warn(W_net1, "Cannot resolve %s: %s",
id, gai_strerror(n));
return -1;
}
if (find_route(&ai->ai_addr[0], &iif) < 0)
goto out_ai;
if (iif_to_name(iif, &ifr) < 0) {
numa_warn(W_net2, "Cannot resolve network interface %d", iif);
goto out_ai;
}
freeaddrinfo(ai);
return affinity_class(mask, "net", ifr.ifr_name);
out_ai:
freeaddrinfo(ai);
return -1;
}
/* Look up affinity for a PCI device */
static int affinity_pci(struct bitmask *mask, char *cls, const char *id)
{
unsigned seg, bus, dev, func;
int n, ret;
/* Func is optional. */
if ((n = sscanf(id, "%x:%x:%x.%x",&seg,&bus,&dev,&func)) == 4 || n == 3) {
if (n == 3)
func = 0;
}
/* Segment is optional too */
else if ((n = sscanf(id, "%x:%x.%x",&bus,&dev,&func)) == 3 || n == 2) {
seg = 0;
if (n == 2)
func = 0;
} else {
numa_warn(W_pci1, "Cannot parse PCI device `%s'", id);
return -1;
}
ret = sysfs_node_read(mask,
"/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node",
seg, bus, seg, bus, dev, func);
if (ret < 0)
return node_parse_failure(ret, cls, id);
return 0;
}
static struct handler {
char first;
char *name;
char *cls;
int (*handler)(struct bitmask *mask, char *cls, const char *desc);
} handlers[] = {
{ 'n', "netdev:", "net", affinity_class },
{ 'i', "ip:", NULL, affinity_ip },
{ 'f', "file:", NULL, affinity_file },
{ 'b', "block:", "block", affinity_class },
{ 'p', "pci:", NULL, affinity_pci },
{}
};
hidden int resolve_affinity(const char *id, struct bitmask *mask)
{
struct handler *h;
for (h = &handlers[0]; h->first; h++) {
int len;
if (id[0] != h->first)
continue;
len = strlen(h->name);
if (!strncmp(id, h->name, len)) {
int ret = h->handler(mask, h->cls, id + len);
if (ret == -2) {
numa_warn(W_nonode, "Kernel does not know node for %s\n",
id + len);
}
return ret;
}
}
return NO_IO_AFFINITY;
}