| /* Support for specifying IO affinity by various means. |
| Copyright 2010 Intel Corporation |
| Author: Andi Kleen |
| |
| libnuma is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; version |
| 2.1. |
| |
| libnuma is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should find a copy of v2.1 of the GNU Lesser General Public License |
| somewhere on your Linux system; if not, write to the Free Software |
| Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
| |
| /* Notebook: |
| - Separate real errors from no NUMA with fallback |
| - Infiniband |
| - FCoE? |
| - Support for other special IO devices |
| - Specifying cpu subsets inside the IO node? |
| - Handle multiple IO nodes (needs kernel changes) |
| - Better support for multi-path IO? |
| */ |
| #define _GNU_SOURCE 1 |
| #include <string.h> |
| #include <errno.h> |
| #include <sys/stat.h> |
| #include <netdb.h> |
| #include <unistd.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/socket.h> |
| #include <sys/ioctl.h> |
| #include <net/if.h> |
| #include <dirent.h> |
| #include <linux/rtnetlink.h> |
| #include <linux/netlink.h> |
| #include <sys/types.h> |
| #include <sys/sysmacros.h> |
| #include <ctype.h> |
| #include <assert.h> |
| #include <regex.h> |
| #include <sys/sysmacros.h> |
| #include "numa.h" |
| #include "numaint.h" |
| #include "sysfs.h" |
| #include "affinity.h" |
| #include "rtnetlink.h" |
| |
| static int badchar(const char *s) |
| { |
| if (strpbrk(s, "/.")) |
| return 1; |
| return 0; |
| } |
| |
| static int node_parse_failure(int ret, char *cls, const char *dev) |
| { |
| if (!cls) |
| cls = ""; |
| if (ret == -2) |
| numa_warn(W_node_parse1, |
| "Kernel does not know node mask for%s%s device `%s'", |
| *cls ? " " : "", cls, dev); |
| else |
| numa_warn(W_node_parse2, |
| "Cannot read node mask for %s device `%s'", |
| cls, dev); |
| return -1; |
| } |
| |
| /* Generic sysfs class lookup */ |
| static int |
| affinity_class(struct bitmask *mask, char *cls, const char *dev) |
| { |
| int ret; |
| while (isspace(*dev)) |
| dev++; |
| if (badchar(dev)) { |
| numa_warn(W_badchar, "Illegal characters in `%s' specification", |
| dev); |
| return -1; |
| } |
| |
| /* Somewhat hackish: extract device from symlink path. |
| Better would be a direct backlink. This knows slightly too |
| much about the actual sysfs layout. */ |
| char path[1024]; |
| char *fn = NULL; |
| if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 && |
| readlink(fn, path, sizeof path) > 0) { |
| regex_t re; |
| regmatch_t match[2]; |
| char *p; |
| |
| regcomp(&re, "(/devices/pci[0-9a-fA-F:/]+\\.[0-9]+)/", |
| REG_EXTENDED); |
| ret = regexec(&re, path, 2, match, 0); |
| regfree(&re); |
| if (ret == 0) { |
| free(fn); |
| assert(match[0].rm_so > 0); |
| assert(match[0].rm_eo > 0); |
| path[match[1].rm_eo + 1] = 0; |
| p = path + match[0].rm_so; |
| ret = sysfs_node_read(mask, "/sys/%s/numa_node", p); |
| if (ret < 0) |
| return node_parse_failure(ret, NULL, p); |
| return ret; |
| } |
| } |
| free(fn); |
| |
| ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node", |
| cls, dev); |
| if (ret < 0) |
| return node_parse_failure(ret, cls, dev); |
| return 0; |
| } |
| |
| /* Turn file (or device node) into class name */ |
| static int affinity_file(struct bitmask *mask, char *cls, const char *file) |
| { |
| struct stat st; |
| DIR *dir; |
| int n; |
| unsigned maj = 0, min = 0; |
| dev_t d; |
| struct dirent *dep; |
| |
| cls = "block"; |
| char fn[sizeof("/sys/class/") + strlen(cls)]; |
| if (stat(file, &st) < 0) { |
| numa_warn(W_blockdev1, "Cannot stat file %s", file); |
| return -1; |
| } |
| d = st.st_dev; |
| if (S_ISCHR(st.st_mode)) { |
| /* Better choice than misc? Most likely misc will not work |
| anyways unless the kernel is fixed. */ |
| cls = "misc"; |
| d = st.st_rdev; |
| } else if (S_ISBLK(st.st_mode)) |
| d = st.st_rdev; |
| |
| sprintf(fn, "/sys/class/%s", cls); |
| dir = opendir(fn); |
| if (!dir) { |
| numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs", |
| cls); |
| return -1; |
| } |
| while ((dep = readdir(dir)) != NULL) { |
| char *name = dep->d_name; |
| int ret; |
| |
| if (*name == '.') |
| continue; |
| char *dev; |
| char fn2[sizeof("/sys/class/block//dev") + strlen(name)]; |
| |
| n = -1; |
| if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0) |
| break; |
| dev = sysfs_read(fn2); |
| if (dev) { |
| n = sscanf(dev, "%u:%u", &maj, &min); |
| free(dev); |
| } |
| if (n != 2) { |
| numa_warn(W_blockdev3, "Cannot parse sysfs device %s", |
| name); |
| continue; |
| } |
| |
| if (major(d) != maj || minor(d) != min) |
| continue; |
| |
| ret = affinity_class(mask, "block", name); |
| closedir(dir); |
| return ret; |
| } |
| closedir(dir); |
| numa_warn(W_blockdev5, "Cannot find block device %x:%x in sysfs for `%s'", |
| maj, min, file); |
| return -1; |
| } |
| |
| /* Look up interface of route using rtnetlink. */ |
| static int find_route(struct sockaddr *dst, int *iifp) |
| { |
| struct rtattr *rta; |
| const int hdrlen = NLMSG_LENGTH(sizeof(struct rtmsg)); |
| struct { |
| struct nlmsghdr msg; |
| struct rtmsg rt; |
| char buf[256]; |
| } req = { |
| .msg = { |
| .nlmsg_len = hdrlen, |
| .nlmsg_type = RTM_GETROUTE, |
| .nlmsg_flags = NLM_F_REQUEST, |
| }, |
| .rt = { |
| .rtm_family = dst->sa_family, |
| }, |
| }; |
| struct sockaddr_nl adr = { |
| .nl_family = AF_NETLINK, |
| }; |
| |
| if (rta_put_address(&req.msg, RTA_DST, dst) < 0) { |
| numa_warn(W_netlink1, "Cannot handle network family %x", |
| dst->sa_family); |
| return -1; |
| } |
| |
| if (rtnetlink_request(&req.msg, sizeof req, &adr) < 0) { |
| numa_warn(W_netlink2, "Cannot request rtnetlink route: %s", |
| strerror(errno)); |
| return -1; |
| } |
| |
| /* Fish the interface out of the netlink soup. */ |
| rta = NULL; |
| while ((rta = rta_get(&req.msg, rta, hdrlen)) != NULL) { |
| if (rta->rta_type == RTA_OIF) { |
| memcpy(iifp, RTA_DATA(rta), sizeof(int)); |
| return 0; |
| } |
| } |
| |
| numa_warn(W_netlink3, "rtnetlink query did not return interface"); |
| return -1; |
| } |
| |
| static int iif_to_name(int iif, struct ifreq *ifr) |
| { |
| int n; |
| int sk = socket(PF_INET, SOCK_DGRAM, 0); |
| if (sk < 0) |
| return -1; |
| ifr->ifr_ifindex = iif; |
| n = ioctl(sk, SIOCGIFNAME, ifr); |
| close(sk); |
| return n; |
| } |
| |
| /* Resolve an IP address to the nodes of a network device. |
| This generally only attempts to handle simple cases: |
| no multi-path, no bounding etc. In these cases only |
| the first interface or none is chosen. */ |
| static int affinity_ip(struct bitmask *mask, char *cls, const char *id) |
| { |
| struct addrinfo *ai; |
| int n; |
| int iif; |
| struct ifreq ifr; |
| |
| if ((n = getaddrinfo(id, NULL, NULL, &ai)) != 0) { |
| numa_warn(W_net1, "Cannot resolve %s: %s", |
| id, gai_strerror(n)); |
| return -1; |
| } |
| |
| if (find_route(&ai->ai_addr[0], &iif) < 0) |
| goto out_ai; |
| |
| if (iif_to_name(iif, &ifr) < 0) { |
| numa_warn(W_net2, "Cannot resolve network interface %d", iif); |
| goto out_ai; |
| } |
| |
| freeaddrinfo(ai); |
| return affinity_class(mask, "net", ifr.ifr_name); |
| |
| out_ai: |
| freeaddrinfo(ai); |
| return -1; |
| } |
| |
| /* Look up affinity for a PCI device */ |
| static int affinity_pci(struct bitmask *mask, char *cls, const char *id) |
| { |
| unsigned seg, bus, dev, func; |
| int n, ret; |
| |
| /* Func is optional. */ |
| if ((n = sscanf(id, "%x:%x:%x.%x",&seg,&bus,&dev,&func)) == 4 || n == 3) { |
| if (n == 3) |
| func = 0; |
| } |
| /* Segment is optional too */ |
| else if ((n = sscanf(id, "%x:%x.%x",&bus,&dev,&func)) == 3 || n == 2) { |
| seg = 0; |
| if (n == 2) |
| func = 0; |
| } else { |
| numa_warn(W_pci1, "Cannot parse PCI device `%s'", id); |
| return -1; |
| } |
| ret = sysfs_node_read(mask, |
| "/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node", |
| seg, bus, seg, bus, dev, func); |
| if (ret < 0) |
| return node_parse_failure(ret, cls, id); |
| return 0; |
| } |
| |
| static struct handler { |
| char first; |
| char *name; |
| char *cls; |
| int (*handler)(struct bitmask *mask, char *cls, const char *desc); |
| } handlers[] = { |
| { 'n', "netdev:", "net", affinity_class }, |
| { 'i', "ip:", NULL, affinity_ip }, |
| { 'f', "file:", NULL, affinity_file }, |
| { 'b', "block:", "block", affinity_class }, |
| { 'p', "pci:", NULL, affinity_pci }, |
| {} |
| }; |
| |
| hidden int resolve_affinity(const char *id, struct bitmask *mask) |
| { |
| struct handler *h; |
| |
| for (h = &handlers[0]; h->first; h++) { |
| int len; |
| if (id[0] != h->first) |
| continue; |
| len = strlen(h->name); |
| if (!strncmp(id, h->name, len)) { |
| int ret = h->handler(mask, h->cls, id + len); |
| if (ret == -2) { |
| numa_warn(W_nonode, "Kernel does not know node for %s\n", |
| id + len); |
| } |
| return ret; |
| } |
| } |
| return NO_IO_AFFINITY; |
| } |