| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
| */ |
| |
| #include <ctype.h> |
| #include <dirent.h> |
| #include <fcntl.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/efi_partition.h> |
| |
| #ifdef HAVE_LIBUDEV |
| #include <libudev.h> |
| #endif |
| |
| #include <libzutil.h> |
| |
| /* |
| * Append partition suffix to an otherwise fully qualified device path. |
| * This is used to generate the name the full path as its stored in |
| * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length |
| * of 'path' will be returned on error a negative value is returned. |
| */ |
| int |
| zfs_append_partition(char *path, size_t max_len) |
| { |
| int len = strlen(path); |
| |
| if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || |
| (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { |
| if (len + 6 >= max_len) |
| return (-1); |
| |
| (void) strcat(path, "-part1"); |
| len += 6; |
| } else { |
| if (len + 2 >= max_len) |
| return (-1); |
| |
| if (isdigit(path[len-1])) { |
| (void) strcat(path, "p1"); |
| len += 2; |
| } else { |
| (void) strcat(path, "1"); |
| len += 1; |
| } |
| } |
| |
| return (len); |
| } |
| |
| /* |
| * Remove partition suffix from a vdev path. Partition suffixes may take three |
| * forms: "-partX", "pX", or "X", where X is a string of digits. The second |
| * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The |
| * third case only occurs when preceded by a string matching the regular |
| * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. |
| * |
| * caller must free the returned string |
| */ |
| char * |
| zfs_strip_partition(char *path) |
| { |
| char *tmp = strdup(path); |
| char *part = NULL, *d = NULL; |
| if (!tmp) |
| return (NULL); |
| |
| if ((part = strstr(tmp, "-part")) && part != tmp) { |
| d = part + 5; |
| } else if ((part = strrchr(tmp, 'p')) && |
| part > tmp + 1 && isdigit(*(part-1))) { |
| d = part + 1; |
| } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && |
| tmp[1] == 'd') { |
| for (d = &tmp[2]; isalpha(*d); part = ++d) { } |
| } else if (strncmp("xvd", tmp, 3) == 0) { |
| for (d = &tmp[3]; isalpha(*d); part = ++d) { } |
| } |
| if (part && d && *d != '\0') { |
| for (; isdigit(*d); d++) { } |
| if (*d == '\0') |
| *part = '\0'; |
| } |
| |
| return (tmp); |
| } |
| |
| /* |
| * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname |
| * |
| * path: /dev/sda1 |
| * returns: /dev/sda |
| * |
| * Returned string must be freed. |
| */ |
| static char * |
| zfs_strip_partition_path(char *path) |
| { |
| char *newpath = strdup(path); |
| char *sd_offset; |
| char *new_sd; |
| |
| if (!newpath) |
| return (NULL); |
| |
| /* Point to "sda1" part of "/dev/sda1" */ |
| sd_offset = strrchr(newpath, '/') + 1; |
| |
| /* Get our new name "sda" */ |
| new_sd = zfs_strip_partition(sd_offset); |
| if (!new_sd) { |
| free(newpath); |
| return (NULL); |
| } |
| |
| /* Paste the "sda" where "sda1" was */ |
| strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); |
| |
| /* Free temporary "sda" */ |
| free(new_sd); |
| |
| return (newpath); |
| } |
| |
| /* |
| * Strip the unwanted portion of a device path. |
| */ |
| char * |
| zfs_strip_path(char *path) |
| { |
| return (strrchr(path, '/') + 1); |
| } |
| |
| /* |
| * Read the contents of a sysfs file into an allocated buffer and remove the |
| * last newline. |
| * |
| * This is useful for reading sysfs files that return a single string. Return |
| * an allocated string pointer on success, NULL otherwise. Returned buffer |
| * must be freed by the user. |
| */ |
| static char * |
| zfs_read_sysfs_file(char *filepath) |
| { |
| char buf[4096]; /* all sysfs files report 4k size */ |
| char *str = NULL; |
| |
| FILE *fp = fopen(filepath, "r"); |
| if (fp == NULL) { |
| return (NULL); |
| } |
| if (fgets(buf, sizeof (buf), fp) == buf) { |
| /* success */ |
| |
| /* Remove the last newline (if any) */ |
| size_t len = strlen(buf); |
| if (buf[len - 1] == '\n') { |
| buf[len - 1] = '\0'; |
| } |
| str = strdup(buf); |
| } |
| |
| fclose(fp); |
| |
| return (str); |
| } |
| |
| /* |
| * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to |
| * the drive (in /sys/bus/pci/slots). |
| * |
| * For example: |
| * dev: "nvme0n1" |
| * returns: "/sys/bus/pci/slots/0" |
| * |
| * 'dev' must be an NVMe device. |
| * |
| * Returned string must be freed. Returns NULL on error or no sysfs path. |
| */ |
| static char * |
| zfs_get_pci_slots_sys_path(const char *dev_name) |
| { |
| DIR *dp = NULL; |
| struct dirent *ep; |
| char *address1 = NULL; |
| char *address2 = NULL; |
| char *path = NULL; |
| char buf[MAXPATHLEN]; |
| char *tmp; |
| |
| /* If they preface 'dev' with a path (like "/dev") then strip it off */ |
| tmp = strrchr(dev_name, '/'); |
| if (tmp != NULL) |
| dev_name = tmp + 1; /* +1 since we want the chr after '/' */ |
| |
| if (strncmp("nvme", dev_name, 4) != 0) |
| return (NULL); |
| |
| (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address", |
| dev_name); |
| |
| address1 = zfs_read_sysfs_file(buf); |
| if (!address1) |
| return (NULL); |
| |
| /* |
| * /sys/block/nvme0n1/device/address format will |
| * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be |
| * "0000:01:00". Just NULL terminate at the '.' so they match. |
| */ |
| tmp = strrchr(address1, '.'); |
| if (tmp != NULL) |
| *tmp = '\0'; |
| |
| dp = opendir("/sys/bus/pci/slots/"); |
| if (dp == NULL) { |
| free(address1); |
| return (NULL); |
| } |
| |
| /* |
| * Look through all the /sys/bus/pci/slots/ subdirs |
| */ |
| while ((ep = readdir(dp))) { |
| /* |
| * We only care about directory names that are a single number. |
| * Sometimes there's other directories like |
| * "/sys/bus/pci/slots/0-3/" in there - skip those. |
| */ |
| if (!zfs_isnumber(ep->d_name)) |
| continue; |
| |
| (void) snprintf(buf, sizeof (buf), |
| "/sys/bus/pci/slots/%s/address", ep->d_name); |
| |
| address2 = zfs_read_sysfs_file(buf); |
| if (!address2) |
| continue; |
| |
| if (strcmp(address1, address2) == 0) { |
| /* Addresses match, we're all done */ |
| free(address2); |
| if (asprintf(&path, "/sys/bus/pci/slots/%s", |
| ep->d_name) == -1) { |
| continue; |
| } |
| break; |
| } |
| free(address2); |
| } |
| |
| closedir(dp); |
| free(address1); |
| |
| return (path); |
| } |
| |
| /* |
| * Given a dev name like "sda", return the full enclosure sysfs path to |
| * the disk. You can also pass in the name with "/dev" prepended |
| * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices. |
| * |
| * For example, disk "sda" in enclosure slot 1: |
| * dev_name: "sda" |
| * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" |
| * |
| * Or: |
| * |
| * dev_name: "nvme0n1" |
| * returns: "/sys/bus/pci/slots/0" |
| * |
| * 'dev' must be a non-devicemapper device. |
| * |
| * Returned string must be freed. Returns NULL on error. |
| */ |
| char * |
| zfs_get_enclosure_sysfs_path(const char *dev_name) |
| { |
| DIR *dp = NULL; |
| struct dirent *ep; |
| char buf[MAXPATHLEN]; |
| char *tmp1 = NULL; |
| char *tmp2 = NULL; |
| char *tmp3 = NULL; |
| char *path = NULL; |
| size_t size; |
| int tmpsize; |
| |
| if (dev_name == NULL) |
| return (NULL); |
| |
| /* If they preface 'dev' with a path (like "/dev") then strip it off */ |
| tmp1 = strrchr(dev_name, '/'); |
| if (tmp1 != NULL) |
| dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ |
| |
| tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); |
| if (tmpsize == -1 || tmp1 == NULL) { |
| tmp1 = NULL; |
| goto end; |
| } |
| |
| dp = opendir(tmp1); |
| if (dp == NULL) |
| goto end; |
| |
| /* |
| * Look though all sysfs entries in /sys/block/<dev>/device for |
| * the enclosure symlink. |
| */ |
| while ((ep = readdir(dp))) { |
| /* Ignore everything that's not our enclosure_device link */ |
| if (strstr(ep->d_name, "enclosure_device") == NULL) |
| continue; |
| |
| if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) { |
| tmp2 = NULL; |
| break; |
| } |
| |
| size = readlink(tmp2, buf, sizeof (buf)); |
| |
| /* Did readlink fail or crop the link name? */ |
| if (size == -1 || size >= sizeof (buf)) |
| break; |
| |
| /* |
| * We got a valid link. readlink() doesn't terminate strings |
| * so we have to do it. |
| */ |
| buf[size] = '\0'; |
| |
| /* |
| * Our link will look like: |
| * |
| * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" |
| * |
| * We want to grab the "enclosure/1:0:3:0/SLOT 1" part |
| */ |
| tmp3 = strstr(buf, "enclosure"); |
| if (tmp3 == NULL) |
| break; |
| |
| if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { |
| /* If asprintf() fails, 'path' is undefined */ |
| path = NULL; |
| break; |
| } |
| |
| if (path == NULL) |
| break; |
| } |
| |
| end: |
| free(tmp2); |
| free(tmp1); |
| |
| if (dp != NULL) |
| closedir(dp); |
| |
| if (!path) { |
| /* |
| * This particular disk isn't in a JBOD. It could be an NVMe |
| * drive. If so, look up the NVMe device's path in |
| * /sys/bus/pci/slots/. Within that directory is a 'attention' |
| * file which controls the NVMe fault LED. |
| */ |
| path = zfs_get_pci_slots_sys_path(dev_name); |
| } |
| |
| return (path); |
| } |
| |
| /* |
| * Allocate and return the underlying device name for a device mapper device. |
| * |
| * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a |
| * DM device (like /dev/disk/by-vdev/A0) are also allowed. |
| * |
| * If the DM device has multiple underlying devices (like with multipath |
| * DM devices), then favor underlying devices that have a symlink back to their |
| * back to their enclosure device in sysfs. This will be useful for the |
| * zedlet scripts that toggle the fault LED. |
| * |
| * Returns an underlying device name, or NULL on error or no match. If dm_name |
| * is not a DM device then return NULL. |
| * |
| * NOTE: The returned name string must be *freed*. |
| */ |
| static char * |
| dm_get_underlying_path(const char *dm_name) |
| { |
| DIR *dp = NULL; |
| struct dirent *ep; |
| char *realp; |
| char *tmp = NULL; |
| char *path = NULL; |
| char *dev_str; |
| int size; |
| char *first_path = NULL; |
| char *enclosure_path; |
| |
| if (dm_name == NULL) |
| return (NULL); |
| |
| /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ |
| realp = realpath(dm_name, NULL); |
| if (realp == NULL) |
| return (NULL); |
| |
| /* |
| * If they preface 'dev' with a path (like "/dev") then strip it off. |
| * We just want the 'dm-N' part. |
| */ |
| tmp = strrchr(realp, '/'); |
| if (tmp != NULL) |
| dev_str = tmp + 1; /* +1 since we want the chr after '/' */ |
| else |
| dev_str = tmp; |
| |
| if ((size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str)) == -1) { |
| tmp = NULL; |
| goto end; |
| } |
| |
| dp = opendir(tmp); |
| if (dp == NULL) |
| goto end; |
| |
| /* |
| * A device-mapper device can have multiple paths to it (multipath). |
| * Favor paths that have a symlink back to their enclosure device. |
| * We have to do this since some enclosures may only provide a symlink |
| * back for one underlying path to a disk and not the other. |
| * |
| * If no paths have links back to their enclosure, then just return the |
| * first path. |
| */ |
| while ((ep = readdir(dp))) { |
| if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ |
| if (!first_path) |
| first_path = strdup(ep->d_name); |
| |
| enclosure_path = |
| zfs_get_enclosure_sysfs_path(ep->d_name); |
| |
| if (!enclosure_path) |
| continue; |
| |
| if ((size = asprintf( |
| &path, "/dev/%s", ep->d_name)) == -1) |
| path = NULL; |
| free(enclosure_path); |
| break; |
| } |
| } |
| |
| end: |
| if (dp != NULL) |
| closedir(dp); |
| free(tmp); |
| free(realp); |
| |
| if (!path && first_path) { |
| /* |
| * None of the underlying paths had a link back to their |
| * enclosure devices. Throw up out hands and return the first |
| * underlying path. |
| */ |
| if ((size = asprintf(&path, "/dev/%s", first_path)) == -1) |
| path = NULL; |
| } |
| |
| free(first_path); |
| return (path); |
| } |
| |
| /* |
| * Return B_TRUE if device is a device mapper or multipath device. |
| * Return B_FALSE if not. |
| */ |
| boolean_t |
| zfs_dev_is_dm(const char *dev_name) |
| { |
| |
| char *tmp; |
| tmp = dm_get_underlying_path(dev_name); |
| if (tmp == NULL) |
| return (B_FALSE); |
| |
| free(tmp); |
| return (B_TRUE); |
| } |
| |
| /* |
| * By "whole disk" we mean an entire physical disk (something we can |
| * label, toggle the write cache on, etc.) as opposed to the full |
| * capacity of a pseudo-device such as lofi or did. We act as if we |
| * are labeling the disk, which should be a pretty good test of whether |
| * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if |
| * it isn't. |
| */ |
| boolean_t |
| zfs_dev_is_whole_disk(const char *dev_name) |
| { |
| struct dk_gpt *label = NULL; |
| int fd; |
| |
| if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) |
| return (B_FALSE); |
| |
| if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { |
| (void) close(fd); |
| return (B_FALSE); |
| } |
| |
| efi_free(label); |
| (void) close(fd); |
| |
| return (B_TRUE); |
| } |
| |
| /* |
| * Lookup the underlying device for a device name |
| * |
| * Often you'll have a symlink to a device, a partition device, |
| * or a multipath device, and want to look up the underlying device. |
| * This function returns the underlying device name. If the device |
| * name is already the underlying device, then just return the same |
| * name. If the device is a DM device with multiple underlying devices |
| * then return the first one. |
| * |
| * For example: |
| * |
| * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda |
| * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 |
| * returns: /dev/sda |
| * |
| * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) |
| * dev_name: /dev/mapper/mpatha |
| * returns: /dev/sda (first device) |
| * |
| * 3. /dev/sda (already the underlying device) |
| * dev_name: /dev/sda |
| * returns: /dev/sda |
| * |
| * 4. /dev/dm-3 (mapped to /dev/sda) |
| * dev_name: /dev/dm-3 |
| * returns: /dev/sda |
| * |
| * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 |
| * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 |
| * returns: /dev/sdb |
| * |
| * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 |
| * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a |
| * returns: /dev/sda |
| * |
| * Returns underlying device name, or NULL on error or no match. |
| * |
| * NOTE: The returned name string must be *freed*. |
| */ |
| char * |
| zfs_get_underlying_path(const char *dev_name) |
| { |
| char *name = NULL; |
| char *tmp; |
| |
| if (dev_name == NULL) |
| return (NULL); |
| |
| tmp = dm_get_underlying_path(dev_name); |
| |
| /* dev_name not a DM device, so just un-symlinkize it */ |
| if (tmp == NULL) |
| tmp = realpath(dev_name, NULL); |
| |
| if (tmp != NULL) { |
| name = zfs_strip_partition_path(tmp); |
| free(tmp); |
| } |
| |
| return (name); |
| } |
| |
| |
| #ifdef HAVE_LIBUDEV |
| |
| /* |
| * A disk is considered a multipath whole disk when: |
| * DEVNAME key value has "dm-" |
| * DM_UUID key exists and starts with 'mpath-' |
| * ID_PART_TABLE_TYPE key does not exist or is not gpt |
| * ID_FS_LABEL key does not exist (disk isn't labeled) |
| */ |
| static boolean_t |
| is_mpath_udev_sane(struct udev_device *dev) |
| { |
| const char *devname, *type, *uuid, *label; |
| |
| devname = udev_device_get_property_value(dev, "DEVNAME"); |
| type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); |
| uuid = udev_device_get_property_value(dev, "DM_UUID"); |
| label = udev_device_get_property_value(dev, "ID_FS_LABEL"); |
| |
| if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && |
| ((type == NULL) || (strcmp(type, "gpt") != 0)) && |
| ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) && |
| (label == NULL)) { |
| return (B_TRUE); |
| } |
| |
| return (B_FALSE); |
| } |
| |
| /* |
| * Check if a disk is a multipath "blank" disk: |
| * |
| * 1. The disk has udev values that suggest it's a multipath disk |
| * 2. The disk is not currently labeled with a filesystem of any type |
| * 3. There are no partitions on the disk |
| */ |
| boolean_t |
| is_mpath_whole_disk(const char *path) |
| { |
| struct udev *udev; |
| struct udev_device *dev = NULL; |
| char nodepath[MAXPATHLEN]; |
| char *sysname; |
| |
| if (realpath(path, nodepath) == NULL) |
| return (B_FALSE); |
| sysname = strrchr(nodepath, '/') + 1; |
| if (strncmp(sysname, "dm-", 3) != 0) |
| return (B_FALSE); |
| if ((udev = udev_new()) == NULL) |
| return (B_FALSE); |
| if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", |
| sysname)) == NULL) { |
| udev_device_unref(dev); |
| return (B_FALSE); |
| } |
| |
| /* Sanity check some udev values */ |
| boolean_t is_sane = is_mpath_udev_sane(dev); |
| udev_device_unref(dev); |
| |
| return (is_sane); |
| } |
| |
| #else /* HAVE_LIBUDEV */ |
| |
| /* ARGSUSED */ |
| boolean_t |
| is_mpath_whole_disk(const char *path) |
| { |
| return (B_FALSE); |
| } |
| |
| #endif /* HAVE_LIBUDEV */ |