| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
| * |
| * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> |
| * All rights reserved. |
| * |
| * Portions Copyright 2010 Robert Milkowski |
| * |
| * Copyright 2011 Nexenta Systems, Inc. All rights reserved. |
| * Copyright (c) 2012, 2017 by Delphix. All rights reserved. |
| * Copyright (c) 2013, Joyent, Inc. All rights reserved. |
| * Copyright (c) 2014 Integros [integros.com] |
| */ |
| |
| /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ |
| |
| /* |
| * ZFS volume emulation driver. |
| * |
| * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. |
| * Volumes are accessed through the symbolic links named: |
| * |
| * /dev/zvol/<pool_name>/<dataset_name> |
| * |
| * Volumes are persistent through reboot. No user command needs to be |
| * run before opening and using a device. |
| * |
| * On FreeBSD ZVOLs are simply GEOM providers like any other storage device |
| * in the system. Except when they're simply character devices (volmode=dev). |
| */ |
| |
| #include <sys/types.h> |
| #include <sys/param.h> |
| #include <sys/kernel.h> |
| #include <sys/errno.h> |
| #include <sys/uio.h> |
| #include <sys/bio.h> |
| #include <sys/buf.h> |
| #include <sys/kmem.h> |
| #include <sys/conf.h> |
| #include <sys/cmn_err.h> |
| #include <sys/stat.h> |
| #include <sys/proc.h> |
| #include <sys/zap.h> |
| #include <sys/spa.h> |
| #include <sys/spa_impl.h> |
| #include <sys/zio.h> |
| #include <sys/disk.h> |
| #include <sys/dmu_traverse.h> |
| #include <sys/dnode.h> |
| #include <sys/dsl_dataset.h> |
| #include <sys/dsl_prop.h> |
| #include <sys/dsl_dir.h> |
| #include <sys/byteorder.h> |
| #include <sys/sunddi.h> |
| #include <sys/dirent.h> |
| #include <sys/policy.h> |
| #include <sys/queue.h> |
| #include <sys/fs/zfs.h> |
| #include <sys/zfs_ioctl.h> |
| #include <sys/zil.h> |
| #include <sys/zfs_znode.h> |
| #include <sys/zfs_rlock.h> |
| #include <sys/vdev_impl.h> |
| #include <sys/vdev_raidz.h> |
| #include <sys/zvol.h> |
| #include <sys/zil_impl.h> |
| #include <sys/dataset_kstats.h> |
| #include <sys/dbuf.h> |
| #include <sys/dmu_tx.h> |
| #include <sys/zfeature.h> |
| #include <sys/zio_checksum.h> |
| #include <sys/zil_impl.h> |
| #include <sys/filio.h> |
| |
| #include <geom/geom.h> |
| #include <sys/zvol.h> |
| #include <sys/zvol_impl.h> |
| |
| #include "zfs_namecheck.h" |
| |
| #define ZVOL_DUMPSIZE "dumpsize" |
| |
| #ifdef ZVOL_LOCK_DEBUG |
| #define ZVOL_RW_READER RW_WRITER |
| #define ZVOL_RW_READ_HELD RW_WRITE_HELD |
| #else |
| #define ZVOL_RW_READER RW_READER |
| #define ZVOL_RW_READ_HELD RW_READ_HELD |
| #endif |
| |
| enum zvol_geom_state { |
| ZVOL_GEOM_UNINIT, |
| ZVOL_GEOM_STOPPED, |
| ZVOL_GEOM_RUNNING, |
| }; |
| |
| struct zvol_state_os { |
| #define zso_dev _zso_state._zso_dev |
| #define zso_geom _zso_state._zso_geom |
| union { |
| /* volmode=dev */ |
| struct zvol_state_dev { |
| struct cdev *zsd_cdev; |
| uint64_t zsd_sync_cnt; |
| } _zso_dev; |
| |
| /* volmode=geom */ |
| struct zvol_state_geom { |
| struct g_provider *zsg_provider; |
| struct bio_queue_head zsg_queue; |
| struct mtx zsg_queue_mtx; |
| enum zvol_geom_state zsg_state; |
| } _zso_geom; |
| } _zso_state; |
| int zso_dying; |
| }; |
| |
| static uint32_t zvol_minors; |
| |
| SYSCTL_DECL(_vfs_zfs); |
| SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); |
| SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, |
| "Expose as GEOM providers (1), device files (2) or neither"); |
| static boolean_t zpool_on_zvol = B_FALSE; |
| SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, |
| "Allow zpools to use zvols as vdevs (DANGEROUS)"); |
| |
| /* |
| * Toggle unmap functionality. |
| */ |
| boolean_t zvol_unmap_enabled = B_TRUE; |
| |
| SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, |
| &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); |
| |
| /* |
| * zvol maximum transfer in one DMU tx. |
| */ |
| int zvol_maxphys = DMU_MAX_ACCESS / 2; |
| |
| static void zvol_ensure_zilog(zvol_state_t *zv); |
| |
| static d_open_t zvol_cdev_open; |
| static d_close_t zvol_cdev_close; |
| static d_ioctl_t zvol_cdev_ioctl; |
| static d_read_t zvol_cdev_read; |
| static d_write_t zvol_cdev_write; |
| static d_strategy_t zvol_geom_bio_strategy; |
| |
| static struct cdevsw zvol_cdevsw = { |
| .d_name = "zvol", |
| .d_version = D_VERSION, |
| .d_flags = D_DISK | D_TRACKCLOSE, |
| .d_open = zvol_cdev_open, |
| .d_close = zvol_cdev_close, |
| .d_ioctl = zvol_cdev_ioctl, |
| .d_read = zvol_cdev_read, |
| .d_write = zvol_cdev_write, |
| .d_strategy = zvol_geom_bio_strategy, |
| }; |
| |
| extern uint_t zfs_geom_probe_vdev_key; |
| |
| struct g_class zfs_zvol_class = { |
| .name = "ZFS::ZVOL", |
| .version = G_VERSION, |
| }; |
| |
| DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); |
| |
| static int zvol_geom_open(struct g_provider *pp, int flag, int count); |
| static int zvol_geom_close(struct g_provider *pp, int flag, int count); |
| static void zvol_geom_run(zvol_state_t *zv); |
| static void zvol_geom_destroy(zvol_state_t *zv); |
| static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); |
| static void zvol_geom_worker(void *arg); |
| static void zvol_geom_bio_start(struct bio *bp); |
| static int zvol_geom_bio_getattr(struct bio *bp); |
| /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ |
| |
| /* |
| * GEOM mode implementation |
| */ |
| |
| /*ARGSUSED*/ |
| static int |
| zvol_geom_open(struct g_provider *pp, int flag, int count) |
| { |
| zvol_state_t *zv; |
| int err = 0; |
| boolean_t drop_suspend = B_FALSE; |
| |
| if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { |
| /* |
| * if zfs_geom_probe_vdev_key is set, that means that zfs is |
| * attempting to probe geom providers while looking for a |
| * replacement for a missing VDEV. In this case, the |
| * spa_namespace_lock will not be held, but it is still illegal |
| * to use a zvol as a vdev. Deadlocks can result if another |
| * thread has spa_namespace_lock |
| */ |
| return (SET_ERROR(EOPNOTSUPP)); |
| } |
| |
| retry: |
| rw_enter(&zvol_state_lock, ZVOL_RW_READER); |
| /* |
| * Obtain a copy of private under zvol_state_lock to make sure either |
| * the result of zvol free code setting private to NULL is observed, |
| * or the zv is protected from being freed because of the positive |
| * zv_open_count. |
| */ |
| zv = pp->private; |
| if (zv == NULL) { |
| rw_exit(&zvol_state_lock); |
| err = SET_ERROR(ENXIO); |
| goto out_locked; |
| } |
| |
| mutex_enter(&zv->zv_state_lock); |
| if (zv->zv_zso->zso_dying) { |
| rw_exit(&zvol_state_lock); |
| err = SET_ERROR(ENXIO); |
| goto out_zv_locked; |
| } |
| ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); |
| |
| /* |
| * make sure zvol is not suspended during first open |
| * (hold zv_suspend_lock) and respect proper lock acquisition |
| * ordering - zv_suspend_lock before zv_state_lock |
| */ |
| if (zv->zv_open_count == 0) { |
| drop_suspend = B_TRUE; |
| if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { |
| mutex_exit(&zv->zv_state_lock); |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| mutex_enter(&zv->zv_state_lock); |
| /* check to see if zv_suspend_lock is needed */ |
| if (zv->zv_open_count != 0) { |
| rw_exit(&zv->zv_suspend_lock); |
| drop_suspend = B_FALSE; |
| } |
| } |
| } |
| rw_exit(&zvol_state_lock); |
| |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| |
| if (zv->zv_open_count == 0) { |
| boolean_t drop_namespace = B_FALSE; |
| |
| ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); |
| |
| /* |
| * Take spa_namespace_lock to prevent lock inversion when |
| * zvols from one pool are opened as vdevs in another. |
| */ |
| if (!mutex_owned(&spa_namespace_lock)) { |
| if (!mutex_tryenter(&spa_namespace_lock)) { |
| mutex_exit(&zv->zv_state_lock); |
| rw_exit(&zv->zv_suspend_lock); |
| kern_yield(PRI_USER); |
| goto retry; |
| } else { |
| drop_namespace = B_TRUE; |
| } |
| } |
| err = zvol_first_open(zv, !(flag & FWRITE)); |
| if (drop_namespace) |
| mutex_exit(&spa_namespace_lock); |
| if (err) |
| goto out_zv_locked; |
| pp->mediasize = zv->zv_volsize; |
| pp->stripeoffset = 0; |
| pp->stripesize = zv->zv_volblocksize; |
| } |
| |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| |
| /* |
| * Check for a bad on-disk format version now since we |
| * lied about owning the dataset readonly before. |
| */ |
| if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || |
| dmu_objset_incompatible_encryption_version(zv->zv_objset))) { |
| err = SET_ERROR(EROFS); |
| goto out_opened; |
| } |
| if (zv->zv_flags & ZVOL_EXCL) { |
| err = SET_ERROR(EBUSY); |
| goto out_opened; |
| } |
| #ifdef FEXCL |
| if (flag & FEXCL) { |
| if (zv->zv_open_count != 0) { |
| err = SET_ERROR(EBUSY); |
| goto out_opened; |
| } |
| zv->zv_flags |= ZVOL_EXCL; |
| } |
| #endif |
| |
| zv->zv_open_count += count; |
| out_opened: |
| if (zv->zv_open_count == 0) { |
| zvol_last_close(zv); |
| wakeup(zv); |
| } |
| out_zv_locked: |
| mutex_exit(&zv->zv_state_lock); |
| out_locked: |
| if (drop_suspend) |
| rw_exit(&zv->zv_suspend_lock); |
| return (err); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zvol_geom_close(struct g_provider *pp, int flag, int count) |
| { |
| zvol_state_t *zv; |
| boolean_t drop_suspend = B_TRUE; |
| int new_open_count; |
| |
| rw_enter(&zvol_state_lock, ZVOL_RW_READER); |
| zv = pp->private; |
| if (zv == NULL) { |
| rw_exit(&zvol_state_lock); |
| return (SET_ERROR(ENXIO)); |
| } |
| |
| mutex_enter(&zv->zv_state_lock); |
| if (zv->zv_flags & ZVOL_EXCL) { |
| ASSERT3U(zv->zv_open_count, ==, 1); |
| zv->zv_flags &= ~ZVOL_EXCL; |
| } |
| |
| ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); |
| |
| /* |
| * If the open count is zero, this is a spurious close. |
| * That indicates a bug in the kernel / DDI framework. |
| */ |
| ASSERT3U(zv->zv_open_count, >, 0); |
| |
| /* |
| * make sure zvol is not suspended during last close |
| * (hold zv_suspend_lock) and respect proper lock acquisition |
| * ordering - zv_suspend_lock before zv_state_lock |
| */ |
| new_open_count = zv->zv_open_count - count; |
| if (new_open_count == 0) { |
| if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { |
| mutex_exit(&zv->zv_state_lock); |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| mutex_enter(&zv->zv_state_lock); |
| /* check to see if zv_suspend_lock is needed */ |
| new_open_count = zv->zv_open_count - count; |
| if (new_open_count != 0) { |
| rw_exit(&zv->zv_suspend_lock); |
| drop_suspend = B_FALSE; |
| } |
| } |
| } else { |
| drop_suspend = B_FALSE; |
| } |
| rw_exit(&zvol_state_lock); |
| |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| |
| /* |
| * You may get multiple opens, but only one close. |
| */ |
| zv->zv_open_count = new_open_count; |
| if (zv->zv_open_count == 0) { |
| ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); |
| zvol_last_close(zv); |
| wakeup(zv); |
| } |
| |
| mutex_exit(&zv->zv_state_lock); |
| |
| if (drop_suspend) |
| rw_exit(&zv->zv_suspend_lock); |
| return (0); |
| } |
| |
| static void |
| zvol_geom_run(zvol_state_t *zv) |
| { |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct g_provider *pp = zsg->zsg_provider; |
| |
| ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); |
| |
| g_error_provider(pp, 0); |
| |
| kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, |
| "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); |
| } |
| |
| static void |
| zvol_geom_destroy(zvol_state_t *zv) |
| { |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct g_provider *pp = zsg->zsg_provider; |
| |
| ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); |
| |
| g_topology_assert(); |
| |
| mutex_enter(&zv->zv_state_lock); |
| VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); |
| mutex_exit(&zv->zv_state_lock); |
| zsg->zsg_provider = NULL; |
| g_wither_geom(pp->geom, ENXIO); |
| } |
| |
| void |
| zvol_wait_close(zvol_state_t *zv) |
| { |
| |
| if (zv->zv_volmode != ZFS_VOLMODE_GEOM) |
| return; |
| mutex_enter(&zv->zv_state_lock); |
| zv->zv_zso->zso_dying = B_TRUE; |
| |
| if (zv->zv_open_count) |
| msleep(zv, &zv->zv_state_lock, |
| PRIBIO, "zvol:dying", 10*hz); |
| mutex_exit(&zv->zv_state_lock); |
| } |
| |
| |
| static int |
| zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) |
| { |
| int count, error, flags; |
| |
| g_topology_assert(); |
| |
| /* |
| * To make it easier we expect either open or close, but not both |
| * at the same time. |
| */ |
| KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || |
| (acr <= 0 && acw <= 0 && ace <= 0), |
| ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", |
| pp->name, acr, acw, ace)); |
| |
| if (pp->private == NULL) { |
| if (acr <= 0 && acw <= 0 && ace <= 0) |
| return (0); |
| return (pp->error); |
| } |
| |
| /* |
| * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if |
| * ace != 0, because GEOM already handles that and handles it a bit |
| * differently. GEOM allows for multiple read/exclusive consumers and |
| * ZFS allows only one exclusive consumer, no matter if it is reader or |
| * writer. I like better the way GEOM works so I'll leave it for GEOM |
| * to decide what to do. |
| */ |
| |
| count = acr + acw + ace; |
| if (count == 0) |
| return (0); |
| |
| flags = 0; |
| if (acr != 0 || ace != 0) |
| flags |= FREAD; |
| if (acw != 0) |
| flags |= FWRITE; |
| |
| g_topology_unlock(); |
| if (count > 0) |
| error = zvol_geom_open(pp, flags, count); |
| else |
| error = zvol_geom_close(pp, flags, -count); |
| g_topology_lock(); |
| return (error); |
| } |
| |
| static void |
| zvol_geom_worker(void *arg) |
| { |
| zvol_state_t *zv = arg; |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct bio *bp; |
| |
| ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); |
| |
| thread_lock(curthread); |
| sched_prio(curthread, PRIBIO); |
| thread_unlock(curthread); |
| |
| for (;;) { |
| mtx_lock(&zsg->zsg_queue_mtx); |
| bp = bioq_takefirst(&zsg->zsg_queue); |
| if (bp == NULL) { |
| if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { |
| zsg->zsg_state = ZVOL_GEOM_RUNNING; |
| wakeup(&zsg->zsg_state); |
| mtx_unlock(&zsg->zsg_queue_mtx); |
| kthread_exit(); |
| } |
| msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, |
| PRIBIO | PDROP, "zvol:io", 0); |
| continue; |
| } |
| mtx_unlock(&zsg->zsg_queue_mtx); |
| zvol_geom_bio_strategy(bp); |
| } |
| } |
| |
| static void |
| zvol_geom_bio_start(struct bio *bp) |
| { |
| zvol_state_t *zv = bp->bio_to->private; |
| struct zvol_state_geom *zsg; |
| boolean_t first; |
| |
| if (zv == NULL) { |
| g_io_deliver(bp, ENXIO); |
| return; |
| } |
| if (bp->bio_cmd == BIO_GETATTR) { |
| if (zvol_geom_bio_getattr(bp)) |
| g_io_deliver(bp, EOPNOTSUPP); |
| return; |
| } |
| |
| if (!THREAD_CAN_SLEEP()) { |
| zsg = &zv->zv_zso->zso_geom; |
| mtx_lock(&zsg->zsg_queue_mtx); |
| first = (bioq_first(&zsg->zsg_queue) == NULL); |
| bioq_insert_tail(&zsg->zsg_queue, bp); |
| mtx_unlock(&zsg->zsg_queue_mtx); |
| if (first) |
| wakeup_one(&zsg->zsg_queue); |
| return; |
| } |
| |
| zvol_geom_bio_strategy(bp); |
| } |
| |
| static int |
| zvol_geom_bio_getattr(struct bio *bp) |
| { |
| zvol_state_t *zv; |
| |
| zv = bp->bio_to->private; |
| ASSERT3P(zv, !=, NULL); |
| |
| spa_t *spa = dmu_objset_spa(zv->zv_objset); |
| uint64_t refd, avail, usedobjs, availobjs; |
| |
| if (g_handleattr_int(bp, "GEOM::candelete", 1)) |
| return (0); |
| if (strcmp(bp->bio_attribute, "blocksavail") == 0) { |
| dmu_objset_space(zv->zv_objset, &refd, &avail, |
| &usedobjs, &availobjs); |
| if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) |
| return (0); |
| } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { |
| dmu_objset_space(zv->zv_objset, &refd, &avail, |
| &usedobjs, &availobjs); |
| if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) |
| return (0); |
| } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { |
| avail = metaslab_class_get_space(spa_normal_class(spa)); |
| avail -= metaslab_class_get_alloc(spa_normal_class(spa)); |
| if (g_handleattr_off_t(bp, "poolblocksavail", |
| avail / DEV_BSIZE)) |
| return (0); |
| } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { |
| refd = metaslab_class_get_alloc(spa_normal_class(spa)); |
| if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) |
| return (0); |
| } |
| return (1); |
| } |
| |
| static void |
| zvol_geom_bio_strategy(struct bio *bp) |
| { |
| zvol_state_t *zv; |
| uint64_t off, volsize; |
| size_t resid; |
| char *addr; |
| objset_t *os; |
| zfs_locked_range_t *lr; |
| int error = 0; |
| boolean_t doread = B_FALSE; |
| boolean_t is_dumpified; |
| boolean_t sync; |
| |
| if (bp->bio_to) |
| zv = bp->bio_to->private; |
| else |
| zv = bp->bio_dev->si_drv2; |
| |
| if (zv == NULL) { |
| error = SET_ERROR(ENXIO); |
| goto out; |
| } |
| |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| |
| switch (bp->bio_cmd) { |
| case BIO_READ: |
| doread = B_TRUE; |
| break; |
| case BIO_WRITE: |
| case BIO_FLUSH: |
| case BIO_DELETE: |
| if (zv->zv_flags & ZVOL_RDONLY) { |
| error = SET_ERROR(EROFS); |
| goto resume; |
| } |
| zvol_ensure_zilog(zv); |
| if (bp->bio_cmd == BIO_FLUSH) |
| goto sync; |
| break; |
| default: |
| error = SET_ERROR(EOPNOTSUPP); |
| goto resume; |
| } |
| |
| off = bp->bio_offset; |
| volsize = zv->zv_volsize; |
| |
| os = zv->zv_objset; |
| ASSERT3P(os, !=, NULL); |
| |
| addr = bp->bio_data; |
| resid = bp->bio_length; |
| |
| if (resid > 0 && off >= volsize) { |
| error = SET_ERROR(EIO); |
| goto resume; |
| } |
| |
| is_dumpified = B_FALSE; |
| sync = !doread && !is_dumpified && |
| zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
| |
| /* |
| * There must be no buffer changes when doing a dmu_sync() because |
| * we can't change the data whilst calculating the checksum. |
| */ |
| lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, |
| doread ? RL_READER : RL_WRITER); |
| |
| if (bp->bio_cmd == BIO_DELETE) { |
| dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); |
| error = dmu_tx_assign(tx, TXG_WAIT); |
| if (error != 0) { |
| dmu_tx_abort(tx); |
| } else { |
| zvol_log_truncate(zv, tx, off, resid, sync); |
| dmu_tx_commit(tx); |
| error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, |
| off, resid); |
| resid = 0; |
| } |
| goto unlock; |
| } |
| while (resid != 0 && off < volsize) { |
| size_t size = MIN(resid, zvol_maxphys); |
| if (doread) { |
| error = dmu_read(os, ZVOL_OBJ, off, size, addr, |
| DMU_READ_PREFETCH); |
| } else { |
| dmu_tx_t *tx = dmu_tx_create(os); |
| dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); |
| error = dmu_tx_assign(tx, TXG_WAIT); |
| if (error) { |
| dmu_tx_abort(tx); |
| } else { |
| dmu_write(os, ZVOL_OBJ, off, size, addr, tx); |
| zvol_log_write(zv, tx, off, size, sync); |
| dmu_tx_commit(tx); |
| } |
| } |
| if (error) { |
| /* convert checksum errors into IO errors */ |
| if (error == ECKSUM) |
| error = SET_ERROR(EIO); |
| break; |
| } |
| off += size; |
| addr += size; |
| resid -= size; |
| } |
| unlock: |
| zfs_rangelock_exit(lr); |
| |
| bp->bio_completed = bp->bio_length - resid; |
| if (bp->bio_completed < bp->bio_length && off > volsize) |
| error = SET_ERROR(EINVAL); |
| |
| switch (bp->bio_cmd) { |
| case BIO_FLUSH: |
| break; |
| case BIO_READ: |
| dataset_kstats_update_read_kstats(&zv->zv_kstat, |
| bp->bio_completed); |
| break; |
| case BIO_WRITE: |
| dataset_kstats_update_write_kstats(&zv->zv_kstat, |
| bp->bio_completed); |
| break; |
| case BIO_DELETE: |
| break; |
| default: |
| break; |
| } |
| |
| if (sync) { |
| sync: |
| zil_commit(zv->zv_zilog, ZVOL_OBJ); |
| } |
| resume: |
| rw_exit(&zv->zv_suspend_lock); |
| out: |
| if (bp->bio_to) |
| g_io_deliver(bp, error); |
| else |
| biofinish(bp, NULL, error); |
| } |
| |
| /* |
| * Character device mode implementation |
| */ |
| |
| static int |
| zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) |
| { |
| zvol_state_t *zv; |
| uint64_t volsize; |
| zfs_locked_range_t *lr; |
| int error = 0; |
| zfs_uio_t uio; |
| |
| zfs_uio_init(&uio, uio_s); |
| |
| zv = dev->si_drv2; |
| |
| volsize = zv->zv_volsize; |
| /* |
| * uio_loffset == volsize isn't an error as |
| * it's required for EOF processing. |
| */ |
| if (zfs_uio_resid(&uio) > 0 && |
| (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) |
| return (SET_ERROR(EIO)); |
| |
| ssize_t start_resid = zfs_uio_resid(&uio); |
| lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), |
| zfs_uio_resid(&uio), RL_READER); |
| while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { |
| uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); |
| |
| /* don't read past the end */ |
| if (bytes > volsize - zfs_uio_offset(&uio)) |
| bytes = volsize - zfs_uio_offset(&uio); |
| |
| error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); |
| if (error) { |
| /* convert checksum errors into IO errors */ |
| if (error == ECKSUM) |
| error = SET_ERROR(EIO); |
| break; |
| } |
| } |
| zfs_rangelock_exit(lr); |
| int64_t nread = start_resid - zfs_uio_resid(&uio); |
| dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); |
| |
| return (error); |
| } |
| |
| static int |
| zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) |
| { |
| zvol_state_t *zv; |
| uint64_t volsize; |
| zfs_locked_range_t *lr; |
| int error = 0; |
| boolean_t sync; |
| zfs_uio_t uio; |
| |
| zv = dev->si_drv2; |
| |
| volsize = zv->zv_volsize; |
| |
| zfs_uio_init(&uio, uio_s); |
| |
| if (zfs_uio_resid(&uio) > 0 && |
| (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) |
| return (SET_ERROR(EIO)); |
| |
| ssize_t start_resid = zfs_uio_resid(&uio); |
| sync = (ioflag & IO_SYNC) || |
| (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); |
| |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| zvol_ensure_zilog(zv); |
| |
| lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), |
| zfs_uio_resid(&uio), RL_WRITER); |
| while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { |
| uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); |
| uint64_t off = zfs_uio_offset(&uio); |
| dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); |
| |
| if (bytes > volsize - off) /* don't write past the end */ |
| bytes = volsize - off; |
| |
| dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); |
| error = dmu_tx_assign(tx, TXG_WAIT); |
| if (error) { |
| dmu_tx_abort(tx); |
| break; |
| } |
| error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); |
| if (error == 0) |
| zvol_log_write(zv, tx, off, bytes, sync); |
| dmu_tx_commit(tx); |
| |
| if (error) |
| break; |
| } |
| zfs_rangelock_exit(lr); |
| int64_t nwritten = start_resid - zfs_uio_resid(&uio); |
| dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); |
| if (sync) |
| zil_commit(zv->zv_zilog, ZVOL_OBJ); |
| rw_exit(&zv->zv_suspend_lock); |
| return (error); |
| } |
| |
| static int |
| zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) |
| { |
| zvol_state_t *zv; |
| struct zvol_state_dev *zsd; |
| int err = 0; |
| boolean_t drop_suspend = B_FALSE; |
| |
| retry: |
| rw_enter(&zvol_state_lock, ZVOL_RW_READER); |
| /* |
| * Obtain a copy of si_drv2 under zvol_state_lock to make sure either |
| * the result of zvol free code setting si_drv2 to NULL is observed, |
| * or the zv is protected from being freed because of the positive |
| * zv_open_count. |
| */ |
| zv = dev->si_drv2; |
| if (zv == NULL) { |
| rw_exit(&zvol_state_lock); |
| err = SET_ERROR(ENXIO); |
| goto out_locked; |
| } |
| |
| mutex_enter(&zv->zv_state_lock); |
| if (zv->zv_zso->zso_dying) { |
| rw_exit(&zvol_state_lock); |
| err = SET_ERROR(ENXIO); |
| goto out_zv_locked; |
| } |
| ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); |
| |
| /* |
| * make sure zvol is not suspended during first open |
| * (hold zv_suspend_lock) and respect proper lock acquisition |
| * ordering - zv_suspend_lock before zv_state_lock |
| */ |
| if (zv->zv_open_count == 0) { |
| drop_suspend = B_TRUE; |
| if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { |
| mutex_exit(&zv->zv_state_lock); |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| mutex_enter(&zv->zv_state_lock); |
| /* check to see if zv_suspend_lock is needed */ |
| if (zv->zv_open_count != 0) { |
| rw_exit(&zv->zv_suspend_lock); |
| drop_suspend = B_FALSE; |
| } |
| } |
| } |
| rw_exit(&zvol_state_lock); |
| |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| |
| if (zv->zv_open_count == 0) { |
| boolean_t drop_namespace = B_FALSE; |
| |
| ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); |
| |
| /* |
| * Take spa_namespace_lock to prevent lock inversion when |
| * zvols from one pool are opened as vdevs in another. |
| */ |
| if (!mutex_owned(&spa_namespace_lock)) { |
| if (!mutex_tryenter(&spa_namespace_lock)) { |
| mutex_exit(&zv->zv_state_lock); |
| rw_exit(&zv->zv_suspend_lock); |
| kern_yield(PRI_USER); |
| goto retry; |
| } else { |
| drop_namespace = B_TRUE; |
| } |
| } |
| err = zvol_first_open(zv, !(flags & FWRITE)); |
| if (drop_namespace) |
| mutex_exit(&spa_namespace_lock); |
| if (err) |
| goto out_zv_locked; |
| } |
| |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| |
| if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { |
| err = SET_ERROR(EROFS); |
| goto out_opened; |
| } |
| if (zv->zv_flags & ZVOL_EXCL) { |
| err = SET_ERROR(EBUSY); |
| goto out_opened; |
| } |
| #ifdef FEXCL |
| if (flags & FEXCL) { |
| if (zv->zv_open_count != 0) { |
| err = SET_ERROR(EBUSY); |
| goto out_opened; |
| } |
| zv->zv_flags |= ZVOL_EXCL; |
| } |
| #endif |
| |
| zv->zv_open_count++; |
| if (flags & (FSYNC | FDSYNC)) { |
| zsd = &zv->zv_zso->zso_dev; |
| zsd->zsd_sync_cnt++; |
| if (zsd->zsd_sync_cnt == 1 && |
| (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) |
| zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); |
| } |
| out_opened: |
| if (zv->zv_open_count == 0) { |
| zvol_last_close(zv); |
| wakeup(zv); |
| } |
| out_zv_locked: |
| mutex_exit(&zv->zv_state_lock); |
| out_locked: |
| if (drop_suspend) |
| rw_exit(&zv->zv_suspend_lock); |
| return (err); |
| } |
| |
| static int |
| zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) |
| { |
| zvol_state_t *zv; |
| struct zvol_state_dev *zsd; |
| boolean_t drop_suspend = B_TRUE; |
| |
| rw_enter(&zvol_state_lock, ZVOL_RW_READER); |
| zv = dev->si_drv2; |
| if (zv == NULL) { |
| rw_exit(&zvol_state_lock); |
| return (SET_ERROR(ENXIO)); |
| } |
| |
| mutex_enter(&zv->zv_state_lock); |
| if (zv->zv_flags & ZVOL_EXCL) { |
| ASSERT3U(zv->zv_open_count, ==, 1); |
| zv->zv_flags &= ~ZVOL_EXCL; |
| } |
| |
| ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); |
| |
| /* |
| * If the open count is zero, this is a spurious close. |
| * That indicates a bug in the kernel / DDI framework. |
| */ |
| ASSERT3U(zv->zv_open_count, >, 0); |
| /* |
| * make sure zvol is not suspended during last close |
| * (hold zv_suspend_lock) and respect proper lock acquisition |
| * ordering - zv_suspend_lock before zv_state_lock |
| */ |
| if (zv->zv_open_count == 1) { |
| if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { |
| mutex_exit(&zv->zv_state_lock); |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| mutex_enter(&zv->zv_state_lock); |
| /* check to see if zv_suspend_lock is needed */ |
| if (zv->zv_open_count != 1) { |
| rw_exit(&zv->zv_suspend_lock); |
| drop_suspend = B_FALSE; |
| } |
| } |
| } else { |
| drop_suspend = B_FALSE; |
| } |
| rw_exit(&zvol_state_lock); |
| |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| |
| /* |
| * You may get multiple opens, but only one close. |
| */ |
| zv->zv_open_count--; |
| if (flags & (FSYNC | FDSYNC)) { |
| zsd = &zv->zv_zso->zso_dev; |
| zsd->zsd_sync_cnt--; |
| } |
| |
| if (zv->zv_open_count == 0) { |
| ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); |
| zvol_last_close(zv); |
| wakeup(zv); |
| } |
| |
| mutex_exit(&zv->zv_state_lock); |
| |
| if (drop_suspend) |
| rw_exit(&zv->zv_suspend_lock); |
| return (0); |
| } |
| |
| static int |
| zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, |
| int fflag, struct thread *td) |
| { |
| zvol_state_t *zv; |
| zfs_locked_range_t *lr; |
| off_t offset, length; |
| int error; |
| boolean_t sync; |
| |
| zv = dev->si_drv2; |
| |
| error = 0; |
| KASSERT(zv->zv_open_count > 0, |
| ("Device with zero access count in %s", __func__)); |
| |
| switch (cmd) { |
| case DIOCGSECTORSIZE: |
| *(uint32_t *)data = DEV_BSIZE; |
| break; |
| case DIOCGMEDIASIZE: |
| *(off_t *)data = zv->zv_volsize; |
| break; |
| case DIOCGFLUSH: |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| if (zv->zv_zilog != NULL) |
| zil_commit(zv->zv_zilog, ZVOL_OBJ); |
| rw_exit(&zv->zv_suspend_lock); |
| break; |
| case DIOCGDELETE: |
| if (!zvol_unmap_enabled) |
| break; |
| |
| offset = ((off_t *)data)[0]; |
| length = ((off_t *)data)[1]; |
| if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || |
| offset < 0 || offset >= zv->zv_volsize || |
| length <= 0) { |
| printf("%s: offset=%jd length=%jd\n", __func__, offset, |
| length); |
| error = SET_ERROR(EINVAL); |
| break; |
| } |
| rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); |
| zvol_ensure_zilog(zv); |
| lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, |
| RL_WRITER); |
| dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); |
| error = dmu_tx_assign(tx, TXG_WAIT); |
| if (error != 0) { |
| sync = FALSE; |
| dmu_tx_abort(tx); |
| } else { |
| sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); |
| zvol_log_truncate(zv, tx, offset, length, sync); |
| dmu_tx_commit(tx); |
| error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, |
| offset, length); |
| } |
| zfs_rangelock_exit(lr); |
| if (sync) |
| zil_commit(zv->zv_zilog, ZVOL_OBJ); |
| rw_exit(&zv->zv_suspend_lock); |
| break; |
| case DIOCGSTRIPESIZE: |
| *(off_t *)data = zv->zv_volblocksize; |
| break; |
| case DIOCGSTRIPEOFFSET: |
| *(off_t *)data = 0; |
| break; |
| case DIOCGATTR: { |
| spa_t *spa = dmu_objset_spa(zv->zv_objset); |
| struct diocgattr_arg *arg = (struct diocgattr_arg *)data; |
| uint64_t refd, avail, usedobjs, availobjs; |
| |
| if (strcmp(arg->name, "GEOM::candelete") == 0) |
| arg->value.i = 1; |
| else if (strcmp(arg->name, "blocksavail") == 0) { |
| dmu_objset_space(zv->zv_objset, &refd, &avail, |
| &usedobjs, &availobjs); |
| arg->value.off = avail / DEV_BSIZE; |
| } else if (strcmp(arg->name, "blocksused") == 0) { |
| dmu_objset_space(zv->zv_objset, &refd, &avail, |
| &usedobjs, &availobjs); |
| arg->value.off = refd / DEV_BSIZE; |
| } else if (strcmp(arg->name, "poolblocksavail") == 0) { |
| avail = metaslab_class_get_space(spa_normal_class(spa)); |
| avail -= metaslab_class_get_alloc( |
| spa_normal_class(spa)); |
| arg->value.off = avail / DEV_BSIZE; |
| } else if (strcmp(arg->name, "poolblocksused") == 0) { |
| refd = metaslab_class_get_alloc(spa_normal_class(spa)); |
| arg->value.off = refd / DEV_BSIZE; |
| } else |
| error = SET_ERROR(ENOIOCTL); |
| break; |
| } |
| case FIOSEEKHOLE: |
| case FIOSEEKDATA: { |
| off_t *off = (off_t *)data; |
| uint64_t noff; |
| boolean_t hole; |
| |
| hole = (cmd == FIOSEEKHOLE); |
| noff = *off; |
| lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, |
| RL_READER); |
| error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); |
| zfs_rangelock_exit(lr); |
| *off = noff; |
| break; |
| } |
| default: |
| error = SET_ERROR(ENOIOCTL); |
| } |
| |
| return (error); |
| } |
| |
| /* |
| * Misc. helpers |
| */ |
| |
| static void |
| zvol_ensure_zilog(zvol_state_t *zv) |
| { |
| ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); |
| |
| /* |
| * Open a ZIL if this is the first time we have written to this |
| * zvol. We protect zv->zv_zilog with zv_suspend_lock rather |
| * than zv_state_lock so that we don't need to acquire an |
| * additional lock in this path. |
| */ |
| if (zv->zv_zilog == NULL) { |
| if (!rw_tryupgrade(&zv->zv_suspend_lock)) { |
| rw_exit(&zv->zv_suspend_lock); |
| rw_enter(&zv->zv_suspend_lock, RW_WRITER); |
| } |
| if (zv->zv_zilog == NULL) { |
| zv->zv_zilog = zil_open(zv->zv_objset, |
| zvol_get_data); |
| zv->zv_flags |= ZVOL_WRITTEN_TO; |
| /* replay / destroy done in zvol_create_minor_impl() */ |
| VERIFY0(zv->zv_zilog->zl_header->zh_flags & |
| ZIL_REPLAY_NEEDED); |
| } |
| rw_downgrade(&zv->zv_suspend_lock); |
| } |
| } |
| |
| static boolean_t |
| zvol_is_zvol_impl(const char *device) |
| { |
| return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); |
| } |
| |
| static void |
| zvol_rename_minor(zvol_state_t *zv, const char *newname) |
| { |
| ASSERT(RW_LOCK_HELD(&zvol_state_lock)); |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| |
| /* move to new hashtable entry */ |
| zv->zv_hash = zvol_name_hash(zv->zv_name); |
| hlist_del(&zv->zv_hlink); |
| hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); |
| |
| if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct g_provider *pp = zsg->zsg_provider; |
| struct g_geom *gp; |
| |
| g_topology_lock(); |
| gp = pp->geom; |
| ASSERT3P(gp, !=, NULL); |
| |
| zsg->zsg_provider = NULL; |
| g_wither_provider(pp, ENXIO); |
| |
| pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); |
| pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; |
| pp->sectorsize = DEV_BSIZE; |
| pp->mediasize = zv->zv_volsize; |
| pp->private = zv; |
| zsg->zsg_provider = pp; |
| g_error_provider(pp, 0); |
| g_topology_unlock(); |
| } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { |
| struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; |
| struct cdev *dev; |
| struct make_dev_args args; |
| |
| dev = zsd->zsd_cdev; |
| if (dev != NULL) { |
| destroy_dev(dev); |
| dev = zsd->zsd_cdev = NULL; |
| if (zv->zv_open_count > 0) { |
| zv->zv_flags &= ~ZVOL_EXCL; |
| zv->zv_open_count = 0; |
| /* XXX need suspend lock but lock order */ |
| zvol_last_close(zv); |
| } |
| } |
| |
| make_dev_args_init(&args); |
| args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; |
| args.mda_devsw = &zvol_cdevsw; |
| args.mda_cr = NULL; |
| args.mda_uid = UID_ROOT; |
| args.mda_gid = GID_OPERATOR; |
| args.mda_mode = 0640; |
| args.mda_si_drv2 = zv; |
| if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) |
| == 0) { |
| #if __FreeBSD_version > 1300130 |
| dev->si_iosize_max = maxphys; |
| #else |
| dev->si_iosize_max = MAXPHYS; |
| #endif |
| zsd->zsd_cdev = dev; |
| } |
| } |
| strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); |
| } |
| |
| /* |
| * Remove minor node for the specified volume. |
| */ |
| static void |
| zvol_free(zvol_state_t *zv) |
| { |
| ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); |
| ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); |
| ASSERT0(zv->zv_open_count); |
| |
| ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); |
| |
| rw_destroy(&zv->zv_suspend_lock); |
| zfs_rangelock_fini(&zv->zv_rangelock); |
| |
| if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct g_provider *pp __maybe_unused = zsg->zsg_provider; |
| |
| ASSERT3P(pp->private, ==, NULL); |
| |
| g_topology_lock(); |
| zvol_geom_destroy(zv); |
| g_topology_unlock(); |
| mtx_destroy(&zsg->zsg_queue_mtx); |
| } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { |
| struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; |
| struct cdev *dev = zsd->zsd_cdev; |
| |
| if (dev != NULL) { |
| ASSERT3P(dev->si_drv2, ==, NULL); |
| destroy_dev(dev); |
| } |
| } |
| |
| mutex_destroy(&zv->zv_state_lock); |
| dataset_kstats_destroy(&zv->zv_kstat); |
| kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); |
| kmem_free(zv, sizeof (zvol_state_t)); |
| zvol_minors--; |
| } |
| |
| /* |
| * Create a minor node (plus a whole lot more) for the specified volume. |
| */ |
| static int |
| zvol_create_minor_impl(const char *name) |
| { |
| zvol_state_t *zv; |
| objset_t *os; |
| dmu_object_info_t *doi; |
| uint64_t volsize; |
| uint64_t volmode, hash; |
| int error; |
| |
| ZFS_LOG(1, "Creating ZVOL %s...", name); |
| hash = zvol_name_hash(name); |
| if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { |
| ASSERT(MUTEX_HELD(&zv->zv_state_lock)); |
| mutex_exit(&zv->zv_state_lock); |
| return (SET_ERROR(EEXIST)); |
| } |
| |
| DROP_GIANT(); |
| |
| doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); |
| |
| /* lie and say we're read-only */ |
| error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); |
| if (error) |
| goto out_doi; |
| |
| error = dmu_object_info(os, ZVOL_OBJ, doi); |
| if (error) |
| goto out_dmu_objset_disown; |
| |
| error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); |
| if (error) |
| goto out_dmu_objset_disown; |
| |
| error = dsl_prop_get_integer(name, |
| zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); |
| if (error || volmode == ZFS_VOLMODE_DEFAULT) |
| volmode = zvol_volmode; |
| error = 0; |
| |
| /* |
| * zvol_alloc equivalent ... |
| */ |
| zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); |
| zv->zv_hash = hash; |
| mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); |
| zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); |
| zv->zv_volmode = volmode; |
| if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct g_provider *pp; |
| struct g_geom *gp; |
| |
| zsg->zsg_state = ZVOL_GEOM_UNINIT; |
| mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); |
| |
| g_topology_lock(); |
| gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); |
| gp->start = zvol_geom_bio_start; |
| gp->access = zvol_geom_access; |
| pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); |
| pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; |
| pp->sectorsize = DEV_BSIZE; |
| pp->mediasize = 0; |
| pp->private = zv; |
| |
| zsg->zsg_provider = pp; |
| bioq_init(&zsg->zsg_queue); |
| } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { |
| struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; |
| struct cdev *dev; |
| struct make_dev_args args; |
| |
| make_dev_args_init(&args); |
| args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; |
| args.mda_devsw = &zvol_cdevsw; |
| args.mda_cr = NULL; |
| args.mda_uid = UID_ROOT; |
| args.mda_gid = GID_OPERATOR; |
| args.mda_mode = 0640; |
| args.mda_si_drv2 = zv; |
| if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) |
| == 0) { |
| #if __FreeBSD_version > 1300130 |
| dev->si_iosize_max = maxphys; |
| #else |
| dev->si_iosize_max = MAXPHYS; |
| #endif |
| zsd->zsd_cdev = dev; |
| } |
| } |
| (void) strlcpy(zv->zv_name, name, MAXPATHLEN); |
| rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); |
| zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); |
| |
| if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) |
| zv->zv_flags |= ZVOL_RDONLY; |
| |
| zv->zv_volblocksize = doi->doi_data_block_size; |
| zv->zv_volsize = volsize; |
| zv->zv_objset = os; |
| |
| ASSERT3P(zv->zv_zilog, ==, NULL); |
| zv->zv_zilog = zil_open(os, zvol_get_data); |
| if (spa_writeable(dmu_objset_spa(os))) { |
| if (zil_replay_disable) |
| zil_destroy(zv->zv_zilog, B_FALSE); |
| else |
| zil_replay(os, zv, zvol_replay_vector); |
| } |
| zil_close(zv->zv_zilog); |
| zv->zv_zilog = NULL; |
| ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); |
| dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); |
| |
| /* TODO: prefetch for geom tasting */ |
| |
| zv->zv_objset = NULL; |
| out_dmu_objset_disown: |
| dmu_objset_disown(os, B_TRUE, FTAG); |
| |
| if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { |
| zvol_geom_run(zv); |
| g_topology_unlock(); |
| } |
| out_doi: |
| kmem_free(doi, sizeof (dmu_object_info_t)); |
| if (error == 0) { |
| rw_enter(&zvol_state_lock, RW_WRITER); |
| zvol_insert(zv); |
| zvol_minors++; |
| rw_exit(&zvol_state_lock); |
| ZFS_LOG(1, "ZVOL %s created.", name); |
| } |
| PICKUP_GIANT(); |
| return (error); |
| } |
| |
| static void |
| zvol_clear_private(zvol_state_t *zv) |
| { |
| ASSERT(RW_LOCK_HELD(&zvol_state_lock)); |
| if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct g_provider *pp = zsg->zsg_provider; |
| |
| if (pp->private == NULL) /* already cleared */ |
| return; |
| |
| mtx_lock(&zsg->zsg_queue_mtx); |
| zsg->zsg_state = ZVOL_GEOM_STOPPED; |
| pp->private = NULL; |
| wakeup_one(&zsg->zsg_queue); |
| while (zsg->zsg_state != ZVOL_GEOM_RUNNING) |
| msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, |
| 0, "zvol:w", 0); |
| mtx_unlock(&zsg->zsg_queue_mtx); |
| ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); |
| } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { |
| struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; |
| struct cdev *dev = zsd->zsd_cdev; |
| |
| if (dev != NULL) |
| dev->si_drv2 = NULL; |
| } |
| } |
| |
| static int |
| zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) |
| { |
| zv->zv_volsize = volsize; |
| if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { |
| struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; |
| struct g_provider *pp = zsg->zsg_provider; |
| |
| g_topology_lock(); |
| |
| if (pp->private == NULL) { |
| g_topology_unlock(); |
| return (SET_ERROR(ENXIO)); |
| } |
| |
| /* |
| * Do not invoke resize event when initial size was zero. |
| * ZVOL initializes the size on first open, this is not |
| * real resizing. |
| */ |
| if (pp->mediasize == 0) |
| pp->mediasize = zv->zv_volsize; |
| else |
| g_resize_provider(pp, zv->zv_volsize); |
| |
| g_topology_unlock(); |
| } |
| return (0); |
| } |
| |
| static void |
| zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) |
| { |
| // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); |
| } |
| |
| static void |
| zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) |
| { |
| // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); |
| } |
| |
| const static zvol_platform_ops_t zvol_freebsd_ops = { |
| .zv_free = zvol_free, |
| .zv_rename_minor = zvol_rename_minor, |
| .zv_create_minor = zvol_create_minor_impl, |
| .zv_update_volsize = zvol_update_volsize, |
| .zv_clear_private = zvol_clear_private, |
| .zv_is_zvol = zvol_is_zvol_impl, |
| .zv_set_disk_ro = zvol_set_disk_ro_impl, |
| .zv_set_capacity = zvol_set_capacity_impl, |
| }; |
| |
| /* |
| * Public interfaces |
| */ |
| |
| int |
| zvol_busy(void) |
| { |
| return (zvol_minors != 0); |
| } |
| |
| int |
| zvol_init(void) |
| { |
| zvol_init_impl(); |
| zvol_register_ops(&zvol_freebsd_ops); |
| return (0); |
| } |
| |
| void |
| zvol_fini(void) |
| { |
| zvol_fini_impl(); |
| } |