xref: /freebsd-src/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c (revision 7a7741af18d6c8a804cc643cb7ecda9d730c6aa6)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy /*
22eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23eda14cbcSMatt Macy  *
24eda14cbcSMatt Macy  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25eda14cbcSMatt Macy  * All rights reserved.
26eda14cbcSMatt Macy  *
27eda14cbcSMatt Macy  * Portions Copyright 2010 Robert Milkowski
28eda14cbcSMatt Macy  *
29eda14cbcSMatt Macy  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30eda14cbcSMatt Macy  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31eda14cbcSMatt Macy  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32eda14cbcSMatt Macy  * Copyright (c) 2014 Integros [integros.com]
33ce4dcb97SMartin Matuska  * Copyright (c) 2024, Klara, Inc.
34eda14cbcSMatt Macy  */
35eda14cbcSMatt Macy 
36eda14cbcSMatt Macy /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
37eda14cbcSMatt Macy 
38eda14cbcSMatt Macy /*
39eda14cbcSMatt Macy  * ZFS volume emulation driver.
40eda14cbcSMatt Macy  *
41eda14cbcSMatt Macy  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
42eda14cbcSMatt Macy  * Volumes are accessed through the symbolic links named:
43eda14cbcSMatt Macy  *
44eda14cbcSMatt Macy  * /dev/zvol/<pool_name>/<dataset_name>
45eda14cbcSMatt Macy  *
46eda14cbcSMatt Macy  * Volumes are persistent through reboot.  No user command needs to be
47eda14cbcSMatt Macy  * run before opening and using a device.
48eda14cbcSMatt Macy  *
49eda14cbcSMatt Macy  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
50eda14cbcSMatt Macy  * in the system. Except when they're simply character devices (volmode=dev).
51eda14cbcSMatt Macy  */
52eda14cbcSMatt Macy 
53eda14cbcSMatt Macy #include <sys/types.h>
54eda14cbcSMatt Macy #include <sys/param.h>
55eda14cbcSMatt Macy #include <sys/kernel.h>
56eda14cbcSMatt Macy #include <sys/errno.h>
57eda14cbcSMatt Macy #include <sys/uio.h>
58eda14cbcSMatt Macy #include <sys/bio.h>
59eda14cbcSMatt Macy #include <sys/buf.h>
60eda14cbcSMatt Macy #include <sys/kmem.h>
61eda14cbcSMatt Macy #include <sys/conf.h>
62eda14cbcSMatt Macy #include <sys/cmn_err.h>
63eda14cbcSMatt Macy #include <sys/stat.h>
64eda14cbcSMatt Macy #include <sys/proc.h>
65eda14cbcSMatt Macy #include <sys/zap.h>
66eda14cbcSMatt Macy #include <sys/spa.h>
67eda14cbcSMatt Macy #include <sys/spa_impl.h>
68eda14cbcSMatt Macy #include <sys/zio.h>
69eda14cbcSMatt Macy #include <sys/disk.h>
70eda14cbcSMatt Macy #include <sys/dmu_traverse.h>
71eda14cbcSMatt Macy #include <sys/dnode.h>
72eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
73eda14cbcSMatt Macy #include <sys/dsl_prop.h>
74eda14cbcSMatt Macy #include <sys/dsl_dir.h>
75eda14cbcSMatt Macy #include <sys/byteorder.h>
76eda14cbcSMatt Macy #include <sys/sunddi.h>
77eda14cbcSMatt Macy #include <sys/dirent.h>
78eda14cbcSMatt Macy #include <sys/policy.h>
79eda14cbcSMatt Macy #include <sys/queue.h>
80eda14cbcSMatt Macy #include <sys/fs/zfs.h>
81eda14cbcSMatt Macy #include <sys/zfs_ioctl.h>
82eda14cbcSMatt Macy #include <sys/zil.h>
83eda14cbcSMatt Macy #include <sys/zfs_znode.h>
84eda14cbcSMatt Macy #include <sys/zfs_rlock.h>
85eda14cbcSMatt Macy #include <sys/vdev_impl.h>
86eda14cbcSMatt Macy #include <sys/vdev_raidz.h>
87eda14cbcSMatt Macy #include <sys/zvol.h>
88eda14cbcSMatt Macy #include <sys/zil_impl.h>
89eda14cbcSMatt Macy #include <sys/dataset_kstats.h>
90eda14cbcSMatt Macy #include <sys/dbuf.h>
91eda14cbcSMatt Macy #include <sys/dmu_tx.h>
92eda14cbcSMatt Macy #include <sys/zfeature.h>
93eda14cbcSMatt Macy #include <sys/zio_checksum.h>
94eda14cbcSMatt Macy #include <sys/zil_impl.h>
95eda14cbcSMatt Macy #include <sys/filio.h>
96c7046f76SMartin Matuska #include <sys/freebsd_event.h>
97eda14cbcSMatt Macy 
98eda14cbcSMatt Macy #include <geom/geom.h>
99eda14cbcSMatt Macy #include <sys/zvol.h>
100eda14cbcSMatt Macy #include <sys/zvol_impl.h>
101eda14cbcSMatt Macy 
102eda14cbcSMatt Macy #include "zfs_namecheck.h"
103eda14cbcSMatt Macy 
104eda14cbcSMatt Macy #define	ZVOL_DUMPSIZE		"dumpsize"
105eda14cbcSMatt Macy 
106eda14cbcSMatt Macy #ifdef ZVOL_LOCK_DEBUG
107eda14cbcSMatt Macy #define	ZVOL_RW_READER		RW_WRITER
108eda14cbcSMatt Macy #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
109eda14cbcSMatt Macy #else
110eda14cbcSMatt Macy #define	ZVOL_RW_READER		RW_READER
111eda14cbcSMatt Macy #define	ZVOL_RW_READ_HELD	RW_READ_HELD
112eda14cbcSMatt Macy #endif
113eda14cbcSMatt Macy 
114eda14cbcSMatt Macy enum zvol_geom_state {
115eda14cbcSMatt Macy 	ZVOL_GEOM_UNINIT,
116eda14cbcSMatt Macy 	ZVOL_GEOM_STOPPED,
117eda14cbcSMatt Macy 	ZVOL_GEOM_RUNNING,
118eda14cbcSMatt Macy };
119eda14cbcSMatt Macy 
120eda14cbcSMatt Macy struct zvol_state_os {
121eda14cbcSMatt Macy #define	zso_dev		_zso_state._zso_dev
122eda14cbcSMatt Macy #define	zso_geom	_zso_state._zso_geom
123eda14cbcSMatt Macy 	union {
124eda14cbcSMatt Macy 		/* volmode=dev */
125eda14cbcSMatt Macy 		struct zvol_state_dev {
126eda14cbcSMatt Macy 			struct cdev *zsd_cdev;
127c7046f76SMartin Matuska 			struct selinfo zsd_selinfo;
128eda14cbcSMatt Macy 		} _zso_dev;
129eda14cbcSMatt Macy 
130eda14cbcSMatt Macy 		/* volmode=geom */
131eda14cbcSMatt Macy 		struct zvol_state_geom {
132eda14cbcSMatt Macy 			struct g_provider *zsg_provider;
133eda14cbcSMatt Macy 			struct bio_queue_head zsg_queue;
134eda14cbcSMatt Macy 			struct mtx zsg_queue_mtx;
135eda14cbcSMatt Macy 			enum zvol_geom_state zsg_state;
136eda14cbcSMatt Macy 		} _zso_geom;
137eda14cbcSMatt Macy 	} _zso_state;
1387877fdebSMatt Macy 	int zso_dying;
139eda14cbcSMatt Macy };
140eda14cbcSMatt Macy 
141eda14cbcSMatt Macy static uint32_t zvol_minors;
142eda14cbcSMatt Macy 
143eda14cbcSMatt Macy SYSCTL_DECL(_vfs_zfs);
144eda14cbcSMatt Macy SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145eda14cbcSMatt Macy SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146eda14cbcSMatt Macy 	"Expose as GEOM providers (1), device files (2) or neither");
147eda14cbcSMatt Macy static boolean_t zpool_on_zvol = B_FALSE;
148eda14cbcSMatt Macy SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149eda14cbcSMatt Macy 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
150eda14cbcSMatt Macy 
151eda14cbcSMatt Macy /*
152eda14cbcSMatt Macy  * Toggle unmap functionality.
153eda14cbcSMatt Macy  */
154eda14cbcSMatt Macy boolean_t zvol_unmap_enabled = B_TRUE;
155eda14cbcSMatt Macy 
156eda14cbcSMatt Macy SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157eda14cbcSMatt Macy 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158eda14cbcSMatt Macy 
159eda14cbcSMatt Macy /*
160eda14cbcSMatt Macy  * zvol maximum transfer in one DMU tx.
161eda14cbcSMatt Macy  */
162eda14cbcSMatt Macy int zvol_maxphys = DMU_MAX_ACCESS / 2;
163eda14cbcSMatt Macy 
164eda14cbcSMatt Macy static void zvol_ensure_zilog(zvol_state_t *zv);
165eda14cbcSMatt Macy 
166eda14cbcSMatt Macy static d_open_t		zvol_cdev_open;
167eda14cbcSMatt Macy static d_close_t	zvol_cdev_close;
168eda14cbcSMatt Macy static d_ioctl_t	zvol_cdev_ioctl;
169eda14cbcSMatt Macy static d_read_t		zvol_cdev_read;
170eda14cbcSMatt Macy static d_write_t	zvol_cdev_write;
171eda14cbcSMatt Macy static d_strategy_t	zvol_geom_bio_strategy;
172c7046f76SMartin Matuska static d_kqfilter_t	zvol_cdev_kqfilter;
173eda14cbcSMatt Macy 
174eda14cbcSMatt Macy static struct cdevsw zvol_cdevsw = {
175eda14cbcSMatt Macy 	.d_name =	"zvol",
176eda14cbcSMatt Macy 	.d_version =	D_VERSION,
177eda14cbcSMatt Macy 	.d_flags =	D_DISK | D_TRACKCLOSE,
178eda14cbcSMatt Macy 	.d_open =	zvol_cdev_open,
179eda14cbcSMatt Macy 	.d_close =	zvol_cdev_close,
180eda14cbcSMatt Macy 	.d_ioctl =	zvol_cdev_ioctl,
181eda14cbcSMatt Macy 	.d_read =	zvol_cdev_read,
182eda14cbcSMatt Macy 	.d_write =	zvol_cdev_write,
183eda14cbcSMatt Macy 	.d_strategy =	zvol_geom_bio_strategy,
184c7046f76SMartin Matuska 	.d_kqfilter =	zvol_cdev_kqfilter,
185c7046f76SMartin Matuska };
186c7046f76SMartin Matuska 
187c7046f76SMartin Matuska static void		zvol_filter_detach(struct knote *kn);
188c7046f76SMartin Matuska static int		zvol_filter_vnode(struct knote *kn, long hint);
189c7046f76SMartin Matuska 
190c7046f76SMartin Matuska static struct filterops zvol_filterops_vnode = {
191c7046f76SMartin Matuska 	.f_isfd = 1,
192c7046f76SMartin Matuska 	.f_detach = zvol_filter_detach,
193c7046f76SMartin Matuska 	.f_event = zvol_filter_vnode,
194eda14cbcSMatt Macy };
195eda14cbcSMatt Macy 
196eda14cbcSMatt Macy extern uint_t zfs_geom_probe_vdev_key;
197eda14cbcSMatt Macy 
198eda14cbcSMatt Macy struct g_class zfs_zvol_class = {
199eda14cbcSMatt Macy 	.name = "ZFS::ZVOL",
200eda14cbcSMatt Macy 	.version = G_VERSION,
201eda14cbcSMatt Macy };
202eda14cbcSMatt Macy 
203eda14cbcSMatt Macy DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
204eda14cbcSMatt Macy 
205eda14cbcSMatt Macy static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206eda14cbcSMatt Macy static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207eda14cbcSMatt Macy static void zvol_geom_run(zvol_state_t *zv);
208eda14cbcSMatt Macy static void zvol_geom_destroy(zvol_state_t *zv);
209eda14cbcSMatt Macy static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210eda14cbcSMatt Macy static void zvol_geom_worker(void *arg);
211eda14cbcSMatt Macy static void zvol_geom_bio_start(struct bio *bp);
212eda14cbcSMatt Macy static int zvol_geom_bio_getattr(struct bio *bp);
213eda14cbcSMatt Macy /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
214eda14cbcSMatt Macy 
215eda14cbcSMatt Macy /*
216eda14cbcSMatt Macy  * GEOM mode implementation
217eda14cbcSMatt Macy  */
218eda14cbcSMatt Macy 
219eda14cbcSMatt Macy static int
220eda14cbcSMatt Macy zvol_geom_open(struct g_provider *pp, int flag, int count)
221eda14cbcSMatt Macy {
222eda14cbcSMatt Macy 	zvol_state_t *zv;
223eda14cbcSMatt Macy 	int err = 0;
2247877fdebSMatt Macy 	boolean_t drop_suspend = B_FALSE;
225eda14cbcSMatt Macy 
226eda14cbcSMatt Macy 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
227eda14cbcSMatt Macy 		/*
228e92ffd9bSMartin Matuska 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229eda14cbcSMatt Macy 		 * attempting to probe geom providers while looking for a
230eda14cbcSMatt Macy 		 * replacement for a missing VDEV.  In this case, the
231eda14cbcSMatt Macy 		 * spa_namespace_lock will not be held, but it is still illegal
232eda14cbcSMatt Macy 		 * to use a zvol as a vdev.  Deadlocks can result if another
233e92ffd9bSMartin Matuska 		 * thread has spa_namespace_lock.
234eda14cbcSMatt Macy 		 */
235eda14cbcSMatt Macy 		return (SET_ERROR(EOPNOTSUPP));
236eda14cbcSMatt Macy 	}
237eda14cbcSMatt Macy 
238eac7052fSMatt Macy retry:
239eda14cbcSMatt Macy 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
240e92ffd9bSMartin Matuska 	/*
241e92ffd9bSMartin Matuska 	 * Obtain a copy of private under zvol_state_lock to make sure either
242e92ffd9bSMartin Matuska 	 * the result of zvol free code setting private to NULL is observed,
243e92ffd9bSMartin Matuska 	 * or the zv is protected from being freed because of the positive
244e92ffd9bSMartin Matuska 	 * zv_open_count.
245e92ffd9bSMartin Matuska 	 */
246eda14cbcSMatt Macy 	zv = pp->private;
247eda14cbcSMatt Macy 	if (zv == NULL) {
248eda14cbcSMatt Macy 		rw_exit(&zvol_state_lock);
2497877fdebSMatt Macy 		err = SET_ERROR(ENXIO);
2507877fdebSMatt Macy 		goto out_locked;
251eda14cbcSMatt Macy 	}
252eda14cbcSMatt Macy 
253eda14cbcSMatt Macy 	mutex_enter(&zv->zv_state_lock);
254ce4dcb97SMartin Matuska 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
2557877fdebSMatt Macy 		rw_exit(&zvol_state_lock);
2567877fdebSMatt Macy 		err = SET_ERROR(ENXIO);
2577877fdebSMatt Macy 		goto out_zv_locked;
2587877fdebSMatt Macy 	}
2597877fdebSMatt Macy 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
260eda14cbcSMatt Macy 
261eda14cbcSMatt Macy 	/*
262e92ffd9bSMartin Matuska 	 * Make sure zvol is not suspended during first open
263eda14cbcSMatt Macy 	 * (hold zv_suspend_lock) and respect proper lock acquisition
264e92ffd9bSMartin Matuska 	 * ordering - zv_suspend_lock before zv_state_lock.
265eda14cbcSMatt Macy 	 */
266eda14cbcSMatt Macy 	if (zv->zv_open_count == 0) {
2677877fdebSMatt Macy 		drop_suspend = B_TRUE;
268eda14cbcSMatt Macy 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269eda14cbcSMatt Macy 			mutex_exit(&zv->zv_state_lock);
270eda14cbcSMatt Macy 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271eda14cbcSMatt Macy 			mutex_enter(&zv->zv_state_lock);
272e92ffd9bSMartin Matuska 			/* Check to see if zv_suspend_lock is needed. */
273eda14cbcSMatt Macy 			if (zv->zv_open_count != 0) {
274eda14cbcSMatt Macy 				rw_exit(&zv->zv_suspend_lock);
275eda14cbcSMatt Macy 				drop_suspend = B_FALSE;
276eda14cbcSMatt Macy 			}
277eda14cbcSMatt Macy 		}
278eda14cbcSMatt Macy 	}
279eda14cbcSMatt Macy 	rw_exit(&zvol_state_lock);
280eda14cbcSMatt Macy 
281eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
282eda14cbcSMatt Macy 
283eda14cbcSMatt Macy 	if (zv->zv_open_count == 0) {
284e92ffd9bSMartin Matuska 		boolean_t drop_namespace = B_FALSE;
285e92ffd9bSMartin Matuska 
286eda14cbcSMatt Macy 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
287e92ffd9bSMartin Matuska 
288e92ffd9bSMartin Matuska 		/*
289e92ffd9bSMartin Matuska 		 * Take spa_namespace_lock to prevent lock inversion when
290e92ffd9bSMartin Matuska 		 * zvols from one pool are opened as vdevs in another.
291e92ffd9bSMartin Matuska 		 */
292e92ffd9bSMartin Matuska 		if (!mutex_owned(&spa_namespace_lock)) {
293e92ffd9bSMartin Matuska 			if (!mutex_tryenter(&spa_namespace_lock)) {
294e92ffd9bSMartin Matuska 				mutex_exit(&zv->zv_state_lock);
295e92ffd9bSMartin Matuska 				rw_exit(&zv->zv_suspend_lock);
29675e1fea6SMartin Matuska 				drop_suspend = B_FALSE;
297e92ffd9bSMartin Matuska 				kern_yield(PRI_USER);
298e92ffd9bSMartin Matuska 				goto retry;
299e92ffd9bSMartin Matuska 			} else {
300e92ffd9bSMartin Matuska 				drop_namespace = B_TRUE;
301e92ffd9bSMartin Matuska 			}
302e92ffd9bSMartin Matuska 		}
303eda14cbcSMatt Macy 		err = zvol_first_open(zv, !(flag & FWRITE));
304e92ffd9bSMartin Matuska 		if (drop_namespace)
305e92ffd9bSMartin Matuska 			mutex_exit(&spa_namespace_lock);
306eda14cbcSMatt Macy 		if (err)
3077877fdebSMatt Macy 			goto out_zv_locked;
308eda14cbcSMatt Macy 		pp->mediasize = zv->zv_volsize;
309eda14cbcSMatt Macy 		pp->stripeoffset = 0;
310eda14cbcSMatt Macy 		pp->stripesize = zv->zv_volblocksize;
311eda14cbcSMatt Macy 	}
312eda14cbcSMatt Macy 
313e92ffd9bSMartin Matuska 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
314e92ffd9bSMartin Matuska 
315eda14cbcSMatt Macy 	/*
316eda14cbcSMatt Macy 	 * Check for a bad on-disk format version now since we
317eda14cbcSMatt Macy 	 * lied about owning the dataset readonly before.
318eda14cbcSMatt Macy 	 */
319eda14cbcSMatt Macy 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
320eda14cbcSMatt Macy 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
3217877fdebSMatt Macy 		err = SET_ERROR(EROFS);
3227877fdebSMatt Macy 		goto out_opened;
323eda14cbcSMatt Macy 	}
324eda14cbcSMatt Macy 	if (zv->zv_flags & ZVOL_EXCL) {
3257877fdebSMatt Macy 		err = SET_ERROR(EBUSY);
3267877fdebSMatt Macy 		goto out_opened;
327eda14cbcSMatt Macy 	}
328716fd348SMartin Matuska 	if (flag & O_EXCL) {
329eda14cbcSMatt Macy 		if (zv->zv_open_count != 0) {
3307877fdebSMatt Macy 			err = SET_ERROR(EBUSY);
3317877fdebSMatt Macy 			goto out_opened;
332eda14cbcSMatt Macy 		}
333eda14cbcSMatt Macy 		zv->zv_flags |= ZVOL_EXCL;
334eda14cbcSMatt Macy 	}
335eda14cbcSMatt Macy 
336eda14cbcSMatt Macy 	zv->zv_open_count += count;
3377877fdebSMatt Macy out_opened:
3387877fdebSMatt Macy 	if (zv->zv_open_count == 0) {
339eda14cbcSMatt Macy 		zvol_last_close(zv);
3407877fdebSMatt Macy 		wakeup(zv);
3417877fdebSMatt Macy 	}
3427877fdebSMatt Macy out_zv_locked:
3437877fdebSMatt Macy 	mutex_exit(&zv->zv_state_lock);
3447877fdebSMatt Macy out_locked:
345eda14cbcSMatt Macy 	if (drop_suspend)
346eda14cbcSMatt Macy 		rw_exit(&zv->zv_suspend_lock);
3477877fdebSMatt Macy 	return (err);
348eda14cbcSMatt Macy }
349eda14cbcSMatt Macy 
350eda14cbcSMatt Macy static int
351eda14cbcSMatt Macy zvol_geom_close(struct g_provider *pp, int flag, int count)
352eda14cbcSMatt Macy {
353c03c5b1cSMartin Matuska 	(void) flag;
354eda14cbcSMatt Macy 	zvol_state_t *zv;
355eda14cbcSMatt Macy 	boolean_t drop_suspend = B_TRUE;
3567877fdebSMatt Macy 	int new_open_count;
357eda14cbcSMatt Macy 
358eda14cbcSMatt Macy 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
359eda14cbcSMatt Macy 	zv = pp->private;
360eda14cbcSMatt Macy 	if (zv == NULL) {
361eda14cbcSMatt Macy 		rw_exit(&zvol_state_lock);
362eda14cbcSMatt Macy 		return (SET_ERROR(ENXIO));
363eda14cbcSMatt Macy 	}
364eda14cbcSMatt Macy 
365eda14cbcSMatt Macy 	mutex_enter(&zv->zv_state_lock);
366eda14cbcSMatt Macy 	if (zv->zv_flags & ZVOL_EXCL) {
3677877fdebSMatt Macy 		ASSERT3U(zv->zv_open_count, ==, 1);
368eda14cbcSMatt Macy 		zv->zv_flags &= ~ZVOL_EXCL;
369eda14cbcSMatt Macy 	}
370eda14cbcSMatt Macy 
3717877fdebSMatt Macy 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
372eda14cbcSMatt Macy 
373eda14cbcSMatt Macy 	/*
374eda14cbcSMatt Macy 	 * If the open count is zero, this is a spurious close.
375eda14cbcSMatt Macy 	 * That indicates a bug in the kernel / DDI framework.
376eda14cbcSMatt Macy 	 */
3777877fdebSMatt Macy 	ASSERT3U(zv->zv_open_count, >, 0);
378eda14cbcSMatt Macy 
379eda14cbcSMatt Macy 	/*
380e92ffd9bSMartin Matuska 	 * Make sure zvol is not suspended during last close
381eda14cbcSMatt Macy 	 * (hold zv_suspend_lock) and respect proper lock acquisition
382e92ffd9bSMartin Matuska 	 * ordering - zv_suspend_lock before zv_state_lock.
383eda14cbcSMatt Macy 	 */
3847877fdebSMatt Macy 	new_open_count = zv->zv_open_count - count;
3857877fdebSMatt Macy 	if (new_open_count == 0) {
386eda14cbcSMatt Macy 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
387eda14cbcSMatt Macy 			mutex_exit(&zv->zv_state_lock);
388eda14cbcSMatt Macy 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
389eda14cbcSMatt Macy 			mutex_enter(&zv->zv_state_lock);
390e92ffd9bSMartin Matuska 			/* Check to see if zv_suspend_lock is needed. */
3917877fdebSMatt Macy 			new_open_count = zv->zv_open_count - count;
3927877fdebSMatt Macy 			if (new_open_count != 0) {
393eda14cbcSMatt Macy 				rw_exit(&zv->zv_suspend_lock);
394eda14cbcSMatt Macy 				drop_suspend = B_FALSE;
395eda14cbcSMatt Macy 			}
396eda14cbcSMatt Macy 		}
397eda14cbcSMatt Macy 	} else {
398eda14cbcSMatt Macy 		drop_suspend = B_FALSE;
399eda14cbcSMatt Macy 	}
400eda14cbcSMatt Macy 	rw_exit(&zvol_state_lock);
401eda14cbcSMatt Macy 
402eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
403eda14cbcSMatt Macy 
404eda14cbcSMatt Macy 	/*
405eda14cbcSMatt Macy 	 * You may get multiple opens, but only one close.
406eda14cbcSMatt Macy 	 */
4077877fdebSMatt Macy 	zv->zv_open_count = new_open_count;
408eda14cbcSMatt Macy 	if (zv->zv_open_count == 0) {
409eda14cbcSMatt Macy 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
410eda14cbcSMatt Macy 		zvol_last_close(zv);
4117877fdebSMatt Macy 		wakeup(zv);
412eda14cbcSMatt Macy 	}
413eda14cbcSMatt Macy 
414eda14cbcSMatt Macy 	mutex_exit(&zv->zv_state_lock);
415eda14cbcSMatt Macy 
416eda14cbcSMatt Macy 	if (drop_suspend)
417eda14cbcSMatt Macy 		rw_exit(&zv->zv_suspend_lock);
418eda14cbcSMatt Macy 	return (0);
419eda14cbcSMatt Macy }
420eda14cbcSMatt Macy 
421eda14cbcSMatt Macy static void
422eda14cbcSMatt Macy zvol_geom_run(zvol_state_t *zv)
423eda14cbcSMatt Macy {
424eda14cbcSMatt Macy 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
425eda14cbcSMatt Macy 	struct g_provider *pp = zsg->zsg_provider;
426eda14cbcSMatt Macy 
4277877fdebSMatt Macy 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
428eda14cbcSMatt Macy 
429eda14cbcSMatt Macy 	g_error_provider(pp, 0);
430eda14cbcSMatt Macy 
431eda14cbcSMatt Macy 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
432eda14cbcSMatt Macy 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
433eda14cbcSMatt Macy }
434eda14cbcSMatt Macy 
435eda14cbcSMatt Macy static void
436eda14cbcSMatt Macy zvol_geom_destroy(zvol_state_t *zv)
437eda14cbcSMatt Macy {
438eda14cbcSMatt Macy 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
439eda14cbcSMatt Macy 	struct g_provider *pp = zsg->zsg_provider;
440eda14cbcSMatt Macy 
4417877fdebSMatt Macy 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
442eda14cbcSMatt Macy 
443eda14cbcSMatt Macy 	g_topology_assert();
444eda14cbcSMatt Macy 
445eda14cbcSMatt Macy 	mutex_enter(&zv->zv_state_lock);
44616038816SMartin Matuska 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
447eda14cbcSMatt Macy 	mutex_exit(&zv->zv_state_lock);
448eda14cbcSMatt Macy 	zsg->zsg_provider = NULL;
449eda14cbcSMatt Macy 	g_wither_geom(pp->geom, ENXIO);
450eda14cbcSMatt Macy }
451eda14cbcSMatt Macy 
4527877fdebSMatt Macy void
4537877fdebSMatt Macy zvol_wait_close(zvol_state_t *zv)
4547877fdebSMatt Macy {
4557877fdebSMatt Macy 
4567877fdebSMatt Macy 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
4577877fdebSMatt Macy 		return;
4587877fdebSMatt Macy 	mutex_enter(&zv->zv_state_lock);
4597877fdebSMatt Macy 	zv->zv_zso->zso_dying = B_TRUE;
4607877fdebSMatt Macy 
4617877fdebSMatt Macy 	if (zv->zv_open_count)
4627877fdebSMatt Macy 		msleep(zv, &zv->zv_state_lock,
4637877fdebSMatt Macy 		    PRIBIO, "zvol:dying", 10*hz);
4647877fdebSMatt Macy 	mutex_exit(&zv->zv_state_lock);
4657877fdebSMatt Macy }
4667877fdebSMatt Macy 
4677877fdebSMatt Macy 
468eda14cbcSMatt Macy static int
469eda14cbcSMatt Macy zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
470eda14cbcSMatt Macy {
471eda14cbcSMatt Macy 	int count, error, flags;
472eda14cbcSMatt Macy 
473eda14cbcSMatt Macy 	g_topology_assert();
474eda14cbcSMatt Macy 
475eda14cbcSMatt Macy 	/*
476eda14cbcSMatt Macy 	 * To make it easier we expect either open or close, but not both
477eda14cbcSMatt Macy 	 * at the same time.
478eda14cbcSMatt Macy 	 */
479eda14cbcSMatt Macy 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
480eda14cbcSMatt Macy 	    (acr <= 0 && acw <= 0 && ace <= 0),
481eda14cbcSMatt Macy 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
482eda14cbcSMatt Macy 	    pp->name, acr, acw, ace));
483eda14cbcSMatt Macy 
484eda14cbcSMatt Macy 	if (pp->private == NULL) {
485eda14cbcSMatt Macy 		if (acr <= 0 && acw <= 0 && ace <= 0)
486eda14cbcSMatt Macy 			return (0);
487eda14cbcSMatt Macy 		return (pp->error);
488eda14cbcSMatt Macy 	}
489eda14cbcSMatt Macy 
490eda14cbcSMatt Macy 	/*
491eda14cbcSMatt Macy 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
492eda14cbcSMatt Macy 	 * ace != 0, because GEOM already handles that and handles it a bit
493eda14cbcSMatt Macy 	 * differently. GEOM allows for multiple read/exclusive consumers and
494eda14cbcSMatt Macy 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
495eda14cbcSMatt Macy 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
496eda14cbcSMatt Macy 	 * to decide what to do.
497eda14cbcSMatt Macy 	 */
498eda14cbcSMatt Macy 
499eda14cbcSMatt Macy 	count = acr + acw + ace;
500eda14cbcSMatt Macy 	if (count == 0)
501eda14cbcSMatt Macy 		return (0);
502eda14cbcSMatt Macy 
503eda14cbcSMatt Macy 	flags = 0;
504eda14cbcSMatt Macy 	if (acr != 0 || ace != 0)
505eda14cbcSMatt Macy 		flags |= FREAD;
506eda14cbcSMatt Macy 	if (acw != 0)
507eda14cbcSMatt Macy 		flags |= FWRITE;
508eda14cbcSMatt Macy 
509eda14cbcSMatt Macy 	g_topology_unlock();
510eda14cbcSMatt Macy 	if (count > 0)
511eda14cbcSMatt Macy 		error = zvol_geom_open(pp, flags, count);
512eda14cbcSMatt Macy 	else
513eda14cbcSMatt Macy 		error = zvol_geom_close(pp, flags, -count);
514eda14cbcSMatt Macy 	g_topology_lock();
515eda14cbcSMatt Macy 	return (error);
516eda14cbcSMatt Macy }
517eda14cbcSMatt Macy 
518eda14cbcSMatt Macy static void
519eda14cbcSMatt Macy zvol_geom_worker(void *arg)
520eda14cbcSMatt Macy {
521eda14cbcSMatt Macy 	zvol_state_t *zv = arg;
522eda14cbcSMatt Macy 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
523eda14cbcSMatt Macy 	struct bio *bp;
524eda14cbcSMatt Macy 
5257877fdebSMatt Macy 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
526eda14cbcSMatt Macy 
527eda14cbcSMatt Macy 	thread_lock(curthread);
528eda14cbcSMatt Macy 	sched_prio(curthread, PRIBIO);
529eda14cbcSMatt Macy 	thread_unlock(curthread);
530eda14cbcSMatt Macy 
531eda14cbcSMatt Macy 	for (;;) {
532eda14cbcSMatt Macy 		mtx_lock(&zsg->zsg_queue_mtx);
533eda14cbcSMatt Macy 		bp = bioq_takefirst(&zsg->zsg_queue);
534eda14cbcSMatt Macy 		if (bp == NULL) {
535eda14cbcSMatt Macy 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
536eda14cbcSMatt Macy 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
537eda14cbcSMatt Macy 				wakeup(&zsg->zsg_state);
538eda14cbcSMatt Macy 				mtx_unlock(&zsg->zsg_queue_mtx);
539eda14cbcSMatt Macy 				kthread_exit();
540eda14cbcSMatt Macy 			}
541eda14cbcSMatt Macy 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
542eda14cbcSMatt Macy 			    PRIBIO | PDROP, "zvol:io", 0);
543eda14cbcSMatt Macy 			continue;
544eda14cbcSMatt Macy 		}
545eda14cbcSMatt Macy 		mtx_unlock(&zsg->zsg_queue_mtx);
546eda14cbcSMatt Macy 		zvol_geom_bio_strategy(bp);
547eda14cbcSMatt Macy 	}
548eda14cbcSMatt Macy }
549eda14cbcSMatt Macy 
550eda14cbcSMatt Macy static void
551eda14cbcSMatt Macy zvol_geom_bio_start(struct bio *bp)
552eda14cbcSMatt Macy {
553eda14cbcSMatt Macy 	zvol_state_t *zv = bp->bio_to->private;
5547877fdebSMatt Macy 	struct zvol_state_geom *zsg;
555eda14cbcSMatt Macy 	boolean_t first;
556eda14cbcSMatt Macy 
5577877fdebSMatt Macy 	if (zv == NULL) {
5587877fdebSMatt Macy 		g_io_deliver(bp, ENXIO);
5597877fdebSMatt Macy 		return;
5607877fdebSMatt Macy 	}
561eda14cbcSMatt Macy 	if (bp->bio_cmd == BIO_GETATTR) {
562eda14cbcSMatt Macy 		if (zvol_geom_bio_getattr(bp))
563eda14cbcSMatt Macy 			g_io_deliver(bp, EOPNOTSUPP);
564eda14cbcSMatt Macy 		return;
565eda14cbcSMatt Macy 	}
566eda14cbcSMatt Macy 
567eda14cbcSMatt Macy 	if (!THREAD_CAN_SLEEP()) {
5687877fdebSMatt Macy 		zsg = &zv->zv_zso->zso_geom;
569eda14cbcSMatt Macy 		mtx_lock(&zsg->zsg_queue_mtx);
570eda14cbcSMatt Macy 		first = (bioq_first(&zsg->zsg_queue) == NULL);
571eda14cbcSMatt Macy 		bioq_insert_tail(&zsg->zsg_queue, bp);
572eda14cbcSMatt Macy 		mtx_unlock(&zsg->zsg_queue_mtx);
573eda14cbcSMatt Macy 		if (first)
574eda14cbcSMatt Macy 			wakeup_one(&zsg->zsg_queue);
575eda14cbcSMatt Macy 		return;
576eda14cbcSMatt Macy 	}
577eda14cbcSMatt Macy 
578eda14cbcSMatt Macy 	zvol_geom_bio_strategy(bp);
579eda14cbcSMatt Macy }
580eda14cbcSMatt Macy 
581eda14cbcSMatt Macy static int
582eda14cbcSMatt Macy zvol_geom_bio_getattr(struct bio *bp)
583eda14cbcSMatt Macy {
584eda14cbcSMatt Macy 	zvol_state_t *zv;
585eda14cbcSMatt Macy 
586eda14cbcSMatt Macy 	zv = bp->bio_to->private;
5877877fdebSMatt Macy 	ASSERT3P(zv, !=, NULL);
588eda14cbcSMatt Macy 
589eda14cbcSMatt Macy 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
590eda14cbcSMatt Macy 	uint64_t refd, avail, usedobjs, availobjs;
591eda14cbcSMatt Macy 
592eda14cbcSMatt Macy 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
593eda14cbcSMatt Macy 		return (0);
594eda14cbcSMatt Macy 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
595eda14cbcSMatt Macy 		dmu_objset_space(zv->zv_objset, &refd, &avail,
596eda14cbcSMatt Macy 		    &usedobjs, &availobjs);
597eda14cbcSMatt Macy 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
598eda14cbcSMatt Macy 			return (0);
599eda14cbcSMatt Macy 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
600eda14cbcSMatt Macy 		dmu_objset_space(zv->zv_objset, &refd, &avail,
601eda14cbcSMatt Macy 		    &usedobjs, &availobjs);
602eda14cbcSMatt Macy 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
603eda14cbcSMatt Macy 			return (0);
604eda14cbcSMatt Macy 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
605eda14cbcSMatt Macy 		avail = metaslab_class_get_space(spa_normal_class(spa));
606eda14cbcSMatt Macy 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
607eda14cbcSMatt Macy 		if (g_handleattr_off_t(bp, "poolblocksavail",
608eda14cbcSMatt Macy 		    avail / DEV_BSIZE))
609eda14cbcSMatt Macy 			return (0);
610eda14cbcSMatt Macy 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
611eda14cbcSMatt Macy 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
612eda14cbcSMatt Macy 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
613eda14cbcSMatt Macy 			return (0);
614eda14cbcSMatt Macy 	}
615eda14cbcSMatt Macy 	return (1);
616eda14cbcSMatt Macy }
617eda14cbcSMatt Macy 
618eda14cbcSMatt Macy static void
619c7046f76SMartin Matuska zvol_filter_detach(struct knote *kn)
620c7046f76SMartin Matuska {
621c7046f76SMartin Matuska 	zvol_state_t *zv;
622c7046f76SMartin Matuska 	struct zvol_state_dev *zsd;
623c7046f76SMartin Matuska 
624c7046f76SMartin Matuska 	zv = kn->kn_hook;
625c7046f76SMartin Matuska 	zsd = &zv->zv_zso->zso_dev;
626c7046f76SMartin Matuska 
627c7046f76SMartin Matuska 	knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
628c7046f76SMartin Matuska }
629c7046f76SMartin Matuska 
630c7046f76SMartin Matuska static int
631c7046f76SMartin Matuska zvol_filter_vnode(struct knote *kn, long hint)
632c7046f76SMartin Matuska {
633c7046f76SMartin Matuska 	kn->kn_fflags |= kn->kn_sfflags & hint;
634c7046f76SMartin Matuska 
635c7046f76SMartin Matuska 	return (kn->kn_fflags != 0);
636c7046f76SMartin Matuska }
637c7046f76SMartin Matuska 
638c7046f76SMartin Matuska static int
639c7046f76SMartin Matuska zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
640c7046f76SMartin Matuska {
641c7046f76SMartin Matuska 	zvol_state_t *zv;
642c7046f76SMartin Matuska 	struct zvol_state_dev *zsd;
643c7046f76SMartin Matuska 
644c7046f76SMartin Matuska 	zv = dev->si_drv2;
645c7046f76SMartin Matuska 	zsd = &zv->zv_zso->zso_dev;
646c7046f76SMartin Matuska 
647c7046f76SMartin Matuska 	if (kn->kn_filter != EVFILT_VNODE)
648c7046f76SMartin Matuska 		return (EINVAL);
649c7046f76SMartin Matuska 
650c7046f76SMartin Matuska 	/* XXX: extend support for other NOTE_* events */
651c7046f76SMartin Matuska 	if (kn->kn_sfflags != NOTE_ATTRIB)
652c7046f76SMartin Matuska 		return (EINVAL);
653c7046f76SMartin Matuska 
654c7046f76SMartin Matuska 	kn->kn_fop = &zvol_filterops_vnode;
655c7046f76SMartin Matuska 	kn->kn_hook = zv;
656c7046f76SMartin Matuska 	knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
657c7046f76SMartin Matuska 
658c7046f76SMartin Matuska 	return (0);
659c7046f76SMartin Matuska }
660c7046f76SMartin Matuska 
661c7046f76SMartin Matuska static void
662eda14cbcSMatt Macy zvol_geom_bio_strategy(struct bio *bp)
663eda14cbcSMatt Macy {
664eda14cbcSMatt Macy 	zvol_state_t *zv;
665eda14cbcSMatt Macy 	uint64_t off, volsize;
666eda14cbcSMatt Macy 	size_t resid;
667eda14cbcSMatt Macy 	char *addr;
668eda14cbcSMatt Macy 	objset_t *os;
669eda14cbcSMatt Macy 	zfs_locked_range_t *lr;
670eda14cbcSMatt Macy 	int error = 0;
671eda14cbcSMatt Macy 	boolean_t doread = B_FALSE;
672eda14cbcSMatt Macy 	boolean_t is_dumpified;
673f8b1db88SMartin Matuska 	boolean_t commit;
674eda14cbcSMatt Macy 
675eda14cbcSMatt Macy 	if (bp->bio_to)
676eda14cbcSMatt Macy 		zv = bp->bio_to->private;
677eda14cbcSMatt Macy 	else
678eda14cbcSMatt Macy 		zv = bp->bio_dev->si_drv2;
679eda14cbcSMatt Macy 
680eda14cbcSMatt Macy 	if (zv == NULL) {
681eda14cbcSMatt Macy 		error = SET_ERROR(ENXIO);
682eda14cbcSMatt Macy 		goto out;
683eda14cbcSMatt Macy 	}
684eda14cbcSMatt Macy 
685eda14cbcSMatt Macy 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
686eda14cbcSMatt Macy 
687ce4dcb97SMartin Matuska 	if (zv->zv_flags & ZVOL_REMOVING) {
688ce4dcb97SMartin Matuska 		error = SET_ERROR(ENXIO);
689ce4dcb97SMartin Matuska 		goto resume;
690ce4dcb97SMartin Matuska 	}
691ce4dcb97SMartin Matuska 
692eda14cbcSMatt Macy 	switch (bp->bio_cmd) {
693eda14cbcSMatt Macy 	case BIO_READ:
694eda14cbcSMatt Macy 		doread = B_TRUE;
695eda14cbcSMatt Macy 		break;
696eda14cbcSMatt Macy 	case BIO_WRITE:
697eda14cbcSMatt Macy 	case BIO_FLUSH:
698eda14cbcSMatt Macy 	case BIO_DELETE:
699eda14cbcSMatt Macy 		if (zv->zv_flags & ZVOL_RDONLY) {
700eda14cbcSMatt Macy 			error = SET_ERROR(EROFS);
701eda14cbcSMatt Macy 			goto resume;
702eda14cbcSMatt Macy 		}
703eda14cbcSMatt Macy 		zvol_ensure_zilog(zv);
704eda14cbcSMatt Macy 		if (bp->bio_cmd == BIO_FLUSH)
705f8b1db88SMartin Matuska 			goto commit;
706eda14cbcSMatt Macy 		break;
707eda14cbcSMatt Macy 	default:
7087877fdebSMatt Macy 		error = SET_ERROR(EOPNOTSUPP);
709eda14cbcSMatt Macy 		goto resume;
710eda14cbcSMatt Macy 	}
711eda14cbcSMatt Macy 
712eda14cbcSMatt Macy 	off = bp->bio_offset;
713eda14cbcSMatt Macy 	volsize = zv->zv_volsize;
714eda14cbcSMatt Macy 
715eda14cbcSMatt Macy 	os = zv->zv_objset;
7167877fdebSMatt Macy 	ASSERT3P(os, !=, NULL);
717eda14cbcSMatt Macy 
718eda14cbcSMatt Macy 	addr = bp->bio_data;
719eda14cbcSMatt Macy 	resid = bp->bio_length;
720eda14cbcSMatt Macy 
721eac7052fSMatt Macy 	if (resid > 0 && off >= volsize) {
722eda14cbcSMatt Macy 		error = SET_ERROR(EIO);
723eda14cbcSMatt Macy 		goto resume;
724eda14cbcSMatt Macy 	}
725eda14cbcSMatt Macy 
726eda14cbcSMatt Macy 	is_dumpified = B_FALSE;
727f8b1db88SMartin Matuska 	commit = !doread && !is_dumpified &&
728eda14cbcSMatt Macy 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
729eda14cbcSMatt Macy 
730eda14cbcSMatt Macy 	/*
731eda14cbcSMatt Macy 	 * There must be no buffer changes when doing a dmu_sync() because
732eda14cbcSMatt Macy 	 * we can't change the data whilst calculating the checksum.
733eda14cbcSMatt Macy 	 */
734eda14cbcSMatt Macy 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
735eda14cbcSMatt Macy 	    doread ? RL_READER : RL_WRITER);
736eda14cbcSMatt Macy 
737eda14cbcSMatt Macy 	if (bp->bio_cmd == BIO_DELETE) {
738eda14cbcSMatt Macy 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
739eda14cbcSMatt Macy 		error = dmu_tx_assign(tx, TXG_WAIT);
740eda14cbcSMatt Macy 		if (error != 0) {
741eda14cbcSMatt Macy 			dmu_tx_abort(tx);
742eda14cbcSMatt Macy 		} else {
743f8b1db88SMartin Matuska 			zvol_log_truncate(zv, tx, off, resid);
744eda14cbcSMatt Macy 			dmu_tx_commit(tx);
745eda14cbcSMatt Macy 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
746eda14cbcSMatt Macy 			    off, resid);
747eda14cbcSMatt Macy 			resid = 0;
748eda14cbcSMatt Macy 		}
749eda14cbcSMatt Macy 		goto unlock;
750eda14cbcSMatt Macy 	}
751eda14cbcSMatt Macy 	while (resid != 0 && off < volsize) {
752eda14cbcSMatt Macy 		size_t size = MIN(resid, zvol_maxphys);
753eda14cbcSMatt Macy 		if (doread) {
754eda14cbcSMatt Macy 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
755eda14cbcSMatt Macy 			    DMU_READ_PREFETCH);
756eda14cbcSMatt Macy 		} else {
757eda14cbcSMatt Macy 			dmu_tx_t *tx = dmu_tx_create(os);
758eda14cbcSMatt Macy 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
759eda14cbcSMatt Macy 			error = dmu_tx_assign(tx, TXG_WAIT);
760eda14cbcSMatt Macy 			if (error) {
761eda14cbcSMatt Macy 				dmu_tx_abort(tx);
762eda14cbcSMatt Macy 			} else {
763eda14cbcSMatt Macy 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
764f8b1db88SMartin Matuska 				zvol_log_write(zv, tx, off, size, commit);
765eda14cbcSMatt Macy 				dmu_tx_commit(tx);
766eda14cbcSMatt Macy 			}
767eda14cbcSMatt Macy 		}
768eda14cbcSMatt Macy 		if (error) {
769e92ffd9bSMartin Matuska 			/* Convert checksum errors into IO errors. */
770eda14cbcSMatt Macy 			if (error == ECKSUM)
771eda14cbcSMatt Macy 				error = SET_ERROR(EIO);
772eda14cbcSMatt Macy 			break;
773eda14cbcSMatt Macy 		}
774eda14cbcSMatt Macy 		off += size;
775eda14cbcSMatt Macy 		addr += size;
776eda14cbcSMatt Macy 		resid -= size;
777eda14cbcSMatt Macy 	}
778eda14cbcSMatt Macy unlock:
779eda14cbcSMatt Macy 	zfs_rangelock_exit(lr);
780eda14cbcSMatt Macy 
781eda14cbcSMatt Macy 	bp->bio_completed = bp->bio_length - resid;
782eda14cbcSMatt Macy 	if (bp->bio_completed < bp->bio_length && off > volsize)
7837877fdebSMatt Macy 		error = SET_ERROR(EINVAL);
784eda14cbcSMatt Macy 
785eda14cbcSMatt Macy 	switch (bp->bio_cmd) {
786eda14cbcSMatt Macy 	case BIO_FLUSH:
787eda14cbcSMatt Macy 		break;
788eda14cbcSMatt Macy 	case BIO_READ:
789eda14cbcSMatt Macy 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
790eda14cbcSMatt Macy 		    bp->bio_completed);
791eda14cbcSMatt Macy 		break;
792eda14cbcSMatt Macy 	case BIO_WRITE:
793eda14cbcSMatt Macy 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
794eda14cbcSMatt Macy 		    bp->bio_completed);
795eda14cbcSMatt Macy 		break;
796eda14cbcSMatt Macy 	case BIO_DELETE:
797eda14cbcSMatt Macy 		break;
798eda14cbcSMatt Macy 	default:
799eda14cbcSMatt Macy 		break;
800eda14cbcSMatt Macy 	}
801eda14cbcSMatt Macy 
802f8b1db88SMartin Matuska 	if (commit) {
803f8b1db88SMartin Matuska commit:
804eda14cbcSMatt Macy 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
805eda14cbcSMatt Macy 	}
806eda14cbcSMatt Macy resume:
807eda14cbcSMatt Macy 	rw_exit(&zv->zv_suspend_lock);
808eda14cbcSMatt Macy out:
809eda14cbcSMatt Macy 	if (bp->bio_to)
810eda14cbcSMatt Macy 		g_io_deliver(bp, error);
811eda14cbcSMatt Macy 	else
812eda14cbcSMatt Macy 		biofinish(bp, NULL, error);
813eda14cbcSMatt Macy }
814eda14cbcSMatt Macy 
815eda14cbcSMatt Macy /*
816eda14cbcSMatt Macy  * Character device mode implementation
817eda14cbcSMatt Macy  */
818eda14cbcSMatt Macy 
819eda14cbcSMatt Macy static int
820184c1b94SMartin Matuska zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
821eda14cbcSMatt Macy {
822eda14cbcSMatt Macy 	zvol_state_t *zv;
823eda14cbcSMatt Macy 	uint64_t volsize;
824eda14cbcSMatt Macy 	zfs_locked_range_t *lr;
825eda14cbcSMatt Macy 	int error = 0;
826184c1b94SMartin Matuska 	zfs_uio_t uio;
827184c1b94SMartin Matuska 
828184c1b94SMartin Matuska 	zfs_uio_init(&uio, uio_s);
829eda14cbcSMatt Macy 
830eda14cbcSMatt Macy 	zv = dev->si_drv2;
831eda14cbcSMatt Macy 
832eda14cbcSMatt Macy 	volsize = zv->zv_volsize;
833eda14cbcSMatt Macy 	/*
834eda14cbcSMatt Macy 	 * uio_loffset == volsize isn't an error as
83516038816SMartin Matuska 	 * it's required for EOF processing.
836eda14cbcSMatt Macy 	 */
837184c1b94SMartin Matuska 	if (zfs_uio_resid(&uio) > 0 &&
838184c1b94SMartin Matuska 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
839eda14cbcSMatt Macy 		return (SET_ERROR(EIO));
840eda14cbcSMatt Macy 
841e639e0d2SMartin Matuska 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
84216038816SMartin Matuska 	ssize_t start_resid = zfs_uio_resid(&uio);
843184c1b94SMartin Matuska 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
844184c1b94SMartin Matuska 	    zfs_uio_resid(&uio), RL_READER);
845184c1b94SMartin Matuska 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
846184c1b94SMartin Matuska 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
847eda14cbcSMatt Macy 
848e92ffd9bSMartin Matuska 		/* Don't read past the end. */
849184c1b94SMartin Matuska 		if (bytes > volsize - zfs_uio_offset(&uio))
850184c1b94SMartin Matuska 			bytes = volsize - zfs_uio_offset(&uio);
851eda14cbcSMatt Macy 
852184c1b94SMartin Matuska 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
853eda14cbcSMatt Macy 		if (error) {
854e92ffd9bSMartin Matuska 			/* Convert checksum errors into IO errors. */
855eda14cbcSMatt Macy 			if (error == ECKSUM)
856eda14cbcSMatt Macy 				error = SET_ERROR(EIO);
857eda14cbcSMatt Macy 			break;
858eda14cbcSMatt Macy 		}
859eda14cbcSMatt Macy 	}
860eda14cbcSMatt Macy 	zfs_rangelock_exit(lr);
86116038816SMartin Matuska 	int64_t nread = start_resid - zfs_uio_resid(&uio);
86216038816SMartin Matuska 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
863e639e0d2SMartin Matuska 	rw_exit(&zv->zv_suspend_lock);
864eda14cbcSMatt Macy 
865eda14cbcSMatt Macy 	return (error);
866eda14cbcSMatt Macy }
867eda14cbcSMatt Macy 
868eda14cbcSMatt Macy static int
869184c1b94SMartin Matuska zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
870eda14cbcSMatt Macy {
871eda14cbcSMatt Macy 	zvol_state_t *zv;
872eda14cbcSMatt Macy 	uint64_t volsize;
873eda14cbcSMatt Macy 	zfs_locked_range_t *lr;
874eda14cbcSMatt Macy 	int error = 0;
875f8b1db88SMartin Matuska 	boolean_t commit;
876184c1b94SMartin Matuska 	zfs_uio_t uio;
877eda14cbcSMatt Macy 
878eda14cbcSMatt Macy 	zv = dev->si_drv2;
879eda14cbcSMatt Macy 
880eda14cbcSMatt Macy 	volsize = zv->zv_volsize;
881eda14cbcSMatt Macy 
882184c1b94SMartin Matuska 	zfs_uio_init(&uio, uio_s);
883184c1b94SMartin Matuska 
884184c1b94SMartin Matuska 	if (zfs_uio_resid(&uio) > 0 &&
885184c1b94SMartin Matuska 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
886eda14cbcSMatt Macy 		return (SET_ERROR(EIO));
887eda14cbcSMatt Macy 
88816038816SMartin Matuska 	ssize_t start_resid = zfs_uio_resid(&uio);
889f8b1db88SMartin Matuska 	commit = (ioflag & IO_SYNC) ||
890eda14cbcSMatt Macy 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
891eda14cbcSMatt Macy 
892eda14cbcSMatt Macy 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
893eda14cbcSMatt Macy 	zvol_ensure_zilog(zv);
894eda14cbcSMatt Macy 
895184c1b94SMartin Matuska 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
896184c1b94SMartin Matuska 	    zfs_uio_resid(&uio), RL_WRITER);
897184c1b94SMartin Matuska 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
898184c1b94SMartin Matuska 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
899184c1b94SMartin Matuska 		uint64_t off = zfs_uio_offset(&uio);
900eda14cbcSMatt Macy 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
901eda14cbcSMatt Macy 
902e92ffd9bSMartin Matuska 		if (bytes > volsize - off)	/* Don't write past the end. */
903eda14cbcSMatt Macy 			bytes = volsize - off;
904eda14cbcSMatt Macy 
905eda14cbcSMatt Macy 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
906eda14cbcSMatt Macy 		error = dmu_tx_assign(tx, TXG_WAIT);
907eda14cbcSMatt Macy 		if (error) {
908eda14cbcSMatt Macy 			dmu_tx_abort(tx);
909eda14cbcSMatt Macy 			break;
910eda14cbcSMatt Macy 		}
911184c1b94SMartin Matuska 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
912eda14cbcSMatt Macy 		if (error == 0)
913f8b1db88SMartin Matuska 			zvol_log_write(zv, tx, off, bytes, commit);
914eda14cbcSMatt Macy 		dmu_tx_commit(tx);
915eda14cbcSMatt Macy 
916eda14cbcSMatt Macy 		if (error)
917eda14cbcSMatt Macy 			break;
918eda14cbcSMatt Macy 	}
919eda14cbcSMatt Macy 	zfs_rangelock_exit(lr);
92016038816SMartin Matuska 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
92116038816SMartin Matuska 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
922f8b1db88SMartin Matuska 	if (commit)
923eda14cbcSMatt Macy 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
924eda14cbcSMatt Macy 	rw_exit(&zv->zv_suspend_lock);
925*7a7741afSMartin Matuska 
926eda14cbcSMatt Macy 	return (error);
927eda14cbcSMatt Macy }
928eda14cbcSMatt Macy 
929eda14cbcSMatt Macy static int
930eda14cbcSMatt Macy zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
931eda14cbcSMatt Macy {
932eda14cbcSMatt Macy 	zvol_state_t *zv;
933eda14cbcSMatt Macy 	int err = 0;
9347877fdebSMatt Macy 	boolean_t drop_suspend = B_FALSE;
935eda14cbcSMatt Macy 
9367877fdebSMatt Macy retry:
937eda14cbcSMatt Macy 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
938e92ffd9bSMartin Matuska 	/*
939e92ffd9bSMartin Matuska 	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
940e92ffd9bSMartin Matuska 	 * the result of zvol free code setting si_drv2 to NULL is observed,
941e92ffd9bSMartin Matuska 	 * or the zv is protected from being freed because of the positive
942e92ffd9bSMartin Matuska 	 * zv_open_count.
943e92ffd9bSMartin Matuska 	 */
944eda14cbcSMatt Macy 	zv = dev->si_drv2;
945eda14cbcSMatt Macy 	if (zv == NULL) {
946eda14cbcSMatt Macy 		rw_exit(&zvol_state_lock);
9477877fdebSMatt Macy 		err = SET_ERROR(ENXIO);
9487877fdebSMatt Macy 		goto out_locked;
949eda14cbcSMatt Macy 	}
950eda14cbcSMatt Macy 
951eda14cbcSMatt Macy 	mutex_enter(&zv->zv_state_lock);
952e92ffd9bSMartin Matuska 	if (zv->zv_zso->zso_dying) {
953e92ffd9bSMartin Matuska 		rw_exit(&zvol_state_lock);
954e92ffd9bSMartin Matuska 		err = SET_ERROR(ENXIO);
955e92ffd9bSMartin Matuska 		goto out_zv_locked;
956e92ffd9bSMartin Matuska 	}
9577877fdebSMatt Macy 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
958eda14cbcSMatt Macy 
959eda14cbcSMatt Macy 	/*
960e92ffd9bSMartin Matuska 	 * Make sure zvol is not suspended during first open
961eda14cbcSMatt Macy 	 * (hold zv_suspend_lock) and respect proper lock acquisition
962e92ffd9bSMartin Matuska 	 * ordering - zv_suspend_lock before zv_state_lock.
963eda14cbcSMatt Macy 	 */
964eda14cbcSMatt Macy 	if (zv->zv_open_count == 0) {
9657877fdebSMatt Macy 		drop_suspend = B_TRUE;
966eda14cbcSMatt Macy 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
967eda14cbcSMatt Macy 			mutex_exit(&zv->zv_state_lock);
968eda14cbcSMatt Macy 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
969eda14cbcSMatt Macy 			mutex_enter(&zv->zv_state_lock);
970e92ffd9bSMartin Matuska 			/* Check to see if zv_suspend_lock is needed. */
971eda14cbcSMatt Macy 			if (zv->zv_open_count != 0) {
972eda14cbcSMatt Macy 				rw_exit(&zv->zv_suspend_lock);
973eda14cbcSMatt Macy 				drop_suspend = B_FALSE;
974eda14cbcSMatt Macy 			}
975eda14cbcSMatt Macy 		}
976eda14cbcSMatt Macy 	}
977eda14cbcSMatt Macy 	rw_exit(&zvol_state_lock);
978eda14cbcSMatt Macy 
979eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
980eda14cbcSMatt Macy 
981eda14cbcSMatt Macy 	if (zv->zv_open_count == 0) {
982e92ffd9bSMartin Matuska 		boolean_t drop_namespace = B_FALSE;
983e92ffd9bSMartin Matuska 
984eda14cbcSMatt Macy 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
985e92ffd9bSMartin Matuska 
986e92ffd9bSMartin Matuska 		/*
987e92ffd9bSMartin Matuska 		 * Take spa_namespace_lock to prevent lock inversion when
988e92ffd9bSMartin Matuska 		 * zvols from one pool are opened as vdevs in another.
989e92ffd9bSMartin Matuska 		 */
990e92ffd9bSMartin Matuska 		if (!mutex_owned(&spa_namespace_lock)) {
991e92ffd9bSMartin Matuska 			if (!mutex_tryenter(&spa_namespace_lock)) {
99247e46b11SRyan Moeller 				mutex_exit(&zv->zv_state_lock);
99347e46b11SRyan Moeller 				rw_exit(&zv->zv_suspend_lock);
99475e1fea6SMartin Matuska 				drop_suspend = B_FALSE;
995e92ffd9bSMartin Matuska 				kern_yield(PRI_USER);
996e92ffd9bSMartin Matuska 				goto retry;
997e92ffd9bSMartin Matuska 			} else {
998e92ffd9bSMartin Matuska 				drop_namespace = B_TRUE;
999e92ffd9bSMartin Matuska 			}
1000e92ffd9bSMartin Matuska 		}
1001eda14cbcSMatt Macy 		err = zvol_first_open(zv, !(flags & FWRITE));
1002e92ffd9bSMartin Matuska 		if (drop_namespace)
1003e92ffd9bSMartin Matuska 			mutex_exit(&spa_namespace_lock);
1004eda14cbcSMatt Macy 		if (err)
10057877fdebSMatt Macy 			goto out_zv_locked;
1006eda14cbcSMatt Macy 	}
1007eda14cbcSMatt Macy 
1008e92ffd9bSMartin Matuska 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1009e92ffd9bSMartin Matuska 
1010eda14cbcSMatt Macy 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
10117877fdebSMatt Macy 		err = SET_ERROR(EROFS);
1012eda14cbcSMatt Macy 		goto out_opened;
1013eda14cbcSMatt Macy 	}
1014eda14cbcSMatt Macy 	if (zv->zv_flags & ZVOL_EXCL) {
10157877fdebSMatt Macy 		err = SET_ERROR(EBUSY);
1016eda14cbcSMatt Macy 		goto out_opened;
1017eda14cbcSMatt Macy 	}
1018716fd348SMartin Matuska 	if (flags & O_EXCL) {
1019eda14cbcSMatt Macy 		if (zv->zv_open_count != 0) {
10207877fdebSMatt Macy 			err = SET_ERROR(EBUSY);
1021eda14cbcSMatt Macy 			goto out_opened;
1022eda14cbcSMatt Macy 		}
1023eda14cbcSMatt Macy 		zv->zv_flags |= ZVOL_EXCL;
1024eda14cbcSMatt Macy 	}
1025eda14cbcSMatt Macy 
1026eda14cbcSMatt Macy 	zv->zv_open_count++;
1027eda14cbcSMatt Macy out_opened:
10287877fdebSMatt Macy 	if (zv->zv_open_count == 0) {
1029eda14cbcSMatt Macy 		zvol_last_close(zv);
10307877fdebSMatt Macy 		wakeup(zv);
10317877fdebSMatt Macy 	}
10327877fdebSMatt Macy out_zv_locked:
1033eda14cbcSMatt Macy 	mutex_exit(&zv->zv_state_lock);
10347877fdebSMatt Macy out_locked:
1035eda14cbcSMatt Macy 	if (drop_suspend)
1036eda14cbcSMatt Macy 		rw_exit(&zv->zv_suspend_lock);
10377877fdebSMatt Macy 	return (err);
1038eda14cbcSMatt Macy }
1039eda14cbcSMatt Macy 
1040eda14cbcSMatt Macy static int
1041eda14cbcSMatt Macy zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1042eda14cbcSMatt Macy {
1043eda14cbcSMatt Macy 	zvol_state_t *zv;
1044eda14cbcSMatt Macy 	boolean_t drop_suspend = B_TRUE;
1045eda14cbcSMatt Macy 
1046eda14cbcSMatt Macy 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1047eda14cbcSMatt Macy 	zv = dev->si_drv2;
1048eda14cbcSMatt Macy 	if (zv == NULL) {
1049eda14cbcSMatt Macy 		rw_exit(&zvol_state_lock);
1050eda14cbcSMatt Macy 		return (SET_ERROR(ENXIO));
1051eda14cbcSMatt Macy 	}
1052eda14cbcSMatt Macy 
1053eda14cbcSMatt Macy 	mutex_enter(&zv->zv_state_lock);
1054eda14cbcSMatt Macy 	if (zv->zv_flags & ZVOL_EXCL) {
10557877fdebSMatt Macy 		ASSERT3U(zv->zv_open_count, ==, 1);
1056eda14cbcSMatt Macy 		zv->zv_flags &= ~ZVOL_EXCL;
1057eda14cbcSMatt Macy 	}
1058eda14cbcSMatt Macy 
10597877fdebSMatt Macy 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1060eda14cbcSMatt Macy 
1061eda14cbcSMatt Macy 	/*
1062eda14cbcSMatt Macy 	 * If the open count is zero, this is a spurious close.
1063eda14cbcSMatt Macy 	 * That indicates a bug in the kernel / DDI framework.
1064eda14cbcSMatt Macy 	 */
10657877fdebSMatt Macy 	ASSERT3U(zv->zv_open_count, >, 0);
1066eda14cbcSMatt Macy 	/*
1067e92ffd9bSMartin Matuska 	 * Make sure zvol is not suspended during last close
1068eda14cbcSMatt Macy 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1069e92ffd9bSMartin Matuska 	 * ordering - zv_suspend_lock before zv_state_lock.
1070eda14cbcSMatt Macy 	 */
1071eda14cbcSMatt Macy 	if (zv->zv_open_count == 1) {
1072eda14cbcSMatt Macy 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1073eda14cbcSMatt Macy 			mutex_exit(&zv->zv_state_lock);
1074eda14cbcSMatt Macy 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1075eda14cbcSMatt Macy 			mutex_enter(&zv->zv_state_lock);
1076e92ffd9bSMartin Matuska 			/* Check to see if zv_suspend_lock is needed. */
1077eda14cbcSMatt Macy 			if (zv->zv_open_count != 1) {
1078eda14cbcSMatt Macy 				rw_exit(&zv->zv_suspend_lock);
1079eda14cbcSMatt Macy 				drop_suspend = B_FALSE;
1080eda14cbcSMatt Macy 			}
1081eda14cbcSMatt Macy 		}
1082eda14cbcSMatt Macy 	} else {
1083eda14cbcSMatt Macy 		drop_suspend = B_FALSE;
1084eda14cbcSMatt Macy 	}
1085eda14cbcSMatt Macy 	rw_exit(&zvol_state_lock);
1086eda14cbcSMatt Macy 
1087eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1088eda14cbcSMatt Macy 
1089eda14cbcSMatt Macy 	/*
1090eda14cbcSMatt Macy 	 * You may get multiple opens, but only one close.
1091eda14cbcSMatt Macy 	 */
1092eda14cbcSMatt Macy 	zv->zv_open_count--;
1093eda14cbcSMatt Macy 
1094eda14cbcSMatt Macy 	if (zv->zv_open_count == 0) {
1095eda14cbcSMatt Macy 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1096eda14cbcSMatt Macy 		zvol_last_close(zv);
10977877fdebSMatt Macy 		wakeup(zv);
1098eda14cbcSMatt Macy 	}
1099eda14cbcSMatt Macy 
1100eda14cbcSMatt Macy 	mutex_exit(&zv->zv_state_lock);
1101eda14cbcSMatt Macy 
1102eda14cbcSMatt Macy 	if (drop_suspend)
1103eda14cbcSMatt Macy 		rw_exit(&zv->zv_suspend_lock);
1104eda14cbcSMatt Macy 	return (0);
1105eda14cbcSMatt Macy }
1106eda14cbcSMatt Macy 
1107eda14cbcSMatt Macy static int
1108eda14cbcSMatt Macy zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1109eda14cbcSMatt Macy     int fflag, struct thread *td)
1110eda14cbcSMatt Macy {
1111eda14cbcSMatt Macy 	zvol_state_t *zv;
1112eda14cbcSMatt Macy 	zfs_locked_range_t *lr;
1113eda14cbcSMatt Macy 	off_t offset, length;
1114e92ffd9bSMartin Matuska 	int error;
1115eda14cbcSMatt Macy 	boolean_t sync;
1116eda14cbcSMatt Macy 
1117eda14cbcSMatt Macy 	zv = dev->si_drv2;
1118eda14cbcSMatt Macy 
1119eda14cbcSMatt Macy 	error = 0;
1120eda14cbcSMatt Macy 	KASSERT(zv->zv_open_count > 0,
1121eda14cbcSMatt Macy 	    ("Device with zero access count in %s", __func__));
1122eda14cbcSMatt Macy 
1123eda14cbcSMatt Macy 	switch (cmd) {
1124eda14cbcSMatt Macy 	case DIOCGSECTORSIZE:
1125eda14cbcSMatt Macy 		*(uint32_t *)data = DEV_BSIZE;
1126eda14cbcSMatt Macy 		break;
1127eda14cbcSMatt Macy 	case DIOCGMEDIASIZE:
1128eda14cbcSMatt Macy 		*(off_t *)data = zv->zv_volsize;
1129eda14cbcSMatt Macy 		break;
1130eda14cbcSMatt Macy 	case DIOCGFLUSH:
1131eda14cbcSMatt Macy 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1132eda14cbcSMatt Macy 		if (zv->zv_zilog != NULL)
1133eda14cbcSMatt Macy 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1134eda14cbcSMatt Macy 		rw_exit(&zv->zv_suspend_lock);
1135eda14cbcSMatt Macy 		break;
1136eda14cbcSMatt Macy 	case DIOCGDELETE:
1137eda14cbcSMatt Macy 		if (!zvol_unmap_enabled)
1138eda14cbcSMatt Macy 			break;
1139eda14cbcSMatt Macy 
1140eda14cbcSMatt Macy 		offset = ((off_t *)data)[0];
1141eda14cbcSMatt Macy 		length = ((off_t *)data)[1];
1142eda14cbcSMatt Macy 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1143eda14cbcSMatt Macy 		    offset < 0 || offset >= zv->zv_volsize ||
1144eda14cbcSMatt Macy 		    length <= 0) {
1145eda14cbcSMatt Macy 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1146eda14cbcSMatt Macy 			    length);
11477877fdebSMatt Macy 			error = SET_ERROR(EINVAL);
1148eda14cbcSMatt Macy 			break;
1149eda14cbcSMatt Macy 		}
1150eda14cbcSMatt Macy 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1151eda14cbcSMatt Macy 		zvol_ensure_zilog(zv);
1152eda14cbcSMatt Macy 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1153eda14cbcSMatt Macy 		    RL_WRITER);
1154eda14cbcSMatt Macy 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1155eda14cbcSMatt Macy 		error = dmu_tx_assign(tx, TXG_WAIT);
1156eda14cbcSMatt Macy 		if (error != 0) {
1157eda14cbcSMatt Macy 			sync = FALSE;
1158eda14cbcSMatt Macy 			dmu_tx_abort(tx);
1159eda14cbcSMatt Macy 		} else {
1160eda14cbcSMatt Macy 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1161f8b1db88SMartin Matuska 			zvol_log_truncate(zv, tx, offset, length);
1162eda14cbcSMatt Macy 			dmu_tx_commit(tx);
1163eda14cbcSMatt Macy 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1164eda14cbcSMatt Macy 			    offset, length);
1165eda14cbcSMatt Macy 		}
1166eda14cbcSMatt Macy 		zfs_rangelock_exit(lr);
1167eda14cbcSMatt Macy 		if (sync)
1168eda14cbcSMatt Macy 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1169eda14cbcSMatt Macy 		rw_exit(&zv->zv_suspend_lock);
1170eda14cbcSMatt Macy 		break;
1171eda14cbcSMatt Macy 	case DIOCGSTRIPESIZE:
1172eda14cbcSMatt Macy 		*(off_t *)data = zv->zv_volblocksize;
1173eda14cbcSMatt Macy 		break;
1174eda14cbcSMatt Macy 	case DIOCGSTRIPEOFFSET:
1175eda14cbcSMatt Macy 		*(off_t *)data = 0;
1176eda14cbcSMatt Macy 		break;
1177eda14cbcSMatt Macy 	case DIOCGATTR: {
1178eda14cbcSMatt Macy 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1179eda14cbcSMatt Macy 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1180eda14cbcSMatt Macy 		uint64_t refd, avail, usedobjs, availobjs;
1181eda14cbcSMatt Macy 
1182eda14cbcSMatt Macy 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1183eda14cbcSMatt Macy 			arg->value.i = 1;
1184eda14cbcSMatt Macy 		else if (strcmp(arg->name, "blocksavail") == 0) {
1185eda14cbcSMatt Macy 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1186eda14cbcSMatt Macy 			    &usedobjs, &availobjs);
1187eda14cbcSMatt Macy 			arg->value.off = avail / DEV_BSIZE;
1188eda14cbcSMatt Macy 		} else if (strcmp(arg->name, "blocksused") == 0) {
1189eda14cbcSMatt Macy 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1190eda14cbcSMatt Macy 			    &usedobjs, &availobjs);
1191eda14cbcSMatt Macy 			arg->value.off = refd / DEV_BSIZE;
1192eda14cbcSMatt Macy 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1193eda14cbcSMatt Macy 			avail = metaslab_class_get_space(spa_normal_class(spa));
1194eda14cbcSMatt Macy 			avail -= metaslab_class_get_alloc(
1195eda14cbcSMatt Macy 			    spa_normal_class(spa));
1196eda14cbcSMatt Macy 			arg->value.off = avail / DEV_BSIZE;
1197eda14cbcSMatt Macy 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1198eda14cbcSMatt Macy 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1199eda14cbcSMatt Macy 			arg->value.off = refd / DEV_BSIZE;
1200eda14cbcSMatt Macy 		} else
12017877fdebSMatt Macy 			error = SET_ERROR(ENOIOCTL);
1202eda14cbcSMatt Macy 		break;
1203eda14cbcSMatt Macy 	}
1204eda14cbcSMatt Macy 	case FIOSEEKHOLE:
1205eda14cbcSMatt Macy 	case FIOSEEKDATA: {
1206eda14cbcSMatt Macy 		off_t *off = (off_t *)data;
1207eda14cbcSMatt Macy 		uint64_t noff;
1208eda14cbcSMatt Macy 		boolean_t hole;
1209eda14cbcSMatt Macy 
1210eda14cbcSMatt Macy 		hole = (cmd == FIOSEEKHOLE);
1211eda14cbcSMatt Macy 		noff = *off;
12122a58b312SMartin Matuska 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
12132a58b312SMartin Matuska 		    RL_READER);
1214eda14cbcSMatt Macy 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
12152a58b312SMartin Matuska 		zfs_rangelock_exit(lr);
1216eda14cbcSMatt Macy 		*off = noff;
1217eda14cbcSMatt Macy 		break;
1218eda14cbcSMatt Macy 	}
1219eda14cbcSMatt Macy 	default:
12207877fdebSMatt Macy 		error = SET_ERROR(ENOIOCTL);
1221eda14cbcSMatt Macy 	}
1222eda14cbcSMatt Macy 
1223eda14cbcSMatt Macy 	return (error);
1224eda14cbcSMatt Macy }
1225eda14cbcSMatt Macy 
1226eda14cbcSMatt Macy /*
1227eda14cbcSMatt Macy  * Misc. helpers
1228eda14cbcSMatt Macy  */
1229eda14cbcSMatt Macy 
1230eda14cbcSMatt Macy static void
1231eda14cbcSMatt Macy zvol_ensure_zilog(zvol_state_t *zv)
1232eda14cbcSMatt Macy {
1233eda14cbcSMatt Macy 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1234eda14cbcSMatt Macy 
1235eda14cbcSMatt Macy 	/*
1236eda14cbcSMatt Macy 	 * Open a ZIL if this is the first time we have written to this
1237eda14cbcSMatt Macy 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238eda14cbcSMatt Macy 	 * than zv_state_lock so that we don't need to acquire an
1239eda14cbcSMatt Macy 	 * additional lock in this path.
1240eda14cbcSMatt Macy 	 */
1241eda14cbcSMatt Macy 	if (zv->zv_zilog == NULL) {
1242eda14cbcSMatt Macy 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1243eda14cbcSMatt Macy 			rw_exit(&zv->zv_suspend_lock);
1244eda14cbcSMatt Macy 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1245eda14cbcSMatt Macy 		}
1246eda14cbcSMatt Macy 		if (zv->zv_zilog == NULL) {
1247eda14cbcSMatt Macy 			zv->zv_zilog = zil_open(zv->zv_objset,
1248271171e0SMartin Matuska 			    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1249eda14cbcSMatt Macy 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1250c03c5b1cSMartin Matuska 			/* replay / destroy done in zvol_os_create_minor() */
125116038816SMartin Matuska 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
125216038816SMartin Matuska 			    ZIL_REPLAY_NEEDED);
1253eda14cbcSMatt Macy 		}
1254eda14cbcSMatt Macy 		rw_downgrade(&zv->zv_suspend_lock);
1255eda14cbcSMatt Macy 	}
1256eda14cbcSMatt Macy }
1257eda14cbcSMatt Macy 
1258c03c5b1cSMartin Matuska boolean_t
1259c03c5b1cSMartin Matuska zvol_os_is_zvol(const char *device)
1260eda14cbcSMatt Macy {
1261eda14cbcSMatt Macy 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1262eda14cbcSMatt Macy }
1263eda14cbcSMatt Macy 
1264c03c5b1cSMartin Matuska void
1265c03c5b1cSMartin Matuska zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1266eda14cbcSMatt Macy {
1267eda14cbcSMatt Macy 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1268eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1269eda14cbcSMatt Macy 
1270e92ffd9bSMartin Matuska 	/* Move to a new hashtable entry.  */
1271b985c9caSMartin Matuska 	zv->zv_hash = zvol_name_hash(newname);
1272eda14cbcSMatt Macy 	hlist_del(&zv->zv_hlink);
1273eda14cbcSMatt Macy 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1274eda14cbcSMatt Macy 
12757877fdebSMatt Macy 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1276eda14cbcSMatt Macy 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1277eda14cbcSMatt Macy 		struct g_provider *pp = zsg->zsg_provider;
1278eda14cbcSMatt Macy 		struct g_geom *gp;
1279eda14cbcSMatt Macy 
1280eda14cbcSMatt Macy 		g_topology_lock();
1281eda14cbcSMatt Macy 		gp = pp->geom;
12827877fdebSMatt Macy 		ASSERT3P(gp, !=, NULL);
1283eda14cbcSMatt Macy 
1284eda14cbcSMatt Macy 		zsg->zsg_provider = NULL;
1285eda14cbcSMatt Macy 		g_wither_provider(pp, ENXIO);
1286eda14cbcSMatt Macy 
1287eda14cbcSMatt Macy 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1288eda14cbcSMatt Macy 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1289eda14cbcSMatt Macy 		pp->sectorsize = DEV_BSIZE;
1290eda14cbcSMatt Macy 		pp->mediasize = zv->zv_volsize;
1291eda14cbcSMatt Macy 		pp->private = zv;
1292eda14cbcSMatt Macy 		zsg->zsg_provider = pp;
1293eda14cbcSMatt Macy 		g_error_provider(pp, 0);
1294eda14cbcSMatt Macy 		g_topology_unlock();
12957877fdebSMatt Macy 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1296eda14cbcSMatt Macy 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1297eda14cbcSMatt Macy 		struct cdev *dev;
1298eda14cbcSMatt Macy 		struct make_dev_args args;
1299eda14cbcSMatt Macy 
1300eda14cbcSMatt Macy 		dev = zsd->zsd_cdev;
1301eda14cbcSMatt Macy 		if (dev != NULL) {
1302eda14cbcSMatt Macy 			destroy_dev(dev);
1303eda14cbcSMatt Macy 			dev = zsd->zsd_cdev = NULL;
1304eda14cbcSMatt Macy 			if (zv->zv_open_count > 0) {
1305eda14cbcSMatt Macy 				zv->zv_flags &= ~ZVOL_EXCL;
1306eda14cbcSMatt Macy 				zv->zv_open_count = 0;
1307eda14cbcSMatt Macy 				/* XXX  need suspend lock but lock order */
1308eda14cbcSMatt Macy 				zvol_last_close(zv);
1309eda14cbcSMatt Macy 			}
1310eda14cbcSMatt Macy 		}
1311eda14cbcSMatt Macy 
1312eda14cbcSMatt Macy 		make_dev_args_init(&args);
1313eda14cbcSMatt Macy 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1314eda14cbcSMatt Macy 		args.mda_devsw = &zvol_cdevsw;
1315eda14cbcSMatt Macy 		args.mda_cr = NULL;
1316eda14cbcSMatt Macy 		args.mda_uid = UID_ROOT;
1317eda14cbcSMatt Macy 		args.mda_gid = GID_OPERATOR;
1318eda14cbcSMatt Macy 		args.mda_mode = 0640;
1319eda14cbcSMatt Macy 		args.mda_si_drv2 = zv;
1320eda14cbcSMatt Macy 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1321eda14cbcSMatt Macy 		    == 0) {
1322cd853791SKonstantin Belousov 			dev->si_iosize_max = maxphys;
1323eda14cbcSMatt Macy 			zsd->zsd_cdev = dev;
1324eda14cbcSMatt Macy 		}
1325eda14cbcSMatt Macy 	}
1326eda14cbcSMatt Macy 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
132714c2e0a0SMartin Matuska 	dataset_kstats_rename(&zv->zv_kstat, newname);
1328eda14cbcSMatt Macy }
1329eda14cbcSMatt Macy 
1330eda14cbcSMatt Macy /*
1331eda14cbcSMatt Macy  * Remove minor node for the specified volume.
1332eda14cbcSMatt Macy  */
1333c03c5b1cSMartin Matuska void
1334c03c5b1cSMartin Matuska zvol_os_free(zvol_state_t *zv)
1335eda14cbcSMatt Macy {
1336eda14cbcSMatt Macy 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1337eda14cbcSMatt Macy 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
13387877fdebSMatt Macy 	ASSERT0(zv->zv_open_count);
1339eda14cbcSMatt Macy 
1340eda14cbcSMatt Macy 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1341eda14cbcSMatt Macy 
1342eda14cbcSMatt Macy 	rw_destroy(&zv->zv_suspend_lock);
1343eda14cbcSMatt Macy 	zfs_rangelock_fini(&zv->zv_rangelock);
1344eda14cbcSMatt Macy 
13457877fdebSMatt Macy 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1346eda14cbcSMatt Macy 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
13477877fdebSMatt Macy 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
13487877fdebSMatt Macy 
13497877fdebSMatt Macy 		ASSERT3P(pp->private, ==, NULL);
1350eda14cbcSMatt Macy 
1351eda14cbcSMatt Macy 		g_topology_lock();
1352eda14cbcSMatt Macy 		zvol_geom_destroy(zv);
1353eda14cbcSMatt Macy 		g_topology_unlock();
1354eda14cbcSMatt Macy 		mtx_destroy(&zsg->zsg_queue_mtx);
13557877fdebSMatt Macy 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1356eda14cbcSMatt Macy 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1357eda14cbcSMatt Macy 		struct cdev *dev = zsd->zsd_cdev;
1358eda14cbcSMatt Macy 
13593f9d360cSMartin Matuska 		if (dev != NULL) {
13607877fdebSMatt Macy 			ASSERT3P(dev->si_drv2, ==, NULL);
1361eda14cbcSMatt Macy 			destroy_dev(dev);
1362c7046f76SMartin Matuska 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1363c7046f76SMartin Matuska 			knlist_destroy(&zsd->zsd_selinfo.si_note);
1364eda14cbcSMatt Macy 		}
13653f9d360cSMartin Matuska 	}
1366eda14cbcSMatt Macy 
1367eda14cbcSMatt Macy 	mutex_destroy(&zv->zv_state_lock);
1368ce4dcb97SMartin Matuska 	cv_destroy(&zv->zv_removing_cv);
1369eda14cbcSMatt Macy 	dataset_kstats_destroy(&zv->zv_kstat);
1370eda14cbcSMatt Macy 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1371eda14cbcSMatt Macy 	kmem_free(zv, sizeof (zvol_state_t));
1372eda14cbcSMatt Macy 	zvol_minors--;
1373eda14cbcSMatt Macy }
1374eda14cbcSMatt Macy 
1375eda14cbcSMatt Macy /*
1376eda14cbcSMatt Macy  * Create a minor node (plus a whole lot more) for the specified volume.
1377eda14cbcSMatt Macy  */
1378c03c5b1cSMartin Matuska int
1379c03c5b1cSMartin Matuska zvol_os_create_minor(const char *name)
1380eda14cbcSMatt Macy {
1381eda14cbcSMatt Macy 	zvol_state_t *zv;
1382eda14cbcSMatt Macy 	objset_t *os;
1383eda14cbcSMatt Macy 	dmu_object_info_t *doi;
1384eda14cbcSMatt Macy 	uint64_t volsize;
1385eda14cbcSMatt Macy 	uint64_t volmode, hash;
1386eda14cbcSMatt Macy 	int error;
1387dbd5678dSMartin Matuska 	bool replayed_zil = B_FALSE;
1388eda14cbcSMatt Macy 
1389eda14cbcSMatt Macy 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1390eda14cbcSMatt Macy 	hash = zvol_name_hash(name);
1391eda14cbcSMatt Macy 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1392eda14cbcSMatt Macy 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1393eda14cbcSMatt Macy 		mutex_exit(&zv->zv_state_lock);
1394eda14cbcSMatt Macy 		return (SET_ERROR(EEXIST));
1395eda14cbcSMatt Macy 	}
1396eda14cbcSMatt Macy 
1397eda14cbcSMatt Macy 	DROP_GIANT();
13987877fdebSMatt Macy 
1399eda14cbcSMatt Macy 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1400eda14cbcSMatt Macy 
1401e92ffd9bSMartin Matuska 	/* Lie and say we're read-only. */
14027877fdebSMatt Macy 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1403eda14cbcSMatt Macy 	if (error)
1404eda14cbcSMatt Macy 		goto out_doi;
1405eda14cbcSMatt Macy 
1406eda14cbcSMatt Macy 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1407eda14cbcSMatt Macy 	if (error)
1408eda14cbcSMatt Macy 		goto out_dmu_objset_disown;
1409eda14cbcSMatt Macy 
1410eda14cbcSMatt Macy 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1411eda14cbcSMatt Macy 	if (error)
1412eda14cbcSMatt Macy 		goto out_dmu_objset_disown;
1413eda14cbcSMatt Macy 
1414eda14cbcSMatt Macy 	error = dsl_prop_get_integer(name,
1415eda14cbcSMatt Macy 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
14167877fdebSMatt Macy 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1417eda14cbcSMatt Macy 		volmode = zvol_volmode;
14187877fdebSMatt Macy 	error = 0;
14197877fdebSMatt Macy 
1420eda14cbcSMatt Macy 	/*
1421eda14cbcSMatt Macy 	 * zvol_alloc equivalent ...
1422eda14cbcSMatt Macy 	 */
1423eda14cbcSMatt Macy 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1424eda14cbcSMatt Macy 	zv->zv_hash = hash;
1425eda14cbcSMatt Macy 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1426ce4dcb97SMartin Matuska 	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1427eda14cbcSMatt Macy 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
14287877fdebSMatt Macy 	zv->zv_volmode = volmode;
14297877fdebSMatt Macy 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1430eda14cbcSMatt Macy 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1431eda14cbcSMatt Macy 		struct g_provider *pp;
1432eda14cbcSMatt Macy 		struct g_geom *gp;
1433eda14cbcSMatt Macy 
1434eda14cbcSMatt Macy 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1435eda14cbcSMatt Macy 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1436eda14cbcSMatt Macy 
1437eda14cbcSMatt Macy 		g_topology_lock();
1438eda14cbcSMatt Macy 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1439eda14cbcSMatt Macy 		gp->start = zvol_geom_bio_start;
1440eda14cbcSMatt Macy 		gp->access = zvol_geom_access;
1441eda14cbcSMatt Macy 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1442eda14cbcSMatt Macy 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1443eda14cbcSMatt Macy 		pp->sectorsize = DEV_BSIZE;
1444eda14cbcSMatt Macy 		pp->mediasize = 0;
1445eda14cbcSMatt Macy 		pp->private = zv;
1446eda14cbcSMatt Macy 
1447eda14cbcSMatt Macy 		zsg->zsg_provider = pp;
1448eda14cbcSMatt Macy 		bioq_init(&zsg->zsg_queue);
14497877fdebSMatt Macy 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450eda14cbcSMatt Macy 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451eda14cbcSMatt Macy 		struct cdev *dev;
1452eda14cbcSMatt Macy 		struct make_dev_args args;
1453eda14cbcSMatt Macy 
1454eda14cbcSMatt Macy 		make_dev_args_init(&args);
1455eda14cbcSMatt Macy 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1456eda14cbcSMatt Macy 		args.mda_devsw = &zvol_cdevsw;
1457eda14cbcSMatt Macy 		args.mda_cr = NULL;
1458eda14cbcSMatt Macy 		args.mda_uid = UID_ROOT;
1459eda14cbcSMatt Macy 		args.mda_gid = GID_OPERATOR;
1460eda14cbcSMatt Macy 		args.mda_mode = 0640;
1461eda14cbcSMatt Macy 		args.mda_si_drv2 = zv;
14623f9d360cSMartin Matuska 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
14633f9d360cSMartin Matuska 		    == 0) {
1464cd853791SKonstantin Belousov 			dev->si_iosize_max = maxphys;
1465eda14cbcSMatt Macy 			zsd->zsd_cdev = dev;
1466c7046f76SMartin Matuska 			knlist_init_sx(&zsd->zsd_selinfo.si_note,
1467c7046f76SMartin Matuska 			    &zv->zv_state_lock);
1468eda14cbcSMatt Macy 		}
14693f9d360cSMartin Matuska 	}
1470eda14cbcSMatt Macy 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1471eda14cbcSMatt Macy 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1472eda14cbcSMatt Macy 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1473eda14cbcSMatt Macy 
1474eda14cbcSMatt Macy 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1475eda14cbcSMatt Macy 		zv->zv_flags |= ZVOL_RDONLY;
1476eda14cbcSMatt Macy 
1477eda14cbcSMatt Macy 	zv->zv_volblocksize = doi->doi_data_block_size;
1478eda14cbcSMatt Macy 	zv->zv_volsize = volsize;
1479eda14cbcSMatt Macy 	zv->zv_objset = os;
1480eda14cbcSMatt Macy 
1481271171e0SMartin Matuska 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1482271171e0SMartin Matuska 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1483271171e0SMartin Matuska 	if (error)
1484271171e0SMartin Matuska 		goto out_dmu_objset_disown;
14859db44a8eSMartin Matuska 	ASSERT3P(zv->zv_zilog, ==, NULL);
1486271171e0SMartin Matuska 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1487eda14cbcSMatt Macy 	if (spa_writeable(dmu_objset_spa(os))) {
1488eda14cbcSMatt Macy 		if (zil_replay_disable)
1489dbd5678dSMartin Matuska 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1490eda14cbcSMatt Macy 		else
1491dbd5678dSMartin Matuska 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1492eda14cbcSMatt Macy 	}
1493dbd5678dSMartin Matuska 	if (replayed_zil)
14949db44a8eSMartin Matuska 		zil_close(zv->zv_zilog);
14959db44a8eSMartin Matuska 	zv->zv_zilog = NULL;
1496eda14cbcSMatt Macy 
14977877fdebSMatt Macy 	/* TODO: prefetch for geom tasting */
1498eda14cbcSMatt Macy 
1499eda14cbcSMatt Macy 	zv->zv_objset = NULL;
1500eda14cbcSMatt Macy out_dmu_objset_disown:
1501eda14cbcSMatt Macy 	dmu_objset_disown(os, B_TRUE, FTAG);
1502eda14cbcSMatt Macy 
15037877fdebSMatt Macy 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1504eda14cbcSMatt Macy 		zvol_geom_run(zv);
1505eda14cbcSMatt Macy 		g_topology_unlock();
1506eda14cbcSMatt Macy 	}
1507eda14cbcSMatt Macy out_doi:
1508eda14cbcSMatt Macy 	kmem_free(doi, sizeof (dmu_object_info_t));
1509eda14cbcSMatt Macy 	if (error == 0) {
1510eda14cbcSMatt Macy 		rw_enter(&zvol_state_lock, RW_WRITER);
1511eda14cbcSMatt Macy 		zvol_insert(zv);
1512eda14cbcSMatt Macy 		zvol_minors++;
1513eda14cbcSMatt Macy 		rw_exit(&zvol_state_lock);
1514eda14cbcSMatt Macy 		ZFS_LOG(1, "ZVOL %s created.", name);
15157877fdebSMatt Macy 	}
1516eda14cbcSMatt Macy 	PICKUP_GIANT();
1517eda14cbcSMatt Macy 	return (error);
1518eda14cbcSMatt Macy }
1519eda14cbcSMatt Macy 
1520c03c5b1cSMartin Matuska void
1521c03c5b1cSMartin Matuska zvol_os_clear_private(zvol_state_t *zv)
1522eda14cbcSMatt Macy {
1523eda14cbcSMatt Macy 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
15247877fdebSMatt Macy 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1525eda14cbcSMatt Macy 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1526eda14cbcSMatt Macy 		struct g_provider *pp = zsg->zsg_provider;
1527eda14cbcSMatt Macy 
15287877fdebSMatt Macy 		if (pp->private == NULL) /* already cleared */
1529eda14cbcSMatt Macy 			return;
1530eda14cbcSMatt Macy 
1531eda14cbcSMatt Macy 		mtx_lock(&zsg->zsg_queue_mtx);
1532eda14cbcSMatt Macy 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1533eda14cbcSMatt Macy 		pp->private = NULL;
1534eda14cbcSMatt Macy 		wakeup_one(&zsg->zsg_queue);
1535eda14cbcSMatt Macy 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
15367877fdebSMatt Macy 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1537eda14cbcSMatt Macy 			    0, "zvol:w", 0);
1538eda14cbcSMatt Macy 		mtx_unlock(&zsg->zsg_queue_mtx);
1539eda14cbcSMatt Macy 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
15407877fdebSMatt Macy 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
15417877fdebSMatt Macy 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
15427877fdebSMatt Macy 		struct cdev *dev = zsd->zsd_cdev;
15437877fdebSMatt Macy 
15443f9d360cSMartin Matuska 		if (dev != NULL)
15457877fdebSMatt Macy 			dev->si_drv2 = NULL;
1546eda14cbcSMatt Macy 	}
1547eda14cbcSMatt Macy }
1548eda14cbcSMatt Macy 
1549c03c5b1cSMartin Matuska int
1550c03c5b1cSMartin Matuska zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1551eda14cbcSMatt Macy {
1552eda14cbcSMatt Macy 	zv->zv_volsize = volsize;
15537877fdebSMatt Macy 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1554eda14cbcSMatt Macy 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1555eda14cbcSMatt Macy 		struct g_provider *pp = zsg->zsg_provider;
1556eda14cbcSMatt Macy 
1557eda14cbcSMatt Macy 		g_topology_lock();
1558eda14cbcSMatt Macy 
15597877fdebSMatt Macy 		if (pp->private == NULL) {
15607877fdebSMatt Macy 			g_topology_unlock();
15617877fdebSMatt Macy 			return (SET_ERROR(ENXIO));
15627877fdebSMatt Macy 		}
15637877fdebSMatt Macy 
1564eda14cbcSMatt Macy 		/*
1565eda14cbcSMatt Macy 		 * Do not invoke resize event when initial size was zero.
1566eda14cbcSMatt Macy 		 * ZVOL initializes the size on first open, this is not
1567eda14cbcSMatt Macy 		 * real resizing.
1568eda14cbcSMatt Macy 		 */
1569eda14cbcSMatt Macy 		if (pp->mediasize == 0)
1570eda14cbcSMatt Macy 			pp->mediasize = zv->zv_volsize;
1571eda14cbcSMatt Macy 		else
1572eda14cbcSMatt Macy 			g_resize_provider(pp, zv->zv_volsize);
1573eda14cbcSMatt Macy 
1574eda14cbcSMatt Macy 		g_topology_unlock();
1575c7046f76SMartin Matuska 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1576c7046f76SMartin Matuska 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1577c7046f76SMartin Matuska 
1578c7046f76SMartin Matuska 		KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1579eda14cbcSMatt Macy 	}
1580eda14cbcSMatt Macy 	return (0);
1581eda14cbcSMatt Macy }
1582eda14cbcSMatt Macy 
1583c03c5b1cSMartin Matuska void
1584c03c5b1cSMartin Matuska zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1585eda14cbcSMatt Macy {
1586eda14cbcSMatt Macy 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1587eda14cbcSMatt Macy }
1588eda14cbcSMatt Macy 
1589c03c5b1cSMartin Matuska void
1590c03c5b1cSMartin Matuska zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1591eda14cbcSMatt Macy {
1592eda14cbcSMatt Macy 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1593eda14cbcSMatt Macy }
1594eda14cbcSMatt Macy 
1595eda14cbcSMatt Macy /*
1596eda14cbcSMatt Macy  * Public interfaces
1597eda14cbcSMatt Macy  */
1598eda14cbcSMatt Macy 
1599eda14cbcSMatt Macy int
1600eda14cbcSMatt Macy zvol_busy(void)
1601eda14cbcSMatt Macy {
1602eda14cbcSMatt Macy 	return (zvol_minors != 0);
1603eda14cbcSMatt Macy }
1604eda14cbcSMatt Macy 
1605eda14cbcSMatt Macy int
1606eda14cbcSMatt Macy zvol_init(void)
1607eda14cbcSMatt Macy {
1608eda14cbcSMatt Macy 	zvol_init_impl();
1609eda14cbcSMatt Macy 	return (0);
1610eda14cbcSMatt Macy }
1611eda14cbcSMatt Macy 
1612eda14cbcSMatt Macy void
1613eda14cbcSMatt Macy zvol_fini(void)
1614eda14cbcSMatt Macy {
1615eda14cbcSMatt Macy 	zvol_fini_impl();
1616eda14cbcSMatt Macy }
1617