xref: /freebsd-src/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c (revision 2a66634d1bc6d7401adafad4a3be7b9ac6bab8b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51 
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 
96 #include <geom/geom.h>
97 #include <sys/zvol.h>
98 #include <sys/zvol_impl.h>
99 
100 #include "zfs_namecheck.h"
101 
102 #define	ZVOL_DUMPSIZE		"dumpsize"
103 
104 #ifdef ZVOL_LOCK_DEBUG
105 #define	ZVOL_RW_READER		RW_WRITER
106 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
107 #else
108 #define	ZVOL_RW_READER		RW_READER
109 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
110 #endif
111 
112 enum zvol_geom_state {
113 	ZVOL_GEOM_UNINIT,
114 	ZVOL_GEOM_STOPPED,
115 	ZVOL_GEOM_RUNNING,
116 };
117 
118 struct zvol_state_os {
119 #define	zso_dev		_zso_state._zso_dev
120 #define	zso_geom	_zso_state._zso_geom
121 	union {
122 		/* volmode=dev */
123 		struct zvol_state_dev {
124 			struct cdev *zsd_cdev;
125 			uint64_t zsd_sync_cnt;
126 		} _zso_dev;
127 
128 		/* volmode=geom */
129 		struct zvol_state_geom {
130 			struct g_provider *zsg_provider;
131 			struct bio_queue_head zsg_queue;
132 			struct mtx zsg_queue_mtx;
133 			enum zvol_geom_state zsg_state;
134 		} _zso_geom;
135 	} _zso_state;
136 	int zso_dying;
137 };
138 
139 static uint32_t zvol_minors;
140 
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144 	"Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
148 
149 /*
150  * Toggle unmap functionality.
151  */
152 boolean_t zvol_unmap_enabled = B_TRUE;
153 
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
156 
157 /*
158  * zvol maximum transfer in one DMU tx.
159  */
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
161 
162 static void zvol_ensure_zilog(zvol_state_t *zv);
163 
164 static d_open_t		zvol_cdev_open;
165 static d_close_t	zvol_cdev_close;
166 static d_ioctl_t	zvol_cdev_ioctl;
167 static d_read_t		zvol_cdev_read;
168 static d_write_t	zvol_cdev_write;
169 static d_strategy_t	zvol_geom_bio_strategy;
170 
171 static struct cdevsw zvol_cdevsw = {
172 	.d_name =	"zvol",
173 	.d_version =	D_VERSION,
174 	.d_flags =	D_DISK | D_TRACKCLOSE,
175 	.d_open =	zvol_cdev_open,
176 	.d_close =	zvol_cdev_close,
177 	.d_ioctl =	zvol_cdev_ioctl,
178 	.d_read =	zvol_cdev_read,
179 	.d_write =	zvol_cdev_write,
180 	.d_strategy =	zvol_geom_bio_strategy,
181 };
182 
183 extern uint_t zfs_geom_probe_vdev_key;
184 
185 struct g_class zfs_zvol_class = {
186 	.name = "ZFS::ZVOL",
187 	.version = G_VERSION,
188 };
189 
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
191 
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
201 
202 /*
203  * GEOM mode implementation
204  */
205 
206 static int
207 zvol_geom_open(struct g_provider *pp, int flag, int count)
208 {
209 	zvol_state_t *zv;
210 	int err = 0;
211 	boolean_t drop_suspend = B_FALSE;
212 
213 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
214 		/*
215 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
216 		 * attempting to probe geom providers while looking for a
217 		 * replacement for a missing VDEV.  In this case, the
218 		 * spa_namespace_lock will not be held, but it is still illegal
219 		 * to use a zvol as a vdev.  Deadlocks can result if another
220 		 * thread has spa_namespace_lock.
221 		 */
222 		return (SET_ERROR(EOPNOTSUPP));
223 	}
224 
225 retry:
226 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
227 	/*
228 	 * Obtain a copy of private under zvol_state_lock to make sure either
229 	 * the result of zvol free code setting private to NULL is observed,
230 	 * or the zv is protected from being freed because of the positive
231 	 * zv_open_count.
232 	 */
233 	zv = pp->private;
234 	if (zv == NULL) {
235 		rw_exit(&zvol_state_lock);
236 		err = SET_ERROR(ENXIO);
237 		goto out_locked;
238 	}
239 
240 	mutex_enter(&zv->zv_state_lock);
241 	if (zv->zv_zso->zso_dying) {
242 		rw_exit(&zvol_state_lock);
243 		err = SET_ERROR(ENXIO);
244 		goto out_zv_locked;
245 	}
246 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
247 
248 	/*
249 	 * Make sure zvol is not suspended during first open
250 	 * (hold zv_suspend_lock) and respect proper lock acquisition
251 	 * ordering - zv_suspend_lock before zv_state_lock.
252 	 */
253 	if (zv->zv_open_count == 0) {
254 		drop_suspend = B_TRUE;
255 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
256 			mutex_exit(&zv->zv_state_lock);
257 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
258 			mutex_enter(&zv->zv_state_lock);
259 			/* Check to see if zv_suspend_lock is needed. */
260 			if (zv->zv_open_count != 0) {
261 				rw_exit(&zv->zv_suspend_lock);
262 				drop_suspend = B_FALSE;
263 			}
264 		}
265 	}
266 	rw_exit(&zvol_state_lock);
267 
268 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
269 
270 	if (zv->zv_open_count == 0) {
271 		boolean_t drop_namespace = B_FALSE;
272 
273 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
274 
275 		/*
276 		 * Take spa_namespace_lock to prevent lock inversion when
277 		 * zvols from one pool are opened as vdevs in another.
278 		 */
279 		if (!mutex_owned(&spa_namespace_lock)) {
280 			if (!mutex_tryenter(&spa_namespace_lock)) {
281 				mutex_exit(&zv->zv_state_lock);
282 				rw_exit(&zv->zv_suspend_lock);
283 				kern_yield(PRI_USER);
284 				goto retry;
285 			} else {
286 				drop_namespace = B_TRUE;
287 			}
288 		}
289 		err = zvol_first_open(zv, !(flag & FWRITE));
290 		if (drop_namespace)
291 			mutex_exit(&spa_namespace_lock);
292 		if (err)
293 			goto out_zv_locked;
294 		pp->mediasize = zv->zv_volsize;
295 		pp->stripeoffset = 0;
296 		pp->stripesize = zv->zv_volblocksize;
297 	}
298 
299 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
300 
301 	/*
302 	 * Check for a bad on-disk format version now since we
303 	 * lied about owning the dataset readonly before.
304 	 */
305 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
306 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
307 		err = SET_ERROR(EROFS);
308 		goto out_opened;
309 	}
310 	if (zv->zv_flags & ZVOL_EXCL) {
311 		err = SET_ERROR(EBUSY);
312 		goto out_opened;
313 	}
314 	if (flag & O_EXCL) {
315 		if (zv->zv_open_count != 0) {
316 			err = SET_ERROR(EBUSY);
317 			goto out_opened;
318 		}
319 		zv->zv_flags |= ZVOL_EXCL;
320 	}
321 
322 	zv->zv_open_count += count;
323 out_opened:
324 	if (zv->zv_open_count == 0) {
325 		zvol_last_close(zv);
326 		wakeup(zv);
327 	}
328 out_zv_locked:
329 	mutex_exit(&zv->zv_state_lock);
330 out_locked:
331 	if (drop_suspend)
332 		rw_exit(&zv->zv_suspend_lock);
333 	return (err);
334 }
335 
336 static int
337 zvol_geom_close(struct g_provider *pp, int flag, int count)
338 {
339 	(void) flag;
340 	zvol_state_t *zv;
341 	boolean_t drop_suspend = B_TRUE;
342 	int new_open_count;
343 
344 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
345 	zv = pp->private;
346 	if (zv == NULL) {
347 		rw_exit(&zvol_state_lock);
348 		return (SET_ERROR(ENXIO));
349 	}
350 
351 	mutex_enter(&zv->zv_state_lock);
352 	if (zv->zv_flags & ZVOL_EXCL) {
353 		ASSERT3U(zv->zv_open_count, ==, 1);
354 		zv->zv_flags &= ~ZVOL_EXCL;
355 	}
356 
357 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
358 
359 	/*
360 	 * If the open count is zero, this is a spurious close.
361 	 * That indicates a bug in the kernel / DDI framework.
362 	 */
363 	ASSERT3U(zv->zv_open_count, >, 0);
364 
365 	/*
366 	 * Make sure zvol is not suspended during last close
367 	 * (hold zv_suspend_lock) and respect proper lock acquisition
368 	 * ordering - zv_suspend_lock before zv_state_lock.
369 	 */
370 	new_open_count = zv->zv_open_count - count;
371 	if (new_open_count == 0) {
372 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
373 			mutex_exit(&zv->zv_state_lock);
374 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
375 			mutex_enter(&zv->zv_state_lock);
376 			/* Check to see if zv_suspend_lock is needed. */
377 			new_open_count = zv->zv_open_count - count;
378 			if (new_open_count != 0) {
379 				rw_exit(&zv->zv_suspend_lock);
380 				drop_suspend = B_FALSE;
381 			}
382 		}
383 	} else {
384 		drop_suspend = B_FALSE;
385 	}
386 	rw_exit(&zvol_state_lock);
387 
388 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
389 
390 	/*
391 	 * You may get multiple opens, but only one close.
392 	 */
393 	zv->zv_open_count = new_open_count;
394 	if (zv->zv_open_count == 0) {
395 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
396 		zvol_last_close(zv);
397 		wakeup(zv);
398 	}
399 
400 	mutex_exit(&zv->zv_state_lock);
401 
402 	if (drop_suspend)
403 		rw_exit(&zv->zv_suspend_lock);
404 	return (0);
405 }
406 
407 static void
408 zvol_geom_run(zvol_state_t *zv)
409 {
410 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
411 	struct g_provider *pp = zsg->zsg_provider;
412 
413 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
414 
415 	g_error_provider(pp, 0);
416 
417 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
418 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
419 }
420 
421 static void
422 zvol_geom_destroy(zvol_state_t *zv)
423 {
424 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
425 	struct g_provider *pp = zsg->zsg_provider;
426 
427 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
428 
429 	g_topology_assert();
430 
431 	mutex_enter(&zv->zv_state_lock);
432 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
433 	mutex_exit(&zv->zv_state_lock);
434 	zsg->zsg_provider = NULL;
435 	g_wither_geom(pp->geom, ENXIO);
436 }
437 
438 void
439 zvol_wait_close(zvol_state_t *zv)
440 {
441 
442 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
443 		return;
444 	mutex_enter(&zv->zv_state_lock);
445 	zv->zv_zso->zso_dying = B_TRUE;
446 
447 	if (zv->zv_open_count)
448 		msleep(zv, &zv->zv_state_lock,
449 		    PRIBIO, "zvol:dying", 10*hz);
450 	mutex_exit(&zv->zv_state_lock);
451 }
452 
453 
454 static int
455 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
456 {
457 	int count, error, flags;
458 
459 	g_topology_assert();
460 
461 	/*
462 	 * To make it easier we expect either open or close, but not both
463 	 * at the same time.
464 	 */
465 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
466 	    (acr <= 0 && acw <= 0 && ace <= 0),
467 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
468 	    pp->name, acr, acw, ace));
469 
470 	if (pp->private == NULL) {
471 		if (acr <= 0 && acw <= 0 && ace <= 0)
472 			return (0);
473 		return (pp->error);
474 	}
475 
476 	/*
477 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
478 	 * ace != 0, because GEOM already handles that and handles it a bit
479 	 * differently. GEOM allows for multiple read/exclusive consumers and
480 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
481 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
482 	 * to decide what to do.
483 	 */
484 
485 	count = acr + acw + ace;
486 	if (count == 0)
487 		return (0);
488 
489 	flags = 0;
490 	if (acr != 0 || ace != 0)
491 		flags |= FREAD;
492 	if (acw != 0)
493 		flags |= FWRITE;
494 
495 	g_topology_unlock();
496 	if (count > 0)
497 		error = zvol_geom_open(pp, flags, count);
498 	else
499 		error = zvol_geom_close(pp, flags, -count);
500 	g_topology_lock();
501 	return (error);
502 }
503 
504 static void
505 zvol_geom_worker(void *arg)
506 {
507 	zvol_state_t *zv = arg;
508 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
509 	struct bio *bp;
510 
511 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
512 
513 	thread_lock(curthread);
514 	sched_prio(curthread, PRIBIO);
515 	thread_unlock(curthread);
516 
517 	for (;;) {
518 		mtx_lock(&zsg->zsg_queue_mtx);
519 		bp = bioq_takefirst(&zsg->zsg_queue);
520 		if (bp == NULL) {
521 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
522 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
523 				wakeup(&zsg->zsg_state);
524 				mtx_unlock(&zsg->zsg_queue_mtx);
525 				kthread_exit();
526 			}
527 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
528 			    PRIBIO | PDROP, "zvol:io", 0);
529 			continue;
530 		}
531 		mtx_unlock(&zsg->zsg_queue_mtx);
532 		zvol_geom_bio_strategy(bp);
533 	}
534 }
535 
536 static void
537 zvol_geom_bio_start(struct bio *bp)
538 {
539 	zvol_state_t *zv = bp->bio_to->private;
540 	struct zvol_state_geom *zsg;
541 	boolean_t first;
542 
543 	if (zv == NULL) {
544 		g_io_deliver(bp, ENXIO);
545 		return;
546 	}
547 	if (bp->bio_cmd == BIO_GETATTR) {
548 		if (zvol_geom_bio_getattr(bp))
549 			g_io_deliver(bp, EOPNOTSUPP);
550 		return;
551 	}
552 
553 	if (!THREAD_CAN_SLEEP()) {
554 		zsg = &zv->zv_zso->zso_geom;
555 		mtx_lock(&zsg->zsg_queue_mtx);
556 		first = (bioq_first(&zsg->zsg_queue) == NULL);
557 		bioq_insert_tail(&zsg->zsg_queue, bp);
558 		mtx_unlock(&zsg->zsg_queue_mtx);
559 		if (first)
560 			wakeup_one(&zsg->zsg_queue);
561 		return;
562 	}
563 
564 	zvol_geom_bio_strategy(bp);
565 }
566 
567 static int
568 zvol_geom_bio_getattr(struct bio *bp)
569 {
570 	zvol_state_t *zv;
571 
572 	zv = bp->bio_to->private;
573 	ASSERT3P(zv, !=, NULL);
574 
575 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
576 	uint64_t refd, avail, usedobjs, availobjs;
577 
578 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
579 		return (0);
580 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
581 		dmu_objset_space(zv->zv_objset, &refd, &avail,
582 		    &usedobjs, &availobjs);
583 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
584 			return (0);
585 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
586 		dmu_objset_space(zv->zv_objset, &refd, &avail,
587 		    &usedobjs, &availobjs);
588 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
589 			return (0);
590 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
591 		avail = metaslab_class_get_space(spa_normal_class(spa));
592 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
593 		if (g_handleattr_off_t(bp, "poolblocksavail",
594 		    avail / DEV_BSIZE))
595 			return (0);
596 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
597 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
598 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
599 			return (0);
600 	}
601 	return (1);
602 }
603 
604 static void
605 zvol_geom_bio_strategy(struct bio *bp)
606 {
607 	zvol_state_t *zv;
608 	uint64_t off, volsize;
609 	size_t resid;
610 	char *addr;
611 	objset_t *os;
612 	zfs_locked_range_t *lr;
613 	int error = 0;
614 	boolean_t doread = B_FALSE;
615 	boolean_t is_dumpified;
616 	boolean_t sync;
617 
618 	if (bp->bio_to)
619 		zv = bp->bio_to->private;
620 	else
621 		zv = bp->bio_dev->si_drv2;
622 
623 	if (zv == NULL) {
624 		error = SET_ERROR(ENXIO);
625 		goto out;
626 	}
627 
628 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
629 
630 	switch (bp->bio_cmd) {
631 	case BIO_READ:
632 		doread = B_TRUE;
633 		break;
634 	case BIO_WRITE:
635 	case BIO_FLUSH:
636 	case BIO_DELETE:
637 		if (zv->zv_flags & ZVOL_RDONLY) {
638 			error = SET_ERROR(EROFS);
639 			goto resume;
640 		}
641 		zvol_ensure_zilog(zv);
642 		if (bp->bio_cmd == BIO_FLUSH)
643 			goto sync;
644 		break;
645 	default:
646 		error = SET_ERROR(EOPNOTSUPP);
647 		goto resume;
648 	}
649 
650 	off = bp->bio_offset;
651 	volsize = zv->zv_volsize;
652 
653 	os = zv->zv_objset;
654 	ASSERT3P(os, !=, NULL);
655 
656 	addr = bp->bio_data;
657 	resid = bp->bio_length;
658 
659 	if (resid > 0 && off >= volsize) {
660 		error = SET_ERROR(EIO);
661 		goto resume;
662 	}
663 
664 	is_dumpified = B_FALSE;
665 	sync = !doread && !is_dumpified &&
666 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
667 
668 	/*
669 	 * There must be no buffer changes when doing a dmu_sync() because
670 	 * we can't change the data whilst calculating the checksum.
671 	 */
672 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
673 	    doread ? RL_READER : RL_WRITER);
674 
675 	if (bp->bio_cmd == BIO_DELETE) {
676 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
677 		error = dmu_tx_assign(tx, TXG_WAIT);
678 		if (error != 0) {
679 			dmu_tx_abort(tx);
680 		} else {
681 			zvol_log_truncate(zv, tx, off, resid, sync);
682 			dmu_tx_commit(tx);
683 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
684 			    off, resid);
685 			resid = 0;
686 		}
687 		goto unlock;
688 	}
689 	while (resid != 0 && off < volsize) {
690 		size_t size = MIN(resid, zvol_maxphys);
691 		if (doread) {
692 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
693 			    DMU_READ_PREFETCH);
694 		} else {
695 			dmu_tx_t *tx = dmu_tx_create(os);
696 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
697 			error = dmu_tx_assign(tx, TXG_WAIT);
698 			if (error) {
699 				dmu_tx_abort(tx);
700 			} else {
701 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
702 				zvol_log_write(zv, tx, off, size, sync);
703 				dmu_tx_commit(tx);
704 			}
705 		}
706 		if (error) {
707 			/* Convert checksum errors into IO errors. */
708 			if (error == ECKSUM)
709 				error = SET_ERROR(EIO);
710 			break;
711 		}
712 		off += size;
713 		addr += size;
714 		resid -= size;
715 	}
716 unlock:
717 	zfs_rangelock_exit(lr);
718 
719 	bp->bio_completed = bp->bio_length - resid;
720 	if (bp->bio_completed < bp->bio_length && off > volsize)
721 		error = SET_ERROR(EINVAL);
722 
723 	switch (bp->bio_cmd) {
724 	case BIO_FLUSH:
725 		break;
726 	case BIO_READ:
727 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
728 		    bp->bio_completed);
729 		break;
730 	case BIO_WRITE:
731 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
732 		    bp->bio_completed);
733 		break;
734 	case BIO_DELETE:
735 		break;
736 	default:
737 		break;
738 	}
739 
740 	if (sync) {
741 sync:
742 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
743 	}
744 resume:
745 	rw_exit(&zv->zv_suspend_lock);
746 out:
747 	if (bp->bio_to)
748 		g_io_deliver(bp, error);
749 	else
750 		biofinish(bp, NULL, error);
751 }
752 
753 /*
754  * Character device mode implementation
755  */
756 
757 static int
758 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
759 {
760 	zvol_state_t *zv;
761 	uint64_t volsize;
762 	zfs_locked_range_t *lr;
763 	int error = 0;
764 	zfs_uio_t uio;
765 
766 	zfs_uio_init(&uio, uio_s);
767 
768 	zv = dev->si_drv2;
769 
770 	volsize = zv->zv_volsize;
771 	/*
772 	 * uio_loffset == volsize isn't an error as
773 	 * it's required for EOF processing.
774 	 */
775 	if (zfs_uio_resid(&uio) > 0 &&
776 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
777 		return (SET_ERROR(EIO));
778 
779 	ssize_t start_resid = zfs_uio_resid(&uio);
780 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
781 	    zfs_uio_resid(&uio), RL_READER);
782 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
783 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
784 
785 		/* Don't read past the end. */
786 		if (bytes > volsize - zfs_uio_offset(&uio))
787 			bytes = volsize - zfs_uio_offset(&uio);
788 
789 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
790 		if (error) {
791 			/* Convert checksum errors into IO errors. */
792 			if (error == ECKSUM)
793 				error = SET_ERROR(EIO);
794 			break;
795 		}
796 	}
797 	zfs_rangelock_exit(lr);
798 	int64_t nread = start_resid - zfs_uio_resid(&uio);
799 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
800 
801 	return (error);
802 }
803 
804 static int
805 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
806 {
807 	zvol_state_t *zv;
808 	uint64_t volsize;
809 	zfs_locked_range_t *lr;
810 	int error = 0;
811 	boolean_t sync;
812 	zfs_uio_t uio;
813 
814 	zv = dev->si_drv2;
815 
816 	volsize = zv->zv_volsize;
817 
818 	zfs_uio_init(&uio, uio_s);
819 
820 	if (zfs_uio_resid(&uio) > 0 &&
821 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
822 		return (SET_ERROR(EIO));
823 
824 	ssize_t start_resid = zfs_uio_resid(&uio);
825 	sync = (ioflag & IO_SYNC) ||
826 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
827 
828 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
829 	zvol_ensure_zilog(zv);
830 
831 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
832 	    zfs_uio_resid(&uio), RL_WRITER);
833 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
834 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
835 		uint64_t off = zfs_uio_offset(&uio);
836 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
837 
838 		if (bytes > volsize - off)	/* Don't write past the end. */
839 			bytes = volsize - off;
840 
841 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
842 		error = dmu_tx_assign(tx, TXG_WAIT);
843 		if (error) {
844 			dmu_tx_abort(tx);
845 			break;
846 		}
847 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
848 		if (error == 0)
849 			zvol_log_write(zv, tx, off, bytes, sync);
850 		dmu_tx_commit(tx);
851 
852 		if (error)
853 			break;
854 	}
855 	zfs_rangelock_exit(lr);
856 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
857 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
858 	if (sync)
859 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
860 	rw_exit(&zv->zv_suspend_lock);
861 	return (error);
862 }
863 
864 static int
865 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
866 {
867 	zvol_state_t *zv;
868 	struct zvol_state_dev *zsd;
869 	int err = 0;
870 	boolean_t drop_suspend = B_FALSE;
871 
872 retry:
873 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
874 	/*
875 	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
876 	 * the result of zvol free code setting si_drv2 to NULL is observed,
877 	 * or the zv is protected from being freed because of the positive
878 	 * zv_open_count.
879 	 */
880 	zv = dev->si_drv2;
881 	if (zv == NULL) {
882 		rw_exit(&zvol_state_lock);
883 		err = SET_ERROR(ENXIO);
884 		goto out_locked;
885 	}
886 
887 	mutex_enter(&zv->zv_state_lock);
888 	if (zv->zv_zso->zso_dying) {
889 		rw_exit(&zvol_state_lock);
890 		err = SET_ERROR(ENXIO);
891 		goto out_zv_locked;
892 	}
893 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
894 
895 	/*
896 	 * Make sure zvol is not suspended during first open
897 	 * (hold zv_suspend_lock) and respect proper lock acquisition
898 	 * ordering - zv_suspend_lock before zv_state_lock.
899 	 */
900 	if (zv->zv_open_count == 0) {
901 		drop_suspend = B_TRUE;
902 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
903 			mutex_exit(&zv->zv_state_lock);
904 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
905 			mutex_enter(&zv->zv_state_lock);
906 			/* Check to see if zv_suspend_lock is needed. */
907 			if (zv->zv_open_count != 0) {
908 				rw_exit(&zv->zv_suspend_lock);
909 				drop_suspend = B_FALSE;
910 			}
911 		}
912 	}
913 	rw_exit(&zvol_state_lock);
914 
915 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
916 
917 	if (zv->zv_open_count == 0) {
918 		boolean_t drop_namespace = B_FALSE;
919 
920 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
921 
922 		/*
923 		 * Take spa_namespace_lock to prevent lock inversion when
924 		 * zvols from one pool are opened as vdevs in another.
925 		 */
926 		if (!mutex_owned(&spa_namespace_lock)) {
927 			if (!mutex_tryenter(&spa_namespace_lock)) {
928 				mutex_exit(&zv->zv_state_lock);
929 				rw_exit(&zv->zv_suspend_lock);
930 				kern_yield(PRI_USER);
931 				goto retry;
932 			} else {
933 				drop_namespace = B_TRUE;
934 			}
935 		}
936 		err = zvol_first_open(zv, !(flags & FWRITE));
937 		if (drop_namespace)
938 			mutex_exit(&spa_namespace_lock);
939 		if (err)
940 			goto out_zv_locked;
941 	}
942 
943 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
944 
945 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
946 		err = SET_ERROR(EROFS);
947 		goto out_opened;
948 	}
949 	if (zv->zv_flags & ZVOL_EXCL) {
950 		err = SET_ERROR(EBUSY);
951 		goto out_opened;
952 	}
953 	if (flags & O_EXCL) {
954 		if (zv->zv_open_count != 0) {
955 			err = SET_ERROR(EBUSY);
956 			goto out_opened;
957 		}
958 		zv->zv_flags |= ZVOL_EXCL;
959 	}
960 
961 	zv->zv_open_count++;
962 	if (flags & O_SYNC) {
963 		zsd = &zv->zv_zso->zso_dev;
964 		zsd->zsd_sync_cnt++;
965 		if (zsd->zsd_sync_cnt == 1 &&
966 		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
967 			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
968 	}
969 out_opened:
970 	if (zv->zv_open_count == 0) {
971 		zvol_last_close(zv);
972 		wakeup(zv);
973 	}
974 out_zv_locked:
975 	mutex_exit(&zv->zv_state_lock);
976 out_locked:
977 	if (drop_suspend)
978 		rw_exit(&zv->zv_suspend_lock);
979 	return (err);
980 }
981 
982 static int
983 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
984 {
985 	zvol_state_t *zv;
986 	struct zvol_state_dev *zsd;
987 	boolean_t drop_suspend = B_TRUE;
988 
989 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
990 	zv = dev->si_drv2;
991 	if (zv == NULL) {
992 		rw_exit(&zvol_state_lock);
993 		return (SET_ERROR(ENXIO));
994 	}
995 
996 	mutex_enter(&zv->zv_state_lock);
997 	if (zv->zv_flags & ZVOL_EXCL) {
998 		ASSERT3U(zv->zv_open_count, ==, 1);
999 		zv->zv_flags &= ~ZVOL_EXCL;
1000 	}
1001 
1002 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1003 
1004 	/*
1005 	 * If the open count is zero, this is a spurious close.
1006 	 * That indicates a bug in the kernel / DDI framework.
1007 	 */
1008 	ASSERT3U(zv->zv_open_count, >, 0);
1009 	/*
1010 	 * Make sure zvol is not suspended during last close
1011 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1012 	 * ordering - zv_suspend_lock before zv_state_lock.
1013 	 */
1014 	if (zv->zv_open_count == 1) {
1015 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1016 			mutex_exit(&zv->zv_state_lock);
1017 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1018 			mutex_enter(&zv->zv_state_lock);
1019 			/* Check to see if zv_suspend_lock is needed. */
1020 			if (zv->zv_open_count != 1) {
1021 				rw_exit(&zv->zv_suspend_lock);
1022 				drop_suspend = B_FALSE;
1023 			}
1024 		}
1025 	} else {
1026 		drop_suspend = B_FALSE;
1027 	}
1028 	rw_exit(&zvol_state_lock);
1029 
1030 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1031 
1032 	/*
1033 	 * You may get multiple opens, but only one close.
1034 	 */
1035 	zv->zv_open_count--;
1036 	if (flags & O_SYNC) {
1037 		zsd = &zv->zv_zso->zso_dev;
1038 		zsd->zsd_sync_cnt--;
1039 	}
1040 
1041 	if (zv->zv_open_count == 0) {
1042 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1043 		zvol_last_close(zv);
1044 		wakeup(zv);
1045 	}
1046 
1047 	mutex_exit(&zv->zv_state_lock);
1048 
1049 	if (drop_suspend)
1050 		rw_exit(&zv->zv_suspend_lock);
1051 	return (0);
1052 }
1053 
1054 static int
1055 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1056     int fflag, struct thread *td)
1057 {
1058 	zvol_state_t *zv;
1059 	zfs_locked_range_t *lr;
1060 	off_t offset, length;
1061 	int error;
1062 	boolean_t sync;
1063 
1064 	zv = dev->si_drv2;
1065 
1066 	error = 0;
1067 	KASSERT(zv->zv_open_count > 0,
1068 	    ("Device with zero access count in %s", __func__));
1069 
1070 	switch (cmd) {
1071 	case DIOCGSECTORSIZE:
1072 		*(uint32_t *)data = DEV_BSIZE;
1073 		break;
1074 	case DIOCGMEDIASIZE:
1075 		*(off_t *)data = zv->zv_volsize;
1076 		break;
1077 	case DIOCGFLUSH:
1078 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1079 		if (zv->zv_zilog != NULL)
1080 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1081 		rw_exit(&zv->zv_suspend_lock);
1082 		break;
1083 	case DIOCGDELETE:
1084 		if (!zvol_unmap_enabled)
1085 			break;
1086 
1087 		offset = ((off_t *)data)[0];
1088 		length = ((off_t *)data)[1];
1089 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1090 		    offset < 0 || offset >= zv->zv_volsize ||
1091 		    length <= 0) {
1092 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1093 			    length);
1094 			error = SET_ERROR(EINVAL);
1095 			break;
1096 		}
1097 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1098 		zvol_ensure_zilog(zv);
1099 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1100 		    RL_WRITER);
1101 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1102 		error = dmu_tx_assign(tx, TXG_WAIT);
1103 		if (error != 0) {
1104 			sync = FALSE;
1105 			dmu_tx_abort(tx);
1106 		} else {
1107 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1108 			zvol_log_truncate(zv, tx, offset, length, sync);
1109 			dmu_tx_commit(tx);
1110 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1111 			    offset, length);
1112 		}
1113 		zfs_rangelock_exit(lr);
1114 		if (sync)
1115 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1116 		rw_exit(&zv->zv_suspend_lock);
1117 		break;
1118 	case DIOCGSTRIPESIZE:
1119 		*(off_t *)data = zv->zv_volblocksize;
1120 		break;
1121 	case DIOCGSTRIPEOFFSET:
1122 		*(off_t *)data = 0;
1123 		break;
1124 	case DIOCGATTR: {
1125 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1126 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1127 		uint64_t refd, avail, usedobjs, availobjs;
1128 
1129 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1130 			arg->value.i = 1;
1131 		else if (strcmp(arg->name, "blocksavail") == 0) {
1132 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1133 			    &usedobjs, &availobjs);
1134 			arg->value.off = avail / DEV_BSIZE;
1135 		} else if (strcmp(arg->name, "blocksused") == 0) {
1136 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1137 			    &usedobjs, &availobjs);
1138 			arg->value.off = refd / DEV_BSIZE;
1139 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1140 			avail = metaslab_class_get_space(spa_normal_class(spa));
1141 			avail -= metaslab_class_get_alloc(
1142 			    spa_normal_class(spa));
1143 			arg->value.off = avail / DEV_BSIZE;
1144 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1145 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1146 			arg->value.off = refd / DEV_BSIZE;
1147 		} else
1148 			error = SET_ERROR(ENOIOCTL);
1149 		break;
1150 	}
1151 	case FIOSEEKHOLE:
1152 	case FIOSEEKDATA: {
1153 		off_t *off = (off_t *)data;
1154 		uint64_t noff;
1155 		boolean_t hole;
1156 
1157 		hole = (cmd == FIOSEEKHOLE);
1158 		noff = *off;
1159 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1160 		*off = noff;
1161 		break;
1162 	}
1163 	default:
1164 		error = SET_ERROR(ENOIOCTL);
1165 	}
1166 
1167 	return (error);
1168 }
1169 
1170 /*
1171  * Misc. helpers
1172  */
1173 
1174 static void
1175 zvol_ensure_zilog(zvol_state_t *zv)
1176 {
1177 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1178 
1179 	/*
1180 	 * Open a ZIL if this is the first time we have written to this
1181 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1182 	 * than zv_state_lock so that we don't need to acquire an
1183 	 * additional lock in this path.
1184 	 */
1185 	if (zv->zv_zilog == NULL) {
1186 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1187 			rw_exit(&zv->zv_suspend_lock);
1188 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1189 		}
1190 		if (zv->zv_zilog == NULL) {
1191 			zv->zv_zilog = zil_open(zv->zv_objset,
1192 			    zvol_get_data);
1193 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1194 			/* replay / destroy done in zvol_os_create_minor() */
1195 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1196 			    ZIL_REPLAY_NEEDED);
1197 		}
1198 		rw_downgrade(&zv->zv_suspend_lock);
1199 	}
1200 }
1201 
1202 boolean_t
1203 zvol_os_is_zvol(const char *device)
1204 {
1205 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1206 }
1207 
1208 void
1209 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1210 {
1211 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1212 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1213 
1214 	/* Move to a new hashtable entry.  */
1215 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1216 	hlist_del(&zv->zv_hlink);
1217 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1218 
1219 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1220 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1221 		struct g_provider *pp = zsg->zsg_provider;
1222 		struct g_geom *gp;
1223 
1224 		g_topology_lock();
1225 		gp = pp->geom;
1226 		ASSERT3P(gp, !=, NULL);
1227 
1228 		zsg->zsg_provider = NULL;
1229 		g_wither_provider(pp, ENXIO);
1230 
1231 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1232 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1233 		pp->sectorsize = DEV_BSIZE;
1234 		pp->mediasize = zv->zv_volsize;
1235 		pp->private = zv;
1236 		zsg->zsg_provider = pp;
1237 		g_error_provider(pp, 0);
1238 		g_topology_unlock();
1239 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1240 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1241 		struct cdev *dev;
1242 		struct make_dev_args args;
1243 
1244 		dev = zsd->zsd_cdev;
1245 		if (dev != NULL) {
1246 			destroy_dev(dev);
1247 			dev = zsd->zsd_cdev = NULL;
1248 			if (zv->zv_open_count > 0) {
1249 				zv->zv_flags &= ~ZVOL_EXCL;
1250 				zv->zv_open_count = 0;
1251 				/* XXX  need suspend lock but lock order */
1252 				zvol_last_close(zv);
1253 			}
1254 		}
1255 
1256 		make_dev_args_init(&args);
1257 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1258 		args.mda_devsw = &zvol_cdevsw;
1259 		args.mda_cr = NULL;
1260 		args.mda_uid = UID_ROOT;
1261 		args.mda_gid = GID_OPERATOR;
1262 		args.mda_mode = 0640;
1263 		args.mda_si_drv2 = zv;
1264 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1265 		    == 0) {
1266 #if __FreeBSD_version > 1300130
1267 			dev->si_iosize_max = maxphys;
1268 #else
1269 			dev->si_iosize_max = MAXPHYS;
1270 #endif
1271 			zsd->zsd_cdev = dev;
1272 		}
1273 	}
1274 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1275 }
1276 
1277 /*
1278  * Remove minor node for the specified volume.
1279  */
1280 void
1281 zvol_os_free(zvol_state_t *zv)
1282 {
1283 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1284 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1285 	ASSERT0(zv->zv_open_count);
1286 
1287 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1288 
1289 	rw_destroy(&zv->zv_suspend_lock);
1290 	zfs_rangelock_fini(&zv->zv_rangelock);
1291 
1292 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1293 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1294 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1295 
1296 		ASSERT3P(pp->private, ==, NULL);
1297 
1298 		g_topology_lock();
1299 		zvol_geom_destroy(zv);
1300 		g_topology_unlock();
1301 		mtx_destroy(&zsg->zsg_queue_mtx);
1302 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1303 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1304 		struct cdev *dev = zsd->zsd_cdev;
1305 
1306 		if (dev != NULL) {
1307 			ASSERT3P(dev->si_drv2, ==, NULL);
1308 			destroy_dev(dev);
1309 		}
1310 	}
1311 
1312 	mutex_destroy(&zv->zv_state_lock);
1313 	dataset_kstats_destroy(&zv->zv_kstat);
1314 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1315 	kmem_free(zv, sizeof (zvol_state_t));
1316 	zvol_minors--;
1317 }
1318 
1319 /*
1320  * Create a minor node (plus a whole lot more) for the specified volume.
1321  */
1322 int
1323 zvol_os_create_minor(const char *name)
1324 {
1325 	zvol_state_t *zv;
1326 	objset_t *os;
1327 	dmu_object_info_t *doi;
1328 	uint64_t volsize;
1329 	uint64_t volmode, hash;
1330 	int error;
1331 
1332 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1333 	hash = zvol_name_hash(name);
1334 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1335 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1336 		mutex_exit(&zv->zv_state_lock);
1337 		return (SET_ERROR(EEXIST));
1338 	}
1339 
1340 	DROP_GIANT();
1341 
1342 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1343 
1344 	/* Lie and say we're read-only. */
1345 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1346 	if (error)
1347 		goto out_doi;
1348 
1349 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1350 	if (error)
1351 		goto out_dmu_objset_disown;
1352 
1353 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1354 	if (error)
1355 		goto out_dmu_objset_disown;
1356 
1357 	error = dsl_prop_get_integer(name,
1358 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1359 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1360 		volmode = zvol_volmode;
1361 	error = 0;
1362 
1363 	/*
1364 	 * zvol_alloc equivalent ...
1365 	 */
1366 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1367 	zv->zv_hash = hash;
1368 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1369 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1370 	zv->zv_volmode = volmode;
1371 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1372 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1373 		struct g_provider *pp;
1374 		struct g_geom *gp;
1375 
1376 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1377 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1378 
1379 		g_topology_lock();
1380 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1381 		gp->start = zvol_geom_bio_start;
1382 		gp->access = zvol_geom_access;
1383 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1384 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1385 		pp->sectorsize = DEV_BSIZE;
1386 		pp->mediasize = 0;
1387 		pp->private = zv;
1388 
1389 		zsg->zsg_provider = pp;
1390 		bioq_init(&zsg->zsg_queue);
1391 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1392 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1393 		struct cdev *dev;
1394 		struct make_dev_args args;
1395 
1396 		make_dev_args_init(&args);
1397 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1398 		args.mda_devsw = &zvol_cdevsw;
1399 		args.mda_cr = NULL;
1400 		args.mda_uid = UID_ROOT;
1401 		args.mda_gid = GID_OPERATOR;
1402 		args.mda_mode = 0640;
1403 		args.mda_si_drv2 = zv;
1404 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1405 		    == 0) {
1406 #if __FreeBSD_version > 1300130
1407 			dev->si_iosize_max = maxphys;
1408 #else
1409 			dev->si_iosize_max = MAXPHYS;
1410 #endif
1411 			zsd->zsd_cdev = dev;
1412 		}
1413 	}
1414 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1415 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1416 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1417 
1418 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1419 		zv->zv_flags |= ZVOL_RDONLY;
1420 
1421 	zv->zv_volblocksize = doi->doi_data_block_size;
1422 	zv->zv_volsize = volsize;
1423 	zv->zv_objset = os;
1424 
1425 	ASSERT3P(zv->zv_zilog, ==, NULL);
1426 	zv->zv_zilog = zil_open(os, zvol_get_data);
1427 	if (spa_writeable(dmu_objset_spa(os))) {
1428 		if (zil_replay_disable)
1429 			zil_destroy(zv->zv_zilog, B_FALSE);
1430 		else
1431 			zil_replay(os, zv, zvol_replay_vector);
1432 	}
1433 	zil_close(zv->zv_zilog);
1434 	zv->zv_zilog = NULL;
1435 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1436 	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1437 
1438 	/* TODO: prefetch for geom tasting */
1439 
1440 	zv->zv_objset = NULL;
1441 out_dmu_objset_disown:
1442 	dmu_objset_disown(os, B_TRUE, FTAG);
1443 
1444 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1445 		zvol_geom_run(zv);
1446 		g_topology_unlock();
1447 	}
1448 out_doi:
1449 	kmem_free(doi, sizeof (dmu_object_info_t));
1450 	if (error == 0) {
1451 		rw_enter(&zvol_state_lock, RW_WRITER);
1452 		zvol_insert(zv);
1453 		zvol_minors++;
1454 		rw_exit(&zvol_state_lock);
1455 		ZFS_LOG(1, "ZVOL %s created.", name);
1456 	}
1457 	PICKUP_GIANT();
1458 	return (error);
1459 }
1460 
1461 void
1462 zvol_os_clear_private(zvol_state_t *zv)
1463 {
1464 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1465 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1466 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1467 		struct g_provider *pp = zsg->zsg_provider;
1468 
1469 		if (pp->private == NULL) /* already cleared */
1470 			return;
1471 
1472 		mtx_lock(&zsg->zsg_queue_mtx);
1473 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1474 		pp->private = NULL;
1475 		wakeup_one(&zsg->zsg_queue);
1476 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1477 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1478 			    0, "zvol:w", 0);
1479 		mtx_unlock(&zsg->zsg_queue_mtx);
1480 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1481 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1482 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1483 		struct cdev *dev = zsd->zsd_cdev;
1484 
1485 		if (dev != NULL)
1486 			dev->si_drv2 = NULL;
1487 	}
1488 }
1489 
1490 int
1491 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1492 {
1493 	zv->zv_volsize = volsize;
1494 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1495 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1496 		struct g_provider *pp = zsg->zsg_provider;
1497 
1498 		g_topology_lock();
1499 
1500 		if (pp->private == NULL) {
1501 			g_topology_unlock();
1502 			return (SET_ERROR(ENXIO));
1503 		}
1504 
1505 		/*
1506 		 * Do not invoke resize event when initial size was zero.
1507 		 * ZVOL initializes the size on first open, this is not
1508 		 * real resizing.
1509 		 */
1510 		if (pp->mediasize == 0)
1511 			pp->mediasize = zv->zv_volsize;
1512 		else
1513 			g_resize_provider(pp, zv->zv_volsize);
1514 
1515 		g_topology_unlock();
1516 	}
1517 	return (0);
1518 }
1519 
1520 void
1521 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1522 {
1523 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1524 }
1525 
1526 void
1527 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1528 {
1529 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1530 }
1531 
1532 /*
1533  * Public interfaces
1534  */
1535 
1536 int
1537 zvol_busy(void)
1538 {
1539 	return (zvol_minors != 0);
1540 }
1541 
1542 int
1543 zvol_init(void)
1544 {
1545 	zvol_init_impl();
1546 	return (0);
1547 }
1548 
1549 void
1550 zvol_fini(void)
1551 {
1552 	zvol_fini_impl();
1553 }
1554