xref: /freebsd-src/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c (revision 371f152c7ff76b99acbdf0decf5aa5ca1cc45bd6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51 
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 
96 #include <geom/geom.h>
97 #include <sys/zvol.h>
98 #include <sys/zvol_impl.h>
99 
100 #include "zfs_namecheck.h"
101 
102 #define	ZVOL_DUMPSIZE		"dumpsize"
103 
104 #ifdef ZVOL_LOCK_DEBUG
105 #define	ZVOL_RW_READER		RW_WRITER
106 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
107 #else
108 #define	ZVOL_RW_READER		RW_READER
109 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
110 #endif
111 
112 enum zvol_geom_state {
113 	ZVOL_GEOM_UNINIT,
114 	ZVOL_GEOM_STOPPED,
115 	ZVOL_GEOM_RUNNING,
116 };
117 
118 struct zvol_state_os {
119 	int zso_volmode;
120 #define	zso_dev		_zso_state._zso_dev
121 #define	zso_geom	_zso_state._zso_geom
122 	union {
123 		/* volmode=dev */
124 		struct zvol_state_dev {
125 			struct cdev *zsd_cdev;
126 			uint64_t zsd_sync_cnt;
127 		} _zso_dev;
128 
129 		/* volmode=geom */
130 		struct zvol_state_geom {
131 			struct g_provider *zsg_provider;
132 			struct bio_queue_head zsg_queue;
133 			struct mtx zsg_queue_mtx;
134 			enum zvol_geom_state zsg_state;
135 		} _zso_geom;
136 	} _zso_state;
137 };
138 
139 static uint32_t zvol_minors;
140 
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144 	"Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
148 
149 /*
150  * Toggle unmap functionality.
151  */
152 boolean_t zvol_unmap_enabled = B_TRUE;
153 
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
156 
157 /*
158  * zvol maximum transfer in one DMU tx.
159  */
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
161 
162 static void zvol_ensure_zilog(zvol_state_t *zv);
163 
164 static d_open_t		zvol_cdev_open;
165 static d_close_t	zvol_cdev_close;
166 static d_ioctl_t	zvol_cdev_ioctl;
167 static d_read_t		zvol_cdev_read;
168 static d_write_t	zvol_cdev_write;
169 static d_strategy_t	zvol_geom_bio_strategy;
170 
171 static struct cdevsw zvol_cdevsw = {
172 	.d_name =	"zvol",
173 	.d_version =	D_VERSION,
174 	.d_flags =	D_DISK | D_TRACKCLOSE,
175 	.d_open =	zvol_cdev_open,
176 	.d_close =	zvol_cdev_close,
177 	.d_ioctl =	zvol_cdev_ioctl,
178 	.d_read =	zvol_cdev_read,
179 	.d_write =	zvol_cdev_write,
180 	.d_strategy =	zvol_geom_bio_strategy,
181 };
182 
183 extern uint_t zfs_geom_probe_vdev_key;
184 
185 struct g_class zfs_zvol_class = {
186 	.name = "ZFS::ZVOL",
187 	.version = G_VERSION,
188 };
189 
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
191 
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
201 
202 /*
203  * GEOM mode implementation
204  */
205 
206 /*ARGSUSED*/
207 static int
208 zvol_geom_open(struct g_provider *pp, int flag, int count)
209 {
210 	zvol_state_t *zv;
211 	int err = 0;
212 	boolean_t drop_suspend = B_TRUE;
213 	boolean_t drop_namespace = B_FALSE;
214 
215 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
216 		/*
217 		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
218 		 * attempting to probe geom providers while looking for a
219 		 * replacement for a missing VDEV.  In this case, the
220 		 * spa_namespace_lock will not be held, but it is still illegal
221 		 * to use a zvol as a vdev.  Deadlocks can result if another
222 		 * thread has spa_namespace_lock
223 		 */
224 		return (SET_ERROR(EOPNOTSUPP));
225 	}
226 
227 retry:
228 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
229 	zv = pp->private;
230 	if (zv == NULL) {
231 		if (drop_namespace)
232 			mutex_exit(&spa_namespace_lock);
233 		rw_exit(&zvol_state_lock);
234 		return (SET_ERROR(ENXIO));
235 	}
236 
237 	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
238 		/*
239 		 * We need to guarantee that the namespace lock is held
240 		 * to avoid spurious failures in zvol_first_open
241 		 */
242 		drop_namespace = B_TRUE;
243 		if (!mutex_tryenter(&spa_namespace_lock)) {
244 			rw_exit(&zvol_state_lock);
245 			mutex_enter(&spa_namespace_lock);
246 			goto retry;
247 		}
248 	}
249 	mutex_enter(&zv->zv_state_lock);
250 
251 	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
252 
253 	/*
254 	 * make sure zvol is not suspended during first open
255 	 * (hold zv_suspend_lock) and respect proper lock acquisition
256 	 * ordering - zv_suspend_lock before zv_state_lock
257 	 */
258 	if (zv->zv_open_count == 0) {
259 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
260 			mutex_exit(&zv->zv_state_lock);
261 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
262 			mutex_enter(&zv->zv_state_lock);
263 			/* check to see if zv_suspend_lock is needed */
264 			if (zv->zv_open_count != 0) {
265 				rw_exit(&zv->zv_suspend_lock);
266 				drop_suspend = B_FALSE;
267 			}
268 		}
269 	} else {
270 		drop_suspend = B_FALSE;
271 	}
272 	rw_exit(&zvol_state_lock);
273 
274 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
275 
276 	if (zv->zv_open_count == 0) {
277 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
278 		err = zvol_first_open(zv, !(flag & FWRITE));
279 		if (err)
280 			goto out_mutex;
281 		pp->mediasize = zv->zv_volsize;
282 		pp->stripeoffset = 0;
283 		pp->stripesize = zv->zv_volblocksize;
284 	}
285 
286 	/*
287 	 * Check for a bad on-disk format version now since we
288 	 * lied about owning the dataset readonly before.
289 	 */
290 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
291 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
292 		err = EROFS;
293 		goto out_open_count;
294 	}
295 	if (zv->zv_flags & ZVOL_EXCL) {
296 		err = EBUSY;
297 		goto out_open_count;
298 	}
299 #ifdef FEXCL
300 	if (flag & FEXCL) {
301 		if (zv->zv_open_count != 0) {
302 			err = EBUSY;
303 			goto out_open_count;
304 		}
305 		zv->zv_flags |= ZVOL_EXCL;
306 	}
307 #endif
308 
309 	zv->zv_open_count += count;
310 	if (drop_namespace)
311 		mutex_exit(&spa_namespace_lock);
312 	mutex_exit(&zv->zv_state_lock);
313 	if (drop_suspend)
314 		rw_exit(&zv->zv_suspend_lock);
315 	return (0);
316 
317 out_open_count:
318 	if (zv->zv_open_count == 0)
319 		zvol_last_close(zv);
320 out_mutex:
321 	if (drop_namespace)
322 		mutex_exit(&spa_namespace_lock);
323 	mutex_exit(&zv->zv_state_lock);
324 	if (drop_suspend)
325 		rw_exit(&zv->zv_suspend_lock);
326 	return (SET_ERROR(err));
327 }
328 
329 /*ARGSUSED*/
330 static int
331 zvol_geom_close(struct g_provider *pp, int flag, int count)
332 {
333 	zvol_state_t *zv;
334 	boolean_t drop_suspend = B_TRUE;
335 
336 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
337 	zv = pp->private;
338 	if (zv == NULL) {
339 		rw_exit(&zvol_state_lock);
340 		return (SET_ERROR(ENXIO));
341 	}
342 
343 	mutex_enter(&zv->zv_state_lock);
344 	if (zv->zv_flags & ZVOL_EXCL) {
345 		ASSERT(zv->zv_open_count == 1);
346 		zv->zv_flags &= ~ZVOL_EXCL;
347 	}
348 
349 	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
350 
351 	/*
352 	 * If the open count is zero, this is a spurious close.
353 	 * That indicates a bug in the kernel / DDI framework.
354 	 */
355 	ASSERT(zv->zv_open_count > 0);
356 
357 	/*
358 	 * make sure zvol is not suspended during last close
359 	 * (hold zv_suspend_lock) and respect proper lock acquisition
360 	 * ordering - zv_suspend_lock before zv_state_lock
361 	 */
362 	if ((zv->zv_open_count - count) == 0) {
363 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
364 			mutex_exit(&zv->zv_state_lock);
365 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
366 			mutex_enter(&zv->zv_state_lock);
367 			/* check to see if zv_suspend_lock is needed */
368 			if (zv->zv_open_count != 1) {
369 				rw_exit(&zv->zv_suspend_lock);
370 				drop_suspend = B_FALSE;
371 			}
372 		}
373 	} else {
374 		drop_suspend = B_FALSE;
375 	}
376 	rw_exit(&zvol_state_lock);
377 
378 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
379 
380 	/*
381 	 * You may get multiple opens, but only one close.
382 	 */
383 	zv->zv_open_count -= count;
384 
385 	if (zv->zv_open_count == 0) {
386 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
387 		zvol_last_close(zv);
388 	}
389 
390 	mutex_exit(&zv->zv_state_lock);
391 
392 	if (drop_suspend)
393 		rw_exit(&zv->zv_suspend_lock);
394 	return (0);
395 }
396 
397 static void
398 zvol_geom_run(zvol_state_t *zv)
399 {
400 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
401 	struct g_provider *pp = zsg->zsg_provider;
402 
403 	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
404 
405 	g_error_provider(pp, 0);
406 
407 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
408 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
409 }
410 
411 static void
412 zvol_geom_destroy(zvol_state_t *zv)
413 {
414 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
415 	struct g_provider *pp = zsg->zsg_provider;
416 
417 	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
418 
419 	g_topology_assert();
420 
421 	mutex_enter(&zv->zv_state_lock);
422 	VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
423 	mutex_exit(&zv->zv_state_lock);
424 	zsg->zsg_provider = NULL;
425 	pp->private = NULL;
426 	g_wither_geom(pp->geom, ENXIO);
427 }
428 
429 static int
430 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
431 {
432 	int count, error, flags;
433 
434 	g_topology_assert();
435 
436 	/*
437 	 * To make it easier we expect either open or close, but not both
438 	 * at the same time.
439 	 */
440 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
441 	    (acr <= 0 && acw <= 0 && ace <= 0),
442 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
443 	    pp->name, acr, acw, ace));
444 
445 	if (pp->private == NULL) {
446 		if (acr <= 0 && acw <= 0 && ace <= 0)
447 			return (0);
448 		return (pp->error);
449 	}
450 
451 	/*
452 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
453 	 * ace != 0, because GEOM already handles that and handles it a bit
454 	 * differently. GEOM allows for multiple read/exclusive consumers and
455 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
456 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
457 	 * to decide what to do.
458 	 */
459 
460 	count = acr + acw + ace;
461 	if (count == 0)
462 		return (0);
463 
464 	flags = 0;
465 	if (acr != 0 || ace != 0)
466 		flags |= FREAD;
467 	if (acw != 0)
468 		flags |= FWRITE;
469 
470 	g_topology_unlock();
471 	if (count > 0)
472 		error = zvol_geom_open(pp, flags, count);
473 	else
474 		error = zvol_geom_close(pp, flags, -count);
475 	g_topology_lock();
476 	return (error);
477 }
478 
479 static void
480 zvol_geom_worker(void *arg)
481 {
482 	zvol_state_t *zv = arg;
483 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
484 	struct bio *bp;
485 
486 	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
487 
488 	thread_lock(curthread);
489 	sched_prio(curthread, PRIBIO);
490 	thread_unlock(curthread);
491 
492 	for (;;) {
493 		mtx_lock(&zsg->zsg_queue_mtx);
494 		bp = bioq_takefirst(&zsg->zsg_queue);
495 		if (bp == NULL) {
496 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
497 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
498 				wakeup(&zsg->zsg_state);
499 				mtx_unlock(&zsg->zsg_queue_mtx);
500 				kthread_exit();
501 			}
502 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
503 			    PRIBIO | PDROP, "zvol:io", 0);
504 			continue;
505 		}
506 		mtx_unlock(&zsg->zsg_queue_mtx);
507 		zvol_geom_bio_strategy(bp);
508 	}
509 }
510 
511 static void
512 zvol_geom_bio_start(struct bio *bp)
513 {
514 	zvol_state_t *zv = bp->bio_to->private;
515 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
516 	boolean_t first;
517 
518 	if (bp->bio_cmd == BIO_GETATTR) {
519 		if (zvol_geom_bio_getattr(bp))
520 			g_io_deliver(bp, EOPNOTSUPP);
521 		return;
522 	}
523 
524 	if (!THREAD_CAN_SLEEP()) {
525 		mtx_lock(&zsg->zsg_queue_mtx);
526 		first = (bioq_first(&zsg->zsg_queue) == NULL);
527 		bioq_insert_tail(&zsg->zsg_queue, bp);
528 		mtx_unlock(&zsg->zsg_queue_mtx);
529 		if (first)
530 			wakeup_one(&zsg->zsg_queue);
531 		return;
532 	}
533 
534 	zvol_geom_bio_strategy(bp);
535 }
536 
537 static int
538 zvol_geom_bio_getattr(struct bio *bp)
539 {
540 	zvol_state_t *zv;
541 
542 	zv = bp->bio_to->private;
543 	ASSERT(zv != NULL);
544 
545 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
546 	uint64_t refd, avail, usedobjs, availobjs;
547 
548 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
549 		return (0);
550 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
551 		dmu_objset_space(zv->zv_objset, &refd, &avail,
552 		    &usedobjs, &availobjs);
553 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
554 			return (0);
555 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
556 		dmu_objset_space(zv->zv_objset, &refd, &avail,
557 		    &usedobjs, &availobjs);
558 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
559 			return (0);
560 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
561 		avail = metaslab_class_get_space(spa_normal_class(spa));
562 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
563 		if (g_handleattr_off_t(bp, "poolblocksavail",
564 		    avail / DEV_BSIZE))
565 			return (0);
566 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
567 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
568 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
569 			return (0);
570 	}
571 	return (1);
572 }
573 
574 static void
575 zvol_geom_bio_strategy(struct bio *bp)
576 {
577 	zvol_state_t *zv;
578 	uint64_t off, volsize;
579 	size_t resid;
580 	char *addr;
581 	objset_t *os;
582 	zfs_locked_range_t *lr;
583 	int error = 0;
584 	boolean_t doread = B_FALSE;
585 	boolean_t is_dumpified;
586 	boolean_t sync;
587 
588 	if (bp->bio_to)
589 		zv = bp->bio_to->private;
590 	else
591 		zv = bp->bio_dev->si_drv2;
592 
593 	if (zv == NULL) {
594 		error = SET_ERROR(ENXIO);
595 		goto out;
596 	}
597 
598 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
599 
600 	switch (bp->bio_cmd) {
601 	case BIO_READ:
602 		doread = B_TRUE;
603 		break;
604 	case BIO_WRITE:
605 	case BIO_FLUSH:
606 	case BIO_DELETE:
607 		if (zv->zv_flags & ZVOL_RDONLY) {
608 			error = SET_ERROR(EROFS);
609 			goto resume;
610 		}
611 		zvol_ensure_zilog(zv);
612 		if (bp->bio_cmd == BIO_FLUSH)
613 			goto sync;
614 		break;
615 	default:
616 		error = EOPNOTSUPP;
617 		goto resume;
618 	}
619 
620 	off = bp->bio_offset;
621 	volsize = zv->zv_volsize;
622 
623 	os = zv->zv_objset;
624 	ASSERT(os != NULL);
625 
626 	addr = bp->bio_data;
627 	resid = bp->bio_length;
628 
629 	if (resid > 0 && off >= volsize) {
630 		error = SET_ERROR(EIO);
631 		goto resume;
632 	}
633 
634 	is_dumpified = B_FALSE;
635 	sync = !doread && !is_dumpified &&
636 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
637 
638 	/*
639 	 * There must be no buffer changes when doing a dmu_sync() because
640 	 * we can't change the data whilst calculating the checksum.
641 	 */
642 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
643 	    doread ? RL_READER : RL_WRITER);
644 
645 	if (bp->bio_cmd == BIO_DELETE) {
646 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
647 		error = dmu_tx_assign(tx, TXG_WAIT);
648 		if (error != 0) {
649 			dmu_tx_abort(tx);
650 		} else {
651 			zvol_log_truncate(zv, tx, off, resid, sync);
652 			dmu_tx_commit(tx);
653 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
654 			    off, resid);
655 			resid = 0;
656 		}
657 		goto unlock;
658 	}
659 	while (resid != 0 && off < volsize) {
660 		size_t size = MIN(resid, zvol_maxphys);
661 		if (doread) {
662 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
663 			    DMU_READ_PREFETCH);
664 		} else {
665 			dmu_tx_t *tx = dmu_tx_create(os);
666 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
667 			error = dmu_tx_assign(tx, TXG_WAIT);
668 			if (error) {
669 				dmu_tx_abort(tx);
670 			} else {
671 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
672 				zvol_log_write(zv, tx, off, size, sync);
673 				dmu_tx_commit(tx);
674 			}
675 		}
676 		if (error) {
677 			/* convert checksum errors into IO errors */
678 			if (error == ECKSUM)
679 				error = SET_ERROR(EIO);
680 			break;
681 		}
682 		off += size;
683 		addr += size;
684 		resid -= size;
685 	}
686 unlock:
687 	zfs_rangelock_exit(lr);
688 
689 	bp->bio_completed = bp->bio_length - resid;
690 	if (bp->bio_completed < bp->bio_length && off > volsize)
691 		error = EINVAL;
692 
693 	switch (bp->bio_cmd) {
694 	case BIO_FLUSH:
695 		break;
696 	case BIO_READ:
697 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
698 		    bp->bio_completed);
699 		break;
700 	case BIO_WRITE:
701 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
702 		    bp->bio_completed);
703 		break;
704 	case BIO_DELETE:
705 		break;
706 	default:
707 		break;
708 	}
709 
710 	if (sync) {
711 sync:
712 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
713 	}
714 resume:
715 	rw_exit(&zv->zv_suspend_lock);
716 out:
717 	if (bp->bio_to)
718 		g_io_deliver(bp, error);
719 	else
720 		biofinish(bp, NULL, error);
721 }
722 
723 /*
724  * Character device mode implementation
725  */
726 
727 static int
728 zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag)
729 {
730 	zvol_state_t *zv;
731 	uint64_t volsize;
732 	zfs_locked_range_t *lr;
733 	int error = 0;
734 
735 	zv = dev->si_drv2;
736 
737 	volsize = zv->zv_volsize;
738 	/*
739 	 * uio_loffset == volsize isn't an error as
740 	 * its required for EOF processing.
741 	 */
742 	if (uio->uio_resid > 0 &&
743 	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
744 		return (SET_ERROR(EIO));
745 
746 	lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
747 	    uio->uio_resid, RL_READER);
748 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
749 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
750 
751 		/* don't read past the end */
752 		if (bytes > volsize - uio->uio_loffset)
753 			bytes = volsize - uio->uio_loffset;
754 
755 		error =  dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
756 		if (error) {
757 			/* convert checksum errors into IO errors */
758 			if (error == ECKSUM)
759 				error = SET_ERROR(EIO);
760 			break;
761 		}
762 	}
763 	zfs_rangelock_exit(lr);
764 
765 	return (error);
766 }
767 
768 static int
769 zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag)
770 {
771 	zvol_state_t *zv;
772 	uint64_t volsize;
773 	zfs_locked_range_t *lr;
774 	int error = 0;
775 	boolean_t sync;
776 
777 	zv = dev->si_drv2;
778 
779 	volsize = zv->zv_volsize;
780 
781 	if (uio->uio_resid > 0 &&
782 	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
783 		return (SET_ERROR(EIO));
784 
785 	sync = (ioflag & IO_SYNC) ||
786 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
787 
788 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
789 	zvol_ensure_zilog(zv);
790 
791 	lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
792 	    uio->uio_resid, RL_WRITER);
793 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
794 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
795 		uint64_t off = uio->uio_loffset;
796 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
797 
798 		if (bytes > volsize - off)	/* don't write past the end */
799 			bytes = volsize - off;
800 
801 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
802 		error = dmu_tx_assign(tx, TXG_WAIT);
803 		if (error) {
804 			dmu_tx_abort(tx);
805 			break;
806 		}
807 		error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
808 		if (error == 0)
809 			zvol_log_write(zv, tx, off, bytes, sync);
810 		dmu_tx_commit(tx);
811 
812 		if (error)
813 			break;
814 	}
815 	zfs_rangelock_exit(lr);
816 	if (sync)
817 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
818 	rw_exit(&zv->zv_suspend_lock);
819 	return (error);
820 }
821 
822 static int
823 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
824 {
825 	zvol_state_t *zv;
826 	struct zvol_state_dev *zsd;
827 	int err = 0;
828 	boolean_t drop_suspend = B_TRUE;
829 
830 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
831 	zv = dev->si_drv2;
832 	if (zv == NULL) {
833 		rw_exit(&zvol_state_lock);
834 		return (SET_ERROR(ENXIO));
835 	}
836 
837 	mutex_enter(&zv->zv_state_lock);
838 
839 	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
840 
841 	/*
842 	 * make sure zvol is not suspended during first open
843 	 * (hold zv_suspend_lock) and respect proper lock acquisition
844 	 * ordering - zv_suspend_lock before zv_state_lock
845 	 */
846 	if (zv->zv_open_count == 0) {
847 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
848 			mutex_exit(&zv->zv_state_lock);
849 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
850 			mutex_enter(&zv->zv_state_lock);
851 			/* check to see if zv_suspend_lock is needed */
852 			if (zv->zv_open_count != 0) {
853 				rw_exit(&zv->zv_suspend_lock);
854 				drop_suspend = B_FALSE;
855 			}
856 		}
857 	} else {
858 		drop_suspend = B_FALSE;
859 	}
860 	rw_exit(&zvol_state_lock);
861 
862 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
863 
864 	if (zv->zv_open_count == 0) {
865 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
866 		err = zvol_first_open(zv, !(flags & FWRITE));
867 		if (err)
868 			goto out_locked;
869 	}
870 
871 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
872 		err = EROFS;
873 		goto out_opened;
874 	}
875 	if (zv->zv_flags & ZVOL_EXCL) {
876 		err = EBUSY;
877 		goto out_opened;
878 	}
879 #ifdef FEXCL
880 	if (flags & FEXCL) {
881 		if (zv->zv_open_count != 0) {
882 			err = EBUSY;
883 			goto out_opened;
884 		}
885 		zv->zv_flags |= ZVOL_EXCL;
886 	}
887 #endif
888 
889 	zv->zv_open_count++;
890 	if (flags & (FSYNC | FDSYNC)) {
891 		zsd = &zv->zv_zso->zso_dev;
892 		zsd->zsd_sync_cnt++;
893 		if (zsd->zsd_sync_cnt == 1 &&
894 		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
895 			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
896 	}
897 
898 	mutex_exit(&zv->zv_state_lock);
899 	if (drop_suspend)
900 		rw_exit(&zv->zv_suspend_lock);
901 	return (0);
902 
903 out_opened:
904 	if (zv->zv_open_count == 0)
905 		zvol_last_close(zv);
906 out_locked:
907 	mutex_exit(&zv->zv_state_lock);
908 	if (drop_suspend)
909 		rw_exit(&zv->zv_suspend_lock);
910 	return (SET_ERROR(err));
911 }
912 
913 static int
914 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
915 {
916 	zvol_state_t *zv;
917 	struct zvol_state_dev *zsd;
918 	boolean_t drop_suspend = B_TRUE;
919 
920 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
921 	zv = dev->si_drv2;
922 	if (zv == NULL) {
923 		rw_exit(&zvol_state_lock);
924 		return (SET_ERROR(ENXIO));
925 	}
926 
927 	mutex_enter(&zv->zv_state_lock);
928 	if (zv->zv_flags & ZVOL_EXCL) {
929 		ASSERT(zv->zv_open_count == 1);
930 		zv->zv_flags &= ~ZVOL_EXCL;
931 	}
932 
933 	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
934 
935 	/*
936 	 * If the open count is zero, this is a spurious close.
937 	 * That indicates a bug in the kernel / DDI framework.
938 	 */
939 	ASSERT(zv->zv_open_count > 0);
940 	/*
941 	 * make sure zvol is not suspended during last close
942 	 * (hold zv_suspend_lock) and respect proper lock acquisition
943 	 * ordering - zv_suspend_lock before zv_state_lock
944 	 */
945 	if (zv->zv_open_count == 1) {
946 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
947 			mutex_exit(&zv->zv_state_lock);
948 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
949 			mutex_enter(&zv->zv_state_lock);
950 			/* check to see if zv_suspend_lock is needed */
951 			if (zv->zv_open_count != 1) {
952 				rw_exit(&zv->zv_suspend_lock);
953 				drop_suspend = B_FALSE;
954 			}
955 		}
956 	} else {
957 		drop_suspend = B_FALSE;
958 	}
959 	rw_exit(&zvol_state_lock);
960 
961 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
962 
963 	/*
964 	 * You may get multiple opens, but only one close.
965 	 */
966 	zv->zv_open_count--;
967 	if (flags & (FSYNC | FDSYNC)) {
968 		zsd = &zv->zv_zso->zso_dev;
969 		zsd->zsd_sync_cnt--;
970 	}
971 
972 	if (zv->zv_open_count == 0) {
973 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
974 		zvol_last_close(zv);
975 	}
976 
977 	mutex_exit(&zv->zv_state_lock);
978 
979 	if (drop_suspend)
980 		rw_exit(&zv->zv_suspend_lock);
981 	return (0);
982 }
983 
984 static int
985 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
986     int fflag, struct thread *td)
987 {
988 	zvol_state_t *zv;
989 	zfs_locked_range_t *lr;
990 	off_t offset, length;
991 	int i, error;
992 	boolean_t sync;
993 
994 	zv = dev->si_drv2;
995 
996 	error = 0;
997 	KASSERT(zv->zv_open_count > 0,
998 	    ("Device with zero access count in %s", __func__));
999 
1000 	i = IOCPARM_LEN(cmd);
1001 	switch (cmd) {
1002 	case DIOCGSECTORSIZE:
1003 		*(uint32_t *)data = DEV_BSIZE;
1004 		break;
1005 	case DIOCGMEDIASIZE:
1006 		*(off_t *)data = zv->zv_volsize;
1007 		break;
1008 	case DIOCGFLUSH:
1009 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1010 		if (zv->zv_zilog != NULL)
1011 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1012 		rw_exit(&zv->zv_suspend_lock);
1013 		break;
1014 	case DIOCGDELETE:
1015 		if (!zvol_unmap_enabled)
1016 			break;
1017 
1018 		offset = ((off_t *)data)[0];
1019 		length = ((off_t *)data)[1];
1020 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1021 		    offset < 0 || offset >= zv->zv_volsize ||
1022 		    length <= 0) {
1023 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1024 			    length);
1025 			error = EINVAL;
1026 			break;
1027 		}
1028 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1029 		zvol_ensure_zilog(zv);
1030 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1031 		    RL_WRITER);
1032 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1033 		error = dmu_tx_assign(tx, TXG_WAIT);
1034 		if (error != 0) {
1035 			sync = FALSE;
1036 			dmu_tx_abort(tx);
1037 		} else {
1038 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1039 			zvol_log_truncate(zv, tx, offset, length, sync);
1040 			dmu_tx_commit(tx);
1041 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1042 			    offset, length);
1043 		}
1044 		zfs_rangelock_exit(lr);
1045 		if (sync)
1046 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1047 		rw_exit(&zv->zv_suspend_lock);
1048 		break;
1049 	case DIOCGSTRIPESIZE:
1050 		*(off_t *)data = zv->zv_volblocksize;
1051 		break;
1052 	case DIOCGSTRIPEOFFSET:
1053 		*(off_t *)data = 0;
1054 		break;
1055 	case DIOCGATTR: {
1056 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1057 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1058 		uint64_t refd, avail, usedobjs, availobjs;
1059 
1060 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1061 			arg->value.i = 1;
1062 		else if (strcmp(arg->name, "blocksavail") == 0) {
1063 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1064 			    &usedobjs, &availobjs);
1065 			arg->value.off = avail / DEV_BSIZE;
1066 		} else if (strcmp(arg->name, "blocksused") == 0) {
1067 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1068 			    &usedobjs, &availobjs);
1069 			arg->value.off = refd / DEV_BSIZE;
1070 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1071 			avail = metaslab_class_get_space(spa_normal_class(spa));
1072 			avail -= metaslab_class_get_alloc(
1073 			    spa_normal_class(spa));
1074 			arg->value.off = avail / DEV_BSIZE;
1075 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1076 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1077 			arg->value.off = refd / DEV_BSIZE;
1078 		} else
1079 			error = ENOIOCTL;
1080 		break;
1081 	}
1082 	case FIOSEEKHOLE:
1083 	case FIOSEEKDATA: {
1084 		off_t *off = (off_t *)data;
1085 		uint64_t noff;
1086 		boolean_t hole;
1087 
1088 		hole = (cmd == FIOSEEKHOLE);
1089 		noff = *off;
1090 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1091 		*off = noff;
1092 		break;
1093 	}
1094 	default:
1095 		error = ENOIOCTL;
1096 	}
1097 
1098 	return (error);
1099 }
1100 
1101 /*
1102  * Misc. helpers
1103  */
1104 
1105 static void
1106 zvol_ensure_zilog(zvol_state_t *zv)
1107 {
1108 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1109 
1110 	/*
1111 	 * Open a ZIL if this is the first time we have written to this
1112 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1113 	 * than zv_state_lock so that we don't need to acquire an
1114 	 * additional lock in this path.
1115 	 */
1116 	if (zv->zv_zilog == NULL) {
1117 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1118 			rw_exit(&zv->zv_suspend_lock);
1119 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1120 		}
1121 		if (zv->zv_zilog == NULL) {
1122 			zv->zv_zilog = zil_open(zv->zv_objset,
1123 			    zvol_get_data);
1124 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1125 		}
1126 		rw_downgrade(&zv->zv_suspend_lock);
1127 	}
1128 }
1129 
1130 static boolean_t
1131 zvol_is_zvol_impl(const char *device)
1132 {
1133 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1134 }
1135 
1136 static void
1137 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1138 {
1139 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1140 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1141 
1142 	/* move to new hashtable entry  */
1143 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1144 	hlist_del(&zv->zv_hlink);
1145 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1146 
1147 	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1148 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1149 		struct g_provider *pp = zsg->zsg_provider;
1150 		struct g_geom *gp;
1151 
1152 		g_topology_lock();
1153 		gp = pp->geom;
1154 		ASSERT(gp != NULL);
1155 
1156 		zsg->zsg_provider = NULL;
1157 		g_wither_provider(pp, ENXIO);
1158 
1159 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1160 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1161 		pp->sectorsize = DEV_BSIZE;
1162 		pp->mediasize = zv->zv_volsize;
1163 		pp->private = zv;
1164 		zsg->zsg_provider = pp;
1165 		g_error_provider(pp, 0);
1166 		g_topology_unlock();
1167 	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1168 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1169 		struct cdev *dev;
1170 		struct make_dev_args args;
1171 
1172 		dev = zsd->zsd_cdev;
1173 		if (dev != NULL) {
1174 			destroy_dev(dev);
1175 			dev = zsd->zsd_cdev = NULL;
1176 			if (zv->zv_open_count > 0) {
1177 				zv->zv_flags &= ~ZVOL_EXCL;
1178 				zv->zv_open_count = 0;
1179 				/* XXX  need suspend lock but lock order */
1180 				zvol_last_close(zv);
1181 			}
1182 		}
1183 
1184 		make_dev_args_init(&args);
1185 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1186 		args.mda_devsw = &zvol_cdevsw;
1187 		args.mda_cr = NULL;
1188 		args.mda_uid = UID_ROOT;
1189 		args.mda_gid = GID_OPERATOR;
1190 		args.mda_mode = 0640;
1191 		args.mda_si_drv2 = zv;
1192 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1193 		    == 0) {
1194 			dev->si_iosize_max = MAXPHYS;
1195 			zsd->zsd_cdev = dev;
1196 		}
1197 	}
1198 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1199 }
1200 
1201 /*
1202  * Remove minor node for the specified volume.
1203  */
1204 static void
1205 zvol_free(zvol_state_t *zv)
1206 {
1207 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1208 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1209 	ASSERT(zv->zv_open_count == 0);
1210 
1211 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1212 
1213 	rw_destroy(&zv->zv_suspend_lock);
1214 	zfs_rangelock_fini(&zv->zv_rangelock);
1215 
1216 	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1217 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1218 
1219 		g_topology_lock();
1220 		zvol_geom_destroy(zv);
1221 		g_topology_unlock();
1222 		mtx_destroy(&zsg->zsg_queue_mtx);
1223 	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1224 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1225 		struct cdev *dev = zsd->zsd_cdev;
1226 
1227 		if (dev != NULL)
1228 			destroy_dev(dev);
1229 	}
1230 
1231 	mutex_destroy(&zv->zv_state_lock);
1232 	dataset_kstats_destroy(&zv->zv_kstat);
1233 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1234 	kmem_free(zv, sizeof (zvol_state_t));
1235 	zvol_minors--;
1236 }
1237 
1238 /*
1239  * Create a minor node (plus a whole lot more) for the specified volume.
1240  */
1241 static int
1242 zvol_create_minor_impl(const char *name)
1243 {
1244 	zvol_state_t *zv;
1245 	objset_t *os;
1246 	dmu_object_info_t *doi;
1247 	uint64_t volsize;
1248 	uint64_t volmode, hash;
1249 	int error;
1250 
1251 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1252 
1253 	hash = zvol_name_hash(name);
1254 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1255 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1256 		mutex_exit(&zv->zv_state_lock);
1257 		return (SET_ERROR(EEXIST));
1258 	}
1259 
1260 	DROP_GIANT();
1261 	/* lie and say we're read-only */
1262 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1263 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1264 
1265 	if (error)
1266 		goto out_doi;
1267 
1268 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1269 	if (error)
1270 		goto out_dmu_objset_disown;
1271 
1272 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1273 	if (error)
1274 		goto out_dmu_objset_disown;
1275 
1276 	error = dsl_prop_get_integer(name,
1277 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1278 	if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT)
1279 		volmode = zvol_volmode;
1280 	/*
1281 	 * zvol_alloc equivalent ...
1282 	 */
1283 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1284 	zv->zv_hash = hash;
1285 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1286 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1287 	zv->zv_zso->zso_volmode = volmode;
1288 	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1289 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1290 		struct g_provider *pp;
1291 		struct g_geom *gp;
1292 
1293 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1294 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1295 
1296 		g_topology_lock();
1297 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1298 		gp->start = zvol_geom_bio_start;
1299 		gp->access = zvol_geom_access;
1300 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1301 		/* TODO: NULL check? */
1302 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1303 		pp->sectorsize = DEV_BSIZE;
1304 		pp->mediasize = 0;
1305 		pp->private = zv;
1306 
1307 		zsg->zsg_provider = pp;
1308 		bioq_init(&zsg->zsg_queue);
1309 	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1310 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1311 		struct cdev *dev;
1312 		struct make_dev_args args;
1313 
1314 		make_dev_args_init(&args);
1315 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1316 		args.mda_devsw = &zvol_cdevsw;
1317 		args.mda_cr = NULL;
1318 		args.mda_uid = UID_ROOT;
1319 		args.mda_gid = GID_OPERATOR;
1320 		args.mda_mode = 0640;
1321 		args.mda_si_drv2 = zv;
1322 		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1323 		if (error != 0) {
1324 			mutex_destroy(&zv->zv_state_lock);
1325 			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1326 			kmem_free(zv, sizeof (*zv));
1327 			dmu_objset_disown(os, B_TRUE, FTAG);
1328 			goto out_giant;
1329 		}
1330 		dev->si_iosize_max = MAXPHYS;
1331 		zsd->zsd_cdev = dev;
1332 	}
1333 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1334 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1335 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1336 
1337 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1338 		zv->zv_flags |= ZVOL_RDONLY;
1339 
1340 	zv->zv_volblocksize = doi->doi_data_block_size;
1341 	zv->zv_volsize = volsize;
1342 	zv->zv_objset = os;
1343 
1344 	if (spa_writeable(dmu_objset_spa(os))) {
1345 		if (zil_replay_disable)
1346 			zil_destroy(dmu_objset_zil(os), B_FALSE);
1347 		else
1348 			zil_replay(os, zv, zvol_replay_vector);
1349 	}
1350 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1351 	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1352 
1353 	/* XXX do prefetch */
1354 
1355 	zv->zv_objset = NULL;
1356 out_dmu_objset_disown:
1357 	dmu_objset_disown(os, B_TRUE, FTAG);
1358 
1359 	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1360 		if (error == 0)
1361 			zvol_geom_run(zv);
1362 		g_topology_unlock();
1363 	}
1364 out_doi:
1365 	kmem_free(doi, sizeof (dmu_object_info_t));
1366 	if (error == 0) {
1367 		rw_enter(&zvol_state_lock, RW_WRITER);
1368 		zvol_insert(zv);
1369 		zvol_minors++;
1370 		rw_exit(&zvol_state_lock);
1371 	}
1372 	ZFS_LOG(1, "ZVOL %s created.", name);
1373 out_giant:
1374 	PICKUP_GIANT();
1375 	return (error);
1376 }
1377 
1378 static void
1379 zvol_clear_private(zvol_state_t *zv)
1380 {
1381 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1382 	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1383 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1384 		struct g_provider *pp = zsg->zsg_provider;
1385 
1386 		if (pp == NULL) /* XXX when? */
1387 			return;
1388 
1389 		mtx_lock(&zsg->zsg_queue_mtx);
1390 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1391 		pp->private = NULL;
1392 		wakeup_one(&zsg->zsg_queue);
1393 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1394 			msleep(&zsg->zsg_state,
1395 			    &zsg->zsg_queue_mtx,
1396 			    0, "zvol:w", 0);
1397 		mtx_unlock(&zsg->zsg_queue_mtx);
1398 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1399 	}
1400 }
1401 
1402 static int
1403 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
1404 {
1405 	zv->zv_volsize = volsize;
1406 	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1407 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1408 		struct g_provider *pp = zsg->zsg_provider;
1409 
1410 		if (pp == NULL) /* XXX when? */
1411 			return (0);
1412 
1413 		g_topology_lock();
1414 
1415 		/*
1416 		 * Do not invoke resize event when initial size was zero.
1417 		 * ZVOL initializes the size on first open, this is not
1418 		 * real resizing.
1419 		 */
1420 		if (pp->mediasize == 0)
1421 			pp->mediasize = zv->zv_volsize;
1422 		else
1423 			g_resize_provider(pp, zv->zv_volsize);
1424 
1425 		g_topology_unlock();
1426 	}
1427 	return (0);
1428 }
1429 
1430 static void
1431 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1432 {
1433 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1434 }
1435 
1436 static void
1437 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1438 {
1439 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1440 }
1441 
1442 const static zvol_platform_ops_t zvol_freebsd_ops = {
1443 	.zv_free = zvol_free,
1444 	.zv_rename_minor = zvol_rename_minor,
1445 	.zv_create_minor = zvol_create_minor_impl,
1446 	.zv_update_volsize = zvol_update_volsize,
1447 	.zv_clear_private = zvol_clear_private,
1448 	.zv_is_zvol = zvol_is_zvol_impl,
1449 	.zv_set_disk_ro = zvol_set_disk_ro_impl,
1450 	.zv_set_capacity = zvol_set_capacity_impl,
1451 };
1452 
1453 /*
1454  * Public interfaces
1455  */
1456 
1457 int
1458 zvol_busy(void)
1459 {
1460 	return (zvol_minors != 0);
1461 }
1462 
1463 int
1464 zvol_init(void)
1465 {
1466 	zvol_init_impl();
1467 	zvol_register_ops(&zvol_freebsd_ops);
1468 	return (0);
1469 }
1470 
1471 void
1472 zvol_fini(void)
1473 {
1474 	zvol_fini_impl();
1475 }
1476