xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34 
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36 
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/dsk/<pool_name>/<dataset_name>
44  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
45  *
46  * These links are created by the /dev filesystem (sdev_zvolops.c).
47  * Volumes are persistent through reboot.  No user command needs to be
48  * run before opening and using a device.
49  *
50  * FreeBSD notes.
51  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
52  * in the system.
53  */
54 
55 #include <sys/types.h>
56 #include <sys/param.h>
57 #include <sys/kernel.h>
58 #include <sys/errno.h>
59 #include <sys/uio.h>
60 #include <sys/buf.h>
61 #include <sys/open.h>
62 #include <sys/kmem.h>
63 #include <sys/conf.h>
64 #include <sys/cmn_err.h>
65 #include <sys/stat.h>
66 #include <sys/zap.h>
67 #include <sys/spa.h>
68 #include <sys/spa_impl.h>
69 #include <sys/zio.h>
70 #include <sys/disk.h>
71 #include <sys/dmu_traverse.h>
72 #include <sys/dnode.h>
73 #include <sys/dsl_dataset.h>
74 #include <sys/dsl_prop.h>
75 #include <sys/dkio.h>
76 #include <sys/byteorder.h>
77 #include <sys/sunddi.h>
78 #include <sys/dirent.h>
79 #include <sys/policy.h>
80 #include <sys/queue.h>
81 #include <sys/fs/zfs.h>
82 #include <sys/zfs_ioctl.h>
83 #include <sys/zil.h>
84 #include <sys/refcount.h>
85 #include <sys/zfs_znode.h>
86 #include <sys/zfs_rlock.h>
87 #include <sys/vdev_impl.h>
88 #include <sys/vdev_raidz.h>
89 #include <sys/zvol.h>
90 #include <sys/zil_impl.h>
91 #include <sys/dbuf.h>
92 #include <sys/dmu_tx.h>
93 #include <sys/zfeature.h>
94 #include <sys/zio_checksum.h>
95 #include <sys/filio.h>
96 
97 #include "zfs_namecheck.h"
98 
99 #ifdef __FreeBSD__
100 #include <sys/bio.h>
101 #include <geom/geom.h>
102 
103 struct g_class zfs_zvol_class = {
104 	.name = "ZFS::ZVOL",
105 	.version = G_VERSION,
106 };
107 
108 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
109 #endif
110 
111 #ifdef __NetBSD__
112 #include <sys/pathname.h>
113 #include <prop/proplib.h>
114 
115 #define	DROP_GIANT()	/* nothing */
116 #define PICKUP_GIANT()	/* nothing */
117 
118 void	zvol_minphys(struct buf *);
119 static struct dkdriver zvol_dkdriver = { zvol_strategy, zvol_minphys };
120 
121 #define	bioerror(bp, er)	((bp)->b_error = (er))
122 #define	b_edev			b_dev
123 #endif
124 
125 void *zfsdev_state;
126 static char *zvol_tag = "zvol_tag";
127 
128 #define	ZVOL_DUMPSIZE		"dumpsize"
129 
130 #ifdef __FreeBSD__
131 /*
132  * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
133  * spa_namespace_lock in the ZVOL code.
134  */
135 #define zfsdev_state_lock spa_namespace_lock
136 #else
137 /*
138  * This lock protects the zfsdev_state structure from being modified
139  * while it's being used, e.g. an open that comes in before a create
140  * finishes.  It also protects temporary opens of the dataset so that,
141  * e.g., an open doesn't get a spurious EBUSY.
142  */
143 kmutex_t zfsdev_state_lock;
144 #endif
145 static uint32_t zvol_minors;
146 
147 #ifndef illumos
148 SYSCTL_DECL(_vfs_zfs);
149 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
150 static int	volmode = ZFS_VOLMODE_GEOM;
151 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
152     "Expose as GEOM providers (1), device files (2) or neither");
153 static boolean_t zpool_on_zvol = B_FALSE;
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
155     "Allow zpools to use zvols as vdevs (DANGEROUS)");
156 
157 #endif
158 typedef struct zvol_extent {
159 	list_node_t	ze_node;
160 	dva_t		ze_dva;		/* dva associated with this extent */
161 	uint64_t	ze_nblks;	/* number of blocks in extent */
162 } zvol_extent_t;
163 
164 /*
165  * The in-core state of each volume.
166  */
167 typedef struct zvol_state {
168 #ifndef illumos
169 	LIST_ENTRY(zvol_state)	zv_links;
170 #endif
171 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
172 	uint64_t	zv_volsize;	/* amount of space we advertise */
173 	uint64_t	zv_volblocksize; /* volume block size */
174 #ifdef __FreeBSD__
175 	struct cdev	*zv_dev;	/* non-GEOM device */
176 	struct g_provider *zv_provider;	/* GEOM provider */
177 #else
178 	minor_t		zv_minor;	/* minor number */
179 #endif
180 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
181 	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
182 	objset_t	*zv_objset;	/* objset handle */
183 #if defined(illumos) || defined(__NetBSD__)
184 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
185 #endif
186 	uint32_t	zv_total_opens;	/* total open count */
187 	uint32_t	zv_sync_cnt;	/* synchronous open count */
188 	zilog_t		*zv_zilog;	/* ZIL handle */
189 	list_t		zv_extents;	/* List of extents for dump */
190 	znode_t		zv_znode;	/* for range locking */
191 	dmu_buf_t	*zv_dbuf;	/* bonus handle */
192 #ifdef __FreeBSD__
193 	int		zv_state;
194 	int		zv_volmode;	/* Provide GEOM or cdev */
195 	struct bio_queue_head zv_queue;
196 	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
197 #endif
198 #ifdef __NetBSD__
199 	struct disk	zv_dk;		/* disk statistics */
200 	kmutex_t	zv_dklock;	/* disk statistics */
201 #endif
202 } zvol_state_t;
203 
204 #ifndef illumos
205 static LIST_HEAD(, zvol_state) all_zvols;
206 #endif
207 /*
208  * zvol specific flags
209  */
210 #define	ZVOL_RDONLY	0x1
211 #define	ZVOL_DUMPIFIED	0x2
212 #define	ZVOL_EXCL	0x4
213 #define	ZVOL_WCE	0x8
214 
215 /*
216  * zvol maximum transfer in one DMU tx.
217  */
218 int zvol_maxphys = DMU_MAX_ACCESS/2;
219 
220 /*
221  * Toggle unmap functionality.
222  */
223 boolean_t zvol_unmap_enabled = B_TRUE;
224 
225 /*
226  * If true, unmaps requested as synchronous are executed synchronously,
227  * otherwise all unmaps are asynchronous.
228  */
229 boolean_t zvol_unmap_sync_enabled = B_FALSE;
230 
231 #ifdef __FreeBSD__
232 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
233     &zvol_unmap_enabled, 0,
234     "Enable UNMAP functionality");
235 
236 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN,
237     &zvol_unmap_sync_enabled, 0,
238     "UNMAPs requested as sync are executed synchronously");
239 
240 static d_open_t		zvol_d_open;
241 static d_close_t	zvol_d_close;
242 static d_read_t		zvol_read;
243 static d_write_t	zvol_write;
244 static d_ioctl_t	zvol_d_ioctl;
245 static d_strategy_t	zvol_strategy;
246 
247 static struct cdevsw zvol_cdevsw = {
248 	.d_version =	D_VERSION,
249 	.d_open =	zvol_d_open,
250 	.d_close =	zvol_d_close,
251 	.d_read =	zvol_read,
252 	.d_write =	zvol_write,
253 	.d_ioctl =	zvol_d_ioctl,
254 	.d_strategy =	zvol_strategy,
255 	.d_name =	"zvol",
256 	.d_flags =	D_DISK | D_TRACKCLOSE,
257 };
258 
259 static void zvol_geom_run(zvol_state_t *zv);
260 static void zvol_geom_destroy(zvol_state_t *zv);
261 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
262 static void zvol_geom_start(struct bio *bp);
263 static void zvol_geom_worker(void *arg);
264 static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
265     uint64_t len, boolean_t sync);
266 #endif /* __FreeBSD__ */
267 #ifdef __NetBSD__
268 /* XXXNETBSD need devsw, etc */
269 #endif
270 
271 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
272     nvlist_t *, nvlist_t *);
273 static int zvol_remove_zv(zvol_state_t *);
274 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
275 static int zvol_dumpify(zvol_state_t *zv);
276 static int zvol_dump_fini(zvol_state_t *zv);
277 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
278 
279 static void
280 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
281 {
282 #ifdef illumos
283 	dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
284 
285 	zv->zv_volsize = volsize;
286 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
287 	    "Size", volsize) == DDI_SUCCESS);
288 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
289 	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
290 
291 	/* Notify specfs to invalidate the cached size */
292 	spec_size_invalidate(dev, VBLK);
293 	spec_size_invalidate(dev, VCHR);
294 #endif /* illumos */
295 #ifdef __FreeBSD__
296 	zv->zv_volsize = volsize;
297 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
298 		struct g_provider *pp;
299 
300 		pp = zv->zv_provider;
301 		if (pp == NULL)
302 			return;
303 		g_topology_lock();
304 
305 		/*
306 		 * Do not invoke resize event when initial size was zero.
307 		 * ZVOL initializes the size on first open, this is not
308 		 * real resizing.
309 		 */
310 		if (pp->mediasize == 0)
311 			pp->mediasize = zv->zv_volsize;
312 		else
313 			g_resize_provider(pp, zv->zv_volsize);
314 		g_topology_unlock();
315 	}
316 #endif /* __FreeBSD__ */
317 #ifdef __NetBSD__
318 	prop_dictionary_t disk_info, odisk_info, geom;
319 	struct disk *disk;
320 
321 	disk = &zv->zv_dk;
322 
323 	disk_info = prop_dictionary_create();
324 	geom = prop_dictionary_create();
325 
326 	prop_dictionary_set_cstring_nocopy(disk_info, "type", "ESDI");
327 	prop_dictionary_set_uint64(geom, "sectors-per-unit", zv->zv_volsize);
328 	prop_dictionary_set_uint32(geom, "sector-size",
329 	    DEV_BSIZE /* XXX 512? */);
330 	prop_dictionary_set_uint32(geom, "sectors-per-track", 32);
331 	prop_dictionary_set_uint32(geom, "tracks-per-cylinder", 64);
332 	prop_dictionary_set_uint32(geom, "cylinders-per-unit", zv->zv_volsize / 2048);
333 	prop_dictionary_set(disk_info, "geometry", geom);
334 	prop_object_release(geom);
335 
336 	odisk_info = disk->dk_info;
337 	disk->dk_info = disk_info;
338 
339 	if (odisk_info != NULL)
340 		prop_object_release(odisk_info);
341 #endif
342 }
343 
344 
345 int
346 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
347 {
348 	if (volsize == 0)
349 		return (SET_ERROR(EINVAL));
350 
351 	if (volsize % blocksize != 0)
352 		return (SET_ERROR(EINVAL));
353 
354 #ifdef _ILP32
355 	if (volsize - 1 > SPEC_MAXOFFSET_T)
356 		return (SET_ERROR(EOVERFLOW));
357 #endif
358 	return (0);
359 }
360 
361 int
362 zvol_check_volblocksize(uint64_t volblocksize)
363 {
364 	if (volblocksize < SPA_MINBLOCKSIZE ||
365 	    volblocksize > SPA_OLD_MAXBLOCKSIZE ||
366 	    !ISP2(volblocksize))
367 		return (SET_ERROR(EDOM));
368 
369 	return (0);
370 }
371 
372 int
373 zvol_get_stats(objset_t *os, nvlist_t *nv)
374 {
375 	int error;
376 	dmu_object_info_t doi;
377 	uint64_t val;
378 
379 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
380 	if (error)
381 		return (error);
382 
383 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
384 
385 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
386 
387 	if (error == 0) {
388 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
389 		    doi.doi_data_block_size);
390 	}
391 
392 	return (error);
393 }
394 
395 static zvol_state_t *
396 zvol_minor_lookup(const char *name)
397 {
398 #ifdef illumos
399 	minor_t minor;
400 #endif
401 	zvol_state_t *zv;
402 
403 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
404 
405 #ifdef illumos
406 	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++)
407 #else
408 	LIST_FOREACH(zv, &all_zvols, zv_links)
409 #endif
410 	{
411 #ifdef illumos
412 		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
413 		if (zv == NULL)
414 			continue;
415 #endif
416 
417 		if (strcmp(zv->zv_name, name) == 0)
418 			return (zv);
419 	}
420 
421 	return (NULL);
422 }
423 
424 /* extent mapping arg */
425 struct maparg {
426 	zvol_state_t	*ma_zv;
427 	uint64_t	ma_blks;
428 };
429 
430 /*ARGSUSED*/
431 static int
432 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
433     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
434 {
435 	struct maparg *ma = arg;
436 	zvol_extent_t *ze;
437 	int bs = ma->ma_zv->zv_volblocksize;
438 
439 	if (bp == NULL || BP_IS_HOLE(bp) ||
440 	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
441 		return (0);
442 
443 	VERIFY(!BP_IS_EMBEDDED(bp));
444 
445 	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
446 	ma->ma_blks++;
447 
448 	/* Abort immediately if we have encountered gang blocks */
449 	if (BP_IS_GANG(bp))
450 		return (SET_ERROR(EFRAGS));
451 
452 	/*
453 	 * See if the block is at the end of the previous extent.
454 	 */
455 	ze = list_tail(&ma->ma_zv->zv_extents);
456 	if (ze &&
457 	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
458 	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
459 	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
460 		ze->ze_nblks++;
461 		return (0);
462 	}
463 
464 	dprintf_bp(bp, "%s", "next blkptr:");
465 
466 	/* start a new extent */
467 	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
468 	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
469 	ze->ze_nblks = 1;
470 	list_insert_tail(&ma->ma_zv->zv_extents, ze);
471 	return (0);
472 }
473 
474 static void
475 zvol_free_extents(zvol_state_t *zv)
476 {
477 	zvol_extent_t *ze;
478 
479 	while (ze = list_head(&zv->zv_extents)) {
480 		list_remove(&zv->zv_extents, ze);
481 		kmem_free(ze, sizeof (zvol_extent_t));
482 	}
483 }
484 
485 static int
486 zvol_get_lbas(zvol_state_t *zv)
487 {
488 	objset_t *os = zv->zv_objset;
489 	struct maparg	ma;
490 	int		err;
491 
492 	ma.ma_zv = zv;
493 	ma.ma_blks = 0;
494 	zvol_free_extents(zv);
495 
496 	/* commit any in-flight changes before traversing the dataset */
497 	txg_wait_synced(dmu_objset_pool(os), 0);
498 	err = traverse_dataset(dmu_objset_ds(os), 0,
499 	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
500 	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
501 		zvol_free_extents(zv);
502 		return (err ? err : EIO);
503 	}
504 
505 	return (0);
506 }
507 
508 /* ARGSUSED */
509 void
510 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
511 {
512 	zfs_creat_t *zct = arg;
513 	nvlist_t *nvprops = zct->zct_props;
514 	int error;
515 	uint64_t volblocksize, volsize;
516 
517 	VERIFY(nvlist_lookup_uint64(nvprops,
518 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
519 	if (nvlist_lookup_uint64(nvprops,
520 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
521 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
522 
523 	/*
524 	 * These properties must be removed from the list so the generic
525 	 * property setting step won't apply to them.
526 	 */
527 	VERIFY(nvlist_remove_all(nvprops,
528 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
529 	(void) nvlist_remove_all(nvprops,
530 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
531 
532 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
533 	    DMU_OT_NONE, 0, tx);
534 	ASSERT(error == 0);
535 
536 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
537 	    DMU_OT_NONE, 0, tx);
538 	ASSERT(error == 0);
539 
540 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
541 	ASSERT(error == 0);
542 }
543 
544 /*
545  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
546  * implement DKIOCFREE/free-long-range.
547  */
548 static int
549 zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
550 {
551 	uint64_t offset, length;
552 
553 	if (byteswap)
554 		byteswap_uint64_array(lr, sizeof (*lr));
555 
556 	offset = lr->lr_offset;
557 	length = lr->lr_length;
558 
559 	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
560 }
561 
562 /*
563  * Replay a TX_WRITE ZIL transaction that didn't get committed
564  * after a system failure
565  */
566 static int
567 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
568 {
569 	objset_t *os = zv->zv_objset;
570 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
571 	uint64_t offset, length;
572 	dmu_tx_t *tx;
573 	int error;
574 
575 	if (byteswap)
576 		byteswap_uint64_array(lr, sizeof (*lr));
577 
578 	offset = lr->lr_offset;
579 	length = lr->lr_length;
580 
581 	/* If it's a dmu_sync() block, write the whole block */
582 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
583 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
584 		if (length < blocksize) {
585 			offset -= offset % blocksize;
586 			length = blocksize;
587 		}
588 	}
589 
590 	tx = dmu_tx_create(os);
591 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
592 	error = dmu_tx_assign(tx, TXG_WAIT);
593 	if (error) {
594 		dmu_tx_abort(tx);
595 	} else {
596 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
597 		dmu_tx_commit(tx);
598 	}
599 
600 	return (error);
601 }
602 
603 /* ARGSUSED */
604 static int
605 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
606 {
607 	return (SET_ERROR(ENOTSUP));
608 }
609 
610 /*
611  * Callback vectors for replaying records.
612  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
613  */
614 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
615 	zvol_replay_err,	/* 0 no such transaction type */
616 	zvol_replay_err,	/* TX_CREATE */
617 	zvol_replay_err,	/* TX_MKDIR */
618 	zvol_replay_err,	/* TX_MKXATTR */
619 	zvol_replay_err,	/* TX_SYMLINK */
620 	zvol_replay_err,	/* TX_REMOVE */
621 	zvol_replay_err,	/* TX_RMDIR */
622 	zvol_replay_err,	/* TX_LINK */
623 	zvol_replay_err,	/* TX_RENAME */
624 	zvol_replay_write,	/* TX_WRITE */
625 	zvol_replay_truncate,	/* TX_TRUNCATE */
626 	zvol_replay_err,	/* TX_SETATTR */
627 	zvol_replay_err,	/* TX_ACL */
628 	zvol_replay_err,	/* TX_CREATE_ACL */
629 	zvol_replay_err,	/* TX_CREATE_ATTR */
630 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
631 	zvol_replay_err,	/* TX_MKDIR_ACL */
632 	zvol_replay_err,	/* TX_MKDIR_ATTR */
633 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
634 	zvol_replay_err,	/* TX_WRITE2 */
635 };
636 
637 #ifdef illumos
638 int
639 zvol_name2minor(const char *name, minor_t *minor)
640 {
641 	zvol_state_t *zv;
642 
643 	mutex_enter(&zfsdev_state_lock);
644 	zv = zvol_minor_lookup(name);
645 	if (minor && zv)
646 		*minor = zv->zv_minor;
647 	mutex_exit(&zfsdev_state_lock);
648 	return (zv ? 0 : -1);
649 }
650 #endif	/* illumos */
651 
652 /*
653  * Create a minor node (plus a whole lot more) for the specified volume.
654  */
655 int
656 zvol_create_minor(const char *name)
657 {
658 	zfs_soft_state_t *zs;
659 	zvol_state_t *zv;
660 	objset_t *os;
661 	int error;
662 #ifdef illumos
663 	dmu_object_info_t doi;
664 	minor_t minor = 0;
665 	char chrbuf[30], blkbuf[30];
666 #endif
667 #ifdef __FreeBSD__
668 	struct g_provider *pp;
669 	struct g_geom *gp;
670 	uint64_t mode;
671 
672 	ZFS_LOG(1, "Creating ZVOL %s...", name);
673 #endif
674 #ifdef __NetBSD__
675 	dmu_object_info_t doi;
676 	minor_t minor = 0;
677 	vnode_t *vp = NULL;
678 	char *devpath;
679 	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1;
680 #endif
681 
682 	mutex_enter(&zfsdev_state_lock);
683 
684 	if (zvol_minor_lookup(name) != NULL) {
685 		mutex_exit(&zfsdev_state_lock);
686 		return (SET_ERROR(EEXIST));
687 	}
688 
689 	/* lie and say we're read-only */
690 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
691 
692 	if (error) {
693 		mutex_exit(&zfsdev_state_lock);
694 		return (error);
695 	}
696 
697 #ifdef illumos
698 	if ((minor = zfsdev_minor_alloc()) == 0) {
699 		dmu_objset_disown(os, FTAG);
700 		mutex_exit(&zfsdev_state_lock);
701 		return (SET_ERROR(ENXIO));
702 	}
703 
704 	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
705 		dmu_objset_disown(os, FTAG);
706 		mutex_exit(&zfsdev_state_lock);
707 		return (SET_ERROR(EAGAIN));
708 	}
709 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
710 	    (char *)name);
711 
712 	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
713 
714 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
715 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
716 		ddi_soft_state_free(zfsdev_state, minor);
717 		dmu_objset_disown(os, FTAG);
718 		mutex_exit(&zfsdev_state_lock);
719 		return (SET_ERROR(EAGAIN));
720 	}
721 
722 	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
723 
724 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
725 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
726 		ddi_remove_minor_node(zfs_dip, chrbuf);
727 		ddi_soft_state_free(zfsdev_state, minor);
728 		dmu_objset_disown(os, FTAG);
729 		mutex_exit(&zfsdev_state_lock);
730 		return (SET_ERROR(EAGAIN));
731 	}
732 
733 	zs = ddi_get_soft_state(zfsdev_state, minor);
734 	zs->zss_type = ZSST_ZVOL;
735 	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
736 #endif /* illumos */
737 
738 #ifdef __FreeBSD__
739 	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
740 	zv->zv_state = 0;
741 	error = dsl_prop_get_integer(name,
742 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
743 	if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
744 		mode = volmode;
745 
746 	DROP_GIANT();
747 	zv->zv_volmode = mode;
748 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
749 		g_topology_lock();
750 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
751 		gp->start = zvol_geom_start;
752 		gp->access = zvol_geom_access;
753 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
754 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
755 		pp->sectorsize = DEV_BSIZE;
756 		pp->mediasize = 0;
757 		pp->private = zv;
758 
759 		zv->zv_provider = pp;
760 		bioq_init(&zv->zv_queue);
761 		mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
762 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
763 		struct make_dev_args args;
764 
765 		make_dev_args_init(&args);
766 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
767 		args.mda_devsw = &zvol_cdevsw;
768 		args.mda_cr = NULL;
769 		args.mda_uid = UID_ROOT;
770 		args.mda_gid = GID_OPERATOR;
771 		args.mda_mode = 0640;
772 		args.mda_si_drv2 = zv;
773 		error = make_dev_s(&args, &zv->zv_dev,
774 		    "%s/%s", ZVOL_DRIVER, name);
775 		if (error != 0) {
776 			kmem_free(zv, sizeof(*zv));
777 			dmu_objset_disown(os, FTAG);
778 			mutex_exit(&zfsdev_state_lock);
779 			return (error);
780 		}
781 		zv->zv_dev->si_iosize_max = MAXPHYS;
782 	}
783 	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
784 #endif /* __FreeBSD__ */
785 
786 #ifdef __NetBSD__
787 
788 	/*
789 	 * If there's an existing /dev/zvol symlink, try to use the
790 	 * same minor number we used last time.
791 	 */
792 	devpath = kmem_alloc(devpathlen, KM_SLEEP);
793 
794 	/* Get full path to ZFS volume disk device */
795 	(void) snprintf(devpath, devpathlen, "%s/%s", ZVOL_FULL_DEV_DIR, name);
796 
797 	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
798 
799 	if (error == 0 && vp->v_type != VBLK) {
800 		error = EINVAL;
801 	}
802 
803 	if (error == 0) {
804 		struct stat sb;
805 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
806 		error = vn_stat(vp, &sb);
807 		VOP_UNLOCK(vp, 0);
808 		if (error == 0) {
809 			minor = getminor(sb.st_rdev);
810 		}
811 	}
812 
813 	if (vp != NULL)
814 		VN_RELE(vp);
815 
816 	/*
817 	 * If we found a minor but it's already in use, we must pick a new one.
818 	 */
819 
820 	if (minor != 0 && zfsdev_get_soft_state(minor, ZSST_ZVOL) != NULL)
821 		minor = 0;
822 
823 	if (minor == 0)
824 		minor = zfsdev_minor_alloc();
825 
826 	if (minor == 0) {
827 		dmu_objset_disown(os, zvol_tag);
828 		mutex_exit(&zfsdev_state_lock);
829 		kmem_free(devpath, devpathlen);
830 		return (ENXIO);
831 	}
832 
833 	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
834 		dmu_objset_disown(os, zvol_tag);
835 		mutex_exit(&zfsdev_state_lock);
836 		kmem_free(devpath, devpathlen);
837 		return (EAGAIN);
838 	}
839 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
840 	    (char *)name);
841 
842 	if (ddi_create_minor_node(zfs_dip, (char *)name, S_IFCHR,
843 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
844 		ddi_soft_state_free(zfsdev_state, minor);
845 		dmu_objset_disown(os, zvol_tag);
846 		mutex_exit(&zfsdev_state_lock);
847 		kmem_free(devpath, devpathlen);
848 		return (EAGAIN);
849 	}
850 
851 	if (ddi_create_minor_node(zfs_dip, (char *)name, S_IFBLK,
852 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
853 		ddi_remove_minor_node(zfs_dip, (char *)name);
854 		ddi_soft_state_free(zfsdev_state, minor);
855 		dmu_objset_disown(os, zvol_tag);
856 		mutex_exit(&zfsdev_state_lock);
857 		kmem_free(devpath, devpathlen);
858 		return (EAGAIN);
859 	}
860 	zs = ddi_get_soft_state(zfsdev_state, minor);
861 	zs->zss_type = ZSST_ZVOL;
862 	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
863 
864 	disk_init(&zv->zv_dk, name, &zvol_dkdriver);
865 	disk_attach(&zv->zv_dk);
866 	mutex_init(&zv->zv_dklock, NULL, MUTEX_DEFAULT, NULL);
867 
868 	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
869 #endif /* __NetBSD__ */
870 
871 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
872 	zv->zv_min_bs = DEV_BSHIFT;
873 #ifdef illumos
874 	zv->zv_minor = minor;
875 #endif
876 	zv->zv_objset = os;
877 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
878 		zv->zv_flags |= ZVOL_RDONLY;
879 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
880 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
881 	    sizeof (rl_t), offsetof(rl_t, r_node));
882 	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
883 	    offsetof(zvol_extent_t, ze_node));
884 #if defined(illumos) || defined(__NetBSD__)
885 	/* get and cache the blocksize */
886 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
887 	ASSERT(error == 0);
888 	zv->zv_volblocksize = doi.doi_data_block_size;
889 #endif
890 
891 	if (spa_writeable(dmu_objset_spa(os))) {
892 		if (zil_replay_disable)
893 			zil_destroy(dmu_objset_zil(os), B_FALSE);
894 		else
895 			zil_replay(os, zv, zvol_replay_vector);
896 	}
897 	dmu_objset_disown(os, FTAG);
898 	zv->zv_objset = NULL;
899 
900 	zvol_minors++;
901 
902 	mutex_exit(&zfsdev_state_lock);
903 #ifdef __FreeBSD__
904 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
905 		zvol_geom_run(zv);
906 		g_topology_unlock();
907 	}
908 	PICKUP_GIANT();
909 
910 	ZFS_LOG(1, "ZVOL %s created.", name);
911 #endif
912 	return (0);
913 }
914 
915 /*
916  * Remove minor node for the specified volume.
917  */
918 static int
919 zvol_remove_zv(zvol_state_t *zv)
920 {
921 
922 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
923 	if (zv->zv_total_opens != 0)
924 		return (SET_ERROR(EBUSY));
925 
926 #ifdef illumos
927 	char nmbuf[20];
928 	minor_t minor = zv->zv_minor;
929 
930 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
931 	ddi_remove_minor_node(zfs_dip, nmbuf);
932 
933 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
934 	ddi_remove_minor_node(zfs_dip, nmbuf);
935 #endif
936 #ifdef __FreeBSD__
937 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
938 
939 	LIST_REMOVE(zv, zv_links);
940 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
941 		g_topology_lock();
942 		zvol_geom_destroy(zv);
943 		g_topology_unlock();
944 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
945 		if (zv->zv_dev != NULL)
946 			destroy_dev(zv->zv_dev);
947 	}
948 #endif
949 #ifdef __NetBSD__
950 	char nmbuf[20];
951 	minor_t minor = zv->zv_minor;
952 
953 	/* XXXNETBSD needs changes here */
954 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", zv->zv_minor);
955 	ddi_remove_minor_node(zfs_dip, nmbuf);
956 
957 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", zv->zv_minor);
958 	ddi_remove_minor_node(zfs_dip, nmbuf);
959 #endif
960 
961 	avl_destroy(&zv->zv_znode.z_range_avl);
962 	mutex_destroy(&zv->zv_znode.z_range_lock);
963 
964 	kmem_free(zv, sizeof (zvol_state_t));
965 #ifdef illumos
966 	ddi_soft_state_free(zfsdev_state, minor);
967 #endif
968 #ifdef __NetBSD__
969 	ddi_soft_state_free(zfsdev_state, minor);
970 #endif
971 	zvol_minors--;
972 	return (0);
973 }
974 
975 int
976 zvol_remove_minor(const char *name)
977 {
978 	zvol_state_t *zv;
979 	int rc;
980 
981 	mutex_enter(&zfsdev_state_lock);
982 	if ((zv = zvol_minor_lookup(name)) == NULL) {
983 		mutex_exit(&zfsdev_state_lock);
984 		return (SET_ERROR(ENXIO));
985 	}
986 	rc = zvol_remove_zv(zv);
987 	mutex_exit(&zfsdev_state_lock);
988 	return (rc);
989 }
990 
991 int
992 zvol_first_open(zvol_state_t *zv)
993 {
994 	dmu_object_info_t doi;
995 	objset_t *os;
996 	uint64_t volsize;
997 	int error;
998 	uint64_t readonly;
999 
1000 	/* lie and say we're read-only */
1001 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
1002 	    zvol_tag, &os);
1003 	if (error)
1004 		return (error);
1005 
1006 	zv->zv_objset = os;
1007 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1008 	if (error) {
1009 		ASSERT(error == 0);
1010 		dmu_objset_disown(os, zvol_tag);
1011 		return (error);
1012 	}
1013 
1014 	/* get and cache the blocksize */
1015 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
1016 	if (error) {
1017 		ASSERT(error == 0);
1018 		dmu_objset_disown(os, zvol_tag);
1019 		return (error);
1020 	}
1021 	zv->zv_volblocksize = doi.doi_data_block_size;
1022 
1023 	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
1024 	if (error) {
1025 		dmu_objset_disown(os, zvol_tag);
1026 		return (error);
1027 	}
1028 
1029 	zvol_size_changed(zv, volsize);
1030 	zv->zv_zilog = zil_open(os, zvol_get_data);
1031 
1032 	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
1033 	    NULL) == 0);
1034 	if (readonly || dmu_objset_is_snapshot(os) ||
1035 	    !spa_writeable(dmu_objset_spa(os)))
1036 		zv->zv_flags |= ZVOL_RDONLY;
1037 	else
1038 		zv->zv_flags &= ~ZVOL_RDONLY;
1039 	return (error);
1040 }
1041 
1042 void
1043 zvol_last_close(zvol_state_t *zv)
1044 {
1045 	zil_close(zv->zv_zilog);
1046 	zv->zv_zilog = NULL;
1047 
1048 	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
1049 	zv->zv_dbuf = NULL;
1050 
1051 	/*
1052 	 * Evict cached data
1053 	 */
1054 	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
1055 	    !(zv->zv_flags & ZVOL_RDONLY))
1056 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1057 	dmu_objset_evict_dbufs(zv->zv_objset);
1058 
1059 	dmu_objset_disown(zv->zv_objset, zvol_tag);
1060 	zv->zv_objset = NULL;
1061 #ifdef __NetBSD__xxx
1062 	/* the old code has this here, but it's in the wrong place. */
1063 	disk_detach(&zv->zv_dk);
1064 	disk_destroy(&zv->zv_dk);
1065 	mutex_destroy(&zv->zv_dklock);
1066 #endif
1067 }
1068 
1069 #ifdef illumos
1070 int
1071 zvol_prealloc(zvol_state_t *zv)
1072 {
1073 	objset_t *os = zv->zv_objset;
1074 	dmu_tx_t *tx;
1075 	uint64_t refd, avail, usedobjs, availobjs;
1076 	uint64_t resid = zv->zv_volsize;
1077 	uint64_t off = 0;
1078 
1079 	/* Check the space usage before attempting to allocate the space */
1080 	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
1081 	if (avail < zv->zv_volsize)
1082 		return (SET_ERROR(ENOSPC));
1083 
1084 	/* Free old extents if they exist */
1085 	zvol_free_extents(zv);
1086 
1087 	while (resid != 0) {
1088 		int error;
1089 		uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
1090 
1091 		tx = dmu_tx_create(os);
1092 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1093 		error = dmu_tx_assign(tx, TXG_WAIT);
1094 		if (error) {
1095 			dmu_tx_abort(tx);
1096 			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
1097 			return (error);
1098 		}
1099 		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
1100 		dmu_tx_commit(tx);
1101 		off += bytes;
1102 		resid -= bytes;
1103 	}
1104 	txg_wait_synced(dmu_objset_pool(os), 0);
1105 
1106 	return (0);
1107 }
1108 #endif	/* illumos */
1109 
1110 static int
1111 zvol_update_volsize(objset_t *os, uint64_t volsize)
1112 {
1113 	dmu_tx_t *tx;
1114 	int error;
1115 
1116 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1117 
1118 	tx = dmu_tx_create(os);
1119 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1120 	dmu_tx_mark_netfree(tx);
1121 	error = dmu_tx_assign(tx, TXG_WAIT);
1122 	if (error) {
1123 		dmu_tx_abort(tx);
1124 		return (error);
1125 	}
1126 
1127 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
1128 	    &volsize, tx);
1129 	dmu_tx_commit(tx);
1130 
1131 	if (error == 0)
1132 		error = dmu_free_long_range(os,
1133 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
1134 	return (error);
1135 }
1136 
1137 void
1138 zvol_remove_minors(const char *name)
1139 {
1140 #ifdef illumos
1141 	zvol_state_t *zv;
1142 	char *namebuf;
1143 	minor_t minor;
1144 
1145 	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
1146 	(void) strncpy(namebuf, name, strlen(name));
1147 	(void) strcat(namebuf, "/");
1148 	mutex_enter(&zfsdev_state_lock);
1149 	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
1150 
1151 		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1152 		if (zv == NULL)
1153 			continue;
1154 		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
1155 			(void) zvol_remove_zv(zv);
1156 	}
1157 	kmem_free(namebuf, strlen(name) + 2);
1158 
1159 	mutex_exit(&zfsdev_state_lock);
1160 #else	/* !illumos */
1161 	zvol_state_t *zv, *tzv;
1162 	size_t namelen;
1163 
1164 	namelen = strlen(name);
1165 
1166 	DROP_GIANT();
1167 	mutex_enter(&zfsdev_state_lock);
1168 
1169 	LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
1170 		if (strcmp(zv->zv_name, name) == 0 ||
1171 		    (strncmp(zv->zv_name, name, namelen) == 0 &&
1172 		    strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
1173 		    zv->zv_name[namelen] == '@'))) {
1174 			(void) zvol_remove_zv(zv);
1175 		}
1176 	}
1177 
1178 	mutex_exit(&zfsdev_state_lock);
1179 	PICKUP_GIANT();
1180 #endif	/* illumos */
1181 }
1182 
1183 static int
1184 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
1185 {
1186 	uint64_t old_volsize = 0ULL;
1187 	int error = 0;
1188 
1189 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1190 
1191 	/*
1192 	 * Reinitialize the dump area to the new size. If we
1193 	 * failed to resize the dump area then restore it back to
1194 	 * its original size.  We must set the new volsize prior
1195 	 * to calling dumpvp_resize() to ensure that the devices'
1196 	 * size(9P) is not visible by the dump subsystem.
1197 	 */
1198 	old_volsize = zv->zv_volsize;
1199 	zvol_size_changed(zv, volsize);
1200 
1201 #ifdef ZVOL_DUMP
1202 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1203 		if ((error = zvol_dumpify(zv)) != 0 ||
1204 		    (error = dumpvp_resize()) != 0) {
1205 			int dumpify_error;
1206 
1207 			(void) zvol_update_volsize(zv->zv_objset, old_volsize);
1208 			zvol_size_changed(zv, old_volsize);
1209 			dumpify_error = zvol_dumpify(zv);
1210 			error = dumpify_error ? dumpify_error : error;
1211 		}
1212 	}
1213 #endif	/* ZVOL_DUMP */
1214 
1215 #ifdef illumos
1216 	/*
1217 	 * Generate a LUN expansion event.
1218 	 */
1219 	if (error == 0) {
1220 		sysevent_id_t eid;
1221 		nvlist_t *attr;
1222 		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1223 
1224 		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
1225 		    zv->zv_minor);
1226 
1227 		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1228 		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
1229 
1230 		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
1231 		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
1232 
1233 		nvlist_free(attr);
1234 		kmem_free(physpath, MAXPATHLEN);
1235 	}
1236 #endif	/* illumos */
1237 	return (error);
1238 }
1239 
1240 int
1241 zvol_set_volsize(const char *name, uint64_t volsize)
1242 {
1243 	zvol_state_t *zv = NULL;
1244 	objset_t *os;
1245 	int error;
1246 	dmu_object_info_t doi;
1247 	uint64_t readonly;
1248 	boolean_t owned = B_FALSE;
1249 
1250 	error = dsl_prop_get_integer(name,
1251 	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
1252 	if (error != 0)
1253 		return (error);
1254 	if (readonly)
1255 		return (SET_ERROR(EROFS));
1256 
1257 	mutex_enter(&zfsdev_state_lock);
1258 	zv = zvol_minor_lookup(name);
1259 
1260 	if (zv == NULL || zv->zv_objset == NULL) {
1261 		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
1262 		    FTAG, &os)) != 0) {
1263 			mutex_exit(&zfsdev_state_lock);
1264 			return (error);
1265 		}
1266 		owned = B_TRUE;
1267 		if (zv != NULL)
1268 			zv->zv_objset = os;
1269 	} else {
1270 		os = zv->zv_objset;
1271 	}
1272 
1273 	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
1274 	    (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
1275 		goto out;
1276 
1277 	error = zvol_update_volsize(os, volsize);
1278 
1279 	if (error == 0 && zv != NULL)
1280 		error = zvol_update_live_volsize(zv, volsize);
1281 out:
1282 	if (owned) {
1283 		dmu_objset_disown(os, FTAG);
1284 		if (zv != NULL)
1285 			zv->zv_objset = NULL;
1286 	}
1287 	mutex_exit(&zfsdev_state_lock);
1288 	return (error);
1289 }
1290 
1291 /*ARGSUSED*/
1292 #ifdef illumos
1293 int
1294 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
1295 #endif
1296 #ifdef __FreeBSD__
1297 static int
1298 zvol_open(struct g_provider *pp, int flag, int count)
1299 #endif
1300 #ifdef __NetBSD__
1301 int
1302 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
1303 #endif
1304 {
1305 	zvol_state_t *zv;
1306 	int err = 0;
1307 
1308 #ifdef illumos
1309 	mutex_enter(&zfsdev_state_lock);
1310 
1311 	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
1312 	if (zv == NULL) {
1313 		mutex_exit(&zfsdev_state_lock);
1314 		return (SET_ERROR(ENXIO));
1315 	}
1316 
1317 	if (zv->zv_total_opens == 0)
1318 		err = zvol_first_open(zv);
1319 	if (err) {
1320 		mutex_exit(&zfsdev_state_lock);
1321 		return (err);
1322 	}
1323 #endif /* !illumos */
1324 #ifdef __FreeBSD__
1325 	boolean_t locked = B_FALSE;
1326 
1327 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
1328 		/*
1329 		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
1330 		 * attempting to probe geom providers while looking for a
1331 		 * replacement for a missing VDEV.  In this case, the
1332 		 * spa_namespace_lock will not be held, but it is still illegal
1333 		 * to use a zvol as a vdev.  Deadlocks can result if another
1334 		 * thread has spa_namespace_lock
1335 		 */
1336 		return (EOPNOTSUPP);
1337 	}
1338 	/*
1339 	 * Protect against recursively entering spa_namespace_lock
1340 	 * when spa_open() is used for a pool on a (local) ZVOL(s).
1341 	 * This is needed since we replaced upstream zfsdev_state_lock
1342 	 * with spa_namespace_lock in the ZVOL code.
1343 	 * We are using the same trick as spa_open().
1344 	 * Note that calls in zvol_first_open which need to resolve
1345 	 * pool name to a spa object will enter spa_open()
1346 	 * recursively, but that function already has all the
1347 	 * necessary protection.
1348 	 */
1349 	if (!MUTEX_HELD(&zfsdev_state_lock)) {
1350 		mutex_enter(&zfsdev_state_lock);
1351 		locked = B_TRUE;
1352 	}
1353 
1354 	zv = pp->private;
1355 	if (zv == NULL) {
1356 		if (locked)
1357 			mutex_exit(&zfsdev_state_lock);
1358 		return (SET_ERROR(ENXIO));
1359 	}
1360 
1361 	if (zv->zv_total_opens == 0) {
1362 		err = zvol_first_open(zv);
1363 		if (err) {
1364 			if (locked)
1365 				mutex_exit(&zfsdev_state_lock);
1366 			return (err);
1367 		}
1368 		pp->mediasize = zv->zv_volsize;
1369 		pp->stripeoffset = 0;
1370 		pp->stripesize = zv->zv_volblocksize;
1371 	}
1372 #endif /* __FreeBSD__ */
1373 #ifdef __NetBSD__
1374 	mutex_enter(&zfsdev_state_lock);
1375 
1376 	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
1377 	if (zv == NULL) {
1378 		mutex_exit(&zfsdev_state_lock);
1379 		return (SET_ERROR(ENXIO));
1380 	}
1381 
1382 	if (zv->zv_total_opens == 0)
1383 		err = zvol_first_open(zv);
1384 	if (err) {
1385 		mutex_exit(&zfsdev_state_lock);
1386 		return (err);
1387 	}
1388 #endif
1389 
1390 	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1391 		err = SET_ERROR(EROFS);
1392 		goto out;
1393 	}
1394 	if (zv->zv_flags & ZVOL_EXCL) {
1395 		err = SET_ERROR(EBUSY);
1396 		goto out;
1397 	}
1398 #ifdef FEXCL
1399 	if (flag & FEXCL) {
1400 		if (zv->zv_total_opens != 0) {
1401 			err = SET_ERROR(EBUSY);
1402 			goto out;
1403 		}
1404 		zv->zv_flags |= ZVOL_EXCL;
1405 	}
1406 #endif
1407 
1408 #ifdef illumos
1409 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
1410 		zv->zv_open_count[otyp]++;
1411 		zv->zv_total_opens++;
1412 	}
1413 	mutex_exit(&zfsdev_state_lock);
1414 #endif
1415 #ifdef __FreeBSD__
1416 	zv->zv_total_opens += count;
1417 	if (locked)
1418 		mutex_exit(&zfsdev_state_lock);
1419 #endif
1420 #ifdef __NetBSD__
1421 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
1422 		zv->zv_open_count[otyp]++;
1423 		zv->zv_total_opens++;
1424 	}
1425 	mutex_exit(&zfsdev_state_lock);
1426 #endif
1427 
1428 	return (err);
1429 out:
1430 	if (zv->zv_total_opens == 0)
1431 		zvol_last_close(zv);
1432 #ifdef __FreeBSD__
1433 	if (locked)
1434 #endif
1435 		mutex_exit(&zfsdev_state_lock);
1436 
1437 	return (err);
1438 }
1439 
1440 /*ARGSUSED*/
1441 #if defined(illumos) || defined(__NetBSD__)
1442 int
1443 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
1444 #endif
1445 #ifdef __FreeBSD__
1446 static int
1447 zvol_close(struct g_provider *pp, int flag, int count)
1448 #endif
1449 {
1450 #if defined(illumos) || defined(__NetBSD__)
1451 	minor_t minor = getminor(dev);
1452 	zvol_state_t *zv;
1453 	int error = 0;
1454 
1455 	mutex_enter(&zfsdev_state_lock);
1456 
1457 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1458 	if (zv == NULL) {
1459 		mutex_exit(&zfsdev_state_lock);
1460 		return (SET_ERROR(ENXIO));
1461 	}
1462 #endif /* illumos */
1463 #ifdef __FreeBSD__
1464 	zvol_state_t *zv;
1465 	int error = 0;
1466 	boolean_t locked = B_FALSE;
1467 
1468 	/* See comment in zvol_open(). */
1469 	if (!MUTEX_HELD(&zfsdev_state_lock)) {
1470 		mutex_enter(&zfsdev_state_lock);
1471 		locked = B_TRUE;
1472 	}
1473 
1474 	zv = pp->private;
1475 	if (zv == NULL) {
1476 		if (locked)
1477 			mutex_exit(&zfsdev_state_lock);
1478 		return (SET_ERROR(ENXIO));
1479 	}
1480 #endif /* __FreeBSD__ */
1481 
1482 	if (zv->zv_flags & ZVOL_EXCL) {
1483 		ASSERT(zv->zv_total_opens == 1);
1484 		zv->zv_flags &= ~ZVOL_EXCL;
1485 	}
1486 
1487 	/*
1488 	 * If the open count is zero, this is a spurious close.
1489 	 * That indicates a bug in the kernel / DDI framework.
1490 	 */
1491 #if defined(illumos) || defined(__NetBSD__)
1492 	ASSERT(zv->zv_open_count[otyp] != 0);
1493 #endif
1494 	ASSERT(zv->zv_total_opens != 0);
1495 
1496 	/*
1497 	 * You may get multiple opens, but only one close.
1498 	 */
1499 #if defined(illumos) || defined(__NetBSD__)
1500 	zv->zv_open_count[otyp]--;
1501 	zv->zv_total_opens--;
1502 #else
1503 	zv->zv_total_opens -= count;
1504 #endif
1505 
1506 	if (zv->zv_total_opens == 0)
1507 		zvol_last_close(zv);
1508 
1509 #if defined(illumos) || defined(__NetBSD__)
1510 	mutex_exit(&zfsdev_state_lock);
1511 #else
1512 	if (locked)
1513 		mutex_exit(&zfsdev_state_lock);
1514 #endif
1515 	return (error);
1516 }
1517 
1518 static void
1519 zvol_get_done(zgd_t *zgd, int error)
1520 {
1521 	if (zgd->zgd_db)
1522 		dmu_buf_rele(zgd->zgd_db, zgd);
1523 
1524 	zfs_range_unlock(zgd->zgd_rl);
1525 
1526 	if (error == 0 && zgd->zgd_bp)
1527 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1528 
1529 	kmem_free(zgd, sizeof (zgd_t));
1530 }
1531 
1532 /*
1533  * Get data to generate a TX_WRITE intent log record.
1534  */
1535 static int
1536 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1537 {
1538 	zvol_state_t *zv = arg;
1539 	objset_t *os = zv->zv_objset;
1540 	uint64_t object = ZVOL_OBJ;
1541 	uint64_t offset = lr->lr_offset;
1542 	uint64_t size = lr->lr_length;	/* length of user data */
1543 	blkptr_t *bp = &lr->lr_blkptr;
1544 	dmu_buf_t *db;
1545 	zgd_t *zgd;
1546 	int error;
1547 
1548 	ASSERT(zio != NULL);
1549 	ASSERT(size != 0);
1550 
1551 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1552 	zgd->zgd_zilog = zv->zv_zilog;
1553 	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
1554 
1555 	/*
1556 	 * Write records come in two flavors: immediate and indirect.
1557 	 * For small writes it's cheaper to store the data with the
1558 	 * log record (immediate); for large writes it's cheaper to
1559 	 * sync the data and get a pointer to it (indirect) so that
1560 	 * we don't have to write the data twice.
1561 	 */
1562 	if (buf != NULL) {	/* immediate write */
1563 		error = dmu_read(os, object, offset, size, buf,
1564 		    DMU_READ_NO_PREFETCH);
1565 	} else {
1566 		size = zv->zv_volblocksize;
1567 		offset = P2ALIGN(offset, size);
1568 		error = dmu_buf_hold(os, object, offset, zgd, &db,
1569 		    DMU_READ_NO_PREFETCH);
1570 		if (error == 0) {
1571 			blkptr_t *obp = dmu_buf_get_blkptr(db);
1572 			if (obp) {
1573 				ASSERT(BP_IS_HOLE(bp));
1574 				*bp = *obp;
1575 			}
1576 
1577 			zgd->zgd_db = db;
1578 			zgd->zgd_bp = bp;
1579 
1580 			ASSERT(db->db_offset == offset);
1581 			ASSERT(db->db_size == size);
1582 
1583 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1584 			    zvol_get_done, zgd);
1585 
1586 			if (error == 0)
1587 				return (0);
1588 		}
1589 	}
1590 
1591 	zvol_get_done(zgd, error);
1592 
1593 	return (error);
1594 }
1595 
1596 /*
1597  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1598  *
1599  * We store data in the log buffers if it's small enough.
1600  * Otherwise we will later flush the data out via dmu_sync().
1601  */
1602 ssize_t zvol_immediate_write_sz = 32768;
1603 #ifdef _KERNEL
1604 SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
1605     &zvol_immediate_write_sz, 0, "Minimal size for indirect log write");
1606 #endif
1607 
1608 static void
1609 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1610     boolean_t sync)
1611 {
1612 	uint32_t blocksize = zv->zv_volblocksize;
1613 	zilog_t *zilog = zv->zv_zilog;
1614 	itx_wr_state_t write_state;
1615 
1616 	if (zil_replaying(zilog, tx))
1617 		return;
1618 
1619 	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1620 		write_state = WR_INDIRECT;
1621 	else if (!spa_has_slogs(zilog->zl_spa) &&
1622 	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
1623 		write_state = WR_INDIRECT;
1624 	else if (sync)
1625 		write_state = WR_COPIED;
1626 	else
1627 		write_state = WR_NEED_COPY;
1628 
1629 	while (resid) {
1630 		itx_t *itx;
1631 		lr_write_t *lr;
1632 		itx_wr_state_t wr_state = write_state;
1633 		ssize_t len = resid;
1634 
1635 		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1636 			wr_state = WR_NEED_COPY;
1637 		else if (wr_state == WR_INDIRECT)
1638 			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1639 
1640 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1641 		    (wr_state == WR_COPIED ? len : 0));
1642 		lr = (lr_write_t *)&itx->itx_lr;
1643 		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
1644 		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1645 			zil_itx_destroy(itx);
1646 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1647 			lr = (lr_write_t *)&itx->itx_lr;
1648 			wr_state = WR_NEED_COPY;
1649 		}
1650 
1651 		itx->itx_wr_state = wr_state;
1652 		lr->lr_foid = ZVOL_OBJ;
1653 		lr->lr_offset = off;
1654 		lr->lr_length = len;
1655 		lr->lr_blkoff = 0;
1656 		BP_ZERO(&lr->lr_blkptr);
1657 
1658 		itx->itx_private = zv;
1659 
1660 		if (!sync && (zv->zv_sync_cnt == 0))
1661 			itx->itx_sync = B_FALSE;
1662 
1663 		zil_itx_assign(zilog, itx, tx);
1664 
1665 		off += len;
1666 		resid -= len;
1667 	}
1668 }
1669 
1670 #ifdef illumos
1671 static int
1672 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1673     uint64_t size, boolean_t doread, boolean_t isdump)
1674 {
1675 	vdev_disk_t *dvd;
1676 	int c;
1677 	int numerrors = 0;
1678 
1679 	if (vd->vdev_ops == &vdev_mirror_ops ||
1680 	    vd->vdev_ops == &vdev_replacing_ops ||
1681 	    vd->vdev_ops == &vdev_spare_ops) {
1682 		for (c = 0; c < vd->vdev_children; c++) {
1683 			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1684 			    addr, offset, origoffset, size, doread, isdump);
1685 			if (err != 0) {
1686 				numerrors++;
1687 			} else if (doread) {
1688 				break;
1689 			}
1690 		}
1691 	}
1692 
1693 	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1694 		return (numerrors < vd->vdev_children ? 0 : EIO);
1695 
1696 	if (doread && !vdev_readable(vd))
1697 		return (SET_ERROR(EIO));
1698 	else if (!doread && !vdev_writeable(vd))
1699 		return (SET_ERROR(EIO));
1700 
1701 	if (vd->vdev_ops == &vdev_raidz_ops) {
1702 		return (vdev_raidz_physio(vd,
1703 		    addr, size, offset, origoffset, doread, isdump));
1704 	}
1705 
1706 	offset += VDEV_LABEL_START_SIZE;
1707 
1708 	if (ddi_in_panic() || isdump) {
1709 		ASSERT(!doread);
1710 		if (doread)
1711 			return (SET_ERROR(EIO));
1712 		dvd = vd->vdev_tsd;
1713 		ASSERT3P(dvd, !=, NULL);
1714 		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1715 		    lbtodb(size)));
1716 	} else {
1717 		dvd = vd->vdev_tsd;
1718 		ASSERT3P(dvd, !=, NULL);
1719 		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1720 		    offset, doread ? B_READ : B_WRITE));
1721 	}
1722 }
1723 
1724 static int
1725 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1726     boolean_t doread, boolean_t isdump)
1727 {
1728 	vdev_t *vd;
1729 	int error;
1730 	zvol_extent_t *ze;
1731 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1732 
1733 	/* Must be sector aligned, and not stradle a block boundary. */
1734 	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1735 	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1736 		return (SET_ERROR(EINVAL));
1737 	}
1738 	ASSERT(size <= zv->zv_volblocksize);
1739 
1740 	/* Locate the extent this belongs to */
1741 	ze = list_head(&zv->zv_extents);
1742 	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1743 		offset -= ze->ze_nblks * zv->zv_volblocksize;
1744 		ze = list_next(&zv->zv_extents, ze);
1745 	}
1746 
1747 	if (ze == NULL)
1748 		return (SET_ERROR(EINVAL));
1749 
1750 	if (!ddi_in_panic())
1751 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1752 
1753 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1754 	offset += DVA_GET_OFFSET(&ze->ze_dva);
1755 	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1756 	    size, doread, isdump);
1757 
1758 	if (!ddi_in_panic())
1759 		spa_config_exit(spa, SCL_STATE, FTAG);
1760 
1761 	return (error);
1762 }
1763 #else /* !illumos */
1764 static inline int
1765 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1766     boolean_t doread, boolean_t isdump)
1767 {
1768 	return 0;
1769 }
1770 #endif /* illumos */
1771 
1772 #ifdef illumos
1773 int
1774 zvol_strategy(buf_t *bp)
1775 #endif
1776 #ifdef __FreeBSD__
1777 void
1778 zvol_strategy(struct bio *bp)
1779 #endif
1780 #ifdef __NetBSD__
1781 void
1782 zvol_strategy(buf_t *bp)
1783 #endif
1784 {
1785 	zvol_state_t *zv;
1786 	uint64_t off, volsize;
1787 	size_t resid;
1788 	char *addr;
1789 	objset_t *os;
1790 	rl_t *rl;
1791 	int error = 0;
1792 #ifdef illumos
1793 	boolean_t doread = bp->b_flags & B_READ;
1794 #else
1795 	boolean_t doread = 0;
1796 #endif
1797 	boolean_t is_dumpified;
1798 	boolean_t sync;
1799 
1800 #ifdef illumos
1801 	zfs_soft_state_t *zs = NULL;
1802 
1803 	if (getminor(bp->b_edev) == 0) {
1804 		error = SET_ERROR(EINVAL);
1805 	} else {
1806 		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1807 		if (zs == NULL)
1808 			error = SET_ERROR(ENXIO);
1809 		else if (zs->zss_type != ZSST_ZVOL)
1810 			error = SET_ERROR(EINVAL);
1811 	}
1812 
1813 	if (error) {
1814 		bioerror(bp, error);
1815 		biodone(bp);
1816 		return (0);
1817 	}
1818 
1819 	zv = zs->zss_data;
1820 
1821 	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1822 		bioerror(bp, EROFS);
1823 		biodone(bp);
1824 		return (0);
1825 	}
1826 
1827 	off = ldbtob(bp->b_blkno);
1828 #endif /* illumos */
1829 #ifdef __FreeBSD__
1830 	if (bp->bio_to)
1831 		zv = bp->bio_to->private;
1832 	else
1833 		zv = bp->bio_dev->si_drv2;
1834 
1835 	if (zv == NULL) {
1836 		error = SET_ERROR(ENXIO);
1837 		goto out;
1838 	}
1839 
1840 	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1841 		error = SET_ERROR(EROFS);
1842 		goto out;
1843 	}
1844 
1845 	switch (bp->bio_cmd) {
1846 	case BIO_FLUSH:
1847 		goto sync;
1848 	case BIO_READ:
1849 		doread = 1;
1850 	case BIO_WRITE:
1851 	case BIO_DELETE:
1852 		break;
1853 	default:
1854 		error = EOPNOTSUPP;
1855 		goto out;
1856 	}
1857 
1858 	off = bp->bio_offset;
1859 #endif /* __FreeBSD__ */
1860 #ifdef __NetBSD__
1861 	zfs_soft_state_t *zs = NULL;
1862 
1863 	if (getminor(bp->b_edev) == 0) {
1864 		error = SET_ERROR(EINVAL);
1865 	} else {
1866 		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1867 		if (zs == NULL)
1868 			error = SET_ERROR(ENXIO);
1869 		else if (zs->zss_type != ZSST_ZVOL)
1870 			error = SET_ERROR(EINVAL);
1871 	}
1872 
1873 	if (error) {
1874 		bioerror(bp, error);
1875 		biodone(bp);
1876 		return;
1877 	}
1878 
1879 	zv = zs->zss_data;
1880 
1881 	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1882 		bioerror(bp, EROFS);
1883 		biodone(bp);
1884 		return;
1885 	}
1886 	off = (uint64_t)bp->b_blkno * DEV_BSIZE;
1887 #endif
1888 
1889 	volsize = zv->zv_volsize;
1890 
1891 	os = zv->zv_objset;
1892 	ASSERT(os != NULL);
1893 
1894 #ifdef illumos
1895 	bp_mapin(bp);
1896 	addr = bp->b_un.b_addr;
1897 	resid = bp->b_bcount;
1898 
1899 	if (resid > 0 && (off < 0 || off >= volsize)) {
1900 		bioerror(bp, EIO);
1901 		biodone(bp);
1902 		return (0);
1903 	}
1904 
1905 	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1906 	sync = ((!(bp->b_flags & B_ASYNC) &&
1907 	    !(zv->zv_flags & ZVOL_WCE)) ||
1908 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1909 	    !doread && !is_dumpified;
1910 #endif /* illumos */
1911 #ifdef __FreeBSD__
1912 	addr = bp->bio_data;
1913 	resid = bp->bio_length;
1914 
1915 	if (resid > 0 && (off < 0 || off >= volsize)) {
1916 		error = SET_ERROR(EIO);
1917 		goto out;
1918 	}
1919 
1920 	is_dumpified = B_FALSE;
1921 	sync = !doread && !is_dumpified &&
1922 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1923 #endif /* __FreeBSD__ */
1924 #ifdef __NetBSD__
1925 	addr = bp->b_data;
1926 	resid = bp->b_bcount;
1927 
1928 	if (resid > 0 && off >= volsize) {
1929 		bioerror(bp, EIO);
1930 		biodone(bp);
1931 		return;
1932 	}
1933 
1934 	is_dumpified = B_FALSE;
1935 	sync = ((!(bp->b_flags & B_ASYNC) &&
1936 	    !(zv->zv_flags & ZVOL_WCE)) ||
1937 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1938 	    !doread && !is_dumpified;
1939 
1940 	mutex_enter(&zv->zv_dklock);
1941 	disk_busy(&zv->zv_dk);
1942 	mutex_exit(&zv->zv_dklock);
1943 #endif
1944 
1945 	/*
1946 	 * There must be no buffer changes when doing a dmu_sync() because
1947 	 * we can't change the data whilst calculating the checksum.
1948 	 */
1949 	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1950 	    doread ? RL_READER : RL_WRITER);
1951 
1952 #ifdef __FreeBSD__
1953 	if (bp->bio_cmd == BIO_DELETE) {
1954 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1955 		error = dmu_tx_assign(tx, TXG_WAIT);
1956 		if (error != 0) {
1957 			dmu_tx_abort(tx);
1958 		} else {
1959 			zvol_log_truncate(zv, tx, off, resid, sync);
1960 			dmu_tx_commit(tx);
1961 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1962 			    off, resid);
1963 			resid = 0;
1964 		}
1965 		goto unlock;
1966 	}
1967 #endif
1968 	while (resid != 0 && off < volsize) {
1969 		size_t size = MIN(resid, zvol_maxphys);
1970 		if (is_dumpified) {
1971 			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1972 			error = zvol_dumpio(zv, addr, off, size,
1973 			    doread, B_FALSE);
1974 		} else if (doread) {
1975 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1976 			    DMU_READ_PREFETCH);
1977 		} else {
1978 			dmu_tx_t *tx = dmu_tx_create(os);
1979 			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1980 			error = dmu_tx_assign(tx, TXG_WAIT);
1981 			if (error) {
1982 				dmu_tx_abort(tx);
1983 			} else {
1984 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1985 				zvol_log_write(zv, tx, off, size, sync);
1986 				dmu_tx_commit(tx);
1987 			}
1988 		}
1989 		if (error) {
1990 			/* convert checksum errors into IO errors */
1991 			if (error == ECKSUM)
1992 				error = SET_ERROR(EIO);
1993 			break;
1994 		}
1995 		off += size;
1996 		addr += size;
1997 		resid -= size;
1998 	}
1999 #ifdef __FreeBSD__
2000 unlock:
2001 #endif
2002 	zfs_range_unlock(rl);
2003 
2004 #ifdef illumos
2005 	if ((bp->b_resid = resid) == bp->b_bcount)
2006 		bioerror(bp, off > volsize ? EINVAL : error);
2007 
2008 	if (sync)
2009 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2010 	biodone(bp);
2011 
2012 	return (0);
2013 #endif /* illumos */
2014 #ifdef __FreeBSD__
2015 	bp->bio_completed = bp->bio_length - resid;
2016 	if (bp->bio_completed < bp->bio_length && off > volsize)
2017 		error = EINVAL;
2018 
2019 	if (sync) {
2020 sync:
2021 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2022 	}
2023 out:
2024 	if (bp->bio_to)
2025 		g_io_deliver(bp, error);
2026 	else
2027 		biofinish(bp, NULL, error);
2028 #endif /* __FreeBSD__ */
2029 #ifdef __NetBSD__
2030 	if ((bp->b_resid = resid) == bp->b_bcount)
2031 		bioerror(bp, off > volsize ? EINVAL : error);
2032 
2033 	if (sync)
2034 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2035 	mutex_enter(&zv->zv_dklock);
2036 	disk_unbusy(&zv->zv_dk, bp->b_bcount - bp->b_resid, doread);
2037 	mutex_exit(&zv->zv_dklock);
2038 	biodone(bp);
2039 #endif /* __NetBSD__ */
2040 }
2041 
2042 #if defined(illumos) || defined(__NetBSD__)
2043 /*
2044  * Set the buffer count to the zvol maximum transfer.
2045  * Using our own routine instead of the default minphys()
2046  * means that for larger writes we write bigger buffers on X86
2047  * (128K instead of 56K) and flush the disk write cache less often
2048  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
2049  * 56K on X86 and 128K on sparc).
2050  */
2051 void
2052 zvol_minphys(struct buf *bp)
2053 {
2054 	if (bp->b_bcount > zvol_maxphys)
2055 		bp->b_bcount = zvol_maxphys;
2056 }
2057 #endif
2058 
2059 #ifdef illumos
2060 int
2061 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
2062 {
2063 	minor_t minor = getminor(dev);
2064 	zvol_state_t *zv;
2065 	int error = 0;
2066 	uint64_t size;
2067 	uint64_t boff;
2068 	uint64_t resid;
2069 
2070 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
2071 	if (zv == NULL)
2072 		return (SET_ERROR(ENXIO));
2073 
2074 	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
2075 		return (SET_ERROR(EINVAL));
2076 
2077 	boff = ldbtob(blkno);
2078 	resid = ldbtob(nblocks);
2079 
2080 	VERIFY3U(boff + resid, <=, zv->zv_volsize);
2081 
2082 	while (resid) {
2083 		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
2084 		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
2085 		if (error)
2086 			break;
2087 		boff += size;
2088 		addr += size;
2089 		resid -= size;
2090 	}
2091 
2092 	return (error);
2093 }
2094 #endif
2095 
2096 /*ARGSUSED*/
2097 #if defined(illumos) || defined(__NetBSD__)
2098 int
2099 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
2100 #endif
2101 #ifdef __FreeBSD__
2102 int
2103 zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
2104 #endif
2105 {
2106 	zvol_state_t *zv;
2107 	uint64_t volsize;
2108 	rl_t *rl;
2109 	int error = 0;
2110 
2111 #if defined(illumos) || defined(__NetBSD__)
2112 	minor_t minor = getminor(dev);
2113 
2114 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
2115 	if (zv == NULL)
2116 		return (SET_ERROR(ENXIO));
2117 #else
2118 	zv = dev->si_drv2;
2119 #endif
2120 
2121 	volsize = zv->zv_volsize;
2122 	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
2123 	if (uio->uio_resid > 0 &&
2124 	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
2125 		return (SET_ERROR(EIO));
2126 
2127 #ifdef illumos
2128 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
2129 		error = physio(zvol_strategy, NULL, dev, B_READ,
2130 		    zvol_minphys, uio);
2131 		return (error);
2132 	}
2133 #endif
2134 
2135 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
2136 	    RL_READER);
2137 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
2138 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
2139 
2140 		/* don't read past the end */
2141 		if (bytes > volsize - uio->uio_loffset)
2142 			bytes = volsize - uio->uio_loffset;
2143 
2144 		error =  dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
2145 		if (error) {
2146 			/* convert checksum errors into IO errors */
2147 			if (error == ECKSUM)
2148 				error = SET_ERROR(EIO);
2149 			break;
2150 		}
2151 	}
2152 	zfs_range_unlock(rl);
2153 	return (error);
2154 }
2155 
2156 /*ARGSUSED*/
2157 #if defined(illumos) || defined(__NetBSD__)
2158 int
2159 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
2160 #else
2161 int
2162 zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
2163 #endif
2164 {
2165 	zvol_state_t *zv;
2166 	uint64_t volsize;
2167 	rl_t *rl;
2168 	int error = 0;
2169 	boolean_t sync;
2170 
2171 #if defined(illumos) || defined(__NetBSD__)
2172 	minor_t minor = getminor(dev);
2173 
2174 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
2175 	if (zv == NULL)
2176 		return (SET_ERROR(ENXIO));
2177 #else
2178 	zv = dev->si_drv2;
2179 #endif
2180 
2181 	volsize = zv->zv_volsize;
2182 	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
2183 	if (uio->uio_resid > 0 &&
2184 	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
2185 		return (SET_ERROR(EIO));
2186 
2187 #ifdef illumos
2188 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
2189 		error = physio(zvol_strategy, NULL, dev, B_WRITE,
2190 		    zvol_minphys, uio);
2191 		return (error);
2192 	}
2193 
2194 	sync = !(zv->zv_flags & ZVOL_WCE) ||
2195 #endif
2196 #ifdef __FreeBSD__
2197 	sync = (ioflag & IO_SYNC) ||
2198 #endif
2199 #ifdef __NetBSD__
2200 	sync = 1 ||
2201 #endif
2202 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
2203 
2204 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
2205 	    RL_WRITER);
2206 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
2207 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
2208 		uint64_t off = uio->uio_loffset;
2209 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
2210 
2211 		if (bytes > volsize - off)	/* don't write past the end */
2212 			bytes = volsize - off;
2213 
2214 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
2215 		error = dmu_tx_assign(tx, TXG_WAIT);
2216 		if (error) {
2217 			dmu_tx_abort(tx);
2218 			break;
2219 		}
2220 		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
2221 		if (error == 0)
2222 			zvol_log_write(zv, tx, off, bytes, sync);
2223 		dmu_tx_commit(tx);
2224 
2225 		if (error)
2226 			break;
2227 	}
2228 	zfs_range_unlock(rl);
2229 	if (sync)
2230 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2231 	return (error);
2232 }
2233 
2234 #ifdef illumos
2235 int
2236 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
2237 {
2238 	struct uuid uuid = EFI_RESERVED;
2239 	efi_gpe_t gpe = { 0 };
2240 	uint32_t crc;
2241 	dk_efi_t efi;
2242 	int length;
2243 	char *ptr;
2244 
2245 	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
2246 		return (SET_ERROR(EFAULT));
2247 	ptr = (char *)(uintptr_t)efi.dki_data_64;
2248 	length = efi.dki_length;
2249 	/*
2250 	 * Some clients may attempt to request a PMBR for the
2251 	 * zvol.  Currently this interface will return EINVAL to
2252 	 * such requests.  These requests could be supported by
2253 	 * adding a check for lba == 0 and consing up an appropriate
2254 	 * PMBR.
2255 	 */
2256 	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
2257 		return (SET_ERROR(EINVAL));
2258 
2259 	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
2260 	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
2261 	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
2262 
2263 	if (efi.dki_lba == 1) {
2264 		efi_gpt_t gpt = { 0 };
2265 
2266 		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
2267 		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
2268 		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
2269 		gpt.efi_gpt_MyLBA = LE_64(1ULL);
2270 		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
2271 		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
2272 		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
2273 		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
2274 		gpt.efi_gpt_SizeOfPartitionEntry =
2275 		    LE_32(sizeof (efi_gpe_t));
2276 		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
2277 		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2278 		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
2279 		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2280 		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
2281 		    flag))
2282 			return (SET_ERROR(EFAULT));
2283 		ptr += sizeof (gpt);
2284 		length -= sizeof (gpt);
2285 	}
2286 	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
2287 	    length), flag))
2288 		return (SET_ERROR(EFAULT));
2289 	return (0);
2290 }
2291 
2292 /*
2293  * BEGIN entry points to allow external callers access to the volume.
2294  */
2295 /*
2296  * Return the volume parameters needed for access from an external caller.
2297  * These values are invariant as long as the volume is held open.
2298  */
2299 int
2300 zvol_get_volume_params(minor_t minor, uint64_t *blksize,
2301     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
2302     void **rl_hdl, void **bonus_hdl)
2303 {
2304 	zvol_state_t *zv;
2305 
2306 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
2307 	if (zv == NULL)
2308 		return (SET_ERROR(ENXIO));
2309 	if (zv->zv_flags & ZVOL_DUMPIFIED)
2310 		return (SET_ERROR(ENXIO));
2311 
2312 	ASSERT(blksize && max_xfer_len && minor_hdl &&
2313 	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
2314 
2315 	*blksize = zv->zv_volblocksize;
2316 	*max_xfer_len = (uint64_t)zvol_maxphys;
2317 	*minor_hdl = zv;
2318 	*objset_hdl = zv->zv_objset;
2319 	*zil_hdl = zv->zv_zilog;
2320 	*rl_hdl = &zv->zv_znode;
2321 	*bonus_hdl = zv->zv_dbuf;
2322 	return (0);
2323 }
2324 
2325 /*
2326  * Return the current volume size to an external caller.
2327  * The size can change while the volume is open.
2328  */
2329 uint64_t
2330 zvol_get_volume_size(void *minor_hdl)
2331 {
2332 	zvol_state_t *zv = minor_hdl;
2333 
2334 	return (zv->zv_volsize);
2335 }
2336 
2337 /*
2338  * Return the current WCE setting to an external caller.
2339  * The WCE setting can change while the volume is open.
2340  */
2341 int
2342 zvol_get_volume_wce(void *minor_hdl)
2343 {
2344 	zvol_state_t *zv = minor_hdl;
2345 
2346 	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
2347 }
2348 
2349 /*
2350  * Entry point for external callers to zvol_log_write
2351  */
2352 void
2353 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
2354     boolean_t sync)
2355 {
2356 	zvol_state_t *zv = minor_hdl;
2357 
2358 	zvol_log_write(zv, tx, off, resid, sync);
2359 }
2360 /*
2361  * END entry points to allow external callers access to the volume.
2362  */
2363 #endif	/* illumos */
2364 
2365 /*
2366  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
2367  */
2368 static void
2369 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
2370     boolean_t sync)
2371 {
2372 	itx_t *itx;
2373 	lr_truncate_t *lr;
2374 	zilog_t *zilog = zv->zv_zilog;
2375 
2376 	if (zil_replaying(zilog, tx))
2377 		return;
2378 
2379 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
2380 	lr = (lr_truncate_t *)&itx->itx_lr;
2381 	lr->lr_foid = ZVOL_OBJ;
2382 	lr->lr_offset = off;
2383 	lr->lr_length = len;
2384 
2385 	itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
2386 	zil_itx_assign(zilog, itx, tx);
2387 }
2388 
2389 #ifdef illumos
2390 /*
2391  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
2392  * Also a dirtbag dkio ioctl for unmap/free-block functionality.
2393  */
2394 /*ARGSUSED*/
2395 int
2396 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
2397 {
2398 	zvol_state_t *zv;
2399 	struct dk_callback *dkc;
2400 	int error = 0;
2401 	rl_t *rl;
2402 
2403 	mutex_enter(&zfsdev_state_lock);
2404 
2405 	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
2406 
2407 	if (zv == NULL) {
2408 		mutex_exit(&zfsdev_state_lock);
2409 		return (SET_ERROR(ENXIO));
2410 	}
2411 	ASSERT(zv->zv_total_opens > 0);
2412 
2413 	switch (cmd) {
2414 
2415 	case DKIOCINFO:
2416 	{
2417 		struct dk_cinfo dki;
2418 
2419 		bzero(&dki, sizeof (dki));
2420 		(void) strcpy(dki.dki_cname, "zvol");
2421 		(void) strcpy(dki.dki_dname, "zvol");
2422 		dki.dki_ctype = DKC_UNKNOWN;
2423 		dki.dki_unit = getminor(dev);
2424 		dki.dki_maxtransfer =
2425 		    1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
2426 		mutex_exit(&zfsdev_state_lock);
2427 		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
2428 			error = SET_ERROR(EFAULT);
2429 		return (error);
2430 	}
2431 
2432 	case DKIOCGMEDIAINFO:
2433 	{
2434 		struct dk_minfo dkm;
2435 
2436 		bzero(&dkm, sizeof (dkm));
2437 		dkm.dki_lbsize = 1U << zv->zv_min_bs;
2438 		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
2439 		dkm.dki_media_type = DK_UNKNOWN;
2440 		mutex_exit(&zfsdev_state_lock);
2441 		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
2442 			error = SET_ERROR(EFAULT);
2443 		return (error);
2444 	}
2445 
2446 	case DKIOCGMEDIAINFOEXT:
2447 	{
2448 		struct dk_minfo_ext dkmext;
2449 
2450 		bzero(&dkmext, sizeof (dkmext));
2451 		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
2452 		dkmext.dki_pbsize = zv->zv_volblocksize;
2453 		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
2454 		dkmext.dki_media_type = DK_UNKNOWN;
2455 		mutex_exit(&zfsdev_state_lock);
2456 		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
2457 			error = SET_ERROR(EFAULT);
2458 		return (error);
2459 	}
2460 
2461 	case DKIOCGETEFI:
2462 	{
2463 		uint64_t vs = zv->zv_volsize;
2464 		uint8_t bs = zv->zv_min_bs;
2465 
2466 		mutex_exit(&zfsdev_state_lock);
2467 		error = zvol_getefi((void *)arg, flag, vs, bs);
2468 		return (error);
2469 	}
2470 
2471 	case DKIOCFLUSHWRITECACHE:
2472 		dkc = (struct dk_callback *)arg;
2473 		mutex_exit(&zfsdev_state_lock);
2474 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2475 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
2476 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
2477 			error = 0;
2478 		}
2479 		return (error);
2480 
2481 	case DKIOCGETWCE:
2482 	{
2483 		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
2484 		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
2485 		    flag))
2486 			error = SET_ERROR(EFAULT);
2487 		break;
2488 	}
2489 	case DKIOCSETWCE:
2490 	{
2491 		int wce;
2492 		if (ddi_copyin((void *)arg, &wce, sizeof (int),
2493 		    flag)) {
2494 			error = SET_ERROR(EFAULT);
2495 			break;
2496 		}
2497 		if (wce) {
2498 			zv->zv_flags |= ZVOL_WCE;
2499 			mutex_exit(&zfsdev_state_lock);
2500 		} else {
2501 			zv->zv_flags &= ~ZVOL_WCE;
2502 			mutex_exit(&zfsdev_state_lock);
2503 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2504 		}
2505 		return (0);
2506 	}
2507 
2508 	case DKIOCGGEOM:
2509 	case DKIOCGVTOC:
2510 		/*
2511 		 * commands using these (like prtvtoc) expect ENOTSUP
2512 		 * since we're emulating an EFI label
2513 		 */
2514 		error = SET_ERROR(ENOTSUP);
2515 		break;
2516 
2517 	case DKIOCDUMPINIT:
2518 		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
2519 		    RL_WRITER);
2520 		error = zvol_dumpify(zv);
2521 		zfs_range_unlock(rl);
2522 		break;
2523 
2524 	case DKIOCDUMPFINI:
2525 		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
2526 			break;
2527 		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
2528 		    RL_WRITER);
2529 		error = zvol_dump_fini(zv);
2530 		zfs_range_unlock(rl);
2531 		break;
2532 
2533 	case DKIOCFREE:
2534 	{
2535 		dkioc_free_t df;
2536 		dmu_tx_t *tx;
2537 
2538 		if (!zvol_unmap_enabled)
2539 			break;
2540 
2541 		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
2542 			error = SET_ERROR(EFAULT);
2543 			break;
2544 		}
2545 
2546 		/*
2547 		 * Apply Postel's Law to length-checking.  If they overshoot,
2548 		 * just blank out until the end, if there's a need to blank
2549 		 * out anything.
2550 		 */
2551 		if (df.df_start >= zv->zv_volsize)
2552 			break;	/* No need to do anything... */
2553 
2554 		mutex_exit(&zfsdev_state_lock);
2555 
2556 		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
2557 		    RL_WRITER);
2558 		tx = dmu_tx_create(zv->zv_objset);
2559 		dmu_tx_mark_netfree(tx);
2560 		error = dmu_tx_assign(tx, TXG_WAIT);
2561 		if (error != 0) {
2562 			dmu_tx_abort(tx);
2563 		} else {
2564 			zvol_log_truncate(zv, tx, df.df_start,
2565 			    df.df_length, B_TRUE);
2566 			dmu_tx_commit(tx);
2567 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
2568 			    df.df_start, df.df_length);
2569 		}
2570 
2571 		zfs_range_unlock(rl);
2572 
2573 		/*
2574 		 * If the write-cache is disabled, 'sync' property
2575 		 * is set to 'always', or if the caller is asking for
2576 		 * a synchronous free, commit this operation to the zil.
2577 		 * This will sync any previous uncommitted writes to the
2578 		 * zvol object.
2579 		 * Can be overridden by the zvol_unmap_sync_enabled tunable.
2580 		 */
2581 		if ((error == 0) && zvol_unmap_sync_enabled &&
2582 		    (!(zv->zv_flags & ZVOL_WCE) ||
2583 		    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
2584 		    (df.df_flags & DF_WAIT_SYNC))) {
2585 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2586 		}
2587 
2588 		return (error);
2589 	}
2590 
2591 	default:
2592 		error = SET_ERROR(ENOTTY);
2593 		break;
2594 
2595 	}
2596 	mutex_exit(&zfsdev_state_lock);
2597 	return (error);
2598 }
2599 #endif	/* illumos */
2600 
2601 int
2602 zvol_busy(void)
2603 {
2604 	return (zvol_minors != 0);
2605 }
2606 
2607 void
2608 zvol_init(void)
2609 {
2610 	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
2611 	    1) == 0);
2612 #ifndef __FreeBSD__
2613 	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
2614 #endif
2615 	ZFS_LOG(1, "ZVOL Initialized.");
2616 }
2617 
2618 void
2619 zvol_fini(void)
2620 {
2621 #ifndef __FreeBSD__
2622 	mutex_destroy(&zfsdev_state_lock);
2623 #endif
2624 	ddi_soft_state_fini(&zfsdev_state);
2625 	ZFS_LOG(1, "ZVOL Deinitialized.");
2626 }
2627 
2628 #ifdef illumos
2629 /*ARGSUSED*/
2630 static int
2631 zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
2632 {
2633 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
2634 
2635 	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
2636 		return (1);
2637 	return (0);
2638 }
2639 
2640 /*ARGSUSED*/
2641 static void
2642 zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
2643 {
2644 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
2645 
2646 	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
2647 }
2648 
2649 static int
2650 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
2651 {
2652 	dmu_tx_t *tx;
2653 	int error;
2654 	objset_t *os = zv->zv_objset;
2655 	spa_t *spa = dmu_objset_spa(os);
2656 	vdev_t *vd = spa->spa_root_vdev;
2657 	nvlist_t *nv = NULL;
2658 	uint64_t version = spa_version(spa);
2659 	uint64_t checksum, compress, refresrv, vbs, dedup;
2660 
2661 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
2662 	ASSERT(vd->vdev_ops == &vdev_root_ops);
2663 
2664 	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
2665 	    DMU_OBJECT_END);
2666 	if (error != 0)
2667 		return (error);
2668 	/* wait for dmu_free_long_range to actually free the blocks */
2669 	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2670 
2671 	/*
2672 	 * If the pool on which the dump device is being initialized has more
2673 	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
2674 	 * enabled.  If so, bump that feature's counter to indicate that the
2675 	 * feature is active. We also check the vdev type to handle the
2676 	 * following case:
2677 	 *   # zpool create test raidz disk1 disk2 disk3
2678 	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
2679 	 *   the raidz vdev itself has 3 children.
2680 	 */
2681 	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
2682 		if (!spa_feature_is_enabled(spa,
2683 		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
2684 			return (SET_ERROR(ENOTSUP));
2685 		(void) dsl_sync_task(spa_name(spa),
2686 		    zfs_mvdev_dump_feature_check,
2687 		    zfs_mvdev_dump_activate_feature_sync, NULL,
2688 		    2, ZFS_SPACE_CHECK_RESERVED);
2689 	}
2690 
2691 	if (!resize) {
2692 		error = dsl_prop_get_integer(zv->zv_name,
2693 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
2694 		if (error == 0) {
2695 			error = dsl_prop_get_integer(zv->zv_name,
2696 			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
2697 			    NULL);
2698 		}
2699 		if (error == 0) {
2700 			error = dsl_prop_get_integer(zv->zv_name,
2701 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2702 			    &refresrv, NULL);
2703 		}
2704 		if (error == 0) {
2705 			error = dsl_prop_get_integer(zv->zv_name,
2706 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
2707 			    NULL);
2708 		}
2709 		if (version >= SPA_VERSION_DEDUP && error == 0) {
2710 			error = dsl_prop_get_integer(zv->zv_name,
2711 			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
2712 		}
2713 	}
2714 	if (error != 0)
2715 		return (error);
2716 
2717 	tx = dmu_tx_create(os);
2718 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2719 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2720 	error = dmu_tx_assign(tx, TXG_WAIT);
2721 	if (error != 0) {
2722 		dmu_tx_abort(tx);
2723 		return (error);
2724 	}
2725 
2726 	/*
2727 	 * If we are resizing the dump device then we only need to
2728 	 * update the refreservation to match the newly updated
2729 	 * zvolsize. Otherwise, we save off the original state of the
2730 	 * zvol so that we can restore them if the zvol is ever undumpified.
2731 	 */
2732 	if (resize) {
2733 		error = zap_update(os, ZVOL_ZAP_OBJ,
2734 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2735 		    &zv->zv_volsize, tx);
2736 	} else {
2737 		error = zap_update(os, ZVOL_ZAP_OBJ,
2738 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
2739 		    &compress, tx);
2740 		if (error == 0) {
2741 			error = zap_update(os, ZVOL_ZAP_OBJ,
2742 			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
2743 			    &checksum, tx);
2744 		}
2745 		if (error == 0) {
2746 			error = zap_update(os, ZVOL_ZAP_OBJ,
2747 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2748 			    &refresrv, tx);
2749 		}
2750 		if (error == 0) {
2751 			error = zap_update(os, ZVOL_ZAP_OBJ,
2752 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
2753 			    &vbs, tx);
2754 		}
2755 		if (error == 0) {
2756 			error = dmu_object_set_blocksize(
2757 			    os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
2758 		}
2759 		if (version >= SPA_VERSION_DEDUP && error == 0) {
2760 			error = zap_update(os, ZVOL_ZAP_OBJ,
2761 			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2762 			    &dedup, tx);
2763 		}
2764 		if (error == 0)
2765 			zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
2766 	}
2767 	dmu_tx_commit(tx);
2768 
2769 	/*
2770 	 * We only need update the zvol's property if we are initializing
2771 	 * the dump area for the first time.
2772 	 */
2773 	if (error == 0 && !resize) {
2774 		/*
2775 		 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2776 		 * function.  Otherwise, use the old default -- OFF.
2777 		 */
2778 		checksum = spa_feature_is_active(spa,
2779 		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2780 		    ZIO_CHECKSUM_OFF;
2781 
2782 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2783 		VERIFY(nvlist_add_uint64(nv,
2784 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2785 		VERIFY(nvlist_add_uint64(nv,
2786 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2787 		    ZIO_COMPRESS_OFF) == 0);
2788 		VERIFY(nvlist_add_uint64(nv,
2789 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2790 		    checksum) == 0);
2791 		if (version >= SPA_VERSION_DEDUP) {
2792 			VERIFY(nvlist_add_uint64(nv,
2793 			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2794 			    ZIO_CHECKSUM_OFF) == 0);
2795 		}
2796 
2797 		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2798 		    nv, NULL);
2799 		nvlist_free(nv);
2800 	}
2801 
2802 	/* Allocate the space for the dump */
2803 	if (error == 0)
2804 		error = zvol_prealloc(zv);
2805 	return (error);
2806 }
2807 
2808 static int
2809 zvol_dumpify(zvol_state_t *zv)
2810 {
2811 	int error = 0;
2812 	uint64_t dumpsize = 0;
2813 	dmu_tx_t *tx;
2814 	objset_t *os = zv->zv_objset;
2815 
2816 	if (zv->zv_flags & ZVOL_RDONLY)
2817 		return (SET_ERROR(EROFS));
2818 
2819 	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2820 	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2821 		boolean_t resize = (dumpsize > 0);
2822 
2823 		if ((error = zvol_dump_init(zv, resize)) != 0) {
2824 			(void) zvol_dump_fini(zv);
2825 			return (error);
2826 		}
2827 	}
2828 
2829 	/*
2830 	 * Build up our lba mapping.
2831 	 */
2832 	error = zvol_get_lbas(zv);
2833 	if (error) {
2834 		(void) zvol_dump_fini(zv);
2835 		return (error);
2836 	}
2837 
2838 	tx = dmu_tx_create(os);
2839 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2840 	error = dmu_tx_assign(tx, TXG_WAIT);
2841 	if (error) {
2842 		dmu_tx_abort(tx);
2843 		(void) zvol_dump_fini(zv);
2844 		return (error);
2845 	}
2846 
2847 	zv->zv_flags |= ZVOL_DUMPIFIED;
2848 	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2849 	    &zv->zv_volsize, tx);
2850 	dmu_tx_commit(tx);
2851 
2852 	if (error) {
2853 		(void) zvol_dump_fini(zv);
2854 		return (error);
2855 	}
2856 
2857 	txg_wait_synced(dmu_objset_pool(os), 0);
2858 	return (0);
2859 }
2860 
2861 static int
2862 zvol_dump_fini(zvol_state_t *zv)
2863 {
2864 	dmu_tx_t *tx;
2865 	objset_t *os = zv->zv_objset;
2866 	nvlist_t *nv;
2867 	int error = 0;
2868 	uint64_t checksum, compress, refresrv, vbs, dedup;
2869 	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2870 
2871 	/*
2872 	 * Attempt to restore the zvol back to its pre-dumpified state.
2873 	 * This is a best-effort attempt as it's possible that not all
2874 	 * of these properties were initialized during the dumpify process
2875 	 * (i.e. error during zvol_dump_init).
2876 	 */
2877 
2878 	tx = dmu_tx_create(os);
2879 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2880 	error = dmu_tx_assign(tx, TXG_WAIT);
2881 	if (error) {
2882 		dmu_tx_abort(tx);
2883 		return (error);
2884 	}
2885 	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2886 	dmu_tx_commit(tx);
2887 
2888 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2889 	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2890 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2891 	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2892 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2893 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2894 	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2895 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2896 
2897 	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2898 	(void) nvlist_add_uint64(nv,
2899 	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2900 	(void) nvlist_add_uint64(nv,
2901 	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2902 	(void) nvlist_add_uint64(nv,
2903 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2904 	if (version >= SPA_VERSION_DEDUP &&
2905 	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2906 	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2907 		(void) nvlist_add_uint64(nv,
2908 		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2909 	}
2910 	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2911 	    nv, NULL);
2912 	nvlist_free(nv);
2913 
2914 	zvol_free_extents(zv);
2915 	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2916 	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2917 	/* wait for dmu_free_long_range to actually free the blocks */
2918 	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2919 	tx = dmu_tx_create(os);
2920 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2921 	error = dmu_tx_assign(tx, TXG_WAIT);
2922 	if (error) {
2923 		dmu_tx_abort(tx);
2924 		return (error);
2925 	}
2926 	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2927 		zv->zv_volblocksize = vbs;
2928 	dmu_tx_commit(tx);
2929 
2930 	return (0);
2931 }
2932 #endif /* illumos */
2933 
2934 #ifdef __FreeBSD__
2935 static void
2936 zvol_geom_run(zvol_state_t *zv)
2937 {
2938 	struct g_provider *pp;
2939 
2940 	pp = zv->zv_provider;
2941 	g_error_provider(pp, 0);
2942 
2943 	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2944 	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2945 }
2946 
2947 static void
2948 zvol_geom_destroy(zvol_state_t *zv)
2949 {
2950 	struct g_provider *pp;
2951 
2952 	g_topology_assert();
2953 
2954 	mtx_lock(&zv->zv_queue_mtx);
2955 	zv->zv_state = 1;
2956 	wakeup_one(&zv->zv_queue);
2957 	while (zv->zv_state != 2)
2958 		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2959 	mtx_destroy(&zv->zv_queue_mtx);
2960 
2961 	pp = zv->zv_provider;
2962 	zv->zv_provider = NULL;
2963 	pp->private = NULL;
2964 	g_wither_geom(pp->geom, ENXIO);
2965 }
2966 
2967 static int
2968 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2969 {
2970 	int count, error, flags;
2971 
2972 	g_topology_assert();
2973 
2974 	/*
2975 	 * To make it easier we expect either open or close, but not both
2976 	 * at the same time.
2977 	 */
2978 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2979 	    (acr <= 0 && acw <= 0 && ace <= 0),
2980 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2981 	    pp->name, acr, acw, ace));
2982 
2983 	if (pp->private == NULL) {
2984 		if (acr <= 0 && acw <= 0 && ace <= 0)
2985 			return (0);
2986 		return (pp->error);
2987 	}
2988 
2989 	/*
2990 	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2991 	 * because GEOM already handles that and handles it a bit differently.
2992 	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2993 	 * only one exclusive consumer, no matter if it is reader or writer.
2994 	 * I like better the way GEOM works so I'll leave it for GEOM to
2995 	 * decide what to do.
2996 	 */
2997 
2998 	count = acr + acw + ace;
2999 	if (count == 0)
3000 		return (0);
3001 
3002 	flags = 0;
3003 	if (acr != 0 || ace != 0)
3004 		flags |= FREAD;
3005 	if (acw != 0)
3006 		flags |= FWRITE;
3007 
3008 	g_topology_unlock();
3009 	if (count > 0)
3010 		error = zvol_open(pp, flags, count);
3011 	else
3012 		error = zvol_close(pp, flags, -count);
3013 	g_topology_lock();
3014 	return (error);
3015 }
3016 
3017 static void
3018 zvol_geom_start(struct bio *bp)
3019 {
3020 	zvol_state_t *zv;
3021 	boolean_t first;
3022 
3023 	zv = bp->bio_to->private;
3024 	ASSERT(zv != NULL);
3025 	switch (bp->bio_cmd) {
3026 	case BIO_FLUSH:
3027 		if (!THREAD_CAN_SLEEP())
3028 			goto enqueue;
3029 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
3030 		g_io_deliver(bp, 0);
3031 		break;
3032 	case BIO_READ:
3033 	case BIO_WRITE:
3034 	case BIO_DELETE:
3035 		if (!THREAD_CAN_SLEEP())
3036 			goto enqueue;
3037 		zvol_strategy(bp);
3038 		break;
3039 	case BIO_GETATTR: {
3040 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
3041 		uint64_t refd, avail, usedobjs, availobjs, val;
3042 
3043 		if (g_handleattr_int(bp, "GEOM::candelete", 1))
3044 			return;
3045 		if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
3046 			dmu_objset_space(zv->zv_objset, &refd, &avail,
3047 			    &usedobjs, &availobjs);
3048 			if (g_handleattr_off_t(bp, "blocksavail",
3049 			    avail / DEV_BSIZE))
3050 				return;
3051 		} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
3052 			dmu_objset_space(zv->zv_objset, &refd, &avail,
3053 			    &usedobjs, &availobjs);
3054 			if (g_handleattr_off_t(bp, "blocksused",
3055 			    refd / DEV_BSIZE))
3056 				return;
3057 		} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
3058 			avail = metaslab_class_get_space(spa_normal_class(spa));
3059 			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
3060 			if (g_handleattr_off_t(bp, "poolblocksavail",
3061 			    avail / DEV_BSIZE))
3062 				return;
3063 		} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
3064 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
3065 			if (g_handleattr_off_t(bp, "poolblocksused",
3066 			    refd / DEV_BSIZE))
3067 				return;
3068 		}
3069 		/* FALLTHROUGH */
3070 	}
3071 	default:
3072 		g_io_deliver(bp, EOPNOTSUPP);
3073 		break;
3074 	}
3075 	return;
3076 
3077 enqueue:
3078 	mtx_lock(&zv->zv_queue_mtx);
3079 	first = (bioq_first(&zv->zv_queue) == NULL);
3080 	bioq_insert_tail(&zv->zv_queue, bp);
3081 	mtx_unlock(&zv->zv_queue_mtx);
3082 	if (first)
3083 		wakeup_one(&zv->zv_queue);
3084 }
3085 
3086 static void
3087 zvol_geom_worker(void *arg)
3088 {
3089 	zvol_state_t *zv;
3090 	struct bio *bp;
3091 
3092 	thread_lock(curthread);
3093 	sched_prio(curthread, PRIBIO);
3094 	thread_unlock(curthread);
3095 
3096 	zv = arg;
3097 	for (;;) {
3098 		mtx_lock(&zv->zv_queue_mtx);
3099 		bp = bioq_takefirst(&zv->zv_queue);
3100 		if (bp == NULL) {
3101 			if (zv->zv_state == 1) {
3102 				zv->zv_state = 2;
3103 				wakeup(&zv->zv_state);
3104 				mtx_unlock(&zv->zv_queue_mtx);
3105 				kthread_exit();
3106 			}
3107 			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
3108 			    "zvol:io", 0);
3109 			continue;
3110 		}
3111 		mtx_unlock(&zv->zv_queue_mtx);
3112 		switch (bp->bio_cmd) {
3113 		case BIO_FLUSH:
3114 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
3115 			g_io_deliver(bp, 0);
3116 			break;
3117 		case BIO_READ:
3118 		case BIO_WRITE:
3119 		case BIO_DELETE:
3120 			zvol_strategy(bp);
3121 			break;
3122 		default:
3123 			g_io_deliver(bp, EOPNOTSUPP);
3124 			break;
3125 		}
3126 	}
3127 }
3128 
3129 extern boolean_t dataset_name_hidden(const char *name);
3130 
3131 static int
3132 zvol_create_snapshots(objset_t *os, const char *name)
3133 {
3134 	uint64_t cookie, obj;
3135 	char *sname;
3136 	int error, len;
3137 
3138 	cookie = obj = 0;
3139 	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3140 
3141 #if 0
3142 	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
3143 	    DS_FIND_SNAPSHOTS);
3144 #endif
3145 
3146 	for (;;) {
3147 		len = snprintf(sname, MAXPATHLEN, "%s@", name);
3148 		if (len >= MAXPATHLEN) {
3149 			dmu_objset_rele(os, FTAG);
3150 			error = ENAMETOOLONG;
3151 			break;
3152 		}
3153 
3154 		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
3155 		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
3156 		    sname + len, &obj, &cookie, NULL);
3157 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
3158 		if (error != 0) {
3159 			if (error == ENOENT)
3160 				error = 0;
3161 			break;
3162 		}
3163 
3164 		error = zvol_create_minor(sname);
3165 		if (error != 0 && error != EEXIST) {
3166 			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
3167 			    sname, error);
3168 			break;
3169 		}
3170 	}
3171 
3172 	kmem_free(sname, MAXPATHLEN);
3173 	return (error);
3174 }
3175 
3176 int
3177 zvol_create_minors(const char *name)
3178 {
3179 	uint64_t cookie;
3180 	objset_t *os;
3181 	char *osname, *p;
3182 	int error, len;
3183 
3184 	if (dataset_name_hidden(name))
3185 		return (0);
3186 
3187 	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
3188 		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
3189 		    name, error);
3190 		return (error);
3191 	}
3192 	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
3193 		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
3194 		dsl_pool_rele(dmu_objset_pool(os), FTAG);
3195 		error = zvol_create_minor(name);
3196 		if (error == 0 || error == EEXIST) {
3197 			error = zvol_create_snapshots(os, name);
3198 		} else {
3199 			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
3200 			    name, error);
3201 		}
3202 		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
3203 		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
3204 		return (error);
3205 	}
3206 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
3207 		dmu_objset_rele(os, FTAG);
3208 		return (0);
3209 	}
3210 
3211 	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3212 	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
3213 		dmu_objset_rele(os, FTAG);
3214 		kmem_free(osname, MAXPATHLEN);
3215 		return (ENOENT);
3216 	}
3217 	p = osname + strlen(osname);
3218 	len = MAXPATHLEN - (p - osname);
3219 
3220 #if 0
3221 	/* Prefetch the datasets. */
3222 	cookie = 0;
3223 	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
3224 		if (!dataset_name_hidden(osname))
3225 			(void) dmu_objset_prefetch(osname, NULL);
3226 	}
3227 #endif
3228 
3229 	cookie = 0;
3230 	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
3231 	    &cookie) == 0) {
3232 		dmu_objset_rele(os, FTAG);
3233 		(void)zvol_create_minors(osname);
3234 		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
3235 			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
3236 			    name, error);
3237 			return (error);
3238 		}
3239 	}
3240 
3241 	dmu_objset_rele(os, FTAG);
3242 	kmem_free(osname, MAXPATHLEN);
3243 	return (0);
3244 }
3245 
3246 static void
3247 zvol_rename_minor(zvol_state_t *zv, const char *newname)
3248 {
3249 	struct g_geom *gp;
3250 	struct g_provider *pp;
3251 	struct cdev *dev;
3252 
3253 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
3254 
3255 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
3256 		g_topology_lock();
3257 		pp = zv->zv_provider;
3258 		ASSERT(pp != NULL);
3259 		gp = pp->geom;
3260 		ASSERT(gp != NULL);
3261 
3262 		zv->zv_provider = NULL;
3263 		g_wither_provider(pp, ENXIO);
3264 
3265 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
3266 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
3267 		pp->sectorsize = DEV_BSIZE;
3268 		pp->mediasize = zv->zv_volsize;
3269 		pp->private = zv;
3270 		zv->zv_provider = pp;
3271 		g_error_provider(pp, 0);
3272 		g_topology_unlock();
3273 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
3274 		struct make_dev_args args;
3275 
3276 		if ((dev = zv->zv_dev) != NULL) {
3277 			zv->zv_dev = NULL;
3278 			destroy_dev(dev);
3279 			if (zv->zv_total_opens > 0) {
3280 				zv->zv_flags &= ~ZVOL_EXCL;
3281 				zv->zv_total_opens = 0;
3282 				zvol_last_close(zv);
3283 			}
3284 		}
3285 
3286 		make_dev_args_init(&args);
3287 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
3288 		args.mda_devsw = &zvol_cdevsw;
3289 		args.mda_cr = NULL;
3290 		args.mda_uid = UID_ROOT;
3291 		args.mda_gid = GID_OPERATOR;
3292 		args.mda_mode = 0640;
3293 		args.mda_si_drv2 = zv;
3294 		if (make_dev_s(&args, &zv->zv_dev,
3295 		    "%s/%s", ZVOL_DRIVER, newname) == 0)
3296 			zv->zv_dev->si_iosize_max = MAXPHYS;
3297 	}
3298 	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
3299 }
3300 
3301 void
3302 zvol_rename_minors(const char *oldname, const char *newname)
3303 {
3304 	char name[MAXPATHLEN];
3305 	struct g_provider *pp;
3306 	struct g_geom *gp;
3307 	size_t oldnamelen, newnamelen;
3308 	zvol_state_t *zv;
3309 	char *namebuf;
3310 	boolean_t locked = B_FALSE;
3311 
3312 	oldnamelen = strlen(oldname);
3313 	newnamelen = strlen(newname);
3314 
3315 	DROP_GIANT();
3316 	/* See comment in zvol_open(). */
3317 	if (!MUTEX_HELD(&zfsdev_state_lock)) {
3318 		mutex_enter(&zfsdev_state_lock);
3319 		locked = B_TRUE;
3320 	}
3321 
3322 	LIST_FOREACH(zv, &all_zvols, zv_links) {
3323 		if (strcmp(zv->zv_name, oldname) == 0) {
3324 			zvol_rename_minor(zv, newname);
3325 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
3326 		    (zv->zv_name[oldnamelen] == '/' ||
3327 		     zv->zv_name[oldnamelen] == '@')) {
3328 			snprintf(name, sizeof(name), "%s%c%s", newname,
3329 			    zv->zv_name[oldnamelen],
3330 			    zv->zv_name + oldnamelen + 1);
3331 			zvol_rename_minor(zv, name);
3332 		}
3333 	}
3334 
3335 	if (locked)
3336 		mutex_exit(&zfsdev_state_lock);
3337 	PICKUP_GIANT();
3338 }
3339 
3340 static int
3341 zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
3342 {
3343 	zvol_state_t *zv = dev->si_drv2;
3344 	int err = 0;
3345 
3346 	mutex_enter(&zfsdev_state_lock);
3347 	if (zv->zv_total_opens == 0)
3348 		err = zvol_first_open(zv);
3349 	if (err) {
3350 		mutex_exit(&zfsdev_state_lock);
3351 		return (err);
3352 	}
3353 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
3354 		err = SET_ERROR(EROFS);
3355 		goto out;
3356 	}
3357 	if (zv->zv_flags & ZVOL_EXCL) {
3358 		err = SET_ERROR(EBUSY);
3359 		goto out;
3360 	}
3361 #ifdef FEXCL
3362 	if (flags & FEXCL) {
3363 		if (zv->zv_total_opens != 0) {
3364 			err = SET_ERROR(EBUSY);
3365 			goto out;
3366 		}
3367 		zv->zv_flags |= ZVOL_EXCL;
3368 	}
3369 #endif
3370 
3371 	zv->zv_total_opens++;
3372 	if (flags & (FSYNC | FDSYNC)) {
3373 		zv->zv_sync_cnt++;
3374 		if (zv->zv_sync_cnt == 1)
3375 			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
3376 	}
3377 	mutex_exit(&zfsdev_state_lock);
3378 	return (err);
3379 out:
3380 	if (zv->zv_total_opens == 0)
3381 		zvol_last_close(zv);
3382 	mutex_exit(&zfsdev_state_lock);
3383 	return (err);
3384 }
3385 
3386 static int
3387 zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
3388 {
3389 	zvol_state_t *zv = dev->si_drv2;
3390 
3391 	mutex_enter(&zfsdev_state_lock);
3392 	if (zv->zv_flags & ZVOL_EXCL) {
3393 		ASSERT(zv->zv_total_opens == 1);
3394 		zv->zv_flags &= ~ZVOL_EXCL;
3395 	}
3396 
3397 	/*
3398 	 * If the open count is zero, this is a spurious close.
3399 	 * That indicates a bug in the kernel / DDI framework.
3400 	 */
3401 	ASSERT(zv->zv_total_opens != 0);
3402 
3403 	/*
3404 	 * You may get multiple opens, but only one close.
3405 	 */
3406 	zv->zv_total_opens--;
3407 	if (flags & (FSYNC | FDSYNC))
3408 		zv->zv_sync_cnt--;
3409 
3410 	if (zv->zv_total_opens == 0)
3411 		zvol_last_close(zv);
3412 
3413 	mutex_exit(&zfsdev_state_lock);
3414 	return (0);
3415 }
3416 
3417 static int
3418 zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
3419 {
3420 	zvol_state_t *zv;
3421 	rl_t *rl;
3422 	off_t offset, length;
3423 	int i, error;
3424 	boolean_t sync;
3425 
3426 	zv = dev->si_drv2;
3427 
3428 	error = 0;
3429 	KASSERT(zv->zv_total_opens > 0,
3430 	    ("Device with zero access count in zvol_d_ioctl"));
3431 
3432 	i = IOCPARM_LEN(cmd);
3433 	switch (cmd) {
3434 	case DIOCGSECTORSIZE:
3435 		*(u_int *)data = DEV_BSIZE;
3436 		break;
3437 	case DIOCGMEDIASIZE:
3438 		*(off_t *)data = zv->zv_volsize;
3439 		break;
3440 	case DIOCGFLUSH:
3441 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
3442 		break;
3443 	case DIOCGDELETE:
3444 		if (!zvol_unmap_enabled)
3445 			break;
3446 
3447 		offset = ((off_t *)data)[0];
3448 		length = ((off_t *)data)[1];
3449 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
3450 		    offset < 0 || offset >= zv->zv_volsize ||
3451 		    length <= 0) {
3452 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
3453 			    length);
3454 			error = EINVAL;
3455 			break;
3456 		}
3457 
3458 		rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
3459 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
3460 		error = dmu_tx_assign(tx, TXG_WAIT);
3461 		if (error != 0) {
3462 			sync = FALSE;
3463 			dmu_tx_abort(tx);
3464 		} else {
3465 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
3466 			zvol_log_truncate(zv, tx, offset, length, sync);
3467 			dmu_tx_commit(tx);
3468 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
3469 			    offset, length);
3470 		}
3471 		zfs_range_unlock(rl);
3472 		if (sync)
3473 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
3474 		break;
3475 	case DIOCGSTRIPESIZE:
3476 		*(off_t *)data = zv->zv_volblocksize;
3477 		break;
3478 	case DIOCGSTRIPEOFFSET:
3479 		*(off_t *)data = 0;
3480 		break;
3481 	case DIOCGATTR: {
3482 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
3483 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
3484 		uint64_t refd, avail, usedobjs, availobjs;
3485 
3486 		if (strcmp(arg->name, "GEOM::candelete") == 0)
3487 			arg->value.i = 1;
3488 		else if (strcmp(arg->name, "blocksavail") == 0) {
3489 			dmu_objset_space(zv->zv_objset, &refd, &avail,
3490 			    &usedobjs, &availobjs);
3491 			arg->value.off = avail / DEV_BSIZE;
3492 		} else if (strcmp(arg->name, "blocksused") == 0) {
3493 			dmu_objset_space(zv->zv_objset, &refd, &avail,
3494 			    &usedobjs, &availobjs);
3495 			arg->value.off = refd / DEV_BSIZE;
3496 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
3497 			avail = metaslab_class_get_space(spa_normal_class(spa));
3498 			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
3499 			arg->value.off = avail / DEV_BSIZE;
3500 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
3501 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
3502 			arg->value.off = refd / DEV_BSIZE;
3503 		} else
3504 			error = ENOIOCTL;
3505 		break;
3506 	}
3507 	case FIOSEEKHOLE:
3508 	case FIOSEEKDATA: {
3509 		off_t *off = (off_t *)data;
3510 		uint64_t noff;
3511 		boolean_t hole;
3512 
3513 		hole = (cmd == FIOSEEKHOLE);
3514 		noff = *off;
3515 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
3516 		*off = noff;
3517 		break;
3518 	}
3519 	default:
3520 		error = ENOIOCTL;
3521 	}
3522 
3523 	return (error);
3524 }
3525 #endif /* __FreeBSD__ */
3526 
3527 #ifdef __NetBSD__
3528 /*ARGSUSED*/
3529 int
3530 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
3531 {
3532 	zvol_state_t *zv;
3533 	int error = 0;
3534 
3535 	mutex_enter(&zfsdev_state_lock);
3536 
3537 	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
3538 
3539 	if (zv == NULL) {
3540 		mutex_exit(&zfsdev_state_lock);
3541 		return (ENXIO);
3542 	}
3543 
3544 	switch(cmd) {
3545 	case DIOCGWEDGEINFO:
3546 	{
3547 		struct dkwedge_info *dkw = (void *) arg;
3548 
3549 		strlcpy(dkw->dkw_devname, zv->zv_name, 16);
3550 		strlcpy(dkw->dkw_wname, zv->zv_name, MAXPATHLEN);
3551 		strlcpy(dkw->dkw_parent, zv->zv_name, 16);
3552 
3553 		dkw->dkw_offset = 0;
3554 		/* XXX NetBSD supports only DEV_BSIZE device block
3555 		   size zv_volblocksize >> DEV_BSIZE*/
3556 		dkw->dkw_size = (zv->zv_volsize / DEV_BSIZE);
3557 		dprintf("dkw %"PRIu64" volsize %"PRIu64" volblock %"PRIu64" \n",
3558 		    dkw->dkw_size, zv->zv_volsize, zv->zv_volblocksize);
3559 		strcpy(dkw->dkw_ptype, DKW_PTYPE_FFS);
3560 
3561 		break;
3562 	}
3563 
3564 	case DIOCGDISKINFO:
3565 	{
3566 		struct plistref *pref = (struct plistref *) arg;
3567 
3568 		if (zv->zv_dk.dk_info == NULL) {
3569 			mutex_exit(&zfsdev_state_lock);
3570 			return ENOTSUP;
3571 		} else
3572 			prop_dictionary_copyout_ioctl(pref, cmd,
3573 			    zv->zv_dk.dk_info);
3574 
3575 		break;
3576 	}
3577 
3578 	default:
3579 		aprint_debug("unknown disk_ioctl called\n");
3580 		error = ENOTTY;
3581 		break;
3582 	}
3583 
3584 	mutex_exit(&zfsdev_state_lock);
3585 	return (error);
3586 }
3587 #endif /* __NetBSD__ */
3588