1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 226423Sgw25295 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 29789Sahrens * ZFS volume emulation driver. 30789Sahrens * 31789Sahrens * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 32789Sahrens * Volumes are accessed through the symbolic links named: 33789Sahrens * 34789Sahrens * /dev/zvol/dsk/<pool_name>/<dataset_name> 35789Sahrens * /dev/zvol/rdsk/<pool_name>/<dataset_name> 36789Sahrens * 37789Sahrens * These links are created by the ZFS-specific devfsadm link generator. 38789Sahrens * Volumes are persistent through reboot. No user command needs to be 39789Sahrens * run before opening and using a device. 40789Sahrens */ 41789Sahrens 42789Sahrens #include <sys/types.h> 43789Sahrens #include <sys/param.h> 44789Sahrens #include <sys/errno.h> 45789Sahrens #include <sys/uio.h> 46789Sahrens #include <sys/buf.h> 47789Sahrens #include <sys/modctl.h> 48789Sahrens #include <sys/open.h> 49789Sahrens #include <sys/kmem.h> 50789Sahrens #include <sys/conf.h> 51789Sahrens #include <sys/cmn_err.h> 52789Sahrens #include <sys/stat.h> 53789Sahrens #include <sys/zap.h> 54789Sahrens #include <sys/spa.h> 55789Sahrens #include <sys/zio.h> 566423Sgw25295 #include <sys/dmu_traverse.h> 576423Sgw25295 #include <sys/dnode.h> 586423Sgw25295 #include <sys/dsl_dataset.h> 59789Sahrens #include <sys/dsl_prop.h> 60789Sahrens #include <sys/dkio.h> 61789Sahrens #include <sys/efi_partition.h> 62789Sahrens #include <sys/byteorder.h> 63789Sahrens #include <sys/pathname.h> 64789Sahrens #include <sys/ddi.h> 65789Sahrens #include <sys/sunddi.h> 66789Sahrens #include <sys/crc32.h> 67789Sahrens #include <sys/dirent.h> 68789Sahrens #include <sys/policy.h> 69789Sahrens #include <sys/fs/zfs.h> 70789Sahrens #include <sys/zfs_ioctl.h> 71789Sahrens #include <sys/mkdev.h> 721141Sperrin #include <sys/zil.h> 732237Smaybee #include <sys/refcount.h> 743755Sperrin #include <sys/zfs_znode.h> 753755Sperrin #include <sys/zfs_rlock.h> 766423Sgw25295 #include <sys/vdev_disk.h> 776423Sgw25295 #include <sys/vdev_impl.h> 786423Sgw25295 #include <sys/zvol.h> 796423Sgw25295 #include <sys/dumphdr.h> 80789Sahrens 81789Sahrens #include "zfs_namecheck.h" 82789Sahrens 836423Sgw25295 static void *zvol_state; 84789Sahrens 856423Sgw25295 #define ZVOL_DUMPSIZE "dumpsize" 86789Sahrens 87789Sahrens /* 88789Sahrens * This lock protects the zvol_state structure from being modified 89789Sahrens * while it's being used, e.g. an open that comes in before a create 90789Sahrens * finishes. It also protects temporary opens of the dataset so that, 91789Sahrens * e.g., an open doesn't get a spurious EBUSY. 92789Sahrens */ 93789Sahrens static kmutex_t zvol_state_lock; 94789Sahrens static uint32_t zvol_minors; 95789Sahrens 966423Sgw25295 #define NUM_EXTENTS ((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t)) 976423Sgw25295 986423Sgw25295 typedef struct zvol_extent { 996423Sgw25295 dva_t ze_dva; /* dva associated with this extent */ 1006423Sgw25295 uint64_t ze_stride; /* extent stride */ 1016423Sgw25295 uint64_t ze_size; /* number of blocks in extent */ 1026423Sgw25295 } zvol_extent_t; 1036423Sgw25295 1046423Sgw25295 /* 1056423Sgw25295 * The list of extents associated with the dump device 1066423Sgw25295 */ 1076423Sgw25295 typedef struct zvol_ext_list { 1086423Sgw25295 zvol_extent_t zl_extents[NUM_EXTENTS]; 1096423Sgw25295 struct zvol_ext_list *zl_next; 1106423Sgw25295 } zvol_ext_list_t; 1116423Sgw25295 112789Sahrens /* 113789Sahrens * The in-core state of each volume. 114789Sahrens */ 115789Sahrens typedef struct zvol_state { 116789Sahrens char zv_name[MAXPATHLEN]; /* pool/dd name */ 117789Sahrens uint64_t zv_volsize; /* amount of space we advertise */ 1183063Sperrin uint64_t zv_volblocksize; /* volume block size */ 119789Sahrens minor_t zv_minor; /* minor number */ 120789Sahrens uint8_t zv_min_bs; /* minimum addressable block shift */ 1216423Sgw25295 uint8_t zv_flags; /* readonly; dumpified */ 122789Sahrens objset_t *zv_objset; /* objset handle */ 123789Sahrens uint32_t zv_mode; /* DS_MODE_* flags at open time */ 124789Sahrens uint32_t zv_open_count[OTYPCNT]; /* open counts */ 125789Sahrens uint32_t zv_total_opens; /* total open count */ 1261141Sperrin zilog_t *zv_zilog; /* ZIL handle */ 1276423Sgw25295 zvol_ext_list_t *zv_list; /* List of extents for dump */ 1281141Sperrin uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ 1293755Sperrin znode_t zv_znode; /* for range locking */ 130789Sahrens } zvol_state_t; 131789Sahrens 1323063Sperrin /* 1336423Sgw25295 * zvol specific flags 1346423Sgw25295 */ 1356423Sgw25295 #define ZVOL_RDONLY 0x1 1366423Sgw25295 #define ZVOL_DUMPIFIED 0x2 1376423Sgw25295 1386423Sgw25295 /* 1393063Sperrin * zvol maximum transfer in one DMU tx. 1403063Sperrin */ 1413063Sperrin int zvol_maxphys = DMU_MAX_ACCESS/2; 1423063Sperrin 1436423Sgw25295 extern int zfs_set_prop_nvlist(const char *, nvlist_t *); 1443638Sbillm static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); 1456423Sgw25295 static int zvol_dumpify(zvol_state_t *zv); 1466423Sgw25295 static int zvol_dump_fini(zvol_state_t *zv); 1476423Sgw25295 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); 1483063Sperrin 149789Sahrens static void 1504787Sahrens zvol_size_changed(zvol_state_t *zv, major_t maj) 151789Sahrens { 1524787Sahrens dev_t dev = makedevice(maj, zv->zv_minor); 153789Sahrens 154789Sahrens VERIFY(ddi_prop_update_int64(dev, zfs_dip, 155789Sahrens "Size", zv->zv_volsize) == DDI_SUCCESS); 156789Sahrens VERIFY(ddi_prop_update_int64(dev, zfs_dip, 157789Sahrens "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS); 1586423Sgw25295 1596423Sgw25295 /* Notify specfs to invalidate the cached size */ 1606423Sgw25295 spec_size_invalidate(dev, VBLK); 1616423Sgw25295 spec_size_invalidate(dev, VCHR); 162789Sahrens } 163789Sahrens 164789Sahrens int 1652676Seschrock zvol_check_volsize(uint64_t volsize, uint64_t blocksize) 166789Sahrens { 1672676Seschrock if (volsize == 0) 168789Sahrens return (EINVAL); 169789Sahrens 1702676Seschrock if (volsize % blocksize != 0) 1711133Seschrock return (EINVAL); 1721133Seschrock 173789Sahrens #ifdef _ILP32 1742676Seschrock if (volsize - 1 > SPEC_MAXOFFSET_T) 175789Sahrens return (EOVERFLOW); 176789Sahrens #endif 177789Sahrens return (0); 178789Sahrens } 179789Sahrens 180789Sahrens int 1812676Seschrock zvol_check_volblocksize(uint64_t volblocksize) 182789Sahrens { 1832676Seschrock if (volblocksize < SPA_MINBLOCKSIZE || 1842676Seschrock volblocksize > SPA_MAXBLOCKSIZE || 1852676Seschrock !ISP2(volblocksize)) 186789Sahrens return (EDOM); 187789Sahrens 188789Sahrens return (0); 189789Sahrens } 190789Sahrens 191789Sahrens static void 192789Sahrens zvol_readonly_changed_cb(void *arg, uint64_t newval) 193789Sahrens { 194789Sahrens zvol_state_t *zv = arg; 195789Sahrens 1966423Sgw25295 if (newval) 1976423Sgw25295 zv->zv_flags |= ZVOL_RDONLY; 1986423Sgw25295 else 1996423Sgw25295 zv->zv_flags &= ~ZVOL_RDONLY; 200789Sahrens } 201789Sahrens 202789Sahrens int 2032885Sahrens zvol_get_stats(objset_t *os, nvlist_t *nv) 204789Sahrens { 205789Sahrens int error; 206789Sahrens dmu_object_info_t doi; 2072885Sahrens uint64_t val; 208789Sahrens 209789Sahrens 2102885Sahrens error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); 211789Sahrens if (error) 212789Sahrens return (error); 213789Sahrens 2142885Sahrens dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); 2152885Sahrens 216789Sahrens error = dmu_object_info(os, ZVOL_OBJ, &doi); 217789Sahrens 2182885Sahrens if (error == 0) { 2192885Sahrens dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, 2202885Sahrens doi.doi_data_block_size); 2212885Sahrens } 222789Sahrens 223789Sahrens return (error); 224789Sahrens } 225789Sahrens 226789Sahrens /* 227789Sahrens * Find a free minor number. 228789Sahrens */ 229789Sahrens static minor_t 230789Sahrens zvol_minor_alloc(void) 231789Sahrens { 232789Sahrens minor_t minor; 233789Sahrens 234789Sahrens ASSERT(MUTEX_HELD(&zvol_state_lock)); 235789Sahrens 236789Sahrens for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) 237789Sahrens if (ddi_get_soft_state(zvol_state, minor) == NULL) 238789Sahrens return (minor); 239789Sahrens 240789Sahrens return (0); 241789Sahrens } 242789Sahrens 243789Sahrens static zvol_state_t * 2442676Seschrock zvol_minor_lookup(const char *name) 245789Sahrens { 246789Sahrens minor_t minor; 247789Sahrens zvol_state_t *zv; 248789Sahrens 249789Sahrens ASSERT(MUTEX_HELD(&zvol_state_lock)); 250789Sahrens 251789Sahrens for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { 252789Sahrens zv = ddi_get_soft_state(zvol_state, minor); 253789Sahrens if (zv == NULL) 254789Sahrens continue; 255789Sahrens if (strcmp(zv->zv_name, name) == 0) 256789Sahrens break; 257789Sahrens } 258789Sahrens 259789Sahrens return (zv); 260789Sahrens } 261789Sahrens 2626423Sgw25295 void 2636423Sgw25295 zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp) 2646423Sgw25295 { 2656423Sgw25295 ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ 2666423Sgw25295 ze->ze_stride = 0; 2676423Sgw25295 ze->ze_size = 1; 2686423Sgw25295 } 2696423Sgw25295 2706423Sgw25295 /* extent mapping arg */ 2716423Sgw25295 struct maparg { 2726423Sgw25295 zvol_ext_list_t *ma_list; 2736423Sgw25295 zvol_extent_t *ma_extent; 2746423Sgw25295 int ma_gang; 2756423Sgw25295 }; 2766423Sgw25295 2776423Sgw25295 /*ARGSUSED*/ 2786423Sgw25295 static int 2796423Sgw25295 zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg) 2806423Sgw25295 { 2816423Sgw25295 zbookmark_t *zb = &bc->bc_bookmark; 2826423Sgw25295 blkptr_t *bp = &bc->bc_blkptr; 2836423Sgw25295 void *data = bc->bc_data; 2846423Sgw25295 dnode_phys_t *dnp = bc->bc_dnode; 2856423Sgw25295 struct maparg *ma = (struct maparg *)arg; 2866423Sgw25295 uint64_t stride; 2876423Sgw25295 2886423Sgw25295 /* If there is an error, then keep trying to make progress */ 2896423Sgw25295 if (bc->bc_errno) 2906423Sgw25295 return (ERESTART); 2916423Sgw25295 2926423Sgw25295 #ifdef ZFS_DEBUG 2936423Sgw25295 if (zb->zb_level == -1) { 2946423Sgw25295 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); 2956423Sgw25295 ASSERT3U(BP_GET_LEVEL(bp), ==, 0); 2966423Sgw25295 } else { 2976423Sgw25295 ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); 2986423Sgw25295 ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); 2996423Sgw25295 } 3006423Sgw25295 3016423Sgw25295 if (zb->zb_level > 0) { 3026423Sgw25295 uint64_t fill = 0; 3036423Sgw25295 blkptr_t *bpx, *bpend; 3046423Sgw25295 3056423Sgw25295 for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx); 3066423Sgw25295 bpx < bpend; bpx++) { 3076423Sgw25295 if (bpx->blk_birth != 0) { 3086423Sgw25295 fill += bpx->blk_fill; 3096423Sgw25295 } else { 3106423Sgw25295 ASSERT(bpx->blk_fill == 0); 3116423Sgw25295 } 3126423Sgw25295 } 3136423Sgw25295 ASSERT3U(fill, ==, bp->blk_fill); 3146423Sgw25295 } 3156423Sgw25295 3166423Sgw25295 if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) { 3176423Sgw25295 uint64_t fill = 0; 3186423Sgw25295 dnode_phys_t *dnx, *dnend; 3196423Sgw25295 3206423Sgw25295 for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT); 3216423Sgw25295 dnx < dnend; dnx++) { 3226423Sgw25295 if (dnx->dn_type != DMU_OT_NONE) 3236423Sgw25295 fill++; 3246423Sgw25295 } 3256423Sgw25295 ASSERT3U(fill, ==, bp->blk_fill); 3266423Sgw25295 } 3276423Sgw25295 #endif 3286423Sgw25295 3296423Sgw25295 if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE) 3306423Sgw25295 return (0); 3316423Sgw25295 3326423Sgw25295 /* Abort immediately if we have encountered gang blocks */ 3336423Sgw25295 if (BP_IS_GANG(bp)) { 3346423Sgw25295 ma->ma_gang++; 3356423Sgw25295 return (EINTR); 3366423Sgw25295 } 3376423Sgw25295 3386423Sgw25295 /* first time? */ 3396423Sgw25295 if (ma->ma_extent->ze_size == 0) { 3406423Sgw25295 zvol_init_extent(ma->ma_extent, bp); 3416423Sgw25295 return (0); 3426423Sgw25295 } 3436423Sgw25295 3446423Sgw25295 stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) - 3456423Sgw25295 ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) + 3466423Sgw25295 (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride)); 3476423Sgw25295 if (DVA_GET_VDEV(BP_IDENTITY(bp)) == 3486423Sgw25295 DVA_GET_VDEV(&ma->ma_extent->ze_dva)) { 3496423Sgw25295 if (ma->ma_extent->ze_stride == 0) { 3506423Sgw25295 /* second block in this extent */ 3516423Sgw25295 ma->ma_extent->ze_stride = stride; 3526423Sgw25295 ma->ma_extent->ze_size++; 3536423Sgw25295 return (0); 3546423Sgw25295 } else if (ma->ma_extent->ze_stride == stride) { 3556423Sgw25295 /* 3566423Sgw25295 * the block we allocated has the same 3576423Sgw25295 * stride 3586423Sgw25295 */ 3596423Sgw25295 ma->ma_extent->ze_size++; 3606423Sgw25295 return (0); 3616423Sgw25295 } 3626423Sgw25295 } 3636423Sgw25295 3646423Sgw25295 /* 3656423Sgw25295 * dtrace -n 'zfs-dprintf 3666423Sgw25295 * /stringof(arg0) == "zvol.c"/ 3676423Sgw25295 * { 3686423Sgw25295 * printf("%s: %s", stringof(arg1), stringof(arg3)) 3696423Sgw25295 * } ' 3706423Sgw25295 */ 3716423Sgw25295 dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n", 3726423Sgw25295 ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride); 3736423Sgw25295 dprintf_bp(bp, "%s", "next blkptr:"); 3746423Sgw25295 /* start a new extent */ 3756423Sgw25295 if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) { 3766423Sgw25295 ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t), 3776423Sgw25295 KM_SLEEP); 3786423Sgw25295 ma->ma_list = ma->ma_list->zl_next; 3796423Sgw25295 ma->ma_extent = &ma->ma_list->zl_extents[0]; 3806423Sgw25295 } else { 3816423Sgw25295 ma->ma_extent++; 3826423Sgw25295 } 3836423Sgw25295 zvol_init_extent(ma->ma_extent, bp); 3846423Sgw25295 return (0); 3856423Sgw25295 } 3866423Sgw25295 3874543Smarks /* ARGSUSED */ 388789Sahrens void 3894543Smarks zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 390789Sahrens { 3915331Samw zfs_creat_t *zct = arg; 3925331Samw nvlist_t *nvprops = zct->zct_props; 393789Sahrens int error; 3942676Seschrock uint64_t volblocksize, volsize; 395789Sahrens 3964543Smarks VERIFY(nvlist_lookup_uint64(nvprops, 3972676Seschrock zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); 3984543Smarks if (nvlist_lookup_uint64(nvprops, 3992676Seschrock zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) 4002676Seschrock volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); 4012676Seschrock 4022676Seschrock /* 4036423Sgw25295 * These properties must be removed from the list so the generic 4042676Seschrock * property setting step won't apply to them. 4052676Seschrock */ 4064543Smarks VERIFY(nvlist_remove_all(nvprops, 4072676Seschrock zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); 4084543Smarks (void) nvlist_remove_all(nvprops, 4092676Seschrock zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); 4102676Seschrock 4112676Seschrock error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, 412789Sahrens DMU_OT_NONE, 0, tx); 413789Sahrens ASSERT(error == 0); 414789Sahrens 415789Sahrens error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, 416789Sahrens DMU_OT_NONE, 0, tx); 417789Sahrens ASSERT(error == 0); 418789Sahrens 4192676Seschrock error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); 420789Sahrens ASSERT(error == 0); 421789Sahrens } 422789Sahrens 423789Sahrens /* 4241141Sperrin * Replay a TX_WRITE ZIL transaction that didn't get committed 4251141Sperrin * after a system failure 4261141Sperrin */ 4271141Sperrin static int 4281141Sperrin zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) 4291141Sperrin { 4301141Sperrin objset_t *os = zv->zv_objset; 4311141Sperrin char *data = (char *)(lr + 1); /* data follows lr_write_t */ 4321141Sperrin uint64_t off = lr->lr_offset; 4331141Sperrin uint64_t len = lr->lr_length; 4341141Sperrin dmu_tx_t *tx; 4351141Sperrin int error; 4361141Sperrin 4371141Sperrin if (byteswap) 4381141Sperrin byteswap_uint64_array(lr, sizeof (*lr)); 4391141Sperrin 4401141Sperrin tx = dmu_tx_create(os); 4411141Sperrin dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); 4421141Sperrin error = dmu_tx_assign(tx, zv->zv_txg_assign); 4431141Sperrin if (error) { 4441141Sperrin dmu_tx_abort(tx); 4451141Sperrin } else { 4461141Sperrin dmu_write(os, ZVOL_OBJ, off, len, data, tx); 4471141Sperrin dmu_tx_commit(tx); 4481141Sperrin } 4491141Sperrin 4501141Sperrin return (error); 4511141Sperrin } 4521141Sperrin 4531141Sperrin /* ARGSUSED */ 4541141Sperrin static int 4551141Sperrin zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) 4561141Sperrin { 4571141Sperrin return (ENOTSUP); 4581141Sperrin } 4591141Sperrin 4601141Sperrin /* 4611141Sperrin * Callback vectors for replaying records. 4621141Sperrin * Only TX_WRITE is needed for zvol. 4631141Sperrin */ 4641141Sperrin zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { 4651141Sperrin zvol_replay_err, /* 0 no such transaction type */ 4661141Sperrin zvol_replay_err, /* TX_CREATE */ 4671141Sperrin zvol_replay_err, /* TX_MKDIR */ 4681141Sperrin zvol_replay_err, /* TX_MKXATTR */ 4691141Sperrin zvol_replay_err, /* TX_SYMLINK */ 4701141Sperrin zvol_replay_err, /* TX_REMOVE */ 4711141Sperrin zvol_replay_err, /* TX_RMDIR */ 4721141Sperrin zvol_replay_err, /* TX_LINK */ 4731141Sperrin zvol_replay_err, /* TX_RENAME */ 4741141Sperrin zvol_replay_write, /* TX_WRITE */ 4751141Sperrin zvol_replay_err, /* TX_TRUNCATE */ 4761141Sperrin zvol_replay_err, /* TX_SETATTR */ 4771141Sperrin zvol_replay_err, /* TX_ACL */ 4781141Sperrin }; 4791141Sperrin 4801141Sperrin /* 4816423Sgw25295 * reconstruct dva that gets us to the desired offset (offset 4826423Sgw25295 * is in bytes) 4836423Sgw25295 */ 4846423Sgw25295 int 4856423Sgw25295 zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva) 4866423Sgw25295 { 4876423Sgw25295 zvol_ext_list_t *zl; 4886423Sgw25295 zvol_extent_t *ze; 4896423Sgw25295 int idx; 4906423Sgw25295 uint64_t tmp; 4916423Sgw25295 4926423Sgw25295 if ((zl = zv->zv_list) == NULL) 4936423Sgw25295 return (EIO); 4946423Sgw25295 idx = 0; 4956423Sgw25295 ze = &zl->zl_extents[0]; 4966423Sgw25295 while (offset >= ze->ze_size * zv->zv_volblocksize) { 4976423Sgw25295 offset -= ze->ze_size * zv->zv_volblocksize; 4986423Sgw25295 4996423Sgw25295 if (idx == NUM_EXTENTS - 1) { 5006423Sgw25295 /* we've reached the end of this array */ 5016423Sgw25295 ASSERT(zl->zl_next != NULL); 5026423Sgw25295 if (zl->zl_next == NULL) 5036423Sgw25295 return (-1); 5046423Sgw25295 zl = zl->zl_next; 5056423Sgw25295 ze = &zl->zl_extents[0]; 5066423Sgw25295 idx = 0; 5076423Sgw25295 } else { 5086423Sgw25295 ze++; 5096423Sgw25295 idx++; 5106423Sgw25295 } 5116423Sgw25295 } 5126423Sgw25295 DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva)); 5136423Sgw25295 tmp = DVA_GET_OFFSET((&ze->ze_dva)); 5146423Sgw25295 tmp += (ze->ze_stride * (offset / zv->zv_volblocksize)); 5156423Sgw25295 DVA_SET_OFFSET(dva, tmp); 5166423Sgw25295 return (0); 5176423Sgw25295 } 5186423Sgw25295 5196423Sgw25295 static void 5206423Sgw25295 zvol_free_extents(zvol_state_t *zv) 5216423Sgw25295 { 5226423Sgw25295 zvol_ext_list_t *zl; 5236423Sgw25295 zvol_ext_list_t *tmp; 5246423Sgw25295 5256423Sgw25295 if (zv->zv_list != NULL) { 5266423Sgw25295 zl = zv->zv_list; 5276423Sgw25295 while (zl != NULL) { 5286423Sgw25295 tmp = zl->zl_next; 5296423Sgw25295 kmem_free(zl, sizeof (zvol_ext_list_t)); 5306423Sgw25295 zl = tmp; 5316423Sgw25295 } 5326423Sgw25295 zv->zv_list = NULL; 5336423Sgw25295 } 5346423Sgw25295 } 5356423Sgw25295 5366423Sgw25295 int 5376423Sgw25295 zvol_get_lbas(zvol_state_t *zv) 5386423Sgw25295 { 5396423Sgw25295 struct maparg ma; 5406423Sgw25295 zvol_ext_list_t *zl; 5416423Sgw25295 zvol_extent_t *ze; 5426423Sgw25295 uint64_t blocks = 0; 5436423Sgw25295 int err; 5446423Sgw25295 5456423Sgw25295 ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP); 5466423Sgw25295 ma.ma_extent = &ma.ma_list->zl_extents[0]; 5476423Sgw25295 ma.ma_gang = 0; 5486423Sgw25295 zv->zv_list = ma.ma_list; 5496423Sgw25295 5506423Sgw25295 err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma); 5516423Sgw25295 if (err == EINTR && ma.ma_gang) { 5526423Sgw25295 /* 5536423Sgw25295 * We currently don't support dump devices when the pool 5546423Sgw25295 * is so fragmented that our allocation has resulted in 5556423Sgw25295 * gang blocks. 5566423Sgw25295 */ 5576423Sgw25295 zvol_free_extents(zv); 5586423Sgw25295 return (EFRAGS); 5596423Sgw25295 } 5606423Sgw25295 ASSERT3U(err, ==, 0); 5616423Sgw25295 5626423Sgw25295 ze = &zl->zl_extents[0]; 5636423Sgw25295 while (ze) { 5646423Sgw25295 blocks += ze->ze_size; 5656423Sgw25295 if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) { 5666423Sgw25295 zl = zl->zl_next; 5676423Sgw25295 ze = &zl->zl_extents[0]; 5686423Sgw25295 } else { 5696423Sgw25295 ze++; 5706423Sgw25295 } 5716423Sgw25295 } 5726423Sgw25295 if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) { 5736423Sgw25295 zvol_free_extents(zv); 5746423Sgw25295 return (EIO); 5756423Sgw25295 } 5766423Sgw25295 5776423Sgw25295 return (0); 5786423Sgw25295 } 5796423Sgw25295 5806423Sgw25295 /* 5816423Sgw25295 * Create a minor node (plus a whole lot more) for the specified volume. 582789Sahrens */ 583789Sahrens int 5844787Sahrens zvol_create_minor(const char *name, major_t maj) 585789Sahrens { 586789Sahrens zvol_state_t *zv; 587789Sahrens objset_t *os; 5883063Sperrin dmu_object_info_t doi; 589789Sahrens uint64_t volsize; 590789Sahrens minor_t minor = 0; 591789Sahrens struct pathname linkpath; 5926689Smaybee int ds_mode = DS_MODE_OWNER; 593789Sahrens vnode_t *vp = NULL; 594789Sahrens char *devpath; 5956423Sgw25295 size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1; 596789Sahrens char chrbuf[30], blkbuf[30]; 597789Sahrens int error; 598789Sahrens 599789Sahrens mutex_enter(&zvol_state_lock); 600789Sahrens 601789Sahrens if ((zv = zvol_minor_lookup(name)) != NULL) { 602789Sahrens mutex_exit(&zvol_state_lock); 603789Sahrens return (EEXIST); 604789Sahrens } 605789Sahrens 606789Sahrens if (strchr(name, '@') != 0) 607789Sahrens ds_mode |= DS_MODE_READONLY; 608789Sahrens 609789Sahrens error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); 610789Sahrens 611789Sahrens if (error) { 612789Sahrens mutex_exit(&zvol_state_lock); 613789Sahrens return (error); 614789Sahrens } 615789Sahrens 616789Sahrens error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 617789Sahrens 618789Sahrens if (error) { 619789Sahrens dmu_objset_close(os); 620789Sahrens mutex_exit(&zvol_state_lock); 621789Sahrens return (error); 622789Sahrens } 623789Sahrens 624789Sahrens /* 625789Sahrens * If there's an existing /dev/zvol symlink, try to use the 626789Sahrens * same minor number we used last time. 627789Sahrens */ 628789Sahrens devpath = kmem_alloc(devpathlen, KM_SLEEP); 629789Sahrens 6306423Sgw25295 (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name); 631789Sahrens 632789Sahrens error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp); 633789Sahrens 634789Sahrens kmem_free(devpath, devpathlen); 635789Sahrens 636789Sahrens if (error == 0 && vp->v_type != VLNK) 637789Sahrens error = EINVAL; 638789Sahrens 639789Sahrens if (error == 0) { 640789Sahrens pn_alloc(&linkpath); 641789Sahrens error = pn_getsymlink(vp, &linkpath, kcred); 642789Sahrens if (error == 0) { 643789Sahrens char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV); 644789Sahrens if (ms != NULL) { 645789Sahrens ms += strlen(ZVOL_PSEUDO_DEV); 646789Sahrens minor = stoi(&ms); 647789Sahrens } 648789Sahrens } 649789Sahrens pn_free(&linkpath); 650789Sahrens } 651789Sahrens 652789Sahrens if (vp != NULL) 653789Sahrens VN_RELE(vp); 654789Sahrens 655789Sahrens /* 656789Sahrens * If we found a minor but it's already in use, we must pick a new one. 657789Sahrens */ 658789Sahrens if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL) 659789Sahrens minor = 0; 660789Sahrens 661789Sahrens if (minor == 0) 662789Sahrens minor = zvol_minor_alloc(); 663789Sahrens 664789Sahrens if (minor == 0) { 665789Sahrens dmu_objset_close(os); 666789Sahrens mutex_exit(&zvol_state_lock); 667789Sahrens return (ENXIO); 668789Sahrens } 669789Sahrens 670789Sahrens if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { 671789Sahrens dmu_objset_close(os); 672789Sahrens mutex_exit(&zvol_state_lock); 673789Sahrens return (EAGAIN); 674789Sahrens } 675789Sahrens 6762676Seschrock (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, 6772676Seschrock (char *)name); 678789Sahrens 679789Sahrens (void) sprintf(chrbuf, "%uc,raw", minor); 680789Sahrens 681789Sahrens if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, 682789Sahrens minor, DDI_PSEUDO, 0) == DDI_FAILURE) { 683789Sahrens ddi_soft_state_free(zvol_state, minor); 684789Sahrens dmu_objset_close(os); 685789Sahrens mutex_exit(&zvol_state_lock); 686789Sahrens return (EAGAIN); 687789Sahrens } 688789Sahrens 689789Sahrens (void) sprintf(blkbuf, "%uc", minor); 690789Sahrens 691789Sahrens if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, 692789Sahrens minor, DDI_PSEUDO, 0) == DDI_FAILURE) { 693789Sahrens ddi_remove_minor_node(zfs_dip, chrbuf); 694789Sahrens ddi_soft_state_free(zvol_state, minor); 695789Sahrens dmu_objset_close(os); 696789Sahrens mutex_exit(&zvol_state_lock); 697789Sahrens return (EAGAIN); 698789Sahrens } 699789Sahrens 700789Sahrens zv = ddi_get_soft_state(zvol_state, minor); 701789Sahrens 702789Sahrens (void) strcpy(zv->zv_name, name); 703789Sahrens zv->zv_min_bs = DEV_BSHIFT; 704789Sahrens zv->zv_minor = minor; 705789Sahrens zv->zv_volsize = volsize; 706789Sahrens zv->zv_objset = os; 707789Sahrens zv->zv_mode = ds_mode; 7083063Sperrin zv->zv_zilog = zil_open(os, zvol_get_data); 7093755Sperrin mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); 7103755Sperrin avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, 7113755Sperrin sizeof (rl_t), offsetof(rl_t, r_node)); 7123063Sperrin /* get and cache the blocksize */ 7133063Sperrin error = dmu_object_info(os, ZVOL_OBJ, &doi); 7143063Sperrin ASSERT(error == 0); 7153063Sperrin zv->zv_volblocksize = doi.doi_data_block_size; 7161141Sperrin 7173461Sahrens zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector); 7184787Sahrens zvol_size_changed(zv, maj); 719789Sahrens 7201544Seschrock /* XXX this should handle the possible i/o error */ 721789Sahrens VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), 722789Sahrens "readonly", zvol_readonly_changed_cb, zv) == 0); 723789Sahrens 724789Sahrens zvol_minors++; 725789Sahrens 726789Sahrens mutex_exit(&zvol_state_lock); 727789Sahrens 728789Sahrens return (0); 729789Sahrens } 730789Sahrens 731789Sahrens /* 732789Sahrens * Remove minor node for the specified volume. 733789Sahrens */ 734789Sahrens int 7352676Seschrock zvol_remove_minor(const char *name) 736789Sahrens { 737789Sahrens zvol_state_t *zv; 738789Sahrens char namebuf[30]; 739789Sahrens 740789Sahrens mutex_enter(&zvol_state_lock); 741789Sahrens 7422676Seschrock if ((zv = zvol_minor_lookup(name)) == NULL) { 743789Sahrens mutex_exit(&zvol_state_lock); 744789Sahrens return (ENXIO); 745789Sahrens } 746789Sahrens 747789Sahrens if (zv->zv_total_opens != 0) { 748789Sahrens mutex_exit(&zvol_state_lock); 749789Sahrens return (EBUSY); 750789Sahrens } 751789Sahrens 752789Sahrens (void) sprintf(namebuf, "%uc,raw", zv->zv_minor); 753789Sahrens ddi_remove_minor_node(zfs_dip, namebuf); 754789Sahrens 755789Sahrens (void) sprintf(namebuf, "%uc", zv->zv_minor); 756789Sahrens ddi_remove_minor_node(zfs_dip, namebuf); 757789Sahrens 758789Sahrens VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), 759789Sahrens "readonly", zvol_readonly_changed_cb, zv) == 0); 760789Sahrens 7611141Sperrin zil_close(zv->zv_zilog); 7621141Sperrin zv->zv_zilog = NULL; 763789Sahrens dmu_objset_close(zv->zv_objset); 764789Sahrens zv->zv_objset = NULL; 7653755Sperrin avl_destroy(&zv->zv_znode.z_range_avl); 7663755Sperrin mutex_destroy(&zv->zv_znode.z_range_lock); 767789Sahrens 768789Sahrens ddi_soft_state_free(zvol_state, zv->zv_minor); 769789Sahrens 770789Sahrens zvol_minors--; 771789Sahrens 772789Sahrens mutex_exit(&zvol_state_lock); 773789Sahrens 774789Sahrens return (0); 775789Sahrens } 776789Sahrens 7776423Sgw25295 int 7786423Sgw25295 zvol_prealloc(zvol_state_t *zv) 7796423Sgw25295 { 7806423Sgw25295 objset_t *os = zv->zv_objset; 7816423Sgw25295 dmu_tx_t *tx; 7826423Sgw25295 void *data; 7836423Sgw25295 uint64_t refd, avail, usedobjs, availobjs; 7846423Sgw25295 uint64_t resid = zv->zv_volsize; 7856423Sgw25295 uint64_t off = 0; 7866423Sgw25295 7876423Sgw25295 /* Check the space usage before attempting to allocate the space */ 7886423Sgw25295 dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); 7896423Sgw25295 if (avail < zv->zv_volsize) 7906423Sgw25295 return (ENOSPC); 7916423Sgw25295 7926423Sgw25295 /* Free old extents if they exist */ 7936423Sgw25295 zvol_free_extents(zv); 7946423Sgw25295 7956423Sgw25295 /* allocate the blocks by writing each one */ 7966423Sgw25295 data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP); 7976423Sgw25295 7986423Sgw25295 while (resid != 0) { 7996423Sgw25295 int error; 8006423Sgw25295 uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE); 8016423Sgw25295 8026423Sgw25295 tx = dmu_tx_create(os); 8036423Sgw25295 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); 8046423Sgw25295 error = dmu_tx_assign(tx, TXG_WAIT); 8056423Sgw25295 if (error) { 8066423Sgw25295 dmu_tx_abort(tx); 8076423Sgw25295 kmem_free(data, SPA_MAXBLOCKSIZE); 808*6992Smaybee (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); 8096423Sgw25295 return (error); 8106423Sgw25295 } 8116423Sgw25295 dmu_write(os, ZVOL_OBJ, off, bytes, data, tx); 8126423Sgw25295 dmu_tx_commit(tx); 8136423Sgw25295 off += bytes; 8146423Sgw25295 resid -= bytes; 8156423Sgw25295 } 8166423Sgw25295 kmem_free(data, SPA_MAXBLOCKSIZE); 8176423Sgw25295 txg_wait_synced(dmu_objset_pool(os), 0); 8186423Sgw25295 8196423Sgw25295 return (0); 8206423Sgw25295 } 8216423Sgw25295 8226423Sgw25295 int 8236423Sgw25295 zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize) 8246423Sgw25295 { 8256423Sgw25295 dmu_tx_t *tx; 8266423Sgw25295 int error; 8276423Sgw25295 8286423Sgw25295 ASSERT(MUTEX_HELD(&zvol_state_lock)); 8296423Sgw25295 8306423Sgw25295 tx = dmu_tx_create(zv->zv_objset); 8316423Sgw25295 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 8326423Sgw25295 error = dmu_tx_assign(tx, TXG_WAIT); 8336423Sgw25295 if (error) { 8346423Sgw25295 dmu_tx_abort(tx); 8356423Sgw25295 return (error); 8366423Sgw25295 } 8376423Sgw25295 8386423Sgw25295 error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, 8396423Sgw25295 &volsize, tx); 8406423Sgw25295 dmu_tx_commit(tx); 8416423Sgw25295 8426423Sgw25295 if (error == 0) 843*6992Smaybee error = dmu_free_long_range(zv->zv_objset, 844*6992Smaybee ZVOL_OBJ, volsize, DMU_OBJECT_END); 8456423Sgw25295 8466423Sgw25295 if (error == 0) { 8476423Sgw25295 zv->zv_volsize = volsize; 8486423Sgw25295 zvol_size_changed(zv, maj); 8496423Sgw25295 } 8506423Sgw25295 return (error); 8516423Sgw25295 } 8526423Sgw25295 853789Sahrens int 8544787Sahrens zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) 855789Sahrens { 856789Sahrens zvol_state_t *zv; 857789Sahrens int error; 8581133Seschrock dmu_object_info_t doi; 8596423Sgw25295 uint64_t old_volsize = 0ULL; 860789Sahrens 861789Sahrens mutex_enter(&zvol_state_lock); 862789Sahrens 8632676Seschrock if ((zv = zvol_minor_lookup(name)) == NULL) { 864789Sahrens mutex_exit(&zvol_state_lock); 865789Sahrens return (ENXIO); 866789Sahrens } 8676423Sgw25295 old_volsize = zv->zv_volsize; 868789Sahrens 8691133Seschrock if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || 8702676Seschrock (error = zvol_check_volsize(volsize, 8712676Seschrock doi.doi_data_block_size)) != 0) { 8721133Seschrock mutex_exit(&zvol_state_lock); 8731133Seschrock return (error); 8741133Seschrock } 8751133Seschrock 8766423Sgw25295 if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { 877789Sahrens mutex_exit(&zvol_state_lock); 878789Sahrens return (EROFS); 879789Sahrens } 880789Sahrens 8816423Sgw25295 error = zvol_update_volsize(zv, maj, volsize); 882789Sahrens 8836423Sgw25295 /* 8846423Sgw25295 * Reinitialize the dump area to the new size. If we 8856423Sgw25295 * failed to resize the dump area then restore the it back to 8866423Sgw25295 * it's original size. 8876423Sgw25295 */ 8886423Sgw25295 if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) { 8896423Sgw25295 if ((error = zvol_dumpify(zv)) != 0 || 8906423Sgw25295 (error = dumpvp_resize()) != 0) { 8916423Sgw25295 (void) zvol_update_volsize(zv, maj, old_volsize); 8926423Sgw25295 error = zvol_dumpify(zv); 8936423Sgw25295 } 894789Sahrens } 895789Sahrens 896789Sahrens mutex_exit(&zvol_state_lock); 897789Sahrens 898789Sahrens return (error); 899789Sahrens } 900789Sahrens 901789Sahrens int 9022676Seschrock zvol_set_volblocksize(const char *name, uint64_t volblocksize) 903789Sahrens { 904789Sahrens zvol_state_t *zv; 905789Sahrens dmu_tx_t *tx; 906789Sahrens int error; 907789Sahrens 908789Sahrens mutex_enter(&zvol_state_lock); 909789Sahrens 9102676Seschrock if ((zv = zvol_minor_lookup(name)) == NULL) { 911789Sahrens mutex_exit(&zvol_state_lock); 912789Sahrens return (ENXIO); 913789Sahrens } 9146423Sgw25295 if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { 915789Sahrens mutex_exit(&zvol_state_lock); 916789Sahrens return (EROFS); 917789Sahrens } 918789Sahrens 919789Sahrens tx = dmu_tx_create(zv->zv_objset); 920789Sahrens dmu_tx_hold_bonus(tx, ZVOL_OBJ); 921789Sahrens error = dmu_tx_assign(tx, TXG_WAIT); 922789Sahrens if (error) { 923789Sahrens dmu_tx_abort(tx); 924789Sahrens } else { 925789Sahrens error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, 9262676Seschrock volblocksize, 0, tx); 927789Sahrens if (error == ENOTSUP) 928789Sahrens error = EBUSY; 929789Sahrens dmu_tx_commit(tx); 930789Sahrens } 931789Sahrens 932789Sahrens mutex_exit(&zvol_state_lock); 933789Sahrens 934789Sahrens return (error); 935789Sahrens } 936789Sahrens 937789Sahrens /*ARGSUSED*/ 938789Sahrens int 939789Sahrens zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) 940789Sahrens { 941789Sahrens minor_t minor = getminor(*devp); 942789Sahrens zvol_state_t *zv; 943789Sahrens 944789Sahrens if (minor == 0) /* This is the control device */ 945789Sahrens return (0); 946789Sahrens 947789Sahrens mutex_enter(&zvol_state_lock); 948789Sahrens 949789Sahrens zv = ddi_get_soft_state(zvol_state, minor); 950789Sahrens if (zv == NULL) { 951789Sahrens mutex_exit(&zvol_state_lock); 952789Sahrens return (ENXIO); 953789Sahrens } 954789Sahrens 955789Sahrens ASSERT(zv->zv_objset != NULL); 956789Sahrens 957789Sahrens if ((flag & FWRITE) && 9586423Sgw25295 (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) { 959789Sahrens mutex_exit(&zvol_state_lock); 960789Sahrens return (EROFS); 961789Sahrens } 962789Sahrens 963789Sahrens if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { 964789Sahrens zv->zv_open_count[otyp]++; 965789Sahrens zv->zv_total_opens++; 966789Sahrens } 967789Sahrens 968789Sahrens mutex_exit(&zvol_state_lock); 969789Sahrens 970789Sahrens return (0); 971789Sahrens } 972789Sahrens 973789Sahrens /*ARGSUSED*/ 974789Sahrens int 975789Sahrens zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) 976789Sahrens { 977789Sahrens minor_t minor = getminor(dev); 978789Sahrens zvol_state_t *zv; 979789Sahrens 980789Sahrens if (minor == 0) /* This is the control device */ 981789Sahrens return (0); 982789Sahrens 983789Sahrens mutex_enter(&zvol_state_lock); 984789Sahrens 985789Sahrens zv = ddi_get_soft_state(zvol_state, minor); 986789Sahrens if (zv == NULL) { 987789Sahrens mutex_exit(&zvol_state_lock); 988789Sahrens return (ENXIO); 989789Sahrens } 990789Sahrens 991789Sahrens /* 992789Sahrens * The next statement is a workaround for the following DDI bug: 993789Sahrens * 6343604 specfs race: multiple "last-close" of the same device 994789Sahrens */ 995789Sahrens if (zv->zv_total_opens == 0) { 996789Sahrens mutex_exit(&zvol_state_lock); 997789Sahrens return (0); 998789Sahrens } 999789Sahrens 1000789Sahrens /* 1001789Sahrens * If the open count is zero, this is a spurious close. 1002789Sahrens * That indicates a bug in the kernel / DDI framework. 1003789Sahrens */ 1004789Sahrens ASSERT(zv->zv_open_count[otyp] != 0); 1005789Sahrens ASSERT(zv->zv_total_opens != 0); 1006789Sahrens 1007789Sahrens /* 1008789Sahrens * You may get multiple opens, but only one close. 1009789Sahrens */ 1010789Sahrens zv->zv_open_count[otyp]--; 1011789Sahrens zv->zv_total_opens--; 1012789Sahrens 1013789Sahrens mutex_exit(&zvol_state_lock); 1014789Sahrens 1015789Sahrens return (0); 1016789Sahrens } 1017789Sahrens 10183638Sbillm static void 10193063Sperrin zvol_get_done(dmu_buf_t *db, void *vzgd) 10203063Sperrin { 10213063Sperrin zgd_t *zgd = (zgd_t *)vzgd; 10223755Sperrin rl_t *rl = zgd->zgd_rl; 10233063Sperrin 10243063Sperrin dmu_buf_rele(db, vzgd); 10253755Sperrin zfs_range_unlock(rl); 10265688Sbonwick zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 10273063Sperrin kmem_free(zgd, sizeof (zgd_t)); 10283063Sperrin } 10293063Sperrin 10303063Sperrin /* 10313063Sperrin * Get data to generate a TX_WRITE intent log record. 10323063Sperrin */ 10333638Sbillm static int 10343063Sperrin zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 10353063Sperrin { 10363063Sperrin zvol_state_t *zv = arg; 10373063Sperrin objset_t *os = zv->zv_objset; 10383063Sperrin dmu_buf_t *db; 10393755Sperrin rl_t *rl; 10403063Sperrin zgd_t *zgd; 10413755Sperrin uint64_t boff; /* block starting offset */ 10423755Sperrin int dlen = lr->lr_length; /* length of user data */ 10433063Sperrin int error; 10443063Sperrin 10453063Sperrin ASSERT(zio); 10463755Sperrin ASSERT(dlen != 0); 10473638Sbillm 10483755Sperrin /* 10493755Sperrin * Write records come in two flavors: immediate and indirect. 10503755Sperrin * For small writes it's cheaper to store the data with the 10513755Sperrin * log record (immediate); for large writes it's cheaper to 10523755Sperrin * sync the data and get a pointer to it (indirect) so that 10533755Sperrin * we don't have to write the data twice. 10543755Sperrin */ 10553755Sperrin if (buf != NULL) /* immediate write */ 10563755Sperrin return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); 10573063Sperrin 10583063Sperrin zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 10593063Sperrin zgd->zgd_zilog = zv->zv_zilog; 10603063Sperrin zgd->zgd_bp = &lr->lr_blkptr; 10613063Sperrin 10623755Sperrin /* 10633755Sperrin * Lock the range of the block to ensure that when the data is 10646423Sgw25295 * written out and its checksum is being calculated that no other 10653755Sperrin * thread can change the block. 10663755Sperrin */ 10673755Sperrin boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); 10683755Sperrin rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, 10693755Sperrin RL_READER); 10703755Sperrin zgd->zgd_rl = rl; 10713755Sperrin 10723063Sperrin VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); 10733063Sperrin error = dmu_sync(zio, db, &lr->lr_blkptr, 10743063Sperrin lr->lr_common.lrc_txg, zvol_get_done, zgd); 10753638Sbillm if (error == 0) 10765688Sbonwick zil_add_block(zv->zv_zilog, &lr->lr_blkptr); 10773063Sperrin /* 10783063Sperrin * If we get EINPROGRESS, then we need to wait for a 10793063Sperrin * write IO initiated by dmu_sync() to complete before 10803063Sperrin * we can release this dbuf. We will finish everything 10813063Sperrin * up in the zvol_get_done() callback. 10823063Sperrin */ 10833063Sperrin if (error == EINPROGRESS) 10843063Sperrin return (0); 10853063Sperrin dmu_buf_rele(db, zgd); 10863755Sperrin zfs_range_unlock(rl); 10873063Sperrin kmem_free(zgd, sizeof (zgd_t)); 10883063Sperrin return (error); 10893063Sperrin } 10903063Sperrin 10911861Sperrin /* 10921861Sperrin * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. 10931141Sperrin * 10941141Sperrin * We store data in the log buffers if it's small enough. 10953063Sperrin * Otherwise we will later flush the data out via dmu_sync(). 10961141Sperrin */ 10973063Sperrin ssize_t zvol_immediate_write_sz = 32768; 10981141Sperrin 10993638Sbillm static void 11003638Sbillm zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) 11011141Sperrin { 11023638Sbillm uint32_t blocksize = zv->zv_volblocksize; 11031141Sperrin lr_write_t *lr; 11041861Sperrin 11051861Sperrin while (len) { 11063638Sbillm ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); 11073638Sbillm itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 11083638Sbillm 11093638Sbillm itx->itx_wr_state = 11103638Sbillm len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; 11113638Sbillm itx->itx_private = zv; 11123638Sbillm lr = (lr_write_t *)&itx->itx_lr; 11133638Sbillm lr->lr_foid = ZVOL_OBJ; 11143638Sbillm lr->lr_offset = off; 11153638Sbillm lr->lr_length = nbytes; 11163638Sbillm lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); 11173638Sbillm BP_ZERO(&lr->lr_blkptr); 11183638Sbillm 11193638Sbillm (void) zil_itx_assign(zv->zv_zilog, itx, tx); 11201861Sperrin len -= nbytes; 11211861Sperrin off += nbytes; 11221141Sperrin } 11231141Sperrin } 11241141Sperrin 1125789Sahrens int 11266423Sgw25295 zvol_dumpio(vdev_t *vd, uint64_t size, uint64_t offset, void *addr, 11276423Sgw25295 int bflags, int isdump) 11286423Sgw25295 { 11296423Sgw25295 vdev_disk_t *dvd; 11306423Sgw25295 int direction; 11316423Sgw25295 int c; 11326423Sgw25295 int numerrors = 0; 11336423Sgw25295 11346423Sgw25295 for (c = 0; c < vd->vdev_children; c++) { 11356423Sgw25295 if (zvol_dumpio(vd->vdev_child[c], size, offset, 11366423Sgw25295 addr, bflags, isdump) != 0) { 11376423Sgw25295 numerrors++; 11386423Sgw25295 } else if (bflags & B_READ) { 11396423Sgw25295 break; 11406423Sgw25295 } 11416423Sgw25295 } 11426423Sgw25295 11436423Sgw25295 if (!vd->vdev_ops->vdev_op_leaf) 11446423Sgw25295 return (numerrors < vd->vdev_children ? 0 : EIO); 11456423Sgw25295 11466423Sgw25295 if (!vdev_writeable(vd)) 11476423Sgw25295 return (EIO); 11486423Sgw25295 11496423Sgw25295 dvd = vd->vdev_tsd; 11506423Sgw25295 ASSERT3P(dvd, !=, NULL); 11516423Sgw25295 direction = bflags & (B_WRITE | B_READ); 11526423Sgw25295 ASSERT(ISP2(direction)); 11536423Sgw25295 offset += VDEV_LABEL_START_SIZE; 11546423Sgw25295 11556423Sgw25295 if (ddi_in_panic() || isdump) { 11566423Sgw25295 if (direction & B_READ) 11576423Sgw25295 return (EIO); 11586423Sgw25295 return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), 11596423Sgw25295 lbtodb(size))); 11606423Sgw25295 } else { 11616423Sgw25295 return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, 11626423Sgw25295 direction)); 11636423Sgw25295 } 11646423Sgw25295 } 11656423Sgw25295 11666423Sgw25295 int 11676423Sgw25295 zvol_physio(zvol_state_t *zv, int bflags, uint64_t off, 11686423Sgw25295 uint64_t size, void *addr, int isdump) 11696423Sgw25295 { 11706423Sgw25295 dva_t dva; 11716423Sgw25295 vdev_t *vd; 11726423Sgw25295 int error; 11736423Sgw25295 spa_t *spa = dmu_objset_spa(zv->zv_objset); 11746423Sgw25295 11756423Sgw25295 ASSERT(size <= zv->zv_volblocksize); 11766423Sgw25295 11776423Sgw25295 /* restrict requests to multiples of the system block size */ 11786423Sgw25295 if (P2PHASE(off, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE)) 11796423Sgw25295 return (EINVAL); 11806423Sgw25295 11816423Sgw25295 if (zvol_get_dva(zv, off, &dva) != 0) 11826423Sgw25295 return (EIO); 11836423Sgw25295 11846423Sgw25295 spa_config_enter(spa, RW_READER, FTAG); 11856423Sgw25295 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); 11866423Sgw25295 11876423Sgw25295 error = zvol_dumpio(vd, size, 11886423Sgw25295 DVA_GET_OFFSET(&dva) + (off % zv->zv_volblocksize), 11896423Sgw25295 addr, bflags & (B_READ | B_WRITE | B_PHYS), isdump); 11906423Sgw25295 11916423Sgw25295 spa_config_exit(spa, FTAG); 11926423Sgw25295 return (error); 11936423Sgw25295 } 11946423Sgw25295 11956423Sgw25295 int 1196789Sahrens zvol_strategy(buf_t *bp) 1197789Sahrens { 1198789Sahrens zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); 1199789Sahrens uint64_t off, volsize; 1200789Sahrens size_t size, resid; 1201789Sahrens char *addr; 12021141Sperrin objset_t *os; 12033755Sperrin rl_t *rl; 1204789Sahrens int error = 0; 12056423Sgw25295 boolean_t reading, is_dump = zv->zv_flags & ZVOL_DUMPIFIED; 1206789Sahrens 1207789Sahrens if (zv == NULL) { 1208789Sahrens bioerror(bp, ENXIO); 1209789Sahrens biodone(bp); 1210789Sahrens return (0); 1211789Sahrens } 1212789Sahrens 1213789Sahrens if (getminor(bp->b_edev) == 0) { 1214789Sahrens bioerror(bp, EINVAL); 1215789Sahrens biodone(bp); 1216789Sahrens return (0); 1217789Sahrens } 1218789Sahrens 12196423Sgw25295 if (!(bp->b_flags & B_READ) && 12206423Sgw25295 (zv->zv_flags & ZVOL_RDONLY || 12216423Sgw25295 zv->zv_mode & DS_MODE_READONLY)) { 1222789Sahrens bioerror(bp, EROFS); 1223789Sahrens biodone(bp); 1224789Sahrens return (0); 1225789Sahrens } 1226789Sahrens 1227789Sahrens off = ldbtob(bp->b_blkno); 1228789Sahrens volsize = zv->zv_volsize; 1229789Sahrens 12301141Sperrin os = zv->zv_objset; 12311141Sperrin ASSERT(os != NULL); 1232789Sahrens 1233789Sahrens bp_mapin(bp); 1234789Sahrens addr = bp->b_un.b_addr; 1235789Sahrens resid = bp->b_bcount; 1236789Sahrens 12371861Sperrin /* 12381861Sperrin * There must be no buffer changes when doing a dmu_sync() because 12391861Sperrin * we can't change the data whilst calculating the checksum. 12401861Sperrin */ 12411861Sperrin reading = bp->b_flags & B_READ; 12423755Sperrin rl = zfs_range_lock(&zv->zv_znode, off, resid, 12433755Sperrin reading ? RL_READER : RL_WRITER); 12441861Sperrin 12456423Sgw25295 if (resid > volsize - off) /* don't write past the end */ 12466423Sgw25295 resid = volsize - off; 12476423Sgw25295 1248789Sahrens while (resid != 0 && off < volsize) { 1249789Sahrens 12506423Sgw25295 size = MIN(resid, zvol_maxphys); 12516423Sgw25295 if (is_dump) { 12526423Sgw25295 /* can't straddle a block boundary */ 12536423Sgw25295 size = MIN(size, P2END(off, zv->zv_volblocksize) - off); 12546423Sgw25295 error = zvol_physio(zv, bp->b_flags, off, size, 12556423Sgw25295 addr, 0); 12566423Sgw25295 } else if (reading) { 12571861Sperrin error = dmu_read(os, ZVOL_OBJ, off, size, addr); 1258789Sahrens } else { 12591141Sperrin dmu_tx_t *tx = dmu_tx_create(os); 1260789Sahrens dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); 1261789Sahrens error = dmu_tx_assign(tx, TXG_WAIT); 1262789Sahrens if (error) { 1263789Sahrens dmu_tx_abort(tx); 1264789Sahrens } else { 12651141Sperrin dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 12663638Sbillm zvol_log_write(zv, tx, off, size); 1267789Sahrens dmu_tx_commit(tx); 1268789Sahrens } 1269789Sahrens } 1270789Sahrens if (error) 1271789Sahrens break; 1272789Sahrens off += size; 1273789Sahrens addr += size; 1274789Sahrens resid -= size; 1275789Sahrens } 12763755Sperrin zfs_range_unlock(rl); 1277789Sahrens 1278789Sahrens if ((bp->b_resid = resid) == bp->b_bcount) 1279789Sahrens bioerror(bp, off > volsize ? EINVAL : error); 1280789Sahrens 12816423Sgw25295 if (!(bp->b_flags & B_ASYNC) && !reading && !zil_disable && !is_dump) 12823638Sbillm zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 12833638Sbillm biodone(bp); 12841141Sperrin 1285789Sahrens return (0); 1286789Sahrens } 1287789Sahrens 12883063Sperrin /* 12893063Sperrin * Set the buffer count to the zvol maximum transfer. 12903063Sperrin * Using our own routine instead of the default minphys() 12913063Sperrin * means that for larger writes we write bigger buffers on X86 12923063Sperrin * (128K instead of 56K) and flush the disk write cache less often 12933063Sperrin * (every zvol_maxphys - currently 1MB) instead of minphys (currently 12943063Sperrin * 56K on X86 and 128K on sparc). 12953063Sperrin */ 12963063Sperrin void 12973063Sperrin zvol_minphys(struct buf *bp) 12983063Sperrin { 12993063Sperrin if (bp->b_bcount > zvol_maxphys) 13003063Sperrin bp->b_bcount = zvol_maxphys; 13013063Sperrin } 13023063Sperrin 13036423Sgw25295 int 13046423Sgw25295 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) 13056423Sgw25295 { 13066423Sgw25295 minor_t minor = getminor(dev); 13076423Sgw25295 zvol_state_t *zv; 13086423Sgw25295 int error = 0; 13096423Sgw25295 uint64_t size; 13106423Sgw25295 uint64_t boff; 13116423Sgw25295 uint64_t resid; 13126423Sgw25295 13136423Sgw25295 if (minor == 0) /* This is the control device */ 13146423Sgw25295 return (ENXIO); 13156423Sgw25295 13166423Sgw25295 zv = ddi_get_soft_state(zvol_state, minor); 13176423Sgw25295 if (zv == NULL) 13186423Sgw25295 return (ENXIO); 13196423Sgw25295 13206423Sgw25295 boff = ldbtob(blkno); 13216423Sgw25295 resid = ldbtob(nblocks); 13226423Sgw25295 if (boff + resid > zv->zv_volsize) { 13236423Sgw25295 /* dump should know better than to write here */ 13246423Sgw25295 ASSERT(blkno + resid <= zv->zv_volsize); 13256423Sgw25295 return (EIO); 13266423Sgw25295 } 13276423Sgw25295 while (resid) { 13286423Sgw25295 /* can't straddle a block boundary */ 13296423Sgw25295 size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); 13306423Sgw25295 13316423Sgw25295 error = zvol_physio(zv, B_WRITE, boff, size, addr, 1); 13326423Sgw25295 if (error) 13336423Sgw25295 break; 13346423Sgw25295 boff += size; 13356423Sgw25295 addr += size; 13366423Sgw25295 resid -= size; 13376423Sgw25295 } 13386423Sgw25295 13396423Sgw25295 return (error); 13406423Sgw25295 } 13416423Sgw25295 1342789Sahrens /*ARGSUSED*/ 1343789Sahrens int 13443638Sbillm zvol_read(dev_t dev, uio_t *uio, cred_t *cr) 1345789Sahrens { 13464107Sgw25295 minor_t minor = getminor(dev); 13474107Sgw25295 zvol_state_t *zv; 13483755Sperrin rl_t *rl; 13493638Sbillm int error = 0; 13503638Sbillm 13514107Sgw25295 if (minor == 0) /* This is the control device */ 13524107Sgw25295 return (ENXIO); 13534107Sgw25295 13544107Sgw25295 zv = ddi_get_soft_state(zvol_state, minor); 13554107Sgw25295 if (zv == NULL) 13564107Sgw25295 return (ENXIO); 13574107Sgw25295 13583755Sperrin rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, 13593755Sperrin RL_READER); 13603638Sbillm while (uio->uio_resid > 0) { 13613638Sbillm uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 13623638Sbillm 13633638Sbillm error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); 13643638Sbillm if (error) 13653638Sbillm break; 13663638Sbillm } 13673755Sperrin zfs_range_unlock(rl); 13683638Sbillm return (error); 1369789Sahrens } 1370789Sahrens 1371789Sahrens /*ARGSUSED*/ 1372789Sahrens int 13733638Sbillm zvol_write(dev_t dev, uio_t *uio, cred_t *cr) 1374789Sahrens { 13754107Sgw25295 minor_t minor = getminor(dev); 13764107Sgw25295 zvol_state_t *zv; 13773755Sperrin rl_t *rl; 13783638Sbillm int error = 0; 13793638Sbillm 13804107Sgw25295 if (minor == 0) /* This is the control device */ 13814107Sgw25295 return (ENXIO); 13824107Sgw25295 13834107Sgw25295 zv = ddi_get_soft_state(zvol_state, minor); 13844107Sgw25295 if (zv == NULL) 13854107Sgw25295 return (ENXIO); 13864107Sgw25295 13876423Sgw25295 if (zv->zv_flags & ZVOL_DUMPIFIED) { 13886423Sgw25295 error = physio(zvol_strategy, NULL, dev, B_WRITE, 13896423Sgw25295 zvol_minphys, uio); 13906423Sgw25295 return (error); 13916423Sgw25295 } 13926423Sgw25295 13933755Sperrin rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, 13943755Sperrin RL_WRITER); 13953638Sbillm while (uio->uio_resid > 0) { 13963638Sbillm uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 13973638Sbillm uint64_t off = uio->uio_loffset; 1398789Sahrens 13993638Sbillm dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 14003638Sbillm dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); 14013638Sbillm error = dmu_tx_assign(tx, TXG_WAIT); 14023638Sbillm if (error) { 14033638Sbillm dmu_tx_abort(tx); 14043638Sbillm break; 14053638Sbillm } 14063638Sbillm error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); 14073638Sbillm if (error == 0) 14083638Sbillm zvol_log_write(zv, tx, off, bytes); 14093638Sbillm dmu_tx_commit(tx); 14103638Sbillm 14113638Sbillm if (error) 14123638Sbillm break; 14133638Sbillm } 14143755Sperrin zfs_range_unlock(rl); 14153638Sbillm return (error); 1416789Sahrens } 1417789Sahrens 1418789Sahrens /* 1419789Sahrens * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). 1420789Sahrens */ 1421789Sahrens /*ARGSUSED*/ 1422789Sahrens int 1423789Sahrens zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 1424789Sahrens { 1425789Sahrens zvol_state_t *zv; 14263897Smaybee struct dk_cinfo dki; 1427789Sahrens struct dk_minfo dkm; 1428789Sahrens dk_efi_t efi; 14293897Smaybee struct dk_callback *dkc; 1430789Sahrens struct uuid uuid = EFI_RESERVED; 1431789Sahrens uint32_t crc; 1432789Sahrens int error = 0; 14336423Sgw25295 rl_t *rl; 1434789Sahrens 1435789Sahrens mutex_enter(&zvol_state_lock); 1436789Sahrens 1437789Sahrens zv = ddi_get_soft_state(zvol_state, getminor(dev)); 1438789Sahrens 1439789Sahrens if (zv == NULL) { 1440789Sahrens mutex_exit(&zvol_state_lock); 1441789Sahrens return (ENXIO); 1442789Sahrens } 1443789Sahrens 1444789Sahrens switch (cmd) { 1445789Sahrens 1446789Sahrens case DKIOCINFO: 14473897Smaybee bzero(&dki, sizeof (dki)); 14483897Smaybee (void) strcpy(dki.dki_cname, "zvol"); 14493897Smaybee (void) strcpy(dki.dki_dname, "zvol"); 14503897Smaybee dki.dki_ctype = DKC_UNKNOWN; 14513897Smaybee dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs); 1452789Sahrens mutex_exit(&zvol_state_lock); 14533897Smaybee if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) 1454789Sahrens error = EFAULT; 1455789Sahrens return (error); 1456789Sahrens 1457789Sahrens case DKIOCGMEDIAINFO: 1458789Sahrens bzero(&dkm, sizeof (dkm)); 1459789Sahrens dkm.dki_lbsize = 1U << zv->zv_min_bs; 1460789Sahrens dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; 1461789Sahrens dkm.dki_media_type = DK_UNKNOWN; 1462789Sahrens mutex_exit(&zvol_state_lock); 1463789Sahrens if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) 1464789Sahrens error = EFAULT; 1465789Sahrens return (error); 1466789Sahrens 1467789Sahrens case DKIOCGETEFI: 1468789Sahrens if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) { 1469789Sahrens mutex_exit(&zvol_state_lock); 1470789Sahrens return (EFAULT); 1471789Sahrens } 1472789Sahrens efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; 1473789Sahrens 14743016Smaybee /* 14753016Smaybee * Some clients may attempt to request a PMBR for the 14763016Smaybee * zvol. Currently this interface will return ENOTTY to 14773016Smaybee * such requests. These requests could be supported by 14783016Smaybee * adding a check for lba == 0 and consing up an appropriate 14796423Sgw25295 * PMBR. 14803016Smaybee */ 14813016Smaybee if (efi.dki_lba == 1) { 14823016Smaybee efi_gpt_t gpt; 14833016Smaybee efi_gpe_t gpe; 14843016Smaybee 14853016Smaybee bzero(&gpt, sizeof (gpt)); 14863016Smaybee bzero(&gpe, sizeof (gpe)); 1487789Sahrens 14883016Smaybee if (efi.dki_length < sizeof (gpt)) { 14893016Smaybee mutex_exit(&zvol_state_lock); 14903016Smaybee return (EINVAL); 14913016Smaybee } 14923016Smaybee 14933016Smaybee gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); 14943016Smaybee gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 14953016Smaybee gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); 14963016Smaybee gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); 14973016Smaybee gpt.efi_gpt_LastUsableLBA = 14983016Smaybee LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1); 14993016Smaybee gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); 15003080Smaybee gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); 15013016Smaybee gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe)); 1502789Sahrens 15033016Smaybee UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); 15043016Smaybee gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA; 15053016Smaybee gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA; 15063016Smaybee 15073016Smaybee CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); 15083016Smaybee gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 15093016Smaybee 15103016Smaybee CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); 15113016Smaybee gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); 15123016Smaybee 15133016Smaybee mutex_exit(&zvol_state_lock); 15143016Smaybee if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag)) 15153016Smaybee error = EFAULT; 15163016Smaybee } else if (efi.dki_lba == 2) { 15173016Smaybee efi_gpe_t gpe; 15183016Smaybee 15193016Smaybee bzero(&gpe, sizeof (gpe)); 1520789Sahrens 15213016Smaybee if (efi.dki_length < sizeof (gpe)) { 15223016Smaybee mutex_exit(&zvol_state_lock); 15233016Smaybee return (EINVAL); 15243016Smaybee } 1525789Sahrens 15263016Smaybee UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); 15273016Smaybee gpe.efi_gpe_StartingLBA = LE_64(34ULL); 15283016Smaybee gpe.efi_gpe_EndingLBA = 15293016Smaybee LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1); 1530789Sahrens 15313016Smaybee mutex_exit(&zvol_state_lock); 15323016Smaybee if (ddi_copyout(&gpe, efi.dki_data, sizeof (gpe), flag)) 15333016Smaybee error = EFAULT; 15343016Smaybee } else { 15353016Smaybee mutex_exit(&zvol_state_lock); 15363016Smaybee error = EINVAL; 15373016Smaybee } 1538789Sahrens return (error); 1539789Sahrens 15403638Sbillm case DKIOCFLUSHWRITECACHE: 15413897Smaybee dkc = (struct dk_callback *)arg; 15423638Sbillm zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 15433897Smaybee if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { 15443897Smaybee (*dkc->dkc_callback)(dkc->dkc_cookie, error); 15453897Smaybee error = 0; 15463897Smaybee } 15473638Sbillm break; 15483638Sbillm 15493245Smaybee case DKIOCGGEOM: 15503245Smaybee case DKIOCGVTOC: 15516423Sgw25295 /* 15526423Sgw25295 * commands using these (like prtvtoc) expect ENOTSUP 15536423Sgw25295 * since we're emulating an EFI label 15546423Sgw25295 */ 15553245Smaybee error = ENOTSUP; 15563245Smaybee break; 15573245Smaybee 15586423Sgw25295 case DKIOCDUMPINIT: 15596423Sgw25295 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, 15606423Sgw25295 RL_WRITER); 15616423Sgw25295 error = zvol_dumpify(zv); 15626423Sgw25295 zfs_range_unlock(rl); 15636423Sgw25295 break; 15646423Sgw25295 15656423Sgw25295 case DKIOCDUMPFINI: 15666423Sgw25295 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, 15676423Sgw25295 RL_WRITER); 15686423Sgw25295 error = zvol_dump_fini(zv); 15696423Sgw25295 zfs_range_unlock(rl); 15706423Sgw25295 break; 15716423Sgw25295 1572789Sahrens default: 15733016Smaybee error = ENOTTY; 1574789Sahrens break; 1575789Sahrens 1576789Sahrens } 1577789Sahrens mutex_exit(&zvol_state_lock); 1578789Sahrens return (error); 1579789Sahrens } 1580789Sahrens 1581789Sahrens int 1582789Sahrens zvol_busy(void) 1583789Sahrens { 1584789Sahrens return (zvol_minors != 0); 1585789Sahrens } 1586789Sahrens 1587789Sahrens void 1588789Sahrens zvol_init(void) 1589789Sahrens { 1590789Sahrens VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0); 1591789Sahrens mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); 1592789Sahrens } 1593789Sahrens 1594789Sahrens void 1595789Sahrens zvol_fini(void) 1596789Sahrens { 1597789Sahrens mutex_destroy(&zvol_state_lock); 1598789Sahrens ddi_soft_state_fini(&zvol_state); 1599789Sahrens } 16006423Sgw25295 16016423Sgw25295 static boolean_t 16026423Sgw25295 zvol_is_swap(zvol_state_t *zv) 16036423Sgw25295 { 16046423Sgw25295 vnode_t *vp; 16056423Sgw25295 boolean_t ret = B_FALSE; 16066423Sgw25295 char *devpath; 16076423Sgw25295 size_t devpathlen; 16086423Sgw25295 int error; 16096423Sgw25295 16106423Sgw25295 devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1; 16116423Sgw25295 devpath = kmem_alloc(devpathlen, KM_SLEEP); 16126423Sgw25295 (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name); 16136423Sgw25295 error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 16146423Sgw25295 kmem_free(devpath, devpathlen); 16156423Sgw25295 16166423Sgw25295 ret = !error && IS_SWAPVP(common_specvp(vp)); 16176423Sgw25295 16186423Sgw25295 if (vp != NULL) 16196423Sgw25295 VN_RELE(vp); 16206423Sgw25295 16216423Sgw25295 return (ret); 16226423Sgw25295 } 16236423Sgw25295 16246423Sgw25295 static int 16256423Sgw25295 zvol_dump_init(zvol_state_t *zv, boolean_t resize) 16266423Sgw25295 { 16276423Sgw25295 dmu_tx_t *tx; 16286423Sgw25295 int error = 0; 16296423Sgw25295 objset_t *os = zv->zv_objset; 16306423Sgw25295 nvlist_t *nv = NULL; 16316423Sgw25295 uint64_t checksum, compress, refresrv; 16326423Sgw25295 16336423Sgw25295 ASSERT(MUTEX_HELD(&zvol_state_lock)); 16346423Sgw25295 16356423Sgw25295 tx = dmu_tx_create(os); 16366423Sgw25295 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 16376423Sgw25295 error = dmu_tx_assign(tx, TXG_WAIT); 16386423Sgw25295 if (error) { 16396423Sgw25295 dmu_tx_abort(tx); 16406423Sgw25295 return (error); 16416423Sgw25295 } 16426423Sgw25295 16436423Sgw25295 /* 16446423Sgw25295 * If we are resizing the dump device then we only need to 16456423Sgw25295 * update the refreservation to match the newly updated 16466423Sgw25295 * zvolsize. Otherwise, we save off the original state of the 16476423Sgw25295 * zvol so that we can restore them if the zvol is ever undumpified. 16486423Sgw25295 */ 16496423Sgw25295 if (resize) { 16506423Sgw25295 error = zap_update(os, ZVOL_ZAP_OBJ, 16516423Sgw25295 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, 16526423Sgw25295 &zv->zv_volsize, tx); 16536423Sgw25295 } else { 16546423Sgw25295 error = dsl_prop_get_integer(zv->zv_name, 16556423Sgw25295 zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); 16566423Sgw25295 error = error ? error : dsl_prop_get_integer(zv->zv_name, 16576423Sgw25295 zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); 16586423Sgw25295 error = error ? error : dsl_prop_get_integer(zv->zv_name, 16596423Sgw25295 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); 16606423Sgw25295 16616423Sgw25295 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, 16626423Sgw25295 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, 16636423Sgw25295 &compress, tx); 16646423Sgw25295 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, 16656423Sgw25295 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); 16666423Sgw25295 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, 16676423Sgw25295 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, 16686423Sgw25295 &refresrv, tx); 16696423Sgw25295 } 16706423Sgw25295 dmu_tx_commit(tx); 16716423Sgw25295 16726423Sgw25295 /* Truncate the file */ 16736423Sgw25295 if (!error) 1674*6992Smaybee error = dmu_free_long_range(zv->zv_objset, 1675*6992Smaybee ZVOL_OBJ, 0, DMU_OBJECT_END); 16766423Sgw25295 16776423Sgw25295 if (error) 16786423Sgw25295 return (error); 16796423Sgw25295 16806423Sgw25295 /* 16816423Sgw25295 * We only need update the zvol's property if we are initializing 16826423Sgw25295 * the dump area for the first time. 16836423Sgw25295 */ 16846423Sgw25295 if (!resize) { 16856423Sgw25295 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 16866423Sgw25295 VERIFY(nvlist_add_uint64(nv, 16876423Sgw25295 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); 16886423Sgw25295 VERIFY(nvlist_add_uint64(nv, 16896423Sgw25295 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 16906423Sgw25295 ZIO_COMPRESS_OFF) == 0); 16916423Sgw25295 VERIFY(nvlist_add_uint64(nv, 16926423Sgw25295 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 16936423Sgw25295 ZIO_CHECKSUM_OFF) == 0); 16946423Sgw25295 16956423Sgw25295 error = zfs_set_prop_nvlist(zv->zv_name, nv); 16966423Sgw25295 nvlist_free(nv); 16976423Sgw25295 16986423Sgw25295 if (error) 16996423Sgw25295 return (error); 17006423Sgw25295 } 17016423Sgw25295 17026423Sgw25295 /* Allocate the space for the dump */ 17036423Sgw25295 error = zvol_prealloc(zv); 17046423Sgw25295 return (error); 17056423Sgw25295 } 17066423Sgw25295 17076423Sgw25295 static int 17086423Sgw25295 zvol_dumpify(zvol_state_t *zv) 17096423Sgw25295 { 17106423Sgw25295 int error = 0; 17116423Sgw25295 uint64_t dumpsize = 0; 17126423Sgw25295 dmu_tx_t *tx; 17136423Sgw25295 objset_t *os = zv->zv_objset; 17146423Sgw25295 17156423Sgw25295 if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) 17166423Sgw25295 return (EROFS); 17176423Sgw25295 17186423Sgw25295 /* 17196423Sgw25295 * We do not support swap devices acting as dump devices. 17206423Sgw25295 */ 17216423Sgw25295 if (zvol_is_swap(zv)) 17226423Sgw25295 return (ENOTSUP); 17236423Sgw25295 17246423Sgw25295 if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 17256423Sgw25295 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { 17266423Sgw25295 boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE; 17276423Sgw25295 17286423Sgw25295 if ((error = zvol_dump_init(zv, resize)) != 0) { 17296423Sgw25295 (void) zvol_dump_fini(zv); 17306423Sgw25295 return (error); 17316423Sgw25295 } 17326423Sgw25295 } 17336423Sgw25295 17346423Sgw25295 /* 17356423Sgw25295 * Build up our lba mapping. 17366423Sgw25295 */ 17376423Sgw25295 error = zvol_get_lbas(zv); 17386423Sgw25295 if (error) { 17396423Sgw25295 (void) zvol_dump_fini(zv); 17406423Sgw25295 return (error); 17416423Sgw25295 } 17426423Sgw25295 17436423Sgw25295 tx = dmu_tx_create(os); 17446423Sgw25295 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 17456423Sgw25295 error = dmu_tx_assign(tx, TXG_WAIT); 17466423Sgw25295 if (error) { 17476423Sgw25295 dmu_tx_abort(tx); 17486423Sgw25295 (void) zvol_dump_fini(zv); 17496423Sgw25295 return (error); 17506423Sgw25295 } 17516423Sgw25295 17526423Sgw25295 zv->zv_flags |= ZVOL_DUMPIFIED; 17536423Sgw25295 error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, 17546423Sgw25295 &zv->zv_volsize, tx); 17556423Sgw25295 dmu_tx_commit(tx); 17566423Sgw25295 17576423Sgw25295 if (error) { 17586423Sgw25295 (void) zvol_dump_fini(zv); 17596423Sgw25295 return (error); 17606423Sgw25295 } 17616423Sgw25295 17626423Sgw25295 txg_wait_synced(dmu_objset_pool(os), 0); 17636423Sgw25295 return (0); 17646423Sgw25295 } 17656423Sgw25295 17666423Sgw25295 static int 17676423Sgw25295 zvol_dump_fini(zvol_state_t *zv) 17686423Sgw25295 { 17696423Sgw25295 dmu_tx_t *tx; 17706423Sgw25295 objset_t *os = zv->zv_objset; 17716423Sgw25295 nvlist_t *nv; 17726423Sgw25295 int error = 0; 17736423Sgw25295 uint64_t checksum, compress, refresrv; 17746423Sgw25295 17756423Sgw25295 tx = dmu_tx_create(os); 17766423Sgw25295 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 17776423Sgw25295 error = dmu_tx_assign(tx, TXG_WAIT); 17786423Sgw25295 if (error) { 17796423Sgw25295 dmu_tx_abort(tx); 17806423Sgw25295 return (error); 17816423Sgw25295 } 17826423Sgw25295 17836423Sgw25295 /* 17846423Sgw25295 * Attempt to restore the zvol back to its pre-dumpified state. 17856423Sgw25295 * This is a best-effort attempt as it's possible that not all 17866423Sgw25295 * of these properties were initialized during the dumpify process 17876423Sgw25295 * (i.e. error during zvol_dump_init). 17886423Sgw25295 */ 17896423Sgw25295 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, 17906423Sgw25295 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); 17916423Sgw25295 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, 17926423Sgw25295 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); 17936423Sgw25295 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, 17946423Sgw25295 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); 17956423Sgw25295 17966423Sgw25295 (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); 17976423Sgw25295 zvol_free_extents(zv); 1798*6992Smaybee (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); 17996423Sgw25295 zv->zv_flags &= ~ZVOL_DUMPIFIED; 18006423Sgw25295 dmu_tx_commit(tx); 18016423Sgw25295 18026423Sgw25295 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 18036423Sgw25295 (void) nvlist_add_uint64(nv, 18046423Sgw25295 zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); 18056423Sgw25295 (void) nvlist_add_uint64(nv, 18066423Sgw25295 zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); 18076423Sgw25295 (void) nvlist_add_uint64(nv, 18086423Sgw25295 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); 18096423Sgw25295 (void) zfs_set_prop_nvlist(zv->zv_name, nv); 18106423Sgw25295 nvlist_free(nv); 18116423Sgw25295 18126423Sgw25295 return (0); 18136423Sgw25295 } 1814