1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 232c48331dSMatt Macy * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy #include <sys/zfs_context.h> 27eda14cbcSMatt Macy #include <sys/spa.h> 28eda14cbcSMatt Macy #include <sys/file.h> 29eda14cbcSMatt Macy #include <sys/vdev_file.h> 30eda14cbcSMatt Macy #include <sys/vdev_impl.h> 31eda14cbcSMatt Macy #include <sys/zio.h> 32eda14cbcSMatt Macy #include <sys/fs/zfs.h> 33eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h> 34eda14cbcSMatt Macy #include <sys/abd.h> 35eda14cbcSMatt Macy #include <sys/stat.h> 36eda14cbcSMatt Macy 37eda14cbcSMatt Macy /* 38eda14cbcSMatt Macy * Virtual device vector for files. 39eda14cbcSMatt Macy */ 40eda14cbcSMatt Macy 41eda14cbcSMatt Macy static taskq_t *vdev_file_taskq; 42eda14cbcSMatt Macy 43dbd5678dSMartin Matuska static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; 44dbd5678dSMartin Matuska static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; 452c48331dSMatt Macy 46eda14cbcSMatt Macy void 47eda14cbcSMatt Macy vdev_file_init(void) 48eda14cbcSMatt Macy { 49eda14cbcSMatt Macy vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), 50eda14cbcSMatt Macy minclsyspri, max_ncpus, INT_MAX, 0); 51eda14cbcSMatt Macy } 52eda14cbcSMatt Macy 53eda14cbcSMatt Macy void 54eda14cbcSMatt Macy vdev_file_fini(void) 55eda14cbcSMatt Macy { 56eda14cbcSMatt Macy taskq_destroy(vdev_file_taskq); 57eda14cbcSMatt Macy } 58eda14cbcSMatt Macy 59eda14cbcSMatt Macy static void 60eda14cbcSMatt Macy vdev_file_hold(vdev_t *vd) 61eda14cbcSMatt Macy { 6216038816SMartin Matuska ASSERT3P(vd->vdev_path, !=, NULL); 63eda14cbcSMatt Macy } 64eda14cbcSMatt Macy 65eda14cbcSMatt Macy static void 66eda14cbcSMatt Macy vdev_file_rele(vdev_t *vd) 67eda14cbcSMatt Macy { 6816038816SMartin Matuska ASSERT3P(vd->vdev_path, !=, NULL); 69eda14cbcSMatt Macy } 70eda14cbcSMatt Macy 71eda14cbcSMatt Macy static mode_t 72eda14cbcSMatt Macy vdev_file_open_mode(spa_mode_t spa_mode) 73eda14cbcSMatt Macy { 74eda14cbcSMatt Macy mode_t mode = 0; 75eda14cbcSMatt Macy 76eda14cbcSMatt Macy if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { 77eda14cbcSMatt Macy mode = O_RDWR; 78eda14cbcSMatt Macy } else if (spa_mode & SPA_MODE_READ) { 79eda14cbcSMatt Macy mode = O_RDONLY; 80eda14cbcSMatt Macy } else if (spa_mode & SPA_MODE_WRITE) { 81eda14cbcSMatt Macy mode = O_WRONLY; 82eda14cbcSMatt Macy } 83eda14cbcSMatt Macy 84eda14cbcSMatt Macy return (mode | O_LARGEFILE); 85eda14cbcSMatt Macy } 86eda14cbcSMatt Macy 87eda14cbcSMatt Macy static int 88eda14cbcSMatt Macy vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 89eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 90eda14cbcSMatt Macy { 91eda14cbcSMatt Macy vdev_file_t *vf; 92eda14cbcSMatt Macy zfs_file_t *fp; 93eda14cbcSMatt Macy zfs_file_attr_t zfa; 94eda14cbcSMatt Macy int error; 95eda14cbcSMatt Macy 96eda14cbcSMatt Macy /* 97eda14cbcSMatt Macy * Rotational optimizations only make sense on block devices. 98eda14cbcSMatt Macy */ 99eda14cbcSMatt Macy vd->vdev_nonrot = B_TRUE; 100eda14cbcSMatt Macy 101eda14cbcSMatt Macy /* 102eda14cbcSMatt Macy * Allow TRIM on file based vdevs. This may not always be supported, 103eda14cbcSMatt Macy * since it depends on your kernel version and underlying filesystem 104eda14cbcSMatt Macy * type but it is always safe to attempt. 105eda14cbcSMatt Macy */ 106eda14cbcSMatt Macy vd->vdev_has_trim = B_TRUE; 107eda14cbcSMatt Macy 108eda14cbcSMatt Macy /* 109eda14cbcSMatt Macy * Disable secure TRIM on file based vdevs. There is no way to 110eda14cbcSMatt Macy * request this behavior from the underlying filesystem. 111eda14cbcSMatt Macy */ 112eda14cbcSMatt Macy vd->vdev_has_securetrim = B_FALSE; 113eda14cbcSMatt Macy 114eda14cbcSMatt Macy /* 115eda14cbcSMatt Macy * We must have a pathname, and it must be absolute. 116eda14cbcSMatt Macy */ 117eda14cbcSMatt Macy if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 118eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 119eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 120eda14cbcSMatt Macy } 121eda14cbcSMatt Macy 122eda14cbcSMatt Macy /* 123eda14cbcSMatt Macy * Reopen the device if it's not currently open. Otherwise, 124eda14cbcSMatt Macy * just update the physical size of the device. 125eda14cbcSMatt Macy */ 126eda14cbcSMatt Macy if (vd->vdev_tsd != NULL) { 127eda14cbcSMatt Macy ASSERT(vd->vdev_reopening); 128eda14cbcSMatt Macy vf = vd->vdev_tsd; 129eda14cbcSMatt Macy goto skip_open; 130eda14cbcSMatt Macy } 131eda14cbcSMatt Macy 132eda14cbcSMatt Macy vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 133eda14cbcSMatt Macy 134eda14cbcSMatt Macy /* 135eda14cbcSMatt Macy * We always open the files from the root of the global zone, even if 136eda14cbcSMatt Macy * we're in a local zone. If the user has gotten to this point, the 137eda14cbcSMatt Macy * administrator has already decided that the pool should be available 138eda14cbcSMatt Macy * to local zone users, so the underlying devices should be as well. 139eda14cbcSMatt Macy */ 14016038816SMartin Matuska ASSERT3P(vd->vdev_path, !=, NULL); 14116038816SMartin Matuska ASSERT(vd->vdev_path[0] == '/'); 142eda14cbcSMatt Macy 143eda14cbcSMatt Macy error = zfs_file_open(vd->vdev_path, 144eda14cbcSMatt Macy vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); 145eda14cbcSMatt Macy if (error) { 146eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 147eda14cbcSMatt Macy return (error); 148eda14cbcSMatt Macy } 149eda14cbcSMatt Macy 150eda14cbcSMatt Macy vf->vf_file = fp; 151eda14cbcSMatt Macy 152eda14cbcSMatt Macy #ifdef _KERNEL 153eda14cbcSMatt Macy /* 154eda14cbcSMatt Macy * Make sure it's a regular file. 155eda14cbcSMatt Macy */ 156eda14cbcSMatt Macy if (zfs_file_getattr(fp, &zfa)) { 157eda14cbcSMatt Macy return (SET_ERROR(ENODEV)); 158eda14cbcSMatt Macy } 159eda14cbcSMatt Macy if (!S_ISREG(zfa.zfa_mode)) { 160eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 161eda14cbcSMatt Macy return (SET_ERROR(ENODEV)); 162eda14cbcSMatt Macy } 163eda14cbcSMatt Macy #endif 164eda14cbcSMatt Macy 165eda14cbcSMatt Macy skip_open: 166eda14cbcSMatt Macy 167eda14cbcSMatt Macy error = zfs_file_getattr(vf->vf_file, &zfa); 168eda14cbcSMatt Macy if (error) { 169eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 170eda14cbcSMatt Macy return (error); 171eda14cbcSMatt Macy } 172eda14cbcSMatt Macy 173eda14cbcSMatt Macy *max_psize = *psize = zfa.zfa_size; 1742c48331dSMatt Macy *logical_ashift = vdev_file_logical_ashift; 1752c48331dSMatt Macy *physical_ashift = vdev_file_physical_ashift; 176eda14cbcSMatt Macy 177eda14cbcSMatt Macy return (0); 178eda14cbcSMatt Macy } 179eda14cbcSMatt Macy 180eda14cbcSMatt Macy static void 181eda14cbcSMatt Macy vdev_file_close(vdev_t *vd) 182eda14cbcSMatt Macy { 183eda14cbcSMatt Macy vdev_file_t *vf = vd->vdev_tsd; 184eda14cbcSMatt Macy 185eda14cbcSMatt Macy if (vd->vdev_reopening || vf == NULL) 186eda14cbcSMatt Macy return; 187eda14cbcSMatt Macy 188eda14cbcSMatt Macy if (vf->vf_file != NULL) { 189eda14cbcSMatt Macy zfs_file_close(vf->vf_file); 190eda14cbcSMatt Macy } 191eda14cbcSMatt Macy 192eda14cbcSMatt Macy vd->vdev_delayed_close = B_FALSE; 193eda14cbcSMatt Macy kmem_free(vf, sizeof (vdev_file_t)); 194eda14cbcSMatt Macy vd->vdev_tsd = NULL; 195eda14cbcSMatt Macy } 196eda14cbcSMatt Macy 197eda14cbcSMatt Macy /* 198eda14cbcSMatt Macy * Implements the interrupt side for file vdev types. This routine will be 199eda14cbcSMatt Macy * called when the I/O completes allowing us to transfer the I/O to the 200eda14cbcSMatt Macy * interrupt taskqs. For consistency, the code structure mimics disk vdev 201eda14cbcSMatt Macy * types. 202eda14cbcSMatt Macy */ 203eda14cbcSMatt Macy static void 204eda14cbcSMatt Macy vdev_file_io_intr(zio_t *zio) 205eda14cbcSMatt Macy { 206eda14cbcSMatt Macy zio_delay_interrupt(zio); 207eda14cbcSMatt Macy } 208eda14cbcSMatt Macy 209eda14cbcSMatt Macy static void 210eda14cbcSMatt Macy vdev_file_io_strategy(void *arg) 211eda14cbcSMatt Macy { 212eda14cbcSMatt Macy zio_t *zio = arg; 213eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 214eda14cbcSMatt Macy vdev_file_t *vf; 215eda14cbcSMatt Macy void *buf; 216eda14cbcSMatt Macy ssize_t resid; 217eda14cbcSMatt Macy loff_t off; 218eda14cbcSMatt Macy ssize_t size; 219eda14cbcSMatt Macy int err; 220eda14cbcSMatt Macy 221eda14cbcSMatt Macy off = zio->io_offset; 222eda14cbcSMatt Macy size = zio->io_size; 223eda14cbcSMatt Macy resid = 0; 224eda14cbcSMatt Macy 225eda14cbcSMatt Macy vf = vd->vdev_tsd; 226eda14cbcSMatt Macy 227eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 228eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_READ) { 229eda14cbcSMatt Macy buf = abd_borrow_buf(zio->io_abd, zio->io_size); 230eda14cbcSMatt Macy err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); 231eda14cbcSMatt Macy abd_return_buf_copy(zio->io_abd, buf, size); 232eda14cbcSMatt Macy } else { 233eda14cbcSMatt Macy buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 234eda14cbcSMatt Macy err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); 235eda14cbcSMatt Macy abd_return_buf(zio->io_abd, buf, size); 236eda14cbcSMatt Macy } 237e92ffd9bSMartin Matuska zio->io_error = err; 238eda14cbcSMatt Macy if (resid != 0 && zio->io_error == 0) 239eda14cbcSMatt Macy zio->io_error = ENOSPC; 240eda14cbcSMatt Macy 241eda14cbcSMatt Macy vdev_file_io_intr(zio); 242eda14cbcSMatt Macy } 243eda14cbcSMatt Macy 244eda14cbcSMatt Macy static void 245eda14cbcSMatt Macy vdev_file_io_start(zio_t *zio) 246eda14cbcSMatt Macy { 247eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 248eda14cbcSMatt Macy vdev_file_t *vf = vd->vdev_tsd; 249eda14cbcSMatt Macy 2501719886fSMartin Matuska if (zio->io_type == ZIO_TYPE_FLUSH) { 251eda14cbcSMatt Macy /* XXPOLICY */ 252eda14cbcSMatt Macy if (!vdev_readable(vd)) { 253eda14cbcSMatt Macy zio->io_error = SET_ERROR(ENXIO); 254eda14cbcSMatt Macy zio_interrupt(zio); 255eda14cbcSMatt Macy return; 256eda14cbcSMatt Macy } 257eda14cbcSMatt Macy 2581719886fSMartin Matuska zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC); 259eda14cbcSMatt Macy 260eda14cbcSMatt Macy zio_execute(zio); 261eda14cbcSMatt Macy return; 262eda14cbcSMatt Macy } else if (zio->io_type == ZIO_TYPE_TRIM) { 263eda14cbcSMatt Macy ASSERT3U(zio->io_size, !=, 0); 264*7a7741afSMartin Matuska zio->io_error = zfs_file_deallocate(vf->vf_file, 265*7a7741afSMartin Matuska zio->io_offset, zio->io_size); 266eda14cbcSMatt Macy zio_execute(zio); 267eda14cbcSMatt Macy return; 268eda14cbcSMatt Macy } 269eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 270eda14cbcSMatt Macy zio->io_target_timestamp = zio_handle_io_delay(zio); 271eda14cbcSMatt Macy 272eda14cbcSMatt Macy VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, 273eda14cbcSMatt Macy TQ_SLEEP), !=, 0); 274eda14cbcSMatt Macy } 275eda14cbcSMatt Macy 276eda14cbcSMatt Macy static void 277eda14cbcSMatt Macy vdev_file_io_done(zio_t *zio) 278eda14cbcSMatt Macy { 279e92ffd9bSMartin Matuska (void) zio; 280eda14cbcSMatt Macy } 281eda14cbcSMatt Macy 282eda14cbcSMatt Macy vdev_ops_t vdev_file_ops = { 2837877fdebSMatt Macy .vdev_op_init = NULL, 2847877fdebSMatt Macy .vdev_op_fini = NULL, 2857877fdebSMatt Macy .vdev_op_open = vdev_file_open, 2867877fdebSMatt Macy .vdev_op_close = vdev_file_close, 2877877fdebSMatt Macy .vdev_op_asize = vdev_default_asize, 2887877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 2897877fdebSMatt Macy .vdev_op_min_alloc = NULL, 2907877fdebSMatt Macy .vdev_op_io_start = vdev_file_io_start, 2917877fdebSMatt Macy .vdev_op_io_done = vdev_file_io_done, 2927877fdebSMatt Macy .vdev_op_state_change = NULL, 2937877fdebSMatt Macy .vdev_op_need_resilver = NULL, 2947877fdebSMatt Macy .vdev_op_hold = vdev_file_hold, 2957877fdebSMatt Macy .vdev_op_rele = vdev_file_rele, 2967877fdebSMatt Macy .vdev_op_remap = NULL, 2977877fdebSMatt Macy .vdev_op_xlate = vdev_default_xlate, 2987877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 2997877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 3007877fdebSMatt Macy .vdev_op_config_generate = NULL, 3017877fdebSMatt Macy .vdev_op_nparity = NULL, 3027877fdebSMatt Macy .vdev_op_ndisks = NULL, 3037877fdebSMatt Macy .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ 3047877fdebSMatt Macy .vdev_op_leaf = B_TRUE /* leaf vdev */ 305eda14cbcSMatt Macy }; 306eda14cbcSMatt Macy 307eda14cbcSMatt Macy /* 308eda14cbcSMatt Macy * From userland we access disks just like files. 309eda14cbcSMatt Macy */ 310eda14cbcSMatt Macy #ifndef _KERNEL 311eda14cbcSMatt Macy 312eda14cbcSMatt Macy vdev_ops_t vdev_disk_ops = { 3137877fdebSMatt Macy .vdev_op_init = NULL, 3147877fdebSMatt Macy .vdev_op_fini = NULL, 3157877fdebSMatt Macy .vdev_op_open = vdev_file_open, 3167877fdebSMatt Macy .vdev_op_close = vdev_file_close, 3177877fdebSMatt Macy .vdev_op_asize = vdev_default_asize, 3187877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 3197877fdebSMatt Macy .vdev_op_min_alloc = NULL, 3207877fdebSMatt Macy .vdev_op_io_start = vdev_file_io_start, 3217877fdebSMatt Macy .vdev_op_io_done = vdev_file_io_done, 3227877fdebSMatt Macy .vdev_op_state_change = NULL, 3237877fdebSMatt Macy .vdev_op_need_resilver = NULL, 3247877fdebSMatt Macy .vdev_op_hold = vdev_file_hold, 3257877fdebSMatt Macy .vdev_op_rele = vdev_file_rele, 3267877fdebSMatt Macy .vdev_op_remap = NULL, 3277877fdebSMatt Macy .vdev_op_xlate = vdev_default_xlate, 3287877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 3297877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 3307877fdebSMatt Macy .vdev_op_config_generate = NULL, 3317877fdebSMatt Macy .vdev_op_nparity = NULL, 3327877fdebSMatt Macy .vdev_op_ndisks = NULL, 3337877fdebSMatt Macy .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 3347877fdebSMatt Macy .vdev_op_leaf = B_TRUE /* leaf vdev */ 335eda14cbcSMatt Macy }; 336eda14cbcSMatt Macy 337eda14cbcSMatt Macy #endif 3382c48331dSMatt Macy 339dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, 3402c48331dSMatt Macy "Logical ashift for file-based devices"); 341dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, 3422c48331dSMatt Macy "Physical ashift for file-based devices"); 343