1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 232c48331dSMatt Macy * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy #include <sys/zfs_context.h> 27eda14cbcSMatt Macy #include <sys/spa.h> 28eda14cbcSMatt Macy #include <sys/spa_impl.h> 29eda14cbcSMatt Macy #include <sys/vdev_file.h> 30eda14cbcSMatt Macy #include <sys/vdev_impl.h> 31eda14cbcSMatt Macy #include <sys/vdev_trim.h> 32eda14cbcSMatt Macy #include <sys/zio.h> 33eda14cbcSMatt Macy #include <sys/fs/zfs.h> 34eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h> 35eda14cbcSMatt Macy #include <sys/abd.h> 36eda14cbcSMatt Macy #include <sys/vnode.h> 37eda14cbcSMatt Macy #include <sys/zfs_file.h> 38eda14cbcSMatt Macy #ifdef _KERNEL 39eda14cbcSMatt Macy #include <linux/falloc.h> 40*5c65a0a9SMartin Matuska #include <sys/fcntl.h> 41*5c65a0a9SMartin Matuska #else 42*5c65a0a9SMartin Matuska #include <fcntl.h> 43eda14cbcSMatt Macy #endif 44eda14cbcSMatt Macy /* 45eda14cbcSMatt Macy * Virtual device vector for files. 46eda14cbcSMatt Macy */ 47eda14cbcSMatt Macy 48eda14cbcSMatt Macy static taskq_t *vdev_file_taskq; 49eda14cbcSMatt Macy 502c48331dSMatt Macy /* 512c48331dSMatt Macy * By default, the logical/physical ashift for file vdevs is set to 522c48331dSMatt Macy * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9) 532c48331dSMatt Macy * blocksizes. Users may opt to change one or both of these for testing 542c48331dSMatt Macy * or performance reasons. Care should be taken as these values will 552c48331dSMatt Macy * impact the vdev_ashift setting which can only be set at vdev creation 562c48331dSMatt Macy * time. 572c48331dSMatt Macy */ 58dbd5678dSMartin Matuska static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; 59dbd5678dSMartin Matuska static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; 602c48331dSMatt Macy 61eda14cbcSMatt Macy static void 62eda14cbcSMatt Macy vdev_file_hold(vdev_t *vd) 63eda14cbcSMatt Macy { 64eda14cbcSMatt Macy ASSERT(vd->vdev_path != NULL); 65eda14cbcSMatt Macy } 66eda14cbcSMatt Macy 67eda14cbcSMatt Macy static void 68eda14cbcSMatt Macy vdev_file_rele(vdev_t *vd) 69eda14cbcSMatt Macy { 70eda14cbcSMatt Macy ASSERT(vd->vdev_path != NULL); 71eda14cbcSMatt Macy } 72eda14cbcSMatt Macy 73eda14cbcSMatt Macy static mode_t 74eda14cbcSMatt Macy vdev_file_open_mode(spa_mode_t spa_mode) 75eda14cbcSMatt Macy { 76eda14cbcSMatt Macy mode_t mode = 0; 77eda14cbcSMatt Macy 78eda14cbcSMatt Macy if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { 79eda14cbcSMatt Macy mode = O_RDWR; 80eda14cbcSMatt Macy } else if (spa_mode & SPA_MODE_READ) { 81eda14cbcSMatt Macy mode = O_RDONLY; 82eda14cbcSMatt Macy } else if (spa_mode & SPA_MODE_WRITE) { 83eda14cbcSMatt Macy mode = O_WRONLY; 84eda14cbcSMatt Macy } 85eda14cbcSMatt Macy 86eda14cbcSMatt Macy return (mode | O_LARGEFILE); 87eda14cbcSMatt Macy } 88eda14cbcSMatt Macy 89eda14cbcSMatt Macy static int 90eda14cbcSMatt Macy vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 91eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 92eda14cbcSMatt Macy { 93eda14cbcSMatt Macy vdev_file_t *vf; 94eda14cbcSMatt Macy zfs_file_t *fp; 95eda14cbcSMatt Macy zfs_file_attr_t zfa; 96eda14cbcSMatt Macy int error; 97eda14cbcSMatt Macy 98eda14cbcSMatt Macy /* 99eda14cbcSMatt Macy * Rotational optimizations only make sense on block devices. 100eda14cbcSMatt Macy */ 101eda14cbcSMatt Macy vd->vdev_nonrot = B_TRUE; 102eda14cbcSMatt Macy 103eda14cbcSMatt Macy /* 104eda14cbcSMatt Macy * Allow TRIM on file based vdevs. This may not always be supported, 105eda14cbcSMatt Macy * since it depends on your kernel version and underlying filesystem 106eda14cbcSMatt Macy * type but it is always safe to attempt. 107eda14cbcSMatt Macy */ 108eda14cbcSMatt Macy vd->vdev_has_trim = B_TRUE; 109eda14cbcSMatt Macy 110eda14cbcSMatt Macy /* 111eda14cbcSMatt Macy * Disable secure TRIM on file based vdevs. There is no way to 112eda14cbcSMatt Macy * request this behavior from the underlying filesystem. 113eda14cbcSMatt Macy */ 114eda14cbcSMatt Macy vd->vdev_has_securetrim = B_FALSE; 115eda14cbcSMatt Macy 116eda14cbcSMatt Macy /* 117eda14cbcSMatt Macy * We must have a pathname, and it must be absolute. 118eda14cbcSMatt Macy */ 119eda14cbcSMatt Macy if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 120eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 121eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 122eda14cbcSMatt Macy } 123eda14cbcSMatt Macy 124eda14cbcSMatt Macy /* 125eda14cbcSMatt Macy * Reopen the device if it's not currently open. Otherwise, 126eda14cbcSMatt Macy * just update the physical size of the device. 127eda14cbcSMatt Macy */ 128eda14cbcSMatt Macy if (vd->vdev_tsd != NULL) { 129eda14cbcSMatt Macy ASSERT(vd->vdev_reopening); 130eda14cbcSMatt Macy vf = vd->vdev_tsd; 131eda14cbcSMatt Macy goto skip_open; 132eda14cbcSMatt Macy } 133eda14cbcSMatt Macy 134eda14cbcSMatt Macy vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 135eda14cbcSMatt Macy 136eda14cbcSMatt Macy /* 137eda14cbcSMatt Macy * We always open the files from the root of the global zone, even if 138eda14cbcSMatt Macy * we're in a local zone. If the user has gotten to this point, the 139eda14cbcSMatt Macy * administrator has already decided that the pool should be available 140eda14cbcSMatt Macy * to local zone users, so the underlying devices should be as well. 141eda14cbcSMatt Macy */ 142eda14cbcSMatt Macy ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 143eda14cbcSMatt Macy 144eda14cbcSMatt Macy error = zfs_file_open(vd->vdev_path, 145eda14cbcSMatt Macy vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); 146eda14cbcSMatt Macy if (error) { 147eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 148eda14cbcSMatt Macy return (error); 149eda14cbcSMatt Macy } 150eda14cbcSMatt Macy 151eda14cbcSMatt Macy vf->vf_file = fp; 152eda14cbcSMatt Macy 153eda14cbcSMatt Macy #ifdef _KERNEL 154eda14cbcSMatt Macy /* 155eda14cbcSMatt Macy * Make sure it's a regular file. 156eda14cbcSMatt Macy */ 157eda14cbcSMatt Macy if (zfs_file_getattr(fp, &zfa)) { 158eda14cbcSMatt Macy return (SET_ERROR(ENODEV)); 159eda14cbcSMatt Macy } 160eda14cbcSMatt Macy if (!S_ISREG(zfa.zfa_mode)) { 161eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 162eda14cbcSMatt Macy return (SET_ERROR(ENODEV)); 163eda14cbcSMatt Macy } 164eda14cbcSMatt Macy #endif 165eda14cbcSMatt Macy 166eda14cbcSMatt Macy skip_open: 167eda14cbcSMatt Macy 168eda14cbcSMatt Macy error = zfs_file_getattr(vf->vf_file, &zfa); 169eda14cbcSMatt Macy if (error) { 170eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 171eda14cbcSMatt Macy return (error); 172eda14cbcSMatt Macy } 173eda14cbcSMatt Macy 174eda14cbcSMatt Macy *max_psize = *psize = zfa.zfa_size; 1752c48331dSMatt Macy *logical_ashift = vdev_file_logical_ashift; 1762c48331dSMatt Macy *physical_ashift = vdev_file_physical_ashift; 177eda14cbcSMatt Macy 178eda14cbcSMatt Macy return (0); 179eda14cbcSMatt Macy } 180eda14cbcSMatt Macy 181eda14cbcSMatt Macy static void 182eda14cbcSMatt Macy vdev_file_close(vdev_t *vd) 183eda14cbcSMatt Macy { 184eda14cbcSMatt Macy vdev_file_t *vf = vd->vdev_tsd; 185eda14cbcSMatt Macy 186eda14cbcSMatt Macy if (vd->vdev_reopening || vf == NULL) 187eda14cbcSMatt Macy return; 188eda14cbcSMatt Macy 189eda14cbcSMatt Macy if (vf->vf_file != NULL) { 190eda14cbcSMatt Macy (void) zfs_file_close(vf->vf_file); 191eda14cbcSMatt Macy } 192eda14cbcSMatt Macy 193eda14cbcSMatt Macy vd->vdev_delayed_close = B_FALSE; 194eda14cbcSMatt Macy kmem_free(vf, sizeof (vdev_file_t)); 195eda14cbcSMatt Macy vd->vdev_tsd = NULL; 196eda14cbcSMatt Macy } 197eda14cbcSMatt Macy 198eda14cbcSMatt Macy static void 199eda14cbcSMatt Macy vdev_file_io_strategy(void *arg) 200eda14cbcSMatt Macy { 201eda14cbcSMatt Macy zio_t *zio = (zio_t *)arg; 202eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 203eda14cbcSMatt Macy vdev_file_t *vf = vd->vdev_tsd; 204eda14cbcSMatt Macy ssize_t resid; 205eda14cbcSMatt Macy void *buf; 206eda14cbcSMatt Macy loff_t off; 207eda14cbcSMatt Macy ssize_t size; 208eda14cbcSMatt Macy int err; 209eda14cbcSMatt Macy 210eda14cbcSMatt Macy off = zio->io_offset; 211eda14cbcSMatt Macy size = zio->io_size; 212eda14cbcSMatt Macy resid = 0; 213eda14cbcSMatt Macy 214eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_READ) { 215eda14cbcSMatt Macy buf = abd_borrow_buf(zio->io_abd, zio->io_size); 216eda14cbcSMatt Macy err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); 217eda14cbcSMatt Macy abd_return_buf_copy(zio->io_abd, buf, size); 218eda14cbcSMatt Macy } else { 219eda14cbcSMatt Macy buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 220eda14cbcSMatt Macy err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); 221eda14cbcSMatt Macy abd_return_buf(zio->io_abd, buf, size); 222eda14cbcSMatt Macy } 223eda14cbcSMatt Macy zio->io_error = err; 224eda14cbcSMatt Macy if (resid != 0 && zio->io_error == 0) 225eda14cbcSMatt Macy zio->io_error = SET_ERROR(ENOSPC); 226eda14cbcSMatt Macy 227eda14cbcSMatt Macy zio_delay_interrupt(zio); 228eda14cbcSMatt Macy } 229eda14cbcSMatt Macy 230eda14cbcSMatt Macy static void 231eda14cbcSMatt Macy vdev_file_io_fsync(void *arg) 232eda14cbcSMatt Macy { 233eda14cbcSMatt Macy zio_t *zio = (zio_t *)arg; 234eda14cbcSMatt Macy vdev_file_t *vf = zio->io_vd->vdev_tsd; 235eda14cbcSMatt Macy 236eda14cbcSMatt Macy zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); 237eda14cbcSMatt Macy 238eda14cbcSMatt Macy zio_interrupt(zio); 239eda14cbcSMatt Macy } 240eda14cbcSMatt Macy 241eda14cbcSMatt Macy static void 242eda14cbcSMatt Macy vdev_file_io_start(zio_t *zio) 243eda14cbcSMatt Macy { 244eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 245eda14cbcSMatt Macy vdev_file_t *vf = vd->vdev_tsd; 246eda14cbcSMatt Macy 2471719886fSMartin Matuska if (zio->io_type == ZIO_TYPE_FLUSH) { 248eda14cbcSMatt Macy /* XXPOLICY */ 249eda14cbcSMatt Macy if (!vdev_readable(vd)) { 250eda14cbcSMatt Macy zio->io_error = SET_ERROR(ENXIO); 251eda14cbcSMatt Macy zio_interrupt(zio); 252eda14cbcSMatt Macy return; 253eda14cbcSMatt Macy } 254eda14cbcSMatt Macy 2551719886fSMartin Matuska if (zfs_nocacheflush) { 2561719886fSMartin Matuska zio_execute(zio); 2571719886fSMartin Matuska return; 2581719886fSMartin Matuska } 259eda14cbcSMatt Macy 260eda14cbcSMatt Macy /* 261eda14cbcSMatt Macy * We cannot safely call vfs_fsync() when PF_FSTRANS 262eda14cbcSMatt Macy * is set in the current context. Filesystems like 263eda14cbcSMatt Macy * XFS include sanity checks to verify it is not 264eda14cbcSMatt Macy * already set, see xfs_vm_writepage(). Therefore 265eda14cbcSMatt Macy * the sync must be dispatched to a different context. 266eda14cbcSMatt Macy */ 267eda14cbcSMatt Macy if (__spl_pf_fstrans_check()) { 268eda14cbcSMatt Macy VERIFY3U(taskq_dispatch(vdev_file_taskq, 269eda14cbcSMatt Macy vdev_file_io_fsync, zio, TQ_SLEEP), !=, 270eda14cbcSMatt Macy TASKQID_INVALID); 271eda14cbcSMatt Macy return; 272eda14cbcSMatt Macy } 273eda14cbcSMatt Macy 2741719886fSMartin Matuska zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); 275eda14cbcSMatt Macy 276eda14cbcSMatt Macy zio_execute(zio); 277eda14cbcSMatt Macy return; 278eda14cbcSMatt Macy } else if (zio->io_type == ZIO_TYPE_TRIM) { 279eda14cbcSMatt Macy ASSERT3U(zio->io_size, !=, 0); 2807a7741afSMartin Matuska zio->io_error = zfs_file_deallocate(vf->vf_file, 2817a7741afSMartin Matuska zio->io_offset, zio->io_size); 282eda14cbcSMatt Macy zio_execute(zio); 283eda14cbcSMatt Macy return; 284eda14cbcSMatt Macy } 285eda14cbcSMatt Macy 286eda14cbcSMatt Macy zio->io_target_timestamp = zio_handle_io_delay(zio); 287eda14cbcSMatt Macy 288eda14cbcSMatt Macy VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, 289eda14cbcSMatt Macy TQ_SLEEP), !=, TASKQID_INVALID); 290eda14cbcSMatt Macy } 291eda14cbcSMatt Macy 292eda14cbcSMatt Macy static void 293eda14cbcSMatt Macy vdev_file_io_done(zio_t *zio) 294eda14cbcSMatt Macy { 295e92ffd9bSMartin Matuska (void) zio; 296eda14cbcSMatt Macy } 297eda14cbcSMatt Macy 298eda14cbcSMatt Macy vdev_ops_t vdev_file_ops = { 2997877fdebSMatt Macy .vdev_op_init = NULL, 3007877fdebSMatt Macy .vdev_op_fini = NULL, 301eda14cbcSMatt Macy .vdev_op_open = vdev_file_open, 302eda14cbcSMatt Macy .vdev_op_close = vdev_file_close, 303eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 3047877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 3057877fdebSMatt Macy .vdev_op_min_alloc = NULL, 306eda14cbcSMatt Macy .vdev_op_io_start = vdev_file_io_start, 307eda14cbcSMatt Macy .vdev_op_io_done = vdev_file_io_done, 308eda14cbcSMatt Macy .vdev_op_state_change = NULL, 309eda14cbcSMatt Macy .vdev_op_need_resilver = NULL, 310eda14cbcSMatt Macy .vdev_op_hold = vdev_file_hold, 311eda14cbcSMatt Macy .vdev_op_rele = vdev_file_rele, 312eda14cbcSMatt Macy .vdev_op_remap = NULL, 313eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 3147877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 3157877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 3167877fdebSMatt Macy .vdev_op_config_generate = NULL, 3177877fdebSMatt Macy .vdev_op_nparity = NULL, 3187877fdebSMatt Macy .vdev_op_ndisks = NULL, 319eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ 320eda14cbcSMatt Macy .vdev_op_leaf = B_TRUE /* leaf vdev */ 321eda14cbcSMatt Macy }; 322eda14cbcSMatt Macy 323eda14cbcSMatt Macy void 324eda14cbcSMatt Macy vdev_file_init(void) 325eda14cbcSMatt Macy { 326eda14cbcSMatt Macy vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), 327eda14cbcSMatt Macy minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); 328eda14cbcSMatt Macy 329eda14cbcSMatt Macy VERIFY(vdev_file_taskq); 330eda14cbcSMatt Macy } 331eda14cbcSMatt Macy 332eda14cbcSMatt Macy void 333eda14cbcSMatt Macy vdev_file_fini(void) 334eda14cbcSMatt Macy { 335eda14cbcSMatt Macy taskq_destroy(vdev_file_taskq); 336eda14cbcSMatt Macy } 337eda14cbcSMatt Macy 338eda14cbcSMatt Macy /* 339eda14cbcSMatt Macy * From userland we access disks just like files. 340eda14cbcSMatt Macy */ 341eda14cbcSMatt Macy #ifndef _KERNEL 342eda14cbcSMatt Macy 343eda14cbcSMatt Macy vdev_ops_t vdev_disk_ops = { 3447877fdebSMatt Macy .vdev_op_init = NULL, 3457877fdebSMatt Macy .vdev_op_fini = NULL, 346eda14cbcSMatt Macy .vdev_op_open = vdev_file_open, 347eda14cbcSMatt Macy .vdev_op_close = vdev_file_close, 348eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 3497877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 3507877fdebSMatt Macy .vdev_op_min_alloc = NULL, 351eda14cbcSMatt Macy .vdev_op_io_start = vdev_file_io_start, 352eda14cbcSMatt Macy .vdev_op_io_done = vdev_file_io_done, 353eda14cbcSMatt Macy .vdev_op_state_change = NULL, 354eda14cbcSMatt Macy .vdev_op_need_resilver = NULL, 355eda14cbcSMatt Macy .vdev_op_hold = vdev_file_hold, 356eda14cbcSMatt Macy .vdev_op_rele = vdev_file_rele, 357eda14cbcSMatt Macy .vdev_op_remap = NULL, 358eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 3597877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 3607877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 3617877fdebSMatt Macy .vdev_op_config_generate = NULL, 3627877fdebSMatt Macy .vdev_op_nparity = NULL, 3637877fdebSMatt Macy .vdev_op_ndisks = NULL, 364eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 365eda14cbcSMatt Macy .vdev_op_leaf = B_TRUE /* leaf vdev */ 366eda14cbcSMatt Macy }; 367eda14cbcSMatt Macy 368eda14cbcSMatt Macy #endif 3692c48331dSMatt Macy 370dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, 3712c48331dSMatt Macy "Logical ashift for file-based devices"); 372dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, 3732c48331dSMatt Macy "Physical ashift for file-based devices"); 374