17877fdebSMatt Macy /* 27877fdebSMatt Macy * CDDL HEADER START 37877fdebSMatt Macy * 47877fdebSMatt Macy * The contents of this file are subject to the terms of the 57877fdebSMatt Macy * Common Development and Distribution License (the "License"). 67877fdebSMatt Macy * You may not use this file except in compliance with the License. 77877fdebSMatt Macy * 87877fdebSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 107877fdebSMatt Macy * See the License for the specific language governing permissions 117877fdebSMatt Macy * and limitations under the License. 127877fdebSMatt Macy * 137877fdebSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 147877fdebSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157877fdebSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 167877fdebSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 177877fdebSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 187877fdebSMatt Macy * 197877fdebSMatt Macy * CDDL HEADER END 207877fdebSMatt Macy */ 217877fdebSMatt Macy 227877fdebSMatt Macy /* 237877fdebSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 247877fdebSMatt Macy * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 257877fdebSMatt Macy * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 267877fdebSMatt Macy * Copyright 2017 Nexenta Systems, Inc. 272a58b312SMartin Matuska * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek 287877fdebSMatt Macy */ 297877fdebSMatt Macy 307877fdebSMatt Macy /* Portions Copyright 2007 Jeremy Teo */ 317877fdebSMatt Macy /* Portions Copyright 2010 Robert Milkowski */ 327877fdebSMatt Macy 337877fdebSMatt Macy #include <sys/types.h> 347877fdebSMatt Macy #include <sys/param.h> 357877fdebSMatt Macy #include <sys/time.h> 367877fdebSMatt Macy #include <sys/sysmacros.h> 377877fdebSMatt Macy #include <sys/vfs.h> 387877fdebSMatt Macy #include <sys/file.h> 397877fdebSMatt Macy #include <sys/stat.h> 407877fdebSMatt Macy #include <sys/kmem.h> 417877fdebSMatt Macy #include <sys/cmn_err.h> 427877fdebSMatt Macy #include <sys/errno.h> 437877fdebSMatt Macy #include <sys/zfs_dir.h> 447877fdebSMatt Macy #include <sys/zfs_acl.h> 457877fdebSMatt Macy #include <sys/zfs_ioctl.h> 467877fdebSMatt Macy #include <sys/fs/zfs.h> 477877fdebSMatt Macy #include <sys/dmu.h> 487877fdebSMatt Macy #include <sys/dmu_objset.h> 493494f7c0SMartin Matuska #include <sys/dsl_crypt.h> 507877fdebSMatt Macy #include <sys/spa.h> 517877fdebSMatt Macy #include <sys/txg.h> 527877fdebSMatt Macy #include <sys/dbuf.h> 537877fdebSMatt Macy #include <sys/policy.h> 542a58b312SMartin Matuska #include <sys/zfeature.h> 557877fdebSMatt Macy #include <sys/zfs_vnops.h> 567877fdebSMatt Macy #include <sys/zfs_quota.h> 577877fdebSMatt Macy #include <sys/zfs_vfsops.h> 587877fdebSMatt Macy #include <sys/zfs_znode.h> 597877fdebSMatt Macy 60a4e5e010SMartin Matuska /* 615c65a0a9SMartin Matuska * Enables access to the block cloning feature. If this setting is 0, then even 625c65a0a9SMartin Matuska * if feature@block_cloning is enabled, using functions and system calls that 635c65a0a9SMartin Matuska * attempt to clone blocks will act as though the feature is disabled. 64a4e5e010SMartin Matuska */ 65a4e5e010SMartin Matuska int zfs_bclone_enabled = 1; 66a4e5e010SMartin Matuska 67a4e5e010SMartin Matuska /* 68a4e5e010SMartin Matuska * When set zfs_clone_range() waits for dirty data to be written to disk. 69a4e5e010SMartin Matuska * This allows the clone operation to reliably succeed when a file is modified 70a4e5e010SMartin Matuska * and then immediately cloned. For small files this may be slower than making 71a4e5e010SMartin Matuska * a copy of the file and is therefore not the default. However, in certain 72a4e5e010SMartin Matuska * scenarios this behavior may be desirable so a tunable is provided. 73a4e5e010SMartin Matuska */ 74*dd215568SMartin Matuska int zfs_bclone_wait_dirty = 0; 75a4e5e010SMartin Matuska 76a4e5e010SMartin Matuska /* 777a7741afSMartin Matuska * Enable Direct I/O. If this setting is 0, then all I/O requests will be 787a7741afSMartin Matuska * directed through the ARC acting as though the dataset property direct was 797a7741afSMartin Matuska * set to disabled. 8087bf66d4SMartin Matuska * 8187bf66d4SMartin Matuska * Disabled by default on FreeBSD until a potential range locking issue in 8287bf66d4SMartin Matuska * zfs_getpages() can be resolved. 837a7741afSMartin Matuska */ 8487bf66d4SMartin Matuska #ifdef __FreeBSD__ 857a7741afSMartin Matuska static int zfs_dio_enabled = 0; 8687bf66d4SMartin Matuska #else 8787bf66d4SMartin Matuska static int zfs_dio_enabled = 1; 8887bf66d4SMartin Matuska #endif 897a7741afSMartin Matuska 907a7741afSMartin Matuska 917a7741afSMartin Matuska /* 92a4e5e010SMartin Matuska * Maximum bytes to read per chunk in zfs_read(). 93a4e5e010SMartin Matuska */ 94a4e5e010SMartin Matuska static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; 957877fdebSMatt Macy 967877fdebSMatt Macy int 977877fdebSMatt Macy zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) 987877fdebSMatt Macy { 99c7046f76SMartin Matuska int error = 0; 1007877fdebSMatt Macy zfsvfs_t *zfsvfs = ZTOZSB(zp); 1017877fdebSMatt Macy 1027877fdebSMatt Macy if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 103c7046f76SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 104f8b1db88SMartin Matuska return (error); 105716fd348SMartin Matuska atomic_inc_32(&zp->z_sync_writes_cnt); 1067877fdebSMatt Macy zil_commit(zfsvfs->z_log, zp->z_id); 107716fd348SMartin Matuska atomic_dec_32(&zp->z_sync_writes_cnt); 108c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 1097877fdebSMatt Macy } 110c7046f76SMartin Matuska return (error); 1117877fdebSMatt Macy } 1127877fdebSMatt Macy 1137877fdebSMatt Macy 1147877fdebSMatt Macy #if defined(SEEK_HOLE) && defined(SEEK_DATA) 1157877fdebSMatt Macy /* 1167877fdebSMatt Macy * Lseek support for finding holes (cmd == SEEK_HOLE) and 1177877fdebSMatt Macy * data (cmd == SEEK_DATA). "off" is an in/out parameter. 1187877fdebSMatt Macy */ 1197877fdebSMatt Macy static int 1207877fdebSMatt Macy zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) 1217877fdebSMatt Macy { 12281b22a98SMartin Matuska zfs_locked_range_t *lr; 1237877fdebSMatt Macy uint64_t noff = (uint64_t)*off; /* new offset */ 1247877fdebSMatt Macy uint64_t file_sz; 1257877fdebSMatt Macy int error; 1267877fdebSMatt Macy boolean_t hole; 1277877fdebSMatt Macy 1287877fdebSMatt Macy file_sz = zp->z_size; 1297877fdebSMatt Macy if (noff >= file_sz) { 1307877fdebSMatt Macy return (SET_ERROR(ENXIO)); 1317877fdebSMatt Macy } 1327877fdebSMatt Macy 1337877fdebSMatt Macy if (cmd == F_SEEK_HOLE) 1347877fdebSMatt Macy hole = B_TRUE; 1357877fdebSMatt Macy else 1367877fdebSMatt Macy hole = B_FALSE; 1377877fdebSMatt Macy 13881b22a98SMartin Matuska /* Flush any mmap()'d data to disk */ 139c9539b89SMartin Matuska if (zn_has_cached_data(zp, 0, file_sz - 1)) 140783d3ff6SMartin Matuska zn_flush_cached_data(zp, B_TRUE); 14181b22a98SMartin Matuska 1422a58b312SMartin Matuska lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); 1437877fdebSMatt Macy error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); 14481b22a98SMartin Matuska zfs_rangelock_exit(lr); 1457877fdebSMatt Macy 1467877fdebSMatt Macy if (error == ESRCH) 1477877fdebSMatt Macy return (SET_ERROR(ENXIO)); 1487877fdebSMatt Macy 14981b22a98SMartin Matuska /* File was dirty, so fall back to using generic logic */ 1507877fdebSMatt Macy if (error == EBUSY) { 1517877fdebSMatt Macy if (hole) 1527877fdebSMatt Macy *off = file_sz; 1537877fdebSMatt Macy 1547877fdebSMatt Macy return (0); 1557877fdebSMatt Macy } 1567877fdebSMatt Macy 1577877fdebSMatt Macy /* 1587877fdebSMatt Macy * We could find a hole that begins after the logical end-of-file, 1597877fdebSMatt Macy * because dmu_offset_next() only works on whole blocks. If the 1607877fdebSMatt Macy * EOF falls mid-block, then indicate that the "virtual hole" 1617877fdebSMatt Macy * at the end of the file begins at the logical EOF, rather than 1627877fdebSMatt Macy * at the end of the last block. 1637877fdebSMatt Macy */ 1647877fdebSMatt Macy if (noff > file_sz) { 1657877fdebSMatt Macy ASSERT(hole); 1667877fdebSMatt Macy noff = file_sz; 1677877fdebSMatt Macy } 1687877fdebSMatt Macy 1697877fdebSMatt Macy if (noff < *off) 1707877fdebSMatt Macy return (error); 1717877fdebSMatt Macy *off = noff; 1727877fdebSMatt Macy return (error); 1737877fdebSMatt Macy } 1747877fdebSMatt Macy 1757877fdebSMatt Macy int 1767877fdebSMatt Macy zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) 1777877fdebSMatt Macy { 1787877fdebSMatt Macy zfsvfs_t *zfsvfs = ZTOZSB(zp); 1797877fdebSMatt Macy int error; 1807877fdebSMatt Macy 181c7046f76SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 182c7046f76SMartin Matuska return (error); 1837877fdebSMatt Macy 1847877fdebSMatt Macy error = zfs_holey_common(zp, cmd, off); 1857877fdebSMatt Macy 186c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 1877877fdebSMatt Macy return (error); 1887877fdebSMatt Macy } 1897877fdebSMatt Macy #endif /* SEEK_HOLE && SEEK_DATA */ 1907877fdebSMatt Macy 1917877fdebSMatt Macy int 1927877fdebSMatt Macy zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) 1937877fdebSMatt Macy { 1947877fdebSMatt Macy zfsvfs_t *zfsvfs = ZTOZSB(zp); 1957877fdebSMatt Macy int error; 1967877fdebSMatt Macy 197c7046f76SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 198c7046f76SMartin Matuska return (error); 1997877fdebSMatt Macy 2007877fdebSMatt Macy if (flag & V_ACE_MASK) 201dbd5678dSMartin Matuska #if defined(__linux__) 202dbd5678dSMartin Matuska error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 203d411c1d6SMartin Matuska zfs_init_idmap); 204dbd5678dSMartin Matuska #else 205dbd5678dSMartin Matuska error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 206dbd5678dSMartin Matuska NULL); 207dbd5678dSMartin Matuska #endif 2087877fdebSMatt Macy else 209dbd5678dSMartin Matuska #if defined(__linux__) 210d411c1d6SMartin Matuska error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); 211dbd5678dSMartin Matuska #else 212dbd5678dSMartin Matuska error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); 213dbd5678dSMartin Matuska #endif 2147877fdebSMatt Macy 215c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 2167877fdebSMatt Macy return (error); 2177877fdebSMatt Macy } 2187877fdebSMatt Macy 2197877fdebSMatt Macy /* 2207a7741afSMartin Matuska * Determine if Direct I/O has been requested (either via the O_DIRECT flag or 2217a7741afSMartin Matuska * the "direct" dataset property). When inherited by the property only apply 2227a7741afSMartin Matuska * the O_DIRECT flag to correctly aligned IO requests. The rational for this 2237a7741afSMartin Matuska * is it allows the property to be safely set on a dataset without forcing 2247a7741afSMartin Matuska * all of the applications to be aware of the alignment restrictions. When 2257a7741afSMartin Matuska * O_DIRECT is explicitly requested by an application return EINVAL if the 2267a7741afSMartin Matuska * request is unaligned. In all cases, if the range for this request has 2277a7741afSMartin Matuska * been mmap'ed then we will perform buffered I/O to keep the mapped region 2287a7741afSMartin Matuska * synhronized with the ARC. 2297a7741afSMartin Matuska * 2307a7741afSMartin Matuska * It is possible that a file's pages could be mmap'ed after it is checked 2317a7741afSMartin Matuska * here. If so, that is handled coorarding in zfs_write(). See comments in the 2327a7741afSMartin Matuska * following area for how this is handled: 2337a7741afSMartin Matuska * zfs_write() -> update_pages() 2347a7741afSMartin Matuska */ 2357a7741afSMartin Matuska static int 2367a7741afSMartin Matuska zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, 2377a7741afSMartin Matuska int *ioflagp) 2387a7741afSMartin Matuska { 2397a7741afSMartin Matuska zfsvfs_t *zfsvfs = ZTOZSB(zp); 2407a7741afSMartin Matuska objset_t *os = zfsvfs->z_os; 2417a7741afSMartin Matuska int ioflag = *ioflagp; 2427a7741afSMartin Matuska int error = 0; 2437a7741afSMartin Matuska 2447a7741afSMartin Matuska if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED || 2457a7741afSMartin Matuska zn_has_cached_data(zp, zfs_uio_offset(uio), 2467a7741afSMartin Matuska zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { 2477a7741afSMartin Matuska /* 2487a7741afSMartin Matuska * Direct I/O is disabled or the region is mmap'ed. In either 2497a7741afSMartin Matuska * case the I/O request will just directed through the ARC. 2507a7741afSMartin Matuska */ 2517a7741afSMartin Matuska ioflag &= ~O_DIRECT; 2527a7741afSMartin Matuska goto out; 2537a7741afSMartin Matuska } else if (os->os_direct == ZFS_DIRECT_ALWAYS && 2547a7741afSMartin Matuska zfs_uio_page_aligned(uio) && 2557a7741afSMartin Matuska zfs_uio_aligned(uio, PAGE_SIZE)) { 2567a7741afSMartin Matuska if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) || 2577a7741afSMartin Matuska (rw == UIO_READ)) { 2587a7741afSMartin Matuska ioflag |= O_DIRECT; 2597a7741afSMartin Matuska } 2607a7741afSMartin Matuska } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) { 2617a7741afSMartin Matuska /* 2627a7741afSMartin Matuska * Direct I/O was requested through the direct=always, but it 2637a7741afSMartin Matuska * is not properly PAGE_SIZE aligned. The request will be 2647a7741afSMartin Matuska * directed through the ARC. 2657a7741afSMartin Matuska */ 2667a7741afSMartin Matuska ioflag &= ~O_DIRECT; 2677a7741afSMartin Matuska } 2687a7741afSMartin Matuska 2697a7741afSMartin Matuska if (ioflag & O_DIRECT) { 2707a7741afSMartin Matuska if (!zfs_uio_page_aligned(uio) || 2717a7741afSMartin Matuska !zfs_uio_aligned(uio, PAGE_SIZE)) { 2727a7741afSMartin Matuska error = SET_ERROR(EINVAL); 2737a7741afSMartin Matuska goto out; 2747a7741afSMartin Matuska } 2757a7741afSMartin Matuska 2767a7741afSMartin Matuska error = zfs_uio_get_dio_pages_alloc(uio, rw); 2777a7741afSMartin Matuska if (error) { 2787a7741afSMartin Matuska goto out; 2797a7741afSMartin Matuska } 2807a7741afSMartin Matuska } 2817a7741afSMartin Matuska 2827a7741afSMartin Matuska IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT); 2837a7741afSMartin Matuska ASSERT0(error); 2847a7741afSMartin Matuska 2857a7741afSMartin Matuska out: 2867a7741afSMartin Matuska *ioflagp = ioflag; 2877a7741afSMartin Matuska return (error); 2887a7741afSMartin Matuska } 2897a7741afSMartin Matuska 2907a7741afSMartin Matuska /* 2917877fdebSMatt Macy * Read bytes from specified file into supplied buffer. 2927877fdebSMatt Macy * 2937877fdebSMatt Macy * IN: zp - inode of file to be read from. 2947877fdebSMatt Macy * uio - structure supplying read location, range info, 2957877fdebSMatt Macy * and return buffer. 2967877fdebSMatt Macy * ioflag - O_SYNC flags; used to provide FRSYNC semantics. 2977877fdebSMatt Macy * O_DIRECT flag; used to bypass page cache. 2987877fdebSMatt Macy * cr - credentials of caller. 2997877fdebSMatt Macy * 3007877fdebSMatt Macy * OUT: uio - updated offset and range, buffer filled. 3017877fdebSMatt Macy * 3027877fdebSMatt Macy * RETURN: 0 on success, error code on failure. 3037877fdebSMatt Macy * 3047877fdebSMatt Macy * Side Effects: 3057877fdebSMatt Macy * inode - atime updated if byte count > 0 3067877fdebSMatt Macy */ 3077877fdebSMatt Macy int 308184c1b94SMartin Matuska zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 3097877fdebSMatt Macy { 310c03c5b1cSMartin Matuska (void) cr; 3117877fdebSMatt Macy int error = 0; 3127877fdebSMatt Macy boolean_t frsync = B_FALSE; 31387bf66d4SMartin Matuska boolean_t dio_checksum_failure = B_FALSE; 3147877fdebSMatt Macy 3157877fdebSMatt Macy zfsvfs_t *zfsvfs = ZTOZSB(zp); 316c7046f76SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 317c7046f76SMartin Matuska return (error); 3187877fdebSMatt Macy 3197877fdebSMatt Macy if (zp->z_pflags & ZFS_AV_QUARANTINED) { 320c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 3217877fdebSMatt Macy return (SET_ERROR(EACCES)); 3227877fdebSMatt Macy } 3237877fdebSMatt Macy 3247877fdebSMatt Macy /* We don't copy out anything useful for directories. */ 3257877fdebSMatt Macy if (Z_ISDIR(ZTOTYPE(zp))) { 326c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 3277877fdebSMatt Macy return (SET_ERROR(EISDIR)); 3287877fdebSMatt Macy } 3297877fdebSMatt Macy 3307877fdebSMatt Macy /* 3317877fdebSMatt Macy * Validate file offset 3327877fdebSMatt Macy */ 333184c1b94SMartin Matuska if (zfs_uio_offset(uio) < (offset_t)0) { 334c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 3357877fdebSMatt Macy return (SET_ERROR(EINVAL)); 3367877fdebSMatt Macy } 3377877fdebSMatt Macy 3387877fdebSMatt Macy /* 3397877fdebSMatt Macy * Fasttrack empty reads 3407877fdebSMatt Macy */ 341184c1b94SMartin Matuska if (zfs_uio_resid(uio) == 0) { 342c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 3437877fdebSMatt Macy return (0); 3447877fdebSMatt Macy } 3457877fdebSMatt Macy 3467877fdebSMatt Macy #ifdef FRSYNC 3477877fdebSMatt Macy /* 3487877fdebSMatt Macy * If we're in FRSYNC mode, sync out this znode before reading it. 3497877fdebSMatt Macy * Only do this for non-snapshots. 3507877fdebSMatt Macy * 3517877fdebSMatt Macy * Some platforms do not support FRSYNC and instead map it 3527877fdebSMatt Macy * to O_SYNC, which results in unnecessary calls to zil_commit. We 3537877fdebSMatt Macy * only honor FRSYNC requests on platforms which support it. 3547877fdebSMatt Macy */ 3557877fdebSMatt Macy frsync = !!(ioflag & FRSYNC); 3567877fdebSMatt Macy #endif 3577877fdebSMatt Macy if (zfsvfs->z_log && 3587877fdebSMatt Macy (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 3597877fdebSMatt Macy zil_commit(zfsvfs->z_log, zp->z_id); 3607877fdebSMatt Macy 3617877fdebSMatt Macy /* 3627877fdebSMatt Macy * Lock the range against changes. 3637877fdebSMatt Macy */ 3647877fdebSMatt Macy zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 365184c1b94SMartin Matuska zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); 3667877fdebSMatt Macy 3677877fdebSMatt Macy /* 3687877fdebSMatt Macy * If we are reading past end-of-file we can skip 3697877fdebSMatt Macy * to the end; but we might still need to set atime. 3707877fdebSMatt Macy */ 371184c1b94SMartin Matuska if (zfs_uio_offset(uio) >= zp->z_size) { 3727877fdebSMatt Macy error = 0; 3737877fdebSMatt Macy goto out; 3747877fdebSMatt Macy } 375184c1b94SMartin Matuska ASSERT(zfs_uio_offset(uio) < zp->z_size); 3767a7741afSMartin Matuska 3777a7741afSMartin Matuska /* 3787a7741afSMartin Matuska * Setting up Direct I/O if requested. 3797a7741afSMartin Matuska */ 3807a7741afSMartin Matuska error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); 3817a7741afSMartin Matuska if (error) { 3827a7741afSMartin Matuska goto out; 3837a7741afSMartin Matuska } 3847a7741afSMartin Matuska 38581b22a98SMartin Matuska #if defined(__linux__) 38681b22a98SMartin Matuska ssize_t start_offset = zfs_uio_offset(uio); 38781b22a98SMartin Matuska #endif 3887a7741afSMartin Matuska ssize_t chunk_size = zfs_vnops_read_chunk_size; 389184c1b94SMartin Matuska ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); 3907877fdebSMatt Macy ssize_t start_resid = n; 3917a7741afSMartin Matuska ssize_t dio_remaining_resid = 0; 3927a7741afSMartin Matuska 3937a7741afSMartin Matuska if (uio->uio_extflg & UIO_DIRECT) { 3947a7741afSMartin Matuska /* 3957a7741afSMartin Matuska * All pages for an O_DIRECT request ahve already been mapped 3967a7741afSMartin Matuska * so there's no compelling reason to handle this uio in 3977a7741afSMartin Matuska * smaller chunks. 3987a7741afSMartin Matuska */ 3997a7741afSMartin Matuska chunk_size = DMU_MAX_ACCESS; 4007a7741afSMartin Matuska 4017a7741afSMartin Matuska /* 4027a7741afSMartin Matuska * In the event that the O_DIRECT request is reading the entire 4037a7741afSMartin Matuska * file, it is possible file's length is not page sized 4047a7741afSMartin Matuska * aligned. However, lower layers expect that the Direct I/O 4057a7741afSMartin Matuska * request is page-aligned. In this case, as much of the file 4067a7741afSMartin Matuska * that can be read using Direct I/O happens and the remaining 4077a7741afSMartin Matuska * amount will be read through the ARC. 4087a7741afSMartin Matuska * 4097a7741afSMartin Matuska * This is still consistent with the semantics of Direct I/O in 4107a7741afSMartin Matuska * ZFS as at a minimum the I/O request must be page-aligned. 4117a7741afSMartin Matuska */ 4127a7741afSMartin Matuska dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); 4137a7741afSMartin Matuska if (dio_remaining_resid != 0) 4147a7741afSMartin Matuska n -= dio_remaining_resid; 4157a7741afSMartin Matuska } 4167877fdebSMatt Macy 4177877fdebSMatt Macy while (n > 0) { 4187a7741afSMartin Matuska ssize_t nbytes = MIN(n, chunk_size - 4197a7741afSMartin Matuska P2PHASE(zfs_uio_offset(uio), chunk_size)); 4207877fdebSMatt Macy #ifdef UIO_NOCOPY 421184c1b94SMartin Matuska if (zfs_uio_segflg(uio) == UIO_NOCOPY) 4227877fdebSMatt Macy error = mappedread_sf(zp, nbytes, uio); 4237877fdebSMatt Macy else 4247877fdebSMatt Macy #endif 425c9539b89SMartin Matuska if (zn_has_cached_data(zp, zfs_uio_offset(uio), 4267a7741afSMartin Matuska zfs_uio_offset(uio) + nbytes - 1)) { 4277877fdebSMatt Macy error = mappedread(zp, nbytes, uio); 4287877fdebSMatt Macy } else { 4297877fdebSMatt Macy error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 4307877fdebSMatt Macy uio, nbytes); 4317877fdebSMatt Macy } 4327877fdebSMatt Macy 4337877fdebSMatt Macy if (error) { 4347877fdebSMatt Macy /* convert checksum errors into IO errors */ 43587bf66d4SMartin Matuska if (error == ECKSUM) { 43687bf66d4SMartin Matuska /* 43787bf66d4SMartin Matuska * If a Direct I/O read returned a checksum 43887bf66d4SMartin Matuska * verify error, then it must be treated as 43987bf66d4SMartin Matuska * suspicious. The contents of the buffer could 44087bf66d4SMartin Matuska * have beeen manipulated while the I/O was in 44187bf66d4SMartin Matuska * flight. In this case, the remainder of I/O 44287bf66d4SMartin Matuska * request will just be reissued through the 44387bf66d4SMartin Matuska * ARC. 44487bf66d4SMartin Matuska */ 44587bf66d4SMartin Matuska if (uio->uio_extflg & UIO_DIRECT) { 44687bf66d4SMartin Matuska dio_checksum_failure = B_TRUE; 44787bf66d4SMartin Matuska uio->uio_extflg &= ~UIO_DIRECT; 44887bf66d4SMartin Matuska n += dio_remaining_resid; 44987bf66d4SMartin Matuska dio_remaining_resid = 0; 45087bf66d4SMartin Matuska continue; 45187bf66d4SMartin Matuska } else { 4527877fdebSMatt Macy error = SET_ERROR(EIO); 45387bf66d4SMartin Matuska } 45487bf66d4SMartin Matuska } 45581b22a98SMartin Matuska 45681b22a98SMartin Matuska #if defined(__linux__) 45781b22a98SMartin Matuska /* 45881b22a98SMartin Matuska * if we actually read some bytes, bubbling EFAULT 45981b22a98SMartin Matuska * up to become EAGAIN isn't what we want here... 46081b22a98SMartin Matuska * 46181b22a98SMartin Matuska * ...on Linux, at least. On FBSD, doing this breaks. 46281b22a98SMartin Matuska */ 46381b22a98SMartin Matuska if (error == EFAULT && 46481b22a98SMartin Matuska (zfs_uio_offset(uio) - start_offset) != 0) 46581b22a98SMartin Matuska error = 0; 46681b22a98SMartin Matuska #endif 4677877fdebSMatt Macy break; 4687877fdebSMatt Macy } 4697877fdebSMatt Macy 4707877fdebSMatt Macy n -= nbytes; 4717877fdebSMatt Macy } 4727877fdebSMatt Macy 4737a7741afSMartin Matuska if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && 4747a7741afSMartin Matuska dio_remaining_resid != 0) { 4757a7741afSMartin Matuska /* 4767a7741afSMartin Matuska * Temporarily remove the UIO_DIRECT flag from the UIO so the 4777a7741afSMartin Matuska * remainder of the file can be read using the ARC. 4787a7741afSMartin Matuska */ 4797a7741afSMartin Matuska uio->uio_extflg &= ~UIO_DIRECT; 4807a7741afSMartin Matuska 4817a7741afSMartin Matuska if (zn_has_cached_data(zp, zfs_uio_offset(uio), 4827a7741afSMartin Matuska zfs_uio_offset(uio) + dio_remaining_resid - 1)) { 4837a7741afSMartin Matuska error = mappedread(zp, dio_remaining_resid, uio); 4847a7741afSMartin Matuska } else { 4857a7741afSMartin Matuska error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, 4867a7741afSMartin Matuska dio_remaining_resid); 4877a7741afSMartin Matuska } 4887a7741afSMartin Matuska uio->uio_extflg |= UIO_DIRECT; 4897a7741afSMartin Matuska 4907a7741afSMartin Matuska if (error != 0) 4917a7741afSMartin Matuska n += dio_remaining_resid; 4927a7741afSMartin Matuska } else if (error && (uio->uio_extflg & UIO_DIRECT)) { 4937a7741afSMartin Matuska n += dio_remaining_resid; 4947a7741afSMartin Matuska } 4957877fdebSMatt Macy int64_t nread = start_resid - n; 4967a7741afSMartin Matuska 4977877fdebSMatt Macy dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); 4987877fdebSMatt Macy out: 4997877fdebSMatt Macy zfs_rangelock_exit(lr); 5007877fdebSMatt Macy 50187bf66d4SMartin Matuska if (dio_checksum_failure == B_TRUE) 50287bf66d4SMartin Matuska uio->uio_extflg |= UIO_DIRECT; 50387bf66d4SMartin Matuska 5047a7741afSMartin Matuska /* 5057a7741afSMartin Matuska * Cleanup for Direct I/O if requested. 5067a7741afSMartin Matuska */ 5077a7741afSMartin Matuska if (uio->uio_extflg & UIO_DIRECT) 5087a7741afSMartin Matuska zfs_uio_free_dio_pages(uio, UIO_READ); 5097a7741afSMartin Matuska 5107877fdebSMatt Macy ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 511c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 5127877fdebSMatt Macy return (error); 5137877fdebSMatt Macy } 5147877fdebSMatt Macy 515c03c5b1cSMartin Matuska static void 516c03c5b1cSMartin Matuska zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, 517c03c5b1cSMartin Matuska uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) 518c03c5b1cSMartin Matuska { 519c03c5b1cSMartin Matuska zilog_t *zilog = zfsvfs->z_log; 520c03c5b1cSMartin Matuska const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 521c03c5b1cSMartin Matuska 522c03c5b1cSMartin Matuska ASSERT(clear_setid_bits_txgp != NULL); 523c03c5b1cSMartin Matuska ASSERT(tx != NULL); 524c03c5b1cSMartin Matuska 525c03c5b1cSMartin Matuska /* 526c03c5b1cSMartin Matuska * Clear Set-UID/Set-GID bits on successful write if not 527c03c5b1cSMartin Matuska * privileged and at least one of the execute bits is set. 528c03c5b1cSMartin Matuska * 529c03c5b1cSMartin Matuska * It would be nice to do this after all writes have 530c03c5b1cSMartin Matuska * been done, but that would still expose the ISUID/ISGID 531c03c5b1cSMartin Matuska * to another app after the partial write is committed. 532c03c5b1cSMartin Matuska * 533c03c5b1cSMartin Matuska * Note: we don't call zfs_fuid_map_id() here because 534c03c5b1cSMartin Matuska * user 0 is not an ephemeral uid. 535c03c5b1cSMartin Matuska */ 536c03c5b1cSMartin Matuska mutex_enter(&zp->z_acl_lock); 537c03c5b1cSMartin Matuska if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && 538c03c5b1cSMartin Matuska (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 539c03c5b1cSMartin Matuska secpolicy_vnode_setid_retain(zp, cr, 540c03c5b1cSMartin Matuska ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { 541c03c5b1cSMartin Matuska uint64_t newmode; 542c03c5b1cSMartin Matuska 543c03c5b1cSMartin Matuska zp->z_mode &= ~(S_ISUID | S_ISGID); 544c03c5b1cSMartin Matuska newmode = zp->z_mode; 545c03c5b1cSMartin Matuska (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 546c03c5b1cSMartin Matuska (void *)&newmode, sizeof (uint64_t), tx); 547c03c5b1cSMartin Matuska 548c03c5b1cSMartin Matuska mutex_exit(&zp->z_acl_lock); 549c03c5b1cSMartin Matuska 550c03c5b1cSMartin Matuska /* 551c03c5b1cSMartin Matuska * Make sure SUID/SGID bits will be removed when we replay the 552c03c5b1cSMartin Matuska * log. If the setid bits are keep coming back, don't log more 553c03c5b1cSMartin Matuska * than one TX_SETATTR per transaction group. 554c03c5b1cSMartin Matuska */ 555c03c5b1cSMartin Matuska if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { 556da5137abSMartin Matuska vattr_t va = {0}; 557c03c5b1cSMartin Matuska 558716fd348SMartin Matuska va.va_mask = ATTR_MODE; 559c03c5b1cSMartin Matuska va.va_nodeid = zp->z_id; 560c03c5b1cSMartin Matuska va.va_mode = newmode; 561716fd348SMartin Matuska zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, 562716fd348SMartin Matuska ATTR_MODE, NULL); 563c03c5b1cSMartin Matuska *clear_setid_bits_txgp = dmu_tx_get_txg(tx); 564c03c5b1cSMartin Matuska } 565c03c5b1cSMartin Matuska } else { 566c03c5b1cSMartin Matuska mutex_exit(&zp->z_acl_lock); 567c03c5b1cSMartin Matuska } 568c03c5b1cSMartin Matuska } 569c03c5b1cSMartin Matuska 5707877fdebSMatt Macy /* 5717877fdebSMatt Macy * Write the bytes to a file. 5727877fdebSMatt Macy * 5737877fdebSMatt Macy * IN: zp - znode of file to be written to. 5747877fdebSMatt Macy * uio - structure supplying write location, range info, 5757877fdebSMatt Macy * and data buffer. 5767877fdebSMatt Macy * ioflag - O_APPEND flag set if in append mode. 5777877fdebSMatt Macy * O_DIRECT flag; used to bypass page cache. 5787877fdebSMatt Macy * cr - credentials of caller. 5797877fdebSMatt Macy * 5807877fdebSMatt Macy * OUT: uio - updated offset and range. 5817877fdebSMatt Macy * 5827877fdebSMatt Macy * RETURN: 0 if success 5837877fdebSMatt Macy * error code if failure 5847877fdebSMatt Macy * 5857877fdebSMatt Macy * Timestamps: 5867877fdebSMatt Macy * ip - ctime|mtime updated if byte count > 0 5877877fdebSMatt Macy */ 5887877fdebSMatt Macy int 589184c1b94SMartin Matuska zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 5907877fdebSMatt Macy { 591e92ffd9bSMartin Matuska int error = 0, error1; 592184c1b94SMartin Matuska ssize_t start_resid = zfs_uio_resid(uio); 593c03c5b1cSMartin Matuska uint64_t clear_setid_bits_txg = 0; 5947a7741afSMartin Matuska boolean_t o_direct_defer = B_FALSE; 5957877fdebSMatt Macy 5967877fdebSMatt Macy /* 5977877fdebSMatt Macy * Fasttrack empty write 5987877fdebSMatt Macy */ 5997877fdebSMatt Macy ssize_t n = start_resid; 6007877fdebSMatt Macy if (n == 0) 6017877fdebSMatt Macy return (0); 6027877fdebSMatt Macy 6037877fdebSMatt Macy zfsvfs_t *zfsvfs = ZTOZSB(zp); 604c7046f76SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 605c7046f76SMartin Matuska return (error); 6067877fdebSMatt Macy 6077877fdebSMatt Macy sa_bulk_attr_t bulk[4]; 6087877fdebSMatt Macy int count = 0; 6097877fdebSMatt Macy uint64_t mtime[2], ctime[2]; 6107877fdebSMatt Macy SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 6117877fdebSMatt Macy SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 6127877fdebSMatt Macy SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 6137877fdebSMatt Macy &zp->z_size, 8); 6147877fdebSMatt Macy SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 6157877fdebSMatt Macy &zp->z_pflags, 8); 6167877fdebSMatt Macy 6177877fdebSMatt Macy /* 6187877fdebSMatt Macy * Callers might not be able to detect properly that we are read-only, 6197877fdebSMatt Macy * so check it explicitly here. 6207877fdebSMatt Macy */ 6217877fdebSMatt Macy if (zfs_is_readonly(zfsvfs)) { 622c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 6237877fdebSMatt Macy return (SET_ERROR(EROFS)); 6247877fdebSMatt Macy } 6257877fdebSMatt Macy 6267877fdebSMatt Macy /* 6279db44a8eSMartin Matuska * If immutable or not appending then return EPERM. 6289db44a8eSMartin Matuska * Intentionally allow ZFS_READONLY through here. 6299db44a8eSMartin Matuska * See zfs_zaccess_common() 6307877fdebSMatt Macy */ 6319db44a8eSMartin Matuska if ((zp->z_pflags & ZFS_IMMUTABLE) || 6327877fdebSMatt Macy ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && 633184c1b94SMartin Matuska (zfs_uio_offset(uio) < zp->z_size))) { 634c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 6357877fdebSMatt Macy return (SET_ERROR(EPERM)); 6367877fdebSMatt Macy } 6377877fdebSMatt Macy 6387877fdebSMatt Macy /* 6397877fdebSMatt Macy * Validate file offset 6407877fdebSMatt Macy */ 641184c1b94SMartin Matuska offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); 6427877fdebSMatt Macy if (woff < 0) { 643c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 6447877fdebSMatt Macy return (SET_ERROR(EINVAL)); 6457877fdebSMatt Macy } 6467877fdebSMatt Macy 6477877fdebSMatt Macy /* 6487a7741afSMartin Matuska * Setting up Direct I/O if requested. 6497a7741afSMartin Matuska */ 6507a7741afSMartin Matuska error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); 6517a7741afSMartin Matuska if (error) { 6527a7741afSMartin Matuska zfs_exit(zfsvfs, FTAG); 6537a7741afSMartin Matuska return (SET_ERROR(error)); 6547a7741afSMartin Matuska } 6557a7741afSMartin Matuska 6567a7741afSMartin Matuska /* 6577877fdebSMatt Macy * Pre-fault the pages to ensure slow (eg NFS) pages 6587877fdebSMatt Macy * don't hold up txg. 6597877fdebSMatt Macy */ 6607b5e6873SMartin Matuska ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); 6617b5e6873SMartin Matuska if (zfs_uio_prefaultpages(pfbytes, uio)) { 662c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 6637877fdebSMatt Macy return (SET_ERROR(EFAULT)); 6647877fdebSMatt Macy } 6657877fdebSMatt Macy 6667877fdebSMatt Macy /* 6677877fdebSMatt Macy * If in append mode, set the io offset pointer to eof. 6687877fdebSMatt Macy */ 6697877fdebSMatt Macy zfs_locked_range_t *lr; 6707877fdebSMatt Macy if (ioflag & O_APPEND) { 6717877fdebSMatt Macy /* 6727877fdebSMatt Macy * Obtain an appending range lock to guarantee file append 6737877fdebSMatt Macy * semantics. We reset the write offset once we have the lock. 6747877fdebSMatt Macy */ 6757877fdebSMatt Macy lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); 6767877fdebSMatt Macy woff = lr->lr_offset; 6777877fdebSMatt Macy if (lr->lr_length == UINT64_MAX) { 6787877fdebSMatt Macy /* 6797877fdebSMatt Macy * We overlocked the file because this write will cause 6807877fdebSMatt Macy * the file block size to increase. 6817877fdebSMatt Macy * Note that zp_size cannot change with this lock held. 6827877fdebSMatt Macy */ 6837877fdebSMatt Macy woff = zp->z_size; 6847877fdebSMatt Macy } 685184c1b94SMartin Matuska zfs_uio_setoffset(uio, woff); 6867a7741afSMartin Matuska /* 6877a7741afSMartin Matuska * We need to update the starting offset as well because it is 6887a7741afSMartin Matuska * set previously in the ZPL (Linux) and VNOPS (FreeBSD) 6897a7741afSMartin Matuska * layers. 6907a7741afSMartin Matuska */ 6917a7741afSMartin Matuska zfs_uio_setsoffset(uio, woff); 6927877fdebSMatt Macy } else { 6937877fdebSMatt Macy /* 6947877fdebSMatt Macy * Note that if the file block size will change as a result of 6957877fdebSMatt Macy * this write, then this range lock will lock the entire file 6967877fdebSMatt Macy * so that we can re-write the block safely. 6977877fdebSMatt Macy */ 6987877fdebSMatt Macy lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); 6997877fdebSMatt Macy } 7007877fdebSMatt Macy 7012a58b312SMartin Matuska if (zn_rlimit_fsize_uio(zp, uio)) { 7027877fdebSMatt Macy zfs_rangelock_exit(lr); 703c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 7047877fdebSMatt Macy return (SET_ERROR(EFBIG)); 7057877fdebSMatt Macy } 7067877fdebSMatt Macy 7077877fdebSMatt Macy const rlim64_t limit = MAXOFFSET_T; 7087877fdebSMatt Macy 7097877fdebSMatt Macy if (woff >= limit) { 7107877fdebSMatt Macy zfs_rangelock_exit(lr); 711c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 7127877fdebSMatt Macy return (SET_ERROR(EFBIG)); 7137877fdebSMatt Macy } 7147877fdebSMatt Macy 7157877fdebSMatt Macy if (n > limit - woff) 7167877fdebSMatt Macy n = limit - woff; 7177877fdebSMatt Macy 7187877fdebSMatt Macy uint64_t end_size = MAX(zp->z_size, woff + n); 7197877fdebSMatt Macy zilog_t *zilog = zfsvfs->z_log; 720f8b1db88SMartin Matuska boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || 721f8b1db88SMartin Matuska (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); 7227877fdebSMatt Macy 7237877fdebSMatt Macy const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 7247877fdebSMatt Macy const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 7257877fdebSMatt Macy const uint64_t projid = zp->z_projid; 7267877fdebSMatt Macy 7277877fdebSMatt Macy /* 7287a7741afSMartin Matuska * In the event we are increasing the file block size 7297a7741afSMartin Matuska * (lr_length == UINT64_MAX), we will direct the write to the ARC. 7307a7741afSMartin Matuska * Because zfs_grow_blocksize() will read from the ARC in order to 7317a7741afSMartin Matuska * grow the dbuf, we avoid doing Direct I/O here as that would cause 7327a7741afSMartin Matuska * data written to disk to be overwritten by data in the ARC during 7337a7741afSMartin Matuska * the sync phase. Besides writing data twice to disk, we also 7347a7741afSMartin Matuska * want to avoid consistency concerns between data in the the ARC and 7357a7741afSMartin Matuska * on disk while growing the file's blocksize. 7367a7741afSMartin Matuska * 7377a7741afSMartin Matuska * We will only temporarily remove Direct I/O and put it back after 7387a7741afSMartin Matuska * we have grown the blocksize. We do this in the event a request 7397a7741afSMartin Matuska * is larger than max_blksz, so further requests to 7407a7741afSMartin Matuska * dmu_write_uio_dbuf() will still issue the requests using Direct 7417a7741afSMartin Matuska * IO. 7427a7741afSMartin Matuska * 7437a7741afSMartin Matuska * As an example: 7447a7741afSMartin Matuska * The first block to file is being written as a 4k request with 7457a7741afSMartin Matuska * a recorsize of 1K. The first 1K issued in the loop below will go 7467a7741afSMartin Matuska * through the ARC; however, the following 3 1K requests will 7477a7741afSMartin Matuska * use Direct I/O. 7487a7741afSMartin Matuska */ 7497a7741afSMartin Matuska if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { 7507a7741afSMartin Matuska uio->uio_extflg &= ~UIO_DIRECT; 7517a7741afSMartin Matuska o_direct_defer = B_TRUE; 7527a7741afSMartin Matuska } 7537a7741afSMartin Matuska 7547a7741afSMartin Matuska /* 7557877fdebSMatt Macy * Write the file in reasonable size chunks. Each chunk is written 7567877fdebSMatt Macy * in a separate transaction; this keeps the intent log records small 7577877fdebSMatt Macy * and allows us to do more fine-grained space accounting. 7587877fdebSMatt Macy */ 7597877fdebSMatt Macy while (n > 0) { 760184c1b94SMartin Matuska woff = zfs_uio_offset(uio); 7617877fdebSMatt Macy 7627877fdebSMatt Macy if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 7637877fdebSMatt Macy zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 7647877fdebSMatt Macy (projid != ZFS_DEFAULT_PROJID && 7657877fdebSMatt Macy zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 7667877fdebSMatt Macy projid))) { 7677877fdebSMatt Macy error = SET_ERROR(EDQUOT); 7687877fdebSMatt Macy break; 7697877fdebSMatt Macy } 7707877fdebSMatt Macy 7717b5e6873SMartin Matuska uint64_t blksz; 7727b5e6873SMartin Matuska if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { 7737b5e6873SMartin Matuska if (zp->z_blksz > zfsvfs->z_max_blksz && 7747b5e6873SMartin Matuska !ISP2(zp->z_blksz)) { 7757b5e6873SMartin Matuska /* 7767b5e6873SMartin Matuska * File's blocksize is already larger than the 7777b5e6873SMartin Matuska * "recordsize" property. Only let it grow to 7787b5e6873SMartin Matuska * the next power of 2. 7797b5e6873SMartin Matuska */ 7807b5e6873SMartin Matuska blksz = 1 << highbit64(zp->z_blksz); 7817b5e6873SMartin Matuska } else { 7827b5e6873SMartin Matuska blksz = zfsvfs->z_max_blksz; 7837b5e6873SMartin Matuska } 7847b5e6873SMartin Matuska blksz = MIN(blksz, P2ROUNDUP(end_size, 7857b5e6873SMartin Matuska SPA_MINBLOCKSIZE)); 7867b5e6873SMartin Matuska blksz = MAX(blksz, zp->z_blksz); 7877b5e6873SMartin Matuska } else { 7887b5e6873SMartin Matuska blksz = zp->z_blksz; 7897b5e6873SMartin Matuska } 7907b5e6873SMartin Matuska 7917877fdebSMatt Macy arc_buf_t *abuf = NULL; 7927b5e6873SMartin Matuska ssize_t nbytes = n; 7937b5e6873SMartin Matuska if (n >= blksz && woff >= zp->z_size && 7947b5e6873SMartin Matuska P2PHASE(woff, blksz) == 0 && 7957a7741afSMartin Matuska !(uio->uio_extflg & UIO_DIRECT) && 7967b5e6873SMartin Matuska (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { 7977877fdebSMatt Macy /* 7987877fdebSMatt Macy * This write covers a full block. "Borrow" a buffer 7997877fdebSMatt Macy * from the dmu so that we can fill it before we enter 8007877fdebSMatt Macy * a transaction. This avoids the possibility of 8017877fdebSMatt Macy * holding up the transaction if the data copy hangs 8027877fdebSMatt Macy * up on a pagefault (e.g., from an NFS server mapping). 8037877fdebSMatt Macy */ 8047877fdebSMatt Macy abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 8057b5e6873SMartin Matuska blksz); 8067877fdebSMatt Macy ASSERT(abuf != NULL); 8077b5e6873SMartin Matuska ASSERT(arc_buf_size(abuf) == blksz); 8087b5e6873SMartin Matuska if ((error = zfs_uiocopy(abuf->b_data, blksz, 8097b5e6873SMartin Matuska UIO_WRITE, uio, &nbytes))) { 8107877fdebSMatt Macy dmu_return_arcbuf(abuf); 8117877fdebSMatt Macy break; 8127877fdebSMatt Macy } 8137b5e6873SMartin Matuska ASSERT3S(nbytes, ==, blksz); 8147b5e6873SMartin Matuska } else { 8157b5e6873SMartin Matuska nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - 8167b5e6873SMartin Matuska P2PHASE(woff, blksz)); 8177b5e6873SMartin Matuska if (pfbytes < nbytes) { 8187b5e6873SMartin Matuska if (zfs_uio_prefaultpages(nbytes, uio)) { 8197b5e6873SMartin Matuska error = SET_ERROR(EFAULT); 8207b5e6873SMartin Matuska break; 8217b5e6873SMartin Matuska } 8227b5e6873SMartin Matuska pfbytes = nbytes; 8237b5e6873SMartin Matuska } 8247877fdebSMatt Macy } 8257877fdebSMatt Macy 8267877fdebSMatt Macy /* 8277877fdebSMatt Macy * Start a transaction. 8287877fdebSMatt Macy */ 8297877fdebSMatt Macy dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 8307877fdebSMatt Macy dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 8317877fdebSMatt Macy dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 8327877fdebSMatt Macy DB_DNODE_ENTER(db); 8337b5e6873SMartin Matuska dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); 8347877fdebSMatt Macy DB_DNODE_EXIT(db); 8357877fdebSMatt Macy zfs_sa_upgrade_txholds(tx, zp); 8367877fdebSMatt Macy error = dmu_tx_assign(tx, TXG_WAIT); 8377877fdebSMatt Macy if (error) { 8387877fdebSMatt Macy dmu_tx_abort(tx); 8397877fdebSMatt Macy if (abuf != NULL) 8407877fdebSMatt Macy dmu_return_arcbuf(abuf); 8417877fdebSMatt Macy break; 8427877fdebSMatt Macy } 8437877fdebSMatt Macy 8447877fdebSMatt Macy /* 845c03c5b1cSMartin Matuska * NB: We must call zfs_clear_setid_bits_if_necessary before 846c03c5b1cSMartin Matuska * committing the transaction! 847c03c5b1cSMartin Matuska */ 848c03c5b1cSMartin Matuska 849c03c5b1cSMartin Matuska /* 8507877fdebSMatt Macy * If rangelock_enter() over-locked we grow the blocksize 8517877fdebSMatt Macy * and then reduce the lock range. This will only happen 8527877fdebSMatt Macy * on the first iteration since rangelock_reduce() will 8537877fdebSMatt Macy * shrink down lr_length to the appropriate size. 8547877fdebSMatt Macy */ 8557877fdebSMatt Macy if (lr->lr_length == UINT64_MAX) { 8567b5e6873SMartin Matuska zfs_grow_blocksize(zp, blksz, tx); 8577877fdebSMatt Macy zfs_rangelock_reduce(lr, woff, n); 8587877fdebSMatt Macy } 8597877fdebSMatt Macy 8607877fdebSMatt Macy ssize_t tx_bytes; 8617877fdebSMatt Macy if (abuf == NULL) { 862184c1b94SMartin Matuska tx_bytes = zfs_uio_resid(uio); 863184c1b94SMartin Matuska zfs_uio_fault_disable(uio, B_TRUE); 8647877fdebSMatt Macy error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 8657877fdebSMatt Macy uio, nbytes, tx); 866184c1b94SMartin Matuska zfs_uio_fault_disable(uio, B_FALSE); 8677877fdebSMatt Macy #ifdef __linux__ 8687877fdebSMatt Macy if (error == EFAULT) { 869c03c5b1cSMartin Matuska zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 870c03c5b1cSMartin Matuska cr, &clear_setid_bits_txg, tx); 8717877fdebSMatt Macy dmu_tx_commit(tx); 8727877fdebSMatt Macy /* 8737877fdebSMatt Macy * Account for partial writes before 8747877fdebSMatt Macy * continuing the loop. 8757877fdebSMatt Macy * Update needs to occur before the next 876184c1b94SMartin Matuska * zfs_uio_prefaultpages, or prefaultpages may 8777877fdebSMatt Macy * error, and we may break the loop early. 8787877fdebSMatt Macy */ 879184c1b94SMartin Matuska n -= tx_bytes - zfs_uio_resid(uio); 8807b5e6873SMartin Matuska pfbytes -= tx_bytes - zfs_uio_resid(uio); 8817877fdebSMatt Macy continue; 8827877fdebSMatt Macy } 8837877fdebSMatt Macy #endif 884e92ffd9bSMartin Matuska /* 885e92ffd9bSMartin Matuska * On FreeBSD, EFAULT should be propagated back to the 886e92ffd9bSMartin Matuska * VFS, which will handle faulting and will retry. 887e92ffd9bSMartin Matuska */ 888e92ffd9bSMartin Matuska if (error != 0 && error != EFAULT) { 889c03c5b1cSMartin Matuska zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 890c03c5b1cSMartin Matuska cr, &clear_setid_bits_txg, tx); 8917877fdebSMatt Macy dmu_tx_commit(tx); 8927877fdebSMatt Macy break; 8937877fdebSMatt Macy } 894184c1b94SMartin Matuska tx_bytes -= zfs_uio_resid(uio); 8957877fdebSMatt Macy } else { 8967877fdebSMatt Macy /* 8977877fdebSMatt Macy * Thus, we're writing a full block at a block-aligned 8987877fdebSMatt Macy * offset and extending the file past EOF. 8997877fdebSMatt Macy * 9007877fdebSMatt Macy * dmu_assign_arcbuf_by_dbuf() will directly assign the 9017877fdebSMatt Macy * arc buffer to a dbuf. 9027877fdebSMatt Macy */ 9037877fdebSMatt Macy error = dmu_assign_arcbuf_by_dbuf( 9047877fdebSMatt Macy sa_get_db(zp->z_sa_hdl), woff, abuf, tx); 9057877fdebSMatt Macy if (error != 0) { 906c03c5b1cSMartin Matuska /* 907c03c5b1cSMartin Matuska * XXX This might not be necessary if 908c03c5b1cSMartin Matuska * dmu_assign_arcbuf_by_dbuf is guaranteed 909c03c5b1cSMartin Matuska * to be atomic. 910c03c5b1cSMartin Matuska */ 911c03c5b1cSMartin Matuska zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 912c03c5b1cSMartin Matuska cr, &clear_setid_bits_txg, tx); 9137877fdebSMatt Macy dmu_return_arcbuf(abuf); 9147877fdebSMatt Macy dmu_tx_commit(tx); 9157877fdebSMatt Macy break; 9167877fdebSMatt Macy } 917184c1b94SMartin Matuska ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); 918184c1b94SMartin Matuska zfs_uioskip(uio, nbytes); 9197877fdebSMatt Macy tx_bytes = nbytes; 9207877fdebSMatt Macy } 9217a7741afSMartin Matuska /* 9227a7741afSMartin Matuska * There is a window where a file's pages can be mmap'ed after 9237a7741afSMartin Matuska * zfs_setup_direct() is called. This is due to the fact that 9247a7741afSMartin Matuska * the rangelock in this function is acquired after calling 9257a7741afSMartin Matuska * zfs_setup_direct(). This is done so that 9267a7741afSMartin Matuska * zfs_uio_prefaultpages() does not attempt to fault in pages 9277a7741afSMartin Matuska * on Linux for Direct I/O requests. This is not necessary as 9287a7741afSMartin Matuska * the pages are pinned in memory and can not be faulted out. 9297a7741afSMartin Matuska * Ideally, the rangelock would be held before calling 9307a7741afSMartin Matuska * zfs_setup_direct() and zfs_uio_prefaultpages(); however, 9317a7741afSMartin Matuska * this can lead to a deadlock as zfs_getpage() also acquires 9327a7741afSMartin Matuska * the rangelock as a RL_WRITER and prefaulting the pages can 9337a7741afSMartin Matuska * lead to zfs_getpage() being called. 9347a7741afSMartin Matuska * 9357a7741afSMartin Matuska * In the case of the pages being mapped after 9367a7741afSMartin Matuska * zfs_setup_direct() is called, the call to update_pages() 9377a7741afSMartin Matuska * will still be made to make sure there is consistency between 9387a7741afSMartin Matuska * the ARC and the Linux page cache. This is an ufortunate 9397a7741afSMartin Matuska * situation as the data will be read back into the ARC after 9407a7741afSMartin Matuska * the Direct I/O write has completed, but this is the penality 9417a7741afSMartin Matuska * for writing to a mmap'ed region of a file using Direct I/O. 9427a7741afSMartin Matuska */ 943c9539b89SMartin Matuska if (tx_bytes && 9447a7741afSMartin Matuska zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { 9457877fdebSMatt Macy update_pages(zp, woff, tx_bytes, zfsvfs->z_os); 9467877fdebSMatt Macy } 9477877fdebSMatt Macy 9487877fdebSMatt Macy /* 9497877fdebSMatt Macy * If we made no progress, we're done. If we made even 9507877fdebSMatt Macy * partial progress, update the znode and ZIL accordingly. 9517877fdebSMatt Macy */ 9527877fdebSMatt Macy if (tx_bytes == 0) { 9537877fdebSMatt Macy (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 9547877fdebSMatt Macy (void *)&zp->z_size, sizeof (uint64_t), tx); 9557877fdebSMatt Macy dmu_tx_commit(tx); 9567877fdebSMatt Macy ASSERT(error != 0); 9577877fdebSMatt Macy break; 9587877fdebSMatt Macy } 9597877fdebSMatt Macy 960c03c5b1cSMartin Matuska zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, 961c03c5b1cSMartin Matuska &clear_setid_bits_txg, tx); 9627877fdebSMatt Macy 9637877fdebSMatt Macy zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 9647877fdebSMatt Macy 9657877fdebSMatt Macy /* 9667877fdebSMatt Macy * Update the file size (zp_size) if it has changed; 9677877fdebSMatt Macy * account for possible concurrent updates. 9687877fdebSMatt Macy */ 969184c1b94SMartin Matuska while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { 9707877fdebSMatt Macy (void) atomic_cas_64(&zp->z_size, end_size, 971184c1b94SMartin Matuska zfs_uio_offset(uio)); 972e92ffd9bSMartin Matuska ASSERT(error == 0 || error == EFAULT); 9737877fdebSMatt Macy } 9747877fdebSMatt Macy /* 9757877fdebSMatt Macy * If we are replaying and eof is non zero then force 9767877fdebSMatt Macy * the file size to the specified eof. Note, there's no 9777877fdebSMatt Macy * concurrency during replay. 9787877fdebSMatt Macy */ 9797877fdebSMatt Macy if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 9807877fdebSMatt Macy zp->z_size = zfsvfs->z_replay_eof; 9817877fdebSMatt Macy 982e92ffd9bSMartin Matuska error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 983e92ffd9bSMartin Matuska if (error1 != 0) 984e92ffd9bSMartin Matuska /* Avoid clobbering EFAULT. */ 985e92ffd9bSMartin Matuska error = error1; 9867877fdebSMatt Macy 987c03c5b1cSMartin Matuska /* 988c03c5b1cSMartin Matuska * NB: During replay, the TX_SETATTR record logged by 989c03c5b1cSMartin Matuska * zfs_clear_setid_bits_if_necessary must precede any of 990c03c5b1cSMartin Matuska * the TX_WRITE records logged here. 991c03c5b1cSMartin Matuska */ 992f8b1db88SMartin Matuska zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, 9937a7741afSMartin Matuska uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, 9947a7741afSMartin Matuska NULL); 995c03c5b1cSMartin Matuska 9967877fdebSMatt Macy dmu_tx_commit(tx); 9977877fdebSMatt Macy 9987a7741afSMartin Matuska /* 9997a7741afSMartin Matuska * Direct I/O was deferred in order to grow the first block. 10007a7741afSMartin Matuska * At this point it can be re-enabled for subsequent writes. 10017a7741afSMartin Matuska */ 10027a7741afSMartin Matuska if (o_direct_defer) { 10037a7741afSMartin Matuska ASSERT(ioflag & O_DIRECT); 10047a7741afSMartin Matuska uio->uio_extflg |= UIO_DIRECT; 10057a7741afSMartin Matuska o_direct_defer = B_FALSE; 10067a7741afSMartin Matuska } 10077a7741afSMartin Matuska 10087877fdebSMatt Macy if (error != 0) 10097877fdebSMatt Macy break; 10107877fdebSMatt Macy ASSERT3S(tx_bytes, ==, nbytes); 10117877fdebSMatt Macy n -= nbytes; 10127b5e6873SMartin Matuska pfbytes -= nbytes; 10137877fdebSMatt Macy } 10147877fdebSMatt Macy 10157a7741afSMartin Matuska if (o_direct_defer) { 10167a7741afSMartin Matuska ASSERT(ioflag & O_DIRECT); 10177a7741afSMartin Matuska uio->uio_extflg |= UIO_DIRECT; 10187a7741afSMartin Matuska o_direct_defer = B_FALSE; 10197a7741afSMartin Matuska } 10207a7741afSMartin Matuska 1021184c1b94SMartin Matuska zfs_znode_update_vfs(zp); 10227877fdebSMatt Macy zfs_rangelock_exit(lr); 10237877fdebSMatt Macy 10247877fdebSMatt Macy /* 10257a7741afSMartin Matuska * Cleanup for Direct I/O if requested. 10267a7741afSMartin Matuska */ 10277a7741afSMartin Matuska if (uio->uio_extflg & UIO_DIRECT) 10287a7741afSMartin Matuska zfs_uio_free_dio_pages(uio, UIO_WRITE); 10297a7741afSMartin Matuska 10307a7741afSMartin Matuska /* 10317877fdebSMatt Macy * If we're in replay mode, or we made no progress, or the 10327877fdebSMatt Macy * uio data is inaccessible return an error. Otherwise, it's 10337877fdebSMatt Macy * at least a partial write, so it's successful. 10347877fdebSMatt Macy */ 1035184c1b94SMartin Matuska if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || 10367877fdebSMatt Macy error == EFAULT) { 1037c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 10387877fdebSMatt Macy return (error); 10397877fdebSMatt Macy } 10407877fdebSMatt Macy 1041f8b1db88SMartin Matuska if (commit) 10427877fdebSMatt Macy zil_commit(zilog, zp->z_id); 10437877fdebSMatt Macy 10447a7741afSMartin Matuska int64_t nwritten = start_resid - zfs_uio_resid(uio); 10457877fdebSMatt Macy dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); 10467877fdebSMatt Macy 1047c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 10487877fdebSMatt Macy return (0); 10497877fdebSMatt Macy } 10507877fdebSMatt Macy 10517877fdebSMatt Macy int 10527877fdebSMatt Macy zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 10537877fdebSMatt Macy { 10547877fdebSMatt Macy zfsvfs_t *zfsvfs = ZTOZSB(zp); 10557877fdebSMatt Macy int error; 10567877fdebSMatt Macy boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 10577877fdebSMatt Macy 1058c7046f76SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1059c7046f76SMartin Matuska return (error); 10607877fdebSMatt Macy error = zfs_getacl(zp, vsecp, skipaclchk, cr); 1061c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 10627877fdebSMatt Macy 10637877fdebSMatt Macy return (error); 10647877fdebSMatt Macy } 10657877fdebSMatt Macy 10667877fdebSMatt Macy int 10677877fdebSMatt Macy zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 10687877fdebSMatt Macy { 10697877fdebSMatt Macy zfsvfs_t *zfsvfs = ZTOZSB(zp); 10707877fdebSMatt Macy int error; 10717877fdebSMatt Macy boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 107278ae60b4SMartin Matuska zilog_t *zilog; 10737877fdebSMatt Macy 1074c7046f76SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1075c7046f76SMartin Matuska return (error); 107678ae60b4SMartin Matuska zilog = zfsvfs->z_log; 10777877fdebSMatt Macy error = zfs_setacl(zp, vsecp, skipaclchk, cr); 10787877fdebSMatt Macy 10797877fdebSMatt Macy if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 10807877fdebSMatt Macy zil_commit(zilog, 0); 10817877fdebSMatt Macy 1082c7046f76SMartin Matuska zfs_exit(zfsvfs, FTAG); 10837877fdebSMatt Macy return (error); 10847877fdebSMatt Macy } 10857877fdebSMatt Macy 10867877fdebSMatt Macy #ifdef ZFS_DEBUG 10877877fdebSMatt Macy static int zil_fault_io = 0; 10887877fdebSMatt Macy #endif 10897877fdebSMatt Macy 10907877fdebSMatt Macy static void zfs_get_done(zgd_t *zgd, int error); 10917877fdebSMatt Macy 10927877fdebSMatt Macy /* 10937877fdebSMatt Macy * Get data to generate a TX_WRITE intent log record. 10947877fdebSMatt Macy */ 10957877fdebSMatt Macy int 1096f9693befSMartin Matuska zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, 1097f9693befSMartin Matuska struct lwb *lwb, zio_t *zio) 10987877fdebSMatt Macy { 10997877fdebSMatt Macy zfsvfs_t *zfsvfs = arg; 11007877fdebSMatt Macy objset_t *os = zfsvfs->z_os; 11017877fdebSMatt Macy znode_t *zp; 11027877fdebSMatt Macy uint64_t object = lr->lr_foid; 11037877fdebSMatt Macy uint64_t offset = lr->lr_offset; 11047877fdebSMatt Macy uint64_t size = lr->lr_length; 11057877fdebSMatt Macy zgd_t *zgd; 11067877fdebSMatt Macy int error = 0; 1107f9693befSMartin Matuska uint64_t zp_gen; 11087877fdebSMatt Macy 11097877fdebSMatt Macy ASSERT3P(lwb, !=, NULL); 11107877fdebSMatt Macy ASSERT3U(size, !=, 0); 11117877fdebSMatt Macy 11127877fdebSMatt Macy /* 11137877fdebSMatt Macy * Nothing to do if the file has been removed 11147877fdebSMatt Macy */ 11157877fdebSMatt Macy if (zfs_zget(zfsvfs, object, &zp) != 0) 11167877fdebSMatt Macy return (SET_ERROR(ENOENT)); 11177877fdebSMatt Macy if (zp->z_unlinked) { 11187877fdebSMatt Macy /* 11197877fdebSMatt Macy * Release the vnode asynchronously as we currently have the 11207877fdebSMatt Macy * txg stopped from syncing. 11217877fdebSMatt Macy */ 11227877fdebSMatt Macy zfs_zrele_async(zp); 11237877fdebSMatt Macy return (SET_ERROR(ENOENT)); 11247877fdebSMatt Macy } 1125f9693befSMartin Matuska /* check if generation number matches */ 1126f9693befSMartin Matuska if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1127f9693befSMartin Matuska sizeof (zp_gen)) != 0) { 1128f9693befSMartin Matuska zfs_zrele_async(zp); 1129f9693befSMartin Matuska return (SET_ERROR(EIO)); 1130f9693befSMartin Matuska } 1131f9693befSMartin Matuska if (zp_gen != gen) { 1132f9693befSMartin Matuska zfs_zrele_async(zp); 1133f9693befSMartin Matuska return (SET_ERROR(ENOENT)); 1134f9693befSMartin Matuska } 11357877fdebSMatt Macy 113615f0b8c3SMartin Matuska zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 11377877fdebSMatt Macy zgd->zgd_lwb = lwb; 11387877fdebSMatt Macy zgd->zgd_private = zp; 11397877fdebSMatt Macy 11407877fdebSMatt Macy /* 11417877fdebSMatt Macy * Write records come in two flavors: immediate and indirect. 11427877fdebSMatt Macy * For small writes it's cheaper to store the data with the 11437877fdebSMatt Macy * log record (immediate); for large writes it's cheaper to 11447877fdebSMatt Macy * sync the data and get a pointer to it (indirect) so that 11457877fdebSMatt Macy * we don't have to write the data twice. 11467877fdebSMatt Macy */ 11477877fdebSMatt Macy if (buf != NULL) { /* immediate write */ 11487a7741afSMartin Matuska zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, 11497a7741afSMartin Matuska size, RL_READER); 11507877fdebSMatt Macy /* test for truncation needs to be done while range locked */ 11517877fdebSMatt Macy if (offset >= zp->z_size) { 11527877fdebSMatt Macy error = SET_ERROR(ENOENT); 11537877fdebSMatt Macy } else { 11547877fdebSMatt Macy error = dmu_read(os, object, offset, size, buf, 11557877fdebSMatt Macy DMU_READ_NO_PREFETCH); 11567877fdebSMatt Macy } 11577877fdebSMatt Macy ASSERT(error == 0 || error == ENOENT); 11587877fdebSMatt Macy } else { /* indirect write */ 1159315ee00fSMartin Matuska ASSERT3P(zio, !=, NULL); 11607877fdebSMatt Macy /* 11617877fdebSMatt Macy * Have to lock the whole block to ensure when it's 11627877fdebSMatt Macy * written out and its checksum is being calculated 11637877fdebSMatt Macy * that no one can change the data. We need to re-check 11647877fdebSMatt Macy * blocksize after we get the lock in case it's changed! 11657877fdebSMatt Macy */ 11667877fdebSMatt Macy for (;;) { 11677877fdebSMatt Macy uint64_t blkoff; 11687877fdebSMatt Macy size = zp->z_blksz; 11697877fdebSMatt Macy blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 11707877fdebSMatt Macy offset -= blkoff; 11717877fdebSMatt Macy zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, 11727877fdebSMatt Macy offset, size, RL_READER); 11737877fdebSMatt Macy if (zp->z_blksz == size) 11747877fdebSMatt Macy break; 11757877fdebSMatt Macy offset += blkoff; 11767877fdebSMatt Macy zfs_rangelock_exit(zgd->zgd_lr); 11777877fdebSMatt Macy } 11787877fdebSMatt Macy /* test for truncation needs to be done while range locked */ 11797877fdebSMatt Macy if (lr->lr_offset >= zp->z_size) 11807877fdebSMatt Macy error = SET_ERROR(ENOENT); 11817877fdebSMatt Macy #ifdef ZFS_DEBUG 11827877fdebSMatt Macy if (zil_fault_io) { 11837877fdebSMatt Macy error = SET_ERROR(EIO); 11847877fdebSMatt Macy zil_fault_io = 0; 11857877fdebSMatt Macy } 11867877fdebSMatt Macy #endif 11877a7741afSMartin Matuska 11887a7741afSMartin Matuska dmu_buf_t *dbp; 11897877fdebSMatt Macy if (error == 0) 1190315ee00fSMartin Matuska error = dmu_buf_hold_noread(os, object, offset, zgd, 11917a7741afSMartin Matuska &dbp); 11927877fdebSMatt Macy 11937877fdebSMatt Macy if (error == 0) { 11947a7741afSMartin Matuska zgd->zgd_db = dbp; 11957a7741afSMartin Matuska dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; 11967a7741afSMartin Matuska boolean_t direct_write = B_FALSE; 11977a7741afSMartin Matuska mutex_enter(&db->db_mtx); 11987a7741afSMartin Matuska dbuf_dirty_record_t *dr = 11997a7741afSMartin Matuska dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); 12007a7741afSMartin Matuska if (dr != NULL && dr->dt.dl.dr_diowrite) 12017a7741afSMartin Matuska direct_write = B_TRUE; 12027a7741afSMartin Matuska mutex_exit(&db->db_mtx); 12037877fdebSMatt Macy 12047a7741afSMartin Matuska /* 12057a7741afSMartin Matuska * All Direct I/O writes will have already completed and 12067a7741afSMartin Matuska * the block pointer can be immediately stored in the 12077a7741afSMartin Matuska * log record. 12087a7741afSMartin Matuska */ 12097a7741afSMartin Matuska if (direct_write) { 12107a7741afSMartin Matuska /* 12117a7741afSMartin Matuska * A Direct I/O write always covers an entire 12127a7741afSMartin Matuska * block. 12137a7741afSMartin Matuska */ 12147a7741afSMartin Matuska ASSERT3U(dbp->db_size, ==, zp->z_blksz); 12157a7741afSMartin Matuska lr->lr_blkptr = dr->dt.dl.dr_overridden_by; 12167a7741afSMartin Matuska zfs_get_done(zgd, 0); 12177a7741afSMartin Matuska return (0); 12187a7741afSMartin Matuska } 12197a7741afSMartin Matuska 12207a7741afSMartin Matuska blkptr_t *bp = &lr->lr_blkptr; 12217877fdebSMatt Macy zgd->zgd_bp = bp; 12227877fdebSMatt Macy 12237a7741afSMartin Matuska ASSERT3U(dbp->db_offset, ==, offset); 12247a7741afSMartin Matuska ASSERT3U(dbp->db_size, ==, size); 12257877fdebSMatt Macy 12267877fdebSMatt Macy error = dmu_sync(zio, lr->lr_common.lrc_txg, 12277877fdebSMatt Macy zfs_get_done, zgd); 12287877fdebSMatt Macy ASSERT(error || lr->lr_length <= size); 12297877fdebSMatt Macy 12307877fdebSMatt Macy /* 12317877fdebSMatt Macy * On success, we need to wait for the write I/O 12327877fdebSMatt Macy * initiated by dmu_sync() to complete before we can 12337877fdebSMatt Macy * release this dbuf. We will finish everything up 12347877fdebSMatt Macy * in the zfs_get_done() callback. 12357877fdebSMatt Macy */ 12367877fdebSMatt Macy if (error == 0) 12377877fdebSMatt Macy return (0); 12387877fdebSMatt Macy 12397877fdebSMatt Macy if (error == EALREADY) { 12407877fdebSMatt Macy lr->lr_common.lrc_txtype = TX_WRITE2; 12417877fdebSMatt Macy /* 12427877fdebSMatt Macy * TX_WRITE2 relies on the data previously 12437877fdebSMatt Macy * written by the TX_WRITE that caused 12447877fdebSMatt Macy * EALREADY. We zero out the BP because 12457877fdebSMatt Macy * it is the old, currently-on-disk BP. 12467877fdebSMatt Macy */ 12477877fdebSMatt Macy zgd->zgd_bp = NULL; 12487877fdebSMatt Macy BP_ZERO(bp); 12497877fdebSMatt Macy error = 0; 12507877fdebSMatt Macy } 12517877fdebSMatt Macy } 12527877fdebSMatt Macy } 12537877fdebSMatt Macy 12547877fdebSMatt Macy zfs_get_done(zgd, error); 12557877fdebSMatt Macy 12567877fdebSMatt Macy return (error); 12577877fdebSMatt Macy } 12587877fdebSMatt Macy 12597877fdebSMatt Macy static void 12607877fdebSMatt Macy zfs_get_done(zgd_t *zgd, int error) 12617877fdebSMatt Macy { 1262c03c5b1cSMartin Matuska (void) error; 12637877fdebSMatt Macy znode_t *zp = zgd->zgd_private; 12647877fdebSMatt Macy 12657877fdebSMatt Macy if (zgd->zgd_db) 12667877fdebSMatt Macy dmu_buf_rele(zgd->zgd_db, zgd); 12677877fdebSMatt Macy 12687877fdebSMatt Macy zfs_rangelock_exit(zgd->zgd_lr); 12697877fdebSMatt Macy 12707877fdebSMatt Macy /* 12717877fdebSMatt Macy * Release the vnode asynchronously as we currently have the 12727877fdebSMatt Macy * txg stopped from syncing. 12737877fdebSMatt Macy */ 12747877fdebSMatt Macy zfs_zrele_async(zp); 12757877fdebSMatt Macy 12767877fdebSMatt Macy kmem_free(zgd, sizeof (zgd_t)); 12777877fdebSMatt Macy } 12787877fdebSMatt Macy 12792a58b312SMartin Matuska static int 12802a58b312SMartin Matuska zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 12812a58b312SMartin Matuska { 12822a58b312SMartin Matuska int error; 12832a58b312SMartin Matuska 12842a58b312SMartin Matuska /* Swap. Not sure if the order of zfs_enter()s is important. */ 12852a58b312SMartin Matuska if (zfsvfs1 > zfsvfs2) { 12862a58b312SMartin Matuska zfsvfs_t *tmpzfsvfs; 12872a58b312SMartin Matuska 12882a58b312SMartin Matuska tmpzfsvfs = zfsvfs2; 12892a58b312SMartin Matuska zfsvfs2 = zfsvfs1; 12902a58b312SMartin Matuska zfsvfs1 = tmpzfsvfs; 12912a58b312SMartin Matuska } 12922a58b312SMartin Matuska 12932a58b312SMartin Matuska error = zfs_enter(zfsvfs1, tag); 12942a58b312SMartin Matuska if (error != 0) 12952a58b312SMartin Matuska return (error); 12962a58b312SMartin Matuska if (zfsvfs1 != zfsvfs2) { 12972a58b312SMartin Matuska error = zfs_enter(zfsvfs2, tag); 12982a58b312SMartin Matuska if (error != 0) { 12992a58b312SMartin Matuska zfs_exit(zfsvfs1, tag); 13002a58b312SMartin Matuska return (error); 13012a58b312SMartin Matuska } 13022a58b312SMartin Matuska } 13032a58b312SMartin Matuska 13042a58b312SMartin Matuska return (0); 13052a58b312SMartin Matuska } 13062a58b312SMartin Matuska 13072a58b312SMartin Matuska static void 13082a58b312SMartin Matuska zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 13092a58b312SMartin Matuska { 13102a58b312SMartin Matuska 13112a58b312SMartin Matuska zfs_exit(zfsvfs1, tag); 13122a58b312SMartin Matuska if (zfsvfs1 != zfsvfs2) 13132a58b312SMartin Matuska zfs_exit(zfsvfs2, tag); 13142a58b312SMartin Matuska } 13152a58b312SMartin Matuska 13162a58b312SMartin Matuska /* 13172a58b312SMartin Matuska * We split each clone request in chunks that can fit into a single ZIL 13182a58b312SMartin Matuska * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning 13192a58b312SMartin Matuska * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives 13202a58b312SMartin Matuska * us room for storing 1022 block pointers. 13212a58b312SMartin Matuska * 13222a58b312SMartin Matuska * On success, the function return the number of bytes copied in *lenp. 13232a58b312SMartin Matuska * Note, it doesn't return how much bytes are left to be copied. 1324315ee00fSMartin Matuska * On errors which are caused by any file system limitations or 1325315ee00fSMartin Matuska * brt limitations `EINVAL` is returned. In the most cases a user 1326315ee00fSMartin Matuska * requested bad parameters, it could be possible to clone the file but 1327315ee00fSMartin Matuska * some parameters don't match the requirements. 13282a58b312SMartin Matuska */ 13292a58b312SMartin Matuska int 13302a58b312SMartin Matuska zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, 13312a58b312SMartin Matuska uint64_t *outoffp, uint64_t *lenp, cred_t *cr) 13322a58b312SMartin Matuska { 13332a58b312SMartin Matuska zfsvfs_t *inzfsvfs, *outzfsvfs; 13342a58b312SMartin Matuska objset_t *inos, *outos; 13352a58b312SMartin Matuska zfs_locked_range_t *inlr, *outlr; 13362a58b312SMartin Matuska dmu_buf_impl_t *db; 13372a58b312SMartin Matuska dmu_tx_t *tx; 13382a58b312SMartin Matuska zilog_t *zilog; 13392a58b312SMartin Matuska uint64_t inoff, outoff, len, done; 13402a58b312SMartin Matuska uint64_t outsize, size; 13412a58b312SMartin Matuska int error; 13422a58b312SMartin Matuska int count = 0; 13432a58b312SMartin Matuska sa_bulk_attr_t bulk[3]; 13442a58b312SMartin Matuska uint64_t mtime[2], ctime[2]; 13452a58b312SMartin Matuska uint64_t uid, gid, projid; 13462a58b312SMartin Matuska blkptr_t *bps; 13472a58b312SMartin Matuska size_t maxblocks, nbps; 13482a58b312SMartin Matuska uint_t inblksz; 13492a58b312SMartin Matuska uint64_t clear_setid_bits_txg = 0; 1350a4e5e010SMartin Matuska uint64_t last_synced_txg = 0; 13512a58b312SMartin Matuska 13522a58b312SMartin Matuska inoff = *inoffp; 13532a58b312SMartin Matuska outoff = *outoffp; 13542a58b312SMartin Matuska len = *lenp; 13552a58b312SMartin Matuska done = 0; 13562a58b312SMartin Matuska 13572a58b312SMartin Matuska inzfsvfs = ZTOZSB(inzp); 13582a58b312SMartin Matuska outzfsvfs = ZTOZSB(outzp); 1359e639e0d2SMartin Matuska 1360e639e0d2SMartin Matuska /* 1361e639e0d2SMartin Matuska * We need to call zfs_enter() potentially on two different datasets, 1362e639e0d2SMartin Matuska * so we need a dedicated function for that. 1363e639e0d2SMartin Matuska */ 1364e639e0d2SMartin Matuska error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); 1365e639e0d2SMartin Matuska if (error != 0) 1366e639e0d2SMartin Matuska return (error); 1367e639e0d2SMartin Matuska 13682a58b312SMartin Matuska inos = inzfsvfs->z_os; 13692a58b312SMartin Matuska outos = outzfsvfs->z_os; 13702a58b312SMartin Matuska 13712a58b312SMartin Matuska /* 13722a58b312SMartin Matuska * Both source and destination have to belong to the same storage pool. 13732a58b312SMartin Matuska */ 13742a58b312SMartin Matuska if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { 13752a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 13762a58b312SMartin Matuska return (SET_ERROR(EXDEV)); 13772a58b312SMartin Matuska } 13782a58b312SMartin Matuska 1379315ee00fSMartin Matuska /* 1380315ee00fSMartin Matuska * outos and inos belongs to the same storage pool. 1381315ee00fSMartin Matuska * see a few lines above, only one check. 1382315ee00fSMartin Matuska */ 1383315ee00fSMartin Matuska if (!spa_feature_is_enabled(dmu_objset_spa(outos), 1384315ee00fSMartin Matuska SPA_FEATURE_BLOCK_CLONING)) { 1385315ee00fSMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1386315ee00fSMartin Matuska return (SET_ERROR(EOPNOTSUPP)); 1387315ee00fSMartin Matuska } 1388315ee00fSMartin Matuska 13892a58b312SMartin Matuska ASSERT(!outzfsvfs->z_replay); 13902a58b312SMartin Matuska 1391f8b1db88SMartin Matuska /* 1392f8b1db88SMartin Matuska * Block cloning from an unencrypted dataset into an encrypted 1393f8b1db88SMartin Matuska * dataset and vice versa is not supported. 1394f8b1db88SMartin Matuska */ 1395f8b1db88SMartin Matuska if (inos->os_encrypted != outos->os_encrypted) { 1396f8b1db88SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1397f8b1db88SMartin Matuska return (SET_ERROR(EXDEV)); 1398f8b1db88SMartin Matuska } 1399f8b1db88SMartin Matuska 14003494f7c0SMartin Matuska /* 14013494f7c0SMartin Matuska * Cloning across encrypted datasets is possible only if they 14023494f7c0SMartin Matuska * share the same master key. 14033494f7c0SMartin Matuska */ 14043494f7c0SMartin Matuska if (inos != outos && inos->os_encrypted && 14053494f7c0SMartin Matuska !dmu_objset_crypto_key_equal(inos, outos)) { 14063494f7c0SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14073494f7c0SMartin Matuska return (SET_ERROR(EXDEV)); 14083494f7c0SMartin Matuska } 14093494f7c0SMartin Matuska 14102a58b312SMartin Matuska error = zfs_verify_zp(inzp); 14112a58b312SMartin Matuska if (error == 0) 14122a58b312SMartin Matuska error = zfs_verify_zp(outzp); 14132a58b312SMartin Matuska if (error != 0) { 14142a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14152a58b312SMartin Matuska return (error); 14162a58b312SMartin Matuska } 14172a58b312SMartin Matuska 14182a58b312SMartin Matuska /* 14192a58b312SMartin Matuska * We don't copy source file's flags that's why we don't allow to clone 14202a58b312SMartin Matuska * files that are in quarantine. 14212a58b312SMartin Matuska */ 14222a58b312SMartin Matuska if (inzp->z_pflags & ZFS_AV_QUARANTINED) { 14232a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14242a58b312SMartin Matuska return (SET_ERROR(EACCES)); 14252a58b312SMartin Matuska } 14262a58b312SMartin Matuska 14272a58b312SMartin Matuska if (inoff >= inzp->z_size) { 14282a58b312SMartin Matuska *lenp = 0; 14292a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14302a58b312SMartin Matuska return (0); 14312a58b312SMartin Matuska } 14322a58b312SMartin Matuska if (len > inzp->z_size - inoff) { 14332a58b312SMartin Matuska len = inzp->z_size - inoff; 14342a58b312SMartin Matuska } 14352a58b312SMartin Matuska if (len == 0) { 14362a58b312SMartin Matuska *lenp = 0; 14372a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14382a58b312SMartin Matuska return (0); 14392a58b312SMartin Matuska } 14402a58b312SMartin Matuska 14412a58b312SMartin Matuska /* 14422a58b312SMartin Matuska * Callers might not be able to detect properly that we are read-only, 14432a58b312SMartin Matuska * so check it explicitly here. 14442a58b312SMartin Matuska */ 14452a58b312SMartin Matuska if (zfs_is_readonly(outzfsvfs)) { 14462a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14472a58b312SMartin Matuska return (SET_ERROR(EROFS)); 14482a58b312SMartin Matuska } 14492a58b312SMartin Matuska 14502a58b312SMartin Matuska /* 14512a58b312SMartin Matuska * If immutable or not appending then return EPERM. 14522a58b312SMartin Matuska * Intentionally allow ZFS_READONLY through here. 14532a58b312SMartin Matuska * See zfs_zaccess_common() 14542a58b312SMartin Matuska */ 14552a58b312SMartin Matuska if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { 14562a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14572a58b312SMartin Matuska return (SET_ERROR(EPERM)); 14582a58b312SMartin Matuska } 14592a58b312SMartin Matuska 14602a58b312SMartin Matuska /* 14612a58b312SMartin Matuska * No overlapping if we are cloning within the same file. 14622a58b312SMartin Matuska */ 14632a58b312SMartin Matuska if (inzp == outzp) { 14642a58b312SMartin Matuska if (inoff < outoff + len && outoff < inoff + len) { 14652a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 14662a58b312SMartin Matuska return (SET_ERROR(EINVAL)); 14672a58b312SMartin Matuska } 14682a58b312SMartin Matuska } 14692a58b312SMartin Matuska 1470783d3ff6SMartin Matuska /* Flush any mmap()'d data to disk */ 1471783d3ff6SMartin Matuska if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) 1472783d3ff6SMartin Matuska zn_flush_cached_data(inzp, B_TRUE); 1473783d3ff6SMartin Matuska 14742a58b312SMartin Matuska /* 14752a58b312SMartin Matuska * Maintain predictable lock order. 14762a58b312SMartin Matuska */ 14772a58b312SMartin Matuska if (inzp < outzp || (inzp == outzp && inoff < outoff)) { 14782a58b312SMartin Matuska inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 14792a58b312SMartin Matuska RL_READER); 14802a58b312SMartin Matuska outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 14812a58b312SMartin Matuska RL_WRITER); 14822a58b312SMartin Matuska } else { 14832a58b312SMartin Matuska outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 14842a58b312SMartin Matuska RL_WRITER); 14852a58b312SMartin Matuska inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 14862a58b312SMartin Matuska RL_READER); 14872a58b312SMartin Matuska } 14882a58b312SMartin Matuska 14892a58b312SMartin Matuska inblksz = inzp->z_blksz; 14902a58b312SMartin Matuska 14912a58b312SMartin Matuska /* 1492f552d7adSMartin Matuska * We cannot clone into a file with different block size if we can't 1493f552d7adSMartin Matuska * grow it (block size is already bigger, has more than one block, or 1494f552d7adSMartin Matuska * not locked for growth). There are other possible reasons for the 1495f552d7adSMartin Matuska * grow to fail, but we cover what we can before opening transaction 1496f552d7adSMartin Matuska * and the rest detect after we try to do it. 14972a58b312SMartin Matuska */ 1498f552d7adSMartin Matuska if (inblksz < outzp->z_blksz) { 1499f552d7adSMartin Matuska error = SET_ERROR(EINVAL); 1500f552d7adSMartin Matuska goto unlock; 1501f552d7adSMartin Matuska } 15023159b89bSMartin Matuska if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || 1503f552d7adSMartin Matuska outlr->lr_length != UINT64_MAX)) { 15043159b89bSMartin Matuska error = SET_ERROR(EINVAL); 15053159b89bSMartin Matuska goto unlock; 15063159b89bSMartin Matuska } 15073159b89bSMartin Matuska 15083159b89bSMartin Matuska /* 15093159b89bSMartin Matuska * Block size must be power-of-2 if destination offset != 0. 15103159b89bSMartin Matuska * There can be no multiple blocks of non-power-of-2 size. 15113159b89bSMartin Matuska */ 15123159b89bSMartin Matuska if (outoff != 0 && !ISP2(inblksz)) { 1513315ee00fSMartin Matuska error = SET_ERROR(EINVAL); 15142a58b312SMartin Matuska goto unlock; 15152a58b312SMartin Matuska } 15162a58b312SMartin Matuska 15172a58b312SMartin Matuska /* 15182a58b312SMartin Matuska * Offsets and len must be at block boundries. 15192a58b312SMartin Matuska */ 15202a58b312SMartin Matuska if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { 1521315ee00fSMartin Matuska error = SET_ERROR(EINVAL); 15222a58b312SMartin Matuska goto unlock; 15232a58b312SMartin Matuska } 15242a58b312SMartin Matuska /* 15252a58b312SMartin Matuska * Length must be multipe of blksz, except for the end of the file. 15262a58b312SMartin Matuska */ 15272a58b312SMartin Matuska if ((len % inblksz) != 0 && 15282a58b312SMartin Matuska (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { 1529315ee00fSMartin Matuska error = SET_ERROR(EINVAL); 15302a58b312SMartin Matuska goto unlock; 15312a58b312SMartin Matuska } 15322a58b312SMartin Matuska 1533f7a5903dSMartin Matuska /* 1534f7a5903dSMartin Matuska * If we are copying only one block and it is smaller than recordsize 1535f7a5903dSMartin Matuska * property, do not allow destination to grow beyond one block if it 1536f7a5903dSMartin Matuska * is not there yet. Otherwise the destination will get stuck with 1537f7a5903dSMartin Matuska * that block size forever, that can be as small as 512 bytes, no 1538f7a5903dSMartin Matuska * matter how big the destination grow later. 1539f7a5903dSMartin Matuska */ 1540f7a5903dSMartin Matuska if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && 1541f7a5903dSMartin Matuska outzp->z_size <= inblksz && outoff + len > inblksz) { 1542f7a5903dSMartin Matuska error = SET_ERROR(EINVAL); 1543f7a5903dSMartin Matuska goto unlock; 1544f7a5903dSMartin Matuska } 1545f7a5903dSMartin Matuska 15462a58b312SMartin Matuska error = zn_rlimit_fsize(outoff + len); 15472a58b312SMartin Matuska if (error != 0) { 15482a58b312SMartin Matuska goto unlock; 15492a58b312SMartin Matuska } 15502a58b312SMartin Matuska 15512a58b312SMartin Matuska if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { 15522a58b312SMartin Matuska error = SET_ERROR(EFBIG); 15532a58b312SMartin Matuska goto unlock; 15542a58b312SMartin Matuska } 15552a58b312SMartin Matuska 15562a58b312SMartin Matuska SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, 15572a58b312SMartin Matuska &mtime, 16); 15582a58b312SMartin Matuska SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, 15592a58b312SMartin Matuska &ctime, 16); 15602a58b312SMartin Matuska SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, 15612a58b312SMartin Matuska &outzp->z_size, 8); 15622a58b312SMartin Matuska 15632a58b312SMartin Matuska zilog = outzfsvfs->z_log; 15642a58b312SMartin Matuska maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / 15652a58b312SMartin Matuska sizeof (bps[0]); 15662a58b312SMartin Matuska 15672a58b312SMartin Matuska uid = KUID_TO_SUID(ZTOUID(outzp)); 15682a58b312SMartin Matuska gid = KGID_TO_SGID(ZTOGID(outzp)); 15692a58b312SMartin Matuska projid = outzp->z_projid; 15702a58b312SMartin Matuska 1571315ee00fSMartin Matuska bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); 15722a58b312SMartin Matuska 15732a58b312SMartin Matuska /* 15742a58b312SMartin Matuska * Clone the file in reasonable size chunks. Each chunk is cloned 15752a58b312SMartin Matuska * in a separate transaction; this keeps the intent log records small 15762a58b312SMartin Matuska * and allows us to do more fine-grained space accounting. 15772a58b312SMartin Matuska */ 15782a58b312SMartin Matuska while (len > 0) { 15792a58b312SMartin Matuska size = MIN(inblksz * maxblocks, len); 15802a58b312SMartin Matuska 15812a58b312SMartin Matuska if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, 15822a58b312SMartin Matuska uid) || 15832a58b312SMartin Matuska zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, 15842a58b312SMartin Matuska gid) || 15852a58b312SMartin Matuska (projid != ZFS_DEFAULT_PROJID && 15862a58b312SMartin Matuska zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, 15872a58b312SMartin Matuska projid))) { 15882a58b312SMartin Matuska error = SET_ERROR(EDQUOT); 15892a58b312SMartin Matuska break; 15902a58b312SMartin Matuska } 15912a58b312SMartin Matuska 15922a58b312SMartin Matuska nbps = maxblocks; 1593a4e5e010SMartin Matuska last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); 1594e639e0d2SMartin Matuska error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, 15952a58b312SMartin Matuska &nbps); 15962a58b312SMartin Matuska if (error != 0) { 15972a58b312SMartin Matuska /* 1598315ee00fSMartin Matuska * If we are trying to clone a block that was created 1599a4e5e010SMartin Matuska * in the current transaction group, the error will be 1600a4e5e010SMartin Matuska * EAGAIN here. Based on zfs_bclone_wait_dirty either 1601a4e5e010SMartin Matuska * return a shortened range to the caller so it can 1602a4e5e010SMartin Matuska * fallback, or wait for the next TXG and check again. 16032a58b312SMartin Matuska */ 1604a4e5e010SMartin Matuska if (error == EAGAIN && zfs_bclone_wait_dirty) { 1605a4e5e010SMartin Matuska txg_wait_synced(dmu_objset_pool(inos), 1606a4e5e010SMartin Matuska last_synced_txg + 1); 1607a4e5e010SMartin Matuska continue; 1608a4e5e010SMartin Matuska } 1609a4e5e010SMartin Matuska 16102a58b312SMartin Matuska break; 16112a58b312SMartin Matuska } 16122a58b312SMartin Matuska 1613e639e0d2SMartin Matuska /* 1614e639e0d2SMartin Matuska * Start a transaction. 1615e639e0d2SMartin Matuska */ 1616e639e0d2SMartin Matuska tx = dmu_tx_create(outos); 16172a58b312SMartin Matuska dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); 16182a58b312SMartin Matuska db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); 16192a58b312SMartin Matuska DB_DNODE_ENTER(db); 16202a58b312SMartin Matuska dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); 16212a58b312SMartin Matuska DB_DNODE_EXIT(db); 16222a58b312SMartin Matuska zfs_sa_upgrade_txholds(tx, outzp); 16232a58b312SMartin Matuska error = dmu_tx_assign(tx, TXG_WAIT); 16242a58b312SMartin Matuska if (error != 0) { 16252a58b312SMartin Matuska dmu_tx_abort(tx); 16262a58b312SMartin Matuska break; 16272a58b312SMartin Matuska } 16282a58b312SMartin Matuska 16292a58b312SMartin Matuska /* 1630f552d7adSMartin Matuska * Copy source znode's block size. This is done only if the 1631f552d7adSMartin Matuska * whole znode is locked (see zfs_rangelock_cb()) and only 1632f552d7adSMartin Matuska * on the first iteration since zfs_rangelock_reduce() will 1633f552d7adSMartin Matuska * shrink down lr_length to the appropriate size. 16342a58b312SMartin Matuska */ 16352a58b312SMartin Matuska if (outlr->lr_length == UINT64_MAX) { 16362a58b312SMartin Matuska zfs_grow_blocksize(outzp, inblksz, tx); 1637f552d7adSMartin Matuska 1638f552d7adSMartin Matuska /* 1639f552d7adSMartin Matuska * Block growth may fail for many reasons we can not 1640f552d7adSMartin Matuska * predict here. If it happen the cloning is doomed. 1641f552d7adSMartin Matuska */ 1642f552d7adSMartin Matuska if (inblksz != outzp->z_blksz) { 1643f552d7adSMartin Matuska error = SET_ERROR(EINVAL); 1644f552d7adSMartin Matuska dmu_tx_abort(tx); 1645f552d7adSMartin Matuska break; 1646f552d7adSMartin Matuska } 1647f552d7adSMartin Matuska 16482a58b312SMartin Matuska /* 16492a58b312SMartin Matuska * Round range lock up to the block boundary, so we 16502a58b312SMartin Matuska * prevent appends until we are done. 16512a58b312SMartin Matuska */ 16522a58b312SMartin Matuska zfs_rangelock_reduce(outlr, outoff, 16532a58b312SMartin Matuska ((len - 1) / inblksz + 1) * inblksz); 16542a58b312SMartin Matuska } 16552a58b312SMartin Matuska 1656e639e0d2SMartin Matuska error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, 1657525fe93dSMartin Matuska bps, nbps); 1658e639e0d2SMartin Matuska if (error != 0) { 1659e639e0d2SMartin Matuska dmu_tx_commit(tx); 1660e639e0d2SMartin Matuska break; 1661e639e0d2SMartin Matuska } 16622a58b312SMartin Matuska 166378ae60b4SMartin Matuska if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { 166478ae60b4SMartin Matuska update_pages(outzp, outoff, size, outos); 166578ae60b4SMartin Matuska } 166678ae60b4SMartin Matuska 16672a58b312SMartin Matuska zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, 16682a58b312SMartin Matuska &clear_setid_bits_txg, tx); 16692a58b312SMartin Matuska 16702a58b312SMartin Matuska zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); 16712a58b312SMartin Matuska 16722a58b312SMartin Matuska /* 16732a58b312SMartin Matuska * Update the file size (zp_size) if it has changed; 16742a58b312SMartin Matuska * account for possible concurrent updates. 16752a58b312SMartin Matuska */ 16762a58b312SMartin Matuska while ((outsize = outzp->z_size) < outoff + size) { 16772a58b312SMartin Matuska (void) atomic_cas_64(&outzp->z_size, outsize, 16782a58b312SMartin Matuska outoff + size); 16792a58b312SMartin Matuska } 16802a58b312SMartin Matuska 16812a58b312SMartin Matuska error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); 16822a58b312SMartin Matuska 16832a58b312SMartin Matuska zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, 16842a58b312SMartin Matuska size, inblksz, bps, nbps); 16852a58b312SMartin Matuska 16862a58b312SMartin Matuska dmu_tx_commit(tx); 16872a58b312SMartin Matuska 16882a58b312SMartin Matuska if (error != 0) 16892a58b312SMartin Matuska break; 16902a58b312SMartin Matuska 16912a58b312SMartin Matuska inoff += size; 16922a58b312SMartin Matuska outoff += size; 16932a58b312SMartin Matuska len -= size; 16942a58b312SMartin Matuska done += size; 1695aca928a5SMartin Matuska 1696aca928a5SMartin Matuska if (issig()) { 1697aca928a5SMartin Matuska error = SET_ERROR(EINTR); 1698aca928a5SMartin Matuska break; 1699aca928a5SMartin Matuska } 17002a58b312SMartin Matuska } 17012a58b312SMartin Matuska 1702315ee00fSMartin Matuska vmem_free(bps, sizeof (bps[0]) * maxblocks); 17032a58b312SMartin Matuska zfs_znode_update_vfs(outzp); 17042a58b312SMartin Matuska 17052a58b312SMartin Matuska unlock: 17062a58b312SMartin Matuska zfs_rangelock_exit(outlr); 17072a58b312SMartin Matuska zfs_rangelock_exit(inlr); 17082a58b312SMartin Matuska 17092a58b312SMartin Matuska if (done > 0) { 17102a58b312SMartin Matuska /* 17112a58b312SMartin Matuska * If we have made at least partial progress, reset the error. 17122a58b312SMartin Matuska */ 17132a58b312SMartin Matuska error = 0; 17142a58b312SMartin Matuska 17152a58b312SMartin Matuska ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); 17162a58b312SMartin Matuska 17172a58b312SMartin Matuska if (outos->os_sync == ZFS_SYNC_ALWAYS) { 17182a58b312SMartin Matuska zil_commit(zilog, outzp->z_id); 17192a58b312SMartin Matuska } 17202a58b312SMartin Matuska 17212a58b312SMartin Matuska *inoffp += done; 17222a58b312SMartin Matuska *outoffp += done; 17232a58b312SMartin Matuska *lenp = done; 17243159b89bSMartin Matuska } else { 17253159b89bSMartin Matuska /* 17263159b89bSMartin Matuska * If we made no progress, there must be a good reason. 17273159b89bSMartin Matuska * EOF is handled explicitly above, before the loop. 17283159b89bSMartin Matuska */ 17293159b89bSMartin Matuska ASSERT3S(error, !=, 0); 17302a58b312SMartin Matuska } 17312a58b312SMartin Matuska 17322a58b312SMartin Matuska zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 17332a58b312SMartin Matuska 17342a58b312SMartin Matuska return (error); 17352a58b312SMartin Matuska } 17362a58b312SMartin Matuska 17372a58b312SMartin Matuska /* 17382a58b312SMartin Matuska * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), 17392a58b312SMartin Matuska * but we cannot do that, because when replaying we don't have source znode 17402a58b312SMartin Matuska * available. This is why we need a dedicated replay function. 17412a58b312SMartin Matuska */ 17422a58b312SMartin Matuska int 17432a58b312SMartin Matuska zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, 17442a58b312SMartin Matuska const blkptr_t *bps, size_t nbps) 17452a58b312SMartin Matuska { 17462a58b312SMartin Matuska zfsvfs_t *zfsvfs; 17472a58b312SMartin Matuska dmu_buf_impl_t *db; 17482a58b312SMartin Matuska dmu_tx_t *tx; 17492a58b312SMartin Matuska int error; 17502a58b312SMartin Matuska int count = 0; 17512a58b312SMartin Matuska sa_bulk_attr_t bulk[3]; 17522a58b312SMartin Matuska uint64_t mtime[2], ctime[2]; 17532a58b312SMartin Matuska 17542a58b312SMartin Matuska ASSERT3U(off, <, MAXOFFSET_T); 17552a58b312SMartin Matuska ASSERT3U(len, >, 0); 17562a58b312SMartin Matuska ASSERT3U(nbps, >, 0); 17572a58b312SMartin Matuska 17582a58b312SMartin Matuska zfsvfs = ZTOZSB(zp); 17592a58b312SMartin Matuska 17602a58b312SMartin Matuska ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), 17612a58b312SMartin Matuska SPA_FEATURE_BLOCK_CLONING)); 17622a58b312SMartin Matuska 17632a58b312SMartin Matuska if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 17642a58b312SMartin Matuska return (error); 17652a58b312SMartin Matuska 17662a58b312SMartin Matuska ASSERT(zfsvfs->z_replay); 17672a58b312SMartin Matuska ASSERT(!zfs_is_readonly(zfsvfs)); 17682a58b312SMartin Matuska 17692a58b312SMartin Matuska if ((off % blksz) != 0) { 17702a58b312SMartin Matuska zfs_exit(zfsvfs, FTAG); 17712a58b312SMartin Matuska return (SET_ERROR(EINVAL)); 17722a58b312SMartin Matuska } 17732a58b312SMartin Matuska 17742a58b312SMartin Matuska SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 17752a58b312SMartin Matuska SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 17762a58b312SMartin Matuska SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 17772a58b312SMartin Matuska &zp->z_size, 8); 17782a58b312SMartin Matuska 17792a58b312SMartin Matuska /* 17802a58b312SMartin Matuska * Start a transaction. 17812a58b312SMartin Matuska */ 17822a58b312SMartin Matuska tx = dmu_tx_create(zfsvfs->z_os); 17832a58b312SMartin Matuska 17842a58b312SMartin Matuska dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 17852a58b312SMartin Matuska db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 17862a58b312SMartin Matuska DB_DNODE_ENTER(db); 17872a58b312SMartin Matuska dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); 17882a58b312SMartin Matuska DB_DNODE_EXIT(db); 17892a58b312SMartin Matuska zfs_sa_upgrade_txholds(tx, zp); 17902a58b312SMartin Matuska error = dmu_tx_assign(tx, TXG_WAIT); 17912a58b312SMartin Matuska if (error != 0) { 17922a58b312SMartin Matuska dmu_tx_abort(tx); 17932a58b312SMartin Matuska zfs_exit(zfsvfs, FTAG); 17942a58b312SMartin Matuska return (error); 17952a58b312SMartin Matuska } 17962a58b312SMartin Matuska 17972a58b312SMartin Matuska if (zp->z_blksz < blksz) 17982a58b312SMartin Matuska zfs_grow_blocksize(zp, blksz, tx); 17992a58b312SMartin Matuska 1800525fe93dSMartin Matuska dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); 18012a58b312SMartin Matuska 18022a58b312SMartin Matuska zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 18032a58b312SMartin Matuska 18042a58b312SMartin Matuska if (zp->z_size < off + len) 18052a58b312SMartin Matuska zp->z_size = off + len; 18062a58b312SMartin Matuska 18072a58b312SMartin Matuska error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 18082a58b312SMartin Matuska 18092a58b312SMartin Matuska /* 18102a58b312SMartin Matuska * zil_replaying() not only check if we are replaying ZIL, but also 18112a58b312SMartin Matuska * updates the ZIL header to record replay progress. 18122a58b312SMartin Matuska */ 18132a58b312SMartin Matuska VERIFY(zil_replaying(zfsvfs->z_log, tx)); 18142a58b312SMartin Matuska 18152a58b312SMartin Matuska dmu_tx_commit(tx); 18162a58b312SMartin Matuska 18172a58b312SMartin Matuska zfs_znode_update_vfs(zp); 18182a58b312SMartin Matuska 18192a58b312SMartin Matuska zfs_exit(zfsvfs, FTAG); 18202a58b312SMartin Matuska 18212a58b312SMartin Matuska return (error); 18222a58b312SMartin Matuska } 18232a58b312SMartin Matuska 18247877fdebSMatt Macy EXPORT_SYMBOL(zfs_access); 18257877fdebSMatt Macy EXPORT_SYMBOL(zfs_fsync); 18267877fdebSMatt Macy EXPORT_SYMBOL(zfs_holey); 18277877fdebSMatt Macy EXPORT_SYMBOL(zfs_read); 18287877fdebSMatt Macy EXPORT_SYMBOL(zfs_write); 18297877fdebSMatt Macy EXPORT_SYMBOL(zfs_getsecattr); 18307877fdebSMatt Macy EXPORT_SYMBOL(zfs_setsecattr); 18312a58b312SMartin Matuska EXPORT_SYMBOL(zfs_clone_range); 18322a58b312SMartin Matuska EXPORT_SYMBOL(zfs_clone_range_replay); 18337877fdebSMatt Macy 1834dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, 18357877fdebSMatt Macy "Bytes to read per chunk"); 1836a4e5e010SMartin Matuska 1837a4e5e010SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, 1838a4e5e010SMartin Matuska "Enable block cloning"); 1839a4e5e010SMartin Matuska 1840a4e5e010SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, 1841a4e5e010SMartin Matuska "Wait for dirty blocks when cloning"); 18427a7741afSMartin Matuska 18437a7741afSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, 18447a7741afSMartin Matuska "Enable Direct I/O"); 1845