1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 /* Portions Copyright 2007 Jeremy Teo */ 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/kmem.h> 41 #include <sys/taskq.h> 42 #include <sys/uio.h> 43 #include <sys/vmsystm.h> 44 #include <sys/atomic.h> 45 #include <sys/pathname.h> 46 #include <sys/cmn_err.h> 47 #include <sys/errno.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/policy.h> 60 #include <sys/sunddi.h> 61 #include <sys/sid.h> 62 #include <sys/zfs_ctldir.h> 63 #include <sys/zfs_fuid.h> 64 #include <sys/zfs_quota.h> 65 #include <sys/zfs_sa.h> 66 #include <sys/zfs_vnops.h> 67 #include <sys/zfs_rlock.h> 68 #include <sys/cred.h> 69 #include <sys/zpl.h> 70 #include <sys/zil.h> 71 #include <sys/sa_impl.h> 72 #include <linux/mm_compat.h> 73 74 /* 75 * Programming rules. 76 * 77 * Each vnode op performs some logical unit of work. To do this, the ZPL must 78 * properly lock its in-core state, create a DMU transaction, do the work, 79 * record this work in the intent log (ZIL), commit the DMU transaction, 80 * and wait for the intent log to commit if it is a synchronous operation. 81 * Moreover, the vnode ops must work in both normal and log replay context. 82 * The ordering of events is important to avoid deadlocks and references 83 * to freed memory. The example below illustrates the following Big Rules: 84 * 85 * (1) A check must be made in each zfs thread for a mounted file system. 86 * This is done avoiding races using zfs_enter(zfsvfs). 87 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 88 * must be checked with zfs_verify_zp(zp). Both of these macros 89 * can return EIO from the calling function. 90 * 91 * (2) zrele() should always be the last thing except for zil_commit() (if 92 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 93 * last reference, the vnode/znode can be freed, so the zp may point to 94 * freed memory. Second, the last reference will call zfs_zinactive(), 95 * which may induce a lot of work -- pushing cached pages (which acquires 96 * range locks) and syncing out cached atime changes. Third, 97 * zfs_zinactive() may require a new tx, which could deadlock the system 98 * if you were already holding one. This deadlock occurs because the tx 99 * currently being operated on prevents a txg from syncing, which 100 * prevents the new tx from progressing, resulting in a deadlock. If you 101 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 102 * is a synonym for zrele(). 103 * 104 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 105 * as they can span dmu_tx_assign() calls. 106 * 107 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 108 * dmu_tx_assign(). This is critical because we don't want to block 109 * while holding locks. 110 * 111 * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This 112 * reduces lock contention and CPU usage when we must wait (note that if 113 * throughput is constrained by the storage, nearly every transaction 114 * must wait). 115 * 116 * Note, in particular, that if a lock is sometimes acquired before 117 * the tx assigns, and sometimes after (e.g. z_lock), then failing 118 * to use a non-blocking assign can deadlock the system. The scenario: 119 * 120 * Thread A has grabbed a lock before calling dmu_tx_assign(). 121 * Thread B is in an already-assigned tx, and blocks for this lock. 122 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 123 * forever, because the previous txg can't quiesce until B's tx commits. 124 * 125 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 126 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 127 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 128 * to indicate that this operation has already called dmu_tx_wait(). 129 * This will ensure that we don't retry forever, waiting a short bit 130 * each time. 131 * 132 * (5) If the operation succeeded, generate the intent log entry for it 133 * before dropping locks. This ensures that the ordering of events 134 * in the intent log matches the order in which they actually occurred. 135 * During ZIL replay the zfs_log_* functions will update the sequence 136 * number to indicate the zil transaction has replayed. 137 * 138 * (6) At the end of each vnode op, the DMU tx must always commit, 139 * regardless of whether there were any errors. 140 * 141 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 142 * to ensure that synchronous semantics are provided when necessary. 143 * 144 * In general, this is how things should be ordered in each vnode op: 145 * 146 * zfs_enter(zfsvfs); // exit if unmounted 147 * top: 148 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 149 * rw_enter(...); // grab any other locks you need 150 * tx = dmu_tx_create(...); // get DMU tx 151 * dmu_tx_hold_*(); // hold each object you might modify 152 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 153 * if (error) { 154 * rw_exit(...); // drop locks 155 * zfs_dirent_unlock(dl); // unlock directory entry 156 * zrele(...); // release held znodes 157 * if (error == ERESTART) { 158 * waited = B_TRUE; 159 * dmu_tx_wait(tx); 160 * dmu_tx_abort(tx); 161 * goto top; 162 * } 163 * dmu_tx_abort(tx); // abort DMU tx 164 * zfs_exit(zfsvfs); // finished in zfs 165 * return (error); // really out of space 166 * } 167 * error = do_real_work(); // do whatever this VOP does 168 * if (error == 0) 169 * zfs_log_*(...); // on success, make ZIL entry 170 * dmu_tx_commit(tx); // commit DMU tx -- error or not 171 * rw_exit(...); // drop locks 172 * zfs_dirent_unlock(dl); // unlock directory entry 173 * zrele(...); // release held znodes 174 * zil_commit(zilog, foid); // synchronous when necessary 175 * zfs_exit(zfsvfs); // finished in zfs 176 * return (error); // done, report error 177 */ 178 int 179 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 180 { 181 (void) cr; 182 znode_t *zp = ITOZ(ip); 183 zfsvfs_t *zfsvfs = ITOZSB(ip); 184 int error; 185 186 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 187 return (error); 188 189 /* Honor ZFS_APPENDONLY file attribute */ 190 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && 191 ((flag & O_APPEND) == 0)) { 192 zfs_exit(zfsvfs, FTAG); 193 return (SET_ERROR(EPERM)); 194 } 195 196 /* 197 * Keep a count of the synchronous opens in the znode. On first 198 * synchronous open we must convert all previous async transactions 199 * into sync to keep correct ordering. 200 */ 201 if (flag & O_SYNC) { 202 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) 203 zil_async_to_sync(zfsvfs->z_log, zp->z_id); 204 } 205 206 zfs_exit(zfsvfs, FTAG); 207 return (0); 208 } 209 210 int 211 zfs_close(struct inode *ip, int flag, cred_t *cr) 212 { 213 (void) cr; 214 znode_t *zp = ITOZ(ip); 215 zfsvfs_t *zfsvfs = ITOZSB(ip); 216 int error; 217 218 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 219 return (error); 220 221 /* Decrement the synchronous opens in the znode */ 222 if (flag & O_SYNC) 223 atomic_dec_32(&zp->z_sync_cnt); 224 225 zfs_exit(zfsvfs, FTAG); 226 return (0); 227 } 228 229 #if defined(_KERNEL) 230 231 static int zfs_fillpage(struct inode *ip, struct page *pp); 232 233 /* 234 * When a file is memory mapped, we must keep the IO data synchronized 235 * between the DMU cache and the memory mapped pages. Update all mapped 236 * pages with the contents of the coresponding dmu buffer. 237 */ 238 void 239 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 240 { 241 struct address_space *mp = ZTOI(zp)->i_mapping; 242 int64_t off = start & (PAGE_SIZE - 1); 243 244 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 245 uint64_t nbytes = MIN(PAGE_SIZE - off, len); 246 247 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 248 if (pp) { 249 if (mapping_writably_mapped(mp)) 250 flush_dcache_page(pp); 251 252 void *pb = kmap(pp); 253 int error = dmu_read(os, zp->z_id, start + off, 254 nbytes, pb + off, DMU_READ_PREFETCH); 255 kunmap(pp); 256 257 if (error) { 258 SetPageError(pp); 259 ClearPageUptodate(pp); 260 } else { 261 ClearPageError(pp); 262 SetPageUptodate(pp); 263 if (!PagePrivate(pp)) { 264 /* 265 * Set private bit so page migration 266 * will wait for us to finish writeback 267 * before calling migrate_folio(). 268 */ 269 SetPagePrivate(pp); 270 get_page(pp); 271 } 272 273 if (mapping_writably_mapped(mp)) 274 flush_dcache_page(pp); 275 276 mark_page_accessed(pp); 277 } 278 279 unlock_page(pp); 280 put_page(pp); 281 } 282 283 len -= nbytes; 284 off = 0; 285 } 286 } 287 288 /* 289 * When a file is memory mapped, we must keep the I/O data synchronized 290 * between the DMU cache and the memory mapped pages. Preferentially read 291 * from memory mapped pages, otherwise fallback to reading through the dmu. 292 */ 293 int 294 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 295 { 296 struct inode *ip = ZTOI(zp); 297 struct address_space *mp = ip->i_mapping; 298 int64_t start = uio->uio_loffset; 299 int64_t off = start & (PAGE_SIZE - 1); 300 int len = nbytes; 301 int error = 0; 302 303 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 304 uint64_t bytes = MIN(PAGE_SIZE - off, len); 305 306 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 307 if (pp) { 308 309 /* 310 * If filemap_fault() retries there exists a window 311 * where the page will be unlocked and not up to date. 312 * In this case we must try and fill the page. 313 */ 314 if (unlikely(!PageUptodate(pp))) { 315 error = zfs_fillpage(ip, pp); 316 if (error) { 317 unlock_page(pp); 318 put_page(pp); 319 return (error); 320 } 321 } 322 323 ASSERT(PageUptodate(pp) || PageDirty(pp)); 324 325 unlock_page(pp); 326 327 void *pb = kmap(pp); 328 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 329 kunmap(pp); 330 331 if (mapping_writably_mapped(mp)) 332 flush_dcache_page(pp); 333 334 mark_page_accessed(pp); 335 put_page(pp); 336 } else { 337 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 338 uio, bytes); 339 } 340 341 len -= bytes; 342 off = 0; 343 344 if (error) 345 break; 346 } 347 348 return (error); 349 } 350 #endif /* _KERNEL */ 351 352 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 353 354 /* 355 * Write the bytes to a file. 356 * 357 * IN: zp - znode of file to be written to 358 * data - bytes to write 359 * len - number of bytes to write 360 * pos - offset to start writing at 361 * 362 * OUT: resid - remaining bytes to write 363 * 364 * RETURN: 0 if success 365 * positive error code if failure. EIO is returned 366 * for a short write when residp isn't provided. 367 * 368 * Timestamps: 369 * zp - ctime|mtime updated if byte count > 0 370 */ 371 int 372 zfs_write_simple(znode_t *zp, const void *data, size_t len, 373 loff_t pos, size_t *residp) 374 { 375 fstrans_cookie_t cookie; 376 int error; 377 378 struct iovec iov; 379 iov.iov_base = (void *)data; 380 iov.iov_len = len; 381 382 zfs_uio_t uio; 383 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 384 385 cookie = spl_fstrans_mark(); 386 error = zfs_write(zp, &uio, 0, kcred); 387 spl_fstrans_unmark(cookie); 388 389 if (error == 0) { 390 if (residp != NULL) 391 *residp = zfs_uio_resid(&uio); 392 else if (zfs_uio_resid(&uio) != 0) 393 error = SET_ERROR(EIO); 394 } 395 396 return (error); 397 } 398 399 static void 400 zfs_rele_async_task(void *arg) 401 { 402 iput(arg); 403 } 404 405 void 406 zfs_zrele_async(znode_t *zp) 407 { 408 struct inode *ip = ZTOI(zp); 409 objset_t *os = ITOZSB(ip)->z_os; 410 411 ASSERT(atomic_read(&ip->i_count) > 0); 412 ASSERT(os != NULL); 413 414 /* 415 * If decrementing the count would put us at 0, we can't do it inline 416 * here, because that would be synchronous. Instead, dispatch an iput 417 * to run later. 418 * 419 * For more information on the dangers of a synchronous iput, see the 420 * header comment of this file. 421 */ 422 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 423 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 424 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 425 } 426 } 427 428 429 /* 430 * Lookup an entry in a directory, or an extended attribute directory. 431 * If it exists, return a held inode reference for it. 432 * 433 * IN: zdp - znode of directory to search. 434 * nm - name of entry to lookup. 435 * flags - LOOKUP_XATTR set if looking for an attribute. 436 * cr - credentials of caller. 437 * direntflags - directory lookup flags 438 * realpnp - returned pathname. 439 * 440 * OUT: zpp - znode of located entry, NULL if not found. 441 * 442 * RETURN: 0 on success, error code on failure. 443 * 444 * Timestamps: 445 * NA 446 */ 447 int 448 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 449 int *direntflags, pathname_t *realpnp) 450 { 451 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 452 int error = 0; 453 454 /* 455 * Fast path lookup, however we must skip DNLC lookup 456 * for case folding or normalizing lookups because the 457 * DNLC code only stores the passed in name. This means 458 * creating 'a' and removing 'A' on a case insensitive 459 * file system would work, but DNLC still thinks 'a' 460 * exists and won't let you create it again on the next 461 * pass through fast path. 462 */ 463 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 464 465 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 466 return (SET_ERROR(ENOTDIR)); 467 } else if (zdp->z_sa_hdl == NULL) { 468 return (SET_ERROR(EIO)); 469 } 470 471 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 472 error = zfs_fastaccesschk_execute(zdp, cr); 473 if (!error) { 474 *zpp = zdp; 475 zhold(*zpp); 476 return (0); 477 } 478 return (error); 479 } 480 } 481 482 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 483 return (error); 484 485 *zpp = NULL; 486 487 if (flags & LOOKUP_XATTR) { 488 /* 489 * We don't allow recursive attributes.. 490 * Maybe someday we will. 491 */ 492 if (zdp->z_pflags & ZFS_XATTR) { 493 zfs_exit(zfsvfs, FTAG); 494 return (SET_ERROR(EINVAL)); 495 } 496 497 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 498 zfs_exit(zfsvfs, FTAG); 499 return (error); 500 } 501 502 /* 503 * Do we have permission to get into attribute directory? 504 */ 505 506 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 507 B_TRUE, cr, zfs_init_idmap))) { 508 zrele(*zpp); 509 *zpp = NULL; 510 } 511 512 zfs_exit(zfsvfs, FTAG); 513 return (error); 514 } 515 516 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 517 zfs_exit(zfsvfs, FTAG); 518 return (SET_ERROR(ENOTDIR)); 519 } 520 521 /* 522 * Check accessibility of directory. 523 */ 524 525 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 526 zfs_init_idmap))) { 527 zfs_exit(zfsvfs, FTAG); 528 return (error); 529 } 530 531 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 532 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 533 zfs_exit(zfsvfs, FTAG); 534 return (SET_ERROR(EILSEQ)); 535 } 536 537 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 538 if ((error == 0) && (*zpp)) 539 zfs_znode_update_vfs(*zpp); 540 541 zfs_exit(zfsvfs, FTAG); 542 return (error); 543 } 544 545 /* 546 * Perform a linear search in directory for the name of specific inode. 547 * Note we don't pass in the buffer size of name because it's hardcoded to 548 * NAME_MAX+1(256) in Linux. 549 * 550 * IN: dzp - znode of directory to search. 551 * zp - znode of the target 552 * 553 * OUT: name - dentry name of the target 554 * 555 * RETURN: 0 on success, error code on failure. 556 */ 557 int 558 zfs_get_name(znode_t *dzp, char *name, znode_t *zp) 559 { 560 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 561 int error = 0; 562 563 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 564 return (error); 565 566 if ((error = zfs_verify_zp(zp)) != 0) { 567 zfs_exit(zfsvfs, FTAG); 568 return (error); 569 } 570 571 /* ctldir should have got their name in zfs_vget */ 572 if (dzp->z_is_ctldir || zp->z_is_ctldir) { 573 zfs_exit(zfsvfs, FTAG); 574 return (ENOENT); 575 } 576 577 /* buffer len is hardcoded to 256 in Linux kernel */ 578 error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id, 579 ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN); 580 581 zfs_exit(zfsvfs, FTAG); 582 return (error); 583 } 584 585 /* 586 * Attempt to create a new entry in a directory. If the entry 587 * already exists, truncate the file if permissible, else return 588 * an error. Return the ip of the created or trunc'd file. 589 * 590 * IN: dzp - znode of directory to put new file entry in. 591 * name - name of new file entry. 592 * vap - attributes of new file. 593 * excl - flag indicating exclusive or non-exclusive mode. 594 * mode - mode to open file with. 595 * cr - credentials of caller. 596 * flag - file flag. 597 * vsecp - ACL to be set 598 * mnt_ns - user namespace of the mount 599 * 600 * OUT: zpp - znode of created or trunc'd entry. 601 * 602 * RETURN: 0 on success, error code on failure. 603 * 604 * Timestamps: 605 * dzp - ctime|mtime updated if new entry created 606 * zp - ctime|mtime always, atime if new 607 */ 608 int 609 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 610 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 611 zidmap_t *mnt_ns) 612 { 613 znode_t *zp; 614 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 615 zilog_t *zilog; 616 objset_t *os; 617 zfs_dirlock_t *dl; 618 dmu_tx_t *tx; 619 int error; 620 uid_t uid; 621 gid_t gid; 622 zfs_acl_ids_t acl_ids; 623 boolean_t fuid_dirtied; 624 boolean_t have_acl = B_FALSE; 625 boolean_t waited = B_FALSE; 626 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 627 628 /* 629 * If we have an ephemeral id, ACL, or XVATTR then 630 * make sure file system is at proper version 631 */ 632 633 gid = crgetgid(cr); 634 uid = crgetuid(cr); 635 636 if (zfsvfs->z_use_fuids == B_FALSE && 637 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 638 return (SET_ERROR(EINVAL)); 639 640 if (name == NULL) 641 return (SET_ERROR(EINVAL)); 642 643 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 644 return (error); 645 os = zfsvfs->z_os; 646 zilog = zfsvfs->z_log; 647 648 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 649 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 650 zfs_exit(zfsvfs, FTAG); 651 return (SET_ERROR(EILSEQ)); 652 } 653 654 if (vap->va_mask & ATTR_XVATTR) { 655 if ((error = secpolicy_xvattr((xvattr_t *)vap, 656 crgetuid(cr), cr, vap->va_mode)) != 0) { 657 zfs_exit(zfsvfs, FTAG); 658 return (error); 659 } 660 } 661 662 top: 663 *zpp = NULL; 664 if (*name == '\0') { 665 /* 666 * Null component name refers to the directory itself. 667 */ 668 zhold(dzp); 669 zp = dzp; 670 dl = NULL; 671 error = 0; 672 } else { 673 /* possible igrab(zp) */ 674 int zflg = 0; 675 676 if (flag & FIGNORECASE) 677 zflg |= ZCILOOK; 678 679 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 680 NULL, NULL); 681 if (error) { 682 if (have_acl) 683 zfs_acl_ids_free(&acl_ids); 684 if (strcmp(name, "..") == 0) 685 error = SET_ERROR(EISDIR); 686 zfs_exit(zfsvfs, FTAG); 687 return (error); 688 } 689 } 690 691 if (zp == NULL) { 692 uint64_t txtype; 693 uint64_t projid = ZFS_DEFAULT_PROJID; 694 695 /* 696 * Create a new file object and update the directory 697 * to reference it. 698 */ 699 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 700 mnt_ns))) { 701 if (have_acl) 702 zfs_acl_ids_free(&acl_ids); 703 goto out; 704 } 705 706 /* 707 * We only support the creation of regular files in 708 * extended attribute directories. 709 */ 710 711 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 712 if (have_acl) 713 zfs_acl_ids_free(&acl_ids); 714 error = SET_ERROR(EINVAL); 715 goto out; 716 } 717 718 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 719 cr, vsecp, &acl_ids, mnt_ns)) != 0) 720 goto out; 721 have_acl = B_TRUE; 722 723 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 724 projid = zfs_inherit_projid(dzp); 725 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 726 zfs_acl_ids_free(&acl_ids); 727 error = SET_ERROR(EDQUOT); 728 goto out; 729 } 730 731 tx = dmu_tx_create(os); 732 733 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 734 ZFS_SA_BASE_ATTR_SIZE); 735 736 fuid_dirtied = zfsvfs->z_fuid_dirty; 737 if (fuid_dirtied) 738 zfs_fuid_txhold(zfsvfs, tx); 739 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 740 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 741 if (!zfsvfs->z_use_sa && 742 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 743 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 744 0, acl_ids.z_aclp->z_acl_bytes); 745 } 746 747 error = dmu_tx_assign(tx, 748 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 749 if (error) { 750 zfs_dirent_unlock(dl); 751 if (error == ERESTART) { 752 waited = B_TRUE; 753 dmu_tx_wait(tx); 754 dmu_tx_abort(tx); 755 goto top; 756 } 757 zfs_acl_ids_free(&acl_ids); 758 dmu_tx_abort(tx); 759 zfs_exit(zfsvfs, FTAG); 760 return (error); 761 } 762 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 763 764 error = zfs_link_create(dl, zp, tx, ZNEW); 765 if (error != 0) { 766 /* 767 * Since, we failed to add the directory entry for it, 768 * delete the newly created dnode. 769 */ 770 zfs_znode_delete(zp, tx); 771 remove_inode_hash(ZTOI(zp)); 772 zfs_acl_ids_free(&acl_ids); 773 dmu_tx_commit(tx); 774 goto out; 775 } 776 777 if (fuid_dirtied) 778 zfs_fuid_sync(zfsvfs, tx); 779 780 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 781 if (flag & FIGNORECASE) 782 txtype |= TX_CI; 783 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 784 vsecp, acl_ids.z_fuidp, vap); 785 zfs_acl_ids_free(&acl_ids); 786 dmu_tx_commit(tx); 787 } else { 788 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 789 790 if (have_acl) 791 zfs_acl_ids_free(&acl_ids); 792 793 /* 794 * A directory entry already exists for this name. 795 */ 796 /* 797 * Can't truncate an existing file if in exclusive mode. 798 */ 799 if (excl) { 800 error = SET_ERROR(EEXIST); 801 goto out; 802 } 803 /* 804 * Can't open a directory for writing. 805 */ 806 if (S_ISDIR(ZTOI(zp)->i_mode)) { 807 error = SET_ERROR(EISDIR); 808 goto out; 809 } 810 /* 811 * Verify requested access to file. 812 */ 813 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 814 mnt_ns))) { 815 goto out; 816 } 817 818 mutex_enter(&dzp->z_lock); 819 dzp->z_seq++; 820 mutex_exit(&dzp->z_lock); 821 822 /* 823 * Truncate regular files if requested. 824 */ 825 if (S_ISREG(ZTOI(zp)->i_mode) && 826 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 827 /* we can't hold any locks when calling zfs_freesp() */ 828 if (dl) { 829 zfs_dirent_unlock(dl); 830 dl = NULL; 831 } 832 error = zfs_freesp(zp, 0, 0, mode, TRUE); 833 } 834 } 835 out: 836 837 if (dl) 838 zfs_dirent_unlock(dl); 839 840 if (error) { 841 if (zp) 842 zrele(zp); 843 } else { 844 zfs_znode_update_vfs(dzp); 845 zfs_znode_update_vfs(zp); 846 *zpp = zp; 847 } 848 849 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 850 zil_commit(zilog, 0); 851 852 zfs_exit(zfsvfs, FTAG); 853 return (error); 854 } 855 856 int 857 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 858 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 859 zidmap_t *mnt_ns) 860 { 861 (void) excl, (void) mode, (void) flag; 862 znode_t *zp = NULL, *dzp = ITOZ(dip); 863 zfsvfs_t *zfsvfs = ITOZSB(dip); 864 objset_t *os; 865 dmu_tx_t *tx; 866 int error; 867 uid_t uid; 868 gid_t gid; 869 zfs_acl_ids_t acl_ids; 870 uint64_t projid = ZFS_DEFAULT_PROJID; 871 boolean_t fuid_dirtied; 872 boolean_t have_acl = B_FALSE; 873 boolean_t waited = B_FALSE; 874 875 /* 876 * If we have an ephemeral id, ACL, or XVATTR then 877 * make sure file system is at proper version 878 */ 879 880 gid = crgetgid(cr); 881 uid = crgetuid(cr); 882 883 if (zfsvfs->z_use_fuids == B_FALSE && 884 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 885 return (SET_ERROR(EINVAL)); 886 887 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 888 return (error); 889 os = zfsvfs->z_os; 890 891 if (vap->va_mask & ATTR_XVATTR) { 892 if ((error = secpolicy_xvattr((xvattr_t *)vap, 893 crgetuid(cr), cr, vap->va_mode)) != 0) { 894 zfs_exit(zfsvfs, FTAG); 895 return (error); 896 } 897 } 898 899 top: 900 *ipp = NULL; 901 902 /* 903 * Create a new file object and update the directory 904 * to reference it. 905 */ 906 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 907 if (have_acl) 908 zfs_acl_ids_free(&acl_ids); 909 goto out; 910 } 911 912 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 913 cr, vsecp, &acl_ids, mnt_ns)) != 0) 914 goto out; 915 have_acl = B_TRUE; 916 917 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 918 projid = zfs_inherit_projid(dzp); 919 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 920 zfs_acl_ids_free(&acl_ids); 921 error = SET_ERROR(EDQUOT); 922 goto out; 923 } 924 925 tx = dmu_tx_create(os); 926 927 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 928 ZFS_SA_BASE_ATTR_SIZE); 929 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 930 931 fuid_dirtied = zfsvfs->z_fuid_dirty; 932 if (fuid_dirtied) 933 zfs_fuid_txhold(zfsvfs, tx); 934 if (!zfsvfs->z_use_sa && 935 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 936 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 937 0, acl_ids.z_aclp->z_acl_bytes); 938 } 939 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 940 if (error) { 941 if (error == ERESTART) { 942 waited = B_TRUE; 943 dmu_tx_wait(tx); 944 dmu_tx_abort(tx); 945 goto top; 946 } 947 zfs_acl_ids_free(&acl_ids); 948 dmu_tx_abort(tx); 949 zfs_exit(zfsvfs, FTAG); 950 return (error); 951 } 952 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 953 954 if (fuid_dirtied) 955 zfs_fuid_sync(zfsvfs, tx); 956 957 /* Add to unlinked set */ 958 zp->z_unlinked = B_TRUE; 959 zfs_unlinked_add(zp, tx); 960 zfs_acl_ids_free(&acl_ids); 961 dmu_tx_commit(tx); 962 out: 963 964 if (error) { 965 if (zp) 966 zrele(zp); 967 } else { 968 zfs_znode_update_vfs(dzp); 969 zfs_znode_update_vfs(zp); 970 *ipp = ZTOI(zp); 971 } 972 973 zfs_exit(zfsvfs, FTAG); 974 return (error); 975 } 976 977 /* 978 * Remove an entry from a directory. 979 * 980 * IN: dzp - znode of directory to remove entry from. 981 * name - name of entry to remove. 982 * cr - credentials of caller. 983 * flags - case flags. 984 * 985 * RETURN: 0 if success 986 * error code if failure 987 * 988 * Timestamps: 989 * dzp - ctime|mtime 990 * ip - ctime (if nlink > 0) 991 */ 992 993 static uint64_t null_xattr = 0; 994 995 int 996 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 997 { 998 znode_t *zp; 999 znode_t *xzp; 1000 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1001 zilog_t *zilog; 1002 uint64_t acl_obj, xattr_obj; 1003 uint64_t xattr_obj_unlinked = 0; 1004 uint64_t obj = 0; 1005 uint64_t links; 1006 zfs_dirlock_t *dl; 1007 dmu_tx_t *tx; 1008 boolean_t may_delete_now, delete_now = FALSE; 1009 boolean_t unlinked, toobig = FALSE; 1010 uint64_t txtype; 1011 pathname_t *realnmp = NULL; 1012 pathname_t realnm; 1013 int error; 1014 int zflg = ZEXISTS; 1015 boolean_t waited = B_FALSE; 1016 1017 if (name == NULL) 1018 return (SET_ERROR(EINVAL)); 1019 1020 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1021 return (error); 1022 zilog = zfsvfs->z_log; 1023 1024 if (flags & FIGNORECASE) { 1025 zflg |= ZCILOOK; 1026 pn_alloc(&realnm); 1027 realnmp = &realnm; 1028 } 1029 1030 top: 1031 xattr_obj = 0; 1032 xzp = NULL; 1033 /* 1034 * Attempt to lock directory; fail if entry doesn't exist. 1035 */ 1036 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1037 NULL, realnmp))) { 1038 if (realnmp) 1039 pn_free(realnmp); 1040 zfs_exit(zfsvfs, FTAG); 1041 return (error); 1042 } 1043 1044 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1045 goto out; 1046 } 1047 1048 /* 1049 * Need to use rmdir for removing directories. 1050 */ 1051 if (S_ISDIR(ZTOI(zp)->i_mode)) { 1052 error = SET_ERROR(EPERM); 1053 goto out; 1054 } 1055 1056 mutex_enter(&zp->z_lock); 1057 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 1058 !zn_has_cached_data(zp, 0, LLONG_MAX); 1059 mutex_exit(&zp->z_lock); 1060 1061 /* 1062 * We may delete the znode now, or we may put it in the unlinked set; 1063 * it depends on whether we're the last link, and on whether there are 1064 * other holds on the inode. So we dmu_tx_hold() the right things to 1065 * allow for either case. 1066 */ 1067 obj = zp->z_id; 1068 tx = dmu_tx_create(zfsvfs->z_os); 1069 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1070 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1071 zfs_sa_upgrade_txholds(tx, zp); 1072 zfs_sa_upgrade_txholds(tx, dzp); 1073 if (may_delete_now) { 1074 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1075 /* if the file is too big, only hold_free a token amount */ 1076 dmu_tx_hold_free(tx, zp->z_id, 0, 1077 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1078 } 1079 1080 /* are there any extended attributes? */ 1081 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1082 &xattr_obj, sizeof (xattr_obj)); 1083 if (error == 0 && xattr_obj) { 1084 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1085 ASSERT0(error); 1086 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1087 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1088 } 1089 1090 mutex_enter(&zp->z_lock); 1091 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1092 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1093 mutex_exit(&zp->z_lock); 1094 1095 /* charge as an update -- would be nice not to charge at all */ 1096 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1097 1098 /* 1099 * Mark this transaction as typically resulting in a net free of space 1100 */ 1101 dmu_tx_mark_netfree(tx); 1102 1103 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1104 if (error) { 1105 zfs_dirent_unlock(dl); 1106 if (error == ERESTART) { 1107 waited = B_TRUE; 1108 dmu_tx_wait(tx); 1109 dmu_tx_abort(tx); 1110 zrele(zp); 1111 if (xzp) 1112 zrele(xzp); 1113 goto top; 1114 } 1115 if (realnmp) 1116 pn_free(realnmp); 1117 dmu_tx_abort(tx); 1118 zrele(zp); 1119 if (xzp) 1120 zrele(xzp); 1121 zfs_exit(zfsvfs, FTAG); 1122 return (error); 1123 } 1124 1125 /* 1126 * Remove the directory entry. 1127 */ 1128 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1129 1130 if (error) { 1131 dmu_tx_commit(tx); 1132 goto out; 1133 } 1134 1135 if (unlinked) { 1136 /* 1137 * Hold z_lock so that we can make sure that the ACL obj 1138 * hasn't changed. Could have been deleted due to 1139 * zfs_sa_upgrade(). 1140 */ 1141 mutex_enter(&zp->z_lock); 1142 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1143 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1144 delete_now = may_delete_now && !toobig && 1145 atomic_read(&ZTOI(zp)->i_count) == 1 && 1146 !zn_has_cached_data(zp, 0, LLONG_MAX) && 1147 xattr_obj == xattr_obj_unlinked && 1148 zfs_external_acl(zp) == acl_obj; 1149 VERIFY_IMPLY(xattr_obj_unlinked, xzp); 1150 } 1151 1152 if (delete_now) { 1153 if (xattr_obj_unlinked) { 1154 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1155 mutex_enter(&xzp->z_lock); 1156 xzp->z_unlinked = B_TRUE; 1157 clear_nlink(ZTOI(xzp)); 1158 links = 0; 1159 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1160 &links, sizeof (links), tx); 1161 ASSERT3U(error, ==, 0); 1162 mutex_exit(&xzp->z_lock); 1163 zfs_unlinked_add(xzp, tx); 1164 1165 if (zp->z_is_sa) 1166 error = sa_remove(zp->z_sa_hdl, 1167 SA_ZPL_XATTR(zfsvfs), tx); 1168 else 1169 error = sa_update(zp->z_sa_hdl, 1170 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1171 sizeof (uint64_t), tx); 1172 ASSERT0(error); 1173 } 1174 /* 1175 * Add to the unlinked set because a new reference could be 1176 * taken concurrently resulting in a deferred destruction. 1177 */ 1178 zfs_unlinked_add(zp, tx); 1179 mutex_exit(&zp->z_lock); 1180 } else if (unlinked) { 1181 mutex_exit(&zp->z_lock); 1182 zfs_unlinked_add(zp, tx); 1183 } 1184 1185 txtype = TX_REMOVE; 1186 if (flags & FIGNORECASE) 1187 txtype |= TX_CI; 1188 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1189 1190 dmu_tx_commit(tx); 1191 out: 1192 if (realnmp) 1193 pn_free(realnmp); 1194 1195 zfs_dirent_unlock(dl); 1196 zfs_znode_update_vfs(dzp); 1197 zfs_znode_update_vfs(zp); 1198 1199 if (delete_now) 1200 zrele(zp); 1201 else 1202 zfs_zrele_async(zp); 1203 1204 if (xzp) { 1205 zfs_znode_update_vfs(xzp); 1206 zfs_zrele_async(xzp); 1207 } 1208 1209 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1210 zil_commit(zilog, 0); 1211 1212 zfs_exit(zfsvfs, FTAG); 1213 return (error); 1214 } 1215 1216 /* 1217 * Create a new directory and insert it into dzp using the name 1218 * provided. Return a pointer to the inserted directory. 1219 * 1220 * IN: dzp - znode of directory to add subdir to. 1221 * dirname - name of new directory. 1222 * vap - attributes of new directory. 1223 * cr - credentials of caller. 1224 * flags - case flags. 1225 * vsecp - ACL to be set 1226 * mnt_ns - user namespace of the mount 1227 * 1228 * OUT: zpp - znode of created directory. 1229 * 1230 * RETURN: 0 if success 1231 * error code if failure 1232 * 1233 * Timestamps: 1234 * dzp - ctime|mtime updated 1235 * zpp - ctime|mtime|atime updated 1236 */ 1237 int 1238 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1239 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) 1240 { 1241 znode_t *zp; 1242 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1243 zilog_t *zilog; 1244 zfs_dirlock_t *dl; 1245 uint64_t txtype; 1246 dmu_tx_t *tx; 1247 int error; 1248 int zf = ZNEW; 1249 uid_t uid; 1250 gid_t gid = crgetgid(cr); 1251 zfs_acl_ids_t acl_ids; 1252 boolean_t fuid_dirtied; 1253 boolean_t waited = B_FALSE; 1254 1255 ASSERT(S_ISDIR(vap->va_mode)); 1256 1257 /* 1258 * If we have an ephemeral id, ACL, or XVATTR then 1259 * make sure file system is at proper version 1260 */ 1261 1262 uid = crgetuid(cr); 1263 if (zfsvfs->z_use_fuids == B_FALSE && 1264 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1265 return (SET_ERROR(EINVAL)); 1266 1267 if (dirname == NULL) 1268 return (SET_ERROR(EINVAL)); 1269 1270 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1271 return (error); 1272 zilog = zfsvfs->z_log; 1273 1274 if (dzp->z_pflags & ZFS_XATTR) { 1275 zfs_exit(zfsvfs, FTAG); 1276 return (SET_ERROR(EINVAL)); 1277 } 1278 1279 if (zfsvfs->z_utf8 && u8_validate(dirname, 1280 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1281 zfs_exit(zfsvfs, FTAG); 1282 return (SET_ERROR(EILSEQ)); 1283 } 1284 if (flags & FIGNORECASE) 1285 zf |= ZCILOOK; 1286 1287 if (vap->va_mask & ATTR_XVATTR) { 1288 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1289 crgetuid(cr), cr, vap->va_mode)) != 0) { 1290 zfs_exit(zfsvfs, FTAG); 1291 return (error); 1292 } 1293 } 1294 1295 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1296 vsecp, &acl_ids, mnt_ns)) != 0) { 1297 zfs_exit(zfsvfs, FTAG); 1298 return (error); 1299 } 1300 /* 1301 * First make sure the new directory doesn't exist. 1302 * 1303 * Existence is checked first to make sure we don't return 1304 * EACCES instead of EEXIST which can cause some applications 1305 * to fail. 1306 */ 1307 top: 1308 *zpp = NULL; 1309 1310 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1311 NULL, NULL))) { 1312 zfs_acl_ids_free(&acl_ids); 1313 zfs_exit(zfsvfs, FTAG); 1314 return (error); 1315 } 1316 1317 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1318 mnt_ns))) { 1319 zfs_acl_ids_free(&acl_ids); 1320 zfs_dirent_unlock(dl); 1321 zfs_exit(zfsvfs, FTAG); 1322 return (error); 1323 } 1324 1325 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1326 zfs_acl_ids_free(&acl_ids); 1327 zfs_dirent_unlock(dl); 1328 zfs_exit(zfsvfs, FTAG); 1329 return (SET_ERROR(EDQUOT)); 1330 } 1331 1332 /* 1333 * Add a new entry to the directory. 1334 */ 1335 tx = dmu_tx_create(zfsvfs->z_os); 1336 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1337 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1338 fuid_dirtied = zfsvfs->z_fuid_dirty; 1339 if (fuid_dirtied) 1340 zfs_fuid_txhold(zfsvfs, tx); 1341 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1342 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1343 acl_ids.z_aclp->z_acl_bytes); 1344 } 1345 1346 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1347 ZFS_SA_BASE_ATTR_SIZE); 1348 1349 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1350 if (error) { 1351 zfs_dirent_unlock(dl); 1352 if (error == ERESTART) { 1353 waited = B_TRUE; 1354 dmu_tx_wait(tx); 1355 dmu_tx_abort(tx); 1356 goto top; 1357 } 1358 zfs_acl_ids_free(&acl_ids); 1359 dmu_tx_abort(tx); 1360 zfs_exit(zfsvfs, FTAG); 1361 return (error); 1362 } 1363 1364 /* 1365 * Create new node. 1366 */ 1367 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1368 1369 /* 1370 * Now put new name in parent dir. 1371 */ 1372 error = zfs_link_create(dl, zp, tx, ZNEW); 1373 if (error != 0) { 1374 zfs_znode_delete(zp, tx); 1375 remove_inode_hash(ZTOI(zp)); 1376 goto out; 1377 } 1378 1379 if (fuid_dirtied) 1380 zfs_fuid_sync(zfsvfs, tx); 1381 1382 *zpp = zp; 1383 1384 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1385 if (flags & FIGNORECASE) 1386 txtype |= TX_CI; 1387 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1388 acl_ids.z_fuidp, vap); 1389 1390 out: 1391 zfs_acl_ids_free(&acl_ids); 1392 1393 dmu_tx_commit(tx); 1394 1395 zfs_dirent_unlock(dl); 1396 1397 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1398 zil_commit(zilog, 0); 1399 1400 if (error != 0) { 1401 zrele(zp); 1402 } else { 1403 zfs_znode_update_vfs(dzp); 1404 zfs_znode_update_vfs(zp); 1405 } 1406 zfs_exit(zfsvfs, FTAG); 1407 return (error); 1408 } 1409 1410 /* 1411 * Remove a directory subdir entry. If the current working 1412 * directory is the same as the subdir to be removed, the 1413 * remove will fail. 1414 * 1415 * IN: dzp - znode of directory to remove from. 1416 * name - name of directory to be removed. 1417 * cwd - inode of current working directory. 1418 * cr - credentials of caller. 1419 * flags - case flags 1420 * 1421 * RETURN: 0 on success, error code on failure. 1422 * 1423 * Timestamps: 1424 * dzp - ctime|mtime updated 1425 */ 1426 int 1427 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1428 int flags) 1429 { 1430 znode_t *zp; 1431 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1432 zilog_t *zilog; 1433 zfs_dirlock_t *dl; 1434 dmu_tx_t *tx; 1435 int error; 1436 int zflg = ZEXISTS; 1437 boolean_t waited = B_FALSE; 1438 1439 if (name == NULL) 1440 return (SET_ERROR(EINVAL)); 1441 1442 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1443 return (error); 1444 zilog = zfsvfs->z_log; 1445 1446 if (flags & FIGNORECASE) 1447 zflg |= ZCILOOK; 1448 top: 1449 zp = NULL; 1450 1451 /* 1452 * Attempt to lock directory; fail if entry doesn't exist. 1453 */ 1454 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1455 NULL, NULL))) { 1456 zfs_exit(zfsvfs, FTAG); 1457 return (error); 1458 } 1459 1460 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1461 goto out; 1462 } 1463 1464 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1465 error = SET_ERROR(ENOTDIR); 1466 goto out; 1467 } 1468 1469 if (zp == cwd) { 1470 error = SET_ERROR(EINVAL); 1471 goto out; 1472 } 1473 1474 /* 1475 * Grab a lock on the directory to make sure that no one is 1476 * trying to add (or lookup) entries while we are removing it. 1477 */ 1478 rw_enter(&zp->z_name_lock, RW_WRITER); 1479 1480 /* 1481 * Grab a lock on the parent pointer to make sure we play well 1482 * with the treewalk and directory rename code. 1483 */ 1484 rw_enter(&zp->z_parent_lock, RW_WRITER); 1485 1486 tx = dmu_tx_create(zfsvfs->z_os); 1487 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1488 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1489 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1490 zfs_sa_upgrade_txholds(tx, zp); 1491 zfs_sa_upgrade_txholds(tx, dzp); 1492 dmu_tx_mark_netfree(tx); 1493 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1494 if (error) { 1495 rw_exit(&zp->z_parent_lock); 1496 rw_exit(&zp->z_name_lock); 1497 zfs_dirent_unlock(dl); 1498 if (error == ERESTART) { 1499 waited = B_TRUE; 1500 dmu_tx_wait(tx); 1501 dmu_tx_abort(tx); 1502 zrele(zp); 1503 goto top; 1504 } 1505 dmu_tx_abort(tx); 1506 zrele(zp); 1507 zfs_exit(zfsvfs, FTAG); 1508 return (error); 1509 } 1510 1511 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1512 1513 if (error == 0) { 1514 uint64_t txtype = TX_RMDIR; 1515 if (flags & FIGNORECASE) 1516 txtype |= TX_CI; 1517 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1518 B_FALSE); 1519 } 1520 1521 dmu_tx_commit(tx); 1522 1523 rw_exit(&zp->z_parent_lock); 1524 rw_exit(&zp->z_name_lock); 1525 out: 1526 zfs_dirent_unlock(dl); 1527 1528 zfs_znode_update_vfs(dzp); 1529 zfs_znode_update_vfs(zp); 1530 zrele(zp); 1531 1532 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1533 zil_commit(zilog, 0); 1534 1535 zfs_exit(zfsvfs, FTAG); 1536 return (error); 1537 } 1538 1539 /* 1540 * Read directory entries from the given directory cursor position and emit 1541 * name and position for each entry. 1542 * 1543 * IN: ip - inode of directory to read. 1544 * ctx - directory entry context. 1545 * cr - credentials of caller. 1546 * 1547 * RETURN: 0 if success 1548 * error code if failure 1549 * 1550 * Timestamps: 1551 * ip - atime updated 1552 * 1553 * Note that the low 4 bits of the cookie returned by zap is always zero. 1554 * This allows us to use the low range for "special" directory entries: 1555 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1556 * we use the offset 2 for the '.zfs' directory. 1557 */ 1558 int 1559 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) 1560 { 1561 (void) cr; 1562 znode_t *zp = ITOZ(ip); 1563 zfsvfs_t *zfsvfs = ITOZSB(ip); 1564 objset_t *os; 1565 zap_cursor_t zc; 1566 zap_attribute_t *zap; 1567 int error; 1568 uint8_t prefetch; 1569 uint8_t type; 1570 int done = 0; 1571 uint64_t parent; 1572 uint64_t offset; /* must be unsigned; checks for < 1 */ 1573 1574 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1575 return (error); 1576 1577 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1578 &parent, sizeof (parent))) != 0) 1579 goto out; 1580 1581 /* 1582 * Quit if directory has been removed (posix) 1583 */ 1584 if (zp->z_unlinked) 1585 goto out; 1586 1587 error = 0; 1588 os = zfsvfs->z_os; 1589 offset = ctx->pos; 1590 prefetch = zp->z_zn_prefetch; 1591 zap = zap_attribute_long_alloc(); 1592 1593 /* 1594 * Initialize the iterator cursor. 1595 */ 1596 if (offset <= 3) { 1597 /* 1598 * Start iteration from the beginning of the directory. 1599 */ 1600 zap_cursor_init(&zc, os, zp->z_id); 1601 } else { 1602 /* 1603 * The offset is a serialized cursor. 1604 */ 1605 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1606 } 1607 1608 /* 1609 * Transform to file-system independent format 1610 */ 1611 while (!done) { 1612 uint64_t objnum; 1613 /* 1614 * Special case `.', `..', and `.zfs'. 1615 */ 1616 if (offset == 0) { 1617 (void) strcpy(zap->za_name, "."); 1618 zap->za_normalization_conflict = 0; 1619 objnum = zp->z_id; 1620 type = DT_DIR; 1621 } else if (offset == 1) { 1622 (void) strcpy(zap->za_name, ".."); 1623 zap->za_normalization_conflict = 0; 1624 objnum = parent; 1625 type = DT_DIR; 1626 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1627 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME); 1628 zap->za_normalization_conflict = 0; 1629 objnum = ZFSCTL_INO_ROOT; 1630 type = DT_DIR; 1631 } else { 1632 /* 1633 * Grab next entry. 1634 */ 1635 if ((error = zap_cursor_retrieve(&zc, zap))) { 1636 if (error == ENOENT) 1637 break; 1638 else 1639 goto update; 1640 } 1641 1642 /* 1643 * Allow multiple entries provided the first entry is 1644 * the object id. Non-zpl consumers may safely make 1645 * use of the additional space. 1646 * 1647 * XXX: This should be a feature flag for compatibility 1648 */ 1649 if (zap->za_integer_length != 8 || 1650 zap->za_num_integers == 0) { 1651 cmn_err(CE_WARN, "zap_readdir: bad directory " 1652 "entry, obj = %lld, offset = %lld, " 1653 "length = %d, num = %lld\n", 1654 (u_longlong_t)zp->z_id, 1655 (u_longlong_t)offset, 1656 zap->za_integer_length, 1657 (u_longlong_t)zap->za_num_integers); 1658 error = SET_ERROR(ENXIO); 1659 goto update; 1660 } 1661 1662 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer); 1663 type = ZFS_DIRENT_TYPE(zap->za_first_integer); 1664 } 1665 1666 done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name), 1667 objnum, type); 1668 if (done) 1669 break; 1670 1671 if (prefetch) 1672 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); 1673 1674 /* 1675 * Move to the next entry, fill in the previous offset. 1676 */ 1677 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1678 zap_cursor_advance(&zc); 1679 offset = zap_cursor_serialize(&zc); 1680 } else { 1681 offset += 1; 1682 } 1683 ctx->pos = offset; 1684 } 1685 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1686 1687 update: 1688 zap_cursor_fini(&zc); 1689 zap_attribute_free(zap); 1690 if (error == ENOENT) 1691 error = 0; 1692 out: 1693 zfs_exit(zfsvfs, FTAG); 1694 1695 return (error); 1696 } 1697 1698 /* 1699 * Get the basic file attributes and place them in the provided kstat 1700 * structure. The inode is assumed to be the authoritative source 1701 * for most of the attributes. However, the znode currently has the 1702 * authoritative atime, blksize, and block count. 1703 * 1704 * IN: ip - inode of file. 1705 * 1706 * OUT: sp - kstat values. 1707 * 1708 * RETURN: 0 (always succeeds) 1709 */ 1710 int 1711 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1712 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, 1713 struct kstat *sp) 1714 #else 1715 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) 1716 #endif 1717 { 1718 znode_t *zp = ITOZ(ip); 1719 zfsvfs_t *zfsvfs = ITOZSB(ip); 1720 uint32_t blksize; 1721 u_longlong_t nblocks; 1722 int error; 1723 1724 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1725 return (error); 1726 1727 mutex_enter(&zp->z_lock); 1728 1729 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1730 zpl_generic_fillattr(user_ns, request_mask, ip, sp); 1731 #else 1732 zpl_generic_fillattr(user_ns, ip, sp); 1733 #endif 1734 /* 1735 * +1 link count for root inode with visible '.zfs' directory. 1736 */ 1737 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1738 if (sp->nlink < ZFS_LINK_MAX) 1739 sp->nlink++; 1740 1741 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1742 sp->blksize = blksize; 1743 sp->blocks = nblocks; 1744 1745 if (unlikely(zp->z_blksz == 0)) { 1746 /* 1747 * Block size hasn't been set; suggest maximal I/O transfers. 1748 */ 1749 sp->blksize = zfsvfs->z_max_blksz; 1750 } 1751 1752 mutex_exit(&zp->z_lock); 1753 1754 /* 1755 * Required to prevent NFS client from detecting different inode 1756 * numbers of snapshot root dentry before and after snapshot mount. 1757 */ 1758 if (zfsvfs->z_issnap) { 1759 if (ip->i_sb->s_root->d_inode == ip) 1760 sp->ino = ZFSCTL_INO_SNAPDIRS - 1761 dmu_objset_id(zfsvfs->z_os); 1762 } 1763 1764 zfs_exit(zfsvfs, FTAG); 1765 1766 return (0); 1767 } 1768 1769 /* 1770 * For the operation of changing file's user/group/project, we need to 1771 * handle not only the main object that is assigned to the file directly, 1772 * but also the ones that are used by the file via hidden xattr directory. 1773 * 1774 * Because the xattr directory may contains many EA entries, as to it may 1775 * be impossible to change all of them via the transaction of changing the 1776 * main object's user/group/project attributes. Then we have to change them 1777 * via other multiple independent transactions one by one. It may be not good 1778 * solution, but we have no better idea yet. 1779 */ 1780 static int 1781 zfs_setattr_dir(znode_t *dzp) 1782 { 1783 struct inode *dxip = ZTOI(dzp); 1784 struct inode *xip = NULL; 1785 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1786 objset_t *os = zfsvfs->z_os; 1787 zap_cursor_t zc; 1788 zap_attribute_t *zap; 1789 zfs_dirlock_t *dl; 1790 znode_t *zp = NULL; 1791 dmu_tx_t *tx = NULL; 1792 uint64_t uid, gid; 1793 sa_bulk_attr_t bulk[4]; 1794 int count; 1795 int err; 1796 1797 zap = zap_attribute_alloc(); 1798 zap_cursor_init(&zc, os, dzp->z_id); 1799 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) { 1800 count = 0; 1801 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) { 1802 err = ENXIO; 1803 break; 1804 } 1805 1806 err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp, 1807 ZEXISTS, NULL, NULL); 1808 if (err == ENOENT) 1809 goto next; 1810 if (err) 1811 break; 1812 1813 xip = ZTOI(zp); 1814 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1815 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1816 zp->z_projid == dzp->z_projid) 1817 goto next; 1818 1819 tx = dmu_tx_create(os); 1820 if (!(zp->z_pflags & ZFS_PROJID)) 1821 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1822 else 1823 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1824 1825 err = dmu_tx_assign(tx, TXG_WAIT); 1826 if (err) 1827 break; 1828 1829 mutex_enter(&dzp->z_lock); 1830 1831 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1832 xip->i_uid = dxip->i_uid; 1833 uid = zfs_uid_read(dxip); 1834 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1835 &uid, sizeof (uid)); 1836 } 1837 1838 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1839 xip->i_gid = dxip->i_gid; 1840 gid = zfs_gid_read(dxip); 1841 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1842 &gid, sizeof (gid)); 1843 } 1844 1845 1846 uint64_t projid = dzp->z_projid; 1847 if (zp->z_projid != projid) { 1848 if (!(zp->z_pflags & ZFS_PROJID)) { 1849 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 1850 if (unlikely(err == EEXIST)) { 1851 err = 0; 1852 } else if (err != 0) { 1853 goto sa_add_projid_err; 1854 } else { 1855 projid = ZFS_INVALID_PROJID; 1856 } 1857 } 1858 1859 if (projid != ZFS_INVALID_PROJID) { 1860 zp->z_projid = projid; 1861 SA_ADD_BULK_ATTR(bulk, count, 1862 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 1863 sizeof (zp->z_projid)); 1864 } 1865 } 1866 1867 sa_add_projid_err: 1868 mutex_exit(&dzp->z_lock); 1869 1870 if (likely(count > 0)) { 1871 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1872 dmu_tx_commit(tx); 1873 } else if (projid == ZFS_INVALID_PROJID) { 1874 dmu_tx_commit(tx); 1875 } else { 1876 dmu_tx_abort(tx); 1877 } 1878 tx = NULL; 1879 if (err != 0 && err != ENOENT) 1880 break; 1881 1882 next: 1883 if (zp) { 1884 zrele(zp); 1885 zp = NULL; 1886 zfs_dirent_unlock(dl); 1887 } 1888 zap_cursor_advance(&zc); 1889 } 1890 1891 if (tx) 1892 dmu_tx_abort(tx); 1893 if (zp) { 1894 zrele(zp); 1895 zfs_dirent_unlock(dl); 1896 } 1897 zap_cursor_fini(&zc); 1898 zap_attribute_free(zap); 1899 1900 return (err == ENOENT ? 0 : err); 1901 } 1902 1903 /* 1904 * Set the file attributes to the values contained in the 1905 * vattr structure. 1906 * 1907 * IN: zp - znode of file to be modified. 1908 * vap - new attribute values. 1909 * If ATTR_XVATTR set, then optional attrs are being set 1910 * flags - ATTR_UTIME set if non-default time values provided. 1911 * - ATTR_NOACLCHECK (CIFS context only). 1912 * cr - credentials of caller. 1913 * mnt_ns - user namespace of the mount 1914 * 1915 * RETURN: 0 if success 1916 * error code if failure 1917 * 1918 * Timestamps: 1919 * ip - ctime updated, mtime updated if size changed. 1920 */ 1921 int 1922 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) 1923 { 1924 struct inode *ip; 1925 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1926 objset_t *os; 1927 zilog_t *zilog; 1928 dmu_tx_t *tx; 1929 vattr_t oldva; 1930 xvattr_t *tmpxvattr; 1931 uint_t mask = vap->va_mask; 1932 uint_t saved_mask = 0; 1933 int trim_mask = 0; 1934 uint64_t new_mode; 1935 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1936 uint64_t xattr_obj; 1937 uint64_t mtime[2], ctime[2], atime[2]; 1938 uint64_t projid = ZFS_INVALID_PROJID; 1939 znode_t *attrzp; 1940 int need_policy = FALSE; 1941 int err, err2 = 0; 1942 zfs_fuid_info_t *fuidp = NULL; 1943 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1944 xoptattr_t *xoap; 1945 zfs_acl_t *aclp; 1946 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1947 boolean_t fuid_dirtied = B_FALSE; 1948 boolean_t handle_eadir = B_FALSE; 1949 sa_bulk_attr_t *bulk, *xattr_bulk; 1950 int count = 0, xattr_count = 0, bulks = 8; 1951 1952 if (mask == 0) 1953 return (0); 1954 1955 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1956 return (err); 1957 ip = ZTOI(zp); 1958 os = zfsvfs->z_os; 1959 1960 /* 1961 * If this is a xvattr_t, then get a pointer to the structure of 1962 * optional attributes. If this is NULL, then we have a vattr_t. 1963 */ 1964 xoap = xva_getxoptattr(xvap); 1965 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1966 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1967 if (!dmu_objset_projectquota_enabled(os) || 1968 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1969 zfs_exit(zfsvfs, FTAG); 1970 return (SET_ERROR(ENOTSUP)); 1971 } 1972 1973 projid = xoap->xoa_projid; 1974 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1975 zfs_exit(zfsvfs, FTAG); 1976 return (SET_ERROR(EINVAL)); 1977 } 1978 1979 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1980 projid = ZFS_INVALID_PROJID; 1981 else 1982 need_policy = TRUE; 1983 } 1984 1985 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1986 (xoap->xoa_projinherit != 1987 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1988 (!dmu_objset_projectquota_enabled(os) || 1989 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1990 zfs_exit(zfsvfs, FTAG); 1991 return (SET_ERROR(ENOTSUP)); 1992 } 1993 } 1994 1995 zilog = zfsvfs->z_log; 1996 1997 /* 1998 * Make sure that if we have ephemeral uid/gid or xvattr specified 1999 * that file system is at proper version level 2000 */ 2001 2002 if (zfsvfs->z_use_fuids == B_FALSE && 2003 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 2004 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 2005 (mask & ATTR_XVATTR))) { 2006 zfs_exit(zfsvfs, FTAG); 2007 return (SET_ERROR(EINVAL)); 2008 } 2009 2010 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 2011 zfs_exit(zfsvfs, FTAG); 2012 return (SET_ERROR(EISDIR)); 2013 } 2014 2015 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 2016 zfs_exit(zfsvfs, FTAG); 2017 return (SET_ERROR(EINVAL)); 2018 } 2019 2020 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 2021 xva_init(tmpxvattr); 2022 2023 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 2024 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 2025 2026 /* 2027 * Immutable files can only alter immutable bit and atime 2028 */ 2029 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2030 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 2031 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2032 err = SET_ERROR(EPERM); 2033 goto out3; 2034 } 2035 2036 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 2037 err = SET_ERROR(EPERM); 2038 goto out3; 2039 } 2040 2041 /* 2042 * Verify timestamps doesn't overflow 32 bits. 2043 * ZFS can handle large timestamps, but 32bit syscalls can't 2044 * handle times greater than 2039. This check should be removed 2045 * once large timestamps are fully supported. 2046 */ 2047 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 2048 if (((mask & ATTR_ATIME) && 2049 TIMESPEC_OVERFLOW(&vap->va_atime)) || 2050 ((mask & ATTR_MTIME) && 2051 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2052 err = SET_ERROR(EOVERFLOW); 2053 goto out3; 2054 } 2055 } 2056 2057 top: 2058 attrzp = NULL; 2059 aclp = NULL; 2060 2061 /* Can this be moved to before the top label? */ 2062 if (zfs_is_readonly(zfsvfs)) { 2063 err = SET_ERROR(EROFS); 2064 goto out3; 2065 } 2066 2067 /* 2068 * First validate permissions 2069 */ 2070 2071 if (mask & ATTR_SIZE) { 2072 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 2073 mnt_ns); 2074 if (err) 2075 goto out3; 2076 2077 /* 2078 * XXX - Note, we are not providing any open 2079 * mode flags here (like FNDELAY), so we may 2080 * block if there are locks present... this 2081 * should be addressed in openat(). 2082 */ 2083 /* XXX - would it be OK to generate a log record here? */ 2084 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2085 if (err) 2086 goto out3; 2087 } 2088 2089 if (mask & (ATTR_ATIME|ATTR_MTIME) || 2090 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2091 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2092 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2093 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2094 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2095 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2096 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2097 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2098 skipaclchk, cr, mnt_ns); 2099 } 2100 2101 if (mask & (ATTR_UID|ATTR_GID)) { 2102 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2103 int take_owner; 2104 int take_group; 2105 uid_t uid; 2106 gid_t gid; 2107 2108 /* 2109 * NOTE: even if a new mode is being set, 2110 * we may clear S_ISUID/S_ISGID bits. 2111 */ 2112 2113 if (!(mask & ATTR_MODE)) 2114 vap->va_mode = zp->z_mode; 2115 2116 /* 2117 * Take ownership or chgrp to group we are a member of 2118 */ 2119 2120 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), 2121 vap->va_uid); 2122 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), 2123 vap->va_gid); 2124 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2125 take_group = (mask & ATTR_GID) && 2126 zfs_groupmember(zfsvfs, gid, cr); 2127 2128 /* 2129 * If both ATTR_UID and ATTR_GID are set then take_owner and 2130 * take_group must both be set in order to allow taking 2131 * ownership. 2132 * 2133 * Otherwise, send the check through secpolicy_vnode_setattr() 2134 * 2135 */ 2136 2137 if (((idmask == (ATTR_UID|ATTR_GID)) && 2138 take_owner && take_group) || 2139 ((idmask == ATTR_UID) && take_owner) || 2140 ((idmask == ATTR_GID) && take_group)) { 2141 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2142 skipaclchk, cr, mnt_ns) == 0) { 2143 /* 2144 * Remove setuid/setgid for non-privileged users 2145 */ 2146 (void) secpolicy_setid_clear(vap, cr); 2147 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2148 } else { 2149 need_policy = TRUE; 2150 } 2151 } else { 2152 need_policy = TRUE; 2153 } 2154 } 2155 2156 mutex_enter(&zp->z_lock); 2157 oldva.va_mode = zp->z_mode; 2158 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2159 if (mask & ATTR_XVATTR) { 2160 /* 2161 * Update xvattr mask to include only those attributes 2162 * that are actually changing. 2163 * 2164 * the bits will be restored prior to actually setting 2165 * the attributes so the caller thinks they were set. 2166 */ 2167 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2168 if (xoap->xoa_appendonly != 2169 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2170 need_policy = TRUE; 2171 } else { 2172 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2173 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2174 } 2175 } 2176 2177 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2178 if (xoap->xoa_projinherit != 2179 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2180 need_policy = TRUE; 2181 } else { 2182 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2183 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2184 } 2185 } 2186 2187 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2188 if (xoap->xoa_nounlink != 2189 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2190 need_policy = TRUE; 2191 } else { 2192 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2193 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2194 } 2195 } 2196 2197 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2198 if (xoap->xoa_immutable != 2199 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2200 need_policy = TRUE; 2201 } else { 2202 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2203 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2204 } 2205 } 2206 2207 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2208 if (xoap->xoa_nodump != 2209 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2210 need_policy = TRUE; 2211 } else { 2212 XVA_CLR_REQ(xvap, XAT_NODUMP); 2213 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2214 } 2215 } 2216 2217 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2218 if (xoap->xoa_av_modified != 2219 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2220 need_policy = TRUE; 2221 } else { 2222 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2223 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2224 } 2225 } 2226 2227 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2228 if ((!S_ISREG(ip->i_mode) && 2229 xoap->xoa_av_quarantined) || 2230 xoap->xoa_av_quarantined != 2231 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2232 need_policy = TRUE; 2233 } else { 2234 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2235 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2236 } 2237 } 2238 2239 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2240 mutex_exit(&zp->z_lock); 2241 err = SET_ERROR(EPERM); 2242 goto out3; 2243 } 2244 2245 if (need_policy == FALSE && 2246 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2247 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2248 need_policy = TRUE; 2249 } 2250 } 2251 2252 mutex_exit(&zp->z_lock); 2253 2254 if (mask & ATTR_MODE) { 2255 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2256 mnt_ns) == 0) { 2257 err = secpolicy_setid_setsticky_clear(ip, vap, 2258 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2259 if (err) 2260 goto out3; 2261 trim_mask |= ATTR_MODE; 2262 } else { 2263 need_policy = TRUE; 2264 } 2265 } 2266 2267 if (need_policy) { 2268 /* 2269 * If trim_mask is set then take ownership 2270 * has been granted or write_acl is present and user 2271 * has the ability to modify mode. In that case remove 2272 * UID|GID and or MODE from mask so that 2273 * secpolicy_vnode_setattr() doesn't revoke it. 2274 */ 2275 2276 if (trim_mask) { 2277 saved_mask = vap->va_mask; 2278 vap->va_mask &= ~trim_mask; 2279 } 2280 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2281 zfs_zaccess_unix, zp); 2282 if (err) 2283 goto out3; 2284 2285 if (trim_mask) 2286 vap->va_mask |= saved_mask; 2287 } 2288 2289 /* 2290 * secpolicy_vnode_setattr, or take ownership may have 2291 * changed va_mask 2292 */ 2293 mask = vap->va_mask; 2294 2295 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2296 handle_eadir = B_TRUE; 2297 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2298 &xattr_obj, sizeof (xattr_obj)); 2299 2300 if (err == 0 && xattr_obj) { 2301 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2302 if (err) 2303 goto out2; 2304 } 2305 if (mask & ATTR_UID) { 2306 new_kuid = zfs_fuid_create(zfsvfs, 2307 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2308 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2309 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2310 new_kuid)) { 2311 if (attrzp) 2312 zrele(attrzp); 2313 err = SET_ERROR(EDQUOT); 2314 goto out2; 2315 } 2316 } 2317 2318 if (mask & ATTR_GID) { 2319 new_kgid = zfs_fuid_create(zfsvfs, 2320 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2321 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2322 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2323 new_kgid)) { 2324 if (attrzp) 2325 zrele(attrzp); 2326 err = SET_ERROR(EDQUOT); 2327 goto out2; 2328 } 2329 } 2330 2331 if (projid != ZFS_INVALID_PROJID && 2332 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2333 if (attrzp) 2334 zrele(attrzp); 2335 err = EDQUOT; 2336 goto out2; 2337 } 2338 } 2339 tx = dmu_tx_create(os); 2340 2341 if (mask & ATTR_MODE) { 2342 uint64_t pmode = zp->z_mode; 2343 uint64_t acl_obj; 2344 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2345 2346 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2347 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2348 err = EPERM; 2349 goto out; 2350 } 2351 2352 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2353 goto out; 2354 2355 mutex_enter(&zp->z_lock); 2356 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2357 /* 2358 * Are we upgrading ACL from old V0 format 2359 * to V1 format? 2360 */ 2361 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2362 zfs_znode_acl_version(zp) == 2363 ZFS_ACL_VERSION_INITIAL) { 2364 dmu_tx_hold_free(tx, acl_obj, 0, 2365 DMU_OBJECT_END); 2366 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2367 0, aclp->z_acl_bytes); 2368 } else { 2369 dmu_tx_hold_write(tx, acl_obj, 0, 2370 aclp->z_acl_bytes); 2371 } 2372 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2373 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2374 0, aclp->z_acl_bytes); 2375 } 2376 mutex_exit(&zp->z_lock); 2377 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2378 } else { 2379 if (((mask & ATTR_XVATTR) && 2380 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2381 (projid != ZFS_INVALID_PROJID && 2382 !(zp->z_pflags & ZFS_PROJID))) 2383 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2384 else 2385 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2386 } 2387 2388 if (attrzp) { 2389 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2390 } 2391 2392 fuid_dirtied = zfsvfs->z_fuid_dirty; 2393 if (fuid_dirtied) 2394 zfs_fuid_txhold(zfsvfs, tx); 2395 2396 zfs_sa_upgrade_txholds(tx, zp); 2397 2398 err = dmu_tx_assign(tx, TXG_WAIT); 2399 if (err) 2400 goto out; 2401 2402 count = 0; 2403 /* 2404 * Set each attribute requested. 2405 * We group settings according to the locks they need to acquire. 2406 * 2407 * Note: you cannot set ctime directly, although it will be 2408 * updated as a side-effect of calling this function. 2409 */ 2410 2411 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2412 /* 2413 * For the existed object that is upgraded from old system, 2414 * its on-disk layout has no slot for the project ID attribute. 2415 * But quota accounting logic needs to access related slots by 2416 * offset directly. So we need to adjust old objects' layout 2417 * to make the project ID to some unified and fixed offset. 2418 */ 2419 if (attrzp) 2420 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2421 if (err == 0) 2422 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2423 2424 if (unlikely(err == EEXIST)) 2425 err = 0; 2426 else if (err != 0) 2427 goto out; 2428 else 2429 projid = ZFS_INVALID_PROJID; 2430 } 2431 2432 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2433 mutex_enter(&zp->z_acl_lock); 2434 mutex_enter(&zp->z_lock); 2435 2436 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2437 &zp->z_pflags, sizeof (zp->z_pflags)); 2438 2439 if (attrzp) { 2440 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2441 mutex_enter(&attrzp->z_acl_lock); 2442 mutex_enter(&attrzp->z_lock); 2443 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2444 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2445 sizeof (attrzp->z_pflags)); 2446 if (projid != ZFS_INVALID_PROJID) { 2447 attrzp->z_projid = projid; 2448 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2449 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2450 sizeof (attrzp->z_projid)); 2451 } 2452 } 2453 2454 if (mask & (ATTR_UID|ATTR_GID)) { 2455 2456 if (mask & ATTR_UID) { 2457 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2458 new_uid = zfs_uid_read(ZTOI(zp)); 2459 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2460 &new_uid, sizeof (new_uid)); 2461 if (attrzp) { 2462 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2463 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2464 sizeof (new_uid)); 2465 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2466 } 2467 } 2468 2469 if (mask & ATTR_GID) { 2470 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2471 new_gid = zfs_gid_read(ZTOI(zp)); 2472 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2473 NULL, &new_gid, sizeof (new_gid)); 2474 if (attrzp) { 2475 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2476 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2477 sizeof (new_gid)); 2478 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2479 } 2480 } 2481 if (!(mask & ATTR_MODE)) { 2482 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2483 NULL, &new_mode, sizeof (new_mode)); 2484 new_mode = zp->z_mode; 2485 } 2486 err = zfs_acl_chown_setattr(zp); 2487 ASSERT(err == 0); 2488 if (attrzp) { 2489 err = zfs_acl_chown_setattr(attrzp); 2490 ASSERT(err == 0); 2491 } 2492 } 2493 2494 if (mask & ATTR_MODE) { 2495 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2496 &new_mode, sizeof (new_mode)); 2497 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2498 ASSERT3P(aclp, !=, NULL); 2499 err = zfs_aclset_common(zp, aclp, cr, tx); 2500 ASSERT0(err); 2501 if (zp->z_acl_cached) 2502 zfs_acl_free(zp->z_acl_cached); 2503 zp->z_acl_cached = aclp; 2504 aclp = NULL; 2505 } 2506 2507 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2508 zp->z_atime_dirty = B_FALSE; 2509 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); 2510 ZFS_TIME_ENCODE(&tmp_atime, atime); 2511 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2512 &atime, sizeof (atime)); 2513 } 2514 2515 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2516 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2517 zpl_inode_set_mtime_to_ts(ZTOI(zp), 2518 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); 2519 2520 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2521 mtime, sizeof (mtime)); 2522 } 2523 2524 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2525 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2526 zpl_inode_set_ctime_to_ts(ZTOI(zp), 2527 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); 2528 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2529 ctime, sizeof (ctime)); 2530 } 2531 2532 if (projid != ZFS_INVALID_PROJID) { 2533 zp->z_projid = projid; 2534 SA_ADD_BULK_ATTR(bulk, count, 2535 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2536 sizeof (zp->z_projid)); 2537 } 2538 2539 if (attrzp && mask) { 2540 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2541 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2542 sizeof (ctime)); 2543 } 2544 2545 /* 2546 * Do this after setting timestamps to prevent timestamp 2547 * update from toggling bit 2548 */ 2549 2550 if (xoap && (mask & ATTR_XVATTR)) { 2551 2552 /* 2553 * restore trimmed off masks 2554 * so that return masks can be set for caller. 2555 */ 2556 2557 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2558 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2559 } 2560 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2561 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2562 } 2563 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2564 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2565 } 2566 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2567 XVA_SET_REQ(xvap, XAT_NODUMP); 2568 } 2569 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2570 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2571 } 2572 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2573 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2574 } 2575 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2576 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2577 } 2578 2579 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2580 ASSERT(S_ISREG(ip->i_mode)); 2581 2582 zfs_xvattr_set(zp, xvap, tx); 2583 } 2584 2585 if (fuid_dirtied) 2586 zfs_fuid_sync(zfsvfs, tx); 2587 2588 if (mask != 0) 2589 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2590 2591 mutex_exit(&zp->z_lock); 2592 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2593 mutex_exit(&zp->z_acl_lock); 2594 2595 if (attrzp) { 2596 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2597 mutex_exit(&attrzp->z_acl_lock); 2598 mutex_exit(&attrzp->z_lock); 2599 } 2600 out: 2601 if (err == 0 && xattr_count > 0) { 2602 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2603 xattr_count, tx); 2604 ASSERT(err2 == 0); 2605 } 2606 2607 if (aclp) 2608 zfs_acl_free(aclp); 2609 2610 if (fuidp) { 2611 zfs_fuid_info_free(fuidp); 2612 fuidp = NULL; 2613 } 2614 2615 if (err) { 2616 dmu_tx_abort(tx); 2617 if (attrzp) 2618 zrele(attrzp); 2619 if (err == ERESTART) 2620 goto top; 2621 } else { 2622 if (count > 0) 2623 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2624 dmu_tx_commit(tx); 2625 if (attrzp) { 2626 if (err2 == 0 && handle_eadir) 2627 err = zfs_setattr_dir(attrzp); 2628 zrele(attrzp); 2629 } 2630 zfs_znode_update_vfs(zp); 2631 } 2632 2633 out2: 2634 if (os->os_sync == ZFS_SYNC_ALWAYS) 2635 zil_commit(zilog, 0); 2636 2637 out3: 2638 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2639 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2640 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2641 zfs_exit(zfsvfs, FTAG); 2642 return (err); 2643 } 2644 2645 typedef struct zfs_zlock { 2646 krwlock_t *zl_rwlock; /* lock we acquired */ 2647 znode_t *zl_znode; /* znode we held */ 2648 struct zfs_zlock *zl_next; /* next in list */ 2649 } zfs_zlock_t; 2650 2651 /* 2652 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2653 */ 2654 static void 2655 zfs_rename_unlock(zfs_zlock_t **zlpp) 2656 { 2657 zfs_zlock_t *zl; 2658 2659 while ((zl = *zlpp) != NULL) { 2660 if (zl->zl_znode != NULL) 2661 zfs_zrele_async(zl->zl_znode); 2662 rw_exit(zl->zl_rwlock); 2663 *zlpp = zl->zl_next; 2664 kmem_free(zl, sizeof (*zl)); 2665 } 2666 } 2667 2668 /* 2669 * Search back through the directory tree, using the ".." entries. 2670 * Lock each directory in the chain to prevent concurrent renames. 2671 * Fail any attempt to move a directory into one of its own descendants. 2672 * XXX - z_parent_lock can overlap with map or grow locks 2673 */ 2674 static int 2675 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2676 { 2677 zfs_zlock_t *zl; 2678 znode_t *zp = tdzp; 2679 uint64_t rootid = ZTOZSB(zp)->z_root; 2680 uint64_t oidp = zp->z_id; 2681 krwlock_t *rwlp = &szp->z_parent_lock; 2682 krw_t rw = RW_WRITER; 2683 2684 /* 2685 * First pass write-locks szp and compares to zp->z_id. 2686 * Later passes read-lock zp and compare to zp->z_parent. 2687 */ 2688 do { 2689 if (!rw_tryenter(rwlp, rw)) { 2690 /* 2691 * Another thread is renaming in this path. 2692 * Note that if we are a WRITER, we don't have any 2693 * parent_locks held yet. 2694 */ 2695 if (rw == RW_READER && zp->z_id > szp->z_id) { 2696 /* 2697 * Drop our locks and restart 2698 */ 2699 zfs_rename_unlock(&zl); 2700 *zlpp = NULL; 2701 zp = tdzp; 2702 oidp = zp->z_id; 2703 rwlp = &szp->z_parent_lock; 2704 rw = RW_WRITER; 2705 continue; 2706 } else { 2707 /* 2708 * Wait for other thread to drop its locks 2709 */ 2710 rw_enter(rwlp, rw); 2711 } 2712 } 2713 2714 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2715 zl->zl_rwlock = rwlp; 2716 zl->zl_znode = NULL; 2717 zl->zl_next = *zlpp; 2718 *zlpp = zl; 2719 2720 if (oidp == szp->z_id) /* We're a descendant of szp */ 2721 return (SET_ERROR(EINVAL)); 2722 2723 if (oidp == rootid) /* We've hit the top */ 2724 return (0); 2725 2726 if (rw == RW_READER) { /* i.e. not the first pass */ 2727 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2728 if (error) 2729 return (error); 2730 zl->zl_znode = zp; 2731 } 2732 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2733 &oidp, sizeof (oidp)); 2734 rwlp = &zp->z_parent_lock; 2735 rw = RW_READER; 2736 2737 } while (zp->z_id != sdzp->z_id); 2738 2739 return (0); 2740 } 2741 2742 /* 2743 * Move an entry from the provided source directory to the target 2744 * directory. Change the entry name as indicated. 2745 * 2746 * IN: sdzp - Source directory containing the "old entry". 2747 * snm - Old entry name. 2748 * tdzp - Target directory to contain the "new entry". 2749 * tnm - New entry name. 2750 * cr - credentials of caller. 2751 * flags - case flags 2752 * rflags - RENAME_* flags 2753 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2754 * mnt_ns - user namespace of the mount 2755 * 2756 * RETURN: 0 on success, error code on failure. 2757 * 2758 * Timestamps: 2759 * sdzp,tdzp - ctime|mtime updated 2760 */ 2761 int 2762 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2763 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) 2764 { 2765 znode_t *szp, *tzp; 2766 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2767 zilog_t *zilog; 2768 zfs_dirlock_t *sdl, *tdl; 2769 dmu_tx_t *tx; 2770 zfs_zlock_t *zl; 2771 int cmp, serr, terr; 2772 int error = 0; 2773 int zflg = 0; 2774 boolean_t waited = B_FALSE; 2775 /* Needed for whiteout inode creation. */ 2776 boolean_t fuid_dirtied; 2777 zfs_acl_ids_t acl_ids; 2778 boolean_t have_acl = B_FALSE; 2779 znode_t *wzp = NULL; 2780 2781 2782 if (snm == NULL || tnm == NULL) 2783 return (SET_ERROR(EINVAL)); 2784 2785 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2786 return (SET_ERROR(EINVAL)); 2787 2788 /* Already checked by Linux VFS, but just to make sure. */ 2789 if (rflags & RENAME_EXCHANGE && 2790 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2791 return (SET_ERROR(EINVAL)); 2792 2793 /* 2794 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2795 * right kind of vattr_t for the whiteout file. These are set 2796 * internally by ZFS so should never be incorrect. 2797 */ 2798 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2799 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2800 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2801 2802 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2803 return (error); 2804 zilog = zfsvfs->z_log; 2805 2806 if ((error = zfs_verify_zp(tdzp)) != 0) { 2807 zfs_exit(zfsvfs, FTAG); 2808 return (error); 2809 } 2810 2811 /* 2812 * We check i_sb because snapshots and the ctldir must have different 2813 * super blocks. 2814 */ 2815 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2816 zfsctl_is_node(ZTOI(tdzp))) { 2817 zfs_exit(zfsvfs, FTAG); 2818 return (SET_ERROR(EXDEV)); 2819 } 2820 2821 if (zfsvfs->z_utf8 && u8_validate(tnm, 2822 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2823 zfs_exit(zfsvfs, FTAG); 2824 return (SET_ERROR(EILSEQ)); 2825 } 2826 2827 if (flags & FIGNORECASE) 2828 zflg |= ZCILOOK; 2829 2830 top: 2831 szp = NULL; 2832 tzp = NULL; 2833 zl = NULL; 2834 2835 /* 2836 * This is to prevent the creation of links into attribute space 2837 * by renaming a linked file into/outof an attribute directory. 2838 * See the comment in zfs_link() for why this is considered bad. 2839 */ 2840 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2841 zfs_exit(zfsvfs, FTAG); 2842 return (SET_ERROR(EINVAL)); 2843 } 2844 2845 /* 2846 * Lock source and target directory entries. To prevent deadlock, 2847 * a lock ordering must be defined. We lock the directory with 2848 * the smallest object id first, or if it's a tie, the one with 2849 * the lexically first name. 2850 */ 2851 if (sdzp->z_id < tdzp->z_id) { 2852 cmp = -1; 2853 } else if (sdzp->z_id > tdzp->z_id) { 2854 cmp = 1; 2855 } else { 2856 /* 2857 * First compare the two name arguments without 2858 * considering any case folding. 2859 */ 2860 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2861 2862 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2863 ASSERT(error == 0 || !zfsvfs->z_utf8); 2864 if (cmp == 0) { 2865 /* 2866 * POSIX: "If the old argument and the new argument 2867 * both refer to links to the same existing file, 2868 * the rename() function shall return successfully 2869 * and perform no other action." 2870 */ 2871 zfs_exit(zfsvfs, FTAG); 2872 return (0); 2873 } 2874 /* 2875 * If the file system is case-folding, then we may 2876 * have some more checking to do. A case-folding file 2877 * system is either supporting mixed case sensitivity 2878 * access or is completely case-insensitive. Note 2879 * that the file system is always case preserving. 2880 * 2881 * In mixed sensitivity mode case sensitive behavior 2882 * is the default. FIGNORECASE must be used to 2883 * explicitly request case insensitive behavior. 2884 * 2885 * If the source and target names provided differ only 2886 * by case (e.g., a request to rename 'tim' to 'Tim'), 2887 * we will treat this as a special case in the 2888 * case-insensitive mode: as long as the source name 2889 * is an exact match, we will allow this to proceed as 2890 * a name-change request. 2891 */ 2892 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2893 (zfsvfs->z_case == ZFS_CASE_MIXED && 2894 flags & FIGNORECASE)) && 2895 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2896 &error) == 0) { 2897 /* 2898 * case preserving rename request, require exact 2899 * name matches 2900 */ 2901 zflg |= ZCIEXACT; 2902 zflg &= ~ZCILOOK; 2903 } 2904 } 2905 2906 /* 2907 * If the source and destination directories are the same, we should 2908 * grab the z_name_lock of that directory only once. 2909 */ 2910 if (sdzp == tdzp) { 2911 zflg |= ZHAVELOCK; 2912 rw_enter(&sdzp->z_name_lock, RW_READER); 2913 } 2914 2915 if (cmp < 0) { 2916 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2917 ZEXISTS | zflg, NULL, NULL); 2918 terr = zfs_dirent_lock(&tdl, 2919 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2920 } else { 2921 terr = zfs_dirent_lock(&tdl, 2922 tdzp, tnm, &tzp, zflg, NULL, NULL); 2923 serr = zfs_dirent_lock(&sdl, 2924 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2925 NULL, NULL); 2926 } 2927 2928 if (serr) { 2929 /* 2930 * Source entry invalid or not there. 2931 */ 2932 if (!terr) { 2933 zfs_dirent_unlock(tdl); 2934 if (tzp) 2935 zrele(tzp); 2936 } 2937 2938 if (sdzp == tdzp) 2939 rw_exit(&sdzp->z_name_lock); 2940 2941 if (strcmp(snm, "..") == 0) 2942 serr = EINVAL; 2943 zfs_exit(zfsvfs, FTAG); 2944 return (serr); 2945 } 2946 if (terr) { 2947 zfs_dirent_unlock(sdl); 2948 zrele(szp); 2949 2950 if (sdzp == tdzp) 2951 rw_exit(&sdzp->z_name_lock); 2952 2953 if (strcmp(tnm, "..") == 0) 2954 terr = EINVAL; 2955 zfs_exit(zfsvfs, FTAG); 2956 return (terr); 2957 } 2958 2959 /* 2960 * If we are using project inheritance, means if the directory has 2961 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2962 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2963 * such case, we only allow renames into our tree when the project 2964 * IDs are the same. 2965 */ 2966 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2967 tdzp->z_projid != szp->z_projid) { 2968 error = SET_ERROR(EXDEV); 2969 goto out; 2970 } 2971 2972 /* 2973 * Must have write access at the source to remove the old entry 2974 * and write access at the target to create the new entry. 2975 * Note that if target and source are the same, this can be 2976 * done in a single check. 2977 */ 2978 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2979 goto out; 2980 2981 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2982 /* 2983 * Check to make sure rename is valid. 2984 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2985 */ 2986 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2987 goto out; 2988 } 2989 2990 /* 2991 * Does target exist? 2992 */ 2993 if (tzp) { 2994 if (rflags & RENAME_NOREPLACE) { 2995 error = SET_ERROR(EEXIST); 2996 goto out; 2997 } 2998 /* 2999 * Source and target must be the same type (unless exchanging). 3000 */ 3001 if (!(rflags & RENAME_EXCHANGE)) { 3002 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 3003 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 3004 3005 if (s_is_dir != t_is_dir) { 3006 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 3007 goto out; 3008 } 3009 } 3010 /* 3011 * POSIX dictates that when the source and target 3012 * entries refer to the same file object, rename 3013 * must do nothing and exit without error. 3014 */ 3015 if (szp->z_id == tzp->z_id) { 3016 error = 0; 3017 goto out; 3018 } 3019 } else if (rflags & RENAME_EXCHANGE) { 3020 /* Target must exist for RENAME_EXCHANGE. */ 3021 error = SET_ERROR(ENOENT); 3022 goto out; 3023 } 3024 3025 /* Set up inode creation for RENAME_WHITEOUT. */ 3026 if (rflags & RENAME_WHITEOUT) { 3027 /* 3028 * Whiteout files are not regular files or directories, so to 3029 * match zfs_create() we do not inherit the project id. 3030 */ 3031 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 3032 3033 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 3034 if (error) 3035 goto out; 3036 3037 if (!have_acl) { 3038 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 3039 &acl_ids, mnt_ns); 3040 if (error) 3041 goto out; 3042 have_acl = B_TRUE; 3043 } 3044 3045 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 3046 error = SET_ERROR(EDQUOT); 3047 goto out; 3048 } 3049 } 3050 3051 tx = dmu_tx_create(zfsvfs->z_os); 3052 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3053 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3054 dmu_tx_hold_zap(tx, sdzp->z_id, 3055 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 3056 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3057 if (sdzp != tdzp) { 3058 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3059 zfs_sa_upgrade_txholds(tx, tdzp); 3060 } 3061 if (tzp) { 3062 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3063 zfs_sa_upgrade_txholds(tx, tzp); 3064 } 3065 if (rflags & RENAME_WHITEOUT) { 3066 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3067 ZFS_SA_BASE_ATTR_SIZE); 3068 3069 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 3070 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3071 if (!zfsvfs->z_use_sa && 3072 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3073 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3074 0, acl_ids.z_aclp->z_acl_bytes); 3075 } 3076 } 3077 fuid_dirtied = zfsvfs->z_fuid_dirty; 3078 if (fuid_dirtied) 3079 zfs_fuid_txhold(zfsvfs, tx); 3080 zfs_sa_upgrade_txholds(tx, szp); 3081 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3082 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3083 if (error) { 3084 if (zl != NULL) 3085 zfs_rename_unlock(&zl); 3086 zfs_dirent_unlock(sdl); 3087 zfs_dirent_unlock(tdl); 3088 3089 if (sdzp == tdzp) 3090 rw_exit(&sdzp->z_name_lock); 3091 3092 if (error == ERESTART) { 3093 waited = B_TRUE; 3094 dmu_tx_wait(tx); 3095 dmu_tx_abort(tx); 3096 zrele(szp); 3097 if (tzp) 3098 zrele(tzp); 3099 goto top; 3100 } 3101 dmu_tx_abort(tx); 3102 zrele(szp); 3103 if (tzp) 3104 zrele(tzp); 3105 zfs_exit(zfsvfs, FTAG); 3106 return (error); 3107 } 3108 3109 /* 3110 * Unlink the source. 3111 */ 3112 szp->z_pflags |= ZFS_AV_MODIFIED; 3113 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3114 szp->z_pflags |= ZFS_PROJINHERIT; 3115 3116 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3117 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3118 VERIFY0(error); 3119 3120 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3121 if (error) 3122 goto commit; 3123 3124 /* 3125 * Unlink the target. 3126 */ 3127 if (tzp) { 3128 int tzflg = zflg; 3129 3130 if (rflags & RENAME_EXCHANGE) { 3131 /* This inode will be re-linked soon. */ 3132 tzflg |= ZRENAMING; 3133 3134 tzp->z_pflags |= ZFS_AV_MODIFIED; 3135 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3136 tzp->z_pflags |= ZFS_PROJINHERIT; 3137 3138 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3139 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3140 ASSERT0(error); 3141 } 3142 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3143 if (error) 3144 goto commit_link_szp; 3145 } 3146 3147 /* 3148 * Create the new target links: 3149 * * We always link the target. 3150 * * RENAME_EXCHANGE: Link the old target to the source. 3151 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3152 */ 3153 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3154 if (error) { 3155 /* 3156 * If we have removed the existing target, a subsequent call to 3157 * zfs_link_create() to add back the same entry, but with a new 3158 * dnode (szp), should not fail. 3159 */ 3160 ASSERT3P(tzp, ==, NULL); 3161 goto commit_link_tzp; 3162 } 3163 3164 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3165 case RENAME_EXCHANGE: 3166 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3167 /* 3168 * The same argument as zfs_link_create() failing for 3169 * szp applies here, since the source directory must 3170 * have had an entry we are replacing. 3171 */ 3172 ASSERT0(error); 3173 if (error) 3174 goto commit_unlink_td_szp; 3175 break; 3176 case RENAME_WHITEOUT: 3177 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3178 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3179 if (error) { 3180 zfs_znode_delete(wzp, tx); 3181 remove_inode_hash(ZTOI(wzp)); 3182 goto commit_unlink_td_szp; 3183 } 3184 break; 3185 } 3186 3187 if (fuid_dirtied) 3188 zfs_fuid_sync(zfsvfs, tx); 3189 3190 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3191 case RENAME_EXCHANGE: 3192 zfs_log_rename_exchange(zilog, tx, 3193 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3194 tdzp, tdl->dl_name, szp); 3195 break; 3196 case RENAME_WHITEOUT: 3197 zfs_log_rename_whiteout(zilog, tx, 3198 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3199 tdzp, tdl->dl_name, szp, wzp); 3200 break; 3201 default: 3202 ASSERT0(rflags & ~RENAME_NOREPLACE); 3203 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3204 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3205 break; 3206 } 3207 3208 commit: 3209 dmu_tx_commit(tx); 3210 out: 3211 if (have_acl) 3212 zfs_acl_ids_free(&acl_ids); 3213 3214 zfs_znode_update_vfs(sdzp); 3215 if (sdzp == tdzp) 3216 rw_exit(&sdzp->z_name_lock); 3217 3218 if (sdzp != tdzp) 3219 zfs_znode_update_vfs(tdzp); 3220 3221 zfs_znode_update_vfs(szp); 3222 zrele(szp); 3223 if (wzp) { 3224 zfs_znode_update_vfs(wzp); 3225 zrele(wzp); 3226 } 3227 if (tzp) { 3228 zfs_znode_update_vfs(tzp); 3229 zrele(tzp); 3230 } 3231 3232 if (zl != NULL) 3233 zfs_rename_unlock(&zl); 3234 3235 zfs_dirent_unlock(sdl); 3236 zfs_dirent_unlock(tdl); 3237 3238 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3239 zil_commit(zilog, 0); 3240 3241 zfs_exit(zfsvfs, FTAG); 3242 return (error); 3243 3244 /* 3245 * Clean-up path for broken link state. 3246 * 3247 * At this point we are in a (very) bad state, so we need to do our 3248 * best to correct the state. In particular, all of the nlinks are 3249 * wrong because we were destroying and creating links with ZRENAMING. 3250 * 3251 * In some form, all of these operations have to resolve the state: 3252 * 3253 * * link_destroy() *must* succeed. Fortunately, this is very likely 3254 * since we only just created it. 3255 * 3256 * * link_create()s are allowed to fail (though they shouldn't because 3257 * we only just unlinked them and are putting the entries back 3258 * during clean-up). But if they fail, we can just forcefully drop 3259 * the nlink value to (at the very least) avoid broken nlink values 3260 * -- though in the case of non-empty directories we will have to 3261 * panic (otherwise we'd have a leaked directory with a broken ..). 3262 */ 3263 commit_unlink_td_szp: 3264 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3265 commit_link_tzp: 3266 if (tzp) { 3267 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3268 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3269 } 3270 commit_link_szp: 3271 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3272 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3273 goto commit; 3274 } 3275 3276 /* 3277 * Insert the indicated symbolic reference entry into the directory. 3278 * 3279 * IN: dzp - Directory to contain new symbolic link. 3280 * name - Name of directory entry in dip. 3281 * vap - Attributes of new entry. 3282 * link - Name for new symlink entry. 3283 * cr - credentials of caller. 3284 * flags - case flags 3285 * mnt_ns - user namespace of the mount 3286 * 3287 * OUT: zpp - Znode for new symbolic link. 3288 * 3289 * RETURN: 0 on success, error code on failure. 3290 * 3291 * Timestamps: 3292 * dip - ctime|mtime updated 3293 */ 3294 int 3295 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3296 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) 3297 { 3298 znode_t *zp; 3299 zfs_dirlock_t *dl; 3300 dmu_tx_t *tx; 3301 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3302 zilog_t *zilog; 3303 uint64_t len = strlen(link); 3304 int error; 3305 int zflg = ZNEW; 3306 zfs_acl_ids_t acl_ids; 3307 boolean_t fuid_dirtied; 3308 uint64_t txtype = TX_SYMLINK; 3309 boolean_t waited = B_FALSE; 3310 3311 ASSERT(S_ISLNK(vap->va_mode)); 3312 3313 if (name == NULL) 3314 return (SET_ERROR(EINVAL)); 3315 3316 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3317 return (error); 3318 zilog = zfsvfs->z_log; 3319 3320 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3321 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3322 zfs_exit(zfsvfs, FTAG); 3323 return (SET_ERROR(EILSEQ)); 3324 } 3325 if (flags & FIGNORECASE) 3326 zflg |= ZCILOOK; 3327 3328 if (len > MAXPATHLEN) { 3329 zfs_exit(zfsvfs, FTAG); 3330 return (SET_ERROR(ENAMETOOLONG)); 3331 } 3332 3333 if ((error = zfs_acl_ids_create(dzp, 0, 3334 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3335 zfs_exit(zfsvfs, FTAG); 3336 return (error); 3337 } 3338 top: 3339 *zpp = NULL; 3340 3341 /* 3342 * Attempt to lock directory; fail if entry already exists. 3343 */ 3344 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3345 if (error) { 3346 zfs_acl_ids_free(&acl_ids); 3347 zfs_exit(zfsvfs, FTAG); 3348 return (error); 3349 } 3350 3351 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3352 zfs_acl_ids_free(&acl_ids); 3353 zfs_dirent_unlock(dl); 3354 zfs_exit(zfsvfs, FTAG); 3355 return (error); 3356 } 3357 3358 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3359 zfs_acl_ids_free(&acl_ids); 3360 zfs_dirent_unlock(dl); 3361 zfs_exit(zfsvfs, FTAG); 3362 return (SET_ERROR(EDQUOT)); 3363 } 3364 tx = dmu_tx_create(zfsvfs->z_os); 3365 fuid_dirtied = zfsvfs->z_fuid_dirty; 3366 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3367 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3368 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3369 ZFS_SA_BASE_ATTR_SIZE + len); 3370 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3371 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3372 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3373 acl_ids.z_aclp->z_acl_bytes); 3374 } 3375 if (fuid_dirtied) 3376 zfs_fuid_txhold(zfsvfs, tx); 3377 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3378 if (error) { 3379 zfs_dirent_unlock(dl); 3380 if (error == ERESTART) { 3381 waited = B_TRUE; 3382 dmu_tx_wait(tx); 3383 dmu_tx_abort(tx); 3384 goto top; 3385 } 3386 zfs_acl_ids_free(&acl_ids); 3387 dmu_tx_abort(tx); 3388 zfs_exit(zfsvfs, FTAG); 3389 return (error); 3390 } 3391 3392 /* 3393 * Create a new object for the symlink. 3394 * for version 4 ZPL datasets the symlink will be an SA attribute 3395 */ 3396 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3397 3398 if (fuid_dirtied) 3399 zfs_fuid_sync(zfsvfs, tx); 3400 3401 mutex_enter(&zp->z_lock); 3402 if (zp->z_is_sa) 3403 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3404 link, len, tx); 3405 else 3406 zfs_sa_symlink(zp, link, len, tx); 3407 mutex_exit(&zp->z_lock); 3408 3409 zp->z_size = len; 3410 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3411 &zp->z_size, sizeof (zp->z_size), tx); 3412 /* 3413 * Insert the new object into the directory. 3414 */ 3415 error = zfs_link_create(dl, zp, tx, ZNEW); 3416 if (error != 0) { 3417 zfs_znode_delete(zp, tx); 3418 remove_inode_hash(ZTOI(zp)); 3419 } else { 3420 if (flags & FIGNORECASE) 3421 txtype |= TX_CI; 3422 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3423 3424 zfs_znode_update_vfs(dzp); 3425 zfs_znode_update_vfs(zp); 3426 } 3427 3428 zfs_acl_ids_free(&acl_ids); 3429 3430 dmu_tx_commit(tx); 3431 3432 zfs_dirent_unlock(dl); 3433 3434 if (error == 0) { 3435 *zpp = zp; 3436 3437 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3438 zil_commit(zilog, 0); 3439 } else { 3440 zrele(zp); 3441 } 3442 3443 zfs_exit(zfsvfs, FTAG); 3444 return (error); 3445 } 3446 3447 /* 3448 * Return, in the buffer contained in the provided uio structure, 3449 * the symbolic path referred to by ip. 3450 * 3451 * IN: ip - inode of symbolic link 3452 * uio - structure to contain the link path. 3453 * cr - credentials of caller. 3454 * 3455 * RETURN: 0 if success 3456 * error code if failure 3457 * 3458 * Timestamps: 3459 * ip - atime updated 3460 */ 3461 int 3462 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3463 { 3464 (void) cr; 3465 znode_t *zp = ITOZ(ip); 3466 zfsvfs_t *zfsvfs = ITOZSB(ip); 3467 int error; 3468 3469 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3470 return (error); 3471 3472 mutex_enter(&zp->z_lock); 3473 if (zp->z_is_sa) 3474 error = sa_lookup_uio(zp->z_sa_hdl, 3475 SA_ZPL_SYMLINK(zfsvfs), uio); 3476 else 3477 error = zfs_sa_readlink(zp, uio); 3478 mutex_exit(&zp->z_lock); 3479 3480 zfs_exit(zfsvfs, FTAG); 3481 return (error); 3482 } 3483 3484 /* 3485 * Insert a new entry into directory tdzp referencing szp. 3486 * 3487 * IN: tdzp - Directory to contain new entry. 3488 * szp - znode of new entry. 3489 * name - name of new entry. 3490 * cr - credentials of caller. 3491 * flags - case flags. 3492 * 3493 * RETURN: 0 if success 3494 * error code if failure 3495 * 3496 * Timestamps: 3497 * tdzp - ctime|mtime updated 3498 * szp - ctime updated 3499 */ 3500 int 3501 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3502 int flags) 3503 { 3504 struct inode *sip = ZTOI(szp); 3505 znode_t *tzp; 3506 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3507 zilog_t *zilog; 3508 zfs_dirlock_t *dl; 3509 dmu_tx_t *tx; 3510 int error; 3511 int zf = ZNEW; 3512 uint64_t parent; 3513 uid_t owner; 3514 boolean_t waited = B_FALSE; 3515 boolean_t is_tmpfile = 0; 3516 uint64_t txg; 3517 3518 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3519 3520 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3521 3522 if (name == NULL) 3523 return (SET_ERROR(EINVAL)); 3524 3525 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3526 return (error); 3527 zilog = zfsvfs->z_log; 3528 3529 /* 3530 * POSIX dictates that we return EPERM here. 3531 * Better choices include ENOTSUP or EISDIR. 3532 */ 3533 if (S_ISDIR(sip->i_mode)) { 3534 zfs_exit(zfsvfs, FTAG); 3535 return (SET_ERROR(EPERM)); 3536 } 3537 3538 if ((error = zfs_verify_zp(szp)) != 0) { 3539 zfs_exit(zfsvfs, FTAG); 3540 return (error); 3541 } 3542 3543 /* 3544 * If we are using project inheritance, means if the directory has 3545 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3546 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3547 * such case, we only allow hard link creation in our tree when the 3548 * project IDs are the same. 3549 */ 3550 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3551 tdzp->z_projid != szp->z_projid) { 3552 zfs_exit(zfsvfs, FTAG); 3553 return (SET_ERROR(EXDEV)); 3554 } 3555 3556 /* 3557 * We check i_sb because snapshots and the ctldir must have different 3558 * super blocks. 3559 */ 3560 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3561 zfs_exit(zfsvfs, FTAG); 3562 return (SET_ERROR(EXDEV)); 3563 } 3564 3565 /* Prevent links to .zfs/shares files */ 3566 3567 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3568 &parent, sizeof (uint64_t))) != 0) { 3569 zfs_exit(zfsvfs, FTAG); 3570 return (error); 3571 } 3572 if (parent == zfsvfs->z_shares_dir) { 3573 zfs_exit(zfsvfs, FTAG); 3574 return (SET_ERROR(EPERM)); 3575 } 3576 3577 if (zfsvfs->z_utf8 && u8_validate(name, 3578 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3579 zfs_exit(zfsvfs, FTAG); 3580 return (SET_ERROR(EILSEQ)); 3581 } 3582 if (flags & FIGNORECASE) 3583 zf |= ZCILOOK; 3584 3585 /* 3586 * We do not support links between attributes and non-attributes 3587 * because of the potential security risk of creating links 3588 * into "normal" file space in order to circumvent restrictions 3589 * imposed in attribute space. 3590 */ 3591 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3592 zfs_exit(zfsvfs, FTAG); 3593 return (SET_ERROR(EINVAL)); 3594 } 3595 3596 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3597 cr, ZFS_OWNER); 3598 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3599 zfs_exit(zfsvfs, FTAG); 3600 return (SET_ERROR(EPERM)); 3601 } 3602 3603 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3604 zfs_init_idmap))) { 3605 zfs_exit(zfsvfs, FTAG); 3606 return (error); 3607 } 3608 3609 top: 3610 /* 3611 * Attempt to lock directory; fail if entry already exists. 3612 */ 3613 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3614 if (error) { 3615 zfs_exit(zfsvfs, FTAG); 3616 return (error); 3617 } 3618 3619 tx = dmu_tx_create(zfsvfs->z_os); 3620 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3621 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3622 if (is_tmpfile) 3623 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3624 3625 zfs_sa_upgrade_txholds(tx, szp); 3626 zfs_sa_upgrade_txholds(tx, tdzp); 3627 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3628 if (error) { 3629 zfs_dirent_unlock(dl); 3630 if (error == ERESTART) { 3631 waited = B_TRUE; 3632 dmu_tx_wait(tx); 3633 dmu_tx_abort(tx); 3634 goto top; 3635 } 3636 dmu_tx_abort(tx); 3637 zfs_exit(zfsvfs, FTAG); 3638 return (error); 3639 } 3640 /* unmark z_unlinked so zfs_link_create will not reject */ 3641 if (is_tmpfile) 3642 szp->z_unlinked = B_FALSE; 3643 error = zfs_link_create(dl, szp, tx, 0); 3644 3645 if (error == 0) { 3646 uint64_t txtype = TX_LINK; 3647 /* 3648 * tmpfile is created to be in z_unlinkedobj, so remove it. 3649 * Also, we don't log in ZIL, because all previous file 3650 * operation on the tmpfile are ignored by ZIL. Instead we 3651 * always wait for txg to sync to make sure all previous 3652 * operation are sync safe. 3653 */ 3654 if (is_tmpfile) { 3655 VERIFY(zap_remove_int(zfsvfs->z_os, 3656 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); 3657 } else { 3658 if (flags & FIGNORECASE) 3659 txtype |= TX_CI; 3660 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3661 } 3662 } else if (is_tmpfile) { 3663 /* restore z_unlinked since when linking failed */ 3664 szp->z_unlinked = B_TRUE; 3665 } 3666 txg = dmu_tx_get_txg(tx); 3667 dmu_tx_commit(tx); 3668 3669 zfs_dirent_unlock(dl); 3670 3671 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3672 zil_commit(zilog, 0); 3673 3674 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) 3675 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); 3676 3677 zfs_znode_update_vfs(tdzp); 3678 zfs_znode_update_vfs(szp); 3679 zfs_exit(zfsvfs, FTAG); 3680 return (error); 3681 } 3682 3683 static void 3684 zfs_putpage_sync_commit_cb(void *arg) 3685 { 3686 struct page *pp = arg; 3687 3688 ClearPageError(pp); 3689 end_page_writeback(pp); 3690 } 3691 3692 static void 3693 zfs_putpage_async_commit_cb(void *arg) 3694 { 3695 struct page *pp = arg; 3696 znode_t *zp = ITOZ(pp->mapping->host); 3697 3698 ClearPageError(pp); 3699 end_page_writeback(pp); 3700 atomic_dec_32(&zp->z_async_writes_cnt); 3701 } 3702 3703 /* 3704 * Push a page out to disk, once the page is on stable storage the 3705 * registered commit callback will be run as notification of completion. 3706 * 3707 * IN: ip - page mapped for inode. 3708 * pp - page to push (page is locked) 3709 * wbc - writeback control data 3710 * for_sync - does the caller intend to wait synchronously for the 3711 * page writeback to complete? 3712 * 3713 * RETURN: 0 if success 3714 * error code if failure 3715 * 3716 * Timestamps: 3717 * ip - ctime|mtime updated 3718 */ 3719 int 3720 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3721 boolean_t for_sync) 3722 { 3723 znode_t *zp = ITOZ(ip); 3724 zfsvfs_t *zfsvfs = ITOZSB(ip); 3725 loff_t offset; 3726 loff_t pgoff; 3727 unsigned int pglen; 3728 dmu_tx_t *tx; 3729 caddr_t va; 3730 int err = 0; 3731 uint64_t mtime[2], ctime[2]; 3732 inode_timespec_t tmp_ts; 3733 sa_bulk_attr_t bulk[3]; 3734 int cnt = 0; 3735 struct address_space *mapping; 3736 3737 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3738 return (err); 3739 3740 ASSERT(PageLocked(pp)); 3741 3742 pgoff = page_offset(pp); /* Page byte-offset in file */ 3743 offset = i_size_read(ip); /* File length in bytes */ 3744 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3745 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3746 3747 /* Page is beyond end of file */ 3748 if (pgoff >= offset) { 3749 unlock_page(pp); 3750 zfs_exit(zfsvfs, FTAG); 3751 return (0); 3752 } 3753 3754 /* Truncate page length to end of file */ 3755 if (pgoff + pglen > offset) 3756 pglen = offset - pgoff; 3757 3758 #if 0 3759 /* 3760 * FIXME: Allow mmap writes past its quota. The correct fix 3761 * is to register a page_mkwrite() handler to count the page 3762 * against its quota when it is about to be dirtied. 3763 */ 3764 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3765 KUID_TO_SUID(ip->i_uid)) || 3766 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3767 KGID_TO_SGID(ip->i_gid)) || 3768 (zp->z_projid != ZFS_DEFAULT_PROJID && 3769 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3770 zp->z_projid))) { 3771 err = EDQUOT; 3772 } 3773 #endif 3774 3775 /* 3776 * The ordering here is critical and must adhere to the following 3777 * rules in order to avoid deadlocking in either zfs_read() or 3778 * zfs_free_range() due to a lock inversion. 3779 * 3780 * 1) The page must be unlocked prior to acquiring the range lock. 3781 * This is critical because zfs_read() calls find_lock_page() 3782 * which may block on the page lock while holding the range lock. 3783 * 3784 * 2) Before setting or clearing write back on a page the range lock 3785 * must be held in order to prevent a lock inversion with the 3786 * zfs_free_range() function. 3787 * 3788 * This presents a problem because upon entering this function the 3789 * page lock is already held. To safely acquire the range lock the 3790 * page lock must be dropped. This creates a window where another 3791 * process could truncate, invalidate, dirty, or write out the page. 3792 * 3793 * Therefore, after successfully reacquiring the range and page locks 3794 * the current page state is checked. In the common case everything 3795 * will be as is expected and it can be written out. However, if 3796 * the page state has changed it must be handled accordingly. 3797 */ 3798 mapping = pp->mapping; 3799 redirty_page_for_writepage(wbc, pp); 3800 unlock_page(pp); 3801 3802 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3803 pgoff, pglen, RL_WRITER); 3804 lock_page(pp); 3805 3806 /* Page mapping changed or it was no longer dirty, we're done */ 3807 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3808 unlock_page(pp); 3809 zfs_rangelock_exit(lr); 3810 zfs_exit(zfsvfs, FTAG); 3811 return (0); 3812 } 3813 3814 /* Another process started write block if required */ 3815 if (PageWriteback(pp)) { 3816 unlock_page(pp); 3817 zfs_rangelock_exit(lr); 3818 3819 if (wbc->sync_mode != WB_SYNC_NONE) { 3820 /* 3821 * Speed up any non-sync page writebacks since 3822 * they may take several seconds to complete. 3823 * Refer to the comment in zpl_fsync() for details. 3824 */ 3825 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 3826 zil_commit(zfsvfs->z_log, zp->z_id); 3827 } 3828 3829 if (PageWriteback(pp)) 3830 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3831 folio_wait_bit(page_folio(pp), PG_writeback); 3832 #else 3833 wait_on_page_bit(pp, PG_writeback); 3834 #endif 3835 } 3836 3837 zfs_exit(zfsvfs, FTAG); 3838 return (0); 3839 } 3840 3841 /* Clear the dirty flag the required locks are held */ 3842 if (!clear_page_dirty_for_io(pp)) { 3843 unlock_page(pp); 3844 zfs_rangelock_exit(lr); 3845 zfs_exit(zfsvfs, FTAG); 3846 return (0); 3847 } 3848 3849 /* 3850 * Counterpart for redirty_page_for_writepage() above. This page 3851 * was in fact not skipped and should not be counted as if it were. 3852 */ 3853 wbc->pages_skipped--; 3854 if (!for_sync) 3855 atomic_inc_32(&zp->z_async_writes_cnt); 3856 set_page_writeback(pp); 3857 unlock_page(pp); 3858 3859 tx = dmu_tx_create(zfsvfs->z_os); 3860 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3861 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3862 zfs_sa_upgrade_txholds(tx, zp); 3863 3864 err = dmu_tx_assign(tx, TXG_WAIT); 3865 if (err != 0) { 3866 dmu_tx_abort(tx); 3867 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3868 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3869 #else 3870 __set_page_dirty_nobuffers(pp); 3871 #endif 3872 ClearPageError(pp); 3873 end_page_writeback(pp); 3874 if (!for_sync) 3875 atomic_dec_32(&zp->z_async_writes_cnt); 3876 zfs_rangelock_exit(lr); 3877 zfs_exit(zfsvfs, FTAG); 3878 return (err); 3879 } 3880 3881 va = kmap(pp); 3882 ASSERT3U(pglen, <=, PAGE_SIZE); 3883 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); 3884 kunmap(pp); 3885 3886 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3887 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3888 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3889 &zp->z_pflags, 8); 3890 3891 /* Preserve the mtime and ctime provided by the inode */ 3892 tmp_ts = zpl_inode_get_mtime(ip); 3893 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3894 tmp_ts = zpl_inode_get_ctime(ip); 3895 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3896 zp->z_atime_dirty = B_FALSE; 3897 zp->z_seq++; 3898 3899 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3900 3901 boolean_t commit = B_FALSE; 3902 if (wbc->sync_mode != WB_SYNC_NONE) { 3903 /* 3904 * Note that this is rarely called under writepages(), because 3905 * writepages() normally handles the entire commit for 3906 * performance reasons. 3907 */ 3908 commit = B_TRUE; 3909 } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { 3910 /* 3911 * If the caller does not intend to wait synchronously 3912 * for this page writeback to complete and there are active 3913 * synchronous calls on this file, do a commit so that 3914 * the latter don't accidentally end up waiting for 3915 * our writeback to complete. Refer to the comment in 3916 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. 3917 */ 3918 commit = B_TRUE; 3919 } 3920 3921 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, 3922 B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : 3923 zfs_putpage_async_commit_cb, pp); 3924 3925 dmu_tx_commit(tx); 3926 3927 zfs_rangelock_exit(lr); 3928 3929 if (commit) 3930 zil_commit(zfsvfs->z_log, zp->z_id); 3931 3932 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3933 3934 zfs_exit(zfsvfs, FTAG); 3935 return (err); 3936 } 3937 3938 /* 3939 * Update the system attributes when the inode has been dirtied. For the 3940 * moment we only update the mode, atime, mtime, and ctime. 3941 */ 3942 int 3943 zfs_dirty_inode(struct inode *ip, int flags) 3944 { 3945 znode_t *zp = ITOZ(ip); 3946 zfsvfs_t *zfsvfs = ITOZSB(ip); 3947 dmu_tx_t *tx; 3948 uint64_t mode, atime[2], mtime[2], ctime[2]; 3949 inode_timespec_t tmp_ts; 3950 sa_bulk_attr_t bulk[4]; 3951 int error = 0; 3952 int cnt = 0; 3953 3954 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 3955 return (0); 3956 3957 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3958 return (error); 3959 3960 #ifdef I_DIRTY_TIME 3961 /* 3962 * This is the lazytime semantic introduced in Linux 4.0 3963 * This flag will only be called from update_time when lazytime is set. 3964 * (Note, I_DIRTY_SYNC will also set if not lazytime) 3965 * Fortunately mtime and ctime are managed within ZFS itself, so we 3966 * only need to dirty atime. 3967 */ 3968 if (flags == I_DIRTY_TIME) { 3969 zp->z_atime_dirty = B_TRUE; 3970 goto out; 3971 } 3972 #endif 3973 3974 tx = dmu_tx_create(zfsvfs->z_os); 3975 3976 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3977 zfs_sa_upgrade_txholds(tx, zp); 3978 3979 error = dmu_tx_assign(tx, TXG_WAIT); 3980 if (error) { 3981 dmu_tx_abort(tx); 3982 goto out; 3983 } 3984 3985 mutex_enter(&zp->z_lock); 3986 zp->z_atime_dirty = B_FALSE; 3987 3988 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 3989 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 3990 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3991 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3992 3993 /* Preserve the mode, mtime and ctime provided by the inode */ 3994 tmp_ts = zpl_inode_get_atime(ip); 3995 ZFS_TIME_ENCODE(&tmp_ts, atime); 3996 tmp_ts = zpl_inode_get_mtime(ip); 3997 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3998 tmp_ts = zpl_inode_get_ctime(ip); 3999 ZFS_TIME_ENCODE(&tmp_ts, ctime); 4000 mode = ip->i_mode; 4001 4002 zp->z_mode = mode; 4003 4004 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 4005 mutex_exit(&zp->z_lock); 4006 4007 dmu_tx_commit(tx); 4008 out: 4009 zfs_exit(zfsvfs, FTAG); 4010 return (error); 4011 } 4012 4013 void 4014 zfs_inactive(struct inode *ip) 4015 { 4016 znode_t *zp = ITOZ(ip); 4017 zfsvfs_t *zfsvfs = ITOZSB(ip); 4018 uint64_t atime[2]; 4019 int error; 4020 int need_unlock = 0; 4021 4022 /* Only read lock if we haven't already write locked, e.g. rollback */ 4023 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 4024 need_unlock = 1; 4025 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4026 } 4027 if (zp->z_sa_hdl == NULL) { 4028 if (need_unlock) 4029 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4030 return; 4031 } 4032 4033 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 4034 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4035 4036 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4037 zfs_sa_upgrade_txholds(tx, zp); 4038 error = dmu_tx_assign(tx, TXG_WAIT); 4039 if (error) { 4040 dmu_tx_abort(tx); 4041 } else { 4042 inode_timespec_t tmp_atime; 4043 tmp_atime = zpl_inode_get_atime(ip); 4044 ZFS_TIME_ENCODE(&tmp_atime, atime); 4045 mutex_enter(&zp->z_lock); 4046 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4047 (void *)&atime, sizeof (atime), tx); 4048 zp->z_atime_dirty = B_FALSE; 4049 mutex_exit(&zp->z_lock); 4050 dmu_tx_commit(tx); 4051 } 4052 } 4053 4054 zfs_zinactive(zp); 4055 if (need_unlock) 4056 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4057 } 4058 4059 /* 4060 * Fill pages with data from the disk. 4061 */ 4062 static int 4063 zfs_fillpage(struct inode *ip, struct page *pp) 4064 { 4065 znode_t *zp = ITOZ(ip); 4066 zfsvfs_t *zfsvfs = ITOZSB(ip); 4067 loff_t i_size = i_size_read(ip); 4068 u_offset_t io_off = page_offset(pp); 4069 size_t io_len = PAGE_SIZE; 4070 4071 ASSERT3U(io_off, <, i_size); 4072 4073 if (io_off + io_len > i_size) 4074 io_len = i_size - io_off; 4075 4076 void *va = kmap(pp); 4077 int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, 4078 io_len, va, DMU_READ_PREFETCH); 4079 if (io_len != PAGE_SIZE) 4080 memset((char *)va + io_len, 0, PAGE_SIZE - io_len); 4081 kunmap(pp); 4082 4083 if (error) { 4084 /* convert checksum errors into IO errors */ 4085 if (error == ECKSUM) 4086 error = SET_ERROR(EIO); 4087 4088 SetPageError(pp); 4089 ClearPageUptodate(pp); 4090 } else { 4091 ClearPageError(pp); 4092 SetPageUptodate(pp); 4093 if (!PagePrivate(pp)) { 4094 /* 4095 * Set private bit so page migration will wait for us to 4096 * finish writeback before calling migrate_folio(). 4097 */ 4098 SetPagePrivate(pp); 4099 get_page(pp); 4100 } 4101 } 4102 4103 return (error); 4104 } 4105 4106 /* 4107 * Uses zfs_fillpage to read data from the file and fill the page. 4108 * 4109 * IN: ip - inode of file to get data from. 4110 * pp - page to read 4111 * 4112 * RETURN: 0 on success, error code on failure. 4113 * 4114 * Timestamps: 4115 * vp - atime updated 4116 */ 4117 int 4118 zfs_getpage(struct inode *ip, struct page *pp) 4119 { 4120 zfsvfs_t *zfsvfs = ITOZSB(ip); 4121 znode_t *zp = ITOZ(ip); 4122 int error; 4123 loff_t i_size = i_size_read(ip); 4124 u_offset_t io_off = page_offset(pp); 4125 size_t io_len = PAGE_SIZE; 4126 4127 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4128 return (error); 4129 4130 ASSERT3U(io_off, <, i_size); 4131 4132 if (io_off + io_len > i_size) 4133 io_len = i_size - io_off; 4134 4135 /* 4136 * It is important to hold the rangelock here because it is possible 4137 * a Direct I/O write or block clone might be taking place at the same 4138 * time that a page is being faulted in through filemap_fault(). With 4139 * Direct I/O writes and block cloning db->db_data will be set to NULL 4140 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the 4141 * rangelock is not held, then there is a race between faulting in a 4142 * page and writing out a Direct I/O write or block cloning. Without 4143 * the rangelock a NULL pointer dereference can occur in 4144 * dmu_read_impl() for db->db_data during the mempcy operation when 4145 * zfs_fillpage() calls dmu_read(). 4146 */ 4147 zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, 4148 io_off, io_len, RL_READER); 4149 if (lr == NULL) { 4150 /* 4151 * It is important to drop the page lock before grabbing the 4152 * rangelock to avoid another deadlock between here and 4153 * zfs_write() -> update_pages(). update_pages() holds both the 4154 * rangelock and the page lock. 4155 */ 4156 get_page(pp); 4157 unlock_page(pp); 4158 lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, 4159 io_len, RL_READER); 4160 lock_page(pp); 4161 put_page(pp); 4162 } 4163 error = zfs_fillpage(ip, pp); 4164 zfs_rangelock_exit(lr); 4165 4166 if (error == 0) 4167 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); 4168 4169 zfs_exit(zfsvfs, FTAG); 4170 4171 return (error); 4172 } 4173 4174 /* 4175 * Check ZFS specific permissions to memory map a section of a file. 4176 * 4177 * IN: ip - inode of the file to mmap 4178 * off - file offset 4179 * addrp - start address in memory region 4180 * len - length of memory region 4181 * vm_flags- address flags 4182 * 4183 * RETURN: 0 if success 4184 * error code if failure 4185 */ 4186 int 4187 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4188 unsigned long vm_flags) 4189 { 4190 (void) addrp; 4191 znode_t *zp = ITOZ(ip); 4192 zfsvfs_t *zfsvfs = ITOZSB(ip); 4193 int error; 4194 4195 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4196 return (error); 4197 4198 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && 4199 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4200 zfs_exit(zfsvfs, FTAG); 4201 return (SET_ERROR(EPERM)); 4202 } 4203 4204 if ((vm_flags & (VM_READ | VM_EXEC)) && 4205 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4206 zfs_exit(zfsvfs, FTAG); 4207 return (SET_ERROR(EACCES)); 4208 } 4209 4210 if (off < 0 || len > MAXOFFSET_T - off) { 4211 zfs_exit(zfsvfs, FTAG); 4212 return (SET_ERROR(ENXIO)); 4213 } 4214 4215 zfs_exit(zfsvfs, FTAG); 4216 return (0); 4217 } 4218 4219 /* 4220 * Free or allocate space in a file. Currently, this function only 4221 * supports the `F_FREESP' command. However, this command is somewhat 4222 * misnamed, as its functionality includes the ability to allocate as 4223 * well as free space. 4224 * 4225 * IN: zp - znode of file to free data in. 4226 * cmd - action to take (only F_FREESP supported). 4227 * bfp - section of file to free/alloc. 4228 * flag - current file open mode flags. 4229 * offset - current file offset. 4230 * cr - credentials of caller. 4231 * 4232 * RETURN: 0 on success, error code on failure. 4233 * 4234 * Timestamps: 4235 * zp - ctime|mtime updated 4236 */ 4237 int 4238 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4239 offset_t offset, cred_t *cr) 4240 { 4241 (void) offset; 4242 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4243 uint64_t off, len; 4244 int error; 4245 4246 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4247 return (error); 4248 4249 if (cmd != F_FREESP) { 4250 zfs_exit(zfsvfs, FTAG); 4251 return (SET_ERROR(EINVAL)); 4252 } 4253 4254 /* 4255 * Callers might not be able to detect properly that we are read-only, 4256 * so check it explicitly here. 4257 */ 4258 if (zfs_is_readonly(zfsvfs)) { 4259 zfs_exit(zfsvfs, FTAG); 4260 return (SET_ERROR(EROFS)); 4261 } 4262 4263 if (bfp->l_len < 0) { 4264 zfs_exit(zfsvfs, FTAG); 4265 return (SET_ERROR(EINVAL)); 4266 } 4267 4268 /* 4269 * Permissions aren't checked on Solaris because on this OS 4270 * zfs_space() can only be called with an opened file handle. 4271 * On Linux we can get here through truncate_range() which 4272 * operates directly on inodes, so we need to check access rights. 4273 */ 4274 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4275 zfs_init_idmap))) { 4276 zfs_exit(zfsvfs, FTAG); 4277 return (error); 4278 } 4279 4280 off = bfp->l_start; 4281 len = bfp->l_len; /* 0 means from off to end of file */ 4282 4283 error = zfs_freesp(zp, off, len, flag, TRUE); 4284 4285 zfs_exit(zfsvfs, FTAG); 4286 return (error); 4287 } 4288 4289 int 4290 zfs_fid(struct inode *ip, fid_t *fidp) 4291 { 4292 znode_t *zp = ITOZ(ip); 4293 zfsvfs_t *zfsvfs = ITOZSB(ip); 4294 uint32_t gen; 4295 uint64_t gen64; 4296 uint64_t object = zp->z_id; 4297 zfid_short_t *zfid; 4298 int size, i, error; 4299 4300 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4301 return (error); 4302 4303 if (fidp->fid_len < SHORT_FID_LEN) { 4304 fidp->fid_len = SHORT_FID_LEN; 4305 zfs_exit(zfsvfs, FTAG); 4306 return (SET_ERROR(ENOSPC)); 4307 } 4308 4309 if ((error = zfs_verify_zp(zp)) != 0) { 4310 zfs_exit(zfsvfs, FTAG); 4311 return (error); 4312 } 4313 4314 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4315 &gen64, sizeof (uint64_t))) != 0) { 4316 zfs_exit(zfsvfs, FTAG); 4317 return (error); 4318 } 4319 4320 gen = (uint32_t)gen64; 4321 4322 size = SHORT_FID_LEN; 4323 4324 zfid = (zfid_short_t *)fidp; 4325 4326 zfid->zf_len = size; 4327 4328 for (i = 0; i < sizeof (zfid->zf_object); i++) 4329 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4330 4331 /* Must have a non-zero generation number to distinguish from .zfs */ 4332 if (gen == 0) 4333 gen = 1; 4334 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4335 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4336 4337 zfs_exit(zfsvfs, FTAG); 4338 return (0); 4339 } 4340 4341 #if defined(_KERNEL) 4342 EXPORT_SYMBOL(zfs_open); 4343 EXPORT_SYMBOL(zfs_close); 4344 EXPORT_SYMBOL(zfs_lookup); 4345 EXPORT_SYMBOL(zfs_create); 4346 EXPORT_SYMBOL(zfs_tmpfile); 4347 EXPORT_SYMBOL(zfs_remove); 4348 EXPORT_SYMBOL(zfs_mkdir); 4349 EXPORT_SYMBOL(zfs_rmdir); 4350 EXPORT_SYMBOL(zfs_readdir); 4351 EXPORT_SYMBOL(zfs_getattr_fast); 4352 EXPORT_SYMBOL(zfs_setattr); 4353 EXPORT_SYMBOL(zfs_rename); 4354 EXPORT_SYMBOL(zfs_symlink); 4355 EXPORT_SYMBOL(zfs_readlink); 4356 EXPORT_SYMBOL(zfs_link); 4357 EXPORT_SYMBOL(zfs_inactive); 4358 EXPORT_SYMBOL(zfs_space); 4359 EXPORT_SYMBOL(zfs_fid); 4360 EXPORT_SYMBOL(zfs_getpage); 4361 EXPORT_SYMBOL(zfs_putpage); 4362 EXPORT_SYMBOL(zfs_dirty_inode); 4363 EXPORT_SYMBOL(zfs_map); 4364 4365 /* CSTYLED */ 4366 module_param(zfs_delete_blocks, ulong, 0644); 4367 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4368 #endif 4369