1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 /* Portions Copyright 2007 Jeremy Teo */ 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/kmem.h> 41 #include <sys/taskq.h> 42 #include <sys/uio.h> 43 #include <sys/vmsystm.h> 44 #include <sys/atomic.h> 45 #include <sys/pathname.h> 46 #include <sys/cmn_err.h> 47 #include <sys/errno.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/policy.h> 60 #include <sys/sunddi.h> 61 #include <sys/sid.h> 62 #include <sys/zfs_ctldir.h> 63 #include <sys/zfs_fuid.h> 64 #include <sys/zfs_quota.h> 65 #include <sys/zfs_sa.h> 66 #include <sys/zfs_vnops.h> 67 #include <sys/zfs_rlock.h> 68 #include <sys/cred.h> 69 #include <sys/zpl.h> 70 #include <sys/zil.h> 71 #include <sys/sa_impl.h> 72 73 /* 74 * Programming rules. 75 * 76 * Each vnode op performs some logical unit of work. To do this, the ZPL must 77 * properly lock its in-core state, create a DMU transaction, do the work, 78 * record this work in the intent log (ZIL), commit the DMU transaction, 79 * and wait for the intent log to commit if it is a synchronous operation. 80 * Moreover, the vnode ops must work in both normal and log replay context. 81 * The ordering of events is important to avoid deadlocks and references 82 * to freed memory. The example below illustrates the following Big Rules: 83 * 84 * (1) A check must be made in each zfs thread for a mounted file system. 85 * This is done avoiding races using zfs_enter(zfsvfs). 86 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 87 * must be checked with zfs_verify_zp(zp). Both of these macros 88 * can return EIO from the calling function. 89 * 90 * (2) zrele() should always be the last thing except for zil_commit() (if 91 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 92 * last reference, the vnode/znode can be freed, so the zp may point to 93 * freed memory. Second, the last reference will call zfs_zinactive(), 94 * which may induce a lot of work -- pushing cached pages (which acquires 95 * range locks) and syncing out cached atime changes. Third, 96 * zfs_zinactive() may require a new tx, which could deadlock the system 97 * if you were already holding one. This deadlock occurs because the tx 98 * currently being operated on prevents a txg from syncing, which 99 * prevents the new tx from progressing, resulting in a deadlock. If you 100 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 101 * is a synonym for zrele(). 102 * 103 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 104 * as they can span dmu_tx_assign() calls. 105 * 106 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 107 * dmu_tx_assign(). This is critical because we don't want to block 108 * while holding locks. 109 * 110 * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This 111 * reduces lock contention and CPU usage when we must wait (note that if 112 * throughput is constrained by the storage, nearly every transaction 113 * must wait). 114 * 115 * Note, in particular, that if a lock is sometimes acquired before 116 * the tx assigns, and sometimes after (e.g. z_lock), then failing 117 * to use a non-blocking assign can deadlock the system. The scenario: 118 * 119 * Thread A has grabbed a lock before calling dmu_tx_assign(). 120 * Thread B is in an already-assigned tx, and blocks for this lock. 121 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 122 * forever, because the previous txg can't quiesce until B's tx commits. 123 * 124 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 125 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 126 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 127 * to indicate that this operation has already called dmu_tx_wait(). 128 * This will ensure that we don't retry forever, waiting a short bit 129 * each time. 130 * 131 * (5) If the operation succeeded, generate the intent log entry for it 132 * before dropping locks. This ensures that the ordering of events 133 * in the intent log matches the order in which they actually occurred. 134 * During ZIL replay the zfs_log_* functions will update the sequence 135 * number to indicate the zil transaction has replayed. 136 * 137 * (6) At the end of each vnode op, the DMU tx must always commit, 138 * regardless of whether there were any errors. 139 * 140 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 141 * to ensure that synchronous semantics are provided when necessary. 142 * 143 * In general, this is how things should be ordered in each vnode op: 144 * 145 * zfs_enter(zfsvfs); // exit if unmounted 146 * top: 147 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 148 * rw_enter(...); // grab any other locks you need 149 * tx = dmu_tx_create(...); // get DMU tx 150 * dmu_tx_hold_*(); // hold each object you might modify 151 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 152 * if (error) { 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * zrele(...); // release held znodes 156 * if (error == ERESTART) { 157 * waited = B_TRUE; 158 * dmu_tx_wait(tx); 159 * dmu_tx_abort(tx); 160 * goto top; 161 * } 162 * dmu_tx_abort(tx); // abort DMU tx 163 * zfs_exit(zfsvfs); // finished in zfs 164 * return (error); // really out of space 165 * } 166 * error = do_real_work(); // do whatever this VOP does 167 * if (error == 0) 168 * zfs_log_*(...); // on success, make ZIL entry 169 * dmu_tx_commit(tx); // commit DMU tx -- error or not 170 * rw_exit(...); // drop locks 171 * zfs_dirent_unlock(dl); // unlock directory entry 172 * zrele(...); // release held znodes 173 * zil_commit(zilog, foid); // synchronous when necessary 174 * zfs_exit(zfsvfs); // finished in zfs 175 * return (error); // done, report error 176 */ 177 int 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 179 { 180 (void) cr; 181 znode_t *zp = ITOZ(ip); 182 zfsvfs_t *zfsvfs = ITOZSB(ip); 183 int error; 184 185 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 186 return (error); 187 188 /* Honor ZFS_APPENDONLY file attribute */ 189 if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 190 ((flag & O_APPEND) == 0)) { 191 zfs_exit(zfsvfs, FTAG); 192 return (SET_ERROR(EPERM)); 193 } 194 195 /* Keep a count of the synchronous opens in the znode */ 196 if (flag & O_SYNC) 197 atomic_inc_32(&zp->z_sync_cnt); 198 199 zfs_exit(zfsvfs, FTAG); 200 return (0); 201 } 202 203 int 204 zfs_close(struct inode *ip, int flag, cred_t *cr) 205 { 206 (void) cr; 207 znode_t *zp = ITOZ(ip); 208 zfsvfs_t *zfsvfs = ITOZSB(ip); 209 int error; 210 211 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 212 return (error); 213 214 /* Decrement the synchronous opens in the znode */ 215 if (flag & O_SYNC) 216 atomic_dec_32(&zp->z_sync_cnt); 217 218 zfs_exit(zfsvfs, FTAG); 219 return (0); 220 } 221 222 #if defined(_KERNEL) 223 /* 224 * When a file is memory mapped, we must keep the IO data synchronized 225 * between the DMU cache and the memory mapped pages. What this means: 226 * 227 * On Write: If we find a memory mapped page, we write to *both* 228 * the page and the dmu buffer. 229 */ 230 void 231 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 232 { 233 struct inode *ip = ZTOI(zp); 234 struct address_space *mp = ip->i_mapping; 235 struct page *pp; 236 uint64_t nbytes; 237 int64_t off; 238 void *pb; 239 240 off = start & (PAGE_SIZE-1); 241 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 242 nbytes = MIN(PAGE_SIZE - off, len); 243 244 pp = find_lock_page(mp, start >> PAGE_SHIFT); 245 if (pp) { 246 if (mapping_writably_mapped(mp)) 247 flush_dcache_page(pp); 248 249 pb = kmap(pp); 250 (void) dmu_read(os, zp->z_id, start + off, nbytes, 251 pb + off, DMU_READ_PREFETCH); 252 kunmap(pp); 253 254 if (mapping_writably_mapped(mp)) 255 flush_dcache_page(pp); 256 257 mark_page_accessed(pp); 258 SetPageUptodate(pp); 259 ClearPageError(pp); 260 unlock_page(pp); 261 put_page(pp); 262 } 263 264 len -= nbytes; 265 off = 0; 266 } 267 } 268 269 /* 270 * When a file is memory mapped, we must keep the IO data synchronized 271 * between the DMU cache and the memory mapped pages. What this means: 272 * 273 * On Read: We "read" preferentially from memory mapped pages, 274 * else we default from the dmu buffer. 275 * 276 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 277 * the file is memory mapped. 278 */ 279 int 280 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 281 { 282 struct inode *ip = ZTOI(zp); 283 struct address_space *mp = ip->i_mapping; 284 struct page *pp; 285 int64_t start, off; 286 uint64_t bytes; 287 int len = nbytes; 288 int error = 0; 289 void *pb; 290 291 start = uio->uio_loffset; 292 off = start & (PAGE_SIZE-1); 293 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 294 bytes = MIN(PAGE_SIZE - off, len); 295 296 pp = find_lock_page(mp, start >> PAGE_SHIFT); 297 if (pp) { 298 ASSERT(PageUptodate(pp)); 299 unlock_page(pp); 300 301 pb = kmap(pp); 302 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 303 kunmap(pp); 304 305 if (mapping_writably_mapped(mp)) 306 flush_dcache_page(pp); 307 308 mark_page_accessed(pp); 309 put_page(pp); 310 } else { 311 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 312 uio, bytes); 313 } 314 315 len -= bytes; 316 off = 0; 317 if (error) 318 break; 319 } 320 return (error); 321 } 322 #endif /* _KERNEL */ 323 324 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 325 326 /* 327 * Write the bytes to a file. 328 * 329 * IN: zp - znode of file to be written to 330 * data - bytes to write 331 * len - number of bytes to write 332 * pos - offset to start writing at 333 * 334 * OUT: resid - remaining bytes to write 335 * 336 * RETURN: 0 if success 337 * positive error code if failure. EIO is returned 338 * for a short write when residp isn't provided. 339 * 340 * Timestamps: 341 * zp - ctime|mtime updated if byte count > 0 342 */ 343 int 344 zfs_write_simple(znode_t *zp, const void *data, size_t len, 345 loff_t pos, size_t *residp) 346 { 347 fstrans_cookie_t cookie; 348 int error; 349 350 struct iovec iov; 351 iov.iov_base = (void *)data; 352 iov.iov_len = len; 353 354 zfs_uio_t uio; 355 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 356 357 cookie = spl_fstrans_mark(); 358 error = zfs_write(zp, &uio, 0, kcred); 359 spl_fstrans_unmark(cookie); 360 361 if (error == 0) { 362 if (residp != NULL) 363 *residp = zfs_uio_resid(&uio); 364 else if (zfs_uio_resid(&uio) != 0) 365 error = SET_ERROR(EIO); 366 } 367 368 return (error); 369 } 370 371 static void 372 zfs_rele_async_task(void *arg) 373 { 374 iput(arg); 375 } 376 377 void 378 zfs_zrele_async(znode_t *zp) 379 { 380 struct inode *ip = ZTOI(zp); 381 objset_t *os = ITOZSB(ip)->z_os; 382 383 ASSERT(atomic_read(&ip->i_count) > 0); 384 ASSERT(os != NULL); 385 386 /* 387 * If decrementing the count would put us at 0, we can't do it inline 388 * here, because that would be synchronous. Instead, dispatch an iput 389 * to run later. 390 * 391 * For more information on the dangers of a synchronous iput, see the 392 * header comment of this file. 393 */ 394 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 395 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 396 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 397 } 398 } 399 400 401 /* 402 * Lookup an entry in a directory, or an extended attribute directory. 403 * If it exists, return a held inode reference for it. 404 * 405 * IN: zdp - znode of directory to search. 406 * nm - name of entry to lookup. 407 * flags - LOOKUP_XATTR set if looking for an attribute. 408 * cr - credentials of caller. 409 * direntflags - directory lookup flags 410 * realpnp - returned pathname. 411 * 412 * OUT: zpp - znode of located entry, NULL if not found. 413 * 414 * RETURN: 0 on success, error code on failure. 415 * 416 * Timestamps: 417 * NA 418 */ 419 int 420 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 421 int *direntflags, pathname_t *realpnp) 422 { 423 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 424 int error = 0; 425 426 /* 427 * Fast path lookup, however we must skip DNLC lookup 428 * for case folding or normalizing lookups because the 429 * DNLC code only stores the passed in name. This means 430 * creating 'a' and removing 'A' on a case insensitive 431 * file system would work, but DNLC still thinks 'a' 432 * exists and won't let you create it again on the next 433 * pass through fast path. 434 */ 435 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 436 437 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 438 return (SET_ERROR(ENOTDIR)); 439 } else if (zdp->z_sa_hdl == NULL) { 440 return (SET_ERROR(EIO)); 441 } 442 443 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 444 error = zfs_fastaccesschk_execute(zdp, cr); 445 if (!error) { 446 *zpp = zdp; 447 zhold(*zpp); 448 return (0); 449 } 450 return (error); 451 } 452 } 453 454 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 455 return (error); 456 457 *zpp = NULL; 458 459 if (flags & LOOKUP_XATTR) { 460 /* 461 * We don't allow recursive attributes.. 462 * Maybe someday we will. 463 */ 464 if (zdp->z_pflags & ZFS_XATTR) { 465 zfs_exit(zfsvfs, FTAG); 466 return (SET_ERROR(EINVAL)); 467 } 468 469 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 470 zfs_exit(zfsvfs, FTAG); 471 return (error); 472 } 473 474 /* 475 * Do we have permission to get into attribute directory? 476 */ 477 478 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 479 B_TRUE, cr, kcred->user_ns))) { 480 zrele(*zpp); 481 *zpp = NULL; 482 } 483 484 zfs_exit(zfsvfs, FTAG); 485 return (error); 486 } 487 488 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 489 zfs_exit(zfsvfs, FTAG); 490 return (SET_ERROR(ENOTDIR)); 491 } 492 493 /* 494 * Check accessibility of directory. 495 */ 496 497 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 498 kcred->user_ns))) { 499 zfs_exit(zfsvfs, FTAG); 500 return (error); 501 } 502 503 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 504 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 505 zfs_exit(zfsvfs, FTAG); 506 return (SET_ERROR(EILSEQ)); 507 } 508 509 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 510 if ((error == 0) && (*zpp)) 511 zfs_znode_update_vfs(*zpp); 512 513 zfs_exit(zfsvfs, FTAG); 514 return (error); 515 } 516 517 /* 518 * Attempt to create a new entry in a directory. If the entry 519 * already exists, truncate the file if permissible, else return 520 * an error. Return the ip of the created or trunc'd file. 521 * 522 * IN: dzp - znode of directory to put new file entry in. 523 * name - name of new file entry. 524 * vap - attributes of new file. 525 * excl - flag indicating exclusive or non-exclusive mode. 526 * mode - mode to open file with. 527 * cr - credentials of caller. 528 * flag - file flag. 529 * vsecp - ACL to be set 530 * mnt_ns - user namespace of the mount 531 * 532 * OUT: zpp - znode of created or trunc'd entry. 533 * 534 * RETURN: 0 on success, error code on failure. 535 * 536 * Timestamps: 537 * dzp - ctime|mtime updated if new entry created 538 * zp - ctime|mtime always, atime if new 539 */ 540 int 541 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 542 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 543 zuserns_t *mnt_ns) 544 { 545 znode_t *zp; 546 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 547 zilog_t *zilog; 548 objset_t *os; 549 zfs_dirlock_t *dl; 550 dmu_tx_t *tx; 551 int error; 552 uid_t uid; 553 gid_t gid; 554 zfs_acl_ids_t acl_ids; 555 boolean_t fuid_dirtied; 556 boolean_t have_acl = B_FALSE; 557 boolean_t waited = B_FALSE; 558 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 559 560 /* 561 * If we have an ephemeral id, ACL, or XVATTR then 562 * make sure file system is at proper version 563 */ 564 565 gid = crgetgid(cr); 566 uid = crgetuid(cr); 567 568 if (zfsvfs->z_use_fuids == B_FALSE && 569 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 570 return (SET_ERROR(EINVAL)); 571 572 if (name == NULL) 573 return (SET_ERROR(EINVAL)); 574 575 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 576 return (error); 577 os = zfsvfs->z_os; 578 zilog = zfsvfs->z_log; 579 580 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 581 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 582 zfs_exit(zfsvfs, FTAG); 583 return (SET_ERROR(EILSEQ)); 584 } 585 586 if (vap->va_mask & ATTR_XVATTR) { 587 if ((error = secpolicy_xvattr((xvattr_t *)vap, 588 crgetuid(cr), cr, vap->va_mode)) != 0) { 589 zfs_exit(zfsvfs, FTAG); 590 return (error); 591 } 592 } 593 594 top: 595 *zpp = NULL; 596 if (*name == '\0') { 597 /* 598 * Null component name refers to the directory itself. 599 */ 600 zhold(dzp); 601 zp = dzp; 602 dl = NULL; 603 error = 0; 604 } else { 605 /* possible igrab(zp) */ 606 int zflg = 0; 607 608 if (flag & FIGNORECASE) 609 zflg |= ZCILOOK; 610 611 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 612 NULL, NULL); 613 if (error) { 614 if (have_acl) 615 zfs_acl_ids_free(&acl_ids); 616 if (strcmp(name, "..") == 0) 617 error = SET_ERROR(EISDIR); 618 zfs_exit(zfsvfs, FTAG); 619 return (error); 620 } 621 } 622 623 if (zp == NULL) { 624 uint64_t txtype; 625 uint64_t projid = ZFS_DEFAULT_PROJID; 626 627 /* 628 * Create a new file object and update the directory 629 * to reference it. 630 */ 631 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 632 mnt_ns))) { 633 if (have_acl) 634 zfs_acl_ids_free(&acl_ids); 635 goto out; 636 } 637 638 /* 639 * We only support the creation of regular files in 640 * extended attribute directories. 641 */ 642 643 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 644 if (have_acl) 645 zfs_acl_ids_free(&acl_ids); 646 error = SET_ERROR(EINVAL); 647 goto out; 648 } 649 650 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 651 cr, vsecp, &acl_ids, mnt_ns)) != 0) 652 goto out; 653 have_acl = B_TRUE; 654 655 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 656 projid = zfs_inherit_projid(dzp); 657 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 658 zfs_acl_ids_free(&acl_ids); 659 error = SET_ERROR(EDQUOT); 660 goto out; 661 } 662 663 tx = dmu_tx_create(os); 664 665 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 666 ZFS_SA_BASE_ATTR_SIZE); 667 668 fuid_dirtied = zfsvfs->z_fuid_dirty; 669 if (fuid_dirtied) 670 zfs_fuid_txhold(zfsvfs, tx); 671 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 672 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 673 if (!zfsvfs->z_use_sa && 674 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 675 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 676 0, acl_ids.z_aclp->z_acl_bytes); 677 } 678 679 error = dmu_tx_assign(tx, 680 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 681 if (error) { 682 zfs_dirent_unlock(dl); 683 if (error == ERESTART) { 684 waited = B_TRUE; 685 dmu_tx_wait(tx); 686 dmu_tx_abort(tx); 687 goto top; 688 } 689 zfs_acl_ids_free(&acl_ids); 690 dmu_tx_abort(tx); 691 zfs_exit(zfsvfs, FTAG); 692 return (error); 693 } 694 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 695 696 error = zfs_link_create(dl, zp, tx, ZNEW); 697 if (error != 0) { 698 /* 699 * Since, we failed to add the directory entry for it, 700 * delete the newly created dnode. 701 */ 702 zfs_znode_delete(zp, tx); 703 remove_inode_hash(ZTOI(zp)); 704 zfs_acl_ids_free(&acl_ids); 705 dmu_tx_commit(tx); 706 goto out; 707 } 708 709 if (fuid_dirtied) 710 zfs_fuid_sync(zfsvfs, tx); 711 712 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 713 if (flag & FIGNORECASE) 714 txtype |= TX_CI; 715 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 716 vsecp, acl_ids.z_fuidp, vap); 717 zfs_acl_ids_free(&acl_ids); 718 dmu_tx_commit(tx); 719 } else { 720 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 721 722 if (have_acl) 723 zfs_acl_ids_free(&acl_ids); 724 725 /* 726 * A directory entry already exists for this name. 727 */ 728 /* 729 * Can't truncate an existing file if in exclusive mode. 730 */ 731 if (excl) { 732 error = SET_ERROR(EEXIST); 733 goto out; 734 } 735 /* 736 * Can't open a directory for writing. 737 */ 738 if (S_ISDIR(ZTOI(zp)->i_mode)) { 739 error = SET_ERROR(EISDIR); 740 goto out; 741 } 742 /* 743 * Verify requested access to file. 744 */ 745 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 746 mnt_ns))) { 747 goto out; 748 } 749 750 mutex_enter(&dzp->z_lock); 751 dzp->z_seq++; 752 mutex_exit(&dzp->z_lock); 753 754 /* 755 * Truncate regular files if requested. 756 */ 757 if (S_ISREG(ZTOI(zp)->i_mode) && 758 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 759 /* we can't hold any locks when calling zfs_freesp() */ 760 if (dl) { 761 zfs_dirent_unlock(dl); 762 dl = NULL; 763 } 764 error = zfs_freesp(zp, 0, 0, mode, TRUE); 765 } 766 } 767 out: 768 769 if (dl) 770 zfs_dirent_unlock(dl); 771 772 if (error) { 773 if (zp) 774 zrele(zp); 775 } else { 776 zfs_znode_update_vfs(dzp); 777 zfs_znode_update_vfs(zp); 778 *zpp = zp; 779 } 780 781 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 782 zil_commit(zilog, 0); 783 784 zfs_exit(zfsvfs, FTAG); 785 return (error); 786 } 787 788 int 789 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 790 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 791 zuserns_t *mnt_ns) 792 { 793 (void) excl, (void) mode, (void) flag; 794 znode_t *zp = NULL, *dzp = ITOZ(dip); 795 zfsvfs_t *zfsvfs = ITOZSB(dip); 796 objset_t *os; 797 dmu_tx_t *tx; 798 int error; 799 uid_t uid; 800 gid_t gid; 801 zfs_acl_ids_t acl_ids; 802 uint64_t projid = ZFS_DEFAULT_PROJID; 803 boolean_t fuid_dirtied; 804 boolean_t have_acl = B_FALSE; 805 boolean_t waited = B_FALSE; 806 807 /* 808 * If we have an ephemeral id, ACL, or XVATTR then 809 * make sure file system is at proper version 810 */ 811 812 gid = crgetgid(cr); 813 uid = crgetuid(cr); 814 815 if (zfsvfs->z_use_fuids == B_FALSE && 816 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 817 return (SET_ERROR(EINVAL)); 818 819 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 820 return (error); 821 os = zfsvfs->z_os; 822 823 if (vap->va_mask & ATTR_XVATTR) { 824 if ((error = secpolicy_xvattr((xvattr_t *)vap, 825 crgetuid(cr), cr, vap->va_mode)) != 0) { 826 zfs_exit(zfsvfs, FTAG); 827 return (error); 828 } 829 } 830 831 top: 832 *ipp = NULL; 833 834 /* 835 * Create a new file object and update the directory 836 * to reference it. 837 */ 838 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 839 if (have_acl) 840 zfs_acl_ids_free(&acl_ids); 841 goto out; 842 } 843 844 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 845 cr, vsecp, &acl_ids, mnt_ns)) != 0) 846 goto out; 847 have_acl = B_TRUE; 848 849 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 850 projid = zfs_inherit_projid(dzp); 851 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 852 zfs_acl_ids_free(&acl_ids); 853 error = SET_ERROR(EDQUOT); 854 goto out; 855 } 856 857 tx = dmu_tx_create(os); 858 859 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 860 ZFS_SA_BASE_ATTR_SIZE); 861 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 862 863 fuid_dirtied = zfsvfs->z_fuid_dirty; 864 if (fuid_dirtied) 865 zfs_fuid_txhold(zfsvfs, tx); 866 if (!zfsvfs->z_use_sa && 867 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 868 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 869 0, acl_ids.z_aclp->z_acl_bytes); 870 } 871 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 872 if (error) { 873 if (error == ERESTART) { 874 waited = B_TRUE; 875 dmu_tx_wait(tx); 876 dmu_tx_abort(tx); 877 goto top; 878 } 879 zfs_acl_ids_free(&acl_ids); 880 dmu_tx_abort(tx); 881 zfs_exit(zfsvfs, FTAG); 882 return (error); 883 } 884 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 885 886 if (fuid_dirtied) 887 zfs_fuid_sync(zfsvfs, tx); 888 889 /* Add to unlinked set */ 890 zp->z_unlinked = B_TRUE; 891 zfs_unlinked_add(zp, tx); 892 zfs_acl_ids_free(&acl_ids); 893 dmu_tx_commit(tx); 894 out: 895 896 if (error) { 897 if (zp) 898 zrele(zp); 899 } else { 900 zfs_znode_update_vfs(dzp); 901 zfs_znode_update_vfs(zp); 902 *ipp = ZTOI(zp); 903 } 904 905 zfs_exit(zfsvfs, FTAG); 906 return (error); 907 } 908 909 /* 910 * Remove an entry from a directory. 911 * 912 * IN: dzp - znode of directory to remove entry from. 913 * name - name of entry to remove. 914 * cr - credentials of caller. 915 * flags - case flags. 916 * 917 * RETURN: 0 if success 918 * error code if failure 919 * 920 * Timestamps: 921 * dzp - ctime|mtime 922 * ip - ctime (if nlink > 0) 923 */ 924 925 static uint64_t null_xattr = 0; 926 927 int 928 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 929 { 930 znode_t *zp; 931 znode_t *xzp; 932 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 933 zilog_t *zilog; 934 uint64_t acl_obj, xattr_obj; 935 uint64_t xattr_obj_unlinked = 0; 936 uint64_t obj = 0; 937 uint64_t links; 938 zfs_dirlock_t *dl; 939 dmu_tx_t *tx; 940 boolean_t may_delete_now, delete_now = FALSE; 941 boolean_t unlinked, toobig = FALSE; 942 uint64_t txtype; 943 pathname_t *realnmp = NULL; 944 pathname_t realnm; 945 int error; 946 int zflg = ZEXISTS; 947 boolean_t waited = B_FALSE; 948 949 if (name == NULL) 950 return (SET_ERROR(EINVAL)); 951 952 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 953 return (error); 954 zilog = zfsvfs->z_log; 955 956 if (flags & FIGNORECASE) { 957 zflg |= ZCILOOK; 958 pn_alloc(&realnm); 959 realnmp = &realnm; 960 } 961 962 top: 963 xattr_obj = 0; 964 xzp = NULL; 965 /* 966 * Attempt to lock directory; fail if entry doesn't exist. 967 */ 968 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 969 NULL, realnmp))) { 970 if (realnmp) 971 pn_free(realnmp); 972 zfs_exit(zfsvfs, FTAG); 973 return (error); 974 } 975 976 if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { 977 goto out; 978 } 979 980 /* 981 * Need to use rmdir for removing directories. 982 */ 983 if (S_ISDIR(ZTOI(zp)->i_mode)) { 984 error = SET_ERROR(EPERM); 985 goto out; 986 } 987 988 mutex_enter(&zp->z_lock); 989 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 990 !zn_has_cached_data(zp, 0, LLONG_MAX); 991 mutex_exit(&zp->z_lock); 992 993 /* 994 * We may delete the znode now, or we may put it in the unlinked set; 995 * it depends on whether we're the last link, and on whether there are 996 * other holds on the inode. So we dmu_tx_hold() the right things to 997 * allow for either case. 998 */ 999 obj = zp->z_id; 1000 tx = dmu_tx_create(zfsvfs->z_os); 1001 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1002 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1003 zfs_sa_upgrade_txholds(tx, zp); 1004 zfs_sa_upgrade_txholds(tx, dzp); 1005 if (may_delete_now) { 1006 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1007 /* if the file is too big, only hold_free a token amount */ 1008 dmu_tx_hold_free(tx, zp->z_id, 0, 1009 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1010 } 1011 1012 /* are there any extended attributes? */ 1013 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1014 &xattr_obj, sizeof (xattr_obj)); 1015 if (error == 0 && xattr_obj) { 1016 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1017 ASSERT0(error); 1018 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1019 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1020 } 1021 1022 mutex_enter(&zp->z_lock); 1023 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1024 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1025 mutex_exit(&zp->z_lock); 1026 1027 /* charge as an update -- would be nice not to charge at all */ 1028 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1029 1030 /* 1031 * Mark this transaction as typically resulting in a net free of space 1032 */ 1033 dmu_tx_mark_netfree(tx); 1034 1035 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1036 if (error) { 1037 zfs_dirent_unlock(dl); 1038 if (error == ERESTART) { 1039 waited = B_TRUE; 1040 dmu_tx_wait(tx); 1041 dmu_tx_abort(tx); 1042 zrele(zp); 1043 if (xzp) 1044 zrele(xzp); 1045 goto top; 1046 } 1047 if (realnmp) 1048 pn_free(realnmp); 1049 dmu_tx_abort(tx); 1050 zrele(zp); 1051 if (xzp) 1052 zrele(xzp); 1053 zfs_exit(zfsvfs, FTAG); 1054 return (error); 1055 } 1056 1057 /* 1058 * Remove the directory entry. 1059 */ 1060 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1061 1062 if (error) { 1063 dmu_tx_commit(tx); 1064 goto out; 1065 } 1066 1067 if (unlinked) { 1068 /* 1069 * Hold z_lock so that we can make sure that the ACL obj 1070 * hasn't changed. Could have been deleted due to 1071 * zfs_sa_upgrade(). 1072 */ 1073 mutex_enter(&zp->z_lock); 1074 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1075 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1076 delete_now = may_delete_now && !toobig && 1077 atomic_read(&ZTOI(zp)->i_count) == 1 && 1078 !zn_has_cached_data(zp, 0, LLONG_MAX) && 1079 xattr_obj == xattr_obj_unlinked && 1080 zfs_external_acl(zp) == acl_obj; 1081 } 1082 1083 if (delete_now) { 1084 if (xattr_obj_unlinked) { 1085 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1086 mutex_enter(&xzp->z_lock); 1087 xzp->z_unlinked = B_TRUE; 1088 clear_nlink(ZTOI(xzp)); 1089 links = 0; 1090 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1091 &links, sizeof (links), tx); 1092 ASSERT3U(error, ==, 0); 1093 mutex_exit(&xzp->z_lock); 1094 zfs_unlinked_add(xzp, tx); 1095 1096 if (zp->z_is_sa) 1097 error = sa_remove(zp->z_sa_hdl, 1098 SA_ZPL_XATTR(zfsvfs), tx); 1099 else 1100 error = sa_update(zp->z_sa_hdl, 1101 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1102 sizeof (uint64_t), tx); 1103 ASSERT0(error); 1104 } 1105 /* 1106 * Add to the unlinked set because a new reference could be 1107 * taken concurrently resulting in a deferred destruction. 1108 */ 1109 zfs_unlinked_add(zp, tx); 1110 mutex_exit(&zp->z_lock); 1111 } else if (unlinked) { 1112 mutex_exit(&zp->z_lock); 1113 zfs_unlinked_add(zp, tx); 1114 } 1115 1116 txtype = TX_REMOVE; 1117 if (flags & FIGNORECASE) 1118 txtype |= TX_CI; 1119 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1120 1121 dmu_tx_commit(tx); 1122 out: 1123 if (realnmp) 1124 pn_free(realnmp); 1125 1126 zfs_dirent_unlock(dl); 1127 zfs_znode_update_vfs(dzp); 1128 zfs_znode_update_vfs(zp); 1129 1130 if (delete_now) 1131 zrele(zp); 1132 else 1133 zfs_zrele_async(zp); 1134 1135 if (xzp) { 1136 zfs_znode_update_vfs(xzp); 1137 zfs_zrele_async(xzp); 1138 } 1139 1140 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1141 zil_commit(zilog, 0); 1142 1143 zfs_exit(zfsvfs, FTAG); 1144 return (error); 1145 } 1146 1147 /* 1148 * Create a new directory and insert it into dzp using the name 1149 * provided. Return a pointer to the inserted directory. 1150 * 1151 * IN: dzp - znode of directory to add subdir to. 1152 * dirname - name of new directory. 1153 * vap - attributes of new directory. 1154 * cr - credentials of caller. 1155 * flags - case flags. 1156 * vsecp - ACL to be set 1157 * mnt_ns - user namespace of the mount 1158 * 1159 * OUT: zpp - znode of created directory. 1160 * 1161 * RETURN: 0 if success 1162 * error code if failure 1163 * 1164 * Timestamps: 1165 * dzp - ctime|mtime updated 1166 * zpp - ctime|mtime|atime updated 1167 */ 1168 int 1169 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1170 cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns) 1171 { 1172 znode_t *zp; 1173 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1174 zilog_t *zilog; 1175 zfs_dirlock_t *dl; 1176 uint64_t txtype; 1177 dmu_tx_t *tx; 1178 int error; 1179 int zf = ZNEW; 1180 uid_t uid; 1181 gid_t gid = crgetgid(cr); 1182 zfs_acl_ids_t acl_ids; 1183 boolean_t fuid_dirtied; 1184 boolean_t waited = B_FALSE; 1185 1186 ASSERT(S_ISDIR(vap->va_mode)); 1187 1188 /* 1189 * If we have an ephemeral id, ACL, or XVATTR then 1190 * make sure file system is at proper version 1191 */ 1192 1193 uid = crgetuid(cr); 1194 if (zfsvfs->z_use_fuids == B_FALSE && 1195 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1196 return (SET_ERROR(EINVAL)); 1197 1198 if (dirname == NULL) 1199 return (SET_ERROR(EINVAL)); 1200 1201 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1202 return (error); 1203 zilog = zfsvfs->z_log; 1204 1205 if (dzp->z_pflags & ZFS_XATTR) { 1206 zfs_exit(zfsvfs, FTAG); 1207 return (SET_ERROR(EINVAL)); 1208 } 1209 1210 if (zfsvfs->z_utf8 && u8_validate(dirname, 1211 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1212 zfs_exit(zfsvfs, FTAG); 1213 return (SET_ERROR(EILSEQ)); 1214 } 1215 if (flags & FIGNORECASE) 1216 zf |= ZCILOOK; 1217 1218 if (vap->va_mask & ATTR_XVATTR) { 1219 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1220 crgetuid(cr), cr, vap->va_mode)) != 0) { 1221 zfs_exit(zfsvfs, FTAG); 1222 return (error); 1223 } 1224 } 1225 1226 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1227 vsecp, &acl_ids, mnt_ns)) != 0) { 1228 zfs_exit(zfsvfs, FTAG); 1229 return (error); 1230 } 1231 /* 1232 * First make sure the new directory doesn't exist. 1233 * 1234 * Existence is checked first to make sure we don't return 1235 * EACCES instead of EEXIST which can cause some applications 1236 * to fail. 1237 */ 1238 top: 1239 *zpp = NULL; 1240 1241 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1242 NULL, NULL))) { 1243 zfs_acl_ids_free(&acl_ids); 1244 zfs_exit(zfsvfs, FTAG); 1245 return (error); 1246 } 1247 1248 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1249 mnt_ns))) { 1250 zfs_acl_ids_free(&acl_ids); 1251 zfs_dirent_unlock(dl); 1252 zfs_exit(zfsvfs, FTAG); 1253 return (error); 1254 } 1255 1256 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1257 zfs_acl_ids_free(&acl_ids); 1258 zfs_dirent_unlock(dl); 1259 zfs_exit(zfsvfs, FTAG); 1260 return (SET_ERROR(EDQUOT)); 1261 } 1262 1263 /* 1264 * Add a new entry to the directory. 1265 */ 1266 tx = dmu_tx_create(zfsvfs->z_os); 1267 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1268 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1269 fuid_dirtied = zfsvfs->z_fuid_dirty; 1270 if (fuid_dirtied) 1271 zfs_fuid_txhold(zfsvfs, tx); 1272 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1273 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1274 acl_ids.z_aclp->z_acl_bytes); 1275 } 1276 1277 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1278 ZFS_SA_BASE_ATTR_SIZE); 1279 1280 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1281 if (error) { 1282 zfs_dirent_unlock(dl); 1283 if (error == ERESTART) { 1284 waited = B_TRUE; 1285 dmu_tx_wait(tx); 1286 dmu_tx_abort(tx); 1287 goto top; 1288 } 1289 zfs_acl_ids_free(&acl_ids); 1290 dmu_tx_abort(tx); 1291 zfs_exit(zfsvfs, FTAG); 1292 return (error); 1293 } 1294 1295 /* 1296 * Create new node. 1297 */ 1298 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1299 1300 /* 1301 * Now put new name in parent dir. 1302 */ 1303 error = zfs_link_create(dl, zp, tx, ZNEW); 1304 if (error != 0) { 1305 zfs_znode_delete(zp, tx); 1306 remove_inode_hash(ZTOI(zp)); 1307 goto out; 1308 } 1309 1310 if (fuid_dirtied) 1311 zfs_fuid_sync(zfsvfs, tx); 1312 1313 *zpp = zp; 1314 1315 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1316 if (flags & FIGNORECASE) 1317 txtype |= TX_CI; 1318 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1319 acl_ids.z_fuidp, vap); 1320 1321 out: 1322 zfs_acl_ids_free(&acl_ids); 1323 1324 dmu_tx_commit(tx); 1325 1326 zfs_dirent_unlock(dl); 1327 1328 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1329 zil_commit(zilog, 0); 1330 1331 if (error != 0) { 1332 zrele(zp); 1333 } else { 1334 zfs_znode_update_vfs(dzp); 1335 zfs_znode_update_vfs(zp); 1336 } 1337 zfs_exit(zfsvfs, FTAG); 1338 return (error); 1339 } 1340 1341 /* 1342 * Remove a directory subdir entry. If the current working 1343 * directory is the same as the subdir to be removed, the 1344 * remove will fail. 1345 * 1346 * IN: dzp - znode of directory to remove from. 1347 * name - name of directory to be removed. 1348 * cwd - inode of current working directory. 1349 * cr - credentials of caller. 1350 * flags - case flags 1351 * 1352 * RETURN: 0 on success, error code on failure. 1353 * 1354 * Timestamps: 1355 * dzp - ctime|mtime updated 1356 */ 1357 int 1358 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1359 int flags) 1360 { 1361 znode_t *zp; 1362 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1363 zilog_t *zilog; 1364 zfs_dirlock_t *dl; 1365 dmu_tx_t *tx; 1366 int error; 1367 int zflg = ZEXISTS; 1368 boolean_t waited = B_FALSE; 1369 1370 if (name == NULL) 1371 return (SET_ERROR(EINVAL)); 1372 1373 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1374 return (error); 1375 zilog = zfsvfs->z_log; 1376 1377 if (flags & FIGNORECASE) 1378 zflg |= ZCILOOK; 1379 top: 1380 zp = NULL; 1381 1382 /* 1383 * Attempt to lock directory; fail if entry doesn't exist. 1384 */ 1385 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1386 NULL, NULL))) { 1387 zfs_exit(zfsvfs, FTAG); 1388 return (error); 1389 } 1390 1391 if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { 1392 goto out; 1393 } 1394 1395 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1396 error = SET_ERROR(ENOTDIR); 1397 goto out; 1398 } 1399 1400 if (zp == cwd) { 1401 error = SET_ERROR(EINVAL); 1402 goto out; 1403 } 1404 1405 /* 1406 * Grab a lock on the directory to make sure that no one is 1407 * trying to add (or lookup) entries while we are removing it. 1408 */ 1409 rw_enter(&zp->z_name_lock, RW_WRITER); 1410 1411 /* 1412 * Grab a lock on the parent pointer to make sure we play well 1413 * with the treewalk and directory rename code. 1414 */ 1415 rw_enter(&zp->z_parent_lock, RW_WRITER); 1416 1417 tx = dmu_tx_create(zfsvfs->z_os); 1418 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1419 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1420 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1421 zfs_sa_upgrade_txholds(tx, zp); 1422 zfs_sa_upgrade_txholds(tx, dzp); 1423 dmu_tx_mark_netfree(tx); 1424 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1425 if (error) { 1426 rw_exit(&zp->z_parent_lock); 1427 rw_exit(&zp->z_name_lock); 1428 zfs_dirent_unlock(dl); 1429 if (error == ERESTART) { 1430 waited = B_TRUE; 1431 dmu_tx_wait(tx); 1432 dmu_tx_abort(tx); 1433 zrele(zp); 1434 goto top; 1435 } 1436 dmu_tx_abort(tx); 1437 zrele(zp); 1438 zfs_exit(zfsvfs, FTAG); 1439 return (error); 1440 } 1441 1442 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1443 1444 if (error == 0) { 1445 uint64_t txtype = TX_RMDIR; 1446 if (flags & FIGNORECASE) 1447 txtype |= TX_CI; 1448 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1449 B_FALSE); 1450 } 1451 1452 dmu_tx_commit(tx); 1453 1454 rw_exit(&zp->z_parent_lock); 1455 rw_exit(&zp->z_name_lock); 1456 out: 1457 zfs_dirent_unlock(dl); 1458 1459 zfs_znode_update_vfs(dzp); 1460 zfs_znode_update_vfs(zp); 1461 zrele(zp); 1462 1463 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1464 zil_commit(zilog, 0); 1465 1466 zfs_exit(zfsvfs, FTAG); 1467 return (error); 1468 } 1469 1470 /* 1471 * Read directory entries from the given directory cursor position and emit 1472 * name and position for each entry. 1473 * 1474 * IN: ip - inode of directory to read. 1475 * ctx - directory entry context. 1476 * cr - credentials of caller. 1477 * 1478 * RETURN: 0 if success 1479 * error code if failure 1480 * 1481 * Timestamps: 1482 * ip - atime updated 1483 * 1484 * Note that the low 4 bits of the cookie returned by zap is always zero. 1485 * This allows us to use the low range for "special" directory entries: 1486 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1487 * we use the offset 2 for the '.zfs' directory. 1488 */ 1489 int 1490 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) 1491 { 1492 (void) cr; 1493 znode_t *zp = ITOZ(ip); 1494 zfsvfs_t *zfsvfs = ITOZSB(ip); 1495 objset_t *os; 1496 zap_cursor_t zc; 1497 zap_attribute_t zap; 1498 int error; 1499 uint8_t prefetch; 1500 uint8_t type; 1501 int done = 0; 1502 uint64_t parent; 1503 uint64_t offset; /* must be unsigned; checks for < 1 */ 1504 1505 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1506 return (error); 1507 1508 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1509 &parent, sizeof (parent))) != 0) 1510 goto out; 1511 1512 /* 1513 * Quit if directory has been removed (posix) 1514 */ 1515 if (zp->z_unlinked) 1516 goto out; 1517 1518 error = 0; 1519 os = zfsvfs->z_os; 1520 offset = ctx->pos; 1521 prefetch = zp->z_zn_prefetch; 1522 1523 /* 1524 * Initialize the iterator cursor. 1525 */ 1526 if (offset <= 3) { 1527 /* 1528 * Start iteration from the beginning of the directory. 1529 */ 1530 zap_cursor_init(&zc, os, zp->z_id); 1531 } else { 1532 /* 1533 * The offset is a serialized cursor. 1534 */ 1535 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1536 } 1537 1538 /* 1539 * Transform to file-system independent format 1540 */ 1541 while (!done) { 1542 uint64_t objnum; 1543 /* 1544 * Special case `.', `..', and `.zfs'. 1545 */ 1546 if (offset == 0) { 1547 (void) strcpy(zap.za_name, "."); 1548 zap.za_normalization_conflict = 0; 1549 objnum = zp->z_id; 1550 type = DT_DIR; 1551 } else if (offset == 1) { 1552 (void) strcpy(zap.za_name, ".."); 1553 zap.za_normalization_conflict = 0; 1554 objnum = parent; 1555 type = DT_DIR; 1556 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1557 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1558 zap.za_normalization_conflict = 0; 1559 objnum = ZFSCTL_INO_ROOT; 1560 type = DT_DIR; 1561 } else { 1562 /* 1563 * Grab next entry. 1564 */ 1565 if ((error = zap_cursor_retrieve(&zc, &zap))) { 1566 if (error == ENOENT) 1567 break; 1568 else 1569 goto update; 1570 } 1571 1572 /* 1573 * Allow multiple entries provided the first entry is 1574 * the object id. Non-zpl consumers may safely make 1575 * use of the additional space. 1576 * 1577 * XXX: This should be a feature flag for compatibility 1578 */ 1579 if (zap.za_integer_length != 8 || 1580 zap.za_num_integers == 0) { 1581 cmn_err(CE_WARN, "zap_readdir: bad directory " 1582 "entry, obj = %lld, offset = %lld, " 1583 "length = %d, num = %lld\n", 1584 (u_longlong_t)zp->z_id, 1585 (u_longlong_t)offset, 1586 zap.za_integer_length, 1587 (u_longlong_t)zap.za_num_integers); 1588 error = SET_ERROR(ENXIO); 1589 goto update; 1590 } 1591 1592 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 1593 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1594 } 1595 1596 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), 1597 objnum, type); 1598 if (done) 1599 break; 1600 1601 /* Prefetch znode */ 1602 if (prefetch) { 1603 dmu_prefetch(os, objnum, 0, 0, 0, 1604 ZIO_PRIORITY_SYNC_READ); 1605 } 1606 1607 /* 1608 * Move to the next entry, fill in the previous offset. 1609 */ 1610 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1611 zap_cursor_advance(&zc); 1612 offset = zap_cursor_serialize(&zc); 1613 } else { 1614 offset += 1; 1615 } 1616 ctx->pos = offset; 1617 } 1618 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1619 1620 update: 1621 zap_cursor_fini(&zc); 1622 if (error == ENOENT) 1623 error = 0; 1624 out: 1625 zfs_exit(zfsvfs, FTAG); 1626 1627 return (error); 1628 } 1629 1630 /* 1631 * Get the basic file attributes and place them in the provided kstat 1632 * structure. The inode is assumed to be the authoritative source 1633 * for most of the attributes. However, the znode currently has the 1634 * authoritative atime, blksize, and block count. 1635 * 1636 * IN: ip - inode of file. 1637 * 1638 * OUT: sp - kstat values. 1639 * 1640 * RETURN: 0 (always succeeds) 1641 */ 1642 int 1643 zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, 1644 struct kstat *sp) 1645 { 1646 znode_t *zp = ITOZ(ip); 1647 zfsvfs_t *zfsvfs = ITOZSB(ip); 1648 uint32_t blksize; 1649 u_longlong_t nblocks; 1650 int error; 1651 1652 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1653 return (error); 1654 1655 mutex_enter(&zp->z_lock); 1656 1657 zpl_generic_fillattr(user_ns, ip, sp); 1658 /* 1659 * +1 link count for root inode with visible '.zfs' directory. 1660 */ 1661 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1662 if (sp->nlink < ZFS_LINK_MAX) 1663 sp->nlink++; 1664 1665 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1666 sp->blksize = blksize; 1667 sp->blocks = nblocks; 1668 1669 if (unlikely(zp->z_blksz == 0)) { 1670 /* 1671 * Block size hasn't been set; suggest maximal I/O transfers. 1672 */ 1673 sp->blksize = zfsvfs->z_max_blksz; 1674 } 1675 1676 mutex_exit(&zp->z_lock); 1677 1678 /* 1679 * Required to prevent NFS client from detecting different inode 1680 * numbers of snapshot root dentry before and after snapshot mount. 1681 */ 1682 if (zfsvfs->z_issnap) { 1683 if (ip->i_sb->s_root->d_inode == ip) 1684 sp->ino = ZFSCTL_INO_SNAPDIRS - 1685 dmu_objset_id(zfsvfs->z_os); 1686 } 1687 1688 zfs_exit(zfsvfs, FTAG); 1689 1690 return (0); 1691 } 1692 1693 /* 1694 * For the operation of changing file's user/group/project, we need to 1695 * handle not only the main object that is assigned to the file directly, 1696 * but also the ones that are used by the file via hidden xattr directory. 1697 * 1698 * Because the xattr directory may contains many EA entries, as to it may 1699 * be impossible to change all of them via the transaction of changing the 1700 * main object's user/group/project attributes. Then we have to change them 1701 * via other multiple independent transactions one by one. It may be not good 1702 * solution, but we have no better idea yet. 1703 */ 1704 static int 1705 zfs_setattr_dir(znode_t *dzp) 1706 { 1707 struct inode *dxip = ZTOI(dzp); 1708 struct inode *xip = NULL; 1709 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1710 objset_t *os = zfsvfs->z_os; 1711 zap_cursor_t zc; 1712 zap_attribute_t zap; 1713 zfs_dirlock_t *dl; 1714 znode_t *zp = NULL; 1715 dmu_tx_t *tx = NULL; 1716 uint64_t uid, gid; 1717 sa_bulk_attr_t bulk[4]; 1718 int count; 1719 int err; 1720 1721 zap_cursor_init(&zc, os, dzp->z_id); 1722 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { 1723 count = 0; 1724 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { 1725 err = ENXIO; 1726 break; 1727 } 1728 1729 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, 1730 ZEXISTS, NULL, NULL); 1731 if (err == ENOENT) 1732 goto next; 1733 if (err) 1734 break; 1735 1736 xip = ZTOI(zp); 1737 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1738 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1739 zp->z_projid == dzp->z_projid) 1740 goto next; 1741 1742 tx = dmu_tx_create(os); 1743 if (!(zp->z_pflags & ZFS_PROJID)) 1744 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1745 else 1746 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1747 1748 err = dmu_tx_assign(tx, TXG_WAIT); 1749 if (err) 1750 break; 1751 1752 mutex_enter(&dzp->z_lock); 1753 1754 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1755 xip->i_uid = dxip->i_uid; 1756 uid = zfs_uid_read(dxip); 1757 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1758 &uid, sizeof (uid)); 1759 } 1760 1761 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1762 xip->i_gid = dxip->i_gid; 1763 gid = zfs_gid_read(dxip); 1764 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1765 &gid, sizeof (gid)); 1766 } 1767 1768 if (zp->z_projid != dzp->z_projid) { 1769 if (!(zp->z_pflags & ZFS_PROJID)) { 1770 zp->z_pflags |= ZFS_PROJID; 1771 SA_ADD_BULK_ATTR(bulk, count, 1772 SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 1773 sizeof (zp->z_pflags)); 1774 } 1775 1776 zp->z_projid = dzp->z_projid; 1777 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), 1778 NULL, &zp->z_projid, sizeof (zp->z_projid)); 1779 } 1780 1781 mutex_exit(&dzp->z_lock); 1782 1783 if (likely(count > 0)) { 1784 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1785 dmu_tx_commit(tx); 1786 } else { 1787 dmu_tx_abort(tx); 1788 } 1789 tx = NULL; 1790 if (err != 0 && err != ENOENT) 1791 break; 1792 1793 next: 1794 if (zp) { 1795 zrele(zp); 1796 zp = NULL; 1797 zfs_dirent_unlock(dl); 1798 } 1799 zap_cursor_advance(&zc); 1800 } 1801 1802 if (tx) 1803 dmu_tx_abort(tx); 1804 if (zp) { 1805 zrele(zp); 1806 zfs_dirent_unlock(dl); 1807 } 1808 zap_cursor_fini(&zc); 1809 1810 return (err == ENOENT ? 0 : err); 1811 } 1812 1813 /* 1814 * Set the file attributes to the values contained in the 1815 * vattr structure. 1816 * 1817 * IN: zp - znode of file to be modified. 1818 * vap - new attribute values. 1819 * If ATTR_XVATTR set, then optional attrs are being set 1820 * flags - ATTR_UTIME set if non-default time values provided. 1821 * - ATTR_NOACLCHECK (CIFS context only). 1822 * cr - credentials of caller. 1823 * mnt_ns - user namespace of the mount 1824 * 1825 * RETURN: 0 if success 1826 * error code if failure 1827 * 1828 * Timestamps: 1829 * ip - ctime updated, mtime updated if size changed. 1830 */ 1831 int 1832 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) 1833 { 1834 struct inode *ip; 1835 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1836 objset_t *os = zfsvfs->z_os; 1837 zilog_t *zilog; 1838 dmu_tx_t *tx; 1839 vattr_t oldva; 1840 xvattr_t *tmpxvattr; 1841 uint_t mask = vap->va_mask; 1842 uint_t saved_mask = 0; 1843 int trim_mask = 0; 1844 uint64_t new_mode; 1845 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1846 uint64_t xattr_obj; 1847 uint64_t mtime[2], ctime[2], atime[2]; 1848 uint64_t projid = ZFS_INVALID_PROJID; 1849 znode_t *attrzp; 1850 int need_policy = FALSE; 1851 int err, err2 = 0; 1852 zfs_fuid_info_t *fuidp = NULL; 1853 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1854 xoptattr_t *xoap; 1855 zfs_acl_t *aclp; 1856 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1857 boolean_t fuid_dirtied = B_FALSE; 1858 boolean_t handle_eadir = B_FALSE; 1859 sa_bulk_attr_t *bulk, *xattr_bulk; 1860 int count = 0, xattr_count = 0, bulks = 8; 1861 1862 if (mask == 0) 1863 return (0); 1864 1865 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1866 return (err); 1867 ip = ZTOI(zp); 1868 1869 /* 1870 * If this is a xvattr_t, then get a pointer to the structure of 1871 * optional attributes. If this is NULL, then we have a vattr_t. 1872 */ 1873 xoap = xva_getxoptattr(xvap); 1874 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1875 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1876 if (!dmu_objset_projectquota_enabled(os) || 1877 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1878 zfs_exit(zfsvfs, FTAG); 1879 return (SET_ERROR(ENOTSUP)); 1880 } 1881 1882 projid = xoap->xoa_projid; 1883 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1884 zfs_exit(zfsvfs, FTAG); 1885 return (SET_ERROR(EINVAL)); 1886 } 1887 1888 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1889 projid = ZFS_INVALID_PROJID; 1890 else 1891 need_policy = TRUE; 1892 } 1893 1894 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1895 (xoap->xoa_projinherit != 1896 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1897 (!dmu_objset_projectquota_enabled(os) || 1898 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1899 zfs_exit(zfsvfs, FTAG); 1900 return (SET_ERROR(ENOTSUP)); 1901 } 1902 } 1903 1904 zilog = zfsvfs->z_log; 1905 1906 /* 1907 * Make sure that if we have ephemeral uid/gid or xvattr specified 1908 * that file system is at proper version level 1909 */ 1910 1911 if (zfsvfs->z_use_fuids == B_FALSE && 1912 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 1913 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 1914 (mask & ATTR_XVATTR))) { 1915 zfs_exit(zfsvfs, FTAG); 1916 return (SET_ERROR(EINVAL)); 1917 } 1918 1919 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 1920 zfs_exit(zfsvfs, FTAG); 1921 return (SET_ERROR(EISDIR)); 1922 } 1923 1924 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 1925 zfs_exit(zfsvfs, FTAG); 1926 return (SET_ERROR(EINVAL)); 1927 } 1928 1929 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 1930 xva_init(tmpxvattr); 1931 1932 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1933 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1934 1935 /* 1936 * Immutable files can only alter immutable bit and atime 1937 */ 1938 if ((zp->z_pflags & ZFS_IMMUTABLE) && 1939 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 1940 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 1941 err = SET_ERROR(EPERM); 1942 goto out3; 1943 } 1944 1945 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 1946 err = SET_ERROR(EPERM); 1947 goto out3; 1948 } 1949 1950 /* 1951 * Verify timestamps doesn't overflow 32 bits. 1952 * ZFS can handle large timestamps, but 32bit syscalls can't 1953 * handle times greater than 2039. This check should be removed 1954 * once large timestamps are fully supported. 1955 */ 1956 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 1957 if (((mask & ATTR_ATIME) && 1958 TIMESPEC_OVERFLOW(&vap->va_atime)) || 1959 ((mask & ATTR_MTIME) && 1960 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 1961 err = SET_ERROR(EOVERFLOW); 1962 goto out3; 1963 } 1964 } 1965 1966 top: 1967 attrzp = NULL; 1968 aclp = NULL; 1969 1970 /* Can this be moved to before the top label? */ 1971 if (zfs_is_readonly(zfsvfs)) { 1972 err = SET_ERROR(EROFS); 1973 goto out3; 1974 } 1975 1976 /* 1977 * First validate permissions 1978 */ 1979 1980 if (mask & ATTR_SIZE) { 1981 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 1982 mnt_ns); 1983 if (err) 1984 goto out3; 1985 1986 /* 1987 * XXX - Note, we are not providing any open 1988 * mode flags here (like FNDELAY), so we may 1989 * block if there are locks present... this 1990 * should be addressed in openat(). 1991 */ 1992 /* XXX - would it be OK to generate a log record here? */ 1993 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 1994 if (err) 1995 goto out3; 1996 } 1997 1998 if (mask & (ATTR_ATIME|ATTR_MTIME) || 1999 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2000 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2001 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2002 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2003 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2004 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2005 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2006 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2007 skipaclchk, cr, mnt_ns); 2008 } 2009 2010 if (mask & (ATTR_UID|ATTR_GID)) { 2011 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2012 int take_owner; 2013 int take_group; 2014 uid_t uid; 2015 gid_t gid; 2016 2017 /* 2018 * NOTE: even if a new mode is being set, 2019 * we may clear S_ISUID/S_ISGID bits. 2020 */ 2021 2022 if (!(mask & ATTR_MODE)) 2023 vap->va_mode = zp->z_mode; 2024 2025 /* 2026 * Take ownership or chgrp to group we are a member of 2027 */ 2028 2029 uid = zfs_uid_to_vfsuid((struct user_namespace *)mnt_ns, 2030 zfs_i_user_ns(ip), vap->va_uid); 2031 gid = zfs_gid_to_vfsgid((struct user_namespace *)mnt_ns, 2032 zfs_i_user_ns(ip), vap->va_gid); 2033 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2034 take_group = (mask & ATTR_GID) && 2035 zfs_groupmember(zfsvfs, gid, cr); 2036 2037 /* 2038 * If both ATTR_UID and ATTR_GID are set then take_owner and 2039 * take_group must both be set in order to allow taking 2040 * ownership. 2041 * 2042 * Otherwise, send the check through secpolicy_vnode_setattr() 2043 * 2044 */ 2045 2046 if (((idmask == (ATTR_UID|ATTR_GID)) && 2047 take_owner && take_group) || 2048 ((idmask == ATTR_UID) && take_owner) || 2049 ((idmask == ATTR_GID) && take_group)) { 2050 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2051 skipaclchk, cr, mnt_ns) == 0) { 2052 /* 2053 * Remove setuid/setgid for non-privileged users 2054 */ 2055 (void) secpolicy_setid_clear(vap, cr); 2056 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2057 } else { 2058 need_policy = TRUE; 2059 } 2060 } else { 2061 need_policy = TRUE; 2062 } 2063 } 2064 2065 mutex_enter(&zp->z_lock); 2066 oldva.va_mode = zp->z_mode; 2067 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2068 if (mask & ATTR_XVATTR) { 2069 /* 2070 * Update xvattr mask to include only those attributes 2071 * that are actually changing. 2072 * 2073 * the bits will be restored prior to actually setting 2074 * the attributes so the caller thinks they were set. 2075 */ 2076 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2077 if (xoap->xoa_appendonly != 2078 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2079 need_policy = TRUE; 2080 } else { 2081 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2082 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2083 } 2084 } 2085 2086 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2087 if (xoap->xoa_projinherit != 2088 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2089 need_policy = TRUE; 2090 } else { 2091 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2092 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2093 } 2094 } 2095 2096 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2097 if (xoap->xoa_nounlink != 2098 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2099 need_policy = TRUE; 2100 } else { 2101 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2102 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2103 } 2104 } 2105 2106 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2107 if (xoap->xoa_immutable != 2108 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2109 need_policy = TRUE; 2110 } else { 2111 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2112 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2113 } 2114 } 2115 2116 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2117 if (xoap->xoa_nodump != 2118 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2119 need_policy = TRUE; 2120 } else { 2121 XVA_CLR_REQ(xvap, XAT_NODUMP); 2122 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2123 } 2124 } 2125 2126 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2127 if (xoap->xoa_av_modified != 2128 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2129 need_policy = TRUE; 2130 } else { 2131 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2132 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2133 } 2134 } 2135 2136 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2137 if ((!S_ISREG(ip->i_mode) && 2138 xoap->xoa_av_quarantined) || 2139 xoap->xoa_av_quarantined != 2140 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2141 need_policy = TRUE; 2142 } else { 2143 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2144 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2145 } 2146 } 2147 2148 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2149 mutex_exit(&zp->z_lock); 2150 err = SET_ERROR(EPERM); 2151 goto out3; 2152 } 2153 2154 if (need_policy == FALSE && 2155 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2156 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2157 need_policy = TRUE; 2158 } 2159 } 2160 2161 mutex_exit(&zp->z_lock); 2162 2163 if (mask & ATTR_MODE) { 2164 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2165 mnt_ns) == 0) { 2166 err = secpolicy_setid_setsticky_clear(ip, vap, 2167 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2168 if (err) 2169 goto out3; 2170 trim_mask |= ATTR_MODE; 2171 } else { 2172 need_policy = TRUE; 2173 } 2174 } 2175 2176 if (need_policy) { 2177 /* 2178 * If trim_mask is set then take ownership 2179 * has been granted or write_acl is present and user 2180 * has the ability to modify mode. In that case remove 2181 * UID|GID and or MODE from mask so that 2182 * secpolicy_vnode_setattr() doesn't revoke it. 2183 */ 2184 2185 if (trim_mask) { 2186 saved_mask = vap->va_mask; 2187 vap->va_mask &= ~trim_mask; 2188 } 2189 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2190 zfs_zaccess_unix, zp); 2191 if (err) 2192 goto out3; 2193 2194 if (trim_mask) 2195 vap->va_mask |= saved_mask; 2196 } 2197 2198 /* 2199 * secpolicy_vnode_setattr, or take ownership may have 2200 * changed va_mask 2201 */ 2202 mask = vap->va_mask; 2203 2204 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2205 handle_eadir = B_TRUE; 2206 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2207 &xattr_obj, sizeof (xattr_obj)); 2208 2209 if (err == 0 && xattr_obj) { 2210 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2211 if (err) 2212 goto out2; 2213 } 2214 if (mask & ATTR_UID) { 2215 new_kuid = zfs_fuid_create(zfsvfs, 2216 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2217 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2218 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2219 new_kuid)) { 2220 if (attrzp) 2221 zrele(attrzp); 2222 err = SET_ERROR(EDQUOT); 2223 goto out2; 2224 } 2225 } 2226 2227 if (mask & ATTR_GID) { 2228 new_kgid = zfs_fuid_create(zfsvfs, 2229 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2230 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2231 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2232 new_kgid)) { 2233 if (attrzp) 2234 zrele(attrzp); 2235 err = SET_ERROR(EDQUOT); 2236 goto out2; 2237 } 2238 } 2239 2240 if (projid != ZFS_INVALID_PROJID && 2241 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2242 if (attrzp) 2243 zrele(attrzp); 2244 err = EDQUOT; 2245 goto out2; 2246 } 2247 } 2248 tx = dmu_tx_create(os); 2249 2250 if (mask & ATTR_MODE) { 2251 uint64_t pmode = zp->z_mode; 2252 uint64_t acl_obj; 2253 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2254 2255 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2256 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2257 err = EPERM; 2258 goto out; 2259 } 2260 2261 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2262 goto out; 2263 2264 mutex_enter(&zp->z_lock); 2265 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2266 /* 2267 * Are we upgrading ACL from old V0 format 2268 * to V1 format? 2269 */ 2270 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2271 zfs_znode_acl_version(zp) == 2272 ZFS_ACL_VERSION_INITIAL) { 2273 dmu_tx_hold_free(tx, acl_obj, 0, 2274 DMU_OBJECT_END); 2275 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2276 0, aclp->z_acl_bytes); 2277 } else { 2278 dmu_tx_hold_write(tx, acl_obj, 0, 2279 aclp->z_acl_bytes); 2280 } 2281 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2282 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2283 0, aclp->z_acl_bytes); 2284 } 2285 mutex_exit(&zp->z_lock); 2286 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2287 } else { 2288 if (((mask & ATTR_XVATTR) && 2289 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2290 (projid != ZFS_INVALID_PROJID && 2291 !(zp->z_pflags & ZFS_PROJID))) 2292 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2293 else 2294 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2295 } 2296 2297 if (attrzp) { 2298 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2299 } 2300 2301 fuid_dirtied = zfsvfs->z_fuid_dirty; 2302 if (fuid_dirtied) 2303 zfs_fuid_txhold(zfsvfs, tx); 2304 2305 zfs_sa_upgrade_txholds(tx, zp); 2306 2307 err = dmu_tx_assign(tx, TXG_WAIT); 2308 if (err) 2309 goto out; 2310 2311 count = 0; 2312 /* 2313 * Set each attribute requested. 2314 * We group settings according to the locks they need to acquire. 2315 * 2316 * Note: you cannot set ctime directly, although it will be 2317 * updated as a side-effect of calling this function. 2318 */ 2319 2320 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2321 /* 2322 * For the existed object that is upgraded from old system, 2323 * its on-disk layout has no slot for the project ID attribute. 2324 * But quota accounting logic needs to access related slots by 2325 * offset directly. So we need to adjust old objects' layout 2326 * to make the project ID to some unified and fixed offset. 2327 */ 2328 if (attrzp) 2329 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2330 if (err == 0) 2331 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2332 2333 if (unlikely(err == EEXIST)) 2334 err = 0; 2335 else if (err != 0) 2336 goto out; 2337 else 2338 projid = ZFS_INVALID_PROJID; 2339 } 2340 2341 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2342 mutex_enter(&zp->z_acl_lock); 2343 mutex_enter(&zp->z_lock); 2344 2345 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2346 &zp->z_pflags, sizeof (zp->z_pflags)); 2347 2348 if (attrzp) { 2349 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2350 mutex_enter(&attrzp->z_acl_lock); 2351 mutex_enter(&attrzp->z_lock); 2352 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2353 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2354 sizeof (attrzp->z_pflags)); 2355 if (projid != ZFS_INVALID_PROJID) { 2356 attrzp->z_projid = projid; 2357 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2358 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2359 sizeof (attrzp->z_projid)); 2360 } 2361 } 2362 2363 if (mask & (ATTR_UID|ATTR_GID)) { 2364 2365 if (mask & ATTR_UID) { 2366 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2367 new_uid = zfs_uid_read(ZTOI(zp)); 2368 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2369 &new_uid, sizeof (new_uid)); 2370 if (attrzp) { 2371 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2372 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2373 sizeof (new_uid)); 2374 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2375 } 2376 } 2377 2378 if (mask & ATTR_GID) { 2379 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2380 new_gid = zfs_gid_read(ZTOI(zp)); 2381 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2382 NULL, &new_gid, sizeof (new_gid)); 2383 if (attrzp) { 2384 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2385 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2386 sizeof (new_gid)); 2387 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2388 } 2389 } 2390 if (!(mask & ATTR_MODE)) { 2391 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2392 NULL, &new_mode, sizeof (new_mode)); 2393 new_mode = zp->z_mode; 2394 } 2395 err = zfs_acl_chown_setattr(zp); 2396 ASSERT(err == 0); 2397 if (attrzp) { 2398 err = zfs_acl_chown_setattr(attrzp); 2399 ASSERT(err == 0); 2400 } 2401 } 2402 2403 if (mask & ATTR_MODE) { 2404 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2405 &new_mode, sizeof (new_mode)); 2406 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2407 ASSERT3P(aclp, !=, NULL); 2408 err = zfs_aclset_common(zp, aclp, cr, tx); 2409 ASSERT0(err); 2410 if (zp->z_acl_cached) 2411 zfs_acl_free(zp->z_acl_cached); 2412 zp->z_acl_cached = aclp; 2413 aclp = NULL; 2414 } 2415 2416 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2417 zp->z_atime_dirty = B_FALSE; 2418 ZFS_TIME_ENCODE(&ip->i_atime, atime); 2419 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2420 &atime, sizeof (atime)); 2421 } 2422 2423 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2424 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2425 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( 2426 vap->va_mtime, ZTOI(zp)); 2427 2428 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2429 mtime, sizeof (mtime)); 2430 } 2431 2432 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2433 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2434 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, 2435 ZTOI(zp)); 2436 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2437 ctime, sizeof (ctime)); 2438 } 2439 2440 if (projid != ZFS_INVALID_PROJID) { 2441 zp->z_projid = projid; 2442 SA_ADD_BULK_ATTR(bulk, count, 2443 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2444 sizeof (zp->z_projid)); 2445 } 2446 2447 if (attrzp && mask) { 2448 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2449 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2450 sizeof (ctime)); 2451 } 2452 2453 /* 2454 * Do this after setting timestamps to prevent timestamp 2455 * update from toggling bit 2456 */ 2457 2458 if (xoap && (mask & ATTR_XVATTR)) { 2459 2460 /* 2461 * restore trimmed off masks 2462 * so that return masks can be set for caller. 2463 */ 2464 2465 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2466 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2467 } 2468 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2469 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2470 } 2471 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2472 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2473 } 2474 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2475 XVA_SET_REQ(xvap, XAT_NODUMP); 2476 } 2477 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2478 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2479 } 2480 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2481 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2482 } 2483 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2484 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2485 } 2486 2487 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2488 ASSERT(S_ISREG(ip->i_mode)); 2489 2490 zfs_xvattr_set(zp, xvap, tx); 2491 } 2492 2493 if (fuid_dirtied) 2494 zfs_fuid_sync(zfsvfs, tx); 2495 2496 if (mask != 0) 2497 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2498 2499 mutex_exit(&zp->z_lock); 2500 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2501 mutex_exit(&zp->z_acl_lock); 2502 2503 if (attrzp) { 2504 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2505 mutex_exit(&attrzp->z_acl_lock); 2506 mutex_exit(&attrzp->z_lock); 2507 } 2508 out: 2509 if (err == 0 && xattr_count > 0) { 2510 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2511 xattr_count, tx); 2512 ASSERT(err2 == 0); 2513 } 2514 2515 if (aclp) 2516 zfs_acl_free(aclp); 2517 2518 if (fuidp) { 2519 zfs_fuid_info_free(fuidp); 2520 fuidp = NULL; 2521 } 2522 2523 if (err) { 2524 dmu_tx_abort(tx); 2525 if (attrzp) 2526 zrele(attrzp); 2527 if (err == ERESTART) 2528 goto top; 2529 } else { 2530 if (count > 0) 2531 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2532 dmu_tx_commit(tx); 2533 if (attrzp) { 2534 if (err2 == 0 && handle_eadir) 2535 err = zfs_setattr_dir(attrzp); 2536 zrele(attrzp); 2537 } 2538 zfs_znode_update_vfs(zp); 2539 } 2540 2541 out2: 2542 if (os->os_sync == ZFS_SYNC_ALWAYS) 2543 zil_commit(zilog, 0); 2544 2545 out3: 2546 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2547 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2548 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2549 zfs_exit(zfsvfs, FTAG); 2550 return (err); 2551 } 2552 2553 typedef struct zfs_zlock { 2554 krwlock_t *zl_rwlock; /* lock we acquired */ 2555 znode_t *zl_znode; /* znode we held */ 2556 struct zfs_zlock *zl_next; /* next in list */ 2557 } zfs_zlock_t; 2558 2559 /* 2560 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2561 */ 2562 static void 2563 zfs_rename_unlock(zfs_zlock_t **zlpp) 2564 { 2565 zfs_zlock_t *zl; 2566 2567 while ((zl = *zlpp) != NULL) { 2568 if (zl->zl_znode != NULL) 2569 zfs_zrele_async(zl->zl_znode); 2570 rw_exit(zl->zl_rwlock); 2571 *zlpp = zl->zl_next; 2572 kmem_free(zl, sizeof (*zl)); 2573 } 2574 } 2575 2576 /* 2577 * Search back through the directory tree, using the ".." entries. 2578 * Lock each directory in the chain to prevent concurrent renames. 2579 * Fail any attempt to move a directory into one of its own descendants. 2580 * XXX - z_parent_lock can overlap with map or grow locks 2581 */ 2582 static int 2583 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2584 { 2585 zfs_zlock_t *zl; 2586 znode_t *zp = tdzp; 2587 uint64_t rootid = ZTOZSB(zp)->z_root; 2588 uint64_t oidp = zp->z_id; 2589 krwlock_t *rwlp = &szp->z_parent_lock; 2590 krw_t rw = RW_WRITER; 2591 2592 /* 2593 * First pass write-locks szp and compares to zp->z_id. 2594 * Later passes read-lock zp and compare to zp->z_parent. 2595 */ 2596 do { 2597 if (!rw_tryenter(rwlp, rw)) { 2598 /* 2599 * Another thread is renaming in this path. 2600 * Note that if we are a WRITER, we don't have any 2601 * parent_locks held yet. 2602 */ 2603 if (rw == RW_READER && zp->z_id > szp->z_id) { 2604 /* 2605 * Drop our locks and restart 2606 */ 2607 zfs_rename_unlock(&zl); 2608 *zlpp = NULL; 2609 zp = tdzp; 2610 oidp = zp->z_id; 2611 rwlp = &szp->z_parent_lock; 2612 rw = RW_WRITER; 2613 continue; 2614 } else { 2615 /* 2616 * Wait for other thread to drop its locks 2617 */ 2618 rw_enter(rwlp, rw); 2619 } 2620 } 2621 2622 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2623 zl->zl_rwlock = rwlp; 2624 zl->zl_znode = NULL; 2625 zl->zl_next = *zlpp; 2626 *zlpp = zl; 2627 2628 if (oidp == szp->z_id) /* We're a descendant of szp */ 2629 return (SET_ERROR(EINVAL)); 2630 2631 if (oidp == rootid) /* We've hit the top */ 2632 return (0); 2633 2634 if (rw == RW_READER) { /* i.e. not the first pass */ 2635 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2636 if (error) 2637 return (error); 2638 zl->zl_znode = zp; 2639 } 2640 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2641 &oidp, sizeof (oidp)); 2642 rwlp = &zp->z_parent_lock; 2643 rw = RW_READER; 2644 2645 } while (zp->z_id != sdzp->z_id); 2646 2647 return (0); 2648 } 2649 2650 /* 2651 * Move an entry from the provided source directory to the target 2652 * directory. Change the entry name as indicated. 2653 * 2654 * IN: sdzp - Source directory containing the "old entry". 2655 * snm - Old entry name. 2656 * tdzp - Target directory to contain the "new entry". 2657 * tnm - New entry name. 2658 * cr - credentials of caller. 2659 * flags - case flags 2660 * rflags - RENAME_* flags 2661 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2662 * mnt_ns - user namespace of the mount 2663 * 2664 * RETURN: 0 on success, error code on failure. 2665 * 2666 * Timestamps: 2667 * sdzp,tdzp - ctime|mtime updated 2668 */ 2669 int 2670 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2671 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) 2672 { 2673 znode_t *szp, *tzp; 2674 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2675 zilog_t *zilog; 2676 zfs_dirlock_t *sdl, *tdl; 2677 dmu_tx_t *tx; 2678 zfs_zlock_t *zl; 2679 int cmp, serr, terr; 2680 int error = 0; 2681 int zflg = 0; 2682 boolean_t waited = B_FALSE; 2683 /* Needed for whiteout inode creation. */ 2684 boolean_t fuid_dirtied; 2685 zfs_acl_ids_t acl_ids; 2686 boolean_t have_acl = B_FALSE; 2687 znode_t *wzp = NULL; 2688 2689 2690 if (snm == NULL || tnm == NULL) 2691 return (SET_ERROR(EINVAL)); 2692 2693 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2694 return (SET_ERROR(EINVAL)); 2695 2696 /* Already checked by Linux VFS, but just to make sure. */ 2697 if (rflags & RENAME_EXCHANGE && 2698 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2699 return (SET_ERROR(EINVAL)); 2700 2701 /* 2702 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2703 * right kind of vattr_t for the whiteout file. These are set 2704 * internally by ZFS so should never be incorrect. 2705 */ 2706 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2707 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2708 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2709 2710 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2711 return (error); 2712 zilog = zfsvfs->z_log; 2713 2714 if ((error = zfs_verify_zp(tdzp)) != 0) { 2715 zfs_exit(zfsvfs, FTAG); 2716 return (error); 2717 } 2718 2719 /* 2720 * We check i_sb because snapshots and the ctldir must have different 2721 * super blocks. 2722 */ 2723 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2724 zfsctl_is_node(ZTOI(tdzp))) { 2725 zfs_exit(zfsvfs, FTAG); 2726 return (SET_ERROR(EXDEV)); 2727 } 2728 2729 if (zfsvfs->z_utf8 && u8_validate(tnm, 2730 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2731 zfs_exit(zfsvfs, FTAG); 2732 return (SET_ERROR(EILSEQ)); 2733 } 2734 2735 if (flags & FIGNORECASE) 2736 zflg |= ZCILOOK; 2737 2738 top: 2739 szp = NULL; 2740 tzp = NULL; 2741 zl = NULL; 2742 2743 /* 2744 * This is to prevent the creation of links into attribute space 2745 * by renaming a linked file into/outof an attribute directory. 2746 * See the comment in zfs_link() for why this is considered bad. 2747 */ 2748 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2749 zfs_exit(zfsvfs, FTAG); 2750 return (SET_ERROR(EINVAL)); 2751 } 2752 2753 /* 2754 * Lock source and target directory entries. To prevent deadlock, 2755 * a lock ordering must be defined. We lock the directory with 2756 * the smallest object id first, or if it's a tie, the one with 2757 * the lexically first name. 2758 */ 2759 if (sdzp->z_id < tdzp->z_id) { 2760 cmp = -1; 2761 } else if (sdzp->z_id > tdzp->z_id) { 2762 cmp = 1; 2763 } else { 2764 /* 2765 * First compare the two name arguments without 2766 * considering any case folding. 2767 */ 2768 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2769 2770 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2771 ASSERT(error == 0 || !zfsvfs->z_utf8); 2772 if (cmp == 0) { 2773 /* 2774 * POSIX: "If the old argument and the new argument 2775 * both refer to links to the same existing file, 2776 * the rename() function shall return successfully 2777 * and perform no other action." 2778 */ 2779 zfs_exit(zfsvfs, FTAG); 2780 return (0); 2781 } 2782 /* 2783 * If the file system is case-folding, then we may 2784 * have some more checking to do. A case-folding file 2785 * system is either supporting mixed case sensitivity 2786 * access or is completely case-insensitive. Note 2787 * that the file system is always case preserving. 2788 * 2789 * In mixed sensitivity mode case sensitive behavior 2790 * is the default. FIGNORECASE must be used to 2791 * explicitly request case insensitive behavior. 2792 * 2793 * If the source and target names provided differ only 2794 * by case (e.g., a request to rename 'tim' to 'Tim'), 2795 * we will treat this as a special case in the 2796 * case-insensitive mode: as long as the source name 2797 * is an exact match, we will allow this to proceed as 2798 * a name-change request. 2799 */ 2800 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2801 (zfsvfs->z_case == ZFS_CASE_MIXED && 2802 flags & FIGNORECASE)) && 2803 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2804 &error) == 0) { 2805 /* 2806 * case preserving rename request, require exact 2807 * name matches 2808 */ 2809 zflg |= ZCIEXACT; 2810 zflg &= ~ZCILOOK; 2811 } 2812 } 2813 2814 /* 2815 * If the source and destination directories are the same, we should 2816 * grab the z_name_lock of that directory only once. 2817 */ 2818 if (sdzp == tdzp) { 2819 zflg |= ZHAVELOCK; 2820 rw_enter(&sdzp->z_name_lock, RW_READER); 2821 } 2822 2823 if (cmp < 0) { 2824 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2825 ZEXISTS | zflg, NULL, NULL); 2826 terr = zfs_dirent_lock(&tdl, 2827 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2828 } else { 2829 terr = zfs_dirent_lock(&tdl, 2830 tdzp, tnm, &tzp, zflg, NULL, NULL); 2831 serr = zfs_dirent_lock(&sdl, 2832 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2833 NULL, NULL); 2834 } 2835 2836 if (serr) { 2837 /* 2838 * Source entry invalid or not there. 2839 */ 2840 if (!terr) { 2841 zfs_dirent_unlock(tdl); 2842 if (tzp) 2843 zrele(tzp); 2844 } 2845 2846 if (sdzp == tdzp) 2847 rw_exit(&sdzp->z_name_lock); 2848 2849 if (strcmp(snm, "..") == 0) 2850 serr = EINVAL; 2851 zfs_exit(zfsvfs, FTAG); 2852 return (serr); 2853 } 2854 if (terr) { 2855 zfs_dirent_unlock(sdl); 2856 zrele(szp); 2857 2858 if (sdzp == tdzp) 2859 rw_exit(&sdzp->z_name_lock); 2860 2861 if (strcmp(tnm, "..") == 0) 2862 terr = EINVAL; 2863 zfs_exit(zfsvfs, FTAG); 2864 return (terr); 2865 } 2866 2867 /* 2868 * If we are using project inheritance, means if the directory has 2869 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2870 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2871 * such case, we only allow renames into our tree when the project 2872 * IDs are the same. 2873 */ 2874 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2875 tdzp->z_projid != szp->z_projid) { 2876 error = SET_ERROR(EXDEV); 2877 goto out; 2878 } 2879 2880 /* 2881 * Must have write access at the source to remove the old entry 2882 * and write access at the target to create the new entry. 2883 * Note that if target and source are the same, this can be 2884 * done in a single check. 2885 */ 2886 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2887 goto out; 2888 2889 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2890 /* 2891 * Check to make sure rename is valid. 2892 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2893 */ 2894 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2895 goto out; 2896 } 2897 2898 /* 2899 * Does target exist? 2900 */ 2901 if (tzp) { 2902 if (rflags & RENAME_NOREPLACE) { 2903 error = SET_ERROR(EEXIST); 2904 goto out; 2905 } 2906 /* 2907 * Source and target must be the same type (unless exchanging). 2908 */ 2909 if (!(rflags & RENAME_EXCHANGE)) { 2910 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 2911 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 2912 2913 if (s_is_dir != t_is_dir) { 2914 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 2915 goto out; 2916 } 2917 } 2918 /* 2919 * POSIX dictates that when the source and target 2920 * entries refer to the same file object, rename 2921 * must do nothing and exit without error. 2922 */ 2923 if (szp->z_id == tzp->z_id) { 2924 error = 0; 2925 goto out; 2926 } 2927 } else if (rflags & RENAME_EXCHANGE) { 2928 /* Target must exist for RENAME_EXCHANGE. */ 2929 error = SET_ERROR(ENOENT); 2930 goto out; 2931 } 2932 2933 /* Set up inode creation for RENAME_WHITEOUT. */ 2934 if (rflags & RENAME_WHITEOUT) { 2935 /* 2936 * Whiteout files are not regular files or directories, so to 2937 * match zfs_create() we do not inherit the project id. 2938 */ 2939 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 2940 2941 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 2942 if (error) 2943 goto out; 2944 2945 if (!have_acl) { 2946 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 2947 &acl_ids, mnt_ns); 2948 if (error) 2949 goto out; 2950 have_acl = B_TRUE; 2951 } 2952 2953 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 2954 error = SET_ERROR(EDQUOT); 2955 goto out; 2956 } 2957 } 2958 2959 tx = dmu_tx_create(zfsvfs->z_os); 2960 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 2961 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 2962 dmu_tx_hold_zap(tx, sdzp->z_id, 2963 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 2964 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 2965 if (sdzp != tdzp) { 2966 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 2967 zfs_sa_upgrade_txholds(tx, tdzp); 2968 } 2969 if (tzp) { 2970 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 2971 zfs_sa_upgrade_txholds(tx, tzp); 2972 } 2973 if (rflags & RENAME_WHITEOUT) { 2974 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2975 ZFS_SA_BASE_ATTR_SIZE); 2976 2977 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 2978 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 2979 if (!zfsvfs->z_use_sa && 2980 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2981 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2982 0, acl_ids.z_aclp->z_acl_bytes); 2983 } 2984 } 2985 fuid_dirtied = zfsvfs->z_fuid_dirty; 2986 if (fuid_dirtied) 2987 zfs_fuid_txhold(zfsvfs, tx); 2988 zfs_sa_upgrade_txholds(tx, szp); 2989 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2990 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 2991 if (error) { 2992 if (zl != NULL) 2993 zfs_rename_unlock(&zl); 2994 zfs_dirent_unlock(sdl); 2995 zfs_dirent_unlock(tdl); 2996 2997 if (sdzp == tdzp) 2998 rw_exit(&sdzp->z_name_lock); 2999 3000 if (error == ERESTART) { 3001 waited = B_TRUE; 3002 dmu_tx_wait(tx); 3003 dmu_tx_abort(tx); 3004 zrele(szp); 3005 if (tzp) 3006 zrele(tzp); 3007 goto top; 3008 } 3009 dmu_tx_abort(tx); 3010 zrele(szp); 3011 if (tzp) 3012 zrele(tzp); 3013 zfs_exit(zfsvfs, FTAG); 3014 return (error); 3015 } 3016 3017 /* 3018 * Unlink the source. 3019 */ 3020 szp->z_pflags |= ZFS_AV_MODIFIED; 3021 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3022 szp->z_pflags |= ZFS_PROJINHERIT; 3023 3024 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3025 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3026 VERIFY0(error); 3027 3028 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3029 if (error) 3030 goto commit; 3031 3032 /* 3033 * Unlink the target. 3034 */ 3035 if (tzp) { 3036 int tzflg = zflg; 3037 3038 if (rflags & RENAME_EXCHANGE) { 3039 /* This inode will be re-linked soon. */ 3040 tzflg |= ZRENAMING; 3041 3042 tzp->z_pflags |= ZFS_AV_MODIFIED; 3043 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3044 tzp->z_pflags |= ZFS_PROJINHERIT; 3045 3046 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3047 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3048 ASSERT0(error); 3049 } 3050 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3051 if (error) 3052 goto commit_link_szp; 3053 } 3054 3055 /* 3056 * Create the new target links: 3057 * * We always link the target. 3058 * * RENAME_EXCHANGE: Link the old target to the source. 3059 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3060 */ 3061 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3062 if (error) { 3063 /* 3064 * If we have removed the existing target, a subsequent call to 3065 * zfs_link_create() to add back the same entry, but with a new 3066 * dnode (szp), should not fail. 3067 */ 3068 ASSERT3P(tzp, ==, NULL); 3069 goto commit_link_tzp; 3070 } 3071 3072 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3073 case RENAME_EXCHANGE: 3074 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3075 /* 3076 * The same argument as zfs_link_create() failing for 3077 * szp applies here, since the source directory must 3078 * have had an entry we are replacing. 3079 */ 3080 ASSERT0(error); 3081 if (error) 3082 goto commit_unlink_td_szp; 3083 break; 3084 case RENAME_WHITEOUT: 3085 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3086 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3087 if (error) { 3088 zfs_znode_delete(wzp, tx); 3089 remove_inode_hash(ZTOI(wzp)); 3090 goto commit_unlink_td_szp; 3091 } 3092 break; 3093 } 3094 3095 if (fuid_dirtied) 3096 zfs_fuid_sync(zfsvfs, tx); 3097 3098 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3099 case RENAME_EXCHANGE: 3100 zfs_log_rename_exchange(zilog, tx, 3101 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3102 tdzp, tdl->dl_name, szp); 3103 break; 3104 case RENAME_WHITEOUT: 3105 zfs_log_rename_whiteout(zilog, tx, 3106 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3107 tdzp, tdl->dl_name, szp, wzp); 3108 break; 3109 default: 3110 ASSERT0(rflags & ~RENAME_NOREPLACE); 3111 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3112 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3113 break; 3114 } 3115 3116 commit: 3117 dmu_tx_commit(tx); 3118 out: 3119 if (have_acl) 3120 zfs_acl_ids_free(&acl_ids); 3121 3122 zfs_znode_update_vfs(sdzp); 3123 if (sdzp == tdzp) 3124 rw_exit(&sdzp->z_name_lock); 3125 3126 if (sdzp != tdzp) 3127 zfs_znode_update_vfs(tdzp); 3128 3129 zfs_znode_update_vfs(szp); 3130 zrele(szp); 3131 if (wzp) { 3132 zfs_znode_update_vfs(wzp); 3133 zrele(wzp); 3134 } 3135 if (tzp) { 3136 zfs_znode_update_vfs(tzp); 3137 zrele(tzp); 3138 } 3139 3140 if (zl != NULL) 3141 zfs_rename_unlock(&zl); 3142 3143 zfs_dirent_unlock(sdl); 3144 zfs_dirent_unlock(tdl); 3145 3146 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3147 zil_commit(zilog, 0); 3148 3149 zfs_exit(zfsvfs, FTAG); 3150 return (error); 3151 3152 /* 3153 * Clean-up path for broken link state. 3154 * 3155 * At this point we are in a (very) bad state, so we need to do our 3156 * best to correct the state. In particular, all of the nlinks are 3157 * wrong because we were destroying and creating links with ZRENAMING. 3158 * 3159 * In some form, all of these operations have to resolve the state: 3160 * 3161 * * link_destroy() *must* succeed. Fortunately, this is very likely 3162 * since we only just created it. 3163 * 3164 * * link_create()s are allowed to fail (though they shouldn't because 3165 * we only just unlinked them and are putting the entries back 3166 * during clean-up). But if they fail, we can just forcefully drop 3167 * the nlink value to (at the very least) avoid broken nlink values 3168 * -- though in the case of non-empty directories we will have to 3169 * panic (otherwise we'd have a leaked directory with a broken ..). 3170 */ 3171 commit_unlink_td_szp: 3172 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3173 commit_link_tzp: 3174 if (tzp) { 3175 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3176 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3177 } 3178 commit_link_szp: 3179 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3180 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3181 goto commit; 3182 } 3183 3184 /* 3185 * Insert the indicated symbolic reference entry into the directory. 3186 * 3187 * IN: dzp - Directory to contain new symbolic link. 3188 * name - Name of directory entry in dip. 3189 * vap - Attributes of new entry. 3190 * link - Name for new symlink entry. 3191 * cr - credentials of caller. 3192 * flags - case flags 3193 * mnt_ns - user namespace of the mount 3194 * 3195 * OUT: zpp - Znode for new symbolic link. 3196 * 3197 * RETURN: 0 on success, error code on failure. 3198 * 3199 * Timestamps: 3200 * dip - ctime|mtime updated 3201 */ 3202 int 3203 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3204 znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns) 3205 { 3206 znode_t *zp; 3207 zfs_dirlock_t *dl; 3208 dmu_tx_t *tx; 3209 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3210 zilog_t *zilog; 3211 uint64_t len = strlen(link); 3212 int error; 3213 int zflg = ZNEW; 3214 zfs_acl_ids_t acl_ids; 3215 boolean_t fuid_dirtied; 3216 uint64_t txtype = TX_SYMLINK; 3217 boolean_t waited = B_FALSE; 3218 3219 ASSERT(S_ISLNK(vap->va_mode)); 3220 3221 if (name == NULL) 3222 return (SET_ERROR(EINVAL)); 3223 3224 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3225 return (error); 3226 zilog = zfsvfs->z_log; 3227 3228 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3229 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3230 zfs_exit(zfsvfs, FTAG); 3231 return (SET_ERROR(EILSEQ)); 3232 } 3233 if (flags & FIGNORECASE) 3234 zflg |= ZCILOOK; 3235 3236 if (len > MAXPATHLEN) { 3237 zfs_exit(zfsvfs, FTAG); 3238 return (SET_ERROR(ENAMETOOLONG)); 3239 } 3240 3241 if ((error = zfs_acl_ids_create(dzp, 0, 3242 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3243 zfs_exit(zfsvfs, FTAG); 3244 return (error); 3245 } 3246 top: 3247 *zpp = NULL; 3248 3249 /* 3250 * Attempt to lock directory; fail if entry already exists. 3251 */ 3252 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3253 if (error) { 3254 zfs_acl_ids_free(&acl_ids); 3255 zfs_exit(zfsvfs, FTAG); 3256 return (error); 3257 } 3258 3259 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3260 zfs_acl_ids_free(&acl_ids); 3261 zfs_dirent_unlock(dl); 3262 zfs_exit(zfsvfs, FTAG); 3263 return (error); 3264 } 3265 3266 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3267 zfs_acl_ids_free(&acl_ids); 3268 zfs_dirent_unlock(dl); 3269 zfs_exit(zfsvfs, FTAG); 3270 return (SET_ERROR(EDQUOT)); 3271 } 3272 tx = dmu_tx_create(zfsvfs->z_os); 3273 fuid_dirtied = zfsvfs->z_fuid_dirty; 3274 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3275 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3276 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3277 ZFS_SA_BASE_ATTR_SIZE + len); 3278 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3279 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3280 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3281 acl_ids.z_aclp->z_acl_bytes); 3282 } 3283 if (fuid_dirtied) 3284 zfs_fuid_txhold(zfsvfs, tx); 3285 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3286 if (error) { 3287 zfs_dirent_unlock(dl); 3288 if (error == ERESTART) { 3289 waited = B_TRUE; 3290 dmu_tx_wait(tx); 3291 dmu_tx_abort(tx); 3292 goto top; 3293 } 3294 zfs_acl_ids_free(&acl_ids); 3295 dmu_tx_abort(tx); 3296 zfs_exit(zfsvfs, FTAG); 3297 return (error); 3298 } 3299 3300 /* 3301 * Create a new object for the symlink. 3302 * for version 4 ZPL datasets the symlink will be an SA attribute 3303 */ 3304 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3305 3306 if (fuid_dirtied) 3307 zfs_fuid_sync(zfsvfs, tx); 3308 3309 mutex_enter(&zp->z_lock); 3310 if (zp->z_is_sa) 3311 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3312 link, len, tx); 3313 else 3314 zfs_sa_symlink(zp, link, len, tx); 3315 mutex_exit(&zp->z_lock); 3316 3317 zp->z_size = len; 3318 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3319 &zp->z_size, sizeof (zp->z_size), tx); 3320 /* 3321 * Insert the new object into the directory. 3322 */ 3323 error = zfs_link_create(dl, zp, tx, ZNEW); 3324 if (error != 0) { 3325 zfs_znode_delete(zp, tx); 3326 remove_inode_hash(ZTOI(zp)); 3327 } else { 3328 if (flags & FIGNORECASE) 3329 txtype |= TX_CI; 3330 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3331 3332 zfs_znode_update_vfs(dzp); 3333 zfs_znode_update_vfs(zp); 3334 } 3335 3336 zfs_acl_ids_free(&acl_ids); 3337 3338 dmu_tx_commit(tx); 3339 3340 zfs_dirent_unlock(dl); 3341 3342 if (error == 0) { 3343 *zpp = zp; 3344 3345 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3346 zil_commit(zilog, 0); 3347 } else { 3348 zrele(zp); 3349 } 3350 3351 zfs_exit(zfsvfs, FTAG); 3352 return (error); 3353 } 3354 3355 /* 3356 * Return, in the buffer contained in the provided uio structure, 3357 * the symbolic path referred to by ip. 3358 * 3359 * IN: ip - inode of symbolic link 3360 * uio - structure to contain the link path. 3361 * cr - credentials of caller. 3362 * 3363 * RETURN: 0 if success 3364 * error code if failure 3365 * 3366 * Timestamps: 3367 * ip - atime updated 3368 */ 3369 int 3370 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3371 { 3372 (void) cr; 3373 znode_t *zp = ITOZ(ip); 3374 zfsvfs_t *zfsvfs = ITOZSB(ip); 3375 int error; 3376 3377 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3378 return (error); 3379 3380 mutex_enter(&zp->z_lock); 3381 if (zp->z_is_sa) 3382 error = sa_lookup_uio(zp->z_sa_hdl, 3383 SA_ZPL_SYMLINK(zfsvfs), uio); 3384 else 3385 error = zfs_sa_readlink(zp, uio); 3386 mutex_exit(&zp->z_lock); 3387 3388 zfs_exit(zfsvfs, FTAG); 3389 return (error); 3390 } 3391 3392 /* 3393 * Insert a new entry into directory tdzp referencing szp. 3394 * 3395 * IN: tdzp - Directory to contain new entry. 3396 * szp - znode of new entry. 3397 * name - name of new entry. 3398 * cr - credentials of caller. 3399 * flags - case flags. 3400 * 3401 * RETURN: 0 if success 3402 * error code if failure 3403 * 3404 * Timestamps: 3405 * tdzp - ctime|mtime updated 3406 * szp - ctime updated 3407 */ 3408 int 3409 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3410 int flags) 3411 { 3412 struct inode *sip = ZTOI(szp); 3413 znode_t *tzp; 3414 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3415 zilog_t *zilog; 3416 zfs_dirlock_t *dl; 3417 dmu_tx_t *tx; 3418 int error; 3419 int zf = ZNEW; 3420 uint64_t parent; 3421 uid_t owner; 3422 boolean_t waited = B_FALSE; 3423 boolean_t is_tmpfile = 0; 3424 uint64_t txg; 3425 #ifdef HAVE_TMPFILE 3426 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3427 #endif 3428 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3429 3430 if (name == NULL) 3431 return (SET_ERROR(EINVAL)); 3432 3433 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3434 return (error); 3435 zilog = zfsvfs->z_log; 3436 3437 /* 3438 * POSIX dictates that we return EPERM here. 3439 * Better choices include ENOTSUP or EISDIR. 3440 */ 3441 if (S_ISDIR(sip->i_mode)) { 3442 zfs_exit(zfsvfs, FTAG); 3443 return (SET_ERROR(EPERM)); 3444 } 3445 3446 if ((error = zfs_verify_zp(szp)) != 0) { 3447 zfs_exit(zfsvfs, FTAG); 3448 return (error); 3449 } 3450 3451 /* 3452 * If we are using project inheritance, means if the directory has 3453 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3454 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3455 * such case, we only allow hard link creation in our tree when the 3456 * project IDs are the same. 3457 */ 3458 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3459 tdzp->z_projid != szp->z_projid) { 3460 zfs_exit(zfsvfs, FTAG); 3461 return (SET_ERROR(EXDEV)); 3462 } 3463 3464 /* 3465 * We check i_sb because snapshots and the ctldir must have different 3466 * super blocks. 3467 */ 3468 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3469 zfs_exit(zfsvfs, FTAG); 3470 return (SET_ERROR(EXDEV)); 3471 } 3472 3473 /* Prevent links to .zfs/shares files */ 3474 3475 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3476 &parent, sizeof (uint64_t))) != 0) { 3477 zfs_exit(zfsvfs, FTAG); 3478 return (error); 3479 } 3480 if (parent == zfsvfs->z_shares_dir) { 3481 zfs_exit(zfsvfs, FTAG); 3482 return (SET_ERROR(EPERM)); 3483 } 3484 3485 if (zfsvfs->z_utf8 && u8_validate(name, 3486 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3487 zfs_exit(zfsvfs, FTAG); 3488 return (SET_ERROR(EILSEQ)); 3489 } 3490 if (flags & FIGNORECASE) 3491 zf |= ZCILOOK; 3492 3493 /* 3494 * We do not support links between attributes and non-attributes 3495 * because of the potential security risk of creating links 3496 * into "normal" file space in order to circumvent restrictions 3497 * imposed in attribute space. 3498 */ 3499 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3500 zfs_exit(zfsvfs, FTAG); 3501 return (SET_ERROR(EINVAL)); 3502 } 3503 3504 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3505 cr, ZFS_OWNER); 3506 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3507 zfs_exit(zfsvfs, FTAG); 3508 return (SET_ERROR(EPERM)); 3509 } 3510 3511 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3512 kcred->user_ns))) { 3513 zfs_exit(zfsvfs, FTAG); 3514 return (error); 3515 } 3516 3517 top: 3518 /* 3519 * Attempt to lock directory; fail if entry already exists. 3520 */ 3521 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3522 if (error) { 3523 zfs_exit(zfsvfs, FTAG); 3524 return (error); 3525 } 3526 3527 tx = dmu_tx_create(zfsvfs->z_os); 3528 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3529 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3530 if (is_tmpfile) 3531 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3532 3533 zfs_sa_upgrade_txholds(tx, szp); 3534 zfs_sa_upgrade_txholds(tx, tdzp); 3535 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3536 if (error) { 3537 zfs_dirent_unlock(dl); 3538 if (error == ERESTART) { 3539 waited = B_TRUE; 3540 dmu_tx_wait(tx); 3541 dmu_tx_abort(tx); 3542 goto top; 3543 } 3544 dmu_tx_abort(tx); 3545 zfs_exit(zfsvfs, FTAG); 3546 return (error); 3547 } 3548 /* unmark z_unlinked so zfs_link_create will not reject */ 3549 if (is_tmpfile) 3550 szp->z_unlinked = B_FALSE; 3551 error = zfs_link_create(dl, szp, tx, 0); 3552 3553 if (error == 0) { 3554 uint64_t txtype = TX_LINK; 3555 /* 3556 * tmpfile is created to be in z_unlinkedobj, so remove it. 3557 * Also, we don't log in ZIL, because all previous file 3558 * operation on the tmpfile are ignored by ZIL. Instead we 3559 * always wait for txg to sync to make sure all previous 3560 * operation are sync safe. 3561 */ 3562 if (is_tmpfile) { 3563 VERIFY(zap_remove_int(zfsvfs->z_os, 3564 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); 3565 } else { 3566 if (flags & FIGNORECASE) 3567 txtype |= TX_CI; 3568 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3569 } 3570 } else if (is_tmpfile) { 3571 /* restore z_unlinked since when linking failed */ 3572 szp->z_unlinked = B_TRUE; 3573 } 3574 txg = dmu_tx_get_txg(tx); 3575 dmu_tx_commit(tx); 3576 3577 zfs_dirent_unlock(dl); 3578 3579 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3580 zil_commit(zilog, 0); 3581 3582 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) 3583 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); 3584 3585 zfs_znode_update_vfs(tdzp); 3586 zfs_znode_update_vfs(szp); 3587 zfs_exit(zfsvfs, FTAG); 3588 return (error); 3589 } 3590 3591 static void 3592 zfs_putpage_sync_commit_cb(void *arg) 3593 { 3594 struct page *pp = arg; 3595 3596 ClearPageError(pp); 3597 end_page_writeback(pp); 3598 } 3599 3600 static void 3601 zfs_putpage_async_commit_cb(void *arg) 3602 { 3603 struct page *pp = arg; 3604 znode_t *zp = ITOZ(pp->mapping->host); 3605 3606 ClearPageError(pp); 3607 end_page_writeback(pp); 3608 atomic_dec_32(&zp->z_async_writes_cnt); 3609 } 3610 3611 /* 3612 * Push a page out to disk, once the page is on stable storage the 3613 * registered commit callback will be run as notification of completion. 3614 * 3615 * IN: ip - page mapped for inode. 3616 * pp - page to push (page is locked) 3617 * wbc - writeback control data 3618 * for_sync - does the caller intend to wait synchronously for the 3619 * page writeback to complete? 3620 * 3621 * RETURN: 0 if success 3622 * error code if failure 3623 * 3624 * Timestamps: 3625 * ip - ctime|mtime updated 3626 */ 3627 int 3628 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3629 boolean_t for_sync) 3630 { 3631 znode_t *zp = ITOZ(ip); 3632 zfsvfs_t *zfsvfs = ITOZSB(ip); 3633 loff_t offset; 3634 loff_t pgoff; 3635 unsigned int pglen; 3636 dmu_tx_t *tx; 3637 caddr_t va; 3638 int err = 0; 3639 uint64_t mtime[2], ctime[2]; 3640 sa_bulk_attr_t bulk[3]; 3641 int cnt = 0; 3642 struct address_space *mapping; 3643 3644 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3645 return (err); 3646 3647 ASSERT(PageLocked(pp)); 3648 3649 pgoff = page_offset(pp); /* Page byte-offset in file */ 3650 offset = i_size_read(ip); /* File length in bytes */ 3651 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3652 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3653 3654 /* Page is beyond end of file */ 3655 if (pgoff >= offset) { 3656 unlock_page(pp); 3657 zfs_exit(zfsvfs, FTAG); 3658 return (0); 3659 } 3660 3661 /* Truncate page length to end of file */ 3662 if (pgoff + pglen > offset) 3663 pglen = offset - pgoff; 3664 3665 #if 0 3666 /* 3667 * FIXME: Allow mmap writes past its quota. The correct fix 3668 * is to register a page_mkwrite() handler to count the page 3669 * against its quota when it is about to be dirtied. 3670 */ 3671 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3672 KUID_TO_SUID(ip->i_uid)) || 3673 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3674 KGID_TO_SGID(ip->i_gid)) || 3675 (zp->z_projid != ZFS_DEFAULT_PROJID && 3676 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3677 zp->z_projid))) { 3678 err = EDQUOT; 3679 } 3680 #endif 3681 3682 /* 3683 * The ordering here is critical and must adhere to the following 3684 * rules in order to avoid deadlocking in either zfs_read() or 3685 * zfs_free_range() due to a lock inversion. 3686 * 3687 * 1) The page must be unlocked prior to acquiring the range lock. 3688 * This is critical because zfs_read() calls find_lock_page() 3689 * which may block on the page lock while holding the range lock. 3690 * 3691 * 2) Before setting or clearing write back on a page the range lock 3692 * must be held in order to prevent a lock inversion with the 3693 * zfs_free_range() function. 3694 * 3695 * This presents a problem because upon entering this function the 3696 * page lock is already held. To safely acquire the range lock the 3697 * page lock must be dropped. This creates a window where another 3698 * process could truncate, invalidate, dirty, or write out the page. 3699 * 3700 * Therefore, after successfully reacquiring the range and page locks 3701 * the current page state is checked. In the common case everything 3702 * will be as is expected and it can be written out. However, if 3703 * the page state has changed it must be handled accordingly. 3704 */ 3705 mapping = pp->mapping; 3706 redirty_page_for_writepage(wbc, pp); 3707 unlock_page(pp); 3708 3709 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3710 pgoff, pglen, RL_WRITER); 3711 lock_page(pp); 3712 3713 /* Page mapping changed or it was no longer dirty, we're done */ 3714 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3715 unlock_page(pp); 3716 zfs_rangelock_exit(lr); 3717 zfs_exit(zfsvfs, FTAG); 3718 return (0); 3719 } 3720 3721 /* Another process started write block if required */ 3722 if (PageWriteback(pp)) { 3723 unlock_page(pp); 3724 zfs_rangelock_exit(lr); 3725 3726 if (wbc->sync_mode != WB_SYNC_NONE) { 3727 /* 3728 * Speed up any non-sync page writebacks since 3729 * they may take several seconds to complete. 3730 * Refer to the comment in zpl_fsync() (when 3731 * HAVE_FSYNC_RANGE is defined) for details. 3732 */ 3733 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 3734 zil_commit(zfsvfs->z_log, zp->z_id); 3735 } 3736 3737 if (PageWriteback(pp)) 3738 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3739 folio_wait_bit(page_folio(pp), PG_writeback); 3740 #else 3741 wait_on_page_bit(pp, PG_writeback); 3742 #endif 3743 } 3744 3745 zfs_exit(zfsvfs, FTAG); 3746 return (0); 3747 } 3748 3749 /* Clear the dirty flag the required locks are held */ 3750 if (!clear_page_dirty_for_io(pp)) { 3751 unlock_page(pp); 3752 zfs_rangelock_exit(lr); 3753 zfs_exit(zfsvfs, FTAG); 3754 return (0); 3755 } 3756 3757 /* 3758 * Counterpart for redirty_page_for_writepage() above. This page 3759 * was in fact not skipped and should not be counted as if it were. 3760 */ 3761 wbc->pages_skipped--; 3762 if (!for_sync) 3763 atomic_inc_32(&zp->z_async_writes_cnt); 3764 set_page_writeback(pp); 3765 unlock_page(pp); 3766 3767 tx = dmu_tx_create(zfsvfs->z_os); 3768 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3769 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3770 zfs_sa_upgrade_txholds(tx, zp); 3771 3772 err = dmu_tx_assign(tx, TXG_NOWAIT); 3773 if (err != 0) { 3774 if (err == ERESTART) 3775 dmu_tx_wait(tx); 3776 3777 dmu_tx_abort(tx); 3778 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3779 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3780 #else 3781 __set_page_dirty_nobuffers(pp); 3782 #endif 3783 ClearPageError(pp); 3784 end_page_writeback(pp); 3785 if (!for_sync) 3786 atomic_dec_32(&zp->z_async_writes_cnt); 3787 zfs_rangelock_exit(lr); 3788 zfs_exit(zfsvfs, FTAG); 3789 return (err); 3790 } 3791 3792 va = kmap(pp); 3793 ASSERT3U(pglen, <=, PAGE_SIZE); 3794 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); 3795 kunmap(pp); 3796 3797 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3798 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3799 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3800 &zp->z_pflags, 8); 3801 3802 /* Preserve the mtime and ctime provided by the inode */ 3803 ZFS_TIME_ENCODE(&ip->i_mtime, mtime); 3804 ZFS_TIME_ENCODE(&ip->i_ctime, ctime); 3805 zp->z_atime_dirty = B_FALSE; 3806 zp->z_seq++; 3807 3808 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3809 3810 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, 3811 for_sync ? zfs_putpage_sync_commit_cb : 3812 zfs_putpage_async_commit_cb, pp); 3813 3814 dmu_tx_commit(tx); 3815 3816 zfs_rangelock_exit(lr); 3817 3818 if (wbc->sync_mode != WB_SYNC_NONE) { 3819 /* 3820 * Note that this is rarely called under writepages(), because 3821 * writepages() normally handles the entire commit for 3822 * performance reasons. 3823 */ 3824 zil_commit(zfsvfs->z_log, zp->z_id); 3825 } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { 3826 /* 3827 * If the caller does not intend to wait synchronously 3828 * for this page writeback to complete and there are active 3829 * synchronous calls on this file, do a commit so that 3830 * the latter don't accidentally end up waiting for 3831 * our writeback to complete. Refer to the comment in 3832 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. 3833 */ 3834 zil_commit(zfsvfs->z_log, zp->z_id); 3835 } 3836 3837 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3838 3839 zfs_exit(zfsvfs, FTAG); 3840 return (err); 3841 } 3842 3843 /* 3844 * Update the system attributes when the inode has been dirtied. For the 3845 * moment we only update the mode, atime, mtime, and ctime. 3846 */ 3847 int 3848 zfs_dirty_inode(struct inode *ip, int flags) 3849 { 3850 znode_t *zp = ITOZ(ip); 3851 zfsvfs_t *zfsvfs = ITOZSB(ip); 3852 dmu_tx_t *tx; 3853 uint64_t mode, atime[2], mtime[2], ctime[2]; 3854 sa_bulk_attr_t bulk[4]; 3855 int error = 0; 3856 int cnt = 0; 3857 3858 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 3859 return (0); 3860 3861 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3862 return (error); 3863 3864 #ifdef I_DIRTY_TIME 3865 /* 3866 * This is the lazytime semantic introduced in Linux 4.0 3867 * This flag will only be called from update_time when lazytime is set. 3868 * (Note, I_DIRTY_SYNC will also set if not lazytime) 3869 * Fortunately mtime and ctime are managed within ZFS itself, so we 3870 * only need to dirty atime. 3871 */ 3872 if (flags == I_DIRTY_TIME) { 3873 zp->z_atime_dirty = B_TRUE; 3874 goto out; 3875 } 3876 #endif 3877 3878 tx = dmu_tx_create(zfsvfs->z_os); 3879 3880 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3881 zfs_sa_upgrade_txholds(tx, zp); 3882 3883 error = dmu_tx_assign(tx, TXG_WAIT); 3884 if (error) { 3885 dmu_tx_abort(tx); 3886 goto out; 3887 } 3888 3889 mutex_enter(&zp->z_lock); 3890 zp->z_atime_dirty = B_FALSE; 3891 3892 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 3893 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 3894 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3895 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3896 3897 /* Preserve the mode, mtime and ctime provided by the inode */ 3898 ZFS_TIME_ENCODE(&ip->i_atime, atime); 3899 ZFS_TIME_ENCODE(&ip->i_mtime, mtime); 3900 ZFS_TIME_ENCODE(&ip->i_ctime, ctime); 3901 mode = ip->i_mode; 3902 3903 zp->z_mode = mode; 3904 3905 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3906 mutex_exit(&zp->z_lock); 3907 3908 dmu_tx_commit(tx); 3909 out: 3910 zfs_exit(zfsvfs, FTAG); 3911 return (error); 3912 } 3913 3914 void 3915 zfs_inactive(struct inode *ip) 3916 { 3917 znode_t *zp = ITOZ(ip); 3918 zfsvfs_t *zfsvfs = ITOZSB(ip); 3919 uint64_t atime[2]; 3920 int error; 3921 int need_unlock = 0; 3922 3923 /* Only read lock if we haven't already write locked, e.g. rollback */ 3924 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 3925 need_unlock = 1; 3926 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3927 } 3928 if (zp->z_sa_hdl == NULL) { 3929 if (need_unlock) 3930 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3931 return; 3932 } 3933 3934 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 3935 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3936 3937 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3938 zfs_sa_upgrade_txholds(tx, zp); 3939 error = dmu_tx_assign(tx, TXG_WAIT); 3940 if (error) { 3941 dmu_tx_abort(tx); 3942 } else { 3943 ZFS_TIME_ENCODE(&ip->i_atime, atime); 3944 mutex_enter(&zp->z_lock); 3945 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 3946 (void *)&atime, sizeof (atime), tx); 3947 zp->z_atime_dirty = B_FALSE; 3948 mutex_exit(&zp->z_lock); 3949 dmu_tx_commit(tx); 3950 } 3951 } 3952 3953 zfs_zinactive(zp); 3954 if (need_unlock) 3955 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3956 } 3957 3958 /* 3959 * Fill pages with data from the disk. 3960 */ 3961 static int 3962 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) 3963 { 3964 znode_t *zp = ITOZ(ip); 3965 zfsvfs_t *zfsvfs = ITOZSB(ip); 3966 objset_t *os; 3967 struct page *cur_pp; 3968 u_offset_t io_off, total; 3969 size_t io_len; 3970 loff_t i_size; 3971 unsigned page_idx; 3972 int err; 3973 3974 os = zfsvfs->z_os; 3975 io_len = nr_pages << PAGE_SHIFT; 3976 i_size = i_size_read(ip); 3977 io_off = page_offset(pl[0]); 3978 3979 if (io_off + io_len > i_size) 3980 io_len = i_size - io_off; 3981 3982 /* 3983 * Iterate over list of pages and read each page individually. 3984 */ 3985 page_idx = 0; 3986 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 3987 caddr_t va; 3988 3989 cur_pp = pl[page_idx++]; 3990 va = kmap(cur_pp); 3991 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 3992 DMU_READ_PREFETCH); 3993 kunmap(cur_pp); 3994 if (err) { 3995 /* convert checksum errors into IO errors */ 3996 if (err == ECKSUM) 3997 err = SET_ERROR(EIO); 3998 return (err); 3999 } 4000 } 4001 4002 return (0); 4003 } 4004 4005 /* 4006 * Uses zfs_fillpage to read data from the file and fill the pages. 4007 * 4008 * IN: ip - inode of file to get data from. 4009 * pl - list of pages to read 4010 * nr_pages - number of pages to read 4011 * 4012 * RETURN: 0 on success, error code on failure. 4013 * 4014 * Timestamps: 4015 * vp - atime updated 4016 */ 4017 int 4018 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) 4019 { 4020 znode_t *zp = ITOZ(ip); 4021 zfsvfs_t *zfsvfs = ITOZSB(ip); 4022 int err; 4023 4024 if (pl == NULL) 4025 return (0); 4026 4027 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4028 return (err); 4029 4030 err = zfs_fillpage(ip, pl, nr_pages); 4031 4032 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); 4033 4034 zfs_exit(zfsvfs, FTAG); 4035 return (err); 4036 } 4037 4038 /* 4039 * Check ZFS specific permissions to memory map a section of a file. 4040 * 4041 * IN: ip - inode of the file to mmap 4042 * off - file offset 4043 * addrp - start address in memory region 4044 * len - length of memory region 4045 * vm_flags- address flags 4046 * 4047 * RETURN: 0 if success 4048 * error code if failure 4049 */ 4050 int 4051 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4052 unsigned long vm_flags) 4053 { 4054 (void) addrp; 4055 znode_t *zp = ITOZ(ip); 4056 zfsvfs_t *zfsvfs = ITOZSB(ip); 4057 int error; 4058 4059 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4060 return (error); 4061 4062 if ((vm_flags & VM_WRITE) && (zp->z_pflags & 4063 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4064 zfs_exit(zfsvfs, FTAG); 4065 return (SET_ERROR(EPERM)); 4066 } 4067 4068 if ((vm_flags & (VM_READ | VM_EXEC)) && 4069 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4070 zfs_exit(zfsvfs, FTAG); 4071 return (SET_ERROR(EACCES)); 4072 } 4073 4074 if (off < 0 || len > MAXOFFSET_T - off) { 4075 zfs_exit(zfsvfs, FTAG); 4076 return (SET_ERROR(ENXIO)); 4077 } 4078 4079 zfs_exit(zfsvfs, FTAG); 4080 return (0); 4081 } 4082 4083 /* 4084 * Free or allocate space in a file. Currently, this function only 4085 * supports the `F_FREESP' command. However, this command is somewhat 4086 * misnamed, as its functionality includes the ability to allocate as 4087 * well as free space. 4088 * 4089 * IN: zp - znode of file to free data in. 4090 * cmd - action to take (only F_FREESP supported). 4091 * bfp - section of file to free/alloc. 4092 * flag - current file open mode flags. 4093 * offset - current file offset. 4094 * cr - credentials of caller. 4095 * 4096 * RETURN: 0 on success, error code on failure. 4097 * 4098 * Timestamps: 4099 * zp - ctime|mtime updated 4100 */ 4101 int 4102 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4103 offset_t offset, cred_t *cr) 4104 { 4105 (void) offset; 4106 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4107 uint64_t off, len; 4108 int error; 4109 4110 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4111 return (error); 4112 4113 if (cmd != F_FREESP) { 4114 zfs_exit(zfsvfs, FTAG); 4115 return (SET_ERROR(EINVAL)); 4116 } 4117 4118 /* 4119 * Callers might not be able to detect properly that we are read-only, 4120 * so check it explicitly here. 4121 */ 4122 if (zfs_is_readonly(zfsvfs)) { 4123 zfs_exit(zfsvfs, FTAG); 4124 return (SET_ERROR(EROFS)); 4125 } 4126 4127 if (bfp->l_len < 0) { 4128 zfs_exit(zfsvfs, FTAG); 4129 return (SET_ERROR(EINVAL)); 4130 } 4131 4132 /* 4133 * Permissions aren't checked on Solaris because on this OS 4134 * zfs_space() can only be called with an opened file handle. 4135 * On Linux we can get here through truncate_range() which 4136 * operates directly on inodes, so we need to check access rights. 4137 */ 4138 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4139 kcred->user_ns))) { 4140 zfs_exit(zfsvfs, FTAG); 4141 return (error); 4142 } 4143 4144 off = bfp->l_start; 4145 len = bfp->l_len; /* 0 means from off to end of file */ 4146 4147 error = zfs_freesp(zp, off, len, flag, TRUE); 4148 4149 zfs_exit(zfsvfs, FTAG); 4150 return (error); 4151 } 4152 4153 int 4154 zfs_fid(struct inode *ip, fid_t *fidp) 4155 { 4156 znode_t *zp = ITOZ(ip); 4157 zfsvfs_t *zfsvfs = ITOZSB(ip); 4158 uint32_t gen; 4159 uint64_t gen64; 4160 uint64_t object = zp->z_id; 4161 zfid_short_t *zfid; 4162 int size, i, error; 4163 4164 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4165 return (error); 4166 4167 if (fidp->fid_len < SHORT_FID_LEN) { 4168 fidp->fid_len = SHORT_FID_LEN; 4169 zfs_exit(zfsvfs, FTAG); 4170 return (SET_ERROR(ENOSPC)); 4171 } 4172 4173 if ((error = zfs_verify_zp(zp)) != 0) { 4174 zfs_exit(zfsvfs, FTAG); 4175 return (error); 4176 } 4177 4178 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4179 &gen64, sizeof (uint64_t))) != 0) { 4180 zfs_exit(zfsvfs, FTAG); 4181 return (error); 4182 } 4183 4184 gen = (uint32_t)gen64; 4185 4186 size = SHORT_FID_LEN; 4187 4188 zfid = (zfid_short_t *)fidp; 4189 4190 zfid->zf_len = size; 4191 4192 for (i = 0; i < sizeof (zfid->zf_object); i++) 4193 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4194 4195 /* Must have a non-zero generation number to distinguish from .zfs */ 4196 if (gen == 0) 4197 gen = 1; 4198 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4199 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4200 4201 zfs_exit(zfsvfs, FTAG); 4202 return (0); 4203 } 4204 4205 #if defined(_KERNEL) 4206 EXPORT_SYMBOL(zfs_open); 4207 EXPORT_SYMBOL(zfs_close); 4208 EXPORT_SYMBOL(zfs_lookup); 4209 EXPORT_SYMBOL(zfs_create); 4210 EXPORT_SYMBOL(zfs_tmpfile); 4211 EXPORT_SYMBOL(zfs_remove); 4212 EXPORT_SYMBOL(zfs_mkdir); 4213 EXPORT_SYMBOL(zfs_rmdir); 4214 EXPORT_SYMBOL(zfs_readdir); 4215 EXPORT_SYMBOL(zfs_getattr_fast); 4216 EXPORT_SYMBOL(zfs_setattr); 4217 EXPORT_SYMBOL(zfs_rename); 4218 EXPORT_SYMBOL(zfs_symlink); 4219 EXPORT_SYMBOL(zfs_readlink); 4220 EXPORT_SYMBOL(zfs_link); 4221 EXPORT_SYMBOL(zfs_inactive); 4222 EXPORT_SYMBOL(zfs_space); 4223 EXPORT_SYMBOL(zfs_fid); 4224 EXPORT_SYMBOL(zfs_getpage); 4225 EXPORT_SYMBOL(zfs_putpage); 4226 EXPORT_SYMBOL(zfs_dirty_inode); 4227 EXPORT_SYMBOL(zfs_map); 4228 4229 /* CSTYLED */ 4230 module_param(zfs_delete_blocks, ulong, 0644); 4231 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4232 4233 #endif 4234