1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 /* Portions Copyright 2007 Jeremy Teo */ 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/kmem.h> 41 #include <sys/taskq.h> 42 #include <sys/uio.h> 43 #include <sys/vmsystm.h> 44 #include <sys/atomic.h> 45 #include <sys/pathname.h> 46 #include <sys/cmn_err.h> 47 #include <sys/errno.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/policy.h> 60 #include <sys/sunddi.h> 61 #include <sys/sid.h> 62 #include <sys/zfs_ctldir.h> 63 #include <sys/zfs_fuid.h> 64 #include <sys/zfs_quota.h> 65 #include <sys/zfs_sa.h> 66 #include <sys/zfs_vnops.h> 67 #include <sys/zfs_rlock.h> 68 #include <sys/cred.h> 69 #include <sys/zpl.h> 70 #include <sys/zil.h> 71 #include <sys/sa_impl.h> 72 #include <linux/mm_compat.h> 73 74 /* 75 * Programming rules. 76 * 77 * Each vnode op performs some logical unit of work. To do this, the ZPL must 78 * properly lock its in-core state, create a DMU transaction, do the work, 79 * record this work in the intent log (ZIL), commit the DMU transaction, 80 * and wait for the intent log to commit if it is a synchronous operation. 81 * Moreover, the vnode ops must work in both normal and log replay context. 82 * The ordering of events is important to avoid deadlocks and references 83 * to freed memory. The example below illustrates the following Big Rules: 84 * 85 * (1) A check must be made in each zfs thread for a mounted file system. 86 * This is done avoiding races using zfs_enter(zfsvfs). 87 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 88 * must be checked with zfs_verify_zp(zp). Both of these macros 89 * can return EIO from the calling function. 90 * 91 * (2) zrele() should always be the last thing except for zil_commit() (if 92 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 93 * last reference, the vnode/znode can be freed, so the zp may point to 94 * freed memory. Second, the last reference will call zfs_zinactive(), 95 * which may induce a lot of work -- pushing cached pages (which acquires 96 * range locks) and syncing out cached atime changes. Third, 97 * zfs_zinactive() may require a new tx, which could deadlock the system 98 * if you were already holding one. This deadlock occurs because the tx 99 * currently being operated on prevents a txg from syncing, which 100 * prevents the new tx from progressing, resulting in a deadlock. If you 101 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 102 * is a synonym for zrele(). 103 * 104 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 105 * as they can span dmu_tx_assign() calls. 106 * 107 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 108 * dmu_tx_assign(). This is critical because we don't want to block 109 * while holding locks. 110 * 111 * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This 112 * reduces lock contention and CPU usage when we must wait (note that if 113 * throughput is constrained by the storage, nearly every transaction 114 * must wait). 115 * 116 * Note, in particular, that if a lock is sometimes acquired before 117 * the tx assigns, and sometimes after (e.g. z_lock), then failing 118 * to use a non-blocking assign can deadlock the system. The scenario: 119 * 120 * Thread A has grabbed a lock before calling dmu_tx_assign(). 121 * Thread B is in an already-assigned tx, and blocks for this lock. 122 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 123 * forever, because the previous txg can't quiesce until B's tx commits. 124 * 125 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 126 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 127 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 128 * to indicate that this operation has already called dmu_tx_wait(). 129 * This will ensure that we don't retry forever, waiting a short bit 130 * each time. 131 * 132 * (5) If the operation succeeded, generate the intent log entry for it 133 * before dropping locks. This ensures that the ordering of events 134 * in the intent log matches the order in which they actually occurred. 135 * During ZIL replay the zfs_log_* functions will update the sequence 136 * number to indicate the zil transaction has replayed. 137 * 138 * (6) At the end of each vnode op, the DMU tx must always commit, 139 * regardless of whether there were any errors. 140 * 141 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 142 * to ensure that synchronous semantics are provided when necessary. 143 * 144 * In general, this is how things should be ordered in each vnode op: 145 * 146 * zfs_enter(zfsvfs); // exit if unmounted 147 * top: 148 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 149 * rw_enter(...); // grab any other locks you need 150 * tx = dmu_tx_create(...); // get DMU tx 151 * dmu_tx_hold_*(); // hold each object you might modify 152 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 153 * if (error) { 154 * rw_exit(...); // drop locks 155 * zfs_dirent_unlock(dl); // unlock directory entry 156 * zrele(...); // release held znodes 157 * if (error == ERESTART) { 158 * waited = B_TRUE; 159 * dmu_tx_wait(tx); 160 * dmu_tx_abort(tx); 161 * goto top; 162 * } 163 * dmu_tx_abort(tx); // abort DMU tx 164 * zfs_exit(zfsvfs); // finished in zfs 165 * return (error); // really out of space 166 * } 167 * error = do_real_work(); // do whatever this VOP does 168 * if (error == 0) 169 * zfs_log_*(...); // on success, make ZIL entry 170 * dmu_tx_commit(tx); // commit DMU tx -- error or not 171 * rw_exit(...); // drop locks 172 * zfs_dirent_unlock(dl); // unlock directory entry 173 * zrele(...); // release held znodes 174 * zil_commit(zilog, foid); // synchronous when necessary 175 * zfs_exit(zfsvfs); // finished in zfs 176 * return (error); // done, report error 177 */ 178 int 179 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 180 { 181 (void) cr; 182 znode_t *zp = ITOZ(ip); 183 zfsvfs_t *zfsvfs = ITOZSB(ip); 184 int error; 185 186 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 187 return (error); 188 189 /* Honor ZFS_APPENDONLY file attribute */ 190 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && 191 ((flag & O_APPEND) == 0)) { 192 zfs_exit(zfsvfs, FTAG); 193 return (SET_ERROR(EPERM)); 194 } 195 196 /* 197 * Keep a count of the synchronous opens in the znode. On first 198 * synchronous open we must convert all previous async transactions 199 * into sync to keep correct ordering. 200 */ 201 if (flag & O_SYNC) { 202 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) 203 zil_async_to_sync(zfsvfs->z_log, zp->z_id); 204 } 205 206 zfs_exit(zfsvfs, FTAG); 207 return (0); 208 } 209 210 int 211 zfs_close(struct inode *ip, int flag, cred_t *cr) 212 { 213 (void) cr; 214 znode_t *zp = ITOZ(ip); 215 zfsvfs_t *zfsvfs = ITOZSB(ip); 216 int error; 217 218 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 219 return (error); 220 221 /* Decrement the synchronous opens in the znode */ 222 if (flag & O_SYNC) 223 atomic_dec_32(&zp->z_sync_cnt); 224 225 zfs_exit(zfsvfs, FTAG); 226 return (0); 227 } 228 229 #if defined(_KERNEL) 230 231 static int zfs_fillpage(struct inode *ip, struct page *pp); 232 233 /* 234 * When a file is memory mapped, we must keep the IO data synchronized 235 * between the DMU cache and the memory mapped pages. Update all mapped 236 * pages with the contents of the coresponding dmu buffer. 237 */ 238 void 239 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 240 { 241 struct address_space *mp = ZTOI(zp)->i_mapping; 242 int64_t off = start & (PAGE_SIZE - 1); 243 244 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 245 uint64_t nbytes = MIN(PAGE_SIZE - off, len); 246 247 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 248 if (pp) { 249 if (mapping_writably_mapped(mp)) 250 flush_dcache_page(pp); 251 252 void *pb = kmap(pp); 253 int error = dmu_read(os, zp->z_id, start + off, 254 nbytes, pb + off, DMU_READ_PREFETCH); 255 kunmap(pp); 256 257 if (error) { 258 SetPageError(pp); 259 ClearPageUptodate(pp); 260 } else { 261 ClearPageError(pp); 262 SetPageUptodate(pp); 263 264 if (mapping_writably_mapped(mp)) 265 flush_dcache_page(pp); 266 267 mark_page_accessed(pp); 268 } 269 270 unlock_page(pp); 271 put_page(pp); 272 } 273 274 len -= nbytes; 275 off = 0; 276 } 277 } 278 279 /* 280 * When a file is memory mapped, we must keep the I/O data synchronized 281 * between the DMU cache and the memory mapped pages. Preferentially read 282 * from memory mapped pages, otherwise fallback to reading through the dmu. 283 */ 284 int 285 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 286 { 287 struct inode *ip = ZTOI(zp); 288 struct address_space *mp = ip->i_mapping; 289 int64_t start = uio->uio_loffset; 290 int64_t off = start & (PAGE_SIZE - 1); 291 int len = nbytes; 292 int error = 0; 293 294 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 295 uint64_t bytes = MIN(PAGE_SIZE - off, len); 296 297 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 298 if (pp) { 299 /* 300 * If filemap_fault() retries there exists a window 301 * where the page will be unlocked and not up to date. 302 * In this case we must try and fill the page. 303 */ 304 if (unlikely(!PageUptodate(pp))) { 305 error = zfs_fillpage(ip, pp); 306 if (error) { 307 unlock_page(pp); 308 put_page(pp); 309 return (error); 310 } 311 } 312 313 ASSERT(PageUptodate(pp) || PageDirty(pp)); 314 315 unlock_page(pp); 316 317 void *pb = kmap(pp); 318 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 319 kunmap(pp); 320 321 if (mapping_writably_mapped(mp)) 322 flush_dcache_page(pp); 323 324 mark_page_accessed(pp); 325 put_page(pp); 326 } else { 327 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 328 uio, bytes); 329 } 330 331 len -= bytes; 332 off = 0; 333 334 if (error) 335 break; 336 } 337 338 return (error); 339 } 340 #endif /* _KERNEL */ 341 342 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 343 344 /* 345 * Write the bytes to a file. 346 * 347 * IN: zp - znode of file to be written to 348 * data - bytes to write 349 * len - number of bytes to write 350 * pos - offset to start writing at 351 * 352 * OUT: resid - remaining bytes to write 353 * 354 * RETURN: 0 if success 355 * positive error code if failure. EIO is returned 356 * for a short write when residp isn't provided. 357 * 358 * Timestamps: 359 * zp - ctime|mtime updated if byte count > 0 360 */ 361 int 362 zfs_write_simple(znode_t *zp, const void *data, size_t len, 363 loff_t pos, size_t *residp) 364 { 365 fstrans_cookie_t cookie; 366 int error; 367 368 struct iovec iov; 369 iov.iov_base = (void *)data; 370 iov.iov_len = len; 371 372 zfs_uio_t uio; 373 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 374 375 cookie = spl_fstrans_mark(); 376 error = zfs_write(zp, &uio, 0, kcred); 377 spl_fstrans_unmark(cookie); 378 379 if (error == 0) { 380 if (residp != NULL) 381 *residp = zfs_uio_resid(&uio); 382 else if (zfs_uio_resid(&uio) != 0) 383 error = SET_ERROR(EIO); 384 } 385 386 return (error); 387 } 388 389 static void 390 zfs_rele_async_task(void *arg) 391 { 392 iput(arg); 393 } 394 395 void 396 zfs_zrele_async(znode_t *zp) 397 { 398 struct inode *ip = ZTOI(zp); 399 objset_t *os = ITOZSB(ip)->z_os; 400 401 ASSERT(atomic_read(&ip->i_count) > 0); 402 ASSERT(os != NULL); 403 404 /* 405 * If decrementing the count would put us at 0, we can't do it inline 406 * here, because that would be synchronous. Instead, dispatch an iput 407 * to run later. 408 * 409 * For more information on the dangers of a synchronous iput, see the 410 * header comment of this file. 411 */ 412 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 413 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 414 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 415 } 416 } 417 418 419 /* 420 * Lookup an entry in a directory, or an extended attribute directory. 421 * If it exists, return a held inode reference for it. 422 * 423 * IN: zdp - znode of directory to search. 424 * nm - name of entry to lookup. 425 * flags - LOOKUP_XATTR set if looking for an attribute. 426 * cr - credentials of caller. 427 * direntflags - directory lookup flags 428 * realpnp - returned pathname. 429 * 430 * OUT: zpp - znode of located entry, NULL if not found. 431 * 432 * RETURN: 0 on success, error code on failure. 433 * 434 * Timestamps: 435 * NA 436 */ 437 int 438 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 439 int *direntflags, pathname_t *realpnp) 440 { 441 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 442 int error = 0; 443 444 /* 445 * Fast path lookup, however we must skip DNLC lookup 446 * for case folding or normalizing lookups because the 447 * DNLC code only stores the passed in name. This means 448 * creating 'a' and removing 'A' on a case insensitive 449 * file system would work, but DNLC still thinks 'a' 450 * exists and won't let you create it again on the next 451 * pass through fast path. 452 */ 453 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 454 455 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 456 return (SET_ERROR(ENOTDIR)); 457 } else if (zdp->z_sa_hdl == NULL) { 458 return (SET_ERROR(EIO)); 459 } 460 461 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 462 error = zfs_fastaccesschk_execute(zdp, cr); 463 if (!error) { 464 *zpp = zdp; 465 zhold(*zpp); 466 return (0); 467 } 468 return (error); 469 } 470 } 471 472 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 473 return (error); 474 475 *zpp = NULL; 476 477 if (flags & LOOKUP_XATTR) { 478 /* 479 * We don't allow recursive attributes.. 480 * Maybe someday we will. 481 */ 482 if (zdp->z_pflags & ZFS_XATTR) { 483 zfs_exit(zfsvfs, FTAG); 484 return (SET_ERROR(EINVAL)); 485 } 486 487 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 488 zfs_exit(zfsvfs, FTAG); 489 return (error); 490 } 491 492 /* 493 * Do we have permission to get into attribute directory? 494 */ 495 496 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 497 B_TRUE, cr, zfs_init_idmap))) { 498 zrele(*zpp); 499 *zpp = NULL; 500 } 501 502 zfs_exit(zfsvfs, FTAG); 503 return (error); 504 } 505 506 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 507 zfs_exit(zfsvfs, FTAG); 508 return (SET_ERROR(ENOTDIR)); 509 } 510 511 /* 512 * Check accessibility of directory. 513 */ 514 515 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 516 zfs_init_idmap))) { 517 zfs_exit(zfsvfs, FTAG); 518 return (error); 519 } 520 521 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 522 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 523 zfs_exit(zfsvfs, FTAG); 524 return (SET_ERROR(EILSEQ)); 525 } 526 527 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 528 if ((error == 0) && (*zpp)) 529 zfs_znode_update_vfs(*zpp); 530 531 zfs_exit(zfsvfs, FTAG); 532 return (error); 533 } 534 535 /* 536 * Attempt to create a new entry in a directory. If the entry 537 * already exists, truncate the file if permissible, else return 538 * an error. Return the ip of the created or trunc'd file. 539 * 540 * IN: dzp - znode of directory to put new file entry in. 541 * name - name of new file entry. 542 * vap - attributes of new file. 543 * excl - flag indicating exclusive or non-exclusive mode. 544 * mode - mode to open file with. 545 * cr - credentials of caller. 546 * flag - file flag. 547 * vsecp - ACL to be set 548 * mnt_ns - user namespace of the mount 549 * 550 * OUT: zpp - znode of created or trunc'd entry. 551 * 552 * RETURN: 0 on success, error code on failure. 553 * 554 * Timestamps: 555 * dzp - ctime|mtime updated if new entry created 556 * zp - ctime|mtime always, atime if new 557 */ 558 int 559 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 560 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 561 zidmap_t *mnt_ns) 562 { 563 znode_t *zp; 564 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 565 zilog_t *zilog; 566 objset_t *os; 567 zfs_dirlock_t *dl; 568 dmu_tx_t *tx; 569 int error; 570 uid_t uid; 571 gid_t gid; 572 zfs_acl_ids_t acl_ids; 573 boolean_t fuid_dirtied; 574 boolean_t have_acl = B_FALSE; 575 boolean_t waited = B_FALSE; 576 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 577 578 /* 579 * If we have an ephemeral id, ACL, or XVATTR then 580 * make sure file system is at proper version 581 */ 582 583 gid = crgetgid(cr); 584 uid = crgetuid(cr); 585 586 if (zfsvfs->z_use_fuids == B_FALSE && 587 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 588 return (SET_ERROR(EINVAL)); 589 590 if (name == NULL) 591 return (SET_ERROR(EINVAL)); 592 593 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 594 return (error); 595 os = zfsvfs->z_os; 596 zilog = zfsvfs->z_log; 597 598 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 599 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 600 zfs_exit(zfsvfs, FTAG); 601 return (SET_ERROR(EILSEQ)); 602 } 603 604 if (vap->va_mask & ATTR_XVATTR) { 605 if ((error = secpolicy_xvattr((xvattr_t *)vap, 606 crgetuid(cr), cr, vap->va_mode)) != 0) { 607 zfs_exit(zfsvfs, FTAG); 608 return (error); 609 } 610 } 611 612 top: 613 *zpp = NULL; 614 if (*name == '\0') { 615 /* 616 * Null component name refers to the directory itself. 617 */ 618 zhold(dzp); 619 zp = dzp; 620 dl = NULL; 621 error = 0; 622 } else { 623 /* possible igrab(zp) */ 624 int zflg = 0; 625 626 if (flag & FIGNORECASE) 627 zflg |= ZCILOOK; 628 629 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 630 NULL, NULL); 631 if (error) { 632 if (have_acl) 633 zfs_acl_ids_free(&acl_ids); 634 if (strcmp(name, "..") == 0) 635 error = SET_ERROR(EISDIR); 636 zfs_exit(zfsvfs, FTAG); 637 return (error); 638 } 639 } 640 641 if (zp == NULL) { 642 uint64_t txtype; 643 uint64_t projid = ZFS_DEFAULT_PROJID; 644 645 /* 646 * Create a new file object and update the directory 647 * to reference it. 648 */ 649 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 650 mnt_ns))) { 651 if (have_acl) 652 zfs_acl_ids_free(&acl_ids); 653 goto out; 654 } 655 656 /* 657 * We only support the creation of regular files in 658 * extended attribute directories. 659 */ 660 661 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 662 if (have_acl) 663 zfs_acl_ids_free(&acl_ids); 664 error = SET_ERROR(EINVAL); 665 goto out; 666 } 667 668 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 669 cr, vsecp, &acl_ids, mnt_ns)) != 0) 670 goto out; 671 have_acl = B_TRUE; 672 673 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 674 projid = zfs_inherit_projid(dzp); 675 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 676 zfs_acl_ids_free(&acl_ids); 677 error = SET_ERROR(EDQUOT); 678 goto out; 679 } 680 681 tx = dmu_tx_create(os); 682 683 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 684 ZFS_SA_BASE_ATTR_SIZE); 685 686 fuid_dirtied = zfsvfs->z_fuid_dirty; 687 if (fuid_dirtied) 688 zfs_fuid_txhold(zfsvfs, tx); 689 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 690 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 691 if (!zfsvfs->z_use_sa && 692 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 693 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 694 0, acl_ids.z_aclp->z_acl_bytes); 695 } 696 697 error = dmu_tx_assign(tx, 698 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 699 if (error) { 700 zfs_dirent_unlock(dl); 701 if (error == ERESTART) { 702 waited = B_TRUE; 703 dmu_tx_wait(tx); 704 dmu_tx_abort(tx); 705 goto top; 706 } 707 zfs_acl_ids_free(&acl_ids); 708 dmu_tx_abort(tx); 709 zfs_exit(zfsvfs, FTAG); 710 return (error); 711 } 712 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 713 714 error = zfs_link_create(dl, zp, tx, ZNEW); 715 if (error != 0) { 716 /* 717 * Since, we failed to add the directory entry for it, 718 * delete the newly created dnode. 719 */ 720 zfs_znode_delete(zp, tx); 721 remove_inode_hash(ZTOI(zp)); 722 zfs_acl_ids_free(&acl_ids); 723 dmu_tx_commit(tx); 724 goto out; 725 } 726 727 if (fuid_dirtied) 728 zfs_fuid_sync(zfsvfs, tx); 729 730 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 731 if (flag & FIGNORECASE) 732 txtype |= TX_CI; 733 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 734 vsecp, acl_ids.z_fuidp, vap); 735 zfs_acl_ids_free(&acl_ids); 736 dmu_tx_commit(tx); 737 } else { 738 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 739 740 if (have_acl) 741 zfs_acl_ids_free(&acl_ids); 742 743 /* 744 * A directory entry already exists for this name. 745 */ 746 /* 747 * Can't truncate an existing file if in exclusive mode. 748 */ 749 if (excl) { 750 error = SET_ERROR(EEXIST); 751 goto out; 752 } 753 /* 754 * Can't open a directory for writing. 755 */ 756 if (S_ISDIR(ZTOI(zp)->i_mode)) { 757 error = SET_ERROR(EISDIR); 758 goto out; 759 } 760 /* 761 * Verify requested access to file. 762 */ 763 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 764 mnt_ns))) { 765 goto out; 766 } 767 768 mutex_enter(&dzp->z_lock); 769 dzp->z_seq++; 770 mutex_exit(&dzp->z_lock); 771 772 /* 773 * Truncate regular files if requested. 774 */ 775 if (S_ISREG(ZTOI(zp)->i_mode) && 776 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 777 /* we can't hold any locks when calling zfs_freesp() */ 778 if (dl) { 779 zfs_dirent_unlock(dl); 780 dl = NULL; 781 } 782 error = zfs_freesp(zp, 0, 0, mode, TRUE); 783 } 784 } 785 out: 786 787 if (dl) 788 zfs_dirent_unlock(dl); 789 790 if (error) { 791 if (zp) 792 zrele(zp); 793 } else { 794 zfs_znode_update_vfs(dzp); 795 zfs_znode_update_vfs(zp); 796 *zpp = zp; 797 } 798 799 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 800 zil_commit(zilog, 0); 801 802 zfs_exit(zfsvfs, FTAG); 803 return (error); 804 } 805 806 int 807 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 808 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 809 zidmap_t *mnt_ns) 810 { 811 (void) excl, (void) mode, (void) flag; 812 znode_t *zp = NULL, *dzp = ITOZ(dip); 813 zfsvfs_t *zfsvfs = ITOZSB(dip); 814 objset_t *os; 815 dmu_tx_t *tx; 816 int error; 817 uid_t uid; 818 gid_t gid; 819 zfs_acl_ids_t acl_ids; 820 uint64_t projid = ZFS_DEFAULT_PROJID; 821 boolean_t fuid_dirtied; 822 boolean_t have_acl = B_FALSE; 823 boolean_t waited = B_FALSE; 824 825 /* 826 * If we have an ephemeral id, ACL, or XVATTR then 827 * make sure file system is at proper version 828 */ 829 830 gid = crgetgid(cr); 831 uid = crgetuid(cr); 832 833 if (zfsvfs->z_use_fuids == B_FALSE && 834 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 835 return (SET_ERROR(EINVAL)); 836 837 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 838 return (error); 839 os = zfsvfs->z_os; 840 841 if (vap->va_mask & ATTR_XVATTR) { 842 if ((error = secpolicy_xvattr((xvattr_t *)vap, 843 crgetuid(cr), cr, vap->va_mode)) != 0) { 844 zfs_exit(zfsvfs, FTAG); 845 return (error); 846 } 847 } 848 849 top: 850 *ipp = NULL; 851 852 /* 853 * Create a new file object and update the directory 854 * to reference it. 855 */ 856 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 857 if (have_acl) 858 zfs_acl_ids_free(&acl_ids); 859 goto out; 860 } 861 862 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 863 cr, vsecp, &acl_ids, mnt_ns)) != 0) 864 goto out; 865 have_acl = B_TRUE; 866 867 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 868 projid = zfs_inherit_projid(dzp); 869 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 870 zfs_acl_ids_free(&acl_ids); 871 error = SET_ERROR(EDQUOT); 872 goto out; 873 } 874 875 tx = dmu_tx_create(os); 876 877 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 878 ZFS_SA_BASE_ATTR_SIZE); 879 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 880 881 fuid_dirtied = zfsvfs->z_fuid_dirty; 882 if (fuid_dirtied) 883 zfs_fuid_txhold(zfsvfs, tx); 884 if (!zfsvfs->z_use_sa && 885 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 886 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 887 0, acl_ids.z_aclp->z_acl_bytes); 888 } 889 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 890 if (error) { 891 if (error == ERESTART) { 892 waited = B_TRUE; 893 dmu_tx_wait(tx); 894 dmu_tx_abort(tx); 895 goto top; 896 } 897 zfs_acl_ids_free(&acl_ids); 898 dmu_tx_abort(tx); 899 zfs_exit(zfsvfs, FTAG); 900 return (error); 901 } 902 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 903 904 if (fuid_dirtied) 905 zfs_fuid_sync(zfsvfs, tx); 906 907 /* Add to unlinked set */ 908 zp->z_unlinked = B_TRUE; 909 zfs_unlinked_add(zp, tx); 910 zfs_acl_ids_free(&acl_ids); 911 dmu_tx_commit(tx); 912 out: 913 914 if (error) { 915 if (zp) 916 zrele(zp); 917 } else { 918 zfs_znode_update_vfs(dzp); 919 zfs_znode_update_vfs(zp); 920 *ipp = ZTOI(zp); 921 } 922 923 zfs_exit(zfsvfs, FTAG); 924 return (error); 925 } 926 927 /* 928 * Remove an entry from a directory. 929 * 930 * IN: dzp - znode of directory to remove entry from. 931 * name - name of entry to remove. 932 * cr - credentials of caller. 933 * flags - case flags. 934 * 935 * RETURN: 0 if success 936 * error code if failure 937 * 938 * Timestamps: 939 * dzp - ctime|mtime 940 * ip - ctime (if nlink > 0) 941 */ 942 943 static uint64_t null_xattr = 0; 944 945 int 946 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 947 { 948 znode_t *zp; 949 znode_t *xzp; 950 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 951 zilog_t *zilog; 952 uint64_t acl_obj, xattr_obj; 953 uint64_t xattr_obj_unlinked = 0; 954 uint64_t obj = 0; 955 uint64_t links; 956 zfs_dirlock_t *dl; 957 dmu_tx_t *tx; 958 boolean_t may_delete_now, delete_now = FALSE; 959 boolean_t unlinked, toobig = FALSE; 960 uint64_t txtype; 961 pathname_t *realnmp = NULL; 962 pathname_t realnm; 963 int error; 964 int zflg = ZEXISTS; 965 boolean_t waited = B_FALSE; 966 967 if (name == NULL) 968 return (SET_ERROR(EINVAL)); 969 970 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 971 return (error); 972 zilog = zfsvfs->z_log; 973 974 if (flags & FIGNORECASE) { 975 zflg |= ZCILOOK; 976 pn_alloc(&realnm); 977 realnmp = &realnm; 978 } 979 980 top: 981 xattr_obj = 0; 982 xzp = NULL; 983 /* 984 * Attempt to lock directory; fail if entry doesn't exist. 985 */ 986 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 987 NULL, realnmp))) { 988 if (realnmp) 989 pn_free(realnmp); 990 zfs_exit(zfsvfs, FTAG); 991 return (error); 992 } 993 994 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 995 goto out; 996 } 997 998 /* 999 * Need to use rmdir for removing directories. 1000 */ 1001 if (S_ISDIR(ZTOI(zp)->i_mode)) { 1002 error = SET_ERROR(EPERM); 1003 goto out; 1004 } 1005 1006 mutex_enter(&zp->z_lock); 1007 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 1008 !zn_has_cached_data(zp, 0, LLONG_MAX); 1009 mutex_exit(&zp->z_lock); 1010 1011 /* 1012 * We may delete the znode now, or we may put it in the unlinked set; 1013 * it depends on whether we're the last link, and on whether there are 1014 * other holds on the inode. So we dmu_tx_hold() the right things to 1015 * allow for either case. 1016 */ 1017 obj = zp->z_id; 1018 tx = dmu_tx_create(zfsvfs->z_os); 1019 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1020 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1021 zfs_sa_upgrade_txholds(tx, zp); 1022 zfs_sa_upgrade_txholds(tx, dzp); 1023 if (may_delete_now) { 1024 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1025 /* if the file is too big, only hold_free a token amount */ 1026 dmu_tx_hold_free(tx, zp->z_id, 0, 1027 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1028 } 1029 1030 /* are there any extended attributes? */ 1031 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1032 &xattr_obj, sizeof (xattr_obj)); 1033 if (error == 0 && xattr_obj) { 1034 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1035 ASSERT0(error); 1036 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1037 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1038 } 1039 1040 mutex_enter(&zp->z_lock); 1041 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1042 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1043 mutex_exit(&zp->z_lock); 1044 1045 /* charge as an update -- would be nice not to charge at all */ 1046 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1047 1048 /* 1049 * Mark this transaction as typically resulting in a net free of space 1050 */ 1051 dmu_tx_mark_netfree(tx); 1052 1053 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1054 if (error) { 1055 zfs_dirent_unlock(dl); 1056 if (error == ERESTART) { 1057 waited = B_TRUE; 1058 dmu_tx_wait(tx); 1059 dmu_tx_abort(tx); 1060 zrele(zp); 1061 if (xzp) 1062 zrele(xzp); 1063 goto top; 1064 } 1065 if (realnmp) 1066 pn_free(realnmp); 1067 dmu_tx_abort(tx); 1068 zrele(zp); 1069 if (xzp) 1070 zrele(xzp); 1071 zfs_exit(zfsvfs, FTAG); 1072 return (error); 1073 } 1074 1075 /* 1076 * Remove the directory entry. 1077 */ 1078 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1079 1080 if (error) { 1081 dmu_tx_commit(tx); 1082 goto out; 1083 } 1084 1085 if (unlinked) { 1086 /* 1087 * Hold z_lock so that we can make sure that the ACL obj 1088 * hasn't changed. Could have been deleted due to 1089 * zfs_sa_upgrade(). 1090 */ 1091 mutex_enter(&zp->z_lock); 1092 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1093 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1094 delete_now = may_delete_now && !toobig && 1095 atomic_read(&ZTOI(zp)->i_count) == 1 && 1096 !zn_has_cached_data(zp, 0, LLONG_MAX) && 1097 xattr_obj == xattr_obj_unlinked && 1098 zfs_external_acl(zp) == acl_obj; 1099 VERIFY_IMPLY(xattr_obj_unlinked, xzp); 1100 } 1101 1102 if (delete_now) { 1103 if (xattr_obj_unlinked) { 1104 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1105 mutex_enter(&xzp->z_lock); 1106 xzp->z_unlinked = B_TRUE; 1107 clear_nlink(ZTOI(xzp)); 1108 links = 0; 1109 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1110 &links, sizeof (links), tx); 1111 ASSERT3U(error, ==, 0); 1112 mutex_exit(&xzp->z_lock); 1113 zfs_unlinked_add(xzp, tx); 1114 1115 if (zp->z_is_sa) 1116 error = sa_remove(zp->z_sa_hdl, 1117 SA_ZPL_XATTR(zfsvfs), tx); 1118 else 1119 error = sa_update(zp->z_sa_hdl, 1120 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1121 sizeof (uint64_t), tx); 1122 ASSERT0(error); 1123 } 1124 /* 1125 * Add to the unlinked set because a new reference could be 1126 * taken concurrently resulting in a deferred destruction. 1127 */ 1128 zfs_unlinked_add(zp, tx); 1129 mutex_exit(&zp->z_lock); 1130 } else if (unlinked) { 1131 mutex_exit(&zp->z_lock); 1132 zfs_unlinked_add(zp, tx); 1133 } 1134 1135 txtype = TX_REMOVE; 1136 if (flags & FIGNORECASE) 1137 txtype |= TX_CI; 1138 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1139 1140 dmu_tx_commit(tx); 1141 out: 1142 if (realnmp) 1143 pn_free(realnmp); 1144 1145 zfs_dirent_unlock(dl); 1146 zfs_znode_update_vfs(dzp); 1147 zfs_znode_update_vfs(zp); 1148 1149 if (delete_now) 1150 zrele(zp); 1151 else 1152 zfs_zrele_async(zp); 1153 1154 if (xzp) { 1155 zfs_znode_update_vfs(xzp); 1156 zfs_zrele_async(xzp); 1157 } 1158 1159 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1160 zil_commit(zilog, 0); 1161 1162 zfs_exit(zfsvfs, FTAG); 1163 return (error); 1164 } 1165 1166 /* 1167 * Create a new directory and insert it into dzp using the name 1168 * provided. Return a pointer to the inserted directory. 1169 * 1170 * IN: dzp - znode of directory to add subdir to. 1171 * dirname - name of new directory. 1172 * vap - attributes of new directory. 1173 * cr - credentials of caller. 1174 * flags - case flags. 1175 * vsecp - ACL to be set 1176 * mnt_ns - user namespace of the mount 1177 * 1178 * OUT: zpp - znode of created directory. 1179 * 1180 * RETURN: 0 if success 1181 * error code if failure 1182 * 1183 * Timestamps: 1184 * dzp - ctime|mtime updated 1185 * zpp - ctime|mtime|atime updated 1186 */ 1187 int 1188 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1189 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) 1190 { 1191 znode_t *zp; 1192 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1193 zilog_t *zilog; 1194 zfs_dirlock_t *dl; 1195 uint64_t txtype; 1196 dmu_tx_t *tx; 1197 int error; 1198 int zf = ZNEW; 1199 uid_t uid; 1200 gid_t gid = crgetgid(cr); 1201 zfs_acl_ids_t acl_ids; 1202 boolean_t fuid_dirtied; 1203 boolean_t waited = B_FALSE; 1204 1205 ASSERT(S_ISDIR(vap->va_mode)); 1206 1207 /* 1208 * If we have an ephemeral id, ACL, or XVATTR then 1209 * make sure file system is at proper version 1210 */ 1211 1212 uid = crgetuid(cr); 1213 if (zfsvfs->z_use_fuids == B_FALSE && 1214 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1215 return (SET_ERROR(EINVAL)); 1216 1217 if (dirname == NULL) 1218 return (SET_ERROR(EINVAL)); 1219 1220 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1221 return (error); 1222 zilog = zfsvfs->z_log; 1223 1224 if (dzp->z_pflags & ZFS_XATTR) { 1225 zfs_exit(zfsvfs, FTAG); 1226 return (SET_ERROR(EINVAL)); 1227 } 1228 1229 if (zfsvfs->z_utf8 && u8_validate(dirname, 1230 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1231 zfs_exit(zfsvfs, FTAG); 1232 return (SET_ERROR(EILSEQ)); 1233 } 1234 if (flags & FIGNORECASE) 1235 zf |= ZCILOOK; 1236 1237 if (vap->va_mask & ATTR_XVATTR) { 1238 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1239 crgetuid(cr), cr, vap->va_mode)) != 0) { 1240 zfs_exit(zfsvfs, FTAG); 1241 return (error); 1242 } 1243 } 1244 1245 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1246 vsecp, &acl_ids, mnt_ns)) != 0) { 1247 zfs_exit(zfsvfs, FTAG); 1248 return (error); 1249 } 1250 /* 1251 * First make sure the new directory doesn't exist. 1252 * 1253 * Existence is checked first to make sure we don't return 1254 * EACCES instead of EEXIST which can cause some applications 1255 * to fail. 1256 */ 1257 top: 1258 *zpp = NULL; 1259 1260 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1261 NULL, NULL))) { 1262 zfs_acl_ids_free(&acl_ids); 1263 zfs_exit(zfsvfs, FTAG); 1264 return (error); 1265 } 1266 1267 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1268 mnt_ns))) { 1269 zfs_acl_ids_free(&acl_ids); 1270 zfs_dirent_unlock(dl); 1271 zfs_exit(zfsvfs, FTAG); 1272 return (error); 1273 } 1274 1275 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1276 zfs_acl_ids_free(&acl_ids); 1277 zfs_dirent_unlock(dl); 1278 zfs_exit(zfsvfs, FTAG); 1279 return (SET_ERROR(EDQUOT)); 1280 } 1281 1282 /* 1283 * Add a new entry to the directory. 1284 */ 1285 tx = dmu_tx_create(zfsvfs->z_os); 1286 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1287 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1288 fuid_dirtied = zfsvfs->z_fuid_dirty; 1289 if (fuid_dirtied) 1290 zfs_fuid_txhold(zfsvfs, tx); 1291 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1292 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1293 acl_ids.z_aclp->z_acl_bytes); 1294 } 1295 1296 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1297 ZFS_SA_BASE_ATTR_SIZE); 1298 1299 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1300 if (error) { 1301 zfs_dirent_unlock(dl); 1302 if (error == ERESTART) { 1303 waited = B_TRUE; 1304 dmu_tx_wait(tx); 1305 dmu_tx_abort(tx); 1306 goto top; 1307 } 1308 zfs_acl_ids_free(&acl_ids); 1309 dmu_tx_abort(tx); 1310 zfs_exit(zfsvfs, FTAG); 1311 return (error); 1312 } 1313 1314 /* 1315 * Create new node. 1316 */ 1317 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1318 1319 /* 1320 * Now put new name in parent dir. 1321 */ 1322 error = zfs_link_create(dl, zp, tx, ZNEW); 1323 if (error != 0) { 1324 zfs_znode_delete(zp, tx); 1325 remove_inode_hash(ZTOI(zp)); 1326 goto out; 1327 } 1328 1329 if (fuid_dirtied) 1330 zfs_fuid_sync(zfsvfs, tx); 1331 1332 *zpp = zp; 1333 1334 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1335 if (flags & FIGNORECASE) 1336 txtype |= TX_CI; 1337 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1338 acl_ids.z_fuidp, vap); 1339 1340 out: 1341 zfs_acl_ids_free(&acl_ids); 1342 1343 dmu_tx_commit(tx); 1344 1345 zfs_dirent_unlock(dl); 1346 1347 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1348 zil_commit(zilog, 0); 1349 1350 if (error != 0) { 1351 zrele(zp); 1352 } else { 1353 zfs_znode_update_vfs(dzp); 1354 zfs_znode_update_vfs(zp); 1355 } 1356 zfs_exit(zfsvfs, FTAG); 1357 return (error); 1358 } 1359 1360 /* 1361 * Remove a directory subdir entry. If the current working 1362 * directory is the same as the subdir to be removed, the 1363 * remove will fail. 1364 * 1365 * IN: dzp - znode of directory to remove from. 1366 * name - name of directory to be removed. 1367 * cwd - inode of current working directory. 1368 * cr - credentials of caller. 1369 * flags - case flags 1370 * 1371 * RETURN: 0 on success, error code on failure. 1372 * 1373 * Timestamps: 1374 * dzp - ctime|mtime updated 1375 */ 1376 int 1377 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1378 int flags) 1379 { 1380 znode_t *zp; 1381 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1382 zilog_t *zilog; 1383 zfs_dirlock_t *dl; 1384 dmu_tx_t *tx; 1385 int error; 1386 int zflg = ZEXISTS; 1387 boolean_t waited = B_FALSE; 1388 1389 if (name == NULL) 1390 return (SET_ERROR(EINVAL)); 1391 1392 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1393 return (error); 1394 zilog = zfsvfs->z_log; 1395 1396 if (flags & FIGNORECASE) 1397 zflg |= ZCILOOK; 1398 top: 1399 zp = NULL; 1400 1401 /* 1402 * Attempt to lock directory; fail if entry doesn't exist. 1403 */ 1404 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1405 NULL, NULL))) { 1406 zfs_exit(zfsvfs, FTAG); 1407 return (error); 1408 } 1409 1410 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1411 goto out; 1412 } 1413 1414 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1415 error = SET_ERROR(ENOTDIR); 1416 goto out; 1417 } 1418 1419 if (zp == cwd) { 1420 error = SET_ERROR(EINVAL); 1421 goto out; 1422 } 1423 1424 /* 1425 * Grab a lock on the directory to make sure that no one is 1426 * trying to add (or lookup) entries while we are removing it. 1427 */ 1428 rw_enter(&zp->z_name_lock, RW_WRITER); 1429 1430 /* 1431 * Grab a lock on the parent pointer to make sure we play well 1432 * with the treewalk and directory rename code. 1433 */ 1434 rw_enter(&zp->z_parent_lock, RW_WRITER); 1435 1436 tx = dmu_tx_create(zfsvfs->z_os); 1437 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1438 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1439 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1440 zfs_sa_upgrade_txholds(tx, zp); 1441 zfs_sa_upgrade_txholds(tx, dzp); 1442 dmu_tx_mark_netfree(tx); 1443 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1444 if (error) { 1445 rw_exit(&zp->z_parent_lock); 1446 rw_exit(&zp->z_name_lock); 1447 zfs_dirent_unlock(dl); 1448 if (error == ERESTART) { 1449 waited = B_TRUE; 1450 dmu_tx_wait(tx); 1451 dmu_tx_abort(tx); 1452 zrele(zp); 1453 goto top; 1454 } 1455 dmu_tx_abort(tx); 1456 zrele(zp); 1457 zfs_exit(zfsvfs, FTAG); 1458 return (error); 1459 } 1460 1461 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1462 1463 if (error == 0) { 1464 uint64_t txtype = TX_RMDIR; 1465 if (flags & FIGNORECASE) 1466 txtype |= TX_CI; 1467 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1468 B_FALSE); 1469 } 1470 1471 dmu_tx_commit(tx); 1472 1473 rw_exit(&zp->z_parent_lock); 1474 rw_exit(&zp->z_name_lock); 1475 out: 1476 zfs_dirent_unlock(dl); 1477 1478 zfs_znode_update_vfs(dzp); 1479 zfs_znode_update_vfs(zp); 1480 zrele(zp); 1481 1482 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1483 zil_commit(zilog, 0); 1484 1485 zfs_exit(zfsvfs, FTAG); 1486 return (error); 1487 } 1488 1489 /* 1490 * Read directory entries from the given directory cursor position and emit 1491 * name and position for each entry. 1492 * 1493 * IN: ip - inode of directory to read. 1494 * ctx - directory entry context. 1495 * cr - credentials of caller. 1496 * 1497 * RETURN: 0 if success 1498 * error code if failure 1499 * 1500 * Timestamps: 1501 * ip - atime updated 1502 * 1503 * Note that the low 4 bits of the cookie returned by zap is always zero. 1504 * This allows us to use the low range for "special" directory entries: 1505 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1506 * we use the offset 2 for the '.zfs' directory. 1507 */ 1508 int 1509 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) 1510 { 1511 (void) cr; 1512 znode_t *zp = ITOZ(ip); 1513 zfsvfs_t *zfsvfs = ITOZSB(ip); 1514 objset_t *os; 1515 zap_cursor_t zc; 1516 zap_attribute_t zap; 1517 int error; 1518 uint8_t prefetch; 1519 uint8_t type; 1520 int done = 0; 1521 uint64_t parent; 1522 uint64_t offset; /* must be unsigned; checks for < 1 */ 1523 1524 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1525 return (error); 1526 1527 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1528 &parent, sizeof (parent))) != 0) 1529 goto out; 1530 1531 /* 1532 * Quit if directory has been removed (posix) 1533 */ 1534 if (zp->z_unlinked) 1535 goto out; 1536 1537 error = 0; 1538 os = zfsvfs->z_os; 1539 offset = ctx->pos; 1540 prefetch = zp->z_zn_prefetch; 1541 1542 /* 1543 * Initialize the iterator cursor. 1544 */ 1545 if (offset <= 3) { 1546 /* 1547 * Start iteration from the beginning of the directory. 1548 */ 1549 zap_cursor_init(&zc, os, zp->z_id); 1550 } else { 1551 /* 1552 * The offset is a serialized cursor. 1553 */ 1554 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1555 } 1556 1557 /* 1558 * Transform to file-system independent format 1559 */ 1560 while (!done) { 1561 uint64_t objnum; 1562 /* 1563 * Special case `.', `..', and `.zfs'. 1564 */ 1565 if (offset == 0) { 1566 (void) strcpy(zap.za_name, "."); 1567 zap.za_normalization_conflict = 0; 1568 objnum = zp->z_id; 1569 type = DT_DIR; 1570 } else if (offset == 1) { 1571 (void) strcpy(zap.za_name, ".."); 1572 zap.za_normalization_conflict = 0; 1573 objnum = parent; 1574 type = DT_DIR; 1575 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1576 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1577 zap.za_normalization_conflict = 0; 1578 objnum = ZFSCTL_INO_ROOT; 1579 type = DT_DIR; 1580 } else { 1581 /* 1582 * Grab next entry. 1583 */ 1584 if ((error = zap_cursor_retrieve(&zc, &zap))) { 1585 if (error == ENOENT) 1586 break; 1587 else 1588 goto update; 1589 } 1590 1591 /* 1592 * Allow multiple entries provided the first entry is 1593 * the object id. Non-zpl consumers may safely make 1594 * use of the additional space. 1595 * 1596 * XXX: This should be a feature flag for compatibility 1597 */ 1598 if (zap.za_integer_length != 8 || 1599 zap.za_num_integers == 0) { 1600 cmn_err(CE_WARN, "zap_readdir: bad directory " 1601 "entry, obj = %lld, offset = %lld, " 1602 "length = %d, num = %lld\n", 1603 (u_longlong_t)zp->z_id, 1604 (u_longlong_t)offset, 1605 zap.za_integer_length, 1606 (u_longlong_t)zap.za_num_integers); 1607 error = SET_ERROR(ENXIO); 1608 goto update; 1609 } 1610 1611 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 1612 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1613 } 1614 1615 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), 1616 objnum, type); 1617 if (done) 1618 break; 1619 1620 if (prefetch) 1621 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); 1622 1623 /* 1624 * Move to the next entry, fill in the previous offset. 1625 */ 1626 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1627 zap_cursor_advance(&zc); 1628 offset = zap_cursor_serialize(&zc); 1629 } else { 1630 offset += 1; 1631 } 1632 ctx->pos = offset; 1633 } 1634 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1635 1636 update: 1637 zap_cursor_fini(&zc); 1638 if (error == ENOENT) 1639 error = 0; 1640 out: 1641 zfs_exit(zfsvfs, FTAG); 1642 1643 return (error); 1644 } 1645 1646 /* 1647 * Get the basic file attributes and place them in the provided kstat 1648 * structure. The inode is assumed to be the authoritative source 1649 * for most of the attributes. However, the znode currently has the 1650 * authoritative atime, blksize, and block count. 1651 * 1652 * IN: ip - inode of file. 1653 * 1654 * OUT: sp - kstat values. 1655 * 1656 * RETURN: 0 (always succeeds) 1657 */ 1658 int 1659 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1660 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, 1661 struct kstat *sp) 1662 #else 1663 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) 1664 #endif 1665 { 1666 znode_t *zp = ITOZ(ip); 1667 zfsvfs_t *zfsvfs = ITOZSB(ip); 1668 uint32_t blksize; 1669 u_longlong_t nblocks; 1670 int error; 1671 1672 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1673 return (error); 1674 1675 mutex_enter(&zp->z_lock); 1676 1677 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1678 zpl_generic_fillattr(user_ns, request_mask, ip, sp); 1679 #else 1680 zpl_generic_fillattr(user_ns, ip, sp); 1681 #endif 1682 /* 1683 * +1 link count for root inode with visible '.zfs' directory. 1684 */ 1685 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1686 if (sp->nlink < ZFS_LINK_MAX) 1687 sp->nlink++; 1688 1689 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1690 sp->blksize = blksize; 1691 sp->blocks = nblocks; 1692 1693 if (unlikely(zp->z_blksz == 0)) { 1694 /* 1695 * Block size hasn't been set; suggest maximal I/O transfers. 1696 */ 1697 sp->blksize = zfsvfs->z_max_blksz; 1698 } 1699 1700 mutex_exit(&zp->z_lock); 1701 1702 /* 1703 * Required to prevent NFS client from detecting different inode 1704 * numbers of snapshot root dentry before and after snapshot mount. 1705 */ 1706 if (zfsvfs->z_issnap) { 1707 if (ip->i_sb->s_root->d_inode == ip) 1708 sp->ino = ZFSCTL_INO_SNAPDIRS - 1709 dmu_objset_id(zfsvfs->z_os); 1710 } 1711 1712 zfs_exit(zfsvfs, FTAG); 1713 1714 return (0); 1715 } 1716 1717 /* 1718 * For the operation of changing file's user/group/project, we need to 1719 * handle not only the main object that is assigned to the file directly, 1720 * but also the ones that are used by the file via hidden xattr directory. 1721 * 1722 * Because the xattr directory may contains many EA entries, as to it may 1723 * be impossible to change all of them via the transaction of changing the 1724 * main object's user/group/project attributes. Then we have to change them 1725 * via other multiple independent transactions one by one. It may be not good 1726 * solution, but we have no better idea yet. 1727 */ 1728 static int 1729 zfs_setattr_dir(znode_t *dzp) 1730 { 1731 struct inode *dxip = ZTOI(dzp); 1732 struct inode *xip = NULL; 1733 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1734 objset_t *os = zfsvfs->z_os; 1735 zap_cursor_t zc; 1736 zap_attribute_t zap; 1737 zfs_dirlock_t *dl; 1738 znode_t *zp = NULL; 1739 dmu_tx_t *tx = NULL; 1740 uint64_t uid, gid; 1741 sa_bulk_attr_t bulk[4]; 1742 int count; 1743 int err; 1744 1745 zap_cursor_init(&zc, os, dzp->z_id); 1746 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { 1747 count = 0; 1748 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { 1749 err = ENXIO; 1750 break; 1751 } 1752 1753 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, 1754 ZEXISTS, NULL, NULL); 1755 if (err == ENOENT) 1756 goto next; 1757 if (err) 1758 break; 1759 1760 xip = ZTOI(zp); 1761 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1762 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1763 zp->z_projid == dzp->z_projid) 1764 goto next; 1765 1766 tx = dmu_tx_create(os); 1767 if (!(zp->z_pflags & ZFS_PROJID)) 1768 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1769 else 1770 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1771 1772 err = dmu_tx_assign(tx, TXG_WAIT); 1773 if (err) 1774 break; 1775 1776 mutex_enter(&dzp->z_lock); 1777 1778 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1779 xip->i_uid = dxip->i_uid; 1780 uid = zfs_uid_read(dxip); 1781 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1782 &uid, sizeof (uid)); 1783 } 1784 1785 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1786 xip->i_gid = dxip->i_gid; 1787 gid = zfs_gid_read(dxip); 1788 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1789 &gid, sizeof (gid)); 1790 } 1791 1792 1793 uint64_t projid = dzp->z_projid; 1794 if (zp->z_projid != projid) { 1795 if (!(zp->z_pflags & ZFS_PROJID)) { 1796 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 1797 if (unlikely(err == EEXIST)) { 1798 err = 0; 1799 } else if (err != 0) { 1800 goto sa_add_projid_err; 1801 } else { 1802 projid = ZFS_INVALID_PROJID; 1803 } 1804 } 1805 1806 if (projid != ZFS_INVALID_PROJID) { 1807 zp->z_projid = projid; 1808 SA_ADD_BULK_ATTR(bulk, count, 1809 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 1810 sizeof (zp->z_projid)); 1811 } 1812 } 1813 1814 sa_add_projid_err: 1815 mutex_exit(&dzp->z_lock); 1816 1817 if (likely(count > 0)) { 1818 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1819 dmu_tx_commit(tx); 1820 } else if (projid == ZFS_INVALID_PROJID) { 1821 dmu_tx_commit(tx); 1822 } else { 1823 dmu_tx_abort(tx); 1824 } 1825 tx = NULL; 1826 if (err != 0 && err != ENOENT) 1827 break; 1828 1829 next: 1830 if (zp) { 1831 zrele(zp); 1832 zp = NULL; 1833 zfs_dirent_unlock(dl); 1834 } 1835 zap_cursor_advance(&zc); 1836 } 1837 1838 if (tx) 1839 dmu_tx_abort(tx); 1840 if (zp) { 1841 zrele(zp); 1842 zfs_dirent_unlock(dl); 1843 } 1844 zap_cursor_fini(&zc); 1845 1846 return (err == ENOENT ? 0 : err); 1847 } 1848 1849 /* 1850 * Set the file attributes to the values contained in the 1851 * vattr structure. 1852 * 1853 * IN: zp - znode of file to be modified. 1854 * vap - new attribute values. 1855 * If ATTR_XVATTR set, then optional attrs are being set 1856 * flags - ATTR_UTIME set if non-default time values provided. 1857 * - ATTR_NOACLCHECK (CIFS context only). 1858 * cr - credentials of caller. 1859 * mnt_ns - user namespace of the mount 1860 * 1861 * RETURN: 0 if success 1862 * error code if failure 1863 * 1864 * Timestamps: 1865 * ip - ctime updated, mtime updated if size changed. 1866 */ 1867 int 1868 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) 1869 { 1870 struct inode *ip; 1871 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1872 objset_t *os; 1873 zilog_t *zilog; 1874 dmu_tx_t *tx; 1875 vattr_t oldva; 1876 xvattr_t *tmpxvattr; 1877 uint_t mask = vap->va_mask; 1878 uint_t saved_mask = 0; 1879 int trim_mask = 0; 1880 uint64_t new_mode; 1881 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1882 uint64_t xattr_obj; 1883 uint64_t mtime[2], ctime[2], atime[2]; 1884 uint64_t projid = ZFS_INVALID_PROJID; 1885 znode_t *attrzp; 1886 int need_policy = FALSE; 1887 int err, err2 = 0; 1888 zfs_fuid_info_t *fuidp = NULL; 1889 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1890 xoptattr_t *xoap; 1891 zfs_acl_t *aclp; 1892 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1893 boolean_t fuid_dirtied = B_FALSE; 1894 boolean_t handle_eadir = B_FALSE; 1895 sa_bulk_attr_t *bulk, *xattr_bulk; 1896 int count = 0, xattr_count = 0, bulks = 8; 1897 1898 if (mask == 0) 1899 return (0); 1900 1901 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1902 return (err); 1903 ip = ZTOI(zp); 1904 os = zfsvfs->z_os; 1905 1906 /* 1907 * If this is a xvattr_t, then get a pointer to the structure of 1908 * optional attributes. If this is NULL, then we have a vattr_t. 1909 */ 1910 xoap = xva_getxoptattr(xvap); 1911 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1912 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1913 if (!dmu_objset_projectquota_enabled(os) || 1914 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1915 zfs_exit(zfsvfs, FTAG); 1916 return (SET_ERROR(ENOTSUP)); 1917 } 1918 1919 projid = xoap->xoa_projid; 1920 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1921 zfs_exit(zfsvfs, FTAG); 1922 return (SET_ERROR(EINVAL)); 1923 } 1924 1925 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1926 projid = ZFS_INVALID_PROJID; 1927 else 1928 need_policy = TRUE; 1929 } 1930 1931 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1932 (xoap->xoa_projinherit != 1933 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1934 (!dmu_objset_projectquota_enabled(os) || 1935 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1936 zfs_exit(zfsvfs, FTAG); 1937 return (SET_ERROR(ENOTSUP)); 1938 } 1939 } 1940 1941 zilog = zfsvfs->z_log; 1942 1943 /* 1944 * Make sure that if we have ephemeral uid/gid or xvattr specified 1945 * that file system is at proper version level 1946 */ 1947 1948 if (zfsvfs->z_use_fuids == B_FALSE && 1949 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 1950 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 1951 (mask & ATTR_XVATTR))) { 1952 zfs_exit(zfsvfs, FTAG); 1953 return (SET_ERROR(EINVAL)); 1954 } 1955 1956 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 1957 zfs_exit(zfsvfs, FTAG); 1958 return (SET_ERROR(EISDIR)); 1959 } 1960 1961 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 1962 zfs_exit(zfsvfs, FTAG); 1963 return (SET_ERROR(EINVAL)); 1964 } 1965 1966 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 1967 xva_init(tmpxvattr); 1968 1969 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1970 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 1971 1972 /* 1973 * Immutable files can only alter immutable bit and atime 1974 */ 1975 if ((zp->z_pflags & ZFS_IMMUTABLE) && 1976 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 1977 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 1978 err = SET_ERROR(EPERM); 1979 goto out3; 1980 } 1981 1982 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 1983 err = SET_ERROR(EPERM); 1984 goto out3; 1985 } 1986 1987 /* 1988 * Verify timestamps doesn't overflow 32 bits. 1989 * ZFS can handle large timestamps, but 32bit syscalls can't 1990 * handle times greater than 2039. This check should be removed 1991 * once large timestamps are fully supported. 1992 */ 1993 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 1994 if (((mask & ATTR_ATIME) && 1995 TIMESPEC_OVERFLOW(&vap->va_atime)) || 1996 ((mask & ATTR_MTIME) && 1997 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 1998 err = SET_ERROR(EOVERFLOW); 1999 goto out3; 2000 } 2001 } 2002 2003 top: 2004 attrzp = NULL; 2005 aclp = NULL; 2006 2007 /* Can this be moved to before the top label? */ 2008 if (zfs_is_readonly(zfsvfs)) { 2009 err = SET_ERROR(EROFS); 2010 goto out3; 2011 } 2012 2013 /* 2014 * First validate permissions 2015 */ 2016 2017 if (mask & ATTR_SIZE) { 2018 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 2019 mnt_ns); 2020 if (err) 2021 goto out3; 2022 2023 /* 2024 * XXX - Note, we are not providing any open 2025 * mode flags here (like FNDELAY), so we may 2026 * block if there are locks present... this 2027 * should be addressed in openat(). 2028 */ 2029 /* XXX - would it be OK to generate a log record here? */ 2030 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2031 if (err) 2032 goto out3; 2033 } 2034 2035 if (mask & (ATTR_ATIME|ATTR_MTIME) || 2036 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2037 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2038 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2039 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2040 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2041 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2042 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2043 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2044 skipaclchk, cr, mnt_ns); 2045 } 2046 2047 if (mask & (ATTR_UID|ATTR_GID)) { 2048 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2049 int take_owner; 2050 int take_group; 2051 uid_t uid; 2052 gid_t gid; 2053 2054 /* 2055 * NOTE: even if a new mode is being set, 2056 * we may clear S_ISUID/S_ISGID bits. 2057 */ 2058 2059 if (!(mask & ATTR_MODE)) 2060 vap->va_mode = zp->z_mode; 2061 2062 /* 2063 * Take ownership or chgrp to group we are a member of 2064 */ 2065 2066 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), 2067 vap->va_uid); 2068 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), 2069 vap->va_gid); 2070 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2071 take_group = (mask & ATTR_GID) && 2072 zfs_groupmember(zfsvfs, gid, cr); 2073 2074 /* 2075 * If both ATTR_UID and ATTR_GID are set then take_owner and 2076 * take_group must both be set in order to allow taking 2077 * ownership. 2078 * 2079 * Otherwise, send the check through secpolicy_vnode_setattr() 2080 * 2081 */ 2082 2083 if (((idmask == (ATTR_UID|ATTR_GID)) && 2084 take_owner && take_group) || 2085 ((idmask == ATTR_UID) && take_owner) || 2086 ((idmask == ATTR_GID) && take_group)) { 2087 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2088 skipaclchk, cr, mnt_ns) == 0) { 2089 /* 2090 * Remove setuid/setgid for non-privileged users 2091 */ 2092 (void) secpolicy_setid_clear(vap, cr); 2093 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2094 } else { 2095 need_policy = TRUE; 2096 } 2097 } else { 2098 need_policy = TRUE; 2099 } 2100 } 2101 2102 mutex_enter(&zp->z_lock); 2103 oldva.va_mode = zp->z_mode; 2104 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2105 if (mask & ATTR_XVATTR) { 2106 /* 2107 * Update xvattr mask to include only those attributes 2108 * that are actually changing. 2109 * 2110 * the bits will be restored prior to actually setting 2111 * the attributes so the caller thinks they were set. 2112 */ 2113 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2114 if (xoap->xoa_appendonly != 2115 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2116 need_policy = TRUE; 2117 } else { 2118 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2119 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2120 } 2121 } 2122 2123 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2124 if (xoap->xoa_projinherit != 2125 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2126 need_policy = TRUE; 2127 } else { 2128 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2129 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2130 } 2131 } 2132 2133 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2134 if (xoap->xoa_nounlink != 2135 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2136 need_policy = TRUE; 2137 } else { 2138 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2139 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2140 } 2141 } 2142 2143 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2144 if (xoap->xoa_immutable != 2145 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2146 need_policy = TRUE; 2147 } else { 2148 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2149 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2150 } 2151 } 2152 2153 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2154 if (xoap->xoa_nodump != 2155 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2156 need_policy = TRUE; 2157 } else { 2158 XVA_CLR_REQ(xvap, XAT_NODUMP); 2159 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2160 } 2161 } 2162 2163 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2164 if (xoap->xoa_av_modified != 2165 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2166 need_policy = TRUE; 2167 } else { 2168 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2169 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2170 } 2171 } 2172 2173 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2174 if ((!S_ISREG(ip->i_mode) && 2175 xoap->xoa_av_quarantined) || 2176 xoap->xoa_av_quarantined != 2177 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2178 need_policy = TRUE; 2179 } else { 2180 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2181 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2182 } 2183 } 2184 2185 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2186 mutex_exit(&zp->z_lock); 2187 err = SET_ERROR(EPERM); 2188 goto out3; 2189 } 2190 2191 if (need_policy == FALSE && 2192 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2193 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2194 need_policy = TRUE; 2195 } 2196 } 2197 2198 mutex_exit(&zp->z_lock); 2199 2200 if (mask & ATTR_MODE) { 2201 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2202 mnt_ns) == 0) { 2203 err = secpolicy_setid_setsticky_clear(ip, vap, 2204 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2205 if (err) 2206 goto out3; 2207 trim_mask |= ATTR_MODE; 2208 } else { 2209 need_policy = TRUE; 2210 } 2211 } 2212 2213 if (need_policy) { 2214 /* 2215 * If trim_mask is set then take ownership 2216 * has been granted or write_acl is present and user 2217 * has the ability to modify mode. In that case remove 2218 * UID|GID and or MODE from mask so that 2219 * secpolicy_vnode_setattr() doesn't revoke it. 2220 */ 2221 2222 if (trim_mask) { 2223 saved_mask = vap->va_mask; 2224 vap->va_mask &= ~trim_mask; 2225 } 2226 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2227 zfs_zaccess_unix, zp); 2228 if (err) 2229 goto out3; 2230 2231 if (trim_mask) 2232 vap->va_mask |= saved_mask; 2233 } 2234 2235 /* 2236 * secpolicy_vnode_setattr, or take ownership may have 2237 * changed va_mask 2238 */ 2239 mask = vap->va_mask; 2240 2241 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2242 handle_eadir = B_TRUE; 2243 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2244 &xattr_obj, sizeof (xattr_obj)); 2245 2246 if (err == 0 && xattr_obj) { 2247 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2248 if (err) 2249 goto out2; 2250 } 2251 if (mask & ATTR_UID) { 2252 new_kuid = zfs_fuid_create(zfsvfs, 2253 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2254 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2255 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2256 new_kuid)) { 2257 if (attrzp) 2258 zrele(attrzp); 2259 err = SET_ERROR(EDQUOT); 2260 goto out2; 2261 } 2262 } 2263 2264 if (mask & ATTR_GID) { 2265 new_kgid = zfs_fuid_create(zfsvfs, 2266 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2267 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2268 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2269 new_kgid)) { 2270 if (attrzp) 2271 zrele(attrzp); 2272 err = SET_ERROR(EDQUOT); 2273 goto out2; 2274 } 2275 } 2276 2277 if (projid != ZFS_INVALID_PROJID && 2278 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2279 if (attrzp) 2280 zrele(attrzp); 2281 err = EDQUOT; 2282 goto out2; 2283 } 2284 } 2285 tx = dmu_tx_create(os); 2286 2287 if (mask & ATTR_MODE) { 2288 uint64_t pmode = zp->z_mode; 2289 uint64_t acl_obj; 2290 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2291 2292 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2293 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2294 err = EPERM; 2295 goto out; 2296 } 2297 2298 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2299 goto out; 2300 2301 mutex_enter(&zp->z_lock); 2302 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2303 /* 2304 * Are we upgrading ACL from old V0 format 2305 * to V1 format? 2306 */ 2307 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2308 zfs_znode_acl_version(zp) == 2309 ZFS_ACL_VERSION_INITIAL) { 2310 dmu_tx_hold_free(tx, acl_obj, 0, 2311 DMU_OBJECT_END); 2312 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2313 0, aclp->z_acl_bytes); 2314 } else { 2315 dmu_tx_hold_write(tx, acl_obj, 0, 2316 aclp->z_acl_bytes); 2317 } 2318 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2319 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2320 0, aclp->z_acl_bytes); 2321 } 2322 mutex_exit(&zp->z_lock); 2323 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2324 } else { 2325 if (((mask & ATTR_XVATTR) && 2326 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2327 (projid != ZFS_INVALID_PROJID && 2328 !(zp->z_pflags & ZFS_PROJID))) 2329 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2330 else 2331 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2332 } 2333 2334 if (attrzp) { 2335 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2336 } 2337 2338 fuid_dirtied = zfsvfs->z_fuid_dirty; 2339 if (fuid_dirtied) 2340 zfs_fuid_txhold(zfsvfs, tx); 2341 2342 zfs_sa_upgrade_txholds(tx, zp); 2343 2344 err = dmu_tx_assign(tx, TXG_WAIT); 2345 if (err) 2346 goto out; 2347 2348 count = 0; 2349 /* 2350 * Set each attribute requested. 2351 * We group settings according to the locks they need to acquire. 2352 * 2353 * Note: you cannot set ctime directly, although it will be 2354 * updated as a side-effect of calling this function. 2355 */ 2356 2357 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2358 /* 2359 * For the existed object that is upgraded from old system, 2360 * its on-disk layout has no slot for the project ID attribute. 2361 * But quota accounting logic needs to access related slots by 2362 * offset directly. So we need to adjust old objects' layout 2363 * to make the project ID to some unified and fixed offset. 2364 */ 2365 if (attrzp) 2366 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2367 if (err == 0) 2368 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2369 2370 if (unlikely(err == EEXIST)) 2371 err = 0; 2372 else if (err != 0) 2373 goto out; 2374 else 2375 projid = ZFS_INVALID_PROJID; 2376 } 2377 2378 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2379 mutex_enter(&zp->z_acl_lock); 2380 mutex_enter(&zp->z_lock); 2381 2382 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2383 &zp->z_pflags, sizeof (zp->z_pflags)); 2384 2385 if (attrzp) { 2386 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2387 mutex_enter(&attrzp->z_acl_lock); 2388 mutex_enter(&attrzp->z_lock); 2389 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2390 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2391 sizeof (attrzp->z_pflags)); 2392 if (projid != ZFS_INVALID_PROJID) { 2393 attrzp->z_projid = projid; 2394 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2395 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2396 sizeof (attrzp->z_projid)); 2397 } 2398 } 2399 2400 if (mask & (ATTR_UID|ATTR_GID)) { 2401 2402 if (mask & ATTR_UID) { 2403 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2404 new_uid = zfs_uid_read(ZTOI(zp)); 2405 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2406 &new_uid, sizeof (new_uid)); 2407 if (attrzp) { 2408 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2409 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2410 sizeof (new_uid)); 2411 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2412 } 2413 } 2414 2415 if (mask & ATTR_GID) { 2416 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2417 new_gid = zfs_gid_read(ZTOI(zp)); 2418 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2419 NULL, &new_gid, sizeof (new_gid)); 2420 if (attrzp) { 2421 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2422 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2423 sizeof (new_gid)); 2424 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2425 } 2426 } 2427 if (!(mask & ATTR_MODE)) { 2428 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2429 NULL, &new_mode, sizeof (new_mode)); 2430 new_mode = zp->z_mode; 2431 } 2432 err = zfs_acl_chown_setattr(zp); 2433 ASSERT(err == 0); 2434 if (attrzp) { 2435 err = zfs_acl_chown_setattr(attrzp); 2436 ASSERT(err == 0); 2437 } 2438 } 2439 2440 if (mask & ATTR_MODE) { 2441 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2442 &new_mode, sizeof (new_mode)); 2443 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2444 ASSERT3P(aclp, !=, NULL); 2445 err = zfs_aclset_common(zp, aclp, cr, tx); 2446 ASSERT0(err); 2447 if (zp->z_acl_cached) 2448 zfs_acl_free(zp->z_acl_cached); 2449 zp->z_acl_cached = aclp; 2450 aclp = NULL; 2451 } 2452 2453 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2454 zp->z_atime_dirty = B_FALSE; 2455 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); 2456 ZFS_TIME_ENCODE(&tmp_atime, atime); 2457 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2458 &atime, sizeof (atime)); 2459 } 2460 2461 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2462 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2463 zpl_inode_set_mtime_to_ts(ZTOI(zp), 2464 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); 2465 2466 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2467 mtime, sizeof (mtime)); 2468 } 2469 2470 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2471 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2472 zpl_inode_set_ctime_to_ts(ZTOI(zp), 2473 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); 2474 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2475 ctime, sizeof (ctime)); 2476 } 2477 2478 if (projid != ZFS_INVALID_PROJID) { 2479 zp->z_projid = projid; 2480 SA_ADD_BULK_ATTR(bulk, count, 2481 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2482 sizeof (zp->z_projid)); 2483 } 2484 2485 if (attrzp && mask) { 2486 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2487 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2488 sizeof (ctime)); 2489 } 2490 2491 /* 2492 * Do this after setting timestamps to prevent timestamp 2493 * update from toggling bit 2494 */ 2495 2496 if (xoap && (mask & ATTR_XVATTR)) { 2497 2498 /* 2499 * restore trimmed off masks 2500 * so that return masks can be set for caller. 2501 */ 2502 2503 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2504 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2505 } 2506 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2507 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2508 } 2509 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2510 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2511 } 2512 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2513 XVA_SET_REQ(xvap, XAT_NODUMP); 2514 } 2515 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2516 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2517 } 2518 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2519 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2520 } 2521 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2522 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2523 } 2524 2525 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2526 ASSERT(S_ISREG(ip->i_mode)); 2527 2528 zfs_xvattr_set(zp, xvap, tx); 2529 } 2530 2531 if (fuid_dirtied) 2532 zfs_fuid_sync(zfsvfs, tx); 2533 2534 if (mask != 0) 2535 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2536 2537 mutex_exit(&zp->z_lock); 2538 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2539 mutex_exit(&zp->z_acl_lock); 2540 2541 if (attrzp) { 2542 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2543 mutex_exit(&attrzp->z_acl_lock); 2544 mutex_exit(&attrzp->z_lock); 2545 } 2546 out: 2547 if (err == 0 && xattr_count > 0) { 2548 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2549 xattr_count, tx); 2550 ASSERT(err2 == 0); 2551 } 2552 2553 if (aclp) 2554 zfs_acl_free(aclp); 2555 2556 if (fuidp) { 2557 zfs_fuid_info_free(fuidp); 2558 fuidp = NULL; 2559 } 2560 2561 if (err) { 2562 dmu_tx_abort(tx); 2563 if (attrzp) 2564 zrele(attrzp); 2565 if (err == ERESTART) 2566 goto top; 2567 } else { 2568 if (count > 0) 2569 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2570 dmu_tx_commit(tx); 2571 if (attrzp) { 2572 if (err2 == 0 && handle_eadir) 2573 err = zfs_setattr_dir(attrzp); 2574 zrele(attrzp); 2575 } 2576 zfs_znode_update_vfs(zp); 2577 } 2578 2579 out2: 2580 if (os->os_sync == ZFS_SYNC_ALWAYS) 2581 zil_commit(zilog, 0); 2582 2583 out3: 2584 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2585 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2586 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2587 zfs_exit(zfsvfs, FTAG); 2588 return (err); 2589 } 2590 2591 typedef struct zfs_zlock { 2592 krwlock_t *zl_rwlock; /* lock we acquired */ 2593 znode_t *zl_znode; /* znode we held */ 2594 struct zfs_zlock *zl_next; /* next in list */ 2595 } zfs_zlock_t; 2596 2597 /* 2598 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2599 */ 2600 static void 2601 zfs_rename_unlock(zfs_zlock_t **zlpp) 2602 { 2603 zfs_zlock_t *zl; 2604 2605 while ((zl = *zlpp) != NULL) { 2606 if (zl->zl_znode != NULL) 2607 zfs_zrele_async(zl->zl_znode); 2608 rw_exit(zl->zl_rwlock); 2609 *zlpp = zl->zl_next; 2610 kmem_free(zl, sizeof (*zl)); 2611 } 2612 } 2613 2614 /* 2615 * Search back through the directory tree, using the ".." entries. 2616 * Lock each directory in the chain to prevent concurrent renames. 2617 * Fail any attempt to move a directory into one of its own descendants. 2618 * XXX - z_parent_lock can overlap with map or grow locks 2619 */ 2620 static int 2621 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2622 { 2623 zfs_zlock_t *zl; 2624 znode_t *zp = tdzp; 2625 uint64_t rootid = ZTOZSB(zp)->z_root; 2626 uint64_t oidp = zp->z_id; 2627 krwlock_t *rwlp = &szp->z_parent_lock; 2628 krw_t rw = RW_WRITER; 2629 2630 /* 2631 * First pass write-locks szp and compares to zp->z_id. 2632 * Later passes read-lock zp and compare to zp->z_parent. 2633 */ 2634 do { 2635 if (!rw_tryenter(rwlp, rw)) { 2636 /* 2637 * Another thread is renaming in this path. 2638 * Note that if we are a WRITER, we don't have any 2639 * parent_locks held yet. 2640 */ 2641 if (rw == RW_READER && zp->z_id > szp->z_id) { 2642 /* 2643 * Drop our locks and restart 2644 */ 2645 zfs_rename_unlock(&zl); 2646 *zlpp = NULL; 2647 zp = tdzp; 2648 oidp = zp->z_id; 2649 rwlp = &szp->z_parent_lock; 2650 rw = RW_WRITER; 2651 continue; 2652 } else { 2653 /* 2654 * Wait for other thread to drop its locks 2655 */ 2656 rw_enter(rwlp, rw); 2657 } 2658 } 2659 2660 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2661 zl->zl_rwlock = rwlp; 2662 zl->zl_znode = NULL; 2663 zl->zl_next = *zlpp; 2664 *zlpp = zl; 2665 2666 if (oidp == szp->z_id) /* We're a descendant of szp */ 2667 return (SET_ERROR(EINVAL)); 2668 2669 if (oidp == rootid) /* We've hit the top */ 2670 return (0); 2671 2672 if (rw == RW_READER) { /* i.e. not the first pass */ 2673 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2674 if (error) 2675 return (error); 2676 zl->zl_znode = zp; 2677 } 2678 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2679 &oidp, sizeof (oidp)); 2680 rwlp = &zp->z_parent_lock; 2681 rw = RW_READER; 2682 2683 } while (zp->z_id != sdzp->z_id); 2684 2685 return (0); 2686 } 2687 2688 /* 2689 * Move an entry from the provided source directory to the target 2690 * directory. Change the entry name as indicated. 2691 * 2692 * IN: sdzp - Source directory containing the "old entry". 2693 * snm - Old entry name. 2694 * tdzp - Target directory to contain the "new entry". 2695 * tnm - New entry name. 2696 * cr - credentials of caller. 2697 * flags - case flags 2698 * rflags - RENAME_* flags 2699 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2700 * mnt_ns - user namespace of the mount 2701 * 2702 * RETURN: 0 on success, error code on failure. 2703 * 2704 * Timestamps: 2705 * sdzp,tdzp - ctime|mtime updated 2706 */ 2707 int 2708 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2709 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) 2710 { 2711 znode_t *szp, *tzp; 2712 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2713 zilog_t *zilog; 2714 zfs_dirlock_t *sdl, *tdl; 2715 dmu_tx_t *tx; 2716 zfs_zlock_t *zl; 2717 int cmp, serr, terr; 2718 int error = 0; 2719 int zflg = 0; 2720 boolean_t waited = B_FALSE; 2721 /* Needed for whiteout inode creation. */ 2722 boolean_t fuid_dirtied; 2723 zfs_acl_ids_t acl_ids; 2724 boolean_t have_acl = B_FALSE; 2725 znode_t *wzp = NULL; 2726 2727 2728 if (snm == NULL || tnm == NULL) 2729 return (SET_ERROR(EINVAL)); 2730 2731 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2732 return (SET_ERROR(EINVAL)); 2733 2734 /* Already checked by Linux VFS, but just to make sure. */ 2735 if (rflags & RENAME_EXCHANGE && 2736 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2737 return (SET_ERROR(EINVAL)); 2738 2739 /* 2740 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2741 * right kind of vattr_t for the whiteout file. These are set 2742 * internally by ZFS so should never be incorrect. 2743 */ 2744 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2745 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2746 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2747 2748 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2749 return (error); 2750 zilog = zfsvfs->z_log; 2751 2752 if ((error = zfs_verify_zp(tdzp)) != 0) { 2753 zfs_exit(zfsvfs, FTAG); 2754 return (error); 2755 } 2756 2757 /* 2758 * We check i_sb because snapshots and the ctldir must have different 2759 * super blocks. 2760 */ 2761 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2762 zfsctl_is_node(ZTOI(tdzp))) { 2763 zfs_exit(zfsvfs, FTAG); 2764 return (SET_ERROR(EXDEV)); 2765 } 2766 2767 if (zfsvfs->z_utf8 && u8_validate(tnm, 2768 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2769 zfs_exit(zfsvfs, FTAG); 2770 return (SET_ERROR(EILSEQ)); 2771 } 2772 2773 if (flags & FIGNORECASE) 2774 zflg |= ZCILOOK; 2775 2776 top: 2777 szp = NULL; 2778 tzp = NULL; 2779 zl = NULL; 2780 2781 /* 2782 * This is to prevent the creation of links into attribute space 2783 * by renaming a linked file into/outof an attribute directory. 2784 * See the comment in zfs_link() for why this is considered bad. 2785 */ 2786 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2787 zfs_exit(zfsvfs, FTAG); 2788 return (SET_ERROR(EINVAL)); 2789 } 2790 2791 /* 2792 * Lock source and target directory entries. To prevent deadlock, 2793 * a lock ordering must be defined. We lock the directory with 2794 * the smallest object id first, or if it's a tie, the one with 2795 * the lexically first name. 2796 */ 2797 if (sdzp->z_id < tdzp->z_id) { 2798 cmp = -1; 2799 } else if (sdzp->z_id > tdzp->z_id) { 2800 cmp = 1; 2801 } else { 2802 /* 2803 * First compare the two name arguments without 2804 * considering any case folding. 2805 */ 2806 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2807 2808 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2809 ASSERT(error == 0 || !zfsvfs->z_utf8); 2810 if (cmp == 0) { 2811 /* 2812 * POSIX: "If the old argument and the new argument 2813 * both refer to links to the same existing file, 2814 * the rename() function shall return successfully 2815 * and perform no other action." 2816 */ 2817 zfs_exit(zfsvfs, FTAG); 2818 return (0); 2819 } 2820 /* 2821 * If the file system is case-folding, then we may 2822 * have some more checking to do. A case-folding file 2823 * system is either supporting mixed case sensitivity 2824 * access or is completely case-insensitive. Note 2825 * that the file system is always case preserving. 2826 * 2827 * In mixed sensitivity mode case sensitive behavior 2828 * is the default. FIGNORECASE must be used to 2829 * explicitly request case insensitive behavior. 2830 * 2831 * If the source and target names provided differ only 2832 * by case (e.g., a request to rename 'tim' to 'Tim'), 2833 * we will treat this as a special case in the 2834 * case-insensitive mode: as long as the source name 2835 * is an exact match, we will allow this to proceed as 2836 * a name-change request. 2837 */ 2838 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2839 (zfsvfs->z_case == ZFS_CASE_MIXED && 2840 flags & FIGNORECASE)) && 2841 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2842 &error) == 0) { 2843 /* 2844 * case preserving rename request, require exact 2845 * name matches 2846 */ 2847 zflg |= ZCIEXACT; 2848 zflg &= ~ZCILOOK; 2849 } 2850 } 2851 2852 /* 2853 * If the source and destination directories are the same, we should 2854 * grab the z_name_lock of that directory only once. 2855 */ 2856 if (sdzp == tdzp) { 2857 zflg |= ZHAVELOCK; 2858 rw_enter(&sdzp->z_name_lock, RW_READER); 2859 } 2860 2861 if (cmp < 0) { 2862 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2863 ZEXISTS | zflg, NULL, NULL); 2864 terr = zfs_dirent_lock(&tdl, 2865 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2866 } else { 2867 terr = zfs_dirent_lock(&tdl, 2868 tdzp, tnm, &tzp, zflg, NULL, NULL); 2869 serr = zfs_dirent_lock(&sdl, 2870 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2871 NULL, NULL); 2872 } 2873 2874 if (serr) { 2875 /* 2876 * Source entry invalid or not there. 2877 */ 2878 if (!terr) { 2879 zfs_dirent_unlock(tdl); 2880 if (tzp) 2881 zrele(tzp); 2882 } 2883 2884 if (sdzp == tdzp) 2885 rw_exit(&sdzp->z_name_lock); 2886 2887 if (strcmp(snm, "..") == 0) 2888 serr = EINVAL; 2889 zfs_exit(zfsvfs, FTAG); 2890 return (serr); 2891 } 2892 if (terr) { 2893 zfs_dirent_unlock(sdl); 2894 zrele(szp); 2895 2896 if (sdzp == tdzp) 2897 rw_exit(&sdzp->z_name_lock); 2898 2899 if (strcmp(tnm, "..") == 0) 2900 terr = EINVAL; 2901 zfs_exit(zfsvfs, FTAG); 2902 return (terr); 2903 } 2904 2905 /* 2906 * If we are using project inheritance, means if the directory has 2907 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2908 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2909 * such case, we only allow renames into our tree when the project 2910 * IDs are the same. 2911 */ 2912 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2913 tdzp->z_projid != szp->z_projid) { 2914 error = SET_ERROR(EXDEV); 2915 goto out; 2916 } 2917 2918 /* 2919 * Must have write access at the source to remove the old entry 2920 * and write access at the target to create the new entry. 2921 * Note that if target and source are the same, this can be 2922 * done in a single check. 2923 */ 2924 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2925 goto out; 2926 2927 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2928 /* 2929 * Check to make sure rename is valid. 2930 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2931 */ 2932 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2933 goto out; 2934 } 2935 2936 /* 2937 * Does target exist? 2938 */ 2939 if (tzp) { 2940 if (rflags & RENAME_NOREPLACE) { 2941 error = SET_ERROR(EEXIST); 2942 goto out; 2943 } 2944 /* 2945 * Source and target must be the same type (unless exchanging). 2946 */ 2947 if (!(rflags & RENAME_EXCHANGE)) { 2948 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 2949 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 2950 2951 if (s_is_dir != t_is_dir) { 2952 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 2953 goto out; 2954 } 2955 } 2956 /* 2957 * POSIX dictates that when the source and target 2958 * entries refer to the same file object, rename 2959 * must do nothing and exit without error. 2960 */ 2961 if (szp->z_id == tzp->z_id) { 2962 error = 0; 2963 goto out; 2964 } 2965 } else if (rflags & RENAME_EXCHANGE) { 2966 /* Target must exist for RENAME_EXCHANGE. */ 2967 error = SET_ERROR(ENOENT); 2968 goto out; 2969 } 2970 2971 /* Set up inode creation for RENAME_WHITEOUT. */ 2972 if (rflags & RENAME_WHITEOUT) { 2973 /* 2974 * Whiteout files are not regular files or directories, so to 2975 * match zfs_create() we do not inherit the project id. 2976 */ 2977 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 2978 2979 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 2980 if (error) 2981 goto out; 2982 2983 if (!have_acl) { 2984 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 2985 &acl_ids, mnt_ns); 2986 if (error) 2987 goto out; 2988 have_acl = B_TRUE; 2989 } 2990 2991 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 2992 error = SET_ERROR(EDQUOT); 2993 goto out; 2994 } 2995 } 2996 2997 tx = dmu_tx_create(zfsvfs->z_os); 2998 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 2999 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3000 dmu_tx_hold_zap(tx, sdzp->z_id, 3001 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 3002 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3003 if (sdzp != tdzp) { 3004 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3005 zfs_sa_upgrade_txholds(tx, tdzp); 3006 } 3007 if (tzp) { 3008 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3009 zfs_sa_upgrade_txholds(tx, tzp); 3010 } 3011 if (rflags & RENAME_WHITEOUT) { 3012 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3013 ZFS_SA_BASE_ATTR_SIZE); 3014 3015 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 3016 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3017 if (!zfsvfs->z_use_sa && 3018 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3019 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3020 0, acl_ids.z_aclp->z_acl_bytes); 3021 } 3022 } 3023 fuid_dirtied = zfsvfs->z_fuid_dirty; 3024 if (fuid_dirtied) 3025 zfs_fuid_txhold(zfsvfs, tx); 3026 zfs_sa_upgrade_txholds(tx, szp); 3027 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3028 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3029 if (error) { 3030 if (zl != NULL) 3031 zfs_rename_unlock(&zl); 3032 zfs_dirent_unlock(sdl); 3033 zfs_dirent_unlock(tdl); 3034 3035 if (sdzp == tdzp) 3036 rw_exit(&sdzp->z_name_lock); 3037 3038 if (error == ERESTART) { 3039 waited = B_TRUE; 3040 dmu_tx_wait(tx); 3041 dmu_tx_abort(tx); 3042 zrele(szp); 3043 if (tzp) 3044 zrele(tzp); 3045 goto top; 3046 } 3047 dmu_tx_abort(tx); 3048 zrele(szp); 3049 if (tzp) 3050 zrele(tzp); 3051 zfs_exit(zfsvfs, FTAG); 3052 return (error); 3053 } 3054 3055 /* 3056 * Unlink the source. 3057 */ 3058 szp->z_pflags |= ZFS_AV_MODIFIED; 3059 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3060 szp->z_pflags |= ZFS_PROJINHERIT; 3061 3062 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3063 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3064 VERIFY0(error); 3065 3066 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3067 if (error) 3068 goto commit; 3069 3070 /* 3071 * Unlink the target. 3072 */ 3073 if (tzp) { 3074 int tzflg = zflg; 3075 3076 if (rflags & RENAME_EXCHANGE) { 3077 /* This inode will be re-linked soon. */ 3078 tzflg |= ZRENAMING; 3079 3080 tzp->z_pflags |= ZFS_AV_MODIFIED; 3081 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3082 tzp->z_pflags |= ZFS_PROJINHERIT; 3083 3084 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3085 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3086 ASSERT0(error); 3087 } 3088 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3089 if (error) 3090 goto commit_link_szp; 3091 } 3092 3093 /* 3094 * Create the new target links: 3095 * * We always link the target. 3096 * * RENAME_EXCHANGE: Link the old target to the source. 3097 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3098 */ 3099 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3100 if (error) { 3101 /* 3102 * If we have removed the existing target, a subsequent call to 3103 * zfs_link_create() to add back the same entry, but with a new 3104 * dnode (szp), should not fail. 3105 */ 3106 ASSERT3P(tzp, ==, NULL); 3107 goto commit_link_tzp; 3108 } 3109 3110 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3111 case RENAME_EXCHANGE: 3112 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3113 /* 3114 * The same argument as zfs_link_create() failing for 3115 * szp applies here, since the source directory must 3116 * have had an entry we are replacing. 3117 */ 3118 ASSERT0(error); 3119 if (error) 3120 goto commit_unlink_td_szp; 3121 break; 3122 case RENAME_WHITEOUT: 3123 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3124 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3125 if (error) { 3126 zfs_znode_delete(wzp, tx); 3127 remove_inode_hash(ZTOI(wzp)); 3128 goto commit_unlink_td_szp; 3129 } 3130 break; 3131 } 3132 3133 if (fuid_dirtied) 3134 zfs_fuid_sync(zfsvfs, tx); 3135 3136 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3137 case RENAME_EXCHANGE: 3138 zfs_log_rename_exchange(zilog, tx, 3139 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3140 tdzp, tdl->dl_name, szp); 3141 break; 3142 case RENAME_WHITEOUT: 3143 zfs_log_rename_whiteout(zilog, tx, 3144 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3145 tdzp, tdl->dl_name, szp, wzp); 3146 break; 3147 default: 3148 ASSERT0(rflags & ~RENAME_NOREPLACE); 3149 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3150 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3151 break; 3152 } 3153 3154 commit: 3155 dmu_tx_commit(tx); 3156 out: 3157 if (have_acl) 3158 zfs_acl_ids_free(&acl_ids); 3159 3160 zfs_znode_update_vfs(sdzp); 3161 if (sdzp == tdzp) 3162 rw_exit(&sdzp->z_name_lock); 3163 3164 if (sdzp != tdzp) 3165 zfs_znode_update_vfs(tdzp); 3166 3167 zfs_znode_update_vfs(szp); 3168 zrele(szp); 3169 if (wzp) { 3170 zfs_znode_update_vfs(wzp); 3171 zrele(wzp); 3172 } 3173 if (tzp) { 3174 zfs_znode_update_vfs(tzp); 3175 zrele(tzp); 3176 } 3177 3178 if (zl != NULL) 3179 zfs_rename_unlock(&zl); 3180 3181 zfs_dirent_unlock(sdl); 3182 zfs_dirent_unlock(tdl); 3183 3184 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3185 zil_commit(zilog, 0); 3186 3187 zfs_exit(zfsvfs, FTAG); 3188 return (error); 3189 3190 /* 3191 * Clean-up path for broken link state. 3192 * 3193 * At this point we are in a (very) bad state, so we need to do our 3194 * best to correct the state. In particular, all of the nlinks are 3195 * wrong because we were destroying and creating links with ZRENAMING. 3196 * 3197 * In some form, all of these operations have to resolve the state: 3198 * 3199 * * link_destroy() *must* succeed. Fortunately, this is very likely 3200 * since we only just created it. 3201 * 3202 * * link_create()s are allowed to fail (though they shouldn't because 3203 * we only just unlinked them and are putting the entries back 3204 * during clean-up). But if they fail, we can just forcefully drop 3205 * the nlink value to (at the very least) avoid broken nlink values 3206 * -- though in the case of non-empty directories we will have to 3207 * panic (otherwise we'd have a leaked directory with a broken ..). 3208 */ 3209 commit_unlink_td_szp: 3210 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3211 commit_link_tzp: 3212 if (tzp) { 3213 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3214 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3215 } 3216 commit_link_szp: 3217 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3218 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3219 goto commit; 3220 } 3221 3222 /* 3223 * Insert the indicated symbolic reference entry into the directory. 3224 * 3225 * IN: dzp - Directory to contain new symbolic link. 3226 * name - Name of directory entry in dip. 3227 * vap - Attributes of new entry. 3228 * link - Name for new symlink entry. 3229 * cr - credentials of caller. 3230 * flags - case flags 3231 * mnt_ns - user namespace of the mount 3232 * 3233 * OUT: zpp - Znode for new symbolic link. 3234 * 3235 * RETURN: 0 on success, error code on failure. 3236 * 3237 * Timestamps: 3238 * dip - ctime|mtime updated 3239 */ 3240 int 3241 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3242 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) 3243 { 3244 znode_t *zp; 3245 zfs_dirlock_t *dl; 3246 dmu_tx_t *tx; 3247 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3248 zilog_t *zilog; 3249 uint64_t len = strlen(link); 3250 int error; 3251 int zflg = ZNEW; 3252 zfs_acl_ids_t acl_ids; 3253 boolean_t fuid_dirtied; 3254 uint64_t txtype = TX_SYMLINK; 3255 boolean_t waited = B_FALSE; 3256 3257 ASSERT(S_ISLNK(vap->va_mode)); 3258 3259 if (name == NULL) 3260 return (SET_ERROR(EINVAL)); 3261 3262 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3263 return (error); 3264 zilog = zfsvfs->z_log; 3265 3266 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3267 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3268 zfs_exit(zfsvfs, FTAG); 3269 return (SET_ERROR(EILSEQ)); 3270 } 3271 if (flags & FIGNORECASE) 3272 zflg |= ZCILOOK; 3273 3274 if (len > MAXPATHLEN) { 3275 zfs_exit(zfsvfs, FTAG); 3276 return (SET_ERROR(ENAMETOOLONG)); 3277 } 3278 3279 if ((error = zfs_acl_ids_create(dzp, 0, 3280 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3281 zfs_exit(zfsvfs, FTAG); 3282 return (error); 3283 } 3284 top: 3285 *zpp = NULL; 3286 3287 /* 3288 * Attempt to lock directory; fail if entry already exists. 3289 */ 3290 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3291 if (error) { 3292 zfs_acl_ids_free(&acl_ids); 3293 zfs_exit(zfsvfs, FTAG); 3294 return (error); 3295 } 3296 3297 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3298 zfs_acl_ids_free(&acl_ids); 3299 zfs_dirent_unlock(dl); 3300 zfs_exit(zfsvfs, FTAG); 3301 return (error); 3302 } 3303 3304 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3305 zfs_acl_ids_free(&acl_ids); 3306 zfs_dirent_unlock(dl); 3307 zfs_exit(zfsvfs, FTAG); 3308 return (SET_ERROR(EDQUOT)); 3309 } 3310 tx = dmu_tx_create(zfsvfs->z_os); 3311 fuid_dirtied = zfsvfs->z_fuid_dirty; 3312 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3313 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3314 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3315 ZFS_SA_BASE_ATTR_SIZE + len); 3316 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3317 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3318 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3319 acl_ids.z_aclp->z_acl_bytes); 3320 } 3321 if (fuid_dirtied) 3322 zfs_fuid_txhold(zfsvfs, tx); 3323 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3324 if (error) { 3325 zfs_dirent_unlock(dl); 3326 if (error == ERESTART) { 3327 waited = B_TRUE; 3328 dmu_tx_wait(tx); 3329 dmu_tx_abort(tx); 3330 goto top; 3331 } 3332 zfs_acl_ids_free(&acl_ids); 3333 dmu_tx_abort(tx); 3334 zfs_exit(zfsvfs, FTAG); 3335 return (error); 3336 } 3337 3338 /* 3339 * Create a new object for the symlink. 3340 * for version 4 ZPL datasets the symlink will be an SA attribute 3341 */ 3342 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3343 3344 if (fuid_dirtied) 3345 zfs_fuid_sync(zfsvfs, tx); 3346 3347 mutex_enter(&zp->z_lock); 3348 if (zp->z_is_sa) 3349 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3350 link, len, tx); 3351 else 3352 zfs_sa_symlink(zp, link, len, tx); 3353 mutex_exit(&zp->z_lock); 3354 3355 zp->z_size = len; 3356 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3357 &zp->z_size, sizeof (zp->z_size), tx); 3358 /* 3359 * Insert the new object into the directory. 3360 */ 3361 error = zfs_link_create(dl, zp, tx, ZNEW); 3362 if (error != 0) { 3363 zfs_znode_delete(zp, tx); 3364 remove_inode_hash(ZTOI(zp)); 3365 } else { 3366 if (flags & FIGNORECASE) 3367 txtype |= TX_CI; 3368 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3369 3370 zfs_znode_update_vfs(dzp); 3371 zfs_znode_update_vfs(zp); 3372 } 3373 3374 zfs_acl_ids_free(&acl_ids); 3375 3376 dmu_tx_commit(tx); 3377 3378 zfs_dirent_unlock(dl); 3379 3380 if (error == 0) { 3381 *zpp = zp; 3382 3383 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3384 zil_commit(zilog, 0); 3385 } else { 3386 zrele(zp); 3387 } 3388 3389 zfs_exit(zfsvfs, FTAG); 3390 return (error); 3391 } 3392 3393 /* 3394 * Return, in the buffer contained in the provided uio structure, 3395 * the symbolic path referred to by ip. 3396 * 3397 * IN: ip - inode of symbolic link 3398 * uio - structure to contain the link path. 3399 * cr - credentials of caller. 3400 * 3401 * RETURN: 0 if success 3402 * error code if failure 3403 * 3404 * Timestamps: 3405 * ip - atime updated 3406 */ 3407 int 3408 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3409 { 3410 (void) cr; 3411 znode_t *zp = ITOZ(ip); 3412 zfsvfs_t *zfsvfs = ITOZSB(ip); 3413 int error; 3414 3415 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3416 return (error); 3417 3418 mutex_enter(&zp->z_lock); 3419 if (zp->z_is_sa) 3420 error = sa_lookup_uio(zp->z_sa_hdl, 3421 SA_ZPL_SYMLINK(zfsvfs), uio); 3422 else 3423 error = zfs_sa_readlink(zp, uio); 3424 mutex_exit(&zp->z_lock); 3425 3426 zfs_exit(zfsvfs, FTAG); 3427 return (error); 3428 } 3429 3430 /* 3431 * Insert a new entry into directory tdzp referencing szp. 3432 * 3433 * IN: tdzp - Directory to contain new entry. 3434 * szp - znode of new entry. 3435 * name - name of new entry. 3436 * cr - credentials of caller. 3437 * flags - case flags. 3438 * 3439 * RETURN: 0 if success 3440 * error code if failure 3441 * 3442 * Timestamps: 3443 * tdzp - ctime|mtime updated 3444 * szp - ctime updated 3445 */ 3446 int 3447 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3448 int flags) 3449 { 3450 struct inode *sip = ZTOI(szp); 3451 znode_t *tzp; 3452 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3453 zilog_t *zilog; 3454 zfs_dirlock_t *dl; 3455 dmu_tx_t *tx; 3456 int error; 3457 int zf = ZNEW; 3458 uint64_t parent; 3459 uid_t owner; 3460 boolean_t waited = B_FALSE; 3461 boolean_t is_tmpfile = 0; 3462 uint64_t txg; 3463 #ifdef HAVE_TMPFILE 3464 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3465 #endif 3466 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3467 3468 if (name == NULL) 3469 return (SET_ERROR(EINVAL)); 3470 3471 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3472 return (error); 3473 zilog = zfsvfs->z_log; 3474 3475 /* 3476 * POSIX dictates that we return EPERM here. 3477 * Better choices include ENOTSUP or EISDIR. 3478 */ 3479 if (S_ISDIR(sip->i_mode)) { 3480 zfs_exit(zfsvfs, FTAG); 3481 return (SET_ERROR(EPERM)); 3482 } 3483 3484 if ((error = zfs_verify_zp(szp)) != 0) { 3485 zfs_exit(zfsvfs, FTAG); 3486 return (error); 3487 } 3488 3489 /* 3490 * If we are using project inheritance, means if the directory has 3491 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3492 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3493 * such case, we only allow hard link creation in our tree when the 3494 * project IDs are the same. 3495 */ 3496 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3497 tdzp->z_projid != szp->z_projid) { 3498 zfs_exit(zfsvfs, FTAG); 3499 return (SET_ERROR(EXDEV)); 3500 } 3501 3502 /* 3503 * We check i_sb because snapshots and the ctldir must have different 3504 * super blocks. 3505 */ 3506 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3507 zfs_exit(zfsvfs, FTAG); 3508 return (SET_ERROR(EXDEV)); 3509 } 3510 3511 /* Prevent links to .zfs/shares files */ 3512 3513 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3514 &parent, sizeof (uint64_t))) != 0) { 3515 zfs_exit(zfsvfs, FTAG); 3516 return (error); 3517 } 3518 if (parent == zfsvfs->z_shares_dir) { 3519 zfs_exit(zfsvfs, FTAG); 3520 return (SET_ERROR(EPERM)); 3521 } 3522 3523 if (zfsvfs->z_utf8 && u8_validate(name, 3524 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3525 zfs_exit(zfsvfs, FTAG); 3526 return (SET_ERROR(EILSEQ)); 3527 } 3528 if (flags & FIGNORECASE) 3529 zf |= ZCILOOK; 3530 3531 /* 3532 * We do not support links between attributes and non-attributes 3533 * because of the potential security risk of creating links 3534 * into "normal" file space in order to circumvent restrictions 3535 * imposed in attribute space. 3536 */ 3537 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3538 zfs_exit(zfsvfs, FTAG); 3539 return (SET_ERROR(EINVAL)); 3540 } 3541 3542 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3543 cr, ZFS_OWNER); 3544 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3545 zfs_exit(zfsvfs, FTAG); 3546 return (SET_ERROR(EPERM)); 3547 } 3548 3549 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3550 zfs_init_idmap))) { 3551 zfs_exit(zfsvfs, FTAG); 3552 return (error); 3553 } 3554 3555 top: 3556 /* 3557 * Attempt to lock directory; fail if entry already exists. 3558 */ 3559 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3560 if (error) { 3561 zfs_exit(zfsvfs, FTAG); 3562 return (error); 3563 } 3564 3565 tx = dmu_tx_create(zfsvfs->z_os); 3566 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3567 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3568 if (is_tmpfile) 3569 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3570 3571 zfs_sa_upgrade_txholds(tx, szp); 3572 zfs_sa_upgrade_txholds(tx, tdzp); 3573 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3574 if (error) { 3575 zfs_dirent_unlock(dl); 3576 if (error == ERESTART) { 3577 waited = B_TRUE; 3578 dmu_tx_wait(tx); 3579 dmu_tx_abort(tx); 3580 goto top; 3581 } 3582 dmu_tx_abort(tx); 3583 zfs_exit(zfsvfs, FTAG); 3584 return (error); 3585 } 3586 /* unmark z_unlinked so zfs_link_create will not reject */ 3587 if (is_tmpfile) 3588 szp->z_unlinked = B_FALSE; 3589 error = zfs_link_create(dl, szp, tx, 0); 3590 3591 if (error == 0) { 3592 uint64_t txtype = TX_LINK; 3593 /* 3594 * tmpfile is created to be in z_unlinkedobj, so remove it. 3595 * Also, we don't log in ZIL, because all previous file 3596 * operation on the tmpfile are ignored by ZIL. Instead we 3597 * always wait for txg to sync to make sure all previous 3598 * operation are sync safe. 3599 */ 3600 if (is_tmpfile) { 3601 VERIFY(zap_remove_int(zfsvfs->z_os, 3602 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); 3603 } else { 3604 if (flags & FIGNORECASE) 3605 txtype |= TX_CI; 3606 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3607 } 3608 } else if (is_tmpfile) { 3609 /* restore z_unlinked since when linking failed */ 3610 szp->z_unlinked = B_TRUE; 3611 } 3612 txg = dmu_tx_get_txg(tx); 3613 dmu_tx_commit(tx); 3614 3615 zfs_dirent_unlock(dl); 3616 3617 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3618 zil_commit(zilog, 0); 3619 3620 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) 3621 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); 3622 3623 zfs_znode_update_vfs(tdzp); 3624 zfs_znode_update_vfs(szp); 3625 zfs_exit(zfsvfs, FTAG); 3626 return (error); 3627 } 3628 3629 static void 3630 zfs_putpage_sync_commit_cb(void *arg) 3631 { 3632 struct page *pp = arg; 3633 3634 ClearPageError(pp); 3635 end_page_writeback(pp); 3636 } 3637 3638 static void 3639 zfs_putpage_async_commit_cb(void *arg) 3640 { 3641 struct page *pp = arg; 3642 znode_t *zp = ITOZ(pp->mapping->host); 3643 3644 ClearPageError(pp); 3645 end_page_writeback(pp); 3646 atomic_dec_32(&zp->z_async_writes_cnt); 3647 } 3648 3649 /* 3650 * Push a page out to disk, once the page is on stable storage the 3651 * registered commit callback will be run as notification of completion. 3652 * 3653 * IN: ip - page mapped for inode. 3654 * pp - page to push (page is locked) 3655 * wbc - writeback control data 3656 * for_sync - does the caller intend to wait synchronously for the 3657 * page writeback to complete? 3658 * 3659 * RETURN: 0 if success 3660 * error code if failure 3661 * 3662 * Timestamps: 3663 * ip - ctime|mtime updated 3664 */ 3665 int 3666 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3667 boolean_t for_sync) 3668 { 3669 znode_t *zp = ITOZ(ip); 3670 zfsvfs_t *zfsvfs = ITOZSB(ip); 3671 loff_t offset; 3672 loff_t pgoff; 3673 unsigned int pglen; 3674 dmu_tx_t *tx; 3675 caddr_t va; 3676 int err = 0; 3677 uint64_t mtime[2], ctime[2]; 3678 inode_timespec_t tmp_ts; 3679 sa_bulk_attr_t bulk[3]; 3680 int cnt = 0; 3681 struct address_space *mapping; 3682 3683 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3684 return (err); 3685 3686 ASSERT(PageLocked(pp)); 3687 3688 pgoff = page_offset(pp); /* Page byte-offset in file */ 3689 offset = i_size_read(ip); /* File length in bytes */ 3690 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3691 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3692 3693 /* Page is beyond end of file */ 3694 if (pgoff >= offset) { 3695 unlock_page(pp); 3696 zfs_exit(zfsvfs, FTAG); 3697 return (0); 3698 } 3699 3700 /* Truncate page length to end of file */ 3701 if (pgoff + pglen > offset) 3702 pglen = offset - pgoff; 3703 3704 #if 0 3705 /* 3706 * FIXME: Allow mmap writes past its quota. The correct fix 3707 * is to register a page_mkwrite() handler to count the page 3708 * against its quota when it is about to be dirtied. 3709 */ 3710 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3711 KUID_TO_SUID(ip->i_uid)) || 3712 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3713 KGID_TO_SGID(ip->i_gid)) || 3714 (zp->z_projid != ZFS_DEFAULT_PROJID && 3715 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3716 zp->z_projid))) { 3717 err = EDQUOT; 3718 } 3719 #endif 3720 3721 /* 3722 * The ordering here is critical and must adhere to the following 3723 * rules in order to avoid deadlocking in either zfs_read() or 3724 * zfs_free_range() due to a lock inversion. 3725 * 3726 * 1) The page must be unlocked prior to acquiring the range lock. 3727 * This is critical because zfs_read() calls find_lock_page() 3728 * which may block on the page lock while holding the range lock. 3729 * 3730 * 2) Before setting or clearing write back on a page the range lock 3731 * must be held in order to prevent a lock inversion with the 3732 * zfs_free_range() function. 3733 * 3734 * This presents a problem because upon entering this function the 3735 * page lock is already held. To safely acquire the range lock the 3736 * page lock must be dropped. This creates a window where another 3737 * process could truncate, invalidate, dirty, or write out the page. 3738 * 3739 * Therefore, after successfully reacquiring the range and page locks 3740 * the current page state is checked. In the common case everything 3741 * will be as is expected and it can be written out. However, if 3742 * the page state has changed it must be handled accordingly. 3743 */ 3744 mapping = pp->mapping; 3745 redirty_page_for_writepage(wbc, pp); 3746 unlock_page(pp); 3747 3748 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3749 pgoff, pglen, RL_WRITER); 3750 lock_page(pp); 3751 3752 /* Page mapping changed or it was no longer dirty, we're done */ 3753 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3754 unlock_page(pp); 3755 zfs_rangelock_exit(lr); 3756 zfs_exit(zfsvfs, FTAG); 3757 return (0); 3758 } 3759 3760 /* Another process started write block if required */ 3761 if (PageWriteback(pp)) { 3762 unlock_page(pp); 3763 zfs_rangelock_exit(lr); 3764 3765 if (wbc->sync_mode != WB_SYNC_NONE) { 3766 /* 3767 * Speed up any non-sync page writebacks since 3768 * they may take several seconds to complete. 3769 * Refer to the comment in zpl_fsync() (when 3770 * HAVE_FSYNC_RANGE is defined) for details. 3771 */ 3772 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 3773 zil_commit(zfsvfs->z_log, zp->z_id); 3774 } 3775 3776 if (PageWriteback(pp)) 3777 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3778 folio_wait_bit(page_folio(pp), PG_writeback); 3779 #else 3780 wait_on_page_bit(pp, PG_writeback); 3781 #endif 3782 } 3783 3784 zfs_exit(zfsvfs, FTAG); 3785 return (0); 3786 } 3787 3788 /* Clear the dirty flag the required locks are held */ 3789 if (!clear_page_dirty_for_io(pp)) { 3790 unlock_page(pp); 3791 zfs_rangelock_exit(lr); 3792 zfs_exit(zfsvfs, FTAG); 3793 return (0); 3794 } 3795 3796 /* 3797 * Counterpart for redirty_page_for_writepage() above. This page 3798 * was in fact not skipped and should not be counted as if it were. 3799 */ 3800 wbc->pages_skipped--; 3801 if (!for_sync) 3802 atomic_inc_32(&zp->z_async_writes_cnt); 3803 set_page_writeback(pp); 3804 unlock_page(pp); 3805 3806 tx = dmu_tx_create(zfsvfs->z_os); 3807 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3808 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3809 zfs_sa_upgrade_txholds(tx, zp); 3810 3811 err = dmu_tx_assign(tx, TXG_WAIT); 3812 if (err != 0) { 3813 dmu_tx_abort(tx); 3814 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3815 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3816 #else 3817 __set_page_dirty_nobuffers(pp); 3818 #endif 3819 ClearPageError(pp); 3820 end_page_writeback(pp); 3821 if (!for_sync) 3822 atomic_dec_32(&zp->z_async_writes_cnt); 3823 zfs_rangelock_exit(lr); 3824 zfs_exit(zfsvfs, FTAG); 3825 return (err); 3826 } 3827 3828 va = kmap(pp); 3829 ASSERT3U(pglen, <=, PAGE_SIZE); 3830 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); 3831 kunmap(pp); 3832 3833 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3834 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3835 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3836 &zp->z_pflags, 8); 3837 3838 /* Preserve the mtime and ctime provided by the inode */ 3839 tmp_ts = zpl_inode_get_mtime(ip); 3840 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3841 tmp_ts = zpl_inode_get_ctime(ip); 3842 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3843 zp->z_atime_dirty = B_FALSE; 3844 zp->z_seq++; 3845 3846 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3847 3848 boolean_t commit = B_FALSE; 3849 if (wbc->sync_mode != WB_SYNC_NONE) { 3850 /* 3851 * Note that this is rarely called under writepages(), because 3852 * writepages() normally handles the entire commit for 3853 * performance reasons. 3854 */ 3855 commit = B_TRUE; 3856 } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { 3857 /* 3858 * If the caller does not intend to wait synchronously 3859 * for this page writeback to complete and there are active 3860 * synchronous calls on this file, do a commit so that 3861 * the latter don't accidentally end up waiting for 3862 * our writeback to complete. Refer to the comment in 3863 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. 3864 */ 3865 commit = B_TRUE; 3866 } 3867 3868 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, 3869 for_sync ? zfs_putpage_sync_commit_cb : 3870 zfs_putpage_async_commit_cb, pp); 3871 3872 dmu_tx_commit(tx); 3873 3874 zfs_rangelock_exit(lr); 3875 3876 if (commit) 3877 zil_commit(zfsvfs->z_log, zp->z_id); 3878 3879 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3880 3881 zfs_exit(zfsvfs, FTAG); 3882 return (err); 3883 } 3884 3885 /* 3886 * Update the system attributes when the inode has been dirtied. For the 3887 * moment we only update the mode, atime, mtime, and ctime. 3888 */ 3889 int 3890 zfs_dirty_inode(struct inode *ip, int flags) 3891 { 3892 znode_t *zp = ITOZ(ip); 3893 zfsvfs_t *zfsvfs = ITOZSB(ip); 3894 dmu_tx_t *tx; 3895 uint64_t mode, atime[2], mtime[2], ctime[2]; 3896 inode_timespec_t tmp_ts; 3897 sa_bulk_attr_t bulk[4]; 3898 int error = 0; 3899 int cnt = 0; 3900 3901 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 3902 return (0); 3903 3904 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3905 return (error); 3906 3907 #ifdef I_DIRTY_TIME 3908 /* 3909 * This is the lazytime semantic introduced in Linux 4.0 3910 * This flag will only be called from update_time when lazytime is set. 3911 * (Note, I_DIRTY_SYNC will also set if not lazytime) 3912 * Fortunately mtime and ctime are managed within ZFS itself, so we 3913 * only need to dirty atime. 3914 */ 3915 if (flags == I_DIRTY_TIME) { 3916 zp->z_atime_dirty = B_TRUE; 3917 goto out; 3918 } 3919 #endif 3920 3921 tx = dmu_tx_create(zfsvfs->z_os); 3922 3923 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3924 zfs_sa_upgrade_txholds(tx, zp); 3925 3926 error = dmu_tx_assign(tx, TXG_WAIT); 3927 if (error) { 3928 dmu_tx_abort(tx); 3929 goto out; 3930 } 3931 3932 mutex_enter(&zp->z_lock); 3933 zp->z_atime_dirty = B_FALSE; 3934 3935 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 3936 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 3937 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3938 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3939 3940 /* Preserve the mode, mtime and ctime provided by the inode */ 3941 tmp_ts = zpl_inode_get_atime(ip); 3942 ZFS_TIME_ENCODE(&tmp_ts, atime); 3943 tmp_ts = zpl_inode_get_mtime(ip); 3944 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3945 tmp_ts = zpl_inode_get_ctime(ip); 3946 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3947 mode = ip->i_mode; 3948 3949 zp->z_mode = mode; 3950 3951 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3952 mutex_exit(&zp->z_lock); 3953 3954 dmu_tx_commit(tx); 3955 out: 3956 zfs_exit(zfsvfs, FTAG); 3957 return (error); 3958 } 3959 3960 void 3961 zfs_inactive(struct inode *ip) 3962 { 3963 znode_t *zp = ITOZ(ip); 3964 zfsvfs_t *zfsvfs = ITOZSB(ip); 3965 uint64_t atime[2]; 3966 int error; 3967 int need_unlock = 0; 3968 3969 /* Only read lock if we haven't already write locked, e.g. rollback */ 3970 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 3971 need_unlock = 1; 3972 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3973 } 3974 if (zp->z_sa_hdl == NULL) { 3975 if (need_unlock) 3976 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3977 return; 3978 } 3979 3980 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 3981 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3982 3983 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3984 zfs_sa_upgrade_txholds(tx, zp); 3985 error = dmu_tx_assign(tx, TXG_WAIT); 3986 if (error) { 3987 dmu_tx_abort(tx); 3988 } else { 3989 inode_timespec_t tmp_atime; 3990 tmp_atime = zpl_inode_get_atime(ip); 3991 ZFS_TIME_ENCODE(&tmp_atime, atime); 3992 mutex_enter(&zp->z_lock); 3993 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 3994 (void *)&atime, sizeof (atime), tx); 3995 zp->z_atime_dirty = B_FALSE; 3996 mutex_exit(&zp->z_lock); 3997 dmu_tx_commit(tx); 3998 } 3999 } 4000 4001 zfs_zinactive(zp); 4002 if (need_unlock) 4003 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4004 } 4005 4006 /* 4007 * Fill pages with data from the disk. 4008 */ 4009 static int 4010 zfs_fillpage(struct inode *ip, struct page *pp) 4011 { 4012 zfsvfs_t *zfsvfs = ITOZSB(ip); 4013 loff_t i_size = i_size_read(ip); 4014 u_offset_t io_off = page_offset(pp); 4015 size_t io_len = PAGE_SIZE; 4016 4017 ASSERT3U(io_off, <, i_size); 4018 4019 if (io_off + io_len > i_size) 4020 io_len = i_size - io_off; 4021 4022 void *va = kmap(pp); 4023 int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, 4024 io_len, va, DMU_READ_PREFETCH); 4025 if (io_len != PAGE_SIZE) 4026 memset((char *)va + io_len, 0, PAGE_SIZE - io_len); 4027 kunmap(pp); 4028 4029 if (error) { 4030 /* convert checksum errors into IO errors */ 4031 if (error == ECKSUM) 4032 error = SET_ERROR(EIO); 4033 4034 SetPageError(pp); 4035 ClearPageUptodate(pp); 4036 } else { 4037 ClearPageError(pp); 4038 SetPageUptodate(pp); 4039 } 4040 4041 return (error); 4042 } 4043 4044 /* 4045 * Uses zfs_fillpage to read data from the file and fill the page. 4046 * 4047 * IN: ip - inode of file to get data from. 4048 * pp - page to read 4049 * 4050 * RETURN: 0 on success, error code on failure. 4051 * 4052 * Timestamps: 4053 * vp - atime updated 4054 */ 4055 int 4056 zfs_getpage(struct inode *ip, struct page *pp) 4057 { 4058 zfsvfs_t *zfsvfs = ITOZSB(ip); 4059 znode_t *zp = ITOZ(ip); 4060 int error; 4061 4062 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4063 return (error); 4064 4065 error = zfs_fillpage(ip, pp); 4066 if (error == 0) 4067 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); 4068 4069 zfs_exit(zfsvfs, FTAG); 4070 4071 return (error); 4072 } 4073 4074 /* 4075 * Check ZFS specific permissions to memory map a section of a file. 4076 * 4077 * IN: ip - inode of the file to mmap 4078 * off - file offset 4079 * addrp - start address in memory region 4080 * len - length of memory region 4081 * vm_flags- address flags 4082 * 4083 * RETURN: 0 if success 4084 * error code if failure 4085 */ 4086 int 4087 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4088 unsigned long vm_flags) 4089 { 4090 (void) addrp; 4091 znode_t *zp = ITOZ(ip); 4092 zfsvfs_t *zfsvfs = ITOZSB(ip); 4093 int error; 4094 4095 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4096 return (error); 4097 4098 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && 4099 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4100 zfs_exit(zfsvfs, FTAG); 4101 return (SET_ERROR(EPERM)); 4102 } 4103 4104 if ((vm_flags & (VM_READ | VM_EXEC)) && 4105 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4106 zfs_exit(zfsvfs, FTAG); 4107 return (SET_ERROR(EACCES)); 4108 } 4109 4110 if (off < 0 || len > MAXOFFSET_T - off) { 4111 zfs_exit(zfsvfs, FTAG); 4112 return (SET_ERROR(ENXIO)); 4113 } 4114 4115 zfs_exit(zfsvfs, FTAG); 4116 return (0); 4117 } 4118 4119 /* 4120 * Free or allocate space in a file. Currently, this function only 4121 * supports the `F_FREESP' command. However, this command is somewhat 4122 * misnamed, as its functionality includes the ability to allocate as 4123 * well as free space. 4124 * 4125 * IN: zp - znode of file to free data in. 4126 * cmd - action to take (only F_FREESP supported). 4127 * bfp - section of file to free/alloc. 4128 * flag - current file open mode flags. 4129 * offset - current file offset. 4130 * cr - credentials of caller. 4131 * 4132 * RETURN: 0 on success, error code on failure. 4133 * 4134 * Timestamps: 4135 * zp - ctime|mtime updated 4136 */ 4137 int 4138 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4139 offset_t offset, cred_t *cr) 4140 { 4141 (void) offset; 4142 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4143 uint64_t off, len; 4144 int error; 4145 4146 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4147 return (error); 4148 4149 if (cmd != F_FREESP) { 4150 zfs_exit(zfsvfs, FTAG); 4151 return (SET_ERROR(EINVAL)); 4152 } 4153 4154 /* 4155 * Callers might not be able to detect properly that we are read-only, 4156 * so check it explicitly here. 4157 */ 4158 if (zfs_is_readonly(zfsvfs)) { 4159 zfs_exit(zfsvfs, FTAG); 4160 return (SET_ERROR(EROFS)); 4161 } 4162 4163 if (bfp->l_len < 0) { 4164 zfs_exit(zfsvfs, FTAG); 4165 return (SET_ERROR(EINVAL)); 4166 } 4167 4168 /* 4169 * Permissions aren't checked on Solaris because on this OS 4170 * zfs_space() can only be called with an opened file handle. 4171 * On Linux we can get here through truncate_range() which 4172 * operates directly on inodes, so we need to check access rights. 4173 */ 4174 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4175 zfs_init_idmap))) { 4176 zfs_exit(zfsvfs, FTAG); 4177 return (error); 4178 } 4179 4180 off = bfp->l_start; 4181 len = bfp->l_len; /* 0 means from off to end of file */ 4182 4183 error = zfs_freesp(zp, off, len, flag, TRUE); 4184 4185 zfs_exit(zfsvfs, FTAG); 4186 return (error); 4187 } 4188 4189 int 4190 zfs_fid(struct inode *ip, fid_t *fidp) 4191 { 4192 znode_t *zp = ITOZ(ip); 4193 zfsvfs_t *zfsvfs = ITOZSB(ip); 4194 uint32_t gen; 4195 uint64_t gen64; 4196 uint64_t object = zp->z_id; 4197 zfid_short_t *zfid; 4198 int size, i, error; 4199 4200 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4201 return (error); 4202 4203 if (fidp->fid_len < SHORT_FID_LEN) { 4204 fidp->fid_len = SHORT_FID_LEN; 4205 zfs_exit(zfsvfs, FTAG); 4206 return (SET_ERROR(ENOSPC)); 4207 } 4208 4209 if ((error = zfs_verify_zp(zp)) != 0) { 4210 zfs_exit(zfsvfs, FTAG); 4211 return (error); 4212 } 4213 4214 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4215 &gen64, sizeof (uint64_t))) != 0) { 4216 zfs_exit(zfsvfs, FTAG); 4217 return (error); 4218 } 4219 4220 gen = (uint32_t)gen64; 4221 4222 size = SHORT_FID_LEN; 4223 4224 zfid = (zfid_short_t *)fidp; 4225 4226 zfid->zf_len = size; 4227 4228 for (i = 0; i < sizeof (zfid->zf_object); i++) 4229 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4230 4231 /* Must have a non-zero generation number to distinguish from .zfs */ 4232 if (gen == 0) 4233 gen = 1; 4234 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4235 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4236 4237 zfs_exit(zfsvfs, FTAG); 4238 return (0); 4239 } 4240 4241 #if defined(_KERNEL) 4242 EXPORT_SYMBOL(zfs_open); 4243 EXPORT_SYMBOL(zfs_close); 4244 EXPORT_SYMBOL(zfs_lookup); 4245 EXPORT_SYMBOL(zfs_create); 4246 EXPORT_SYMBOL(zfs_tmpfile); 4247 EXPORT_SYMBOL(zfs_remove); 4248 EXPORT_SYMBOL(zfs_mkdir); 4249 EXPORT_SYMBOL(zfs_rmdir); 4250 EXPORT_SYMBOL(zfs_readdir); 4251 EXPORT_SYMBOL(zfs_getattr_fast); 4252 EXPORT_SYMBOL(zfs_setattr); 4253 EXPORT_SYMBOL(zfs_rename); 4254 EXPORT_SYMBOL(zfs_symlink); 4255 EXPORT_SYMBOL(zfs_readlink); 4256 EXPORT_SYMBOL(zfs_link); 4257 EXPORT_SYMBOL(zfs_inactive); 4258 EXPORT_SYMBOL(zfs_space); 4259 EXPORT_SYMBOL(zfs_fid); 4260 EXPORT_SYMBOL(zfs_getpage); 4261 EXPORT_SYMBOL(zfs_putpage); 4262 EXPORT_SYMBOL(zfs_dirty_inode); 4263 EXPORT_SYMBOL(zfs_map); 4264 4265 /* CSTYLED */ 4266 module_param(zfs_delete_blocks, ulong, 0644); 4267 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4268 #endif 4269