1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29 /* Portions Copyright 2007 Jeremy Teo */ 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/kmem.h> 41 #include <sys/taskq.h> 42 #include <sys/uio.h> 43 #include <sys/vmsystm.h> 44 #include <sys/atomic.h> 45 #include <sys/pathname.h> 46 #include <sys/cmn_err.h> 47 #include <sys/errno.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/policy.h> 60 #include <sys/sunddi.h> 61 #include <sys/sid.h> 62 #include <sys/zfs_ctldir.h> 63 #include <sys/zfs_fuid.h> 64 #include <sys/zfs_quota.h> 65 #include <sys/zfs_sa.h> 66 #include <sys/zfs_vnops.h> 67 #include <sys/zfs_rlock.h> 68 #include <sys/cred.h> 69 #include <sys/zpl.h> 70 #include <sys/zil.h> 71 #include <sys/sa_impl.h> 72 #include <linux/mm_compat.h> 73 74 /* 75 * Programming rules. 76 * 77 * Each vnode op performs some logical unit of work. To do this, the ZPL must 78 * properly lock its in-core state, create a DMU transaction, do the work, 79 * record this work in the intent log (ZIL), commit the DMU transaction, 80 * and wait for the intent log to commit if it is a synchronous operation. 81 * Moreover, the vnode ops must work in both normal and log replay context. 82 * The ordering of events is important to avoid deadlocks and references 83 * to freed memory. The example below illustrates the following Big Rules: 84 * 85 * (1) A check must be made in each zfs thread for a mounted file system. 86 * This is done avoiding races using zfs_enter(zfsvfs). 87 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 88 * must be checked with zfs_verify_zp(zp). Both of these macros 89 * can return EIO from the calling function. 90 * 91 * (2) zrele() should always be the last thing except for zil_commit() (if 92 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 93 * last reference, the vnode/znode can be freed, so the zp may point to 94 * freed memory. Second, the last reference will call zfs_zinactive(), 95 * which may induce a lot of work -- pushing cached pages (which acquires 96 * range locks) and syncing out cached atime changes. Third, 97 * zfs_zinactive() may require a new tx, which could deadlock the system 98 * if you were already holding one. This deadlock occurs because the tx 99 * currently being operated on prevents a txg from syncing, which 100 * prevents the new tx from progressing, resulting in a deadlock. If you 101 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 102 * is a synonym for zrele(). 103 * 104 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 105 * as they can span dmu_tx_assign() calls. 106 * 107 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 108 * dmu_tx_assign(). This is critical because we don't want to block 109 * while holding locks. 110 * 111 * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This 112 * reduces lock contention and CPU usage when we must wait (note that if 113 * throughput is constrained by the storage, nearly every transaction 114 * must wait). 115 * 116 * Note, in particular, that if a lock is sometimes acquired before 117 * the tx assigns, and sometimes after (e.g. z_lock), then failing 118 * to use a non-blocking assign can deadlock the system. The scenario: 119 * 120 * Thread A has grabbed a lock before calling dmu_tx_assign(). 121 * Thread B is in an already-assigned tx, and blocks for this lock. 122 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 123 * forever, because the previous txg can't quiesce until B's tx commits. 124 * 125 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 126 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 127 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 128 * to indicate that this operation has already called dmu_tx_wait(). 129 * This will ensure that we don't retry forever, waiting a short bit 130 * each time. 131 * 132 * (5) If the operation succeeded, generate the intent log entry for it 133 * before dropping locks. This ensures that the ordering of events 134 * in the intent log matches the order in which they actually occurred. 135 * During ZIL replay the zfs_log_* functions will update the sequence 136 * number to indicate the zil transaction has replayed. 137 * 138 * (6) At the end of each vnode op, the DMU tx must always commit, 139 * regardless of whether there were any errors. 140 * 141 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 142 * to ensure that synchronous semantics are provided when necessary. 143 * 144 * In general, this is how things should be ordered in each vnode op: 145 * 146 * zfs_enter(zfsvfs); // exit if unmounted 147 * top: 148 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 149 * rw_enter(...); // grab any other locks you need 150 * tx = dmu_tx_create(...); // get DMU tx 151 * dmu_tx_hold_*(); // hold each object you might modify 152 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 153 * if (error) { 154 * rw_exit(...); // drop locks 155 * zfs_dirent_unlock(dl); // unlock directory entry 156 * zrele(...); // release held znodes 157 * if (error == ERESTART) { 158 * waited = B_TRUE; 159 * dmu_tx_wait(tx); 160 * dmu_tx_abort(tx); 161 * goto top; 162 * } 163 * dmu_tx_abort(tx); // abort DMU tx 164 * zfs_exit(zfsvfs); // finished in zfs 165 * return (error); // really out of space 166 * } 167 * error = do_real_work(); // do whatever this VOP does 168 * if (error == 0) 169 * zfs_log_*(...); // on success, make ZIL entry 170 * dmu_tx_commit(tx); // commit DMU tx -- error or not 171 * rw_exit(...); // drop locks 172 * zfs_dirent_unlock(dl); // unlock directory entry 173 * zrele(...); // release held znodes 174 * zil_commit(zilog, foid); // synchronous when necessary 175 * zfs_exit(zfsvfs); // finished in zfs 176 * return (error); // done, report error 177 */ 178 int 179 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 180 { 181 (void) cr; 182 znode_t *zp = ITOZ(ip); 183 zfsvfs_t *zfsvfs = ITOZSB(ip); 184 int error; 185 186 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 187 return (error); 188 189 /* Honor ZFS_APPENDONLY file attribute */ 190 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && 191 ((flag & O_APPEND) == 0)) { 192 zfs_exit(zfsvfs, FTAG); 193 return (SET_ERROR(EPERM)); 194 } 195 196 /* 197 * Keep a count of the synchronous opens in the znode. On first 198 * synchronous open we must convert all previous async transactions 199 * into sync to keep correct ordering. 200 */ 201 if (flag & O_SYNC) { 202 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) 203 zil_async_to_sync(zfsvfs->z_log, zp->z_id); 204 } 205 206 zfs_exit(zfsvfs, FTAG); 207 return (0); 208 } 209 210 int 211 zfs_close(struct inode *ip, int flag, cred_t *cr) 212 { 213 (void) cr; 214 znode_t *zp = ITOZ(ip); 215 zfsvfs_t *zfsvfs = ITOZSB(ip); 216 int error; 217 218 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 219 return (error); 220 221 /* Decrement the synchronous opens in the znode */ 222 if (flag & O_SYNC) 223 atomic_dec_32(&zp->z_sync_cnt); 224 225 zfs_exit(zfsvfs, FTAG); 226 return (0); 227 } 228 229 #if defined(_KERNEL) 230 231 static int zfs_fillpage(struct inode *ip, struct page *pp); 232 233 /* 234 * When a file is memory mapped, we must keep the IO data synchronized 235 * between the DMU cache and the memory mapped pages. Update all mapped 236 * pages with the contents of the coresponding dmu buffer. 237 */ 238 void 239 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 240 { 241 struct address_space *mp = ZTOI(zp)->i_mapping; 242 int64_t off = start & (PAGE_SIZE - 1); 243 244 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 245 uint64_t nbytes = MIN(PAGE_SIZE - off, len); 246 247 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 248 if (pp) { 249 if (mapping_writably_mapped(mp)) 250 flush_dcache_page(pp); 251 252 void *pb = kmap(pp); 253 int error = dmu_read(os, zp->z_id, start + off, 254 nbytes, pb + off, DMU_READ_PREFETCH); 255 kunmap(pp); 256 257 if (error) { 258 SetPageError(pp); 259 ClearPageUptodate(pp); 260 } else { 261 ClearPageError(pp); 262 SetPageUptodate(pp); 263 264 if (mapping_writably_mapped(mp)) 265 flush_dcache_page(pp); 266 267 mark_page_accessed(pp); 268 } 269 270 unlock_page(pp); 271 put_page(pp); 272 } 273 274 len -= nbytes; 275 off = 0; 276 } 277 } 278 279 /* 280 * When a file is memory mapped, we must keep the I/O data synchronized 281 * between the DMU cache and the memory mapped pages. Preferentially read 282 * from memory mapped pages, otherwise fallback to reading through the dmu. 283 */ 284 int 285 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 286 { 287 struct inode *ip = ZTOI(zp); 288 struct address_space *mp = ip->i_mapping; 289 int64_t start = uio->uio_loffset; 290 int64_t off = start & (PAGE_SIZE - 1); 291 int len = nbytes; 292 int error = 0; 293 294 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 295 uint64_t bytes = MIN(PAGE_SIZE - off, len); 296 297 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 298 if (pp) { 299 300 /* 301 * If filemap_fault() retries there exists a window 302 * where the page will be unlocked and not up to date. 303 * In this case we must try and fill the page. 304 */ 305 if (unlikely(!PageUptodate(pp))) { 306 error = zfs_fillpage(ip, pp); 307 if (error) { 308 unlock_page(pp); 309 put_page(pp); 310 return (error); 311 } 312 } 313 314 ASSERT(PageUptodate(pp) || PageDirty(pp)); 315 316 unlock_page(pp); 317 318 void *pb = kmap(pp); 319 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 320 kunmap(pp); 321 322 if (mapping_writably_mapped(mp)) 323 flush_dcache_page(pp); 324 325 mark_page_accessed(pp); 326 put_page(pp); 327 } else { 328 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 329 uio, bytes); 330 } 331 332 len -= bytes; 333 off = 0; 334 335 if (error) 336 break; 337 } 338 339 return (error); 340 } 341 #endif /* _KERNEL */ 342 343 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 344 345 /* 346 * Write the bytes to a file. 347 * 348 * IN: zp - znode of file to be written to 349 * data - bytes to write 350 * len - number of bytes to write 351 * pos - offset to start writing at 352 * 353 * OUT: resid - remaining bytes to write 354 * 355 * RETURN: 0 if success 356 * positive error code if failure. EIO is returned 357 * for a short write when residp isn't provided. 358 * 359 * Timestamps: 360 * zp - ctime|mtime updated if byte count > 0 361 */ 362 int 363 zfs_write_simple(znode_t *zp, const void *data, size_t len, 364 loff_t pos, size_t *residp) 365 { 366 fstrans_cookie_t cookie; 367 int error; 368 369 struct iovec iov; 370 iov.iov_base = (void *)data; 371 iov.iov_len = len; 372 373 zfs_uio_t uio; 374 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 375 376 cookie = spl_fstrans_mark(); 377 error = zfs_write(zp, &uio, 0, kcred); 378 spl_fstrans_unmark(cookie); 379 380 if (error == 0) { 381 if (residp != NULL) 382 *residp = zfs_uio_resid(&uio); 383 else if (zfs_uio_resid(&uio) != 0) 384 error = SET_ERROR(EIO); 385 } 386 387 return (error); 388 } 389 390 static void 391 zfs_rele_async_task(void *arg) 392 { 393 iput(arg); 394 } 395 396 void 397 zfs_zrele_async(znode_t *zp) 398 { 399 struct inode *ip = ZTOI(zp); 400 objset_t *os = ITOZSB(ip)->z_os; 401 402 ASSERT(atomic_read(&ip->i_count) > 0); 403 ASSERT(os != NULL); 404 405 /* 406 * If decrementing the count would put us at 0, we can't do it inline 407 * here, because that would be synchronous. Instead, dispatch an iput 408 * to run later. 409 * 410 * For more information on the dangers of a synchronous iput, see the 411 * header comment of this file. 412 */ 413 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 414 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 415 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 416 } 417 } 418 419 420 /* 421 * Lookup an entry in a directory, or an extended attribute directory. 422 * If it exists, return a held inode reference for it. 423 * 424 * IN: zdp - znode of directory to search. 425 * nm - name of entry to lookup. 426 * flags - LOOKUP_XATTR set if looking for an attribute. 427 * cr - credentials of caller. 428 * direntflags - directory lookup flags 429 * realpnp - returned pathname. 430 * 431 * OUT: zpp - znode of located entry, NULL if not found. 432 * 433 * RETURN: 0 on success, error code on failure. 434 * 435 * Timestamps: 436 * NA 437 */ 438 int 439 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 440 int *direntflags, pathname_t *realpnp) 441 { 442 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 443 int error = 0; 444 445 /* 446 * Fast path lookup, however we must skip DNLC lookup 447 * for case folding or normalizing lookups because the 448 * DNLC code only stores the passed in name. This means 449 * creating 'a' and removing 'A' on a case insensitive 450 * file system would work, but DNLC still thinks 'a' 451 * exists and won't let you create it again on the next 452 * pass through fast path. 453 */ 454 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 455 456 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 457 return (SET_ERROR(ENOTDIR)); 458 } else if (zdp->z_sa_hdl == NULL) { 459 return (SET_ERROR(EIO)); 460 } 461 462 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 463 error = zfs_fastaccesschk_execute(zdp, cr); 464 if (!error) { 465 *zpp = zdp; 466 zhold(*zpp); 467 return (0); 468 } 469 return (error); 470 } 471 } 472 473 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 474 return (error); 475 476 *zpp = NULL; 477 478 if (flags & LOOKUP_XATTR) { 479 /* 480 * We don't allow recursive attributes.. 481 * Maybe someday we will. 482 */ 483 if (zdp->z_pflags & ZFS_XATTR) { 484 zfs_exit(zfsvfs, FTAG); 485 return (SET_ERROR(EINVAL)); 486 } 487 488 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 489 zfs_exit(zfsvfs, FTAG); 490 return (error); 491 } 492 493 /* 494 * Do we have permission to get into attribute directory? 495 */ 496 497 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 498 B_TRUE, cr, zfs_init_idmap))) { 499 zrele(*zpp); 500 *zpp = NULL; 501 } 502 503 zfs_exit(zfsvfs, FTAG); 504 return (error); 505 } 506 507 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 508 zfs_exit(zfsvfs, FTAG); 509 return (SET_ERROR(ENOTDIR)); 510 } 511 512 /* 513 * Check accessibility of directory. 514 */ 515 516 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 517 zfs_init_idmap))) { 518 zfs_exit(zfsvfs, FTAG); 519 return (error); 520 } 521 522 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 523 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 524 zfs_exit(zfsvfs, FTAG); 525 return (SET_ERROR(EILSEQ)); 526 } 527 528 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 529 if ((error == 0) && (*zpp)) 530 zfs_znode_update_vfs(*zpp); 531 532 zfs_exit(zfsvfs, FTAG); 533 return (error); 534 } 535 536 /* 537 * Perform a linear search in directory for the name of specific inode. 538 * Note we don't pass in the buffer size of name because it's hardcoded to 539 * NAME_MAX+1(256) in Linux. 540 * 541 * IN: dzp - znode of directory to search. 542 * zp - znode of the target 543 * 544 * OUT: name - dentry name of the target 545 * 546 * RETURN: 0 on success, error code on failure. 547 */ 548 int 549 zfs_get_name(znode_t *dzp, char *name, znode_t *zp) 550 { 551 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 552 int error = 0; 553 554 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 555 return (error); 556 557 if ((error = zfs_verify_zp(zp)) != 0) { 558 zfs_exit(zfsvfs, FTAG); 559 return (error); 560 } 561 562 /* ctldir should have got their name in zfs_vget */ 563 if (dzp->z_is_ctldir || zp->z_is_ctldir) { 564 zfs_exit(zfsvfs, FTAG); 565 return (ENOENT); 566 } 567 568 /* buffer len is hardcoded to 256 in Linux kernel */ 569 error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id, 570 ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN); 571 572 zfs_exit(zfsvfs, FTAG); 573 return (error); 574 } 575 576 /* 577 * Attempt to create a new entry in a directory. If the entry 578 * already exists, truncate the file if permissible, else return 579 * an error. Return the ip of the created or trunc'd file. 580 * 581 * IN: dzp - znode of directory to put new file entry in. 582 * name - name of new file entry. 583 * vap - attributes of new file. 584 * excl - flag indicating exclusive or non-exclusive mode. 585 * mode - mode to open file with. 586 * cr - credentials of caller. 587 * flag - file flag. 588 * vsecp - ACL to be set 589 * mnt_ns - user namespace of the mount 590 * 591 * OUT: zpp - znode of created or trunc'd entry. 592 * 593 * RETURN: 0 on success, error code on failure. 594 * 595 * Timestamps: 596 * dzp - ctime|mtime updated if new entry created 597 * zp - ctime|mtime always, atime if new 598 */ 599 int 600 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 601 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 602 zidmap_t *mnt_ns) 603 { 604 znode_t *zp; 605 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 606 zilog_t *zilog; 607 objset_t *os; 608 zfs_dirlock_t *dl; 609 dmu_tx_t *tx; 610 int error; 611 uid_t uid; 612 gid_t gid; 613 zfs_acl_ids_t acl_ids; 614 boolean_t fuid_dirtied; 615 boolean_t have_acl = B_FALSE; 616 boolean_t waited = B_FALSE; 617 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 618 619 /* 620 * If we have an ephemeral id, ACL, or XVATTR then 621 * make sure file system is at proper version 622 */ 623 624 gid = crgetgid(cr); 625 uid = crgetuid(cr); 626 627 if (zfsvfs->z_use_fuids == B_FALSE && 628 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 629 return (SET_ERROR(EINVAL)); 630 631 if (name == NULL) 632 return (SET_ERROR(EINVAL)); 633 634 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 635 return (error); 636 os = zfsvfs->z_os; 637 zilog = zfsvfs->z_log; 638 639 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 640 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 641 zfs_exit(zfsvfs, FTAG); 642 return (SET_ERROR(EILSEQ)); 643 } 644 645 if (vap->va_mask & ATTR_XVATTR) { 646 if ((error = secpolicy_xvattr((xvattr_t *)vap, 647 crgetuid(cr), cr, vap->va_mode)) != 0) { 648 zfs_exit(zfsvfs, FTAG); 649 return (error); 650 } 651 } 652 653 top: 654 *zpp = NULL; 655 if (*name == '\0') { 656 /* 657 * Null component name refers to the directory itself. 658 */ 659 zhold(dzp); 660 zp = dzp; 661 dl = NULL; 662 error = 0; 663 } else { 664 /* possible igrab(zp) */ 665 int zflg = 0; 666 667 if (flag & FIGNORECASE) 668 zflg |= ZCILOOK; 669 670 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 671 NULL, NULL); 672 if (error) { 673 if (have_acl) 674 zfs_acl_ids_free(&acl_ids); 675 if (strcmp(name, "..") == 0) 676 error = SET_ERROR(EISDIR); 677 zfs_exit(zfsvfs, FTAG); 678 return (error); 679 } 680 } 681 682 if (zp == NULL) { 683 uint64_t txtype; 684 uint64_t projid = ZFS_DEFAULT_PROJID; 685 686 /* 687 * Create a new file object and update the directory 688 * to reference it. 689 */ 690 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 691 mnt_ns))) { 692 if (have_acl) 693 zfs_acl_ids_free(&acl_ids); 694 goto out; 695 } 696 697 /* 698 * We only support the creation of regular files in 699 * extended attribute directories. 700 */ 701 702 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 703 if (have_acl) 704 zfs_acl_ids_free(&acl_ids); 705 error = SET_ERROR(EINVAL); 706 goto out; 707 } 708 709 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 710 cr, vsecp, &acl_ids, mnt_ns)) != 0) 711 goto out; 712 have_acl = B_TRUE; 713 714 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 715 projid = zfs_inherit_projid(dzp); 716 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 717 zfs_acl_ids_free(&acl_ids); 718 error = SET_ERROR(EDQUOT); 719 goto out; 720 } 721 722 tx = dmu_tx_create(os); 723 724 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 725 ZFS_SA_BASE_ATTR_SIZE); 726 727 fuid_dirtied = zfsvfs->z_fuid_dirty; 728 if (fuid_dirtied) 729 zfs_fuid_txhold(zfsvfs, tx); 730 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 731 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 732 if (!zfsvfs->z_use_sa && 733 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 734 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 735 0, acl_ids.z_aclp->z_acl_bytes); 736 } 737 738 error = dmu_tx_assign(tx, 739 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 740 if (error) { 741 zfs_dirent_unlock(dl); 742 if (error == ERESTART) { 743 waited = B_TRUE; 744 dmu_tx_wait(tx); 745 dmu_tx_abort(tx); 746 goto top; 747 } 748 zfs_acl_ids_free(&acl_ids); 749 dmu_tx_abort(tx); 750 zfs_exit(zfsvfs, FTAG); 751 return (error); 752 } 753 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 754 755 error = zfs_link_create(dl, zp, tx, ZNEW); 756 if (error != 0) { 757 /* 758 * Since, we failed to add the directory entry for it, 759 * delete the newly created dnode. 760 */ 761 zfs_znode_delete(zp, tx); 762 remove_inode_hash(ZTOI(zp)); 763 zfs_acl_ids_free(&acl_ids); 764 dmu_tx_commit(tx); 765 goto out; 766 } 767 768 if (fuid_dirtied) 769 zfs_fuid_sync(zfsvfs, tx); 770 771 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 772 if (flag & FIGNORECASE) 773 txtype |= TX_CI; 774 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 775 vsecp, acl_ids.z_fuidp, vap); 776 zfs_acl_ids_free(&acl_ids); 777 dmu_tx_commit(tx); 778 } else { 779 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 780 781 if (have_acl) 782 zfs_acl_ids_free(&acl_ids); 783 784 /* 785 * A directory entry already exists for this name. 786 */ 787 /* 788 * Can't truncate an existing file if in exclusive mode. 789 */ 790 if (excl) { 791 error = SET_ERROR(EEXIST); 792 goto out; 793 } 794 /* 795 * Can't open a directory for writing. 796 */ 797 if (S_ISDIR(ZTOI(zp)->i_mode)) { 798 error = SET_ERROR(EISDIR); 799 goto out; 800 } 801 /* 802 * Verify requested access to file. 803 */ 804 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 805 mnt_ns))) { 806 goto out; 807 } 808 809 mutex_enter(&dzp->z_lock); 810 dzp->z_seq++; 811 mutex_exit(&dzp->z_lock); 812 813 /* 814 * Truncate regular files if requested. 815 */ 816 if (S_ISREG(ZTOI(zp)->i_mode) && 817 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 818 /* we can't hold any locks when calling zfs_freesp() */ 819 if (dl) { 820 zfs_dirent_unlock(dl); 821 dl = NULL; 822 } 823 error = zfs_freesp(zp, 0, 0, mode, TRUE); 824 } 825 } 826 out: 827 828 if (dl) 829 zfs_dirent_unlock(dl); 830 831 if (error) { 832 if (zp) 833 zrele(zp); 834 } else { 835 zfs_znode_update_vfs(dzp); 836 zfs_znode_update_vfs(zp); 837 *zpp = zp; 838 } 839 840 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 841 zil_commit(zilog, 0); 842 843 zfs_exit(zfsvfs, FTAG); 844 return (error); 845 } 846 847 int 848 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 849 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 850 zidmap_t *mnt_ns) 851 { 852 (void) excl, (void) mode, (void) flag; 853 znode_t *zp = NULL, *dzp = ITOZ(dip); 854 zfsvfs_t *zfsvfs = ITOZSB(dip); 855 objset_t *os; 856 dmu_tx_t *tx; 857 int error; 858 uid_t uid; 859 gid_t gid; 860 zfs_acl_ids_t acl_ids; 861 uint64_t projid = ZFS_DEFAULT_PROJID; 862 boolean_t fuid_dirtied; 863 boolean_t have_acl = B_FALSE; 864 boolean_t waited = B_FALSE; 865 866 /* 867 * If we have an ephemeral id, ACL, or XVATTR then 868 * make sure file system is at proper version 869 */ 870 871 gid = crgetgid(cr); 872 uid = crgetuid(cr); 873 874 if (zfsvfs->z_use_fuids == B_FALSE && 875 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 876 return (SET_ERROR(EINVAL)); 877 878 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 879 return (error); 880 os = zfsvfs->z_os; 881 882 if (vap->va_mask & ATTR_XVATTR) { 883 if ((error = secpolicy_xvattr((xvattr_t *)vap, 884 crgetuid(cr), cr, vap->va_mode)) != 0) { 885 zfs_exit(zfsvfs, FTAG); 886 return (error); 887 } 888 } 889 890 top: 891 *ipp = NULL; 892 893 /* 894 * Create a new file object and update the directory 895 * to reference it. 896 */ 897 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 898 if (have_acl) 899 zfs_acl_ids_free(&acl_ids); 900 goto out; 901 } 902 903 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 904 cr, vsecp, &acl_ids, mnt_ns)) != 0) 905 goto out; 906 have_acl = B_TRUE; 907 908 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 909 projid = zfs_inherit_projid(dzp); 910 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 911 zfs_acl_ids_free(&acl_ids); 912 error = SET_ERROR(EDQUOT); 913 goto out; 914 } 915 916 tx = dmu_tx_create(os); 917 918 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 919 ZFS_SA_BASE_ATTR_SIZE); 920 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 921 922 fuid_dirtied = zfsvfs->z_fuid_dirty; 923 if (fuid_dirtied) 924 zfs_fuid_txhold(zfsvfs, tx); 925 if (!zfsvfs->z_use_sa && 926 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 927 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 928 0, acl_ids.z_aclp->z_acl_bytes); 929 } 930 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 931 if (error) { 932 if (error == ERESTART) { 933 waited = B_TRUE; 934 dmu_tx_wait(tx); 935 dmu_tx_abort(tx); 936 goto top; 937 } 938 zfs_acl_ids_free(&acl_ids); 939 dmu_tx_abort(tx); 940 zfs_exit(zfsvfs, FTAG); 941 return (error); 942 } 943 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 944 945 if (fuid_dirtied) 946 zfs_fuid_sync(zfsvfs, tx); 947 948 /* Add to unlinked set */ 949 zp->z_unlinked = B_TRUE; 950 zfs_unlinked_add(zp, tx); 951 zfs_acl_ids_free(&acl_ids); 952 dmu_tx_commit(tx); 953 out: 954 955 if (error) { 956 if (zp) 957 zrele(zp); 958 } else { 959 zfs_znode_update_vfs(dzp); 960 zfs_znode_update_vfs(zp); 961 *ipp = ZTOI(zp); 962 } 963 964 zfs_exit(zfsvfs, FTAG); 965 return (error); 966 } 967 968 /* 969 * Remove an entry from a directory. 970 * 971 * IN: dzp - znode of directory to remove entry from. 972 * name - name of entry to remove. 973 * cr - credentials of caller. 974 * flags - case flags. 975 * 976 * RETURN: 0 if success 977 * error code if failure 978 * 979 * Timestamps: 980 * dzp - ctime|mtime 981 * ip - ctime (if nlink > 0) 982 */ 983 984 static uint64_t null_xattr = 0; 985 986 int 987 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 988 { 989 znode_t *zp; 990 znode_t *xzp; 991 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 992 zilog_t *zilog; 993 uint64_t acl_obj, xattr_obj; 994 uint64_t xattr_obj_unlinked = 0; 995 uint64_t obj = 0; 996 uint64_t links; 997 zfs_dirlock_t *dl; 998 dmu_tx_t *tx; 999 boolean_t may_delete_now, delete_now = FALSE; 1000 boolean_t unlinked, toobig = FALSE; 1001 uint64_t txtype; 1002 pathname_t *realnmp = NULL; 1003 pathname_t realnm; 1004 int error; 1005 int zflg = ZEXISTS; 1006 boolean_t waited = B_FALSE; 1007 1008 if (name == NULL) 1009 return (SET_ERROR(EINVAL)); 1010 1011 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1012 return (error); 1013 zilog = zfsvfs->z_log; 1014 1015 if (flags & FIGNORECASE) { 1016 zflg |= ZCILOOK; 1017 pn_alloc(&realnm); 1018 realnmp = &realnm; 1019 } 1020 1021 top: 1022 xattr_obj = 0; 1023 xzp = NULL; 1024 /* 1025 * Attempt to lock directory; fail if entry doesn't exist. 1026 */ 1027 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1028 NULL, realnmp))) { 1029 if (realnmp) 1030 pn_free(realnmp); 1031 zfs_exit(zfsvfs, FTAG); 1032 return (error); 1033 } 1034 1035 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1036 goto out; 1037 } 1038 1039 /* 1040 * Need to use rmdir for removing directories. 1041 */ 1042 if (S_ISDIR(ZTOI(zp)->i_mode)) { 1043 error = SET_ERROR(EPERM); 1044 goto out; 1045 } 1046 1047 mutex_enter(&zp->z_lock); 1048 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 1049 !zn_has_cached_data(zp, 0, LLONG_MAX); 1050 mutex_exit(&zp->z_lock); 1051 1052 /* 1053 * We may delete the znode now, or we may put it in the unlinked set; 1054 * it depends on whether we're the last link, and on whether there are 1055 * other holds on the inode. So we dmu_tx_hold() the right things to 1056 * allow for either case. 1057 */ 1058 obj = zp->z_id; 1059 tx = dmu_tx_create(zfsvfs->z_os); 1060 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1061 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1062 zfs_sa_upgrade_txholds(tx, zp); 1063 zfs_sa_upgrade_txholds(tx, dzp); 1064 if (may_delete_now) { 1065 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1066 /* if the file is too big, only hold_free a token amount */ 1067 dmu_tx_hold_free(tx, zp->z_id, 0, 1068 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1069 } 1070 1071 /* are there any extended attributes? */ 1072 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1073 &xattr_obj, sizeof (xattr_obj)); 1074 if (error == 0 && xattr_obj) { 1075 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1076 ASSERT0(error); 1077 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1078 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1079 } 1080 1081 mutex_enter(&zp->z_lock); 1082 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1083 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1084 mutex_exit(&zp->z_lock); 1085 1086 /* charge as an update -- would be nice not to charge at all */ 1087 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1088 1089 /* 1090 * Mark this transaction as typically resulting in a net free of space 1091 */ 1092 dmu_tx_mark_netfree(tx); 1093 1094 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1095 if (error) { 1096 zfs_dirent_unlock(dl); 1097 if (error == ERESTART) { 1098 waited = B_TRUE; 1099 dmu_tx_wait(tx); 1100 dmu_tx_abort(tx); 1101 zrele(zp); 1102 if (xzp) 1103 zrele(xzp); 1104 goto top; 1105 } 1106 if (realnmp) 1107 pn_free(realnmp); 1108 dmu_tx_abort(tx); 1109 zrele(zp); 1110 if (xzp) 1111 zrele(xzp); 1112 zfs_exit(zfsvfs, FTAG); 1113 return (error); 1114 } 1115 1116 /* 1117 * Remove the directory entry. 1118 */ 1119 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1120 1121 if (error) { 1122 dmu_tx_commit(tx); 1123 goto out; 1124 } 1125 1126 if (unlinked) { 1127 /* 1128 * Hold z_lock so that we can make sure that the ACL obj 1129 * hasn't changed. Could have been deleted due to 1130 * zfs_sa_upgrade(). 1131 */ 1132 mutex_enter(&zp->z_lock); 1133 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1134 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1135 delete_now = may_delete_now && !toobig && 1136 atomic_read(&ZTOI(zp)->i_count) == 1 && 1137 !zn_has_cached_data(zp, 0, LLONG_MAX) && 1138 xattr_obj == xattr_obj_unlinked && 1139 zfs_external_acl(zp) == acl_obj; 1140 VERIFY_IMPLY(xattr_obj_unlinked, xzp); 1141 } 1142 1143 if (delete_now) { 1144 if (xattr_obj_unlinked) { 1145 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1146 mutex_enter(&xzp->z_lock); 1147 xzp->z_unlinked = B_TRUE; 1148 clear_nlink(ZTOI(xzp)); 1149 links = 0; 1150 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1151 &links, sizeof (links), tx); 1152 ASSERT3U(error, ==, 0); 1153 mutex_exit(&xzp->z_lock); 1154 zfs_unlinked_add(xzp, tx); 1155 1156 if (zp->z_is_sa) 1157 error = sa_remove(zp->z_sa_hdl, 1158 SA_ZPL_XATTR(zfsvfs), tx); 1159 else 1160 error = sa_update(zp->z_sa_hdl, 1161 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1162 sizeof (uint64_t), tx); 1163 ASSERT0(error); 1164 } 1165 /* 1166 * Add to the unlinked set because a new reference could be 1167 * taken concurrently resulting in a deferred destruction. 1168 */ 1169 zfs_unlinked_add(zp, tx); 1170 mutex_exit(&zp->z_lock); 1171 } else if (unlinked) { 1172 mutex_exit(&zp->z_lock); 1173 zfs_unlinked_add(zp, tx); 1174 } 1175 1176 txtype = TX_REMOVE; 1177 if (flags & FIGNORECASE) 1178 txtype |= TX_CI; 1179 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1180 1181 dmu_tx_commit(tx); 1182 out: 1183 if (realnmp) 1184 pn_free(realnmp); 1185 1186 zfs_dirent_unlock(dl); 1187 zfs_znode_update_vfs(dzp); 1188 zfs_znode_update_vfs(zp); 1189 1190 if (delete_now) 1191 zrele(zp); 1192 else 1193 zfs_zrele_async(zp); 1194 1195 if (xzp) { 1196 zfs_znode_update_vfs(xzp); 1197 zfs_zrele_async(xzp); 1198 } 1199 1200 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1201 zil_commit(zilog, 0); 1202 1203 zfs_exit(zfsvfs, FTAG); 1204 return (error); 1205 } 1206 1207 /* 1208 * Create a new directory and insert it into dzp using the name 1209 * provided. Return a pointer to the inserted directory. 1210 * 1211 * IN: dzp - znode of directory to add subdir to. 1212 * dirname - name of new directory. 1213 * vap - attributes of new directory. 1214 * cr - credentials of caller. 1215 * flags - case flags. 1216 * vsecp - ACL to be set 1217 * mnt_ns - user namespace of the mount 1218 * 1219 * OUT: zpp - znode of created directory. 1220 * 1221 * RETURN: 0 if success 1222 * error code if failure 1223 * 1224 * Timestamps: 1225 * dzp - ctime|mtime updated 1226 * zpp - ctime|mtime|atime updated 1227 */ 1228 int 1229 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1230 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) 1231 { 1232 znode_t *zp; 1233 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1234 zilog_t *zilog; 1235 zfs_dirlock_t *dl; 1236 uint64_t txtype; 1237 dmu_tx_t *tx; 1238 int error; 1239 int zf = ZNEW; 1240 uid_t uid; 1241 gid_t gid = crgetgid(cr); 1242 zfs_acl_ids_t acl_ids; 1243 boolean_t fuid_dirtied; 1244 boolean_t waited = B_FALSE; 1245 1246 ASSERT(S_ISDIR(vap->va_mode)); 1247 1248 /* 1249 * If we have an ephemeral id, ACL, or XVATTR then 1250 * make sure file system is at proper version 1251 */ 1252 1253 uid = crgetuid(cr); 1254 if (zfsvfs->z_use_fuids == B_FALSE && 1255 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1256 return (SET_ERROR(EINVAL)); 1257 1258 if (dirname == NULL) 1259 return (SET_ERROR(EINVAL)); 1260 1261 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1262 return (error); 1263 zilog = zfsvfs->z_log; 1264 1265 if (dzp->z_pflags & ZFS_XATTR) { 1266 zfs_exit(zfsvfs, FTAG); 1267 return (SET_ERROR(EINVAL)); 1268 } 1269 1270 if (zfsvfs->z_utf8 && u8_validate(dirname, 1271 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1272 zfs_exit(zfsvfs, FTAG); 1273 return (SET_ERROR(EILSEQ)); 1274 } 1275 if (flags & FIGNORECASE) 1276 zf |= ZCILOOK; 1277 1278 if (vap->va_mask & ATTR_XVATTR) { 1279 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1280 crgetuid(cr), cr, vap->va_mode)) != 0) { 1281 zfs_exit(zfsvfs, FTAG); 1282 return (error); 1283 } 1284 } 1285 1286 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1287 vsecp, &acl_ids, mnt_ns)) != 0) { 1288 zfs_exit(zfsvfs, FTAG); 1289 return (error); 1290 } 1291 /* 1292 * First make sure the new directory doesn't exist. 1293 * 1294 * Existence is checked first to make sure we don't return 1295 * EACCES instead of EEXIST which can cause some applications 1296 * to fail. 1297 */ 1298 top: 1299 *zpp = NULL; 1300 1301 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1302 NULL, NULL))) { 1303 zfs_acl_ids_free(&acl_ids); 1304 zfs_exit(zfsvfs, FTAG); 1305 return (error); 1306 } 1307 1308 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1309 mnt_ns))) { 1310 zfs_acl_ids_free(&acl_ids); 1311 zfs_dirent_unlock(dl); 1312 zfs_exit(zfsvfs, FTAG); 1313 return (error); 1314 } 1315 1316 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1317 zfs_acl_ids_free(&acl_ids); 1318 zfs_dirent_unlock(dl); 1319 zfs_exit(zfsvfs, FTAG); 1320 return (SET_ERROR(EDQUOT)); 1321 } 1322 1323 /* 1324 * Add a new entry to the directory. 1325 */ 1326 tx = dmu_tx_create(zfsvfs->z_os); 1327 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1328 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1329 fuid_dirtied = zfsvfs->z_fuid_dirty; 1330 if (fuid_dirtied) 1331 zfs_fuid_txhold(zfsvfs, tx); 1332 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1333 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1334 acl_ids.z_aclp->z_acl_bytes); 1335 } 1336 1337 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1338 ZFS_SA_BASE_ATTR_SIZE); 1339 1340 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1341 if (error) { 1342 zfs_dirent_unlock(dl); 1343 if (error == ERESTART) { 1344 waited = B_TRUE; 1345 dmu_tx_wait(tx); 1346 dmu_tx_abort(tx); 1347 goto top; 1348 } 1349 zfs_acl_ids_free(&acl_ids); 1350 dmu_tx_abort(tx); 1351 zfs_exit(zfsvfs, FTAG); 1352 return (error); 1353 } 1354 1355 /* 1356 * Create new node. 1357 */ 1358 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1359 1360 /* 1361 * Now put new name in parent dir. 1362 */ 1363 error = zfs_link_create(dl, zp, tx, ZNEW); 1364 if (error != 0) { 1365 zfs_znode_delete(zp, tx); 1366 remove_inode_hash(ZTOI(zp)); 1367 goto out; 1368 } 1369 1370 if (fuid_dirtied) 1371 zfs_fuid_sync(zfsvfs, tx); 1372 1373 *zpp = zp; 1374 1375 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1376 if (flags & FIGNORECASE) 1377 txtype |= TX_CI; 1378 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1379 acl_ids.z_fuidp, vap); 1380 1381 out: 1382 zfs_acl_ids_free(&acl_ids); 1383 1384 dmu_tx_commit(tx); 1385 1386 zfs_dirent_unlock(dl); 1387 1388 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1389 zil_commit(zilog, 0); 1390 1391 if (error != 0) { 1392 zrele(zp); 1393 } else { 1394 zfs_znode_update_vfs(dzp); 1395 zfs_znode_update_vfs(zp); 1396 } 1397 zfs_exit(zfsvfs, FTAG); 1398 return (error); 1399 } 1400 1401 /* 1402 * Remove a directory subdir entry. If the current working 1403 * directory is the same as the subdir to be removed, the 1404 * remove will fail. 1405 * 1406 * IN: dzp - znode of directory to remove from. 1407 * name - name of directory to be removed. 1408 * cwd - inode of current working directory. 1409 * cr - credentials of caller. 1410 * flags - case flags 1411 * 1412 * RETURN: 0 on success, error code on failure. 1413 * 1414 * Timestamps: 1415 * dzp - ctime|mtime updated 1416 */ 1417 int 1418 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1419 int flags) 1420 { 1421 znode_t *zp; 1422 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1423 zilog_t *zilog; 1424 zfs_dirlock_t *dl; 1425 dmu_tx_t *tx; 1426 int error; 1427 int zflg = ZEXISTS; 1428 boolean_t waited = B_FALSE; 1429 1430 if (name == NULL) 1431 return (SET_ERROR(EINVAL)); 1432 1433 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1434 return (error); 1435 zilog = zfsvfs->z_log; 1436 1437 if (flags & FIGNORECASE) 1438 zflg |= ZCILOOK; 1439 top: 1440 zp = NULL; 1441 1442 /* 1443 * Attempt to lock directory; fail if entry doesn't exist. 1444 */ 1445 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1446 NULL, NULL))) { 1447 zfs_exit(zfsvfs, FTAG); 1448 return (error); 1449 } 1450 1451 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1452 goto out; 1453 } 1454 1455 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1456 error = SET_ERROR(ENOTDIR); 1457 goto out; 1458 } 1459 1460 if (zp == cwd) { 1461 error = SET_ERROR(EINVAL); 1462 goto out; 1463 } 1464 1465 /* 1466 * Grab a lock on the directory to make sure that no one is 1467 * trying to add (or lookup) entries while we are removing it. 1468 */ 1469 rw_enter(&zp->z_name_lock, RW_WRITER); 1470 1471 /* 1472 * Grab a lock on the parent pointer to make sure we play well 1473 * with the treewalk and directory rename code. 1474 */ 1475 rw_enter(&zp->z_parent_lock, RW_WRITER); 1476 1477 tx = dmu_tx_create(zfsvfs->z_os); 1478 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1479 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1480 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1481 zfs_sa_upgrade_txholds(tx, zp); 1482 zfs_sa_upgrade_txholds(tx, dzp); 1483 dmu_tx_mark_netfree(tx); 1484 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1485 if (error) { 1486 rw_exit(&zp->z_parent_lock); 1487 rw_exit(&zp->z_name_lock); 1488 zfs_dirent_unlock(dl); 1489 if (error == ERESTART) { 1490 waited = B_TRUE; 1491 dmu_tx_wait(tx); 1492 dmu_tx_abort(tx); 1493 zrele(zp); 1494 goto top; 1495 } 1496 dmu_tx_abort(tx); 1497 zrele(zp); 1498 zfs_exit(zfsvfs, FTAG); 1499 return (error); 1500 } 1501 1502 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1503 1504 if (error == 0) { 1505 uint64_t txtype = TX_RMDIR; 1506 if (flags & FIGNORECASE) 1507 txtype |= TX_CI; 1508 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1509 B_FALSE); 1510 } 1511 1512 dmu_tx_commit(tx); 1513 1514 rw_exit(&zp->z_parent_lock); 1515 rw_exit(&zp->z_name_lock); 1516 out: 1517 zfs_dirent_unlock(dl); 1518 1519 zfs_znode_update_vfs(dzp); 1520 zfs_znode_update_vfs(zp); 1521 zrele(zp); 1522 1523 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1524 zil_commit(zilog, 0); 1525 1526 zfs_exit(zfsvfs, FTAG); 1527 return (error); 1528 } 1529 1530 /* 1531 * Read directory entries from the given directory cursor position and emit 1532 * name and position for each entry. 1533 * 1534 * IN: ip - inode of directory to read. 1535 * ctx - directory entry context. 1536 * cr - credentials of caller. 1537 * 1538 * RETURN: 0 if success 1539 * error code if failure 1540 * 1541 * Timestamps: 1542 * ip - atime updated 1543 * 1544 * Note that the low 4 bits of the cookie returned by zap is always zero. 1545 * This allows us to use the low range for "special" directory entries: 1546 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1547 * we use the offset 2 for the '.zfs' directory. 1548 */ 1549 int 1550 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) 1551 { 1552 (void) cr; 1553 znode_t *zp = ITOZ(ip); 1554 zfsvfs_t *zfsvfs = ITOZSB(ip); 1555 objset_t *os; 1556 zap_cursor_t zc; 1557 zap_attribute_t *zap; 1558 int error; 1559 uint8_t prefetch; 1560 uint8_t type; 1561 int done = 0; 1562 uint64_t parent; 1563 uint64_t offset; /* must be unsigned; checks for < 1 */ 1564 1565 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1566 return (error); 1567 1568 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1569 &parent, sizeof (parent))) != 0) 1570 goto out; 1571 1572 /* 1573 * Quit if directory has been removed (posix) 1574 */ 1575 if (zp->z_unlinked) 1576 goto out; 1577 1578 error = 0; 1579 os = zfsvfs->z_os; 1580 offset = ctx->pos; 1581 prefetch = zp->z_zn_prefetch; 1582 zap = zap_attribute_long_alloc(); 1583 1584 /* 1585 * Initialize the iterator cursor. 1586 */ 1587 if (offset <= 3) { 1588 /* 1589 * Start iteration from the beginning of the directory. 1590 */ 1591 zap_cursor_init(&zc, os, zp->z_id); 1592 } else { 1593 /* 1594 * The offset is a serialized cursor. 1595 */ 1596 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1597 } 1598 1599 /* 1600 * Transform to file-system independent format 1601 */ 1602 while (!done) { 1603 uint64_t objnum; 1604 /* 1605 * Special case `.', `..', and `.zfs'. 1606 */ 1607 if (offset == 0) { 1608 (void) strcpy(zap->za_name, "."); 1609 zap->za_normalization_conflict = 0; 1610 objnum = zp->z_id; 1611 type = DT_DIR; 1612 } else if (offset == 1) { 1613 (void) strcpy(zap->za_name, ".."); 1614 zap->za_normalization_conflict = 0; 1615 objnum = parent; 1616 type = DT_DIR; 1617 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1618 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME); 1619 zap->za_normalization_conflict = 0; 1620 objnum = ZFSCTL_INO_ROOT; 1621 type = DT_DIR; 1622 } else { 1623 /* 1624 * Grab next entry. 1625 */ 1626 if ((error = zap_cursor_retrieve(&zc, zap))) { 1627 if (error == ENOENT) 1628 break; 1629 else 1630 goto update; 1631 } 1632 1633 /* 1634 * Allow multiple entries provided the first entry is 1635 * the object id. Non-zpl consumers may safely make 1636 * use of the additional space. 1637 * 1638 * XXX: This should be a feature flag for compatibility 1639 */ 1640 if (zap->za_integer_length != 8 || 1641 zap->za_num_integers == 0) { 1642 cmn_err(CE_WARN, "zap_readdir: bad directory " 1643 "entry, obj = %lld, offset = %lld, " 1644 "length = %d, num = %lld\n", 1645 (u_longlong_t)zp->z_id, 1646 (u_longlong_t)offset, 1647 zap->za_integer_length, 1648 (u_longlong_t)zap->za_num_integers); 1649 error = SET_ERROR(ENXIO); 1650 goto update; 1651 } 1652 1653 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer); 1654 type = ZFS_DIRENT_TYPE(zap->za_first_integer); 1655 } 1656 1657 done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name), 1658 objnum, type); 1659 if (done) 1660 break; 1661 1662 if (prefetch) 1663 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); 1664 1665 /* 1666 * Move to the next entry, fill in the previous offset. 1667 */ 1668 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1669 zap_cursor_advance(&zc); 1670 offset = zap_cursor_serialize(&zc); 1671 } else { 1672 offset += 1; 1673 } 1674 ctx->pos = offset; 1675 } 1676 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1677 1678 update: 1679 zap_cursor_fini(&zc); 1680 zap_attribute_free(zap); 1681 if (error == ENOENT) 1682 error = 0; 1683 out: 1684 zfs_exit(zfsvfs, FTAG); 1685 1686 return (error); 1687 } 1688 1689 /* 1690 * Get the basic file attributes and place them in the provided kstat 1691 * structure. The inode is assumed to be the authoritative source 1692 * for most of the attributes. However, the znode currently has the 1693 * authoritative atime, blksize, and block count. 1694 * 1695 * IN: ip - inode of file. 1696 * 1697 * OUT: sp - kstat values. 1698 * 1699 * RETURN: 0 (always succeeds) 1700 */ 1701 int 1702 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1703 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, 1704 struct kstat *sp) 1705 #else 1706 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) 1707 #endif 1708 { 1709 znode_t *zp = ITOZ(ip); 1710 zfsvfs_t *zfsvfs = ITOZSB(ip); 1711 uint32_t blksize; 1712 u_longlong_t nblocks; 1713 int error; 1714 1715 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1716 return (error); 1717 1718 mutex_enter(&zp->z_lock); 1719 1720 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1721 zpl_generic_fillattr(user_ns, request_mask, ip, sp); 1722 #else 1723 zpl_generic_fillattr(user_ns, ip, sp); 1724 #endif 1725 /* 1726 * +1 link count for root inode with visible '.zfs' directory. 1727 */ 1728 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1729 if (sp->nlink < ZFS_LINK_MAX) 1730 sp->nlink++; 1731 1732 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1733 sp->blksize = blksize; 1734 sp->blocks = nblocks; 1735 1736 if (unlikely(zp->z_blksz == 0)) { 1737 /* 1738 * Block size hasn't been set; suggest maximal I/O transfers. 1739 */ 1740 sp->blksize = zfsvfs->z_max_blksz; 1741 } 1742 1743 mutex_exit(&zp->z_lock); 1744 1745 /* 1746 * Required to prevent NFS client from detecting different inode 1747 * numbers of snapshot root dentry before and after snapshot mount. 1748 */ 1749 if (zfsvfs->z_issnap) { 1750 if (ip->i_sb->s_root->d_inode == ip) 1751 sp->ino = ZFSCTL_INO_SNAPDIRS - 1752 dmu_objset_id(zfsvfs->z_os); 1753 } 1754 1755 zfs_exit(zfsvfs, FTAG); 1756 1757 return (0); 1758 } 1759 1760 /* 1761 * For the operation of changing file's user/group/project, we need to 1762 * handle not only the main object that is assigned to the file directly, 1763 * but also the ones that are used by the file via hidden xattr directory. 1764 * 1765 * Because the xattr directory may contains many EA entries, as to it may 1766 * be impossible to change all of them via the transaction of changing the 1767 * main object's user/group/project attributes. Then we have to change them 1768 * via other multiple independent transactions one by one. It may be not good 1769 * solution, but we have no better idea yet. 1770 */ 1771 static int 1772 zfs_setattr_dir(znode_t *dzp) 1773 { 1774 struct inode *dxip = ZTOI(dzp); 1775 struct inode *xip = NULL; 1776 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1777 objset_t *os = zfsvfs->z_os; 1778 zap_cursor_t zc; 1779 zap_attribute_t *zap; 1780 zfs_dirlock_t *dl; 1781 znode_t *zp = NULL; 1782 dmu_tx_t *tx = NULL; 1783 uint64_t uid, gid; 1784 sa_bulk_attr_t bulk[4]; 1785 int count; 1786 int err; 1787 1788 zap = zap_attribute_alloc(); 1789 zap_cursor_init(&zc, os, dzp->z_id); 1790 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) { 1791 count = 0; 1792 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) { 1793 err = ENXIO; 1794 break; 1795 } 1796 1797 err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp, 1798 ZEXISTS, NULL, NULL); 1799 if (err == ENOENT) 1800 goto next; 1801 if (err) 1802 break; 1803 1804 xip = ZTOI(zp); 1805 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1806 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1807 zp->z_projid == dzp->z_projid) 1808 goto next; 1809 1810 tx = dmu_tx_create(os); 1811 if (!(zp->z_pflags & ZFS_PROJID)) 1812 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1813 else 1814 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1815 1816 err = dmu_tx_assign(tx, TXG_WAIT); 1817 if (err) 1818 break; 1819 1820 mutex_enter(&dzp->z_lock); 1821 1822 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1823 xip->i_uid = dxip->i_uid; 1824 uid = zfs_uid_read(dxip); 1825 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1826 &uid, sizeof (uid)); 1827 } 1828 1829 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1830 xip->i_gid = dxip->i_gid; 1831 gid = zfs_gid_read(dxip); 1832 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1833 &gid, sizeof (gid)); 1834 } 1835 1836 1837 uint64_t projid = dzp->z_projid; 1838 if (zp->z_projid != projid) { 1839 if (!(zp->z_pflags & ZFS_PROJID)) { 1840 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 1841 if (unlikely(err == EEXIST)) { 1842 err = 0; 1843 } else if (err != 0) { 1844 goto sa_add_projid_err; 1845 } else { 1846 projid = ZFS_INVALID_PROJID; 1847 } 1848 } 1849 1850 if (projid != ZFS_INVALID_PROJID) { 1851 zp->z_projid = projid; 1852 SA_ADD_BULK_ATTR(bulk, count, 1853 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 1854 sizeof (zp->z_projid)); 1855 } 1856 } 1857 1858 sa_add_projid_err: 1859 mutex_exit(&dzp->z_lock); 1860 1861 if (likely(count > 0)) { 1862 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1863 dmu_tx_commit(tx); 1864 } else if (projid == ZFS_INVALID_PROJID) { 1865 dmu_tx_commit(tx); 1866 } else { 1867 dmu_tx_abort(tx); 1868 } 1869 tx = NULL; 1870 if (err != 0 && err != ENOENT) 1871 break; 1872 1873 next: 1874 if (zp) { 1875 zrele(zp); 1876 zp = NULL; 1877 zfs_dirent_unlock(dl); 1878 } 1879 zap_cursor_advance(&zc); 1880 } 1881 1882 if (tx) 1883 dmu_tx_abort(tx); 1884 if (zp) { 1885 zrele(zp); 1886 zfs_dirent_unlock(dl); 1887 } 1888 zap_cursor_fini(&zc); 1889 zap_attribute_free(zap); 1890 1891 return (err == ENOENT ? 0 : err); 1892 } 1893 1894 /* 1895 * Set the file attributes to the values contained in the 1896 * vattr structure. 1897 * 1898 * IN: zp - znode of file to be modified. 1899 * vap - new attribute values. 1900 * If ATTR_XVATTR set, then optional attrs are being set 1901 * flags - ATTR_UTIME set if non-default time values provided. 1902 * - ATTR_NOACLCHECK (CIFS context only). 1903 * cr - credentials of caller. 1904 * mnt_ns - user namespace of the mount 1905 * 1906 * RETURN: 0 if success 1907 * error code if failure 1908 * 1909 * Timestamps: 1910 * ip - ctime updated, mtime updated if size changed. 1911 */ 1912 int 1913 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) 1914 { 1915 struct inode *ip; 1916 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1917 objset_t *os; 1918 zilog_t *zilog; 1919 dmu_tx_t *tx; 1920 vattr_t oldva; 1921 xvattr_t *tmpxvattr; 1922 uint_t mask = vap->va_mask; 1923 uint_t saved_mask = 0; 1924 int trim_mask = 0; 1925 uint64_t new_mode; 1926 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1927 uint64_t xattr_obj; 1928 uint64_t mtime[2], ctime[2], atime[2]; 1929 uint64_t projid = ZFS_INVALID_PROJID; 1930 znode_t *attrzp; 1931 int need_policy = FALSE; 1932 int err, err2 = 0; 1933 zfs_fuid_info_t *fuidp = NULL; 1934 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1935 xoptattr_t *xoap; 1936 zfs_acl_t *aclp; 1937 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1938 boolean_t fuid_dirtied = B_FALSE; 1939 boolean_t handle_eadir = B_FALSE; 1940 sa_bulk_attr_t *bulk, *xattr_bulk; 1941 int count = 0, xattr_count = 0, bulks = 8; 1942 1943 if (mask == 0) 1944 return (0); 1945 1946 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1947 return (err); 1948 ip = ZTOI(zp); 1949 os = zfsvfs->z_os; 1950 1951 /* 1952 * If this is a xvattr_t, then get a pointer to the structure of 1953 * optional attributes. If this is NULL, then we have a vattr_t. 1954 */ 1955 xoap = xva_getxoptattr(xvap); 1956 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1957 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1958 if (!dmu_objset_projectquota_enabled(os) || 1959 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1960 zfs_exit(zfsvfs, FTAG); 1961 return (SET_ERROR(ENOTSUP)); 1962 } 1963 1964 projid = xoap->xoa_projid; 1965 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1966 zfs_exit(zfsvfs, FTAG); 1967 return (SET_ERROR(EINVAL)); 1968 } 1969 1970 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1971 projid = ZFS_INVALID_PROJID; 1972 else 1973 need_policy = TRUE; 1974 } 1975 1976 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1977 (xoap->xoa_projinherit != 1978 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1979 (!dmu_objset_projectquota_enabled(os) || 1980 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1981 zfs_exit(zfsvfs, FTAG); 1982 return (SET_ERROR(ENOTSUP)); 1983 } 1984 } 1985 1986 zilog = zfsvfs->z_log; 1987 1988 /* 1989 * Make sure that if we have ephemeral uid/gid or xvattr specified 1990 * that file system is at proper version level 1991 */ 1992 1993 if (zfsvfs->z_use_fuids == B_FALSE && 1994 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 1995 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 1996 (mask & ATTR_XVATTR))) { 1997 zfs_exit(zfsvfs, FTAG); 1998 return (SET_ERROR(EINVAL)); 1999 } 2000 2001 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 2002 zfs_exit(zfsvfs, FTAG); 2003 return (SET_ERROR(EISDIR)); 2004 } 2005 2006 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 2007 zfs_exit(zfsvfs, FTAG); 2008 return (SET_ERROR(EINVAL)); 2009 } 2010 2011 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 2012 xva_init(tmpxvattr); 2013 2014 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 2015 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 2016 2017 /* 2018 * Immutable files can only alter immutable bit and atime 2019 */ 2020 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2021 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 2022 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2023 err = SET_ERROR(EPERM); 2024 goto out3; 2025 } 2026 2027 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 2028 err = SET_ERROR(EPERM); 2029 goto out3; 2030 } 2031 2032 /* 2033 * Verify timestamps doesn't overflow 32 bits. 2034 * ZFS can handle large timestamps, but 32bit syscalls can't 2035 * handle times greater than 2039. This check should be removed 2036 * once large timestamps are fully supported. 2037 */ 2038 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 2039 if (((mask & ATTR_ATIME) && 2040 TIMESPEC_OVERFLOW(&vap->va_atime)) || 2041 ((mask & ATTR_MTIME) && 2042 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2043 err = SET_ERROR(EOVERFLOW); 2044 goto out3; 2045 } 2046 } 2047 2048 top: 2049 attrzp = NULL; 2050 aclp = NULL; 2051 2052 /* Can this be moved to before the top label? */ 2053 if (zfs_is_readonly(zfsvfs)) { 2054 err = SET_ERROR(EROFS); 2055 goto out3; 2056 } 2057 2058 /* 2059 * First validate permissions 2060 */ 2061 2062 if (mask & ATTR_SIZE) { 2063 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 2064 mnt_ns); 2065 if (err) 2066 goto out3; 2067 2068 /* 2069 * XXX - Note, we are not providing any open 2070 * mode flags here (like FNDELAY), so we may 2071 * block if there are locks present... this 2072 * should be addressed in openat(). 2073 */ 2074 /* XXX - would it be OK to generate a log record here? */ 2075 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2076 if (err) 2077 goto out3; 2078 } 2079 2080 if (mask & (ATTR_ATIME|ATTR_MTIME) || 2081 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2082 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2083 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2084 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2085 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2086 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2087 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2088 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2089 skipaclchk, cr, mnt_ns); 2090 } 2091 2092 if (mask & (ATTR_UID|ATTR_GID)) { 2093 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2094 int take_owner; 2095 int take_group; 2096 uid_t uid; 2097 gid_t gid; 2098 2099 /* 2100 * NOTE: even if a new mode is being set, 2101 * we may clear S_ISUID/S_ISGID bits. 2102 */ 2103 2104 if (!(mask & ATTR_MODE)) 2105 vap->va_mode = zp->z_mode; 2106 2107 /* 2108 * Take ownership or chgrp to group we are a member of 2109 */ 2110 2111 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), 2112 vap->va_uid); 2113 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), 2114 vap->va_gid); 2115 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2116 take_group = (mask & ATTR_GID) && 2117 zfs_groupmember(zfsvfs, gid, cr); 2118 2119 /* 2120 * If both ATTR_UID and ATTR_GID are set then take_owner and 2121 * take_group must both be set in order to allow taking 2122 * ownership. 2123 * 2124 * Otherwise, send the check through secpolicy_vnode_setattr() 2125 * 2126 */ 2127 2128 if (((idmask == (ATTR_UID|ATTR_GID)) && 2129 take_owner && take_group) || 2130 ((idmask == ATTR_UID) && take_owner) || 2131 ((idmask == ATTR_GID) && take_group)) { 2132 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2133 skipaclchk, cr, mnt_ns) == 0) { 2134 /* 2135 * Remove setuid/setgid for non-privileged users 2136 */ 2137 (void) secpolicy_setid_clear(vap, cr); 2138 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2139 } else { 2140 need_policy = TRUE; 2141 } 2142 } else { 2143 need_policy = TRUE; 2144 } 2145 } 2146 2147 mutex_enter(&zp->z_lock); 2148 oldva.va_mode = zp->z_mode; 2149 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2150 if (mask & ATTR_XVATTR) { 2151 /* 2152 * Update xvattr mask to include only those attributes 2153 * that are actually changing. 2154 * 2155 * the bits will be restored prior to actually setting 2156 * the attributes so the caller thinks they were set. 2157 */ 2158 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2159 if (xoap->xoa_appendonly != 2160 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2161 need_policy = TRUE; 2162 } else { 2163 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2164 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2165 } 2166 } 2167 2168 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2169 if (xoap->xoa_projinherit != 2170 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2171 need_policy = TRUE; 2172 } else { 2173 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2174 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2175 } 2176 } 2177 2178 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2179 if (xoap->xoa_nounlink != 2180 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2181 need_policy = TRUE; 2182 } else { 2183 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2184 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2185 } 2186 } 2187 2188 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2189 if (xoap->xoa_immutable != 2190 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2191 need_policy = TRUE; 2192 } else { 2193 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2194 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2195 } 2196 } 2197 2198 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2199 if (xoap->xoa_nodump != 2200 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2201 need_policy = TRUE; 2202 } else { 2203 XVA_CLR_REQ(xvap, XAT_NODUMP); 2204 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2205 } 2206 } 2207 2208 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2209 if (xoap->xoa_av_modified != 2210 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2211 need_policy = TRUE; 2212 } else { 2213 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2214 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2215 } 2216 } 2217 2218 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2219 if ((!S_ISREG(ip->i_mode) && 2220 xoap->xoa_av_quarantined) || 2221 xoap->xoa_av_quarantined != 2222 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2223 need_policy = TRUE; 2224 } else { 2225 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2226 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2227 } 2228 } 2229 2230 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2231 mutex_exit(&zp->z_lock); 2232 err = SET_ERROR(EPERM); 2233 goto out3; 2234 } 2235 2236 if (need_policy == FALSE && 2237 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2238 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2239 need_policy = TRUE; 2240 } 2241 } 2242 2243 mutex_exit(&zp->z_lock); 2244 2245 if (mask & ATTR_MODE) { 2246 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2247 mnt_ns) == 0) { 2248 err = secpolicy_setid_setsticky_clear(ip, vap, 2249 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2250 if (err) 2251 goto out3; 2252 trim_mask |= ATTR_MODE; 2253 } else { 2254 need_policy = TRUE; 2255 } 2256 } 2257 2258 if (need_policy) { 2259 /* 2260 * If trim_mask is set then take ownership 2261 * has been granted or write_acl is present and user 2262 * has the ability to modify mode. In that case remove 2263 * UID|GID and or MODE from mask so that 2264 * secpolicy_vnode_setattr() doesn't revoke it. 2265 */ 2266 2267 if (trim_mask) { 2268 saved_mask = vap->va_mask; 2269 vap->va_mask &= ~trim_mask; 2270 } 2271 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2272 zfs_zaccess_unix, zp); 2273 if (err) 2274 goto out3; 2275 2276 if (trim_mask) 2277 vap->va_mask |= saved_mask; 2278 } 2279 2280 /* 2281 * secpolicy_vnode_setattr, or take ownership may have 2282 * changed va_mask 2283 */ 2284 mask = vap->va_mask; 2285 2286 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2287 handle_eadir = B_TRUE; 2288 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2289 &xattr_obj, sizeof (xattr_obj)); 2290 2291 if (err == 0 && xattr_obj) { 2292 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2293 if (err) 2294 goto out2; 2295 } 2296 if (mask & ATTR_UID) { 2297 new_kuid = zfs_fuid_create(zfsvfs, 2298 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2299 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2300 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2301 new_kuid)) { 2302 if (attrzp) 2303 zrele(attrzp); 2304 err = SET_ERROR(EDQUOT); 2305 goto out2; 2306 } 2307 } 2308 2309 if (mask & ATTR_GID) { 2310 new_kgid = zfs_fuid_create(zfsvfs, 2311 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2312 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2313 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2314 new_kgid)) { 2315 if (attrzp) 2316 zrele(attrzp); 2317 err = SET_ERROR(EDQUOT); 2318 goto out2; 2319 } 2320 } 2321 2322 if (projid != ZFS_INVALID_PROJID && 2323 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2324 if (attrzp) 2325 zrele(attrzp); 2326 err = EDQUOT; 2327 goto out2; 2328 } 2329 } 2330 tx = dmu_tx_create(os); 2331 2332 if (mask & ATTR_MODE) { 2333 uint64_t pmode = zp->z_mode; 2334 uint64_t acl_obj; 2335 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2336 2337 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2338 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2339 err = EPERM; 2340 goto out; 2341 } 2342 2343 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2344 goto out; 2345 2346 mutex_enter(&zp->z_lock); 2347 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2348 /* 2349 * Are we upgrading ACL from old V0 format 2350 * to V1 format? 2351 */ 2352 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2353 zfs_znode_acl_version(zp) == 2354 ZFS_ACL_VERSION_INITIAL) { 2355 dmu_tx_hold_free(tx, acl_obj, 0, 2356 DMU_OBJECT_END); 2357 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2358 0, aclp->z_acl_bytes); 2359 } else { 2360 dmu_tx_hold_write(tx, acl_obj, 0, 2361 aclp->z_acl_bytes); 2362 } 2363 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2364 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2365 0, aclp->z_acl_bytes); 2366 } 2367 mutex_exit(&zp->z_lock); 2368 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2369 } else { 2370 if (((mask & ATTR_XVATTR) && 2371 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2372 (projid != ZFS_INVALID_PROJID && 2373 !(zp->z_pflags & ZFS_PROJID))) 2374 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2375 else 2376 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2377 } 2378 2379 if (attrzp) { 2380 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2381 } 2382 2383 fuid_dirtied = zfsvfs->z_fuid_dirty; 2384 if (fuid_dirtied) 2385 zfs_fuid_txhold(zfsvfs, tx); 2386 2387 zfs_sa_upgrade_txholds(tx, zp); 2388 2389 err = dmu_tx_assign(tx, TXG_WAIT); 2390 if (err) 2391 goto out; 2392 2393 count = 0; 2394 /* 2395 * Set each attribute requested. 2396 * We group settings according to the locks they need to acquire. 2397 * 2398 * Note: you cannot set ctime directly, although it will be 2399 * updated as a side-effect of calling this function. 2400 */ 2401 2402 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2403 /* 2404 * For the existed object that is upgraded from old system, 2405 * its on-disk layout has no slot for the project ID attribute. 2406 * But quota accounting logic needs to access related slots by 2407 * offset directly. So we need to adjust old objects' layout 2408 * to make the project ID to some unified and fixed offset. 2409 */ 2410 if (attrzp) 2411 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2412 if (err == 0) 2413 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2414 2415 if (unlikely(err == EEXIST)) 2416 err = 0; 2417 else if (err != 0) 2418 goto out; 2419 else 2420 projid = ZFS_INVALID_PROJID; 2421 } 2422 2423 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2424 mutex_enter(&zp->z_acl_lock); 2425 mutex_enter(&zp->z_lock); 2426 2427 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2428 &zp->z_pflags, sizeof (zp->z_pflags)); 2429 2430 if (attrzp) { 2431 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2432 mutex_enter(&attrzp->z_acl_lock); 2433 mutex_enter(&attrzp->z_lock); 2434 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2435 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2436 sizeof (attrzp->z_pflags)); 2437 if (projid != ZFS_INVALID_PROJID) { 2438 attrzp->z_projid = projid; 2439 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2440 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2441 sizeof (attrzp->z_projid)); 2442 } 2443 } 2444 2445 if (mask & (ATTR_UID|ATTR_GID)) { 2446 2447 if (mask & ATTR_UID) { 2448 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2449 new_uid = zfs_uid_read(ZTOI(zp)); 2450 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2451 &new_uid, sizeof (new_uid)); 2452 if (attrzp) { 2453 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2454 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2455 sizeof (new_uid)); 2456 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2457 } 2458 } 2459 2460 if (mask & ATTR_GID) { 2461 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2462 new_gid = zfs_gid_read(ZTOI(zp)); 2463 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2464 NULL, &new_gid, sizeof (new_gid)); 2465 if (attrzp) { 2466 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2467 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2468 sizeof (new_gid)); 2469 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2470 } 2471 } 2472 if (!(mask & ATTR_MODE)) { 2473 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2474 NULL, &new_mode, sizeof (new_mode)); 2475 new_mode = zp->z_mode; 2476 } 2477 err = zfs_acl_chown_setattr(zp); 2478 ASSERT(err == 0); 2479 if (attrzp) { 2480 err = zfs_acl_chown_setattr(attrzp); 2481 ASSERT(err == 0); 2482 } 2483 } 2484 2485 if (mask & ATTR_MODE) { 2486 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2487 &new_mode, sizeof (new_mode)); 2488 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2489 ASSERT3P(aclp, !=, NULL); 2490 err = zfs_aclset_common(zp, aclp, cr, tx); 2491 ASSERT0(err); 2492 if (zp->z_acl_cached) 2493 zfs_acl_free(zp->z_acl_cached); 2494 zp->z_acl_cached = aclp; 2495 aclp = NULL; 2496 } 2497 2498 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2499 zp->z_atime_dirty = B_FALSE; 2500 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); 2501 ZFS_TIME_ENCODE(&tmp_atime, atime); 2502 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2503 &atime, sizeof (atime)); 2504 } 2505 2506 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2507 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2508 zpl_inode_set_mtime_to_ts(ZTOI(zp), 2509 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); 2510 2511 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2512 mtime, sizeof (mtime)); 2513 } 2514 2515 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2516 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2517 zpl_inode_set_ctime_to_ts(ZTOI(zp), 2518 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); 2519 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2520 ctime, sizeof (ctime)); 2521 } 2522 2523 if (projid != ZFS_INVALID_PROJID) { 2524 zp->z_projid = projid; 2525 SA_ADD_BULK_ATTR(bulk, count, 2526 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2527 sizeof (zp->z_projid)); 2528 } 2529 2530 if (attrzp && mask) { 2531 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2532 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2533 sizeof (ctime)); 2534 } 2535 2536 /* 2537 * Do this after setting timestamps to prevent timestamp 2538 * update from toggling bit 2539 */ 2540 2541 if (xoap && (mask & ATTR_XVATTR)) { 2542 2543 /* 2544 * restore trimmed off masks 2545 * so that return masks can be set for caller. 2546 */ 2547 2548 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2549 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2550 } 2551 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2552 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2553 } 2554 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2555 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2556 } 2557 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2558 XVA_SET_REQ(xvap, XAT_NODUMP); 2559 } 2560 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2561 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2562 } 2563 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2564 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2565 } 2566 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2567 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2568 } 2569 2570 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2571 ASSERT(S_ISREG(ip->i_mode)); 2572 2573 zfs_xvattr_set(zp, xvap, tx); 2574 } 2575 2576 if (fuid_dirtied) 2577 zfs_fuid_sync(zfsvfs, tx); 2578 2579 if (mask != 0) 2580 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2581 2582 mutex_exit(&zp->z_lock); 2583 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2584 mutex_exit(&zp->z_acl_lock); 2585 2586 if (attrzp) { 2587 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2588 mutex_exit(&attrzp->z_acl_lock); 2589 mutex_exit(&attrzp->z_lock); 2590 } 2591 out: 2592 if (err == 0 && xattr_count > 0) { 2593 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2594 xattr_count, tx); 2595 ASSERT(err2 == 0); 2596 } 2597 2598 if (aclp) 2599 zfs_acl_free(aclp); 2600 2601 if (fuidp) { 2602 zfs_fuid_info_free(fuidp); 2603 fuidp = NULL; 2604 } 2605 2606 if (err) { 2607 dmu_tx_abort(tx); 2608 if (attrzp) 2609 zrele(attrzp); 2610 if (err == ERESTART) 2611 goto top; 2612 } else { 2613 if (count > 0) 2614 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2615 dmu_tx_commit(tx); 2616 if (attrzp) { 2617 if (err2 == 0 && handle_eadir) 2618 err = zfs_setattr_dir(attrzp); 2619 zrele(attrzp); 2620 } 2621 zfs_znode_update_vfs(zp); 2622 } 2623 2624 out2: 2625 if (os->os_sync == ZFS_SYNC_ALWAYS) 2626 zil_commit(zilog, 0); 2627 2628 out3: 2629 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2630 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2631 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2632 zfs_exit(zfsvfs, FTAG); 2633 return (err); 2634 } 2635 2636 typedef struct zfs_zlock { 2637 krwlock_t *zl_rwlock; /* lock we acquired */ 2638 znode_t *zl_znode; /* znode we held */ 2639 struct zfs_zlock *zl_next; /* next in list */ 2640 } zfs_zlock_t; 2641 2642 /* 2643 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2644 */ 2645 static void 2646 zfs_rename_unlock(zfs_zlock_t **zlpp) 2647 { 2648 zfs_zlock_t *zl; 2649 2650 while ((zl = *zlpp) != NULL) { 2651 if (zl->zl_znode != NULL) 2652 zfs_zrele_async(zl->zl_znode); 2653 rw_exit(zl->zl_rwlock); 2654 *zlpp = zl->zl_next; 2655 kmem_free(zl, sizeof (*zl)); 2656 } 2657 } 2658 2659 /* 2660 * Search back through the directory tree, using the ".." entries. 2661 * Lock each directory in the chain to prevent concurrent renames. 2662 * Fail any attempt to move a directory into one of its own descendants. 2663 * XXX - z_parent_lock can overlap with map or grow locks 2664 */ 2665 static int 2666 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2667 { 2668 zfs_zlock_t *zl; 2669 znode_t *zp = tdzp; 2670 uint64_t rootid = ZTOZSB(zp)->z_root; 2671 uint64_t oidp = zp->z_id; 2672 krwlock_t *rwlp = &szp->z_parent_lock; 2673 krw_t rw = RW_WRITER; 2674 2675 /* 2676 * First pass write-locks szp and compares to zp->z_id. 2677 * Later passes read-lock zp and compare to zp->z_parent. 2678 */ 2679 do { 2680 if (!rw_tryenter(rwlp, rw)) { 2681 /* 2682 * Another thread is renaming in this path. 2683 * Note that if we are a WRITER, we don't have any 2684 * parent_locks held yet. 2685 */ 2686 if (rw == RW_READER && zp->z_id > szp->z_id) { 2687 /* 2688 * Drop our locks and restart 2689 */ 2690 zfs_rename_unlock(&zl); 2691 *zlpp = NULL; 2692 zp = tdzp; 2693 oidp = zp->z_id; 2694 rwlp = &szp->z_parent_lock; 2695 rw = RW_WRITER; 2696 continue; 2697 } else { 2698 /* 2699 * Wait for other thread to drop its locks 2700 */ 2701 rw_enter(rwlp, rw); 2702 } 2703 } 2704 2705 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2706 zl->zl_rwlock = rwlp; 2707 zl->zl_znode = NULL; 2708 zl->zl_next = *zlpp; 2709 *zlpp = zl; 2710 2711 if (oidp == szp->z_id) /* We're a descendant of szp */ 2712 return (SET_ERROR(EINVAL)); 2713 2714 if (oidp == rootid) /* We've hit the top */ 2715 return (0); 2716 2717 if (rw == RW_READER) { /* i.e. not the first pass */ 2718 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2719 if (error) 2720 return (error); 2721 zl->zl_znode = zp; 2722 } 2723 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2724 &oidp, sizeof (oidp)); 2725 rwlp = &zp->z_parent_lock; 2726 rw = RW_READER; 2727 2728 } while (zp->z_id != sdzp->z_id); 2729 2730 return (0); 2731 } 2732 2733 /* 2734 * Move an entry from the provided source directory to the target 2735 * directory. Change the entry name as indicated. 2736 * 2737 * IN: sdzp - Source directory containing the "old entry". 2738 * snm - Old entry name. 2739 * tdzp - Target directory to contain the "new entry". 2740 * tnm - New entry name. 2741 * cr - credentials of caller. 2742 * flags - case flags 2743 * rflags - RENAME_* flags 2744 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2745 * mnt_ns - user namespace of the mount 2746 * 2747 * RETURN: 0 on success, error code on failure. 2748 * 2749 * Timestamps: 2750 * sdzp,tdzp - ctime|mtime updated 2751 */ 2752 int 2753 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2754 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) 2755 { 2756 znode_t *szp, *tzp; 2757 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2758 zilog_t *zilog; 2759 zfs_dirlock_t *sdl, *tdl; 2760 dmu_tx_t *tx; 2761 zfs_zlock_t *zl; 2762 int cmp, serr, terr; 2763 int error = 0; 2764 int zflg = 0; 2765 boolean_t waited = B_FALSE; 2766 /* Needed for whiteout inode creation. */ 2767 boolean_t fuid_dirtied; 2768 zfs_acl_ids_t acl_ids; 2769 boolean_t have_acl = B_FALSE; 2770 znode_t *wzp = NULL; 2771 2772 2773 if (snm == NULL || tnm == NULL) 2774 return (SET_ERROR(EINVAL)); 2775 2776 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2777 return (SET_ERROR(EINVAL)); 2778 2779 /* Already checked by Linux VFS, but just to make sure. */ 2780 if (rflags & RENAME_EXCHANGE && 2781 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2782 return (SET_ERROR(EINVAL)); 2783 2784 /* 2785 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2786 * right kind of vattr_t for the whiteout file. These are set 2787 * internally by ZFS so should never be incorrect. 2788 */ 2789 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2790 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2791 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2792 2793 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2794 return (error); 2795 zilog = zfsvfs->z_log; 2796 2797 if ((error = zfs_verify_zp(tdzp)) != 0) { 2798 zfs_exit(zfsvfs, FTAG); 2799 return (error); 2800 } 2801 2802 /* 2803 * We check i_sb because snapshots and the ctldir must have different 2804 * super blocks. 2805 */ 2806 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2807 zfsctl_is_node(ZTOI(tdzp))) { 2808 zfs_exit(zfsvfs, FTAG); 2809 return (SET_ERROR(EXDEV)); 2810 } 2811 2812 if (zfsvfs->z_utf8 && u8_validate(tnm, 2813 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2814 zfs_exit(zfsvfs, FTAG); 2815 return (SET_ERROR(EILSEQ)); 2816 } 2817 2818 if (flags & FIGNORECASE) 2819 zflg |= ZCILOOK; 2820 2821 top: 2822 szp = NULL; 2823 tzp = NULL; 2824 zl = NULL; 2825 2826 /* 2827 * This is to prevent the creation of links into attribute space 2828 * by renaming a linked file into/outof an attribute directory. 2829 * See the comment in zfs_link() for why this is considered bad. 2830 */ 2831 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2832 zfs_exit(zfsvfs, FTAG); 2833 return (SET_ERROR(EINVAL)); 2834 } 2835 2836 /* 2837 * Lock source and target directory entries. To prevent deadlock, 2838 * a lock ordering must be defined. We lock the directory with 2839 * the smallest object id first, or if it's a tie, the one with 2840 * the lexically first name. 2841 */ 2842 if (sdzp->z_id < tdzp->z_id) { 2843 cmp = -1; 2844 } else if (sdzp->z_id > tdzp->z_id) { 2845 cmp = 1; 2846 } else { 2847 /* 2848 * First compare the two name arguments without 2849 * considering any case folding. 2850 */ 2851 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2852 2853 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2854 ASSERT(error == 0 || !zfsvfs->z_utf8); 2855 if (cmp == 0) { 2856 /* 2857 * POSIX: "If the old argument and the new argument 2858 * both refer to links to the same existing file, 2859 * the rename() function shall return successfully 2860 * and perform no other action." 2861 */ 2862 zfs_exit(zfsvfs, FTAG); 2863 return (0); 2864 } 2865 /* 2866 * If the file system is case-folding, then we may 2867 * have some more checking to do. A case-folding file 2868 * system is either supporting mixed case sensitivity 2869 * access or is completely case-insensitive. Note 2870 * that the file system is always case preserving. 2871 * 2872 * In mixed sensitivity mode case sensitive behavior 2873 * is the default. FIGNORECASE must be used to 2874 * explicitly request case insensitive behavior. 2875 * 2876 * If the source and target names provided differ only 2877 * by case (e.g., a request to rename 'tim' to 'Tim'), 2878 * we will treat this as a special case in the 2879 * case-insensitive mode: as long as the source name 2880 * is an exact match, we will allow this to proceed as 2881 * a name-change request. 2882 */ 2883 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2884 (zfsvfs->z_case == ZFS_CASE_MIXED && 2885 flags & FIGNORECASE)) && 2886 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2887 &error) == 0) { 2888 /* 2889 * case preserving rename request, require exact 2890 * name matches 2891 */ 2892 zflg |= ZCIEXACT; 2893 zflg &= ~ZCILOOK; 2894 } 2895 } 2896 2897 /* 2898 * If the source and destination directories are the same, we should 2899 * grab the z_name_lock of that directory only once. 2900 */ 2901 if (sdzp == tdzp) { 2902 zflg |= ZHAVELOCK; 2903 rw_enter(&sdzp->z_name_lock, RW_READER); 2904 } 2905 2906 if (cmp < 0) { 2907 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2908 ZEXISTS | zflg, NULL, NULL); 2909 terr = zfs_dirent_lock(&tdl, 2910 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2911 } else { 2912 terr = zfs_dirent_lock(&tdl, 2913 tdzp, tnm, &tzp, zflg, NULL, NULL); 2914 serr = zfs_dirent_lock(&sdl, 2915 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2916 NULL, NULL); 2917 } 2918 2919 if (serr) { 2920 /* 2921 * Source entry invalid or not there. 2922 */ 2923 if (!terr) { 2924 zfs_dirent_unlock(tdl); 2925 if (tzp) 2926 zrele(tzp); 2927 } 2928 2929 if (sdzp == tdzp) 2930 rw_exit(&sdzp->z_name_lock); 2931 2932 if (strcmp(snm, "..") == 0) 2933 serr = EINVAL; 2934 zfs_exit(zfsvfs, FTAG); 2935 return (serr); 2936 } 2937 if (terr) { 2938 zfs_dirent_unlock(sdl); 2939 zrele(szp); 2940 2941 if (sdzp == tdzp) 2942 rw_exit(&sdzp->z_name_lock); 2943 2944 if (strcmp(tnm, "..") == 0) 2945 terr = EINVAL; 2946 zfs_exit(zfsvfs, FTAG); 2947 return (terr); 2948 } 2949 2950 /* 2951 * If we are using project inheritance, means if the directory has 2952 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2953 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2954 * such case, we only allow renames into our tree when the project 2955 * IDs are the same. 2956 */ 2957 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2958 tdzp->z_projid != szp->z_projid) { 2959 error = SET_ERROR(EXDEV); 2960 goto out; 2961 } 2962 2963 /* 2964 * Must have write access at the source to remove the old entry 2965 * and write access at the target to create the new entry. 2966 * Note that if target and source are the same, this can be 2967 * done in a single check. 2968 */ 2969 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2970 goto out; 2971 2972 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2973 /* 2974 * Check to make sure rename is valid. 2975 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2976 */ 2977 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2978 goto out; 2979 } 2980 2981 /* 2982 * Does target exist? 2983 */ 2984 if (tzp) { 2985 if (rflags & RENAME_NOREPLACE) { 2986 error = SET_ERROR(EEXIST); 2987 goto out; 2988 } 2989 /* 2990 * Source and target must be the same type (unless exchanging). 2991 */ 2992 if (!(rflags & RENAME_EXCHANGE)) { 2993 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 2994 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 2995 2996 if (s_is_dir != t_is_dir) { 2997 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 2998 goto out; 2999 } 3000 } 3001 /* 3002 * POSIX dictates that when the source and target 3003 * entries refer to the same file object, rename 3004 * must do nothing and exit without error. 3005 */ 3006 if (szp->z_id == tzp->z_id) { 3007 error = 0; 3008 goto out; 3009 } 3010 } else if (rflags & RENAME_EXCHANGE) { 3011 /* Target must exist for RENAME_EXCHANGE. */ 3012 error = SET_ERROR(ENOENT); 3013 goto out; 3014 } 3015 3016 /* Set up inode creation for RENAME_WHITEOUT. */ 3017 if (rflags & RENAME_WHITEOUT) { 3018 /* 3019 * Whiteout files are not regular files or directories, so to 3020 * match zfs_create() we do not inherit the project id. 3021 */ 3022 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 3023 3024 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 3025 if (error) 3026 goto out; 3027 3028 if (!have_acl) { 3029 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 3030 &acl_ids, mnt_ns); 3031 if (error) 3032 goto out; 3033 have_acl = B_TRUE; 3034 } 3035 3036 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 3037 error = SET_ERROR(EDQUOT); 3038 goto out; 3039 } 3040 } 3041 3042 tx = dmu_tx_create(zfsvfs->z_os); 3043 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3044 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3045 dmu_tx_hold_zap(tx, sdzp->z_id, 3046 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 3047 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3048 if (sdzp != tdzp) { 3049 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3050 zfs_sa_upgrade_txholds(tx, tdzp); 3051 } 3052 if (tzp) { 3053 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3054 zfs_sa_upgrade_txholds(tx, tzp); 3055 } 3056 if (rflags & RENAME_WHITEOUT) { 3057 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3058 ZFS_SA_BASE_ATTR_SIZE); 3059 3060 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 3061 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3062 if (!zfsvfs->z_use_sa && 3063 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3064 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3065 0, acl_ids.z_aclp->z_acl_bytes); 3066 } 3067 } 3068 fuid_dirtied = zfsvfs->z_fuid_dirty; 3069 if (fuid_dirtied) 3070 zfs_fuid_txhold(zfsvfs, tx); 3071 zfs_sa_upgrade_txholds(tx, szp); 3072 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3073 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3074 if (error) { 3075 if (zl != NULL) 3076 zfs_rename_unlock(&zl); 3077 zfs_dirent_unlock(sdl); 3078 zfs_dirent_unlock(tdl); 3079 3080 if (sdzp == tdzp) 3081 rw_exit(&sdzp->z_name_lock); 3082 3083 if (error == ERESTART) { 3084 waited = B_TRUE; 3085 dmu_tx_wait(tx); 3086 dmu_tx_abort(tx); 3087 zrele(szp); 3088 if (tzp) 3089 zrele(tzp); 3090 goto top; 3091 } 3092 dmu_tx_abort(tx); 3093 zrele(szp); 3094 if (tzp) 3095 zrele(tzp); 3096 zfs_exit(zfsvfs, FTAG); 3097 return (error); 3098 } 3099 3100 /* 3101 * Unlink the source. 3102 */ 3103 szp->z_pflags |= ZFS_AV_MODIFIED; 3104 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3105 szp->z_pflags |= ZFS_PROJINHERIT; 3106 3107 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3108 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3109 VERIFY0(error); 3110 3111 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3112 if (error) 3113 goto commit; 3114 3115 /* 3116 * Unlink the target. 3117 */ 3118 if (tzp) { 3119 int tzflg = zflg; 3120 3121 if (rflags & RENAME_EXCHANGE) { 3122 /* This inode will be re-linked soon. */ 3123 tzflg |= ZRENAMING; 3124 3125 tzp->z_pflags |= ZFS_AV_MODIFIED; 3126 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3127 tzp->z_pflags |= ZFS_PROJINHERIT; 3128 3129 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3130 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3131 ASSERT0(error); 3132 } 3133 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3134 if (error) 3135 goto commit_link_szp; 3136 } 3137 3138 /* 3139 * Create the new target links: 3140 * * We always link the target. 3141 * * RENAME_EXCHANGE: Link the old target to the source. 3142 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3143 */ 3144 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3145 if (error) { 3146 /* 3147 * If we have removed the existing target, a subsequent call to 3148 * zfs_link_create() to add back the same entry, but with a new 3149 * dnode (szp), should not fail. 3150 */ 3151 ASSERT3P(tzp, ==, NULL); 3152 goto commit_link_tzp; 3153 } 3154 3155 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3156 case RENAME_EXCHANGE: 3157 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3158 /* 3159 * The same argument as zfs_link_create() failing for 3160 * szp applies here, since the source directory must 3161 * have had an entry we are replacing. 3162 */ 3163 ASSERT0(error); 3164 if (error) 3165 goto commit_unlink_td_szp; 3166 break; 3167 case RENAME_WHITEOUT: 3168 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3169 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3170 if (error) { 3171 zfs_znode_delete(wzp, tx); 3172 remove_inode_hash(ZTOI(wzp)); 3173 goto commit_unlink_td_szp; 3174 } 3175 break; 3176 } 3177 3178 if (fuid_dirtied) 3179 zfs_fuid_sync(zfsvfs, tx); 3180 3181 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3182 case RENAME_EXCHANGE: 3183 zfs_log_rename_exchange(zilog, tx, 3184 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3185 tdzp, tdl->dl_name, szp); 3186 break; 3187 case RENAME_WHITEOUT: 3188 zfs_log_rename_whiteout(zilog, tx, 3189 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3190 tdzp, tdl->dl_name, szp, wzp); 3191 break; 3192 default: 3193 ASSERT0(rflags & ~RENAME_NOREPLACE); 3194 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3195 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3196 break; 3197 } 3198 3199 commit: 3200 dmu_tx_commit(tx); 3201 out: 3202 if (have_acl) 3203 zfs_acl_ids_free(&acl_ids); 3204 3205 zfs_znode_update_vfs(sdzp); 3206 if (sdzp == tdzp) 3207 rw_exit(&sdzp->z_name_lock); 3208 3209 if (sdzp != tdzp) 3210 zfs_znode_update_vfs(tdzp); 3211 3212 zfs_znode_update_vfs(szp); 3213 zrele(szp); 3214 if (wzp) { 3215 zfs_znode_update_vfs(wzp); 3216 zrele(wzp); 3217 } 3218 if (tzp) { 3219 zfs_znode_update_vfs(tzp); 3220 zrele(tzp); 3221 } 3222 3223 if (zl != NULL) 3224 zfs_rename_unlock(&zl); 3225 3226 zfs_dirent_unlock(sdl); 3227 zfs_dirent_unlock(tdl); 3228 3229 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3230 zil_commit(zilog, 0); 3231 3232 zfs_exit(zfsvfs, FTAG); 3233 return (error); 3234 3235 /* 3236 * Clean-up path for broken link state. 3237 * 3238 * At this point we are in a (very) bad state, so we need to do our 3239 * best to correct the state. In particular, all of the nlinks are 3240 * wrong because we were destroying and creating links with ZRENAMING. 3241 * 3242 * In some form, all of these operations have to resolve the state: 3243 * 3244 * * link_destroy() *must* succeed. Fortunately, this is very likely 3245 * since we only just created it. 3246 * 3247 * * link_create()s are allowed to fail (though they shouldn't because 3248 * we only just unlinked them and are putting the entries back 3249 * during clean-up). But if they fail, we can just forcefully drop 3250 * the nlink value to (at the very least) avoid broken nlink values 3251 * -- though in the case of non-empty directories we will have to 3252 * panic (otherwise we'd have a leaked directory with a broken ..). 3253 */ 3254 commit_unlink_td_szp: 3255 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3256 commit_link_tzp: 3257 if (tzp) { 3258 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3259 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3260 } 3261 commit_link_szp: 3262 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3263 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3264 goto commit; 3265 } 3266 3267 /* 3268 * Insert the indicated symbolic reference entry into the directory. 3269 * 3270 * IN: dzp - Directory to contain new symbolic link. 3271 * name - Name of directory entry in dip. 3272 * vap - Attributes of new entry. 3273 * link - Name for new symlink entry. 3274 * cr - credentials of caller. 3275 * flags - case flags 3276 * mnt_ns - user namespace of the mount 3277 * 3278 * OUT: zpp - Znode for new symbolic link. 3279 * 3280 * RETURN: 0 on success, error code on failure. 3281 * 3282 * Timestamps: 3283 * dip - ctime|mtime updated 3284 */ 3285 int 3286 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3287 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) 3288 { 3289 znode_t *zp; 3290 zfs_dirlock_t *dl; 3291 dmu_tx_t *tx; 3292 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3293 zilog_t *zilog; 3294 uint64_t len = strlen(link); 3295 int error; 3296 int zflg = ZNEW; 3297 zfs_acl_ids_t acl_ids; 3298 boolean_t fuid_dirtied; 3299 uint64_t txtype = TX_SYMLINK; 3300 boolean_t waited = B_FALSE; 3301 3302 ASSERT(S_ISLNK(vap->va_mode)); 3303 3304 if (name == NULL) 3305 return (SET_ERROR(EINVAL)); 3306 3307 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3308 return (error); 3309 zilog = zfsvfs->z_log; 3310 3311 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3312 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3313 zfs_exit(zfsvfs, FTAG); 3314 return (SET_ERROR(EILSEQ)); 3315 } 3316 if (flags & FIGNORECASE) 3317 zflg |= ZCILOOK; 3318 3319 if (len > MAXPATHLEN) { 3320 zfs_exit(zfsvfs, FTAG); 3321 return (SET_ERROR(ENAMETOOLONG)); 3322 } 3323 3324 if ((error = zfs_acl_ids_create(dzp, 0, 3325 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3326 zfs_exit(zfsvfs, FTAG); 3327 return (error); 3328 } 3329 top: 3330 *zpp = NULL; 3331 3332 /* 3333 * Attempt to lock directory; fail if entry already exists. 3334 */ 3335 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3336 if (error) { 3337 zfs_acl_ids_free(&acl_ids); 3338 zfs_exit(zfsvfs, FTAG); 3339 return (error); 3340 } 3341 3342 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3343 zfs_acl_ids_free(&acl_ids); 3344 zfs_dirent_unlock(dl); 3345 zfs_exit(zfsvfs, FTAG); 3346 return (error); 3347 } 3348 3349 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3350 zfs_acl_ids_free(&acl_ids); 3351 zfs_dirent_unlock(dl); 3352 zfs_exit(zfsvfs, FTAG); 3353 return (SET_ERROR(EDQUOT)); 3354 } 3355 tx = dmu_tx_create(zfsvfs->z_os); 3356 fuid_dirtied = zfsvfs->z_fuid_dirty; 3357 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3358 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3359 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3360 ZFS_SA_BASE_ATTR_SIZE + len); 3361 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3362 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3363 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3364 acl_ids.z_aclp->z_acl_bytes); 3365 } 3366 if (fuid_dirtied) 3367 zfs_fuid_txhold(zfsvfs, tx); 3368 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3369 if (error) { 3370 zfs_dirent_unlock(dl); 3371 if (error == ERESTART) { 3372 waited = B_TRUE; 3373 dmu_tx_wait(tx); 3374 dmu_tx_abort(tx); 3375 goto top; 3376 } 3377 zfs_acl_ids_free(&acl_ids); 3378 dmu_tx_abort(tx); 3379 zfs_exit(zfsvfs, FTAG); 3380 return (error); 3381 } 3382 3383 /* 3384 * Create a new object for the symlink. 3385 * for version 4 ZPL datasets the symlink will be an SA attribute 3386 */ 3387 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3388 3389 if (fuid_dirtied) 3390 zfs_fuid_sync(zfsvfs, tx); 3391 3392 mutex_enter(&zp->z_lock); 3393 if (zp->z_is_sa) 3394 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3395 link, len, tx); 3396 else 3397 zfs_sa_symlink(zp, link, len, tx); 3398 mutex_exit(&zp->z_lock); 3399 3400 zp->z_size = len; 3401 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3402 &zp->z_size, sizeof (zp->z_size), tx); 3403 /* 3404 * Insert the new object into the directory. 3405 */ 3406 error = zfs_link_create(dl, zp, tx, ZNEW); 3407 if (error != 0) { 3408 zfs_znode_delete(zp, tx); 3409 remove_inode_hash(ZTOI(zp)); 3410 } else { 3411 if (flags & FIGNORECASE) 3412 txtype |= TX_CI; 3413 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3414 3415 zfs_znode_update_vfs(dzp); 3416 zfs_znode_update_vfs(zp); 3417 } 3418 3419 zfs_acl_ids_free(&acl_ids); 3420 3421 dmu_tx_commit(tx); 3422 3423 zfs_dirent_unlock(dl); 3424 3425 if (error == 0) { 3426 *zpp = zp; 3427 3428 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3429 zil_commit(zilog, 0); 3430 } else { 3431 zrele(zp); 3432 } 3433 3434 zfs_exit(zfsvfs, FTAG); 3435 return (error); 3436 } 3437 3438 /* 3439 * Return, in the buffer contained in the provided uio structure, 3440 * the symbolic path referred to by ip. 3441 * 3442 * IN: ip - inode of symbolic link 3443 * uio - structure to contain the link path. 3444 * cr - credentials of caller. 3445 * 3446 * RETURN: 0 if success 3447 * error code if failure 3448 * 3449 * Timestamps: 3450 * ip - atime updated 3451 */ 3452 int 3453 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3454 { 3455 (void) cr; 3456 znode_t *zp = ITOZ(ip); 3457 zfsvfs_t *zfsvfs = ITOZSB(ip); 3458 int error; 3459 3460 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3461 return (error); 3462 3463 mutex_enter(&zp->z_lock); 3464 if (zp->z_is_sa) 3465 error = sa_lookup_uio(zp->z_sa_hdl, 3466 SA_ZPL_SYMLINK(zfsvfs), uio); 3467 else 3468 error = zfs_sa_readlink(zp, uio); 3469 mutex_exit(&zp->z_lock); 3470 3471 zfs_exit(zfsvfs, FTAG); 3472 return (error); 3473 } 3474 3475 /* 3476 * Insert a new entry into directory tdzp referencing szp. 3477 * 3478 * IN: tdzp - Directory to contain new entry. 3479 * szp - znode of new entry. 3480 * name - name of new entry. 3481 * cr - credentials of caller. 3482 * flags - case flags. 3483 * 3484 * RETURN: 0 if success 3485 * error code if failure 3486 * 3487 * Timestamps: 3488 * tdzp - ctime|mtime updated 3489 * szp - ctime updated 3490 */ 3491 int 3492 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3493 int flags) 3494 { 3495 struct inode *sip = ZTOI(szp); 3496 znode_t *tzp; 3497 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3498 zilog_t *zilog; 3499 zfs_dirlock_t *dl; 3500 dmu_tx_t *tx; 3501 int error; 3502 int zf = ZNEW; 3503 uint64_t parent; 3504 uid_t owner; 3505 boolean_t waited = B_FALSE; 3506 boolean_t is_tmpfile = 0; 3507 uint64_t txg; 3508 3509 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3510 3511 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3512 3513 if (name == NULL) 3514 return (SET_ERROR(EINVAL)); 3515 3516 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3517 return (error); 3518 zilog = zfsvfs->z_log; 3519 3520 /* 3521 * POSIX dictates that we return EPERM here. 3522 * Better choices include ENOTSUP or EISDIR. 3523 */ 3524 if (S_ISDIR(sip->i_mode)) { 3525 zfs_exit(zfsvfs, FTAG); 3526 return (SET_ERROR(EPERM)); 3527 } 3528 3529 if ((error = zfs_verify_zp(szp)) != 0) { 3530 zfs_exit(zfsvfs, FTAG); 3531 return (error); 3532 } 3533 3534 /* 3535 * If we are using project inheritance, means if the directory has 3536 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3537 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3538 * such case, we only allow hard link creation in our tree when the 3539 * project IDs are the same. 3540 */ 3541 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3542 tdzp->z_projid != szp->z_projid) { 3543 zfs_exit(zfsvfs, FTAG); 3544 return (SET_ERROR(EXDEV)); 3545 } 3546 3547 /* 3548 * We check i_sb because snapshots and the ctldir must have different 3549 * super blocks. 3550 */ 3551 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3552 zfs_exit(zfsvfs, FTAG); 3553 return (SET_ERROR(EXDEV)); 3554 } 3555 3556 /* Prevent links to .zfs/shares files */ 3557 3558 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3559 &parent, sizeof (uint64_t))) != 0) { 3560 zfs_exit(zfsvfs, FTAG); 3561 return (error); 3562 } 3563 if (parent == zfsvfs->z_shares_dir) { 3564 zfs_exit(zfsvfs, FTAG); 3565 return (SET_ERROR(EPERM)); 3566 } 3567 3568 if (zfsvfs->z_utf8 && u8_validate(name, 3569 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3570 zfs_exit(zfsvfs, FTAG); 3571 return (SET_ERROR(EILSEQ)); 3572 } 3573 if (flags & FIGNORECASE) 3574 zf |= ZCILOOK; 3575 3576 /* 3577 * We do not support links between attributes and non-attributes 3578 * because of the potential security risk of creating links 3579 * into "normal" file space in order to circumvent restrictions 3580 * imposed in attribute space. 3581 */ 3582 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3583 zfs_exit(zfsvfs, FTAG); 3584 return (SET_ERROR(EINVAL)); 3585 } 3586 3587 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3588 cr, ZFS_OWNER); 3589 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3590 zfs_exit(zfsvfs, FTAG); 3591 return (SET_ERROR(EPERM)); 3592 } 3593 3594 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3595 zfs_init_idmap))) { 3596 zfs_exit(zfsvfs, FTAG); 3597 return (error); 3598 } 3599 3600 top: 3601 /* 3602 * Attempt to lock directory; fail if entry already exists. 3603 */ 3604 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3605 if (error) { 3606 zfs_exit(zfsvfs, FTAG); 3607 return (error); 3608 } 3609 3610 tx = dmu_tx_create(zfsvfs->z_os); 3611 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3612 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3613 if (is_tmpfile) 3614 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3615 3616 zfs_sa_upgrade_txholds(tx, szp); 3617 zfs_sa_upgrade_txholds(tx, tdzp); 3618 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 3619 if (error) { 3620 zfs_dirent_unlock(dl); 3621 if (error == ERESTART) { 3622 waited = B_TRUE; 3623 dmu_tx_wait(tx); 3624 dmu_tx_abort(tx); 3625 goto top; 3626 } 3627 dmu_tx_abort(tx); 3628 zfs_exit(zfsvfs, FTAG); 3629 return (error); 3630 } 3631 /* unmark z_unlinked so zfs_link_create will not reject */ 3632 if (is_tmpfile) 3633 szp->z_unlinked = B_FALSE; 3634 error = zfs_link_create(dl, szp, tx, 0); 3635 3636 if (error == 0) { 3637 uint64_t txtype = TX_LINK; 3638 /* 3639 * tmpfile is created to be in z_unlinkedobj, so remove it. 3640 * Also, we don't log in ZIL, because all previous file 3641 * operation on the tmpfile are ignored by ZIL. Instead we 3642 * always wait for txg to sync to make sure all previous 3643 * operation are sync safe. 3644 */ 3645 if (is_tmpfile) { 3646 VERIFY(zap_remove_int(zfsvfs->z_os, 3647 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); 3648 } else { 3649 if (flags & FIGNORECASE) 3650 txtype |= TX_CI; 3651 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3652 } 3653 } else if (is_tmpfile) { 3654 /* restore z_unlinked since when linking failed */ 3655 szp->z_unlinked = B_TRUE; 3656 } 3657 txg = dmu_tx_get_txg(tx); 3658 dmu_tx_commit(tx); 3659 3660 zfs_dirent_unlock(dl); 3661 3662 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3663 zil_commit(zilog, 0); 3664 3665 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) 3666 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); 3667 3668 zfs_znode_update_vfs(tdzp); 3669 zfs_znode_update_vfs(szp); 3670 zfs_exit(zfsvfs, FTAG); 3671 return (error); 3672 } 3673 3674 static void 3675 zfs_putpage_sync_commit_cb(void *arg) 3676 { 3677 struct page *pp = arg; 3678 3679 ClearPageError(pp); 3680 end_page_writeback(pp); 3681 } 3682 3683 static void 3684 zfs_putpage_async_commit_cb(void *arg) 3685 { 3686 struct page *pp = arg; 3687 znode_t *zp = ITOZ(pp->mapping->host); 3688 3689 ClearPageError(pp); 3690 end_page_writeback(pp); 3691 atomic_dec_32(&zp->z_async_writes_cnt); 3692 } 3693 3694 /* 3695 * Push a page out to disk, once the page is on stable storage the 3696 * registered commit callback will be run as notification of completion. 3697 * 3698 * IN: ip - page mapped for inode. 3699 * pp - page to push (page is locked) 3700 * wbc - writeback control data 3701 * for_sync - does the caller intend to wait synchronously for the 3702 * page writeback to complete? 3703 * 3704 * RETURN: 0 if success 3705 * error code if failure 3706 * 3707 * Timestamps: 3708 * ip - ctime|mtime updated 3709 */ 3710 int 3711 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3712 boolean_t for_sync) 3713 { 3714 znode_t *zp = ITOZ(ip); 3715 zfsvfs_t *zfsvfs = ITOZSB(ip); 3716 loff_t offset; 3717 loff_t pgoff; 3718 unsigned int pglen; 3719 dmu_tx_t *tx; 3720 caddr_t va; 3721 int err = 0; 3722 uint64_t mtime[2], ctime[2]; 3723 inode_timespec_t tmp_ts; 3724 sa_bulk_attr_t bulk[3]; 3725 int cnt = 0; 3726 struct address_space *mapping; 3727 3728 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3729 return (err); 3730 3731 ASSERT(PageLocked(pp)); 3732 3733 pgoff = page_offset(pp); /* Page byte-offset in file */ 3734 offset = i_size_read(ip); /* File length in bytes */ 3735 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3736 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3737 3738 /* Page is beyond end of file */ 3739 if (pgoff >= offset) { 3740 unlock_page(pp); 3741 zfs_exit(zfsvfs, FTAG); 3742 return (0); 3743 } 3744 3745 /* Truncate page length to end of file */ 3746 if (pgoff + pglen > offset) 3747 pglen = offset - pgoff; 3748 3749 #if 0 3750 /* 3751 * FIXME: Allow mmap writes past its quota. The correct fix 3752 * is to register a page_mkwrite() handler to count the page 3753 * against its quota when it is about to be dirtied. 3754 */ 3755 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3756 KUID_TO_SUID(ip->i_uid)) || 3757 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3758 KGID_TO_SGID(ip->i_gid)) || 3759 (zp->z_projid != ZFS_DEFAULT_PROJID && 3760 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3761 zp->z_projid))) { 3762 err = EDQUOT; 3763 } 3764 #endif 3765 3766 /* 3767 * The ordering here is critical and must adhere to the following 3768 * rules in order to avoid deadlocking in either zfs_read() or 3769 * zfs_free_range() due to a lock inversion. 3770 * 3771 * 1) The page must be unlocked prior to acquiring the range lock. 3772 * This is critical because zfs_read() calls find_lock_page() 3773 * which may block on the page lock while holding the range lock. 3774 * 3775 * 2) Before setting or clearing write back on a page the range lock 3776 * must be held in order to prevent a lock inversion with the 3777 * zfs_free_range() function. 3778 * 3779 * This presents a problem because upon entering this function the 3780 * page lock is already held. To safely acquire the range lock the 3781 * page lock must be dropped. This creates a window where another 3782 * process could truncate, invalidate, dirty, or write out the page. 3783 * 3784 * Therefore, after successfully reacquiring the range and page locks 3785 * the current page state is checked. In the common case everything 3786 * will be as is expected and it can be written out. However, if 3787 * the page state has changed it must be handled accordingly. 3788 */ 3789 mapping = pp->mapping; 3790 redirty_page_for_writepage(wbc, pp); 3791 unlock_page(pp); 3792 3793 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3794 pgoff, pglen, RL_WRITER); 3795 lock_page(pp); 3796 3797 /* Page mapping changed or it was no longer dirty, we're done */ 3798 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3799 unlock_page(pp); 3800 zfs_rangelock_exit(lr); 3801 zfs_exit(zfsvfs, FTAG); 3802 return (0); 3803 } 3804 3805 /* Another process started write block if required */ 3806 if (PageWriteback(pp)) { 3807 unlock_page(pp); 3808 zfs_rangelock_exit(lr); 3809 3810 if (wbc->sync_mode != WB_SYNC_NONE) { 3811 /* 3812 * Speed up any non-sync page writebacks since 3813 * they may take several seconds to complete. 3814 * Refer to the comment in zpl_fsync() for details. 3815 */ 3816 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 3817 zil_commit(zfsvfs->z_log, zp->z_id); 3818 } 3819 3820 if (PageWriteback(pp)) 3821 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3822 folio_wait_bit(page_folio(pp), PG_writeback); 3823 #else 3824 wait_on_page_bit(pp, PG_writeback); 3825 #endif 3826 } 3827 3828 zfs_exit(zfsvfs, FTAG); 3829 return (0); 3830 } 3831 3832 /* Clear the dirty flag the required locks are held */ 3833 if (!clear_page_dirty_for_io(pp)) { 3834 unlock_page(pp); 3835 zfs_rangelock_exit(lr); 3836 zfs_exit(zfsvfs, FTAG); 3837 return (0); 3838 } 3839 3840 /* 3841 * Counterpart for redirty_page_for_writepage() above. This page 3842 * was in fact not skipped and should not be counted as if it were. 3843 */ 3844 wbc->pages_skipped--; 3845 if (!for_sync) 3846 atomic_inc_32(&zp->z_async_writes_cnt); 3847 set_page_writeback(pp); 3848 unlock_page(pp); 3849 3850 tx = dmu_tx_create(zfsvfs->z_os); 3851 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3852 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3853 zfs_sa_upgrade_txholds(tx, zp); 3854 3855 err = dmu_tx_assign(tx, TXG_WAIT); 3856 if (err != 0) { 3857 dmu_tx_abort(tx); 3858 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3859 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3860 #else 3861 __set_page_dirty_nobuffers(pp); 3862 #endif 3863 ClearPageError(pp); 3864 end_page_writeback(pp); 3865 if (!for_sync) 3866 atomic_dec_32(&zp->z_async_writes_cnt); 3867 zfs_rangelock_exit(lr); 3868 zfs_exit(zfsvfs, FTAG); 3869 return (err); 3870 } 3871 3872 va = kmap(pp); 3873 ASSERT3U(pglen, <=, PAGE_SIZE); 3874 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); 3875 kunmap(pp); 3876 3877 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3878 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3879 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3880 &zp->z_pflags, 8); 3881 3882 /* Preserve the mtime and ctime provided by the inode */ 3883 tmp_ts = zpl_inode_get_mtime(ip); 3884 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3885 tmp_ts = zpl_inode_get_ctime(ip); 3886 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3887 zp->z_atime_dirty = B_FALSE; 3888 zp->z_seq++; 3889 3890 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3891 3892 boolean_t commit = B_FALSE; 3893 if (wbc->sync_mode != WB_SYNC_NONE) { 3894 /* 3895 * Note that this is rarely called under writepages(), because 3896 * writepages() normally handles the entire commit for 3897 * performance reasons. 3898 */ 3899 commit = B_TRUE; 3900 } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { 3901 /* 3902 * If the caller does not intend to wait synchronously 3903 * for this page writeback to complete and there are active 3904 * synchronous calls on this file, do a commit so that 3905 * the latter don't accidentally end up waiting for 3906 * our writeback to complete. Refer to the comment in 3907 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. 3908 */ 3909 commit = B_TRUE; 3910 } 3911 3912 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, 3913 B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : 3914 zfs_putpage_async_commit_cb, pp); 3915 3916 dmu_tx_commit(tx); 3917 3918 zfs_rangelock_exit(lr); 3919 3920 if (commit) 3921 zil_commit(zfsvfs->z_log, zp->z_id); 3922 3923 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3924 3925 zfs_exit(zfsvfs, FTAG); 3926 return (err); 3927 } 3928 3929 /* 3930 * Update the system attributes when the inode has been dirtied. For the 3931 * moment we only update the mode, atime, mtime, and ctime. 3932 */ 3933 int 3934 zfs_dirty_inode(struct inode *ip, int flags) 3935 { 3936 znode_t *zp = ITOZ(ip); 3937 zfsvfs_t *zfsvfs = ITOZSB(ip); 3938 dmu_tx_t *tx; 3939 uint64_t mode, atime[2], mtime[2], ctime[2]; 3940 inode_timespec_t tmp_ts; 3941 sa_bulk_attr_t bulk[4]; 3942 int error = 0; 3943 int cnt = 0; 3944 3945 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 3946 return (0); 3947 3948 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3949 return (error); 3950 3951 #ifdef I_DIRTY_TIME 3952 /* 3953 * This is the lazytime semantic introduced in Linux 4.0 3954 * This flag will only be called from update_time when lazytime is set. 3955 * (Note, I_DIRTY_SYNC will also set if not lazytime) 3956 * Fortunately mtime and ctime are managed within ZFS itself, so we 3957 * only need to dirty atime. 3958 */ 3959 if (flags == I_DIRTY_TIME) { 3960 zp->z_atime_dirty = B_TRUE; 3961 goto out; 3962 } 3963 #endif 3964 3965 tx = dmu_tx_create(zfsvfs->z_os); 3966 3967 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3968 zfs_sa_upgrade_txholds(tx, zp); 3969 3970 error = dmu_tx_assign(tx, TXG_WAIT); 3971 if (error) { 3972 dmu_tx_abort(tx); 3973 goto out; 3974 } 3975 3976 mutex_enter(&zp->z_lock); 3977 zp->z_atime_dirty = B_FALSE; 3978 3979 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 3980 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 3981 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3982 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3983 3984 /* Preserve the mode, mtime and ctime provided by the inode */ 3985 tmp_ts = zpl_inode_get_atime(ip); 3986 ZFS_TIME_ENCODE(&tmp_ts, atime); 3987 tmp_ts = zpl_inode_get_mtime(ip); 3988 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3989 tmp_ts = zpl_inode_get_ctime(ip); 3990 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3991 mode = ip->i_mode; 3992 3993 zp->z_mode = mode; 3994 3995 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3996 mutex_exit(&zp->z_lock); 3997 3998 dmu_tx_commit(tx); 3999 out: 4000 zfs_exit(zfsvfs, FTAG); 4001 return (error); 4002 } 4003 4004 void 4005 zfs_inactive(struct inode *ip) 4006 { 4007 znode_t *zp = ITOZ(ip); 4008 zfsvfs_t *zfsvfs = ITOZSB(ip); 4009 uint64_t atime[2]; 4010 int error; 4011 int need_unlock = 0; 4012 4013 /* Only read lock if we haven't already write locked, e.g. rollback */ 4014 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 4015 need_unlock = 1; 4016 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4017 } 4018 if (zp->z_sa_hdl == NULL) { 4019 if (need_unlock) 4020 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4021 return; 4022 } 4023 4024 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 4025 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4026 4027 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4028 zfs_sa_upgrade_txholds(tx, zp); 4029 error = dmu_tx_assign(tx, TXG_WAIT); 4030 if (error) { 4031 dmu_tx_abort(tx); 4032 } else { 4033 inode_timespec_t tmp_atime; 4034 tmp_atime = zpl_inode_get_atime(ip); 4035 ZFS_TIME_ENCODE(&tmp_atime, atime); 4036 mutex_enter(&zp->z_lock); 4037 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4038 (void *)&atime, sizeof (atime), tx); 4039 zp->z_atime_dirty = B_FALSE; 4040 mutex_exit(&zp->z_lock); 4041 dmu_tx_commit(tx); 4042 } 4043 } 4044 4045 zfs_zinactive(zp); 4046 if (need_unlock) 4047 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4048 } 4049 4050 /* 4051 * Fill pages with data from the disk. 4052 */ 4053 static int 4054 zfs_fillpage(struct inode *ip, struct page *pp) 4055 { 4056 znode_t *zp = ITOZ(ip); 4057 zfsvfs_t *zfsvfs = ITOZSB(ip); 4058 loff_t i_size = i_size_read(ip); 4059 u_offset_t io_off = page_offset(pp); 4060 size_t io_len = PAGE_SIZE; 4061 4062 ASSERT3U(io_off, <, i_size); 4063 4064 if (io_off + io_len > i_size) 4065 io_len = i_size - io_off; 4066 4067 void *va = kmap(pp); 4068 int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, 4069 io_len, va, DMU_READ_PREFETCH); 4070 if (io_len != PAGE_SIZE) 4071 memset((char *)va + io_len, 0, PAGE_SIZE - io_len); 4072 kunmap(pp); 4073 4074 if (error) { 4075 /* convert checksum errors into IO errors */ 4076 if (error == ECKSUM) 4077 error = SET_ERROR(EIO); 4078 4079 SetPageError(pp); 4080 ClearPageUptodate(pp); 4081 } else { 4082 ClearPageError(pp); 4083 SetPageUptodate(pp); 4084 } 4085 4086 return (error); 4087 } 4088 4089 /* 4090 * Uses zfs_fillpage to read data from the file and fill the page. 4091 * 4092 * IN: ip - inode of file to get data from. 4093 * pp - page to read 4094 * 4095 * RETURN: 0 on success, error code on failure. 4096 * 4097 * Timestamps: 4098 * vp - atime updated 4099 */ 4100 int 4101 zfs_getpage(struct inode *ip, struct page *pp) 4102 { 4103 zfsvfs_t *zfsvfs = ITOZSB(ip); 4104 znode_t *zp = ITOZ(ip); 4105 int error; 4106 loff_t i_size = i_size_read(ip); 4107 u_offset_t io_off = page_offset(pp); 4108 size_t io_len = PAGE_SIZE; 4109 4110 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4111 return (error); 4112 4113 ASSERT3U(io_off, <, i_size); 4114 4115 if (io_off + io_len > i_size) 4116 io_len = i_size - io_off; 4117 4118 /* 4119 * It is important to hold the rangelock here because it is possible 4120 * a Direct I/O write or block clone might be taking place at the same 4121 * time that a page is being faulted in through filemap_fault(). With 4122 * Direct I/O writes and block cloning db->db_data will be set to NULL 4123 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the 4124 * rangelock is not held, then there is a race between faulting in a 4125 * page and writing out a Direct I/O write or block cloning. Without 4126 * the rangelock a NULL pointer dereference can occur in 4127 * dmu_read_impl() for db->db_data during the mempcy operation when 4128 * zfs_fillpage() calls dmu_read(). 4129 */ 4130 zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, 4131 io_off, io_len, RL_READER); 4132 if (lr == NULL) { 4133 /* 4134 * It is important to drop the page lock before grabbing the 4135 * rangelock to avoid another deadlock between here and 4136 * zfs_write() -> update_pages(). update_pages() holds both the 4137 * rangelock and the page lock. 4138 */ 4139 get_page(pp); 4140 unlock_page(pp); 4141 lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, 4142 io_len, RL_READER); 4143 lock_page(pp); 4144 put_page(pp); 4145 } 4146 error = zfs_fillpage(ip, pp); 4147 zfs_rangelock_exit(lr); 4148 4149 if (error == 0) 4150 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); 4151 4152 zfs_exit(zfsvfs, FTAG); 4153 4154 return (error); 4155 } 4156 4157 /* 4158 * Check ZFS specific permissions to memory map a section of a file. 4159 * 4160 * IN: ip - inode of the file to mmap 4161 * off - file offset 4162 * addrp - start address in memory region 4163 * len - length of memory region 4164 * vm_flags- address flags 4165 * 4166 * RETURN: 0 if success 4167 * error code if failure 4168 */ 4169 int 4170 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4171 unsigned long vm_flags) 4172 { 4173 (void) addrp; 4174 znode_t *zp = ITOZ(ip); 4175 zfsvfs_t *zfsvfs = ITOZSB(ip); 4176 int error; 4177 4178 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4179 return (error); 4180 4181 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && 4182 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4183 zfs_exit(zfsvfs, FTAG); 4184 return (SET_ERROR(EPERM)); 4185 } 4186 4187 if ((vm_flags & (VM_READ | VM_EXEC)) && 4188 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4189 zfs_exit(zfsvfs, FTAG); 4190 return (SET_ERROR(EACCES)); 4191 } 4192 4193 if (off < 0 || len > MAXOFFSET_T - off) { 4194 zfs_exit(zfsvfs, FTAG); 4195 return (SET_ERROR(ENXIO)); 4196 } 4197 4198 zfs_exit(zfsvfs, FTAG); 4199 return (0); 4200 } 4201 4202 /* 4203 * Free or allocate space in a file. Currently, this function only 4204 * supports the `F_FREESP' command. However, this command is somewhat 4205 * misnamed, as its functionality includes the ability to allocate as 4206 * well as free space. 4207 * 4208 * IN: zp - znode of file to free data in. 4209 * cmd - action to take (only F_FREESP supported). 4210 * bfp - section of file to free/alloc. 4211 * flag - current file open mode flags. 4212 * offset - current file offset. 4213 * cr - credentials of caller. 4214 * 4215 * RETURN: 0 on success, error code on failure. 4216 * 4217 * Timestamps: 4218 * zp - ctime|mtime updated 4219 */ 4220 int 4221 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4222 offset_t offset, cred_t *cr) 4223 { 4224 (void) offset; 4225 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4226 uint64_t off, len; 4227 int error; 4228 4229 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4230 return (error); 4231 4232 if (cmd != F_FREESP) { 4233 zfs_exit(zfsvfs, FTAG); 4234 return (SET_ERROR(EINVAL)); 4235 } 4236 4237 /* 4238 * Callers might not be able to detect properly that we are read-only, 4239 * so check it explicitly here. 4240 */ 4241 if (zfs_is_readonly(zfsvfs)) { 4242 zfs_exit(zfsvfs, FTAG); 4243 return (SET_ERROR(EROFS)); 4244 } 4245 4246 if (bfp->l_len < 0) { 4247 zfs_exit(zfsvfs, FTAG); 4248 return (SET_ERROR(EINVAL)); 4249 } 4250 4251 /* 4252 * Permissions aren't checked on Solaris because on this OS 4253 * zfs_space() can only be called with an opened file handle. 4254 * On Linux we can get here through truncate_range() which 4255 * operates directly on inodes, so we need to check access rights. 4256 */ 4257 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4258 zfs_init_idmap))) { 4259 zfs_exit(zfsvfs, FTAG); 4260 return (error); 4261 } 4262 4263 off = bfp->l_start; 4264 len = bfp->l_len; /* 0 means from off to end of file */ 4265 4266 error = zfs_freesp(zp, off, len, flag, TRUE); 4267 4268 zfs_exit(zfsvfs, FTAG); 4269 return (error); 4270 } 4271 4272 int 4273 zfs_fid(struct inode *ip, fid_t *fidp) 4274 { 4275 znode_t *zp = ITOZ(ip); 4276 zfsvfs_t *zfsvfs = ITOZSB(ip); 4277 uint32_t gen; 4278 uint64_t gen64; 4279 uint64_t object = zp->z_id; 4280 zfid_short_t *zfid; 4281 int size, i, error; 4282 4283 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4284 return (error); 4285 4286 if (fidp->fid_len < SHORT_FID_LEN) { 4287 fidp->fid_len = SHORT_FID_LEN; 4288 zfs_exit(zfsvfs, FTAG); 4289 return (SET_ERROR(ENOSPC)); 4290 } 4291 4292 if ((error = zfs_verify_zp(zp)) != 0) { 4293 zfs_exit(zfsvfs, FTAG); 4294 return (error); 4295 } 4296 4297 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4298 &gen64, sizeof (uint64_t))) != 0) { 4299 zfs_exit(zfsvfs, FTAG); 4300 return (error); 4301 } 4302 4303 gen = (uint32_t)gen64; 4304 4305 size = SHORT_FID_LEN; 4306 4307 zfid = (zfid_short_t *)fidp; 4308 4309 zfid->zf_len = size; 4310 4311 for (i = 0; i < sizeof (zfid->zf_object); i++) 4312 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4313 4314 /* Must have a non-zero generation number to distinguish from .zfs */ 4315 if (gen == 0) 4316 gen = 1; 4317 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4318 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4319 4320 zfs_exit(zfsvfs, FTAG); 4321 return (0); 4322 } 4323 4324 #if defined(_KERNEL) 4325 EXPORT_SYMBOL(zfs_open); 4326 EXPORT_SYMBOL(zfs_close); 4327 EXPORT_SYMBOL(zfs_lookup); 4328 EXPORT_SYMBOL(zfs_create); 4329 EXPORT_SYMBOL(zfs_tmpfile); 4330 EXPORT_SYMBOL(zfs_remove); 4331 EXPORT_SYMBOL(zfs_mkdir); 4332 EXPORT_SYMBOL(zfs_rmdir); 4333 EXPORT_SYMBOL(zfs_readdir); 4334 EXPORT_SYMBOL(zfs_getattr_fast); 4335 EXPORT_SYMBOL(zfs_setattr); 4336 EXPORT_SYMBOL(zfs_rename); 4337 EXPORT_SYMBOL(zfs_symlink); 4338 EXPORT_SYMBOL(zfs_readlink); 4339 EXPORT_SYMBOL(zfs_link); 4340 EXPORT_SYMBOL(zfs_inactive); 4341 EXPORT_SYMBOL(zfs_space); 4342 EXPORT_SYMBOL(zfs_fid); 4343 EXPORT_SYMBOL(zfs_getpage); 4344 EXPORT_SYMBOL(zfs_putpage); 4345 EXPORT_SYMBOL(zfs_dirty_inode); 4346 EXPORT_SYMBOL(zfs_map); 4347 4348 module_param(zfs_delete_blocks, ulong, 0644); 4349 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4350 #endif 4351